Skip to content

This issue was moved to a discussion.

You can continue the conversation there. Go to discussion →

New issue

Have a question about this project? # for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “#”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? # to your account

文字识别后返回单字识别坐标目前paddleocr支持吗 #10815

Closed
jingmingtao opened this issue Sep 4, 2023 · 6 comments
Closed

文字识别后返回单字识别坐标目前paddleocr支持吗 #10815

jingmingtao opened this issue Sep 4, 2023 · 6 comments
Assignees

Comments

@jingmingtao
Copy link

文字识别后返回单字识别坐标目前paddleocr支持吗

@ToddBear ToddBear self-assigned this Sep 4, 2023
@ToddBear
Copy link
Collaborator

ToddBear commented Sep 4, 2023

支持的,现在PPStructure已有返回单字坐标的功能,可以参考PR: #10515

如果想在普通的识别中使用该功能的话,只需修改一下部分源码即可

@jingmingtao
Copy link
Author

thanks,i will have a try

@jingmingtao
Copy link
Author

支持的,现在PPStructure已有返回单字坐标的功能,可以参考PR: #10515

如果想在普通的识别中使用该功能的话,只需修改一下部分源码即可

您好,如果想在普通的识别中使用该功能的话,有具体修改好的代码工程吗,或者可以说一下具体修改哪里吗

@dengmingD
Copy link

dengmingD commented Sep 19, 2023

我改了一下:
1.predict_system,增加两个参数,a.rec_algorithm=CTCLabelDecode,b.return_word_box=Ture
2.修改tools/infer/predict_system.py,增加from ppstructure.utility import cal_ocr_word_box
3.修改这个文件里的main方法:如下
def main(args):
image_file_list = get_image_file_list(args.image_dir)
image_file_list = image_file_list[args.process_id::args.total_process_num]
text_sys = TextSystem(args)
is_visualize = True
font_path = args.vis_font_path
drop_score = args.drop_score
draw_img_save_dir = args.draw_img_save_dir
os.makedirs(draw_img_save_dir, exist_ok=True)
save_results = []

logger.info(
    "In PP-OCRv3, rec_image_shape parameter defaults to '3, 48, 320', "
    "if you are using recognition model with PP-OCRv2 or an older version, please set --rec_image_shape='3,32,320"
)

# warm up 10 times
if args.warmup:
    img = np.random.uniform(0, 255, [640, 640, 3]).astype(np.uint8)
    for i in range(10):
        res = text_sys(img)

total_time = 0
cpu_mem, gpu_mem, gpu_util = 0, 0, 0
_st = time.time()
count = 0
for idx, image_file in enumerate(image_file_list):

    img, flag_gif, flag_pdf = check_and_read(image_file)
    if not flag_gif and not flag_pdf:
        img = cv2.imread(image_file)
    if not flag_pdf:
        if img is None:
            logger.debug("error in loading image:{}".format(image_file))
            continue
        imgs = [img]
    else:
        page_num = args.page_num
        if page_num > len(img) or page_num == 0:
            page_num = len(img)
        imgs = img[:page_num]
    for index, img in enumerate(imgs):
        starttime = time.time()
        dt_boxes, rec_res, time_dict = text_sys(img)
        elapse = time.time() - starttime
        total_time += elapse
        if len(imgs) > 1:
            logger.debug(
                str(idx) + '_' + str(index) + "  Predict time of %s: %.3fs"
                % (image_file, elapse))
        else:
            logger.debug(
                str(idx) + "  Predict time of %s: %.3fs" % (image_file,
                                                            elapse))
        # 返回信息如下 :第3页共10页, 0.996, [36.769230769230774, [['第'], ['3'], ['页', '共'], ['1', '0'], ['页']], [[2], [7], [13, 20], [25, 28], [33]], ['cn', 'en&num', 'cn', 'en&num', 'cn']]
        if len(rec_res[0]) == 3:
            for text, score, word_box in rec_res:
                logger.debug("{}, {:.3f}, {}".format(text, score, word_box))
        else:
            for text, score in rec_res:
                logger.debug("{}, {:.3f}".format(text, score))
        word_boxs = []
        if args.return_word_box == True:
            for i in range(len(dt_boxes)):
                word, word_box = cal_ocr_word_box(rec_res[i][0], dt_boxes[i], rec_res[i][2])
                word_boxs.append(word_box)
                # print(word, word_box)
            res = [{
                "transcription": rec_res[i][0],
                "points": np.array(dt_boxes[i]).astype(np.int32).tolist(),
                "word_boxs": word_boxs[i],
            } for i in range(len(dt_boxes))]
        else:
            res = [{
                "transcription": rec_res[i][0],
                "points": np.array(dt_boxes[i]).astype(np.int32).tolist(),
            } for i in range(len(dt_boxes))]

        if len(imgs) > 1:
            save_pred = os.path.basename(image_file) + '_' + str(
                index) + "\t" + json.dumps(
                    res, ensure_ascii=False) + "\n"
        else:
            # print('res:', res)
            save_pred = os.path.basename(image_file) + "\t" + json.dumps(
                res, ensure_ascii=False) + "\n"
        save_results.append(save_pred)

        if is_visualize:
            image = Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
            boxes = dt_boxes
            txts = [rec_res[i][0] for i in range(len(rec_res))]
            scores = [rec_res[i][1] for i in range(len(rec_res))]
            # 返回单字坐标位置信息,['第', '3', '页', '共', '10', '页'] [((876.0, 2553.0), (882.0, 2553.0), (882.0, 2592.0), (876.0, 2592.0)), ((993.0, 2553.0), (1019.0, 2553.0), (1019.0, 2592.0), (993.0, 2592.0)), ((831.0, 2553.0), (873.0, 2553.0), (873.0, 2592.0), (831.0, 2592.0)), ((892.0, 2553.0), (944.0, 2553.0), (944.0, 2592.0), (892.0, 2592.0)), ((938.0, 2553.0), (990.0, 2553.0), (990.0, 2592.0), (938.0, 2592.0)), ((1022.0, 2553.0), (1070.0, 2553.0), (1070.0, 2592.0), (1022.0, 2592.0))]
            draw_img = draw_ocr_box_txt(
                image,
                boxes,
                word_boxs,
                txts,
                scores,
                drop_score=drop_score,
                font_path=font_path)

            if flag_gif:
                save_file = image_file[:-3] + "png"
            elif flag_pdf:
                save_file = image_file.replace('.pdf',
                                               '_' + str(index) + '.png')
            else:
                save_file = image_file
            # print(os.path.join(draw_img_save_dir, os.path.basename(save_file)))
            cv2.imwrite(
                os.path.join("d:/",
                             os.path.basename(save_file)),
                draw_img[:, :, ::-1])
            # cv2.imshow('', draw_img[:, :, ::-1])
            # cv2.waitKey(0)
            logger.debug("The visualized image saved in {}".format(
                os.path.join(draw_img_save_dir, os.path.basename(
                    save_file))))

logger.info("The predict total time is {}".format(time.time() - _st))
if args.benchmark:
    text_sys.text_detector.autolog.report()
    text_sys.text_recognizer.autolog.report()
with open(
        os.path.join(draw_img_save_dir, "system_results.txt"),
        'w',
        encoding='utf-8') as f:
    f.writelines(save_results)

4.修改tools/infer/utility.py,增加了一个参数word_boxs,用于画单个字的矩形框
def draw_ocr_box_txt(image,
boxes,
word_boxs,
txts=None,
scores=None,
drop_score=0.5,
font_path="./doc/fonts/simfang.ttf"):
h, w = image.height, image.width
img_left = image.copy()
img_right = np.ones((h, w, 3), dtype=np.uint8) * 255
random.seed(0)

draw_left = ImageDraw.Draw(img_left)
if txts is None or len(txts) != len(boxes):
    txts = [None] * len(boxes)
for idx, (box, txt, word_box) in enumerate(zip(boxes, txts, word_boxs)):
    if scores is not None and scores[idx] < drop_score:
        continue
    color = (random.randint(0, 255), random.randint(0, 255),
             random.randint(0, 255))
    draw_left.polygon(box, fill=color)
    img_right_text = draw_box_txt_fine((w, h), box, txt, font_path)
    pts = np.array(box, np.int32).reshape((-1, 1, 2))
    # cv2.polylines(img_right_text, [pts], True, color, 1)
    for b in word_box:
        b = np.array(b, np.int32)
        cv2.rectangle(img_right_text, (b[0]), (b[2]), color, 1)
    img_right = cv2.bitwise_and(img_right, img_right_text)
img_left = Image.blend(image, img_left, 0.5)
img_show = Image.new('RGB', (w * 2, h), (255, 255, 255))
img_show.paste(img_left, (0, 0, w, h))
img_show.paste(Image.fromarray(img_right), (w, 0, w * 2, h))
return np.array(img_show)

一定要修改ppocr2.6,由于改的时间较长,有点忘记改哪了,有问题请留言

参数如下:
--image_dir=D:/imgs/ocr/img1.jpg
--det_model_dir=D:/models/ocrv4/ch_PP-OCRv4_det_infer
--rec_model_dir=D:/models/ocrv4/ch_PP-OCRv4_rec_infer
--rec_char_dict_path=D:/models/ppocr_keys_v1.txt
--rec_algorithm=CTCLabelDecode
--det_db_thresh=0.3
--det_db_box_thresh=0.3
--det_db_unclip_ratio=1.5
--use_gpu=True
--return_word_box=True

@jingmingtao
Copy link
Author

好的谢谢您,有问题再请教您,万分感谢

@yangy996
Copy link

这种后处理方式,如果文字间隔比较大的话,单个文字定位就很差

@PaddlePaddle PaddlePaddle locked and limited conversation to collaborators May 25, 2024
@SWHL SWHL converted this issue into discussion #12227 May 25, 2024

This issue was moved to a discussion.

You can continue the conversation there. Go to discussion →

Labels
None yet
Projects
None yet
Development

No branches or pull requests

5 participants