Skip to content

Commit

Permalink
feat(translator): add font mapping and enhance style checks
Browse files Browse the repository at this point in the history
- introduce FontMapper to handle font mappings
- add page_font_map to store fonts for each page
- update get_translate_input to accept and use page_font_map
- enhance style check conditions to consider font mappings and styles
  • Loading branch information
awwaawwa committed Jan 21, 2025
1 parent 9393797 commit 7621b6a
Showing 1 changed file with 32 additions and 2 deletions.
34 changes: 32 additions & 2 deletions yadt/document_il/midend/il_translator.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,10 @@
PdfSameStyleCharacters,
PdfSameStyleUnicodeCharacters,
PdfStyle,
PdfFont,
)
from yadt.document_il.translator.translator import BaseTranslator
from yadt.document_il.utils.fontmap import FontMapper
from yadt.document_il.utils.layout_helper import (
get_char_unicode_string,
is_same_style,
Expand Down Expand Up @@ -122,6 +124,7 @@ def __init__(
):
self.translate_engine = translate_engine
self.translation_config = translation_config
self.font_mapper = FontMapper(translation_config)

def translate(self, docs: Document):
tracker = DocumentTranslateTracker()
Expand Down Expand Up @@ -151,12 +154,16 @@ def process_page(
tracker: PageTranslateTracker = None,
):
for paragraph in page.pdf_paragraph:
page_font_map = {}
for font in page.pdf_font:
page_font_map[font.font_id] = font
# self.translate_paragraph(paragraph, pbar)
executor.submit(
self.translate_paragraph,
paragraph,
pbar,
tracker.new_paragraph(),
page_font_map,
)

class TranslateInput:
Expand Down Expand Up @@ -204,7 +211,11 @@ def create_rich_text_placeholder(
right_placeholder,
)

def get_translate_input(self, paragraph: PdfParagraph):
def get_translate_input(
self,
paragraph: PdfParagraph,
page_font_map: dict[str, PdfFont] = None,
):
if not paragraph.pdf_paragraph_composition:
return
if len(paragraph.pdf_paragraph_composition) == 1:
Expand Down Expand Up @@ -244,14 +255,32 @@ def get_translate_input(self, paragraph: PdfParagraph):
chars.append(composition.pdf_character)
elif composition.pdf_same_style_characters:
if (
# 样式和段落基准样式一致,无需占位符
is_same_style(
composition.pdf_same_style_characters.pdf_style,
paragraph.pdf_style,
)
# 字号差异在0.7-1.3之间,可能是首字母变大效果,无需占位符
or is_same_style_except_size(
composition.pdf_same_style_characters.pdf_style,
paragraph.pdf_style,
)
or (
# 除了字体以外样式都和基准一样,并且字体都映射到同一个字体。无需占位符
is_same_style_except_font(
composition.pdf_same_style_characters.pdf_style,
paragraph.pdf_style,
)
and self.font_mapper.map(
page_font_map[
composition.pdf_same_style_characters.pdf_style.font_id
],
"1",
).font_id
== self.font_mapper.map(
page_font_map[paragraph.pdf_style.font_id], "1"
).font_id
)
or len(composition.pdf_same_style_characters.pdf_character) == 1
):
chars.extend(composition.pdf_same_style_characters.pdf_character)
Expand Down Expand Up @@ -398,14 +427,15 @@ def translate_paragraph(
paragraph: PdfParagraph,
pbar: tqdm | None = None,
tracker: ParagraphTranslateTracker = None,
page_font_map: dict[str, PdfFont] = None,
):
with PbarContext(pbar):
if paragraph.vertical:
return

tracker.set_pdf_unicode(paragraph.unicode)

translate_input = self.get_translate_input(paragraph)
translate_input = self.get_translate_input(paragraph, page_font_map)
if not translate_input:
return

Expand Down

0 comments on commit 7621b6a

Please # to comment.