Skip to content

Commit

Permalink
refactor(styles_and_formulas): update bracket and special character h…
Browse files Browse the repository at this point in the history
…andling

- add (cid:104) and (cid:105) to the list of left and right brackets
- improve readability by formatting the list of characters

refactor(layout_helper): define and use HEIGHT_NOT_USFUL_CHAR_IN_CHAR

- introduce a constant for characters to ignore in height calculations
- update formular_height_ignore_char to use the new constant

refactor(paragraph_finder): use HEIGHT_NOT_USFUL_CHAR_IN_CHAR

- replace hardcoded list with the new constant for better maintainability
  • Loading branch information
awwaawwa committed Jan 20, 2025
1 parent 226b0d0 commit 01c754e
Show file tree
Hide file tree
Showing 3 changed files with 33 additions and 10 deletions.
8 changes: 2 additions & 6 deletions yadt/document_il/midend/paragraph_finder.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
Layout,
add_space_dummy_chars,
get_char_unicode_string,
HEIGHT_NOT_USFUL_CHAR_IN_CHAR,
)


Expand Down Expand Up @@ -202,12 +203,7 @@ def get_layout(
] = "middle",
):
# 这几个符号,解析出来的大小经常只有实际大小的一点点。
if xy_mode != 'bottomright' and char.char_unicode in [
"∑︁",
# 来源于 arXiv:2310.18608v2 第九页公式大括号
"(cid:17)",
"(cid:16)",
]:
if xy_mode != "bottomright" and char.char_unicode in HEIGHT_NOT_USFUL_CHAR_IN_CHAR:
return self.get_layout(char, page, "bottomright")
# current layouts
# {
Expand Down
18 changes: 16 additions & 2 deletions yadt/document_il/midend/styles_and_formulas.py
Original file line number Diff line number Diff line change
Expand Up @@ -530,11 +530,25 @@ def split_formula_by_comma(

for char in formula.pdf_character:
# 检查是否是左括号
if char.char_unicode in ["(cid:8)", "(", "(cid:16)", "{", "["]:
if char.char_unicode in [
"(cid:8)",
"(",
"(cid:16)",
"{",
"[",
"(cid:104)",
]:
bracket_level += 1
current_chars.append(char)
# 检查是否是右括号
elif char.char_unicode in ["(cid:9)", ")", "(cid:17)", "}", "]"]:
elif char.char_unicode in [
"(cid:9)",
")",
"(cid:17)",
"}",
"]",
"(cid:105)",
]:
bracket_level = max(0, bracket_level - 1) # 防止括号不匹配的情况
current_chars.append(char)
# 检查是否是逗号,且不在括号内
Expand Down
17 changes: 15 additions & 2 deletions yadt/document_il/utils/layout_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,26 @@
PdfParagraphComposition,
)

HEIGHT_NOT_USFUL_CHAR_IN_CHAR = (
"∑︁",
# 暂时假设cid:17和cid 16是特殊情况
# 来源于 arXiv:2310.18608v2 第九页公式大括号
"(cid:17)",
"(cid:16)",
# arXiv:2411.19509v2 第四页 []
"(cid:104)",
"(cid:105)",
"∑︁",
)

def formular_height_ignore_char(char: PdfCharacter):
return char.pdf_character_id is None or char.char_unicode in (
# 暂时假设cid:17和cid 16是特殊情况
# 来源于 arXiv:2310.18608v2 第九页公式大括号

"(cid:17)",
"(cid:16)",
"(cid:104)",
"(cid:105)",
"∑︁",
)


Expand Down

0 comments on commit 01c754e

Please # to comment.