Skip to content

Commit

Permalink
Adding parameters for taget image and anchor text sizes
Browse files Browse the repository at this point in the history
  • Loading branch information
jakep-allenai committed Oct 18, 2024
1 parent 1c8602c commit 492a3f6
Showing 1 changed file with 9 additions and 10 deletions.
19 changes: 9 additions & 10 deletions pdelfin/birrpipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -321,9 +321,9 @@ def close(self):
thread.join()


def build_page_query(local_pdf_path: str, pretty_pdf_path: str, page: int) -> dict:
image_base64 = render_pdf_to_base64png(local_pdf_path, page, 1024)
anchor_text = get_anchor_text(local_pdf_path, page, pdf_engine="pdfreport")
def build_page_query(local_pdf_path: str, pretty_pdf_path: str, page: int, target_longest_image_dim: int, target_anchor_text_len: int) -> dict:
image_base64 = render_pdf_to_base64png(local_pdf_path, page, target_longest_image_dim=target_longest_image_dim)
anchor_text = get_anchor_text(local_pdf_path, page, pdf_engine="pdfreport", target_length=target_anchor_text_len)

return {
"custom_id": f"{pretty_pdf_path}-{page}",
Expand Down Expand Up @@ -423,7 +423,7 @@ def get_pdf_num_pages(s3_path: str) -> Optional[int]:

return None

def build_pdf_queries(s3_workspace: str, pdf: DatabaseManager.PDFRecord, cur_round: int) -> list[dict]:
def build_pdf_queries(s3_workspace: str, pdf: DatabaseManager.PDFRecord, cur_round: int, target_longest_image_dim: int, target_anchor_text_len: int) -> list[dict]:
db = DatabaseManager(s3_workspace)

existing_pages = db.get_index_entries(pdf.s3_path)
Expand All @@ -447,13 +447,13 @@ def build_pdf_queries(s3_workspace: str, pdf: DatabaseManager.PDFRecord, cur_rou

if has_errored_previously:
# Retry the page at least one more time regularly
new_queries.append({**build_page_query(tf.name, pdf.s3_path, target_page_num), "round": cur_round})
new_queries.append({**build_page_query(tf.name, pdf.s3_path, target_page_num, target_longest_image_dim, target_anchor_text_len), "round": cur_round})

# TODO: If the rotation was previously invalid, then apply a rotation

# TODO: Try to provide a smaller prompt hint
else:
new_queries.append({**build_page_query(tf.name, pdf.s3_path, target_page_num), "round": cur_round})
new_queries.append({**build_page_query(tf.name, pdf.s3_path, target_page_num, target_longest_image_dim, target_anchor_text_len), "round": cur_round})
except Exception as ex:
print(f"Warning, could not get batch inferences lines for {pdf.s3_path} due to {ex}")

Expand Down Expand Up @@ -550,9 +550,8 @@ def get_current_round(s3_workspace: str) -> int:
parser = argparse.ArgumentParser(description='Manager for running millions of PDFs through a batch inference pipeline')
parser.add_argument('workspace', help='The S3 path where work will be done e.g., s3://bucket/prefix/)')
parser.add_argument('--add_pdfs', help='Path to add pdfs stored in s3 to the workspace, can be a glob path s3://bucket/prefix/*.pdf or path to file containing list of pdf paths', default=None)
parser.add_argument('--prefilter_lang', help='If set, tries to detect the language of the pdf and only accepts it if it matches (ex. ENGLISH)')
parser.add_argument('--prefilter_spam', help='If set, tries to detect spammy pdfs and not include them')

parser.add_argument('--target_longest_image_dim', type=int, help='Dimension to use for rendering image', default=1024)
parser.add_argument('--target_anchor_text_len', type=int, help='Maximum amount of anchor text to use', default=6000)
parser.add_argument('--workspace_profile', help='S3 configuration profile for accessing the workspace', default=None)
parser.add_argument('--pdf_profile', help='S3 configuration profile for accessing the raw pdf documents', default=None)
parser.add_argument('--max_size_mb', type=int, default=250, help='Max file size in MB')
Expand Down Expand Up @@ -631,7 +630,7 @@ def get_current_round(s3_workspace: str) -> int:
potentially_done_pdfs = db.get_pdfs_by_status("pending")
else:
print(f"\nCreating batch inference files for new PDFs")
future_to_path = {executor.submit(build_pdf_queries, args.workspace, pdf, current_round): pdf for pdf in db.get_pdfs_by_status("pending")}
future_to_path = {executor.submit(build_pdf_queries, args.workspace, pdf, current_round, args.target_longest_image_dim, args.target_anchor_text_len): pdf for pdf in db.get_pdfs_by_status("pending")}
potentially_done_pdfs = []
lines_written = 0
new_inference_writer = BatchWriter(f"{args.workspace}/inference_inputs/round_{current_round}", args.max_size_mb)
Expand Down

0 comments on commit 492a3f6

Please # to comment.