diff --git a/olmocr/train/hf/convertjsontoparquet.py b/olmocr/train/hf/convertjsontoparquet.py index 7ffac5c..3f76a5e 100644 --- a/olmocr/train/hf/convertjsontoparquet.py +++ b/olmocr/train/hf/convertjsontoparquet.py @@ -14,9 +14,7 @@ import multiprocessing import os import re -import shutil import sqlite3 -import tempfile from dataclasses import dataclass from typing import Dict, List, Optional, Set, Tuple from urllib.parse import urlparse diff --git a/olmocr/train/hf/hfhub_upload.py b/olmocr/train/hf/hfhub_upload.py index 3e7a6a8..33f922a 100644 --- a/olmocr/train/hf/hfhub_upload.py +++ b/olmocr/train/hf/hfhub_upload.py @@ -1,11 +1,9 @@ import logging import os import tarfile -from concurrent.futures import ProcessPoolExecutor, as_completed from math import ceil from huggingface_hub import HfApi -from tqdm import tqdm # Configuration pdf_dir = "pdfs" # Directory with PDF files (flat structure)