-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathconvert_to_hf_dataset.py
58 lines (48 loc) · 1.8 KB
/
convert_to_hf_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
"""
convert_to_hf_dataset.py
Usage:
python3 convert_to_hf_dataset.py --dataset-dir synthetic_dataset
Description:
This script takes the JSON files from your synthetic data generator
and converts them into a Hugging Face dataset (with text, bbox, and image path).
It does not modify or remove any lines in your existing generation code.
"""
import os
import json
import argparse
from datasets import Dataset
def main():
parser = argparse.ArgumentParser()
parser.add_argument(
"--dataset-dir",
type=str,
default="synthetic_dataset",
help="Directory containing the generated synthetic images and .json annotations."
)
args = parser.parse_args()
records = []
# Loop over all per-image .json files
for file_name in sorted(os.listdir(args.dataset_dir)):
if file_name.endswith(".json"):
json_path = os.path.join(args.dataset_dir, file_name)
with open(json_path, "r") as f:
annotation = json.load(f)
# Collect relevant fields
# 'annotation["image"]' should be the .png path
# 'annotation["text"]' the GT text
# 'annotation["bbox"]' or 'annotation["bboxes"]' the bounding boxes
record = {
"image_path": annotation["image"],
"text": annotation["text"],
"bbox": annotation.get("bbox", [])
}
records.append(record)
# Build a Hugging Face Dataset
hf_dataset = Dataset.from_list(records)
# Save it to disk in the same directory or a subfolder
save_path = os.path.join(args.dataset_dir, "hf_dataset")
hf_dataset.save_to_disk(save_path)
print(f"HF dataset saved to: {save_path}")
print(f"Number of samples: {len(hf_dataset)}")
if __name__ == "__main__":
main()