forked from nachiketashunya/Amazon-ML-Challenge-2024
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdataprep.py
57 lines (47 loc) · 1.9 KB
/
dataprep.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
import csv
import json
import os
# Paths
csv_file_path = 'data/processed_train.csv' # Path to your CSV file
image_folder_path = '/scratch/m23csa016/aml/train' # Folder where your images are stored
output_json_file = 'data/processed_train.json' # Output JSON file
# Initialize list to store the dataset
dataset = []
# Open and read the CSV file
with open(csv_file_path, 'r') as csv_file:
reader = csv.DictReader(csv_file)
for row in reader:
# Extract image filename from the image link
image_url = row['image_link']
image_filename = os.path.basename(image_url) # Get just the filename, e.g., '61I9XdN6OFL.jpg'
# Construct the image path in your train folder
image_path = os.path.join(image_folder_path, image_filename)
# Ensure the image exists in the train/ folder before adding to the dataset
if os.path.exists(image_path):
group_id = row['group_id']
entity_name = row['entity_name']
entity_value = row['entity_value']
# Construct the conversation in ShareGPT format
conversation = {
"messages": [
{
"content": f"<image>What is the {entity_name}?",
"role": "user"
},
{
"content": f"{entity_value}",
"role": "assistant"
}
],
"images": [
image_path
]
}
# Add the conversation to the dataset
dataset.append(conversation)
else:
print(f"Image {image_filename} not found in {image_folder_path}, skipping.")
# Save the dataset to a JSON file
with open(output_json_file, 'w') as json_file:
json.dump(dataset, json_file, indent=4)
print(f"Dataset successfully saved to {output_json_file}")