forked from All-Hands-AI/OpenHands
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcreate_dataset.py
47 lines (35 loc) · 1.41 KB
/
create_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
# This file was used to create the hugging face dataset from the exercism/python
# github repo.
# Refer to: https://github.com/exercism/python/tree/main/exercises/practice
import os
from pathlib import Path
from datasets import Dataset
tests = sorted(os.listdir('practice/'))
dataset = {
'instance_id': [],
'instance_name': [],
'instruction': [],
'signature': [],
'test': [],
}
for i, test in enumerate(tests):
testdir = Path(f'practice/{test}/')
dataset['instance_id'].append(i)
dataset['instance_name'].append(testdir.name.replace('-', '_'))
# if len(glob.glob(f'practice/{testdir.name}/*.py')) != 2:
# print(testdir.name)
instructions = ''
introduction = testdir / '.docs/introduction.md'
if introduction.exists():
instructions += introduction.read_text()
instructions += (testdir / '.docs/instructions.md').read_text()
instructions_append = testdir / '.docs/instructions.append.md'
if instructions_append.exists():
instructions += instructions_append.read_text()
dataset['instruction'].append(instructions)
signature_file = testdir / (testdir.name + '.py').replace('-', '_')
dataset['signature'].append(signature_file.read_text())
test_file = testdir / (testdir.name + '_test.py').replace('-', '_')
dataset['test'].append(test_file.read_text())
ds = Dataset.from_dict(dataset)
ds.push_to_hub('RajMaheshwari/Exercism-Python')