-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsplit_conll.py
99 lines (70 loc) · 3.55 KB
/
split_conll.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import os
def count_sentences(file_path):
sentence_count = 0
with open(file_path, 'r') as file:
for line in file:
line = line.lstrip() # Remove leading/trailing whitespace
if not line or line[0] == '#':
continue
columns = line.split()
# if(len(columns) < 3): continue
#print(len(columns[2]))
if int(columns[2]) == 0:
sentence_count += 1
return sentence_count
def split_conll_file(file_path, output_directory):
sentence_count = count_sentences(file_path)
k = 0.6
half_sentence_count = int(sentence_count * k)
# Extracting the document name and directory from the file path
document_dir, document_name = os.path.split(file_path)
document_name = os.path.splitext(document_name)[0]
# Creating directory paths for the split files
split_dir = os.path.join(output_directory, "")
os.makedirs(split_dir, exist_ok=True)
# Creating file names for the split files
file1_path = os.path.join(split_dir, '{}_1.gold_conll'.format(document_name))
file2_path = os.path.join(split_dir, '{}_2.gold_conll'.format(document_name))
# # Extracting the document name from the file path
# document_name = os.path.splitext(os.path.basename(file_path))[0]
# # Creating file names for the split files
# file1_path = '{}_1.gold_conll'.format(document_name)
# file2_path = '{}_2.gold_conll'.format(document_name)
with open(file_path, 'r') as input_file, \
open(file1_path, 'w') as output_file1, \
open(file2_path, 'w') as output_file2:
current_sentence_count = 0
current_output_file = output_file1
current_output_file.write('#begin document ({}); part 0\n'.format(file1_path[len(split_dir):-11]))
for line in input_file:
line = line.lstrip()
if not line:
current_output_file.write('\n')
continue
if line[0] == '#':
continue
columns = line.split()
# columns[0] = current_output_file.name[:-11]
# line = ' '.join(columns) + '\n'
if int(columns[2]) == 0:
current_sentence_count += 1
# Switch output file when half the sentences are reached
if current_sentence_count == half_sentence_count:
current_output_file.write('#end document\n')
current_output_file = output_file2
current_output_file.write('#begin document ({}); part 0\n'.format(file2_path[:-11]))
line = line.replace(f"{document_name}", f"{current_output_file.name[len(split_dir):-11]}")
current_output_file.write(line)
current_output_file.write('#end document' + '\n')
print("Split complete. Split files: '{}' and '{}'".format(file1_path, file2_path))
# file_path = 'litbank/dev/36_the_war_of_the_worlds_brat.gold_conll'
# split_conll_file(file_path)
input_directories = ['litbank/dev', 'litbank/train', 'litbank/test']
output_directories = ['litbank_splitted/dev', 'litbank_splitted/train', 'litbank_splitted/test']
for i, input_directory in enumerate(input_directories):
output_directory = output_directories[i]
# Iterate over files in the input directory
for filename in os.listdir(input_directory):
file_path = os.path.join(input_directory, filename)
if os.path.isfile(file_path):
split_conll_file(file_path, output_directory)