-
Notifications
You must be signed in to change notification settings - Fork 35
/
Copy pathconvert_msmarco_passage_to_anserini.py
116 lines (92 loc) · 5.29 KB
/
convert_msmarco_passage_to_anserini.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
'''Converts MSMARCO's tsv collection to Anserini jsonl files with field configurations.'''
import argparse
import json
import os
# NLTK English stopwords
stop_words = {'ourselves', 'hers', 'between', 'yourself', 'but', 'again', 'there',
'about', 'once', 'during', 'out', 'very', 'having', 'with', 'they',
'own', 'an', 'be', 'some', 'for', 'do', 'its', 'yours', 'such', 'into',
'of', 'most', 'itself', 'other', 'off', 'is', 's', 'am', 'or', 'who',
'as', 'from', 'him', 'each', 'the', 'themselves', 'until', 'below',
'are', 'we', 'these', 'your', 'his', 'through', 'don', 'nor', 'me',
'were', 'her', 'more', 'himself', 'this', 'down', 'should', 'our',
'their', 'while', 'above', 'both', 'up', 'to', 'ours', 'had', 'she',
'all', 'no', 'when', 'at', 'any', 'before', 'them', 'same', 'and',
'been', 'have', 'in', 'will', 'on', 'does', 'yourselves', 'then',
'that', 'because', 'what', 'over', 'why', 'so', 'can', 'did', 'not',
'now', 'under', 'he', 'you', 'herself', 'has', 'just', 'where', 'too',
'only', 'myself', 'which', 'those', 'i', 'after', 'few', 'whom', 't',
'being', 'if', 'theirs', 'my', 'against', 'a', 'by', 'doing', 'it',
'how', 'further', 'was', 'here', 'than'}
# process text by tokenizing and removing stopwords
def process_text(text):
processed = text.lower().replace('.', ' ').replace(',', ' ').replace('?', ' ')
return [word for word in processed.split() if word not in stop_words]
# split new and repeated prediction words
def split_new_repeated(pred_text, doc_text):
pred_repeated = []
pred_new = []
doc_text_set = set(process_text(doc_text))
processed_pred_text = process_text(pred_text)
for word in processed_pred_text:
if word in doc_text_set:
pred_repeated.append(word)
else:
pred_new.append(word)
return pred_new, pred_repeated
if __name__ == '__main__':
parser = argparse.ArgumentParser(
description='Converts MSMARCO\'s tsv collection to Anserini jsonl '
'files.')
parser.add_argument('--collection_path', required=True, help='MS MARCO .tsv collection file')
parser.add_argument('--predictions', required=True, help='File containing predicted queries.')
parser.add_argument('--output_folder', required=True, help='output folder')
parser.add_argument('--max_docs_per_file', default=1000000, type=int,
help='maximum number of documents in each jsonl file.')
# parameters to simulate BM25F via duplicated text
parser.add_argument('--original_copies', default=1, type=int, help='number of original text duplicates.')
parser.add_argument('--prediction_copies', default=1, type=int, help='number of predicted text duplicates.')
# parameters to separate new and repeated prediction text
parser.add_argument('--split_predictions', default=False, type=bool,
help='separate predicted text into repeated and new.')
parser.add_argument('--repeated_prediction_copies', default=1, type=int,
help='number of repeated predicted text duplicates, must set split_predictions to true.')
parser.add_argument('--new_prediction_copies', default=1, type=int,
help='number of new predicted text duplicates, must set split_predictions to true.')
args = parser.parse_args()
if not os.path.exists(args.output_folder):
os.makedirs(args.output_folder)
print('Converting collection...')
file_index = 0
new_words = 0
total_words = 0
with open(args.collection_path) as f_corpus, open(args.predictions) as f_pred:
for i, (line_doc, line_pred) in enumerate(zip(f_corpus, f_pred)):
# Write to a new file when the current one reaches maximum capacity.
if i % args.max_docs_per_file == 0:
if i > 0:
output_jsonl_file.close()
output_path = os.path.join(args.output_folder, f'docs{file_index:02d}.json')
output_jsonl_file = open(output_path, 'w')
file_index += 1
doc_id, doc_text = line_doc.rstrip().split('\t')
pred_text = line_pred.rstrip()
contents = ''
if args.split_predictions:
pred_new, pred_repeated = split_new_repeated(pred_text, doc_text)
new_words += len(pred_new)
total_words += len(pred_new) + len(pred_repeated)
contents += (doc_text + ' ') * args.original_copies
contents += (' '.join(pred_repeated) + ' ') * args.repeated_prediction_copies
contents += (' '.join(pred_new) + ' ') * args.new_prediction_copies
else:
contents += (doc_text + ' ') * args.original_copies
contents += (pred_text + ' ') * args.prediction_copies
output_dict = {'id': doc_id, 'contents': contents}
output_jsonl_file.write(json.dumps(output_dict) + '\n')
if i % 100000 == 0:
print('Converted {} docs in {} files'.format(i, file_index))
if args.split_predictions:
print(f"Found {100 * new_words/total_words}% new predicted text")
output_jsonl_file.close()
print('Done!')