Fix TREC-COVID regressions (#2196)

castorini · Sep 18, 2023 · 88935fc · 88935fc
1 parent bde7fb4
commit 88935fc
Show file tree

Hide file tree

Showing 10 changed files with 147 additions and 147 deletions.
diff --git a/src/main/python/trec-covid/covid_baseline_tools.py b/src/main/python/trec-covid/covid_baseline_tools.py
@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-
+import math
 import os
 import re
 import subprocess
@@ -22,11 +22,11 @@
 
 
 def perform_runs(round_number, indexes):
-    base_topics = f'src/main/resources/topics-and-qrels/topics.covid-round{round_number}.xml'
-    udel_topics = f'src/main/resources/topics-and-qrels/topics.covid-round{round_number}-udel.xml'
+    base_topics = f'tools/topics-and-qrels/topics.covid-round{round_number}.xml'
+    udel_topics = f'tools/topics-and-qrels/topics.covid-round{round_number}-udel.xml'
 
     # Use cumulative qrels from previous round for relevance feedback runs
-    cumulative_qrels = f'src/main/resources/topics-and-qrels/qrels.covid-round{round_number - 1}-cumulative.txt'
+    cumulative_qrels = f'tools/topics-and-qrels/qrels.covid-round{round_number - 1}-cumulative.txt'
 
     print('')
     print('## Running on abstract index...')
@@ -119,7 +119,7 @@ def perform_fusion(round_number, run_checksums, check_md5=True):
 
 def prepare_final_submissions(round_number, run_checksums, check_md5=True):
     # Remove the cumulative qrels from the previous round.
-    qrels = f'src/main/resources/topics-and-qrels/qrels.covid-round{round_number - 1}-cumulative.txt'
+    qrels = f'tools/topics-and-qrels/qrels.covid-round{round_number - 1}-cumulative.txt'
 
     print('')
     print('## Preparing final submission files by removing qrels...')
@@ -235,8 +235,8 @@ def evaluate_runs(qrels, runs, expected={}, check_md5=True):
             for key in ['topics', 'ndcg_cut_10', 'judged_cut_10', 'ndcg_cut_20',
                         'judged_cut_20', 'map', 'recall_1000', 'judged_cut_1000']:
                 if key in expected[run]:
-                    assert metrics[key] == expected[run][key],\
-                        f'\'{key}\' doesn\'t match, expected {expected[run][key]} got {metrics[key]}!'
+                    assert math.isclose(metrics[key], expected[run][key], rel_tol=1e-4), \
+                        f'\'{key}\' doesn\'t match, expected {expected[run][key]:.4f} got {metrics[key]:.4f}!'
 
         if check_md5:
             assert metrics['md5'] == runs[run], f'Error in producing {run}!'

diff --git a/src/main/python/trec-covid/generate_query_udel.py b/src/main/python/trec-covid/generate_query_udel.py
@@ -95,14 +95,14 @@
     original_query_file_name = f'topics.covid-round{args.round}.xml'
     original_query_file = os.path.join(
         args.anserini_root,
-        'src/main/resources/topics-and-qrels',
+        'tools/topics-and-qrels',
         original_query_file_name
     )
 
     output_query_file_name = f'topics.covid-round{args.round}-udel.xml'
     output_query_file = os.path.join(
         args.anserini_root,
-        'src/main/resources/topics-and-qrels',
+        'tools/topics-and-qrels',
         output_query_file_name
     )
 

diff --git a/src/main/python/trec-covid/generate_round1_baselines.py b/src/main/python/trec-covid/generate_round1_baselines.py
@@ -54,8 +54,8 @@
 
 
 def perform_runs():
-    base_topics = f'src/main/resources/topics-and-qrels/topics.covid-round1.xml'
-    udel_topics = f'src/main/resources/topics-and-qrels/topics.covid-round1-udel.xml'
+    base_topics = f'tools/topics-and-qrels/topics.covid-round1.xml'
+    udel_topics = f'tools/topics-and-qrels/topics.covid-round1-udel.xml'
 
     print('')
     print('## Running on abstract index...')
@@ -178,7 +178,7 @@ def main():
     if not (os.path.isdir(indexes[0]) and os.path.isdir(indexes[1]) and os.path.isdir(indexes[2])):
         print('Required indexes do not exist. Please download first.')
 
-    round1_qrels = 'src/main/resources/topics-and-qrels/qrels.covid-round1.txt'
+    round1_qrels = 'tools/topics-and-qrels/qrels.covid-round1.txt'
 
     # Note that this script was written after this issue was noted: https://github.com/castorini/anserini/issues/1669
     # Thus, no point in checking MD5.
@@ -189,7 +189,7 @@ def main():
 
     expected_metrics = {
         'anserini.covid-r1.abstract.query.bm25.txt':
-            {'ndcg_cut_10': 0.4100, 'judged_cut_10': 0.8267, 'recall_1000': 0.5279},
+            {'ndcg_cut_10': 0.4100, 'judged_cut_10': 0.8267, 'recall_1000': 0.5285},
         'anserini.covid-r1.abstract.question.bm25.txt':
             {'ndcg_cut_10': 0.5179, 'judged_cut_10': 0.9833, 'recall_1000': 0.6313},
         'anserini.covid-r1.abstract.query+question.bm25.txt':
@@ -213,17 +213,17 @@ def main():
         'anserini.covid-r1.full-text.query-covid19.bm25.txt':
             {'ndcg_cut_10': 0.2434, 'judged_cut_10': 0.5233, 'recall_1000': 0.5692},
         'anserini.covid-r1.paragraph.query.bm25.txt':
-            {'ndcg_cut_10': 0.4302, 'judged_cut_10': 0.8400, 'recall_1000': 0.4327},
+            {'ndcg_cut_10': 0.4303, 'judged_cut_10': 0.8400, 'recall_1000': 0.4324},
         'anserini.covid-r1.paragraph.question.bm25.txt':
-            {'ndcg_cut_10': 0.4410, 'judged_cut_10': 0.9167, 'recall_1000': 0.5111},
+            {'ndcg_cut_10': 0.4410, 'judged_cut_10': 0.9167, 'recall_1000': 0.5108},
         'anserini.covid-r1.paragraph.query+question.bm25.txt':
             {'ndcg_cut_10': 0.5450, 'judged_cut_10': 0.9733, 'recall_1000': 0.5743},
         'anserini.covid-r1.paragraph.query+question+narrative.bm25.txt':
             {'ndcg_cut_10': 0.4899, 'judged_cut_10': 0.8967, 'recall_1000': 0.5918},
         'anserini.covid-r1.paragraph.query-udel.bm25.txt':
             {'ndcg_cut_10': 0.5544, 'judged_cut_10': 0.9200, 'recall_1000': 0.5640},
         'anserini.covid-r1.paragraph.query-covid19.bm25.txt':
-            {'ndcg_cut_10': 0.3180, 'judged_cut_10': 0.5333, 'recall_1000': 0.3552},
+            {'ndcg_cut_10': 0.3180, 'judged_cut_10': 0.5300, 'recall_1000': 0.3552},
         'anserini.covid-r1.fusion1.txt':
             {'ndcg_cut_10': 0.5716, 'judged_cut_10': 0.9867, 'recall_1000': 0.8122},
         'anserini.covid-r1.fusion2.txt':

diff --git a/src/main/python/trec-covid/generate_round2_baselines.py b/src/main/python/trec-covid/generate_round2_baselines.py
@@ -50,8 +50,8 @@
 
 
 def perform_runs():
-    base_topics = f'src/main/resources/topics-and-qrels/topics.covid-round2.xml'
-    udel_topics = f'src/main/resources/topics-and-qrels/topics.covid-round2-udel.xml'
+    base_topics = f'tools/topics-and-qrels/topics.covid-round2.xml'
+    udel_topics = f'tools/topics-and-qrels/topics.covid-round2-udel.xml'
 
     print('')
     print('## Running on abstract index...')
@@ -135,7 +135,7 @@ def perform_fusion(run_checksums, check_md5=True):
 
 def prepare_final_submissions(run_checksums, check_md5=True):
     # Remove the cumulative qrels from the previous round.
-    qrels = f'src/main/resources/topics-and-qrels/qrels.covid-round1.txt'
+    qrels = f'tools/topics-and-qrels/qrels.covid-round1.txt'
 
     print('')
     print('## Preparing final submission files by removing qrels...')
@@ -170,8 +170,8 @@ def main():
     if not (os.path.isdir(indexes[0]) and os.path.isdir(indexes[1]) and os.path.isdir(indexes[2])):
         print('Required indexes do not exist. Please download first.')
 
-    round1_qrels = 'src/main/resources/topics-and-qrels/qrels.covid-round1.txt'
-    round2_qrels = 'src/main/resources/topics-and-qrels/qrels.covid-round2.txt'
+    round1_qrels = 'tools/topics-and-qrels/qrels.covid-round1.txt'
+    round2_qrels = 'tools/topics-and-qrels/qrels.covid-round2.txt'
 
     # Note that this script was written after this issue was noted: https://github.com/castorini/anserini/issues/1669
     # Thus, no point in checking MD5.
@@ -183,23 +183,23 @@ def main():
 
     expected_metrics = {
         'anserini.covid-r2.abstract.qq.bm25.txt':
-            {'topics': 35, 'ndcg_cut_10': 0.3522, 'judged_cut_10': 0.5371, 'ndcg_cut_20': 0.3171,
+            {'topics': 35, 'ndcg_cut_10': 0.3521, 'judged_cut_10': 0.5371, 'ndcg_cut_20': 0.3170,
              'judged_cut_20': 0.5100, 'map': 0.1752, 'recall_1000': 0.6601, 'judged_cut_1000': 0.1013},
         'anserini.covid-r2.abstract.qdel.bm25.txt':
             {'topics': 35, 'ndcg_cut_10': 0.3781, 'judged_cut_10': 0.5371, 'ndcg_cut_20': 0.3462,
-             'judged_cut_20': 0.4829, 'map': 0.1804, 'recall_1000': 0.6485, 'judged_cut_1000': 0.0958},
+             'judged_cut_20': 0.4829, 'map': 0.1803, 'recall_1000': 0.6485, 'judged_cut_1000': 0.0958},
         'anserini.covid-r2.full-text.qq.bm25.txt':
             {'topics': 35, 'ndcg_cut_10': 0.2070, 'judged_cut_10': 0.4286, 'ndcg_cut_20': 0.1931,
              'judged_cut_20': 0.3929, 'map': 0.1159, 'recall_1000': 0.5953, 'judged_cut_1000': 0.0995},
         'anserini.covid-r2.full-text.qdel.bm25.txt':
             {'topics': 35, 'ndcg_cut_10': 0.3123, 'judged_cut_10': 0.4229, 'ndcg_cut_20': 0.2738,
              'judged_cut_20': 0.3929, 'map': 0.1473, 'recall_1000': 0.6517, 'judged_cut_1000': 0.1022},
         'anserini.covid-r2.paragraph.qq.bm25.txt':
-            {'topics': 35, 'ndcg_cut_10': 0.2772, 'judged_cut_10': 0.4400, 'ndcg_cut_20': 0.2579,
+            {'topics': 35, 'ndcg_cut_10': 0.2770, 'judged_cut_10': 0.4400, 'ndcg_cut_20': 0.2578,
              'judged_cut_20': 0.4529, 'map': 0.1607, 'recall_1000': 0.7248, 'judged_cut_1000': 0.1220},
         'anserini.covid-r2.paragraph.qdel.bm25.txt':
-            {'topics': 35, 'ndcg_cut_10': 0.3353, 'judged_cut_10': 0.4343, 'ndcg_cut_20': 0.2956,
-             'judged_cut_20': 0.4329, 'map': 0.1772, 'recall_1000': 0.7196, 'judged_cut_1000': 0.1136},
+            {'topics': 35, 'ndcg_cut_10': 0.3350, 'judged_cut_10': 0.4343, 'ndcg_cut_20': 0.2954,
+             'judged_cut_20': 0.4329, 'map': 0.1772, 'recall_1000': 0.7196, 'judged_cut_1000': 0.1137},
         'anserini.covid-r2.fusion1.txt':
             {'topics': 35, 'ndcg_cut_10': 0.3297, 'judged_cut_10': 0.4657, 'ndcg_cut_20': 0.3060,
              'judged_cut_20': 0.4643, 'map': 0.1914, 'recall_1000': 0.7561, 'judged_cut_1000': 0.1304},
@@ -216,7 +216,7 @@ def main():
             {'topics': 35, 'ndcg_cut_10': 0.4827, 'judged_cut_10': 0.9543, 'ndcg_cut_20': 0.4512,
              'judged_cut_20': 0.8614, 'map': 0.2431, 'recall_1000': 0.6475, 'judged_cut_1000': 0.1463},
         'anserini.final-r2.fusion2.txt':
-            {'topics': 35, 'ndcg_cut_10': 0.5553, 'judged_cut_10': 0.9743, 'ndcg_cut_20': 0.5058,
+            {'topics': 35, 'ndcg_cut_10': 0.5553, 'judged_cut_10': 0.9714, 'ndcg_cut_20': 0.5058,
              'judged_cut_20': 0.8957, 'map': 0.2739, 'recall_1000': 0.6832, 'judged_cut_1000': 0.1528},
     }
     evaluate_runs(round2_qrels, final_runs, expected=expected_metrics, check_md5=check_md5_flag)