From 8f48f8e40a37e5f6b5910a3a3b5c050a0f9be914 Mon Sep 17 00:00:00 2001
From: Jimmy Lin <jimmylin@uwaterloo.ca>
Date: Sat, 25 Apr 2020 12:27:45 -0400
Subject: [PATCH] Updating CORD-19 to 4/24 data drop (#1117)

---
 docs/experiments-covid.md | 57 ++++++++++++++++++++-------------------
 1 file changed, 30 insertions(+), 27 deletions(-)

diff --git a/docs/experiments-covid.md b/docs/experiments-covid.md
index c440fabce9..108dee45d9 100644
--- a/docs/experiments-covid.md
+++ b/docs/experiments-covid.md
@@ -13,9 +13,9 @@ If you don't want to build the index yourself, you can download the latest pre-b
 
 | Type | Version | Size | Link| Checksum |
 |:-----|:--------|:-----|:----|:---------|
-| Title + Abstract | 2020-04-17 | 1.2G | [[Dropbox]](https://www.dropbox.com/s/xogxcrvyx75vxoj/lucene-index-covid-2020-04-17.tar.gz) | `d57b17eadb1b44fc336b4121c139a598`
-| Full-Text | 2020-04-17 | 2.2G | [[Dropbox]](https://www.dropbox.com/s/gs054ecxna5xm0f/lucene-index-covid-full-text-2020-04-17.tar.gz) | `677546e0a1b7855a48eee8b6fbd7d7af`
-| Paragraph | 2020-04-17 | 4.7G| [[Dropbox]](https://www.dropbox.com/s/u3a0z53pdaxekfe/lucene-index-covid-paragraph-2020-04-17.tar.gz) | `c11e46230b744a46747f84e49acc9c2b`
+| Abstract | 2020-04-24 | 1.3G | [[Dropbox]](https://www.dropbox.com/s/ntfg6ykr3ed3acn/lucene-index-cord19-abstract-2020-04-24.tar.gz) | `93540ae00e166ee433db7531e1bb51c8`
+| Full-Text | 2020-04-24 | 2.4G | [[Dropbox]](https://www.dropbox.com/s/twb1defsb19ss4x/lucene-index-cord19-full-text-2020-04-24.tar.gz) | `fa927b0fc9cf1cd382413039cdc7b736`
+| Paragraph | 2020-04-24 | 5.0G| [[Dropbox]](https://www.dropbox.com/s/xg2b4aapjvmx3ve/lucene-index-cord19-paragraph-2020-04-24.tar.gz) | `7c6de6298e0430b8adb3e03310db32d8`
 
 "Size" refers to the output of `ls -lh`, "Version" refers to the dataset release date from AI2.
 For our answer to the question, "which one should I use?" see below.
@@ -24,19 +24,19 @@ We've kept around older versions of the index for archival purposes &mdash; scro
 
 ## Data Prep
 
-The latest distribution available is from 2020/04/17.
+The latest distribution available is from 2020/04/24.
 First, download the data:
 
 ```bash
-DATE=2020-04-17
-DATA_DIR=./covid-"${DATE}"
+DATE=2020-04-24
+DATA_DIR=./cord19-"${DATE}"
 mkdir "${DATA_DIR}"
 
-wget https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/"${DATE}"/comm_use_subset.tar.gz -P "${DATA_DIR}"
-wget https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/"${DATE}"/noncomm_use_subset.tar.gz -P "${DATA_DIR}"
-wget https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/"${DATE}"/custom_license.tar.gz -P "${DATA_DIR}"
-wget https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/"${DATE}"/biorxiv_medrxiv.tar.gz -P "${DATA_DIR}"
-wget https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/"${DATE}"/metadata.csv -P "${DATA_DIR}"
+wget https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/latest/comm_use_subset.tar.gz -P "${DATA_DIR}"
+wget https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/latest/noncomm_use_subset.tar.gz -P "${DATA_DIR}"
+wget https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/latest/custom_license.tar.gz -P "${DATA_DIR}"
+wget https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/latest/biorxiv_medrxiv.tar.gz -P "${DATA_DIR}"
+wget https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/latest/metadata.csv -P "${DATA_DIR}"
 
 ls "${DATA_DIR}"/*.tar.gz | xargs -I {} tar -zxvf {} -C "${DATA_DIR}"
 # If the above doesn't work due to cross-OS compatibility issues with xargs, untar all folders individually
@@ -65,14 +65,14 @@ We can index titles and abstracts only with `CovidCollection`, as follows:
 sh target/appassembler/bin/IndexCollection \
   -collection CovidCollection -generator CovidGenerator \
   -threads 8 -input "${DATA_DIR}" \
-  -index "${DATA_DIR}"/lucene-index-covid-"${DATE}" \
-  -storePositions -storeDocvectors -storeContents -storeRaw > log.covid.${DATE}.txt
+  -index "${DATA_DIR}"/lucene-index-cord19-abstract-"${DATE}" \
+  -storePositions -storeDocvectors -storeContents -storeRaw > log.cord19-abstract.${DATE}.txt
 ```
 
 The output message should be something like this:
 
 ```bash
-2020-04-20 11:42:44,075 INFO  [main] index.IndexCollection (IndexCollection.java:879) - Total 52,389 documents indexed in 00:01:11
+2020-04-25 09:22:40,284 INFO  [main] index.IndexCollection (IndexCollection.java:879) - Total 57,356 documents indexed in 00:01:13
 ```
 
 The `contents` field of each Lucene document is a concatenation of the article's title and abstract.
@@ -85,14 +85,14 @@ We can index the full text, with `CovidFullTextCollection`, as follows:
 sh target/appassembler/bin/IndexCollection \
   -collection CovidFullTextCollection -generator CovidGenerator \
   -threads 8 -input "${DATA_DIR}" \
-  -index "${DATA_DIR}"/lucene-index-covid-full-text-"${DATE}" \
-  -storePositions -storeDocvectors -storeContents -storeRaw  > log.covid-full-text.${DATE}.txt
+  -index "${DATA_DIR}"/lucene-index-cord19-full-text-"${DATE}" \
+  -storePositions -storeDocvectors -storeContents -storeRaw  > log.cord19-full-text.${DATE}.txt
 ```
 
 The output message should be something like this:
 
 ```bash
-2020-04-20 11:47:06,839 INFO  [main] index.IndexCollection (IndexCollection.java:879) - Total 52,391 documents indexed in 00:03:31
+2020-04-25 09:27:31,978 INFO  [main] index.IndexCollection (IndexCollection.java:879) - Total 57,359 documents indexed in 00:04:42
 ```
 
 The `contents` field of each Lucene document is a concatenation of the article's title and abstract, and the full text JSON (if available).
@@ -105,14 +105,14 @@ We can build a paragraph index with `CovidParagraphCollection`, as follows:
 sh target/appassembler/bin/IndexCollection \
   -collection CovidParagraphCollection -generator CovidGenerator \
   -threads 8 -input "${DATA_DIR}" \
-  -index "${DATA_DIR}"/lucene-index-covid-paragraph-"${DATE}" \
-  -storePositions -storeDocvectors -storeContents -storeRaw > log.covid-paragraph.${DATE}.txt
+  -index "${DATA_DIR}"/lucene-index-cord19-paragraph-"${DATE}" \
+  -storePositions -storeDocvectors -storeContents -storeRaw > log.cord19-paragraph.${DATE}.txt
 ```
 
 The output message should be something like this:
 
 ```bash
-2020-04-20 12:20:33,823 INFO  [main] index.IndexCollection (IndexCollection.java:879) - Total 1,455,628 documents indexed in 00:14:54
+2020-04-25 09:43:40,546 INFO  [main] index.IndexCollection (IndexCollection.java:879) - Total 1,689,378 documents indexed in 00:15:51
 ```
 
 In this configuration, the indexer creates multiple Lucene Documents for each source article:
@@ -173,8 +173,8 @@ solrini/bin/solr create -n anserini -c covid
 We can now index into Solr:
 
 ```
-DATE=2020-04-17
-DATA_DIR=./covid-"${DATE}"
+DATE=2020-04-24
+DATA_DIR=./cord19-"${DATE}"
 
 sh target/appassembler/bin/IndexCollection -collection CovidCollection -generator CovidGenerator \
    -threads 8 -input "${DATA_DIR}" \
@@ -190,16 +190,19 @@ All versions of pre-built indexes:
 
 | Type | Version | Size | Link| Checksum |
 |:-----|:--------|:-----|:----|:---------|
-| Title + Abstract | 2020-04-17 | 1.2G | [[Dropbox]](https://www.dropbox.com/s/xogxcrvyx75vxoj/lucene-index-covid-2020-04-17.tar.gz) | `d57b17eadb1b44fc336b4121c139a598`
-| Title + Abstract | 2020-04-10 | 1.2G | [[Dropbox]](https://www.dropbox.com/s/j55t617yhvmegy8/lucene-index-covid-2020-04-10.tar.gz) | `ec239d56498c0e7b74e3b41e1ce5d42a`
-| Title + Abstract | 2020-04-03 | 1.1G | [[Dropbox]](https://www.dropbox.com/s/d6v9fensyi7q3gb/lucene-index-covid-2020-04-03.tar.gz) | `5d0d222e746d522a75f94240f5ab9f23`
-| Title + Abstract | 2020-03-27 | 1.1G | [[Dropbox]](https://www.dropbox.com/s/j1epbu4ufunbbzv/lucene-index-covid-2020-03-27.tar.gz) | `c5f7247e921c80f41ac6b54ff38eb229`
-| Title + Abstract | 2020-03-20 | 1.0G | [[Dropbox]](https://www.dropbox.com/s/uvjwgy4re2myq5s/lucene-index-covid-2020-03-20.tar.gz) | `281c632034643665d52a544fed23807a`
+| Abstract | 2020-04-24 | 1.3G | [[Dropbox]](https://www.dropbox.com/s/ntfg6ykr3ed3acn/lucene-index-cord19-abstract-2020-04-24.tar.gz) | `93540ae00e166ee433db7531e1bb51c8`
+| Abstract | 2020-04-17 | 1.2G | [[Dropbox]](https://www.dropbox.com/s/xogxcrvyx75vxoj/lucene-index-covid-2020-04-17.tar.gz) | `d57b17eadb1b44fc336b4121c139a598`
+| Abstract | 2020-04-10 | 1.2G | [[Dropbox]](https://www.dropbox.com/s/j55t617yhvmegy8/lucene-index-covid-2020-04-10.tar.gz) | `ec239d56498c0e7b74e3b41e1ce5d42a`
+| Abstract | 2020-04-03 | 1.1G | [[Dropbox]](https://www.dropbox.com/s/d6v9fensyi7q3gb/lucene-index-covid-2020-04-03.tar.gz) | `5d0d222e746d522a75f94240f5ab9f23`
+| Abstract | 2020-03-27 | 1.1G | [[Dropbox]](https://www.dropbox.com/s/j1epbu4ufunbbzv/lucene-index-covid-2020-03-27.tar.gz) | `c5f7247e921c80f41ac6b54ff38eb229`
+| Abstract | 2020-03-20 | 1.0G | [[Dropbox]](https://www.dropbox.com/s/uvjwgy4re2myq5s/lucene-index-covid-2020-03-20.tar.gz) | `281c632034643665d52a544fed23807a`
+| Full-Text | 2020-04-24 | 2.4G | [[Dropbox]](https://www.dropbox.com/s/twb1defsb19ss4x/lucene-index-cord19-full-text-2020-04-24.tar.gz) | `fa927b0fc9cf1cd382413039cdc7b736`
 | Full-Text | 2020-04-17 | 2.2G | [[Dropbox]](https://www.dropbox.com/s/gs054ecxna5xm0f/lucene-index-covid-full-text-2020-04-17.tar.gz) | `677546e0a1b7855a48eee8b6fbd7d7af`
 | Full-Text | 2020-04-10 | 3.3G | [[Dropbox]](https://www.dropbox.com/s/gtq2c3xq81mjowk/lucene-index-covid-full-text-2020-04-10.tar.gz) | `401a6f5583b0f05340c73fbbeb3279c8`
 | Full-Text | 2020-04-03 | 3.0G | [[Dropbox]](https://www.dropbox.com/s/abhuqks7aa1xs79/lucene-index-covid-full-text-2020-04-03.tar.gz) | `9aafb86fec39e0882bd9ef0688d7a9cc`
 | Full-Text | 2020-03-27 | 2.9G | [[Dropbox]](https://www.dropbox.com/s/hjsf7qldn4t10vm/lucene-index-covid-full-text-2020-03-27.tar.gz) | `3c126344f9711720e6cf627c9bc415eb`
 | Full-Text | 2020-03-20 | 2.6G | [[Dropbox]](https://www.dropbox.com/s/w74nmpmvdgw7o00/lucene-index-covid-full-text-2020-03-20.tar.gz) | `30cae90b85fa8f1b53acaa62413756e3`
+| Paragraph | 2020-04-24 | 5.0G| [[Dropbox]](https://www.dropbox.com/s/xg2b4aapjvmx3ve/lucene-index-cord19-paragraph-2020-04-24.tar.gz) | `7c6de6298e0430b8adb3e03310db32d8`
 | Paragraph | 2020-04-17 | 4.7G| [[Dropbox]](https://www.dropbox.com/s/u3a0z53pdaxekfe/lucene-index-covid-paragraph-2020-04-17.tar.gz) | `c11e46230b744a46747f84e49acc9c2b`
 | Paragraph | 2020-04-10 | 3.4G| [[Dropbox]](https://www.dropbox.com/s/ivk87journyajw3/lucene-index-covid-paragraph-2020-04-10.tar.gz) | `8b87a2c55bc0a15b87f11e796860216a`
 | Paragraph | 2020-04-03 | 3.1G| [[Dropbox]](https://www.dropbox.com/s/rfzxrrstwlck4wh/lucene-index-covid-paragraph-2020-04-03.tar.gz) | `523894cfb52fc51c4202e76af79e1b10`