atlas-2024-odfr-hi: add file index generation script

cernopendata · Nov 25, 2024 · 8d878d0 · 8d878d0
1 parent 1be28d9
commit 8d878d0
Show file tree

Hide file tree

Showing 4 changed files with 139 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -5,6 +5,7 @@ venv/
 *.err
 *.pyc
 atlas-2024-odfr/test
+atlas-2024-odfr-hi/test
 cms-2010-collision-datasets/outputs/*.json
 cms-2010-simulated-datasets/outputs/*.json
 cms-2011-collision-datasets/code/das.py

diff --git a/README.rst b/README.rst
@@ -32,6 +32,7 @@ Specific data ingestion and curation campaigns:
 - `atlas-2016-masterclasses <atlas-2016-masterclasses>`_ -- helper scripts for the ATLAS 2016 masterclasses release
 - `atlas-2016-outreach <atlas-2016-outreach>`_ -- helper scripts for the ATLAS 2016 outreach release
 - `atlas-2024-odfr <atlas-2024-odfr>`_ -- helper scripts for the ATLAS 2024 Open Data For Research release
+- `atlas-2024-odfr-hi <atlas-2024-odfr-hi>`_ -- helper scripts for the ATLAS 2024 Open Data For Research heavy ion release
 - `cms-2010-collision-datasets <cms-2010-collision-datasets>`_ -- helper scripts for the CMS 2010 open data release (collision datasets)
 - `cms-2010-simulated-datasets <cms-2010-simulated-datasets>`_ -- helper scripts for the CMS 2010 open data release (simulated datasets)
 - `cms-2011-collision-datasets <cms-2011-collision-datasets>`_ -- helper scripts for the CMS 2011 open data release (collision datasets)

diff --git a/atlas-2024-odfr-hi/README.md b/atlas-2024-odfr-hi/README.md
@@ -27,3 +27,11 @@ From running the scripts and the transfers, a number of metadata json records ar
 
 To generate the open data records themselves, a final script is provided, `mk_hi_json.py`. This script takes in the above-created text and json files and attempts to stitch together the actual open data portal json files. Three json files are created for records: one for MC, one for data, and one to link them. Individual json files are also created for each dataset with the file information for that dataset.
 
+Finally, the records are to be enriched with file indexes by means of `create_file_indexes.py` script:
+
+```
+$ python ./create_file_indexes.py > test/x.sh
+$ cd test && zip -r x.zip x.sh eos-file-indexes
+```
+
+The generated helper script `x.sh` is to be executed on LXPLUS by the CERN Open Data team to copy the generated EOS file indexes to the expected place in EOSPUBLIC.
diff --git a/atlas-2024-odfr-hi/create_file_indexes.py b/atlas-2024-odfr-hi/create_file_indexes.py
@@ -0,0 +1,129 @@
+#!/usr/bin/env python3
+
+import json
+import os
+import sys
+import zlib
+
+os.makedirs("test/eos-file-indexes", exist_ok=True)
+os.makedirs("test/records", exist_ok=True)
+
+
+def get_file_size(afile):
+    "Return file size of a file."
+    return os.path.getsize(afile)
+
+
+def get_file_checksum(afile):
+    """Return the ADLER32 checksum of a file."""
+    checksum = zlib.adler32(open(afile, "rb").read(), 1) & 0xFFFFFFFF
+    checksum = "{:#010x}".format(checksum).split("0x")[1]
+    return checksum
+
+
+for AFIXTUREFILE in [
+    "test/atlas-hi-2024-hi-2015-data.json",
+    "test/atlas-hi-2024-mc-hi-minbias.json",
+    "test/atlas-hi-2024-summary.json",
+]:
+
+    with open(AFIXTUREFILE, "r") as fdesc:
+        records = json.loads(fdesc.read())
+
+        for record in records:
+
+            # first, fix the license information
+            record["license"]["attribution"] = "CC0"
+
+            # second, fix the file information
+            files_new = []
+            for afile in record.get("files", []):
+                afilename = afile["filename"]
+
+                basename = os.path.basename(afilename)
+                basename = basename.replace("_filelist.json", "")
+
+                prefixes = []
+
+                with open(f"test/{afilename}", "r") as fdr:
+                    rootfileinfos = json.loads(fdr.read())
+
+                    for rootfileinfo in rootfileinfos:
+                        rootfileinfo["checksum"] = rootfileinfo["checksum"].replace(
+                            "adler32", "adler32:"
+                        )
+                        prefix = rootfileinfo["filename"].split(":", 1)[0]
+                        if prefix not in prefixes:
+                            prefixes.append(prefix)
+                        del rootfileinfo["events"]
+                        del rootfileinfo["type"]
+                        rootfileinfo["uri"] = rootfileinfo["uri_root"].replace(
+                            ":1094//eos/opendata", "//eos/opendata"
+                        )
+                        del rootfileinfo["uri_root"]
+
+                if len(prefixes) > 1:
+                    print("[ERROR] Several prefixes found: {prefixes}")
+                    sys.exit(1)
+
+                prefix = prefixes[0]
+
+                with open(
+                    f"test/eos-file-indexes/{prefix}_{basename}_file_index.txt", "w"
+                ) as fdw:
+                    for rootfileinfo in rootfileinfos:
+                        fdw.write(rootfileinfo["uri"] + "\n")
+
+                with open(
+                    f"test/eos-file-indexes/{prefix}_{basename}_file_index.json", "w"
+                ) as fdw:
+                    new_content = json.dumps(
+                        rootfileinfos,
+                        indent=2,
+                        sort_keys=True,
+                        ensure_ascii=False,
+                        separators=(",", ": "),
+                    )
+                    fdw.write(new_content + "\n")
+
+                files_new.append(
+                    {
+                        "checksum": f"adler32:{get_file_checksum(f'test/eos-file-indexes/{prefix}_{basename}_file_index.json')}",
+                        "size": get_file_size(
+                            f"test/eos-file-indexes/{prefix}_{basename}_file_index.json"
+                        ),
+                        "type": "index.json",
+                        "uri": f"root://eospublic.cern.ch//eos/opendata/atlas/rucio/{prefix}/file-indexes/{prefix}_{basename}_file_index.json",
+                    }
+                )
+                files_new.append(
+                    {
+                        "checksum": f"adler32:{get_file_checksum(f'test/eos-file-indexes/{prefix}_{basename}_file_index.json')}",
+                        "size": get_file_size(
+                            f"test/eos-file-indexes/{prefix}_{basename}_file_index.json"
+                        ),
+                        "type": "index.txt",
+                        "uri": f"root://eospublic.cern.ch//eos/opendata/atlas/rucio/{prefix}/file-indexes/{prefix}_{basename}_file_index.txt",
+                    }
+                )
+                record["files"] = files_new
+
+                # print EOS copy command statements
+                print(f"eos mkdir -p /eos/opendata/atlas/rucio/{prefix}/file-indexes")
+                print(
+                    f"eos cp eos-file-indexes/{prefix}_{basename}_file_index.json /eos/opendata/atlas/rucio/{prefix}/file-indexes"
+                )
+                print(
+                    f"eos cp eos-file-indexes/{prefix}_{basename}_file_index.txt /eos/opendata/atlas/rucio/{prefix}/file-indexes"
+                )
+
+        new_content = json.dumps(
+            records,
+            indent=2,
+            sort_keys=True,
+            ensure_ascii=False,
+            separators=(",", ": "),
+        )
+
+        with open(f"test/records/{os.path.basename(AFIXTUREFILE)}", "w") as fdesc:
+            fdesc.write(new_content + "\n")