Skip to content

Commit

Permalink
atlas-2024-odfr-hi: add file index generation script
Browse files Browse the repository at this point in the history
  • Loading branch information
tiborsimko committed Nov 25, 2024
1 parent 1be28d9 commit 8d878d0
Show file tree
Hide file tree
Showing 4 changed files with 139 additions and 0 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ venv/
*.err
*.pyc
atlas-2024-odfr/test
atlas-2024-odfr-hi/test
cms-2010-collision-datasets/outputs/*.json
cms-2010-simulated-datasets/outputs/*.json
cms-2011-collision-datasets/code/das.py
Expand Down
1 change: 1 addition & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ Specific data ingestion and curation campaigns:
- `atlas-2016-masterclasses <atlas-2016-masterclasses>`_ -- helper scripts for the ATLAS 2016 masterclasses release
- `atlas-2016-outreach <atlas-2016-outreach>`_ -- helper scripts for the ATLAS 2016 outreach release
- `atlas-2024-odfr <atlas-2024-odfr>`_ -- helper scripts for the ATLAS 2024 Open Data For Research release
- `atlas-2024-odfr-hi <atlas-2024-odfr-hi>`_ -- helper scripts for the ATLAS 2024 Open Data For Research heavy ion release
- `cms-2010-collision-datasets <cms-2010-collision-datasets>`_ -- helper scripts for the CMS 2010 open data release (collision datasets)
- `cms-2010-simulated-datasets <cms-2010-simulated-datasets>`_ -- helper scripts for the CMS 2010 open data release (simulated datasets)
- `cms-2011-collision-datasets <cms-2011-collision-datasets>`_ -- helper scripts for the CMS 2011 open data release (collision datasets)
Expand Down
8 changes: 8 additions & 0 deletions atlas-2024-odfr-hi/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,3 +27,11 @@ From running the scripts and the transfers, a number of metadata json records ar

To generate the open data records themselves, a final script is provided, `mk_hi_json.py`. This script takes in the above-created text and json files and attempts to stitch together the actual open data portal json files. Three json files are created for records: one for MC, one for data, and one to link them. Individual json files are also created for each dataset with the file information for that dataset.

Finally, the records are to be enriched with file indexes by means of `create_file_indexes.py` script:

```
$ python ./create_file_indexes.py > test/x.sh
$ cd test && zip -r x.zip x.sh eos-file-indexes
```

The generated helper script `x.sh` is to be executed on LXPLUS by the CERN Open Data team to copy the generated EOS file indexes to the expected place in EOSPUBLIC.
129 changes: 129 additions & 0 deletions atlas-2024-odfr-hi/create_file_indexes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
#!/usr/bin/env python3

import json
import os
import sys
import zlib

os.makedirs("test/eos-file-indexes", exist_ok=True)
os.makedirs("test/records", exist_ok=True)


def get_file_size(afile):
"Return file size of a file."
return os.path.getsize(afile)


def get_file_checksum(afile):
"""Return the ADLER32 checksum of a file."""
checksum = zlib.adler32(open(afile, "rb").read(), 1) & 0xFFFFFFFF
checksum = "{:#010x}".format(checksum).split("0x")[1]
return checksum


for AFIXTUREFILE in [
"test/atlas-hi-2024-hi-2015-data.json",
"test/atlas-hi-2024-mc-hi-minbias.json",
"test/atlas-hi-2024-summary.json",
]:

with open(AFIXTUREFILE, "r") as fdesc:
records = json.loads(fdesc.read())

for record in records:

# first, fix the license information
record["license"]["attribution"] = "CC0"

# second, fix the file information
files_new = []
for afile in record.get("files", []):
afilename = afile["filename"]

basename = os.path.basename(afilename)
basename = basename.replace("_filelist.json", "")

prefixes = []

with open(f"test/{afilename}", "r") as fdr:
rootfileinfos = json.loads(fdr.read())

for rootfileinfo in rootfileinfos:
rootfileinfo["checksum"] = rootfileinfo["checksum"].replace(
"adler32", "adler32:"
)
prefix = rootfileinfo["filename"].split(":", 1)[0]
if prefix not in prefixes:
prefixes.append(prefix)
del rootfileinfo["events"]
del rootfileinfo["type"]
rootfileinfo["uri"] = rootfileinfo["uri_root"].replace(
":1094//eos/opendata", "//eos/opendata"
)
del rootfileinfo["uri_root"]

if len(prefixes) > 1:
print("[ERROR] Several prefixes found: {prefixes}")
sys.exit(1)

prefix = prefixes[0]

with open(
f"test/eos-file-indexes/{prefix}_{basename}_file_index.txt", "w"
) as fdw:
for rootfileinfo in rootfileinfos:
fdw.write(rootfileinfo["uri"] + "\n")

with open(
f"test/eos-file-indexes/{prefix}_{basename}_file_index.json", "w"
) as fdw:
new_content = json.dumps(
rootfileinfos,
indent=2,
sort_keys=True,
ensure_ascii=False,
separators=(",", ": "),
)
fdw.write(new_content + "\n")

files_new.append(
{
"checksum": f"adler32:{get_file_checksum(f'test/eos-file-indexes/{prefix}_{basename}_file_index.json')}",
"size": get_file_size(
f"test/eos-file-indexes/{prefix}_{basename}_file_index.json"
),
"type": "index.json",
"uri": f"root://eospublic.cern.ch//eos/opendata/atlas/rucio/{prefix}/file-indexes/{prefix}_{basename}_file_index.json",
}
)
files_new.append(
{
"checksum": f"adler32:{get_file_checksum(f'test/eos-file-indexes/{prefix}_{basename}_file_index.json')}",
"size": get_file_size(
f"test/eos-file-indexes/{prefix}_{basename}_file_index.json"
),
"type": "index.txt",
"uri": f"root://eospublic.cern.ch//eos/opendata/atlas/rucio/{prefix}/file-indexes/{prefix}_{basename}_file_index.txt",
}
)
record["files"] = files_new

# print EOS copy command statements
print(f"eos mkdir -p /eos/opendata/atlas/rucio/{prefix}/file-indexes")
print(
f"eos cp eos-file-indexes/{prefix}_{basename}_file_index.json /eos/opendata/atlas/rucio/{prefix}/file-indexes"
)
print(
f"eos cp eos-file-indexes/{prefix}_{basename}_file_index.txt /eos/opendata/atlas/rucio/{prefix}/file-indexes"
)

new_content = json.dumps(
records,
indent=2,
sort_keys=True,
ensure_ascii=False,
separators=(",", ": "),
)

with open(f"test/records/{os.path.basename(AFIXTUREFILE)}", "w") as fdesc:
fdesc.write(new_content + "\n")

0 comments on commit 8d878d0

Please # to comment.