From 4f5a6613f21c785e45b83270fac9be18f2c1f5dc Mon Sep 17 00:00:00 2001 From: Alexander Held <45009355+alexander-held@users.noreply.github.com> Date: Fri, 4 Aug 2023 13:29:44 +0200 Subject: [PATCH] fix: sync changes from #185 to script (#186) * run jupytext to sync notebook changes to script version --- .../ttbar_analysis_pipeline.py | 51 ++++++++++--------- 1 file changed, 28 insertions(+), 23 deletions(-) diff --git a/analyses/cms-open-data-ttbar/ttbar_analysis_pipeline.py b/analyses/cms-open-data-ttbar/ttbar_analysis_pipeline.py index c62c25e7..dde37474 100644 --- a/analyses/cms-open-data-ttbar/ttbar_analysis_pipeline.py +++ b/analyses/cms-open-data-ttbar/ttbar_analysis_pipeline.py @@ -6,7 +6,7 @@ # extension: .py # format_name: percent # format_version: '1.3' -# jupytext_version: 1.14.1 +# jupytext_version: 1.14.7 # kernelspec: # display_name: Python 3 (ipykernel) # language: python @@ -40,7 +40,7 @@ # %% [markdown] # ### Imports: setting up our environment -# %% tags=[] +# %% import asyncio import logging import os @@ -89,7 +89,7 @@ # # The input files are all in the 1–3 GB range. -# %% tags=[] +# %% ### GLOBAL CONFIGURATION # input files per process, set to e.g. 10 (smaller number = faster) N_FILES_MAX_PER_SAMPLE = 5 @@ -114,7 +114,7 @@ # - calculating systematic uncertainties at the event and object level, # - filling all the information into histograms that get aggregated and ultimately returned to us by `coffea`. -# %% tags=[] +# %% # functions creating systematic variations def flat_variation(ones): # 2.5% weight variations @@ -316,8 +316,13 @@ def postprocess(self, accumulator): # # Here, we gather all the required information about the files we want to process: paths to the files and asociated metadata. -# %% tags=[] -fileset = utils.construct_fileset(N_FILES_MAX_PER_SAMPLE, use_xcache=False, af_name=config["benchmarking"]["AF_NAME"]) # local files on /data for ssl-dev +# %% +fileset = utils.construct_fileset( + N_FILES_MAX_PER_SAMPLE, + use_xcache=False, + af_name=config["benchmarking"]["AF_NAME"], + input_from_eos=config["benchmarking"]["INPUT_FROM_EOS"] + ) # local files on /data for ssl-dev as af_name print(f"processes in fileset: {list(fileset.keys())}") print(f"\nexample of information in fileset:\n{{\n 'files': [{fileset['ttbar__nominal']['files'][0]}, ...],") @@ -329,7 +334,7 @@ def postprocess(self, accumulator): # # Define the func_adl query to be used for the purpose of extracting columns and filtering. -# %% tags=[] +# %% def get_query(source: ObjectStream) -> ObjectStream: """Query for event / column selection: >=4j >=1b, ==1 lep with pT>25 GeV, return relevant columns """ @@ -355,7 +360,7 @@ def get_query(source: ObjectStream) -> ObjectStream: # # Using the queries created with `func_adl`, we are using `ServiceX` to read the CMS Open Data files to build cached files with only the specific event information as dictated by the query. -# %% tags=[] +# %% if USE_SERVICEX: # dummy dataset on which to generate the query dummy_ds = ServiceXSourceUpROOT("cernopendata://dummy", "Events", backend_name="uproot") @@ -385,7 +390,7 @@ def get_query(source: ObjectStream) -> ObjectStream: # # When `USE_SERVICEX` is false, the input files need to be processed during this step as well. -# %% tags=[] +# %% NanoAODSchema.warn_missing_crossrefs = False # silences warnings about branches we will not use here if USE_DASK: executor = processor.DaskExecutor(client=utils.get_client(af=config["global"]["AF"])) @@ -410,7 +415,7 @@ def get_query(source: ObjectStream) -> ObjectStream: print(f"\nexecution took {exec_time:.2f} seconds") -# %% tags=[] +# %% # track metrics dataset_source = "/data" if fileset["ttbar__nominal"]["files"][0].startswith("/data") else "https://xrootd-local.unl.edu:1094" # TODO: xcache support metrics.update({ @@ -448,7 +453,7 @@ def get_query(source: ObjectStream) -> ObjectStream: # Let's have a look at the data we obtained. # We built histograms in two phase space regions, for multiple physics processes and systematic variations. -# %% tags=[] +# %% utils.set_style() all_histograms[110j::hist.rebin(2), "4j1b", :, "nominal"].stack("process")[::-1].plot(stack=True, histtype="fill", linewidth=1, edgecolor="grey") @@ -456,7 +461,7 @@ def get_query(source: ObjectStream) -> ObjectStream: plt.title(">= 4 jets, 1 b-tag") plt.xlabel("HT [GeV]"); -# %% tags=[] +# %% all_histograms[:, "4j2b", :, "nominal"].stack("process")[::-1].plot(stack=True, histtype="fill", linewidth=1,edgecolor="grey") plt.legend(frameon=False) plt.title(">= 4 jets, >= 2 b-tags") @@ -471,7 +476,7 @@ def get_query(source: ObjectStream) -> ObjectStream: # # We are making of [UHI](https://uhi.readthedocs.io/) here to re-bin. -# %% tags=[] +# %% # b-tagging variations all_histograms[110j::hist.rebin(2), "4j1b", "ttbar", "nominal"].plot(label="nominal", linewidth=2) all_histograms[110j::hist.rebin(2), "4j1b", "ttbar", "btag_var_0_up"].plot(label="NP 1", linewidth=2) @@ -482,7 +487,7 @@ def get_query(source: ObjectStream) -> ObjectStream: plt.xlabel("HT [GeV]") plt.title("b-tagging variations"); -# %% tags=[] +# %% # jet energy scale variations all_histograms[:, "4j2b", "ttbar", "nominal"].plot(label="nominal", linewidth=2) all_histograms[:, "4j2b", "ttbar", "pt_scale_up"].plot(label="scale up", linewidth=2) @@ -497,7 +502,7 @@ def get_query(source: ObjectStream) -> ObjectStream: # We'll save everything to disk for subsequent usage. # This also builds pseudo-data by combining events from the various simulation setups we have processed. -# %% tags=[] +# %% utils.save_histograms(all_histograms, fileset, "histograms.root") # %% [markdown] @@ -510,7 +515,7 @@ def get_query(source: ObjectStream) -> ObjectStream: # A statistical model has been defined in `config.yml`, ready to be used with our output. # We will use `cabinetry` to combine all histograms into a `pyhf` workspace and fit the resulting statistical model to the pseudodata we built. -# %% tags=[] +# %% config = cabinetry.configuration.load("cabinetry_config.yml") # rebinning: lower edge 110 GeV, merge bins 2->1 @@ -523,13 +528,13 @@ def get_query(source: ObjectStream) -> ObjectStream: # %% [markdown] # We can inspect the workspace with `pyhf`, or use `pyhf` to perform inference. -# %% tags=[] +# %% # !pyhf inspect workspace.json | head -n 20 # %% [markdown] # Let's try out what we built: the next cell will perform a maximum likelihood fit of our statistical model to the pseudodata we built. -# %% tags=[] +# %% model, data = cabinetry.model_utils.model_and_data(ws) fit_results = cabinetry.fit.fit(model, data) @@ -540,7 +545,7 @@ def get_query(source: ObjectStream) -> ObjectStream: # %% [markdown] # For this pseudodata, what is the resulting ttbar cross-section divided by the Standard Model prediction? -# %% tags=[] +# %% poi_index = model.config.poi_index print(f"\nfit result for ttbar_norm: {fit_results.bestfit[poi_index]:.3f} +/- {fit_results.uncertainty[poi_index]:.3f}") @@ -548,23 +553,23 @@ def get_query(source: ObjectStream) -> ObjectStream: # Let's also visualize the model before and after the fit, in both the regions we are using. # The binning here corresponds to the binning used for the fit. -# %% tags=[] +# %% model_prediction = cabinetry.model_utils.prediction(model) figs = cabinetry.visualize.data_mc(model_prediction, data, close_figure=True, config=config) figs[0]["figure"] -# %% tags=[] +# %% figs[1]["figure"] # %% [markdown] # We can see very good post-fit agreement. -# %% tags=[] +# %% model_prediction_postfit = cabinetry.model_utils.prediction(model, fit_results=fit_results) figs = cabinetry.visualize.data_mc(model_prediction_postfit, data, close_figure=True, config=config) figs[0]["figure"] -# %% tags=[] +# %% figs[1]["figure"] # %% [markdown]