PyReleaseValidation/scripts/das-up-to-nevents.py

0001 #!/usr/bin/env python3
0002 import pycurl
0003 from io import BytesIO
0004 import pycurl
0005 import ast
0006 import subprocess
0007 import pandas as pd
0008 import argparse
0009 from bs4 import BeautifulSoup
0010 import numpy as np
0011 import os
0012 import json
0013 import sys
0014 import itertools
0015 import json
0016
0017 ## Helpers
0018 base_cert_url = "https://cms-service-dqmdc.web.cern.ch/CAF/certification/"
0019 base_cert_path = "/eos/user/c/cmsdqm/www/CAF/certification/"
0020
0021 def get_url_clean(url):
0022
0023     buffer = BytesIO()
0024     c = pycurl.Curl()
0025     c.setopt(c.URL, url)
0026     c.setopt(c.WRITEDATA, buffer)
0027     c.perform()
0028     c.close()
0029
0030     return BeautifulSoup(buffer.getvalue(), "lxml").text
0031
0032 def get_lumi_ranges(i):
0033     result = []
0034     for _, b in itertools.groupby(enumerate(i), lambda pair: pair[1] - pair[0]):
0035         b = list(b)
0036         result.append([b[0][1],b[-1][1]])
0037     return result
0038
0039 def das_do_command(cmd):
0040     out = subprocess.check_output(cmd, shell=True, executable="/bin/bash").decode('utf8')
0041     return out.split("\n")
0042
0043 def das_key(dataset):
0044     return 'dataset='+dataset if "#" not in dataset else 'block='+dataset
0045
0046 def das_file_site(dataset, site):
0047     cmd = "dasgoclient --query='file %s site=%s'"%(das_key(dataset),site)
0048     out = das_do_command(cmd)
0049     df = pd.DataFrame(out,columns=["file"])
0050
0051     return df
0052
0053 def das_file_data(dataset,opt=""):
0054     cmd = "dasgoclient --query='file %s %s| grep file.name, file.nevents'"%(das_key(dataset),opt)
0055     out = das_do_command(cmd)
0056     out = [np.array(r.split(" "))[[0,3]] for r in out if len(r) > 0]
0057
0058     df = pd.DataFrame(out,columns=["file","events"])
0059     df.events = df.events.values.astype(int)
0060
0061     return df
0062
0063 def das_lumi_data(dataset,opt=""):
0064
0065     cmd = "dasgoclient --query='file,lumi,run %s %s'"%(das_key(dataset),opt)
0066
0067     out = das_do_command(cmd)
0068     out = [r.split(" ") for r in out if len(r)>0]
0069
0070     df = pd.DataFrame(out,columns=["file","run","lumis"])
0071
0072     return df
0073
0074 def das_run_events_data(dataset,run,opt=""):
0075     cmd = "dasgoclient --query='file %s run=%s %s | sum(file.nevents) '"%(das_key(dataset),run,opt)
0076     out = das_do_command(cmd)[0]
0077
0078     out = [o for o in out.split(" ") if "sum" not in o]
0079     out = int([r.split(" ") for r in out if len(r)>0][0][0])
0080
0081     return out
0082
0083 def das_run_data(dataset,opt=""):
0084     cmd = "dasgoclient --query='run %s %s '"%(das_key(dataset),opt)
0085     out = das_do_command(cmd)
0086
0087     return out
0088
0089 def no_intersection():
0090     print("No intersection between:")
0091     print(" - json   : ", best_json)
0092     print(" - dataset: ", dataset)
0093     print("Exiting.")
0094     sys.exit(1)
0095
0096 if __name__ == '__main__':
0097
0098     parser = argparse.ArgumentParser()
0099     parser.add_argument('--dataset','-d', default=None, help="Dataset Name (e.g. '/DisplacedJet/Run2024C-v1/RAW', may also be a block (e.g /ZeroBias/Run2024J-v1/RAW#d8058bab-4e55-45b0-abb6-405aa3abc2af)",type=str,required=True)
0100     parser.add_argument('--threshold','-t', help ="Event threshold per file",type=int,default=-1)
0101     parser.add_argument('--events','-e', help ="Tot number of events targeted",type=int,default=-1)
0102     parser.add_argument('--outfile','-o', help='Dump results to file', type=str, default=None)
0103     parser.add_argument('--pandas', '-pd',action='store_true',help="Store the whole dataset (no event or threshold cut) in a csv")
0104     parser.add_argument('--proxy','-p', help='Allow to parse a x509 proxy if needed', type=str, default=None)
0105     parser.add_argument('--site','-s', help='Only data at specific site', type=str, default=None)
0106     parser.add_argument('--lumis','-l', help='Output file for lumi ranges for the selected files (if black no lumiranges calculated)', type=str, default=None)
0107     parser.add_argument('--precheck','-pc', action='store_true', help='Check run per run before building the dataframes, to avoid huge caching.')
0108     parser.add_argument('--nogolden','-ng', action='store_true', help='Do not crosscheck the dataset run and lumis with a Golden json for data certification')
0109     parser.add_argument('--run','-r', help ="Target a specific run",type=int,default=None,nargs="+")
0110     args = parser.parse_args()
0111
0112     if args.proxy is not None:
0113         os.environ["X509_USER_PROXY"] = args.proxy
0114     elif "X509_USER_PROXY" not in os.environ:
0115         print("No X509 proxy set. Exiting.")
0116         sys.exit(1)
0117
0118     ## Check if we are in the cms-bot "environment"
0119     testing = "JENKINS_PREFIX" in os.environ
0120     dataset   = args.dataset
0121     events    = args.events
0122     threshold = args.threshold
0123     outfile   = args.outfile
0124     site      = args.site
0125     lumis     = args.lumis
0126     runs      = args.run
0127     das_opt = ""
0128
0129     if runs is not None:
0130         das_opt = "run in %s"%(str([int(r) for r in runs]))
0131
0132     if not args.nogolden:
0133
0134         ## get the greatest golden json
0135         year = dataset.split("Run")[1][2:4] # from 20XX to XX
0136         PD = dataset.split("/")[1]
0137         cert_type = "Collisions" + str(year)
0138         if "Cosmics" in dataset:
0139             cert_type = "Cosmics" + str(year)
0140         elif "Commisioning" in dataset:
0141             cert_type = "Commisioning2020"
0142         elif "HI" in PD:
0143             cert_type = "Collisions" + str(year) + "HI"
0144
0145         cert_path = base_cert_path + cert_type + "/"
0146         web_fallback = False
0147
0148         ## if we have access to eos we get from there ...
0149         if os.path.isdir(cert_path):
0150             json_list = os.listdir(cert_path)
0151             if len(json_list) == 0:
0152                 web_fallback == True
0153             json_list = [c for c in json_list if "golden" in c.lower() and "era" not in c.lower()]
0154             json_list = [c for c in json_list if c.lower().startswith("cert_c") and c.endswith("json")]
0155         else:
0156             web_fallback = True
0157         ## ... if not we go to the website
0158         if web_fallback:
0159             cert_url = base_cert_url + cert_type + "/"
0160             json_list = get_url_clean(cert_url).split("\n")
0161             json_list = [c for c in json_list if "golden" in c.lower() and "era" not in c.lower() and "cert_c" in c.lower()]
0162             json_list = [[cc for cc in c.split(" ") if cc.lower().startswith("cert_c") and cc.endswith("json")][0] for c in json_list]
0163
0164         # the larger the better, assuming file naming schema
0165         # Cert_X_RunStart_RunFinish_Type.json
0166         # TODO if args.run keep golden only with right range
0167
0168         run_ranges = [int(c.split("_")[3]) - int(c.split("_")[2]) for c in json_list]
0169         latest_json = np.array(json_list[np.argmax(run_ranges)]).reshape(1,-1)[0].astype(str)
0170         best_json = str(latest_json[0])
0171         if not web_fallback:
0172             with open(cert_path + "/" + best_json) as js:
0173                 golden = json.load(js)
0174         else:
0175             golden = get_url_clean(cert_url + best_json)
0176             golden = ast.literal_eval(golden) #converts string to dict
0177
0178         # skim for runs in input
0179         if runs is not None:
0180             for k in golden:
0181                 if k not in args.run:
0182                     golden.pop(k)
0183
0184         # golden json with all the lumisections
0185         golden_flat = {}
0186         for k in golden:
0187             R = []
0188             for r in golden[k]:
0189                 R = R + [f for f in range(r[0],r[1]+1)]
0190             golden_flat[k] = R
0191
0192         # let's just check there's an intersection between the
0193         # dataset and the json
0194         data_runs = das_run_data(dataset)
0195         golden_data_runs = [r for r in data_runs if r in golden_flat]
0196
0197         if (len(golden_data_runs)==0):
0198             no_intersection()
0199
0200         # building the dataframe, cleaning for bad lumis
0201         golden_data_runs_tocheck = golden_data_runs
0202
0203         if testing or args.precheck:
0204             golden_data_runs_tocheck = []
0205             # Here we check run per run.
0206             # This implies more dasgoclient queries, but smaller outputs
0207             # useful when running the IB/PR tests not to have huge
0208             # query results that have to be cached.
0209
0210             sum_events = 0
0211
0212             for r in golden_data_runs:
0213                 sum_events = sum_events + int(das_run_events_data(dataset,r))
0214                 golden_data_runs_tocheck.append(r)
0215                 if events > 0 and sum_events > events:
0216                     break
0217
0218             das_opt = "run in %s"%(str([int(g) for g in golden_data_runs_tocheck]))
0219
0220     df = das_lumi_data(dataset,opt=das_opt).merge(das_file_data(dataset,opt=das_opt),on="file",how="inner") # merge file informations with run and lumis
0221     df["lumis"] = [[int(ff) for ff in f.replace("[","").replace("]","").split(",")] for f in df.lumis.values]
0222
0223     if not args.nogolden:
0224
0225         df_rs = []
0226         for r in golden_data_runs_tocheck:
0227             cut = (df["run"] == r)
0228             if not any(cut):
0229                 continue
0230
0231             df_r = df[cut]
0232
0233             # jumping low event content runs
0234             if df_r["events"].sum() < threshold:
0235                 continue
0236
0237             good_lumis = np.array([len([ll for ll in l if ll in golden_flat[r]]) for l in df_r.lumis])
0238             n_lumis = np.array([len(l) for l in df_r.lumis])
0239             df_rs.append(df_r[good_lumis==n_lumis])
0240
0241         if (len(df_rs)==0):
0242             no_intersection()
0243
0244         df = pd.concat(df_rs)
0245
0246     df.loc[:,"min_lumi"] = [min(f) for f in df.lumis]
0247     df.loc[:,"max_lumi"] = [max(f) for f in df.lumis]
0248     df = df.sort_values(["run","min_lumi","max_lumi"])
0249
0250     if site is not None:
0251         df = df.merge(das_file_site(dataset,site),on="file",how="inner")
0252
0253     if args.pandas:
0254         df.to_csv(dataset.replace("/","")+".csv")
0255
0256     if events > 0:
0257         df = df[df["events"] <= events] #jump too big files
0258         df.loc[:,"sum_evs"] = df.loc[:,"events"].cumsum()
0259         df = df[df["sum_evs"] < events]
0260
0261     files = df.file
0262
0263     if lumis is not None:
0264         lumi_ranges = { int(r) : list(get_lumi_ranges(np.sort(np.concatenate(df.loc[df["run"]==r,"lumis"].values).ravel()).tolist())) for r in np.unique(df.run.values).tolist()}
0265
0266         with open(lumis, 'w') as fp:
0267             json.dump(lumi_ranges, fp)
0268
0269     if outfile is not None:
0270         with open(outfile, 'w') as f:
0271             for line in files:
0272                 f.write(f"{line}\n")
0273     else:
0274         print("\n".join(files))
0275
0276     sys.exit(0)
0277
0278