File indexing completed on 2025-04-06 22:42:44
0001
0002 import pycurl
0003 from io import BytesIO
0004 import pycurl
0005 import ast
0006 import subprocess
0007 import pandas as pd
0008 import argparse
0009 from bs4 import BeautifulSoup
0010 import numpy as np
0011 import os
0012 import json
0013 import sys
0014 import itertools
0015 import json
0016
0017
0018 base_cert_url = "https://cms-service-dqmdc.web.cern.ch/CAF/certification/"
0019 base_cert_path = "/eos/user/c/cmsdqm/www/CAF/certification/"
0020
0021 def get_url_clean(url):
0022
0023 buffer = BytesIO()
0024 c = pycurl.Curl()
0025 c.setopt(c.URL, url)
0026 c.setopt(c.WRITEDATA, buffer)
0027 c.perform()
0028 c.close()
0029
0030 return BeautifulSoup(buffer.getvalue(), "lxml").text
0031
0032 def get_lumi_ranges(i):
0033 result = []
0034 for _, b in itertools.groupby(enumerate(i), lambda pair: pair[1] - pair[0]):
0035 b = list(b)
0036 result.append([b[0][1],b[-1][1]])
0037 return result
0038
0039 def das_do_command(cmd):
0040 out = subprocess.check_output(cmd, shell=True, executable="/bin/bash").decode('utf8')
0041 return out.split("\n")
0042
0043 def das_key(dataset):
0044 return 'dataset='+dataset if "#" not in dataset else 'block='+dataset
0045
0046 def das_file_site(dataset, site):
0047 cmd = "dasgoclient --query='file %s site=%s'"%(das_key(dataset),site)
0048 out = das_do_command(cmd)
0049 df = pd.DataFrame(out,columns=["file"])
0050
0051 return df
0052
0053 def das_file_data(dataset,opt=""):
0054 cmd = "dasgoclient --query='file %s %s| grep file.name, file.nevents'"%(das_key(dataset),opt)
0055 out = das_do_command(cmd)
0056 out = [np.array(r.split(" "))[[0,3]] for r in out if len(r) > 0]
0057
0058 df = pd.DataFrame(out,columns=["file","events"])
0059 df.events = df.events.values.astype(int)
0060
0061 return df
0062
0063 def das_lumi_data(dataset,opt=""):
0064
0065 cmd = "dasgoclient --query='file,lumi,run %s %s'"%(das_key(dataset),opt)
0066
0067 out = das_do_command(cmd)
0068 out = [r.split(" ") for r in out if len(r)>0]
0069
0070 df = pd.DataFrame(out,columns=["file","run","lumis"])
0071
0072 return df
0073
0074 def das_run_events_data(dataset,run,opt=""):
0075 cmd = "dasgoclient --query='file %s run=%s %s | sum(file.nevents) '"%(das_key(dataset),run,opt)
0076 out = das_do_command(cmd)[0]
0077
0078 out = [o for o in out.split(" ") if "sum" not in o]
0079 out = int([r.split(" ") for r in out if len(r)>0][0][0])
0080
0081 return out
0082
0083 def das_run_data(dataset,opt=""):
0084 cmd = "dasgoclient --query='run %s %s '"%(das_key(dataset),opt)
0085 out = das_do_command(cmd)
0086
0087 return out
0088
0089 def no_intersection():
0090 print("No intersection between:")
0091 print(" - json : ", best_json)
0092 print(" - dataset: ", dataset)
0093 print("Exiting.")
0094 sys.exit(1)
0095
0096 if __name__ == '__main__':
0097
0098 parser = argparse.ArgumentParser()
0099 parser.add_argument('--dataset','-d', default=None, help="Dataset Name (e.g. '/DisplacedJet/Run2024C-v1/RAW', may also be a block (e.g /ZeroBias/Run2024J-v1/RAW#d8058bab-4e55-45b0-abb6-405aa3abc2af)",type=str,required=True)
0100 parser.add_argument('--threshold','-t', help ="Event threshold per file",type=int,default=-1)
0101 parser.add_argument('--events','-e', help ="Tot number of events targeted",type=int,default=-1)
0102 parser.add_argument('--outfile','-o', help='Dump results to file', type=str, default=None)
0103 parser.add_argument('--pandas', '-pd',action='store_true',help="Store the whole dataset (no event or threshold cut) in a csv")
0104 parser.add_argument('--proxy','-p', help='Allow to parse a x509 proxy if needed', type=str, default=None)
0105 parser.add_argument('--site','-s', help='Only data at specific site', type=str, default=None)
0106 parser.add_argument('--lumis','-l', help='Output file for lumi ranges for the selected files (if black no lumiranges calculated)', type=str, default=None)
0107 parser.add_argument('--precheck','-pc', action='store_true', help='Check run per run before building the dataframes, to avoid huge caching.')
0108 parser.add_argument('--nogolden','-ng', action='store_true', help='Do not crosscheck the dataset run and lumis with a Golden json for data certification')
0109 parser.add_argument('--run','-r', help ="Target a specific run",type=int,default=None,nargs="+")
0110 args = parser.parse_args()
0111
0112 if args.proxy is not None:
0113 os.environ["X509_USER_PROXY"] = args.proxy
0114 elif "X509_USER_PROXY" not in os.environ:
0115 print("No X509 proxy set. Exiting.")
0116 sys.exit(1)
0117
0118
0119 testing = "JENKINS_PREFIX" in os.environ
0120 dataset = args.dataset
0121 events = args.events
0122 threshold = args.threshold
0123 outfile = args.outfile
0124 site = args.site
0125 lumis = args.lumis
0126 runs = args.run
0127 das_opt = ""
0128
0129 if runs is not None:
0130 das_opt = "run in %s"%(str([int(r) for r in runs]))
0131
0132 if not args.nogolden:
0133
0134
0135 year = dataset.split("Run")[1][2:4]
0136 PD = dataset.split("/")[1]
0137 cert_type = "Collisions" + str(year)
0138 if "Cosmics" in dataset:
0139 cert_type = "Cosmics" + str(year)
0140 elif "Commisioning" in dataset:
0141 cert_type = "Commisioning2020"
0142 elif "HI" in PD:
0143 cert_type = "Collisions" + str(year) + "HI"
0144
0145 cert_path = base_cert_path + cert_type + "/"
0146 web_fallback = False
0147
0148
0149 if os.path.isdir(cert_path):
0150 json_list = os.listdir(cert_path)
0151 if len(json_list) == 0:
0152 web_fallback == True
0153 json_list = [c for c in json_list if "Golden" in c and "era" not in c]
0154 json_list = [c for c in json_list if c.startswith("Cert_C") and c.endswith("json")]
0155 else:
0156 web_fallback = True
0157
0158 if web_fallback:
0159 cert_url = base_cert_url + cert_type + "/"
0160 json_list = get_url_clean(cert_url).split("\n")
0161 json_list = [c for c in json_list if "Golden" in c and "era" not in c and "Cert_C" in c]
0162 json_list = [[cc for cc in c.split(" ") if cc.startswith("Cert_C") and cc.endswith("json")][0] for c in json_list]
0163
0164
0165
0166
0167
0168 run_ranges = [int(c.split("_")[3]) - int(c.split("_")[2]) for c in json_list]
0169 latest_json = np.array(json_list[np.argmax(run_ranges)]).reshape(1,-1)[0].astype(str)
0170 best_json = str(latest_json[0])
0171 if not web_fallback:
0172 with open(cert_path + "/" + best_json) as js:
0173 golden = json.load(js)
0174 else:
0175 golden = get_url_clean(cert_url + best_json)
0176 golden = ast.literal_eval(golden)
0177
0178
0179 if runs is not None:
0180 for k in golden:
0181 if k not in args.run:
0182 golden.pop(k)
0183
0184
0185 golden_flat = {}
0186 for k in golden:
0187 R = []
0188 for r in golden[k]:
0189 R = R + [f for f in range(r[0],r[1]+1)]
0190 golden_flat[k] = R
0191
0192
0193
0194 data_runs = das_run_data(dataset)
0195 golden_data_runs = [r for r in data_runs if r in golden_flat]
0196
0197 if (len(golden_data_runs)==0):
0198 no_intersection()
0199
0200
0201 golden_data_runs_tocheck = golden_data_runs
0202
0203 if testing or args.precheck:
0204 golden_data_runs_tocheck = []
0205
0206
0207
0208
0209
0210 sum_events = 0
0211
0212 for r in golden_data_runs:
0213 sum_events = sum_events + int(das_run_events_data(dataset,r))
0214 golden_data_runs_tocheck.append(r)
0215 if events > 0 and sum_events > events:
0216 break
0217
0218 das_opt = "run in %s"%(str([int(g) for g in golden_data_runs_tocheck]))
0219
0220 df = das_lumi_data(dataset,opt=das_opt).merge(das_file_data(dataset,opt=das_opt),on="file",how="inner")
0221 df["lumis"] = [[int(ff) for ff in f.replace("[","").replace("]","").split(",")] for f in df.lumis.values]
0222
0223 if not args.nogolden:
0224
0225 df_rs = []
0226 for r in golden_data_runs_tocheck:
0227 cut = (df["run"] == r)
0228 if not any(cut):
0229 continue
0230
0231 df_r = df[cut]
0232
0233
0234 if df_r["events"].sum() < threshold:
0235 continue
0236
0237 good_lumis = np.array([len([ll for ll in l if ll in golden_flat[r]]) for l in df_r.lumis])
0238 n_lumis = np.array([len(l) for l in df_r.lumis])
0239 df_rs.append(df_r[good_lumis==n_lumis])
0240
0241 if (len(df_rs)==0):
0242 no_intersection()
0243
0244 df = pd.concat(df_rs)
0245
0246 df.loc[:,"min_lumi"] = [min(f) for f in df.lumis]
0247 df.loc[:,"max_lumi"] = [max(f) for f in df.lumis]
0248 df = df.sort_values(["run","min_lumi","max_lumi"])
0249
0250 if site is not None:
0251 df = df.merge(das_file_site(dataset,site),on="file",how="inner")
0252
0253 if args.pandas:
0254 df.to_csv(dataset.replace("/","")+".csv")
0255
0256 if events > 0:
0257 df = df[df["events"] <= events]
0258 df.loc[:,"sum_evs"] = df.loc[:,"events"].cumsum()
0259 df = df[df["sum_evs"] < events]
0260
0261 files = df.file
0262
0263 if lumis is not None:
0264 lumi_ranges = { int(r) : list(get_lumi_ranges(np.sort(np.concatenate(df.loc[df["run"]==r,"lumis"].values).ravel()).tolist())) for r in np.unique(df.run.values).tolist()}
0265
0266 with open(lumis, 'w') as fp:
0267 json.dump(lumi_ranges, fp)
0268
0269 if outfile is not None:
0270 with open(outfile, 'w') as f:
0271 for line in files:
0272 f.write(f"{line}\n")
0273 else:
0274 print("\n".join(files))
0275
0276 sys.exit(0)
0277
0278