File indexing completed on 2024-08-27 04:39:33
0001
0002 import pycurl
0003 from io import BytesIO
0004 import pycurl
0005 import ast
0006 import subprocess
0007 import pandas as pd
0008 import argparse
0009 from bs4 import BeautifulSoup
0010 import numpy as np
0011 import os
0012 import json
0013 import sys
0014
0015
0016 base_cert_url = "https://cms-service-dqmdc.web.cern.ch/CAF/certification/"
0017 base_cert_path = "/eos/user/c/cmsdqm/www/CAF/certification/"
0018
0019 def get_url_clean(url):
0020
0021 buffer = BytesIO()
0022 c = pycurl.Curl()
0023 c.setopt(c.URL, url)
0024 c.setopt(c.WRITEDATA, buffer)
0025 c.perform()
0026 c.close()
0027
0028 return BeautifulSoup(buffer.getvalue(), "lxml").text
0029
0030 def das_do_command(cmd):
0031 out = subprocess.check_output(cmd, shell=True, executable="/bin/bash").decode('utf8')
0032 return out.split("\n")
0033
0034 def das_file_site(dataset, site):
0035 cmd = "dasgoclient --query='file dataset=%s site=%s'"%(dataset,site)
0036 out = das_do_command(cmd)
0037 df = pd.DataFrame(out,columns=["file"])
0038
0039 return df
0040
0041 def das_file_data(dataset,opt=""):
0042 cmd = "dasgoclient --query='file dataset=%s %s| grep file.name, file.nevents'"%(dataset,opt)
0043 out = das_do_command(cmd)
0044 out = [np.array(r.split(" "))[[0,3]] for r in out if len(r) > 0]
0045
0046 df = pd.DataFrame(out,columns=["file","events"])
0047 df.events = df.events.values.astype(int)
0048
0049 return df
0050
0051 def das_lumi_data(dataset,opt=""):
0052 cmd = "dasgoclient --query='file,lumi,run dataset=%s %s'"%(dataset,opt)
0053
0054 out = das_do_command(cmd)
0055 out = [r.split(" ") for r in out if len(r)>0]
0056
0057 df = pd.DataFrame(out,columns=["file","run","lumis"])
0058
0059 return df
0060
0061 def das_run_events_data(dataset,run,opt=""):
0062 cmd = "dasgoclient --query='file dataset=%s run=%s %s | sum(file.nevents) '"%(dataset,run,opt)
0063 out = das_do_command(cmd)[0]
0064
0065 out = [o for o in out.split(" ") if "sum" not in o]
0066 out = int([r.split(" ") for r in out if len(r)>0][0][0])
0067
0068 return out
0069
0070 def das_run_data(dataset,opt=""):
0071 cmd = "dasgoclient --query='run dataset=%s %s '"%(dataset,opt)
0072 out = das_do_command(cmd)
0073
0074 return out
0075
0076 def no_intersection():
0077 print("No intersection between:")
0078 print(" - json : ", best_json)
0079 print(" - dataset: ", dataset)
0080 print("Exiting.")
0081 sys.exit(1)
0082
0083 if __name__ == '__main__':
0084
0085 parser = argparse.ArgumentParser()
0086 parser.add_argument('--dataset','-d', default=None, help="Dataset Name (e.g. '/DisplacedJet/Run2024C-v1/RAW' )",type=str,required=True)
0087 parser.add_argument('--threshold','-t', help ="Event threshold per file",type=int,default=-1)
0088 parser.add_argument('--events','-e', help ="Tot number of events targeted",type=int,default=-1)
0089 parser.add_argument('--outfile','-o', help='Dump results to file', type=str, default=None)
0090 parser.add_argument('--pandas', '-pd',action='store_true',help="Store the whole dataset (no event or threshold cut) in a csv")
0091 parser.add_argument('--proxy','-p', help='Allow to parse a x509 proxy if needed', type=str, default=None)
0092 parser.add_argument('--site','-s', help='Only data at specific site', type=str, default=None)
0093 parser.add_argument('--precheck','-pc', action='store_true', help='Check run per run before building the dataframes, to avoid huge caching.')
0094 args = parser.parse_args()
0095
0096 if args.proxy is not None:
0097 os.environ["X509_USER_PROXY"] = args.proxy
0098 elif "X509_USER_PROXY" not in os.environ:
0099 print("No X509 proxy set. Exiting.")
0100 sys.exit(1)
0101
0102
0103 testing = "JENKINS_PREFIX" in os.environ
0104 dataset = args.dataset
0105 events = args.events
0106 threshold = args.threshold
0107 outfile = args.outfile
0108 site = args.site
0109
0110
0111 year = dataset.split("Run")[1][2:4]
0112 PD = dataset.split("/")[1]
0113 cert_type = "Collisions" + str(year)
0114 if "Cosmics" in dataset:
0115 cert_type = "Cosmics" + str(year)
0116 elif "Commisioning" in dataset:
0117 cert_type = "Commisioning2020"
0118 elif "HI" in PD:
0119 cert_type = "Collisions" + str(year) + "HI"
0120
0121 cert_path = base_cert_path + cert_type + "/"
0122 web_fallback = False
0123
0124
0125 if os.path.isdir(cert_path):
0126 json_list = os.listdir(cert_path)
0127 if len(json_list) == 0:
0128 web_fallback == True
0129 json_list = [c for c in json_list if "Golden" in c and "era" not in c]
0130 json_list = [c for c in json_list if c.startswith("Cert_C") and c.endswith("json")]
0131 else:
0132 web_fallback = True
0133
0134 if web_fallback:
0135 cert_url = base_cert_url + cert_type + "/"
0136 json_list = get_url_clean(cert_url).split("\n")
0137 json_list = [c for c in json_list if "Golden" in c and "era" not in c]
0138 json_list = [[cc for cc in c.split(" ") if cc.startswith("Cert_C") and cc.endswith("json")][0] for c in json_list]
0139
0140
0141
0142 run_ranges = [int(c.split("_")[3]) - int(c.split("_")[2]) for c in json_list]
0143 latest_json = np.array(json_list[np.argmax(run_ranges)]).reshape(1,-1)[0].astype(str)
0144 best_json = str(latest_json[0])
0145 if not web_fallback:
0146 with open(cert_path + "/" + best_json) as js:
0147 golden = json.load(js)
0148 else:
0149 golden = get_url_clean(cert_url + best_json)
0150 golden = ast.literal_eval(golden)
0151
0152
0153 golden_flat = {}
0154 for k in golden:
0155 R = []
0156 for r in golden[k]:
0157 R = R + [f for f in range(r[0],r[1]+1)]
0158 golden_flat[k] = R
0159
0160
0161
0162 data_runs = das_run_data(dataset)
0163 golden_data_runs = [r for r in data_runs if r in golden_flat]
0164
0165 if (len(golden_data_runs)==0):
0166 no_intersection()
0167
0168
0169 golden_data_runs_tocheck = golden_data_runs
0170 das_opt = ""
0171 if testing or args.precheck:
0172 golden_data_runs_tocheck = []
0173
0174
0175
0176
0177
0178 sum_events = 0
0179
0180 for r in golden_data_runs:
0181 sum_events = sum_events + int(das_run_events_data(dataset,r))
0182 golden_data_runs_tocheck.append(r)
0183 if events > 0 and sum_events > events:
0184 break
0185
0186 das_opt = "run in %s"%(str([int(g) for g in golden_data_runs_tocheck]))
0187
0188 df = das_lumi_data(dataset,opt=das_opt).merge(das_file_data(dataset,opt=das_opt),on="file",how="inner")
0189
0190 df["lumis"] = [[int(ff) for ff in f.replace("[","").replace("]","").split(",")] for f in df.lumis.values]
0191 df_rs = []
0192 for r in golden_data_runs_tocheck:
0193 cut = (df["run"] == r)
0194 if not any(cut):
0195 continue
0196
0197 df_r = df[cut]
0198
0199
0200 if df_r["events"].sum() < threshold:
0201 continue
0202
0203 good_lumis = np.array([len([ll for ll in l if ll in golden_flat[r]]) for l in df_r.lumis])
0204 n_lumis = np.array([len(l) for l in df_r.lumis])
0205 df_rs.append(df_r[good_lumis==n_lumis])
0206
0207 if (len(df_rs)==0):
0208 no_intersection()
0209
0210 df = pd.concat(df_rs)
0211 df.loc[:,"min_lumi"] = [min(f) for f in df.lumis]
0212 df.loc[:,"max_lumi"] = [max(f) for f in df.lumis]
0213 df = df.sort_values(["run","min_lumi","max_lumi"])
0214
0215 if site is not None:
0216 df = df.merge(das_file_site(dataset,site),on="file",how="inner")
0217
0218 if args.pandas:
0219 df.to_csv(dataset.replace("/","")+".csv")
0220
0221 if events > 0:
0222 df = df[df["events"] <= events]
0223 df.loc[:,"sum_evs"] = df.loc[:,"events"].cumsum()
0224 df = df[df["sum_evs"] < events]
0225
0226 files = df.file
0227
0228 if outfile is not None:
0229 with open(outfile, 'w') as f:
0230 for line in files:
0231 f.write(f"{line}\n")
0232 else:
0233 print("\n".join(files))
0234
0235 sys.exit(0)
0236
0237