File indexing completed on 2024-04-06 11:56:23
0001
0002
0003
0004
0005 set workdir = `cd ../../; pwd`
0006
0007 set iter = $1
0008 set odir = `cd $2; pwd`
0009 set name = `basename $odir`
0010 set iovfile = `basename $3`
0011 set jobs = `ls -d $odir/job*/ | wc -l`
0012 set queue = tomorrow
0013 set startdir = `pwd`
0014
0015 set allowed_n_jobs_submitted = 1000
0016 set allowed_n_jobs_idle = 1000
0017 set allowed_n_recover = 2
0018 set allowed_n_rescue = 5
0019 set allowed_n_resub = 2
0020 set allowed_n_retry = 2
0021
0022
0023
0024 cp $3 $odir
0025 set k = 0
0026
0027 foreach line ( "`cat $odir/$iovfile`" )
0028 @ k++
0029 echo $line
0030 cmsRun $odir/main/initial_cfg_$k.py >& $odir/main/initial_$line.out
0031 set lastrun = $line
0032 end
0033
0034
0035 @ i = `cat $odir/main/IOIteration_$lastrun.root` + 1
0036
0037
0038 echo Starting iteration = $i, final iteration = $iter, number of jobs = $jobs
0039 echo ""
0040
0041 echo "Writing Condor and DAGMan files..."
0042 mkdir $odir/DAG/
0043
0044
0045 echo "#"\!"/bin/tcsh" >> $odir/upload.csh
0046 echo "cd $workdir" >> $odir/upload.csh
0047 echo "eval "\`"scramv1 runtime -csh"\` >> $odir/upload.csh
0048 echo "cd $odir" >> $odir/upload.csh
0049 echo "cp $odir/main/IOIteration*.root $odir/" >> $odir/upload.csh
0050 echo "cp $odir/main/IOAlignedPositions*.root $odir/" >> $odir/upload.csh
0051 echo "set kk = 0" >> $odir/upload.csh
0052 echo "foreach line ( "\""`cat $odir/$iovfile`"\"" )" >> $odir/upload.csh
0053 echo " @ kk++" >> $odir/upload.csh
0054 echo " cmsRun upload_cfg_"\$"kk.py" >> $odir/upload.csh
0055 echo "end" >> $odir/upload.csh
0056 echo "rm -f $odir/*.root" >> $odir/upload.csh
0057 echo "cd -" >> $odir/upload.csh
0058
0059 echo "universe = vanilla" >> $odir/upload.jdl
0060 echo "executable = $odir/upload.csh" >> $odir/upload.jdl
0061 echo "output = $odir/upload.out" >> $odir/upload.jdl
0062 echo "error = $odir/upload.out" >> $odir/upload.jdl
0063 echo "log = $odir/upload.log" >> $odir/upload.jdl
0064 echo "+JobFlavour = "\""espresso"\" >> $odir/upload.jdl
0065 echo "+JobBatchName = "\""upload"\" >> $odir/upload.jdl
0066 echo "queue" >> $odir/upload.jdl
0067
0068 echo "DAGMAN_MAX_JOBS_SUBMITTED = $allowed_n_jobs_submitted" >> $odir/DAG/sequence.cfg
0069 echo "DAGMAN_MAX_JOBS_IDLE = $allowed_n_jobs_idle" >> $odir/DAG/sequence.cfg
0070
0071 chmod +x $odir/upload.csh
0072
0073
0074 set parent_line_jb = "PARENT"
0075 set parent_line_iov = "PARENT"
0076 set child_line_jb = "CHILD"
0077 set retry_line_jb = ""
0078
0079
0080 foreach jb (`seq 1 1 $jobs`)
0081
0082 echo "universe = vanilla" >> $odir/job$jb/runScript.jdl
0083 echo "executable = $odir/job$jb/runScript.csh" >> $odir/job$jb/runScript.jdl
0084 echo "arguments = $workdir" >> $odir/job$jb/runScript.jdl
0085 echo "output = $odir/job$jb/align.out" >> $odir/job$jb/runScript.jdl
0086 echo "error = $odir/job$jb/align.out" >> $odir/job$jb/runScript.jdl
0087 echo "log = $odir/job$jb/align.log" >> $odir/job$jb/runScript.jdl
0088 echo "+JobFlavour = "\""$queue"\" >> $odir/job$jb/runScript.jdl
0089 echo "+JobBatchName = "\""$name/align_$jb"\" >> $odir/job$jb/runScript.jdl
0090 echo "queue" >> $odir/job$jb/runScript.jdl
0091
0092
0093 echo "JOB job$jb $odir/job$jb/runScript.jdl" >> $odir/DAG/sequence.dag
0094 set parent_line_jb = "$parent_line_jb job$jb"
0095
0096 set retry_line_jb = "${retry_line_jb}Retry job$jb $allowed_n_retry\n"
0097
0098 end
0099
0100
0101 foreach iov (`seq 1 1 $k`)
0102
0103 echo "universe = vanilla" >> $odir/main/runScript_$iov.jdl
0104 echo "executable = $odir/main/runScript_$iov.csh" >> $odir/main/runScript_$iov.jdl
0105 echo "arguments = $workdir" >> $odir/main/runScript_$iov.jdl
0106 echo "output = $odir/main/collect_$iov.out" >> $odir/main/runScript_$iov.jdl
0107 echo "error = $odir/main/collect_$iov.out" >> $odir/main/runScript_$iov.jdl
0108 echo "log = $odir/main/collect_$iov.log" >> $odir/main/runScript_$iov.jdl
0109 echo "+JobFlavour = "\""espresso"\" >> $odir/main/runScript_$iov.jdl
0110 echo "+JobBatchName = "\""$name/collect_$iov"\" >> $odir/main/runScript_$iov.jdl
0111 echo "queue" >> $odir/main/runScript_$iov.jdl
0112
0113
0114 echo "JOB collect$iov $odir/main/runScript_$iov.jdl" >> $odir/DAG/sequence.dag
0115 set child_line_jb = "$child_line_jb collect$iov"
0116 set parent_line_iov = "$parent_line_iov collect$iov"
0117 set retry_line_jb = "${retry_line_jb}Retry collect$iov $allowed_n_retry\n"
0118 end
0119
0120 echo "JOB upload $odir/upload.jdl" >> $odir/DAG/sequence.dag
0121 set retry_line_jb = "${retry_line_jb}Retry upload $allowed_n_retry"
0122
0123 echo "$parent_line_jb $child_line_jb" >> $odir/DAG/sequence.dag
0124 echo "$parent_line_iov CHILD upload \n$retry_line_jb" >> $odir/DAG/sequence.dag
0125 echo "CONFIG $odir/DAG/sequence.cfg" >> $odir/DAG/sequence.dag
0126
0127 while ($i <= $iter)
0128
0129 echo "-----------------------------------------------------------------------"
0130 echo " Starting iteration $i "
0131 echo "-----------------------------------------------------------------------"
0132
0133
0134 echo "Submitting DAGMan file..."
0135 condor_submit_dag $odir/DAG/sequence.dag
0136
0137 set n_recover = 0
0138 set n_rescue = 0
0139 set n_resub = 0
0140 set iteration_done = 0
0141 while (!($iteration_done) && ($n_recover<$allowed_n_recover) && ($n_rescue<$allowed_n_rescue) && ($n_resub<$allowed_n_resub))
0142
0143
0144 if ( -f "$odir/DAG/sequence.dag.metrics" ) then
0145 echo "DAGMAN submission has finished unexpectedly. Restarting submission..."
0146 rm $odir/DAG/sequence.dag.metrics
0147 @ n_resub++
0148 condor_submit_dag $odir/DAG/sequence.dag
0149 continue
0150 endif
0151
0152 echo "Submission was succesful."
0153 echo "Wait untill jobs stop running..."
0154 sleep 30
0155 condor_wait $odir/DAG/sequence.dag.dagman.log
0156 if ( $? != 0 ) then
0157 echo "condor_wait finished unexpectedly, returning $?. Terminating..."
0158 exit 1
0159 endif
0160
0161 @ found_rescue = 1
0162 if (`ls | grep rescue | wc -l` < `expr $n_rescue + 1`) then
0163 @ found_rescue = 0
0164 else
0165 @ n_rescue++
0166 endif
0167
0168 @ log_done = 0
0169 if ( -f $odir/DAG/sequence.dag.dagman.out && "`tail $odir/DAG/sequence.dag.dagman.out |grep "\""EXITING WITH STATUS"\""`" != "") then
0170 @ log_done = 1
0171 endif
0172
0173
0174 if ( `cat $odir/DAG/sequence.dag.dagman.log | grep "Job was aborted by the user" | wc -l` > 0 ) then
0175 echo "USER interruption detected. Terminating..."
0176 exit 1
0177 endif
0178
0179 if ( !($found_rescue) && $log_done) then
0180 echo "Iteration $i finished."
0181 @ iteration_done = 1
0182 else if (!($found_rescue) && !($log_done)) then
0183 @ n_recover++
0184 echo "Sumbit recovery attempt $n_recover"
0185 condor_submit_dag -DoRecovery $odir/DAG/sequence.dag
0186 else if ($found_rescue) then
0187 echo "Submit rescue attempt $n_rescue"
0188 condor_submit_dag $odir/DAG/sequence.dag
0189 endif
0190 end
0191 if !($iteration_done) then
0192 exit 1
0193 endif
0194 cd $startdir
0195
0196
0197 mv $odir/alignments.db $odir/alignments_iter${i}.db
0198
0199
0200 rm -rf $odir/job*/IOUserVariables.root
0201
0202
0203 foreach jb (`seq 1 1 $jobs`)
0204 mv $odir/job$jb/align.out $odir/job$jb/align$i.out
0205 mv $odir/job$jb/align.log $odir/job$jb/align$i.log
0206 gzip -f $odir/job$jb/align$i.out
0207 gzip -f $odir/job$jb/align$i.log
0208 end
0209 foreach iov (`seq 1 1 $k`)
0210 mv $odir/main/collect_$iov.out $odir/main/collect_"$iov"_it$i.out
0211 mv $odir/main/collect_$iov.log $odir/main/collect_"$iov"_it$i.log
0212 gzip -f $odir/main/collect_"$iov"_it$i.out
0213 gzip -f $odir/main/collect_"$iov"_it$i.log
0214 end
0215 mv $odir/upload.out $odir/upload$i.out
0216 mv $odir/upload.log $odir/upload$i.log
0217 gzip -f $odir/upload$i.out
0218 gzip -f $odir/upload$i.log
0219 mv $odir/DAG/sequence.dag.dagman.out $odir/DAG/sequence$i.dag.dagman.out
0220 rm -rf $odir/DAG/sequence.dag.*
0221
0222 @ i++
0223 end
0224