Back to home page

Project CMSSW displayed by LXR

 
 

    


File indexing completed on 2024-04-06 12:07:42

0001 #! /bin/zsh
0002 HOMEDIR=$(dirname $0)
0003 EMAIL=$1
0004 TMPDIR=/tmp
0005 AGENTS=("fileCollector" "producerFileCleanner")
0006 LOGDIR=/home/dqmprolocal/agents
0007 STOPFILE=/tmp/stopModules
0008 
0009 ######################################################################
0010 # Support functions
0011 startAgents(){
0012   [[ $1 == "all" ]] && agents=($AGENTS ) ||
0013       agents=$1
0014 
0015   for a in ${agents[@]}
0016   do  
0017     case $a in
0018       "fileCollector" )
0019         (set -x
0020          ($HOMEDIR/fileCollector.py lilopera@cern.ch \
0021                 /home/dqmprolocal/output \
0022                 /home/dqmprolocal/done  \
0023                 /dqmdata/dqm/uploads
0024          ) |& $HOMEDIR/visDQMRotateLogs $LOGDIR/fcollect-%Y%m%d%H%M.txt </dev/null 86400 &
0025         )
0026         ;;
0027          
0028       "producerFileCleanner" )
0029         (set -x
0030          ($HOMEDIR/producerFileCleanner.py lilopera@cern.ch \
0031           /home/dqmprolocal/done \
0032           /home/dqmprolocal/output \
0033           /dqmdata/dqm/repository/original
0034          ) |& $HOMEDIR/visDQMRotateLogs $LOGDIR/pfclean-%Y%m%d%H%M.txt </dev/null 86400 & 
0035         )
0036         ;;
0037     esac 
0038   done
0039 }
0040 
0041 killproc() {
0042   local T title pat nextmsg
0043   T=1 title="$1" pat="$2"
0044   nextmsg="INFO: Stopping ${title}:"
0045   for pid in $(pgrep -u $(id -u) -f "$pat" | sort -rn); do
0046     psline=$(ps -o pid=,bsdstart=,args= $pid |
0047              perl -n -e 'print join(" ", (split)[0..4])')
0048     [ -n "$nextmsg" ] && { echo "$nextmsg"; nextmsg=; }
0049     echo -n "Stopping $pid ($psline):"
0050     for sig in TERM TERM QUIT KILL; do
0051       echo -n " SIG$sig"
0052       kill -$sig $pid
0053       sleep 1
0054       [ $(ps h $pid | wc -l) = 0 ] && break
0055       sleep $T
0056       T=$(expr $T \* 2)
0057       [ $(ps h $pid | wc -l) = 0 ] && break
0058     done
0059     echo
0060     newline="\n"
0061   done
0062 }
0063 
0064 logme(){
0065   timeTag=$(date +"%Y%m%d%H%M")
0066   logFile=$LOGDIR/alivecheck-${timeTag}
0067   if [[ ${#*} -eq 0 ]]
0068   then 
0069     while read a 
0070     do
0071       echo $(date +"%Y-%m-%d %H:%M:%S")" [aliveCheck.sh/$$] $a" >>! $logFile 
0072     done
0073   else
0074     echo $(date +"%Y-%m-%d %H:%M:%S")" [aliveCheck.sh/$$] $*" >>! $logFile
0075   fi
0076   return 0
0077 }
0078 
0079 ######################################################################
0080 # Setting up the environment
0081 mkdir -p $LOGDIR
0082 cd ~
0083 if [[ -d prod && -e bin/setup_cmssw.sh ]] 
0084 then
0085   cd prod
0086   source bin/setup_cmssw.sh
0087   eval `scramv1 runtime -sh`
0088   cd ~
0089 else
0090   logme "ERROR: Could not find prod release of CMSSW please make sure" \
0091         "that ~/prod exists and is a symbolic link (ln -s) to the" \
0092         "CMSSW area used by dqmpro and the online consumers." \
0093         "Also make sure that ~/bin/setup_cmssw.sh points to the right" \
0094         "CMSSW installation area on nfs"
0095   exit
0096 fi
0097 
0098 if [[ -z $EMAIL || ! $EMAIL == *[a-zA-Z0-9\-\_\.]@cern.ch ]]
0099 then 
0100   logme "ERROR: missing or unacceptable email address '$EMAIL'. e.g. " \
0101         "$HOMEDIR/aliveCheck.sh yourEmail@cern.ch"
0102   exit
0103 fi
0104    
0105 # Stop mode
0106 if [ -e $STOPFILE ]
0107 then
0108   logme "INFO: Found stop file (${STOPFILE}) at $HOSTNAME. Please" \
0109         "remove the file to restart the agents"
0110   set -a runningAgents
0111   for a in $AGENTS
0112   do
0113     pgrep -f $a > /dev/null && runningAgents[$(( ${#runningAgents} + 1 ))]=$a
0114   done
0115   for a in $runningAgents
0116   do 
0117     killproc "FMS Module [$a] " $a | logme
0118   done
0119   exit
0120 fi
0121 
0122 # Running Mode:
0123 # Find out if there's any dead agents
0124 set -a deadAgents
0125 for a in $AGENTS
0126 do
0127   pgrep -f $a > /dev/null ||  deadAgents[$(( ${#deadAgents} + 1 ))]=$a
0128 done
0129 
0130 # If there are no dead agents just finish
0131 [[ ${#deadAgents} -eq 0 ]] && exit
0132 
0133 logme $deadAgents where stopped and restarted now at $HOSTNAME.
0134 echo $deadAgents where stopped and restarted now at $HOSTNAME. | mail -s "File management modules not Running" $EMAIL
0135 
0136 if [[ ${#deadAgents} -eq ${#AGENTS} ]] 
0137 then
0138   startAgents all
0139 else 
0140   for a in $deadAgents
0141   do
0142     startAgents $a
0143   done
0144 fi