File indexing completed on 2024-04-06 12:07:42
0001
0002 HOMEDIR=$(dirname $0)
0003 EMAIL=$1
0004 TMPDIR=/tmp
0005 AGENTS=("fileCollector" "producerFileCleanner")
0006 LOGDIR=/home/dqmprolocal/agents
0007 STOPFILE=/tmp/stopModules
0008
0009
0010
0011 startAgents(){
0012 [[ $1 == "all" ]] && agents=($AGENTS ) ||
0013 agents=$1
0014
0015 for a in ${agents[@]}
0016 do
0017 case $a in
0018 "fileCollector" )
0019 (set -x
0020 ($HOMEDIR/fileCollector.py lilopera@cern.ch \
0021 /home/dqmprolocal/output \
0022 /home/dqmprolocal/done \
0023 /dqmdata/dqm/uploads
0024 ) |& $HOMEDIR/visDQMRotateLogs $LOGDIR/fcollect-%Y%m%d%H%M.txt </dev/null 86400 &
0025 )
0026 ;;
0027
0028 "producerFileCleanner" )
0029 (set -x
0030 ($HOMEDIR/producerFileCleanner.py lilopera@cern.ch \
0031 /home/dqmprolocal/done \
0032 /home/dqmprolocal/output \
0033 /dqmdata/dqm/repository/original
0034 ) |& $HOMEDIR/visDQMRotateLogs $LOGDIR/pfclean-%Y%m%d%H%M.txt </dev/null 86400 &
0035 )
0036 ;;
0037 esac
0038 done
0039 }
0040
0041 killproc() {
0042 local T title pat nextmsg
0043 T=1 title="$1" pat="$2"
0044 nextmsg="INFO: Stopping ${title}:"
0045 for pid in $(pgrep -u $(id -u) -f "$pat" | sort -rn); do
0046 psline=$(ps -o pid=,bsdstart=,args= $pid |
0047 perl -n -e 'print join(" ", (split)[0..4])')
0048 [ -n "$nextmsg" ] && { echo "$nextmsg"; nextmsg=; }
0049 echo -n "Stopping $pid ($psline):"
0050 for sig in TERM TERM QUIT KILL; do
0051 echo -n " SIG$sig"
0052 kill -$sig $pid
0053 sleep 1
0054 [ $(ps h $pid | wc -l) = 0 ] && break
0055 sleep $T
0056 T=$(expr $T \* 2)
0057 [ $(ps h $pid | wc -l) = 0 ] && break
0058 done
0059 echo
0060 newline="\n"
0061 done
0062 }
0063
0064 logme(){
0065 timeTag=$(date +"%Y%m%d%H%M")
0066 logFile=$LOGDIR/alivecheck-${timeTag}
0067 if [[ ${
0068 then
0069 while read a
0070 do
0071 echo $(date +"%Y-%m-%d %H:%M:%S")" [aliveCheck.sh/$$] $a" >>! $logFile
0072 done
0073 else
0074 echo $(date +"%Y-%m-%d %H:%M:%S")" [aliveCheck.sh/$$] $*" >>! $logFile
0075 fi
0076 return 0
0077 }
0078
0079
0080
0081 mkdir -p $LOGDIR
0082 cd ~
0083 if [[ -d prod && -e bin/setup_cmssw.sh ]]
0084 then
0085 cd prod
0086 source bin/setup_cmssw.sh
0087 eval `scramv1 runtime -sh`
0088 cd ~
0089 else
0090 logme "ERROR: Could not find prod release of CMSSW please make sure" \
0091 "that ~/prod exists and is a symbolic link (ln -s) to the" \
0092 "CMSSW area used by dqmpro and the online consumers." \
0093 "Also make sure that ~/bin/setup_cmssw.sh points to the right" \
0094 "CMSSW installation area on nfs"
0095 exit
0096 fi
0097
0098 if [[ -z $EMAIL || ! $EMAIL == *[a-zA-Z0-9\-\_\.]@cern.ch ]]
0099 then
0100 logme "ERROR: missing or unacceptable email address '$EMAIL'. e.g. " \
0101 "$HOMEDIR/aliveCheck.sh yourEmail@cern.ch"
0102 exit
0103 fi
0104
0105
0106 if [ -e $STOPFILE ]
0107 then
0108 logme "INFO: Found stop file (${STOPFILE}) at $HOSTNAME. Please" \
0109 "remove the file to restart the agents"
0110 set -a runningAgents
0111 for a in $AGENTS
0112 do
0113 pgrep -f $a > /dev/null && runningAgents[$(( ${
0114 done
0115 for a in $runningAgents
0116 do
0117 killproc "FMS Module [$a] " $a | logme
0118 done
0119 exit
0120 fi
0121
0122
0123
0124 set -a deadAgents
0125 for a in $AGENTS
0126 do
0127 pgrep -f $a > /dev/null || deadAgents[$(( ${
0128 done
0129
0130
0131 [[ ${
0132
0133 logme $deadAgents where stopped and restarted now at $HOSTNAME.
0134 echo $deadAgents where stopped and restarted now at $HOSTNAME. | mail -s "File management modules not Running" $EMAIL
0135
0136 if [[ ${
0137 then
0138 startAgents all
0139 else
0140 for a in $deadAgents
0141 do
0142 startAgents $a
0143 done
0144 fi