Back to home page

Project CMSSW displayed by LXR

 
 

    


File indexing completed on 2022-05-24 22:29:53

0001 #!/bin/bash
0002 
0003 # defaults
0004 USEDOCKER=""
0005 GPU=""
0006 VERBOSE=""
0007 VERBOSE_ARGS="--log-verbose=1 --log-error=1 --log-warning=1 --log-info=1"
0008 WTIME=300
0009 SERVER=triton_server_instance
0010 RETRIES=3
0011 REPOS=()
0012 MODELS=()
0013 FORCE=""
0014 CLEANUP=true
0015 TMPDIR=""
0016 DRYRUN=""
0017 PARENTPID=""
0018 BASEPORT=8000
0019 AUTOPORT=""
0020 NPORTS=3
0021 IMAGE=fastml/triton-torchgeo:21.06-py3-geometric
0022 SANDBOX=""
0023 COMPAT_USR=""
0024 
0025 get_sandbox(){
0026         if [ -z "$SANDBOX" ]; then
0027                 echo /cvmfs/unpacked.cern.ch/registry.hub.docker.com/${IMAGE}
0028         else
0029                 echo "$SANDBOX"
0030         fi
0031 }
0032 
0033 usage() {
0034         ECHO="echo -e"
0035         $ECHO "cmsTriton [options] [start|stop]"
0036         $ECHO
0037         $ECHO "Options:"
0038         $ECHO "-c          \t don't cleanup temporary dir (for debugging)"
0039         $ECHO "-C [dir]    \t directory containing Nvidia compatibility drivers (checks CMSSW_BASE by default if available)"
0040         $ECHO "-D          \t dry run: print container commands rather than executing them"
0041         $ECHO "-d          \t use Docker instead of Singularity"
0042         $ECHO "-f          \t force reuse of (possibly) existing container instance"
0043         $ECHO "-g          \t use GPU instead of CPU"
0044         $ECHO "-i [name]   \t server image name (default: ${IMAGE})"
0045         $ECHO "-M [dir]    \t model repository (can be given more than once)"
0046         $ECHO "-m [dir]    \t specific model directory (can be given more than once)"
0047         $ECHO "-n [name]   \t name of container instance, also used for default hidden temporary dir (default: ${SERVER})"
0048         $ECHO "-P [port]   \t base port number for services (-1: automatically find an unused port range) (default: ${BASEPORT})"
0049         $ECHO "-p [pid]    \t automatically shut down server when process w/ specified PID ends (-1: use parent process PID)"
0050         $ECHO "-r [num]    \t number of retries when starting container (default: ${RETRIES})"
0051         $ECHO "-s [dir]    \t Singularity sandbox directory (default: $(get_sandbox))"
0052         $ECHO "-t [dir]    \t non-default hidden temporary dir"
0053         $ECHO "-v          \t (verbose) start: activate server debugging info; stop: keep server logs"
0054         $ECHO "-w [time]   \t maximum time to wait for server to start (default: ${WTIME} seconds)"
0055         $ECHO "-h          \t print this message and exit"
0056         $ECHO
0057         $ECHO "Operations:"
0058         $ECHO "start       \t start server"
0059         $ECHO "stop        \t stop server"
0060         $ECHO "check       \t check if server can run on this system"
0061         exit $1
0062 }
0063 
0064 # check shm locations
0065 SHM=/dev/shm
0066 if [ -e /run/shm ]; then
0067         SHM=/run/shm
0068 fi
0069 
0070 while getopts "cC:Ddfgi:M:m:n:P:p:r:s:t:vw:h" opt; do
0071         case "$opt" in
0072                 c) CLEANUP=""
0073                 ;;
0074                 C) COMPAT_USR="$OPTARG"
0075                 ;;
0076                 D) DRYRUN=echo
0077                 ;;
0078                 d) USEDOCKER=true
0079                 ;;
0080                 f) FORCE=true
0081                 ;;
0082                 g) GPU=true
0083                 ;;
0084                 i) IMAGE="$OPTARG"
0085                 ;;
0086                 M) REPOS+=("$OPTARG")
0087                 ;;
0088                 m) MODELS+=("$OPTARG")
0089                 ;;
0090                 n) SERVER="$OPTARG"
0091                 ;;
0092                 P) if [ "$OPTARG" -eq -1 ]; then AUTOPORT=true; else BASEPORT="$OPTARG"; fi
0093                 ;;
0094                 p) if [ "$OPTARG" -eq -1 ]; then PARENTPID="$PPID"; else PARENTPID="$OPTARG"; fi
0095                 ;;
0096                 r) RETRIES="$OPTARG"
0097                 ;;
0098                 s) SANDBOX="$OPTARG"
0099                 ;;
0100                 t) TMPDIR="$OPTARG"
0101                 ;;
0102                 v) VERBOSE="$VERBOSE_ARGS"
0103                 ;;
0104                 w) WTIME="$OPTARG"
0105                 ;;
0106                 h) usage 0
0107                 ;;
0108         esac
0109 done
0110 
0111 shift $(($OPTIND - 1))
0112 OP=$1
0113 
0114 if [ "$OP" != start ] && [ "$OP" != stop ] && [ "$OP" != check ]; then
0115         usage 1
0116 fi
0117 
0118 if [ "$RETRIES" -le 0 ]; then
0119         RETRIES=1
0120 fi
0121 
0122 TOPDIR=$PWD
0123 if [ -z "$TMPDIR" ]; then
0124         TMPDIR="${TOPDIR}/.${SERVER}"
0125 else
0126         TMPDIR=$(readlink -f $TMPDIR)
0127 fi
0128 
0129 SANDBOX=$(get_sandbox)
0130 SANDBOX=$(readlink -f ${SANDBOX})
0131 DOCKER="sudo docker"
0132 LOG="log_${SERVER}.log"
0133 STOPLOG="log_stop_${SERVER}.log"
0134 LIB=lib
0135 STARTED_INDICATOR="Started GRPCInferenceService"
0136 SEGFAULT_INDICATOR="Address already in use"
0137 EXTRA=""
0138 COMPAT_SCRIPT=/etc/shinit_v2
0139 
0140 compute_ports(){
0141         # compute derived port numbers
0142         export HTTPPORT=$BASEPORT
0143         export GRPCPORT=$((BASEPORT+1))
0144         export METRPORT=$((BASEPORT+2))
0145 }
0146 
0147 check_port(){
0148         # success on this command means the port is in use
0149         if 2>/dev/null >"/dev/tcp/0.0.0.0/$1"; then
0150                 return 1
0151         else
0152                 return 0
0153         fi
0154 }
0155 
0156 check_ports(){
0157         if check_port $HTTPPORT && check_port $GRPCPORT && check_port $METRPORT; then
0158                 return 0
0159         else
0160                 return 1
0161         fi
0162 }
0163 
0164 find_ports(){
0165         while ! check_ports; do
0166                 BASEPORT=$((BASEPORT+NPORTS))
0167                 compute_ports
0168         done
0169         echo "CMS_TRITON_GRPC_PORT: $GRPCPORT"
0170 }
0171 
0172 handle_ports(){
0173         # handle ports
0174         compute_ports
0175         if [ -n "$AUTOPORT" ]; then
0176                 find_ports
0177         elif ! check_ports; then
0178                 echo "Error: requested port in use"
0179                 return 1
0180         fi
0181         export PORTARGS="--http-port=${HTTPPORT} --grpc-port=${GRPCPORT} --metrics-port=${METRPORT}"
0182 }
0183 
0184 start_docker(){
0185         # mount all model repositories
0186         MOUNTARGS=""
0187         REPOARGS=""
0188         for REPO in ${REPOS[@]}; do
0189                 MOUNTARGS="$MOUNTARGS -v$REPO:$REPO"
0190                 REPOARGS="$REPOARGS --model-repository=${REPO}"
0191         done
0192 
0193         # compatibility driver environment
0194         if [ -n "$COMPAT" ]; then
0195                 MOUNTARGS="$MOUNTARGS -v$COMPAT"
0196                 if [ -n "$COMPAT_SCRIPT_MOUNT" ]; then
0197                         MOUNTARGS="$MOUNTARGS -v$COMPAT_SCRIPT_MOUNT"
0198                 fi
0199         fi
0200 
0201         $DRYRUN $DOCKER run -d --name ${SERVER} \
0202                 --shm-size=1g --ulimit memlock=-1 --ulimit stack=67108864 --ipc="host" --cap-add=IPC_OWNER \
0203                 -p${HTTPPORT}:${HTTPPORT} -p${GRPCPORT}:${GRPCPORT} -p${METRPORT}:${METRPORT} $EXTRA $MOUNTARGS \
0204                 ${IMAGE} tritonserver $PORTARGS $REPOARGS $VERBOSE
0205 }
0206 
0207 start_singularity(){
0208         # triton server image may need to modify contents of opt/tritonserver/lib/
0209         # but cvmfs is read-only
0210         # -> make a writable local directory with the same contents
0211         # this is no longer needed as of triton 2.11.0, but kept for compatibility w/ older server versions
0212         $DRYRUN mkdir ${LIB}
0213         $DRYRUN ln -s ${SANDBOX}/opt/tritonserver/lib/* ${LIB}/
0214 
0215         # mount all model repositories
0216         MOUNTARGS=""
0217         REPOARGS=""
0218         for REPO in ${REPOS[@]}; do
0219                 MOUNTARGS="$MOUNTARGS -B $REPO"
0220                 REPOARGS="$REPOARGS --model-repository=${REPO}"
0221         done
0222 
0223         # compatibility driver environment
0224         if [ -n "$COMPAT" ]; then
0225                 MOUNTARGS="$MOUNTARGS -B $COMPAT"
0226                 if [ -n "$COMPAT_SCRIPT_MOUNT" ]; then
0227                         MOUNTARGS="$MOUNTARGS -B $COMPAT_SCRIPT_MOUNT"
0228                 fi
0229         fi
0230 
0231         # workaround for nvidia libs w/ singularity-in-singularity
0232         # from https://github.com/hpcng/singularity/issues/5759#issuecomment-919523970
0233         if [ -d /.singularity.d/libs ]; then
0234                 TMPD=`mktemp -d`
0235                 (echo '#!/bin/bash'; echo 'exec /usr/sbin/ldconfig -C '"$TMPD"'/ld.so.cache "$@"') > $TMPD/ldconfig
0236                 chmod +x $TMPD/ldconfig
0237                 PATH=$TMPD:$PATH
0238                 # this does not work with LD_LIBRARY_PATH from cmsenv
0239                 ldconfig /.singularity.d/libs
0240         fi
0241 
0242         # start instance
0243         # need to bind /cvmfs for above symlinks to work inside container
0244         $DRYRUN singularity instance start \
0245                 -B ${SHM}:/run/shm -B ${LIB}:/opt/tritonserver/lib -B ${SANDBOX} $MOUNTARGS $EXTRA \
0246                 ${SANDBOX} ${SERVER}
0247 
0248         START_EXIT=$?
0249         if [ "$START_EXIT" -ne 0 ]; then
0250                 rm -rf ${LIB}
0251                 return "$START_EXIT"
0252         fi
0253 
0254         # run the actual server
0255         if [ -z "$DRYRUN" ]; then
0256                 REDIR="$LOG"
0257         else
0258                 REDIR=/dev/stdout
0259         fi
0260         $DRYRUN singularity run instance://${SERVER} \
0261                 tritonserver $PORTARGS $REPOARGS $VERBOSE >& ${REDIR} &
0262         [ -z "$DRYRUN" ] || wait
0263 }
0264 
0265 stop_docker(){
0266         # keep log
0267         if [ -z "$DRYRUN" ]; then
0268                 if [ -n "$VERBOSE" ]; then $DOCKER logs ${SERVER} >& "$LOG"; fi
0269         fi
0270 
0271         $DRYRUN $DOCKER stop ${SERVER}
0272         $DRYRUN $DOCKER rm ${SERVER}
0273 }
0274 
0275 stop_singularity(){
0276         $DRYRUN singularity instance stop ${SERVER}
0277 }
0278 
0279 test_docker(){
0280         # docker logs print to stderr
0281         ${DOCKER} logs ${SERVER} |& grep "$1"
0282 }
0283 
0284 test_singularity(){
0285         grep "$1" $LOG
0286 }
0287 
0288 wait_server(){
0289         if [ -n "$DRYRUN" ]; then
0290                 return
0291         fi
0292 
0293         COUNT=0
0294         while ! $TEST_FN "$STARTED_INDICATOR" >& /dev/null; do
0295                 if $TEST_FN "$SEGFAULT_INDICATOR" >& /dev/null; then
0296                         handle_ports
0297                         PORT_EXIT=$?
0298                         # port exit is zero if it found a new, good port; so retry
0299                         if [ "$PORT_EXIT" -ne 0 ]; then
0300                                 auto_stop true
0301                                 exit $PORT_EXIT
0302                         else
0303                                 return 1
0304                         fi
0305                 elif [ "$COUNT" -gt "$WTIME" ]; then
0306                         echo "timed out waiting for server to start"
0307                         auto_stop true
0308                         exit 1
0309                 else
0310                         COUNT=$(($COUNT + 1))
0311                         sleep 1
0312                 fi
0313         done
0314 
0315         echo "server is ready!"
0316 }
0317 
0318 list_models(){
0319         # make list of model repositories
0320         for MODEL in ${MODELS[@]}; do
0321                 # check if file was provided rather than directory
0322                 if [ -f "$MODEL" ]; then
0323                         MODEL="$(dirname "$MODEL")"
0324                 fi
0325                 REPOS+=("$(dirname "$MODEL")")
0326         done
0327         for ((r=0; r < ${#REPOS[@]}; r++)); do
0328                 # avoid issues w/ multiple levels of symlinks
0329                 REPOS[$r]=$(readlink -f ${REPOS[$r]})
0330         done
0331         # make unique list
0332         read -a REPOS <<< "$(printf "%s\n" "${REPOS[@]}" | sort -u | tr '\n' ' ')"
0333 }
0334 
0335 auto_stop(){
0336         # allow enabling verbosity here even if disabled at top level
0337         # but otherwise use top-level setting
0338         if [ -n "$1" ]; then VERBOSE="$VERBOSE_ARGS"; fi
0339         PARENTPID="$2"
0340 
0341         if [ -n "$PARENTPID" ]; then
0342                 if [ -n "$VERBOSE" ]; then
0343                         echo "watching PID $PARENTPID"
0344                         ps
0345                 fi
0346                 PCOUNTER=0
0347                 PMAX=5
0348                 while [ "$PCOUNTER" -le "$PMAX" ]; do
0349                         if ! kill -0 $PARENTPID >& /dev/null; then
0350                                 PCOUNTER=$((PCOUNTER+1))
0351                                 if [ -n "$VERBOSE" ]; then
0352                                         echo "trigger $PCOUNTER:"
0353                                         ps
0354                                 fi
0355                         else
0356                                 # must get N in a row, otherwise reset
0357                                 if [ "$PCOUNTER" -gt 0 ] && [ -n "$VERBOSE" ]; then
0358                                         echo "reset:"
0359                                         ps
0360                                 fi
0361                                 PCOUNTER=0
0362                         fi
0363                         sleep 1
0364                 done
0365         fi
0366         $STOP_FN
0367 
0368         # move logs out of tmp dir
0369         if [ -z "$DRYRUN" ]; then
0370                 if [ -n "$VERBOSE" ]; then
0371                         mv "$LOG" "$TOPDIR"
0372                         # only keep non-empty log
0373                         if [ -s "$STOPLOG" ]; then
0374                                 mv "$STOPLOG" "$TOPDIR"
0375                         fi
0376                 fi
0377         fi
0378 
0379         if [ -n "$CLEANUP" ]; then
0380                 $DRYRUN cd "$TOPDIR"
0381                 $DRYRUN rm -rf "$TMPDIR"
0382         fi
0383 }
0384 
0385 make_tmp(){
0386         # make sure everything happens in tmp dir
0387         $DRYRUN mkdir "$TMPDIR"
0388         MKDIR_EXIT=$?
0389         if [ "$MKDIR_EXIT" -ne 0 ]; then
0390                 echo "Could not create temp dir: $TMPDIR"
0391                 exit "$MKDIR_EXIT"
0392         fi
0393         $DRYRUN cd "$TMPDIR"
0394 }
0395 
0396 scram_tag(){
0397         cd $CMSSW_BASE
0398         TOOL="$1"
0399         TAG="$2"
0400         scram tool tag $TOOL $TAG 2> /dev/null || true
0401 }
0402 
0403 driver_docker(){
0404         docker run --rm --entrypoint env ${IMAGE} | grep "CUDA_DRIVER_VERSION="
0405 }
0406 
0407 driver_singularity(){
0408         source ${SANDBOX}/.singularity.d/env/10-docker2singularity.sh && echo $CUDA_DRIVER_VERSION
0409 }
0410 
0411 compat_docker(){
0412         docker cp $(docker create --rm ${IMAGE}):${COMPAT_SCRIPT} .
0413 }
0414 
0415 compat_singularity(){
0416         cp ${SANDBOX}/${COMPAT_SCRIPT} .
0417 }
0418 
0419 check_drivers(){
0420         # get sandbox env vars in subshell
0421         CUDA_DRIVER_VERSION=$($DRIVER_FN)
0422         # copied from https://github.com/triton-inference-server/server/blob/v2.11.0/nvidia_entrypoint.sh
0423         DRIVER_VERSION=$(sed -n 's/^NVRM.*Kernel Module *\([0-9.]*\).*$/\1/p' /proc/driver/nvidia/version 2>/dev/null || true)
0424         if [[ "${DRIVER_VERSION%%.*}" -ge "${CUDA_DRIVER_VERSION%%.*}" ]]; then
0425                 return 0
0426         fi
0427 
0428         export COMPAT=""
0429 
0430         # 1. check for user-specified compatibility drivers
0431         if [ -n "$COMPAT_USR" ] && [ -d "$COMPAT_USR" ]; then
0432                 COMPAT="$COMPAT_USR"
0433         fi
0434 
0435         # 2. check for compatibility drivers from CMSSW if available
0436         # based on $CMSSW_BASE/config/SCRAM/hooks/runtime/00-nvidia-drivers
0437         # but need to check drivers, not runtime
0438         if [ -z "$COMPAT" ] && [ -n "$CMSSW_BASE" ]; then
0439                 CUDA_BASE=$(scram_tag cuda CUDA_BASE)
0440                 COMPAT_CMSSW=${CUDA_BASE}/drivers
0441 
0442                 if [ -n "$CUDA_BASE" ] && [ -d "$COMPAT_CMSSW" ]; then
0443                         COMPAT="$COMPAT_CMSSW"
0444                 fi
0445         fi
0446 
0447         # 3. finally, check expected system location
0448         COMPAT_SYS=/usr/local/cuda/compat
0449         if [ -z "$COMPAT" ] && [ -d "$COMPAT_SYS" ]; then
0450                 COMPAT="$COMPAT_SYS"
0451         fi
0452 
0453         if [ -n "$COMPAT" ]; then
0454                 # in order to mount drivers where Triton expects them
0455                 export COMPAT="${COMPAT}:${COMPAT_SYS}/lib.real"
0456                 # workaround for bug in Triton/Nvidia compatibility check
0457                 if [ "$PWD" == "$TMPDIR" ]; then
0458                         $DRYRUN $COMPAT_FN
0459                         COMPAT_SCRIPT_BASE=$(basename $COMPAT_SCRIPT)
0460                         $DRYRUN sed -i 's/LD_LIBRARY_PATH="${_CUDA_COMPAT_REALLIB}"/&; LD_PRELOAD=""/' $COMPAT_SCRIPT_BASE
0461                         COMPAT_SCRIPT_MOUNT="$COMPAT_SCRIPT_BASE:$COMPAT_SCRIPT"
0462                 fi
0463                 return 0
0464         else
0465                 return 1
0466         fi
0467 }
0468 
0469 if [ -n "$USEDOCKER" ]; then
0470         if [ -n "$GPU" ]; then
0471                 EXTRA="--gpus all"
0472         fi
0473         START_FN=start_docker
0474         TEST_FN=test_docker
0475         STOP_FN=stop_docker
0476         DRIVER_FN=driver_docker
0477         COMPAT_FN=compat_docker
0478         PROG_NAME=Docker
0479 else
0480         if [ -n "$GPU" ]; then
0481                 EXTRA="--nv"
0482         fi
0483         START_FN=start_singularity
0484         TEST_FN=test_singularity
0485         STOP_FN=stop_singularity
0486         DRIVER_FN=driver_singularity
0487         COMPAT_FN=compat_singularity
0488         PROG_NAME=Singularity
0489 fi
0490 
0491 if [ "$OP" == check ]; then
0492         check_drivers
0493         exit $?
0494 elif [ "$OP" == start ]; then
0495         # handle cleaning up
0496         if [ -n "$FORCE" ]; then
0497                 auto_stop
0498         elif [ -d "$TMPDIR" ]; then
0499                 echo "Error: this container may already exist (override with -f)"
0500                 exit 1
0501         fi
0502 
0503         handle_ports
0504         PORT_EXIT=$?
0505         if [ "$PORT_EXIT" -ne 0 ]; then exit $PORT_EXIT; fi
0506 
0507         list_models
0508 
0509         make_tmp
0510 
0511         # after make_tmp because this may create file in tmp dir
0512         check_drivers
0513         DRIVER_EXIT=$?
0514         if [ "$DRIVER_EXIT" -ne 0 ]; then exit $DRIVER_EXIT; fi
0515 
0516         # if parent PID is provided, automatically stop server when finished
0517         # do this before actually trying to start the server in case of ctrl+c
0518         if [ -n "$PARENTPID" ]; then
0519                 auto_stop "" "$PARENTPID" >& "$STOPLOG" &
0520         fi
0521 
0522         START_EXIT=0
0523         for ((counter=0; counter < ${RETRIES}; counter++)); do
0524                 if [ "$START_EXIT" -ne 0 ]; then make_tmp; fi
0525 
0526                 $START_FN
0527                 START_EXIT=$?
0528                 if [ "$START_EXIT" -eq 0 ]; then
0529                         wait_server
0530                         WAIT_EXIT=$?
0531                         if [ "$WAIT_EXIT" -eq 0 ]; then
0532                                 break
0533                         else
0534                                 # allow to keep retrying if there was a port issue
0535                                 counter=$((counter-1))
0536                                 START_EXIT="$WAIT_EXIT"
0537                         fi
0538                 fi
0539 
0540                 if [ "$START_EXIT" -ne 0 ]; then
0541                         auto_stop
0542                         echo "Retrying after container issue..."
0543                 fi
0544         done
0545         if [ "$START_EXIT" -ne 0 ]; then
0546                 echo "Error from $PROG_NAME"
0547                 exit "$START_EXIT"
0548         fi
0549 else
0550         # check for tmp dir
0551         if [ -d "$TMPDIR" ] || [ -n "$DRYRUN" ]; then
0552                 $DRYRUN cd "$TMPDIR"
0553         elif [ -z "$FORCE" ]; then
0554                 echo "Error: attempt to stop unknown container $SERVER"
0555                 exit 1
0556         fi
0557 
0558         auto_stop
0559 fi