Back to home page

Project CMSSW displayed by LXR

 
 

    


File indexing completed on 2023-01-31 03:06:34

0001 #!/bin/bash
0002 
0003 # defaults
0004 USEDOCKER=""
0005 GPU=""
0006 VERBOSE=""
0007 VERBOSE_ARGS="--log-verbose=1 --log-error=1 --log-warning=1 --log-info=1"
0008 WTIME=300
0009 SERVER=triton_server_instance
0010 RETRIES=3
0011 REPOS=()
0012 MODELS=()
0013 FORCE=""
0014 CLEANUP=true
0015 TMPDIR=""
0016 DRYRUN=""
0017 PARENTPID=""
0018 BASEPORT=8000
0019 AUTOPORT=""
0020 NPORTS=3
0021 IMAGE=fastml/triton-torchgeo:21.06-py3-geometric
0022 SANDBOX=""
0023 COMPAT_USR=""
0024 
0025 get_sandbox(){
0026         if [ -z "$SANDBOX" ]; then
0027                 echo /cvmfs/unpacked.cern.ch/registry.hub.docker.com/${IMAGE}
0028         else
0029                 echo "$SANDBOX"
0030         fi
0031 }
0032 
0033 usage() {
0034         ECHO="echo -e"
0035         $ECHO "cmsTriton [options] [start|stop]"
0036         $ECHO
0037         $ECHO "Options:"
0038         $ECHO "-c          \t don't cleanup temporary dir (for debugging)"
0039         $ECHO "-C [dir]    \t directory containing Nvidia compatibility drivers (checks CMSSW_BASE by default if available)"
0040         $ECHO "-D          \t dry run: print container commands rather than executing them"
0041         $ECHO "-d          \t use Docker instead of Apptainer"
0042         $ECHO "-f          \t force reuse of (possibly) existing container instance"
0043         $ECHO "-g          \t use GPU instead of CPU"
0044         $ECHO "-i [name]   \t server image name (default: ${IMAGE})"
0045         $ECHO "-M [dir]    \t model repository (can be given more than once)"
0046         $ECHO "-m [dir]    \t specific model directory (can be given more than once)"
0047         $ECHO "-n [name]   \t name of container instance, also used for default hidden temporary dir (default: ${SERVER})"
0048         $ECHO "-P [port]   \t base port number for services (-1: automatically find an unused port range) (default: ${BASEPORT})"
0049         $ECHO "-p [pid]    \t automatically shut down server when process w/ specified PID ends (-1: use parent process PID)"
0050         $ECHO "-r [num]    \t number of retries when starting container (default: ${RETRIES})"
0051         $ECHO "-s [dir]    \t Apptainer sandbox directory (default: $(get_sandbox))"
0052         $ECHO "-t [dir]    \t non-default hidden temporary dir"
0053         $ECHO "-v          \t (verbose) start: activate server debugging info; stop: keep server logs"
0054         $ECHO "-w [time]   \t maximum time to wait for server to start (default: ${WTIME} seconds)"
0055         $ECHO "-h          \t print this message and exit"
0056         $ECHO
0057         $ECHO "Operations:"
0058         $ECHO "start       \t start server"
0059         $ECHO "stop        \t stop server"
0060         $ECHO "check       \t check if server can run on this system"
0061         exit $1
0062 }
0063 
0064 # check shm locations
0065 SHM=/dev/shm
0066 if [ -e /run/shm ]; then
0067         SHM=/run/shm
0068 fi
0069 
0070 while getopts "cC:Ddfgi:M:m:n:P:p:r:s:t:vw:h" opt; do
0071         case "$opt" in
0072                 c) CLEANUP=""
0073                 ;;
0074                 C) COMPAT_USR="$OPTARG"
0075                 ;;
0076                 D) DRYRUN=echo
0077                 ;;
0078                 d) USEDOCKER=true
0079                 ;;
0080                 f) FORCE=true
0081                 ;;
0082                 g) GPU=true
0083                 ;;
0084                 i) IMAGE="$OPTARG"
0085                 ;;
0086                 M) REPOS+=("$OPTARG")
0087                 ;;
0088                 m) MODELS+=("$OPTARG")
0089                 ;;
0090                 n) SERVER="$OPTARG"
0091                 ;;
0092                 P) if [ "$OPTARG" -eq -1 ]; then AUTOPORT=true; else BASEPORT="$OPTARG"; fi
0093                 ;;
0094                 p) if [ "$OPTARG" -eq -1 ]; then PARENTPID="$PPID"; else PARENTPID="$OPTARG"; fi
0095                 ;;
0096                 r) RETRIES="$OPTARG"
0097                 ;;
0098                 s) SANDBOX="$OPTARG"
0099                 ;;
0100                 t) TMPDIR="$OPTARG"
0101                 ;;
0102                 v) VERBOSE="$VERBOSE_ARGS"
0103                 ;;
0104                 w) WTIME="$OPTARG"
0105                 ;;
0106                 h) usage 0
0107                 ;;
0108         esac
0109 done
0110 
0111 shift $(($OPTIND - 1))
0112 OP=$1
0113 
0114 if [ "$OP" != start ] && [ "$OP" != stop ] && [ "$OP" != check ]; then
0115         usage 1
0116 fi
0117 
0118 if [ "$RETRIES" -le 0 ]; then
0119         RETRIES=1
0120 fi
0121 
0122 TOPDIR=$PWD
0123 if [ -z "$TMPDIR" ]; then
0124         TMPDIR="${TOPDIR}/.${SERVER}"
0125 else
0126         TMPDIR=$(readlink -f $TMPDIR)
0127 fi
0128 
0129 # find executables
0130 if [ -n "$USEDOCKER" ]; then
0131         if [ -z "$DOCKER" ]; then
0132                 DOCKER="sudo docker"
0133         fi
0134 else
0135         if [ -z "$APPTAINER" ]; then
0136                 if type apptainer >& /dev/null; then
0137                         APPTAINER=apptainer
0138                 elif type singularity >& /dev/null; then
0139                         APPTAINER=singularity
0140                 else
0141                         echo "Missing apptainer and singularity"
0142                         exit 1
0143                 fi
0144         fi
0145 fi
0146 
0147 
0148 SANDBOX=$(get_sandbox)
0149 SANDBOX=$(readlink -f ${SANDBOX})
0150 LOG="log_${SERVER}.log"
0151 STOPLOG="log_stop_${SERVER}.log"
0152 LIB=lib
0153 STARTED_INDICATOR="Started GRPCInferenceService"
0154 SEGFAULT_INDICATOR="Address already in use"
0155 EXTRA=""
0156 COMPAT_SCRIPT=/etc/shinit_v2
0157 
0158 compute_ports(){
0159         # compute derived port numbers
0160         export HTTPPORT=$BASEPORT
0161         export GRPCPORT=$((BASEPORT+1))
0162         export METRPORT=$((BASEPORT+2))
0163 }
0164 
0165 check_port(){
0166         # success on this command means the port is in use
0167         if 2>/dev/null >"/dev/tcp/0.0.0.0/$1"; then
0168                 return 1
0169         else
0170                 return 0
0171         fi
0172 }
0173 
0174 check_ports(){
0175         if check_port $HTTPPORT && check_port $GRPCPORT && check_port $METRPORT; then
0176                 return 0
0177         else
0178                 return 1
0179         fi
0180 }
0181 
0182 find_ports(){
0183         while ! check_ports; do
0184                 BASEPORT=$((BASEPORT+NPORTS))
0185                 compute_ports
0186         done
0187         echo "CMS_TRITON_GRPC_PORT: $GRPCPORT"
0188 }
0189 
0190 handle_ports(){
0191         # handle ports
0192         compute_ports
0193         if [ -n "$AUTOPORT" ]; then
0194                 find_ports
0195         elif ! check_ports; then
0196                 echo "Error: requested port in use"
0197                 return 1
0198         fi
0199         export PORTARGS="--http-port=${HTTPPORT} --grpc-port=${GRPCPORT} --metrics-port=${METRPORT}"
0200 }
0201 
0202 start_docker(){
0203         # mount all model repositories
0204         MOUNTARGS=""
0205         REPOARGS=""
0206         for REPO in ${REPOS[@]}; do
0207                 MOUNTARGS="$MOUNTARGS -v$REPO:$REPO"
0208                 REPOARGS="$REPOARGS --model-repository=${REPO}"
0209         done
0210 
0211         # compatibility driver environment
0212         if [ -n "$COMPAT" ]; then
0213                 MOUNTARGS="$MOUNTARGS -v$COMPAT"
0214                 if [ -n "$COMPAT_SCRIPT_MOUNT" ]; then
0215                         MOUNTARGS="$MOUNTARGS -v$COMPAT_SCRIPT_MOUNT"
0216                 fi
0217         fi
0218 
0219         $DRYRUN $DOCKER run -d --name ${SERVER} \
0220                 --shm-size=1g --ulimit memlock=-1 --ulimit stack=67108864 --ipc="host" --cap-add=IPC_OWNER \
0221                 -p${HTTPPORT}:${HTTPPORT} -p${GRPCPORT}:${GRPCPORT} -p${METRPORT}:${METRPORT} $EXTRA $MOUNTARGS \
0222                 ${IMAGE} tritonserver $PORTARGS $REPOARGS $VERBOSE
0223 }
0224 
0225 start_apptainer(){
0226         # triton server image may need to modify contents of opt/tritonserver/lib/
0227         # but cvmfs is read-only
0228         # -> make a writable local directory with the same contents
0229         # this is no longer needed as of triton 2.11.0, but kept for compatibility w/ older server versions
0230         $DRYRUN mkdir ${LIB}
0231         $DRYRUN ln -s ${SANDBOX}/opt/tritonserver/lib/* ${LIB}/
0232 
0233         # mount all model repositories
0234         MOUNTARGS=""
0235         REPOARGS=""
0236         for REPO in ${REPOS[@]}; do
0237                 MOUNTARGS="$MOUNTARGS -B $REPO"
0238                 REPOARGS="$REPOARGS --model-repository=${REPO}"
0239         done
0240 
0241         # compatibility driver environment
0242         if [ -n "$COMPAT" ]; then
0243                 MOUNTARGS="$MOUNTARGS -B $COMPAT"
0244                 if [ -n "$COMPAT_SCRIPT_MOUNT" ]; then
0245                         MOUNTARGS="$MOUNTARGS -B $COMPAT_SCRIPT_MOUNT"
0246                 fi
0247         fi
0248 
0249         # workaround for nvidia libs w/ singularity-in-singularity
0250         # from https://github.com/hpcng/singularity/issues/5759#issuecomment-919523970
0251         if [ -d /.singularity.d/libs ]; then
0252                 TMPD=`mktemp -d`
0253                 (echo '#!/bin/bash'; echo 'exec /usr/sbin/ldconfig -C '"$TMPD"'/ld.so.cache "$@"') > $TMPD/ldconfig
0254                 chmod +x $TMPD/ldconfig
0255                 PATH=$TMPD:$PATH
0256                 # this does not work with LD_LIBRARY_PATH from cmsenv
0257                 ldconfig /.singularity.d/libs
0258         fi
0259 
0260         # start instance
0261         # need to bind /cvmfs for above symlinks to work inside container
0262         $DRYRUN $APPTAINER instance start \
0263                 -B ${SHM}:/run/shm -B ${LIB}:/opt/tritonserver/lib -B ${SANDBOX} $MOUNTARGS $EXTRA \
0264                 ${SANDBOX} ${SERVER}
0265 
0266         START_EXIT=$?
0267         if [ "$START_EXIT" -ne 0 ]; then
0268                 rm -rf ${LIB}
0269                 return "$START_EXIT"
0270         fi
0271 
0272         # run the actual server
0273         if [ -z "$DRYRUN" ]; then
0274                 REDIR="$LOG"
0275         else
0276                 REDIR=/dev/stdout
0277         fi
0278         $DRYRUN $APPTAINER run instance://${SERVER} \
0279                 tritonserver $PORTARGS $REPOARGS $VERBOSE >& ${REDIR} &
0280         [ -z "$DRYRUN" ] || wait
0281 }
0282 
0283 stop_docker(){
0284         # keep log
0285         if [ -z "$DRYRUN" ]; then
0286                 if [ -n "$VERBOSE" ]; then $DOCKER logs ${SERVER} >& "$LOG"; fi
0287         fi
0288 
0289         $DRYRUN $DOCKER stop ${SERVER}
0290         $DRYRUN $DOCKER rm ${SERVER}
0291 }
0292 
0293 stop_apptainer(){
0294         $DRYRUN $APPTAINER instance stop ${SERVER}
0295 }
0296 
0297 test_docker(){
0298         # docker logs print to stderr
0299         ${DOCKER} logs ${SERVER} |& grep "$1"
0300 }
0301 
0302 test_apptainer(){
0303         grep "$1" $LOG
0304 }
0305 
0306 wait_server(){
0307         if [ -n "$DRYRUN" ]; then
0308                 return
0309         fi
0310 
0311         COUNT=0
0312         while ! $TEST_FN "$STARTED_INDICATOR" >& /dev/null; do
0313                 if $TEST_FN "$SEGFAULT_INDICATOR" >& /dev/null; then
0314                         handle_ports
0315                         PORT_EXIT=$?
0316                         # port exit is zero if it found a new, good port; so retry
0317                         if [ "$PORT_EXIT" -ne 0 ]; then
0318                                 auto_stop true
0319                                 exit $PORT_EXIT
0320                         else
0321                                 return 1
0322                         fi
0323                 elif [ "$COUNT" -gt "$WTIME" ]; then
0324                         echo "timed out waiting for server to start"
0325                         auto_stop true
0326                         exit 1
0327                 else
0328                         COUNT=$(($COUNT + 1))
0329                         sleep 1
0330                 fi
0331         done
0332 
0333         echo "server is ready!"
0334 }
0335 
0336 list_models(){
0337         # make list of model repositories
0338         for MODEL in ${MODELS[@]}; do
0339                 # check if file was provided rather than directory
0340                 if [ -f "$MODEL" ]; then
0341                         MODEL="$(dirname "$MODEL")"
0342                 fi
0343                 REPOS+=("$(dirname "$MODEL")")
0344         done
0345         for ((r=0; r < ${#REPOS[@]}; r++)); do
0346                 # avoid issues w/ multiple levels of symlinks
0347                 REPOS[$r]=$(readlink -f ${REPOS[$r]})
0348         done
0349         # make unique list
0350         read -a REPOS <<< "$(printf "%s\n" "${REPOS[@]}" | sort -u | tr '\n' ' ')"
0351 }
0352 
0353 auto_stop(){
0354         # allow enabling verbosity here even if disabled at top level
0355         # but otherwise use top-level setting
0356         if [ -n "$1" ]; then VERBOSE="$VERBOSE_ARGS"; fi
0357         PARENTPID="$2"
0358 
0359         if [ -n "$PARENTPID" ]; then
0360                 if [ -n "$VERBOSE" ]; then
0361                         echo "watching PID $PARENTPID"
0362                         ps
0363                 fi
0364                 PCOUNTER=0
0365                 PMAX=5
0366                 while [ "$PCOUNTER" -le "$PMAX" ]; do
0367                         if ! kill -0 $PARENTPID >& /dev/null; then
0368                                 PCOUNTER=$((PCOUNTER+1))
0369                                 if [ -n "$VERBOSE" ]; then
0370                                         echo "trigger $PCOUNTER:"
0371                                         ps
0372                                 fi
0373                         else
0374                                 # must get N in a row, otherwise reset
0375                                 if [ "$PCOUNTER" -gt 0 ] && [ -n "$VERBOSE" ]; then
0376                                         echo "reset:"
0377                                         ps
0378                                 fi
0379                                 PCOUNTER=0
0380                         fi
0381                         sleep 1
0382                 done
0383         fi
0384         $STOP_FN
0385 
0386         # move logs out of tmp dir
0387         if [ -z "$DRYRUN" ]; then
0388                 if [ -n "$VERBOSE" ]; then
0389                         mv "$LOG" "$TOPDIR"
0390                         # only keep non-empty log
0391                         if [ -s "$STOPLOG" ]; then
0392                                 mv "$STOPLOG" "$TOPDIR"
0393                         fi
0394                 fi
0395         fi
0396 
0397         if [ -n "$CLEANUP" ]; then
0398                 $DRYRUN cd "$TOPDIR"
0399                 $DRYRUN rm -rf "$TMPDIR"
0400         fi
0401 }
0402 
0403 make_tmp(){
0404         # make sure everything happens in tmp dir
0405         $DRYRUN mkdir "$TMPDIR"
0406         MKDIR_EXIT=$?
0407         if [ "$MKDIR_EXIT" -ne 0 ]; then
0408                 echo "Could not create temp dir: $TMPDIR"
0409                 exit "$MKDIR_EXIT"
0410         fi
0411         $DRYRUN cd "$TMPDIR"
0412 }
0413 
0414 scram_tag(){
0415         cd $CMSSW_BASE
0416         TOOL="$1"
0417         TAG="$2"
0418         scram tool tag $TOOL $TAG 2> /dev/null || true
0419 }
0420 
0421 driver_docker(){
0422         $DOCKER run --rm --entrypoint env ${IMAGE} | grep "CUDA_DRIVER_VERSION="
0423 }
0424 
0425 driver_apptainer(){
0426         source ${SANDBOX}/.singularity.d/env/10-docker2singularity.sh && echo $CUDA_DRIVER_VERSION
0427 }
0428 
0429 compat_docker(){
0430         $DOCKER cp $($DOCKER create --rm ${IMAGE}):${COMPAT_SCRIPT} .
0431 }
0432 
0433 compat_apptainer(){
0434         cp ${SANDBOX}/${COMPAT_SCRIPT} .
0435 }
0436 
0437 check_drivers(){
0438         # get sandbox env vars in subshell
0439         CUDA_DRIVER_VERSION=$($DRIVER_FN)
0440         # copied from https://github.com/triton-inference-server/server/blob/v2.11.0/nvidia_entrypoint.sh
0441         DRIVER_VERSION=$(sed -n 's/^NVRM.*Kernel Module *\([0-9.]*\).*$/\1/p' /proc/driver/nvidia/version 2>/dev/null || true)
0442         if [[ "${DRIVER_VERSION%%.*}" -ge "${CUDA_DRIVER_VERSION%%.*}" ]]; then
0443                 return 0
0444         fi
0445 
0446         export COMPAT=""
0447 
0448         # 1. check for user-specified compatibility drivers
0449         if [ -n "$COMPAT_USR" ] && [ -d "$COMPAT_USR" ]; then
0450                 COMPAT="$COMPAT_USR"
0451         fi
0452 
0453         # 2. check for compatibility drivers from CMSSW if available
0454         # based on $CMSSW_BASE/config/SCRAM/hooks/runtime/00-nvidia-drivers
0455         # but need to check drivers, not runtime
0456         if [ -z "$COMPAT" ] && [ -n "$CMSSW_BASE" ]; then
0457                 CUDA_BASE=$(scram_tag cuda CUDA_BASE)
0458                 COMPAT_CMSSW=${CUDA_BASE}/drivers
0459 
0460                 if [ -n "$CUDA_BASE" ] && [ -d "$COMPAT_CMSSW" ]; then
0461                         COMPAT="$COMPAT_CMSSW"
0462                 fi
0463         fi
0464 
0465         # 3. finally, check expected system location
0466         COMPAT_SYS=/usr/local/cuda/compat
0467         if [ -z "$COMPAT" ] && [ -d "$COMPAT_SYS" ]; then
0468                 COMPAT="$COMPAT_SYS"
0469         fi
0470 
0471         if [ -n "$COMPAT" ]; then
0472                 # in order to mount drivers where Triton expects them
0473                 export COMPAT="${COMPAT}:${COMPAT_SYS}/lib.real"
0474                 # workaround for bug in Triton/Nvidia compatibility check
0475                 if [ "$PWD" == "$TMPDIR" ]; then
0476                         $DRYRUN $COMPAT_FN
0477                         COMPAT_SCRIPT_BASE=$(basename $COMPAT_SCRIPT)
0478                         $DRYRUN sed -i 's/LD_LIBRARY_PATH="${_CUDA_COMPAT_REALLIB}"/&; LD_PRELOAD=""/' $COMPAT_SCRIPT_BASE
0479                         COMPAT_SCRIPT_MOUNT="$COMPAT_SCRIPT_BASE:$COMPAT_SCRIPT"
0480                 fi
0481                 return 0
0482         else
0483                 return 1
0484         fi
0485 }
0486 
0487 if [ -n "$USEDOCKER" ]; then
0488         if [ -n "$GPU" ]; then
0489                 EXTRA="--gpus all"
0490         fi
0491         START_FN=start_docker
0492         TEST_FN=test_docker
0493         STOP_FN=stop_docker
0494         DRIVER_FN=driver_docker
0495         COMPAT_FN=compat_docker
0496         PROG_NAME=Docker
0497 else
0498         if [ -n "$GPU" ]; then
0499                 EXTRA="--nv"
0500         fi
0501         START_FN=start_apptainer
0502         TEST_FN=test_apptainer
0503         STOP_FN=stop_apptainer
0504         DRIVER_FN=driver_apptainer
0505         COMPAT_FN=compat_apptainer
0506         PROG_NAME=Apptainer
0507 fi
0508 
0509 if [ "$OP" == check ]; then
0510         check_drivers
0511         exit $?
0512 elif [ "$OP" == start ]; then
0513         # handle cleaning up
0514         if [ -n "$FORCE" ]; then
0515                 auto_stop
0516         elif [ -d "$TMPDIR" ]; then
0517                 echo "Error: this container may already exist (override with -f)"
0518                 exit 1
0519         fi
0520 
0521         handle_ports
0522         PORT_EXIT=$?
0523         if [ "$PORT_EXIT" -ne 0 ]; then exit $PORT_EXIT; fi
0524 
0525         list_models
0526 
0527         make_tmp
0528 
0529         # after make_tmp because this may create file in tmp dir
0530         check_drivers
0531         DRIVER_EXIT=$?
0532         if [ "$DRIVER_EXIT" -ne 0 ]; then exit $DRIVER_EXIT; fi
0533 
0534         # if parent PID is provided, automatically stop server when finished
0535         # do this before actually trying to start the server in case of ctrl+c
0536         if [ -n "$PARENTPID" ]; then
0537                 auto_stop "" "$PARENTPID" >& "$STOPLOG" &
0538         fi
0539 
0540         START_EXIT=0
0541         for ((counter=0; counter < ${RETRIES}; counter++)); do
0542                 if [ "$START_EXIT" -ne 0 ]; then make_tmp; fi
0543 
0544                 $START_FN
0545                 START_EXIT=$?
0546                 if [ "$START_EXIT" -eq 0 ]; then
0547                         wait_server
0548                         WAIT_EXIT=$?
0549                         if [ "$WAIT_EXIT" -eq 0 ]; then
0550                                 break
0551                         else
0552                                 # allow to keep retrying if there was a port issue
0553                                 counter=$((counter-1))
0554                                 START_EXIT="$WAIT_EXIT"
0555                         fi
0556                 fi
0557 
0558                 if [ "$START_EXIT" -ne 0 ]; then
0559                         auto_stop
0560                         echo "Retrying after container issue..."
0561                 fi
0562         done
0563         if [ "$START_EXIT" -ne 0 ]; then
0564                 echo "Error from $PROG_NAME"
0565                 exit "$START_EXIT"
0566         fi
0567 else
0568         # check for tmp dir
0569         if [ -d "$TMPDIR" ] || [ -n "$DRYRUN" ]; then
0570                 $DRYRUN cd "$TMPDIR"
0571         elif [ -z "$FORCE" ]; then
0572                 echo "Error: attempt to stop unknown container $SERVER"
0573                 exit 1
0574         fi
0575 
0576         auto_stop
0577 fi