File indexing completed on 2024-04-06 12:15:47
0001
0002
0003
0004 USEDOCKER=""
0005 GPU=""
0006 VERBOSE=""
0007 VERBOSE_ARGS="--log-verbose=1 --log-error=1 --log-warning=1 --log-info=1"
0008 WTIME=600
0009 SERVER=triton_server_instance
0010 RETRIES=3
0011 REPOS=()
0012 MODELS=()
0013 FORCE=""
0014 CLEANUP=true
0015 TMPDIR=""
0016 INSTANCES=0
0017 LOCALMODELREPO=""
0018 DRYRUN=""
0019 PARENTPID=""
0020 BASEPORT=8000
0021 AUTOPORT=""
0022 NPORTS=3
0023 IMAGE=fastml/triton-torchgeo:22.07-py3-geometric
0024 SANDBOX=""
0025 COMPAT_USR=""
0026 EXTRAPATH=/cvmfs/oasis.opensciencegrid.org/mis/apptainer/current/bin
0027
0028 get_sandbox(){
0029 if [ -z "$SANDBOX" ]; then
0030 echo /cvmfs/unpacked.cern.ch/registry.hub.docker.com/${IMAGE}
0031 else
0032 echo "$SANDBOX"
0033 fi
0034 }
0035
0036 usage() {
0037 ECHO="echo -e"
0038 $ECHO "cmsTriton [options] [start|stop|check]"
0039 $ECHO
0040 $ECHO "Options:"
0041 $ECHO "-c \t don't cleanup temporary dir (for debugging)"
0042 $ECHO "-C [dir] \t directory containing Nvidia compatibility drivers (checks CMSSW_BASE by default if available)"
0043 $ECHO "-D \t dry run: print container commands rather than executing them"
0044 $ECHO "-d \t use Docker instead of Apptainer"
0045 $ECHO "-E [path] \t include extra path(s) for executables (default: ${EXTRAPATH})"
0046 $ECHO "-f \t force reuse of (possibly) existing container instance"
0047 $ECHO "-g \t use GPU instead of CPU"
0048 $ECHO "-i [name] \t server image name (default: ${IMAGE})"
0049 $ECHO "-I [num] \t number of model instances (default: ${INSTANCES} -> means no local editing of config files)"
0050 $ECHO "-M [dir] \t model repository (can be given more than once)"
0051 $ECHO "-m [dir] \t specific model directory (can be given more than once)"
0052 $ECHO "-n [name] \t name of container instance, also used for default hidden temporary dir (default: ${SERVER})"
0053 $ECHO "-P [port] \t base port number for services (-1: automatically find an unused port range) (default: ${BASEPORT})"
0054 $ECHO "-p [pid] \t automatically shut down server when process w/ specified PID ends (-1: use parent process PID)"
0055 $ECHO "-r [num] \t number of retries when starting container (default: ${RETRIES})"
0056 $ECHO "-s [dir] \t Apptainer sandbox directory (default: $(get_sandbox))"
0057 $ECHO "-t [dir] \t non-default hidden temporary dir"
0058 $ECHO "-v \t (verbose) start: activate server debugging info; stop: keep server logs"
0059 $ECHO "-w [time] \t maximum time to wait for server to start (default: ${WTIME} seconds)"
0060 $ECHO "-h \t print this message and exit"
0061 $ECHO
0062 $ECHO "Operations:"
0063 $ECHO "start \t start server"
0064 $ECHO "stop \t stop server"
0065 $ECHO "check \t check if server can run on this system"
0066 exit $1
0067 }
0068
0069
0070 SHM=/dev/shm
0071 if [ -e /run/shm ]; then
0072 SHM=/run/shm
0073 fi
0074
0075 while getopts "cC:Ddfgi:I:M:m:n:P:p:r:s:t:vw:h" opt; do
0076 case "$opt" in
0077 c) CLEANUP=""
0078 ;;
0079 C) COMPAT_USR="$OPTARG"
0080 ;;
0081 D) DRYRUN=echo
0082 ;;
0083 d) USEDOCKER=true
0084 ;;
0085 f) FORCE=true
0086 ;;
0087 g) GPU=true
0088 ;;
0089 i) IMAGE="$OPTARG"
0090 ;;
0091 I) INSTANCES="$OPTARG"
0092 ;;
0093 M) REPOS+=("$OPTARG")
0094 ;;
0095 m) MODELS+=("$OPTARG")
0096 ;;
0097 n) SERVER="$OPTARG"
0098 ;;
0099 P) if [ "$OPTARG" -eq -1 ]; then AUTOPORT=true; else BASEPORT="$OPTARG"; fi
0100 ;;
0101 p) if [ "$OPTARG" -eq -1 ]; then PARENTPID="$PPID"; else PARENTPID="$OPTARG"; fi
0102 ;;
0103 r) RETRIES="$OPTARG"
0104 ;;
0105 s) SANDBOX="$OPTARG"
0106 ;;
0107 t) TMPDIR="$OPTARG"
0108 ;;
0109 v) VERBOSE="$VERBOSE_ARGS"
0110 ;;
0111 w) WTIME="$OPTARG"
0112 ;;
0113 h) usage 0
0114 ;;
0115 esac
0116 done
0117
0118 shift $(($OPTIND - 1))
0119 OP=$1
0120
0121 if [ "$OP" != start ] && [ "$OP" != stop ] && [ "$OP" != check ]; then
0122 usage 1
0123 fi
0124
0125 if [ "$RETRIES" -le 0 ]; then
0126 RETRIES=1
0127 fi
0128
0129 TOPDIR=$PWD
0130 if [ -z "$TMPDIR" ]; then
0131 TMPDIR="${TOPDIR}/.${SERVER}"
0132 else
0133 TMPDIR=$(readlink -f $TMPDIR)
0134 fi
0135
0136
0137 if [ -n "$EXTRAPATH" ]; then
0138 export PATH="${EXTRAPATH}:${PATH}"
0139 fi
0140
0141
0142 if [ -n "$USEDOCKER" ]; then
0143 if [ -z "$DOCKER" ]; then
0144 DOCKER="sudo docker"
0145 fi
0146 else
0147 if [ -z "$APPTAINER" ]; then
0148 if type apptainer >& /dev/null; then
0149 APPTAINER=apptainer
0150 elif type singularity >& /dev/null; then
0151 APPTAINER=singularity
0152 else
0153 echo "Missing apptainer and singularity"
0154 exit 1
0155 fi
0156 fi
0157 fi
0158
0159 SANDBOX=$(get_sandbox)
0160 SANDBOX=$(readlink -f ${SANDBOX})
0161 LOG="log_${SERVER}.log"
0162 STOPLOG="log_stop_${SERVER}.log"
0163 LIB=lib
0164 STARTED_INDICATOR="Started GRPCInferenceService"
0165 SEGFAULT_INDICATOR="Address already in use"
0166 EXTRA=""
0167 COMPAT_SCRIPT=/etc/shinit_v2
0168
0169 THREADCONTROL=""
0170
0171 if [ "$INSTANCES" -gt 0 ] && [ -z "$GPU" ]; then
0172 THREADCONTROL=true
0173 fi
0174
0175 compute_ports(){
0176
0177 export HTTPPORT=$BASEPORT
0178 export GRPCPORT=$((BASEPORT+1))
0179 export METRPORT=$((BASEPORT+2))
0180 }
0181
0182 check_port(){
0183
0184 if 2>/dev/null >"/dev/tcp/0.0.0.0/$1"; then
0185 return 1
0186 else
0187 return 0
0188 fi
0189 }
0190
0191 check_ports(){
0192 if check_port $HTTPPORT && check_port $GRPCPORT && check_port $METRPORT; then
0193 return 0
0194 else
0195 return 1
0196 fi
0197 }
0198
0199 find_ports(){
0200 while ! check_ports; do
0201 BASEPORT=$((BASEPORT+NPORTS))
0202 compute_ports
0203 done
0204 echo "CMS_TRITON_GRPC_PORT: $GRPCPORT"
0205 }
0206
0207 handle_ports(){
0208
0209 compute_ports
0210 if [ -n "$AUTOPORT" ]; then
0211 find_ports
0212 elif ! check_ports; then
0213 echo "Error: requested port in use"
0214 return 1
0215 fi
0216 export PORTARGS="--http-port=${HTTPPORT} --grpc-port=${GRPCPORT} --metrics-port=${METRPORT}"
0217 }
0218
0219 start_docker(){
0220
0221 MOUNTARGS=""
0222 REPOARGS=""
0223 for REPO in ${REPOS[@]}; do
0224 MOUNTARGS="$MOUNTARGS -v$REPO:$REPO"
0225 REPOARGS="$REPOARGS --model-repository=${REPO}"
0226 done
0227
0228
0229 if [ -n "$COMPAT" ]; then
0230 MOUNTARGS="$MOUNTARGS -v$COMPAT"
0231 if [ -n "$COMPAT_SCRIPT_MOUNT" ]; then
0232 MOUNTARGS="$MOUNTARGS -v$COMPAT_SCRIPT_MOUNT"
0233 fi
0234 fi
0235
0236 $DRYRUN $DOCKER run -d --name ${SERVER} \
0237 --shm-size=1g --ulimit memlock=-1 --ulimit stack=67108864 --ipc="host" --cap-add=IPC_OWNER \
0238 -p${HTTPPORT}:${HTTPPORT} -p${GRPCPORT}:${GRPCPORT} -p${METRPORT}:${METRPORT} $EXTRA $MOUNTARGS \
0239 ${IMAGE} tritonserver $PORTARGS $REPOARGS $VERBOSE
0240 }
0241
0242 start_apptainer(){
0243
0244
0245
0246
0247 $DRYRUN mkdir ${LIB}
0248 $DRYRUN ln -s ${SANDBOX}/opt/tritonserver/lib/* ${LIB}/
0249
0250
0251 MOUNTARGS=""
0252 REPOARGS=""
0253 for REPO in ${REPOS[@]}; do
0254 MOUNTARGS="$MOUNTARGS -B $REPO"
0255 REPOARGS="$REPOARGS --model-repository=${REPO}"
0256 done
0257
0258
0259 if [ -n "$COMPAT" ]; then
0260 MOUNTARGS="$MOUNTARGS -B $COMPAT"
0261 if [ -n "$COMPAT_SCRIPT_MOUNT" ]; then
0262 MOUNTARGS="$MOUNTARGS -B $COMPAT_SCRIPT_MOUNT"
0263 fi
0264 fi
0265
0266
0267
0268 if [ -d /.singularity.d/libs ]; then
0269 TMPD=`mktemp -d`
0270 (echo '#!/bin/bash'; echo 'exec /usr/sbin/ldconfig -C '"$TMPD"'/ld.so.cache "$@"') > $TMPD/ldconfig
0271 chmod +x $TMPD/ldconfig
0272 PATH=$TMPD:$PATH
0273
0274 ldconfig /.singularity.d/libs
0275 fi
0276
0277
0278
0279 $DRYRUN $APPTAINER instance start \
0280 -B ${SHM}:/run/shm -B ${LIB}:/opt/tritonserver/lib -B ${SANDBOX} $MOUNTARGS $EXTRA \
0281 ${SANDBOX} ${SERVER}
0282
0283 START_EXIT=$?
0284 if [ "$START_EXIT" -ne 0 ]; then
0285 rm -rf ${LIB}
0286 return "$START_EXIT"
0287 fi
0288
0289
0290 if [ -z "$DRYRUN" ]; then
0291 REDIR="$LOG"
0292 else
0293 REDIR=/dev/stdout
0294 fi
0295 $DRYRUN $APPTAINER run instance://${SERVER} \
0296 tritonserver $PORTARGS $REPOARGS $VERBOSE >& ${REDIR} &
0297 [ -z "$DRYRUN" ] || wait
0298 }
0299
0300 stop_docker(){
0301
0302 if [ -z "$DRYRUN" ]; then
0303 if [ -n "$VERBOSE" ]; then $DOCKER logs ${SERVER} >& "$LOG"; fi
0304 fi
0305
0306 $DRYRUN $DOCKER stop ${SERVER}
0307 $DRYRUN $DOCKER rm ${SERVER}
0308 }
0309
0310 stop_apptainer(){
0311 $DRYRUN $APPTAINER instance stop ${SERVER}
0312 }
0313
0314 test_docker(){
0315
0316 ${DOCKER} logs ${SERVER} |& grep "$1"
0317 }
0318
0319 test_apptainer(){
0320 grep "$1" $LOG
0321 }
0322
0323 wait_server(){
0324 if [ -n "$DRYRUN" ]; then
0325 return
0326 fi
0327
0328 COUNT=0
0329 while ! $TEST_FN "$STARTED_INDICATOR" >& /dev/null; do
0330 if $TEST_FN "$SEGFAULT_INDICATOR" >& /dev/null; then
0331 handle_ports
0332 PORT_EXIT=$?
0333
0334 if [ "$PORT_EXIT" -ne 0 ]; then
0335 auto_stop true
0336 exit $PORT_EXIT
0337 else
0338 return 1
0339 fi
0340 elif [ "$COUNT" -gt "$WTIME" ]; then
0341 echo "timed out waiting for server to start"
0342 auto_stop true
0343 exit 1
0344 else
0345 COUNT=$(($COUNT + 1))
0346 sleep 1
0347 fi
0348 done
0349
0350 echo "server is ready!"
0351 }
0352
0353 list_models(){
0354
0355 LOCALMODELREPO="local_model_repo"
0356 if [ -n "$THREADCONTROL" ]; then
0357 if [ -d "$TMPDIR/$LOCALMODELREPO" ]; then
0358
0359 rm -rf $TMPDIR/$LOCALMODELREPO
0360 fi
0361 $DRYRUN mkdir $TMPDIR/$LOCALMODELREPO
0362 MKMODELDIR_EXIT=$?
0363 if [ "$MKMODELDIR_EXIT" -ne 0 ]; then
0364 echo "Could not create local_model_repo dir: $TMPDIR/$LOCALMODELREPO"
0365 exit "$MKMODELDIR_EXIT"
0366 fi
0367 fi
0368
0369 for MODEL in ${MODELS[@]}; do
0370
0371 if [ -f "$MODEL" ]; then
0372 MODEL="$(dirname "$MODEL")"
0373 fi
0374 if [ -n "$THREADCONTROL" ]; then
0375 $DRYRUN cmsTritonConfigTool threadcontrol -c ${MODEL}/config.pbtxt --copy $TMPDIR/$LOCALMODELREPO --nThreads $INSTANCES
0376 TOOL_EXIT=$?
0377 if [ "$TOOL_EXIT" -ne 0 ]; then
0378 echo "Could not apply threadcontrol to $MODEL"
0379 exit "$TOOL_EXIT"
0380 fi
0381 else
0382 REPOS+=("$(dirname "$MODEL")")
0383 fi
0384 done
0385 if [ -n "$THREADCONTROL" ]; then
0386 REPOS=$TMPDIR/$LOCALMODELREPO
0387 else
0388 for ((r=0; r < ${
0389
0390 REPOS[$r]=$(readlink -f ${REPOS[$r]})
0391 done
0392 read -a REPOS <<< "$(printf "%s\n" "${REPOS[@]}" | sort -u | tr '\n' ' ')"
0393 fi
0394 }
0395
0396 auto_stop(){
0397
0398
0399 if [ -n "$1" ]; then VERBOSE="$VERBOSE_ARGS"; fi
0400 PARENTPID="$2"
0401
0402 if [ -n "$PARENTPID" ]; then
0403 if [ -n "$VERBOSE" ]; then
0404 echo "watching PID $PARENTPID"
0405 ps
0406 fi
0407 PCOUNTER=0
0408 PMAX=5
0409
0410 while [ "$PCOUNTER" -le "$PMAX" ]; do
0411 if ! kill -0 $PARENTPID >& /dev/null; then
0412 PCOUNTER=$((PCOUNTER+1))
0413 if [ -n "$VERBOSE" ]; then
0414 echo "trigger $PCOUNTER:"
0415 ps
0416 fi
0417 else
0418
0419 if [ "$PCOUNTER" -gt 0 ] && [ -n "$VERBOSE" ]; then
0420 echo "reset:"
0421 ps
0422 fi
0423 PCOUNTER=0
0424 fi
0425 sleep 1
0426 done
0427 fi
0428 $STOP_FN
0429
0430
0431 if [ -z "$DRYRUN" ] && [ -n "$VERBOSE" ]; then
0432 mv "$LOG" "$TOPDIR"
0433
0434 if [ -s "$STOPLOG" ]; then
0435 mv "$STOPLOG" "$TOPDIR"
0436 fi
0437 fi
0438
0439 if [ -n "$CLEANUP" ]; then
0440 $DRYRUN cd "$TOPDIR"
0441 $DRYRUN rm -rf "$TMPDIR"
0442 fi
0443 }
0444
0445 make_tmp(){
0446
0447 $DRYRUN mkdir "$TMPDIR"
0448 MKDIR_EXIT=$?
0449 if [ "$MKDIR_EXIT" -ne 0 ]; then
0450 echo "Could not create temp dir: $TMPDIR"
0451 exit "$MKDIR_EXIT"
0452 fi
0453 $DRYRUN cd "$TMPDIR"
0454 }
0455
0456 scram_tag(){
0457 cd $CMSSW_BASE
0458 TOOL="$1"
0459 TAG="$2"
0460 scram tool tag $TOOL $TAG 2> /dev/null || true
0461 }
0462
0463 driver_docker(){
0464 $DOCKER run --rm --entrypoint env ${IMAGE} | grep "CUDA_DRIVER_VERSION="
0465 }
0466
0467 driver_apptainer(){
0468 D2S=${SANDBOX}/.singularity.d/env/10-docker2singularity.sh
0469 if [ -f "$D2S" ]; then
0470 source $D2S && echo $CUDA_DRIVER_VERSION
0471 fi
0472 }
0473
0474 compat_docker(){
0475 $DOCKER cp $($DOCKER create --rm ${IMAGE}):${COMPAT_SCRIPT} .
0476 }
0477
0478 compat_apptainer(){
0479 cp ${SANDBOX}/${COMPAT_SCRIPT} .
0480 }
0481
0482 check_drivers(){
0483
0484 CUDA_DRIVER_VERSION=$($DRIVER_FN)
0485
0486 DRIVER_VERSION=$(sed -n 's/^NVRM.*Kernel Module *\([0-9.]*\).*$/\1/p' /proc/driver/nvidia/version 2>/dev/null || true)
0487 if [[ "${DRIVER_VERSION%%.*}" -ge "${CUDA_DRIVER_VERSION%%.*}" ]]; then
0488 return 0
0489 fi
0490
0491 export COMPAT=""
0492
0493
0494 if [ -n "$COMPAT_USR" ] && [ -d "$COMPAT_USR" ]; then
0495 COMPAT="$COMPAT_USR"
0496 fi
0497
0498
0499
0500
0501 if [ -z "$COMPAT" ] && [ -n "$CMSSW_BASE" ]; then
0502 CUDA_BASE=$(scram_tag cuda CUDA_BASE)
0503 COMPAT_CMSSW=${CUDA_BASE}/drivers
0504
0505 if [ -n "$CUDA_BASE" ] && [ -d "$COMPAT_CMSSW" ]; then
0506 COMPAT="$COMPAT_CMSSW"
0507 fi
0508 fi
0509
0510
0511 COMPAT_SYS=/usr/local/cuda/compat
0512 if [ -z "$COMPAT" ] && [ -d "$COMPAT_SYS" ]; then
0513 COMPAT="$COMPAT_SYS"
0514 fi
0515
0516 if [ -n "$COMPAT" ]; then
0517
0518 export COMPAT="${COMPAT}:${COMPAT_SYS}/lib.real"
0519
0520 if [ "$PWD" == "$TMPDIR" ]; then
0521 $DRYRUN $COMPAT_FN
0522 COMPAT_SCRIPT_BASE=$(basename $COMPAT_SCRIPT)
0523
0524 if ! grep -Fq 'LD_PRELOAD=""' $COMPAT_SCRIPT_BASE; then
0525 $DRYRUN sed -i 's/LD_LIBRARY_PATH="${_CUDA_COMPAT_REALLIB}"/&; LD_PRELOAD=""/' $COMPAT_SCRIPT_BASE
0526 COMPAT_SCRIPT_PATH=$(readlink -f $TMPDIR/$COMPAT_SCRIPT_BASE)
0527 COMPAT_SCRIPT_MOUNT="$COMPAT_SCRIPT_PATH:$COMPAT_SCRIPT"
0528 fi
0529 fi
0530 return 0
0531 else
0532 return 1
0533 fi
0534 }
0535
0536 if [ -n "$USEDOCKER" ]; then
0537 if [ -n "$GPU" ]; then
0538 EXTRA="--gpus all"
0539 fi
0540 START_FN=start_docker
0541 TEST_FN=test_docker
0542 STOP_FN=stop_docker
0543 DRIVER_FN=driver_docker
0544 COMPAT_FN=compat_docker
0545 PROG_NAME=Docker
0546 else
0547 if [ -n "$GPU" ]; then
0548 EXTRA="--nv"
0549 fi
0550 START_FN=start_apptainer
0551 TEST_FN=test_apptainer
0552 STOP_FN=stop_apptainer
0553 DRIVER_FN=driver_apptainer
0554 COMPAT_FN=compat_apptainer
0555 PROG_NAME=Apptainer
0556 fi
0557
0558 if [ "$OP" == check ]; then
0559 check_drivers
0560 exit $?
0561 elif [ "$OP" == start ]; then
0562
0563 if [ -n "$FORCE" ]; then
0564 auto_stop
0565 elif [ -d "$TMPDIR" ]; then
0566 echo "Error: this container may already exist (override with -f)"
0567 exit 1
0568 fi
0569
0570 handle_ports
0571 PORT_EXIT=$?
0572 if [ "$PORT_EXIT" -ne 0 ]; then exit $PORT_EXIT; fi
0573
0574
0575
0576 if [ -n "$PARENTPID" ]; then
0577 auto_stop "" "$PARENTPID" >& "$STOPLOG" &
0578 fi
0579
0580 START_EXIT=0
0581 for ((counter=0; counter < ${RETRIES}; counter++)); do
0582 make_tmp
0583
0584
0585 if [ "$counter" -eq 0 ] || [ -n "$THREADCONTROL" ]; then list_models; fi
0586
0587
0588 if [ -n "$GPU" ]; then
0589 check_drivers
0590 DRIVER_EXIT=$?
0591 if [ "$DRIVER_EXIT" -ne 0 ]; then exit $DRIVER_EXIT; fi
0592 fi
0593
0594 $START_FN
0595 START_EXIT=$?
0596 if [ "$START_EXIT" -eq 0 ]; then
0597 wait_server
0598 WAIT_EXIT=$?
0599 if [ "$WAIT_EXIT" -eq 0 ]; then
0600 break
0601 else
0602
0603 counter=$((counter-1))
0604 START_EXIT="$WAIT_EXIT"
0605 fi
0606 fi
0607
0608 if [ "$START_EXIT" -ne 0 ]; then
0609 auto_stop
0610 echo "Retrying after container issue..."
0611 fi
0612 done
0613 if [ "$START_EXIT" -ne 0 ]; then
0614 echo "Error from $PROG_NAME"
0615 exit "$START_EXIT"
0616 fi
0617 else
0618
0619 if [ -d "$TMPDIR" ] || [ -n "$DRYRUN" ]; then
0620 $DRYRUN cd "$TMPDIR"
0621 elif [ -z "$FORCE" ]; then
0622 echo "Error: attempt to stop unknown container $SERVER"
0623 exit 1
0624 fi
0625
0626 auto_stop
0627 fi