#!/bin/sh
# set -xv
#
# Periodically read log files of PBS and put mark files
# for job, which finished.
# If log files are not available scan for finished (absent) jobs 
# in PBS and put mark files for job, which finished.
#
#
# usage: scan_sge_job control_dir ...

# ARC1 passes first the config file.
if [ "$1" = "--config" ]; then shift; ARC_CONFIG=$1; shift; fi

basedir=`dirname $0`
basedir=`cd $basedir > /dev/null && pwd` || exit $?

pkglibdir="$basedir/../../lib/arc"
pkglibdir=`cd $pkglibdir > /dev/null && pwd` || exit $?

# Assume that gm-kick is installed in the same directory
GMKICK=${basedir}/gm-kick

##############################################################
# Set SGE specific environment.
##############################################################
. "${pkglibdir}/configure-sge-env.sh" 1>&2 || exit $?

##############################################################

olog () { echo "[`date +%Y-%m-%d\ %T`] scan-sge-job: $*" 1>&2; }


umask 022

my_id=`id -u`

# Prints the uid of the owner of the file given as argument
# Perl is used because it's more protable than using the stat command
printuid () {
  code='my @s = stat($ARGV[0]); print($s[4] || 0)'
  /usr/bin/perl -we "$code" "$1"
}

#
# Attempts to switch to uid passed as the first argument and then runs the
# commands passed as the second argument in a shell. The remaining arguments
# are passed as arguments to the shell. No warning is given in case switching
# uid is not possible.
#
do_as_uid () {
    test $# -ge 2 || { log "do_as_uid requires 2 arguments"; return 1; }

    script='use English;
            my ($uid, @args) = @ARGV;
            if ( $UID == 0 ) {
                eval { $UID = $uid };
                print STDERR "Cannot switch to uid($UID): $@\n" if $@;
            }
            system("/bin/sh","-c",@args);
            exit ($?>>8||128+($?&127));
    '
    /usr/bin/perl -we "$script" "$@"
}


if [ -z "$1" ] ; then exit 1 ; fi

TMP_DIR=${TMPDIR:-/tmp}

# first control_dir is used for storing own files

control_dir=$1
control_dirs=
while [ $# -gt 0 ] ; do
  control_dirs="${control_dirs} $1"
  shift
done

# Append .comment (containing STDOUT & STDERR of the job wrapper) to .errors
save_commentfile () {
  uid=$1
  commentfile=$2
  errorsfile=$3
  action="
    { echo '---------- Contents of output stream forwarded by SGE -----------'
      cat '$commentfile' 2> /dev/null
      echo '------------------------- End of output -------------------------'
    } >> '$errorsfile'
  "
  do_as_uid "$uid" "$action" 
}


# GD: no attempt to look for SGE Manager logfiles, restrict to job logs.


# Get all jobs
pids=`${SGE_BIN_PATH}/qstat -u '*' 2>/dev/null | sed -n 's/^ *\([0-9][0-9]*\) .*/\1/p'`
if [ $? != 0 ]; then
  olog "Failed running ${SGE_BIN_PATH}/qstat"
  sleep 60
  exit 1
fi

# Go through directories
for ctr_dir in $control_dirs ; do
  # Obtain ids of pending/running jobs stored in job.*.local
  rjobs=`find $ctr_dir -name 'job.*.status' 2>/dev/null | xargs egrep -lv 'DELETED|FINISHED' 2>/dev/null | sed s/status$/local/`
  if [ -z "$rjobs" ] ; then continue ; fi
  ids=`echo $rjobs | xargs grep -h '^localid=' 2>/dev/null | sed 's/^localid=\([^ ]*\)/\1/'`
  if [ -z "$ids" ] ; then continue ; fi
  # compare them to running jobs and find missing
  bids=
  for id in $ids ; do
    found=`echo "$pids" | grep "^$id"`
    if [ -z "$found" ] ; then
      bids="$bids $id"
    fi
  done
  # go through missing ids
  for id in $bids ; do
    # find grid job corresponding to current local id
    jobfile=`find $ctr_dir -name 'job.*.local' 2>/dev/null | xargs grep -l "localid=$id\$" 2>/dev/null`
    if [ -z "$jobfile" ] ; then continue ; fi
    # unless running as superuser, skip jobs not belonging to the current user
    if [ "$my_id" != '0' ] ; then
      if [ ! -O "$jobfile" ] ; then continue ; fi
    fi
    # find who runs this job
    uid=`printuid "${jobfile}"`
    # extract grid id
    gridid=`basename "$jobfile" '.local' | sed 's/^job\.//'`
    gramifile="${ctr_dir}/job.${gridid}.grami"
    donefile="${ctr_dir}/job.${gridid}.lrms_done"
    countfile="${ctr_dir}/job.${gridid}.lrms_job"
    failedfile="${ctr_dir}/job.${gridid}.failed"
    errorsfile="${ctr_dir}/job.${gridid}.errors"
    if [ -f "$donefile" ] ; then continue ; fi
    statusfile="${ctr_dir}/job.${gridid}.status"
    if [ ! -f "$statusfile" ] ; then continue ; fi
    status=`cat "$statusfile"`
    if [ "$status" != "INLRMS" ] && [ "$status" != "CANCELING" ] ; then continue ; fi
    # get session directory of this job
    session=`grep -h '^sessiondir=' "$jobfile" | sed 's/^sessiondir=\(.*\)/\1/'`
    commentfile="${session}.comment"
    if [  -d "$session" ] ; then
      diagfile="${session}.diag"
      # try to obtain the exit code
      # $diagfile is owned by the user running the job. Switch user to access it.
      exitcode=`do_as_uid "$uid" "sed -n 's/^exitcode=//p' '$diagfile'" | tail -n 1`

      diagfile_tmp=`mktemp "$TMP_DIR/diag_tmp.XXXXXX"` || { sleep 60; exit 1; }
      diagfile_acct=`mktemp "$TMP_DIR/diag_acct.XXXXXX"` || { sleep 60; exit 1; }
      noaccounting=

      # qacct can take quite long. Here is a workaround.
      # Find the accounting file, and copy the last 50000
      # records to a temp file.
      acctfile=$SGE_ROOT/$SGE_CELL/common/accounting
      if [ -f  "$acctfile" ]; then
        briefacct=`mktemp "$TMP_DIR/accounting.XXXXXX"` || { sleep 60; exit 1; }
        tail -n 50000 "$acctfile" > "$briefacct"
        if [ $? = 0 ]; then extraargs="-f $briefacct"; fi
      else
        # Accounting file is not accessible on this host.
        noaccounting=1
      fi

      # get accounting info. write diag file
      ${SGE_BIN_PATH}/qacct -j $id $extraargs 2> /dev/null \
          | /usr/bin/perl -e 'while(<>){
                         $nodename=$1         if /^hostname\s+(\S+)/;
                         $id=$1               if /^jobnumber\s+(\S+)/;
                         $exitcode=$1         if /^exit_status\s+(\S+)/;
                         $failed=$1           if /^failed\s+(.*\S)/;
                         $CPUTime=$1          if /^cpu\s+(\d+)/;
                         $KernelTime=$1       if /^ru_stime\s+(\d+)/;
                         $WallTime=$1         if /^ru_wallclock\s+(\d+)/;
                         $UsedMemory=$1       if /^maxvmem\s+(\S+)M/;
                         $UsedMemory=$1*1024  if /^maxvmem\s+(\S+)G/;
                       }
                       END {
                         exit unless $id;
                         print "nodename=${nodename}\n";
                         print "CPUTime=${CPUTime}.0s\n";
                         print "WallTime=${WallTime}.0s\n";
                         print "KernelTime=${KernelTime}.0s\n";
                         print "UserTime=".int($CPUTime-$KernelTime).".0s\n";
                         print "AverageTotalMemory=".int($UsedMemory*1024)."kB\n";
                         print "failed=$failed\n";
                         print "\nsgeexitcode=$exitcode\n";
                       }' \
          > "$diagfile_acct"
      used_walltime=`sed -n 's/^WallTime=\(.*\).0s/\1/p' "$diagfile_acct" | tail -n 1`
      used_cputime=`sed -n 's/^CPUTime=\(.*\).0s/\1/p' "$diagfile_acct" | tail -n 1`
      used_memory=`sed -n 's/^AverageTotalMemory=\(.*\)kB/\1/p' "$diagfile_acct" | tail -n 1`
      sgeexitcode=`sed -n 's/^sgeexitcode=//p' "$diagfile_acct" | tail -n 1`
      failedreason=`sed -n 's/^failed=//p' "$diagfile_acct" | tail -n 1`
      failedcode=`echo $failedreason | awk '{print $1}'`

      if [ "x$briefacct" != "x" ]; then rm -f "$briefacct"; fi

      # If the last qacct record is about migration,
      # we should wait for the next qacct record to appear
      # Delete file, like there was no accounting present at all!
      if grep -q "^failed=24 *: migrating"    "$diagfile_acct" \
      || grep -q "^failed=25 *: rescheduling" "$diagfile_acct"; then
          rm -f "$diagfile_acct"
          olog "SGE job $id: the last record in qacct is about migration. Waiting for next record to appear"
      fi

      # Add accounting info to $diagfile
      if [ -s "$diagfile_acct" ]; then

          do_as_uid "$uid"  "cat '$diagfile'" \
                        | grep -v "^nodename=" \
                        | grep -v "^WallTime=" \
                        | grep -v "^KernelTime=" \
                        | grep -v "^UserTime=" \
                        | grep -v "^CPUTime=" \
                        | grep -v "^MaxResidentMemory=" \
                        | grep -v "^AverageTotalMemory=" \
          > "$diagfile_tmp"

          cat "$diagfile_tmp" "$diagfile_acct" \
            | grep -v '^sgeexitcode=' \
            | do_as_uid "$uid" "cat > '$diagfile'"

          # Check for exceeded resources limits
          if [ -s "$gramifile" ]; then
            req_walltime=`sed -n "s/^joboption_walltime=//p" "$gramifile" | tail -n 1`
            req_cputime=`sed -n "s/^joboption_cputime=//p" "$gramifile" | tail -n 1`
            req_memory=`sed -n "s/^joboption_memory=//p" "$gramifile" | tail -n 1`

            if [ ! -z "$used_memory" ] && [ ! -z "$req_memory" ] \
            && [ "$req_memory" != "" ] && [ "$req_memory" -gt 0 ] \
            && [ $(( 100*$used_memory/1024/$req_memory )) -gt 95 ]; then
              overlimit="memory"
            fi
            if [ ! -z "$used_cputime" ] && [ ! -z "$req_cputime" ] \
            && [ "$req_cputime" != "" ] && [ "$req_cputime" -gt 0 ] \
            && [ $(( 100*$used_cputime/$req_cputime )) -gt 95 ]; then
              overlimit="cputime"
            fi
            if [ ! -z "$used_walltime" ] && [ ! -z "$req_walltime" ] \
            && [ "$req_walltime" != "" ] && [ "$req_walltime" -gt 0 ] \
            && [ $(( 100*$used_walltime/$req_walltime )) -gt 95 ]; then
              overlimit="walltime"
            fi

            echo ++++++++++++++++++++++++++   >> "$errorsfile"
            echo Resources:                   >> "$errorsfile"
            echo ++++++++++++++++++++++++++   >> "$errorsfile"
            echo req_memory=$req_memory Mb    >> "$errorsfile"
            echo req_cputime=$req_cputime     >> "$errorsfile"
            echo req_walltime=$req_walltime   >> "$errorsfile"
            echo used_memory=$used_memory kB  >> "$errorsfile"
            echo used_cputime=$used_cputime   >> "$errorsfile"
            echo used_walltime=$used_walltime >> "$errorsfile"
            if [ ! -z "$overlimit" ]; then
              echo overlimit=$overlimit       >> "$errorsfile"
            fi
            echo ++++++++++++++++++++++++++   >> "$errorsfile"

          fi # grami file

          save_commentfile "$uid" "$commentfile" "$errorsfile"

          if [ -z "$failedcode" ]; then
            # Should never happen
            olog "SGE job $id failed: SGE accouting record is incomplete"
            echo "-1 SGE accouting record is incomplete" > "$donefile"

          elif [ "$failedcode" = "0" ]; then

            if [ -z "$exitcode" ]; then
              olog "SGE job $id failed with unknown exit code"
              if [ -z "$sgeexitcode" ] || [ "$sgeexitcode" = 0 ]; then sgeexitcode="-1"; fi
              echo "$sgeexitcode Job failed with unknown exit code" > "$donefile"

            elif [ "$exitcode" = "0" ]; then
              #olog "SGE job $id finished succesfully"
              echo "0" > "$donefile"

            else
              #olog "SGE job $id failed with exit code $exitcode"
              if [ -z "$sgeexitcode" ] || [ "$sgeexitcode" = 0 ]; then sgeexitcode="-1"; fi
              echo "$sgeexitcode Job failed with exit code $exitcode" > "$donefile"
            fi

          else  # SGE reports a problem

            if [ "$failedcode" = "25" ]; then
              failedreason="SGE error $failedcode: Job will be rescheduled"
            elif [ "$failedcode" = "24" ]; then
              failedreason="SGE error $failedcode: Job will be migrated"
            elif [ "$failedcode" = "100" ]; then
	      # This happens when SGE signals the job, as in the case when a
	      # resource limit is exceeded.  We don't know for sure whether
	      # they were enforced or not but if a job is killed by SGE, this
	      # might be the likely cause.
	      if [ -z "$overlimit" ]; then
                failedreason="SGE error $failedreason"
              elif [ $overlimit = "memory" ]; then
                failedreason="job killed: vmem"
              elif [ $overlimit = "cputime" ]; then
                failedreason="job killed: cput"
              elif [ $overlimit = "walltime" ]; then
                failedreason="job killed: wall"
              fi
            else
              failedreason="SGE error $failedreason"
            fi

            olog "SGE job $id failed: $failedreason"
            if [ -z "$sgeexitcode" ] || [ "$sgeexitcode" = 0 ]; then sgeexitcode="-1"; fi
            echo "271 $failedreason" > "$donefile"

          fi # failedcode

          # wake up GM
          $GMKICK "$statusfile" >> "$errorsfile"

          rm -f "$countfile"
          rm -f "$diagfile_tmp" "$diagfile_acct"

          # we're done, go to next job id
          continue

      fi # accounting info ok

      rm -f "$diagfile_tmp" "$diagfile_acct"

    fi # session directory exists

    # This section is only reached when accounting info is not present

    if [ -n "$noaccounting" ]; then

      # Accounting file is not accessible on this host.
      echo "scan-sge-job: WARNING: SGE's accounting file is not accessible on the Grid frontend node" >> "$errorsfile"
      echo "scan-sge-job: WARNING: Resource usage reported for this job may be inaccurate or incomplete" >> "$errorsfile"

      if [ -z "$exitcode" ]; then
        echo "-1 Job failed with unknown exit code" > "$donefile"
      elif [ "$exitcode" = "0" ]; then
        echo "0" > "$donefile"
      else
        echo "$exitcode Job failed with exit code $exitcode" > "$donefile"
      fi

      $GMKICK "$statusfile" >> "$errorsfile"

      # we're done, go to next job id
      continue
    fi

    # There is a certain lag between the end of the job
    # and the time when accouting information becomes available.
    # We do 5 retries, keeping the count in $countfile

    counter=0
    if [ -f "$countfile" ] ; then
      counter=`cat "$countfile"`
      counter=$(( $counter + 1 ))
    fi

    if [ "$counter" -gt 5 ]; then
      # Cannot wait more for accounting info.
      echo "scan-sge-job: WARNING: No SGE accounting record found for this job" >> "$errorsfile"
      echo "scan-sge-job: WARNING: Resource usage reported for this job may be inaccurate or incomplete" >> "$errorsfile"

      save_commentfile "$uid" "$commentfile" "$errorsfile"

      if [ -z "$exitcode" ]; then
        olog "No SGE accounting record found for job $id. No exitcode in diag file"
        echo "-1 Job failed with unknown exit code" > "$donefile"
      elif [ "$exitcode" = "0" ]; then
        olog "No SGE accounting record found for job $id. exitcode=$exitcode in diag file"
        echo "0" > "$donefile"
      else 
        olog "No SGE accounting record found for job $id. exitcode=$exitcode in diag file"
        echo "$exitcode Job failed with exit code $exitcode" > "$donefile"
      fi
      rm -f "$countfile"

      # wake up GM
      $GMKICK "$statusfile" >> "$errorsfile"

    else
      # test again for job existence, only count if not known
      ${SGE_BIN_PATH}/qstat -j $id > /dev/null 2>&1
      if [ $? -ne 0 ]; then
        echo "$counter" > "$countfile"
      else
        olog "SGE job $id disappeared and then reappeared!"
        rm -f "$countfile"
      fi
    fi
  done # loop over bids

  # Detect the unlikely situation when a job reappears in the qstat listing
  # after being absent in the previous run of the scan-*-job (which updated
  # the counter file)
  for countfile in `find $ctr_dir -name 'job.*.lrms_job'`; do
    localfile=${countfile%.lrms_job}.local
    pid=`sed -n 's/^localid=\([^ ]*\)/\1/p' "$localfile" 2>/dev/null`
    if [ -z "$pid" ]; then continue; fi
    if echo "$pids" | grep "^$pid\$" >/dev/null; then
      olog "SGE job $id disappeared and then reappeared!"
      rm -f "$countfile"
    fi
  done

done # loop over control_dirs
sleep 60
exit 0

