#!/bin/bash
#
#   Periodically read log files of PBS and put mark files
# for job, which finished.
#   If log files are not available scan for finished (absent) jobs 
# in PBS and put mark files for job, which finished.
#
# usage: scan_pbs_job control_dir ...

# ARC1 passes first the config file.
if [ "$1" = "--config" ]; then shift; ARC_CONFIG=$1; shift; fi

if [ -z "$1" ] ; then exit 1 ; fi

basedir=`dirname $0`
basedir=`cd $basedir > /dev/null && pwd` || exit $?

pkglibdir="$basedir/../../lib/arc"
pkglibdir=`cd $pkglibdir > /dev/null && pwd` || exit $?

. ${pkglibdir}/configure-pbs-env.sh || exit $?

# Assume that gm-kick is installed in the same directory
GMKICK=${basedir}/gm-kick

# Where to store temporary files
TMP_DIR=${TMPDIR:-/tmp}

# directory containing PBS server logs
pbs_log_dir=${CONFIG_pbs_log_path:-/var/spool/pbs/server_logs}

RUNTIME_NODE_SEES_FRONTEND=$CONFIG_shared_filesystem

#default is NFS
if [ -z "${RUNTIME_NODE_SEES_FRONTEND}" ] ; then
  RUNTIME_NODE_SEES_FRONTEND=yes
fi
# locally empty means no
if [ "${RUNTIME_NODE_SEES_FRONTEND}" = 'no' ] ; then
  RUNTIME_NODE_SEES_FRONTEND=
fi

# first control_dir is used for storing own files
control_dir=$1
control_dirs=
while [ $# -gt 0 ] ; do
  control_dirs="${control_dirs} \"$1\""
  shift
done

my_id=`id -u`

state_file=$control_dir/pbs_log_scan.`id -un`

lines=`cat "$state_file" 2>/dev/null`
ldt=`echo $lines | awk '{split($0,field," ");print field[1]}' `
lines=`echo $lines | awk '{split($0,field," ");print field[2]}'`
lines_skip=$(( $lines + 0 ))
ldate=$(( $ldt + 0 ))
if [ -z "$lines_skip" ] ; then lines_skip='0' ; fi
if [ -z "$ldate" ] ; then ldate='0' ; fi

# Prints the owner of the file given as argument
# Perl is used because it's more protable than using the stat command
printowner () {
  code='$f=$ARGV[0];if(@s=stat $f){@p=getpwuid $s[4];if(@p){print $p[0]}else{exit 1}}else{die "$f: $!\n"}'
  /usr/bin/perl -we "$code" "$1"
}

# Append .comment (containing STDOUT & STDERR of the job wrapper) to .errors
# This file can also contain a message from PBS (i.e. the reason for killing the job).
save_commentfile () {
  username=$1
  commentfile=$2
  errorsfile=$3
  action="
    { echo '---------- Contents of output stream forwarded by PBS -----------'
      cat '$commentfile' 2> /dev/null
      echo '------------------------- End of output -------------------------'
    } >> '$errorsfile'
  "
  if [ -z "$username" ] ; then
    eval "$action"
  else
    su "$username" -c "eval \"$action\""
  fi
}

# Function for translating time from 00:01:01 format to 0061s format
translate_time_to_seconds () {
    orig_time=$1
    ttts_new_time=`echo $orig_time | awk 'BEGIN { FS = ":" } ; { print $1*3600+$2*60+$3}'`
}

find_by_local() {
  eval "set -- $control_dirs"
  for ctr_dir in "$@"; do
    find ${ctr_dir} -name 'job.*.local' -print0 | xargs -0 grep -F -l "localid=$job_id" 2>/dev/null
  done \
  | head -n 1
}

find_by_grami() {
  eval "set -- $control_dirs"
  for ctr_dir in "$@"; do
    find ${ctr_dir} -name 'job.*.grami' -print0 | xargs -0 grep -F -l "joboption_jobid=$job_id" 2>/dev/null
  done \
  | sed 's/\.grami$/.local/' \
  | head -n 1
}

#
#  Main fnction for processing one PBS log.
#  Extracts log lines with code 0010 (job exited) and 0008 (job killed)
#
# TODO this should be split into smaller functions
process_log_file () {
eval "set -- $control_dirs"
egrep '^[^;]*;0010;[^;]*;Job;|^[^;]*;0008;[^;]*;Job;[^;]*;Exit_status=|^[^;]*;0008;[^;]*;Job;[^;]*;Job deleted' ${lname} | \
{
  lines_processed='0'
  # skip already processed lines
  while [ $lines_skip -gt '0' ] ; do
    read rest_line
    if [ $? != '0' ] ; then break ; fi
    lines_skip=$(( $lines_skip - 1 ))
    lines_processed=$(( $lines_processed + 1 ))
  done
  while true ; do
    read rest_line
    if [ $? != '0' ] ; then break ; fi
    echo "$rest_line" | sed 's/^[^;]*;//'
    lines_processed=$(( $lines_processed + 1 ))
  done
  if [ "$lines_processed" -lt '0' ] ; then
    lines_processed=0;
  fi
  echo "$cname $lines_processed"> $state_file
} | \
sort -u | \
{
  # parse by ;
  IFS=';'
  while true ; do
    # split line into fields
    read pbs_code pbs_server pbs_job job_id job_message rest_line
    if [ "$?" != '0' ] ; then
      break
    fi
    # Try to extract exit code of PBS (note: if executable fails it's code goes to PBS)
    exit_code=`echo "$job_message" | sed -n 's/^.*Exit_status=\([-0-9]*\).*/\1/p'`
    job_id=`echo "$job_id" | awk '{split($0,field,".");print field[1]"."field[2]}'`

    # Try to extract walltime and cputime
    #WallTime
    walltime=`echo "$job_message" | sed 's/^.*resources_used.walltime=\(\([0-9]*:\)*[0-9][0-9]\).*/\1/;t leave;s/.*//;:leave'`
    #UserTime
    cputime=`echo "$job_message" | sed 's/^.*resources_used.cput=\(\([0-9]*:\)*[0-9][0-9]\).*/\1/;t leave;s/.*//;:leave'`

    # look for this id in job.ID.local, then in job.ID.grami
    name=`find_by_local`
    if [ -z "$name" ]; then
        name=`find_by_grami`
        if [ -z "$name" ]; then continue; fi
    fi

        if [ "$my_id" != '0' ] ; then
          if [ ! -O "$name" ] ; then continue ; fi
        fi
        base_name=`echo "$name" 2>/dev/null | sed -n 's/\.local$//p'`
        if [ -z "${base_name}" ] ; then continue ; fi

        # Check if this is already written in .diag file.
	if [ -z "`grep WallTime $base_name.diag`" ]; then
            # Check if it is possible to write to file
            if [ ! -w $base_name.diag ]; then
                echo "Error: job $job_id has finished without writing WallTime to diag-file" 1>&2
                echo " and the diag-file is not writeable by grid-manager." 1>&2
                continue
            fi

            # It has not been written, write info to .diag file.
	    translate_time_to_seconds "$walltime"
	    echo "WallTime=${ttts_new_time}s" >> $base_name.diag
	    translate_time_to_seconds "$cputime"
	    echo "UserTime=${ttts_new_time}s" >> $base_name.diag
	    echo "KernelTime=0s" >> $base_name.diag

	fi

	# check if job already reported
        if [ -f "${base_name}.lrms_done" ] ; then continue ; fi

        # more protection - check if grid-manager thinks job is still running
        egrep 'INLRMS|SUBMIT|CANCELING' "${base_name}.status" >/dev/null 2>&1
        if [ ! $? = '0' ] ; then continue ; fi

	# So far only PBS exit code is available
	# It would be nice to have exit code of main executable
	exitcode=''
        # get session directory of this job
        session=`grep -h '^sessiondir=' "${base_name}.local" | sed 's/^sessiondir=\(.*\)/\1/'`
        diagfile="${session}.diag"
        commentfile="${session}.comment"
	if [ "$my_id" = '0' ] ; then
          username=`printowner "${name}"`
	else
	  username=
	fi
        if [ ! -z "$session" ] ; then
          # have chance to obtain exit code
          if [ -z "${RUNTIME_NODE_SEES_FRONTEND}" ] ; then
            # In case of non-NFS setup it may take some time till
            # diagnostics file is delivered. Wait for it max 2 minutes.
            diag_tries=20
            while [ "$diag_tries" -gt 0 ] ; do
	      if [ -z "$username" ] ; then
                exitcode=`grep '^exitcode=' "$diagfile" 2>/dev/null | sed 's/^exitcode=//'`
	      else
                exitcode=`su "${username}" -c "grep '^exitcode=' $diagfile" 2>/dev/null | sed 's/^exitcode=//'`
	      fi
              if [ ! -z "$exitcode" ] ; then break ; fi
              sleep 10
              diag_tries=$(( $diag_tries - 1 ))
            done
          else
	    if [ -z "$username" ] ; then
              exitcode=`grep '^exitcode=' "$diagfile" 2>/dev/null | sed 's/^exitcode=//'`
	    else
              exitcode=`su "${username}" -c "grep '^exitcode=' $diagfile" 2>/dev/null | sed 's/^exitcode=//'`
	    fi
	  fi
	fi
	# Try to obtain message from PBS if any
        if [ -z "$username" ] ; then
  	  pbs_comment=`tail -n 1 "$commentfile"`
	else
  	  pbs_comment=`su "${username}" -c "tail -n 1 $commentfile"`
	fi
        save_commentfile "$username" "$commentfile" "${base_name}.errors"

        if [ -z "$exitcode" ] ; then
          # No exit code of job means job was most probably killed
          if [ -z "$exit_code" ] ; then exit_code='-1'; fi
	  if [ "$exit_code" = '0' ] ; then 
            echo "Job $job_id failed but PBS have not noticed that" 1>&2
            echo "-1 Job failed but PBS reported 0 exit code." > "${base_name}.lrms_done"
	  elif [ -z "$pbs_comment" ] ; then
            echo "Job $job_id failed with PBS exit code $exit_code" 1>&2
            echo "$exit_code Job was killed by PBS." > "${base_name}.lrms_done"
	  else
            echo "Job $job_id failed with PBS exit code $exit_code" 1>&2
            echo "$exit_code $pbs_comment" > "${base_name}.lrms_done"
	  fi
	else
          if [ -z "$exit_code" ] ; then exit_code='-1'; fi
          if [ ! "$exitcode" = 0 ] ; then
  	    if [ "$exit_code" = '0' ] ; then exit_code='-1'; fi
            echo "Job $job_id failed with exit code $exitcode, PBS reported $exit_code." 1>&2
	    echo "$exit_code Job failed with exit code $exitcode." > "${base_name}.lrms_done"
	  else
  	    if [ ! "$exit_code" = '0' ] ; then
              echo "Job finished properly but PBS reported $exit_code." 1>&2
	      if [ -z "$pbs_comment" ] ; then
                echo "$exit_code Job was killed by PBS." > "${base_name}.lrms_done"
	      else
                echo "$exit_code $pbs_comment" > "${base_name}.lrms_done"
              fi
	    else
              # echo "Job finished without errors." 1>&2
              echo "0" > "${base_name}.lrms_done"
	    fi  
	  fi	    
	fi
	# wake up GM
	${GMKICK} "${base_name}.status"
  done
}
}

readable_logs=no
if [ ! -z "${pbs_log_dir}" ] ; then
for cname in `ls -1 ${pbs_log_dir}/ 2>/dev/null | grep '^[0-9]*$'` ; do
  lname="${pbs_log_dir}/$cname"
  if [ ! -r "$lname" ] ; then continue ; fi
  readable_logs=yes
  if [ "$cname" -lt "$ldate" ] ; then
    continue
  elif [ "$cname" -gt "$ldate" ] ; then
    lines_skip=0
  fi
  echo "Date: " $cname
  last_modified=`stat $lname | grep Modify`
  process_log_file
done
fi

if [ "$readable_logs" = 'yes' ] ; then
  time_count=0
  while true ; do
    new_modified=`stat $lname | grep Modify`
    if [ "$new_modified" != "$last_modified" ] ; then
      last_modified="$new_modified"
      lines=`cat "$state_file" 2>/dev/null`
      ldt=`echo $lines | awk '{split($0,field," ");print field[1]}' `
      lines=`echo $lines | awk '{split($0,field," ");print field[2]}'`
      lines_skip=$(( $lines + 0 ))
      ldate=$(( $ldt + 0 ))
      process_log_file
    fi
    sleep 10
    time_count=$(( $time_count + 1 ))
    if [ "$time_count" -gt 60 ] ; then break ; fi
  done
  exit 0
fi

# If no PBS logs found try ordinary 'qstat'
eval "set -- $control_dirs"
# Get all running jobs

pidslist=`mktemp "$TMP_DIR/qstat.XXXXXX"` || 
if [ ! "$?" = '0' ] ; then 
  # FS problems ?
  # TODO debug output here
  sleep 60
  exit 1
fi
${PBS_BIN_PATH}/qstat -a 2>/dev/null 1>"$pidslist"
if [ ! "$?" = '0' ] ; then 
  rm -f "$pidslist"
  # PBS server down ?
  sleep 60
  exit 1
fi

exclude_completed () {
  awk '$10!="C"{print $0}'
}

pids=`cat "$pidslist" | grep '^[0-9][0-9]*\.' | exclude_completed | sed 's/^\([0-9][0-9]*\).*/\1/'`
rm -f "$pidslist"
# Go through directories
for ctr_dir in "$@" ; do
  # Obtain ids stored in job.*.local
  ids=`find ${ctr_dir} -name 'job.*.local' -print0 | xargs -0 grep -h "^localid=" 2>/dev/null | sed 's/^localid=\([0-9]*\).*/\1/'`
  if [ -z "$ids" ] ; then continue ; fi
  # compare them to running jobs and find missing
  bids=
  for id in $ids ; do
    found=`echo "$pids" | grep "^$id$"`
    if [ -z "$found" ] ; then
      bids="$bids $id"
    fi
  done
  # go through missing ids
  for id in $bids ; do
    # find grid job corresponding to curent local id
    jobfile=`find ${ctr_dir} -name 'job.*.local' -print0 | xargs -0 grep -F -l "localid=$id." 2>/dev/null`
    if [ -z "$jobfile" ] ; then continue ; fi
    # extract grid id
    gridid=`basename "$jobfile" '.local' | sed 's/^job\.//'`
    donefile="${ctr_dir}/job.${gridid}.lrms_done"
    if [ -f "$donefile" ] ; then continue ; fi
    statusfile="${ctr_dir}/job.${gridid}.status"
    if [ ! -f "$statusfile" ] ; then continue ; fi
    status=`cat "$statusfile"`
    if [ "$status" != "INLRMS" ] && [ "$status" != "CANCELING" ]; then continue ; fi
    if [ "$my_id" = '0' ] ; then
      username=`printowner "${jobfile}"`
    else
      username=
    fi
    # get session directory of this job
    session=`grep -h '^sessiondir=' "$jobfile" | sed 's/^sessiondir=\(.*\)/\1/'`
    if [ ! -z "$session" ] ; then
      # have chance to obtain exit code
      diagfile="${session}.diag"
      if [ ! -z "$session" ] ; then
        # have chance to obtain exit code
        if [ -z "$username" ] ; then
          exitcode=`grep '^exitcode=' "$diagfile" | sed 's/^exitcode=//'`
        else
          exitcode=`su "${username}" -c "grep '^exitcode=' $diagfile" | sed 's/^exitcode=//'`
        fi
      fi
      if [ ! -z "$exitcode" ] ; then
        # job finished and exit code is known
        save_commentfile "$username" "${session}.comment" "${ctr_dir}/job.${gridid}.errors"
        echo "$exitcode Executable finished with exit code $exitcode" > "$donefile"
	${GMKICK} "$statusfile"
        echo "Job $gridid finished with exit code $exitcode"
        continue
      fi
    fi
    # job has probaly finished and exit code is not known
    exitcode='-1'
    countfile="${ctr_dir}/job.${gridid}.lrms_job"
    counter=0
    if [ -f "$countfile" ] ; then
      counter=`cat "$countfile"`
      counter=$(( $counter + 1 ))
    fi
    if [ "$counter" -gt 5 ] ; then
      rm -f "$countfile"
      save_commentfile "$username" "${session}.comment" "${ctr_dir}/job.${gridid}.errors"
      echo "$exitcode Job was lost with unknown exit code" > "$donefile"
      ${GMKICK} "$statusfile"
      echo "Job $gridid finished with unknown exit code"
    else
      echo "$counter" > "$countfile"
    fi
  done
  # go through existing ids
  for id in $pids ; do
    # find grid job corresponding to curent local id
    jobfile=`find ${ctr_dir} -name 'job.*.local' -print0 | xargs -0 grep -F -l "localid=$id." 2>/dev/null`
    if [ -z "$jobfile" ] ; then continue ; fi
    gridid=`basename "$jobfile" '.local' | sed 's/^job\.//'`
    countfile="${ctr_dir}/job.${gridid}.lrms_job"
    # reset failure counter
    rm -f "$countfile"
  done
done
sleep 60
exit 0

