#!/bin/bash
#
#
#   Scan for finished LSF jobs, using bjobs
#
# usage: scan_lsf_job control_dir ...

# Set variables:
#   LSF_BIN_PATH

# ARC1 passes first the config file.
if [ "$1" = "--config" ]; then shift; ARC_CONFIG=$1; shift; fi

basedir=`dirname $0`
basedir=`cd $basedir > /dev/null && pwd` || exit $?

pkglibdir="$basedir/../../lib/arc"
pkglibdir=`cd $pkglibdir > /dev/null && pwd` || exit $?

# Assume that gm-kick is installed in the same directory
GMKICK=${basedir}/gm-kick

. ${pkglibdir}/configure-lsf-env.sh || exit $?

umask 022

# Prints the owner of the file given as argument
# Perl is used because it's more protable than using the stat command
printowner () {
  code='$f=$ARGV[0];if(@s=stat $f){@p=getpwuid $s[4];if(@p){print $p[0]}else{exit 1}}else{die "$f: $!\n"}'
  /usr/bin/perl -we "$code" "$1"
}

if [ -z "$1" ] ; then 
    echo "Missing Input Script file as arg1" 1>&2
    exit 1 ; 
fi

# first control_dir is used for storing own files

echo `date`" : control_dir=$1" 1>&2 

control_dir=$1
control_dirs=
while [ $# -gt 0 ] ; do
  control_dirs="${control_dirs} $1"
  shift
done

my_id=`id -u`

my_name=`id -un`

# Append .comment (containing STDOUT & STDERR of the job wrapper) to .errors
save_commentfile () {
  username=$1
  commentfile=$2
  errorsfile=$3
  action="
    { echo '---------- Contents of output stream forwarded by LSF -----------'
      cat '$commentfile' 2> /dev/null
      echo '------------------------- End of output -------------------------'
    } >> '$errorsfile'
  "
  if [ -z "$username" ] ; then
    eval "$action"
  else
    su "$username" -c "eval \"$action\""
  fi
}


# Get all running jobs
#

if [ -z ${LSF_BIN_PATH} ]; then
    echo "${LSF_BIN_PATH} not set" 1>&2
    exit 1
fi

pidslist=`mktemp "${TMPDIR:-/tmp}/lsfstat.XXXXXX"` || 
if [ ! "$?" = '0' ] ; then 
  rm -f "$pidslist"
  # PBS server down ?
  sleep 60
  exit 1
fi


lsf_stat=`${LSF_BIN_PATH}/bjobs -a -u all -q normal 2>/dev/null` # | grep RUN | grep '^ [:digit:]'

if [ -z "${lsf_stat}" ] ; then
    echo "bjobs returned empty result" 1>&2
#    rm -f "$pidslist"
#    sleep 60
#    exit 0
fi

echo "${lsf_stat}" 1>$pidslist

pids=`cat "$pidslist" | egrep 'PSUSP|USUSP|SSUSP|RUN|PEND' | sed -e 's/^\([^ ]*\).*/\1/'`
rm -f "$pidslist"

eval "set -- $control_dirs"

# Go through directories
for ctr_dir in $control_dir ; do

  # Obtain ids stored in job.*.local
    ids=`find ${ctr_dir} -name 'job.*.local' -print0 | xargs -0 grep -h "^localid=" 2>/dev/null | sed 's/^localid=\([0-9]*\).*/\1/'`

    if [ -z "$ids" ] ; then continue ; fi
	
    # compare them to running jobs and find missing
    bids=
    for id in $ids ; do
	found=`echo "$pids" | grep "^$id$"`
	if [ -z "$found" ] ; then
	    bids="$bids $id"
	fi
    done

    # go through missing ids
    for id in $bids ; do

	# find grid job corresponding to curent local id
	jobfile=`find ${ctr_dir} -name 'job.*.local' -print0 | xargs -0 grep -F -l "localid=$id" 2>/dev/null`
	if [ -z "$jobfile" ] ; then continue ; fi

	# extract grid id
	gridid=`basename "$jobfile" '.local' | sed 's/^job\.//'`
	donefile="${ctr_dir}/job.${gridid}.lrms_done"
	if [ -f "$donefile" ] ; then continue ; fi

	statusfile="${ctr_dir}/job.${gridid}.status"
	if [ ! -f "$statusfile" ] ; then continue ; fi

	status=`cat "$statusfile"`
	if [ "$status" != "INLRMS" ] && [ "$status" != "CANCELING" ] ; then continue ; fi

	if [ "$my_id" = '0' ] ; then
	    username=`printowner "${jobfile}"`
	else
	    username=
	fi

	# get session directory of this job
	session=`grep -h '^sessiondir=' "$jobfile" | sed 's/^sessiondir=\(.*\)/\1/'`
	if [ ! -z "$session" ] ; then
	    # have chance to obtain exit code
	    diagfile="${session}.diag"

	    if [ ! -z "$session" ] ; then
		# have chance to obtain exit code
		if [ -z "$username" ] ; then
		    exitcode=`grep '^exitcode=' "$diagfile" | sed 's/^exitcode=//'`
		else
		    exitcode=`su "${username}" -c "grep '^exitcode=' $diagfile" | sed 's/^exitcode=//'`
		fi
	    fi

	    if [ ! -z "$exitcode" ] ; then
		# job finished and exit code is known
                save_commentfile "$username" "${session}.comment" "${ctr_dir}/job.${gridid}.errors"
		echo "$exitcode Executable finished with exit code $exitcode" > "$donefile"
		${GMKICK} "$statusfile"
		continue
	    fi
	fi

	# job has probaly finished and exit code is not known
	exitcode='-1'
	countfile="${ctr_dir}/job.${gridid}.lrms_job"
	counter=0
	if [ -f "$countfile" ] ; then
	    counter=`cat "$countfile"`
	    counter=$(( $counter + 1 ))
	fi

	if [ "$counter" -gt 5 ] ; then
	    rm -f "$countfile"
            save_commentfile "$username" "${session}.comment" "${ctr_dir}/job.${gridid}.errors"
	    echo "$exitcode Job was lost with unknown exit code" > "$donefile"
	    ${GMKICK} "$statusfile"
	else
	    echo "$counter" > "$countfile"
	fi
  

    done 

    # go through existing ids
    for id in $pids ; do
	# find grid job corresponding to curent local id
	jobfile=`find ${ctr_dir} -name 'job.*.local' -print0 | xargs -0 grep -F -l "localid=$id." 2>/dev/null`
	if [ -z "$jobfile" ] ; then continue ; fi
	gridid=`basename "$jobfile" '.local' | sed 's/^job\.//'`
	countfile="${ctr_dir}/job.${gridid}.lrms_job"
	# reset failure counter
	rm -f "$countfile"
    done

done 

sleep 60
exit 0
