#!/bin/sh
#
# $Header: /home/vikas/src/nocol/utility/RCS/keepalive_monitors,v 1.14 2000/01/27 05:17:50 vikas Exp $
#
# Make sure that the various nocol programs are doing their job and are
# not dead. Mails out the old errors file to the OPSMAIL folks.
#
# This file checks to see if the list of 'PROGRAMS' is running on HOST
# and all the ALTPROGRAMS are running on ALTHOST (the lists can be empty).
# Set these lists and hostnames. Set a mail address for OPSMAIL also.
#
# Can run this from the crontab every 30 minutes.
#
#     5,35 * * * *      $ROOTDIR/bin/keepalive_monitors
#
#	-Vikas Aggarwal,  vikas@navya.com, Feb 1997
#
umask 002

## Tweak these
##
ROOTDIR="/pbulk/work/net/nocol/work/.destdir/usr/pkg"
PIDDIR="/pbulk/work/net/nocol/work/.destdir/var/run"
OPSMAIL="root@localhost"			# mail on restarting
CRITMAIL="root@localhost"
MAIL="/usr/bin/mail"                           # location of 'mail' program
PSFLAGS="-axw"                     # flags for 'ps' to see all processes

#PSFLAGS='-ef'    #on Irix, Solaris, HP-UX, while -axw on others

HOST="victory.netbsd.org"
PROGRAMS="noclogd etherload ippingmon rpcpingmon nsmon ntpmon portmon"
PROGRAMS="$PROGRAMS radiusmon hostmon tpmon"

# you might want to run some monitors (such as another etherload) on another
# host and write to the same NFS shared data directory. If so, define these
# 
ALTHOST="host2.navya.com"
ALTPROGRAMS="etherload.host2"

## Rest is pretty standard 'shell'.
##
BIN="${ROOTDIR}/bin"
TMPFILE="${PIDDIR}/keepalive.tmp$$"	# dont create under /tmp

OPATH=$PATH
PATH=${BIN}:/bin:/usr/bin:/sbin:/usr/sbin:/etc:/usr/etc
PATH=${PATH}:/usr/ucb:/usr/bsd:/usr/local/bin:/usr/local/gnu/bin:${OPATH}

trap "echo $0 Exiting on signal ; rm -f $TMPFILE; exit 1"  1 2 3 9 15

##
# Which program should run on which host.
THISHOST=`hostname`

if [ "X${THISHOST}" = "X$HOST" ]; then
  PROGLIST="$PROGRAMS"
elif [ "X${THISHOST}" = "X$ALTHOST" ]; then
  PROGLIST="$ALTPROGRAMS"
else
  echo "Current host $THISHOST is not one of $HOST or $ALTHOST"
  exit 0
fi

if [ "X${PROGLIST}" = "X" ]; then
  exit 0
fi

if [ ! -d $BIN ]; then
  echo "Directory $BIN not found, exiting"
  exit 1
fi
cd $BIN

## Well, they changed the syntax of the ps <pid> command also...
PSNUMFLAGS=""
(ps 1) 2>&1 | egrep -i '^usage' >/dev/null 2>&1
if [ $? = 0 ]; then PSNUMFLAGS='-f -p' ; fi

## Account for the programs that have a pid file, and those that don't.
#  See which monitors are running.
>$TMPFILE

for p in ${PROGLIST}
 do

   if [ -f ${PIDDIR}/${p}.pid ]; then
    PID=`head -1 ${PIDDIR}/${p}.pid`
   else
    PID=""
   fi
   if [ "X${PID}" = "X" ]; then
     pgmname=`basename $p`;
     ps ${PSFLAGS} 2>&1 |grep -v grep |egrep "$pgmname( |\$)" >/dev/null 2>&1
     #ps ${PSFLAGS} |grep -v grep |egrep "${BIN}/${p}([ ]*|\$)" >/dev/null 2>&1
   else
     ps ${PSNUMFLAGS} $PID | grep ${p} >/dev/null 2>&1
   fi

   if [ $? = 1 ]; then        # process is dead, restart
    if test -f $p ; then
     #if [ $?prompt ]; then
     #  echo "Starting $p at `date` on $THISHOST"
     #fi

     echo "" >> $TMPFILE
     echo "Restarting NOCOL monitor  $p  on $THISHOST" >> $TMPFILE
     if test -s ${PIDDIR}/${p}.error ; then 
       echo "Previous error file for NOCOL monitor  ${p}" >> $TMPFILE
       echo "                                      --------" >> $TMPFILE
       cat ${PIDDIR}/${p}.error >> $TMPFILE
     fi

     ${BIN}/${p} >${PIDDIR}/${p}.error 2>&1 &
    fi
   fi
done

if test -s $TMPFILE ; then
  echo "" >> $TMPFILE
  echo '                   -keepalive_monitors' >> $TMPFILE
  if [ "X${MAIL}" = "X/usr/ucb/mail" ]; then
    cat $TMPFILE | ${MAIL} -s "NOCOL keepalive_monitors" $OPSMAIL
  else
    cat $TMPFILE | ${MAIL} $OPSMAIL
  fi
fi

rm -f $TMPFILE

####
