pro check_process, timelimit=timelimit,  cpulimit=cpulimit, nomail=nomail , $
	loud=loud
;+
;NAME:
;	check_process
;PURPOSE:
;	To monitor  jobs that have hung up and are eating up CPU
;HISTORY:
;       22-Apr-94 (SLF) - Rewrite MDM kill_ftp - parameter/generic PID checks
;       25-Apr-94 (SLF)	- read from file if it exists
;       29-Apr-94 (SLF) - actually spawn command! (was claiming it spawned)
;       11-Jul-94 (SLF) - remove ps warning lines from output ('ps:...')
;       15-Feb-95 (SLF) - Add call to check_oldprocess - added 2 parameters
;                         to the pid_limits data file, create archive logs
;       27-Feb-95 (slf) - allow BSD syntax (OSF, etc)
;-
loud=keyword_set(loud)
mailit=1- keyword_set(nomail)

limitfile=concat_dir('$DIR_SITE_SETUPD','pid_limits.dat')
if file_exist(limitfile) then begin
   flimits=rd_tfile(limitfile,6)
   fcpu=float(flimits(0))
   ftime=float(flimits(1))
   fkill=str2arr(flimits(2))
   fexcept=str2arr(flimits(3))
   oage=float(flimits(4))		; flag jobs older than this (user)
   okill=str2arr(flimits(5))		; kill these older jobs
endif else begin
   fcpu=2.
   ftime=50.
   fkill =['ftp']
   fexcept=['idl']
   oage=6
   okill=['ftp','/usr/bin/rsh']
endelse

; slf 15-feb - call check_oldprocess prior to checking current jobs
message,/info,"Checking for old/hung processes
check_oldproc, kill_pattern=okill, hours_age=oage, nomail=nomail, $
	purge=((get_user() eq 'software'))*30.

; now look at the new stuff
; slf, 27-feb-95 - use diff syntax for bsd systems
bsd_sys=['osf']
cmd = 'ps ' + (['-',''])(is_member(!version.os,bsd_sys,/ignore_case)) + 'auxw'
spawn, cmd, result

logfile=concat_dir('$DIR_SITE_LOGS','chk_process.log')
joblist=result(1:*)
file_append,logfile,joblist,/new
; create archive file for check_oldprocess (next time through)
archive=concat_dir('$DIR_SITE_LOGS','chk_proc_'+ ex2fid(syst2ex()))
file_append,archive,joblist,/new
;
killjobs=fkill					; add to list for auto-kills
if n_elements(timelimit) eq 0 then timelimit=ftime
if n_elements(cpulimit) eq 0 then cpulimit=fcpu	

parse=rd_tfile(logfile,10,/compress,nocomment='ps:')		; use rd_tfile to make table
notroot=where(strpos(parse(0,*),'root') eq -1,nr)
; --------------- dont do anything to root jobs -----------------
if nr eq 0 then begin
   message,/info,'No non-root jobs running, exiting...
   return
endif
; ---------------------------------------------------------------

parse=parse(*,notroot)
list = joblist(notroot)


except=fexcept
excepti=0
repeat begin							; kludge****
   weed=where(strpos(list,except(excepti)) eq -1,nremain)
   if nremain ne 0 then begin
      list=list(weed)
      parse=parse(*,weed)
   endif
   excepti=excepti+1      
endrep until excepti eq n_elements(except) or nremain eq 0

if nremain eq 0 then begin
   message,/info, 'Non non-essential jobs running...'
   return
endif

killit=intarr(nremain)

for i=0,n_elements(killjobs)-1 do begin
   killss=where(strpos(list,killjobs(i)) ne -1,kcnt)
   if kcnt gt 0 then killit(killss) = killit(killss) or 1
endfor

killcmd = 'kill -9 ' 
node=str2arr(get_host(),'.')
node=node(0)

for i=0,n_elements(list)-1 do begin
    ip = where(strpos(parse(*,i), ':') ne -1)
    time=str2arr(parse(ip,i),':')
    cpu=float(parse(2,i))
    timeout=(long(time(0)) ge timelimit)
    cpuout=(cpu ge cpulimit)
    overlimit=(timeout and cpuout)
    kill = killit(i) and overlimit
    message=''
    if loud then begin
       print,'CHK: ' + strmid((list(i)),0,75)
       print,'Time limit: ' + strtrim(timelimit,2), '  ' , $
	'CPU limit: ' + strtrim(cpulimit,2)
    endif
    case 1 of 
       kill(0): begin
          spawncmd=killcmd + parse(1,i)
          message = ['Check_process Message', $
            'I am killing the following process on machine '  + node,list(i)]
          subject='Killing PID: ' + parse(1,i) + '[' + node + ']' 
          message,/info,spawncmd
          spawn, spawncmd, result
       endcase
       overlimit(0): begin message = ['Check_process Message', $
		'Process may need to be killed on ' + get_host(), list(i)]
          subject='Overlimit PID: ' + parse(1,i) + '[' + node + ']'
       endcase
       else: if loud then print, 'CHK: Does not exceed cpu or time limit'
    endcase
    if message(0) ne '' then begin
       prstr,message  
       if mailit then mail, message, user='software@isass0.solar.isas.ac.jp', subj=subject
    endif    
endfor

end

