#! /usr/bin/python

import sys
import os
import signal
import time
import tempfile

import skyd_func
from tiny import hide_env, start, say

# 'boss' is the machine controlling skyd_wait
# grunts are machines controlling skyd_orbit

boss = os.environ['HOSTNAME'].split('.')[0]
stime = time.time()

long_pause    = 86400/2
report_delay  = 10
report_prefix = 'skyd_'

def skyd_load(claim):
	"""
#+
# NAME:
#	skyd_load
# PURPOSE:
#	Read conf file and set up the dictionary needed for
#	tracking the skyd_orbit processes launched on all grunts.
# SIDE EFFECTS:
#	Several entries in the conf file are updated/added.
#	'pid' is set with pid of skyd_wait
#	'boss' is set to hostname of machine that controls skyd_wait
#	'time' is set to time at which skyd_wait was started
#
#	If 'max_load' (# processes per grunt) is not set then
#	'max_load' is set to 2.
#	IF 'grunts' (list of machines that run skyd_orbit) is not
#	set then 'grunts' is set to 'boss' (i.e. the daemon will
#	run htm_orbit locally only).
#	If 'reportdir' (dir for report files) is not set then it
#	is set to $TUB.
# PROCEDURE:
#	The global variable new_runs is set up as a dictionary
#	with one entry for each grunt listed in the conf file.
#	The entry for each grunt is a list of max_proc elements.
#	Each element is a dictionary with entries describing
#	the process. Each process is characterized by:
#		'status': 'dead','start' or 'runs'
#		'wmark'	: '' or 'watermark'
#		'result': '', 'runs','done','kill'
#	Processes are initialized here as 'dead' with a
#	blank watermark. The watermark is a filename of type
#	<reportdir>/skyd_<random> with <random> a unique set
#	of characters (created by tempfile.mkstemp).
# MODIFICATION HISTORY:
#	DEC-2005, Paul  Hick (UCSD/CASS; pphick@ucsd.edu)
#-
	"""
	global new_runs				# Counts processes
	global reportdir

	csay = 'skyd_load'

	status = dict ([ ('number', 0), ('message', '') ])

	status = skyd_func.skyd_read_conf(cffile,claim,status)
	if status['number'] != 0:
		say(csay,'E','cf',status['message']+'\nterminated')

	conf = status.pop('conf')

	if claim:					# Only for the initial load

		conf['pid' ] = '%d'%pid	# Daemon process id
		conf['boss'] = boss		# Machine controlling deamon
		conf['time'] = time.strftime('%Y_%j_%H%M%S')
								# Time of (re)load
		# Count number of groups. Set cur_group to the last one
		# so that the next orbit processed will be from group 0.

		cur_group = 0
		for key in conf.keys():
			if key.find('group_') == 0:
				cur_group += 1

		conf['cur_group'] = '%d'%(cur_group-1)

		if conf.has_key('max_load'):
			max_load = int(conf['max_load'])
		else:
			max_load = 2
			conf['max_load'] = '%d'%max_load

		if conf.has_key('grunts'):
			grunts = conf['grunts'].split(',')
		else:
			grunts = [boss]
			conf['grunts'] = grunts

		if conf.has_key('reportdir'):
			reportdir = os.path.expandvars(conf['reportdir'])
			if not os.path.isdir(reportdir):
				say(csay,'I','#'+reportdir,'does not exist')
				reportdir = ''
		else:
			reportdir = ''

		if reportdir == '':
			reportdir = os.environ['TUB']
			conf['reportdir'] = hide_env(reportdir)
			say(csay,'I','#'+conf['reportdir'],'is report directory')

		status['conf'] = conf
		status = skyd_func.skyd_write_conf(cffile,status)

		if status['number'] != 0:
			say(csay,'E','cf',status['message']+'\nterminated')

	else:

		max_load = int(conf['max_load'])
		grunts   = conf['grunts'].split(',')

	grunts_load = dict()

	for i in range(len(grunts)):
		if ':' not in grunts[i]:
			grunts_load[grunts[i]] = max_load
		else:
			run = grunts[i].split(':')
			grunts[i] = run[0]
			grunts_load[grunts[i]] = min(int(run[1]),max_load)

	# Initalize the list of runs
	# with status='dead' and no watermark.

	new_runs  = dict()

	for grunt in grunts:
		new_runs[grunt] = []
		for run in range(grunts_load[grunt]):
			new_runs[grunt].append(skyd_func.skyd_empty_run())

	return

def skyd_reload():
	"""
#+
# NAME:
#	skyd_reload
# PURPOSE:
#	Rereads the conf file and resets the dictionary needed for
#	tracking the skyd_orbit processes.
# CALLING SEQUENCE:
#	skyd_reload
# PROCEDURE:
#	skyd_reload is called when a SIGHUP signal is received.
#
#	skyd_reload merges the current list of processes in new_runs
#	with the list of unfinished processes in old_runs.
#	Then htm_load is called to set up a fresh new_runs list.
#
#	The old_runs array will only contain processes that are
#	marked as 'start' or 'runs'.
# MODIFICATION HISTORY:
#	DEC-2005, Paul  Hick (UCSD/CASS; pphick@ucsd.edu)
#-
	"""
	global new_runs
	global old_runs
	global terminator

	csay = 'skyd_reload'
	say(csay,'I','reload','\nconfiguration file\n')

	# Save the current old_runs list. Note that on the first
	# reload old_runs is an empty directory.
	# old_runs needs to be integrated later on with the new
	# old_runs list. The new_runs list becomes the new old_runs list

	saved_runs = old_runs

	#old_runs = new_runs			# NOT RIGHT
	old_runs = dict()			# Instead do this
	for grunt in new_runs.keys():
		old_runs[grunt] = []
		old_runs[grunt].extend(new_runs[grunt])

	# Remove all 'dead' processes from the old_runs list

	grunts = old_runs.keys()

	for grunt in grunts:
		run = skyd_func.skyd_not_running(old_runs[grunt])
		while run != -1:
			old_runs[grunt][run:run+1] = []
			run = skyd_func.skyd_not_running(old_runs[grunt])

	# Remove all grunts that have no processes left

	for grunt in grunts:
		if len(old_runs[grunt]) == 0:
			old_runs.pop(grunt)

	# old_runs now only contains grunts that have at least
	# on process left, and all processes are not marked as
	# 'dead', i.e. they will be 'start' or 'runs'.

	# The same is true for the saved_runs array, since it
	# was constructed by the above section of code in an
	# earlier call to this routine.

	# Add the saved_runs list (the old old_runs list) to the
	# new old_runs list.

	grunts = old_runs.keys()

	while len(saved_runs) > 0:
		grunt = (saved_runs.keys())[0]
		if grunts.count(grunt) == 0:
			old_runs[grunt] = saved_runs[grunt]
		else:
			old_runs[grunt].extend(saved_runs[grunt])
		saved_runs.pop(grunt)

	# At this point saved_runs should be an empty dictionary
	# and old_runs contains all unfinished processes.
	# old_runs might also be empty; if not then none of the
	# processes in old_runs are marked 'dead', i.e. these
	# are the leftover skyd_orbit runs that haven't reported back
	# that they have finished.

	skyd_load(False)

	# skyd_load sets up a new_runs dictionary with all processes
	# marked as 'dead'. If old_runs has grunts in common with
	# new_runs then transfer the processes in old_runs to new_runs.

	grunts = new_runs.keys()

	for grunt in old_runs.keys():
		if grunts.count(grunt) != 0:			# Grunt on new and old list
			n = min([len(new_runs[grunt]),len(old_runs[grunt])])
			new_runs[grunt][0:n] = old_runs[grunt][0:n]
			old_runs[grunt][0:n] = []		# Remove transferred runs from old list

	# Remove all grunts in old_runs that have no processes left
	# Note that there still may be processes left in old_runs; either
	# because the grunt is not anymore in new_list, or because there are
	# more than max_load processes running (I don't think this can happen)

	grunts = old_runs.keys()

	for grunt in grunts:
		if len(old_runs[grunt]) == 0:
			old_runs.pop(grunt)

	if terminator:
		say(csay,'I','terminator','cleared\n')
		terminator = False

	# Fire up the skyd_orbit processes

	grunts = new_runs.keys()

	count = 0
	for grunt in grunts:
		count = skyd_start(grunt, count)

	return

def skyd_find_run( reports, lst_runs ):
	"""
#+
# NAME:
#	skyd_find_run
# PURPOSE:
#	Match a file on a list of files to a process on a specfied list
# CALLING SEQUENCE:
#	rtn = skyd_find_run( reports, lst_runs )
# INPUTS:
#	reports		string array
#				list of files in reportdir
#				skyd_orbit runs will send SIGUSR1 signals back
#				to skyd_wait signalling that a 'report' file was
#				put in reportdir reporting about their status.
#				These report files should be in the 'reports' array.
#	lst_runs	dictionary
#				old_runs or new_runs list of skyd_orbit processes
# OUTPUTS:
#	rtn		dictionary
#				if a matching report is found
#				then the entries are
#				'report'  	name of report from 'reports'
#				'grunt' 	name of grunt running process
#				'run'		number of run on grunt
#				'status'	either 'start' or 'runs'
#				'result'	'runs' if 'status'='start'
#						either 'done' or 'kill' if
#						'status'='runs'
#				if no matching report is found
#				then only one entry is present:
#				'report'	set to null string
# PROCEDURE:
#	An attemps is made to match one of the processes in
#	lst_runs to one of the reports.
#	If a process is marked 'start' then the matching
#	report file has name <report>_runs or <report>_kill.
#	If a process is marked 'runs' then the matching
#	report file has name <report>_done or <report>_kill
#	If a process is marked 'dead' then no report
#	is expected for that process.
# MODIFICATION HISTORY:
#	DEC-2005, Paul Hick (UCSD/CASS; pphick@ucsd.edu)
#-
	"""
	csay = 'skyd_find_run'

	run_found = False

	all_grunts = lst_runs.keys()

	for grunt in all_grunts:				# Loop over all grunts

		for run in range(len(lst_runs[grunt])):		# Loop over all runs

			report = lst_runs[grunt][run]['report']	# <reportdir>/skyd_<wmark>
			status = lst_runs[grunt][run]['status']

			if status == 'dead':

				if report != '':
					say(csay,'E',grunt,'run %d is %s, but has a watermark\nterminated'%(run,status))

			elif status == 'start':

				if report == '':
					say(csay,'E',grunt,'run %d is %s, but has no watermark\nterminated'%(run,status))

				name = os.path.split(report)[1]	# report_htm_<wmark>

				result = 'runs'

				run_found = reports.count(name+'_'+result) != 0
				if run_found:
					break

				result = 'kill'

				run_found = reports.count(name+'_'+result) != 0
				if run_found:
					break

			elif status == 'runs':

				if report == '':
					say(csay,'E',grunt,'run %d is %s, but has no watermark\nterminated'%(run,status))

				name = os.path.split(report)[1]

				result = 'done'

				run_found = reports.count(name+'_'+result) != 0
				if run_found:
					break

				result = 'kill'

				run_found = reports.count(name+'_'+result) != 0
				if run_found:
					break

		if run_found:

			rtn = dict({'report'	: report+'_'+result,
						'wmark' 	: lst_runs[grunt][run]['wmark'],
						'grunt' 	: grunt		,
						'run'		: run		,
						'status'	: status	,
						'result'	: result	} )

			break

	else:

		rtn = dict( {'report':''} )

	return rtn

def skyd_find():
	"""
#+
# NAME:
#	skyd_find
# PURPOSE:
#	Match a file in reportdir against a process in the old_runs
#	or new_runs dictionary.
# CALLING SEQUENCE:
#	rtn = skyd_find()
# OUTPUTS:
#	rtn	dictionary	dictionary returned from skyd_find_run
#				with one extra item if a matching report
#				was found:
#				'list' is set to 'old_runs' or 'new_runs'
#				reflecting the process list
# CALLS:
#	skyd_find_run
# PROCEDURE:
#	The content of reportdir is picked up. This will include all
#	report files, and possible some other stuff too.
#	First the old_runs is checked for a matching report file.
#	If unsucessfull, the new_runs list is tried.
#	It is essential that the old_runs list is tried first.
#	Ideally this list is empty already. If not we want to clean
#	it out before processing new_runs.
# MODIFICATION HISTORY:
#	DEC-2005, Paul Hick (UCSD/CASS; pphick@ucsd.edu)
#-
	"""
	global reports_claimed

	rtn = os.listdir(reportdir)		# All files in reportdir
	reports = []

	# Loop over all reports in reportdir.
	# Omit files that do not start with report_prefix
	# Omit files that are on the reports_claimed list

	for report in rtn:			# Loop over all files in reportdir
		if reports_claimed.count(report) != 0:
			say('skyd_find','I','report','avoiding collision with %s'%report)
			continue		# Omit reports on the reports_claimed list
		if report.find(report_prefix) != 0:
			continue		# Omit files not starting with report_prefix
		reports.append(report)

	rtn = skyd_find_run( reports, old_runs )
	if rtn['report'] != '':			# Add report to reports_claimed list
		reports_claimed.append(rtn['report'])
		rtn['list'] = 'old_runs'
		return rtn

	rtn = skyd_find_run( reports, new_runs )
	if rtn['report'] != '':			# Add report to reports_claimed list
		reports_claimed.append(rtn['report'])
		rtn['list'] = 'new_runs'

	return rtn

def skyd_start(grunt,count):
	"""
#+
# NAME:
#	skyd_start
# PURPOSE:
#	Start indexing runs for specified grunts until the
#	maximum number of processes are running.
# CALLING SEQUENCE:
#	new_count = skyd_start(grunt,count)
# INPUTS:
#	grunt		string		name of computer on SMEI subnet
#	count		integer		number of mintutes to delay start
#					of next skyd_orbit run
# OUTPUTS:
#	new_count	integer		input count, plus number of
#					new processes started here
# PROCEDURE:
#	All processes marked as 'dead' in new_runs[grunt]
#	are selected to be launched again.
#	skyd_orbit is set up to be submitted to the at batch
#	queue on grunt with a delay of count minutes.
#	For each process launched count is incremented by one.
#	The final count value is returned.
#
#	skyd_start is typically run in a loop like this.
#
#	count = 0
#	for grunt in grunts:
#		count = skyd_start(grunt, count)
#
#	As a result skyd_orbit are launched across all grunts
#	at intervals of roughly 1 minutes. This should reduce the
#	risk of multiple skyd_orbit runs access the conf file or
#	the user catalogue at the same time (skyd_orbit actually
#	provides some defense against this, but better safe than sorry).
# MODIFICATION HISTORY:
#	DEC-2005, Paul Hick (UCSD/CASS; pphick@ucsd.edu)
#-
	"""
	global new_runs

	csay = 'skyd_start'

	if terminator:
		say(csay,'I','terminator','is set; no new runs started')
		return count

	# Processes are marked 'dead', 'start' or 'runs'
	# Look for 'dead' processes, mark them as 'start'
	# and build a single cmd with multiple skyd_orbit calls in it.

	status = 'dead'
	result = 'start'

	if grunt == boss:
		one_cmd = local_cmd
	else:
		one_cmd = remote_cmd

	count_save = count

	for run in range(len(new_runs[grunt])):

		if new_runs[grunt][run]['status'] == status:

			# NamedTemporaryFile actually creates the file
			# (and deletes it when it closed)
			# So this also tests that reportdir is writable by boss.

			report = tempfile.NamedTemporaryFile('w+b',-1,'',report_prefix,reportdir).name

			new_runs[grunt][run]['report'] = report
			new_runs[grunt][run]['wmark' ] = os.path.split(report)[1][len(report_prefix):]
			new_runs[grunt][run]['status'] = result
			new_runs[grunt][run]['time'  ] = time.strftime('%Y_%j_%H%M%S')

			if count == count_save:
				cmd = ''
 			else:
				cmd += '; '

			cmd += one_cmd%(report,count)
			count += 1

	print

	if count == count_save:		# Grunt is fully committed
		say(csay,'I',grunt,'no new process '+result)

	else:
		say(csay,'I',grunt,result+' %d process(es)'%(count-count_save))

		if grunt != boss:

			# For execution on a remote machine the command is send over
			# using ssh. The command is submitted to the local at queue,
			# because we don't want to wait for the ssh connection to complete
			# (remember that cmd submits htmd_orbit to the at queue on the
			# remote machine).

			cmd = 'echo "ssh '+grunt+' \\"'+cmd+'\\"" | at now'

		# This goes wrong sometimes with an error message 
		# "interrupted system call". Don't know why. We just
		# retry a couple of times. If it it just won't work then
		# set the terminator flag.

		n = 0
		while n < 3:
			try:
				os.popen(cmd)
			except:
				print cmd
				say(cSay,'W','failed','system commmand; try again in a sec')
				n += 1
				time.sleep(10)
			else:
				break

	return count

def skyd_sighup(signum, frame):
	"""
#+
# NAME:
#	skyd_sighup
# PURPOSE
# MODIFICATION HISTORY:
#	DEC-2005, Paul Hick (UCSD/CASS; pphick@ucsd.edu)
#-
	"""
	skyd_reload()

	return

def skyd_alarm(signum, frame):
	"""
#+
# NAME:
#	skyd_alarm
# PURPOSE
# MODIFICATION HISTORY:
#	DEC-2005, Paul Hick (UCSD/CASS; pphick@ucsd.edu)
#-
	"""
	csay = 'skyd_alarm'

	say(csay,'W','<time>','got alarm signal, no activity for too long')

	skyd_reload()

	# Kill of all remaining processes in old_runs here
	# This has to happen AFTER calling htm_reload to avoid processing SIGUSR1 signals

	skyd_func.skyd_kill_runs(old_runs,boss)

	return

def skyd_ctrlc(signum, frame):
	"""
#+
# NAME:
#	skyd_ctrlc
# PURPOSE
# MODIFICATION HISTORY:
#	DEC-2005, Paul Hick (UCSD/CASS; pphick@ucsd.edu)
#-
	"""
	csay = 'skyd_ctrlc'

	# Kill all running skyd_orbit runs

	#skyd_func.skyd_kill_runs(old_runs,boss)
	#skyd_func.skyd_kill_runs(new_runs,boss)

	global terminator
	
	terminator = True
	say(csay,'W','ctrl-c','terminator set; coasting to finish')

	return

def skyd_go(signum, frame):
	"""
#+
# NAME:
#	skyd_go
# PURPOSE:
# PROCEDURE:
#	Called when a SIGUSR1 signal is received from an skyd_orbit run.
#	These are send after a 'report' file has been put in reportdir.
#	Try to match one of the files in reportdir to one of processes
#	in the old_runs or new_runs dictionary using the watermark.
# MODIFICATION HISTORY:
#	DEC-2005, Paul Hick (UCSD/CASS; pphick@ucsd.edu)
#-
	"""
	global terminator
	global new_runs
	global old_runs
	global ncount
	global reports_claimed

	csay = 'skyd_go'

	# Try for 10 seconds to find the report file.
	# Since the SIGUSR1 was send AFTER the report file was written, this should
	# be enough (actually skyd_orbit waits for a couple of second after writing
	# the report file before sending the SIGUSR1 signal).
	# The only significant delay I can think of is a delayed write on
	# an NFS mount (can delayed write be switched off for an NFS mount??).

	n = 0
	while n < report_delay:
		rtn = skyd_find()
		report = rtn['report']
		if report != '':
			break
		time.sleep(1)
		n += 1
	else:
		say(csay,'W','<time>','received SIGUSR1 signal,'+	\
			' but no report found within %d seconds'%report_delay)
		return

	wmark  = rtn['wmark' ]
	grunt  = rtn['grunt' ]
	run    = rtn['run'	 ]
	status = rtn['status']
	result = rtn['result']

	# Read the message in the report file, and delete it.
	# Occasionally the report file is empty for some unknown reason.
	# The content currently is not used in any significant way.
	# For status = 'start' the file contains the pid of the process
	# started on the grunt. This pid is not used for program control and
	# is only printed to the screen.

	message = (open(report,'r')).read()

	# The script occasionally claims the report file doesn't
	# exist after just reading it successfully and (I think) deleting it
	# successfully. The try block avoids a crash. The name of the report
	# is kept on the reports_claimed list just in cast the file still
	# exists (don't want to process it again).

	try:
		os.remove(report)				# Remove report
		reports_claimed.remove(report)	# Remove report from reports_claimed list
	except:
		say(csay,'W','<time>','ignoring os.remove() error for %s'%report)

	if status == 'start':
		if len(message) == 0:
			say(csay,'I','<time>','empty report file, %s'%hide_env(report))
			gpid = -1
		else:
			gpid = int((message.split('\n'))[0].split()[-1])

	say(csay,'I','<time>','\n\n%s\n'%hide_env(report))
	say(csay,'I',grunt,'run %d (%s): %s -> %s'%(run,wmark,status,result)+'\n%s'%message)

	if rtn['list'] == 'new_runs':

		# The report came from a process in new_runs
		# - Update the status indicator:'
		#		start -> runs (pick up pid on grunt from 'message'
		#		runs  -> dead
		# - If new status is 'dead' then erase the watermark
		# - If the returned result was 'kill' then set the terminate flag
		#		to indicate that no new processes are be started (just
		#		wait till all runs have completed then kill skyd_wait)

		if status == 'start':
			new_runs[grunt][run]['status'] = 'runs'
			new_runs[grunt][run]['pid'   ] = gpid
			new_runs[grunt][run]['time'  ] = time.strftime('%Y_%j_%H%M%S')

		elif status == 'runs':
			new_runs[grunt][run] = skyd_func.skyd_empty_run()

			if result == 'kill':
				terminator = True
			else:
				ncount += 1

			count = skyd_start(grunt,0)
		else:					# This cannot happen
			say(csay,'E','stop','rats 2\nterminated')

	else:

		# The report came from a process in old_runs
		# - If the status indicator was 'start' then update it to
		#	'runs' (the 'done' or 'kill' report is yet to come.
		# - If the status indicator was 'runs' then the run has
		#	finished as 'done' or 'kill'. In both cases remove
		#	the process from the old_runs list.

		if status == 'start':
			old_runs[grunt][run]['status'] = 'runs'
			old_runs[grunt][run]['pid'   ] = gpid
			old_runs[grunt][run]['time'  ] = time.strftime('%Y_%j_%H%M%S')

		elif status == 'runs':
			old_runs[grunt][run:run+1] = []
			if len(old_runs[grunt]) == 0:
				old_runs.pop(grunt)

			if result == 'kill':
				say(csay,'W',grunt,'ignoring temination signal from old grunt')
			else:
				ncount += 1

		else:
			say(csay,'E','stop','rats 2\nterminated')

	if terminator:
		count = skyd_func.skyd_count_runs(old_runs)+skyd_func.skyd_count_runs(new_runs)

		if count == 0:
			skyd_func.skyd_show_runs(old_runs,'',boss)
			skyd_func.skyd_show_runs(new_runs,'',boss)
			say(csay,'S','stop','all processes finished\nterminated')

		if count == 1:
			say(csay,'I','terminator','is set; %d process has not finished yet'%count)
		else:
			say(csay,'I','terminator','is set; %d processes have not finished yet'%count)

	return

def skyd_wait(conf_file):
	"""
#+
# NAME:
#	skyd_wait
# PURPOSE:
#	Main routine for SMEI indexing daemon. This is the program
#	set up by skyd.py as a daemon
# CALLING SEQUENCE:
#	skyd_wait(conf_file)
# INPUTS:
#	conf_file	string		name of configuration file
# RESTRICTIONS:
#	Outstanding issues:
#
#	Sometimes the daemon crashes when trying to remove a report file,
#	because the report file doesn't exist anymore. Presumably the same
#	report file is picked up before the filesystem is updated after
#	a remove during processing of the previous signal.
#	It should be possible to make sure that the same report file is
#	processed only once.
#
#	I'm not sure what happens if the two report files (_runs and
#	_done (or _terminate) are present at the same time. This could
#	happen when processing many signals in a short time.
# PROCEDURE:
#	There are a number of requirements for this whole scheme to work.
#
#	0. all software must be accessible from all machines.
#		Currently everything is located in the SMEI software tree on the SMEI
#		server (which is NFS exported to all machines on the subnet)
#	1. sshd must be running on all machines (boss launches remote skyd_orbits
#		by issuing a command over ssh; grunts send back signals to boss lauching
#		the kill command over ssh).
#	2. all grunts must have an account with the same username as the account
#		on boss (running skyd_wait). Moreover, the accounts must allow access
#		in both directions using SSH keys (i.e. no passwords needed).
#		(see href=ssh_to=.py to set this up)
#		Currently an account with user name skyd exists on all machines on the
#		the subnet.
#	3. the batch utility 'at' must be installed on all grunts
#		(skyd_orbit is launched as a batchjob using 'at')
#	4  the configuration file and user catalogue must be located
#		where all machines can access it (e.g. in a directory on a shared
#		NFS volume), and all machines should have read & write permission,
#		and should be able to switch read-protection on/off.
#	5. reportdir, the directory were skyd_orbit puts the report files, also must
#		be accessible by all machines.
#	6. when 'at' finishes a job it sends an email to the local account (currently
#		the account skyd). Email forwarding is set up on all machines sending
#		all these email to the machine running the daemon. The easiest way to
#		do this is to create a soft link in the home directory of the skyd
#		account pointing to a .forward file located somewhere where all
#		machines have read access.
#
#	Most configuration file entries are optional with reasonable defaults
#	if absent. The configuration file has the structure
#
#	<name_1>=<value_1>
#	<name_2>=<value_2>
#	..
#	<name_n>=<value_n>
#	group_0:
#	<_name_1>=<value0_1>
#	..
#	<_name_m>=<value0_m>
#	group_1:
#	<_name_1>=<value0_1>
#	..
#	<_name_m>=<value0_m>
#	..
#	..
#	group_k:
#	<_name_1>=<value0_1>
#	..
#	<_name_2>=<value0_2>
#
#	The first block are 'global entries'. The following entries are separate
#	groups with instruction to run a specific group of orbits. skyd_orbit
#	calls by picking an orbit from group one, then group two, and so on,
#	cycling through all groups.
#
#	Global entries:
#
#	reportdir=reportdir	directory where report files from skyd_orbit
#				runs are put. Default: $TUB
#	max_load=max_load	max number of concurrent processes run on
#				each grund. Default: 2
#	grunts=grunt1,grunt2	comma separated list of machines on which
#				to run skyd_orbit. Default: boss
#
#	The following main entries are set when htm_wait is started.
#
#	boss=boss		machine controlling skyd_wait
#	pid=pid			process id of skyd_wait
#	time=2005_354_162151	time at which skyd_wait started
#	cur_group=0		first group for which orbit is selected
#				(this field incremented in skyd_orbit)
#
#	Group entries:
#
#	Many of these correspond to keywords needed for the indexing program
#	smeidb_htm
#
#	_camera=camera		camera id (1,2,3); default: 1
#	_mode=mode		mode id (0,1,2); default: -1 (this effectively
#				select the main science mode for each camera
#				(mode 2 for cam 1 and 2; mode 1 for cam 3)
#	_min_orbit=min_orbit	minimum orbit to run; zero means no restriction
#				on minimum orbit; default: 0
#	_max_orbit=max_orbit	maximum orbit to run; zero means no restriction
#				on maximum orbit; default: 1
#	_source=source		source for SMEI frames; default: SMEIDC?
#	_destination=destination
#				destination directory for sky maps
#	_level=level		indexing level
#	_keepglare=0/1		0: subtract glare; 1; keep glare; default: 0
#	_catalogue=catalogue	user catalogue
#	_checkversion=0/1	0: don't check version number; 1: update only
#				if smeidb_skyd version is higher than version
#				in existing skymap
#	_overwrite=0/1		0: don't overwrite existing skymaps
#				1: overwrite existing skymaps
#
#	The user catalogue is an ascii file with one line for each orbit in the
#	following format:
#
#   orbnr  orbnr+1 YYYY_DOY_hhmmss status
#
#	orbnr			orbit number to be processed
#	orbnr+1			orbit number, plus one
#	YYYY_doy_hhmmss	start time of orbit orbnr
#	status			status of orbit; can be 'make','skip','busy','done'
#
#	The catalogue can be constructed from a list of available skymaps
#	using the IDL procedure href=skyd_cat=.
# MODIFICATION HISTORY:
#	DEC-2005, Paul Hick (UCSD/CASS)
#	NOV-2006, Paul Hick (UCSD/CASS)
#		The script sometimes crashed while executing the os.remove(report)
#		command claiming the file didn't exist even though the file
#		was just read succesfully, and the remove actually worked correctly.
#		Bracketed os.remove(report) command with try block to avoid crash.
#		Also added the reports_claimed dictionary to store the names of
#		report files that have been identified by skyd_find_run. The name
#		is deleted from the dictionary only if the os.remove() succeeds.
#	JAN-2007, Paul Hick (UCSD/CASS)
#		Moved initialization of global reports_claimed in front of the loop
#		starting the initial indexing runs.
#	MAR-2008, Paul Hick (UCSD/CASS)
#		Fixed bug in processing of -overwrite keyword.
#		Now if -overwrite is set for one or more groups then the
#		orbit catalogue for the range of orbits specified is updated
#		by changing the status for all "done" orbits to "make".
#		This way, skyd_orbit only needs to look at status "make"
#		if -overwrite is set. Without this change skyd_orbit will
#		keep processing the same "done" orbit over and over again.
#	NOV-2008, Paul Hick (UCSD/CASS; pphick@ucsd.edu)
#		The call to skyd_orbit is now prefixed with ". $HOME/.bashrc".
#		This did not use to be necessary. It is needed now for FC9;
#		not sure why.
#-
	"""
	csay = os.path.splitext(os.path.split(sys.argv[0])[1])[0]
	say(csay,'I',csay,'is now up\n')

	global cffile
	global local_cmd
	global remote_cmd
	global old_runs
	global terminator
	global pid
	global ncount
	global reports_claimed

	# pid cannot be moved to the top of the program
	# (it would pick up the wrong pid when skyd_wait is started from skyd.py)

	pid  = os.getpid()
	cffile = conf_file
	ncount = 0

	if not os.path.isfile(cffile):
		say(csay,'E','#'+cffile,'does not exist\nterminated')

	# In principle skyd_orbit can extract pid and boss from cffile
	# Specifying it also on the command line allows skyd_orbit to signal back
	# that it has started without reading cffile (skyd_orbit needs the pid
	# to send SIGUSR1 signals back to this script

	path = '. $HOME/.bashrc; LD_LIBRARY_PATH='+os.environ['LD_LIBRARY_PATH']+'; '+	\
		'export LD_LIBRARY_PATH; '+	\
		'skyd_orbit.py -pid=%d -boss=%s -cffile=%s'%(pid,boss,cffile)+' -report=%s'

	# Set up ssh command to launch indexing of next orbit
	# Set up local command to launch indexing of next orbit
	# htmd_orbit is submitted to the at queue (with a delay)
	
	local_cmd  = 'echo "'+path+'" | at now+%d minutes 2> /dev/null'
	remote_cmd = 'echo \\\\\\"export HOSTNAME; '+path+'\\\\\\" | at now+%d minutes 2> /dev/null'

	skyd_load(True)
	old_runs = dict()				# Must exist

	# skyd_load will read the conf file and set reportdir,
	# so this loop canNOT be moved up.

	# Clean up leftover files from indexing runs
	# on remote machines.

	for name in os.listdir(reportdir):
		if name.find(report_prefix) == 0:
			path = os.path.join(reportdir,name)
			say(csay,'W','#'+path,'... cleanup')
			os.remove(path)

	# Check for -overwrite switch in cf file.
	# If -overwrite is set then the list file is modified
	# by setting the status of all "done" orbits to "make"

	status = dict ([ ('number', 0), ('message', '') ])
	status = skyd_func.skyd_read_conf(cffile,False,status)

	if status['number'] != 0:
		say(csay,'E','#'+cffile,status['message']+'\nterminated')
		return

	conf = status.pop('conf')			# Contents of config file

	for key in conf.keys():

		if key.find('group_') != 0: 	# Find group dictionaries
			continue

		group = conf[key]

		if not group.has_key('overwrite'):
			continue

		if not int(group['overwrite']):	# Check if overwrite key is set
			continue

		catalogue = group['catalogue']
		catalogue = os.path.expandvars(catalogue)
		if not os.path.isfile(catalogue):
			continue

		status = skyd_func.skyd_claim(False,status,catalogue)
		if status['number'] != 0:
			continue

		orbits = status.pop('contents')

		min_orbit = int(group['min_orbit'])
		max_orbit = int(group['max_orbit'])

		m = 0
		for orbit in orbits:

			if len(orbit) == 0:
				continue
			if orbit.isspace():
				continue
			if orbit.find('#') == 0:
				continue

			n = orbit.find('done')
			if n == -1:
				continue

			tmp = orbit.split()

			if int(tmp[0]) < min_orbit:
				continue
			if int(tmp[0]) > max_orbit:
				continue

			# Replace "done" by "make"
			m += 1
			orbits[orbits.index(orbit)] = orbit[0:n]+'make 0'

		if m != 0:
			status['contents'] = orbits
			status = skyd_func.skyd_release(status,catalogue)
			say(csay,'I','#'+catalogue,'done -> make: %d orbits'%m)

	# Set up the signal handlers
	# SIGUSR1 indicates that a report is ready for pickup

	signal.signal(signal.SIGUSR1, skyd_go	 )
	signal.signal(signal.SIGALRM, skyd_alarm )
	signal.signal(signal.SIGINT , skyd_ctrlc )
	signal.signal(signal.SIGHUP , skyd_sighup)

	# Loop over all grunts and start initial indexing runs

	reports_claimed = []
	terminator = False
	count = 0
	for grunt in new_runs.keys():
		count = skyd_start(grunt, count)

	# Go into main processing loop
	# An alarm clock is set to go off if no signal has come in for 12 hours.
	# in which case a new orbit is started.
	# In normal operations skyd_orbit will send either a signal SIGUSR1
	# (instructs to start a new orbit) or a signal SIGUSR2 if some fatal
	# error occurred (in which case the program terminates).

	while True:

		skyd_func.skyd_show_runs(old_runs,'',boss)
		skyd_func.skyd_show_runs(new_runs,'',boss)

		say(csay,'I','wait','\n\n... pid %d (%d;%.2f hr) ...'%(pid,ncount,(time.time()-stime)/3600))

		signal.alarm(long_pause)# Set alarm
		signal.pause()			# Wait for signal
		signal.alarm(0)			# Cancel alarm

	return

if __name__ == '__main__':

	conf_file = start( '-cffile=',sys.argv )
	if conf_file == '':
		say('skyd_wait','E',__name__,	\
			'specify config file with -cffile=<cffile>\nterminated')

	skyd_wait(conf_file)