#! /usr/bin/python

#+
# NAME:
#	check_html
# PURPOSE:
#	Checks whether files references in an html file exist
# CALLING SEQUENCE:
#	check_html file_name
# INPUTS:
#	file_name	name of ascii file or directory name
#			If a directory is specified all files with
#			extension .htm and .html in the directory
#			are checked.
# PROCEDURE:
# 	Python script
# MODIFICATION HISTORY:
#	SEP-2002, Paul Hick (UCSD/CASS)
#	MAR-2003, Paul Hick (UCSD/CASS; pphick@ucsd.edu)
#		Add check for http:// references
#-

import sys, os, httplib

argv  = sys.argv
narg  = len(argv)-1

if narg != 1:
	file = raw_input('File or directory : ')
else:
	file = sys.argv[1]

if os.path.isdir(file):
	files = os.listdir(file)

	for i in range( len(files) ):
		files[i] = os.path.join(file,files[i])

else:
	if not os.path.exists(file):
		print 'file does not exist, ',file
		sys.exit()

	files = [file]


for file in files:
 
	ok = os.path.isfile(file)
	if ok:
		ext = (os.path.splitext(file))[1]
		ext = ext.lower()
		ok = ext == '.htm' or ext == '.html'

	if ok:
		dir = os.path.dirname(file)

		iu = open(file, 'r')
		content = iu.read()
		iu.close()

		# Look for href= and src= (case-insensitive

		targets = ['href="','src="','background="']

		count = 0
		content_low = content.lower()

		for target in targets:

			pos_href = content_low.find(target)

			while pos_href != -1:

				pos_beg = pos_href+len(target)
				pos_end = content.find('"', pos_beg)

				ref     = content    [pos_beg:pos_end]
				ref_low = content_low[pos_beg:pos_end]

				if ref_low.find('http://') == 0:

					site = ref[len('http://'):]
					pos  = site.find('/')
					if pos == -1:
						page = '/'
					else:
						page = site[pos:]
						site = site[0:pos]

					if page.find('/cgi-bin/') != -1:
						cmd = "POST"
					else:
						cmd = "GET"

					conn = httplib.HTTPConnection( site )

					try:
						conn.request(cmd, page)
					except:
						print '   exception on', ref
					else:
						answer = conn.getresponse()

						# answer.status = 200 seems a safe bet for success, but
						# I've also seen status = 302 for a page that existed.
						# answer.reason usually starts with 'OK' or 'Found' or
						# who knows what else gets added here.

						if answer.status != 200 and	\
							answer.reason.find('Authorization Required') != 0 and \
							answer.reason.find('Forbidden') and \
							answer.reason.find('Found') != 0:
							count = count+1
							print '   ',answer.reason, ref

					conn.close()

				elif ref_low.find('mailto:') == 0:
					dummy = 0
					#print 'ignoring', ref

				elif ref_low.find('ftp:') == 0:
					dummy = 0
					#print 'ignoring', ref

				elif ref_low.find('javascript:') == 0:
					dummy = 0
					#print 'ignoring', ref

				else:

					if ref_low.find('file:') == 0:
						ref     = ref    [len('file:'):]
						ref_low = ref_low[len('file:'):]
						
					pos = ref.find('#')
					if pos != -1:
						ref     = ref    [:pos]
						ref_low = ref_low[:pos]

					if len(ref) > 0:
						if ref.find('.') == 0:
							ref = os.path.join(dir,ref)
						elif ref[0] != '/':
							ref = os.path.join(dir,ref)

						exists = os.path.exists(ref)
						islink = os.path.islink(ref)
						isfile = os.path.isfile(ref)
						isdir  = os.path.isdir (ref)

						if exists == 0:

							print '   ','non-existent reference:', ref
							count = count+1

						elif islink:
							if isfile == 0 and isdir == 0:

								print '   ','bad symlink:', ref
								count = count+1

				pos_href = content_low.find(target,pos_end)

		if count > 0:
			print file, 'Found', count, 'problems'
		#else:
			#    print '>>>', file, 'no problems'