#! /usr/bin/python
#+
# NAME:
# check_html
# PURPOSE:
# Checks whether files references in an html file exist
# CALLING SEQUENCE:
# check_html file_name
# INPUTS:
# file_name name of ascii file or directory name
# If a directory is specified all files with
# extension .htm and .html in the directory
# are checked.
# PROCEDURE:
# Python script
# MODIFICATION HISTORY:
# SEP-2002, Paul Hick (UCSD/CASS)
# MAR-2003, Paul Hick (UCSD/CASS; pphick@ucsd.edu)
# Add check for http:// references
#-
import sys, os, httplib
argv = sys.argv
narg = len(argv)-1
if narg != 1:
file = raw_input('File or directory : ')
else:
file = sys.argv[1]
if os.path.isdir(file):
files = os.listdir(file)
for i in range( len(files) ):
files[i] = os.path.join(file,files[i])
else:
if not os.path.exists(file):
print 'file does not exist, ',file
sys.exit()
files = [file]
for file in files:
ok = os.path.isfile(file)
if ok:
ext = (os.path.splitext(file))[1]
ext = ext.lower()
ok = ext == '.htm' or ext == '.html'
if ok:
dir = os.path.dirname(file)
iu = open(file, 'r')
content = iu.read()
iu.close()
# Look for href= and src= (case-insensitive
targets = ['href="','src="','background="']
count = 0
content_low = content.lower()
for target in targets:
pos_href = content_low.find(target)
while pos_href != -1:
pos_beg = pos_href+len(target)
pos_end = content.find('"', pos_beg)
ref = content [pos_beg:pos_end]
ref_low = content_low[pos_beg:pos_end]
if ref_low.find('http://') == 0:
site = ref[len('http://'):]
pos = site.find('/')
if pos == -1:
page = '/'
else:
page = site[pos:]
site = site[0:pos]
if page.find('/cgi-bin/') != -1:
cmd = "POST"
else:
cmd = "GET"
conn = httplib.HTTPConnection( site )
try:
conn.request(cmd, page)
except:
print ' exception on', ref
else:
answer = conn.getresponse()
# answer.status = 200 seems a safe bet for success, but
# I've also seen status = 302 for a page that existed.
# answer.reason usually starts with 'OK' or 'Found' or
# who knows what else gets added here.
if answer.status != 200 and \
answer.reason.find('Authorization Required') != 0 and \
answer.reason.find('Forbidden') and \
answer.reason.find('Found') != 0:
count = count+1
print ' ',answer.reason, ref
conn.close()
elif ref_low.find('mailto:') == 0:
dummy = 0
#print 'ignoring', ref
elif ref_low.find('ftp:') == 0:
dummy = 0
#print 'ignoring', ref
elif ref_low.find('javascript:') == 0:
dummy = 0
#print 'ignoring', ref
else:
if ref_low.find('file:') == 0:
ref = ref [len('file:'):]
ref_low = ref_low[len('file:'):]
pos = ref.find('#')
if pos != -1:
ref = ref [:pos]
ref_low = ref_low[:pos]
if len(ref) > 0:
if ref.find('.') == 0:
ref = os.path.join(dir,ref)
elif ref[0] != '/':
ref = os.path.join(dir,ref)
exists = os.path.exists(ref)
islink = os.path.islink(ref)
isfile = os.path.isfile(ref)
isdir = os.path.isdir (ref)
if exists == 0:
print ' ','non-existent reference:', ref
count = count+1
elif islink:
if isfile == 0 and isdir == 0:
print ' ','bad symlink:', ref
count = count+1
pos_href = content_low.find(target,pos_end)
if count > 0:
print file, 'Found', count, 'problems'
#else:
# print '>>>', file, 'no problems'