#! /usr/bin/python #+ # NAME: # check_html # PURPOSE: # Checks whether files references in an html file exist # CALLING SEQUENCE: # check_html file_name # INPUTS: # file_name name of ascii file or directory name # If a directory is specified all files with # extension .htm and .html in the directory # are checked. # PROCEDURE: # Python script # MODIFICATION HISTORY: # SEP-2002, Paul Hick (UCSD/CASS) # MAR-2003, Paul Hick (UCSD/CASS; pphick@ucsd.edu) # Add check for http:// references #- import sys, os, httplib argv = sys.argv narg = len(argv)-1 if narg != 1: file = raw_input('File or directory : ') else: file = sys.argv[1] if os.path.isdir(file): files = os.listdir(file) for i in range( len(files) ): files[i] = os.path.join(file,files[i]) else: if not os.path.exists(file): print 'file does not exist, ',file sys.exit() files = [file] for file in files: ok = os.path.isfile(file) if ok: ext = (os.path.splitext(file))[1] ext = ext.lower() ok = ext == '.htm' or ext == '.html' if ok: dir = os.path.dirname(file) iu = open(file, 'r') content = iu.read() iu.close() # Look for href= and src= (case-insensitive targets = ['href="','src="','background="'] count = 0 content_low = content.lower() for target in targets: pos_href = content_low.find(target) while pos_href != -1: pos_beg = pos_href+len(target) pos_end = content.find('"', pos_beg) ref = content [pos_beg:pos_end] ref_low = content_low[pos_beg:pos_end] if ref_low.find('http://') == 0: site = ref[len('http://'):] pos = site.find('/') if pos == -1: page = '/' else: page = site[pos:] site = site[0:pos] if page.find('/cgi-bin/') != -1: cmd = "POST" else: cmd = "GET" conn = httplib.HTTPConnection( site ) try: conn.request(cmd, page) except: print ' exception on', ref else: answer = conn.getresponse() # answer.status = 200 seems a safe bet for success, but # I've also seen status = 302 for a page that existed. # answer.reason usually starts with 'OK' or 'Found' or # who knows what else gets added here. if answer.status != 200 and \ answer.reason.find('Authorization Required') != 0 and \ answer.reason.find('Forbidden') and \ answer.reason.find('Found') != 0: count = count+1 print ' ',answer.reason, ref conn.close() elif ref_low.find('mailto:') == 0: dummy = 0 #print 'ignoring', ref elif ref_low.find('ftp:') == 0: dummy = 0 #print 'ignoring', ref elif ref_low.find('javascript:') == 0: dummy = 0 #print 'ignoring', ref else: if ref_low.find('file:') == 0: ref = ref [len('file:'):] ref_low = ref_low[len('file:'):] pos = ref.find('#') if pos != -1: ref = ref [:pos] ref_low = ref_low[:pos] if len(ref) > 0: if ref.find('.') == 0: ref = os.path.join(dir,ref) elif ref[0] != '/': ref = os.path.join(dir,ref) exists = os.path.exists(ref) islink = os.path.islink(ref) isfile = os.path.isfile(ref) isdir = os.path.isdir (ref) if exists == 0: print ' ','non-existent reference:', ref count = count+1 elif islink: if isfile == 0 and isdir == 0: print ' ','bad symlink:', ref count = count+1 pos_href = content_low.find(target,pos_end) if count > 0: print file, 'Found', count, 'problems' #else: # print '>>>', file, 'no problems'