#!/usr/bin/env python # -*- coding: iso-8859-1 -*- # Checks if all internal links in a html file () # can be resolved. This is not checked by tidy. # (c) Thomas Güttler, June 2003 # Copying, Changing allowed # Feedback welcome thomas@thomas-guettler.de import sys import os from HTMLParser import HTMLParser class LinkParser(HTMLParser): link_to={} links={} def handle_starttag(self, tag, attributes): if tag=="a": href=None name=None for (att, value) in attributes: if att=="href": href=value elif att=="name": name=value if href: if href[0]=="#": # Internal Link self.link_to[href[1:]]=self.getpos() else: if (href.startswith("http") or href.startswith("ftp")): #External Link pass else: if not os.path.isfile(href): print "%s not found: %s" % (href, self.getpos()) if name: if self.links.get(name): print ' appears twice: %s' % (name, self.getpos()) self.links[name]=1 def handle_endtag(self, tag): if tag=="html": for (link, pos) in self.link_to.items(): if not self.links.get(link): print 'Link "%s" is unkown: %s' % (link, pos) if __name__=="__main__": filename=sys.argv[1] fd=open(filename) content=fd.read() fd.close() parser=LinkParser() parser.feed(content) parser.close()