#!/usr/bin/env python
# -*- coding: iso-8859-1 -*-
# Checks if all internal links in a html file ()
# can be resolved. This is not checked by tidy.
# (c) Thomas Güttler, June 2003
# Copying, Changing allowed
# Feedback welcome thomas@thomas-guettler.de
import sys
import os
from HTMLParser import HTMLParser
class LinkParser(HTMLParser):
link_to={}
links={}
def handle_starttag(self, tag, attributes):
if tag=="a":
href=None
name=None
for (att, value) in attributes:
if att=="href":
href=value
elif att=="name":
name=value
if href:
if href[0]=="#": # Internal Link
self.link_to[href[1:]]=self.getpos()
else:
if (href.startswith("http") or href.startswith("ftp")):
#External Link
pass
else:
if not os.path.isfile(href):
print "%s not found: %s" % (href, self.getpos())
if name:
if self.links.get(name):
print ' appears twice: %s' % (name,
self.getpos())
self.links[name]=1
def handle_endtag(self, tag):
if tag=="html":
for (link, pos) in self.link_to.items():
if not self.links.get(link):
print 'Link "%s" is unkown: %s' % (link, pos)
if __name__=="__main__":
filename=sys.argv[1]
fd=open(filename)
content=fd.read()
fd.close()
parser=LinkParser()
parser.feed(content)
parser.close()