root/pydotorg/pyfaq/htmlload.py

Revision 140 (by effbot, 03/17/06 03:40:37)

stuff to extract FAQ entries from the FAQ documents on python.org

# $Id$
# XHTML/HTML loader

import os
import cElementTree as ET

NS_XHTML = "{http://www.w3.org/1999/xhtml}"

##
# Loads an XHTML or HTML file into an Element structure.  Note that
# HTML files are converted to XHTML in place, via <b>tidy</b>.

def load(file, loader=None):

    if not loader:
        loader = ET.parse

    try:
        elem = loader(file)
    except:
        # FIXME: needs locking! (atomic rename should be good enough)
        os.system("tidy -qnm -asxml \"%s\"" % file)
        elem = loader(file) # if this fails, the file was too broken

    # clean up namespace
    for node in elem.getiterator():
        if node.tag.startswith(NS_XHTML):
            node.tag = node.tag[len(NS_XHTML):]

    return elem
Note: See TracBrowser for help on using the browser.