The sgmllib Module | File Formats

The sgmllib module, shown in Example 5-5, provides a basic SGML parser. It works pretty much the same as the xmllib parser, but is less restrictive (and less complete).

Like in xmllib, this parser calls methods in itself to deal with things like start tags, data sections, end tags, and entities. If you're only interested in a few tags, you can define special start and end methods.

Example 5-5. Using the sgmllib Module to Extract the Title Element

File: sgmllib-example-1.py

import sgmllib
import string

class FoundTitle(Exception):
 pass

class ExtractTitle(sgmllib.SGMLParser):

 def _ _init_ _(self, verbose=0):
 sgmllib.SGMLParser._ _init_ _(self, verbose)
 self.title = self.data = None

 def handle_data(self, data):
 if self.data is not None:
 self.data.append(data)

 def start_title(self, attrs):
 self.data = []

 def end_title(self):
 self.title = string.join(self.data, "")
 raise FoundTitle # abort parsing!

def extract(file):
 # extract title from an HTML/SGML stream
 p = ExtractTitle()
 try:
 while 1:
 # read small chunks
 s = file.read(512)
 if not s:
 break
 p.feed(s)
 p.close()
 except FoundTitle:
 return p.title
 return None

#
# try it out

print "html", "=>", extract(open("samples/sample.htm"))
print "sgml", "=>", extract(open("samples/sample.sgm"))

html => A Title.
sgml => Quotations

To handle all tags, overload the unknown_starttag and unknown_endtag methods instead, as Example 5-6 demonstrates.

Example 5-6. Using the sgmllib Module to Format an SGML Document

File: sgmllib-example-2.py

import sgmllib
import cgi, sys

class PrettyPrinter(sgmllib.SGMLParser):
 # A simple SGML pretty printer

 def _ _init_ _(self):
 # initialize base class
 sgmllib.SGMLParser._ _init_ _(self)
 self.flag = 0

 def newline(self):
 # force newline, if necessary
 if self.flag:
 sys.stdout.write("
")
 self.flag = 0

 def unknown_starttag(self, tag, attrs):
 # called for each start tag

 # the attrs argument is a list of (attr, value)
 # tuples. convert it to a string.
 text = ""
 for attr, value in attrs:
 text = text + " %s='%s'" % (attr, cgi.escape(value))

 self.newline()
 sys.stdout.write("<%s%s>
" % (tag, text))

 def handle_data(self, text):
 # called for each text section
 sys.stdout.write(text)
 self.flag = (text[-1:] != "
")

 def handle_entityref(self, text):
 # called for each entity
 sys.stdout.write("&%s;" % text)
 
 def unknown_endtag(self, tag):
 # called for each end tag
 self.newline()
 sys.stdout.write("<%s>" % tag)

#
# try it out

file = open("samples/sample.sgm")

p = PrettyPrinter()
p.feed(file.read())
p.close()


Quotations


eff-bot, June 1997



Nobody expects the Spanish Inquisition! Amongst
our weaponry are such diverse elements as fear, surprise,
ruthless efficiency, and an almost fanatical devotion to
Guido, and nice red uniforms — oh, damn!

Example 5-7 checks if an SGML document is "well-formed", in the XML sense. In a well-formed document, all elements are properly nested, with one end tag for each start tag.

To check this, we simply keep a list of open tags, and check that each end tag closes a matching start tag and that there are no open tags when we reach the end of the document.

Example 5-7. Using the sgmllib Module to Check Well-Formedness

File: sgmllib-example-3.py

import sgmllib

class WellFormednessChecker(sgmllib.SGMLParser):
 # check that an SGML document is 'well-formed'
 # (in the XML sense).

 def _ _init_ _(self, file=None):
 sgmllib.SGMLParser._ _init_ _(self)
 self.tags = []
 if file:
 self.load(file)

 def load(self, file):
 while 1:
 s = file.read(8192)
 if not s:
 break
 self.feed(s)
 self.close()

 def close(self):
 sgmllib.SGMLParser.close(self)
 if self.tags:
 raise SyntaxError, "start tag %s not closed" % self.tags[-1]

 def unknown_starttag(self, start, attrs):
 self.tags.append(start)

 def unknown_endtag(self, end):
 start = self.tags.pop()
 if end != start:
 raise SyntaxError, "end tag %s does't match start tag %s" %
 (end, start)

try:
 c = WellFormednessChecker()
 c.load(open("samples/sample.htm"))
except SyntaxError:
 raise # report error
else:
 print "document is well-formed"

Traceback (innermost last):
...
SyntaxError: end tag head does't match start tag meta

Finally, Example 5-8 shows a class that allows you to filter HTML and SGML documents. To use this class, create your own base class, and implement the start and end methods.

Example 5-8. Using the sgmllib Module to Filter SGML Documents

File: sgmllib-example-4.py

import sgmllib
import cgi, string, sys

class SGMLFilter(sgmllib.SGMLParser):
 # sgml filter. override start/end to manipulate
 # document elements

 def _ _init_ _(self, outfile=None, infile=None):
 sgmllib.SGMLParser._ _init_ _(self)
 if not outfile:
 outfile = sys.stdout
 self.write = outfile.write
 if infile:
 self.load(infile)

 def load(self, file):
 while 1:
 s = file.read(8192)
 if not s:
 break
 self.feed(s)
 self.close()

 def handle_entityref(self, name):
 self.write("&%s;" % name)

 def handle_data(self, data):
 self.write(cgi.escape(data))

 def unknown_starttag(self, tag, attrs):
 tag, attrs = self.start(tag, attrs)
 if tag:
 if not attrs:
 self.write("<%s>" % tag)
 else:
 self.write("<%s" % tag)
 for k, v in attrs:
 self.write(" %s=%s" % (k, repr(v)))
 self.write(">")

 def unknown_endtag(self, tag):
 tag = self.end(tag)
 if tag:
 self.write("" % tag)

 def start(self, tag, attrs):
 return tag, attrs # override

 def end(self, tag):
 return tag # override

class Filter(SGMLFilter):

 def fixtag(self, tag):
 if tag == "em":
 tag = "i"
 if tag == "string":
 tag = "b"
 return string.upper(tag)

 def start(self, tag, attrs):
 return self.fixtag(tag), attrs

 def end(self, tag):
 return self.fixtag(tag)

c = Filter()
c.load(open("samples/sample.htm"))

Core Modules

More Standard Modules

Threads and Processes

Data Representation

File Formats

Mail and News Message Processing

Network Protocols

Internationalization

Multimedia Modules

Data Storage

Tools and Utilities

Platform-Specific Modules

Implementation Support Modules

Other Modules