I l@ve RuBoard

12.13 Module: Converting a List of Equal-Length Lists into XML

Credit: Julius Welby

Parsers of tabular data or comma-separated values (CSV) files usually output a list of lists. Converting these into XML allows them to be manipulated with XSLT and other XML tools. Example 12-2 takes a list of equal-length lists and converts it into XML (or, optionally, into an HTML table).

Example 12-2. Converting a list of equal-length lists into XML

# LL2XML.py -- Version 0.3 -- 15 July 2001
# http://www.outwardlynormal.com/python/ll2XML.htm for the full docs
import string

# Set up exceptions
class Error(Exception):
    def _ _init_ _(self, errcode,  heading_num = 0, sublist_length = 0):
        self.errcode = errcode
        if self.errcode == "Length Error - Sublists":
            self.message = ["All the sublists must be of uniform length."]
        elif self.errcode == "Heading Error - heading/sublist mismatch":
            self.message = ["There is at least one empty heading item.\n",
                          "Please supply non-empty headings."]
        elif self.errcode == "Length Error: heading/sublist mismatch":
            self.message = ["Number of headings =", 'heading_num', "\n",
                          "Number of elements in sublists =",
                          'sublist_length', "\n",
                          "These numbers must be equal."]
        else: self.message = [""]
        self.errmsg = string.join(self.message)

    def _ _str_ _(self):
        return self.errmsg

def escape(s):
    """ Replace special characters '&', "'", '<', '>', and '"' 
       with XML entities. """
    s = s.replace("&", "&amp;") # Must be done first!
    s = s.replace("'", "&apos;")
    s = s.replace("<", "<")
    s = s.replace(">", ">")
    s = s.replace('"', "&quot;")
    return s

def cleanTag(s):
    if type(s) != type(""):
        s = 's'
    s = string.lower(s)
    s = string.replace(s," ", "_")
    s = escape(s)
    return s

def LL2XML(LL, headings_tuple = (  ), root_element = "rows",
        row_element = "row", xml_declared = "yes"):

    if headings_tuple == "table":
        headings_tuple = ("td",) * len(LL[0])
        root_element = "table"
        row_element = "tr"
        xml_declared = "no"

    root_element = cleanTag(root_element)
    row_element = cleanTag(row_element)
    if not headings_tuple:
        headings =  LL[0]
        firstRow = "headings"
    else:
        headings = headings_tuple
        firstRow = "data"

    # Sublists all of the same length?
    sublist_length = len(LL[0])
    for sublist in LL:
        if len(sublist) != sublist_length:
            raise Error("Length Error - Sublists")

    # Check headings
    heading_num = len(headings)
    if heading_num != sublist_length:
        raise Error("Heading Error - heading/sublist mismatch",
                     heading_num, sublist_length)

    for item in headings:
        if not item:
            raise Error("Heading Error - Empty Item")

    # Do the conversion
    bits = []
    def add_bits(*somebits):
        bits.extend(list(somebits))
    if xml_declared == "yes":
        xml_declaration = '<?xml version="1.0" encoding="iso-8859-1"?>\n'
    else:
        xml_declaration = ""
    add_bits(xml_declaration, '<', root_element, '>')
    if firstRow == "headings":
        LL = LL[1:]         # Remove redundant heading row, if present
    for sublist in LL:
        add_bits("\n  <", row_element, ">\n")
        i = 0
        for item in sublist:
            tag = headings[i]
            tag = cleanTag(tag)
            if type(item) != type(""):
                item = `item`
            item = escape(item)
            add_bits("    <", tag, ">", item, "</", tag, ">\n")
            i = i+1
        add_bits("  </", row_element, ">")
    add_bits("\n</", root_element, ">")
    return string.join(bits, "")

def test(  ):
    LL = [
   ['Login', 'First Name', 'Last Name', 'Job', 'Group', 'Office', 'Permission'],
   ['auser', 'Arnold', 'Atkins', 'Partner', 'Tax', 'London', 'read'],
   ['buser', 'Bill', 'Brown', 'Partner', 'Tax', 'New York', 'read'],
   ['cuser', 'Clive', 'Cutler', 'Partner', 'Management', 'Brussels', 'read'],
   ['duser', 'Denis', 'Davis', 'Developer', 'ISS', 'London', 'admin'],
   ['euser', 'Eric', 'Ericsson', 'Analyst', 'Analysis', 'London', 'admin'],
   ['fuser', 'Fabian', 'Fowles', 'Partner', 'IP', 'London', 'read']
         ]

    LL_no_heads = LL[1:]

    # Example 1
    print "Example 1: Simple case, using defaults.\n"
    print LL2XML(LL)
    print

    # Example 2
    print """Example 2: LL has its headings in the first line,
and we define our root and row element names.\n"""
    print LL2XML(LL,(  ),"people","person")
    print

    # Example 3
    print """Example 3: headings supplied using the headings argument(tuple),
using default root and row element names.\n"""
    print LL2XML(LL_no_heads,
        ("Login","First Name","Last Name","Job","Group","Office","Permission"))
    print

    #Example 4
    print """Example 4: The special case where we ask for an HTML table as
    output by just giving the string "table" as the second argument.\n"""
    print LL2XML(LL,"table")
    print

if _ _name_ _ == '_ _main_ _':
    test(  )

If the first sublist is a list of headings, these are used to form the element names of the rest of the data, or else the element names can be defined in the function call. Root and row elements can be named if required.

This recipe is coded for compatibility with all versions of Python, including extremely old versions, to the point of reimplementing the escape functionality rather than relying on those supplied by Python's standard library.

12.13.1 See Also

For the specific job of parsing CSV you should probably use one of the existing Python modules available at the Vaults of Parnassus (http://www.vex.net/parnassus/apyllo.py?find=csv); two such parsers are at http://tratt.net/laurie/python/asv/ and http://www.object-craft.com.au/projects/csv/; the permanent home of this module is http://www.outwardlynormal.com/python/ll2XML.htm.

I l@ve RuBoard