python xml.dom.minidom output whitespace removal

Started by
5 comments, last by Oluseyi 15 years, 10 months ago
Using python 2.5, with xml.dom.minidom My output is:
<?xml version="1.0" ?>
        game start!
        game end.
I want indentation, but I don't want whitespace inbetween the text of elements. Like this output:
<?xml version="1.0" ?>
    <msg>game start!</msg>
    <msg>game end.</msg>
This is my code:
from xml.dom.minidom import getDOMImplementation
class LoggerXML():
	"""A XML logger"""
	def __init__(self, filename, root_element ):		
		self.filename = os.path.join('logs', filename)
		impl = getDOMImplementation()
		doc = self.doc = impl.createDocument(None, root_element, None)
		self.root = doc.documentElement

	def log(self, msg):
		"""Log message. same as .add_element(); set_text()"""
		text = self.set_text( self.add_element(self.root, 'msg'), msg )

	def add_element(self, parent, element_name):
		"""creates new 'element' as child of 'parent'"""
		if parent==None: parent = self.root
		element = self.doc.createElement(element_name)
		parent.appendChild( element )
		return element
	def set_text(self, parent, text):
		"""set text; merge text element;"""
		self.add_text( parent, text )
		self.element_normalize( parent )
	def get_text(self, parent):
		"""get text value / child of the parent node"""
		if parent.childNodes: return str( parent.childNodes[0].nodeValue )
		return None # bad element / no text
	def add_text(self, parent, text):
		"""adds a text element to parent."""
		text_node = self.doc.createTextNode( str(text) )
		parent.appendChild( text_node )
		return parent # i'm returning parent because its more usefull than text_node
	def element_normalize( self, element ):
		"""will merge all text elements into one element"""
	def writexml(self, indent='', addindent='    ', newl='\n'):
		"""write file to .filename. [indent,[addindent[,newl]]]"""
		file = open( self.filename, "w" )
		self.doc.writexml( file, indent, addindent, newl ) # self.doc.writexml( self.filename )

if __name__ == '__main__':
	l = LoggerXML("log.xml", "Log")	
	l.log("game start!")	
	l.log("game end.")

Modify your writexml to only employ the addindent when the current element text is another element.
Can you give me help in the right direction? Ie: psuedo code of the recursive function? I'm trying to re-write .writexml(), but Its becoming more complicated than I thought.

I think i need a recursive function, but the problem is I'm not exactly sure on what I'm supposed to do, and so it's a mess.
def _write_subtree(self, element, writer, indent='', addindent='    ', newl='\n'):	"""pseudo: (i'm broke too )	foreach child of element.childNodes:		print child		foreach child of child.childNodes:			_write_subtree()	"""	if element == self.root:		"""im at top level"""		# self._write_subtree( self, element, writer, indent, addindent, newl )		element = self.root.childNodes		for e in element:			self._write_subtree( e, writer, indent, addindent, newl )		else: # non-root element		writer.write( "<%s>" % self.root.nodeName )		for child in element.childNodes:			writer.write( "%s%s" % ( addindent, str(child.nodeName) ) )			# if child.childNodes:			for c in child.childNodes:				print "child: %s, %s, %s" % (					c.nodeValue, c.nodeType, c.nodeValue )			writer.write(newl)def _custom_writexml(self, writer, indent='', addindent='    ', newl='\n'):	"""like .writexml() except dont split elements onto multiple lines if	all they contain is .text element"""	print "testing _jake_writexml() !"	encoding = None	# wrapper = _LineWrapper(writer, indent, addindent, newl, 78)	# wrapper.write('<%s' % "sometag")			if encoding is None:		writer.write('<?xml version="1.0" ?>\n')	else:		writer.write('<?xml version="1.0" encoding="%s" ?>\n' % encoding)	# now write tree 	# start out with no spacing	writer.write( "<%s>" % self.root.nodeName )	self._write_subtree( self.root, writer, indent='', addindent='    ', newl='\n' )	writer.write( "</%s>" % self.root.nodeName )
This is the sample XML file that I'm tyring to generate:
<?xml version="1.0" ?><Log>    <msg>game start!</msg>    <graphics>        <resolution h="800" w="800"/>        <LOD value="1"/>    </graphics>    <character name="jake">        <loc x="20.3" z="-5"/>    </character>    <msg>game end.</msg></Log>

The problem is simple. In an XML document like this:
<?xml version="1.0" ?><Log>    <msg>game start!</msg>    <msg>game end.</msg></Log>

the msg nodes have child nodes (text nodes), and the default writer in xml.dom.minidom inserts a newline whenever a node has child nodes. You need to supply a writer that doesn't insert a newline when a node has a single text node child.

That's a rather elaborate undertaking just to change a single feature, so do what I did instead:

# This is a modification of your original code, with the function below added# and a few new lines in your main function.def new_writexml(self, writer, indent="", addindent="", newl=""):    # indent = current indentation    # addindent = indentation to add to higher levels    # newl = newline string    writer.write(indent+"<" + self.tagName)    attrs = self._get_attributes()    a_names = attrs.keys()    a_names.sort()    for a_name in a_names:        writer.write(" %s=\"" % a_name)        _write_data(writer, attrs[a_name].value)        writer.write("\"")    if self.childNodes:        # The next three lines are the only difference between new_writexml        # and xml.dom.minidom.Element.writexml        #        if len(self.childNodes) == 1 and self.childNodes[0].nodeType == 3:            writer.write(">%s</%s>%s" % (self.childNodes[0].data, self.tagName, newl))            return        writer.write(">%s"%(newl))        for node in self.childNodes:            node.writexml(writer,indent+addindent,addindent,newl)        writer.write("%s</%s>%s" % (indent,self.tagName,newl))    else:        writer.write("/>%s"%(newl))if __name__ == '__main__':        # And here's where we hook up our new code to the existing codebase:        #	oldwritexml = xml.dom.minidom.Element.writexml	xml.dom.minidom.Element.writexml = new_writexml	l = LoggerXML("log.xml", "Log")			l.log("game start!")		l.log("game end.")		l.writexml()

Let me know how it works out for you.
With some minor mods, Oluseyi's code does the trick. I've posted the details here:

(I'll also paste the updated code below.)

A tweak was needed to correctly handle entities within text nodes.

Thanks to Oluseyi for the solution,

def fixed_writexml(self, writer, indent="", addindent="", newl=""):    # indent = current indentation    # addindent = indentation to add to higher levels    # newl = newline string    writer.write(indent+"<" + self.tagName)    attrs = self._get_attributes()    a_names = attrs.keys()    a_names.sort()    for a_name in a_names:        writer.write(" %s=\"" % a_name)        xml.dom.minidom._write_data(writer, attrs[a_name].value)        writer.write("\"")    if self.childNodes:        if len(self.childNodes) == 1 and self.childNodes[0].nodeType == xml.dom.minidom.Node.TEXT_NODE:            writer.write(">")            self.childNodes[0].writexml(writer, "", "", "" )            writer.write("%s" % (self.tagName, newl))            return        writer.write(">%s"%(newl))        for node in self.childNodes:            node.writexml(writer,indent+addindent,addindent,newl)        writer.write("%s%s" % (indent,self.tagName,newl))    else:        writer.write("/>%s"%(newl))# replace minidom's function with oursxml.dom.minidom.Element.writexml = fixed_writexml

[Edited by - Ron Rothman on June 15, 2008 11:24:38 AM]
Good news: I've come across a much simper solution (if PyXML is installed, but you still want to use minidom to generate the document): xml.dom.PrettyPrint. Here's a small wrapper for it:

from xml.dom.ext import PrettyPrintfrom StringIO import StringIOdef toprettyxml_fixed (node, encoding='utf-8'):    tmpStream = StringIO()    PrettyPrint(node, stream=tmpStream, encoding=encoding)    return tmpStream.getvalue()

[Edited by - Ron Rothman on June 15, 2008 11:23:37 AM]
Nice find, Ron! And thanks for the credit on your blog! [smile]

This topic is closed to new replies.
