python xml.dom.minidom output whitespace removal

Started by
5 comments, last by Oluseyi 15 years, 10 months ago
Using python 2.5, with xml.dom.minidom My output is:
<?xml version="1.0" ?>
<Log>
    <msg>
        game start!
    </msg>
    <msg>
        game end.
    </msg>
</Log>
I want indentation, but I don't want whitespace inbetween the text of elements. Like this output:
<?xml version="1.0" ?>
<Log>
    <msg>game start!</msg>
    <msg>game end.</msg>
</Log>
This is my code:
from xml.dom.minidom import getDOMImplementation
class LoggerXML():
	"""A XML logger"""
	def __init__(self, filename, root_element ):		
		self.filename = os.path.join('logs', filename)
		impl = getDOMImplementation()
		doc = self.doc = impl.createDocument(None, root_element, None)
		self.root = doc.documentElement

	def log(self, msg):
		"""Log message. same as .add_element(); set_text()"""
		text = self.set_text( self.add_element(self.root, 'msg'), msg )

	def add_element(self, parent, element_name):
		"""creates new 'element' as child of 'parent'"""
		if parent==None: parent = self.root
		element = self.doc.createElement(element_name)
		parent.appendChild( element )
		return element
		
	def set_text(self, parent, text):
		"""set text; merge text element;"""
		self.add_text( parent, text )
		self.element_normalize( parent )
	
	def get_text(self, parent):
		"""get text value / child of the parent node"""
		if parent.childNodes: return str( parent.childNodes[0].nodeValue )
		return None # bad element / no text
		
	def add_text(self, parent, text):
		"""adds a text element to parent."""
		text_node = self.doc.createTextNode( str(text) )
		parent.appendChild( text_node )
		return parent # i'm returning parent because its more usefull than text_node
	
	def element_normalize( self, element ):
		"""will merge all text elements into one element"""
		element.normalize()
	
	def writexml(self, indent='', addindent='    ', newl='\n'):
		"""write file to .filename. [indent,[addindent[,newl]]]"""
		file = open( self.filename, "w" )
		self.doc.writexml( file, indent, addindent, newl ) # self.doc.writexml( self.filename )

if __name__ == '__main__':
	l = LoggerXML("log.xml", "Log")	
	
	l.log("game start!")	
	l.log("game end.")
	
	l.writexml() 	

Advertisement
Modify your writexml to only employ the addindent when the current element text is another element.
Can you give me help in the right direction? Ie: psuedo code of the recursive function? I'm trying to re-write .writexml(), but Its becoming more complicated than I thought.

I think i need a recursive function, but the problem is I'm not exactly sure on what I'm supposed to do, and so it's a mess.
def _write_subtree(self, element, writer, indent='', addindent='    ', newl='\n'):	"""pseudo: (i'm broke too )	foreach child of element.childNodes:		print child		foreach child of child.childNodes:			_write_subtree()	"""	if element == self.root:		"""im at top level"""		# self._write_subtree( self, element, writer, indent, addindent, newl )		element = self.root.childNodes		for e in element:			self._write_subtree( e, writer, indent, addindent, newl )		else: # non-root element		writer.write( "<%s>" % self.root.nodeName )		for child in element.childNodes:			writer.write( "%s%s" % ( addindent, str(child.nodeName) ) )			# if child.childNodes:			for c in child.childNodes:				print "child: %s, %s, %s" % (					c.nodeValue, c.nodeType, c.nodeValue )			writer.write(newl)def _custom_writexml(self, writer, indent='', addindent='    ', newl='\n'):	"""like .writexml() except dont split elements onto multiple lines if	all they contain is .text element"""	print "testing _jake_writexml() !"	encoding = None	# wrapper = _LineWrapper(writer, indent, addindent, newl, 78)	# wrapper.write('<%s' % "sometag")			if encoding is None:		writer.write('<?xml version="1.0" ?>\n')	else:		writer.write('<?xml version="1.0" encoding="%s" ?>\n' % encoding)	# now write tree 	# start out with no spacing	writer.write( "<%s>" % self.root.nodeName )	self._write_subtree( self.root, writer, indent='', addindent='    ', newl='\n' )	writer.write( "</%s>" % self.root.nodeName )
This is the sample XML file that I'm tyring to generate:
<?xml version="1.0" ?><Log>    <msg>game start!</msg>    <graphics>        <resolution h="800" w="800"/>        <LOD value="1"/>    </graphics>    <character name="jake">        <loc x="20.3" z="-5"/>    </character>    <msg>game end.</msg></Log>


The problem is simple. In an XML document like this:
<?xml version="1.0" ?><Log>    <msg>game start!</msg>    <msg>game end.</msg></Log>

the msg nodes have child nodes (text nodes), and the default writer in xml.dom.minidom inserts a newline whenever a node has child nodes. You need to supply a writer that doesn't insert a newline when a node has a single text node child.

That's a rather elaborate undertaking just to change a single feature, so do what I did instead:

# This is a modification of your original code, with the function below added# and a few new lines in your main function.def new_writexml(self, writer, indent="", addindent="", newl=""):    # indent = current indentation    # addindent = indentation to add to higher levels    # newl = newline string    writer.write(indent+"<" + self.tagName)    attrs = self._get_attributes()    a_names = attrs.keys()    a_names.sort()    for a_name in a_names:        writer.write(" %s=\"" % a_name)        _write_data(writer, attrs[a_name].value)        writer.write("\"")    if self.childNodes:        # The next three lines are the only difference between new_writexml        # and xml.dom.minidom.Element.writexml        #        if len(self.childNodes) == 1 and self.childNodes[0].nodeType == 3:            writer.write(">%s</%s>%s" % (self.childNodes[0].data, self.tagName, newl))            return        writer.write(">%s"%(newl))        for node in self.childNodes:            node.writexml(writer,indent+addindent,addindent,newl)        writer.write("%s</%s>%s" % (indent,self.tagName,newl))    else:        writer.write("/>%s"%(newl))if __name__ == '__main__':        # And here's where we hook up our new code to the existing codebase:        #	oldwritexml = xml.dom.minidom.Element.writexml	xml.dom.minidom.Element.writexml = new_writexml	l = LoggerXML("log.xml", "Log")			l.log("game start!")		l.log("game end.")		l.writexml()


Let me know how it works out for you.
With some minor mods, Oluseyi's code does the trick. I've posted the details here:

http://ronrothman.com/public/leftbraned/xml-dom-minidom-toprettyxml-and-silly-whitespace

(I'll also paste the updated code below.)

A tweak was needed to correctly handle entities within text nodes.

Thanks to Oluseyi for the solution,
Ron

def fixed_writexml(self, writer, indent="", addindent="", newl=""):    # indent = current indentation    # addindent = indentation to add to higher levels    # newl = newline string    writer.write(indent+"<" + self.tagName)    attrs = self._get_attributes()    a_names = attrs.keys()    a_names.sort()    for a_name in a_names:        writer.write(" %s=\"" % a_name)        xml.dom.minidom._write_data(writer, attrs[a_name].value)        writer.write("\"")    if self.childNodes:        if len(self.childNodes) == 1 and self.childNodes[0].nodeType == xml.dom.minidom.Node.TEXT_NODE:            writer.write(">")            self.childNodes[0].writexml(writer, "", "", "" )            writer.write("%s" % (self.tagName, newl))            return        writer.write(">%s"%(newl))        for node in self.childNodes:            node.writexml(writer,indent+addindent,addindent,newl)        writer.write("%s%s" % (indent,self.tagName,newl))    else:        writer.write("/>%s"%(newl))# replace minidom's function with oursxml.dom.minidom.Element.writexml = fixed_writexml


[Edited by - Ron Rothman on June 15, 2008 11:24:38 AM]
Good news: I've come across a much simper solution (if PyXML is installed, but you still want to use minidom to generate the document): xml.dom.PrettyPrint. Here's a small wrapper for it:

from xml.dom.ext import PrettyPrintfrom StringIO import StringIOdef toprettyxml_fixed (node, encoding='utf-8'):    tmpStream = StringIO()    PrettyPrint(node, stream=tmpStream, encoding=encoding)    return tmpStream.getvalue()


[Edited by - Ron Rothman on June 15, 2008 11:23:37 AM]
Nice find, Ron! And thanks for the credit on your blog! [smile]

This topic is closed to new replies.

Advertisement