Sunday, April 28, 2013

Convert xml formatting using python

Need a script to convert from one xml formatting to another.


For instance,

Change this:

<subSection name="tag1" title="tag 1 title" >
</subSection>

To:

<subForm name="tag1form" title="tag 1 title" >
<section name="tag1" >

</section>
</subForm>


I tried doing this with bash but it didn't seem feasible since I wanted to change the name of xml attribute values. sed at first was the obvious choice.

Here was my attempt with sed but it's not done:



#$1 input file

while read line
do
echo $line | sed -e 's/subSection/<subForm/g'  -e 's/<\/subSection/<\/subForm/g'
done < $1

A quick and dirty solution seemed more likely in python:


#usage python convert_xml.py source_file.xml > converted_file.xml

import sys
import re

formNameMatch = re.compile('[nN]ame= *["\'][a-zA-Z0-9_\-]+["\']')

def ProcessXML(file):
f = open(file,'r')
for line in f.readlines():
ProcessLine(line)

def ProcessLine(line):

if re.search(r'<subSection',line):
if re.search(r'subSections',line):
line = re.sub(r'subSections','subForms',line)
print line
return

line = re.sub(r'subSection','subForm',line)

nameMatch = formNameMatch.search(line)
if nameMatch != None:
name = nameMatch.group(0)
nameForm = re.sub(r'"$','Form"',name)
nameForm = re.sub(r'\'$','Form\'',nameForm)
line2 = formNameMatch.sub(nameForm,line)
print line2
print " <sections>"
print " <section ",name," >"

else:
print "ERROR!!!"
elif re.search(r'<\/subSection *>',line):
print " </section>"
print " </sections>"
line = re.sub(r'subSection','subForm',line)
print line
elif re.search(r'<\/subSections>',line):
print re.sub(r'subSections','subForms',line)
elif re.search(r'multiSubSection',line):
line = re.sub(r'subSection','subForm',line)
line = re.sub(r'SubSection','SubForm',line)
nameMatch = formNameMatch.search(line)
if nameMatch != None:
name = nameMatch.group(0)
nameForm = re.sub(r'"$','Form"',name)
nameForm = re.sub(r'\'$','Form\'',nameForm)
line2 = formNameMatch.sub(nameForm,line)
print line2
else:
print line


def main(*args):
if len(args) != 2:
print "must supply file to process"
return
ProcessXML(args[1])

if __name__ == "__main__":
main(*sys.argv)



No comments:

Post a Comment