Pages

Sunday, August 10, 2014

Parse xml with namespace and store data to data dictionary - Python


import sys
import xml.etree.ElementTree as ET

class XmlParser:
    def fixAttrs(self, attrs, c):
        nattrs = {}
        for attr in attrs:
            nattrs[self.buildXmlnsKey(attr, c)] = attrs[attr]
        return nattrs

    def buildXmlnsKey(self, tagtxt, tcounter):
        done = False
        if not tagtxt.startswith("{"):
            return tagtxt
        #print 'Search for: ', tagtxt, ' in ', tcounter
        if self.nsmap.has_key(str(tcounter)):
            nslmap = self.nsmap[str(tcounter)]
            #print 'nslmap-->', nslmap
            for obj in nslmap:
                if done is False:
                    for key in obj.keys():
                        kstr = '{' + key + '}'
                        #print 'key--->', key, ', --->', kstr
                        if tagtxt.startswith(kstr) and done is False:
                            done = True
                            kval = obj[key]
                            #print 'need to replace to: ', kval
                            if len(kval):
                                tagtxt = tagtxt.replace(kstr, kval+':', 1)
                            else:
                                tagtxt = tagtxt.replace(kstr, '', 1)

        if done is False and tcounter > 0:
            tcounter = tcounter - 1
            return self.buildXmlnsKey(tagtxt, tcounter)
        return tagtxt
                        

    def xmlToDict(self, node, dictclass = None):
        if dictclass is None:
            dictclass = {}
        self.ncounter = self.ncounter + 1
        if len(node):        
            if node.attrib:
                #print node.attrib
                dictclass['<<attr>>'] = self.fixAttrs(node.attrib, self.ncounter);            
            for child in node:
                tagtxt = self.buildXmlnsKey(child.tag, self.ncounter)
                newItem = self.xmlToDict(child)
                #tagtxt = child.tag
                if dictclass.has_key(tagtxt):
                    if type(dictclass[tagtxt]) is type([]):
                        dictclass[tagtxt].append(newItem)
                    else:
                        oldItem = dictclass[tagtxt];
                        dictclass[tagtxt] = [];
                        dictclass[tagtxt].append(oldItem);
                        dictclass[tagtxt].append(newItem);
                else:
                    dictclass[tagtxt] = newItem
        else:
            if node.text is None:
                text = ''
            else:
                text = node.text.strip()

            
            if node.attrib:
                #print fixAttrs(node.attrib, ncounter)
                dictclass['<<attr>>'] = self.fixAttrs(node.attrib, self.ncounter)
                dictclass['<<value>>'] = text;
            else:
                dictclass = text;

        return dictclass

    def printDic(self, dic, pos = None):
        if pos is None:
            pos = 0
        for key in dic.keys():
            #print 'key--->', key
            if dic[key] is None:
                print self.getLenStr(pos), key, ''
            elif type(dic[key]) is type({}):
                npos = pos + 1
                print self.getLenStr(pos) + str(key), '{'
                self.printDic(dic[key], npos)
                print self.getLenStr(pos) + '}'
            elif type(dic[key]) is type([]):
                npos = pos + 1
                print self.getLenStr(pos) + str(key), '['
                self.printList(dic[key], npos)
                print self.getLenStr(pos) + ']'
            else:
                print self.getLenStr(pos) + str(key), ': ', dic[key]

    def printList(self, dic, pos = None):
        if pos is None:
            pos = 0
        lindex = -1
        for obj in dic:
            lindex = lindex + 1            
            if obj is not None:
                if type(obj) is type({}):
                    print self.getLenStr(pos) + str(lindex)+'. {'
                    npos = pos + 1
                    self.printDic(obj, npos)
                    print self.getLenStr(pos) + '}'
                elif type(obj) is type([]):
                    print self.getLenStr(pos) + str(lindex)+'. ['
                    npos = pos + 1
                    self.printList(obj, npos)
                    print self.getLenStr(pos) +']'
                else:
                    print self.getLenStr(pos), str(lindex)+'.', obj

    def getLenStr(self, pos):
        sstr = ''
        while pos > 0:
            sstr = sstr + '   '
            pos = pos - 1
        return sstr

    def __init__(self, fileLocation):
        self.location = fileLocation;

    def parse(self):
        tree = ET.parse(self.location)
        root = tree.getroot()
        self.nsmap = {}
        self.lcounter = 0

        for event, elem in ET.iterparse(self.location, events=('start', 'end', 'start-ns', 'end-ns')):
            if event == 'start-ns':
                #print 'start-ns', lcounter
                a, b = elem
                if b is not None and len(b):
                    #print "A-->", a, ", B-->" , b
                    scounter = self.lcounter + 0
                    if not self.nsmap.has_key(str(scounter)):
                        self.nsmap[str(scounter)] = []
                        self.nsmap[str(scounter)].append({'http://www.w3.org/XML/1998/namespace': 'xml'});
                    self.nsmap[str(scounter)].append({b: str(a)})
            elif event == 'start':
                #print 'start', lcounter
                if not self.nsmap.has_key(str(self.lcounter)) and self.nsmap.has_key(str(self.lcounter - 1)):
                    #print 'exist: ', nsmap[str(lcounter - 1)]
                    self.nsmap[str(self.lcounter)] = self.nsmap[str(self.lcounter - 1)];
                self.lcounter = self.lcounter + 1       

        self.ncounter = 0
        self.dic = self.xmlToDict(root)
        self.dic = {self.buildXmlnsKey(root.tag, 0): self.dic}
        return self.dic;

if __name__ == "__main__":
    xmlObj = XmlParser('xml7.xml');
    theXmlDictionary = xmlObj.parse()
    xmlObj.printDic(theXmlDictionary);

Suppose you have the xml as file:


<?xml version="1.0"?>
<lib:library
    xmlns:lib="http://eric.van-der-vlist.com/ns/library"
    xmlns:hr="http://eric.van-der-vlist.com/ns/person">
    <lib:book id="b0836217462" available="true">
        <lib:isbn>0836217462</lib:isbn>
        <lib:title xml:lang="en">Being a Dog Is a Full-Time Job</lib:title>
        <hr:author id="CMS">
            <hr:name>Charles M Schulz</hr:name>
            <hr:born>1922-11-26</hr:born>
            <hr:dead>2000-02-12</hr:dead>
        </hr:author>
        <lib:character id="PP">
            <hr:name>Peppermint Patty</hr:name>
            <hr:born>1966-08-22</hr:born>
            <lib:qualification>bold, brash and tomboyish</lib:qualification>
        </lib:character>
        <lib:character id="Snoopy">
            <hr:name>Snoopy</hr:name>
            <hr:born>1950-10-04</hr:born>
            <lib:qualification>extroverted beagle</lib:qualification>
        </lib:character>
        <lib:character id="Schroeder">
            <hr:name>Schroeder</hr:name>
            <hr:born>1951-05-30</hr:born>
            <lib:qualification>brought classical music to the Peanuts strip</lib:qualification>
        </lib:character>
        <lib:character id="Lucy">
            <hr:name>Lucy</hr:name>
            <hr:born>1952-03-03</hr:born>
            <lib:qualification>bossy, crabby and selfish</lib:qualification>
        </lib:character>
    </lib:book>
    <Purchase>
        <PurchaseId>AAAAA</PurchaseId>
        <PurchaseType>ONLINE</PurchaseType>
    </Purchase>
    <Purchase>
        <PurchaseId>BBBBB</PurchaseId>
        <PurchaseType>OFFLINE</PurchaseType>
    </Purchase>
    <Purchase paid='True'>
        <Purchase age='30'>
            <Purchase>HMM 1</Purchase>
            <Purchase>HMM 2</Purchase>
        </Purchase>
    </Purchase>
</lib:library>

Output would be like this:

lib:library {
   Purchase [
      0. {
         PurchaseId :  AAAAA
         PurchaseType :  ONLINE
      }
      1. {
         PurchaseId :  BBBBB
         PurchaseType :  OFFLINE
      }
      2. {
         <<attr>> {
            paid :  True
         }
         Purchase {
            <<attr>> {
               age :  30
            }
            Purchase [
                0. HMM 1
                1. HMM 2
            ]
         }
      }
   ]
   lib:book {
      hr:author {
         <<attr>> {
            id :  CMS
         }
         hr:name :  Charles M Schulz
         hr:born :  1922-11-26
         hr:dead :  2000-02-12
      }
      <<attr>> {
         available :  true
         id :  b0836217462
      }
      lib:title {
         <<attr>> {
            xml:lang :  en
         }
         <<value>> :  Being a Dog Is a Full-Time Job
      }
      lib:isbn :  0836217462
      lib:character [
         0. {
            <<attr>> {
               id :  PP
            }
            hr:name :  Peppermint Patty
            hr:born :  1966-08-22
            lib:qualification :  bold, brash and tomboyish
         }
         1. {
            <<attr>> {
               id :  Snoopy
            }
            hr:name :  Snoopy
            hr:born :  1950-10-04
            lib:qualification :  extroverted beagle
         }
         2. {
            <<attr>> {
               id :  Schroeder
            }
            hr:name :  Schroeder
            hr:born :  1951-05-30
            lib:qualification :  brought classical music to the Peanuts strip
         }
         3. {
            <<attr>> {
               id :  Lucy
            }
            hr:name :  Lucy
            hr:born :  1952-03-03
            lib:qualification :  bossy, crabby and selfish
         }
      ]
   }
}

No comments:

Post a Comment