Showing posts with label parse raw xml. Show all posts
Showing posts with label parse raw xml. Show all posts

Sunday, September 11, 2016

Python parse xml


import sys
import xml.etree.ElementTree as ET

class XmlParser:
    def fixAttrs(self, attrs, c):
        nattrs = {}
        for attr in attrs:
            nattrs[self.buildXmlnsKey(attr, c)] = attrs[attr]
        return nattrs

    def buildXmlnsKey(self, tagtxt, tcounter):
        done = False
        if not tagtxt.startswith("{"):
            return tagtxt
        #print 'Search for: ', tagtxt, ' in ', tcounter
        if self.nsmap.has_key(str(tcounter)):
            nslmap = self.nsmap[str(tcounter)]
            #print 'nslmap-->', nslmap
            for obj in nslmap:
                if done is False:
                    for key in obj.keys():
                        kstr = '{' + key + '}'
                        #print 'key--->', key, ', --->', kstr
                        if tagtxt.startswith(kstr) and done is False:
                            done = True
                            kval = obj[key]
                            #print 'need to replace to: ', kval
                            if len(kval):
                                tagtxt = tagtxt.replace(kstr, kval+':', 1)
                            else:
                                tagtxt = tagtxt.replace(kstr, '', 1)

        if done is False and tcounter > 0:
            tcounter = tcounter - 1
            return self.buildXmlnsKey(tagtxt, tcounter)
        return tagtxt
                        

    def xmlToDict(self, node, dictclass = None):
        if dictclass is None:
            dictclass = {}
        self.ncounter = self.ncounter + 1
        if len(node):        
            if node.attrib:
                #print node.attrib
                dictclass['<<attr>>'] = self.fixAttrs(node.attrib, self.ncounter);            
            for child in node:
                tagtxt = self.buildXmlnsKey(child.tag, self.ncounter)
                newItem = self.xmlToDict(child)
                #tagtxt = child.tag
                if dictclass.has_key(tagtxt):
                    if type(dictclass[tagtxt]) is type([]):
                        dictclass[tagtxt].append(newItem)
                    else:
                        oldItem = dictclass[tagtxt];
                        dictclass[tagtxt] = [];
                        dictclass[tagtxt].append(oldItem);
                        dictclass[tagtxt].append(newItem);
                else:
                    dictclass[tagtxt] = newItem
        else:
            if node.text is None:
                text = ''
            else:
                text = node.text.strip()

            
            if node.attrib:
                #print fixAttrs(node.attrib, ncounter)
                dictclass['<<attr>>'] = self.fixAttrs(node.attrib, self.ncounter)
                dictclass['<<value>>'] = text;
            else:
                dictclass = text;

        return dictclass

    def printDic(self, dic, pos = None):
        if pos is None:
            pos = 0
        for key in dic.keys():
            #print 'key--->', key
            if dic[key] is None:
                print self.getLenStr(pos), key, ''
            elif type(dic[key]) is type({}):
                npos = pos + 1
                print self.getLenStr(pos) + str(key), '{'
                self.printDic(dic[key], npos)
                print self.getLenStr(pos) + '}'
            elif type(dic[key]) is type([]):
                npos = pos + 1
                print self.getLenStr(pos) + str(key), '['
                self.printList(dic[key], npos)
                print self.getLenStr(pos) + ']'
            else:
                print self.getLenStr(pos) + str(key), ': ', dic[key]

    def printList(self, dic, pos = None):
        if pos is None:
            pos = 0
        lindex = -1
        for obj in dic:
            lindex = lindex + 1            
            if obj is not None:
                if type(obj) is type({}):
                    print self.getLenStr(pos) + str(lindex)+'. {'
                    npos = pos + 1
                    self.printDic(obj, npos)
                    print self.getLenStr(pos) + '}'
                elif type(obj) is type([]):
                    print self.getLenStr(pos) + str(lindex)+'. ['
                    npos = pos + 1
                    self.printList(obj, npos)
                    print self.getLenStr(pos) +']'
                else:
                    print self.getLenStr(pos), str(lindex)+'.', obj

    def getLenStr(self, pos):
        sstr = ''
        while pos > 0:
            sstr = sstr + '   '
            pos = pos - 1
        return sstr

    def __init__(self, fileLocation):
        self.location = fileLocation;

    def parse(self):
        tree = ET.parse(self.location)
        root = tree.getroot()
        self.nsmap = {}
        self.lcounter = 0

        for event, elem in ET.iterparse(self.location, events=('start', 'end', 'start-ns', 'end-ns')):
            if event == 'start-ns':
                #print 'start-ns', lcounter
                a, b = elem
                if b is not None and len(b):
                    #print "A-->", a, ", B-->" , b
                    scounter = self.lcounter + 0
                    if not self.nsmap.has_key(str(scounter)):
                        self.nsmap[str(scounter)] = []
                        self.nsmap[str(scounter)].append({'http://www.w3.org/XML/1998/namespace': 'xml'});
                    self.nsmap[str(scounter)].append({b: str(a)})
            elif event == 'start':
                #print 'start', lcounter
                if not self.nsmap.has_key(str(self.lcounter)) and self.nsmap.has_key(str(self.lcounter - 1)):
                    #print 'exist: ', nsmap[str(lcounter - 1)]
                    self.nsmap[str(self.lcounter)] = self.nsmap[str(self.lcounter - 1)];
                self.lcounter = self.lcounter + 1       

        self.ncounter = 0
        self.dic = self.xmlToDict(root)
        self.dic = {self.buildXmlnsKey(root.tag, 0): self.dic}
        return self.dic;

if __name__ == "__main__":
    xmlObj = XmlParser('xml7.xml');
    theXmlDictionary = xmlObj.parse()
    xmlObj.printDic(theXmlDictionary);

Suppose you have the xml as file:


<?xml version="1.0"?>
<lib:library
    xmlns:lib="http://eric.van-der-vlist.com/ns/library"
    xmlns:hr="http://eric.van-der-vlist.com/ns/person">
    <lib:book id="b0836217462" available="true">
        <lib:isbn>0836217462</lib:isbn>
        <lib:title xml:lang="en">Being a Dog Is a Full-Time Job</lib:title>
        <hr:author id="CMS">
            <hr:name>Charles M Schulz</hr:name>
            <hr:born>1922-11-26</hr:born>
            <hr:dead>2000-02-12</hr:dead>
        </hr:author>
        <lib:character id="PP">
            <hr:name>Peppermint Patty</hr:name>
            <hr:born>1966-08-22</hr:born>
            <lib:qualification>bold, brash and tomboyish</lib:qualification>
        </lib:character>
        <lib:character id="Snoopy">
            <hr:name>Snoopy</hr:name>
            <hr:born>1950-10-04</hr:born>
            <lib:qualification>extroverted beagle</lib:qualification>
        </lib:character>
        <lib:character id="Schroeder">
            <hr:name>Schroeder</hr:name>
            <hr:born>1951-05-30</hr:born>
            <lib:qualification>brought classical music to the Peanuts strip</lib:qualification>
        </lib:character>
        <lib:character id="Lucy">
            <hr:name>Lucy</hr:name>
            <hr:born>1952-03-03</hr:born>
            <lib:qualification>bossy, crabby and selfish</lib:qualification>
        </lib:character>
    </lib:book>
    <Purchase>
        <PurchaseId>AAAAA</PurchaseId>
        <PurchaseType>ONLINE</PurchaseType>
    </Purchase>
    <Purchase>
        <PurchaseId>BBBBB</PurchaseId>
        <PurchaseType>OFFLINE</PurchaseType>
    </Purchase>
    <Purchase paid='True'>
        <Purchase age='30'>
            <Purchase>HMM 1</Purchase>
            <Purchase>HMM 2</Purchase>
        </Purchase>
    </Purchase>
</lib:library>

Output would be like this:

lib:library {
   Purchase [
      0. {
         PurchaseId :  AAAAA
         PurchaseType :  ONLINE
      }
      1. {
         PurchaseId :  BBBBB
         PurchaseType :  OFFLINE
      }
      2. {
         <<attr>> {
            paid :  True
         }
         Purchase {
            <<attr>> {
               age :  30
            }
            Purchase [
                0. HMM 1
                1. HMM 2
            ]
         }
      }
   ]
   lib:book {
      hr:author {
         <<attr>> {
            id :  CMS
         }
         hr:name :  Charles M Schulz
         hr:born :  1922-11-26
         hr:dead :  2000-02-12
      }
      <<attr>> {
         available :  true
         id :  b0836217462
      }
      lib:title {
         <<attr>> {
            xml:lang :  en
         }
         <<value>> :  Being a Dog Is a Full-Time Job
      }
      lib:isbn :  0836217462
      lib:character [
         0. {
            <<attr>> {
               id :  PP
            }
            hr:name :  Peppermint Patty
            hr:born :  1966-08-22
            lib:qualification :  bold, brash and tomboyish
         }
         1. {
            <<attr>> {
               id :  Snoopy
            }
            hr:name :  Snoopy
            hr:born :  1950-10-04
            lib:qualification :  extroverted beagle
         }
         2. {
            <<attr>> {
               id :  Schroeder
            }
            hr:name :  Schroeder
            hr:born :  1951-05-30
            lib:qualification :  brought classical music to the Peanuts strip
         }
         3. {
            <<attr>> {
               id :  Lucy
            }
            hr:name :  Lucy
            hr:born :  1952-03-03
            lib:qualification :  bossy, crabby and selfish
         }
      ]
   }
}