Showing posts with label namespace. Show all posts
Showing posts with label namespace. Show all posts

Sunday, August 10, 2014

Parse xml with namespace and store data to data dictionary - Python


import sys
import xml.etree.ElementTree as ET

class XmlParser:
    def fixAttrs(self, attrs, c):
        nattrs = {}
        for attr in attrs:
            nattrs[self.buildXmlnsKey(attr, c)] = attrs[attr]
        return nattrs

    def buildXmlnsKey(self, tagtxt, tcounter):
        done = False
        if not tagtxt.startswith("{"):
            return tagtxt
        #print 'Search for: ', tagtxt, ' in ', tcounter
        if self.nsmap.has_key(str(tcounter)):
            nslmap = self.nsmap[str(tcounter)]
            #print 'nslmap-->', nslmap
            for obj in nslmap:
                if done is False:
                    for key in obj.keys():
                        kstr = '{' + key + '}'
                        #print 'key--->', key, ', --->', kstr
                        if tagtxt.startswith(kstr) and done is False:
                            done = True
                            kval = obj[key]
                            #print 'need to replace to: ', kval
                            if len(kval):
                                tagtxt = tagtxt.replace(kstr, kval+':', 1)
                            else:
                                tagtxt = tagtxt.replace(kstr, '', 1)

        if done is False and tcounter > 0:
            tcounter = tcounter - 1
            return self.buildXmlnsKey(tagtxt, tcounter)
        return tagtxt
                        

    def xmlToDict(self, node, dictclass = None):
        if dictclass is None:
            dictclass = {}
        self.ncounter = self.ncounter + 1
        if len(node):        
            if node.attrib:
                #print node.attrib
                dictclass['<<attr>>'] = self.fixAttrs(node.attrib, self.ncounter);            
            for child in node:
                tagtxt = self.buildXmlnsKey(child.tag, self.ncounter)
                newItem = self.xmlToDict(child)
                #tagtxt = child.tag
                if dictclass.has_key(tagtxt):
                    if type(dictclass[tagtxt]) is type([]):
                        dictclass[tagtxt].append(newItem)
                    else:
                        oldItem = dictclass[tagtxt];
                        dictclass[tagtxt] = [];
                        dictclass[tagtxt].append(oldItem);
                        dictclass[tagtxt].append(newItem);
                else:
                    dictclass[tagtxt] = newItem
        else:
            if node.text is None:
                text = ''
            else:
                text = node.text.strip()

            
            if node.attrib:
                #print fixAttrs(node.attrib, ncounter)
                dictclass['<<attr>>'] = self.fixAttrs(node.attrib, self.ncounter)
                dictclass['<<value>>'] = text;
            else:
                dictclass = text;

        return dictclass

    def printDic(self, dic, pos = None):
        if pos is None:
            pos = 0
        for key in dic.keys():
            #print 'key--->', key
            if dic[key] is None:
                print self.getLenStr(pos), key, ''
            elif type(dic[key]) is type({}):
                npos = pos + 1
                print self.getLenStr(pos) + str(key), '{'
                self.printDic(dic[key], npos)
                print self.getLenStr(pos) + '}'
            elif type(dic[key]) is type([]):
                npos = pos + 1
                print self.getLenStr(pos) + str(key), '['
                self.printList(dic[key], npos)
                print self.getLenStr(pos) + ']'
            else:
                print self.getLenStr(pos) + str(key), ': ', dic[key]

    def printList(self, dic, pos = None):
        if pos is None:
            pos = 0
        lindex = -1
        for obj in dic:
            lindex = lindex + 1            
            if obj is not None:
                if type(obj) is type({}):
                    print self.getLenStr(pos) + str(lindex)+'. {'
                    npos = pos + 1
                    self.printDic(obj, npos)
                    print self.getLenStr(pos) + '}'
                elif type(obj) is type([]):
                    print self.getLenStr(pos) + str(lindex)+'. ['
                    npos = pos + 1
                    self.printList(obj, npos)
                    print self.getLenStr(pos) +']'
                else:
                    print self.getLenStr(pos), str(lindex)+'.', obj

    def getLenStr(self, pos):
        sstr = ''
        while pos > 0:
            sstr = sstr + '   '
            pos = pos - 1
        return sstr

    def __init__(self, fileLocation):
        self.location = fileLocation;

    def parse(self):
        tree = ET.parse(self.location)
        root = tree.getroot()
        self.nsmap = {}
        self.lcounter = 0

        for event, elem in ET.iterparse(self.location, events=('start', 'end', 'start-ns', 'end-ns')):
            if event == 'start-ns':
                #print 'start-ns', lcounter
                a, b = elem
                if b is not None and len(b):
                    #print "A-->", a, ", B-->" , b
                    scounter = self.lcounter + 0
                    if not self.nsmap.has_key(str(scounter)):
                        self.nsmap[str(scounter)] = []
                        self.nsmap[str(scounter)].append({'http://www.w3.org/XML/1998/namespace': 'xml'});
                    self.nsmap[str(scounter)].append({b: str(a)})
            elif event == 'start':
                #print 'start', lcounter
                if not self.nsmap.has_key(str(self.lcounter)) and self.nsmap.has_key(str(self.lcounter - 1)):
                    #print 'exist: ', nsmap[str(lcounter - 1)]
                    self.nsmap[str(self.lcounter)] = self.nsmap[str(self.lcounter - 1)];
                self.lcounter = self.lcounter + 1       

        self.ncounter = 0
        self.dic = self.xmlToDict(root)
        self.dic = {self.buildXmlnsKey(root.tag, 0): self.dic}
        return self.dic;

if __name__ == "__main__":
    xmlObj = XmlParser('xml7.xml');
    theXmlDictionary = xmlObj.parse()
    xmlObj.printDic(theXmlDictionary);

Suppose you have the xml as file:


<?xml version="1.0"?>
<lib:library
    xmlns:lib="http://eric.van-der-vlist.com/ns/library"
    xmlns:hr="http://eric.van-der-vlist.com/ns/person">
    <lib:book id="b0836217462" available="true">
        <lib:isbn>0836217462</lib:isbn>
        <lib:title xml:lang="en">Being a Dog Is a Full-Time Job</lib:title>
        <hr:author id="CMS">
            <hr:name>Charles M Schulz</hr:name>
            <hr:born>1922-11-26</hr:born>
            <hr:dead>2000-02-12</hr:dead>
        </hr:author>
        <lib:character id="PP">
            <hr:name>Peppermint Patty</hr:name>
            <hr:born>1966-08-22</hr:born>
            <lib:qualification>bold, brash and tomboyish</lib:qualification>
        </lib:character>
        <lib:character id="Snoopy">
            <hr:name>Snoopy</hr:name>
            <hr:born>1950-10-04</hr:born>
            <lib:qualification>extroverted beagle</lib:qualification>
        </lib:character>
        <lib:character id="Schroeder">
            <hr:name>Schroeder</hr:name>
            <hr:born>1951-05-30</hr:born>
            <lib:qualification>brought classical music to the Peanuts strip</lib:qualification>
        </lib:character>
        <lib:character id="Lucy">
            <hr:name>Lucy</hr:name>
            <hr:born>1952-03-03</hr:born>
            <lib:qualification>bossy, crabby and selfish</lib:qualification>
        </lib:character>
    </lib:book>
    <Purchase>
        <PurchaseId>AAAAA</PurchaseId>
        <PurchaseType>ONLINE</PurchaseType>
    </Purchase>
    <Purchase>
        <PurchaseId>BBBBB</PurchaseId>
        <PurchaseType>OFFLINE</PurchaseType>
    </Purchase>
    <Purchase paid='True'>
        <Purchase age='30'>
            <Purchase>HMM 1</Purchase>
            <Purchase>HMM 2</Purchase>
        </Purchase>
    </Purchase>
</lib:library>

Output would be like this:

lib:library {
   Purchase [
      0. {
         PurchaseId :  AAAAA
         PurchaseType :  ONLINE
      }
      1. {
         PurchaseId :  BBBBB
         PurchaseType :  OFFLINE
      }
      2. {
         <<attr>> {
            paid :  True
         }
         Purchase {
            <<attr>> {
               age :  30
            }
            Purchase [
                0. HMM 1
                1. HMM 2
            ]
         }
      }
   ]
   lib:book {
      hr:author {
         <<attr>> {
            id :  CMS
         }
         hr:name :  Charles M Schulz
         hr:born :  1922-11-26
         hr:dead :  2000-02-12
      }
      <<attr>> {
         available :  true
         id :  b0836217462
      }
      lib:title {
         <<attr>> {
            xml:lang :  en
         }
         <<value>> :  Being a Dog Is a Full-Time Job
      }
      lib:isbn :  0836217462
      lib:character [
         0. {
            <<attr>> {
               id :  PP
            }
            hr:name :  Peppermint Patty
            hr:born :  1966-08-22
            lib:qualification :  bold, brash and tomboyish
         }
         1. {
            <<attr>> {
               id :  Snoopy
            }
            hr:name :  Snoopy
            hr:born :  1950-10-04
            lib:qualification :  extroverted beagle
         }
         2. {
            <<attr>> {
               id :  Schroeder
            }
            hr:name :  Schroeder
            hr:born :  1951-05-30
            lib:qualification :  brought classical music to the Peanuts strip
         }
         3. {
            <<attr>> {
               id :  Lucy
            }
            hr:name :  Lucy
            hr:born :  1952-03-03
            lib:qualification :  bossy, crabby and selfish
         }
      ]
   }
}

Wednesday, August 14, 2013

How to parse XML with php code which contains namespace, cdata, ampersand

Download xml.class.php

Php code to parse xml which contains namespace, cdata and ampersand symbol included


<?php
class XmlToArrayParser {
    /** The array created by the parser can be assigned to any variable: $anyVarArr = $domObj->array.*/
    public $array = array();
    private $parse_error = false;
    private $parser;
    private $pointer;

    /** Constructor: $domObj = new xmlToArrayParser($xml); */
    public function __construct($xml) {
        $xml = str_replace(array('&'), array('&amp;'), $xml);
        $this->pointer =& $this->array;
        $this->parser = xml_parser_create("UTF-8");
        xml_set_object($this->parser, $this);
        xml_parser_set_option($this->parser, XML_OPTION_CASE_FOLDING, false);
        xml_set_element_handler($this->parser, "tag_open", "tag_close");
        xml_set_character_data_handler($this->parser, "cdata");
        $this->parse_error = xml_parse($this->parser, ltrim($xml)) ? false : true;
    }

    /** Free the parser. */
    public function __destruct() {
        xml_parser_free($this->parser);
    }

    /** Get the xml error if an an error in the xml file occured during parsing. */
    public function get_xml_error() {
        if ($this->parse_error) {
            $errCode = xml_get_error_code($this->parser);
            $thisError = "Error Code [" . $errCode . "] \"<strong style='color:red;'>" . xml_error_string($errCode) . "</strong>\",
                            at char " . xml_get_current_column_number($this->parser) . "
                            on line " . xml_get_current_line_number($this->parser) . "";
        } else {
            $thisError = $this->parse_error;
        }
        return $thisError;
    }

    private function tag_open($parser, $tag, $attributes) {
        $this->convert_to_array($tag, 'attrib');
        $idx = $this->convert_to_array($tag, 'cdata');
        if (isset($idx)) {
            $this->pointer[$tag][$idx] = Array(
                '@idx' => $idx,
                '@parent' => &$this->pointer
            );
            $this->pointer =& $this->pointer[$tag][$idx];
        } else {
            $this->pointer[$tag] = Array(
                '@parent' => &$this->pointer
            );
            $this->pointer =& $this->pointer[$tag];
        }
        if (!empty($attributes)) {
            $this->pointer['attrib'] = $attributes;
        }
    }

    /** Adds the current elements content to the current pointer[cdata] array. */
    private function cdata($parser, $cdata) {
        if (strlen(trim($cdata)) > 0) {
            if (isset($this->pointer['cdata'])) {
                $this->pointer['cdata'] .= $cdata;
            } else {
                $this->pointer['cdata'] = $cdata;
            }
        }
    }

    private function tag_close($parser, $tag) {
        $current =& $this->pointer;
        if (isset($this->pointer['@idx'])) {
            unset($current['@idx']);
        }

        $this->pointer =& $this->pointer['@parent'];
        unset($current['@parent']);

        if (isset($current['cdata']) && count($current) == 1) {
            $current = $current['cdata'];
        } else if (empty($current['cdata'])) {
            unset($current['cdata']);
        }
    }

    /** Converts a single element item into array(element[0]) if a second element of the same name is encountered. */
    private function convert_to_array($tag, $item) {
        if (isset($this->pointer[$tag][$item])) {
            $content = $this->pointer[$tag];
            $this->pointer[$tag] = array(
                (0) => $content
            );
            $idx = 1;
        } else if (isset($this->pointer[$tag])) {
            $idx = count($this->pointer[$tag]);
            if (!isset($this->pointer[$tag][0])) {
                foreach ($this->pointer[$tag] as $key => $value) {
                    unset($this->pointer[$tag][$key]);
                    $this->pointer[$tag][0][$key] = $value;
                }
            }
        } else {
            $idx = null;
        }
        return $idx;
    }
}
?>

Parsing XML example


<?php
$xmlString = "<?xml version=\"1.0\" encoding=\"utf-8\"?>
<soap:Envelope xmlns:soap=\"http://www.w3.org/2003/05/soap-envelope\"
    xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\"
    xmlns:xsd=\"http://www.w3.org/2001/XMLSchema\">
    <soap:Body>
        <CreateCustomerResponse xmlns=\"https://www.eway.com.au/gateway/managedpayment\">
            <CreateCustomerResult>9876543211000</CreateCustomerResult>
        </CreateCustomerResponse>
    </soap:Body>
    <soap:Body>
        <CreateCustomerResponse xmlns=\"https://www.eway.com.au/gateway/managedpayment\">
            <CreateCustomerResult>9876543211000</CreateCustomerResult>
        </CreateCustomerResponse>
    </soap:Body>
    <address name=\"pritom\">
        <actual content=\"Pritom Kumar\"/>
        <forward forwarding=\"yes\" content=\"pritom\">Forward to pritom & else one</forward>
        <response responding=\"yes\"><![CDATA[Thanks you.]]></response>
    </address>
</soap:Envelope>";

echo "<pre>";
$xmlObject = new XmlToArrayParser($xmlString);
print_r($xmlObject->array);
echo "</pre>";
?>

Output


Array
(
    [soap:Envelope] => Array
        (
            [attrib] => Array
                (
                    [xmlns:soap] => http://www.w3.org/2003/05/soap-envelope
                    [xmlns:xsi] => http://www.w3.org/2001/XMLSchema-instance
                    [xmlns:xsd] => http://www.w3.org/2001/XMLSchema
                )

            [soap:Body] => Array
                (
                    [0] => Array
                        (
                            [CreateCustomerResponse] => Array
                                (
                                    [attrib] => Array
                                        (
                                            [xmlns] => https://www.eway.com.au/gateway/managedpayment
                                        )

                                    [CreateCustomerResult] => 9876543211000
                                )

                        )

                    [1] => Array
                        (
                            [CreateCustomerResponse] => Array
                                (
                                    [attrib] => Array
                                        (
                                            [xmlns] => https://www.eway.com.au/gateway/managedpayment
                                        )

                                    [CreateCustomerResult] => 9876543211000
                                )

                        )

                )

            [address] => Array
                (
                    [attrib] => Array
                        (
                            [name] => pritom
                        )

                    [actual] => Array
                        (
                            [attrib] => Array
                                (
                                    [content] => Pritom Kumar
                                )

                        )

                    [forward] => Array
                        (
                            [attrib] => Array
                                (
                                    [forwarding] => yes
                                    [content] => pritom
                                )

                            [cdata] => Forward to pritom & else one
                        )

                    [response] => Array
                        (
                            [attrib] => Array
                                (
                                    [responding] => yes
                                )

                            [cdata] => Thanks you.
                        )

                )

        )

)