Debaditya Debaditya - 3 months ago 35
Python Question

Parse hierarchical XML tags

Need to parse hierarchical tags from XML and get the tag's value in desired output

Input

<doc>
<pid id="231">
<label key="">Electronics</label>
<desc/>
<cid id="122">
<label key="">TV</label>
</cid>
<desc/>
<cid id="123">
<label key="">Computers</label>
<cid id="12433">
<label key="">Lenovo</label>
</cid>
<desc/>
<cid id="12434">
<label key="">IBM</label>
<desc/>
</cid>
<cid id="12435">
<label key="">Mac</label>
</cid>
<desc/>
</cid>
</pid>
<pid id="7764">
<label key="">Music</label>
<desc/>
<cid id="1224">
<label key="">Play</label>
<desc/>
<cid id="341">
<label key="">PQR</label>
</cid>
<desc/>
</cid>
<cid id="221">
<label key="">iTunes</label>
<cid id="341">
<label key="">XYZ</label>
</cid>
<desc/>
<cid id="515">
<label key="">ABC</label>
</cid>
<desc/>
</cid>
</pid>
</doc>


Output

Electornics/
Electornics/TV
Electornics/Computers/Lenovo
Electornics/Computers/IBM
Electornics/Computers/Mac
Music/
Music/Play/PQR
Music/iTunes/XYZ
Music/iTunes/ABC


What I have tried (in Python)

import xml.etree.ElementTree as ET
import os
import sys
import string

def perf_func(elem, func, level=0):
func(elem,level)
for child in elem.getchildren():
perf_func(child, func, level+1)

def print_level(elem,level):
print '-'*level+elem.tag

root = ET.parse('Products.xml')
perf_func(root.getroot(), print_level)

# Added find logic
root = tree.getroot()

for n in root.findall('doc')
l = n.find('label').text
print l


With the above code, I am able to get the nodes and its levels (just the tag not their value) . And also the 1st level of all labels.
Need some suggestion (Perl/Python) on how to proceed to get the hirerachical structure in the format mentioned in Output.

Answer

We are going to use 3 pieces: find all of the elements in the order in which they occur, get the depth of each one, build a bread crumb based on the depth and order.

from lxml import etree
xml = etree.fromstring(xml_str)
elems = xml.xpath(r'//label')  #xpath expression to find all '<label ...> elements

# counts the number of parents to the root element
def get_depth(element):
    depth = 0
    parent = element.getparent()
    while parent is not None:
        depth += 1
        parent = parent.getparent()
    return depth

# build up the bread crumbs by tracking the depth
# when a new element is entered, it replaces the value in the list
# at that level and drops all values to the right
def reduce_by_depth(element_list):
    crumbs = []
    depth = 0
    elem_crumb = ['']*10
    for elem in element_list:
        depth = get_depth(elem)
        elem_crumb[depth] = elem.text
        elem_crumb[depth+1:] = ['']*(10-depth-1)
        # join all the non-empty string to get the breadcrumb
        crumbs.append('/'.join([e for e in elem_crumb if e]))
    return crumbs

reduce_by_depth(elems)

# output:
['Electronics',
 'Electronics/TV',
 'Electronics/Computers',
 'Electronics/Computers/Lenovo',
 'Electronics/Computers/IBM',
 'Electronics/Computers/Mac',
 'Music',
 'Music/Play',
 'Music/Play/PQR',
 'Music/iTunes',
 'Music/iTunes/XYZ',
 'Music/iTunes/ABC']