vpd vpd - 17 days ago 13
Python Question

Parsing anchor tag and tbody with ElementTree

This is with reference to - Iterate over python dictionary to retreive only required rows

My HTML is being formatted by external application as below -
When I process this HTML input with below code

from xml.etree import ElementTree as ET

s = """<table class="test" style="border: 1px solid #c6c6c6;" width="100%">
<thead>
<tr>
<th style="background-color: #efefef; width: 13%;">
Release
</th>
<th style="background-color: #efefef; width: 23.7965%;">
REFDB
</th>
<th style="background-color: #efefef; width: 59.2035%;">
URL
</th>
</tr>
</thead>
<tbody>
<tr>
<td style="width: 13%;">
3.7.3
</td>
<td style="width: 23.7965%;">
<p>
12345
</p>
<p>
232323
</p>
<p>
4343454
</p>
<p>
5454554
</p>
</td>
<td style="width: 59.2035%;">
<p>
<a class="jive-link-external-small" href="http://google.com" rel="nofollow">
http://google.com
</a>
</p>
<p>
<a class="jive-link-external-small" href="http://test123.com" rel="nofollow">
http://test123.com
</a>
</p>
<p>
<a class="jive-link-external-small" href="http://www.yahoo.com" rel="nofollow">
http://www.yahoo.com
</a>
</p>
<p>
<a class="jive-link-external-small" href="http://www.test.com" rel="nofollow">
http://www.test.com
</a>
</p>
</td>
</tr>
<tr>
<td style="width: 13%;">
3.7.4
</td>
<td style="width: 23.7965%;">
<p>
456789
</p>
<p>
545454
</p>
<p>
5454545
</p>
<p>
545454
</p>
</td>
<td style="width: 59.2035%;">
<p>
<a class="jive-link-external-small" href="http://foo.com" rel="nofollow">
http://foo.com
</a>
</p>
<p>
<a class="jive-link-external-small" href="http://www.yahoo.com" rel="nofollow">
http://www.yahoo.com
</a>
</p>
<p>
<a class="jive-link-external-small" href="http://svn.com" rel="nofollow">
http://svn.com
</a>
</p>
<p>
<a class="jive-link-external-small" href="http://test.com" rel="nofollow">
http://test.com
</a>
</p>
</td>
</tr>
</tbody>
</table>
"""
def find_version(ver):
table = ET.XML(s)
rows = iter(table)
headers = [col.text for col in next(rows)]
for row in rows:
values = [col.text for col in row]
out = dict(zip(headers, values))
if out['Release'] == ver:
return out

return None

res = find_version('3.7.3')
if res:
for x in res.items():
print(' - '.join(x))
else:
print ('Version not found')


I get below output:

trs: [<Element 'th' at 0x0431CDE0>, <Element 'th' at 0x0431CE40>, <Element 'th' at 0x0431CEA0>]
ths: []
tds: []
out: OrderedDict()
Traceback (most recent call last):
File "parse_html.py", line 141, in <module>
res = find_version(ver)
File "parse_html.py", line 136, in find_version
if out['Release'] == ver:
KeyError: 'Release'


Whereas my expected output is

Release - 3.7.3
URL - http://google.com
REFDB - 12345

Answer

Without comment - see code and print() results.

from xml.etree import ElementTree as ET

s = '''<table>
    <tbody>
        <tr>
            <th>Release</th>
            <th>REFDB</th>
            <th>URL</th>
        </tr>
        <tr>
            <td>3.7.3</td>
            <td>12345</td>
            <td><a class="jive-link-external-small" href="http://google.com" rel="nofollow">http://google.com</a>
            </td>
        </tr>
        <tr>
            <td>3.7.4</td>
            <td>456789</td>
            <td><a class="jive-link-external-small" href="http://foo.com" rel="nofollow">http://foo.com</a>
            </td>
        </tr>
    </tbody>
</table>'''

# --- functions ---

def find_version(ver):
    table = ET.XML(s)

    #rows = iter(table)
    #trs = list(next(rows))
    trs = table.getchildren()[0].getchildren()
    print('trs:', trs)

    #ths = [th.text for th in iter(trs[0])]
    ths = [th.text for th in trs[0].getchildren()]
    print('ths:', ths)

    for tr in trs[1:]:
        tds = []
        #for col in iter(tr):
        for col in tr.getchildren():
            text = " ".join(col.itertext()).strip()
            tds.append(text)
        print('tds:', tds)

        out = dict(zip(ths, tds))
        print('out:', out)

        if out['Release'] == ver:
            return out

# --- main ---

res = find_version('3.7.3')

if res:
    for key, val in res.items():
        print(key, '-', val)
else:
    print ('Version not found')

Result

trs: [<Element 'tr' at 0x7f26d73005e8>, <Element 'tr' at 0x7f26d7300e08>, <Element 'tr' at 0x7f26d7300868>]
ths: ['Release', 'REFDB', 'URL']
tds: ['3.7.3', '12345', 'http://google.com']
out: {'URL': 'http://google.com', 'REFDB': '12345', 'Release': '3.7.3'}
URL - http://google.com
REFDB - 12345
Release - 3.7.3