Darshan Deshmukh Darshan Deshmukh - 20 days ago 6
Python Question

Formatting the output with html2text library

I need to retrieve the html table data with row and columns data from an API and populate it to other teams.

import requests
import json
import html2text
#from bs4 import BeautifulSoup

headers = {
'Authorization': 'Bearer hmy0w2ltszfxeysnq8cbjzfcyr4kzfk5k9a0vfca.t',
'Content-Type': 'application/json',
}
data = '{}'
response = requests.get('https://sandbox.jiveon.com/api/core/v3/contents/436669', headers=headers, data=data)
data = response.json()
print (data['content']['text'])


For converting it to text

format = html2text.HTML2Text()
format.ignore_links = True
format.bypass_tables = False
#format.ignore_tables = True
format.wrap_links = True
format.ignore_images = True
format.ignore_emphasis = True
format.wrap_links = True
print (format.handle(data['content']['text']))


Output of the above code snippet is :

<body><!-- [DocumentBodyStart:756f88b6-eed4-4030-ada9-f74dc8e4418b] --><div class="jive-rendered-content"><p>DB Release&#160;</p><p style="min-height: 8pt; padding: 0px;">&#160;</p><div class="j-rte-table"><table class="j-table jiveBorder" style="border: 1px solid #c6c6c6;" width="100%"><thead><tr style="background-color: #efefef;"><th style="width: 11%;">Release Version</th><th style="width: 10%;">REFDB_ID</th><th style="width: 160%;">SVN URL</th></tr></thead><tbody><tr><td style="width: 11%;">3.7.3</td><td style="width: 10%;"><p style="background-color: #ffffff; border: 0px; padding: 0px;">3710002</p><p style="background-color: #ffffff; border: 0px; padding: 0px;">3710003 <br/>3710005 <br/>3710007 <br/>3710009<br/>3710011</p></td><td style="width: 160%;"><p style="background-color: #ffffff; border: 0px; padding: 0px;"><a class="jive-link-external-small" href="http://svnurl.com" rel="nofollow">http://svnurl1.com&#160;</a></p><p style="background-color: #ffffff; border: 0px; padding: 0px;"><a class="jive-link-external-small" href="http://svnurl2.com" rel="nofollow">http://svnurl2.com</a></p></td></tr></tbody></table></div></div><!-- [DocumentBodyEnd:756f88b6-eed4-4030-ada9-f74dc8e4418b] --></body>

DB Release

Release Version| REFDB_ID| SVN URL
---|---|---
3.7.3|

3710002

3710003
3710005
3710007
3710009
3710011

|

http://svnurl1.com

http://svnurl2.com


Whereas my expected output is
enter image description here

Answer

I got the solution which will filter out the data based on command line argument.

import requests
import json
import sys
from bs4 import BeautifulSoup
from sys import argv
from xml.etree import ElementTree as ET


headers = {
    'Authorization': 'Bearer hmy0w2ltszfxeysnq8cbjzfcyr4kzfk5k9a0vfca.t',
    'Content-Type': 'application/json',
}
data = '{}'
response = requests.get('https://sandbox.jiveon.com/api/core/v3/contents/436669', headers=headers, data=data)
data = response.json()
html_doc = data['content']['text']
soup = BeautifulSoup(html_doc, 'html.parser')
mytag = []
mydata = []
finaldata = []
table = soup.findAll('tr')
for val in table:
    trdata = BeautifulSoup(str(val),'html.parser')
    if '3.7.4' in str(trdata):
      mytag = trdata.findAll('td')



for val in mytag:
  mydata.append(val.get_text())

for val in mydata:
  if str(val).startswith('http:'):
    urldata = str(val).split('.com')
    for val in urldata:
      if val:
        finaldata.append("".join([str(val), '.com']))
  else:
    finaldata.append(val)

for val in finaldata:
  print (val)