Astrophe Astrophe - 1 month ago 7
Python Question

Python 2.7 - search for a particular URL on a webpage

I have to retrieve a URL that is nested in a webpage. I have tried the following code but it does not find the URL of the main link (to a PDF).

import urllib2
from bs4 import BeautifulSoup

url = "http://www.cmc.gv.ao/sites/main/pt/Paginas/genericFileList.aspx?mid=9&smid=69&FilterField1=TipoConteudo_x003A_Code&FilterValue1=ENTREG"

conn = urllib2.urlopen(url)
html = conn.read()

soup = BeautifulSoup(html)
links = soup.find_all('a')

for tag in links:
link = tag.get('href',None)
if link is not None:
print link


The URL I would like to find is the main link on the web page:

http://www.cmc.gv.ao/sites/main/pt/Lists/CMC%20%20PublicaesFicheiros/Attachments/89/Lista%20de%20Institui%C3%A7%C3%B5es%20Registadas%20(actualizado%2024.10.16).pdf

In the bs4 documents it says that the find_all() method looks through a tag's descendants (direct children, children of direct children and so on) and retrieves all descendants that match your filters.

How do I get the URL from the webpage?

Answer

The pdf path is retrieved using an ajax request, you need to do a bit of work to mimic the request:

import urllib2

from bs4 import BeautifulSoup
import re

url = "http://www.cmc.gv.ao/sites/main/pt/Paginas/genericFileList.aspx?mid=9&smid=69&FilterField1=TipoConteudo_x003A_Code&FilterValue1=ENTREG"

conn = urllib2.urlopen(url)
html = conn.read()

# we need to pass in the getbyid value which we parse later
attach = "http://www.cmc.gv.ao/sites/main/pt/_api/web/lists/getbyid('{}')/items(89)/AttachmentFiles"

soup = BeautifulSoup(html)

# the getbyid is contained inside a script tag, this will pull what er need from it.
patt = re.compile('ctx.editFormUrl\s+=\s+"(.*?)"')

# find that script.
scr = soup.find("script",text=re.compile("ctx.editFormUrl"))

# line we are getting looks like ctx.editFormUrl = "http://www.cmc.gv.ao/sites/main/pt/_layouts/15/listform.aspx?PageType=6&ListId=%7BC0527FB1%2D00D9%2D4BCF%2D8FFC%2DDFCAA9E9E51D%7D";
# we need the ListId

ctx = patt.search(scr.text).group(1)

# pull ListId, and pass it to url
soup2 = BeautifulSoup(urllib2.urlopen(attach.format(ctx.rsplit("=")[-1])).read())

# ^^ returns xml, we need to find the pdf path from that, it starts with /sites/main/pt/List.
pdf_path = soup2.find(text=re.compile("^/sites/main/pt/List"))

Then you need to join to the base url:

from urlparse import urljoin
# join our parsed path to the base
full_url = urljoin("http://www.cmc.gv.ao", pdf_path)
print(full_url)

we also need to quote and encode:

from urllib import quote
from urlparse import urljoin

# handle non-ascii and encode
full_url = urljoin("http://www.cmc.gv.ao", quote(pdf_path.encode("utf-8")))

And finally to write:

from urlparse import urljoin
from urllib import quote

full_url = urljoin("http://www.cmc.gv.ao", quote(pdf_path.encode("utf-8")))
from os.path import basename
with open(basename(pdf_path.encode("utf-8")), "wb") as f:
    f.writelines(urllib2.urlopen(full_url))

Which will give you a pdf file called Lista de Instituições Registadas (actualizado 24.10.16).pdf

If you use requests, it does a lot of the work for you:

import requests
from bs4 import BeautifulSoup
import re
from urlparse import urljoin
from os.path import basename

url = "http://www.cmc.gv.ao/sites/main/pt/Paginas/genericFileList.aspx?mid=9&smid=69&FilterField1=TipoConteudo_x003A_Code&FilterValue1=ENTREG"

conn  = requests.get(url)
html = conn.content
attach = "http://www.cmc.gv.ao/sites/main/pt/_api/web/lists/getbyid('{}')/items(89)/AttachmentFiles"
soup = BeautifulSoup(html)
links = soup.find_all('a')
patt = re.compile('ctx.editFormUrl\s+=\s+"(.*?)"')
scr = soup.find("script",text=re.compile("ctx.editFormUrl"))

ctx = patt.search(scr.text).group(1)

soup2 = BeautifulSoup(requests.get(attach.format(ctx.rsplit("=")[-1])).content)

pdf_path = soup2.find(text=re.compile("/sites/main/pt/List"))

full_url = urljoin("http://www.cmc.gv.ao", pdf_path.encode("utf-8"))

with open(basename(pdf_path.encode("utf-8")), "wb") as f:
    f.writelines(requests.get(full_url))
Comments