Steve Steve - 6 months ago 23
Python Question

Can't decode output from BeautifulSoup in Python

I've been attempting to write a little scraper in Python using BeautifulSoup.
Everything goes smoothly until I attempt to print (or write to a file) the strings contained inside the various HTML elements. The site i'm scraping is: http://www.yellowpages.ca/search/si/1/Boots/Montreal+QC which contains various french characters. For some reason, when I attempt to print the content in the terminal or into a file, instead of decoding the string like it's supposed to, I'm getting the raw unicode output.
Here's the script:

from BeautifulSoup import BeautifulSoup as bs
import urllib as ul
##import re

base_url = 'http://www.yellowpages.ca'
data_file = open('yellow_file.txt', 'a')

data = ul.urlopen(base_url + '/locations/Quebec/Montreal/90014002.html').readlines()

bt = bs(str(data))

result = bt.findAll('div', 'ypgCategory')

bt = bs(str(result))

result = bt.findAll('a')

for tag in result:
link = base_url + tag['href']
##print str(link)
data = ul.urlopen(link).readlines()

#data = str(data).decode('latin-1')
bt = bs(str(data), convertEntities=bs.HTML_ENTITIES, fromEncoding='latin-1')
titles = bt.findAll('span', 'listingTitle')
phones = bt.findAll('a', 'phoneNumber')

entries = zip(titles, phones)

for title, phone in entries:
#print title.prettify(encoding='latin-1')
#data_file.write(title.text.decode('utf-8') + " " + phone.text.decode('utf-8') + "\n")
print title.text

data_file.close()


/************/

And the output of this is: Projets Autochtones Du Qu\xc3\xa9bec

As you can see the e with accent that's supposed to go in Quebec isn't displaying. I've tried everything mentioned on SO, calling unicode(), passing fromEncoding to soup, .decode('latin-1') but i'm getting nothing.

Any ideas?

Answer

This should be something like what you want:

from BeautifulSoup import BeautifulSoup as bs
import urllib as ul

base_url = 'http://www.yellowpages.ca'
data_file = open('yellow_file.txt', 'a')

bt = bs(ul.urlopen(base_url + '/locations/Quebec/Montreal/90014002.html'))

for div in bt.findAll('div', 'ypgCategory'):
    for a in div.findAll('a'):
        link = base_url + a['href']

        bt = bs(ul.urlopen(link), convertEntities=bs.HTML_ENTITIES)

        titles = bt.findAll('span', 'listingTitle')
        phones = bt.findAll('a', 'phoneNumber')

        for title, phone in zip(titles, phones):
            line = '%s   %s\n' % (title.text, phone.text)
            data_file.write(line.encode('utf-8'))
            print line.rstrip()

data_file.close()