ma yang ma yang - 9 days ago 6
Python Question

UnicodeWarning: Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER

I am using python+bs4+pyside in the code ,please look the part of the code below:

enter code here
#coding:gb2312
import urllib2
import sys
import urllib
import urlparse
import random
import time
from datetime import datetime, timedelta
import socket
from bs4 import BeautifulSoup
import lxml.html
from PySide.QtGui import *
from PySide.QtCore import *
from PySide.QtWebKit import *

def download(self, url, headers, proxy, num_retries, data=None):
print 'Downloading:', url
request = urllib2.Request(url, data, headers or {})
opener = self.opener or urllib2.build_opener()
if proxy:
proxy_params = {urlparse.urlparse(url).scheme: proxy}
opener.add_handler(urllib2.ProxyHandler(proxy_params))
try:
response = opener.open(request)
html = response.read()
code = response.code
except Exception as e:
print 'Download error:', str(e)
html = ''
if hasattr(e, 'code'):
code = e.code
if num_retries > 0 and 500 <= code < 600:
# retry 5XX HTTP errors
return self._get(url, headers, proxy, num_retries-1, data)
else:
code = None
return {'html': html, 'code': code}
def crawling_hdf(openfile):
filename = open(openfile,'r')
namelist = filename.readlines()
app = QApplication(sys.argv)
for name in namelist:
url = "http://so.haodf.com/index/search?type=doctor&kw="+ urllib.quote(name)
#get doctor's home page
D = Downloader(delay=DEFAULT_DELAY, user_agent=DEFAULT_AGENT, proxies=None, num_retries=DEFAULT_RETRIES, cache=None)
html = D(url)
soup = BeautifulSoup(html)
tr = soup.find(attrs={'class':'docInfo'})
td = tr.find(attrs={'class':'docName font_16'}).get('href')
print td
#get doctor's detail information page
loadPage_bs4(td)

filename.close()

if __name__ == '__main__':
crawling_hdf("name_list.txt")


After I run the program , there shows a waring message:

Warning (from warnings module):
File "C:\Python27\lib\site-packages\bs4\dammit.py", line 231
"Some characters could not be decoded, and were "
UnicodeWarning: Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


I have used print str(html) and find all chinese language in tages are messy code.

I have tried use ”decode or encode“ and ”gzip“ solutions which are search in this website,but it doesn't work in my case.

Thank you very much for your help!

Answer

It looks like that page is encoded in gbk. The problem is that there is no direct conversion between utf-8 and gbk (that I am aware of).

I've seen this workaround used before, try:

html.encode('latin-1').decode('gbk').encode('utf-8')