user61629 user61629 - 2 months ago 43
Python Question

Dynamically setting scrapy request call back

I'm working with scrapy. I want to rotate proxies on a per request basis and get a proxy from an api I have that returns a single proxy. My plan is to make a request to the api, get a proxy, then use it to set the proxy based on :

http://stackoverflow.com/questions/39430454/making-request-to-api-from-within-scrapy-function


I have the following:

class ContactSpider(Spider):
name = "contact"

def parse(self, response):

....
PR = Request(
'my_api'
headers=self.headers,
meta={'newrequest': Request(url_to_scrape, headers=self.headers),},
callback=self.parse_PR
)
yield PR


def parse_PR(self, response):
newrequest = response.meta['newrequest']
proxy_data = response.body
newrequest.meta['proxy'] = proxy_data
newrequest.replace(url = 'http://ipinfo.io/ip') #TESTING
newrequest.replace(callback= self.form_output) #TESTING

yield newrequest

def form_output(self, response):
open_in_browser(response)


but I'm getting:

Traceback (most recent call last):
File "C:\twisted\internet\defer.py", line 1126, in _inlineCallbacks
result = result.throwExceptionIntoGenerator(g)
File "C:\twisted\python\failure.py", line 389, in throwExceptionIntoGenerator
return g.throw(self.type, self.value, self.tb)
File "C:\scrapy\core\downloader\middleware.py", line 43, in process_request
defer.returnValue((yield download_func(request=request,spider=spider)))
File "C:\scrapy\utils\defer.py", line 45, in mustbe_deferred
result = f(*args, **kw)
File "C:\scrapy\core\downloader\handlers\__init__.py", line 65, in download_request
return handler.download_request(request, spider)
File "C:\scrapy\core\downloader\handlers\http11.py", line 60, in download_request
return agent.download_request(request)
File "C:\scrapy\core\downloader\handlers\http11.py", line 255, in download_request
agent = self._get_agent(request, timeout)
File "C:\scrapy\core\downloader\handlers\http11.py", line 235, in _get_agent
_, _, proxyHost, proxyPort, proxyParams = _parse(proxy)
File "C:\scrapy\core\downloader\webclient.py", line 37, in _parse
return _parsed_url_args(parsed)
File "C:\scrapy\core\downloader\webclient.py", line 20, in _parsed_url_args
host = b(parsed.hostname)
File "C:\scrapy\core\downloader\webclient.py", line 17, in <lambda>
b = lambda s: to_bytes(s, encoding='ascii')
File "C:\scrapy\utils\python.py", line 117, in to_bytes
'object, got %s' % type(text).__name__)
TypeError: to_bytes must receive a unicode, str or bytes object, got NoneType


what am I doing wrong?

edit: changed code to:

def parse_PR(self, response):
newrequest = response.meta['newrequest']
proxy_data = response.body
newrequest.meta['proxy'] = proxy_data
newrequest = newrequest.replace(url = 'http://ipinfo.io/') #TESTING
newrequest = newrequest.replace(callback= self.form_output) #TESTING
print (newrequest)
yield newrequest


Now getting:

2016-09-19 21:28:47 [scrapy] ERROR: Error downloading <GET http://ipinfo.io/>
Traceback (most recent call last):
File "D:\ENVS\virtalenvs\contactD\lib\site-packages\twisted\internet\defer.py", line 1126, in _inlineCallbacks
result = result.throwExceptionIntoGenerator(g)
File "D:\ENVS\virtalenvs\contactD\lib\site-packages\twisted\python\failure.py", line 389, in throwExceptionIntoGenerator
return g.throw(self.type, self.value, self.tb)
File "c:scrapy\core\downloader\middleware.py", line 43, in process_request
defer.returnValue((yield download_func(request=request,spider=spider)))
File "c:scrapy\utils\defer.py", line 45, in mustbe_deferred
result = f(*args, **kw)
File "c:scrapy\core\downloader\handlers\__init__.py", line 65, in download_request
return handler.download_request(request, spider)
File "c:scrapy\core\downloader\handlers\http11.py", line 60, in download_request
return agent.download_request(request)
File "c:scrapy\core\downloader\handlers\http11.py", line 255, in download_request
agent = self._get_agent(request, timeout)
File "c:scrapy\core\downloader\handlers\http11.py", line 235, in _get_agent
_, _, proxyHost, proxyPort, proxyParams = _parse(proxy)
File "c:scrapy\core\downloader\webclient.py", line 37, in _parse
return _parsed_url_args(parsed)
File "c:scrapy\core\downloader\webclient.py", line 20, in _parsed_url_args
host = b(parsed.hostname)
File "c:scrapy\core\downloader\webclient.py", line 17, in <lambda>
b = lambda s: to_bytes(s, encoding='ascii')
File "c:scrapy\utils\python.py", line 117, in to_bytes
'object, got %s' % type(text).__name__)
TypeError: to_bytes must receive a unicode, str or bytes object, got NoneType

Answer

The stacktrace info suggests Scrapy has encountered a request object whose url is None, which is expected to be of string type.

These two lines in your code:

newrequest.replace(url = 'http://ipinfo.io/ip') #TESTING
newrequest.replace(callback= self.form_output) #TESTING

would not work as expected, since method Request.replace returns a new instance instead of modifying the original request in-place.

You would need something like this:

newrequest = newrequest.replace(url = 'http://ipinfo.io/ip') #TESTING
newrequest = newrequest.replace(callback= self.form_output) #TESTING

or simply:

newrequest = newrequest.replace(
    url='http://ipinfo.io/ip',
    callback=self.form_output
)