MrKaikev MrKaikev - 2 months ago 30
ASP.NET (C#) Question

scrapy, how to parse AJAX response from asp.net page on POST

I want to look through the companies at: https://www.greg.gg/webCompSearch.aspx

I know that the asp.net form needs certain parameters that can be extracted. When sending a POST in scrapy as

FormRequest
I also get a response with the additional data. My problem is that it is only partially html, see:

1|#||4|1890|updatePanel|ctl00_updPanel|
<br />
<div id="login">
<div id="ctl00_pnlLogin" onkeypress="javascript:return WebForm_FireDefaultButton(event, &#39;ctl00_btnLogin&#39;)">


So the question is how I could parse the HTML properly.

Here is the minimal scrapy spider as reference:

# -*- coding: utf-8 -*-

import scrapy

class GgTestSpider(scrapy.Spider):
name = 'gg_test'
allowed_domains = ['www.greg.gg']
base_url = 'https://www.greg.gg/webCompSearch.aspx'
start_urls = [base_url]
custom_settings = {
'DUPEFILTER_CLASS': 'scrapy.dupefilters.BaseDupeFilter',
}

def parse(self, response):
# grep ASP.NET elements out of response
EVENTVALIDATION = response.xpath(
'//*[@id="__EVENTVALIDATION"]/@value').extract_first()
VIEWSTATE = response.xpath(
'//*[@id="__VIEWSTATE"]/@value').extract_first()
PREVIOUSPAGE = response.xpath(
'//*[@id="__PREVIOUSPAGE"]/@value').extract_first()
response.meta['fdat'] = {
'__EVENTTARGET': '',
'__EVENTARGUMENT': '',
'__VIEWSTATE': VIEWSTATE,
'__PREVIOUSPAGE': PREVIOUSPAGE,
'__EVENTVALIDATION': EVENTVALIDATION,
'__ASYNCPOST': "true",
'ctl00$ScriptManager2': "ctl00$cntPortal$updPanel|ctl00$cntPortal$btnSearch",
'ctl00$cntPortal$radSearchType': "radStartsWith",
'ctl00$cntPortal$chkPrevNames': "on",
'ctl00$cntPortal$ddlRegister': "0",
'ctl00$cntPortal$btnSearch': "Search"
}

# id to search
response.meta['fdat']['ctl00$cntPortal$txtCompRegNum'] = "1"

return scrapy.FormRequest.from_response(
response,
headers={
'Referer': self.base_url,
'X-MicrosoftAjax': 'Delta=true',
},
formdata=response.meta['fdat'],
meta={'fdat': response.meta['fdat']},
callback=self._parse_items,
)

def _parse_items(self, response):

company_item = response.xpath(
'//input[contains(@id, "ctl00$cntPortal$grdSearchResults$ctl")]/@value').extract()

print "no data:", response.request.headers, response.meta['fdat'], company_item, response.xpath('/')
response.meta['fdat']['__EVENTVALIDATION'] = response.xpath(
'//*[@id="__EVENTVALIDATION"]/@value').extract()
response.meta['fdat']['__VIEWSTATE'] = response.xpath('//*[@id="__VIEWSTATE"]/@value').extract()
response.meta['fdat']['__PREVIOUSPAGE'] = response.xpath(
'//*[@id="__PREVIOUSPAGE"]/@value').extract()

# give as input to form (POST) to get redirected
for i in company_item:
response.meta['fdat']['ctl00$ScriptManager2'] = 'ctl00$cntPortal$updPanel|{0}'.format(i)
yield scrapy.FormRequest(
url=self.base_url,
formdata=response.meta['fdat'],
meta={'company_extra_id': response.meta['company_extra_id']},
callback=self._parse_company,
)

def _parse_company(self, response):
pass


Thanks in advance!

EDIT: I changed the title of the question from how to get the full HTML like displayed in the browser to how to actually parse the partial HTML that is returned by the POST.

Answer

Using selectors

response_data = scrapy.Selector(text=response.body)
# this will give you selector object 
# you should be able to use .xpath and .css on response_data