JT28 JT28 - 1 year ago 61
Ajax Question

Not able to scrap more then 10 records using scrapy

I'm new to scrapy and python. I'm using scrapy for scraping the data.

The site using AJAX for pagination so I'm not able to get the data more than 10 records I'm posting my code

from scrapy import Spider
from scrapy.selector import Selector
from scrapy import Request
from justdial.items import JustdialItem
import csv
from itertools import izip
import scrapy
import re

class JustdialSpider(Spider):
name = "JustdialSpider"
allowed_domains = ["justdial.com"]
start_urls = [

def start_requests(self):
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:48.0) Gecko/20100101 Firefox/48.0'}
for url in self.start_urls:
yield Request(url, headers=headers)

def parse(self, response):
questions = Selector(response).xpath('//div[@class="col-sm-5 col-xs-8 store-details sp-detail paddingR0"]')
for question in questions:
item = JustdialItem()
item['name'] = question.xpath(
'//div[@class="col-sm-5 col-xs-8 store-details sp-detail paddingR0"]/h4/span/a/text()').extract()
item['contact'] = question.xpath(
'//div[@class="col-sm-5 col-xs-8 store-details sp-detail paddingR0"]/p[@class="contact-info"]/span/a/b/text()').extract()
with open('some.csv', 'wb') as f:
writer = csv.writer(f)
writer.writerows(izip(item['name'], item['contact']))
return item

# if running code above this I'm able to get 10 records of the page

# This code not working for getting data more than 10 records, Pagination using AJAX
url = 'http://www.justdial.com/functions/ajxsearch.php?national_search=0&act=pagination&city=Mumbai&search=Chemical+Dealers&where=&catid=944&psearch=&prid=&page=2&SID=&mntypgrp=0&toknbkt=&bookDate='
next_page = int(re.findall('page=(\d+)', url)[0]) + 1
next_url = re.sub('page=\d+', 'page={}'.format(next_page), url)
print next_url

def parse_ajaxurl(self, response):
# e.g. http://www.justdial.com/Mumbai/Dentists/ct-385543
my_headers = {'Referer': response.url}
yield Request("ajax_request_url",

Please help me


Answer Source

Actually if you disable javascript when viewing the page you'll notice that site offers traditional pagination instead of "never ending" AJAX one.

Using this you can simply find url of next page and continue:

def parse(self, response):
    questions = response.xpath('//div[contains(@class,"store-details")]')
    for question in questions:
        item = dict()
        item['name'] = question.xpath("h4/span/a/text()").extract_first()
        item['contact'] = question.xpath("p[@class='contact-info']//b/text()").extract_first()
        yield item
    # next page
    next_page = response.xpath("//a[@rel='next']/@href").extract_first()
    if next_page:
        yield Request(next_page)

I also fixed up your xpaths but in overal the only bit that changed is those 3 lines under # next page comment. As a side note I've noticed you are saving to csv in spider where you can use built-in scrapy exporter command like: scrapy crawl myspider --output results.csv