Dan H Dan H - 1 year ago 89
Python Question

Scrapy: Looping through search results only returns first item

I'm scraping a site by going through the search page, then looping through all results within. However it only seems to be returning the first result for each page. I also don't think it's hitting the start page's results either.

Secondly, the price is returning as some sort of Unicode (£ symbol) - how can I remove it altogether just leaving the price?

'regular_price': [u'\xa38.59'],

Here is the HTML:

Here's the spider:

import scrapy
import random
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.selector import Selector
from cdl.items import candleItem

class cdlSpider(CrawlSpider):
name = "cdl"
allowed_domains = ["www.xxxx.co.uk"]
start_urls = ['https://www.xxxx.co.uk/advanced_search_result.php']

rules = [

def parse_listings(self, response):
sel = Selector(response)
urls = sel.css('a.product_img')

for url in urls:
url = url.xpath('@href').extract()[0]
return scrapy.Request(url,callback=self.parse_item)

def parse_item(self, response):

candle = candleItem()

n = response.css('.prod_info_name h1')

candle['name'] = n.xpath('.//text()').extract()[0]

if response.css('.regular_price'):
candle['regular_price'] = response.css('.regular_price').xpath('.//text()').extract()
candle['was_price'] = response.css('.was_price strong').xpath('.//text()').extract()
candle['now_price'] = response.css('.now_price strong').xpath('.//text()').extract()

candle['referrer'] = response.request.headers.get('Referer', None)
candle['url'] = response.request.url

yield candle

Answer Source

Yes it's returning only the first result because of your parse_listing method (you're returning the first url and you should be yielding it). I would do something like:

def parse_listings(self, response):
    for url in response.css('a.product_img::attr(href)').extract():
        yield Request(url, callback=self.parse_item)

In that case I would even do something like:

class CdlspiderSpider(CrawlSpider):
    name = 'cdlSpider'
    allowed_domains = ['www.xxxx.co.uk']
    start_urls = ['https://www.xxxx.co.uk/advanced_search_result.php']

    rules = [
        Rule(LinkExtractor(restrict_css='a.product_img'), callback='parse_item')

    def parse_item(self, response):
        if response.css('.regular_price'):
            candle['regular_price'] = response.css('.regular_price::text').re_first(r'\d+\.?\d*')
            candle['was_price'] = response.css('.was_price strong::text').re_first(r'\d+\.?\d*')
            candle['now_price'] = response.css('.now_price strong::text').re_first(r'\d+\.?\d*')
        return candle
Recommended from our users: Dynamic Network Monitoring from WhatsUp Gold from IPSwitch. Free Download