Neel Shah Neel Shah - 24 days ago 9
Python Question

Scraping different products information using scrapy

Following is the code that I use to scrape product information. There are many products on a page. I scrape them all and then move to next page. The problem is that the scrapy is selecting just the first product on a page instead of iterating over all products on a page. Where am I going wrong?

import re
import time
import sys
from scrapy.spider import BaseSpider
from scrapy.selector import Selector
from scrapy.http import Request
import parsedatetime
from datetime import datetime
from airline_sentiment.items import *
from airline_sentiment.spiders.crawlerhelper import *

class TripAdvisorRestaurantBaseSpider(BaseSpider):
name = "shoebuy"

allowed_domains = ["shoebuy.com"]
base_uri = "http://www.shoebuy.com"
start_urls = [
base_uri + "/womens-leather-boots/category_2493?cm_sp=cat-_-d_womensboots_tiles_b1_leather-_-092216"
]


def parse(self, response):

sel = Selector(response)

snode_airline = sel.xpath('//*[starts-with(@class, "pt_grid")]/div[starts-with(@class, "pt_product\")]')

for snode_restaurant in snode_airline:
tripadvisor_item = AirlineSentimentItem()

tripadvisor_item['url'] = self.base_uri + clean_parsed_string(get_parsed_string(snode_restaurant, '//div[starts-with(@class, "pt_info")]/a/@href'))

tripadvisor_item['name'] = clean_parsed_string(get_parsed_string(snode_restaurant, '//div[starts-with(@class, "pt_info")]/a/span[@class="pt_title"]/text()'))
tripadvisor_item['price'] = clean_parsed_string(get_parsed_string(snode_restaurant, '//div[starts-with(@class, "pt_prices")]/span[@class="pt_price"]/text()'))
tripadvisor_item['discount'] = clean_parsed_string(get_parsed_string(snode_restaurant, '//div[starts-with(@class, "pt_prices")]/div[@class="pt_discount"]/span[@class="pt_percent_off"]/text()'))
tripadvisor_item['orig_price'] = clean_parsed_string(get_parsed_string(snode_restaurant, '//div[starts-with(@class, "pt_prices")]/div[@class="pt_discount"]/span[@class="pt_price_orig"]/text()'))
tripadvisor_item['stars'] = clean_parsed_string(get_parsed_string(snode_restaurant, '//*[@class="bv-rating-ratio"]/span/span[3]/text()'))
tripadvisor_item['reviews'] = clean_parsed_string(get_parsed_string(snode_restaurant, '//div[starts-with(@class, "bv-inline-rating-container")]/dl/dd[2]/span/text()'))

yield Request(url=tripadvisor_item['url'], meta={'tripadvisor_item': tripadvisor_item}, callback=self.parse_fetch_review)


next_page_url = clean_parsed_string(get_parsed_string(sel, '//div[@class="paging"]/a[@class="next"]/@href'))
if next_page_url and len(next_page_url) > 0:
yield Request(url=self.base_uri + next_page_url, meta={'tripadvisor_item': tripadvisor_item}, callback=self.parse_next_page)

def parse_next_page(self, response):
sel = Selector(response)

snode_airline = sel.xpath('//*[starts-with(@class, "pt_grid")]/div[starts-with(@class, "pt_product")]')

for snode_restaurant in snode_airline:

tripadvisor_item = AirlineSentimentItem()

tripadvisor_item['url'] = self.base_uri + clean_parsed_string(get_parsed_string(snode_restaurant, '//div[starts-with(@class, "pt_info")]/a/@href'))
tripadvisor_item['name'] = clean_parsed_string(get_parsed_string(snode_restaurant, '//div[starts-with(@class, "pt_info")]/a/span[@class="pt_title"]/text()'))
tripadvisor_item['price'] = clean_parsed_string(get_parsed_string(snode_restaurant, '//div[starts-with(@class, "pt_prices")]/span[@class="pt_price"]/text()'))
tripadvisor_item['discount'] = clean_parsed_string(get_parsed_string(snode_restaurant, '//div[starts-with(@class, "pt_prices")]/div[@class="pt_discount"]/span[@class="pt_percent_off"]/text()'))
tripadvisor_item['orig_price'] = clean_parsed_string(get_parsed_string(snode_restaurant, '//div[starts-with(@class, "pt_prices")]/div[@class="pt_discount"]/span[@class="pt_price_orig"]/text()'))
tripadvisor_item['stars'] = clean_parsed_string(get_parsed_string(snode_restaurant, '//*[@class="bv-rating-ratio"]/span/span[3]/text()'))
tripadvisor_item['reviews'] = clean_parsed_string(get_parsed_string(snode_restaurant, '//div[starts-with(@class, "bv-inline-rating-container")]/dl/dd[2]/span/text()'))

yield Request(url=tripadvisor_item['url'], meta={'tripadvisor_item': tripadvisor_item}, callback=self.parse_fetch_review)

next_page_url = clean_parsed_string(get_parsed_string(sel, '//div[@class="paging"]/a[@class="next"]/@href'))
if next_page_url and len(next_page_url) > 0:
yield Request(url=self.base_uri + next_page_url, meta={'tripadvisor_item': tripadvisor_item}, callback=self.parse_next_page)

def parse_fetch_review(self, response):

tripadvisor_item = response.meta['tripadvisor_item']
sel = Selector(response)

snode_reviews = sel.xpath('//*[starts-with(@class, "product_info_wrapper")]')

for snode_review in snode_reviews:

tripadvisor_item['img'] = self.base_uri + clean_parsed_string(get_parsed_string(snode_review, '//div[starts-with(@class,"large_thumb")]/img/@src'))

tripadvisor_item['desc'] = clean_parsed_string(get_parsed_string(snode_review, '//*[starts-with(@class,"product_information")]/div[1]/span/text()'))

tripadvisor_item['brand'] = clean_parsed_string(get_parsed_string(snode_review, '//div[starts-with(@class,"seo_module")]/h3/text()'))

yield tripadvisor_item

Answer

This is the faulty line:

        tripadvisor_item['url'] = self.base_uri + clean_parsed_string(get_parsed_string(snode_restaurant, '//div[starts-with(@class, "pt_info")]/a/@href'))

the xpath should start with a . like .//div to indicate relative node:

'.//div[starts-with(@class, "pt_info")]/a/@href'

Since you didn't make the xpath relative to your node (with the '.' notation) you always got the first product link on the page as url for every item. Now scrapy has automatic duplicate url filter so what happened is that all of your requests to retrieve reviews were filtered out later on and you ended up just getting the first item.

Tl;dr: just add a . before your // in your relative xpaths.