vaibhav jain vaibhav jain - 2 months ago 11
Python Question

Scrapy parse method not working

I am scrapping a website I have written a spider in scrapy but i am able to extract product price using this:

hxs.select('//div[@class="product_list"]//div[@class="product_list_offerprice"]/text()').extract()


through scrapy shell

But when i am trying to do the same with spider it is returning empty list

Here is my spider code:

from eScraper.items import EscraperItem
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.spiders import CrawlSpider

#------------------------------------------------------------------------------

class ESpider(CrawlSpider):
name = "ashikamallSpider"

allowed_domains = ["ashikamall.com"]
URLSList = []

for n in range (1,51):
URLSList.append('http://ashikamall.com/products.aspx?id=222&page=' + str(n))

start_urls = URLSList


def parse(self, response):

hxs = HtmlXPathSelector(response)
sites = hxs.select('//div[@class="product_list"]')
items = []

for site in sites:
item = EscraperItem()
item['productDesc'] = ""
item['productSite'] = "http://1click1call.com/"
item['productTitle'] = site.select('div[@class="product_list_name"]/h3/text()').extract()
item['productPrice'] = site.select('div[@class="product_list_offerprice"]/text()').extract()
item['productURL'] = "http://ashikamall.com/" + site.select('div[@class="product_list_image"]/a/@href').extract()[0].encode('utf-8')
item['productImage'] = "http://ashikamall.com/" + site.select('div[@class="product_list_image"]/a/img/@src').extract()[0].encode('utf-8')
items.append(item)
return items


Here is my items.py

from scrapy.item import Item, Field

#------------------------------------------------------------------------------

class EscraperItem(Item):

image_urls = Field()
productURL = Field()
productDesc = Field()
image_paths = Field()
productSite = Field()
productTitle = Field()
productPrice = Field()
productImage = Field()


Can anybody help me out please?

Answer

The problem is in your xpaths, they should be relative (.//):

from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.spiders import CrawlSpider

from scrapy.item import Item, Field


class EscraperItem(Item):
    image_urls = Field()
    productURL = Field()
    productDesc = Field()
    image_paths = Field()
    productSite = Field()
    productTitle = Field()
    productPrice = Field()
    productImage = Field()


class ESpider(CrawlSpider):
    name = "ashikamallSpider"

    allowed_domains = ["ashikamall.com"]
    start_urls = ['http://ashikamall.com/products.aspx?id=222&page=%s' % n for n in range(1, 51)]

    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        sites = hxs.select('//div[@class="product_list"]')
        items = []

        for site in sites:
            item = EscraperItem()
            item['productDesc'] = ""
            item['productSite'] = "http://1click1call.com/"
            item['productTitle'] = site.select('.//div[@class="product_list_name"]/h3/text()').extract()
            item['productPrice'] = site.select('.//div[@class="product_list_offerprice"]/text()').extract()
            item['productURL'] = "http://ashikamall.com/" + site.select('.//div[@class="product_list_image"]/a/@href').extract()[0].encode('utf-8')
            item['productImage'] = "http://ashikamall.com/" + site.select('.//div[@class="product_list_image"]/a/img/@src').extract()[0].encode('utf-8')
            items.append(item)
        return items
Comments