user1443063 user1443063 - 4 months ago 72
Python Question

Scrapy Images Downloading

My spider runs without displaying any errors but the images are not stored in the folder here are my scrapy files:

Spider.py:

import scrapy
import re
import os
import urlparse
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.loader.processors import Join, MapCompose, TakeFirst
from scrapy.pipelines.images import ImagesPipeline
from production.items import ProductionItem, ListResidentialItem

class productionSpider(scrapy.Spider):
name = "production"
allowed_domains = ["someurl.com"]
start_urls = [
"someurl.com"
]

def parse(self, response):
for sel in response.xpath('//html/body'):
item = ProductionItem()
img_url = sel.xpath('//a[@data-tealium-id="detail_nav_showphotos"]/@href').extract()[0]
yield scrapy.Request(urlparse.urljoin(response.url, img_url),callback=self.parseBasicListingInfo, meta={'item': item})

def parseBasicListingInfo(item, response):
item = response.request.meta['item']
item = ListResidentialItem()
try:
image_urls = map(unicode.strip,response.xpath('//a[@itemprop="contentUrl"]/@data-href').extract())
item['image_urls'] = [ x for x in image_urls]
except IndexError:
item['image_urls'] = ''

return item


settings.py:

from scrapy.settings.default_settings import ITEM_PIPELINES
from scrapy.pipelines.images import ImagesPipeline

BOT_NAME = 'production'

SPIDER_MODULES = ['production.spiders']
NEWSPIDER_MODULE = 'production.spiders'
DEFAULT_ITEM_CLASS = 'production.items'

ROBOTSTXT_OBEY = True
DEPTH_PRIORITY = 1
IMAGE_STORE = '/images'

CONCURRENT_REQUESTS = 250

DOWNLOAD_DELAY = 2

ITEM_PIPELINES = {
'scrapy.contrib.pipeline.images.ImagesPipeline': 300,
}


items.py

# -*- coding: utf-8 -*-
import scrapy

class ProductionItem(scrapy.Item):
img_url = scrapy.Field()

# ScrapingList Residential & Yield Estate for sale
class ListResidentialItem(scrapy.Item):
image_urls = scrapy.Field()
images = scrapy.Field()

pass


My pipeline file is empty i'm not sure what i am suppose to add to the pipeline.py file.

Any help is greatly appreciated.

Answer

Since you don't know what to put in the pipelines I assume you can use the default pipeline for images provided by scrapy so in the settings.py file you can just declare it like

ITEM_PIPELINES = {
'scrapy.pipelines.images.ImagesPipeline':1
}

Also, your images path is wrong the / means that you are going to the absolute root path of your machine, so you either put the absolute path to where you want to save or just do a relative path from where you are running your crawler

IMAGES_STORE = '/home/user/Documents/scrapy_project/images'

or

IMAGES_STORE = 'images'

Now, in the spider you extract the url but you don't save it into the item

item['image_urls'] = sel.xpath('//a[@data-tealium-id="detail_nav_showphotos"]/@href').extract_first()

The field has to literally be image_urls if you're using the default pipeline.

Now, in the items.py file you need to add the following 2 fields (both are required with this literal name)

image_urls=Field()
images=Field()

That should work