user1443063 user1443063 - 1 year ago 277
Python Question

Scrapy Images Downloading

My spider runs without displaying any errors but the images are not stored in the folder here are my scrapy files:

import scrapy
import re
import os
import urlparse
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.loader.processors import Join, MapCompose, TakeFirst
from scrapy.pipelines.images import ImagesPipeline
from production.items import ProductionItem, ListResidentialItem

class productionSpider(scrapy.Spider):
name = "production"
allowed_domains = [""]
start_urls = [

def parse(self, response):
for sel in response.xpath('//html/body'):
item = ProductionItem()
img_url = sel.xpath('//a[@data-tealium-id="detail_nav_showphotos"]/@href').extract()[0]
yield scrapy.Request(urlparse.urljoin(response.url, img_url),callback=self.parseBasicListingInfo, meta={'item': item})

def parseBasicListingInfo(item, response):
item = response.request.meta['item']
item = ListResidentialItem()
image_urls = map(unicode.strip,response.xpath('//a[@itemprop="contentUrl"]/@data-href').extract())
item['image_urls'] = [ x for x in image_urls]
except IndexError:
item['image_urls'] = ''

return item

from scrapy.settings.default_settings import ITEM_PIPELINES
from scrapy.pipelines.images import ImagesPipeline

BOT_NAME = 'production'

SPIDER_MODULES = ['production.spiders']
NEWSPIDER_MODULE = 'production.spiders'
DEFAULT_ITEM_CLASS = 'production.items'

IMAGE_STORE = '/images'



'scrapy.contrib.pipeline.images.ImagesPipeline': 300,

# -*- coding: utf-8 -*-
import scrapy

class ProductionItem(scrapy.Item):
img_url = scrapy.Field()

# ScrapingList Residential & Yield Estate for sale
class ListResidentialItem(scrapy.Item):
image_urls = scrapy.Field()
images = scrapy.Field()


My pipeline file is empty i'm not sure what i am suppose to add to the file.

Any help is greatly appreciated.

Answer Source

Since you don't know what to put in the pipelines I assume you can use the default pipeline for images provided by scrapy so in the file you can just declare it like


Also, your images path is wrong the / means that you are going to the absolute root path of your machine, so you either put the absolute path to where you want to save or just do a relative path from where you are running your crawler

IMAGES_STORE = '/home/user/Documents/scrapy_project/images'


IMAGES_STORE = 'images'

Now, in the spider you extract the url but you don't save it into the item

item['image_urls'] = sel.xpath('//a[@data-tealium-id="detail_nav_showphotos"]/@href').extract_first()

The field has to literally be image_urls if you're using the default pipeline.

Now, in the file you need to add the following 2 fields (both are required with this literal name)


That should work

Recommended from our users: Dynamic Network Monitoring from WhatsUp Gold from IPSwitch. Free Download