V.Khakhil V.Khakhil - 1 year ago 107
MySQL Question

MySQL database error using scrapy

I am trying to save scrapped data in MySQL database. My script.py is

# -*- coding: utf-8 -*-
import scrapy
import unidecode
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from lxml import html

class ElementSpider(scrapy.Spider):
name = 'books'
download_delay = 3
allowed_domains = ["goodreads.com"]
start_urls = ["https://www.goodreads.com/list/show/19793.I_Marked_My_Calendar_For_This_Book_s_Release",]

rules = (Rule(LinkExtractor(allow=(), restrict_xpaths=('//a[@class="next_page"]',)), callback="parse", follow= True),)

def parse(self, response):
for href in response.xpath('//div[@id="all_votes"]/table[@class="tableList js-dataTooltip"]/tr/td[2]/div[@class="js-tooltipTrigger tooltipTrigger"]/a/@href'):
full_url = response.urljoin(href.extract())
print full_url
yield scrapy.Request(full_url, callback = self.parse_books)

next_page = response.xpath('.//a[@class="next_page"]/@href').extract()
if next_page:
next_href = next_page[0]
next_page_url = 'https://www.goodreads.com' + next_href
print next_page_url
request = scrapy.Request(next_page_url, self.parse)
yield request

def parse_books(self, response):
'url': response.url,

And pipeline.py is

# -*- coding: utf-8 -*-

# Define your item pipelines here
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html

import MySQLdb
import hashlib
from scrapy.exceptions import DropItem

from scrapy.http import Request
import sys

class SQLStore(object):
def __init__(self):
self.conn = MySQLdb.connect("localhost","root","","books" )
self.cursor = self.conn.cursor()
print "connected to DB"

def process_item(self, item, spider):
print "hi"

self.cursor.execute("""INSERT INTO books_data(next_page_url) VALUES (%s)""", (item['url']))

except Exception, e:
print e

When i run the script there is no error. Spider running well but I think cursor not points to process_item. Even it not print hi.

Answer Source

Your method signature is wrong, it should take item and spider parameters:

process_item(self, item, spider)

Also you need to have the pipeline setup in your settings.py file:

 ITEM_PIPELINES = {"project_name.path.SQLStore"}

Your syntax is also incorrect, you need to pass a tuple:

  self.cursor.execute("""INSERT INTO books_data(next_page_url) VALUES (%s)""", 
    (item['url'],) # <- add ,
Recommended from our users: Dynamic Network Monitoring from WhatsUp Gold from IPSwitch. Free Download