Ramesh KC Ramesh KC - 1 month ago 15
Python Question

How can I start to write Unit test in web Scrapy using python?

class AljazeeraSpider(XMLFeedSpider):
name = "aljazeera"
allowed_domains = ["aljazeera.com"]
start_urls = [
'http://www.aljazeera.com/',
]

def parse(self, response):
hxs = HtmlXPathSelector(response) # The xPath selector
titles = hxs.select('//div[contains(@class,"SkyScrapperBoxes")]/div[contains(@class,"skyscLines")]')
if not titles:
MailNotify().send_mail("Aljazeera", "Scraper Report")

items = []
for titles in titles:
item = NewsItem()
item['title'] = escape(''.join(titles.select('a/text()').extract()))
item['link'] = "http://www.aljazeera.com" + escape(''.join(titles.select('a/@href').extract()))
item['description'] = ''
item = Request(item['link'], meta={'item': item}, callback=self.parse_detail)
items.append(item)

return items

def parse_detail(self, response):
item = response.meta['item']
sel = HtmlXPathSelector(response)
detail = sel.select('//td[@class = "DetailedSummary"]')
item['details'] = remove_html_tags(escape(''.join(detail.select('p').extract())))
item['location'] = ''
published_date = sel.select('//span[@id = "ctl00_cphBody_lblDate"]')
item['published_date'] = escape(''.join(published_date.select('text()').extract()))

return item


I am currently working on Scrapy to crawl the website. I have some knowledge about unittest in python. But,How can I write the unittest to check that link is working, and
item['location']
,
item['details']
are returning the value or not? I have learned Scrapy contract but cannot understand anything.So, how can write the unittest in this case?

Answer

If we are talking specifically about how to test the spiders (not pipelines, or loaders), then what we did is provided a "fake response" from a local HTML file. Sample code:

from scrapy.http import Request, TextResponse

def fake_response(file_name=None, url=None):
    """Create a Scrapy fake HTTP response from a HTML file"""
    if not url:
        url = 'http://www.example.com'

    request = Request(url=url)
    if file_name:
        if not file_name[0] == '/':
            responses_dir = os.path.dirname(os.path.realpath(__file__))
            file_path = os.path.join(responses_dir, file_name)
        else:
            file_path = file_name

        file_content = open(file_path, 'r').read()
    else:
        file_content = ''

    response = TextResponse(url=url, request=request, body=file_content,
                            encoding='utf-8')
    return response

Then, in your TestCase class, call the fake_response() function and feed the response to the parse() callback:

from unittest.case import TestCase

class MyTestCase(TestCase):
    def setUp(self):
        self.spider = MySpider()

    def test_parse(self):
        response = fake_response('input.html')
        item = self.spider.parse(response)
        self.assertEqual(item['title'], 'My Title')
        # ...

Aside from that, you should definitely start using Item Loaders with input and output processors - this would help to achieve a better modularity and, hence, isolation - spider would just yield item instances, data preparation and modification would be incapsulated inside the loader, which you would test separately.

Comments