student student - 1 month ago 9
Python Question

Problems while scraping a table from a website?

I am working in extracting the table from this site. Although I matched the xpaths and spotted the fields of the table, I'm not able to extract any content from the site, this is how my spider looks like:

# -*- coding: utf-8 -*-
import scrapy
from table.items import TableItem


class Table(scrapy.Spider):
name = "table1"
start_urls = (
'wesite.com',
)
#//div[4]//div[1]//div[1]//table[1]
#
def parse(self, response):
sites = response.xpath('//*[@id="tabs-1"]/table//tr')[1:-2]
print('\n***********************************\n',sites)

for site in sites:
item = TableItem()
item['col1'] = site.xpath('td[1]/text()').extract()
item['col2'] = site.xpath('td[2]/text()').extract()
yield item
print('\n**********\n',item)


I guess that my main problem is this line:

sites = response.xpath('//*[@id="tabs-1"]/table[1]/tr')


I actually can retrive the content. However, it has an very large repeated incorrect format (it is malformed). Any idea of how to get the table?.

Answer

Sometimes browsers add their own DOM elements while rendering. For your given site, the right xpath selector is response.xpath('//*[@id="tabs-1"]/table//tr') to find table rows.

Edited: Added code to fetch the right elements from the table

# -*- coding: utf-8 -*-
import scrapy
from table.items import TableItem


class Table(scrapy.Spider):
    name = "table1"
    start_urls = (
        'http://www.accessdata.fda.gov/scripts/drugshortages/default.cfm#tabs-1',
    )

    def parse(self, response):
        sites = response.xpath('//*[@id="tabs-1"]/table//tr')

        for site in sites:
            item = TableItem()
            item['col1'] = site.xpath('td/a/text()').extract_first()
            col2 = site.xpath('td/em/strong/text()')
            if col2:
                item['col2'] = site.xpath('td/em/strong/text()')[0].extract().strip()
            else:
                item['col2'] = 'Not Available'
            yield item