Vijay Ramesh Vijay Ramesh - 5 months ago 41
HTML Question

Unable to loop through multiple pages to scrape data

I need to move to the next url link (each page has about 20 rows which I need to extract, following which the same needs to be added to the next set of results from the following urls).

There are about 360 urls and I would like to extract data by running through all of them. My code is below. I would like to write them later to a csv file. Any suggestions would be much appreciated as I am new to Python.

from urlparse import urljoin
import requests
from bs4 import BeautifulSoup
import csv

base_url = 'http://cricket.inhs.uiuc.edu/edwipweb/FMPro?-db=nvpassoc.fp5&-format=nvp_search_results.htm&-lay=web%20form&-max=20&-findall='
list_of_rows = []

next_page = 'http://cricket.inhs.uiuc.edu/edwipweb/FMPro?-db=nvpassoc.fp5&-format=nvp_search_results.htm&-lay=web%20form&-max=20&-skip=20&-findall='

while True:
soup = BeautifulSoup(requests.get(next_page).content)
soup.findAll('table')[1].findAll('tr')
for row in soup.findAll('table')[1].findAll('tr'):
list_of_cells = []
for cell in row.findAll('p'):
text = cell.text.replace(' ','')
list_of_cells.append(text)
list_of_rows.append(list_of_cells)

try:
next_page = urljoin(base_url, soup.select('/FMPro?-db=nvpassoc.fp5&-format=nvp_search_results.htm&-lay=web%20form&-max=20&-skip=20&-findall=')[1].get('href'))
except IndexError:
break


print list_of_rows

outfile = open("./trialpage.csv","wb")
writer = csv.writer(outfile)
writer.writerows(list_of_rows)

Answer

I made some changes to your code. I set up the original url with a variable I called skip. skip will be incremented by 20 each time through

from urlparse import urljoin
import requests
from bs4 import BeautifulSoup
import csv

list_of_rows = []

skip = 0
next_page = 'http://cricket.inhs.uiuc.edu/edwipweb/FMPro?-db=nvpassoc.fp5&-format=nvp_search_results.htm&-lay=web%20form&-max=20&-skip=' + str(skip) + '&-findall='
print next_page
while True:
    soup = BeautifulSoup(requests.get(next_page).content)
    soup.findAll('table')[1].findAll('tr')
    for row in soup.findAll('table')[1].findAll('tr'):
        list_of_cells = []
        for cell in row.findAll('p'):
            text = cell.text.replace(' ','')
            list_of_cells.append(text)
        list_of_rows.append(list_of_cells)

    try:
        skip += 20
        if skip > 300:
            break
        next_page = 'http://cricket.inhs.uiuc.edu/edwipweb/FMPro?-db=nvpassoc.fp5&-format=nvp_search_results.htm&-lay=web%20form&-max=20&-skip=' + str(skip) + '&-findall='
        print next_page
    except IndexError as e:
        print e
        break


# print list_of_rows

outfile = open("./trialpage.csv","wb")
writer = csv.writer(outfile)
writer.writerows(list_of_rows)

You could take bigger chunks since you are not limited by screen view and I think it would work faster. Try max=200, and then step increments by 200