ricky ricky - 4 years ago 130
Python Question

How to Iterate over paginations - Python scraping (BeautifulSoup)


How can I iterate over the paginations to get all the data, I have successfully fetched data of first page.


from bs4 import BeautifulSoup
import requests
url = "https://yts.ag/browse-movies/"
headers = {'User-Agent': 'Mozilla/5.0'}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")
items = soup.find_all('div', 'browse-movie-wrap')
for item in items:
for val in item.find_all('div','browse-movie-bottom'):
title = item.find_all('a','browse-movie-title')[0].text
year = item.find_all('div','browse-movie-year')[0].text
for val in item.find_all('a','browse-movie-link'):
try:
rating = val.find_all('h4')[0].text
genre = val.find_all('h4')[1].text
except:
pass

print year, rating, genre, title

Answer Source

You could use range(1, 300) to iterate all pages:

from bs4 import BeautifulSoup
import requests

headers = {'User-Agent': 'Mozilla/5.0'}

for i in range(1, 300):
    url = "https://yts.ag/browse-movies?page=%s" % i

    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, "html.parser")
    items = soup.find_all('div', 'browse-movie-wrap')
    for item in items:
        for val in item.find_all('div','browse-movie-bottom'):
            title = item.find_all('a','browse-movie-title')[0].text
            year = item.find_all('div','browse-movie-year')[0].text
        for val in item.find_all('a','browse-movie-link'):
            try:
                rating = val.find_all('h4')[0].text
                genre = val.find_all('h4')[1].text 
            except:
                pass 

        print year, rating, genre, title

P.S. You might want to add time.sleep(1) to slow down a little bit in case they block your IP for being too aggressive scraping their webpages.

Recommended from our users: Dynamic Network Monitoring from WhatsUp Gold from IPSwitch. Free Download