SMth80 SMth80 - 3 years ago 103
Python Question

How to make my crawler parse data from start page

I've written some code in python to grab details from a torrent site. However, when I run my code I found the results as I expected. The only problem with this crawler is that it skips the content of first page [as the pagination urls start from 2] which I can't fix. Any help on this will be highly appreciable.

import requests
from lxml import html

page_link = "https://yts.ag/browse-movies"
b_link = "https://yts.ag"

def get_links(main_link):
response = requests.get(main_link).text
tree = html.fromstring(response)
for item in tree.cssselect('ul.tsc_pagination a'):
if "page" in item.attrib["href"]:
movie_details(b_link + item.attrib["href"])

def movie_details(link):
response = requests.get(link).text
tree = html.fromstring(response)
for titles in tree.cssselect("div.browse-movie-wrap"):
title = titles.cssselect('div.browse-movie-bottom a.browse-movie-title')[0].text
link = titles.cssselect('div.browse-movie-year')[0].text
rating= titles.cssselect('figcaption.hidden-xs h4.rating')[0].text
genre = titles.cssselect('figcaption.hidden-xs h4')[0].text
genre1 = titles.cssselect('figcaption.hidden-xs h4')[1].text
print(title, link, rating, genre, genre1)

get_links(page_link)

Answer Source

Why not just call the movie_details() function on the main_link before the loop ?

def get_links(main_link):
    response = requests.get(main_link).text
    tree = html.fromstring(response)
    movie_details(main_link)
    for item in tree.cssselect('ul.tsc_pagination a'):
        if "page" in item.attrib["href"]:
            movie_details(b_link + item.attrib["href"])
Recommended from our users: Dynamic Network Monitoring from WhatsUp Gold from IPSwitch. Free Download