Afzaal Ashraf Afzaal Ashraf - 8 months ago 25
Java Question

How to move from one page to another while scraping all the data of searched query

package scraper;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

public class Scraper {

public static void main(String[] args) throws Exception {


final Document document = Jsoup.connect("https://www.indeed.com.pk/jobs?q=java&l=").userAgent("Mozilla").cookie("auth", "token").timeout(3000) .get();

Elements rows = document.select("div.row.result") ;

for (Element row : rows){
Elements innerDivs = row.select("div");
String header = innerDivs.get(1).text();
String content = innerDivs.get(2).text();
System.out.println("header = "+header+ " -> "+content);
}
}
}


In this code I am scraping jobs of searched query Java but it scrap only current page (link of searched query in code). I want to scrap all pages related to Java

Please help

Answer Source

You need to find the pagination div, which has the .pagination class and then select the first inner link for the first page, second inner link for the second page, etc.

This is an example of how you might do this. You will need to modify it to load the correct page(s):

Elements pages = document.select("div.pagination a");
for(Element page : pages) {
    // Load the next page
    Document nextPage = Jsoup.connect(pages.attr("href"));
    ...
}

Working example:

package scraper;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

public class Scraper {

    public static void main(String[] args) throws Exception {
        final Document document = 
                Jsoup.connect("https://www.indeed.com.pk/jobs?q=java&l=")
                .userAgent("Mozilla")
                .cookie("auth", "token")
                .timeout(3000)
                .get();
        scrape(document);

        // Move to the next page
        Element page = document.select("div.pagination a").get(1);
        System.out.println("Page link: " + page.attr("href"));
        Document pageDoc = Jsoup.connect(page.attr("abs:href")).get();
        scrape(pageDoc);
    }

    public static void scrape(Document document) {
        Elements rows = document.select("div.row.result") ;

        for (Element row : rows) {
            Elements innerDivs = row.select("div");
            String header = innerDivs.get(1).text();
            String content = innerDivs.get(2).text();
            System.out.println("header = "+header+ " -> "+content);
        }
    }
}