Sasquatch3o3 Sasquatch3o3 - 1 month ago 12
Javascript Question

NodeJS multiple requests

I am writing a web scraper that makes multiple requests based on a list that looks like this

1. Category1
1a. categoryItem1
1b. categoryItem2
2. Category2
2a. categoryItem1
2b. categoryItem2
2c. categoryItem3
3. Category3
3a. categoryItem1


Both
Category
and
categoryItem
are links. Only 1
Category
can be expanded at a time.
The amount of
Categories
and
categoryItems
can change so I don't know the exact amount before hand.

I am gathering the data on each
categoryItem
page to be saved in a
json
that looks like this

{
"Category1": [
"categoryItem1: {
// Details saved here
},
"categoryItem2: {
// Details saved here
}
],
"Category2": [
"categoryItem1: {
// Details saved here
},
"categoryItem2: {
// Details saved here
},
"categoryItem3: {
// Details saved here
}
],
"Category3": [
"categoryItem1: {
// Details saved here
}
]
}


The only thing left for me is to figure out how to make this act synchronous


  1. Get the opening page

  2. Open each
    Category
    list

  3. Open each
    categoryItem
    details page



THIS was the web scraper tutorial that I followed, if you would like to know. Due to async calls I don't know when the very last page is parsed, so here is the structure of the script

server.js

var express = require('express');
var fs = require('fs');
var request = require('request');
var cheerio = require('cheerio');
var app = express();

app.get('/scrape', function (req, res) {

globalJSON = {};

baseUrl = 'http://...';

// 1.) open page with list
request.get(baseUrl, function (error, response, html) {
if (!error) {

var $ = cheerio.load(html);

// select the list
$('#categoryListSelector').filter(function () {
var data = $(this);

var listItem = data.find('#listItemSelector');

var expansionLink = listItem.find('a').attr('href'); // <a href=""></a>
var category = listItem.find('font').text();

// Save category to global json
globalJSON[category] = [];

// 2.) Expand the list by opening expansionLink
request.get(baseUrl + expansionLink, function (error, response, html) {
if (!error) {
var $ = cheerio.load(html);

// Select the sub items of each list item
$('#subItem selector').filter(function () {
var data = $(this);

var categoryItemPageLinkElement = data.find('a');

var categoryItemName = categoryItemPageLinkElement.text();
var categoryItemLink = $(categoryItemPageLinkElement).attr('href');

if (typeof categoryItemLink != "undefinded" && categoryItemLink != null && categoryItemLink != "") {

categoryItemObject = {}; // { categoryItemName: categoryItemDetails }
categoryItemDetails = {};

// 3.) Open the categoryItem page to start gathering data
request.get(baseUrl + categoryItemLink, function (error, response, html) {
if (!error) {
var $ = cheerio.load(html);

// GATHER and save data here

// Done gathering data save to global json
categoryItemObject[categoryItemName] = categoryItemDetails;
globalJSON[category].push(categoryItemObject);

}
});
}
});
}
});
});

fs.writeFile('output.json', JSON.stringify(globalJSON, null, 4), function (err) {
console.log('File successfully written!');
});
res.send(globalJSON);

}//END if(!error)
});

})//END app.get()

app.listen('8081')
console.log('Magic happens on port 8081');
exports = module.exports = app;


Update



I did get my issue solved with some help from the feller below, and this is what I came up with. Now, there might be a better way, feel free to let me know.

Basic Layout

Promise.all(categoriesArr.map(categoryObj => new Promise((resolve, reject)=>{
request.get(baseUrl + categoryObj.categoryItemLink, (error, response, html)=>{
if(error){
return reject(error);
}

//build an array of ALL the categoryItemLinks

return resolve(res, html);
});
}))).then(function(statesArray) {

Promise.all(allCategoryItems.map(categoryItemObject => new Promise((resolve, reject)=>{
request.get(baseUrl + categoryItemObject.categoryItemPageLink, (error, response, html)=>{
if(error){
return reject(error);
}
// Gather Data and put into dataJson

return resolve(response, html);
});
}))).then(function(data) {

// Do finishing stuff

}).catch(/*error*/);

}).catch(/*error*/);


server.js

var express = require('express');
var fs = require('fs');
var request = require('request');
var cheerio = require('cheerio');
var app = express();

app.get('/scrape', function (req, res) {

categoriesArr = [];
allCategoryItems = [];

dataJson = {}; // Global json to hold all the data

baseUrl = 'http://www.blahblah.org';

request.get(baseUrl, function(error, response, html) {
if (!error) {

var $ = cheerio.load(html);

$('#categorySelector').filter(function() {
var data = $(this);

var categoryItemLink = data.find('a').attr('href');

categoriesArr.push({
"categoryItemLink": categoryItemLink
});

});

Promise.all(categoriesArr.map(categoryObj => new Promise((resolve, reject)=>{
request.get(baseUrl + categoryObj.categoryItemLink, (error, response, html)=>{
if(error){
return reject(error);
}

var $ = cheerio.load(html);

$('#categoryItemSelector').filter(function() {
var data = $(this);
var categoryItemPageLinkElement = data.find('a');
var categoryItemPageLink = $(categoryItemPageLinkElement).attr('href');

if(typeof categoryItemPageLink != "undefinded" && categoryItemPageLink != null && categoryItemPageLink != "") {

allCategoryItems.push({
"categoryItemPageLink": categoryItemPageLink
});

}
});

return resolve(res, html);
});
}))).then(function(statesArray) {

Promise.all(allCategoryItems.map(associationObject => new Promise((resolve, reject)=>{
request.get(baseUrl + categoryItemObject.categoryItemPageLink, (error, response, html)=>{
if(error){
return reject(error);
}
var $ = cheerio.load(html);
// Gather Data and put into dataJson

return resolve(response, html);
});
}))).then(function(data) {

// Do finishing stuff

}).catch(/*error*/);

}).catch(/*error*/);

}//END if(!error)
});

})//END app.get()

app.listen('8081')
console.log('Magic happens on port 8081');
exports = module.exports = app;

Answer

You can use Promise.all(), so something like:

Promise.all(urls.map(url => new Promise((resolve, reject)=>{
    request.get(url, (err, res, html)=>{
        if(err){
            return reject(err);
        }
        return resolve(res, html);
    });
}))).then(/*success*/).catch(/*error*/);

In that code, the .then() executes after all requests have come back with a response.