Simon Breton Simon Breton - 2 months ago 12
Javascript Question

Node.js web scraping with loop on array of URLs

I'm trying to build a little script to scrap some data. I'm some basics knowledge in javascript however I'm kind of lost with all the async callback or promises stuff. Here is what I have now :

url = "http://Blablablabla.com";

var shares = function(req, res) {
request(url, function (error, response, body) {
if (!error) {
var $ = cheerio.load(body),
share = $(".theitemIwant").html();

return res.send(url + ":" + share);
} else {
console.log("We've encountered an error: " + error);
}
})

}


So everything is fine with this piece of code. What I would like to do is :


  1. Using an array of url
    var urls = [url1,url2,url3,etc...]

  2. Storing my scrapped data into another array, something like this
    data = [{url: url1, shares: share},{url: url2, shares: share},etc...]



I know I need to use something like this
data.push({ urls: url, shares: share})})


and I understand that I need to loop over my first url array to push data into my second data array.

however I'm kind of lost with the
request
method and the way I should deal with async issue in my situation.

thanks !

edit#1 :

I tried this to use promises :

var url = "www.blablabla.com"
var geturl = request(url, function (error, response, body) {
if (!error) { return $ = cheerio.load(body) } else
{ console.log("We've encountered an error: " + error); }
});

var shares = geturl.then( function() {
return $(".nb-shares").html();
})


but got the following error
geturl.then is not a function

Nix Nix
Answer

I took a stab at it. You need to install the q library and require it to

var Q = require('q');

//... where ever your function is
//start with an array of string urls
var urls = [ "http://Blablablabla.com", '...', '...'];

//store results in this array in the form:
//  { 
//       url: url, 
//       promise: <will be resolved when its done>, 
//       share:'code that you wanted'
//    }
var results = [];

//loop over each url and perform the request
urls.forEach(processUrl);

function processUrl(url) {
  //we use deferred object so we can know when the request is done
  var deferred = Q.defer();

  //create a new result object and add it to results
  var result = {
    url: url,
    promise: deferred.promise
  };
  results.push(result);


  //perform the request
  request(url, function (error, response, body) {
      if (!error) {
        var $ = cheerio.load(body),
          share = $(".theitemIwant").html();
        //resolve the promise so we know this request is done.
        //  no one is using the resolve, but if they were they would get the result of share
        deferred.resolve(share);
        //set the value we extracted to the results object
        result.share = share;
      } else {

        //request failed, reject the promise to abort the chain and fall into the "catch" block
        deferred.reject(error)
        console.log("We've encountered an error: " + error);
      }
  });
}

//results.map, converts the "array" to just promises
//Q.all takes in an array of promises
//when they are all done it rull call your then/catch block.
Q.all(results.map(function(i){i.promise}))
    .then(sendResponse) //when all promises are done it calls this
    .catch(sendError);  //if any promise fails it calls this

 function sendError(error){
   res.status(500).json({failed: error});
 }
 function sendResponse(data){ //data = response from every resolve call
  //process results and convert to your response
  return res.send(results);
}