Francis Bongiovanni Francis Bongiovanni - 4 months ago 98
Javascript Question

NightmareJS - Web Crawler needs to iterate over JSON data

I'm building a web crawler that will cover 200+ sites. The current code I have runs on top of an external JSON file I've built of a dozen sites. Sample:

[
{
"company": "My Company",
"url": "http://example.com/jobs/",
"query": "div.job-listings>dt a",
"link": "div.job-listings>dt a"
},
{
"company": "Another Company",
"url": "http://anothercompany.com/careers/",
"query": ".clearfix~ .col-sm-6+ .col-sm-6 a , .col-sm-6:nth-child(4) a",
"link": ".clearfix~ .col-sm-6+ .col-sm-6 a , .col-sm-6:nth-child(4) a"
}
]


When I tried async.each it would log all of the original objects at the top of the function before attempting to enter the nightmare instance then return
error Nothing responds to "goto"
. Then I tried async.eachSeries, which prints out the correct result but stops after the first iteration.

var async = require ('async');
var data = require('./input.json')
var Nightmare = require('nightmare');
var nightmare = Nightmare({ show: false })

function crawl(data, cb) {
console.log(data) // When async.each will iterate all items then error
var nightmare = new Nightmare()
nightmare
.goto(data.url) // go to JSON specified url
.wait(data.query) // wait until CSS selector loads
.evaluate(function (data) {
positionsArr = []
obj = {}
obj.company = data.company
query = document.querySelectorAll(data.query)
link = document.querySelectorAll(data.link)
/* Set query and link equal to all elements with selector
itearte through appending text (innerText) from each element
with job url to obj*/
var i;
for (i = 0; i < query.length; i++) {
positionsObj = {}
positionsObj.title = query[i].innerText.trim()
// if each position has individual page
if (data.link !== null) {
positionsObj.url = link[i].href
} else {
positionsObj.url = data.url
}
positionsArr.push(positionsObj)
}
obj.positions = positionsArr
return obj
}, data)
.end()
.then(function (obj) {
console.log(obj)
console.log('done')
})
.catch(function (error) {
console.error('error', error);
});
}


async.eachSeries(data, crawl, function (err){
console.log('done!');
})


How can I have this work without having to write an individual file for each? Or is there a better way of crawling this amount of sites?

Source code

Answer

You have to use the callback (cb) if you want to execute the second step and so on:

.end()
.then(function (obj) {
    console.log(obj);
    console.log('done');
    cb();
})
.catch(function (error) {
    console.error('error', error);
    cb(error);
});