Adrian Adrian - 3 months ago 20
Javascript Question

CasperJS - How to save some data for each page from an array?

I'm trying to crawl a website using CasperJS but I ran into a problem.

One the first page I collect the links I want to crawl and save them to an array using the

getLinks()
function - this works well.

Then I want to crawl each page from this array (I got this part working) and I want to grab some details from each of these pages.

My code as follows (trimmed the working stuff like casper start and login etc.):

// Casper start here, and some login stuff, all these are working fine so I removed them to have a light example

// ....
// ....
// ....

// Function for saving members to an array
function getLinks() {
var links = document.querySelectorAll(".member_name_and_title");
return Array.prototype.map.call(links, function(link) {
return link.childNodes[1].childNodes[1].href
});
}

casper.then(function() {
// Aggregate results
links = this.evaluate(getLinks);

casper.each(links, function (self, link) {
self.thenOpen(link, function () {

var details = this.evaluate(function(){
document.getElementsByClassName('member_name')[0].textContent;
});

// Grab details for each member
var data = details + " - " + link;

// Save data
var fs = require('fs');
fs.write('results/output.txt', JSON.stringify(data, null, ' '), 'aw');

});
});

});

// Casper run
casper.run(function() {
this.exit();
});


The problem is that the
details
var will return null, so the final
output.txt
would be something like:

"null - domain.com/link1"
"null - domain.com/link2"
"null - domain.com/link3"
"null - domain.com/link4"
"null - domain.com/link5"


The
link
var is working fine but the
details
var is returning null.

When I go to any of the urls from the array (example: domain.com/link1) and run
document.getElementsByClassName('member_name')[0].textContent
in the browser console it returns the value correctly so I'm sure the targeting is fine.

I'm not sure what I'm missing or what I'm doing wrong. Any help would be much appreciated. Thanks!

Answer

Ok I figured this out in the end, totally rookie mistake ... The query was correct the problem was caused by the page load, or better to say the links from the array were not loaded before the actual query was ran.

To test this I used captureSelection() inside the self.thenOpen function to capture the state of the page when it's open, but right before the data is collected.

this.captureSelector('1.jpg', '#page');

I immediately noticed that the page was not fully loaded, hence the return document.querySelector('.member_name.').textContent; returning null.

To fix this I've added a 1.5s wait time, as follows:

casper.wait(1500, function() {
  var details = this.evaluate(function(){
    return document.querySelector('.member_name').textContent;
  });
});

Rookie mistake but might help someone else in the future.