Ales Maticic Ales Maticic - 5 months ago 224
Node.js Question

NodeJS x-ray web-scraper: how to follow links and get content from sub page

So I am trying to scrape some content with node.js x-ray scraping framework. While I can get the content from a single page I can't get my head around on how to follow links and get content from a subpage in one go.

There is a sample on x-ray github profile but it returns empty data if I change the code to some other site.

I have simplified my code and made it crawl the SO questions for this sample.

The following works fine:

var Xray = require('x-ray');
var x = Xray();

x('http://stackoverflow.com/questions/9202531/minimizing-nexpectation-for-a-custom-distribution-in-mathematica', '#content', [{

title: '#question-header h1',
question: '.question .post-text'

}])
(function(err, obj) {

console.log(err);
console.log(obj);

})


This also works:

var Xray = require('x-ray');
var x = Xray();

x('http://stackoverflow.com/questions', '#questions .question-summary .summary', [{

title: 'h3',
question: x('h3 a@href', '#content .question .post-text'),

}])
(function(err, obj) {

console.log(err);
console.log(obj);

})


but this gives me empty details result and I can't figure out what is wrong:

var Xray = require('x-ray');
var x = Xray();

x('http://stackoverflow.com/questions', '#questions .question-summary .summary', [{

title: 'h3',
link: 'h3 a@href',
details: x('h3 a@href', '#content', [{
title: 'h1',
question: '.question .post-text',
}])

}])
(function(err, obj) {

console.log(err);
console.log(obj);

})


I would like my spider to crawl the page with listed questions and then follow the link to each question and retrieve additional information.

Answer

So with with some help I figured out what the problem was. I am posting this answer in case somebody else might have the same problem.

Working example:

var Xray = require('x-ray');
var x = Xray();

x('http://stackoverflow.com/questions', '#questions .question-summary .summary', [{

  title: 'h3',
  link: 'h3 a@href',
  details: x('h3 a@href', {
    title: 'h1',
    question: '.question .post-text',
  })

}])
(function(err, obj) {

  console.log(err);
  console.log(obj);

})
Comments