Jack Wolther Jack Wolther - 3 months ago 14
Node.js Question

Forum title web scrapper

I'm writing a simple web-scraper that pulls post titles, username and last post time from a forum.

The problem is that the scraper only pulls the last last entry in the table.

For example:
If the table is structured in this way:



<tbody>
<tr class="">
<td class="title">
<a href="/forums/marketplace/8827" title="View full post details">Title number 1</a>
</td>
<td class="author"><a href="/members/pursu" title="View member, pursu">pursu</a></td>
<td class="count">0</td>
<td class="last_post">9 minutes ago</td>
</tr>
<tr class="color2">
<td class="title">

<a href="/forums/marketplace/8826" title="View full post details">Title number 2</a>
</td>
<td class="author"><a href="/members/colinatx" title="View member, colinatx">colinatx</a></td>
<td class="count">0</td>
<td class="last_post">9 minutes ago</td>
</tr>
<tr class="">
<td class="title">
<a href="/forums/marketplace/8785" title="View full post details">Title number 3</a>
</td>
<td class="author"><a href="/members/Object117" title="View member, Object117">Object117</a></td>
<td class="count">11</td>
<td class="last_post">about 1 hour ago</td>
</tr>
</tbody>





The result that will be written into the .json output file is this



{
"title": "Title number 3",
"author": "Object117",
"lastpost": "about 1 hour ago"
}





Instead in should be like this:



{
"title": "Title number 1",
"author": "pursu",
"lastpost": "9 minutes ago"
}
{
"title": "Title number 2",
"author": "colinatx",
"lastpost": "9 minutes ago"
}
{
"title": "Title number 3",
"author": "Object117",
"lastpost": "about 1 hour ago"
}





My JavaScript:



var express = require('express');
var fs = require('fs');
var request = require('request');
var cheerio = require('cheerio');
var app = express();

app.get('/scrape', function(req, res){

//This is the URL to pull data from
url = 'http://www.pedalroom.com/forums/marketplace';

// The first parameter is our URL

// The callback function takes 3 parameters, an error, response status code and the html
request(url, function(error, response, html){
if(!error){

//pulling HTML
var $ = cheerio.load(html);

//Variables that capture data
var title, author, lastpost;
var json = { title : "", author : "", lastpost : ""};

$('.title').filter(function(){

var data = $(this);

title = data.children().first().text();

json.title = title;
})
$('.author').filter(function(){

var data = $(this);

author = data.children().first().text();

json.author = author;
})
$('.last_post').filter(function(){

var data = $(this);

lastpost = data.text();

json.lastpost = lastpost;
})
}
fs.writeFile('output.json', JSON.stringify(json, null, 4), function(err){

console.log('File successfully written! - Check your project directory for the output.json file');

})

// Finally, we'll just send out a message to the browser reminding you that this app does not have a UI.
res.send('Check your console!')

});
})

app.listen('8081')
console.log('Magic happens on port 8081');
exports = module.exports = app;





Is it that I need to somehow loop the code or perhaps something else?
Thanks!

Off Off
Answer

In your code you only catch the first element of the first line, because you're not looping on each line.

Here is the working code:

var express = require('express');
var fs = require('fs');
var request = require('request');
var cheerio = require('cheerio');
var app     = express();

app.get('/scrape', function(req, res){

    //This is the URL to pull data from
    url = 'http://www.pedalroom.com/forums/marketplace';

    // The first parameter is our URL

    // The callback function takes 3 parameters, an error, response status code and the html
    request(url, function(error, response, html){
        if(!error){

            //pulling HTML
            var $ = cheerio.load(html);

            var data = [];

            /**
             * New code start here
             */
            // For each row of the table
            $('.topics tr').each(function(index, element){

                // If title is present on this line, write it into the json
                if($(this).find('.title a').length > 0)
                    data.push({
                        title: $(this).find('.title a').html(),
                        author: $(this).find('.author a').html(),
                        lastpost: $(this).find('.last_post').html()
                    });
            });
            /**
             * End's here :D
             */
        }
        fs.writeFile('output.json', JSON.stringify(data, null, 4), function(err){

            console.log('File successfully written! - Check your project directory for the output.json file');

        })

        // Finally, we'll just send out a message to the browser reminding you that this app does not have a UI.
        res.send('Check your console!')

    });
})

app.listen('8081')
console.log('Magic happens on port 8081');
exports = module.exports = app;
Comments