xtr33me xtr33me - 3 months ago 25
Node.js Question

ECONNRESET when using async to scrape archive

I am currently trying to get some article data for a TensorFlow implementation I'm working on. The first source I am trying to pull against is Buzzfeed. I'm using the node.js async.js library to scrape the page, but I am having a heck of a time with always getting an ECONNRESET error from their server after about 200 or so articles.

Now when I am running this, it does seem to make requests pretty quickly, but this was why I decided to use the async.eachSeries due to the delay it applies. I'm continuing to look at this, but if anyone sees something I am doing wrong or perhaps a better way of accomplishing this, I would love to hear what you have to say. Thanks!

const scraper = testHelperGetScraper();

// scrape links to each news in buzz archive page
const archiveUrl = 'http://www.buzzfeed.com/archive';
let fileNum = -1;

scraper.scrapeArchiveDates(archiveUrl, function(err, dates){
async.eachSeries(dates.yearUrl, function(url, cb){
async.waterfall([
function(callback){
scraper.scrapeArchive(url, callback);
},
function(urlArr, callback){
for (var i in urlArr.url){
if (typeof (urlArr.url[i]) === 'string'){
scraper.scrapeArticle(urlArr.url[i], callback);
}
}
} ,function(res, callback){
cb();
callback(null, res);
}
],
function(err, buzzresult) {
if (err){
_logger.error('Error while inserting test data', err);
}

buzzresult.uniqueId = UniqueHelper.getUIntId({
text: buzzresult.title
, options: {
ignoreCase: true
, strip: ['urls', 'hashtags', 'users', 'quotes', 'punct']
}
});

let fileFullPath = _rootFolderWhereYouWantToSave
+ 'rawdata_' + buzzresult.uniqueId + '.txt';
//_logger.debug('fileFullPath', fileFullPath);

let finalDataWritten = buzzresult.title + os.EOL + buzzresult.body + os.EOL;

writeArticleFile(fileFullPath, finalDataWritten);
//console.log('Finsihed calling WriteArticleFile: ', finalDataWritten);
_counter += 1;
console.log('NumArticlesWritten: ', _counter);
});

}, function (error) {
if (error) {
throw error;
}
console.log('Finished!');
});
});

Answer

I simplify your code a little.
You can use named function to create a more readable code.

const scraper = testHelperGetScraper();
const archiveUrl = 'http://www.buzzfeed.com/archive';
let fileNum = -1;

scraper.scrapeArchiveDates(archiveUrl, function(err, dates){
    async.eachSeries(dates, parseDate, (err) => (err) ? throw err : console.log('Finished!'););
});

function parseDate(date, callback) {
    scraper.scrapeArchive(date.yearUrl, function(err, urls) {
        if (err)
            return callback(err);

        urls = urls.filter((url) => typeof(url) === 'string'));
        async.each(urls, scraper.scrapeArticle, function(err, buzzresult) {
            if (err) {
                _logger.error('Error while inserting test data', err);
                return callback(err);
            }

            ... // process result
        })  
    });
}   
Comments