Miha Šušteršič Miha Šušteršič - 17 days ago 5
Node.js Question

Javascript - .map running out of memory

My libraries:

const Promise = require('bluebird');
const fs = Promise.promisifyAll(require('graceful-fs'));
const path = require('path');
const xml2js = Promise.promisifyAll(require('xml2js'));


I have a large number of XML files I want to parse. I am able to create an array of paths to all the files using this function:

function getFileNames(rootPath) {
// Read content of path
return fs.readdirAsync(rootPath)
// Return all directories
.then(function(content) {
return content.filter(function(file) {
return fs.statSync(path.join(rootPath, file)).isDirectory();
});
})
// For every directory
.map(function(directory) {
// Save current path
let currentPath = path.join(rootPath, directory);
// Read files in the directory
return fs.readdirAsync(currentPath)
// Filter out the XMLs
.filter(function(file) {
return path.extname(file) === '.XML';
})
// Return path to file
.map(function(file) {
return path.join(rootPath, directory, file);
});
})
// Flatten array of results
.reduce(function(a, b) {
return a.concat(b);
});
}


and now I want to go trough every single file and parse it.

I have 2 function to do so:

function openFile(filePath) {
return fs.readFileAsync('./' + filePath)
.then(function(fileData) {
return fileData;
});
}

function parseFile(data) {
return xml2js.parseStringAsync(data)
.then(function(xmlObject) {
return xmlObject;
});
}


Now when I call this with the .map (the
GetFileNames
function outputs an array with over 20k strings with file paths) function:

getFileNames('./XML')
.map(function(file) {
openFile(file)
.then(function(data) {
parseFile(data)
.then(function(object) {
console.log(object);
});
});
});


I get a javascript heap out of memory error:


FATAL ERROR: CALL_AND_RETRY_LAST Allocation failed - JavaScript heap
out of memory


But when I run the function a single time by passing in the path to the actual file:

openFile('./XML/2016-10-1/EUROTIPOLD2016-10-1T00-00-22.5756240530.XML')
.then(function(data) {
parseFile(data)
.then(function(object) {
console.log(object);
});
});


I get the desired output.

What am I doing wrong?

Answer

Iterating nK files happens asynchronous.

1) You're getting list of files

2) by doing .map You're calling openFile, parseFile that are async functions and it takes time to read and parse.


So because of asynchronousity it proceeds to next file without waiting to finish previous one to call garbage collector to sweep memory and here is insufficient memory problem.

Think about reading 20K files with different sizes at once.


So here is solution:

Use async to synchronize (eachSeries) or control (eachLimit) iteration.

const async = require('async'); // install: npm i --save async

let files = getFileNames('./XML');

// eachLimit(files, 3,
async.eachSeries(files, 
  (file, next) => { 
    openFile(file) 
     .then(
       parseFile, 
       (err) => {
         console.error('Cannot open file:', file, err);
         next();
       }) 
     .then(
       object => { // successfully parsed file, so log it out and proceed to next file
         console.log(object);
         next();
       }, 
       (err) => {
         console.error('Cannot parse data from file:', file, err);
         next();
       });
});

p.s. feel free to comment and fix code issue in my answer.