Leonardo Leonardo - 22 days ago 15
Node.js Question

Using Async With waterfall and Recursion in nodejs

I've created a script to migrate data from Dynamo to a Mysql DB.
First I was not using Async, but I started getting bottlenecks on the sql side, so I decided to "throttle" the dymano part using the async lib.
The problem: I have a recursion in the middle of the path, as long as dynamo has data I have to continue the process (ultra simple ETL), but I don't know how to perform the recursion inside the waterfall.
My code :

function main() {
async.waterfall([getMaxTimestamp, scanDynamoDB, printout, saveToMySQL], function(err, result) {
if(err) console.log(err)
console.log(result)
});
}

function getMaxTimestamp(callback) {
console.time("max query");
connection.query("SELECT MAX(created_at) as start_date from Tracking;", function(err, data) {
console.timeEnd("max query");
callback(err, data);
})
}

function scanDynamoDB(data, callback) {
if (data[0].start_date != null && data[0].start_date)
query.ExpressionAttributeValues[':v_ca'].N = data[0].start_date;

console.time("dynamo read");
dynamoDB.scan(query, function(err, data) {
console.timeEnd("dynamo read");
callback(err, data);
// if (!err) {
// if (data != undefined && data.Count > 0) {
// printout(data.Items) // Print out the subset of results.
// if (data.LastEvaluatedKey) { // Result is incomplete; there is more to come.
// query.ExclusiveStartKey = data.LastEvaluatedKey;
// scanDynamoDB(query);
// }
// } else {
// console.log('No fresh data found on Dynamo')
// } else console.dir(err);
});
};

function assembleSql() {
insertSql = "insert into Tracking (";
for (var i = 0; i < headers.length; i++) {
insertSql += headers[i];
if (i < headers.length - 1)
insertSql += ",";
}

insertSql += ") values ?;"
previousInsertSql = insertSql;
}

function saveToMySQL(items, callback) {
assembleSql();
//connection.connect();
console.time("insert sql")
connection.query(insertSql, [items], function(err, result) {
console.timeEnd("insert sql")
if (err){
callback(err, null)
return;
}

totalInserts += result.affectedRows;
callback(err, totalInserts)
//connection.end();
})
}

function printout(items, callback) {
var headersMap = {};
var values;
var header;
var value;

var out = [];

if (headers.length == 0) {
if (items.length > 0) {
for (var i = 0; i < items.length; i++) {
for (var key in items[i]) {
headersMap[key] = true;
}
}
}
for (var key in headersMap) {
headers.push(key);
}
}

for (index in items) {
values = [];
for (i = 0; i < headers.length; i++) {
value = "";
header = headers[i];
// Loop through the header rows, adding values if they exist
if (items[index].hasOwnProperty(header)) {
if (items[index][header].N) {
value = items[index][header].N;
} else if (items[index][header].S) {
value = items[index][header].S;
} else if (items[index][header].SS) {
value = items[index][header].SS.toString();
} else if (items[index][header].NS) {
value = items[index][header].NS.toString();
} else if (items[index][header].B) {
value = items[index][header].B.toString('base64');
} else if (items[index][header].M) {
value = JSON.stringify(items[index][header].M);
} else if (items[index][header].L) {
value = JSON.stringify(items[index][header].L);
} else if (items[index][header].BOOL !== undefined) {
value = items[index][header].BOOL.toString();
}
}
values.push(value)
}
out.push(values)
}
callback(null, out);
}
main();


The commented part is where the recursion happens, but I don't know where to place this inside my flow !

Any help would be appreciated !

Answer

Actually I was able to figure it out by myself.

async.whilst(function() { return canInsert}, function (callback){
          scanDynamoDB(query, callback)
        }, function(err, res) {}
function scanDynamoDB(data, callback) {
    console.time("dynamo read");

    dynamoDB.scan(query, function(err, data) {
        console.timeEnd("dynamo read");
        if (!err) {
            if (data != undefined && data.Count > 0) {
                canInsert = data.LastEvaluatedKey;
                if (data.LastEvaluatedKey) // Result is incomplete; there is more to come.
                    query.ExclusiveStartKey = data.LastEvaluatedKey;
            }
        } else console.dir(err);
    });
};

I could have done it just with a while(canInsert). Anyway, I avoided recursion and memory usage is way way lower.