holographix holographix - 6 months ago 33
Node.js Question

Batch screenshots with phantom for nodejs

I am desperately trying to process something like 200 screenshots in a single shot,
my first attempt was to follow the guidelines with a simple script invoked 200 times,

phantom.create()
.then(function(instance) {
console.log("1 - instance")
phInstance = instance;
return instance.createPage();
})
.then(function(page) {
console.log("2 - page")
sitepage = page;
return page.open(url);
})
.then(function(status) {
console.log("3 - render")
sitepage.property('clipRect', {top: 0, left: 0, width:3000,height:890}).then(function() {
sitepage.render(fname).then(function(finished) {
console.log("\t\t\t---> finished");
sitepage.close();
phInstance.exit();
callback({msg: 'ok'})
phantom.exit();
return;
});
});
})


this approach kinda works, but it's really overwhelming for the cpu,
the problem is related to the fact that this way of doings things leads to 200 phantom processes that quickly eats up all the memory.

A more profitable way of doing so, would be to create a single phantom instance and then drive it to open
one page at the time and render it, something that could be done with a phantom script, like so:

var content, counter, f, fs, grab_screen, img, lines, next_screen, page, system, url;
page = require('webpage').create();
system = require('system');
fs = require('fs');
content = '';
lines = [];
url = '';
img = '';
counter = 0;

page.viewportSize = {
width: 1200,
height: 800
};

page.settings.userAgent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.71 Safari/537.36';

f = fs.open("sites.txt", "r");

content = f.read();

lines = content.split("\n");

grab_screen = function() {
var site;
site = lines[counter];
url = 'http://' + site + '/';
img = 'screens/' + site + '.png';
console.log("Grabbing screen for: " + url);
return page.open(url, function(status) {
return window.setTimeout(function() {
page.render(img);
counter++;
return next_screen();
}, 200);
});
};

next_screen = function() {
console.log("On to " + counter + " ...");
if (counter < lines.length) {
return grab_screen();
} else {
return phantom.exit();
}
};

next_screen();


so I was wondering how to achieve that with phantomjs-node.

Answer

I finally solved my problem with two things:

  1. realizing that node.js is NOT multithreading.
  2. Using a single instance of phantom, to render multiple urls.

here's how it came out:

  var webshot = function(id) {
      console.log('makeshot ', shots[id].url);
      requestSync("POST", "http://localhost:4041/options/set", { json:{ opts:JSON.stringify(shots[id].options) } });
      phInstance.createPage().then(function(_page) {
          console.log("2 - page")
          sitepage = _page;
          return _page.open(shots[id].url);
      })
      .then(function(status) {
            console.log("3 - render %s / %s", id, shots.length);
            sitepage.property('clipRect', {top: 0, left: 0, width:1500,height:220}).then(function() {
              sitepage.render(shots[id].fname).then(function(finished) { 
                console.log("\t\t\t---> finished");
                sitepage.close();
                fnames[Math.ceil(parseInt(shots[id].options.pack_id)/mt_per_snap)-1] = "localhost_" + shots[id].options.pack_id + ".png";
                if(id<shots.length-1) {
                  id += 1;
                  webshot(id);
                } else {
                  console.log("all done: %s files has been written", shots.length);
                  // invoke pdf generation for the pdf page
                  cb("files_written", {  });
                  generatePDF();

                }
                return;
              });
            });
      })
  }

so, long story short: I have put the page I wanted to render in a separate script, which I feed with variables before making the shot, and this solves the "multithreading problem", afterwards I have a single variable named phInstance, that is declared as follows:

  var initPhantom = function() {
    phantom.create()
        .then(function(instance) {
            console.log("1 - instance")
            phInstance = instance;
        })
  }

remember to kill the phantom instance once you're done, otherwise it will stay there and suck your resources for good.

Comments