FXAMN FXAMN - 1 year ago 75
Node.js Question

Using pdftotext, antiword, catdoc, etc using string stored in memory

Is it possible to call clis like pdftotext, antiword, catdoc (text extractor scripts) passing the string instead of the file?

WARNING: This is just a proof of concept!

Currently i'm reading pdf files calling pdftotext with

. I spawning a new process and storing the result in a new variable. Everything is working fine.

Now i want test if i can pass the binary from a

fs.readFile('./my-pdf', (error, binary) => {
// call pdftotext with child_process.spawn passing the binary

let event = child_process.spawn('pdftotext', [
//args here!

So.. if is it possible. How can i do?

Thanks in advance! :)

Answer Source

It's definitely possible.

spawn returns a ChildProcess object, you can pass the string (or binary) in memory to it by write to its stdin. The string should be converted to a ReadableStream first, then you can write the string to stdin of the CLI by pipe.

createReadStream creates a ReadableStream from a file.

The following example download a pdf file and pipe the content to pdftotext, then show first few bytes of the result.

const source = 'http://static.googleusercontent.com/media/research.google.com/en//archive/gfs-sosp2003.pdf'
const http = require('http')
const spawn = require('child_process').spawn

.then(result => console.log(result.slice(0, 77)))

function download(url) {
  return new Promise(resolve => http.get(url, resolve))

function pdftotext(binaryStream) {
  //read input from stdin and write to stdout
  const command = spawn('pdftotext', ['-', '-'])

  return new Promise(resolve => {
    const result = []
    command.stdout.on('data', chunk => result.push(chunk.toString()))
    command.stdout.on('end', () => resolve(result.join('')))

For CLIs have no option to read from stdin, you can use named pipes.

Edit: Add another example with named pipes.

Once the named pipes are created, you can use them like files. The following example creates temporary named pipes to send input and get output, and show first few bytes of the result.

const fs = require('fs')
const spawn = require('child_process').spawn

  name: 'wvText',
  input: fs.createReadStream('document.doc'),
}).then(result => console.log(result.slice(0, 77)))

function createPipe(name) {
  return new Promise(resolve =>
    spawn('mkfifo', [name]).on('exit', () => resolve()))

function pipeCommand({name, input}) {
  const inpipe = 'input.pipe'
  const outpipe = 'output.pipe'
  return Promise.all([inpipe, outpipe].map(createPipe)).then(() => {
    const result = []
    .on('data', chunk => result.push(chunk.toString()) )

    const command = spawn(name, [inpipe, outpipe])
    return new Promise(resolve =>
      command.on('exit', () => {
        [inpipe, outpipe].forEach(name => fs.unlink(name))