FXAMN FXAMN - 3 months ago 20
Node.js Question

Passing string stored in memory to pdftotext, antiword, catdoc, etc

Is it possible to call CLI tools like pdftotext, antiword, catdoc (text extractor scripts) passing a string instead of a file?

Currently, I read PDF files calling pdftotext with

child_process.spawn
. I spawn a new process and store the result in a new variable. Everything works fine.

I’d like to pass the
binary
from a
fs.readFile
instead of the file itself:

fs.readFile('./my.pdf', (error, binary) => {
// Call pdftotext with child_process.spawn passing the binary.
let event = child_process.spawn('pdftotext', [
// Args here!
]);
});


How can I do that?

Answer

It's definitely possible, if the command can handle piped input.

spawn returns a ChildProcess object, you can pass the string (or binary) in memory to it by write to its stdin. The string should be converted to a ReadableStream first, then you can write the string to stdin of the CLI by pipe.

createReadStream creates a ReadableStream from a file.

The following example download a pdf file and pipe the content to pdftotext, then show first few bytes of the result.

const source = 'http://static.googleusercontent.com/media/research.google.com/en//archive/gfs-sosp2003.pdf'
const http = require('http')
const spawn = require('child_process').spawn

download(source).then(pdftotext)
.then(result => console.log(result.slice(0, 77)))

function download(url) {
  return new Promise(resolve => http.get(url, resolve))
}

function pdftotext(binaryStream) {
  //read input from stdin and write to stdout
  const command = spawn('pdftotext', ['-', '-'])
  binaryStream.pipe(command.stdin)

  return new Promise(resolve => {
    const result = []
    command.stdout.on('data', chunk => result.push(chunk.toString()))
    command.stdout.on('end', () => resolve(result.join('')))
  })
}

For CLIs have no option to read from stdin, you can use named pipes.

Edit: Add another example with named pipes.

Once the named pipes are created, you can use them like files. The following example creates temporary named pipes to send input and get output, and show first few bytes of the result.

const fs = require('fs')
const spawn = require('child_process').spawn

pipeCommand({
  name: 'wvText',
  input: fs.createReadStream('document.doc'),
}).then(result => console.log(result.slice(0, 77)))

function createPipe(name) {
  return new Promise(resolve =>
    spawn('mkfifo', [name]).on('exit', () => resolve()))
}

function pipeCommand({name, input}) {
  const inpipe = 'input.pipe'
  const outpipe = 'output.pipe'
  return Promise.all([inpipe, outpipe].map(createPipe)).then(() => {
    const result = []
    fs.createReadStream(outpipe)
    .on('data', chunk => result.push(chunk.toString()))
    .on('error', console.log)

    const command = spawn(name, [inpipe, outpipe]).on('error', console.log)
    input.pipe(fs.createWriteStream(inpipe).on('error', console.log))
    return new Promise(resolve =>
      command.on('exit', () => {
        [inpipe, outpipe].forEach(name => fs.unlink(name))
        resolve(result.join(''))
      }))
  })
}