Description
The child_process.spawn
stdout stream seems to be taking ~4x longer than other "similar" methods of I/O streaming with similarly sized data streams.
Running on Node.js v4.2.1, Mac OS X 10.11, Macbook Air 1.7GHz
As a baseline, the following program, where file
is a 472MB gzip file, and stdout
is set to ignore
takes 6.5 secs with the gzip process using ~100% CPU:
var dataLength = 0
var time = process.hrtime()
var gzip = spawn('gzip', ['-cd', file], {stdio: ['ignore', 'ignore', 'ignore']})
if (gzip.stdout != null) {
gzip.stdout.on('data', function(data) {
if (!dataLength) dataLength = data.length
})
}
gzip.on('close', function(code) {
var diff = process.hrtime(time)
console.log('stream took %d seconds', diff[0] + diff[1] / 1e9)
if (dataLength) console.log('buffer length was %d', dataLength)
})
$ node spawn-bench.js
stream took 6.497701762 seconds
If I set the stdout
option from ignore
to pipe
, it suddenly takes 27 secs, with gzip only using ~35% CPU and node using ~75%:
var dataLength = 0
var time = process.hrtime()
var gzip = spawn('gzip', ['-cd', file], {stdio: ['ignore', 'pipe', 'ignore']})
if (gzip.stdout != null) {
gzip.stdout.on('data', function(data) {
if (!dataLength) dataLength = data.length
})
}
gzip.on('close', function(code) {
var diff = process.hrtime(time)
console.log('stream took %d seconds', diff[0] + diff[1] / 1e9)
if (dataLength) console.log('buffer length was %d', dataLength)
})
$ node spawn-bench.js
stream took 27.406851714 seconds
buffer length was 8192
Fine, so that 4x overhead could just be standard pipe/stream overhead, but if I pipe in from stdin, there's really not much overhead at all and it finishes in 7.2 secs, with gzip using ~95% CPU and node ~30%:
var dataLength = 0
var time = process.hrtime()
process.stdin.on('data', function(data) {
if (!dataLength) dataLength = data.length
})
process.stdin.on('end', function() {
var diff = process.hrtime(time)
console.log('stream took %d seconds', diff[0] + diff[1] / 1e9)
if (dataLength) console.log('buffer length was %d', dataLength)
})
gzip -cd file.gz | node spawn-bench.js
stream took 7.2209929479999995 seconds
buffer length was 65536
Similarly, if I process the entire gzip file in node using zlib, I get relatively ok performance too, taking 9.8 secs:
var dataLength = 0
var time = process.hrtime()
fs.createReadStream(file, {highWaterMark: 32 * 1024 * 1024})
.pipe(zlib.createGunzip({chunkSize: 32 * 1024 * 1024}))
.on('data', function(data) {
if (!dataLength) dataLength = data.length
})
.on('end', function() {
var diff = process.hrtime(time)
console.log('stream took %d seconds', diff[0] + diff[1] / 1e9)
if (dataLength) console.log('buffer length was %d', dataLength)
})
node spawn-bench.js
stream took 9.836914587 seconds
buffer length was 33554432
So one thing I suspected was the stream buffer size – you can see when spawning it's only 8k, whereas it's 64k when piping from stdin, and I had better performance upping it even further (to 32MB) when reading from the file and gzipping in node.
However, I couldn't see any way to change this with spawn, so I can't really determine if this is the cause.
Is there something else I'm missing? A 4x slowdown seems to be far greater than expected.