Skip to content

nondeterministic behavior when fed lots of files quickly #8

@rdpoor

Description

@rdpoor

When I call `pdf_text_extract()' with one file at a time, all works well. But when I hit it with a directory of pdf files (still one at a time, just fast), I get non-deterministic behavior where the resulting text is truncated.

Following is an example. Notice that calling reportOneFile(...) correctly processes and reports on individual files. But reportDirectory(...) reports that the processed text is either zero length, 8191 or (once in a while) the correct length. Subsequent calls to reportDirectory() produce different results:

$ node
> var TestParser = require('./lib/test-parser')
undefined
> tp = new TestParser()
{}
> tp.reportOneFile('test/pdf-test/0908050036_91406141321_20140625.pdf')
undefined
> 8005 'test/pdf-test/0908050036_91406141321_20140625.pdf'
> tp.reportOneFile('test/pdf-test/0908050036_91405131615_20140522.pdf')
undefined
> 7999 'test/pdf-test/0908050036_91405131615_20140522.pdf'
> tp.reportDirectory('test/pdf-test')
undefined
> 0 'test/pdf-test/0908050036_91406141321_20140625.pdf'
0 'test/pdf-test/0908050036_91405131615_20140522.pdf'
0 'test/pdf-test/0908050036_91404126594_20140423.pdf'
0 'test/pdf-test/0908050036_91401135121_20140123.pdf'
0 'test/pdf-test/0908050036_91312128215_20131226.pdf'
0 'test/pdf-test/0908050036_91311122264_20131120.pdf'
0 'test/pdf-test/0908050036_9131064218_20131011.pdf'
8191 'test/pdf-test/0908050036_91503129909_20150323.pdf'
8191 'test/pdf-test/0908050036_91502129494_20150220.pdf'
8191 'test/pdf-test/0908050036_91501125167_20150122.pdf'
8191 'test/pdf-test/0908050036_91412121290_20141222.pdf'
8191 'test/pdf-test/0908050036_91411119510_20141120.pdf'
8191 'test/pdf-test/0908050036_91410143534_20141022.pdf'
8191 'test/pdf-test/0908050036_91409124480_20140923.pdf'
8191 'test/pdf-test/0908050036_91408133729_20140822.pdf'
8191 'test/pdf-test/0908050036_91407135090_20140724.pdf'
8191 'test/pdf-test/0908050036_91403128149_20140321.pdf'
8191 'test/pdf-test/0908050036_91402117758_20140220.pdf'
0 'test/pdf-test/0908050036_91310129406_20131022.pdf'
8876 'test/pdf-test/0908050036_91308125534_20130822.pdf'
8122 'test/pdf-test/0908050036_91307116578_20130724.pdf'
8128 'test/pdf-test/0908050036_91306120163_20130624.pdf'
8122 'test/pdf-test/0908050036_91305124314_20130523.pdf'
8122 'test/pdf-test/0908050036_91304127419_20130424.pdf'

undefined

Here is the code in its entirety:

// File: test-parser.js
"use strict";

var fs = require('fs');

function TestParser() {
}

// Extract the text in a pdf file as a single page.
TestParser.prototype.extractFile = function(pdf_filename, callback) {
    var pdf_text_extract = require('pdf-text-extract')
    pdf_text_extract(pdf_filename, function extractionComplete(err, pages) {
        if (err) {
            callback(err)
            return;
        }
        callback(null, pages.join(''), pdf_filename)
    });
}

// Call extractFile() on each file named *.pdf in the given directory.
TestParser.prototype.extractDirectory = function(directory, callback) {
    var pdfs = fs.readdirSync(directory).filter(function(element) {
        return path.extname(element) === '.pdf'
    });
    for(var i=pdfs.length-1; i>=0; --i) {
        var filename = directory + "/" + pdfs[i];
        this.extractFile(filename, callback);
    }
}

TestParser.prototype.reportOneFile = function(pdf_filename) {
    this.extractFile(pdf_filename, this.reportLength);
}

TestParser.prototype.reportDirectory = function(directory) {
    this.extractDirectory(directory, this.reportLength);
}

TestParser.prototype.reportLength = function(err, text, filename) {
    if (err) {
        console.error(err);
        return;
    }
    console.error(text.length, filename);
}

module.exports = TestParser;

P.S.: Regrettably I cannot post the .pdfs themselves in a gist as they contain personal customer data. But I've verified this fails on other directories full of pdfs. If you want a gist, let me know.

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions