From 584436f03351efa2e73129679ba7d3931e5e3181 Mon Sep 17 00:00:00 2001 From: Joe Hosteny Date: Thu, 16 May 2013 15:57:40 -0400 Subject: [PATCH 1/2] Add option to generate hOCR output. --- lib/docsplit/command_line.rb | 5 ++++- lib/docsplit/text_extractor.rb | 11 +++++++---- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/lib/docsplit/command_line.rb b/lib/docsplit/command_line.rb index 7c7af08..1ab580f 100755 --- a/lib/docsplit/command_line.rb +++ b/lib/docsplit/command_line.rb @@ -91,6 +91,9 @@ def parse_options opts.on('--[no-]ocr', 'force OCR to be used, or disable OCR') do |o| @options[:ocr] = o end + opts.on('--hocr', 'force hOCR output when OCR enabled') do |h| + @options[:hocr] = h + end opts.on('--no-clean', 'disable cleaning of OCR\'d text') do |c| @options[:clean] = false end @@ -119,4 +122,4 @@ def parse_options end -end \ No newline at end of file +end diff --git a/lib/docsplit/text_extractor.rb b/lib/docsplit/text_extractor.rb index 0d55f32..38bd412 100644 --- a/lib/docsplit/text_extractor.rb +++ b/lib/docsplit/text_extractor.rb @@ -60,13 +60,15 @@ def extract_from_ocr(pdf, pages) tempdir = Dir.mktmpdir base_path = File.join(@output, @pdf_name) escaped_pdf = ESCAPE[pdf] + additional_opts = "" + additional_opts += "hocr " if @use_hocr if pages pages.each do |page| tiff = "#{tempdir}/#{@pdf_name}_#{page}.tif" escaped_tiff = ESCAPE[tiff] file = "#{base_path}_#{page}" run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf}[#{page - 1}] #{escaped_tiff} 2>&1" - run "tesseract #{escaped_tiff} #{ESCAPE[file]} -l #{@language} 2>&1" + run "tesseract #{escaped_tiff} #{ESCAPE[file]} -l #{@language} #{additional_opts} 2>&1" clean_text(file + '.txt') if @clean_ocr FileUtils.remove_entry_secure tiff end @@ -74,7 +76,7 @@ def extract_from_ocr(pdf, pages) tiff = "#{tempdir}/#{@pdf_name}.tif" escaped_tiff = ESCAPE[tiff] run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf} #{escaped_tiff} 2>&1" - run "tesseract #{escaped_tiff} #{base_path} -l #{@language} 2>&1" + run "tesseract #{escaped_tiff} #{base_path} -l #{@language} #{additional_opts} 2>&1" clean_text(base_path + '.txt') if @clean_ocr end ensure @@ -120,11 +122,12 @@ def extract_options(options) @output = options[:output] || '.' @pages = options[:pages] @force_ocr = options[:ocr] == true + @use_hocr = options[:hocr] == true @forbid_ocr = options[:ocr] == false - @clean_ocr = !(options[:clean] == false) + @clean_ocr = !(options[:clean] == false) && !@use_hocr @language = options[:language] || 'eng' end end -end \ No newline at end of file +end From 8106858e05da6e2e28a17d9d93ef7bfbf95ae1e6 Mon Sep 17 00:00:00 2001 From: Joe Hosteny Date: Fri, 17 May 2013 15:46:53 -0400 Subject: [PATCH 2/2] Keep the .tif files around if generating hOCR output. --- lib/docsplit/text_extractor.rb | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lib/docsplit/text_extractor.rb b/lib/docsplit/text_extractor.rb index 38bd412..422a270 100644 --- a/lib/docsplit/text_extractor.rb +++ b/lib/docsplit/text_extractor.rb @@ -70,6 +70,7 @@ def extract_from_ocr(pdf, pages) run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf}[#{page - 1}] #{escaped_tiff} 2>&1" run "tesseract #{escaped_tiff} #{ESCAPE[file]} -l #{@language} #{additional_opts} 2>&1" clean_text(file + '.txt') if @clean_ocr + run "cp #{escaped_tiff} #{base_path}_#{page}.tif" if @use_hocr FileUtils.remove_entry_secure tiff end else @@ -77,6 +78,7 @@ def extract_from_ocr(pdf, pages) escaped_tiff = ESCAPE[tiff] run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf} #{escaped_tiff} 2>&1" run "tesseract #{escaped_tiff} #{base_path} -l #{@language} #{additional_opts} 2>&1" + run "cp #{escaped_tiff} #{base_path}.tif" if @use_hocr clean_text(base_path + '.txt') if @clean_ocr end ensure