forked from IU-Libraries-Joint-Development/essi
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathocr.rb
68 lines (56 loc) · 2.09 KB
/
ocr.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
module Processors
class OCR < Hydra::Derivatives::Processors::Processor
include Hydra::Derivatives::Processors::ShellBasedProcessor
def self.encode(path, options, output_file)
file_name = File.basename(path)
existing_file = pre_ocr_file(file_name)
if existing_file
Rails.logger.info "Copying Pre-derived OCR file #{existing_file} to #{output_file}."
execute "cp #{existing_file} #{output_file}"
elsif preprocess_ocr?
Rails.logger.info "Pre-processing #{path} before OCR derivation."
bitonal_file = ocr_clean_file(path)
execute "OMP_THREAD_LIMIT=1 tesseract #{bitonal_file} #{output_file.gsub('.xml', '')} #{options[:options]} alto"
remove_tmp_file(bitonal_file)
else
Rails.logger.info "Deriving OCR directly from #{path}."
execute "OMP_THREAD_LIMIT=1 tesseract #{path} #{output_file.gsub('.xml', '')} #{options[:options]} alto"
end
end
def options_for(_format)
{
options: string_options
}
end
def self.pre_ocr_file(filename)
Rails.logger.info 'Checking for a Pre-derived OCR folder.'
return false unless ESSI.config.dig(:essi, :derivatives_folder)
Rails.logger.info 'Checking for a Pre-derived OCR file.'
ocr_filename = "#{File.basename(filename, '.*')}-alto.xml"
ocr_file = File.join(ESSI.config.dig(:essi, :derivatives_folder), ocr_filename)
return false unless File.exist?(ocr_file)
ocr_file
end
def self.preprocess_ocr?
ocr_preprocessor.present? && File.exists?(ocr_preprocessor)
end
def self.ocr_preprocessor
@ocr_preprocessor ||= ESSI.config.dig(:essi, :ocr_preprocessor_path)
end
def self.ocr_clean_file(path)
clean_file = File.join(Hydra::Derivatives.temp_file_base, "clean_#{File.basename(path)}")
execute "#{ocr_preprocessor} #{path} #{clean_file}"
clean_file
end
def self.remove_tmp_file(file)
execute "rm #{file}"
end
private
def string_options
"-l #{language}"
end
def language
directives.fetch(:language, :eng)
end
end
end