diff --git a/.circleci/.force_rebuild b/.circleci/.force_rebuild index 281edca01..00a4c758e 100644 --- a/.circleci/.force_rebuild +++ b/.circleci/.force_rebuild @@ -1,2 +1,2 @@ # modify this file to force circleci to rebuild -2024-02-08.2 +2025-02-13.1 diff --git a/Gemfile b/Gemfile index 7994deca1..e243cd7aa 100644 --- a/Gemfile +++ b/Gemfile @@ -162,9 +162,6 @@ gem "nokogiri", ">= 1.13.6" gem "okcomputer", "~> 1.18.4" -# Read PDF ToC -gem 'origami' - # Force epub search results to be sentences gem 'pragmatic_segmenter', '~> 0.3' diff --git a/Gemfile.lock b/Gemfile.lock index 4a0d780a7..5fb73ddb5 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -226,7 +226,6 @@ GEM execjs coffee-script-source (1.12.2) colorator (1.1.0) - colorize (0.8.1) commonjs (0.2.7) concurrent-ruby (1.2.3) config (5.1.0) @@ -741,8 +740,6 @@ GEM openurl (1.0.0) marc scrub_rb (~> 1.0) - origami (2.1.0) - colorize (~> 0.7) orm_adapter (0.5.0) os (1.1.4) ostruct (0.6.0) @@ -1302,7 +1299,6 @@ DEPENDENCIES oauth oauth2 (~> 1.2) okcomputer (~> 1.18.4) - origami pragmatic_segmenter (~> 0.3) prawn (~> 2.2) pry-rails diff --git a/lib/pdf_ebook.rb b/lib/pdf_ebook.rb index afc7e561d..23580c97c 100644 --- a/lib/pdf_ebook.rb +++ b/lib/pdf_ebook.rb @@ -36,11 +36,6 @@ def self.configure end end -# -# Require Dependencies -# -require 'origami' - # # Require Relative # diff --git a/lib/pdf_ebook/publication.rb b/lib/pdf_ebook/publication.rb index 88051a955..11fef1229 100644 --- a/lib/pdf_ebook/publication.rb +++ b/lib/pdf_ebook/publication.rb @@ -1,17 +1,15 @@ # frozen_string_literal: true -require "skylight" +require 'open3' module PDFEbook class Publication - include Skylight::Helpers private_class_method :new - attr_reader :id + attr_reader :id, :path, :outlines # Class Methods def self.from_path_id(path, id) - file = File.new(path) - new(file, id) + new(path, id) rescue StandardError => e ::PDFEbook.logger.info("Publication.from_path_id(#{path},#{id}) raised #{e} #{e.backtrace.join("\n")}") PublicationNullObject.send(:new) @@ -19,69 +17,42 @@ def self.from_path_id(path, id) # Public method def intervals - @intervals ||= extract_intervals + @intervals ||= extract_titles_and_pages(@outlines["outlines"]) end private - instrument_method - def initialize(file, id) - @pdf = Origami::PDF.read(file, verbosity: Origami::Parser::VERBOSE_QUIET, lazy: true) - @id = id - @obj_to_page = {} - end - - instrument_method - def extract_intervals - # Map of PDF page object number to a page number (pages start from 1) - if @obj_to_page.empty? - @pdf.pages.each_with_index do |p, i| - @obj_to_page[p.no] = i + 1 + def extract_titles_and_pages(outlines, depth = 1) + intervals = [] + index = 0 + outlines.each do |outline| + intervals << PDFEbook::Interval.from_title_level_cfi(id, index, outline['title'], depth, "page=#{outline['destpageposfrom1']}") + index += 1 + # Recursively process kids if they exist + if outline["kids"].any? + intervals.concat(extract_titles_and_pages(outline["kids"], depth + 1)) end end - @pdf.Catalog.Outlines.present? ? iterate_outlines(@pdf.Catalog.Outlines[:First]&.solve, 1) : [] + + # Add an "overall_index" to each Interval, I don't remember why we're doing this + intervals.each_with_index { |interval, i| interval.overall_index = i } end - # Takes Origami::OutlineItem and 1-based depth - instrument_method - def iterate_outlines(outline, depth) # rubocop:disable Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity - intervals = [] - index = 0 - until outline.nil? - page = nil - page = outline&.[](:A)&.solve&.[](:D) - # HELIO-3717 some "named destinations" have `:Dest` not `:A` here. The sample I'm looking at is PDF v1.3 - page ||= outline&.[](:Dest) + def initialize(path, id) + @id = id + @path = path + command = "qpdf --json --json-key=outlines #{@path}" + stdin, stdout, stderr, wait_thr = Open3.popen3(command) + stdin.close + stdout.binmode + out = stdout.read + stdout.close + err = stderr.read + stderr.close - if page.is_a?(Origami::Reference) # skips external links - begin - target = page.solve - rescue Origami::InvalidReferenceError - outline = outline[:Next]&.solve - next - end - page = target - elsif page.is_a?(Origami::LiteralString) - # At this point some ToC entries are "named destinations", essentially strings for some... - # different type of lookup directory than a page number type destination. See HELIO-3377. - page = @pdf.get_destination_by_name(page) - end + raise StandardError.new "ERROR command: \"#{command}\"\n#{err}" unless wait_thr.value.success? - page = page&.[](0)&.solve # gets to Origami::Page - page ||= outline[:Dest]&.solve&.[](0)&.solve - unless page.nil? - page_number = @obj_to_page[page.no] || 0 - # HELIO-4768: very rarely `title` is an `Origami::Reference` at this point, for whatever reason - title = outline[:Title].is_a?(Origami::Reference) ? outline[:Title]&.solve : outline[:Title] - intervals << PDFEbook::Interval.from_title_level_cfi(id, index, title.to_utf8, depth, "page=#{page_number}") - index += 1 - end - unless outline[:First]&.solve.nil? # Child outline - intervals += iterate_outlines(outline[:First].solve, depth + 1) - end - outline = outline[:Next]&.solve - end - intervals.each_with_index { |interval, i| interval.overall_index = i } + @outlines = JSON.parse(out) end end @@ -95,9 +66,8 @@ def intervals private def initialize - @pdf = '' + @path = '' @id = '' - @obj_to_page = {} end end end