From 20804245d03088e35cf8dcc590abb7039eb21505 Mon Sep 17 00:00:00 2001 From: BobLd <38405645+BobLd@users.noreply.github.com> Date: Sun, 24 Nov 2024 20:24:36 +0000 Subject: [PATCH] Handle alternate Unicode name representation cXXX and fix #943 (#944) --- src/UglyToad.PdfPig.Fonts/GlyphList.cs | 7 ++++++ .../Integration/GithubIssuesTests.cs | 23 +++++++++++++++++++ 2 files changed, 30 insertions(+) diff --git a/src/UglyToad.PdfPig.Fonts/GlyphList.cs b/src/UglyToad.PdfPig.Fonts/GlyphList.cs index f17881ada..23a6d0b31 100644 --- a/src/UglyToad.PdfPig.Fonts/GlyphList.cs +++ b/src/UglyToad.PdfPig.Fonts/GlyphList.cs @@ -152,6 +152,13 @@ public string NameToUnicode(string name) unicode = char.ConvertFromUtf32(codePoint); } + else if (name.StartsWith("c", StringComparison.OrdinalIgnoreCase) && name.Length >= 3 && name.Length <= 4) + { + // name representation cXXX + var codePoint = int.Parse(name.AsSpanOrSubstring(1), NumberStyles.Integer, CultureInfo.InvariantCulture); + System.Diagnostics.Debug.Assert(codePoint > 0); + unicode = char.ConvertFromUtf32(codePoint); + } else { return null; diff --git a/src/UglyToad.PdfPig.Tests/Integration/GithubIssuesTests.cs b/src/UglyToad.PdfPig.Tests/Integration/GithubIssuesTests.cs index ff58b206a..e0b8479b3 100644 --- a/src/UglyToad.PdfPig.Tests/Integration/GithubIssuesTests.cs +++ b/src/UglyToad.PdfPig.Tests/Integration/GithubIssuesTests.cs @@ -1,9 +1,32 @@ namespace UglyToad.PdfPig.Tests.Integration { using Content; + using DocumentLayoutAnalysis.PageSegmenter; + using DocumentLayoutAnalysis.WordExtractor; public class GithubIssuesTests { + [Fact] + public void Issue943() + { + var path = IntegrationHelpers.GetDocumentPath("MOZILLA-10225-0.pdf"); + + using (var document = PdfDocument.Open(path)) + { + var page = document.GetPage(1); + Assert.NotNull(page); + + var letters = page.Letters; + Assert.NotNull(letters); + + var words = NearestNeighbourWordExtractor.Instance.GetWords(page.Letters); + var blocks = DocstrumBoundingBoxes.Instance.GetBlocks(words); + + Assert.Equal("Rocket and Spacecraft Propulsion", blocks[0].TextLines[0].Text); + Assert.Equal("Principles, Practice and New Developments (Second Edition)", blocks[0].TextLines[1].Text); + } + } + [Fact] public void Issue736() {