From ac736936126b77fe9479b8ea78e799899545cf86 Mon Sep 17 00:00:00 2001 From: tballison Date: Fri, 25 May 2018 10:31:21 -0400 Subject: [PATCH] allow more flexibility for OCR variations in a PDFParser test --- .../org/apache/tika/parser/pdf/PDFParserTest.java | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java index 537a7ff322..3e4f293b5a 100644 --- a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java +++ b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java @@ -1252,11 +1252,14 @@ public void testEmbeddedDocsWithOCROnly() throws Exception { assertContains("Needle", xmlResult.xml); if (! strategy.equals(PDFParserConfig.OCR_STRATEGY.NO_OCR)) { // Tesseract may see the t in haystack as a ! some times... - String div = "
pdf_hays"; - if (xmlResult.xml.contains(div+"!ack")) { - assertContains(div+"!ack", xmlResult.xml); + //or it might see dehayslack... + //TODO: figure out how to make this test less hacky + String div = "
"; + if (xmlResult.xml.contains(div+"pdf_hays!ack")) { + } else if (xmlResult.xml.contains(div+"pdf_haystack")) { + } else if (xmlResult.xml.contains(div+"dehayslack")) { } else { - assertContains(div+"tack", xmlResult.xml); + fail("couldn't find acceptable variants of haystack"); } } else { assertNotContained("
pdf_haystack", xmlResult.xml);