diff --git a/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java b/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java index 79b3a3b54e..cf5cf84ec4 100644 --- a/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java +++ b/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java @@ -58,11 +58,19 @@ * associated with a TikaInputStream should first use the * {@link #get(InputStream)} factory method to cast or wrap a given * {@link InputStream} into a TikaInputStream instance. + *

+ * TikaInputStream includes a few safety features to protect against parsers + * that may fail to check for an EOF or may incorrectly rely on the unreliable + * value returned from {@link FileInputStream#skip}. These parser failures + * can lead to infinite loops. We strongly encourage the use of + * TikaInputStream. * * @since Apache Tika 0.8 */ public class TikaInputStream extends TaggedInputStream { + private static final int MAX_CONSECUTIVE_EOFS = 1000; + /** * Checks whether the given stream is a TikaInputStream instance. * The given stream can be null, in which case the return @@ -686,9 +694,21 @@ public long getPosition() { return position; } + /** + * This relies on {@link IOUtils#skip(InputStream, long)} to ensure + * that the alleged bytes skipped were actually skipped. + * + * @param ln the number of bytes to skip + * @return the number of bytes skipped + * @throws IOException if the number of bytes requested to be skipped does not match the number of bytes skipped + * or if there's an IOException during the read. + */ @Override public long skip(long ln) throws IOException { - long n = super.skip(ln); + long n = IOUtils.skip(super.in, ln); + if (n != ln) { + throw new IOException("tried to skip "+ln + " but actually skipped: "+n); + } position += n; return n; } @@ -732,7 +752,7 @@ protected void afterRead(int n) throws IOException { position += n; } else { consecutiveEOFs++; - if (consecutiveEOFs > 1000) { + if (consecutiveEOFs > MAX_CONSECUTIVE_EOFS) { throw new IOException("Read too many -1 (EOFs); there could be an infinite loop." + "If you think your file is not corrupt, please open an issue on Tika's JIRA"); } diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/OneNotePtr.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/OneNotePtr.java index c3fb150cfd..85b20e837b 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/OneNotePtr.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/OneNotePtr.java @@ -337,7 +337,12 @@ void deserializeFileNodeListFragment(FileNodeList data, FileChunkReference next, // + 4 bytes for the FileNode header CheckedFileNodePushBack pushBack = new CheckedFileNodePushBack(data); try { + long initialOffset = offset; FileNode fileNode = deserializeFileNode(data.children.get(data.children.size() - 1), curPath); + if (initialOffset == offset) { + //nothing read; avoid an infinite loop + break; + } if (fileNode.id == FndStructureConstants.ChunkTerminatorFND || fileNode.id == 0) { terminated = true; break; @@ -678,7 +683,6 @@ private FileNode deserializeFileNode(FileNode data, FileNodePtr curPath) throws end = backup.end; if (reserved != 1) { - System.exit(1); throw new TikaException("RESERVED_NONZERO"); }