diff --git a/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java b/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java index 79b3a3b54e..cf5cf84ec4 100644 --- a/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java +++ b/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java @@ -58,11 +58,19 @@ * associated with a TikaInputStream should first use the * {@link #get(InputStream)} factory method to cast or wrap a given * {@link InputStream} into a TikaInputStream instance. + *
+ * TikaInputStream includes a few safety features to protect against parsers
+ * that may fail to check for an EOF or may incorrectly rely on the unreliable
+ * value returned from {@link FileInputStream#skip}. These parser failures
+ * can lead to infinite loops. We strongly encourage the use of
+ * TikaInputStream.
*
* @since Apache Tika 0.8
*/
public class TikaInputStream extends TaggedInputStream {
+ private static final int MAX_CONSECUTIVE_EOFS = 1000;
+
/**
* Checks whether the given stream is a TikaInputStream instance.
* The given stream can be null
, in which case the return
@@ -686,9 +694,21 @@ public long getPosition() {
return position;
}
+ /**
+ * This relies on {@link IOUtils#skip(InputStream, long)} to ensure
+ * that the alleged bytes skipped were actually skipped.
+ *
+ * @param ln the number of bytes to skip
+ * @return the number of bytes skipped
+ * @throws IOException if the number of bytes requested to be skipped does not match the number of bytes skipped
+ * or if there's an IOException during the read.
+ */
@Override
public long skip(long ln) throws IOException {
- long n = super.skip(ln);
+ long n = IOUtils.skip(super.in, ln);
+ if (n != ln) {
+ throw new IOException("tried to skip "+ln + " but actually skipped: "+n);
+ }
position += n;
return n;
}
@@ -732,7 +752,7 @@ protected void afterRead(int n) throws IOException {
position += n;
} else {
consecutiveEOFs++;
- if (consecutiveEOFs > 1000) {
+ if (consecutiveEOFs > MAX_CONSECUTIVE_EOFS) {
throw new IOException("Read too many -1 (EOFs); there could be an infinite loop." +
"If you think your file is not corrupt, please open an issue on Tika's JIRA");
}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/OneNotePtr.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/OneNotePtr.java
index c3fb150cfd..85b20e837b 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/OneNotePtr.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/OneNotePtr.java
@@ -337,7 +337,12 @@ void deserializeFileNodeListFragment(FileNodeList data, FileChunkReference next,
// + 4 bytes for the FileNode header
CheckedFileNodePushBack pushBack = new CheckedFileNodePushBack(data);
try {
+ long initialOffset = offset;
FileNode fileNode = deserializeFileNode(data.children.get(data.children.size() - 1), curPath);
+ if (initialOffset == offset) {
+ //nothing read; avoid an infinite loop
+ break;
+ }
if (fileNode.id == FndStructureConstants.ChunkTerminatorFND || fileNode.id == 0) {
terminated = true;
break;
@@ -678,7 +683,6 @@ private FileNode deserializeFileNode(FileNode data, FileNodePtr curPath) throws
end = backup.end;
if (reserved != 1) {
- System.exit(1);
throw new TikaException("RESERVED_NONZERO");
}