Skip to content

Commit

Permalink
TIKA-3081 -- convert TikaInputStream's skip to the equivalent of skip…
Browse files Browse the repository at this point in the history
…Fully
  • Loading branch information
tballison committed Apr 8, 2020
1 parent 4cec35f commit 0f4d5de
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 3 deletions.
24 changes: 22 additions & 2 deletions tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java
Original file line number Diff line number Diff line change
Expand Up @@ -58,11 +58,19 @@
* associated with a TikaInputStream should first use the
* {@link #get(InputStream)} factory method to cast or wrap a given
* {@link InputStream} into a TikaInputStream instance.
* <p>
* TikaInputStream includes a few safety features to protect against parsers
* that may fail to check for an EOF or may incorrectly rely on the unreliable
* value returned from {@link FileInputStream#skip}. These parser failures
* can lead to infinite loops. We strongly encourage the use of
* TikaInputStream.
*
* @since Apache Tika 0.8
*/
public class TikaInputStream extends TaggedInputStream {

private static final int MAX_CONSECUTIVE_EOFS = 1000;

/**
* Checks whether the given stream is a TikaInputStream instance.
* The given stream can be <code>null</code>, in which case the return
Expand Down Expand Up @@ -686,9 +694,21 @@ public long getPosition() {
return position;
}

/**
* This relies on {@link IOUtils#skip(InputStream, long)} to ensure
* that the alleged bytes skipped were actually skipped.
*
* @param ln the number of bytes to skip
* @return the number of bytes skipped
* @throws IOException if the number of bytes requested to be skipped does not match the number of bytes skipped
* or if there's an IOException during the read.
*/
@Override
public long skip(long ln) throws IOException {
long n = super.skip(ln);
long n = IOUtils.skip(super.in, ln);
if (n != ln) {
throw new IOException("tried to skip "+ln + " but actually skipped: "+n);
}
position += n;
return n;
}
Expand Down Expand Up @@ -732,7 +752,7 @@ protected void afterRead(int n) throws IOException {
position += n;
} else {
consecutiveEOFs++;
if (consecutiveEOFs > 1000) {
if (consecutiveEOFs > MAX_CONSECUTIVE_EOFS) {
throw new IOException("Read too many -1 (EOFs); there could be an infinite loop." +
"If you think your file is not corrupt, please open an issue on Tika's JIRA");
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -337,7 +337,12 @@ void deserializeFileNodeListFragment(FileNodeList data, FileChunkReference next,
// + 4 bytes for the FileNode header
CheckedFileNodePushBack pushBack = new CheckedFileNodePushBack(data);
try {
long initialOffset = offset;
FileNode fileNode = deserializeFileNode(data.children.get(data.children.size() - 1), curPath);
if (initialOffset == offset) {
//nothing read; avoid an infinite loop
break;
}
if (fileNode.id == FndStructureConstants.ChunkTerminatorFND || fileNode.id == 0) {
terminated = true;
break;
Expand Down Expand Up @@ -678,7 +683,6 @@ private FileNode deserializeFileNode(FileNode data, FileNodePtr curPath) throws
end = backup.end;

if (reserved != 1) {
System.exit(1);
throw new TikaException("RESERVED_NONZERO");
}

Expand Down

0 comments on commit 0f4d5de

Please # to comment.