Skip to content

Commit

Permalink
TIKA-2645 -- make pool methods private for better encapsulation and add
Browse files Browse the repository at this point in the history
a pool for DOM building
  • Loading branch information
tballison committed May 23, 2018
1 parent c40045a commit cdca0f7
Show file tree
Hide file tree
Showing 24 changed files with 297 additions and 253 deletions.
22 changes: 19 additions & 3 deletions CHANGES.txt
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
Release 1.19 ???


* Use a pool for SAXParsers rather than creating a new one for every parse.
* Use a pool for SAXParsers and DOMBuilders rather than creating
a new parser/builder for every parse.
For better performance, set XMLReaderUtils.setPoolSize() to the
number of threads you're using with Tika (TIKA-2645.

* Add the RecursiveParserWrapperHandler to improve the RecursiveParserWrapper
API slightly (TIKA-2644).


Release 1.18 - 4/20/2018

* Upgrade Jackson to 2.9.5 (TIKA-2634).
Expand Down Expand Up @@ -68,11 +69,26 @@ Release 1.18 - 4/20/2018
* Fixed bug where TesseractOCRParser ignores configured ImageMagickPath,
and set rotation script to ignore Python warnings (TIKA-2509)

* Upgrade geo-apis to 3.0.1 (TIKA-2535).
* Upgrade geo-apis to 3.0.1 (TIKA-2535)

* Mime definition and magic improvements for text-based programming
and config formats (TIKA-2554, TIKA-2567, TIKA-1141)

* Added local Docker image build using dockerfile-maven-plugin to allow
images to be built from source (TIKA-1518).

* Support for SAS7BDAT data files (TIKA-2462)

* Handle .epub files using .htm rather than .html extensions for the
embedded contents (TIKA-1288)

* Mime magic for ACES Images (TIKA-2628) and DPX Images (TIKA-2629)

* For sparse XLSX and XLSB files, always output missing cells to
the left of filled ones (matching XLS), and optionally output
missing rows on all 3 formats if requested via the
OfficeParserContext (TIKA-2479)

Release 1.17 - 12/8/2017

***NOTE: THIS IS THE LAST VERSION OF TIKA THAT WILL RUN
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@

import org.apache.tika.exception.TikaException;
import org.apache.tika.io.CloseShieldInputStream;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.OfflineContentHandler;
import org.apache.tika.utils.XMLReaderUtils;
import org.xml.sax.Attributes;
Expand All @@ -37,6 +38,7 @@
* @since Apache Tika 0.4
*/
public class XmlRootExtractor {
private static final ParseContext EMPTY_CONTEXT = new ParseContext();

public QName extractRootElement(byte[] data) {
return extractRootElement(new ByteArrayInputStream(data));
Expand All @@ -47,15 +49,11 @@ public QName extractRootElement(byte[] data) {
*/
public QName extractRootElement(InputStream stream) {
ExtractorHandler handler = new ExtractorHandler();
SAXParser parser = null;
try {
parser = XMLReaderUtils.acquireSAXParser();
parser.parse(
XMLReaderUtils.parseSAX(
new CloseShieldInputStream(stream),
new OfflineContentHandler(handler));
new OfflineContentHandler(handler), EMPTY_CONTEXT);
} catch (Exception ignore) {
} finally {
XMLReaderUtils.releaseParser(parser);
}
return handler.rootElement;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -319,7 +319,7 @@ public List<Clause> getClauses() {
* @return a SAXParser
* @throws TikaException
*/
public static SAXParser acquireSAXParser()
private static SAXParser acquireSAXParser()
throws TikaException {
while (true) {
SAXParser parser = null;
Expand All @@ -343,7 +343,7 @@ public static SAXParser acquireSAXParser()
*
* @param parser parser to return
*/
public static void releaseParser(SAXParser parser) {
private static void releaseParser(SAXParser parser) {
try {
parser.reset();
} catch (UnsupportedOperationException e) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
import org.apache.tika.sax.OfflineContentHandler;
import org.apache.tika.sax.TaggedContentHandler;
import org.apache.tika.sax.TeeContentHandler;
import org.apache.tika.utils.XMLReaderUtils;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
Expand Down Expand Up @@ -126,12 +127,10 @@ public void parse(

TaggedContentHandler tagged = new TaggedContentHandler(
new OfflineContentHandler(handler));
SAXParser parser = null;
try {
parser = context.acquireSAXParser();
parser.parse(
XMLReaderUtils.parseSAX(
stream, new TeeContentHandler(
tagged, new MetaHandler(metadata)));
tagged, new MetaHandler(metadata)), context);
} catch (SAXException e) {
tagged.throwIfCauseOf(e);
throw new TikaException(
Expand All @@ -140,7 +139,6 @@ stream, new TeeContentHandler(
throw new TikaException(
"Unable to read network parser output", e);
} finally {
context.releaseParser(parser);
try {
thread.join(1000);
} catch (InterruptedException e) {
Expand Down
47 changes: 7 additions & 40 deletions tika-core/src/main/java/org/apache/tika/parser/ParseContext.java
Original file line number Diff line number Diff line change
Expand Up @@ -23,14 +23,9 @@
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLResolver;
import javax.xml.stream.XMLStreamException;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerConfigurationException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.TransformerFactoryConfigurationError;

import java.io.IOException;
import java.io.InputStream;
import java.io.Serializable;
import java.util.HashMap;
import java.util.Map;
Expand All @@ -40,6 +35,7 @@
import org.xml.sax.SAXNotRecognizedException;
import org.xml.sax.SAXNotSupportedException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.DefaultHandler;

/**
* Parse context. Used to pass context information to Tika parsers.
Expand Down Expand Up @@ -121,7 +117,9 @@ public XMLReader getXMLReader() throws TikaException {
/**
* Returns the SAX parser specified in this parsing context. If a parser
* is not explicitly specified, then one is created using the specified
* or the default SAX parser factory.
* or the default SAX parser factory. Consider using
* {@link XMLReaderUtils#parseSAX(InputStream, DefaultHandler, ParseContext)}
* for more efficient reuse of SAXParsers.
*
* @see #getSAXParserFactory()
* @since Apache Tika 0.8
Expand All @@ -137,39 +135,6 @@ public SAXParser getSAXParser() throws TikaException {
}
}

/**
* Returns the SAX parser specified in this parsing context. If a parser
* is not explicitly specified, then one is acquired from the pool.
* <p>
* Make sure to {@link #releaseParser(SAXParser)} as the
* first call in a <code>finally</code> block every time
* you call this!
* </p>
*
* @return SAXParser
* @throws TikaException
*/
public SAXParser acquireSAXParser() throws TikaException {
if (context.containsKey(SAXParser.class)) {
return get(SAXParser.class);
}
return XMLReaderUtils.acquireSAXParser();
}

/**
* If the context already has a SAXParser, this is a no-op.
* Otherwise, this returns the parser to the pool
*
* @param parser
* @throws TikaException
*/
public void releaseParser(SAXParser parser) {
if (context.containsKey(SAXParser.class)) {
return;
}
XMLReaderUtils.releaseParser(parser);
}

/**
* Returns the SAX parser factory specified in this parsing context.
* If a factory is not explicitly specified, then a default factory
Expand Down Expand Up @@ -228,6 +193,8 @@ private DocumentBuilderFactory getDocumentBuilderFactory() {
* instance is created and returned. The builder instance is
* configured to apply an {@link XMLReaderUtils#IGNORING_SAX_ENTITY_RESOLVER},
* and it sets the ErrorHandler to <code>null</code>.
* Consider using {@link XMLReaderUtils#buildDOM(InputStream, ParseContext)}
* instead for more efficient reuse of document builders.
*
* @since Apache Tika 1.13
* @return DOM Builder
Expand Down
Loading

0 comments on commit cdca0f7

Please # to comment.