From 0e6c29e7ac54e4c928c56645cf180151fb687611 Mon Sep 17 00:00:00 2001 From: Kevin Risden Date: Wed, 4 Oct 2023 09:13:56 -0400 Subject: [PATCH] Reduce FST block size for BlockTreeTermsWriter (#12604) --- lucene/CHANGES.txt | 5 +++++ .../codecs/blocktree/BlockTreeTermsWriter.java | 15 ++++++++++++++- .../src/java/org/apache/lucene/util/fst/FST.java | 4 ++++ 3 files changed, 23 insertions(+), 1 deletion(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 4983d8250726..e7e54ca2a5e3 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -26,6 +26,11 @@ Bug Fixes * GITHUB#12352: [Tessellator] Improve the checks that validate the diagonal between two polygon nodes so the resulting polygons are valid counter clockwise polygons. (Ignacio Vera) +Optimizations +--------------------- +* GITHUB#12604: Estimate the block size of FST BytesStore in BlockTreeTermsWriter + to reduce GC load during indexing. (Guo Feng) + ======================= Lucene 8.11.2 ======================= Bug Fixes diff --git a/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsWriter.java index fd7c1fb7230c..adb6649e9bf1 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsWriter.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/blocktree/BlockTreeTermsWriter.java @@ -55,6 +55,7 @@ import org.apache.lucene.util.fst.BytesRefFSTEnum; import org.apache.lucene.util.fst.FST; import org.apache.lucene.util.fst.Util; +import org.apache.lucene.util.packed.PackedInts; /* TODO: @@ -427,10 +428,22 @@ public void compileIndex(List blocks, RAMOutputStream scratchBytes } } + long estimateSize = prefix.length; + for (PendingBlock block : blocks) { + if (block.subIndices != null) { + for (FST subIndex : block.subIndices) { + estimateSize += subIndex.numBytes(); + } + } + } + int estimateBitsRequired = PackedInts.bitsRequired(estimateSize); + int pageBits = Math.min(15, Math.max(6, estimateBitsRequired)); + + final ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton(); final Builder indexBuilder = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, false, Integer.MAX_VALUE, - outputs, true, 15); + outputs, true, pageBits); //if (DEBUG) { // System.out.println(" compile index for prefix=" + prefix); //} diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/FST.java b/lucene/core/src/java/org/apache/lucene/util/fst/FST.java index d9dbacf3ca23..3fa1897f7af8 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/FST.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/FST.java @@ -489,6 +489,10 @@ void finish(long newStartNode) throws IOException { startNode = newStartNode; bytes.finish(); } + + public long numBytes() { + return bytes.getPosition(); + } public T getEmptyOutput() { return emptyOutput;