Merge branch 'better_hints' into 'master'

Misc fixes to CBO and index hints See merge request manticoresearch/dev!358
manticoresoftware · Mar 17, 2023 · e77dd72 · e77dd72
2 parents f9f2fde + 61a5f39
commit e77dd72
Show file tree

Hide file tree

Showing 10 changed files with 91 additions and 27 deletions.
diff --git a/manual/Changelog.md b/manual/Changelog.md
@@ -6,13 +6,18 @@
 * [Commit be6b](https://github.com/manticoresoftware/manticoresearch/commit/be6b2ea20b0cb720db645e63f208ca3d7be6c276) Fixed full path to external files was not being displayed correctly in SHOW CREATE TABLE
 * [Issue #1052](https://github.com/manticoresoftware/manticoresearch/issues/1052) rt_attr_json column won't work with columnar storage
 * [gl #3287] Crash on possibly out of space disk
+* [gl 548] Fixed DocidIndex cost calculation
+* [gl 3361] Fixed index hints to support multiple attributes
+* [gl #3363] Updated export ranker output to match `packedfactors()`
 * [Commit 2196](https://github.com/manticoresoftware/manticoresearch/commit/21966fbf) fixed wildcards at query to not be affected by ignore_chars
 * [Commit 1990](https://github.com/manticoresoftware/manticoresearch/commit/1990e350) fixed crash of daemon at federated query with aggregate
 
 ### Major new features
 * Query optimizer now works for fulltext queries
 
 ### Minor changes
+* Fixed DocidIndex cost calculation
+* Added warnings on invalid index hints
 * Queries using `count(*)` with a single filter now utilize precalculated data from secondary indexes (if available), resulting in significantly faster query times.
 * Added [UINT64()](../Functions/Type_casting_functions.md#UINT64%28%29) type conversion function.
 * String fields/attributes that are both `indexed` and `attribute` are now treated as a single field on `INSERT`, `DESC` and `ALTER`.

diff --git a/src/costestimate.cpp b/src/costestimate.cpp
@@ -38,7 +38,7 @@ class CostEstimate_c : public CostEstimate_i
 	static constexpr float COST_INDEX_READ_DENSE_BITMAP	= 1.5f;
 	static constexpr float COST_INDEX_READ_SPARSE		= 30.0f;
 	static constexpr float COST_INDEX_UNION_COEFF		= 4.0f;
-	static constexpr float COST_LOOKUP_READ				= 7.0f;
+	static constexpr float COST_LOOKUP_READ				= 20.0f;
 	static constexpr float COST_INDEX_ITERATOR_INIT		= 150.0f;
 
 	const CSphVector<SecondaryIndexInfo_t> &	m_dSIInfo;
@@ -60,6 +60,7 @@ class CostEstimate_c : public CostEstimate_i
 	float	CalcFilterCost ( bool bFromIterator, float fDocsAfterIndexes ) const;
 	float	CalcAnalyzerCost() const;
 	float	CalcLookupCost() const;
+	float	CalcPushCost ( float fDocsAfterFilters ) const;
 	float	CalcMTCost ( float fCost ) const;
 
 	float	CalcGetFilterComplexity ( const SecondaryIndexInfo_t & tSIInfo, const CSphFilterSettings & tFilter ) const;
@@ -264,11 +265,19 @@ float CostEstimate_c::CalcLookupCost() const
 		if ( i.m_eType==SecondaryIndexType_e::LOOKUP )
 			iDocsToReadLookup += i.m_iRsetEstimate;
 
-	iDocsToReadLookup = ApplyCutoff(iDocsToReadLookup);
+	// no cutoff here since lookup reader fetches all docs and sorts them
 	return Cost_LookupRead ( iDocsToReadLookup );
 }
 
 
+float CostEstimate_c::CalcPushCost ( float fDocsAfterFilters ) const
+{
+	int64_t iDocsToPush = fDocsAfterFilters*m_tCtx.m_iTotalDocs;
+	iDocsToPush = ApplyCutoff(iDocsToPush);
+	return Cost_Push ( iDocsToPush );
+}
+
+
 float CostEstimate_c::CalcMTCost ( float fCost ) const
 {
 	if ( m_tCtx.m_iThreads==1 )
@@ -365,7 +374,7 @@ float CostEstimate_c::CalcQueryCost()
 		fCost += CalcIndexCost();
 
 	if ( m_tCtx.m_bCalcPushCost )
-		fCost += Cost_Push ( uint64_t(fDocsAfterFilters*m_tCtx.m_iTotalDocs) );
+		fCost += CalcPushCost(fDocsAfterFilters);
 
 	if ( !iNumIndexes && !iNumLookups ) // SI and docid lookups always run in a single thread
 		fCost = CalcMTCost(fCost);

diff --git a/src/docidlookup.cpp b/src/docidlookup.cpp
@@ -462,12 +462,7 @@ bool RowidIterator_LookupRange_T<ROWID_LIMITS,BITMAP>::Fill()
 {
 	DocID_t tLookupDocID = 0;
 	RowID_t tLookupRowID = INVALID_ROWID;
-
-	RowID_t * pRowIdStart = BASE::m_dCollected.Begin();
-	RowID_t * pRowIdMax = pRowIdStart + BASE::m_dCollected.GetLength()-1;
-	RowID_t * pRowID = pRowIdStart;
-
-	while ( pRowID<pRowIdMax && m_pReader->Read ( tLookupDocID, tLookupRowID ) )
+	while ( m_pReader->Read ( tLookupDocID, tLookupRowID ) )
 	{
 		m_iProcessed++;
 
@@ -578,7 +573,7 @@ bool RowidIterator_LookupRangeExclude_T<ROWID_LIMITS,BITMAP>::Fill()
 
 static bool NeedBitmapStorage ( int64_t iRsetSize, DWORD uTotalDocs )
 {
-	return float(iRsetSize)/uTotalDocs > 0.05f;
+	return float(iRsetSize)/uTotalDocs > 0.001f;
 }
 
 

diff --git a/src/docidlookup.h b/src/docidlookup.h
@@ -204,7 +204,7 @@ class LookupReaderIterator_c : private LookupReader_c
 			return;
 
 		// tDocID is inside current block (check with next min docid)? do nothing
-		if ( tDocID < ( m_pCurCheckpoint+1 )->m_tBaseDocID )
+		if ( (uint64_t)tDocID < uint64_t ( ( m_pCurCheckpoint+1 )->m_tBaseDocID ) )
 			return;
 
 		// perform binary search starting with next checkpoint

diff --git a/src/secondaryindex.cpp b/src/secondaryindex.cpp
@@ -799,9 +799,48 @@ static void FetchNumSIIterators ( CSphVector<SecondaryIndexInfo_t> & dSIInfo, co
 	}
 }
 
+
+static bool CheckHints ( const CSphVector<SecondaryIndexInfo_t> & dSIInfo, const SelectIteratorCtx_t & tCtx, CSphString & sWarning )
+{
+	for ( auto & i : tCtx.m_dHints )
+		switch ( i.m_eType )
+		{
+		case SecondaryIndexType_e::LOOKUP:
+			if ( i.m_sIndex!=sphGetDocidName() )
+			{
+				sWarning = "hint error: DocidIndex can only be applied to 'id' attribute";
+				return false;
+			}
+			break;
+
+		case SecondaryIndexType_e::ANALYZER:
+		case SecondaryIndexType_e::INDEX:
+			if ( !tCtx.m_tSchema.GetAttr ( i.m_sIndex.cstr() ) )
+			{
+				sWarning.SetSprintf ( "hint error: '%s' attribute not found", i.m_sIndex.cstr() );
+				return false;
+			}
+			break;
+
+		default:
+			break;
+		}
+
+	ARRAY_FOREACH ( i, tCtx.m_dFilters )
+		for ( auto & tHint : tCtx.m_dHints )
+			if ( tHint.m_sIndex==tCtx.m_dFilters[i].m_sAttrName && tHint.m_bForce )
+				if ( !dSIInfo[i].m_dCapabilities.any_of ( [&tHint]( auto eSupported ){ return tHint.m_eType==eSupported; } ) )
+				{
+					sWarning.SetSprintf ( "hint error: requested hint type not supported for '%s' attribute", tHint.m_sIndex.cstr() );
+					return false;
+				}
+
+	return true;
+}
+
 /////////////////////////////////////////////////////////////////////
 
-CSphVector<SecondaryIndexInfo_t> SelectIterators ( const SelectIteratorCtx_t & tCtx, float & fBestCost )
+CSphVector<SecondaryIndexInfo_t> SelectIterators ( const SelectIteratorCtx_t & tCtx, float & fBestCost, CSphString & sWarning )
 {
 	fBestCost = FLT_MAX;
 
@@ -820,6 +859,7 @@ CSphVector<SecondaryIndexInfo_t> SelectIterators ( const SelectIteratorCtx_t & t
 	DisableRowidFilters ( dSIInfo, tCtx );
 	FetchPartialColumnarMinMax ( dSIInfo, tCtx );
 	FetchNumSIIterators ( dSIInfo, tCtx );
+	CheckHints ( dSIInfo, tCtx, sWarning );
 
 	CSphVector<int> dCapabilities ( dSIInfo.GetLength() );
 	CSphVector<int> dBest ( dSIInfo.GetLength() );

diff --git a/src/secondaryindex.h b/src/secondaryindex.h
@@ -46,7 +46,7 @@ const CSphFilterSettings * GetRowIdFilter ( const CSphVector<CSphFilterSettings>
 
 bool	ReturnIteratorResult ( RowID_t * pRowID, RowID_t * pRowIdStart, RowIdBlock_t & dRowIdBlock );
 
-CSphVector<SecondaryIndexInfo_t> SelectIterators ( const SelectIteratorCtx_t & tCtx, float & fBestCost );
+CSphVector<SecondaryIndexInfo_t> SelectIterators ( const SelectIteratorCtx_t & tCtx, float & fBestCost, CSphString & sWarning );
 
 namespace SI
 {

diff --git a/src/sphinx.cpp b/src/sphinx.cpp
@@ -1387,7 +1387,7 @@ class CSphIndex_VLN : public CSphIndex, public IndexAlterHelper_c, public DebugC
 
 	bool						SplitQuery ( CSphQueryResult & tResult, const CSphQuery & tQuery, const VecTraits_T<ISphMatchSorter *> & dAllSorters, const CSphMultiQueryArgs & tArgs, int64_t tmMaxTimer ) const;
 	RowidIterator_i *			SpawnIterators ( const CSphQuery & tQuery, CSphQueryContext & tCtx, CreateFilterContext_t & tFlx, const ISphSchema & tMaxSorterSchema, CSphQueryResultMeta & tMeta, int iCutoff, int iThreads, CSphVector<CSphFilterSettings> & dModifiedFilters, ISphRanker * pRanker ) const;
-	bool						SelectIteratorsFT ( const CSphQuery & tQuery, ISphRanker * pRanker, CSphVector<SecondaryIndexInfo_t> & dSIInfo, int iCutoff, int iThreads ) const;
+	bool						SelectIteratorsFT ( const CSphQuery & tQuery, ISphRanker * pRanker, CSphVector<SecondaryIndexInfo_t> & dSIInfo, int iCutoff, int iThreads, CSphString & sWarning ) const;
 
 	bool						IsQueryFast ( const CSphQuery & tQuery ) const;
 	bool						CheckEnabledIndexes ( const CSphQuery & tQuery, int iThreads, bool & bFastQuery ) const;
@@ -3033,8 +3033,9 @@ bool CSphIndex_VLN::CheckEnabledIndexes ( const CSphQuery & tQuery, int iThreads
 	float fCost = FLT_MAX;
 	int iCutoff = ApplyImplicitCutoff ( tQuery, {} );
 
+	CSphString sWarning;
 	SelectIteratorCtx_t tCtx ( tQuery.m_dFilters, tQuery.m_dFilterTree, tQuery.m_dIndexHints, m_tSchema, m_pHistograms, m_pColumnar.get(), m_pSIdx.get(), tQuery.m_eCollation, iCutoff, m_iDocinfo, iThreads );
-	CSphVector<SecondaryIndexInfo_t> dEnabledIndexes = SelectIterators ( tCtx, fCost );
+	CSphVector<SecondaryIndexInfo_t> dEnabledIndexes = SelectIterators ( tCtx, fCost, sWarning );
 
 	// disable pseudo sharding if any of the queries use secondary indexes/docid lookups
 	if ( dEnabledIndexes.any_of ( []( const SecondaryIndexInfo_t & tSI ){ return tSI.m_eType==SecondaryIndexType_e::INDEX || tSI.m_eType==SecondaryIndexType_e::LOOKUP; } ) )
@@ -7892,6 +7893,7 @@ RowidIterator_i * CSphIndex_VLN::CreateColumnarAnalyzerOrPrefilter ( CSphVector<
 	ToColumnarFilters ( dFilters, dColumnarFilters, dFilterMap, tSchema, eCollation, sWarning );
 
 	// remove disabled analyzers
+	int iRemoved = 0;
 	for ( size_t i = 0; i < dFilterMap.size(); )
 	{
 		bool bAnalyzer = dSIInfo[i].m_eType==SecondaryIndexType_e::ANALYZER;
@@ -7900,8 +7902,10 @@ RowidIterator_i * CSphIndex_VLN::CreateColumnarAnalyzerOrPrefilter ( CSphVector<
 		{
 			int iColumnarFilter = dFilterMap[i];
 			dFilterMap.erase ( dFilterMap.begin()+i );
+			iRemoved++;
+
 			if ( iColumnarFilter!=-1 )
-				dColumnarFilters.erase ( dColumnarFilters.begin()+iColumnarFilter );
+				dColumnarFilters.erase ( dColumnarFilters.begin() + ( iColumnarFilter-iRemoved+1 ) );
 		}
 		else
 			i++;
@@ -7943,7 +7947,7 @@ static void RecreateFilters ( const CSphVector<SecondaryIndexInfo_t> & dSIInfo,
 }
 
 
-bool CSphIndex_VLN::SelectIteratorsFT ( const CSphQuery & tQuery, ISphRanker * pRanker, CSphVector<SecondaryIndexInfo_t> & dSIInfo, int iCutoff, int iThreads ) const
+bool CSphIndex_VLN::SelectIteratorsFT ( const CSphQuery & tQuery, ISphRanker * pRanker, CSphVector<SecondaryIndexInfo_t> & dSIInfo, int iCutoff, int iThreads, CSphString & sWarning ) const
 {
 	bool bForce = false;
 	for ( const auto & tHint : tQuery.m_dIndexHints )
@@ -7966,7 +7970,7 @@ bool CSphIndex_VLN::SelectIteratorsFT ( const CSphQuery & tQuery, ISphRanker * p
 	SelectIteratorCtx_t tSelectIteratorCtx ( tQuery.m_dFilters, tQuery.m_dFilterTree, tQuery.m_dIndexHints, m_tSchema, m_pHistograms, m_pColumnar.get(), m_pSIdx.get(), tQuery.m_eCollation, iCutoff, m_iDocinfo, iThreads );
 	tSelectIteratorCtx.IgnorePushCost();
 	float fBestCost = FLT_MAX;
-	dSIInfo = SelectIterators ( tSelectIteratorCtx, fBestCost );
+	dSIInfo = SelectIterators ( tSelectIteratorCtx, fBestCost, sWarning );
 
 	// check that we have anything non-plain-filter. if not, bail out
 	if ( !dSIInfo.any_of ( []( const auto & tInfo ){ return tInfo.m_eType==SecondaryIndexType_e::LOOKUP || tInfo.m_eType==SecondaryIndexType_e::INDEX || tInfo.m_eType==SecondaryIndexType_e::ANALYZER; } ) )
@@ -8024,11 +8028,11 @@ RowidIterator_i * CSphIndex_VLN::SpawnIterators ( const CSphQuery & tQuery, CSph
 		// For now we use approach b) as it is simpler
 		float fBestCost = FLT_MAX;
 		SelectIteratorCtx_t tSelectIteratorCtx ( tQuery.m_dFilters, tQuery.m_dFilterTree, tQuery.m_dIndexHints, m_tSchema, m_pHistograms, m_pColumnar.get(), m_pSIdx.get(), tQuery.m_eCollation, iCutoff, m_iDocinfo, iThreads );
-		dSIInfo = SelectIterators ( tSelectIteratorCtx, fBestCost );
+		dSIInfo = SelectIterators ( tSelectIteratorCtx, fBestCost, tMeta.m_sWarning );
 	}
 	else
 	{
-		if ( !SelectIteratorsFT ( tQuery, pRanker, dSIInfo, iCutoff, iThreads ) )
+		if ( !SelectIteratorsFT ( tQuery, pRanker, dSIInfo, iCutoff, iThreads, tMeta.m_sWarning ) )
 			return nullptr;
 	}
 

diff --git a/src/sphinxql.y b/src/sphinxql.y
@@ -1177,28 +1177,33 @@ hint_list:
 	| hint_list hint_item
 	;
 
+hint_attr_list:
+	ident
+	| hint_attr_list ',' ident {TRACK_BOUNDS ( $$, $1, $3 );}
+	;
+
 hint_item:           
-	TOK_HINT_SECONDARY '(' ident ')'
+	TOK_HINT_SECONDARY '(' hint_attr_list ')'
 		{
 			pParser->AddIndexHint ( SecondaryIndexType_e::INDEX, true, $3 );
 		}
-	| TOK_HINT_NO_SECONDARY '(' ident ')'
+	| TOK_HINT_NO_SECONDARY '(' hint_attr_list ')'
 		{
 			pParser->AddIndexHint ( SecondaryIndexType_e::INDEX, false, $3 );
 		}
-	| TOK_HINT_DOCID '(' ident ')'
+	| TOK_HINT_DOCID '(' hint_attr_list ')'
 		{
 			pParser->AddIndexHint ( SecondaryIndexType_e::LOOKUP, true, $3 );
 		}
-	| TOK_HINT_NO_DOCID '(' ident ')'
+	| TOK_HINT_NO_DOCID '(' hint_attr_list ')'
 		{
 			pParser->AddIndexHint ( SecondaryIndexType_e::LOOKUP, false, $3 );
 		}
-	| TOK_HINT_COLUMNAR '(' ident ')'
+	| TOK_HINT_COLUMNAR '(' hint_attr_list ')'
 		{
 			pParser->AddIndexHint ( SecondaryIndexType_e::ANALYZER, true, $3 );
 		}
-	| TOK_HINT_NO_COLUMNAR '(' ident ')'
+	| TOK_HINT_NO_COLUMNAR '(' hint_attr_list ')'
 		{
 			pParser->AddIndexHint ( SecondaryIndexType_e::ANALYZER, false, $3 );
 		}

diff --git a/test/test_430/model.bin b/test/test_430/model.bin
diff --git a/test/test_430/test.xml b/test/test_430/test.xml
@@ -180,6 +180,12 @@ show meta;
 
 select title from META:all where match('two') and property='Ten' /*+ NO_OptimizeFulltextFilters */;
 show meta;
+
+select brand_id from test_col where price&gt;500 and brand_id&gt;5;
+show meta;
+select brand_id from test_col where price&gt;500 and brand_id&gt;5 /*+ NO_ColumnarScan(price,brand_id) */;
+show meta;
+
 </sphinxql></queries>
 
 </test>