Skip to content

Commit

Permalink
Merge branch 'better_hints' into 'master'
Browse files Browse the repository at this point in the history
Misc fixes to CBO and index hints

See merge request manticoresearch/dev!358
  • Loading branch information
glookka committed Mar 17, 2023
2 parents f9f2fde + 61a5f39 commit e77dd72
Show file tree
Hide file tree
Showing 10 changed files with 91 additions and 27 deletions.
5 changes: 5 additions & 0 deletions manual/Changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,18 @@
* [Commit be6b](https://github.com/manticoresoftware/manticoresearch/commit/be6b2ea20b0cb720db645e63f208ca3d7be6c276) Fixed full path to external files was not being displayed correctly in SHOW CREATE TABLE
* [Issue #1052](https://github.com/manticoresoftware/manticoresearch/issues/1052) rt_attr_json column won't work with columnar storage
* [gl #3287] Crash on possibly out of space disk
* [gl 548] Fixed DocidIndex cost calculation
* [gl 3361] Fixed index hints to support multiple attributes
* [gl #3363] Updated export ranker output to match `packedfactors()`
* [Commit 2196](https://github.com/manticoresoftware/manticoresearch/commit/21966fbf) fixed wildcards at query to not be affected by ignore_chars
* [Commit 1990](https://github.com/manticoresoftware/manticoresearch/commit/1990e350) fixed crash of daemon at federated query with aggregate

### Major new features
* Query optimizer now works for fulltext queries

### Minor changes
* Fixed DocidIndex cost calculation
* Added warnings on invalid index hints
* Queries using `count(*)` with a single filter now utilize precalculated data from secondary indexes (if available), resulting in significantly faster query times.
* Added [UINT64()](../Functions/Type_casting_functions.md#UINT64%28%29) type conversion function.
* String fields/attributes that are both `indexed` and `attribute` are now treated as a single field on `INSERT`, `DESC` and `ALTER`.
Expand Down
15 changes: 12 additions & 3 deletions src/costestimate.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ class CostEstimate_c : public CostEstimate_i
static constexpr float COST_INDEX_READ_DENSE_BITMAP = 1.5f;
static constexpr float COST_INDEX_READ_SPARSE = 30.0f;
static constexpr float COST_INDEX_UNION_COEFF = 4.0f;
static constexpr float COST_LOOKUP_READ = 7.0f;
static constexpr float COST_LOOKUP_READ = 20.0f;
static constexpr float COST_INDEX_ITERATOR_INIT = 150.0f;

const CSphVector<SecondaryIndexInfo_t> & m_dSIInfo;
Expand All @@ -60,6 +60,7 @@ class CostEstimate_c : public CostEstimate_i
float CalcFilterCost ( bool bFromIterator, float fDocsAfterIndexes ) const;
float CalcAnalyzerCost() const;
float CalcLookupCost() const;
float CalcPushCost ( float fDocsAfterFilters ) const;
float CalcMTCost ( float fCost ) const;

float CalcGetFilterComplexity ( const SecondaryIndexInfo_t & tSIInfo, const CSphFilterSettings & tFilter ) const;
Expand Down Expand Up @@ -264,11 +265,19 @@ float CostEstimate_c::CalcLookupCost() const
if ( i.m_eType==SecondaryIndexType_e::LOOKUP )
iDocsToReadLookup += i.m_iRsetEstimate;

iDocsToReadLookup = ApplyCutoff(iDocsToReadLookup);
// no cutoff here since lookup reader fetches all docs and sorts them
return Cost_LookupRead ( iDocsToReadLookup );
}


float CostEstimate_c::CalcPushCost ( float fDocsAfterFilters ) const
{
int64_t iDocsToPush = fDocsAfterFilters*m_tCtx.m_iTotalDocs;
iDocsToPush = ApplyCutoff(iDocsToPush);
return Cost_Push ( iDocsToPush );
}


float CostEstimate_c::CalcMTCost ( float fCost ) const
{
if ( m_tCtx.m_iThreads==1 )
Expand Down Expand Up @@ -365,7 +374,7 @@ float CostEstimate_c::CalcQueryCost()
fCost += CalcIndexCost();

if ( m_tCtx.m_bCalcPushCost )
fCost += Cost_Push ( uint64_t(fDocsAfterFilters*m_tCtx.m_iTotalDocs) );
fCost += CalcPushCost(fDocsAfterFilters);

if ( !iNumIndexes && !iNumLookups ) // SI and docid lookups always run in a single thread
fCost = CalcMTCost(fCost);
Expand Down
9 changes: 2 additions & 7 deletions src/docidlookup.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -462,12 +462,7 @@ bool RowidIterator_LookupRange_T<ROWID_LIMITS,BITMAP>::Fill()
{
DocID_t tLookupDocID = 0;
RowID_t tLookupRowID = INVALID_ROWID;

RowID_t * pRowIdStart = BASE::m_dCollected.Begin();
RowID_t * pRowIdMax = pRowIdStart + BASE::m_dCollected.GetLength()-1;
RowID_t * pRowID = pRowIdStart;

while ( pRowID<pRowIdMax && m_pReader->Read ( tLookupDocID, tLookupRowID ) )
while ( m_pReader->Read ( tLookupDocID, tLookupRowID ) )
{
m_iProcessed++;

Expand Down Expand Up @@ -578,7 +573,7 @@ bool RowidIterator_LookupRangeExclude_T<ROWID_LIMITS,BITMAP>::Fill()

static bool NeedBitmapStorage ( int64_t iRsetSize, DWORD uTotalDocs )
{
return float(iRsetSize)/uTotalDocs > 0.05f;
return float(iRsetSize)/uTotalDocs > 0.001f;
}


Expand Down
2 changes: 1 addition & 1 deletion src/docidlookup.h
Original file line number Diff line number Diff line change
Expand Up @@ -204,7 +204,7 @@ class LookupReaderIterator_c : private LookupReader_c
return;

// tDocID is inside current block (check with next min docid)? do nothing
if ( tDocID < ( m_pCurCheckpoint+1 )->m_tBaseDocID )
if ( (uint64_t)tDocID < uint64_t ( ( m_pCurCheckpoint+1 )->m_tBaseDocID ) )
return;

// perform binary search starting with next checkpoint
Expand Down
42 changes: 41 additions & 1 deletion src/secondaryindex.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -799,9 +799,48 @@ static void FetchNumSIIterators ( CSphVector<SecondaryIndexInfo_t> & dSIInfo, co
}
}


static bool CheckHints ( const CSphVector<SecondaryIndexInfo_t> & dSIInfo, const SelectIteratorCtx_t & tCtx, CSphString & sWarning )
{
for ( auto & i : tCtx.m_dHints )
switch ( i.m_eType )
{
case SecondaryIndexType_e::LOOKUP:
if ( i.m_sIndex!=sphGetDocidName() )
{
sWarning = "hint error: DocidIndex can only be applied to 'id' attribute";
return false;
}
break;

case SecondaryIndexType_e::ANALYZER:
case SecondaryIndexType_e::INDEX:
if ( !tCtx.m_tSchema.GetAttr ( i.m_sIndex.cstr() ) )
{
sWarning.SetSprintf ( "hint error: '%s' attribute not found", i.m_sIndex.cstr() );
return false;
}
break;

default:
break;
}

ARRAY_FOREACH ( i, tCtx.m_dFilters )
for ( auto & tHint : tCtx.m_dHints )
if ( tHint.m_sIndex==tCtx.m_dFilters[i].m_sAttrName && tHint.m_bForce )
if ( !dSIInfo[i].m_dCapabilities.any_of ( [&tHint]( auto eSupported ){ return tHint.m_eType==eSupported; } ) )
{
sWarning.SetSprintf ( "hint error: requested hint type not supported for '%s' attribute", tHint.m_sIndex.cstr() );
return false;
}

return true;
}

/////////////////////////////////////////////////////////////////////

CSphVector<SecondaryIndexInfo_t> SelectIterators ( const SelectIteratorCtx_t & tCtx, float & fBestCost )
CSphVector<SecondaryIndexInfo_t> SelectIterators ( const SelectIteratorCtx_t & tCtx, float & fBestCost, CSphString & sWarning )
{
fBestCost = FLT_MAX;

Expand All @@ -820,6 +859,7 @@ CSphVector<SecondaryIndexInfo_t> SelectIterators ( const SelectIteratorCtx_t & t
DisableRowidFilters ( dSIInfo, tCtx );
FetchPartialColumnarMinMax ( dSIInfo, tCtx );
FetchNumSIIterators ( dSIInfo, tCtx );
CheckHints ( dSIInfo, tCtx, sWarning );

CSphVector<int> dCapabilities ( dSIInfo.GetLength() );
CSphVector<int> dBest ( dSIInfo.GetLength() );
Expand Down
2 changes: 1 addition & 1 deletion src/secondaryindex.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ const CSphFilterSettings * GetRowIdFilter ( const CSphVector<CSphFilterSettings>

bool ReturnIteratorResult ( RowID_t * pRowID, RowID_t * pRowIdStart, RowIdBlock_t & dRowIdBlock );

CSphVector<SecondaryIndexInfo_t> SelectIterators ( const SelectIteratorCtx_t & tCtx, float & fBestCost );
CSphVector<SecondaryIndexInfo_t> SelectIterators ( const SelectIteratorCtx_t & tCtx, float & fBestCost, CSphString & sWarning );

namespace SI
{
Expand Down
18 changes: 11 additions & 7 deletions src/sphinx.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1387,7 +1387,7 @@ class CSphIndex_VLN : public CSphIndex, public IndexAlterHelper_c, public DebugC

bool SplitQuery ( CSphQueryResult & tResult, const CSphQuery & tQuery, const VecTraits_T<ISphMatchSorter *> & dAllSorters, const CSphMultiQueryArgs & tArgs, int64_t tmMaxTimer ) const;
RowidIterator_i * SpawnIterators ( const CSphQuery & tQuery, CSphQueryContext & tCtx, CreateFilterContext_t & tFlx, const ISphSchema & tMaxSorterSchema, CSphQueryResultMeta & tMeta, int iCutoff, int iThreads, CSphVector<CSphFilterSettings> & dModifiedFilters, ISphRanker * pRanker ) const;
bool SelectIteratorsFT ( const CSphQuery & tQuery, ISphRanker * pRanker, CSphVector<SecondaryIndexInfo_t> & dSIInfo, int iCutoff, int iThreads ) const;
bool SelectIteratorsFT ( const CSphQuery & tQuery, ISphRanker * pRanker, CSphVector<SecondaryIndexInfo_t> & dSIInfo, int iCutoff, int iThreads, CSphString & sWarning ) const;

bool IsQueryFast ( const CSphQuery & tQuery ) const;
bool CheckEnabledIndexes ( const CSphQuery & tQuery, int iThreads, bool & bFastQuery ) const;
Expand Down Expand Up @@ -3033,8 +3033,9 @@ bool CSphIndex_VLN::CheckEnabledIndexes ( const CSphQuery & tQuery, int iThreads
float fCost = FLT_MAX;
int iCutoff = ApplyImplicitCutoff ( tQuery, {} );

CSphString sWarning;
SelectIteratorCtx_t tCtx ( tQuery.m_dFilters, tQuery.m_dFilterTree, tQuery.m_dIndexHints, m_tSchema, m_pHistograms, m_pColumnar.get(), m_pSIdx.get(), tQuery.m_eCollation, iCutoff, m_iDocinfo, iThreads );
CSphVector<SecondaryIndexInfo_t> dEnabledIndexes = SelectIterators ( tCtx, fCost );
CSphVector<SecondaryIndexInfo_t> dEnabledIndexes = SelectIterators ( tCtx, fCost, sWarning );

// disable pseudo sharding if any of the queries use secondary indexes/docid lookups
if ( dEnabledIndexes.any_of ( []( const SecondaryIndexInfo_t & tSI ){ return tSI.m_eType==SecondaryIndexType_e::INDEX || tSI.m_eType==SecondaryIndexType_e::LOOKUP; } ) )
Expand Down Expand Up @@ -7892,6 +7893,7 @@ RowidIterator_i * CSphIndex_VLN::CreateColumnarAnalyzerOrPrefilter ( CSphVector<
ToColumnarFilters ( dFilters, dColumnarFilters, dFilterMap, tSchema, eCollation, sWarning );

// remove disabled analyzers
int iRemoved = 0;
for ( size_t i = 0; i < dFilterMap.size(); )
{
bool bAnalyzer = dSIInfo[i].m_eType==SecondaryIndexType_e::ANALYZER;
Expand All @@ -7900,8 +7902,10 @@ RowidIterator_i * CSphIndex_VLN::CreateColumnarAnalyzerOrPrefilter ( CSphVector<
{
int iColumnarFilter = dFilterMap[i];
dFilterMap.erase ( dFilterMap.begin()+i );
iRemoved++;

if ( iColumnarFilter!=-1 )
dColumnarFilters.erase ( dColumnarFilters.begin()+iColumnarFilter );
dColumnarFilters.erase ( dColumnarFilters.begin() + ( iColumnarFilter-iRemoved+1 ) );
}
else
i++;
Expand Down Expand Up @@ -7943,7 +7947,7 @@ static void RecreateFilters ( const CSphVector<SecondaryIndexInfo_t> & dSIInfo,
}


bool CSphIndex_VLN::SelectIteratorsFT ( const CSphQuery & tQuery, ISphRanker * pRanker, CSphVector<SecondaryIndexInfo_t> & dSIInfo, int iCutoff, int iThreads ) const
bool CSphIndex_VLN::SelectIteratorsFT ( const CSphQuery & tQuery, ISphRanker * pRanker, CSphVector<SecondaryIndexInfo_t> & dSIInfo, int iCutoff, int iThreads, CSphString & sWarning ) const
{
bool bForce = false;
for ( const auto & tHint : tQuery.m_dIndexHints )
Expand All @@ -7966,7 +7970,7 @@ bool CSphIndex_VLN::SelectIteratorsFT ( const CSphQuery & tQuery, ISphRanker * p
SelectIteratorCtx_t tSelectIteratorCtx ( tQuery.m_dFilters, tQuery.m_dFilterTree, tQuery.m_dIndexHints, m_tSchema, m_pHistograms, m_pColumnar.get(), m_pSIdx.get(), tQuery.m_eCollation, iCutoff, m_iDocinfo, iThreads );
tSelectIteratorCtx.IgnorePushCost();
float fBestCost = FLT_MAX;
dSIInfo = SelectIterators ( tSelectIteratorCtx, fBestCost );
dSIInfo = SelectIterators ( tSelectIteratorCtx, fBestCost, sWarning );

// check that we have anything non-plain-filter. if not, bail out
if ( !dSIInfo.any_of ( []( const auto & tInfo ){ return tInfo.m_eType==SecondaryIndexType_e::LOOKUP || tInfo.m_eType==SecondaryIndexType_e::INDEX || tInfo.m_eType==SecondaryIndexType_e::ANALYZER; } ) )
Expand Down Expand Up @@ -8024,11 +8028,11 @@ RowidIterator_i * CSphIndex_VLN::SpawnIterators ( const CSphQuery & tQuery, CSph
// For now we use approach b) as it is simpler
float fBestCost = FLT_MAX;
SelectIteratorCtx_t tSelectIteratorCtx ( tQuery.m_dFilters, tQuery.m_dFilterTree, tQuery.m_dIndexHints, m_tSchema, m_pHistograms, m_pColumnar.get(), m_pSIdx.get(), tQuery.m_eCollation, iCutoff, m_iDocinfo, iThreads );
dSIInfo = SelectIterators ( tSelectIteratorCtx, fBestCost );
dSIInfo = SelectIterators ( tSelectIteratorCtx, fBestCost, tMeta.m_sWarning );
}
else
{
if ( !SelectIteratorsFT ( tQuery, pRanker, dSIInfo, iCutoff, iThreads ) )
if ( !SelectIteratorsFT ( tQuery, pRanker, dSIInfo, iCutoff, iThreads, tMeta.m_sWarning ) )
return nullptr;
}

Expand Down
17 changes: 11 additions & 6 deletions src/sphinxql.y
Original file line number Diff line number Diff line change
Expand Up @@ -1177,28 +1177,33 @@ hint_list:
| hint_list hint_item
;

hint_attr_list:
ident
| hint_attr_list ',' ident {TRACK_BOUNDS ( $$, $1, $3 );}
;

hint_item:
TOK_HINT_SECONDARY '(' ident ')'
TOK_HINT_SECONDARY '(' hint_attr_list ')'
{
pParser->AddIndexHint ( SecondaryIndexType_e::INDEX, true, $3 );
}
| TOK_HINT_NO_SECONDARY '(' ident ')'
| TOK_HINT_NO_SECONDARY '(' hint_attr_list ')'
{
pParser->AddIndexHint ( SecondaryIndexType_e::INDEX, false, $3 );
}
| TOK_HINT_DOCID '(' ident ')'
| TOK_HINT_DOCID '(' hint_attr_list ')'
{
pParser->AddIndexHint ( SecondaryIndexType_e::LOOKUP, true, $3 );
}
| TOK_HINT_NO_DOCID '(' ident ')'
| TOK_HINT_NO_DOCID '(' hint_attr_list ')'
{
pParser->AddIndexHint ( SecondaryIndexType_e::LOOKUP, false, $3 );
}
| TOK_HINT_COLUMNAR '(' ident ')'
| TOK_HINT_COLUMNAR '(' hint_attr_list ')'
{
pParser->AddIndexHint ( SecondaryIndexType_e::ANALYZER, true, $3 );
}
| TOK_HINT_NO_COLUMNAR '(' ident ')'
| TOK_HINT_NO_COLUMNAR '(' hint_attr_list ')'
{
pParser->AddIndexHint ( SecondaryIndexType_e::ANALYZER, false, $3 );
}
Expand Down
2 changes: 1 addition & 1 deletion test/test_430/model.bin

Large diffs are not rendered by default.

6 changes: 6 additions & 0 deletions test/test_430/test.xml
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,12 @@ show meta;

select title from META:all where match('two') and property='Ten' /*+ NO_OptimizeFulltextFilters */;
show meta;

select brand_id from test_col where price&gt;500 and brand_id&gt;5;
show meta;
select brand_id from test_col where price&gt;500 and brand_id&gt;5 /*+ NO_ColumnarScan(price,brand_id) */;
show meta;

</sphinxql></queries>

</test>

0 comments on commit e77dd72

Please # to comment.