Skip to content

Commit

Permalink
add more extensive checking of shared data sets and fix bit compressi…
Browse files Browse the repository at this point in the history
…on bugs and asserts
  • Loading branch information
paulbkoch committed Nov 11, 2022
1 parent 1176253 commit c9b43de
Show file tree
Hide file tree
Showing 16 changed files with 895 additions and 482 deletions.
18 changes: 9 additions & 9 deletions shared/ebm_native/ApplyUpdate.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -63,13 +63,13 @@ struct ApplyUpdateInternal final {
EBM_ASSERT(cItemsPerBitPack <= k_cBitsForStorageType);

cBitsPerItemMax = GetCountBits<StorageDataType>(cItemsPerBitPack);
EBM_ASSERT(1 <= cBitsPerItemMax);
EBM_ASSERT(cBitsPerItemMax <= k_cBitsForStorageType);

cShift = static_cast<ptrdiff_t>((cSamples - 1) % cItemsPerBitPack * cBitsPerItemMax);
cShiftReset = static_cast<ptrdiff_t>((cItemsPerBitPack - 1) * cBitsPerItemMax);

EBM_ASSERT(1 <= cBitsPerItemMax);
EBM_ASSERT(cBitsPerItemMax <= k_cBitsForSizeT);
maskBits = (~size_t { 0 }) >> (k_cBitsForSizeT - cBitsPerItemMax);
maskBits = static_cast<size_t>(MakeLowMask<StorageDataType>(cBitsPerItemMax));

pInputData = pData->m_aPacked;
}
Expand Down Expand Up @@ -245,13 +245,13 @@ struct ApplyUpdateInternal<2, compilerBitPack, bKeepGradHess, bCalcMetric, bWeig
EBM_ASSERT(cItemsPerBitPack <= k_cBitsForStorageType);

cBitsPerItemMax = GetCountBits<StorageDataType>(cItemsPerBitPack);
EBM_ASSERT(1 <= cBitsPerItemMax);
EBM_ASSERT(cBitsPerItemMax <= k_cBitsForStorageType);

cShift = static_cast<ptrdiff_t>((cSamples - 1) % cItemsPerBitPack * cBitsPerItemMax);
cShiftReset = static_cast<ptrdiff_t>((cItemsPerBitPack - 1) * cBitsPerItemMax);

EBM_ASSERT(1 <= cBitsPerItemMax);
EBM_ASSERT(cBitsPerItemMax <= k_cBitsForSizeT);
maskBits = (~size_t { 0 }) >> (k_cBitsForSizeT - cBitsPerItemMax);
maskBits = static_cast<size_t>(MakeLowMask<StorageDataType>(cBitsPerItemMax));

pInputData = pData->m_aPacked;
}
Expand Down Expand Up @@ -392,13 +392,13 @@ struct ApplyUpdateInternal<k_regression, compilerBitPack, bKeepGradHess, bCalcMe
EBM_ASSERT(cItemsPerBitPack <= k_cBitsForStorageType);

cBitsPerItemMax = GetCountBits<StorageDataType>(cItemsPerBitPack);
EBM_ASSERT(1 <= cBitsPerItemMax);
EBM_ASSERT(cBitsPerItemMax <= k_cBitsForStorageType);

cShift = static_cast<ptrdiff_t>((cSamples - 1) % cItemsPerBitPack * cBitsPerItemMax);
cShiftReset = static_cast<ptrdiff_t>((cItemsPerBitPack - 1) * cBitsPerItemMax);

EBM_ASSERT(1 <= cBitsPerItemMax);
EBM_ASSERT(cBitsPerItemMax <= k_cBitsForSizeT);
maskBits = (~size_t { 0 }) >> (k_cBitsForSizeT - cBitsPerItemMax);
maskBits = static_cast<size_t>(MakeLowMask<StorageDataType>(cBitsPerItemMax));

pInputData = pData->m_aPacked;
}
Expand Down
6 changes: 3 additions & 3 deletions shared/ebm_native/BinSumsBoosting.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -64,13 +64,13 @@ INLINE_RELEASE_TEMPLATED static ErrorEbm BinSumsBoostingInternal(BinSumsBoosting
EBM_ASSERT(cItemsPerBitPack <= k_cBitsForStorageType);

cBitsPerItemMax = GetCountBits<StorageDataType>(cItemsPerBitPack);
EBM_ASSERT(1 <= cBitsPerItemMax);
EBM_ASSERT(cBitsPerItemMax <= k_cBitsForStorageType);

cShift = static_cast<ptrdiff_t>((cSamples - 1) % cItemsPerBitPack * cBitsPerItemMax);
cShiftReset = static_cast<ptrdiff_t>((cItemsPerBitPack - 1) * cBitsPerItemMax);

EBM_ASSERT(1 <= cBitsPerItemMax);
EBM_ASSERT(cBitsPerItemMax <= k_cBitsForSizeT);
maskBits = (~size_t { 0 }) >> (k_cBitsForSizeT - cBitsPerItemMax);
maskBits = static_cast<size_t>(MakeLowMask<StorageDataType>(cBitsPerItemMax));

pInputData = pParams->m_aPacked;
}
Expand Down
6 changes: 3 additions & 3 deletions shared/ebm_native/BinSumsInteraction.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -99,14 +99,14 @@ INLINE_RELEASE_TEMPLATED static ErrorEbm BinSumsInteractionInternal(BinSumsInter
EBM_ASSERT(cItemsPerBitPack <= k_cBitsForStorageType);

const size_t cBitsPerItemMax = GetCountBits<StorageDataType>(cItemsPerBitPack);
EBM_ASSERT(1 <= cBitsPerItemMax);
EBM_ASSERT(cBitsPerItemMax <= k_cBitsForStorageType);
pDimensionalData->m_cBitsPerItemMax = cBitsPerItemMax;

pDimensionalData->m_cShift = static_cast<ptrdiff_t>(((cSamples - 1) % cItemsPerBitPack + 1) * cBitsPerItemMax);
pDimensionalData->m_cShiftReset = static_cast<ptrdiff_t>((cItemsPerBitPack - 1) * cBitsPerItemMax);

EBM_ASSERT(1 <= cBitsPerItemMax);
EBM_ASSERT(cBitsPerItemMax <= k_cBitsForSizeT);
const size_t maskBits = (~size_t { 0 }) >> (k_cBitsForSizeT - cBitsPerItemMax);
const size_t maskBits = static_cast<size_t>(MakeLowMask<StorageDataType>(cBitsPerItemMax));
pDimensionalData->m_maskBits = maskBits;

pDimensionalData->m_cBins = pParams->m_acBins[iDimensionInit];
Expand Down
46 changes: 37 additions & 9 deletions shared/ebm_native/BoosterCore.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -238,15 +238,22 @@ ErrorEbm BoosterCore::Create(
// give ownership of our object back to the caller, even if there is a failure
*ppBoosterCoreOut = pBoosterCore;

size_t cSamples = 0;
size_t cFeatures = 0;
size_t cWeights = 0;
size_t cTargets = 0;
error = GetDataSetSharedHeader(pDataSetShared, &cSamples, &cFeatures, &cWeights, &cTargets);
SharedStorageDataType countSamples;
size_t cFeatures;
size_t cWeights;
size_t cTargets;
error = GetDataSetSharedHeader(pDataSetShared, &countSamples, &cFeatures, &cWeights, &cTargets);
if(Error_None != error) {
// already logged
return error;
}

if(IsConvertError<size_t>(countSamples)) {
LOG_0(Trace_Error, "ERROR BoosterCore::Create IsConvertError<size_t>(countSamples)");
return Error_IllegalParamVal;
}
size_t cSamples = static_cast<size_t>(countSamples);

if(size_t { 1 } < cWeights) {
LOG_0(Trace_Warning, "WARNING BoosterCore::Create size_t { 1 } < cWeights");
return Error_IllegalParamVal;
Expand All @@ -257,7 +264,10 @@ ErrorEbm BoosterCore::Create(
}

ptrdiff_t cClasses;
GetDataSetSharedTarget(pDataSetShared, 0, &cClasses);
if(nullptr == GetDataSetSharedTarget(pDataSetShared, 0, &cClasses)) {
LOG_0(Trace_Warning, "WARNING BoosterCore::Create cClasses cannot fit into ptrdiff_t");
return Error_IllegalParamVal;
}

size_t cTrainingSamples;
size_t cValidationSamples;
Expand All @@ -284,24 +294,30 @@ ErrorEbm BoosterCore::Create(

size_t iFeatureInitialize = size_t { 0 };
do {
size_t cBins;
bool bMissing;
bool bUnknown;
bool bNominal;
bool bSparse;
SharedStorageDataType countBins;
SharedStorageDataType defaultValSparse;
size_t cNonDefaultsSparse;
GetDataSetSharedFeature(
pDataSetShared,
iFeatureInitialize,
&cBins,
&bMissing,
&bUnknown,
&bNominal,
&bSparse,
&countBins,
&defaultValSparse,
&cNonDefaultsSparse
);
EBM_ASSERT(!bSparse); // we do not handle yet
if(IsConvertError<size_t>(countBins)) {
LOG_0(Trace_Error, "ERROR BoosterCore::Create IsConvertError<size_t>(countBins)");
return Error_IllegalParamVal;
}
const size_t cBins = static_cast<size_t>(countBins);
if(0 == cBins) {
if(0 != cSamples) {
LOG_0(Trace_Error, "ERROR BoosterCore::Create countBins cannot be zero if either 0 < cTrainingSamples OR 0 < cValidationSamples");
Expand Down Expand Up @@ -445,9 +461,21 @@ ErrorEbm BoosterCore::Create(
if(LIKELY(size_t { 1 } != cTensorBins)) {
EBM_ASSERT(1 <= cRealDimensions);

const size_t cBitsRequiredMin = CountBitsRequired(cTensorBins - 1);
const size_t iTensorBinMax = cTensorBins - size_t { 1 };

if(IsConvertError<StorageDataType>(iTensorBinMax)) {
LOG_0(Trace_Warning, "WARNING BoosterCore::Create IsConvertError<StorageDataType>(iTensorBinMax)");
return Error_OutOfMemory;
}

const size_t cBitsRequiredMin = CountBitsRequired(iTensorBinMax);
EBM_ASSERT(1 <= cBitsRequiredMin); // 1 < cTensorBins otherwise we'd have filtered it out above
EBM_ASSERT(cBitsRequiredMin <= k_cBitsForSizeT);
EBM_ASSERT(cBitsRequiredMin <= k_cBitsForStorageType);

cItemsPerBitPack = static_cast<ptrdiff_t>(GetCountItemsBitPacked<StorageDataType>(cBitsRequiredMin));
EBM_ASSERT(ptrdiff_t { 1 } <= cItemsPerBitPack);
EBM_ASSERT(cItemsPerBitPack <= ptrdiff_t { k_cBitsForStorageType });

if(size_t { 1 } == cRealDimensions) {
cSingleDimensionBinsMax = EbmMax(cSingleDimensionBinsMax, cSingleDimensionBins);
Expand Down
54 changes: 35 additions & 19 deletions shared/ebm_native/DataSetBoosting.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ INLINE_RELEASE_UNTEMPLATED static StorageDataType * ConstructTargetData(
ptrdiff_t cClasses;
const void * const aTargets = GetDataSetSharedTarget(pDataSetShared, 0, &cClasses);
EBM_ASSERT(1 <= cClasses); // this should be classification, and 0 < cSetSamples
EBM_ASSERT(nullptr != aTargets);
EBM_ASSERT(nullptr != aTargets); // we previously called GetDataSetSharedTarget and got back non-null result

const size_t countClasses = static_cast<size_t>(cClasses);

Expand Down Expand Up @@ -260,10 +260,11 @@ INLINE_RELEASE_UNTEMPLATED static StorageDataType * * ConstructInputData(
} else {
EBM_ASSERT(1 <= pTerm->GetTermBitPack());
const size_t cItemsPerBitPackTo = static_cast<size_t>(pTerm->GetTermBitPack());
// for a 32/64 bit storage item, we can't have more than 32/64 bit packed items stored
EBM_ASSERT(1 <= cItemsPerBitPackTo);
EBM_ASSERT(cItemsPerBitPackTo <= k_cBitsForStorageType);

const size_t cBitsPerItemMaxTo = GetCountBits<StorageDataType>(cItemsPerBitPackTo);
// if we have 1 item, it can't be larger than the number of bits of storage
EBM_ASSERT(1 <= cBitsPerItemMaxTo);
EBM_ASSERT(cBitsPerItemMaxTo <= k_cBitsForStorageType);

EBM_ASSERT(1 <= cSetSamples);
Expand Down Expand Up @@ -298,44 +299,56 @@ INLINE_RELEASE_UNTEMPLATED static StorageDataType * * ConstructInputData(
EBM_ASSERT(!IsConvertError<size_t>(indexFeature)); // we converted it previously
const size_t iFeature = static_cast<size_t>(indexFeature);

size_t cBinsUnused;
bool bMissing;
bool bUnknown;
bool bNominal;
bool bSparse;
SharedStorageDataType cBinsUnused;
SharedStorageDataType defaultValSparse;
size_t cNonDefaultsSparse;
const void * pInputDataFrom = GetDataSetSharedFeature(
pDataSetShared,
iFeature,
&cBinsUnused,
&bMissing,
&bUnknown,
&bNominal,
&bSparse,
&cBinsUnused,
&defaultValSparse,
&cNonDefaultsSparse
);
EBM_ASSERT(nullptr != pInputDataFrom);
EBM_ASSERT(cBinsUnused == cBins);
EBM_ASSERT(!IsConvertError<size_t>(cBinsUnused)); // since we previously extracted cBins and checked
EBM_ASSERT(static_cast<size_t>(cBinsUnused) == cBins);
EBM_ASSERT(!bSparse); // we don't support sparse yet

pDimensionInfoInit->m_pInputData = static_cast<const SharedStorageDataType *>(pInputDataFrom);
pDimensionInfoInit->m_cBins = cBins;

const size_t cBitsRequiredMin = CountBitsRequired(cBins - 1);
const size_t cBitsRequiredMin = CountBitsRequired(cBins - size_t { 1 });
EBM_ASSERT(1 <= cBitsRequiredMin);
EBM_ASSERT(cBitsRequiredMin <= k_cBitsForSharedStorageType);
EBM_ASSERT(cBitsRequiredMin <= k_cBitsForSharedStorageType); // comes from shared data set
EBM_ASSERT(cBitsRequiredMin <= k_cBitsForSizeT); // since cBins fits into size_t (previous call to GetDataSetSharedFeature)
// we previously calculated the tensor bin count, and with that determined that the total number of
// bins minus one (the maximum tensor index) would fit into a StorageDataType. Since any particular
// dimensional index must be less than the multiple of all of them, we know that the number of bits
// will fit into a StorageDataType
EBM_ASSERT(cBitsRequiredMin <= k_cBitsForStorageType);

const size_t cItemsPerBitPackFrom = GetCountItemsBitPacked<SharedStorageDataType>(cBitsRequiredMin);
EBM_ASSERT(1 <= cItemsPerBitPackFrom);
EBM_ASSERT(cItemsPerBitPackFrom <= k_cBitsForSharedStorageType);

const size_t cBitsPerItemMaxFrom = GetCountBits<SharedStorageDataType>(cItemsPerBitPackFrom);
EBM_ASSERT(cBitsPerItemMaxFrom <= k_cBitsForSharedStorageType);
EBM_ASSERT(1 <= cBitsPerItemMaxFrom);
EBM_ASSERT(cBitsPerItemMaxFrom <= k_cBitsForSizeT);
const size_t maskBitsFrom = (~size_t{ 0 }) >> (k_cBitsForSizeT - cBitsPerItemMaxFrom);
EBM_ASSERT(cBitsPerItemMaxFrom <= k_cBitsForSharedStorageType);

// we can only guarantee that cBitsPerItemMaxFrom is less than or equal to k_cBitsForSharedStorageType
// so we need to construct our mask in that type, but afterwards we can convert it to a
// StorageDataType since we know the ultimate answer must fit into that. If in theory
// SharedStorageDataType were allowed to be a billion bits, then the mask could be 65 bits while the end
// result would be forced to be 64 bits or less since we use the maximum number of bits per item possible
const size_t maskBitsFrom = static_cast<size_t>(MakeLowMask<SharedStorageDataType>(cBitsPerItemMaxFrom));

pDimensionInfoInit->m_cItemsPerBitPackFrom = cItemsPerBitPackFrom;
pDimensionInfoInit->m_cBitsPerItemMaxFrom = cBitsPerItemMaxFrom;
Expand Down Expand Up @@ -393,19 +406,21 @@ INLINE_RELEASE_UNTEMPLATED static StorageDataType * * ConstructInputData(
size_t tensorMultiple = 1;
InputDataPointerAndCountBins * pDimensionInfo = &dimensionInfo[0];
do {
SharedStorageDataType dataFrom = *pDimensionInfo->m_pInputData;
const size_t iData = static_cast<size_t>(dataFrom >> (pDimensionInfo->m_iShiftFrom * pDimensionInfo->m_cBitsPerItemMaxFrom)) & pDimensionInfo->m_maskBitsFrom;
const SharedStorageDataType indexDataCombined = *pDimensionInfo->m_pInputData;
EBM_ASSERT(pDimensionInfo->m_iShiftFrom * pDimensionInfo->m_cBitsPerItemMaxFrom < k_cBitsForSharedStorageType);
const size_t iData = static_cast<size_t>(indexDataCombined >>
(pDimensionInfo->m_iShiftFrom * pDimensionInfo->m_cBitsPerItemMaxFrom)) &
pDimensionInfo->m_maskBitsFrom;

// we check our dataSet when we get the header, and cBins has been checked to fit into size_t
EBM_ASSERT(iData < pDimensionInfoInit->m_cBins);

pDimensionInfo->m_iShiftFrom -= 1;
if(pDimensionInfo->m_iShiftFrom < ptrdiff_t { 0 }) {
pDimensionInfo->m_pInputData += 1;
pDimensionInfo->m_iShiftFrom += pDimensionInfo->m_cItemsPerBitPackFrom;
}

if(pDimensionInfo->m_cBins <= iData) {
// TODO: I think this check has been moved to constructing the shared dataset
LOG_0(Trace_Error, "ERROR DataSetBoosting::ConstructInputData iData value must be less than the number of bins");
goto free_all;
}
// we check for overflows during Term construction, but let's check here again
EBM_ASSERT(!IsMultiplyError(tensorMultiple, pDimensionInfo->m_cBins));

Expand All @@ -416,7 +431,8 @@ INLINE_RELEASE_UNTEMPLATED static StorageDataType * * ConstructInputData(
++pDimensionInfo;
} while(pDimensionInfoInit != pDimensionInfo);

EBM_ASSERT(!IsConvertError<StorageDataType>(tensorIndex)); // this was checked when determining packing
// during term construction we check that the maximum tensor index fits into StorageDataType
EBM_ASSERT(!IsConvertError<StorageDataType>(tensorIndex));
iTensor = static_cast<StorageDataType>(tensorIndex);
}

Expand Down
Loading

0 comments on commit c9b43de

Please # to comment.