Skip to content

Commit

Permalink
Optimize case insensitive search for ASCII text, issue #236.
Browse files Browse the repository at this point in the history
  • Loading branch information
zufuliu committed Oct 13, 2020
1 parent fb919f9 commit 526cee6
Showing 1 changed file with 40 additions and 22 deletions.
62 changes: 40 additions & 22 deletions scintilla/src/Document.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -2028,19 +2028,19 @@ Sci::Position Document::FindText(Sci::Position minPos, Sci::Position maxPos, con
std::vector<char> searchThing((lengthFind + 1) * UTF8MaxBytes * maxFoldingExpansion + 1);
const size_t lenSearch =
pcf->Fold(searchThing.data(), searchThing.size(), search, lengthFind);
char bytes[UTF8MaxBytes + 1] = "";
char folded[UTF8MaxBytes * maxFoldingExpansion + 1] = "";
const unsigned char * const searchData = reinterpret_cast<const unsigned char *>(searchThing.data());
while (forward ? (pos < endPos) : (pos >= endPos)) {
int widthFirstCharacter = 0;
Sci::Position posIndexDocument = pos;
size_t indexSearch = 0;
bool characterMatches = true;
for (;;) {
const unsigned char leadByte = cb.UCharAt(posIndexDocument);
bytes[0] = static_cast<char>(leadByte);
char bytes[UTF8MaxBytes + 1];
int widthChar = 1;
if (!UTF8IsAscii(leadByte)) {
const int widthCharBytes = UTF8BytesOfLead(leadByte);
bytes[0] = static_cast<char>(leadByte);
for (int b = 1; b < widthCharBytes; b++) {
bytes[b] = cb.CharAt(posIndexDocument + b);
}
Expand All @@ -2050,11 +2050,17 @@ Sci::Position Document::FindText(Sci::Position minPos, Sci::Position maxPos, con
widthFirstCharacter = widthChar;
if ((posIndexDocument + widthChar) > limitPos)
break;
const size_t lenFlat = pcf->Fold(folded, sizeof(folded), bytes, widthChar);
// memcmp may examine lenFlat bytes in both arguments so assert it doesn't read past end of searchThing
assert((indexSearch + lenFlat) <= searchThing.size());
// Does folded match the buffer
characterMatches = 0 == memcmp(folded, searchThing.data() + indexSearch, lenFlat);
size_t lenFlat = 1;
if (widthChar == 1) {
characterMatches = searchData[indexSearch] == MakeLowerCase(leadByte);
} else {
char folded[UTF8MaxBytes * maxFoldingExpansion + 1];
lenFlat = pcf->Fold(folded, sizeof(folded), bytes, widthChar);
// memcmp may examine lenFlat bytes in both arguments so assert it doesn't read past end of searchThing
assert((indexSearch + lenFlat) <= searchThing.size());
// Does folded match the buffer
characterMatches = 0 == memcmp(folded, searchData + indexSearch, lenFlat);
}
if (!characterMatches)
break;
posIndexDocument += widthChar;
Expand All @@ -2080,26 +2086,32 @@ Sci::Position Document::FindText(Sci::Position minPos, Sci::Position maxPos, con
constexpr size_t maxFoldingExpansion = 4;
std::vector<char> searchThing((lengthFind + 1) * maxBytesCharacter * maxFoldingExpansion + 1);
const size_t lenSearch = pcf->Fold(searchThing.data(), searchThing.size(), search, lengthFind);
const unsigned char * const searchData = reinterpret_cast<const unsigned char *>(searchThing.data());
while (forward ? (pos < endPos) : (pos >= endPos)) {
Sci::Position indexDocument = 0;
size_t indexSearch = 0;
bool characterMatches = true;
while (characterMatches &&
((pos + indexDocument) < limitPos) &&
(indexSearch < lenSearch)) {
char bytes[maxBytesCharacter + 1];
bytes[0] = cb.CharAt(pos + indexDocument);
const Sci::Position widthChar = IsDBCSLeadByteNoExcept(bytes[0]) ? 2 : 1;
if (widthChar == 2)
bytes[1] = cb.CharAt(pos + indexDocument + 1);
const unsigned char leadByte = cb.UCharAt(pos + indexDocument);
const Sci::Position widthChar = IsDBCSLeadByteNoExcept(leadByte) ? 2 : 1;
if ((pos + indexDocument + widthChar) > limitPos)
break;
char folded[maxBytesCharacter * maxFoldingExpansion + 1];
const size_t lenFlat = pcf->Fold(folded, sizeof(folded), bytes, widthChar);
// memcmp may examine lenFlat bytes in both arguments so assert it doesn't read past end of searchThing
assert((indexSearch + lenFlat) <= searchThing.size());
// Does folded match the buffer
characterMatches = 0 == memcmp(folded, searchThing.data() + indexSearch, lenFlat);
size_t lenFlat = 1;
if (widthChar == 1) {
characterMatches = searchData[indexSearch] == MakeLowerCase(leadByte);
} else {
char bytes[maxBytesCharacter + 1];
bytes[0] = static_cast<char>(leadByte);
bytes[1] = cb.CharAt(pos + indexDocument + 1);
char folded[maxBytesCharacter * maxFoldingExpansion + 1];
lenFlat = pcf->Fold(folded, sizeof(folded), bytes, widthChar);
// memcmp may examine lenFlat bytes in both arguments so assert it doesn't read past end of searchThing
assert((indexSearch + lenFlat) <= searchThing.size());
// Does folded match the buffer
characterMatches = 0 == memcmp(folded, searchData + indexSearch, lenFlat);
}
indexDocument += widthChar;
indexSearch += lenFlat;
}
Expand All @@ -2116,13 +2128,19 @@ Sci::Position Document::FindText(Sci::Position minPos, Sci::Position maxPos, con
const Sci::Position endSearch = (startPos <= endPos) ? endPos - lengthFind + 1 : endPos;
std::vector<char> searchThing(lengthFind + 1);
pcf->Fold(searchThing.data(), searchThing.size(), search, lengthFind);
const char * const searchData = searchThing.data();
while (forward ? (pos < endSearch) : (pos >= endSearch)) {
bool found = (pos + lengthFind) <= limitPos;
for (int indexSearch = 0; (indexSearch < lengthFind) && found; indexSearch++) {
const char ch = CharAt(pos + indexSearch);
char folded[2];
pcf->Fold(folded, sizeof(folded), &ch, 1);
found = folded[0] == searchThing[indexSearch];
const char chTest = searchData[indexSearch];
if (UTF8IsAscii(ch)) {
found = chTest == MakeLowerCase(ch);
} else {
char folded[2];
pcf->Fold(folded, sizeof(folded), &ch, 1);
found = folded[0] == chTest;
}
}
if (found && MatchesWordOptions(word, wordStart, pos, lengthFind)) {
return pos;
Expand Down

0 comments on commit 526cee6

Please # to comment.