From 526cee6363f32c041a7c366421878ef7a8362aeb Mon Sep 17 00:00:00 2001 From: zufuliu Date: Tue, 13 Oct 2020 21:31:06 +0800 Subject: [PATCH] Optimize case insensitive search for ASCII text, issue #236. --- scintilla/src/Document.cxx | 62 ++++++++++++++++++++++++-------------- 1 file changed, 40 insertions(+), 22 deletions(-) diff --git a/scintilla/src/Document.cxx b/scintilla/src/Document.cxx index 110c5ac232..8cabdfc11b 100644 --- a/scintilla/src/Document.cxx +++ b/scintilla/src/Document.cxx @@ -2028,8 +2028,7 @@ Sci::Position Document::FindText(Sci::Position minPos, Sci::Position maxPos, con std::vector searchThing((lengthFind + 1) * UTF8MaxBytes * maxFoldingExpansion + 1); const size_t lenSearch = pcf->Fold(searchThing.data(), searchThing.size(), search, lengthFind); - char bytes[UTF8MaxBytes + 1] = ""; - char folded[UTF8MaxBytes * maxFoldingExpansion + 1] = ""; + const unsigned char * const searchData = reinterpret_cast(searchThing.data()); while (forward ? (pos < endPos) : (pos >= endPos)) { int widthFirstCharacter = 0; Sci::Position posIndexDocument = pos; @@ -2037,10 +2036,11 @@ Sci::Position Document::FindText(Sci::Position minPos, Sci::Position maxPos, con bool characterMatches = true; for (;;) { const unsigned char leadByte = cb.UCharAt(posIndexDocument); - bytes[0] = static_cast(leadByte); + char bytes[UTF8MaxBytes + 1]; int widthChar = 1; if (!UTF8IsAscii(leadByte)) { const int widthCharBytes = UTF8BytesOfLead(leadByte); + bytes[0] = static_cast(leadByte); for (int b = 1; b < widthCharBytes; b++) { bytes[b] = cb.CharAt(posIndexDocument + b); } @@ -2050,11 +2050,17 @@ Sci::Position Document::FindText(Sci::Position minPos, Sci::Position maxPos, con widthFirstCharacter = widthChar; if ((posIndexDocument + widthChar) > limitPos) break; - const size_t lenFlat = pcf->Fold(folded, sizeof(folded), bytes, widthChar); - // memcmp may examine lenFlat bytes in both arguments so assert it doesn't read past end of searchThing - assert((indexSearch + lenFlat) <= searchThing.size()); - // Does folded match the buffer - characterMatches = 0 == memcmp(folded, searchThing.data() + indexSearch, lenFlat); + size_t lenFlat = 1; + if (widthChar == 1) { + characterMatches = searchData[indexSearch] == MakeLowerCase(leadByte); + } else { + char folded[UTF8MaxBytes * maxFoldingExpansion + 1]; + lenFlat = pcf->Fold(folded, sizeof(folded), bytes, widthChar); + // memcmp may examine lenFlat bytes in both arguments so assert it doesn't read past end of searchThing + assert((indexSearch + lenFlat) <= searchThing.size()); + // Does folded match the buffer + characterMatches = 0 == memcmp(folded, searchData + indexSearch, lenFlat); + } if (!characterMatches) break; posIndexDocument += widthChar; @@ -2080,6 +2086,7 @@ Sci::Position Document::FindText(Sci::Position minPos, Sci::Position maxPos, con constexpr size_t maxFoldingExpansion = 4; std::vector searchThing((lengthFind + 1) * maxBytesCharacter * maxFoldingExpansion + 1); const size_t lenSearch = pcf->Fold(searchThing.data(), searchThing.size(), search, lengthFind); + const unsigned char * const searchData = reinterpret_cast(searchThing.data()); while (forward ? (pos < endPos) : (pos >= endPos)) { Sci::Position indexDocument = 0; size_t indexSearch = 0; @@ -2087,19 +2094,24 @@ Sci::Position Document::FindText(Sci::Position minPos, Sci::Position maxPos, con while (characterMatches && ((pos + indexDocument) < limitPos) && (indexSearch < lenSearch)) { - char bytes[maxBytesCharacter + 1]; - bytes[0] = cb.CharAt(pos + indexDocument); - const Sci::Position widthChar = IsDBCSLeadByteNoExcept(bytes[0]) ? 2 : 1; - if (widthChar == 2) - bytes[1] = cb.CharAt(pos + indexDocument + 1); + const unsigned char leadByte = cb.UCharAt(pos + indexDocument); + const Sci::Position widthChar = IsDBCSLeadByteNoExcept(leadByte) ? 2 : 1; if ((pos + indexDocument + widthChar) > limitPos) break; - char folded[maxBytesCharacter * maxFoldingExpansion + 1]; - const size_t lenFlat = pcf->Fold(folded, sizeof(folded), bytes, widthChar); - // memcmp may examine lenFlat bytes in both arguments so assert it doesn't read past end of searchThing - assert((indexSearch + lenFlat) <= searchThing.size()); - // Does folded match the buffer - characterMatches = 0 == memcmp(folded, searchThing.data() + indexSearch, lenFlat); + size_t lenFlat = 1; + if (widthChar == 1) { + characterMatches = searchData[indexSearch] == MakeLowerCase(leadByte); + } else { + char bytes[maxBytesCharacter + 1]; + bytes[0] = static_cast(leadByte); + bytes[1] = cb.CharAt(pos + indexDocument + 1); + char folded[maxBytesCharacter * maxFoldingExpansion + 1]; + lenFlat = pcf->Fold(folded, sizeof(folded), bytes, widthChar); + // memcmp may examine lenFlat bytes in both arguments so assert it doesn't read past end of searchThing + assert((indexSearch + lenFlat) <= searchThing.size()); + // Does folded match the buffer + characterMatches = 0 == memcmp(folded, searchData + indexSearch, lenFlat); + } indexDocument += widthChar; indexSearch += lenFlat; } @@ -2116,13 +2128,19 @@ Sci::Position Document::FindText(Sci::Position minPos, Sci::Position maxPos, con const Sci::Position endSearch = (startPos <= endPos) ? endPos - lengthFind + 1 : endPos; std::vector searchThing(lengthFind + 1); pcf->Fold(searchThing.data(), searchThing.size(), search, lengthFind); + const char * const searchData = searchThing.data(); while (forward ? (pos < endSearch) : (pos >= endSearch)) { bool found = (pos + lengthFind) <= limitPos; for (int indexSearch = 0; (indexSearch < lengthFind) && found; indexSearch++) { const char ch = CharAt(pos + indexSearch); - char folded[2]; - pcf->Fold(folded, sizeof(folded), &ch, 1); - found = folded[0] == searchThing[indexSearch]; + const char chTest = searchData[indexSearch]; + if (UTF8IsAscii(ch)) { + found = chTest == MakeLowerCase(ch); + } else { + char folded[2]; + pcf->Fold(folded, sizeof(folded), &ch, 1); + found = folded[0] == chTest; + } } if (found && MatchesWordOptions(word, wordStart, pos, lengthFind)) { return pos;