BookStackApp · ssddanbrown · Feb 14, 2025 · Feb 7, 2025 · Feb 14, 2025 · Feb 14, 2025
diff --git a/app/Search/SearchIndex.php b/app/Search/SearchIndex.php
@@ -16,7 +16,13 @@ class SearchIndex
     /**
      * A list of delimiter characters used to break-up parsed content into terms for indexing.
      */
-    public static string $delimiters = " \n\t.,!?:;()[]{}<>`'\"";
+    public static string $delimiters = " \n\t.-,!?:;()[]{}<>`'\"«»";
+
+    /**
+     * A list of delimiter which could be commonly used within a single term and also indicate a break between terms.
+     * The indexer will index the full term with these delimiters, plus the terms split via these delimiters.
+     */
+    public static string $softDelimiters = ".-";
 
     public function __construct(
         protected EntityProvider $entityProvider
@@ -196,15 +202,36 @@ protected function generateTermScoreMapFromTags(array $tags): array
     protected function textToTermCountMap(string $text): array
     {
         $tokenMap = []; // {TextToken => OccurrenceCount}
-        $splitChars = static::$delimiters;
-        $token = strtok($text, $splitChars);
+        $softDelims = static::$softDelimiters;
+        $tokenizer = new SearchTextTokenizer($text, static::$delimiters);
+        $extendedToken = '';
+        $extendedLen = 0;
+
+        $token = $tokenizer->next();
 
         while ($token !== false) {
-            if (!isset($tokenMap[$token])) {
-                $tokenMap[$token] = 0;
+            $delim = $tokenizer->previousDelimiter();
+
+            if ($delim && str_contains($softDelims, $delim) && $token !== '') {
+                $extendedToken .= $delim . $token;
+                $extendedLen++;
+            } else {
+                if ($extendedLen > 1) {
+                    $tokenMap[$extendedToken] = ($tokenMap[$extendedToken] ?? 0) + 1;
+                }
+                $extendedToken = $token;
+                $extendedLen = 1;
             }
-            $tokenMap[$token]++;
-            $token = strtok($splitChars);
+
+            if ($token) {
+                $tokenMap[$token] = ($tokenMap[$token] ?? 0) + 1;
+            }
+
+            $token = $tokenizer->next();
+        }
+
+        if ($extendedLen > 1) {
+            $tokenMap[$extendedToken] = ($tokenMap[$extendedToken] ?? 0) + 1;
         }
 
         return $tokenMap;

diff --git a/app/Search/SearchOptions.php b/app/Search/SearchOptions.php
@@ -181,7 +181,7 @@ protected static function decodeEscapes(string $input): string
     protected static function parseStandardTermString(string $termString): array
     {
         $terms = explode(' ', $termString);
-        $indexDelimiters = SearchIndex::$delimiters;
+        $indexDelimiters = implode('', array_diff(str_split(SearchIndex::$delimiters), str_split(SearchIndex::$softDelimiters)));
         $parsed = [
             'terms'  => [],
             'exacts' => [],

diff --git a/app/Search/SearchTextTokenizer.php b/app/Search/SearchTextTokenizer.php
@@ -0,0 +1,70 @@
+<?php
+
+namespace BookStack\Search;
+
+/**
+ * A custom text tokenizer which records & provides insight needed for our search indexing.
+ * We used to use basic strtok() but this class does the following which that lacked:
+ * - Tracks and provides the current/previous delimiter that we've stopped at.
+ * - Returns empty tokens upon parsing a delimiter.
+ */
+class SearchTextTokenizer
+{
+    protected int $currentIndex = 0;
+    protected int $length;
+    protected string $currentDelimiter = '';
+    protected string $previousDelimiter = '';
+
+    public function __construct(
+        protected string $text,
+        protected string $delimiters = ' '
+    ) {
+        $this->length = strlen($this->text);
+    }
+
+    /**
+     * Get the current delimiter to be found.
+     */
+    public function currentDelimiter(): string
+    {
+        return $this->currentDelimiter;
+    }
+
+    /**
+     * Get the previous delimiter found.
+     */
+    public function previousDelimiter(): string
+    {
+        return $this->previousDelimiter;
+    }
+
+    /**
+     * Get the next token between delimiters.
+     * Returns false if there's no further tokens.
+     */
+    public function next(): string|false
+    {
+        $token = '';
+
+        for ($i = $this->currentIndex; $i < $this->length; $i++) {
+            $char = $this->text[$i];
+            if (str_contains($this->delimiters, $char)) {
+                $this->previousDelimiter = $this->currentDelimiter;
+                $this->currentDelimiter = $char;
+                $this->currentIndex = $i + 1;
+                return $token;
+            }
+
+            $token .= $char;
+        }
+
+        if ($token) {
+            $this->currentIndex = $this->length;
+            $this->previousDelimiter = $this->currentDelimiter;
+            $this->currentDelimiter = '';
+            return $token;
+        }
+
+        return false;
+    }
+}
diff --git a/tests/Entity/EntitySearchTest.php → tests/Search/EntitySearchTest.php b/tests/Entity/EntitySearchTest.php → tests/Search/EntitySearchTest.php
@@ -1,12 +1,9 @@
 <?php
 
-namespace Tests\Entity;
+namespace Search;
 
 use BookStack\Activity\Models\Tag;
 use BookStack\Entities\Models\Book;
-use BookStack\Entities\Models\Bookshelf;
-use BookStack\Entities\Models\Chapter;
-use Illuminate\Support\Str;
 use Tests\TestCase;
 
 class EntitySearchTest extends TestCase
@@ -312,113 +309,6 @@ public function test_entity_template_selector_search()
         $defaultListTest->assertDontSee($templatePage->name);
     }
 
-    public function test_sibling_search_for_pages()
-    {
-        $chapter = $this->entities->chapterHasPages();
-        $this->assertGreaterThan(2, count($chapter->pages), 'Ensure we\'re testing with at least 1 sibling');
-        $page = $chapter->pages->first();
-
-        $search = $this->actingAs($this->users->viewer())->get("/search/entity/siblings?entity_id={$page->id}&entity_type=page");
-        $search->assertSuccessful();
-        foreach ($chapter->pages as $page) {
-            $search->assertSee($page->name);
-        }
-
-        $search->assertDontSee($chapter->name);
-    }
-
-    public function test_sibling_search_for_pages_without_chapter()
-    {
-        $page = $this->entities->pageNotWithinChapter();
-        $bookChildren = $page->book->getDirectVisibleChildren();
-        $this->assertGreaterThan(2, count($bookChildren), 'Ensure we\'re testing with at least 1 sibling');
-
-        $search = $this->actingAs($this->users->viewer())->get("/search/entity/siblings?entity_id={$page->id}&entity_type=page");
-        $search->assertSuccessful();
-        foreach ($bookChildren as $child) {
-            $search->assertSee($child->name);
-        }
-
-        $search->assertDontSee($page->book->name);
-    }
-
-    public function test_sibling_search_for_chapters()
-    {
-        $chapter = $this->entities->chapter();
-        $bookChildren = $chapter->book->getDirectVisibleChildren();
-        $this->assertGreaterThan(2, count($bookChildren), 'Ensure we\'re testing with at least 1 sibling');
-
-        $search = $this->actingAs($this->users->viewer())->get("/search/entity/siblings?entity_id={$chapter->id}&entity_type=chapter");
-        $search->assertSuccessful();
-        foreach ($bookChildren as $child) {
-            $search->assertSee($child->name);
-        }
-
-        $search->assertDontSee($chapter->book->name);
-    }
-
-    public function test_sibling_search_for_books()
-    {
-        $books = Book::query()->take(10)->get();
-        $book = $books->first();
-        $this->assertGreaterThan(2, count($books), 'Ensure we\'re testing with at least 1 sibling');
-
-        $search = $this->actingAs($this->users->viewer())->get("/search/entity/siblings?entity_id={$book->id}&entity_type=book");
-        $search->assertSuccessful();
-        foreach ($books as $expectedBook) {
-            $search->assertSee($expectedBook->name);
-        }
-    }
-
-    public function test_sibling_search_for_shelves()
-    {
-        $shelves = Bookshelf::query()->take(10)->get();
-        $shelf = $shelves->first();
-        $this->assertGreaterThan(2, count($shelves), 'Ensure we\'re testing with at least 1 sibling');
-
-        $search = $this->actingAs($this->users->viewer())->get("/search/entity/siblings?entity_id={$shelf->id}&entity_type=bookshelf");
-        $search->assertSuccessful();
-        foreach ($shelves as $expectedShelf) {
-            $search->assertSee($expectedShelf->name);
-        }
-    }
-
-    public function test_sibling_search_for_books_provides_results_in_alphabetical_order()
-    {
-        $contextBook = $this->entities->book();
-        $searchBook = $this->entities->book();
-
-        $searchBook->name = 'Zebras';
-        $searchBook->save();
-
-        $search = $this->actingAs($this->users->viewer())->get("/search/entity/siblings?entity_id={$contextBook->id}&entity_type=book");
-        $this->withHtml($search)->assertElementNotContains('a:first-child', 'Zebras');
-
-        $searchBook->name = '1AAAAAAArdvarks';
-        $searchBook->save();
-
-        $search = $this->actingAs($this->users->viewer())->get("/search/entity/siblings?entity_id={$contextBook->id}&entity_type=book");
-        $this->withHtml($search)->assertElementContains('a:first-child', '1AAAAAAArdvarks');
-    }
-
-    public function test_sibling_search_for_shelves_provides_results_in_alphabetical_order()
-    {
-        $contextShelf = $this->entities->shelf();
-        $searchShelf = $this->entities->shelf();
-
-        $searchShelf->name = 'Zebras';
-        $searchShelf->save();
-
-        $search = $this->actingAs($this->users->viewer())->get("/search/entity/siblings?entity_id={$contextShelf->id}&entity_type=bookshelf");
-        $this->withHtml($search)->assertElementNotContains('a:first-child', 'Zebras');
-
-        $searchShelf->name = '1AAAAAAArdvarks';
-        $searchShelf->save();
-
-        $search = $this->actingAs($this->users->viewer())->get("/search/entity/siblings?entity_id={$contextShelf->id}&entity_type=bookshelf");
-        $this->withHtml($search)->assertElementContains('a:first-child', '1AAAAAAArdvarks');
-    }
-
     public function test_search_works_on_updated_page_content()
     {
         $page = $this->entities->page();
@@ -453,75 +343,6 @@ public function test_search_ranks_common_words_lower()
         $this->withHtml($search)->assertElementContains('.entity-list > .page:nth-child(2)', 'Test page A');
     }
 
-    public function test_terms_in_headers_have_an_adjusted_index_score()
-    {
-        $page = $this->entities->newPage(['name' => 'Test page A', 'html' => '
-            <p>TermA</p>
-            <h1>TermB <strong>TermNested</strong></h1>
-            <h2>TermC</h2>
-            <h3>TermD</h3>
-            <h4>TermE</h4>
-            <h5>TermF</h5>
-            <h6>TermG</h6>
-        ']);
-
-        $scoreByTerm = $page->searchTerms()->pluck('score', 'term');
-
-        $this->assertEquals(1, $scoreByTerm->get('TermA'));
-        $this->assertEquals(10, $scoreByTerm->get('TermB'));
-        $this->assertEquals(10, $scoreByTerm->get('TermNested'));
-        $this->assertEquals(5, $scoreByTerm->get('TermC'));
-        $this->assertEquals(4, $scoreByTerm->get('TermD'));
-        $this->assertEquals(3, $scoreByTerm->get('TermE'));
-        $this->assertEquals(2, $scoreByTerm->get('TermF'));
-        // Is 1.5 but stored as integer, rounding up
-        $this->assertEquals(2, $scoreByTerm->get('TermG'));
-    }
-
-    public function test_indexing_works_as_expected_for_page_with_lots_of_terms()
-    {
-        $this->markTestSkipped('Time consuming test');
-
-        $count = 100000;
-        $text = '';
-        $chars = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_#';
-        for ($i = 0; $i < $count; $i++) {
-            $text .= substr(str_shuffle($chars), 0, 5) . ' ';
-        }
-
-        $page = $this->entities->newPage(['name' => 'Test page A', 'html' => '<p>' . $text . '</p>']);
-
-        $termCount = $page->searchTerms()->count();
-
-        // Expect at least 90% unique rate
-        $this->assertGreaterThan($count * 0.9, $termCount);
-    }
-
-    public function test_name_and_content_terms_are_merged_to_single_score()
-    {
-        $page = $this->entities->newPage(['name' => 'TermA', 'html' => '
-            <p>TermA</p>
-        ']);
-
-        $scoreByTerm = $page->searchTerms()->pluck('score', 'term');
-
-        // Scores 40 for being in the name then 1 for being in the content
-        $this->assertEquals(41, $scoreByTerm->get('TermA'));
-    }
-
-    public function test_tag_names_and_values_are_indexed_for_search()
-    {
-        $page = $this->entities->newPage(['name' => 'PageA', 'html' => '<p>content</p>', 'tags' => [
-            ['name' => 'Animal', 'value' => 'MeowieCat'],
-            ['name' => 'SuperImportant'],
-        ]]);
-
-        $scoreByTerm = $page->searchTerms()->pluck('score', 'term');
-        $this->assertEquals(5, $scoreByTerm->get('MeowieCat'));
-        $this->assertEquals(3, $scoreByTerm->get('Animal'));
-        $this->assertEquals(3, $scoreByTerm->get('SuperImportant'));
-    }
-
     public function test_matching_terms_in_search_results_are_highlighted()
     {
         $this->entities->newPage(['name' => 'My Meowie Cat', 'html' => '<p>A superimportant page about meowieable animals</p>', 'tags' => [