Merge branch 'main' into faster-shard-scaling

quickwit-oss · Feb 24, 2025 · 33e28fe · 33e28fe
2 parents 0c8e9ca + 98098bb
commit 33e28fe
Show file tree

Hide file tree

Showing 47 changed files with 1,142 additions and 143 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -131,7 +131,8 @@ jobs:
         if: always() && steps.modified.outputs.rust_src == 'true'
         uses: taiki-e/cache-cargo-install-action@v2
         with:
-          tool: cargo-deny
+          # 0.18 requires rustc 1.85
+          tool: cargo-deny@0.17.0
       - name: cargo clippy
         if: always() && steps.modified.outputs.rust_src == 'true'
         run: cargo clippy --workspace --tests --all-features

diff --git a/LICENSE-3rdparty.csv b/LICENSE-3rdparty.csv
@@ -117,7 +117,7 @@ difflib,https://github.com/DimaKudosh/difflib,MIT,Dima Kudosh <dimakudosh@gmail.
 digest,https://github.com/RustCrypto/traits,MIT OR Apache-2.0,RustCrypto Developers
 displaydoc,https://github.com/yaahc/displaydoc,MIT OR Apache-2.0,Jane Lusby <jlusby@yaah.dev>
 downcast,https://github.com/fkoep/downcast-rs,MIT,Felix Köpge <fkoep@mailbox.org>
-downcast-rs,https://github.com/marcianx/downcast-rs,MIT OR Apache-2.0,"Ashish Myles <marcianx@gmail.com>, Runji Wang <wangrunji0408@163.com>"
+downcast-rs,https://github.com/marcianx/downcast-rs,MIT OR Apache-2.0,The downcast-rs Authors
 dtoa,https://github.com/dtolnay/dtoa,MIT OR Apache-2.0,David Tolnay <dtolnay@gmail.com>
 dyn-clone,https://github.com/dtolnay/dyn-clone,MIT OR Apache-2.0,David Tolnay <dtolnay@gmail.com>
 ecdsa,https://github.com/RustCrypto/signatures/tree/master/ecdsa,Apache-2.0 OR MIT,RustCrypto Developers

diff --git a/docs/configuration/index-config.md b/docs/configuration/index-config.md
@@ -365,7 +365,7 @@ fast:
 | `description` | Optional description for the field. | `None` |
 | `stored`    | Whether value is stored in the document store | `true` |
 | `indexed`   | Whether value is indexed | `true` |
-| `fast`     | Whether value is stored in a fast field. The default behaviour for text in the JSON is to store the text unchanged. An normalizer can be configured via `normalizer: lowercase`. ([See normalizers](#description-of-available-normalizers)) for a list of available normalizers. | `true` |
+| `fast`     | Whether value is stored in a fast field. The default behaviour for text in the JSON is to store the text unchanged. An normalizer can be configured via `normalizer: lowercase`. ([See normalizers](#description-of-available-normalizers)) for a list of available normalizers. | `false` |
 | `tokenizer` | **Only affects strings in the json object**. Name of the `Tokenizer`, choices between `raw`, `default`, `en_stem` and `chinese_compatible` | `raw` |
 | `record`    | **Only affects strings in the json object**. Describes the amount of information indexed, choices between `basic`, `freq` and `position` | `basic` |
 | `expand_dots`    | If true, json keys containing a `.` should be expanded. For instance, if `expand_dots` is set to true, `{"k8s.node.id": "node-2"}` will be indexed as if it was `{"k8s": {"node": {"id": "node2"}}}`. The benefit is that escaping the `.` will not be required at query time. In other words, `k8s.node.id:node2` will match the document. This does not impact the way the document is stored.  | `true` |
@@ -497,7 +497,7 @@ doc_mapping:
   dynamic_mapping:
     indexed: true
     stored: true
-    tokenizer: default
+    tokenizer: raw
     record: basic
     expand_dots: true
     fast: true

diff --git a/docs/configuration/node-config.md b/docs/configuration/node-config.md
@@ -159,6 +159,7 @@ This section contains the configuration options for an indexer. The split store
 | `merge_concurrency` | Maximum number of merge operations that can be executed on the node at one point in time. | `(2 x num threads available) / 3` |
 | `enable_otlp_endpoint` | If true, enables the OpenTelemetry exporter endpoint to ingest logs and traces via the OpenTelemetry Protocol (OTLP). | `false` |
 | `cpu_capacity` | Advisory parameter used by the control plane. The value can expressed be in threads (e.g. `2`) or in term of millicpus (`2000m`). The control plane will attempt to schedule indexing pipelines on the different nodes proportionally to the cpu capacity advertised by the indexer. It is NOT used as a limit. All pipelines will be scheduled regardless of whether the cluster has sufficient capacity or not. The control plane does not attempt to spread the work equally when the load is well below the `cpu_capacity`. Users who need a balanced load on all of their indexer nodes can set the `cpu_capacity` to an arbitrarily low value as long as they keep it proportional to the number of threads available. | `num threads available` |
+| `enable_cooperative_indexing` | Enable sharing resources more efficiently when the number of indexes actively written to is significantly higher than the number of cores but might decrease the overall indexing throughput. | `false` |
 
 Example:
 
@@ -205,7 +206,7 @@ This section contains the configuration options for a Searcher.
 
 ### Searcher split cache configuration
 
-This section contains the configuration options for the on disk searcher split cache.
+This section contains the configuration options for the on-disk searcher split cache. Files are stored in the data directory under `searcher-split-cache/`.
 
 | Property | Description | Default value |
 | --- | --- | --- |

diff --git a/docs/configuration/template-config.md b/docs/configuration/template-config.md
@@ -0,0 +1,111 @@
+---
+title: Index template configuration
+sidebar_position: 7
+toc_max_heading_level: 4
+---
+
+This page describes how to configure an index template.
+
+Index templates let you dynamically create indexes according to predefined rules. Templates are used automatically when documents are received on the ingest API for an index that doesn't exist.
+
+The index template configuration lets you define the following parameters:
+- `template_id` (required)
+- `description`
+- `index_id_patterns` (required)
+- `index_root_uri`
+- `priority`
+
+Besides, the following parameters can also be configured and are the same as those found in the [index configuration](../configuration/index-config.md):
+- doc mapping (required)
+- indexing settings
+- search settings
+- retention policy
+
+You can manage templates using the [index template API](../reference/rest-api.md#index-template-api).
+
+## Config file format
+
+The index configuration format is YAML or JSON. When a key is absent from the configuration file, the default value is used.
+Here is a complete example:
+
+```yaml
+version: 0.9 # File format version.
+
+template_id: "hdfs-dev"
+
+index_root_uri: "s3://my-bucket/hdfs-dev/"
+
+description: "HDFS log management dev"
+
+index_id_patterns:
+    - hdfs-dev-*
+    - hdfs-staging-*
+
+priority: 100
+
+doc_mapping:
+  mode: lenient
+  field_mappings:
+    - name: timestamp
+      type: datetime
+      input_formats:
+        - unix_timestamp
+      output_format: unix_timestamp_secs
+      fast_precision: seconds
+      fast: true
+    - name: severity_text
+      type: text
+      tokenizer: raw
+      fast:
+        - tokenizer: lowercase
+    - name: body
+      type: text
+      tokenizer: default
+      record: position
+    - name: resource
+      type: object
+      field_mappings:
+        - name: service
+          type: text
+          tokenizer: raw
+  tag_fields: ["resource.service"]
+  timestamp_field: timestamp
+  index_field_presence: true
+
+search_settings:
+  default_search_fields: [severity_text, body]
+
+retention:
+  period: 90 days
+  schedule: daily
+```
+
+## Template ID
+
+The `template_id` is a string that uniquely identifies the index template within the metastore. It may only contain uppercase or lowercase ASCII letters, digits, hyphens (`-`), and underscores (`_`). It must start with a letter and contain at least 3 characters but no more than 255.
+
+## Description
+
+An optional string that describes what the index template is used for.
+
+## Index root uri
+
+The `index_root_uri` defines where the index files (also called splits) should be stored.
+This parameter expects a [storage uri](storage-config#storage-uris).
+
+The actual URI of the index is the path concatenation of the `index_root_uri` with the index id. 
+
+If `index_root_uri` is not defined, the `default_index_root_uri` from [Quickwit's node config](node-config) will be used.
+
+## Index ID patterns
+
+`index_id_patterns` is a list of strings that define which indices should be created according to this template. Use [glob-like](https://en.wikipedia.org/wiki/Glob_(programming)) wildcard ( \* ) expressions to target indices that match a pattern: test\* or \*test or te\*t or \*test\*. You can also use negative patterns by prepending the hyphen `-` character.
+
+Patterns must obey the following rules:
+- It must follow the regex `^-?[a-zA-Z\*][a-zA-Z0-9-_\.\*]{0,254}$`.
+- It cannot contain consecutive asterisks (`*`).
+- If it does not contain an asterisk (`*`), the length must be greater than or equal to 3 characters.
+
+## Priority
+
+When multiple templates match a new index ID, the template with the highest `priority` is used to configure the index.
diff --git a/docs/ingest-data/ingest-api.md b/docs/ingest-data/ingest-api.md
@@ -26,6 +26,8 @@ version: 0.7
 index_id: stackoverflow-schemaless
 doc_mapping:
   mode: dynamic
+  dynamic_mapping:
+    tokenizer: default
 indexing_settings:
   commit_timeout_secs: 30
 EOF
@@ -35,6 +37,8 @@ EOF
 curl -XPOST -H 'Content-Type: application/yaml' 'http://localhost:7280/api/v1/indexes' --data-binary @stackoverflow-schemaless-config.yaml
 ```
 
+Note that for this example, we configure the dynamic mapping to use the [default tokenizer](../configuration/index-config.md#description-of-available-tokenizers). This is necessary to enable full-text search on all text fields.
+
 ## Ingest data
 
 Let's first download a sample of the [StackOverflow dataset](https://www.kaggle.com/stackoverflow/stacksample).
@@ -83,6 +87,6 @@ By default, both ingestion services are enabled and ingest V2 is used. You can t
 
 :::note
 
-These configuration drive the ingest service used both by the `api/v1/<index-id>/ingest` endpoint and the [bulk API](../reference/es_compatible_api.md#_bulk--batch-ingestion-endpoint).
+These configurations drive the ingest service used both by the `api/v1/<index-id>/ingest` endpoint and the [bulk API](../reference/es_compatible_api.md#_bulk--batch-ingestion-endpoint).
 
 :::
diff --git a/docs/operating/data-directory.md b/docs/operating/data-directory.md
@@ -54,6 +54,10 @@ This directory is used for caching splits that will undergo a merge operation to
 
 You can [configure](../configuration/node-config#indexer-configuration) the number of splits the cache can hold with `split_store_max_num_splits` and limit the overall size in bytes of splits with `split_store_max_num_bytes`.
 
+### `/searcher-split-cache` directory
+
+This directory is used by searcher nodes to cache entire splits and reduce calls to the object store. It won't be created unless you set the `split_cache` fields in the [searcher configuration](../configuration/node-config.md#searcher-configuration).
+
 
 ## Setting the right splits cache limits
 

diff --git a/docs/overview/concepts/querying.md b/docs/overview/concepts/querying.md
@@ -98,10 +98,18 @@ Search stream queries can take a huge amount of RAM. Quickwit limits the number
 
 Quickwit does caching in many places to deliver a highly performing query engine.
 
+In memory:
+
 - Hotcache caching: A static cache that holds information about a split file internal representation. It helps speed up the opening of a split file. Its size can be defined via the `split_footer_cache_capacity` configuration parameter.
 - Fast field caching: Fast fields tend to be accessed very frequently by users especially for stream requests. They are cached in a RAM whose size can be limited by the `fast_field_cache_capacity` configuration value.
 - Partial request caching: In some cases, like when using dashboards, some very similar requests might be issued, with only timestamp bounds changing. Some partial results can be cached to make these requests faster and issue less requests to the storage. They are cached in a RAM whose size can be limited by the `partial_request_cache_capacity` configuration value.
 
+On disk:
+
+- The split cache stores entire splits on disk. It can be enabled by setting the `split_cache` configuration fields. This cache can help reduce object store costs and load. Searchers populate this cache when splits are created or queried and evict them with a simple LRU strategy.
+
+Learn more about cache parameters in the [searcher configuration docs](../../configuration/node-config.md#searcher-configuration).
+
 ### Scoring
 
 Quickwit supports sorting docs by their BM25 scores. In order to query by score, [fieldnorms](../../configuration/index-config.md#text-type) must be enabled for the field. By default, BM25 scoring is disabled to improve query latencies but it can be opt-in by setting the `sort_by` option to `_score` in queries.

diff --git a/docs/reference/es_compatible_api.md b/docs/reference/es_compatible_api.md
@@ -749,7 +749,7 @@ The multi-target expression has the following constraints:
 
     - It must follow the regex `^[a-zA-Z\*][a-zA-Z0-9-_\.\*]{0,254}$`.
     - It cannot contain consecutive asterisks (`*`).
-    - If it contains an asterisk (`*`), the length must be greater than or equal to 3 characters.
+    - If it does not contain an asterisk (`*`), the length must be greater than or equal to 3 characters.
 
 ### Examples
 ```

diff --git a/docs/reference/rest-api.md b/docs/reference/rest-api.md
@@ -96,7 +96,7 @@ The following are some constrains about the multi-target expression.
 
     - It must follow the regex `^[a-zA-Z\*][a-zA-Z0-9-_\.\*]{0,254}$`.
     - It cannot contain consecutive asterisks (`*`).
-    - If it contains an asterisk (`*`), the length must be greater than or equal to 3 characters.
+    - If it does not contain an asterisk (`*`), the length must be greater than or equal to 3 characters.
 
 #### Examples
 ```
@@ -794,3 +794,118 @@ Get the list of delete tasks for a given `index_id`.
 #### Response
 
 The response is an array of `DeleteTask`.
+
+
+## Index template API
+
+This API manages index template resources. Templates are higher level configuration objects used to automatically create indexes according to predefined rules. See [index template configuration](../configuration/template-config.md).
+
+### Create a template
+
+```
+POST api/v1/templates
+```
+
+#### POST payload
+
+Create an index template by posting a [template configuration](../configuration/template-config.md) payload. The API accepts JSON with the header `content-type: application/json` and YAML with `content-type: application/yaml`.
+
+**Example**
+
+```yaml
+version: 0.9 # File format version.
+
+template_id: "all-logs"
+
+index_root_uri: "s3://my-bucket/logs/"
+
+description: "All my logs"
+
+index_id_patterns:
+    - logs-*
+
+priority: 100
+
+doc_mapping:
+  mode: dynamic
+  field_mappings:
+    - name: timestamp
+      type: datetime
+      input_formats:
+        - unix_timestamp
+      output_format: unix_timestamp_secs
+      fast: true
+  timestamp_field: timestamp
+```
+
+#### Response
+
+The created index template configuration as JSON.
+
+
+### Update a template
+
+```
+PUT api/v1/templates/<template id>
+```
+
+#### Path variable
+
+| Variable      | Description   |
+| ------------- | ------------- |
+| `template id` | The template id  |
+
+
+#### POST payload
+
+Update an index template by posting an [template configuration](../configuration/template-config.md) payload. The API accepts JSON with the header `content-type: application/json` and YAML with `content-type: application/yaml`.
+
+**Example**
+
+See [create endpoint](#create-a-template).
+
+#### Response
+
+The updated template configuration as JSON.
+
+### List the templates
+
+```
+GET api/v1/templates
+```
+
+#### Response
+
+An array with all the existing index template configurations as JSON.
+
+### Get a template
+
+```
+GET api/v1/templates/<template id>
+```
+
+#### Path variable
+
+| Variable      | Description   |
+| ------------- | ------------- |
+| `template id` | The template id  |
+
+#### Response
+
+The requested index template configuration as JSON.
+
+### Delete a template
+
+```
+DELETE api/v1/templates/<template id>
+```
+
+#### Path variable
+
+| Variable      | Description   |
+| ------------- | ------------- |
+| `template id` | The template id  |
+
+#### Response
+
+Empty response.