apache · morningman · May 6, 2026 · May 6, 2026 · May 6, 2026 · May 6, 2026
diff --git a/docusaurus.config.js b/docusaurus.config.js
@@ -195,6 +195,20 @@ const config = {
                 ],
             }),
         ],
+        [
+            'content-docs',
+            /** @type {import('@docusaurus/plugin-content-docs').Options} */
+            ({
+                id: 'key-features',
+                path: 'key-features-docs',
+                routeBasePath: 'why-doris/key-features',
+                sidebarPath: false,
+                editUrl: 'https://github.com/apache/doris-website/edit/master/',
+                showLastUpdateAuthor: false,
+                showLastUpdateTime: false,
+            }),
+        ],
+        require.resolve('./plugins/key-features-glossary-index'),
         process.env.NODE_ENV === 'development' ? null : customDocusaurusPlugin,
         async function tailwindcssPlugin(context, options) {
             return {
@@ -207,6 +221,23 @@ const config = {
                 },
             };
         },
+        function yamlLoaderPlugin() {
+            return {
+                name: 'yaml-loader',
+                configureWebpack() {
+                    return {
+                        module: {
+                            rules: [
+                                {
+                                    test: /\.ya?ml$/,
+                                    use: 'js-yaml-loader',
+                                },
+                            ],
+                        },
+                    };
+                },
+            };
+        },
         [
             '@docusaurus/plugin-client-redirects',
             {

diff --git a/key-features-docs/_tags.yml b/key-features-docs/_tags.yml
@@ -0,0 +1,24 @@
+groups:
+  functional-area:
+    label: "Functional area"
+    tags:
+      storage:                    { label: "Storage",              color: "#3b82f6" }
+      query-engine:               { label: "Query Engine",         color: "#8b5cf6" }
+      indexing:                   { label: "Indexing",             color: "#10b981" }
+      search:                     { label: "Search",               color: "#7c3aed" }
+      lakehouse:                  { label: "Lakehouse",            color: "#f59e0b" }
+      ai:                         { label: "AI",                   color: "#ec4899" }
+      observability:              { label: "Observability",        color: "#06b6d4" }
+      ingestion:                  { label: "Ingestion",            color: "#14b8a6" }
+      compute-storage-decoupling: { label: "Compute-Storage Sep.", color: "#6366f1" }
+      availability:               { label: "Availability",         color: "#84cc16" }
+      security:                   { label: "Security",             color: "#ef4444" }
+  concept-type:
+    label: "Concept type"
+    tags:
+      algorithm:              { label: "Algorithm",      color: "#0891b2" }
+      data-structure:         { label: "Data Structure", color: "#9333ea" }
+      file-format:            { label: "File Format",    color: "#f97316" }
+      protocol:               { label: "Protocol",       color: "#64748b" }
+      mechanism:              { label: "Mechanism",     color: "#0d9488" }
+      architecture-component: { label: "Architecture",   color: "#dc2626" }
diff --git a/key-features-docs/features/.gitkeep b/key-features-docs/features/.gitkeep
diff --git a/key-features-docs/features/hybrid-search.md b/key-features-docs/features/hybrid-search.md
@@ -0,0 +1,27 @@
+---
+slug: hybrid-search
+title: Hybrid Search
+summary: Unified vector + full-text + scalar search in one engine.
+related_concepts:
+  - bm25
+  - inverted-index
+  - ann-search
+---
+
+# Hybrid Search
+
+## What it solves
+
+Modern applications need to combine three signals at query time: semantic similarity (vector search), keyword relevance (full-text), and traditional filters (scalar predicates). Most engines force a tradeoff — a vector database that can't filter by SQL, a search engine that can't do JOINs, or a data warehouse that can't rank by similarity. Doris brings all three into a single SQL surface.
+
+## How it works
+
+Doris stores vectors as a native type, with HNSW indexes for approximate nearest-neighbor search. The same table can also carry an inverted index for full-text columns and standard column-store indexes for scalar fields. A single SQL query can combine all three predicates, with the planner choosing pushdown order based on selectivity.
+
+The relevance scoring stack supports BM25 for full-text, cosine/L2 distance for vectors, and arbitrary SQL expressions for re-ranking. Hybrid scoring across modalities is expressed declaratively rather than via custom code.
+
+## When to use it
+
+Pick Doris over a specialized vector DB when your retrieval pipeline already needs SQL filters, joins, or aggregations. Pick it over a pure search engine when you need stronger consistency, transactional updates, or analytical queries on the same data.
+
+<RelatedConcepts />
diff --git a/key-features-docs/features/lakehouse.md b/key-features-docs/features/lakehouse.md
@@ -0,0 +1,26 @@
+---
+slug: lakehouse
+title: Lakehouse
+summary: Query Iceberg, Hudi, and Paimon directly without ETL.
+related_concepts:
+  - iceberg
+  - hudi
+---
+
+# Lakehouse
+
+## What it solves
+
+The "lakehouse" idea promises one storage layer that serves both analytical and operational workloads. In practice it requires three things: open table formats, compute that reads them at native speed, and federation across the catalogs your organization already uses. Doris provides all three as first-class capabilities — not bolt-on connectors.
+
+## How it works
+
+Doris reads Iceberg, Hudi, and Paimon tables directly from object storage. Catalog federation lets a single SQL query span Hive Metastore, AWS Glue, and Iceberg REST catalogs without staging data. Predicate pushdown reaches into Parquet and ORC scan layers; partition pruning and statistics-driven file skipping keep wide-table scans bounded.
+
+For incremental workloads, Doris materializes hot subsets into native tables while leaving the cold tail on the lake. The same query planner decides at runtime which path to take.
+
+## When to use it
+
+Choose this when your data already lives in an open table format and you want analytical SQL without copying it. Especially valuable when you have multiple teams reading from different catalogs.
+
+<RelatedConcepts />
diff --git a/key-features-docs/features/materialized-view.md b/key-features-docs/features/materialized-view.md
@@ -0,0 +1,25 @@
+---
+slug: materialized-view
+title: Async Materialized Views
+summary: Precompute results, transparently rewrite queries.
+related_concepts:
+  - async-materialized-view
+---
+
+# Async Materialized Views
+
+## What it solves
+
+Dashboards and APIs frequently re-issue the same aggregations over the same data. Caching at the application layer is brittle (invalidation is hard) and pre-aggregating into separate tables forces the application to know which table to query. Async materialized views (AMVs) solve both.
+
+## How it works
+
+You declare an AMV as a SQL definition. Doris maintains it asynchronously — refreshes are scheduled, not synchronous on the base table write — so writes stay fast. At query time, the optimizer recognizes that an AMV can serve a query and rewrites the plan to use it transparently. The application keeps writing the same SQL.
+
+Refresh strategies span full refresh, incremental refresh on partitioned base tables, and triggered refresh based on freshness SLAs.
+
+## When to use it
+
+Use AMVs when you have an expensive query that runs often enough to amortize the maintenance cost. The transparent rewrite means you don't have to coordinate application changes with view changes.
+
+<RelatedConcepts />
diff --git a/key-features-docs/glossary/.gitkeep b/key-features-docs/glossary/.gitkeep
diff --git a/key-features-docs/glossary/ann-search.md b/key-features-docs/glossary/ann-search.md
@@ -0,0 +1,24 @@
+---
+slug: ann-search
+title: ANN Search
+summary: Approximate nearest neighbor search — the practical algorithm class for vector retrieval at scale.
+tags: [search, ai, algorithm]
+---
+
+# ANN Search
+
+<TagChips />
+
+Approximate nearest neighbor (ANN) search is the practical alternative to exact k-NN when corpora exceed a few thousand vectors. Exact k-NN requires comparing the query vector against every candidate (O(n) per query); ANN trades a small amount of recall for orders-of-magnitude faster query time.
+
+## Common algorithms
+
+- **HNSW** (Hierarchical Navigable Small World): graph-based, excellent recall/latency tradeoff at the cost of memory.
+- **IVF** (Inverted File): partitions the vector space into Voronoi cells; queries probe a small number of cells.
+- **PQ** (Product Quantization): compresses vectors into compact codes, enabling in-memory storage of very large corpora.
+
+## In Doris
+
+Doris uses HNSW as the primary index for vector columns, with tunable `ef_construction` and `M` parameters per index. Vector search integrates with SQL: filters, joins, and aggregations apply to the same query that performs ANN retrieval.
+
+<RelatedConcepts ids={['bm25']} />
diff --git a/key-features-docs/glossary/async-materialized-view.md b/key-features-docs/glossary/async-materialized-view.md
@@ -0,0 +1,24 @@
+---
+slug: async-materialized-view
+title: Async Materialized View
+summary: Precomputed query result that is refreshed asynchronously and used to transparently accelerate matching queries.
+tags: [query-engine, mechanism]
+---
+
+# Async Materialized View
+
+<TagChips />
+
+An async materialized view (AMV) is a database object that stores the result of a SQL query and refreshes it asynchronously, decoupled from base-table writes. At query time, the optimizer can detect that a user query is structurally subsumed by an AMV's definition and rewrite the plan to read from the AMV instead — without the application changing its SQL.
+
+## Refresh strategies
+
+- **Full refresh**: re-runs the entire definition.
+- **Incremental refresh**: only recomputes the changed partitions/rows on the base tables.
+- **Triggered refresh**: refresh fires when a freshness threshold is breached.
+
+## Why "async"
+
+Synchronous materialized views (rewriting on every base-table commit) impose latency on writes. Async refresh keeps writes fast, at the cost of bounded staleness.
+
+<RelatedConcepts />
diff --git a/key-features-docs/glossary/bm25.md b/key-features-docs/glossary/bm25.md
@@ -0,0 +1,26 @@
+---
+slug: bm25
+title: BM25
+summary: Probabilistic ranking function for full-text relevance scoring.
+tags: [search, indexing, algorithm]
+---
+
+# BM25
+
+<TagChips />
+
+BM25 (Best Match 25) is a ranking function used by search engines to estimate the relevance of a document to a given query. It builds on TF-IDF but addresses two of its shortcomings: term frequency saturation (a term mentioned 100 times shouldn't be 100× more relevant than mentioned once) and document length normalization (longer documents shouldn't dominate just because they contain more words).
+
+## Formula
+
+For a query `Q` containing terms `q_1, ..., q_n`, the BM25 score of a document `D` is:
+
+`score(D, Q) = Σ IDF(q_i) · (f(q_i, D) · (k1 + 1)) / (f(q_i, D) + k1 · (1 - b + b · |D|/avgdl))`
+
+Where `f(q_i, D)` is term frequency in `D`, `|D|` is document length, `avgdl` is average document length across the corpus, and `k1` and `b` are tuning parameters (typically `k1=1.2`, `b=0.75`).
+
+## In Doris
+
+BM25 is the default scoring function for inverted-index columns when running full-text matching queries. It can be combined with vector similarity scores in hybrid search via expression-level re-ranking.
+
+<RelatedConcepts ids={['inverted-index']} />
diff --git a/key-features-docs/glossary/hudi.md b/key-features-docs/glossary/hudi.md
@@ -0,0 +1,23 @@
+---
+slug: hudi
+title: Apache Hudi
+summary: Lakehouse table format optimized for streaming ingest and incremental processing.
+tags: [lakehouse, ingestion, file-format]
+---
+
+# Apache Hudi
+
+<TagChips />
+
+Apache Hudi is an open table format that brings transactional semantics — upserts, deletes, and incremental queries — to data lake storage. Where Iceberg emphasizes analytical correctness and schema evolution, Hudi emphasizes streaming write patterns and near-real-time freshness.
+
+## Two table types
+
+- **Copy-on-Write (CoW)**: rewrites entire files on update; reads are pure Parquet (fast, no merge cost).
+- **Merge-on-Read (MoR)**: writes deltas to log files, merged at query time (faster ingest, slower scan).
+
+## In Doris
+
+Doris supports both CoW and MoR Hudi tables via the multi-catalog interface. For MoR tables, the merge happens transparently during scan.
+
+<RelatedConcepts ids={['iceberg']} />
diff --git a/key-features-docs/glossary/iceberg.md b/key-features-docs/glossary/iceberg.md
@@ -0,0 +1,24 @@
+---
+slug: iceberg
+title: Apache Iceberg
+summary: Open table format for huge analytic datasets, with hidden partitioning and full schema evolution.
+tags: [lakehouse, file-format]
+---
+
+# Apache Iceberg
+
+<TagChips />
+
+Apache Iceberg is an open table format designed for petabyte-scale analytics. It separates the *table* concept (a stable name with schema and metadata) from the *file layout* (Parquet/ORC files in object storage), allowing safe schema and partition evolution without rewriting data.
+
+## What makes it different
+
+- **Hidden partitioning**: queries don't need to specify partition predicates; Iceberg derives them from filter columns.
+- **Snapshot isolation**: every write produces a new immutable snapshot, enabling time-travel queries and atomic rollbacks.
+- **Full schema evolution**: add, drop, rename, or reorder columns without rewriting files.
+
+## In Doris
+
+Doris reads Iceberg tables directly via the multi-catalog interface. Predicate pushdown extends down to Iceberg's metadata layer, so partition pruning and file skipping happen before any data is read.
+
+<RelatedConcepts ids={['hudi']} />
diff --git a/key-features-docs/glossary/inverted-index.md b/key-features-docs/glossary/inverted-index.md
@@ -0,0 +1,22 @@
+---
+slug: inverted-index
+title: Inverted Index
+summary: Maps terms to the set of documents containing them; the foundation of full-text search.
+tags: [indexing, search, data-structure]
+---
+
+# Inverted Index
+
+<TagChips />
+
+An inverted index is a data structure that maps each unique term in a corpus to the list of document IDs (and often positions within documents) where that term appears. The "inversion" is relative to a forward index, which would map document IDs to their term contents.
+
+## Why it matters
+
+Full-text search at scale only works because of inverted indexes. To answer "which documents contain the word `apache`?" you don't scan every document — you look up `apache` in the index and read its posting list directly.
+
+## In Doris
+
+Inverted indexes in Doris support exact match, phrase match, and prefix match queries. They feed BM25 scoring for ranked retrieval and can be combined with bitmap operations for boolean queries (AND/OR/NOT across multiple terms).
+
+<RelatedConcepts ids={['bm25']} />
diff --git a/key-features-docs/glossary/lz4-compression.md b/key-features-docs/glossary/lz4-compression.md
@@ -0,0 +1,22 @@
+---
+slug: lz4-compression
+title: LZ4 Compression
+summary: Fast lossless compression algorithm used in Doris storage and shuffle paths.
+tags: [storage, algorithm]
+---
+
+# LZ4 Compression
+
+<TagChips />
+
+LZ4 is a lossless byte-level compression algorithm prioritized for speed over compression ratio. It is widely used in systems where compression must not become a CPU bottleneck — including Apache Doris's storage format and inter-node shuffle.
+
+## Tradeoff
+
+LZ4 compresses at ~500 MB/s and decompresses at ~2 GB/s on commodity hardware. Compression ratio is typically 1.5–3× — worse than zstd or gzip, but the raw throughput makes it the right choice for hot paths where the alternative is no compression at all.
+
+## In Doris
+
+Doris uses LZ4 as a default codec for column data and shuffle. For cold/archival data, zstd (higher ratio, slower) is selectable per column or per partition.
+
+<RelatedConcepts />
diff --git a/package.json b/package.json
@@ -73,6 +73,7 @@
         "@docusaurus/module-type-aliases": "3.6.3",
         "@tsconfig/docusaurus": "2.0.3",
         "gray-matter": "^4.0.3",
+        "js-yaml-loader": "^1.2.2",
         "typescript": "^5.2.2"
     },
     "resolutions": {