Infini-AI-Lab
diff --git a/‎docs/_static/custom.css‎
Lines changed: 9 additions & 12 deletions b/‎docs/_static/custom.css‎
Lines changed: 9 additions & 12 deletions
diff --git a/‎vortex_torch/flow/algorithms.py‎
Lines changed: 37 additions & 36 deletions b/‎vortex_torch/flow/algorithms.py‎
Lines changed: 37 additions & 36 deletions
diff --git a/‎vortex_torch/indexer/matmul.py‎
Lines changed: 28 additions & 163 deletions b/‎vortex_torch/indexer/matmul.py‎
Lines changed: 28 additions & 163 deletions
@@ -1,23 +1,20 @@
+/* Class signature: compact (smaller, not oversized). */
 dl.py.class > dt.sig {
-    font-size: 1.4em;             /* 放大字体 */   /* 淡蓝/灰底色，自行改颜色值 */
-    padding: 0.4.rem 0.8rem;       /* 给一点内边距，让块更像卡片 */
-    border-radius: 0.4rem;        /* 圆角 */
+    font-size: 0.9em;
+    padding: 0.3rem 0.6rem;
+    border-radius: 0.4rem;
 }
 
-
 dl.py.class .sig-name {
-    font-weight: 900;
-    font-size: 1.2em;
+    font-weight: 700;
+    font-size: 1em;
 }
 
-/* 调整类方法标题（def xxx）的整体字体大小 */
+/* Method signatures: a touch smaller than body text. */
 dl.py.method > dt.sig {
-    font-size: 1.2em !important;    /* 改成你想要的大小 */
+    font-size: 0.85em;
 }
 
-/* 进一步单独放大方法名 def xxx  */
 dl.py.method > dt.sig .sig-name {
-    font-size: 1.1em !important;
-    font-weight: 700;               /* 让方法名更突出，可选 */
+    font-weight: 600;
 }
-
@@ -102,11 +102,11 @@ class GQABlockSparseAttention(vFlow):
     r"""
     Grouped-query block-sparse routing with a **softmax over pages**.
 
-    Like :class:`BlockSparseAttention` it scores pages by query–centroid
-    similarity, but keeps every grouped-query head separate: each head turns
-    its per-page scores into a softmax distribution over pages, and a page's
-    final score is the **max** of that probability across heads. (Design akin
-    to the GQA sparse-attention formulation in arXiv:2502.11089.)
+    Each page keeps a centroid (the mean of its keys). Every grouped-query
+    head is scored against the centroids separately; each head's per-page
+    scores are turned into a softmax distribution over pages, and a page's
+    final score is the **max** of that probability across heads
+    (cf. the GQA sparse-attention formulation in arXiv:2502.11089).
 
     **Cache.** Per-page centroid :math:`c_p = \frac{1}{|p|}\sum_{k\in p} k`
     via :class:`CMean`.
@@ -265,11 +265,12 @@ class LServeSparseAttention(vFlow):
     r"""
     LSERVE: QUEST envelopes at **sub-block** granularity.
 
-    Sharpens :class:`GQAQuestSparseAttention` by splitting each page into
-    consecutive sub-blocks of :attr:`LSERVE_BLOCK_SIZE` tokens and keeping a
-    separate max/min envelope **per sub-block**. A page is ranked by its
-    single best-matching (head, sub-block) pair, so one relevant sub-region is
-    enough to select the page — a tighter bound than one envelope per page.
+    Each page is split into consecutive sub-blocks of :attr:`LSERVE_BLOCK_SIZE`
+    tokens, and a coordinate-wise max/min key envelope is kept **per
+    sub-block**. The envelopes give a cheap upper bound on the query–key dot
+    product within a sub-block; a page is ranked by its single best-matching
+    (head, sub-block) pair, so one relevant sub-region is enough to select the
+    page.
 
     **Cache.** :meth:`forward_cache` stores, for each of the
     :math:`n_b = \text{block\_size} / \text{LSERVE\_BLOCK\_SIZE}` sub-blocks
@@ -294,8 +295,8 @@ class LServeSparseAttention(vFlow):
 
     **Shapes.** ``q`` is ``[B, H_q, D]``; ``cache["max"]`` / ``cache["min"]``
     are ``[S, n_b, D]`` (indexer) / ``[B, n_b, D]`` (cache). With
-    ``block_size == LSERVE_BLOCK_SIZE`` (:math:`n_b = 1`) this reduces to
-    :class:`GQAQuestSparseAttention`.
+    ``block_size == LSERVE_BLOCK_SIZE`` there is one sub-block per page
+    (:math:`n_b = 1`), i.e. a single envelope over the whole page.
     """
     LSERVE_BLOCK_SIZE = 16
     def __init__(self):
@@ -354,13 +355,12 @@ def create_cache(self, block_size: int, head_dim: int):
 @register("lserve_centroid_sparse_attention")
 class LServeCentroidSparseAttention(vFlow):
     r"""
-    Centroid routing at LSERVE **sub-block** granularity.
+    Centroid routing at sub-block granularity.
 
-    Combines :class:`BlockSparseAttention`'s centroid routing with the
-    sub-block idea of :class:`LServeSparseAttention`: each page is split into
-    consecutive sub-blocks of :attr:`SUB_BLOCK_SIZE` tokens, a centroid is
-    kept **per sub-block**, and a page is ranked by its best-matching
-    sub-block — finer than collapsing the whole page into one centroid.
+    Each page is split into consecutive sub-blocks of :attr:`SUB_BLOCK_SIZE`
+    tokens, and a centroid (the mean of its keys) is kept **per sub-block**. A
+    page is ranked by the query's best match against any of its sub-block
+    centroids, so one relevant sub-region is enough to select the page.
 
     **Cache.** :meth:`forward_cache` stores, for each of the
     :math:`n_b = \text{block\_size} / \text{SUB\_BLOCK\_SIZE}` sub-blocks
@@ -381,8 +381,8 @@ class LServeCentroidSparseAttention(vFlow):
 
     **Shapes.** ``q`` is ``[B, H_q, D]``; ``cache["centroids"]`` is
     ``[S, n_b, D]`` (indexer) / ``[B, n_b, D]`` (cache). With
-    ``block_size == SUB_BLOCK_SIZE`` (:math:`n_b = 1`) this reduces to
-    :class:`BlockSparseAttention`.
+    ``block_size == SUB_BLOCK_SIZE`` there is one sub-block per page
+    (:math:`n_b = 1`), i.e. a single centroid over the whole page.
     """
     SUB_BLOCK_SIZE = 16
 
@@ -437,12 +437,12 @@ class MaskedQuestSparseAttention(vFlow):
     r"""
     QUEST routing with a feature-axis mask that drops low-signal channels.
 
-    Identical to :class:`GQAQuestSparseAttention`, but a :class:`MaskSlice`
-    zeroes the leading ``MASK_END`` feature coordinates of the QUEST bound
-    before the feature sum — a cheap, position-only way to exclude
-    low-signal channels (e.g. large-magnitude "sink" dimensions). Since
-    :class:`MaskSlice` is a pure position writer, no extra state is threaded
-    through ``ctx``.
+    Each page keeps a coordinate-wise max/min key envelope; their combination
+    upper-bounds the largest query–key dot product in the page. Before summing
+    over features, a :class:`MaskSlice` zeroes the leading ``MASK_END`` feature
+    coordinates of that bound — a cheap, position-only way to exclude
+    low-signal channels (e.g. large-magnitude "sink" dimensions). The mask is a
+    pure position writer, so no extra state is threaded through ``ctx``.
 
     **Cache.** Per-page key envelopes :math:`M_p = \max_{k\in p} k` and
     :math:`m_p = \min_{k\in p} k` via :class:`CMax` / :class:`CMin`.
@@ -523,9 +523,9 @@ class CenteredBlockSparseAttention(vFlow):
     r"""
     Centroid block-sparse routing with per-request **mean-centering**.
 
-    Scores pages by query–centroid similarity like
-    :class:`BlockSparseAttention`, then subtracts the per-request mean score
-    across pages before selection — so a page competes by how far *above
+    Each page keeps a centroid (the mean of its keys); pages are scored by
+    query–centroid similarity, and the per-request mean score across pages is
+    subtracted before selection — so a page competes by how far *above
     average* it is, not by raw similarity.
 
     **Cache.** Per-page centroid :math:`c_p = \frac{1}{|p|}\sum_{k\in p} k`
@@ -593,9 +593,10 @@ class RunningAvgBlockSparse(vFlow):
     Centroid block-sparse routing with a per-page **running score**
     (a :class:`Save` / :class:`Load` demo).
 
-    Like :class:`BlockSparseAttention`, but instead of scoring on the current
-    step alone it keeps an exponentially-decayed running score per page across
-    decode steps: pages that stay relevant accumulate, pages that fade decay.
+    Each page keeps a centroid (the mean of its keys). Instead of scoring on
+    the current step alone, the per-page query–centroid score is accumulated
+    into an exponentially-decayed running score across decode steps: pages that
+    stay relevant accumulate, pages that fade decay.
 
     **Cache.** Per-page centroid :math:`c_p` via :class:`CMean`; the persistent
     ``running_score`` is zero-initialised with :class:`CFill` when a page is
@@ -675,10 +676,10 @@ class VEnergyGatedCentroid(vFlow):
     r"""
     Centroid routing **gated by value-block energy**.
 
-    Scores a page by the query–centroid dot product like
-    :class:`BlockSparseAttention`, then multiplies by the page's mean value
-    magnitude (its "energy"): pages whose values carry little energy are muted
-    even when the key centroid aligns with the query.
+    Each page keeps a key centroid (the mean of its keys); a page is scored by
+    the query–centroid dot product multiplied by the page's mean value
+    magnitude (its "energy"), so pages whose values carry little energy are
+    muted even when the key centroid aligns with the query.
 
     **Cache.** :meth:`forward_cache` stores a per-page key centroid
     :math:`c_p` (:class:`CMean`) and the value energy — the mean :math:`L_2`
 
@@ -7,69 +7,17 @@
 
 class GeMV(vOp):
     r"""
-    General matrix-vector multiplication (GEMV) dispatcher.
-
-    This operator computes a *piecewise* batched matrix-vector product.
-    Let
-
-    .. math::
-
-        X \in \mathbb{R}^{B \times 1 \times D}, \qquad
-        Y \in \mathbb{R}^{S_{\text{pack}} \times 1 \times D},
-
-    where the ``S``-axis of :math:`Y` is a concatenation of batch-wise
-    segments
-
-    .. math::
-
-        S_{\text{pack}} = \sum_{i=0}^{B-1} S_i, \qquad
-        Y =
-        \begin{bmatrix}
-            Y_0 \\
-            Y_1 \\
-            \vdots \\
-            Y_{B-1}
-        \end{bmatrix},
-
-    with
-
-    .. math::
-
-        Y_i \in \mathbb{R}^{S_i \times 1 \times D}, \qquad
-        X_i = X[i, 0, :] \in \mathbb{R}^{1 \times D}.
-
-    For each batch index :math:`i \in \{0,\dots,B-1\}`, we define
-
-    .. math::
-
-        O_i = Y_i X_i^{\mathsf{T}} \in \mathbb{R}^{S_i \times 1 \times 1},
-
-    and the overall output is the concatenation
-
-    .. math::
-
-        O =
-        \begin{bmatrix}
-            O_0 \\
-            O_1 \\
-            \vdots \\
-            O_{B-1}
-        \end{bmatrix}
-        \in \mathbb{R}^{S_{\text{pack}} \times 1 \times 1}.
-
-    In the runtime, :math:`S_{\text{pack}}` is given by
-    ``ctx.max_num_pages``. Output format rule: ``BATCHED`` iff both
-    inputs are ``BATCHED`` (both have their ``S`` axis already collapsed
-    to 1), otherwise ``RAGGED``. Format compatibility is enforced by
-    the compiler's per-workload kernel.
-
-    Attributes
-    ----------
-    output_format : Optional[FORMAT]
-        The output tensor format as determined in :meth:`profile`.
-
-    output_buffer : Optional[torch.Tensor]
-        Preallocated output tensor buffer of shape ``[S_pack, 1, 1]``.
+    Per-request batched matrix–vector product, :math:`O = Y X^{\top}`.
+
+    :Math: with a batched query :math:`X\in\mathbb{R}^{B\times 1\times D}` and
+        packed pages :math:`Y\in\mathbb{R}^{S\times 1\times D}`, each page
+        :math:`s` of request :math:`i` scores as
+        :math:`O[s]=\langle Y[s],\,X[i]\rangle`, giving
+        :math:`O\in\mathbb{R}^{S\times 1\times 1}`.
+    :__init__: ``GeMV()`` — no arguments.
+    :__call__: ``o = op(x, y, ctx=ctx)`` — ``x`` is ``[B, 1, D]``, ``y`` is
+        ``[S, 1, D]`` (matching ``D``); returns ``o`` ``[S, 1, 1]``. Output is
+        ``BATCHED`` iff both inputs are, else ``RAGGED``.
     """
 
     def __init__(self):
@@ -79,19 +27,9 @@ def __init__(self):
         self.schedule = Schedule.W
     # ---------------- profile ----------------
     def profile(self, x: vTensor, y: vTensor, ctx: Context) -> vTensor:
-        r"""
-        Validate inputs, allocate the output buffer, and return a
-        :class:`vTensor` view.
-
-        The method enforces the logical shapes
-
-        - ``x``: ``[B, 1, D]``
-        - ``y``: ``[S_pack, 1, D]``
-
-        and checks that the last dimensions match. The output buffer is
-        allocated with shape ``[S_pack, 1, 1]``, where ``S_pack`` is taken
-        from the runtime context as ``ctx.max_num_pages``.
-        """
+        r"""Trace-time: validate ``x`` ``[B, 1, D]`` / ``y`` ``[S, 1, D]``,
+        register the op, and return a ``vTensor`` view of the ``[S, 1, 1]``
+        output (see the class docstring)."""
         prefix = self._prefix()
 
         # Type checks
@@ -137,51 +75,17 @@ def profile(self, x: vTensor, y: vTensor, ctx: Context) -> vTensor:
 # ------------------------------ GeMM ------------------------------ #
 class GeMM(vOp):
     r"""
-    General matrix-matrix multiplication (GeMM) dispatcher.
-
-    Logically this computes, for each logical ``S``-slice, a matrix-matrix
-    product
-
-    .. math::
-
-        O[s] = Y[s] X[s]^{\mathsf{T}}, \quad s = 0, \dots, S-1,
-
-    with slice-wise shapes
-
-    .. math::
-
-        X[s] \in \mathbb{R}^{N_x \times K}, \quad
-        Y[s] \in \mathbb{R}^{N_y \times K}, \quad
-        O[s] \in \mathbb{R}^{N_y \times N_x}.
-
-    In the packed 3D representation used by this dispatcher:
-
-    - ``Y`` has logical shape ``[S, N_y, K]``.
-    - ``X`` has logical shape ``[L_x, N_x, K]``, where the leading
-      dimension :math:`L_x` can represent **either**:
-
-      * a batch axis :math:`B` (when ``x_format == FORMAT.BATCHED``), or
-      * the same ``S`` axis as ``Y`` (when ``x_format`` is ragged/paged and
-        already laid out per-page).
-
-      This is why the code comments use ``X: [B/S, N_x, K]``: the first
-      dimension is interpreted as either a batch size :math:`B` or an
-      ``S``-like logical page index, depending on the format.
-
-    - The output tensor ``O`` has logical shape ``[S, N_y, N_x]``.
-
-    At runtime, the logical ``S`` is taken from ``ctx.max_num_pages``.
-    Output format rule: ``BATCHED`` iff both inputs are ``BATCHED``,
-    otherwise ``RAGGED``. Format compatibility is enforced by the
-    compiler's per-workload kernel.
-
-    Attributes
-    ----------
-    output_format : Optional[FORMAT]
-        The output tensor format as determined in :meth:`profile`.
-
-    output_buffer : Optional[torch.Tensor]
-        Preallocated output tensor buffer of shape ``[S, N_y, N_x]``.
+    Per-page matrix–matrix product, :math:`O[s] = Y[s]\,X[s]^{\top}`.
+
+    :Math: with :math:`Y\in\mathbb{R}^{S\times N_y\times K}` and
+        :math:`X\in\mathbb{R}^{(B\,\text{or}\,S)\times N_x\times K}`, per page
+        :math:`s`:
+        :math:`O[s]=Y[s]\,X[s]^{\top}\in\mathbb{R}^{N_y\times N_x}` — i.e.
+        ``GeMM(x, y) = y xᵀ``. Output :math:`O\in\mathbb{R}^{S\times N_y\times N_x}`.
+    :__init__: ``GeMM()`` — no arguments.
+    :__call__: ``o = op(x, y, ctx=ctx)`` — ``x`` is ``[B|S, N_x, K]``, ``y`` is
+        ``[S, N_y, K]`` (matching ``K``); returns ``o`` ``[S, N_y, N_x]``.
+        Output is ``BATCHED`` iff both inputs are, else ``RAGGED``.
     """
 
     def __init__(self):
@@ -192,48 +96,9 @@ def __init__(self):
 
     # ---------------- profile ----------------
     def profile(self, x: vTensor, y: vTensor, ctx: Context) -> vTensor:
-        r"""
-        Validate inputs, allocate the output buffer, and return a
-        :class:`vTensor` view.
-
-        The method enforces that both inputs are rank-3 tensors and that the
-        inner dimension :math:`K` matches:
-
-        - ``x``: ``[B_or_S, N_x, K]``
-
-          *When* ``x_format == FORMAT.BATCHED``, the leading dimension is a
-          batch size :math:`B`. For ragged/paged formats, it may conceptually
-          coincide with :math:`S`.
-
-        - ``y``: ``[S, N_y, K]``
-
-        The output buffer is allocated with shape ``[S, N_y, N_x]``, where
-        ``S`` is taken from the runtime context as ``ctx.max_num_pages``.
-
-        Parameters
-        ----------
-        x : vTensor
-            Right-hand operand (transposed in the mathematical view), with
-            shape ``[B_or_S, N_x, K]``.
-
-        y : vTensor
-            Left-hand operand with shape ``[S, N_y, K]``.
-
-        ctx : Context
-            Execution context providing ``ctx.max_num_pages`` for the logical
-            ``S`` dimension and tracking auxiliary memory.
-
-        Returns
-        -------
-        vTensor
-            A ``vTensor`` view wrapping the allocated output buffer.
-
-        Raises
-        ------
-        AssertionError
-            If types are not ``vTensor``, ranks are not 3, or the inner
-            dimensions :math:`K` do not match.
-        """
+        r"""Trace-time: validate ``x`` ``[B|S, N_x, K]`` / ``y`` ``[S, N_y, K]``
+        (matching ``K``), register the op, and return a ``vTensor`` view of the
+        ``[S, N_y, N_x]`` output (see the class docstring)."""
         prefix = self._prefix()
 
         # Type checks