vllm.v1.kv_cache_interface ¶

ChunkedLocalAttentionSpec `dataclass` ¶

Bases: AttentionSpec

Source code in vllm/v1/kv_cache_interface.py

@dataclass(frozen=True, kw_only=True)
class ChunkedLocalAttentionSpec(AttentionSpec):
    attention_chunk_size: int

    def max_admission_blocks_per_request(
        self, max_num_batched_tokens: int, max_model_len: int
    ) -> int:
        """Per-request admission cap, in blocks.

        Single source of truth for both startup pool sizing
        (`max_memory_usage_bytes`) and the runtime admission gate, so requests
        admitted by startup can also be admitted at runtime.
        """
        # During chunked prefill, we hold KV for at most one chunk window.
        num_tokens = min(
            self.attention_chunk_size + max_num_batched_tokens, max_model_len
        )
        return cdiv(num_tokens, self.block_size)

    def max_memory_usage_bytes(self, vllm_config: VllmConfig) -> int:
        max_model_len = vllm_config.model_config.max_model_len
        max_num_batched_tokens = vllm_config.scheduler_config.max_num_batched_tokens
        max_blocks = self.max_admission_blocks_per_request(
            max_num_batched_tokens=max_num_batched_tokens, max_model_len=max_model_len
        )
        return max_blocks * self.page_size_bytes

max_admission_blocks_per_request ¶

max_admission_blocks_per_request(
    max_num_batched_tokens: int, max_model_len: int
) -> int

Per-request admission cap, in blocks.

Single source of truth for both startup pool sizing (max_memory_usage_bytes) and the runtime admission gate, so requests admitted by startup can also be admitted at runtime.

Source code in vllm/v1/kv_cache_interface.py

def max_admission_blocks_per_request(
    self, max_num_batched_tokens: int, max_model_len: int
) -> int:
    """Per-request admission cap, in blocks.

    Single source of truth for both startup pool sizing
    (`max_memory_usage_bytes`) and the runtime admission gate, so requests
    admitted by startup can also be admitted at runtime.
    """
    # During chunked prefill, we hold KV for at most one chunk window.
    num_tokens = min(
        self.attention_chunk_size + max_num_batched_tokens, max_model_len
    )
    return cdiv(num_tokens, self.block_size)

CrossAttentionSpec `dataclass` ¶

Bases: AttentionSpec

KV cache spec for cross-attention layers in encoder-decoder models.

Source code in vllm/v1/kv_cache_interface.py

@dataclass(frozen=True)
class CrossAttentionSpec(AttentionSpec):
    """
    KV cache spec for cross-attention layers in encoder-decoder models.
    """

    def max_memory_usage_bytes(self, vllm_config: VllmConfig) -> int:
        # For cross-attention, we need to cache encoder states
        # Get encoder length (e.g., 1500 for Whisper).
        max_encoder_len = vllm_config.scheduler_config.max_num_encoder_input_tokens
        return cdiv(max_encoder_len, self.block_size) * self.page_size_bytes

FullAttentionSpec `dataclass` ¶

Bases: AttentionSpec

When hybrid allocator is disabled and the model contains both full attention layers and sliding window attention layers, sliding window attention are regarded as full attention in KV cache manager (blocks are allocated for all tokens), while computed as sliding window attention in model runner. In this case, we use FullAttentionSpec and record the sliding window size.

Source code in vllm/v1/kv_cache_interface.py

@dataclass(frozen=True, kw_only=True)
class FullAttentionSpec(AttentionSpec):
    """
    When hybrid allocator is disabled and the model contains both full
    attention layers and sliding window attention layers, sliding
    window attention are regarded as full attention in KV cache manager
    (blocks are allocated for all tokens), while computed as sliding window
    attention in model runner.
    In this case, we use FullAttentionSpec and record the sliding window size.
    """

    head_size_v: int = None  # type: ignore[assignment]

    sliding_window: int | None = None
    """
    Default to None for not using sliding window attention.
    """
    attention_chunk_size: int | None = None

    def __post_init__(self):
        if self.head_size_v is None:
            object.__setattr__(self, "head_size_v", self.head_size)

    @property
    def state_content_size_bytes(self) -> int:
        hs_k = self.head_size
        hs_v = self.head_size_v
        if self.kv_quant_mode.is_nvfp4:
            hs_k = nvfp4_kv_cache_full_dim(hs_k)
            hs_v = nvfp4_kv_cache_full_dim(hs_v)
            assert hs_k == hs_v, (
                "nvfp4 with asymmetric K/V head sizes not yet supported"
            )
            return hs_k * get_dtype_size(self.dtype)
        return (hs_k + hs_v) * get_dtype_size(self.dtype)

    def max_memory_usage_bytes(self, vllm_config: VllmConfig) -> int:
        max_model_len = vllm_config.model_config.max_model_len
        dcp_world_size = vllm_config.parallel_config.decode_context_parallel_size
        pcp_world_size = vllm_config.parallel_config.prefill_context_parallel_size
        # Note(hc): each dcp rank only need save
        # (max_model_len//dcp_world_size) tokens locally.
        if dcp_world_size * pcp_world_size > 1:
            max_model_len = cdiv(max_model_len, dcp_world_size * pcp_world_size)
        return cdiv(max_model_len, self.block_size) * self.page_size_bytes

    @classmethod
    def merge_window_sizes(cls, window_sizes: set[int]) -> int | None:
        if len(window_sizes) == 0:
            return None
        elif len(window_sizes) == 1:
            return window_sizes.pop()
        else:
            raise ValueError(
                "All attention layers in the same KV cache group must have the "
                "same window size."
            )

    @classmethod
    def merge(cls, specs: list[Self]) -> Self:
        """
        Merge a list of FullAttentionSpec objects into a single
        FullAttentionSpec object.
        """
        assert all(isinstance(spec, FullAttentionSpec) for spec in specs), (
            "All attention layers in the same KV cache group must be FullAttentionSpec."
        )

        sliding_window = set(
            spec.sliding_window for spec in specs if spec.sliding_window is not None
        )
        attention_chunk_size = set(
            spec.attention_chunk_size
            for spec in specs
            if spec.attention_chunk_size is not None
        )
        assert not any(isinstance(spec, MLAAttentionSpec) for spec in specs), (
            "MLAAttentionSpec should be merged in MLAAttentionSpec.merge"
        )
        merged_spec = cls(
            block_size=specs[0].block_size,
            num_kv_heads=specs[0].num_kv_heads,
            head_size=specs[0].head_size,
            head_size_v=specs[0].head_size_v,
            dtype=specs[0].dtype,
            kv_quant_mode=specs[0].kv_quant_mode,
            page_size_padded=specs[0].page_size_padded,
            sliding_window=cls.merge_window_sizes(sliding_window),
            attention_chunk_size=cls.merge_window_sizes(attention_chunk_size),
        )
        for spec in specs:
            for f in fields(AttentionSpec):
                assert getattr(spec, f.name) == getattr(merged_spec, f.name), (
                    "All attention layers in the same KV cache group must have "
                    "the same attention spec."
                )
        assert (merged_spec.sliding_window is not None) + (
            merged_spec.attention_chunk_size is not None
        ) <= 1, (
            "Model with both sliding window layers and chunked local attention "
            "layers is not supported."
        )
        return merged_spec

    @property
    def real_page_size_bytes(self) -> int:
        if self.kv_quant_mode.is_nvfp4:
            # Packed layout per head: fp4 data + fp8 block scales.
            # fp4 data: head_size//2 bytes (2 fp4 values per byte)
            # fp8 block scale: head_size//16 bytes (1 scale per 16 elements)
            last_dim = nvfp4_kv_cache_full_dim(
                self.head_size
            ) + nvfp4_kv_cache_full_dim(self.head_size_v)
            return (
                self.block_size
                * self.num_kv_heads
                * last_dim
                * get_dtype_size(self.dtype)
            )
        return (
            self.block_size
            * self.num_kv_heads
            * (self.head_size + self.head_size_v)
            * get_dtype_size(self.dtype)
        )

sliding_window `class-attribute` `instance-attribute` ¶

sliding_window: int | None = None

Default to None for not using sliding window attention.

merge `classmethod` ¶

merge(specs: list[Self]) -> Self

Merge a list of FullAttentionSpec objects into a single FullAttentionSpec object.

Source code in vllm/v1/kv_cache_interface.py

@classmethod
def merge(cls, specs: list[Self]) -> Self:
    """
    Merge a list of FullAttentionSpec objects into a single
    FullAttentionSpec object.
    """
    assert all(isinstance(spec, FullAttentionSpec) for spec in specs), (
        "All attention layers in the same KV cache group must be FullAttentionSpec."
    )

    sliding_window = set(
        spec.sliding_window for spec in specs if spec.sliding_window is not None
    )
    attention_chunk_size = set(
        spec.attention_chunk_size
        for spec in specs
        if spec.attention_chunk_size is not None
    )
    assert not any(isinstance(spec, MLAAttentionSpec) for spec in specs), (
        "MLAAttentionSpec should be merged in MLAAttentionSpec.merge"
    )
    merged_spec = cls(
        block_size=specs[0].block_size,
        num_kv_heads=specs[0].num_kv_heads,
        head_size=specs[0].head_size,
        head_size_v=specs[0].head_size_v,
        dtype=specs[0].dtype,
        kv_quant_mode=specs[0].kv_quant_mode,
        page_size_padded=specs[0].page_size_padded,
        sliding_window=cls.merge_window_sizes(sliding_window),
        attention_chunk_size=cls.merge_window_sizes(attention_chunk_size),
    )
    for spec in specs:
        for f in fields(AttentionSpec):
            assert getattr(spec, f.name) == getattr(merged_spec, f.name), (
                "All attention layers in the same KV cache group must have "
                "the same attention spec."
            )
    assert (merged_spec.sliding_window is not None) + (
        merged_spec.attention_chunk_size is not None
    ) <= 1, (
        "Model with both sliding window layers and chunked local attention "
        "layers is not supported."
    )
    return merged_spec

HiddenStateCacheSpec `dataclass` ¶

Bases: MLAAttentionSpec

Marker for hidden-state cache layers used by extract_hidden_states.

Source code in vllm/v1/kv_cache_interface.py

@dataclass(frozen=True, kw_only=True)
class HiddenStateCacheSpec(MLAAttentionSpec):
    """Marker for hidden-state cache layers used by extract_hidden_states."""

    pass

KVCacheConfig `dataclass` ¶

The KV cache configuration of a model.

Source code in vllm/v1/kv_cache_interface.py

@dataclass
class KVCacheConfig:
    """
    The KV cache configuration of a model.
    """

    num_blocks: int
    """The number of KV cache blocks"""
    kv_cache_tensors: list[KVCacheTensor]
    """How should model runner initialize the KV cache tensors for each layer"""
    kv_cache_groups: list[KVCacheGroupSpec]
    """
    The kv cache groups of the model.
    For models with only one type of attention, there is only one group that
    contains all layers.
    For models with multiple types of attention, there will be multiple groups,
    see `_get_kv_cache_config_uniform_page_size` for more details.
    """

    @property
    def has_mamba_layers(self) -> bool:
        return any(isinstance(g.kv_cache_spec, MambaSpec) for g in self.kv_cache_groups)

    @property
    def needs_kv_cache_zeroing(self) -> bool:
        return self.has_mamba_layers

kv_cache_groups `instance-attribute` ¶

kv_cache_groups: list[KVCacheGroupSpec]

The kv cache groups of the model. For models with only one type of attention, there is only one group that contains all layers. For models with multiple types of attention, there will be multiple groups, see _get_kv_cache_config_uniform_page_size for more details.

kv_cache_tensors `instance-attribute` ¶

kv_cache_tensors: list[KVCacheTensor]

How should model runner initialize the KV cache tensors for each layer

num_blocks `instance-attribute` ¶

num_blocks: int

The number of KV cache blocks

KVCacheGroupSpec `dataclass` ¶

Represents a group of model layers that share the same KV cache block table. These layers are regarded as one layer in the KV cache manager.

Source code in vllm/v1/kv_cache_interface.py

@dataclass
class KVCacheGroupSpec:
    """
    Represents a group of model layers that share the same KV cache block table.
    These layers are regarded as one layer in the KV cache manager.
    """

    # The names of model layers in this group
    layer_names: list[str]
    # The KV cache spec of this manager layer
    kv_cache_spec: KVCacheSpec
    # Whether this group contains EAGLE/MTP draft attention layers.
    is_eagle_group: bool = False

KVCacheLayout ¶

Bases: Enum

Physical layout descriptor for a KV cache group.

The logical shape is always [L, B, H, N, ] (RFC #42082). Each member's value is a stride permutation that maps logical axes to physical (memory) order.

Source code in vllm/v1/kv_cache_interface.py

class KVCacheLayout(Enum):
    """Physical layout descriptor for a KV cache group.

    The logical shape is always [L, B, H, N, <content>] (RFC #42082).
    Each member's value is a stride permutation that maps logical axes
    to physical (memory) order.
    """

    LBHNC = (0, 1, 2, 3, 4)  # [L, B, H, N, C] (identity)
    LBNHC = (0, 1, 3, 2, 4)  # [L, B, N, H, C]
    BLHNC = (1, 0, 2, 3, 4)  # [B, L, H, N, C]
    BHLNC = (1, 2, 0, 3, 4)  # [B, H, L, N, C]

    @property
    def stride_order(self) -> tuple[int, ...]:
        return self.value

    @property
    def layer_stride_order(self) -> tuple[int, ...]:
        """4D permutation [B, H, N, C] for per-layer tensors."""
        if not self.is_layer_compact:
            compact = [m.name for m in KVCacheLayout if m.is_layer_compact]
            raise ValueError(
                f"KVCacheLayout.{self.name} cannot produce a 4D"
                f" layer_stride_order because L is not outermost."
                f" Use a layer-compact layout: {compact}"
            )
        return tuple(i - 1 for i in self.value if i != _DIM_L)

    @property
    def is_layer_compact(self) -> bool:
        """True when the layer is compact; i.e. the L dimension is outermost."""
        return self.value[_DIM_L] == 0

    @property
    def is_block_contiguous(self) -> bool:
        """True when [H, N, C] is contiguous within a block."""
        return self.value[-3:] == (_DIM_H, _DIM_N, _DIM_C)

is_block_contiguous `property` ¶

is_block_contiguous: bool

True when [H, N, C] is contiguous within a block.

is_layer_compact `property` ¶

is_layer_compact: bool

True when the layer is compact; i.e. the L dimension is outermost.

layer_stride_order `property` ¶

layer_stride_order: tuple[int, ...]

4D permutation [B, H, N, C] for per-layer tensors.

KVCacheSpec `dataclass` ¶

A base class for specifying the KV cache format of one layer.

RFC #42082 standard vocabulary (properties, overridden by subclasses): num_heads: int — heads (1 if headless, e.g. MLA) tokens_per_state: int — -1 infinite (recurrent), 1 standard, N compressed state_content_size_bytes: int — bytes per state per head

Source code in vllm/v1/kv_cache_interface.py

@dataclass(frozen=True)
class KVCacheSpec:
    """
    A base class for specifying the KV cache format of one layer.

    RFC #42082 standard vocabulary (properties, overridden by subclasses):
      num_heads: int          — heads (1 if headless, e.g. MLA)
      tokens_per_state: int   — -1 infinite (recurrent), 1 standard, N compressed
      state_content_size_bytes: int — bytes per state per head
    """

    block_size: int

    @property
    def num_heads(self) -> int:
        raise NotImplementedError

    @property
    def tokens_per_state(self) -> int:
        raise NotImplementedError

    @property
    def state_content_size_bytes(self) -> int:
        raise NotImplementedError

    @property
    def page_size_bytes(self) -> int:
        """
        The size of a page with `block_size` tokens in bytes.

        Returns:
            The page size
        """
        raise NotImplementedError

    @property
    def storage_block_size(self) -> int:
        return self.block_size

    def max_memory_usage_bytes(self, vllm_config: VllmConfig) -> int:
        """
        The maximum possible memory usage of this KV cache in bytes.

        Returns:
            The KV cache size in bytes
        """
        raise NotImplementedError

    def copy_with_new_block_size(self, block_size: int) -> Self:
        """
        Create a new KVCacheSpec from self but replacing the block size.
        """
        return replace(self, block_size=block_size)

    @classmethod
    def merge(cls, specs: list[Self]) -> Self:
        """
        Merge a list of KVCacheSpec objects into a single KVCacheSpec object.
        """
        assert all(spec == specs[0] for spec in specs[1:]), (
            "All layers in the same KV cache group must be the same."
        )
        return copy.deepcopy(specs[0])

page_size_bytes `property` ¶

page_size_bytes: int

The size of a page with block_size tokens in bytes.

Returns:

Type	Description
`int`	The page size

copy_with_new_block_size ¶

copy_with_new_block_size(block_size: int) -> Self

Create a new KVCacheSpec from self but replacing the block size.

Source code in vllm/v1/kv_cache_interface.py

def copy_with_new_block_size(self, block_size: int) -> Self:
    """
    Create a new KVCacheSpec from self but replacing the block size.
    """
    return replace(self, block_size=block_size)

max_memory_usage_bytes ¶

max_memory_usage_bytes(vllm_config: VllmConfig) -> int

The maximum possible memory usage of this KV cache in bytes.

Returns:

Type	Description
`int`	The KV cache size in bytes

Source code in vllm/v1/kv_cache_interface.py

def max_memory_usage_bytes(self, vllm_config: VllmConfig) -> int:
    """
    The maximum possible memory usage of this KV cache in bytes.

    Returns:
        The KV cache size in bytes
    """
    raise NotImplementedError

merge `classmethod` ¶

merge(specs: list[Self]) -> Self

Merge a list of KVCacheSpec objects into a single KVCacheSpec object.

Source code in vllm/v1/kv_cache_interface.py

@classmethod
def merge(cls, specs: list[Self]) -> Self:
    """
    Merge a list of KVCacheSpec objects into a single KVCacheSpec object.
    """
    assert all(spec == specs[0] for spec in specs[1:]), (
        "All layers in the same KV cache group must be the same."
    )
    return copy.deepcopy(specs[0])

KVCacheTensor `dataclass` ¶

One contiguous GPU allocation backing one or more layer slots.

shared_by[slot_idx] lists the layer names aliasing slot slot_idx. Layers in the same inner list belong to different groups (independent block tables) so their block-id namespaces never collide.

Source code in vllm/v1/kv_cache_interface.py

@dataclass
class KVCacheTensor:
    """One contiguous GPU allocation backing one or more layer slots.

    ``shared_by[slot_idx]`` lists the layer names aliasing slot ``slot_idx``.
    Layers in the same inner list belong to different groups (independent
    block tables) so their block-id namespaces never collide.
    """

    size: int  # total size in bytes
    shared_by: list[list[str]]  # shared_by[slot_idx] = [layer_names]

KVQuantMode ¶

Bases: IntEnum

KV cache quantization mode.

Used by attention backends and kernels to dispatch quantization logic without string matching on kv_cache_dtype.

Source code in vllm/v1/kv_cache_interface.py

class KVQuantMode(IntEnum):
    """KV cache quantization mode.

    Used by attention backends and kernels to dispatch quantization logic
    without string matching on ``kv_cache_dtype``.
    """

    NONE = 0
    FP8_PER_TENSOR = 1  # per-tensor scales (current fp8 path)
    INT8_PER_TOKEN_HEAD = 2  # per-token-head dynamic scales for int8
    FP8_PER_TOKEN_HEAD = 3  # per-token-head dynamic scales for fp8
    NVFP4 = 4  # packed fp4 data + fp8 block scales

    @property
    def is_per_token_head(self) -> bool:
        """True for any per-token-head quantization mode."""
        return self in (
            KVQuantMode.INT8_PER_TOKEN_HEAD,
            KVQuantMode.FP8_PER_TOKEN_HEAD,
        )

    @property
    def is_nvfp4(self) -> bool:
        """True for NVFP4 packed quantization mode."""
        return self == KVQuantMode.NVFP4

is_nvfp4 `property` ¶

is_nvfp4: bool

True for NVFP4 packed quantization mode.

is_per_token_head `property` ¶

is_per_token_head: bool

True for any per-token-head quantization mode.

SinkFullAttentionSpec `dataclass` ¶

Bases: FullAttentionSpec

Source code in vllm/v1/kv_cache_interface.py

@dataclass(frozen=True)
class SinkFullAttentionSpec(FullAttentionSpec):
    sink_len: int | None = None

    @classmethod
    def merge(cls, specs: list[Self]) -> Self:
        """
        Merge a list of FullAttentionSpec objects into a single
        FullAttentionSpec object.
        """
        assert all(isinstance(spec, FullAttentionSpec) for spec in specs), (
            "All attention layers in the same KV cache group must be FullAttentionSpec."
        )

        sliding_window = set(
            spec.sliding_window for spec in specs if spec.sliding_window is not None
        )
        attention_chunk_size = set(
            spec.attention_chunk_size
            for spec in specs
            if spec.attention_chunk_size is not None
        )
        assert not any(isinstance(spec, MLAAttentionSpec) for spec in specs), (
            "MLAAttentionSpec should be merged in MLAAttentionSpec.merge"
        )
        merged_spec = cls(
            block_size=specs[0].block_size,
            num_kv_heads=specs[0].num_kv_heads,
            head_size=specs[0].head_size,
            head_size_v=specs[0].head_size_v,
            sink_len=specs[0].sink_len,
            dtype=specs[0].dtype,
            kv_quant_mode=specs[0].kv_quant_mode,
            page_size_padded=specs[0].page_size_padded,
            sliding_window=cls.merge_window_sizes(sliding_window),
            attention_chunk_size=cls.merge_window_sizes(attention_chunk_size),
        )
        for spec in specs:
            for f in fields(AttentionSpec):
                assert getattr(spec, f.name) == getattr(merged_spec, f.name), (
                    "All attention layers in the same KV cache group must have "
                    "the same attention spec."
                )
        assert (merged_spec.sliding_window is not None) + (
            merged_spec.attention_chunk_size is not None
        ) <= 1, (
            "Model with both sliding window layers and chunked local attention "
            "layers is not supported."
        )
        return merged_spec

merge `classmethod` ¶

merge(specs: list[Self]) -> Self

Merge a list of FullAttentionSpec objects into a single FullAttentionSpec object.

Source code in vllm/v1/kv_cache_interface.py

@classmethod
def merge(cls, specs: list[Self]) -> Self:
    """
    Merge a list of FullAttentionSpec objects into a single
    FullAttentionSpec object.
    """
    assert all(isinstance(spec, FullAttentionSpec) for spec in specs), (
        "All attention layers in the same KV cache group must be FullAttentionSpec."
    )

    sliding_window = set(
        spec.sliding_window for spec in specs if spec.sliding_window is not None
    )
    attention_chunk_size = set(
        spec.attention_chunk_size
        for spec in specs
        if spec.attention_chunk_size is not None
    )
    assert not any(isinstance(spec, MLAAttentionSpec) for spec in specs), (
        "MLAAttentionSpec should be merged in MLAAttentionSpec.merge"
    )
    merged_spec = cls(
        block_size=specs[0].block_size,
        num_kv_heads=specs[0].num_kv_heads,
        head_size=specs[0].head_size,
        head_size_v=specs[0].head_size_v,
        sink_len=specs[0].sink_len,
        dtype=specs[0].dtype,
        kv_quant_mode=specs[0].kv_quant_mode,
        page_size_padded=specs[0].page_size_padded,
        sliding_window=cls.merge_window_sizes(sliding_window),
        attention_chunk_size=cls.merge_window_sizes(attention_chunk_size),
    )
    for spec in specs:
        for f in fields(AttentionSpec):
            assert getattr(spec, f.name) == getattr(merged_spec, f.name), (
                "All attention layers in the same KV cache group must have "
                "the same attention spec."
            )
    assert (merged_spec.sliding_window is not None) + (
        merged_spec.attention_chunk_size is not None
    ) <= 1, (
        "Model with both sliding window layers and chunked local attention "
        "layers is not supported."
    )
    return merged_spec

SlidingWindowMLASpec `dataclass` ¶

Bases: SlidingWindowSpec

Sliding window attention with MLA cache format.

Source code in vllm/v1/kv_cache_interface.py

@dataclass(frozen=True, kw_only=True)
class SlidingWindowMLASpec(SlidingWindowSpec):
    """Sliding window attention with MLA cache format."""

    cache_dtype_str: str | None = None
    # DeepseekV4-only: see MLAAttentionSpec.model_version.
    alignment: int | None = None  # Default to None for no padding.
    tokens_per_state: int = 1
    model_version: str | None = None

    def __post_init__(self):
        super().__post_init__()
        _apply_alignment_padding(self)

    @property
    def state_content_size_bytes(self) -> int:
        if self.model_version == "deepseek_v4":
            return 584
        return self.head_size * get_dtype_size(self.dtype)

    @property
    def storage_block_size(self) -> int:
        return self.block_size // self.tokens_per_state

    @property
    def real_page_size_bytes(self) -> int:
        if self.model_version == "deepseek_v4":
            # DeepseekV4: 448B NoPE + 128B RoPE + 8B fp8 scale = 584B per token.
            return self.storage_block_size * 584
        assert self.model_version is None, (
            f"Unsupported model version: {self.model_version}"
        )
        return (
            self.storage_block_size
            * self.num_kv_heads
            * self.head_size
            * get_dtype_size(self.dtype)
        )

    @classmethod
    def merge(cls, specs: list[Self]) -> Self:
        assert all(isinstance(spec, SlidingWindowMLASpec) for spec in specs), (
            "All attention layers in the same KV cache group must be "
            "SlidingWindowMLASpec."
        )
        cache_dtype_str_set = set(spec.cache_dtype_str for spec in specs)
        tokens_per_state_set = set(spec.tokens_per_state for spec in specs)
        model_version_set = set(spec.model_version for spec in specs)
        sliding_window_set = set(spec.sliding_window for spec in specs)
        assert (
            len(cache_dtype_str_set) == 1
            and len(tokens_per_state_set) == 1
            and len(model_version_set) == 1
            and len(sliding_window_set) == 1
        ), (
            "All attention layers in the same KV cache group must use the same "
            "quantization method, compress ratio, model version and sliding "
            "window size."
        )
        return cls(
            block_size=specs[0].block_size,
            num_kv_heads=specs[0].num_kv_heads,
            head_size=specs[0].head_size,
            dtype=specs[0].dtype,
            page_size_padded=specs[0].page_size_padded,
            sliding_window=sliding_window_set.pop(),
            cache_dtype_str=cache_dtype_str_set.pop(),
            tokens_per_state=tokens_per_state_set.pop(),
            model_version=model_version_set.pop(),
        )

SlidingWindowSpec `dataclass` ¶

Bases: AttentionSpec

Source code in vllm/v1/kv_cache_interface.py

@dataclass(frozen=True, kw_only=True)
class SlidingWindowSpec(AttentionSpec):
    sliding_window: int
    head_size_v: int = None  # type: ignore[assignment]

    def __post_init__(self):
        if self.head_size_v is None:
            object.__setattr__(self, "head_size_v", self.head_size)

    @property
    def real_page_size_bytes(self) -> int:
        # Mirror ``FullAttentionSpec.real_page_size_bytes`` for NVFP4 KV cache.
        if self.kv_quant_mode.is_nvfp4:
            last_dim = nvfp4_kv_cache_full_dim(
                self.head_size
            ) + nvfp4_kv_cache_full_dim(self.head_size_v)
            return (
                self.block_size
                * self.num_kv_heads
                * last_dim
                * get_dtype_size(self.dtype)
            )
        return (
            self.block_size
            * self.num_kv_heads
            * (self.head_size + self.head_size_v)
            * get_dtype_size(self.dtype)
        )

    def max_admission_blocks_per_request(
        self, max_num_batched_tokens: int, max_model_len: int
    ) -> int:
        """Per-request admission cap, in blocks.

        Single source of truth for both startup pool sizing
        (`max_memory_usage_bytes`) and the runtime admission gate. Per-request
        real-held blocks plateau at this bound because
        `SlidingWindowManager.remove_skipped_blocks` runs from `allocate_slots`
        before each chunk's `get_num_blocks_to_allocate`.
        """
        # During chunked prefill, we hold KV for the last `sliding_window-1`
        # computed tokens plus the newly scheduled tokens, and never more
        # than `max_model_len`.
        num_tokens = min(
            self.sliding_window - 1 + max_num_batched_tokens, max_model_len
        )
        # +1 because the sliding window may not start from the beginning of
        # the block. E.g. block size 4 and num_token 4 needs two blocks
        # [XXCD][EF] to store the 6-token window [CDEF].
        return cdiv(num_tokens, self.block_size) + 1

    def max_memory_usage_bytes(self, vllm_config: VllmConfig) -> int:
        assert vllm_config.parallel_config.decode_context_parallel_size == 1, (
            "DCP not support sliding window."
        )
        max_model_len = vllm_config.model_config.max_model_len
        max_num_batched_tokens = vllm_config.scheduler_config.max_num_batched_tokens
        max_blocks = self.max_admission_blocks_per_request(
            max_num_batched_tokens=max_num_batched_tokens, max_model_len=max_model_len
        )
        return max_blocks * self.page_size_bytes

max_admission_blocks_per_request ¶

max_admission_blocks_per_request(
    max_num_batched_tokens: int, max_model_len: int
) -> int

Per-request admission cap, in blocks.

Single source of truth for both startup pool sizing (max_memory_usage_bytes) and the runtime admission gate. Per-request real-held blocks plateau at this bound because SlidingWindowManager.remove_skipped_blocks runs from allocate_slots before each chunk's get_num_blocks_to_allocate.

Source code in vllm/v1/kv_cache_interface.py

def max_admission_blocks_per_request(
    self, max_num_batched_tokens: int, max_model_len: int
) -> int:
    """Per-request admission cap, in blocks.

    Single source of truth for both startup pool sizing
    (`max_memory_usage_bytes`) and the runtime admission gate. Per-request
    real-held blocks plateau at this bound because
    `SlidingWindowManager.remove_skipped_blocks` runs from `allocate_slots`
    before each chunk's `get_num_blocks_to_allocate`.
    """
    # During chunked prefill, we hold KV for the last `sliding_window-1`
    # computed tokens plus the newly scheduled tokens, and never more
    # than `max_model_len`.
    num_tokens = min(
        self.sliding_window - 1 + max_num_batched_tokens, max_model_len
    )
    # +1 because the sliding window may not start from the beginning of
    # the block. E.g. block size 4 and num_token 4 needs two blocks
    # [XXCD][EF] to store the 6-token window [CDEF].
    return cdiv(num_tokens, self.block_size) + 1

TQFullAttentionSpec `dataclass` ¶

Bases: FullAttentionSpec

FullAttentionSpec with TQ-aware page size.

Python equivalent of the C++ TQ4FullAttentionSpec. Overrides real_page_size_bytes to use TQ slot bytes instead of the raw head_size * dtype formula.

Source code in vllm/v1/kv_cache_interface.py

@dataclass(frozen=True, kw_only=True)
class TQFullAttentionSpec(FullAttentionSpec):
    """FullAttentionSpec with TQ-aware page size.

    Python equivalent of the C++ TQ4FullAttentionSpec. Overrides
    real_page_size_bytes to use TQ slot bytes instead of the raw
    head_size * dtype formula.
    """

    tq_slot_size: int = 0

    @property
    def state_content_size_bytes(self) -> int:
        if self.tq_slot_size > 0:
            return self.tq_slot_size
        return super().state_content_size_bytes

    @property
    def real_page_size_bytes(self) -> int:
        if self.tq_slot_size > 0:
            return self.block_size * self.num_kv_heads * self.tq_slot_size
        return super().real_page_size_bytes

    @classmethod
    def merge(cls, specs: list[Self]) -> Self:
        merged = super().merge(specs)
        assert all(s.tq_slot_size == specs[0].tq_slot_size for s in specs), (
            "All TQ layers in the same KV cache group must use the same tq_slot_size."
        )
        return replace(merged, tq_slot_size=specs[0].tq_slot_size)

UniformTypeKVCacheSpecs `dataclass` ¶

Bases: KVCacheSpec

A KV cache spec for multiple layers with the same type of attention. Here, same types means always need the same number of token slots. For example, sliding window attentions with different window sizes are not the same type and should not be merged into one UniformTypeKVCacheSpecs.

Source code in vllm/v1/kv_cache_interface.py

@dataclass(frozen=True)
class UniformTypeKVCacheSpecs(KVCacheSpec):
    """
    A KV cache spec for multiple layers with the same type of attention. Here,
    same types means always need the same number of token slots. For example,
    sliding window attentions with different window sizes are not the same type
    and should not be merged into one UniformTypeKVCacheSpecs.
    """

    kv_cache_specs: dict[str, KVCacheSpec]

    @property
    def page_size_bytes(self) -> int:
        return sum(spec.page_size_bytes for spec in self.kv_cache_specs.values())

    def max_memory_usage_bytes(self, vllm_config: VllmConfig) -> int:
        max_num_pages = max(
            cdiv(spec.max_memory_usage_bytes(vllm_config), spec.page_size_bytes)
            for spec in self.kv_cache_specs.values()
        )
        return max_num_pages * self.page_size_bytes

    @classmethod
    def is_uniform_type(cls, kv_cache_specs: dict[str, KVCacheSpec]) -> bool:
        """
        Whether all layers have the same type of KV cache spec.
        """
        block_sizes = set(spec.block_size for spec in kv_cache_specs.values())
        if len(block_sizes) > 1:
            # Different block sizes, not uniform.
            return False
        one_spec = next(iter(kv_cache_specs.values()))
        # NOTE: Check subclasses before parent classes since isinstance()
        # returns True for subclasses.
        if isinstance(one_spec, SlidingWindowMLASpec):
            # SlidingWindowMLASpec is uniform if all specs are SlidingWindowMLASpec
            # with the same sliding_window size.
            return all(
                isinstance(spec, SlidingWindowMLASpec)
                and spec.sliding_window == one_spec.sliding_window
                for spec in kv_cache_specs.values()
            )
        elif isinstance(one_spec, FullAttentionSpec):
            return all(
                isinstance(spec, FullAttentionSpec) for spec in kv_cache_specs.values()
            )
        elif isinstance(one_spec, CrossAttentionSpec):
            return all(
                isinstance(spec, CrossAttentionSpec) for spec in kv_cache_specs.values()
            )
        elif isinstance(one_spec, SlidingWindowSpec):
            return all(
                isinstance(spec, SlidingWindowSpec)
                and spec.sliding_window == one_spec.sliding_window
                for spec in kv_cache_specs.values()
            )
        elif isinstance(one_spec, ChunkedLocalAttentionSpec):
            return all(
                isinstance(spec, ChunkedLocalAttentionSpec)
                and spec.attention_chunk_size == one_spec.attention_chunk_size
                for spec in kv_cache_specs.values()
            )
        elif isinstance(one_spec, MambaSpec):
            return all(
                isinstance(spec, MambaSpec)
                and spec.num_speculative_blocks == one_spec.num_speculative_blocks
                for spec in kv_cache_specs.values()
            )
        else:
            # NOTE(Chen): Please add new branches for new KV cache spec types.
            raise NotImplementedError(
                f"Unsupported KV cache spec type: {type(one_spec)}"
            )

    @classmethod
    def from_specs(cls, kv_cache_specs: dict[str, KVCacheSpec]) -> Self | None:
        """
        Return a SameTypeKVCacheSpecs object if all layers have the same type
        of KV cache spec. Return None if not.
        """
        if cls.is_uniform_type(kv_cache_specs):
            block_size = next(iter(kv_cache_specs.values())).block_size
            return cls(block_size=block_size, kv_cache_specs=kv_cache_specs)
        else:
            return None

    # NOTE: below util functions are only used by DeepseekV4 for now.
    def get_page_sizes(self) -> list[int]:
        return list(set(spec.page_size_bytes for spec in self.kv_cache_specs.values()))

    def get_num_layer_tuples(self) -> int:
        return Counter(
            spec.page_size_bytes for spec in self.kv_cache_specs.values()
        ).most_common(1)[0][1]

    def max_memory_usage_pages(self, vllm_config: VllmConfig) -> int:
        return max(
            cdiv(spec.max_memory_usage_bytes(vllm_config), spec.page_size_bytes)
            for spec in self.kv_cache_specs.values()
        )

from_specs `classmethod` ¶

from_specs(
    kv_cache_specs: dict[str, KVCacheSpec],
) -> Self | None

Return a SameTypeKVCacheSpecs object if all layers have the same type of KV cache spec. Return None if not.

Source code in vllm/v1/kv_cache_interface.py

@classmethod
def from_specs(cls, kv_cache_specs: dict[str, KVCacheSpec]) -> Self | None:
    """
    Return a SameTypeKVCacheSpecs object if all layers have the same type
    of KV cache spec. Return None if not.
    """
    if cls.is_uniform_type(kv_cache_specs):
        block_size = next(iter(kv_cache_specs.values())).block_size
        return cls(block_size=block_size, kv_cache_specs=kv_cache_specs)
    else:
        return None

is_uniform_type `classmethod` ¶

is_uniform_type(
    kv_cache_specs: dict[str, KVCacheSpec],
) -> bool

Whether all layers have the same type of KV cache spec.

Source code in vllm/v1/kv_cache_interface.py

@classmethod
def is_uniform_type(cls, kv_cache_specs: dict[str, KVCacheSpec]) -> bool:
    """
    Whether all layers have the same type of KV cache spec.
    """
    block_sizes = set(spec.block_size for spec in kv_cache_specs.values())
    if len(block_sizes) > 1:
        # Different block sizes, not uniform.
        return False
    one_spec = next(iter(kv_cache_specs.values()))
    # NOTE: Check subclasses before parent classes since isinstance()
    # returns True for subclasses.
    if isinstance(one_spec, SlidingWindowMLASpec):
        # SlidingWindowMLASpec is uniform if all specs are SlidingWindowMLASpec
        # with the same sliding_window size.
        return all(
            isinstance(spec, SlidingWindowMLASpec)
            and spec.sliding_window == one_spec.sliding_window
            for spec in kv_cache_specs.values()
        )
    elif isinstance(one_spec, FullAttentionSpec):
        return all(
            isinstance(spec, FullAttentionSpec) for spec in kv_cache_specs.values()
        )
    elif isinstance(one_spec, CrossAttentionSpec):
        return all(
            isinstance(spec, CrossAttentionSpec) for spec in kv_cache_specs.values()
        )
    elif isinstance(one_spec, SlidingWindowSpec):
        return all(
            isinstance(spec, SlidingWindowSpec)
            and spec.sliding_window == one_spec.sliding_window
            for spec in kv_cache_specs.values()
        )
    elif isinstance(one_spec, ChunkedLocalAttentionSpec):
        return all(
            isinstance(spec, ChunkedLocalAttentionSpec)
            and spec.attention_chunk_size == one_spec.attention_chunk_size
            for spec in kv_cache_specs.values()
        )
    elif isinstance(one_spec, MambaSpec):
        return all(
            isinstance(spec, MambaSpec)
            and spec.num_speculative_blocks == one_spec.num_speculative_blocks
            for spec in kv_cache_specs.values()
        )
    else:
        # NOTE(Chen): Please add new branches for new KV cache spec types.
        raise NotImplementedError(
            f"Unsupported KV cache spec type: {type(one_spec)}"
        )

compute_layer_kv_cache_shape_bytes ¶

compute_layer_kv_cache_shape_bytes(
    spec: KVCacheSpec,
    num_blocks: int,
    block_size: int | None = None,
) -> tuple[int, ...]

Return the 4D logical shape (B, H, N, C) where C is in bytes.

Source code in vllm/v1/kv_cache_interface.py

def compute_layer_kv_cache_shape_bytes(
    spec: KVCacheSpec,
    num_blocks: int,
    block_size: int | None = None,
) -> tuple[int, ...]:
    """Return the 4D logical shape ``(B, H, N, C)`` where C is in bytes."""
    bs = block_size if block_size is not None else spec.storage_block_size
    ns = num_states_for(bs, spec.tokens_per_state)
    return (num_blocks, spec.num_heads, ns, spec.state_content_size_bytes)

get_kv_quant_mode ¶

get_kv_quant_mode(kv_cache_dtype: str) -> KVQuantMode

Map a kv_cache_dtype string to a :class:KVQuantMode.

Source code in vllm/v1/kv_cache_interface.py

def get_kv_quant_mode(kv_cache_dtype: str) -> KVQuantMode:
    """Map a ``kv_cache_dtype`` string to a :class:`KVQuantMode`."""
    if kv_cache_dtype == "int8_per_token_head":
        return KVQuantMode.INT8_PER_TOKEN_HEAD
    if kv_cache_dtype == "fp8_per_token_head":
        return KVQuantMode.FP8_PER_TOKEN_HEAD
    if kv_cache_dtype == "nvfp4":
        return KVQuantMode.NVFP4
    if isinstance(kv_cache_dtype, str) and kv_cache_dtype.startswith("fp8"):
        return KVQuantMode.FP8_PER_TENSOR
    return KVQuantMode.NONE

kv_cache_uses_per_token_head_scales ¶

kv_cache_uses_per_token_head_scales(
    kv_cache_dtype: str,
) -> bool

Return True if kv_cache_dtype needs per-token-head scales.

Source code in vllm/v1/kv_cache_interface.py

def kv_cache_uses_per_token_head_scales(kv_cache_dtype: str) -> bool:
    """Return True if *kv_cache_dtype* needs per-token-head scales."""
    return get_kv_quant_mode(kv_cache_dtype).is_per_token_head

num_states_for ¶

num_states_for(
    block_size: int, tokens_per_state: int
) -> int

Derive num_states at allocation time (not part of the spec).

Source code in vllm/v1/kv_cache_interface.py

def num_states_for(block_size: int, tokens_per_state: int) -> int:
    """Derive num_states at allocation time (not part of the spec)."""
    if tokens_per_state == -1:
        return 1  # recurrent: single state per block
    return block_size // tokens_per_state

reshape_kv_cache ¶

reshape_kv_cache(
    raw: Tensor,
    spec: KVCacheSpec,
    num_blocks: int,
    num_layer_slots: int,
    layout: KVCacheLayout,
    block_size: int | None = None,
) -> list[Tensor]

View a flat int8 buffer as 4D [B, H, N, C] per-slot views.

Works for all KVCacheSpec subclasses. Shapes as int8 via compute_layer_kv_cache_shape_bytes, then reinterprets as spec.dtype.

Source code in vllm/v1/kv_cache_interface.py

def reshape_kv_cache(
    raw: torch.Tensor,
    spec: KVCacheSpec,
    num_blocks: int,
    num_layer_slots: int,
    layout: KVCacheLayout,
    block_size: int | None = None,
) -> list[torch.Tensor]:
    """View a flat int8 buffer as 4D ``[B, H, N, C]`` per-slot views.

    Works for all KVCacheSpec subclasses. Shapes as int8 via
    compute_layer_kv_cache_shape_bytes, then reinterprets as spec.dtype.
    """
    dtype = getattr(spec, "dtype", None)
    logical_shape_bytes = (
        num_layer_slots,
        *compute_layer_kv_cache_shape_bytes(spec, num_blocks, block_size),
    )
    stride_order = layout.stride_order
    physical_shape_bytes = tuple(logical_shape_bytes[i] for i in stride_order)
    inv_order = [stride_order.index(i) for i in range(5)]

    if page_size_padded := getattr(spec, "page_size_padded", None):
        strides = list(torch.empty(physical_shape_bytes, device="meta").stride())
        strides[inv_order[_DIM_B]] = page_size_padded
        cache = torch.as_strided(raw, size=physical_shape_bytes, stride=tuple(strides))
    else:
        cache = raw.view(physical_shape_bytes)
    cache_logical = cache.permute(*inv_order)

    if dtype is not None:
        cache_logical = cache_logical.view(dtype)

    return [cache_logical[i] for i in range(num_layer_slots)]

vllm.v1.kv_cache_interface ¶

ChunkedLocalAttentionSpec dataclass ¶

max_admission_blocks_per_request ¶

CrossAttentionSpec dataclass ¶

FullAttentionSpec dataclass ¶

sliding_window class-attribute instance-attribute ¶

merge classmethod ¶

HiddenStateCacheSpec dataclass ¶

KVCacheConfig dataclass ¶

kv_cache_groups instance-attribute ¶

kv_cache_tensors instance-attribute ¶

num_blocks instance-attribute ¶

KVCacheGroupSpec dataclass ¶

KVCacheLayout ¶

is_block_contiguous property ¶

is_layer_compact property ¶

layer_stride_order property ¶

KVCacheSpec dataclass ¶

page_size_bytes property ¶

copy_with_new_block_size ¶

max_memory_usage_bytes ¶

merge classmethod ¶

KVCacheTensor dataclass ¶

KVQuantMode ¶

is_nvfp4 property ¶

is_per_token_head property ¶

SinkFullAttentionSpec dataclass ¶

merge classmethod ¶

SlidingWindowMLASpec dataclass ¶

SlidingWindowSpec dataclass ¶

max_admission_blocks_per_request ¶

TQFullAttentionSpec dataclass ¶

UniformTypeKVCacheSpecs dataclass ¶

from_specs classmethod ¶

is_uniform_type classmethod ¶

compute_layer_kv_cache_shape_bytes ¶

get_kv_quant_mode ¶

kv_cache_uses_per_token_head_scales ¶

num_states_for ¶

reshape_kv_cache ¶

ChunkedLocalAttentionSpec `dataclass` ¶

CrossAttentionSpec `dataclass` ¶

FullAttentionSpec `dataclass` ¶

sliding_window `class-attribute` `instance-attribute` ¶

merge `classmethod` ¶

HiddenStateCacheSpec `dataclass` ¶

KVCacheConfig `dataclass` ¶

kv_cache_groups `instance-attribute` ¶

kv_cache_tensors `instance-attribute` ¶

num_blocks `instance-attribute` ¶

KVCacheGroupSpec `dataclass` ¶

is_block_contiguous `property` ¶

is_layer_compact `property` ¶

layer_stride_order `property` ¶

KVCacheSpec `dataclass` ¶

page_size_bytes `property` ¶

merge `classmethod` ¶

KVCacheTensor `dataclass` ¶

is_nvfp4 `property` ¶

is_per_token_head `property` ¶

SinkFullAttentionSpec `dataclass` ¶

merge `classmethod` ¶

SlidingWindowMLASpec `dataclass` ¶

SlidingWindowSpec `dataclass` ¶

TQFullAttentionSpec `dataclass` ¶

UniformTypeKVCacheSpecs `dataclass` ¶

from_specs `classmethod` ¶

is_uniform_type `classmethod` ¶