Skip to content

vllm.v1.attention.ops.flashmla

_flashmla_C_AVAILABLE module-attribute

_flashmla_C_AVAILABLE = True

_flashmla_extension_C_AVAILABLE module-attribute

_flashmla_extension_C_AVAILABLE = True

logger module-attribute

logger = init_logger(__name__)

_is_flashmla_available

_is_flashmla_available() -> tuple[bool, str | None]
Source code in vllm/v1/attention/ops/flashmla.py
def _is_flashmla_available() -> tuple[bool, str | None]:
    if not _flashmla_C_AVAILABLE:
        return (
            False,
            "vllm._flashmla_C is not available, likely was not "
            "compiled due to insufficient nvcc version or a supported arch "
            "was not in the list of target arches to compile for.",
        )
    if not _flashmla_extension_C_AVAILABLE:
        return (
            False,
            "vllm._flashmla_extension_C is not available, likely "
            "was not compiled due to a build error.",
        )

    return True, None

_raise_flashmla_unavailable

_raise_flashmla_unavailable(*_args, **_kwargs)
Source code in vllm/v1/attention/ops/flashmla.py
def _raise_flashmla_unavailable(*_args, **_kwargs):
    _, reason = _is_flashmla_available()
    raise RuntimeError(reason or "FlashMLA is not available")

flash_mla_with_kvcache_fp8

flash_mla_with_kvcache_fp8(
    q: Tensor,
    k_cache: Tensor,
    block_table: Tensor,
    cache_seqlens: Tensor,
    head_dim_v: int,
    tile_scheduler_metadata: Tensor,
    num_splits: Tensor,
    softmax_scale: float | None = None,
    causal: bool = False,
    descale_q: Tensor | None = None,
    descale_k: Tensor | None = None,
) -> tuple[Tensor, Tensor]
Source code in vllm/v1/attention/ops/flashmla.py
def flash_mla_with_kvcache_fp8(
    q: torch.Tensor,
    k_cache: torch.Tensor,
    block_table: torch.Tensor,
    cache_seqlens: torch.Tensor,
    head_dim_v: int,
    tile_scheduler_metadata: torch.Tensor,
    num_splits: torch.Tensor,
    softmax_scale: float | None = None,
    causal: bool = False,
    descale_q: torch.Tensor | None = None,
    descale_k: torch.Tensor | None = None,
) -> tuple[torch.Tensor, torch.Tensor]:
    if not _is_flashmla_available()[0]:
        _raise_flashmla_unavailable()
    if softmax_scale is None:
        softmax_scale = q.shape[-1] ** (-0.5)
    out, softmax_lse = torch.ops._flashmla_extension_C.fwd_kvcache_mla_fp8(
        q,
        k_cache,
        head_dim_v,
        cache_seqlens,
        block_table,
        softmax_scale,
        causal,
        tile_scheduler_metadata,
        num_splits,
        descale_q,
        descale_k,
    )
    return out, softmax_lse

get_mla_metadata_dense_fp8

get_mla_metadata_dense_fp8(
    cache_seqlens: Tensor,
    num_q_tokens_per_head_k: int,
    num_heads_k: int,
) -> tuple[Tensor, Tensor]
Source code in vllm/v1/attention/ops/flashmla.py
def get_mla_metadata_dense_fp8(
    cache_seqlens: torch.Tensor,
    num_q_tokens_per_head_k: int,
    num_heads_k: int,
) -> tuple[torch.Tensor, torch.Tensor]:
    if not _is_flashmla_available()[0]:
        _raise_flashmla_unavailable()
    return torch.ops._flashmla_extension_C.get_mla_decoding_metadata_dense_fp8(
        cache_seqlens,
        num_q_tokens_per_head_k,
        num_heads_k,
    )

is_flashmla_dense_supported

is_flashmla_dense_supported() -> tuple[bool, str | None]

Return: is_supported_flag, unsupported_reason (optional).

Source code in vllm/v1/attention/ops/flashmla.py
def is_flashmla_dense_supported() -> tuple[bool, str | None]:
    """
    Return: is_supported_flag, unsupported_reason (optional).
    """
    is_availble, maybe_reason = _is_flashmla_available()
    if not is_availble:
        return False, maybe_reason
    if not current_platform.is_device_capability_family(90):
        return False, "FlashMLA Dense is only supported on Hopper devices."
    return True, None

is_flashmla_sparse_supported

is_flashmla_sparse_supported() -> tuple[bool, str | None]

Return: is_supported_flag, unsupported_reason (optional).

Source code in vllm/v1/attention/ops/flashmla.py
def is_flashmla_sparse_supported() -> tuple[bool, str | None]:
    """
    Return: is_supported_flag, unsupported_reason (optional).
    """
    is_availble, maybe_reason = _is_flashmla_available()
    if not is_availble:
        return False, maybe_reason
    if not (
        current_platform.is_device_capability_family(90)
        or current_platform.is_device_capability_family(100)
    ):
        return (
            False,
            "FlashMLA Sparse is only supported on Hopper and Blackwell devices.",
        )
    return True, None