Skip to content

vllm.model_executor.layers.quantization.kernels.scaled_mm

Modules:

Name Description
ScaledMMLinearKernel
aiter
cpu
cutlass
flashinfer
pytorch
rocm
triton

_KernelConfigT module-attribute

_KernelConfigT = TypeVar(
    "_KernelConfigT", bound=ScaledMMLinearLayerConfig
)

_KernelT module-attribute

_KernelT = TypeVar('_KernelT', bound=ScaledMMLinearKernel)

logger module-attribute

logger = init_logger(__name__)

choose_scaled_mm_linear_kernel

choose_scaled_mm_linear_kernel(
    config: _KernelConfigT,
    possible_kernels: dict[
        PlatformEnum, list[type[_KernelT]]
    ],
    compute_capability: int | None = None,
    force_kernel: type[_KernelT] | None = None,
) -> type[_KernelT]

Choose a _KernelT that can implement the given config for the given compute capability. Attempts to choose the best kernel in terms of performance.

Parameters:

Name Type Description Default
config _KernelConfigT

Description of the linear layer to be implemented.

required
possible_kernels dict[PlatformEnum, list[_KernelT]]

A dictionary of platforms and their list list of possible kernels.

required
compute_capability Optional[int]

The compute capability of the target device, if None uses current_platform to get the compute capability. Defaults to None.

None
force_kernel Optional[type[_KernelT]]

An Optional forced kernel to override the possible_kernels if it can be implemented. If None, it will only try the possible kernels.

None

Raises:

Type Description
ValueError

If no kernel can implement the given config.

Returns:

Name Type Description
_KernelT type[_KernelT]

Chosen kernel.

Source code in vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py
def choose_scaled_mm_linear_kernel(
    config: _KernelConfigT,
    possible_kernels: dict[PlatformEnum, list[type[_KernelT]]],
    compute_capability: int | None = None,
    force_kernel: type[_KernelT] | None = None,
) -> type[_KernelT]:
    """
    Choose a _KernelT that can implement the given config for the
    given compute capability. Attempts to choose the best kernel in terms of
    performance.

    Args:
        config (_KernelConfigT): Description of the linear layer
            to be implemented.
        possible_kernels (dict[PlatformEnum, list[_KernelT]]): A
            dictionary of platforms and their list list of possible kernels.
        compute_capability (Optional[int], optional): The compute capability of
            the target device, if None uses `current_platform` to get the
            compute capability. Defaults to None.
        force_kernel (Optional[type[_KernelT]]): An Optional forced kernel to override
            the possible_kernels if it can be implemented. If None, it will only try the
            possible kernels.

    Raises:
        ValueError: If no kernel can implement the given config.

    Returns:
        _KernelT: Chosen kernel.
    """

    failure_reason_list = []

    if force_kernel is not None:
        can_implement, failure_reason = is_supported_and_can_implement_kernel(
            force_kernel, config, compute_capability
        )
        if can_implement:
            return force_kernel

        logger.info_once(
            "Tried to force %s, but the kernel couldn't be implemented",
            force_kernel.__name__,
            scope="global",
        )

    for kernel in possible_kernels[current_platform._enum]:
        is_supported_and_can_implement, failure_reason = (
            is_supported_and_can_implement_kernel(kernel, config, compute_capability)
        )
        if is_supported_and_can_implement:
            return kernel
        failure_reason_list.append(failure_reason)

    raise ValueError(
        "Failed to find a kernel that can implement the "
        "ScaledMM linear layer. Reasons: \n" + "\n".join(failure_reason_list)
    )

init_fp8_linear_kernel

init_fp8_linear_kernel(
    activation_quant_key: QuantKey,
    weight_quant_key: QuantKey,
    out_dtype: dtype,
    force_kernel: type[FP8ScaledMMLinearKernel]
    | None = None,
    module_name: str | None = None,
) -> FP8ScaledMMLinearKernel
Source code in vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py
def init_fp8_linear_kernel(
    activation_quant_key: QuantKey,
    weight_quant_key: QuantKey,
    out_dtype: torch.dtype,
    force_kernel: type[FP8ScaledMMLinearKernel] | None = None,
    module_name: str | None = None,
) -> FP8ScaledMMLinearKernel:
    scaled_mm_linear_kernel_config = FP8ScaledMMLinearLayerConfig(
        weight_quant_key=weight_quant_key,
        activation_quant_key=activation_quant_key,
        out_dtype=out_dtype,
    )

    kernel_type = choose_scaled_mm_linear_kernel(
        scaled_mm_linear_kernel_config, _POSSIBLE_FP8_KERNELS, force_kernel=force_kernel
    )

    if module_name:
        logger.info_once(
            "Selected %s for %s",
            kernel_type.__name__,
            module_name,
            scope="global",
        )

    return kernel_type(
        scaled_mm_linear_kernel_config,
        layer_param_names=["weight", "weight_scale", "input_scale", "input_scale_ub"],
    )

init_int8_linear_kernel

init_int8_linear_kernel(
    is_channelwise: bool,
    is_static_input_scheme: bool,
    input_symmetric: bool,
    module_name: str,
) -> Int8ScaledMMLinearKernel
Source code in vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py
def init_int8_linear_kernel(
    is_channelwise: bool,
    is_static_input_scheme: bool,
    input_symmetric: bool,
    module_name: str,
) -> Int8ScaledMMLinearKernel:
    config = Int8ScaledMMLinearLayerConfig(
        is_channelwise=is_channelwise,
        is_static_input_scheme=is_static_input_scheme,
        input_symmetric=input_symmetric,
    )

    kernel_type = choose_scaled_mm_linear_kernel(
        config,
        _POSSIBLE_INT8_KERNELS,
    )

    logger.info_once(
        "Selected %s for %s",
        kernel_type.__name__,
        module_name,
        scope="global",
    )

    return kernel_type(
        config,
        layer_param_names=[
            "weight",
            "weight_scale",
            "input_scale",
            "input_zero_point",
            "azp_adj",
        ],
    )

is_supported_and_can_implement_kernel

is_supported_and_can_implement_kernel(
    kernel: type[_KernelT],
    config: _KernelConfigT,
    compute_capability: int | None,
) -> tuple[bool, str]
Source code in vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py
def is_supported_and_can_implement_kernel(
    kernel: type[_KernelT], config: _KernelConfigT, compute_capability: int | None
) -> tuple[bool, str]:
    # TODO: Fetch `VLLM_DISABLED_KERNELS` from vllm.envs instead.
    if kernel.__name__ in os.environ.get("VLLM_DISABLED_KERNELS", "").split(","):
        return False, f" {kernel.__name__} is disabled by environment variable"

    if compute_capability is None:
        _cc = current_platform.get_device_capability()
        if _cc is not None:
            compute_capability = _cc[0] * 10 + _cc[1]

    is_supported, failure_reason = kernel.is_supported(compute_capability)
    if not is_supported:
        return False, f"{kernel.__name__} {failure_reason}."

    can_implement, failure_reason = kernel.can_implement(config)
    if not can_implement:
        return (
            False,
            f"{kernel.__name__} {failure_reason}.",
        )

    return True, ""