Skip to content

vllm.multimodal.media.audio

AudioEmbeddingMediaIO

Bases: MediaIO[Tensor]

Source code in vllm/multimodal/media/audio.py
class AudioEmbeddingMediaIO(MediaIO[torch.Tensor]):
    def __init__(self) -> None:
        super().__init__()

    def load_bytes(self, data: bytes) -> torch.Tensor:
        buffer = BytesIO(data)
        # Enable sparse tensor integrity checks to prevent out-of-bounds
        # writes from maliciously crafted tensors
        with torch.sparse.check_sparse_tensor_invariants():
            tensor = torch.load(buffer, weights_only=True)
            return tensor.to_dense()

    def load_base64(self, media_type: str, data: str) -> torch.Tensor:
        return self.load_bytes(pybase64.b64decode(data, validate=True))

    def load_file(self, filepath: Path) -> torch.Tensor:
        # Enable sparse tensor integrity checks to prevent out-of-bounds
        # writes from maliciously crafted tensors
        with torch.sparse.check_sparse_tensor_invariants():
            tensor = torch.load(filepath, weights_only=True)
            return tensor.to_dense()

    def encode_base64(self, media: torch.Tensor) -> str:
        return tensor2base64(media)

__init__

__init__() -> None
Source code in vllm/multimodal/media/audio.py
def __init__(self) -> None:
    super().__init__()

encode_base64

encode_base64(media: Tensor) -> str
Source code in vllm/multimodal/media/audio.py
def encode_base64(self, media: torch.Tensor) -> str:
    return tensor2base64(media)

load_base64

load_base64(media_type: str, data: str) -> Tensor
Source code in vllm/multimodal/media/audio.py
def load_base64(self, media_type: str, data: str) -> torch.Tensor:
    return self.load_bytes(pybase64.b64decode(data, validate=True))

load_bytes

load_bytes(data: bytes) -> Tensor
Source code in vllm/multimodal/media/audio.py
def load_bytes(self, data: bytes) -> torch.Tensor:
    buffer = BytesIO(data)
    # Enable sparse tensor integrity checks to prevent out-of-bounds
    # writes from maliciously crafted tensors
    with torch.sparse.check_sparse_tensor_invariants():
        tensor = torch.load(buffer, weights_only=True)
        return tensor.to_dense()

load_file

load_file(filepath: Path) -> Tensor
Source code in vllm/multimodal/media/audio.py
def load_file(self, filepath: Path) -> torch.Tensor:
    # Enable sparse tensor integrity checks to prevent out-of-bounds
    # writes from maliciously crafted tensors
    with torch.sparse.check_sparse_tensor_invariants():
        tensor = torch.load(filepath, weights_only=True)
        return tensor.to_dense()

AudioMediaIO

Bases: MediaIO[tuple[NDArray, float]]

Source code in vllm/multimodal/media/audio.py
class AudioMediaIO(MediaIO[tuple[npt.NDArray, float]]):
    def __init__(self, **kwargs) -> None:
        super().__init__()

        # `kwargs` contains custom arguments from
        # --media-io-kwargs for this modality.
        # They can be passed to the underlying
        # media loaders (e.g. custom implementations)
        # for flexible control.
        self.kwargs = kwargs

    def load_bytes(self, data: bytes) -> tuple[npt.NDArray, float]:
        return librosa.load(BytesIO(data), sr=None)

    def load_base64(
        self,
        media_type: str,
        data: str,
    ) -> tuple[npt.NDArray, float]:
        return self.load_bytes(base64.b64decode(data))

    def load_file(self, filepath: Path) -> tuple[npt.NDArray, float]:
        return librosa.load(filepath, sr=None)

    def encode_base64(
        self,
        media: tuple[npt.NDArray, int],
        *,
        audio_format: str = "WAV",
    ) -> str:
        audio, sr = media

        with BytesIO() as buffer:
            soundfile.write(buffer, audio, sr, format=audio_format)
            data = buffer.getvalue()

        return base64.b64encode(data).decode("utf-8")

kwargs instance-attribute

kwargs = kwargs

__init__

__init__(**kwargs) -> None
Source code in vllm/multimodal/media/audio.py
def __init__(self, **kwargs) -> None:
    super().__init__()

    # `kwargs` contains custom arguments from
    # --media-io-kwargs for this modality.
    # They can be passed to the underlying
    # media loaders (e.g. custom implementations)
    # for flexible control.
    self.kwargs = kwargs

encode_base64

encode_base64(
    media: tuple[NDArray, int], *, audio_format: str = "WAV"
) -> str
Source code in vllm/multimodal/media/audio.py
def encode_base64(
    self,
    media: tuple[npt.NDArray, int],
    *,
    audio_format: str = "WAV",
) -> str:
    audio, sr = media

    with BytesIO() as buffer:
        soundfile.write(buffer, audio, sr, format=audio_format)
        data = buffer.getvalue()

    return base64.b64encode(data).decode("utf-8")

load_base64

load_base64(
    media_type: str, data: str
) -> tuple[NDArray, float]
Source code in vllm/multimodal/media/audio.py
def load_base64(
    self,
    media_type: str,
    data: str,
) -> tuple[npt.NDArray, float]:
    return self.load_bytes(base64.b64decode(data))

load_bytes

load_bytes(data: bytes) -> tuple[NDArray, float]
Source code in vllm/multimodal/media/audio.py
def load_bytes(self, data: bytes) -> tuple[npt.NDArray, float]:
    return librosa.load(BytesIO(data), sr=None)

load_file

load_file(filepath: Path) -> tuple[NDArray, float]
Source code in vllm/multimodal/media/audio.py
def load_file(self, filepath: Path) -> tuple[npt.NDArray, float]:
    return librosa.load(filepath, sr=None)