from dataclasses import dataclass
from datetime import datetime
from typing import Any, Literal, NotRequired, TypedDict, cast
from dateutil import parser
from deprecated import deprecated
from chariot import _apis
from chariot.models.enum import InferenceEngine
from chariot_api._openapi.models import (
OutputEnumerableIsvcSettingResponse,
)
__all__ = [
"GPUDict",
"InferenceServerSettingsDict",
"IsvcSetting",
"VLLMConfigurationDict",
"create_isvc_settings",
"get_inference_server_settings",
"get_isvc_settings",
"set_inference_server_settings",
]
[docs]
class GPUDict(TypedDict):
"""The number and type of GPU's allocated to a model's inference server."""
product: str
count: int
[docs]
class VLLMConfigurationDict(TypedDict):
"""The configuration for the vLLM inference engine."""
bitsandbytes_4bit: NotRequired[bool]
enable_prefix_caching: NotRequired[bool]
max_model_length: NotRequired[int]
seed: NotRequired[int]
[docs]
class InferenceServerSettingsDict(TypedDict):
"""Settings for a model's inference server."""
predictor_cpu: NotRequired[str]
"""Number of CPU cores allocated to the predictor. Must be a positive k8s quantity. Defaults to ``"1"``.
See https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/#meaning-of-cpu for more detail."""
predictor_cpu_burstable: NotRequired[bool]
"""Whether the predictor can burst to using more CPU than requested. Defaults to False."""
predictor_memory: NotRequired[str]
"""Amount of memory allocated to the predictor. Must be a positive k8s quantity. Defaults to ``"4Gi"``.
See https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/#meaning-of-memory for more detail."""
predictor_ephemeral_storage: NotRequired[str | None]
"""Amount of ephemeral (disk) storage allocated to the predictor. Must be ``None`` or a positive k8s quantity. Defaults to ``None``.
If ``None``, no requests or limits are set.
See https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/#setting-requests-and-limits-for-local-ephemeral-storage for more detail."""
predictor_gpu: NotRequired[GPUDict | None]
"""Number and type of GPUs allocated to the predictor. Defaults to ``None``."""
predictor_min_replicas: NotRequired[int]
"""Minimum number of predictor replicas to scale down to. Must be ``>= 0`` and ``<= ReplicaLimit``. Defaults to ``0``."""
predictor_max_replicas: NotRequired[int]
"""Maximum number of predictor replicas to scale up to. Must be ``>= 1`` and ``<= ReplicaLimit``. Defaults to ``1``."""
predictor_scale_metric: NotRequired[Literal["concurrency", "rps"]]
"""Which metric to use for autoscaling the predictor. Defaults to ``"concurrency"``. Valid values:
- ``"concurrency"``: number of simultaneous requests to each replica.
- ``"rps"``: number of requests per second."""
predictor_scale_target: NotRequired[int]
"""Target value for autoscaling the predictor. Must be a positive integer. Defaults to ``5``."""
transformer_cpu: NotRequired[str]
"""Number of CPU cores allocated to the transformer. Must be a positive k8s quantity. Defaults to ``"1"``.
See https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/#meaning-of-cpu for more detail."""
transformer_cpu_burstable: NotRequired[bool]
"""Whether the transformer can burst to using more CPU than requested. Defaults to False."""
transformer_memory: NotRequired[str]
"""Amount of memory allocated to the transformer. Must be a positive k8s quantity. Defaults to ``"2Gi"``.
See https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/#meaning-of-memory for more detail."""
# NOTE(s.maddox): `transformer_gpu` is not applied to the pod by the backend, yet.
# transformer_gpu: NotRequired[GPUDict | None]
# """Number and type of GPUs allocated to the transformer. Defaults to ``None``."""
transformer_min_replicas: NotRequired[int]
"""Minimum number of transformer replicas to scale down to. Must be ``>= 0`` and ``<= ReplicaLimit``. Defaults to ``0``."""
transformer_max_replicas: NotRequired[int]
"""Maximum number of transformer replicas to scale up to. Must be ``>= 1`` and ``<= ReplicaLimit``. Defaults to ``1``."""
transformer_scale_metric: NotRequired[Literal["concurrency", "rps"]]
"""Which metric to use for autoscaling the transformer. Defaults to ``"concurrency"``.
- ``"concurrency"``: number of simultaneous requests to each replica.
- ``"rps"``: number of requests per second."""
transformer_scale_target: NotRequired[int]
"""Target value for autoscaling the transformer. Must be a positive integer. Defaults to ``20``."""
scale_down_delay_seconds: NotRequired[int]
"""The amount of time to wait after the scale metric falls below the scale target before\
scaling down if ``min_replicas`` has not been reached. Must be ``>= 0`` and ``<= 3600``. Defaults to ``600``."""
num_workers: NotRequired[int]
"""Number of workers to use for the predictor. Must be ``>= 1`` and ``<= 100``. Defaults to ``1``.
For artifact_type=Pytorch this value sets ``minWorkers``, ``maxWorkers``, ``default_workers_per_model``
to the specified value. See https://pytorch.org/serve/configuration.html for more detail.
For Chariot artifact types, sets the MLServer ``parallel_workers`` field to the specified value.
See https://mlserver.readthedocs.io/en/latest/user-guide/parallel-inference.html for more details."""
max_batch_size: NotRequired[int]
"""Maximum batch size for triggering a prediction. Must be ``> 0``. Defaults to ``1``."""
max_batch_delay_seconds: NotRequired[int | float]
"""Maximum batch delay in seconds for triggering a prediction. Must be ``>= 0``. Defaults to ``0``."""
inference_engine: NotRequired[InferenceEngine | None]
"""The inference engine to use. User selectable runtimes enable models to run under a different inference engine
than the artifact type. The model must have been converted to that runtime with models export first.
Passing nothing for this will result in running as the artifact type. Defaults to ``None``."""
huggingface_model_kwargs: NotRequired[dict | None]
"""Model keyword arguments to use for the Huggingface inference engine. Defaults to ``None``.
Only used when ``inference_engine="Huggingface"``."""
vllm_configuration: NotRequired[VLLMConfigurationDict | None]
"""The configuration for the vLLM inference engine. Defaults to ``None``.
Only used when ``inference_engine="vLLM"``."""
predictor_include_embedding_model: NotRequired[bool]
"""Whether to include the embedding model in the predictor. Defaults to ``False``."""
enable_inference_storage: NotRequired[bool]
"""Whether to store inferences. Defaults to ``False``."""
only_store_detections: NotRequired[bool]
"""Whether to store all inferences (``False``) or only inferences with detections (``True``). Defaults to ``False``. Ignored if ``enable_inference_storage`` is ``False``."""
positive_sampling_rate: NotRequired[float]
"""Rate at which inferences with classifications or detections will be stored. Must be ``>= 0`` and ``<= 1``. Defaults to ``0``.
A value of ``0.65`` means that there is a 65% chance that each inference with a classification or detection is stored."""
negative_sampling_rate: NotRequired[float]
"""Rate at which inferences without detections will be stored. Must be ``>= 0`` and ``<= 1``. Defaults to ``0``.
A value of ``0.65`` means that there is a 65% chance that each inference without a detection is stored."""
enable_metadata_extraction: NotRequired[bool]
"""Whether to enable metadata extraction when storing inferences. Defaults to ``False``."""
enable_data_storage: NotRequired[bool]
"""Whether to enable data (e.g. image) storage when storing inferences. Defaults to ``False``."""
enable_semantic_scoring: NotRequired[bool]
"""Whether to enable semantic scoring of each request. Defaults to ``False``.
Embeddings will automatically be computed if this is enabled."""
enable_ks_scoring: NotRequired[bool]
"""Whether to enable Kolmogorov–Smirnov scoring of each request. Defaults to ``False``.
Embeddings will automatically be computed if this is enabled."""
enable_cvm_scoring: NotRequired[bool]
"""Whether to enable Cramér–von Mises scoring of each request. Defaults to ``False``.
Embeddings will automatically be computed if this is enabled."""
[docs]
@_apis.login_required
def get_inference_server_settings(model_id: str) -> InferenceServerSettingsDict:
"""Get the current inference server settings for model ``model_id``.
Parameters
----------
model_id: str
The model to get inference server settings from.
Returns
-------
:class:`chariot.models.model.InferenceServerSettingsDict`
"""
resp = _apis.models.inference_server_settings_api.models_model_id_isvc_settings_get( # pyright: ignore [reportAttributeAccessIssue]
model_id
)
settings = cast(InferenceServerSettingsDict, {d.get("key"): d.get("value") for d in resp.data})
return settings
[docs]
@_apis.login_required
def set_inference_server_settings(model_id: str, settings: InferenceServerSettingsDict):
"""Set inference server settings for model ``model_id``.
Parameters
----------
model_id: str
The model to set inference server settings on.
settings: :class:`chariot.models.model.InferenceServerSettingsDict`
Settings to apply to the inference server.
"""
_apis.models.inference_server_settings_api.models_model_id_isvc_settings_post( # pyright: ignore [reportAttributeAccessIssue]
model_id, [{"key": key, "value": value} for key, value in settings.items()]
)
## Deprecated classes and functions:
[docs]
@dataclass
class IsvcSetting:
key: str
value: Any
user_id: str
since: datetime
until: datetime | None = None
def _dict_to_input_settings(settings_dict: dict[str, object]):
return [{"key": key, "value": value} for key, value in settings_dict.items()]
def _output_settings_to_list(
output_settings: OutputEnumerableIsvcSettingResponse,
) -> list[IsvcSetting]:
resp = [
IsvcSetting(
key=item.get("key", ""),
value=item.get("value"),
since=parser.parse(item.get("since")),
until=None if item.get("until") is None else parser.parse(item.get("until")),
user_id=item.get("user_id", ""),
)
for item in output_settings.data
]
return [r for r in resp if r.key != ""]
[docs]
@_apis.login_required
@deprecated(
reason="The `get_isvc_settings` function is deprecated and will be removed in a future release. "
"Please use the `get_inference_server_settings` method on the `Model` object instead."
)
def get_isvc_settings(model_id: str, key: str | None = None) -> list[IsvcSetting]:
"""Get settings for the isvc of this model.
NOTE: This function is deprecated and will be removed in a future release.
Please use the :meth:`chariot.models.model.Model.get_inference_server_settings` method instead.
"""
return _output_settings_to_list(
_apis.models.inference_server_settings_api.models_model_id_isvc_settings_get(
model_id, key=key
)
)
[docs]
@_apis.login_required
@deprecated(
reason="The `create_isvc_settings` property is deprecated and will be removed in a future release. "
"Please use the `set_inference_server_settings` method on the `Model` object instead."
)
def create_isvc_settings(model_id: str, settings: dict[str, Any]) -> list[IsvcSetting]:
"""Create settings for the isvc of this model.
NOTE: This function is deprecated and will be removed in a future release.
Please use the :meth:`chariot.models.model.Model.set_inference_server_settings` method instead.
"""
input_settings = _dict_to_input_settings(settings)
output_settings = _apis.models.inference_server_settings_api.models_model_id_isvc_settings_post(
model_id, input_settings
)
return _output_settings_to_list(output_settings)