Source code for chariot.system_resources.resource

import decimal
import math
from dataclasses import dataclass

from chariot import _apis
from chariot_api._openapi.resources.exceptions import NotFoundException

from .quantity import parse_quantity

__all__ = [
    "get_available_nodes",
    "get_available_node_capacities",
    "get_node_gpu_type",
]


@dataclass
class NodeResources:
    cpu: float
    memory: str
    ephemeral_storage: str
    gpu_type: str


def convert_quantity_to_gi(q) -> str:
    """Converts a k8s resource.Quantity value to a string
    representing the floor of the value in Gi. matching expected values
    in training run configs
    """
    # Parse k8s resource.Quantity val to standard format decimal
    q = parse_quantity(q)

    # Convert to Gi value
    q = q / decimal.Decimal(math.pow(1024, 3))
    q = math.floor(q)
    q = str(q) + "Gi"
    return q


# TODO (nvogler) gpu should be merged into a resources
[docs] @_apis.login_required def get_available_nodes() -> list: # TODO (nvogler) determine what, if any, other data is useful here response = _apis.resources.nodes_api.v2_nodes_get().to_dict() node_names = [node["name"] for node in response["data"]["nodes"]] return node_names
[docs] @_apis.login_required def get_node_gpu_type(name: str) -> str | None: # name can belong to either a node or a node group try: response = _apis.resources.nodes_api.v2_nodes_name_get(name).to_dict() node_labels = response["data"]["node"]["labels"] return node_labels.get("nvidia.com/gpu.product") except NotFoundException: group = _apis.resources.gpus_api.v1_groups_name_get(name) return group.data.gpu.product if group.data.gpu else None
[docs] @_apis.login_required def get_available_node_capacities() -> list: response = _apis.resources.capacities_api.v2_nodes_capacities_get().to_dict() data = response["data"] avail_capacities = data["capacities"] node_capacities = [] for node in avail_capacities: allocatable_resources = node["allocatable"] if allocatable_resources.get("nvidia_comgpu") != 0: gpu_type = get_node_gpu_type(node["name"]) else: gpu_type = "" node_res = NodeResources( cpu=float(parse_quantity(allocatable_resources["cpu"])), memory=convert_quantity_to_gi(allocatable_resources["memory"]), ephemeral_storage=allocatable_resources.get("ephemeral_storage"), gpu_type=gpu_type, ) node_capacities.append(node_res) return node_capacities