Source code for chariot.system_resources.resource

import decimal
import math
from dataclasses import dataclass

from chariot import _apis
from chariot_api._openapi.resources.exceptions import NotFoundException

from .quantity import parse_quantity

__all__ = [
    "get_available_nodes",
    "get_available_node_capacities",
    "get_node_gpu_type",
]


@dataclass
class NodeResources:
    cpu: float
    memory: str
    ephemeral_storage: str
    gpu_type: str


def convert_quantity_to_gi(q) -> str:
    """Converts a k8s resource.Quantity value to a string
    representing the floor of the value in Gi. matching expected values
    in training run configs
    """
    # Parse k8s resource.Quantity val to standard format decimal
    q = parse_quantity(q)

    # Convert to Gi value
    q = q / decimal.Decimal(math.pow(1024, 3))
    q = math.floor(q)
    q = str(q) + "Gi"
    return q


# TODO (nvogler) gpu should be merged into a resources

[docs]
@_apis.login_required
def get_available_nodes() -> list:
    # TODO (nvogler) determine what, if any, other data is useful here
    response = _apis.resources.nodes_api.v2_nodes_get().to_dict()

    node_names = [node["name"] for node in response["data"]["nodes"]]
    return node_names




[docs]
@_apis.login_required
def get_node_gpu_type(name: str) -> str | None:
    # name can belong to either a node or a node group
    try:
        response = _apis.resources.nodes_api.v2_nodes_name_get(name).to_dict()
        node_labels = response["data"]["node"]["labels"]

        return node_labels.get("nvidia.com/gpu.product")

    except NotFoundException:
        group = _apis.resources.gpus_api.v1_groups_name_get(name)

        return group.data.gpu.product if group.data.gpu else None




[docs]
@_apis.login_required
def get_available_node_capacities() -> list:
    response = _apis.resources.capacities_api.v2_nodes_capacities_get().to_dict()
    data = response["data"]
    avail_capacities = data["capacities"]

    node_capacities = []
    for node in avail_capacities:
        allocatable_resources = node["allocatable"]

        if allocatable_resources.get("nvidia_comgpu") != 0:
            gpu_type = get_node_gpu_type(node["name"])
        else:
            gpu_type = ""

        node_res = NodeResources(
            cpu=float(parse_quantity(allocatable_resources["cpu"])),
            memory=convert_quantity_to_gi(allocatable_resources["memory"]),
            ephemeral_storage=allocatable_resources.get("ephemeral_storage"),
            gpu_type=gpu_type,
        )

        node_capacities.append(node_res)

    return node_capacities