import decimal
import math
from dataclasses import dataclass
from chariot import _apis
from chariot_api._openapi.resources.exceptions import NotFoundException
from .quantity import parse_quantity
__all__ = [
"get_available_nodes",
"get_available_node_capacities",
"get_node_gpu_type",
]
@dataclass
class NodeResources:
cpu: float
memory: str
ephemeral_storage: str
gpu_type: str
def convert_quantity_to_gi(q) -> str:
"""Converts a k8s resource.Quantity value to a string
representing the floor of the value in Gi. matching expected values
in training run configs
"""
# Parse k8s resource.Quantity val to standard format decimal
q = parse_quantity(q)
# Convert to Gi value
q = q / decimal.Decimal(math.pow(1024, 3))
q = math.floor(q)
q = str(q) + "Gi"
return q
# TODO (nvogler) gpu should be merged into a resources
[docs]
@_apis.login_required
def get_available_nodes() -> list:
# TODO (nvogler) determine what, if any, other data is useful here
response = _apis.resources.nodes_api.v2_nodes_get().to_dict()
node_names = [node["name"] for node in response["data"]["nodes"]]
return node_names
[docs]
@_apis.login_required
def get_node_gpu_type(name: str) -> str | None:
# name can belong to either a node or a node group
try:
response = _apis.resources.nodes_api.v2_nodes_name_get(name).to_dict()
node_labels = response["data"]["node"]["labels"]
return node_labels.get("nvidia.com/gpu.product")
except NotFoundException:
group = _apis.resources.gpus_api.v1_groups_name_get(name)
return group.data.gpu.product if group.data.gpu else None
[docs]
@_apis.login_required
def get_available_node_capacities() -> list:
response = _apis.resources.capacities_api.v2_nodes_capacities_get().to_dict()
data = response["data"]
avail_capacities = data["capacities"]
node_capacities = []
for node in avail_capacities:
allocatable_resources = node["allocatable"]
if allocatable_resources.get("nvidia_comgpu") != 0:
gpu_type = get_node_gpu_type(node["name"])
else:
gpu_type = ""
node_res = NodeResources(
cpu=float(parse_quantity(allocatable_resources["cpu"])),
memory=convert_quantity_to_gi(allocatable_resources["memory"]),
ephemeral_storage=allocatable_resources.get("ephemeral_storage"),
gpu_type=gpu_type,
)
node_capacities.append(node_res)
return node_capacities