-
Notifications
You must be signed in to change notification settings - Fork 1
SS-1380 MLFlow improvements #365
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 2 commits
56a5905
9a4c839
7928ca4
4e4f983
64a70d1
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,6 +1,6 @@ | ||
| import json | ||
| from datetime import datetime | ||
| from typing import Any, Optional | ||
| from typing import Any, Optional, Tuple | ||
|
|
||
| import regex as re | ||
| import requests | ||
|
|
@@ -10,6 +10,7 @@ | |
| from django.db import transaction | ||
| from django.forms.models import model_to_dict | ||
| from django.utils import timezone | ||
| from prometheus_client.parser import text_string_to_metric_families | ||
|
|
||
| from apps.constants import AppActionOrigin, HandleUpdateStatusResponseCode | ||
| from apps.types_.subdomain import SubdomainCandidateName | ||
|
|
@@ -756,3 +757,35 @@ def get_university_suffix_information(university_sufffix: str) -> str: | |
| } | ||
|
|
||
| return UNIVERSITY_NAMES.get(university_sufffix, university_sufffix) | ||
|
|
||
|
|
||
| def get_minio_usage(minio_service_name: str) -> Optional[Tuple[float, float]]: | ||
| metrics_url = f"http://{minio_service_name}/minio/v2/metrics/cluster" | ||
|
|
||
| try: | ||
| response = requests.get(metrics_url, timeout=5) | ||
| response.raise_for_status() | ||
| raw_metrics = response.text | ||
| except Exception as e: | ||
| logger.error(f"MinIO metrics fetch failed for {metrics_url}: {e}") | ||
| return None | ||
|
|
||
| # Helper to extract metric values | ||
| def get_metric_value(metric_name: str) -> float: | ||
| total = 0.0 | ||
| for family in text_string_to_metric_families(raw_metrics): | ||
| if family.name == metric_name: | ||
| total += sum(float(sample.value) for sample in family.samples) | ||
| return total | ||
|
|
||
| GIB_FACTOR = 1024**3 # 1 GiB in bytes | ||
|
|
||
| try: | ||
| used_bytes = get_metric_value("minio_cluster_usage_total_bytes") | ||
| total_bytes = get_metric_value("minio_cluster_capacity_usable_total_bytes") | ||
| except Exception as e: | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Here as as well, I'd say and have
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks. Added it. |
||
| logger.error(f"MinIO metrics parsing failed: {e}") | ||
| return None | ||
|
|
||
| # Convert to GiB and round | ||
| return (round(used_bytes / GIB_FACTOR, 2), round(total_bytes / GIB_FACTOR, 2)) | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -12,7 +12,7 @@ class MLFlowInstance(BaseAppInstance): | |
| objects = MlflowAppManager() | ||
| ACCESS_TYPES = (("project", "Project"),) | ||
| access = models.CharField(max_length=20, default="project", choices=ACCESS_TYPES) | ||
| upload_size = 1000 # MB | ||
| mlflow_upload_size = 1000 # MB | ||
|
||
|
|
||
| def __init__(self, *args, **kwargs): | ||
| super().__init__(*args, **kwargs) | ||
|
|
@@ -30,7 +30,7 @@ def get_k8s_values(self): | |
| "enabled": True, | ||
| "ingressClassName": "nginx", | ||
| "hostname": self.url.split("://")[1] if self.url is not None else self.url, | ||
| "clientMaxBodySize": f"{self.upload_size}M", | ||
| "annotations": {"nginx.ingress.kubernetes.io/proxy-body-size": f"{self.mlflow_upload_size}M"}, | ||
| }, | ||
| "podLabels": { | ||
| "type": "app", | ||
|
|
@@ -42,15 +42,15 @@ def get_k8s_values(self): | |
| "pdb": {"create": False}, | ||
| # This fixes this issue: | ||
| # https://mlflow.org/docs/2.21.3/tracking/server#handling-timeout-when-uploadingdownloading-large-artifacts | ||
| "extraArgs": {'--gunicorn-opts="--timeout=360"'}, | ||
| "extraArgs": ['--gunicorn-opts="--timeout=360"'], | ||
| } | ||
| k8s_values["run"] = { | ||
| "resources": { | ||
| "requests": {"cpu": "1", "memory": "1Gi", "ephemeral-storage": "1Gi"}, | ||
| "limits": {"cpu": "8", "memory": "16Gi", "ephemeral-storage": "30Gi"}, | ||
| } | ||
| } | ||
| k8s_values["minio"] = {"pdb": {"create": False}} | ||
| k8s_values["minio"] = {"pdb": {"create": False}, "metrics": {"enabled": True}} | ||
| k8s_values["postgresql"] = { | ||
| "primary": { | ||
| "pdb": {"create": False}, | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Would be nice to catch explicit requests exeception
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks. Added it here.