Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 41 additions & 1 deletion apps/helpers.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import json
from datetime import datetime
from typing import Any, Optional
from typing import Any, Optional, Tuple

import regex as re
import requests
Expand All @@ -10,6 +10,7 @@
from django.db import transaction
from django.forms.models import model_to_dict
from django.utils import timezone
from prometheus_client.parser import text_string_to_metric_families

from apps.constants import AppActionOrigin, HandleUpdateStatusResponseCode
from apps.types_.subdomain import SubdomainCandidateName
Expand Down Expand Up @@ -756,3 +757,42 @@ def get_university_suffix_information(university_sufffix: str) -> str:
}

return UNIVERSITY_NAMES.get(university_sufffix, university_sufffix)


def get_minio_usage(minio_service_name: str) -> Optional[Tuple[float, float]]:
metrics_url = f"http://{minio_service_name}/minio/v2/metrics/cluster"

try:
response = requests.get(metrics_url, timeout=5)
response.raise_for_status()
raw_metrics = response.text

except requests.RequestException as e:
logger.error(f"MinIO metrics url get request failed for {metrics_url}: {e}")
return None
except Exception as e:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would be nice to catch explicit requests exeception

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks. Added it here.

logger.error(f"MinIO metrics fetch failed for {metrics_url}: {e}")
return None

# Helper to extract metric values
def get_metric_value(metric_name: str) -> float:
total = 0.0
for family in text_string_to_metric_families(raw_metrics):
if family.name == metric_name:
total += sum(float(sample.value) for sample in family.samples)
return total

GIB_FACTOR = 1024**3 # 1 GiB in bytes

try:
used_bytes = get_metric_value("minio_cluster_usage_total_bytes")
total_bytes = get_metric_value("minio_cluster_capacity_usable_total_bytes")
except ValueError as e:
logger.error(f"MinIO metrics value parsing failed: {e}")
return None
except Exception as e:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here as as well, I'd say and have Exception as a fall back expect statement

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks. Added it.

logger.error(f"MinIO metrics parsing failed: {e}")
return None

# Convert to GiB and round
return (round(used_bytes / GIB_FACTOR, 2), round(total_bytes / GIB_FACTOR, 2))
12 changes: 8 additions & 4 deletions apps/models/app_types/mlflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,16 @@ class MLFlowInstance(BaseAppInstance):
objects = MlflowAppManager()
ACCESS_TYPES = (("project", "Project"),)
access = models.CharField(max_length=20, default="project", choices=ACCESS_TYPES)
upload_size = 1000 # MB

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)

def save(self, *args, **kwargs):
# Only for new instances
if not self.pk:
self.upload_size = 1000
super().save(*args, **kwargs)

def get_k8s_values(self):
k8s_values = super().get_k8s_values()
k8s_values["commonLabels"] = {
Expand All @@ -30,7 +35,7 @@ def get_k8s_values(self):
"enabled": True,
"ingressClassName": "nginx",
"hostname": self.url.split("://")[1] if self.url is not None else self.url,
"clientMaxBodySize": f"{self.upload_size}M",
"annotations": {"nginx.ingress.kubernetes.io/proxy-body-size": f"{self.upload_size}M"},
},
"podLabels": {
"type": "app",
Expand All @@ -42,7 +47,6 @@ def get_k8s_values(self):
"pdb": {"create": False},
# This fixes this issue:
# https://mlflow.org/docs/2.21.3/tracking/server#handling-timeout-when-uploadingdownloading-large-artifacts
# bug fix SS-1481, 'TypeError: Object of type set is not JSON serializable', changing set to list
"extraArgs": ['--gunicorn-opts="--timeout=360"'],
}
k8s_values["run"] = {
Expand All @@ -51,7 +55,7 @@ def get_k8s_values(self):
"limits": {"cpu": "8", "memory": "16Gi", "ephemeral-storage": "30Gi"},
}
}
k8s_values["minio"] = {"pdb": {"create": False}}
k8s_values["minio"] = {"pdb": {"create": False}, "metrics": {"enabled": True}}
k8s_values["postgresql"] = {
"primary": {
"pdb": {"create": False},
Expand Down
18 changes: 17 additions & 1 deletion apps/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
from .helpers import (
create_instance_from_form,
generate_schema_org_compliant_app_metadata,
get_minio_usage,
)
from .models import BaseAppInstance
from .tasks import delete_resource
Expand Down Expand Up @@ -362,7 +363,22 @@ def get(self, request, project, app_slug, app_id):
).stdout
password = base64.b64decode(password).decode()

context = {"mlflow_username": username, "mlflow_password": password, "mlflow_url": instance.url}
minio_used_gib = minio_total_gib = minio_remaining_gib = None
if instance.get_app_status() == "Running":
result = get_minio_usage(f"{subdomain.subdomain}-minio")
if result is not None:
minio_used_gib, minio_total_gib = result
minio_remaining_gib = minio_total_gib - minio_used_gib

context = {
"mlflow_username": username,
"mlflow_password": password,
"mlflow_url": instance.url,
"minio_used_gib": minio_used_gib,
"minio_total_gib": minio_total_gib,
"minio_remaining_gib": minio_remaining_gib,
}

return render(request, self.template, context)


Expand Down
2 changes: 1 addition & 1 deletion fixtures/apps_fixtures.json
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@
{
"fields": {
"category": "develop",
"chart": "oci://registry-1.docker.io/bitnamicharts/mlflow:2.5.2",
"chart": "oci://registry-1.docker.io/bitnamicharts/mlflow:5.0.4",
"created_on": "2025-02-12T21:34:37.815Z",
"description": "",
"name": "MLFlow",
Expand Down
31 changes: 31 additions & 0 deletions templates/apps/secrets_view.html
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,37 @@ <h4 class="h5 mt-4">How to Use These Credentials</h4>
href="https://mlflow.org/docs/latest/index.html" target="_blank">MLflow Documentation</a>.
</li>
</ol>

<hr>

<h1 class="h3 mb-3 card-title">Your MLflow Storage Details (in Gigabytes)</h1>

<p class="card-text">Remember to refresh this page later for the most up-to-date information.</p>
<p class="card-text">Retrieving storage metrics from the MLflow server can introduce some latency.</p>


<!-- Storage information Section -->
<div class="mb-3">
<label class="form-label">Total</label>
<div class="d-flex align-items-center gap-2">
<code class="p-2 bg-light rounded flex-grow-1" id="minio-total">{{ minio_total_gib }} GB</code>
</div>
</div>

<div class="mb-3">
<label class="form-label">Used</label>
<div class="d-flex align-items-center gap-2">
<code class="p-2 bg-light rounded flex-grow-1" id="minio-used">{{ minio_used_gib }} GB</code>
</div>
</div>

<div class="mb-3">
<label class="form-label">Remaining</label>
<div class="d-flex align-items-center gap-2">
<code class="p-2 bg-light rounded flex-grow-1" id="minio-remaining">{{ minio_remaining_gib }} GB</code>
</div>
</div>

</div>
</div>
</div>
Expand Down