"""Classifier discovery service.
Discovers available classifiers from the local filesystem (assets/models/).
Models are baked into the Docker image at build time from the GitLab Model
Registry — no runtime registry access is needed.
Usage:
from ai4drpm.services.shared.classifier_discovery import get_classifier_discovery_service
service = get_classifier_discovery_service()
# List all available classifiers
classifiers = service.discover_all()
# Check if a specific classifier exists
if service.exists("data_classifier"):
info = service.get("data_classifier")
"""
import logging
import os
from dataclasses import dataclass
from typing import Optional
logger = logging.getLogger(__name__)
[docs]
@dataclass
class DiscoveredClassifier:
"""A classifier discovered from the local filesystem.
Attributes:
name: Classifier name (e.g., "data_classifier")
source: Discovery source - always "local"
latest_version: Always None (version info not available from filesystem)
"""
name: str
source: str # "local"
latest_version: str | None
[docs]
class ClassifierDiscoveryService:
"""Discovers available classifiers from the local filesystem.
Scans assets/models/ for model files. Models are baked into the Docker
image at build time from the GitLab Model Registry.
Supports two directory layouts:
1. Subdirectory: {model_dir}/{name}/model.joblib + vectorizer.joblib
2. Flat (legacy): {model_dir}/{name}_model.joblib + {name}_vectorizer.joblib
"""
[docs]
def __init__(self, model_dir: str = "assets/models") -> None:
"""Initialize the discovery service.
Args:
model_dir: Directory containing local model files (default: assets/models)
"""
self.model_dir = model_dir
[docs]
def discover_all(self) -> list[DiscoveredClassifier]:
"""List all available classifiers from the local filesystem.
Returns:
List of DiscoveredClassifier objects sorted by name
"""
classifiers = self._discover_from_filesystem()
return sorted(classifiers, key=lambda c: c.name)
[docs]
def exists(self, classifier_name: str) -> bool:
"""Check if a classifier exists locally.
Args:
classifier_name: Name of the classifier to check
Returns:
True if the classifier exists, False otherwise
"""
return self.get(classifier_name) is not None
[docs]
def get(self, classifier_name: str) -> DiscoveredClassifier | None:
"""Get a single classifier's discovery info.
Args:
classifier_name: Name of the classifier to look up
Returns:
DiscoveredClassifier if found, None otherwise
"""
return self._get_from_filesystem(classifier_name)
def _discover_from_filesystem(self) -> list[DiscoveredClassifier]:
"""Discover classifiers from local filesystem.
Scans the model directory for classifiers in two layouts:
1. Subdirectory layout: {model_dir}/{name}/model.joblib + vectorizer.joblib
2. Flat layout (legacy): {model_dir}/{name}_model.joblib + {name}_vectorizer.joblib
Returns:
List of DiscoveredClassifier objects from filesystem
"""
classifiers = []
try:
if not os.path.exists(self.model_dir):
logger.debug(f"Model directory does not exist: {self.model_dir}")
return classifiers
discovered_names: set[str] = set()
# 1. Subdirectory layout: {name}/model.joblib + {name}/vectorizer.joblib
for entry in os.listdir(self.model_dir):
entry_path = os.path.join(self.model_dir, entry)
if os.path.isdir(entry_path):
model_path = os.path.join(entry_path, "model.joblib")
vectorizer_path = os.path.join(entry_path, "vectorizer.joblib")
if os.path.exists(model_path) and os.path.exists(vectorizer_path):
classifiers.append(DiscoveredClassifier(
name=entry,
source="local",
latest_version=None,
))
discovered_names.add(entry)
logger.debug(f"Discovered local classifier (subdir): {entry}")
# 2. Flat layout (legacy): {name}_model.joblib + {name}_vectorizer.joblib
for filename in os.listdir(self.model_dir):
if filename.endswith("_model.joblib"):
classifier_name = filename.replace("_model.joblib", "")
# Skip if already discovered from subdirectory
if classifier_name in discovered_names:
continue
vectorizer_path = os.path.join(
self.model_dir, f"{classifier_name}_vectorizer.joblib"
)
if os.path.exists(vectorizer_path):
classifiers.append(DiscoveredClassifier(
name=classifier_name,
source="local",
latest_version=None,
))
discovered_names.add(classifier_name)
logger.debug(f"Discovered local classifier (flat): {classifier_name}")
if classifiers:
logger.info(
f"Discovered {len(classifiers)} classifiers from local filesystem"
)
except Exception as e:
logger.error(f"Failed to discover classifiers from filesystem: {e}")
return classifiers
def _get_from_filesystem(self, classifier_name: str) -> DiscoveredClassifier | None:
"""Get a specific classifier from the local filesystem.
Checks both subdirectory layout and flat layout.
Args:
classifier_name: Name of the classifier
Returns:
DiscoveredClassifier if found locally, None otherwise
"""
try:
# 1. Check subdirectory layout: {name}/model.joblib + vectorizer.joblib
subdir_path = os.path.join(self.model_dir, classifier_name)
if os.path.isdir(subdir_path):
model_path = os.path.join(subdir_path, "model.joblib")
vectorizer_path = os.path.join(subdir_path, "vectorizer.joblib")
if os.path.exists(model_path) and os.path.exists(vectorizer_path):
return DiscoveredClassifier(
name=classifier_name,
source="local",
latest_version=None,
)
# 2. Check flat layout (legacy): {name}_model.joblib + {name}_vectorizer.joblib
model_path = os.path.join(
self.model_dir, f"{classifier_name}_model.joblib"
)
vectorizer_path = os.path.join(
self.model_dir, f"{classifier_name}_vectorizer.joblib"
)
if os.path.exists(model_path) and os.path.exists(vectorizer_path):
return DiscoveredClassifier(
name=classifier_name,
source="local",
latest_version=None,
)
except Exception as e:
logger.error(f"Failed to check local classifier {classifier_name}: {e}")
return None
# =============================================================================
# Singleton Instance
# =============================================================================
_discovery_service: Optional[ClassifierDiscoveryService] = None
[docs]
def get_classifier_discovery_service() -> ClassifierDiscoveryService:
"""Get or create the classifier discovery service singleton.
Returns:
ClassifierDiscoveryService instance (same instance on repeated calls)
"""
global _discovery_service
if _discovery_service is None:
_discovery_service = ClassifierDiscoveryService()
return _discovery_service
[docs]
def reset_classifier_discovery_service() -> None:
"""Reset the discovery service singleton (for testing).
Forces re-initialization on next get_classifier_discovery_service() call.
"""
global _discovery_service
_discovery_service = None