Source code for ai4drpm.services.shared.classifier_discovery

"""Classifier discovery service.

Discovers available classifiers from the local filesystem (assets/models/).
Models are baked into the Docker image at build time from the GitLab Model
Registry — no runtime registry access is needed.

Usage:
    from ai4drpm.services.shared.classifier_discovery import get_classifier_discovery_service
    
    service = get_classifier_discovery_service()
    
    # List all available classifiers
    classifiers = service.discover_all()
    
    # Check if a specific classifier exists
    if service.exists("data_classifier"):
        info = service.get("data_classifier")
"""

import logging
import os
from dataclasses import dataclass
from typing import Optional

logger = logging.getLogger(__name__)


[docs] @dataclass class DiscoveredClassifier: """A classifier discovered from the local filesystem. Attributes: name: Classifier name (e.g., "data_classifier") source: Discovery source - always "local" latest_version: Always None (version info not available from filesystem) """ name: str source: str # "local" latest_version: str | None
[docs] class ClassifierDiscoveryService: """Discovers available classifiers from the local filesystem. Scans assets/models/ for model files. Models are baked into the Docker image at build time from the GitLab Model Registry. Supports two directory layouts: 1. Subdirectory: {model_dir}/{name}/model.joblib + vectorizer.joblib 2. Flat (legacy): {model_dir}/{name}_model.joblib + {name}_vectorizer.joblib """
[docs] def __init__(self, model_dir: str = "assets/models") -> None: """Initialize the discovery service. Args: model_dir: Directory containing local model files (default: assets/models) """ self.model_dir = model_dir
[docs] def discover_all(self) -> list[DiscoveredClassifier]: """List all available classifiers from the local filesystem. Returns: List of DiscoveredClassifier objects sorted by name """ classifiers = self._discover_from_filesystem() return sorted(classifiers, key=lambda c: c.name)
[docs] def exists(self, classifier_name: str) -> bool: """Check if a classifier exists locally. Args: classifier_name: Name of the classifier to check Returns: True if the classifier exists, False otherwise """ return self.get(classifier_name) is not None
[docs] def get(self, classifier_name: str) -> DiscoveredClassifier | None: """Get a single classifier's discovery info. Args: classifier_name: Name of the classifier to look up Returns: DiscoveredClassifier if found, None otherwise """ return self._get_from_filesystem(classifier_name)
def _discover_from_filesystem(self) -> list[DiscoveredClassifier]: """Discover classifiers from local filesystem. Scans the model directory for classifiers in two layouts: 1. Subdirectory layout: {model_dir}/{name}/model.joblib + vectorizer.joblib 2. Flat layout (legacy): {model_dir}/{name}_model.joblib + {name}_vectorizer.joblib Returns: List of DiscoveredClassifier objects from filesystem """ classifiers = [] try: if not os.path.exists(self.model_dir): logger.debug(f"Model directory does not exist: {self.model_dir}") return classifiers discovered_names: set[str] = set() # 1. Subdirectory layout: {name}/model.joblib + {name}/vectorizer.joblib for entry in os.listdir(self.model_dir): entry_path = os.path.join(self.model_dir, entry) if os.path.isdir(entry_path): model_path = os.path.join(entry_path, "model.joblib") vectorizer_path = os.path.join(entry_path, "vectorizer.joblib") if os.path.exists(model_path) and os.path.exists(vectorizer_path): classifiers.append(DiscoveredClassifier( name=entry, source="local", latest_version=None, )) discovered_names.add(entry) logger.debug(f"Discovered local classifier (subdir): {entry}") # 2. Flat layout (legacy): {name}_model.joblib + {name}_vectorizer.joblib for filename in os.listdir(self.model_dir): if filename.endswith("_model.joblib"): classifier_name = filename.replace("_model.joblib", "") # Skip if already discovered from subdirectory if classifier_name in discovered_names: continue vectorizer_path = os.path.join( self.model_dir, f"{classifier_name}_vectorizer.joblib" ) if os.path.exists(vectorizer_path): classifiers.append(DiscoveredClassifier( name=classifier_name, source="local", latest_version=None, )) discovered_names.add(classifier_name) logger.debug(f"Discovered local classifier (flat): {classifier_name}") if classifiers: logger.info( f"Discovered {len(classifiers)} classifiers from local filesystem" ) except Exception as e: logger.error(f"Failed to discover classifiers from filesystem: {e}") return classifiers def _get_from_filesystem(self, classifier_name: str) -> DiscoveredClassifier | None: """Get a specific classifier from the local filesystem. Checks both subdirectory layout and flat layout. Args: classifier_name: Name of the classifier Returns: DiscoveredClassifier if found locally, None otherwise """ try: # 1. Check subdirectory layout: {name}/model.joblib + vectorizer.joblib subdir_path = os.path.join(self.model_dir, classifier_name) if os.path.isdir(subdir_path): model_path = os.path.join(subdir_path, "model.joblib") vectorizer_path = os.path.join(subdir_path, "vectorizer.joblib") if os.path.exists(model_path) and os.path.exists(vectorizer_path): return DiscoveredClassifier( name=classifier_name, source="local", latest_version=None, ) # 2. Check flat layout (legacy): {name}_model.joblib + {name}_vectorizer.joblib model_path = os.path.join( self.model_dir, f"{classifier_name}_model.joblib" ) vectorizer_path = os.path.join( self.model_dir, f"{classifier_name}_vectorizer.joblib" ) if os.path.exists(model_path) and os.path.exists(vectorizer_path): return DiscoveredClassifier( name=classifier_name, source="local", latest_version=None, ) except Exception as e: logger.error(f"Failed to check local classifier {classifier_name}: {e}") return None
# ============================================================================= # Singleton Instance # ============================================================================= _discovery_service: Optional[ClassifierDiscoveryService] = None
[docs] def get_classifier_discovery_service() -> ClassifierDiscoveryService: """Get or create the classifier discovery service singleton. Returns: ClassifierDiscoveryService instance (same instance on repeated calls) """ global _discovery_service if _discovery_service is None: _discovery_service = ClassifierDiscoveryService() return _discovery_service
[docs] def reset_classifier_discovery_service() -> None: """Reset the discovery service singleton (for testing). Forces re-initialization on next get_classifier_discovery_service() call. """ global _discovery_service _discovery_service = None