""" HITL-KG Configuration Module Centralized configuration for the entire system. """ import os from dataclasses import dataclass, field from typing import Dict, List, Optional, Any from pathlib import Path import yaml import logging logger = logging.getLogger(__name__) @dataclass class EmbeddingConfig: """Configuration for embedding-based search.""" model_name: str = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2" cache_dir: str = "./data/embeddings" dimension: int = 384 batch_size: int = 32 device: str = "cpu" # "cpu", "cuda", "mps" @dataclass class DatasetConfig: """Configuration for a single dataset.""" name: str source_type: str # "obo", "owl", "csv", "json" source_url: Optional[str] = None source_path: Optional[str] = None entity_category: str = "entity" cache_enabled: bool = True cache_max_age_days: int = 7 @dataclass class LLMConfig: """Configuration for LLM providers.""" provider: str = "local" # "openai", "local" model: str = "gpt-4o-mini" temperature: float = 0.7 max_tokens: int = 2048 api_key: Optional[str] = None @dataclass class AppConfig: """Main application configuration.""" # Paths data_dir: Path = field(default_factory=lambda: Path("./data")) cache_dir: Path = field(default_factory=lambda: Path("./data/cache")) sessions_dir: Path = field(default_factory=lambda: Path("./data/sessions")) # Server host: str = "0.0.0.0" port: int = 7860 debug: bool = False # Sessions session_max_age_hours: int = 24 session_cleanup_interval_minutes: int = 5 max_sessions: int = 1000 # Embeddings embedding: EmbeddingConfig = field(default_factory=EmbeddingConfig) # LLM llm: LLMConfig = field(default_factory=LLMConfig) # Datasets datasets: List[DatasetConfig] = field(default_factory=list) # UI default_language: str = "en" supported_languages: List[str] = field( default_factory=lambda: ["en", "uk", "ru", "es", "de", "fr"] ) def __post_init__(self): """Ensure directories exist.""" for path in [self.data_dir, self.cache_dir, self.sessions_dir]: path.mkdir(parents=True, exist_ok=True) # Default medical datasets configuration DEFAULT_MEDICAL_DATASETS = [ DatasetConfig( name="disease_ontology", source_type="obo", source_url="https://raw.githubusercontent.com/DiseaseOntology/HumanDiseaseOntology/main/src/ontology/doid.obo", entity_category="disease", ), DatasetConfig( name="symptom_ontology", source_type="obo", source_url="https://raw.githubusercontent.com/DiseaseOntology/SymptomOntology/main/symp.obo", entity_category="symptom", ), ] def load_config(config_path: Optional[str] = None) -> AppConfig: """ Load configuration from file or use defaults. Priority: 1. Explicit config file path 2. Environment variable HITL_KG_CONFIG 3. ./config.yaml if exists 4. Default configuration """ config_file = None if config_path: config_file = Path(config_path) elif os.environ.get("HITL_KG_CONFIG"): config_file = Path(os.environ["HITL_KG_CONFIG"]) elif Path("./config.yaml").exists(): config_file = Path("./config.yaml") if config_file and config_file.exists(): try: with open(config_file) as f: data = yaml.safe_load(f) return _parse_config(data) except Exception as e: logger.warning(f"Failed to load config from {config_file}: {e}") # Return default config with medical datasets config = AppConfig() config.datasets = DEFAULT_MEDICAL_DATASETS # Override from environment if os.environ.get("OPENAI_API_KEY"): config.llm.api_key = os.environ["OPENAI_API_KEY"] config.llm.provider = "openai" if os.environ.get("PORT"): config.port = int(os.environ["PORT"]) return config def _parse_config(data: Dict[str, Any]) -> AppConfig: """Parse configuration dictionary into AppConfig.""" config = AppConfig() # Parse simple fields for key in ["host", "port", "debug", "default_language"]: if key in data: setattr(config, key, data[key]) # Parse paths for key in ["data_dir", "cache_dir", "sessions_dir"]: if key in data: setattr(config, key, Path(data[key])) # Parse embedding config if "embedding" in data: config.embedding = EmbeddingConfig(**data["embedding"]) # Parse LLM config if "llm" in data: config.llm = LLMConfig(**data["llm"]) # Parse datasets if "datasets" in data: config.datasets = [DatasetConfig(**ds) for ds in data["datasets"]] else: config.datasets = DEFAULT_MEDICAL_DATASETS return config # Global config instance (lazy loaded) _config: Optional[AppConfig] = None def get_config() -> AppConfig: """Get the global configuration instance.""" global _config if _config is None: _config = load_config() return _config