Spaces:

midah
/

hf-viz

Running

App Files Files Community

midah commited on 24 days ago

Commit

4fac556

1 Parent(s): e904fd3

Apply clean grayscale design, remove all emojis

Browse files

Remove gradient backgrounds, purple theme, and emojis throughout UI

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

backend/README.md +0 -24
backend/api/dependencies.py +23 -0
backend/api/main.py +764 -427
backend/api/routes/__init__.py +6 -0
backend/api/routes/clusters.py +102 -0
backend/api/routes/models.py +247 -0
backend/api/routes/stats.py +37 -0
backend/config/requirements.txt +1 -0
backend/core/__init__.py +2 -0
backend/core/config.py +23 -0
backend/core/exceptions.py +18 -0
backend/models/__init__.py +2 -0
backend/models/schemas.py +22 -0
backend/scripts/export_binary.py +263 -0
backend/services/model_tracker.py +83 -24
backend/services/model_tracker_improved.py +95 -30
backend/utils/data_loader.py +1 -4
backend/utils/embeddings.py +1 -1
backend/utils/family_tree.py +66 -0
backend/utils/graph_embeddings.py +177 -0
backend/utils/network_analysis.py +163 -20
frontend/.npmrc +2 -0
frontend/package-lock.json +2 -1
frontend/package.json +2 -1
frontend/public/index.html +1 -1
frontend/src/App.css +85 -202
frontend/src/App.tsx +49 -118
frontend/src/components/PaperPlots.css +0 -92
frontend/src/components/PaperPlots.tsx +0 -755
frontend/src/components/ScatterPlot.tsx +0 -7
frontend/src/components/controls/ClusterFilter.css +122 -0
frontend/src/components/controls/ClusterFilter.tsx +142 -0
frontend/src/components/controls/NodeDensitySlider.css +31 -0
frontend/src/components/controls/NodeDensitySlider.tsx +39 -0
frontend/src/components/controls/RandomModelButton.tsx +32 -0
frontend/src/components/controls/RenderingStyleSelector.css +37 -0
frontend/src/components/controls/RenderingStyleSelector.tsx +43 -0
frontend/src/components/controls/ThemeToggle.tsx +22 -0
frontend/src/components/controls/VisualizationModeButtons.css +65 -0
frontend/src/components/controls/VisualizationModeButtons.tsx +46 -0
frontend/src/components/controls/ZoomSlider.tsx +43 -0
frontend/src/components/layout/SearchBar.css +181 -0
frontend/src/components/layout/SearchBar.tsx +201 -0
frontend/src/components/{FileTree.css → modals/FileTree.css} +171 -3
frontend/src/components/{FileTree.tsx → modals/FileTree.tsx} +314 -26
frontend/src/components/{ModelModal.css → modals/ModelModal.css} +43 -14
frontend/src/components/{ModelModal.tsx → modals/ModelModal.tsx} +17 -9
frontend/src/components/{ColorLegend.css → ui/ColorLegend.css} +0 -0
frontend/src/components/{ColorLegend.tsx → ui/ColorLegend.tsx} +1 -1
frontend/src/components/{ErrorBoundary.tsx → ui/ErrorBoundary.tsx} +0 -0

backend/README.md DELETED Viewed

@@ -1,24 +0,0 @@
-# Backend API
-FastAPI backend for serving model data to the React frontend.
-## Structure
-- `api/` - API routes and main application
-- `services/` - External service integrations (arXiv, model tracking, scheduling)
-- `utils/` - Utility modules (data loading, embeddings, dimensionality reduction, clustering, network analysis)
-- `config/` - Configuration files (requirements.txt, etc.)
-- `cache/` - Cached data (embeddings, reduced dimensions)
-## Running
-```bash
-cd backend
-uvicorn api.main:app --reload --host 0.0.0.0 --port 8000
-```
-## Environment Variables
-- `SAMPLE_SIZE` - Limit number of models to load (for development). Set to 0 or leave unset to load all models.

backend/api/dependencies.py ADDED Viewed

	@@ -0,0 +1,23 @@

+"""Shared dependencies for API routes."""
+import pandas as pd
+import numpy as np
+from typing import Optional, Dict
+from utils.data_loader import ModelDataLoader
+from utils.embeddings import ModelEmbedder
+from utils.dimensionality_reduction import DimensionReducer
+from utils.graph_embeddings import GraphEmbedder
+# Global state (initialized in startup) - these are module-level variables
+# that will be updated by main.py during startup
+data_loader = ModelDataLoader()
+embedder: Optional[ModelEmbedder] = None
+graph_embedder: Optional[GraphEmbedder] = None
+reducer: Optional[DimensionReducer] = None
+df: Optional[pd.DataFrame] = None
+embeddings: Optional[np.ndarray] = None
+graph_embeddings_dict: Optional[Dict[str, np.ndarray]] = None
+combined_embeddings: Optional[np.ndarray] = None
+reduced_embeddings: Optional[np.ndarray] = None
+reduced_embeddings_graph: Optional[np.ndarray] = None
+cluster_labels: Optional[np.ndarray] = None

backend/api/main.py CHANGED Viewed

@@ -1,202 +1,216 @@
-"""
-FastAPI backend for serving model data to React/Visx frontend.
-"""
 import sys
 import os
-backend_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
-if backend_dir not in sys.path:
-    sys.path.insert(0, backend_dir)
 from fastapi import FastAPI, HTTPException, Query, BackgroundTasks, Request
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.middleware.gzip import GZipMiddleware
 from fastapi.responses import FileResponse, JSONResponse
 from fastapi.exceptions import RequestValidationError
 from starlette.exceptions import HTTPException as StarletteHTTPException
-from typing import Optional, List, Dict
-import pandas as pd
-import numpy as np
 from pydantic import BaseModel
 from umap import UMAP
-import tempfile
-import traceback
-import httpx
 from utils.data_loader import ModelDataLoader
 from utils.embeddings import ModelEmbedder
 from utils.dimensionality_reduction import DimensionReducer
 from utils.network_analysis import ModelNetworkBuilder
 from services.model_tracker import get_tracker
-from services.model_tracker_improved import get_improved_tracker
 from services.arxiv_api import extract_arxiv_ids, fetch_arxiv_papers
-app = FastAPI(title="HF Model Ecosystem API")
 app.add_middleware(GZipMiddleware, minimum_size=1000)
 @app.exception_handler(Exception)
 async def global_exception_handler(request: Request, exc: Exception):
-    """Global exception handler that ensures CORS headers are included even on errors."""
-    import traceback
-    error_detail = str(exc)
-    traceback_str = traceback.format_exc()
-    import sys
-    sys.stderr.write(f"Unhandled exception: {error_detail}\n{traceback_str}\n")
     return JSONResponse(
         status_code=500,
-        content={"detail": error_detail, "error": "Internal server error"},
-        headers={
-            "Access-Control-Allow-Origin": "*",
-            "Access-Control-Allow-Methods": "*",
-            "Access-Control-Allow-Headers": "*",
-        }
     )
 @app.exception_handler(StarletteHTTPException)
 async def http_exception_handler(request: Request, exc: StarletteHTTPException):
-    """HTTP exception handler with CORS headers."""
     return JSONResponse(
         status_code=exc.status_code,
         content={"detail": exc.detail},
-        headers={
-            "Access-Control-Allow-Origin": "*",
-            "Access-Control-Allow-Methods": "*",
-            "Access-Control-Allow-Headers": "*",
-        }
     )
 @app.exception_handler(RequestValidationError)
 async def validation_exception_handler(request: Request, exc: RequestValidationError):
-    """Validation exception handler with CORS headers."""
     return JSONResponse(
         status_code=422,
         content={"detail": exc.errors()},
-        headers={
-            "Access-Control-Allow-Origin": "*",
-            "Access-Control-Allow-Methods": "*",
-            "Access-Control-Allow-Headers": "*",
-        }
     )
-# CORS middleware for React frontend
-# Update allow_origins with your Netlify URL in production
-# Note: Add your specific Netlify URL after deployment
-FRONTEND_URL = os.getenv("FRONTEND_URL", "http://localhost:3000")
-# Allow all origins for development (restrict in production)
-ALLOW_ALL_ORIGINS = os.getenv("ALLOW_ALL_ORIGINS", "true").lower() == "true"
-if ALLOW_ALL_ORIGINS:
     app.add_middleware(
         CORSMiddleware,
-        allow_origins=["*"],  # Allow all origins in development
-        allow_credentials=False,  # Must be False when allow_origins is ["*"]
         allow_methods=["*"],
         allow_headers=["*"],
     )
 else:
     app.add_middleware(
         CORSMiddleware,
-        allow_origins=[
-            "http://localhost:3000",  # Local development
-            FRONTEND_URL,  # Production frontend URL
-            # Add your Netlify URL here after deployment, e.g.:
-            # "https://your-app-name.netlify.app",
-        ],
         allow_credentials=True,
         allow_methods=["*"],
         allow_headers=["*"],
     )
-data_loader = ModelDataLoader()
-embedder: Optional[ModelEmbedder] = None
-reducer: Optional[DimensionReducer] = None
-df: Optional[pd.DataFrame] = None
-embeddings: Optional[np.ndarray] = None
-reduced_embeddings: Optional[np.ndarray] = None
-cluster_labels: Optional[np.ndarray] = None  # Cached cluster assignments
-class FilterParams(BaseModel):
-    min_downloads: int = 0
-    min_likes: int = 0
-    search_query: Optional[str] = None
-    libraries: Optional[List[str]] = None
-    pipeline_tags: Optional[List[str]] = None
-class ModelPoint(BaseModel):
-    model_id: str
-    x: float
-    y: float
-    z: float  # 3D coordinate
-    library_name: Optional[str]
-    pipeline_tag: Optional[str]
-    downloads: int
-    likes: int
-    trending_score: Optional[float]
-    tags: Optional[str]
-    parent_model: Optional[str] = None
-    licenses: Optional[str] = None
-    family_depth: Optional[int] = None  # Generation depth in family tree (0 = root)
-    cluster_id: Optional[int] = None    # Cluster assignment for visualization
 @app.on_event("startup")
 async def startup_event():
-    """Initialize data and models on startup with caching."""
-    global df, embedder, reducer, embeddings, reduced_embeddings
-    import os
     backend_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
     root_dir = os.path.dirname(backend_dir)
     cache_dir = os.path.join(root_dir, "cache")
     os.makedirs(cache_dir, exist_ok=True)
     embeddings_cache = os.path.join(cache_dir, "embeddings.pkl")
     reduced_cache_umap = os.path.join(cache_dir, "reduced_umap_3d.pkl")
     reducer_cache_umap = os.path.join(cache_dir, "reducer_umap_3d.pkl")
-    sample_size_env = os.getenv("SAMPLE_SIZE")
-    if sample_size_env is None:
-        sample_size = None
     else:
-        sample_size = int(sample_size_env)
-        if sample_size == 0:
-            sample_size = None
-    df = data_loader.load_data(sample_size=sample_size)
-    df = data_loader.preprocess_for_embedding(df)
-    if 'model_id' in df.columns:
-        df.set_index('model_id', drop=False, inplace=True)
     for col in ['downloads', 'likes']:
-        if col in df.columns:
-            df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0).astype(int)
-    embedder = ModelEmbedder()
     if os.path.exists(embeddings_cache):
         try:
-            embeddings = embedder.load_embeddings(embeddings_cache)
         except Exception as e:
-            embeddings = None
-    if embeddings is None:
-        texts = df['combined_text'].tolist()
-        embeddings = embedder.generate_embeddings(texts, batch_size=128)
-        embedder.save_embeddings(embeddings, embeddings_cache)
-    reducer = DimensionReducer(method="umap", n_components=3)
     if os.path.exists(reduced_cache_umap) and os.path.exists(reducer_cache_umap):
         try:
-            import pickle
             with open(reduced_cache_umap, 'rb') as f:
-                reduced_embeddings = pickle.load(f)
-            reducer.load_reducer(reducer_cache_umap)
-        except Exception as e:
-            reduced_embeddings = None
-    if reduced_embeddings is None:
-        reducer.reducer = UMAP(
             n_components=3,
             n_neighbors=30,
             min_dist=0.3,
@@ -206,61 +220,57 @@ async def startup_event():
             low_memory=True,
             spread=1.5
         )
-        reduced_embeddings = reducer.fit_transform(embeddings)
-        import pickle
         with open(reduced_cache_umap, 'wb') as f:
-            pickle.dump(reduced_embeddings, f)
-        reducer.save_reducer(reducer_cache_umap)
-def calculate_family_depths(df: pd.DataFrame) -> Dict[str, int]:
-    """
-    Calculate family tree depth for each model.
-    Returns a dictionary mapping model_id to depth (0 = root, 1 = first generation, etc.)
-    """
-    depths = {}
-    visited = set()
-    def get_depth(model_id: str) -> int:
-        if model_id in depths:
-            return depths[model_id]
-        if model_id in visited:
-            # Circular reference, treat as root
-            depths[model_id] = 0
-            return 0
-        visited.add(model_id)
-        if model_id not in df.index:
-            depths[model_id] = 0
-            return 0
-        parent_id = df.loc[model_id].get('parent_model')
-        if parent_id and pd.notna(parent_id) and str(parent_id) != 'nan' and str(parent_id) != '':
-            parent_id_str = str(parent_id)
-            if parent_id_str in df.index:
-                depth = get_depth(parent_id_str) + 1
-            else:
-                depth = 0  # Parent not in dataset, treat as root
-        else:
-            depth = 0  # No parent, this is a root
-        depths[model_id] = depth
-        return depth
-    for model_id in df.index:
-        if model_id not in depths:
-            visited = set()  # Reset for each tree
-            get_depth(model_id)
-    return depths
 def compute_clusters(reduced_embeddings: np.ndarray, n_clusters: int = 50) -> np.ndarray:
-    """
-    Compute clusters using KMeans on reduced embeddings.
-    Returns cluster labels for each point.
-    """
     from sklearn.cluster import KMeans
     n_samples = len(reduced_embeddings)
@@ -268,8 +278,7 @@ def compute_clusters(reduced_embeddings: np.ndarray, n_clusters: int = 50) -> np
         n_clusters = max(1, n_samples // 10)
     kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
-    cluster_labels = kmeans.fit_predict(reduced_embeddings)
-    return cluster_labels
 @app.get("/")
@@ -284,24 +293,16 @@ async def get_models(
     search_query: Optional[str] = Query(None),
     color_by: str = Query("library_name"),
     size_by: str = Query("downloads"),
-    max_points: Optional[int] = Query(None),  # Optional limit (None = all points)
-    projection_method: str = Query("umap"),  # umap or tsne
-    base_models_only: bool = Query(False)  # Only show root models (no parent)
 ):
-    """
-    Get filtered models with 3D coordinates for visualization.
-    Supports multiple projection methods: UMAP or t-SNE.
-    If base_models_only=True, only returns root models (models without a parent_model).
-    Returns a JSON object with:
-    - models: List of ModelPoint objects
-    - filtered_count: Number of models matching filters (before max_points sampling)
-    - returned_count: Number of models actually returned (after max_points sampling)
-    """
-    global df, embedder, reducer, embeddings, reduced_embeddings
-    if df is None:
-        raise HTTPException(status_code=503, detail="Data not loaded")
     # Filter data
     filtered_df = data_loader.filter_data(
@@ -321,7 +322,12 @@ async def get_models(
                 (filtered_df['parent_model'].astype(str) == 'nan')
             ]
-    # Store the filtered count BEFORE sampling
     filtered_count = len(filtered_df)
     if len(filtered_df) == 0:
@@ -332,42 +338,53 @@ async def get_models(
         }
     if max_points is not None and len(filtered_df) > max_points:
-        # Use stratified sampling to preserve distribution of important attributes
-        # Sample proportionally from different libraries/pipelines for better representation
         if 'library_name' in filtered_df.columns and filtered_df['library_name'].notna().any():
-            # Stratified sampling by library
-            filtered_df = filtered_df.groupby('library_name', group_keys=False).apply(
-                lambda x: x.sample(min(len(x), max(1, int(max_points * len(x) / len(filtered_df)))), random_state=42)
-            ).reset_index(drop=True)
-            # If still too many, random sample the rest
             if len(filtered_df) > max_points:
-                filtered_df = filtered_df.sample(n=max_points, random_state=42)
         else:
-            filtered_df = filtered_df.sample(n=max_points, random_state=42)
-    if embeddings is None:
-        raise HTTPException(status_code=503, detail="Embeddings not loaded")
-    if reduced_embeddings is None or (reducer and reducer.method != projection_method.lower()):
-        import os
         backend_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
         root_dir = os.path.dirname(backend_dir)
         cache_dir = os.path.join(root_dir, "cache")
-        reduced_cache = os.path.join(cache_dir, f"reduced_{projection_method.lower()}_3d.pkl")
-        reducer_cache = os.path.join(cache_dir, f"reducer_{projection_method.lower()}_3d.pkl")
         if os.path.exists(reduced_cache) and os.path.exists(reducer_cache):
             try:
-                import pickle
                 with open(reduced_cache, 'rb') as f:
-                    reduced_embeddings = pickle.load(f)
                 if reducer is None or reducer.method != projection_method.lower():
                     reducer = DimensionReducer(method=projection_method.lower(), n_components=3)
                 reducer.load_reducer(reducer_cache)
-            except Exception as e:
-                reduced_embeddings = None
-        if reduced_embeddings is None:
             if reducer is None or reducer.method != projection_method.lower():
                 reducer = DimensionReducer(method=projection_method.lower(), n_components=3)
                 if projection_method.lower() == "umap":
@@ -381,52 +398,91 @@ async def get_models(
                         low_memory=True,
                         spread=1.5
                     )
-            reduced_embeddings = reducer.fit_transform(embeddings)
-            import pickle
             with open(reduced_cache, 'wb') as f:
-                pickle.dump(reduced_embeddings, f)
             reducer.save_reducer(reducer_cache)
-    # Get coordinates for filtered data - optimized vectorized approach
-    # Map filtered dataframe indices to original dataframe integer positions
-    # Since df is indexed by model_id, we need to get the integer positions
     if df.index.name == 'model_id' or 'model_id' in df.index.names:
-        # Get integer positions of filtered rows in original dataframe
-        # Use vectorized lookup for better performance
-        filtered_indices = np.array([df.index.get_loc(idx) for idx in filtered_df.index], dtype=np.int32)
     else:
-        # If using integer index, use directly
-        filtered_indices = filtered_df.index.values.astype(np.int32)
-    # Use advanced indexing for faster access
-    filtered_reduced = reduced_embeddings[filtered_indices]
     family_depths = calculate_family_depths(df)
-    global cluster_labels
-    if cluster_labels is None or len(cluster_labels) != len(reduced_embeddings):
-        cluster_labels = compute_clusters(reduced_embeddings, n_clusters=min(50, len(reduced_embeddings) // 100))
-    filtered_clusters = cluster_labels[filtered_indices]
-    # Build response with optimized vectorized operations
-    # Pre-extract arrays for faster access
     model_ids = filtered_df['model_id'].astype(str).values
-    library_names = filtered_df['library_name'].values
-    pipeline_tags = filtered_df['pipeline_tag'].values
-    downloads_arr = filtered_df['downloads'].fillna(0).astype(int).values
-    likes_arr = filtered_df['likes'].fillna(0).astype(int).values
-    trending_scores = filtered_df.get('trendingScore', pd.Series()).values
-    tags_arr = filtered_df.get('tags', pd.Series()).values
-    parent_models = filtered_df.get('parent_model', pd.Series()).values
-    licenses_arr = filtered_df.get('licenses', pd.Series()).values
-    # Vectorized coordinate extraction
     x_coords = filtered_reduced[:, 0].astype(float)
     y_coords = filtered_reduced[:, 1].astype(float)
     z_coords = filtered_reduced[:, 2].astype(float) if filtered_reduced.shape[1] > 2 else np.zeros(len(filtered_reduced), dtype=float)
-    # Build models list with optimized operations
     models = [
         ModelPoint(
             model_id=model_ids[idx],
@@ -442,28 +498,42 @@ async def get_models(
             parent_model=parent_models[idx] if idx < len(parent_models) and pd.notna(parent_models[idx]) else None,
             licenses=licenses_arr[idx] if idx < len(licenses_arr) and pd.notna(licenses_arr[idx]) else None,
             family_depth=family_depths.get(model_ids[idx], None),
-            cluster_id=int(filtered_clusters[idx]) if idx < len(filtered_clusters) else None
         )
         for idx in range(len(filtered_df))
     ]
-    return models
 @app.get("/api/stats")
 async def get_stats():
     """Get dataset statistics."""
     if df is None:
-        raise HTTPException(status_code=503, detail="Data not loaded")
-    # Use len(df.index) to handle both regular and indexed DataFrames correctly
     total_models = len(df.index) if hasattr(df, 'index') else len(df)
     return {
         "total_models": total_models,
         "unique_libraries": int(df['library_name'].nunique()) if 'library_name' in df.columns else 0,
         "unique_pipelines": int(df['pipeline_tag'].nunique()) if 'pipeline_tag' in df.columns else 0,
         "unique_task_types": int(df['pipeline_tag'].nunique()) if 'pipeline_tag' in df.columns else 0,  # Alias for clarity
         "avg_downloads": float(df['downloads'].mean()) if 'downloads' in df.columns else 0,
         "avg_likes": float(df['likes'].mean()) if 'likes' in df.columns else 0
     }
@@ -473,7 +543,7 @@ async def get_stats():
 async def get_model_details(model_id: str):
     """Get detailed information about a specific model."""
     if df is None:
-        raise HTTPException(status_code=503, detail="Data not loaded")
     model = df[df.get('model_id', '') == model_id]
     if len(model) == 0:
@@ -481,11 +551,9 @@ async def get_model_details(model_id: str):
     model = model.iloc[0]
-    # Extract arXiv IDs from tags
     tags_str = str(model.get('tags', '')) if pd.notna(model.get('tags')) else ''
     arxiv_ids = extract_arxiv_ids(tags_str)
-    # Fetch arXiv papers if any IDs found
     papers = []
     if arxiv_ids:
         papers = await fetch_arxiv_papers(arxiv_ids[:5])  # Limit to 5 papers
@@ -505,6 +573,8 @@ async def get_model_details(model_id: str):
     }
 @app.get("/api/family/stats")
 async def get_family_stats():
     """
@@ -512,9 +582,8 @@ async def get_family_stats():
     Returns family size distribution, depth statistics, model card length by depth, etc.
     """
     if df is None:
-        raise HTTPException(status_code=503, detail="Data not loaded")
-    # Calculate family sizes
     family_sizes = {}
     root_models = set()
@@ -528,14 +597,13 @@ async def get_family_stats():
                 family_sizes[model_id] = 0
         else:
             parent_id_str = str(parent_id)
-            # Find root of this family
             root = parent_id_str
             visited = set()
             while root in df.index and pd.notna(df.loc[root].get('parent_model')):
                 parent = df.loc[root].get('parent_model')
                 if pd.isna(parent) or str(parent) == 'nan' or str(parent) == '':
                     break
-                if str(parent) in visited:  # Circular reference
                     break
                 visited.add(root)
                 root = str(parent)
@@ -544,18 +612,15 @@ async def get_family_stats():
                 family_sizes[root] = 0
             family_sizes[root] += 1
-    # Count family sizes
     size_distribution = {}
     for root, size in family_sizes.items():
         size_distribution[size] = size_distribution.get(size, 0) + 1
-    # Calculate depth statistics
     depths = calculate_family_depths(df)
     depth_counts = {}
     for depth in depths.values():
         depth_counts[depth] = depth_counts.get(depth, 0) + 1
-    # Calculate model card length by depth
     model_card_lengths_by_depth = {}
     if 'modelCard' in df.columns:
         for idx, row in df.iterrows():
@@ -568,7 +633,6 @@ async def get_family_stats():
                     model_card_lengths_by_depth[depth] = []
                 model_card_lengths_by_depth[depth].append(card_length)
-    # Calculate statistics for each depth
     model_card_stats = {}
     for depth, lengths in model_card_lengths_by_depth.items():
         if lengths:
@@ -593,99 +657,218 @@ async def get_family_stats():
     }
 @app.get("/api/family/{model_id}")
-async def get_family_tree(model_id: str, max_depth: int = Query(5, ge=1, le=10)):
     """
     Get family tree for a model (ancestors and descendants).
     Returns the model, its parent chain, and all children.
     """
     if df is None:
-        raise HTTPException(status_code=503, detail="Data not loaded")
-    # Find the model
-    model_row = df[df.get('model_id', '') == model_id]
-    if len(model_row) == 0:
-        raise HTTPException(status_code=404, detail="Model not found")
-    family_models = []
-    visited = set()
-    # Get coordinates for family members
     if reduced_embeddings is None:
         raise HTTPException(status_code=503, detail="Embeddings not ready")
-    # Optimize: create parent_model index for faster lookups
-    if 'parent_model' not in df.index.names and 'parent_model' in df.columns:
-        # Create a reverse index for faster parent lookups
-        parent_index = df[df['parent_model'].notna()].set_index('parent_model', drop=False, append=True)
-    def get_ancestors(current_id: str, depth: int):
-        """Recursively get parent chain - optimized with index lookup."""
-        if depth <= 0 or current_id in visited:
             return
         visited.add(current_id)
-        # Use index lookup if available, otherwise fallback to query
-        if 'model_id' in df.index.names or df.index.name == 'model_id':
-            try:
-                model = df.loc[[current_id]]
-            except KeyError:
-                return
-        else:
-            model = df[df.get('model_id', '') == current_id]
-            if len(model) == 0:
-                return
-            model = model.iloc[[0]]
-        parent_id = model.iloc[0].get('parent_model')
-        if parent_id and pd.notna(parent_id) and str(parent_id) != 'nan':
-            get_ancestors(str(parent_id), depth - 1)
-    def get_descendants(current_id: str, depth: int):
-        """Recursively get all children - optimized with index lookup."""
-        if depth <= 0 or current_id in visited:
             return
         visited.add(current_id)
-        # Use optimized parent lookup
-        if 'parent_model' in df.columns:
-            children = df[df['parent_model'] == current_id]
-            # Use vectorized iteration
-            child_ids = children['model_id'].dropna().astype(str).unique()
-            for child_id in child_ids:
-                if child_id not in visited:
-                    get_descendants(child_id, depth - 1)
-    # Get ancestors (parents)
-    get_ancestors(model_id, max_depth)
-    # Get descendants (children)
-    visited = set()  # Reset for descendants
-    get_descendants(model_id, max_depth)
-    # Add the root model
-    visited.add(model_id)
-    # Get all family members with coordinates - optimized
-    if 'model_id' in df.index.names or df.index.name == 'model_id':
-        # Use index lookup if available
         try:
             family_df = df.loc[list(visited)]
         except KeyError:
-            # Fallback to isin if some IDs not in index
-            family_df = df[df.get('model_id', '').isin(visited)]
     else:
         family_df = df[df.get('model_id', '').isin(visited)]
-    family_indices = family_df.index.values  # Use values instead of tolist() for speed
     family_reduced = reduced_embeddings[family_indices]
-    # Build family tree structure - optimized with vectorized operations
     family_map = {}
     for idx, (i, row) in enumerate(family_df.iterrows()):
-        model_id_val = str(row.get('model_id', 'Unknown'))
-        parent_id = row.get('parent_model') if pd.notna(row.get('parent_model')) else None
         family_map[model_id_val] = {
             "model_id": model_id_val,
@@ -696,12 +879,12 @@ async def get_family_tree(model_id: str, max_depth: int = Query(5, ge=1, le=10))
             "pipeline_tag": str(row.get('pipeline_tag')) if pd.notna(row.get('pipeline_tag')) else None,
             "downloads": int(row.get('downloads', 0)) if pd.notna(row.get('downloads')) else 0,
             "likes": int(row.get('likes', 0)) if pd.notna(row.get('likes')) else 0,
-            "parent_model": str(parent_id) if parent_id else None,
             "licenses": str(row.get('licenses')) if pd.notna(row.get('licenses')) else None,
             "children": []
         }
-    # Build tree structure
     root_models = []
     for model_id_val, model_data in family_map.items():
         parent_id = model_data["parent_model"]
@@ -711,7 +894,7 @@ async def get_family_tree(model_id: str, max_depth: int = Query(5, ge=1, le=10))
             root_models.append(model_id_val)
     return {
-        "root_model": model_id,
         "family": list(family_map.values()),
         "family_map": family_map,
         "root_models": root_models
@@ -720,7 +903,9 @@ async def get_family_tree(model_id: str, max_depth: int = Query(5, ge=1, le=10))
 @app.get("/api/search")
 async def search_models(
-    query: str = Query(..., min_length=1),
     graph_aware: bool = Query(False),
     include_neighbors: bool = Query(True)
 ):
@@ -729,47 +914,79 @@ async def search_models(
     Enhanced with graph-aware search option that includes network relationships.
     """
     if df is None:
-        raise HTTPException(status_code=503, detail="Data not loaded")
     if graph_aware:
-        # Use graph-aware search
         try:
             network_builder = ModelNetworkBuilder(df)
-            # Build network for top models (for performance)
             top_models = network_builder.get_top_models_by_field(n=1000)
             model_ids = [mid for mid, _ in top_models]
             graph = network_builder.build_cooccurrence_network(model_ids, cooccurrence_method='combined')
             results = network_builder.search_graph_aware(
-                query=query,
                 graph=graph,
-                max_results=20,
                 include_neighbors=include_neighbors
             )
-            return {"results": results, "search_type": "graph_aware"}
-        except Exception as e:
-            pass
-    query_lower = query.lower()
-    matches = df[
-        df.get('model_id', '').astype(str).str.lower().str.contains(query_lower, na=False)
-    ].head(20)  # Limit to 20 results
     results = []
     for _, row in matches.iterrows():
         results.append({
-            "model_id": row.get('model_id'),
-            "title": row.get('model_id', '').split('/')[-1] if '/' in str(row.get('model_id', '')) else str(row.get('model_id', '')),
-            "library_name": row.get('library_name'),
-            "pipeline_tag": row.get('pipeline_tag'),
             "downloads": int(row.get('downloads', 0)),
             "likes": int(row.get('likes', 0)),
             "parent_model": row.get('parent_model') if pd.notna(row.get('parent_model')) else None,
             "match_type": "direct"
         })
-    return {"results": results, "search_type": "basic"}
 @app.get("/api/similar/{model_id}")
@@ -778,12 +995,12 @@ async def get_similar_models(model_id: str, k: int = Query(10, ge=1, le=50)):
     Get k-nearest neighbors of a model based on embedding similarity.
     Returns similar models with distance scores.
     """
-    global df, embedder, embeddings, reduced_embeddings
-    if df is None or embeddings is None:
         raise HTTPException(status_code=503, detail="Data not loaded")
-    # Find the model - optimized with index lookup
     if 'model_id' in df.index.names or df.index.name == 'model_id':
         try:
             model_row = df.loc[[model_id]]
@@ -797,16 +1014,11 @@ async def get_similar_models(model_id: str, k: int = Query(10, ge=1, le=50)):
         model_idx = model_row.index[0]
     model_embedding = embeddings[model_idx]
-    # Calculate cosine similarity to all other models - optimized
     from sklearn.metrics.pairwise import cosine_similarity
-    # Use vectorized operations for better performance
     model_embedding_2d = model_embedding.reshape(1, -1)
     similarities = cosine_similarity(model_embedding_2d, embeddings)[0]
-    # Get top k similar models (excluding itself) - use argpartition for speed
-    # argpartition is faster than full sort for top-k
     top_k_indices = np.argpartition(similarities, -k-1)[-k-1:-1]
-    # Sort only the top k (much faster than sorting all)
     top_k_indices = top_k_indices[np.argsort(similarities[top_k_indices])][::-1]
     similar_models = []
@@ -817,7 +1029,7 @@ async def get_similar_models(model_id: str, k: int = Query(10, ge=1, le=50)):
         similar_models.append({
             "model_id": row.get('model_id', 'Unknown'),
             "similarity": float(similarities[idx]),
-            "distance": float(1 - similarities[idx]),  # Convert similarity to distance
             "library_name": row.get('library_name'),
             "pipeline_tag": row.get('pipeline_tag'),
             "downloads": int(row.get('downloads', 0)),
@@ -843,11 +1055,12 @@ async def get_models_by_semantic_similarity(
     Returns models with their similarity scores and coordinates.
     Useful for exploring the embedding space around a specific model.
     """
-    global df, embedder, embeddings, reduced_embeddings
-    if df is None or embeddings is None:
         raise HTTPException(status_code=503, detail="Data not loaded")
     # Find the query model
     if 'model_id' in df.index.names or df.index.name == 'model_id':
         try:
@@ -863,7 +1076,6 @@ async def get_models_by_semantic_similarity(
     query_embedding = embeddings[model_idx]
-    # Filter by downloads/likes first for performance
     filtered_df = data_loader.filter_data(
         df=df,
         min_downloads=min_downloads,
@@ -873,32 +1085,26 @@ async def get_models_by_semantic_similarity(
         pipeline_tags=None
     )
-    # Get indices of filtered models
     if df.index.name == 'model_id' or 'model_id' in df.index.names:
         filtered_indices = [df.index.get_loc(idx) for idx in filtered_df.index]
         filtered_indices = np.array(filtered_indices, dtype=int)
     else:
         filtered_indices = filtered_df.index.values.astype(int)
-    # Calculate similarities only for filtered models
     filtered_embeddings = embeddings[filtered_indices]
     from sklearn.metrics.pairwise import cosine_similarity
     query_embedding_2d = query_embedding.reshape(1, -1)
     similarities = cosine_similarity(query_embedding_2d, filtered_embeddings)[0]
-    # Get top k similar models
     top_k_local_indices = np.argpartition(similarities, -k)[-k:]
     top_k_local_indices = top_k_local_indices[np.argsort(similarities[top_k_local_indices])][::-1]
-    # Get reduced embeddings for visualization
     if reduced_embeddings is None:
         raise HTTPException(status_code=503, detail="Reduced embeddings not ready")
-    # Map back to original indices
     top_k_original_indices = filtered_indices[top_k_local_indices]
     top_k_reduced = reduced_embeddings[top_k_original_indices]
-    # Build response
     similar_models = []
     for i, orig_idx in enumerate(top_k_original_indices):
         row = df.iloc[orig_idx]
@@ -935,11 +1141,12 @@ async def get_distance(
     """
     Calculate distance/similarity between two models.
     """
-    global df, embedder, embeddings
-    if df is None or embeddings is None:
         raise HTTPException(status_code=503, detail="Data not loaded")
     # Find both models - optimized with index lookup
     if 'model_id' in df.index.names or df.index.name == 'model_id':
         try:
@@ -976,7 +1183,7 @@ async def export_models(model_ids: List[str]):
     Export selected models as JSON with full metadata.
     """
     if df is None:
-        raise HTTPException(status_code=503, detail="Data not loaded")
     # Optimized export with index lookup
     if 'model_id' in df.index.names or df.index.name == 'model_id':
@@ -991,7 +1198,6 @@ async def export_models(model_ids: List[str]):
     if len(exported) == 0:
         return {"models": []}
-    # Use list comprehension for faster building
     models = [
         {
             "model_id": str(row.get('model_id', '')),
@@ -1029,12 +1235,10 @@ async def get_cooccurrence_network(
     Returns network graph data suitable for visualization.
     """
     if df is None:
-        raise HTTPException(status_code=503, detail="Data not loaded")
     try:
         network_builder = ModelNetworkBuilder(df)
-        # Get top models by field
         top_models = network_builder.get_top_models_by_field(
             library=library,
             pipeline_tag=pipeline_tag,
@@ -1051,14 +1255,11 @@ async def get_cooccurrence_network(
             }
         model_ids = [mid for mid, _ in top_models]
-        # Build co-occurrence network
         graph = network_builder.build_cooccurrence_network(
             model_ids=model_ids,
             cooccurrence_method=cooccurrence_method
         )
-        # Convert to JSON-serializable format
         nodes = []
         for node_id, attrs in graph.nodes(data=True):
             nodes.append({
@@ -1086,45 +1287,70 @@ async def get_cooccurrence_network(
             "links": links,
             "statistics": stats
         }
-    except Exception as e:
         raise HTTPException(status_code=500, detail=f"Error building network: {str(e)}")
 @app.get("/api/network/family/{model_id}")
 async def get_family_network(
     model_id: str,
-    max_depth: int = Query(5, ge=1, le=10)
 ):
     """
     Build family tree network for a model (directed graph).
-    Returns network graph data showing parent-child relationships.
     """
     if df is None:
-        raise HTTPException(status_code=503, detail="Data not loaded")
     try:
         network_builder = ModelNetworkBuilder(df)
         graph = network_builder.build_family_tree_network(
             root_model_id=model_id,
-            max_depth=max_depth
         )
-        # Convert to JSON-serializable format
         nodes = []
         for node_id, attrs in graph.nodes(data=True):
             nodes.append({
                 "id": node_id,
                 "title": attrs.get('title', node_id),
-                "freq": attrs.get('freq', 0)
             })
         links = []
-        for source, target in graph.edges():
-            links.append({
                 "source": source,
-                "target": target
-            })
         stats = network_builder.get_network_statistics(graph)
@@ -1134,8 +1360,8 @@ async def get_family_network(
             "statistics": stats,
             "root_model": model_id
         }
-    except Exception as e:
         raise HTTPException(status_code=500, detail=f"Error building family network: {str(e)}")
@@ -1150,11 +1376,10 @@ async def get_model_neighbors(
     Similar to graph database queries for finding connected nodes.
     """
     if df is None:
-        raise HTTPException(status_code=503, detail="Data not loaded")
     try:
         network_builder = ModelNetworkBuilder(df)
-        # Build network for top models (for performance)
         top_models = network_builder.get_top_models_by_field(n=1000)
         model_ids = [mid for mid, _ in top_models]
         graph = network_builder.build_cooccurrence_network(model_ids, cooccurrence_method='combined')
@@ -1171,8 +1396,8 @@ async def get_model_neighbors(
             "neighbors": neighbors,
             "count": len(neighbors)
         }
-    except Exception as e:
         raise HTTPException(status_code=500, detail=f"Error finding neighbors: {str(e)}")
@@ -1187,7 +1412,7 @@ async def find_path_between_models(
     Similar to graph database path queries.
     """
     if df is None:
-        raise HTTPException(status_code=503, detail="Data not loaded")
     try:
         network_builder = ModelNetworkBuilder(df)
@@ -1235,7 +1460,7 @@ async def search_by_cooccurrence(
     Similar to graph database queries for co-assignment patterns.
     """
     if df is None:
-        raise HTTPException(status_code=503, detail="Data not loaded")
     try:
         network_builder = ModelNetworkBuilder(df)
@@ -1272,7 +1497,7 @@ async def get_model_relationships(
     Similar to graph database relationship queries.
     """
     if df is None:
-        raise HTTPException(status_code=503, detail="Data not loaded")
     try:
         network_builder = ModelNetworkBuilder(df)
@@ -1297,32 +1522,57 @@ async def get_model_relationships(
 async def get_current_model_count(
     use_cache: bool = Query(True),
     force_refresh: bool = Query(False),
-    use_dataset_snapshot: bool = Query(False)
 ):
     """
     Get the current number of models on Hugging Face Hub.
-    Fetches live data from the Hub API or uses dataset snapshot (faster but may be outdated).
     Query Parameters:
         use_cache: Use cached results if available (default: True)
         force_refresh: Force refresh even if cache is valid (default: False)
-        use_dataset_snapshot: Use dataset snapshot instead of API (faster, default: False)
     """
     try:
         if use_dataset_snapshot:
-            # Use improved tracker with dataset snapshot (like ai-ecosystem repo)
-            tracker = get_improved_tracker()
-            count_data = tracker.get_count_from_dataset_snapshot()
             if count_data is None:
-                # Fallback to API if dataset unavailable
-                count_data = tracker.get_current_model_count(use_cache=use_cache, force_refresh=force_refresh)
         else:
-            # Use improved tracker with API (has caching)
-            tracker = get_improved_tracker()
-            count_data = tracker.get_current_model_count(use_cache=use_cache, force_refresh=force_refresh)
         return count_data
     except Exception as e:
         raise HTTPException(status_code=500, detail=f"Error fetching model count: {str(e)}")
@@ -1343,7 +1593,7 @@ async def get_historical_model_counts(
     try:
         from datetime import datetime
-        tracker = get_improved_tracker()
         start = None
         end = None
@@ -1373,7 +1623,7 @@ async def get_historical_model_counts(
 async def get_latest_model_count():
     """Get the most recently recorded model count from database."""
     try:
-        tracker = get_improved_tracker()
         latest = tracker.get_latest_count()
         if latest is None:
             raise HTTPException(status_code=404, detail="No model counts recorded yet")
@@ -1397,16 +1647,14 @@ async def record_model_count(
         use_dataset_snapshot: Use dataset snapshot instead of API (faster, default: False)
     """
     try:
-        tracker = get_improved_tracker()
-        # Fetch and record in background to avoid blocking
         def record():
             if use_dataset_snapshot:
                 count_data = tracker.get_count_from_dataset_snapshot()
                 if count_data:
                     tracker.record_count(count_data, source="dataset_snapshot")
                 else:
-                    # Fallback to API
                     count_data = tracker.get_current_model_count(use_cache=False)
                     tracker.record_count(count_data, source="api")
             else:
@@ -1433,7 +1681,7 @@ async def get_growth_stats(days: int = Query(7, ge=1, le=365)):
         days: Number of days to analyze
     """
     try:
-        tracker = get_improved_tracker()
         stats = tracker.get_growth_stats(days)
         return stats
     except Exception as e:
@@ -1455,12 +1703,11 @@ async def export_network_graphml(
     Similar to Open Syllabus graph export functionality.
     """
     if df is None:
-        raise HTTPException(status_code=503, detail="Data not loaded")
     try:
         network_builder = ModelNetworkBuilder(df)
-        # Get top models by field
         top_models = network_builder.get_top_models_by_field(
             library=library,
             pipeline_tag=pipeline_tag,
@@ -1473,29 +1720,24 @@ async def export_network_graphml(
             raise HTTPException(status_code=404, detail="No models found matching criteria")
         model_ids = [mid for mid, _ in top_models]
-        # Build co-occurrence network
         graph = network_builder.build_cooccurrence_network(
             model_ids=model_ids,
             cooccurrence_method=cooccurrence_method
         )
-        # Create temporary file
         with tempfile.NamedTemporaryFile(mode='w', suffix='.graphml', delete=False) as tmp_file:
             tmp_path = tmp_file.name
             network_builder.export_graphml(graph, tmp_path)
-        # Schedule cleanup after response is sent
         background_tasks.add_task(os.unlink, tmp_path)
-        # Return file for download
         return FileResponse(
             tmp_path,
             media_type='application/xml',
             filename=f'network_{cooccurrence_method}_{n}_models.graphml'
         )
-    except Exception as e:
         raise HTTPException(status_code=500, detail=f"Error exporting network: {str(e)}")
@@ -1506,7 +1748,7 @@ async def get_model_papers(model_id: str):
     Extracts arXiv IDs from model tags and fetches paper information.
     """
     if df is None:
-        raise HTTPException(status_code=503, detail="Data not loaded")
     model = df[df.get('model_id', '') == model_id]
     if len(model) == 0:
@@ -1535,36 +1777,131 @@ async def get_model_papers(model_id: str):
     }
 @app.get("/api/model/{model_id}/files")
 async def get_model_files(model_id: str, branch: str = Query("main")):
     """
     Get file tree for a model from Hugging Face.
     Proxies the request to avoid CORS issues.
     """
     try:
-        # Try main branch first, then master
-        branches_to_try = [branch, "main", "master"] if branch not in ["main", "master"] else [branch, "main" if branch == "master" else "master"]
-        async with httpx.AsyncClient(timeout=10.0) as client:
             for branch_name in branches_to_try:
                 try:
                     url = f"https://huggingface.co/api/models/{model_id}/tree/{branch_name}"
                     response = await client.get(url)
                     if response.status_code == 200:
-                        return response.json()
-                except Exception:
                     continue
-            raise HTTPException(status_code=404, detail="File tree not found for this model")
     except httpx.TimeoutException:
-        raise HTTPException(status_code=504, detail="Request to Hugging Face timed out")
     except Exception as e:
-        raise HTTPException(status_code=500, detail=f"Error fetching file tree: {str(e)}")
 if __name__ == "__main__":
     import uvicorn
-    # Use PORT environment variable for cloud platforms (Railway, Render, Heroku)
     port = int(os.getenv("PORT", 8000))
     uvicorn.run(app, host="0.0.0.0", port=port)

 import sys
 import os
+import pickle
+import tempfile
+import logging
+from typing import Optional, List, Dict
+from datetime import datetime, timedelta
+import pandas as pd
+import numpy as np
+import httpx
 from fastapi import FastAPI, HTTPException, Query, BackgroundTasks, Request
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.middleware.gzip import GZipMiddleware
 from fastapi.responses import FileResponse, JSONResponse
 from fastapi.exceptions import RequestValidationError
 from starlette.exceptions import HTTPException as StarletteHTTPException
 from pydantic import BaseModel
 from umap import UMAP
 from utils.data_loader import ModelDataLoader
 from utils.embeddings import ModelEmbedder
 from utils.dimensionality_reduction import DimensionReducer
 from utils.network_analysis import ModelNetworkBuilder
+from utils.graph_embeddings import GraphEmbedder
 from services.model_tracker import get_tracker
 from services.arxiv_api import extract_arxiv_ids, fetch_arxiv_papers
+from core.config import settings
+from core.exceptions import DataNotLoadedError, EmbeddingsNotReadyError
+from models.schemas import ModelPoint
+from utils.family_tree import calculate_family_depths
+import api.dependencies as deps
+from api.routes import models, stats, clusters
+# Create aliases for backward compatibility with existing routes
+# Note: These are set at module load time and may be None initially
+# Functions should access via deps.* to get current values
+data_loader = deps.data_loader
+backend_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+if backend_dir not in sys.path:
+    sys.path.insert(0, backend_dir)
+logger = logging.getLogger(__name__)
+app = FastAPI(title="HF Model Ecosystem API", version="2.0.0")
 app.add_middleware(GZipMiddleware, minimum_size=1000)
+CORS_HEADERS = {
+    "Access-Control-Allow-Origin": "*",
+    "Access-Control-Allow-Methods": "*",
+    "Access-Control-Allow-Headers": "*",
+}
 @app.exception_handler(Exception)
 async def global_exception_handler(request: Request, exc: Exception):
+    logger.exception("Unhandled exception", exc_info=exc)
     return JSONResponse(
         status_code=500,
+        content={"detail": "Internal server error"},
+        headers=CORS_HEADERS,
     )
 @app.exception_handler(StarletteHTTPException)
 async def http_exception_handler(request: Request, exc: StarletteHTTPException):
     return JSONResponse(
         status_code=exc.status_code,
         content={"detail": exc.detail},
+        headers=CORS_HEADERS,
     )
 @app.exception_handler(RequestValidationError)
 async def validation_exception_handler(request: Request, exc: RequestValidationError):
     return JSONResponse(
         status_code=422,
         content={"detail": exc.errors()},
+        headers=CORS_HEADERS,
     )
+if settings.ALLOW_ALL_ORIGINS:
     app.add_middleware(
         CORSMiddleware,
+        allow_origins=["*"],
+        allow_credentials=False,
         allow_methods=["*"],
         allow_headers=["*"],
     )
 else:
     app.add_middleware(
         CORSMiddleware,
+        allow_origins=["http://localhost:3000", settings.FRONTEND_URL],
         allow_credentials=True,
         allow_methods=["*"],
         allow_headers=["*"],
     )
+# Include routers
+app.include_router(models.router)
+app.include_router(stats.router)
+app.include_router(clusters.router)
 @app.on_event("startup")
 async def startup_event():
+    # All variables are accessed via deps module, no need for global declarations
     backend_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
     root_dir = os.path.dirname(backend_dir)
     cache_dir = os.path.join(root_dir, "cache")
     os.makedirs(cache_dir, exist_ok=True)
     embeddings_cache = os.path.join(cache_dir, "embeddings.pkl")
+    graph_embeddings_cache = os.path.join(cache_dir, "graph_embeddings.pkl")
+    combined_embeddings_cache = os.path.join(cache_dir, "combined_embeddings.pkl")
     reduced_cache_umap = os.path.join(cache_dir, "reduced_umap_3d.pkl")
+    reduced_cache_umap_graph = os.path.join(cache_dir, "reduced_umap_3d_graph.pkl")
     reducer_cache_umap = os.path.join(cache_dir, "reducer_umap_3d.pkl")
+    reducer_cache_umap_graph = os.path.join(cache_dir, "reducer_umap_3d_graph.pkl")
+    sample_size = settings.get_sample_size()
+    if sample_size:
+        logger.info(f"Loading limited dataset: {sample_size} models (SAMPLE_SIZE={sample_size})")
     else:
+        logger.info("No SAMPLE_SIZE set, loading full dataset")
+    deps.df = deps.data_loader.load_data(sample_size=sample_size)
+    deps.df = deps.data_loader.preprocess_for_embedding(deps.df)
+    if 'model_id' in deps.df.columns:
+        deps.df.set_index('model_id', drop=False, inplace=True)
     for col in ['downloads', 'likes']:
+        if col in deps.df.columns:
+            deps.df[col] = pd.to_numeric(deps.df[col], errors='coerce').fillna(0).astype(int)
+    deps.embedder = ModelEmbedder()
+    # Load or generate text embeddings
     if os.path.exists(embeddings_cache):
         try:
+            deps.embeddings = deps.embedder.load_embeddings(embeddings_cache)
+        except (IOError, pickle.UnpicklingError, EOFError) as e:
+            logger.warning(f"Failed to load cached embeddings: {e}")
+            deps.embeddings = None
+    if deps.embeddings is None:
+        texts = deps.df['combined_text'].tolist()
+        deps.embeddings = deps.embedder.generate_embeddings(texts, batch_size=128)
+        deps.embedder.save_embeddings(deps.embeddings, embeddings_cache)
+    # Initialize graph embedder and generate graph embeddings (optional, lazy-loaded)
+    if settings.USE_GRAPH_EMBEDDINGS:
+        try:
+            deps.graph_embedder = GraphEmbedder()
+            logger.info("Building family graph for graph embeddings...")
+            graph = deps.graph_embedder.build_family_graph(deps.df)
+            if os.path.exists(graph_embeddings_cache):
+                try:
+                    deps.graph_embeddings_dict = deps.graph_embedder.load_embeddings(graph_embeddings_cache)
+                    logger.info(f"Loaded cached graph embeddings for {len(deps.graph_embeddings_dict)} models")
+                except (IOError, pickle.UnpicklingError, EOFError) as e:
+                    logger.warning(f"Failed to load cached graph embeddings: {e}")
+                    deps.graph_embeddings_dict = None
+            if deps.graph_embeddings_dict is None or len(deps.graph_embeddings_dict) == 0:
+                logger.info("Generating graph embeddings (this may take a while)...")
+                deps.graph_embeddings_dict = deps.graph_embedder.generate_graph_embeddings(graph, workers=4)
+                if deps.graph_embeddings_dict:
+                    deps.graph_embedder.save_embeddings(deps.graph_embeddings_dict, graph_embeddings_cache)
+                    logger.info(f"Generated graph embeddings for {len(deps.graph_embeddings_dict)} models")
+            # Combine text and graph embeddings
+            if deps.graph_embeddings_dict and len(deps.graph_embeddings_dict) > 0:
+                model_ids = deps.df['model_id'].astype(str).tolist()
+                if os.path.exists(combined_embeddings_cache):
+                    try:
+                        with open(combined_embeddings_cache, 'rb') as f:
+                            deps.combined_embeddings = pickle.load(f)
+                        logger.info("Loaded cached combined embeddings")
+                    except (IOError, pickle.UnpicklingError, EOFError) as e:
+                        logger.warning(f"Failed to load cached combined embeddings: {e}")
+                        deps.combined_embeddings = None
+                if deps.combined_embeddings is None:
+                    logger.info("Combining text and graph embeddings...")
+                    deps.combined_embeddings = deps.graph_embedder.combine_embeddings(
+                        deps.embeddings, deps.graph_embeddings_dict, model_ids,
+                        text_weight=0.7, graph_weight=0.3
+                    )
+                    with open(combined_embeddings_cache, 'wb') as f:
+                        pickle.dump(deps.combined_embeddings, f)
+                    logger.info("Combined embeddings saved")
         except Exception as e:
+            logger.warning(f"Graph embeddings not available: {e}. Continuing with text-only embeddings.")
+            deps.graph_embedder = None
+            deps.graph_embeddings_dict = None
+            deps.combined_embeddings = None
+    # Initialize reducer for text embeddings
+    deps.reducer = DimensionReducer(method="umap", n_components=3)
     if os.path.exists(reduced_cache_umap) and os.path.exists(reducer_cache_umap):
         try:
             with open(reduced_cache_umap, 'rb') as f:
+                deps.reduced_embeddings = pickle.load(f)
+            deps.reducer.load_reducer(reducer_cache_umap)
+        except (IOError, pickle.UnpicklingError, EOFError) as e:
+            logger.warning(f"Failed to load cached reduced embeddings: {e}")
+            deps.reduced_embeddings = None
+    if deps.reduced_embeddings is None:
+        deps.reducer.reducer = UMAP(
             n_components=3,
             n_neighbors=30,
             min_dist=0.3,
             low_memory=True,
             spread=1.5
         )
+        deps.reduced_embeddings = deps.reducer.fit_transform(deps.embeddings)
         with open(reduced_cache_umap, 'wb') as f:
+            pickle.dump(deps.reduced_embeddings, f)
+        deps.reducer.save_reducer(reducer_cache_umap)
+    # Initialize reducer for graph-aware embeddings if available
+    if deps.combined_embeddings is not None:
+        reducer_graph = DimensionReducer(method="umap", n_components=3)
+        if os.path.exists(reduced_cache_umap_graph) and os.path.exists(reducer_cache_umap_graph):
+            try:
+                with open(reduced_cache_umap_graph, 'rb') as f:
+                    deps.reduced_embeddings_graph = pickle.load(f)
+                reducer_graph.load_reducer(reducer_cache_umap_graph)
+            except (IOError, pickle.UnpicklingError, EOFError) as e:
+                logger.warning(f"Failed to load cached graph-aware reduced embeddings: {e}")
+                deps.reduced_embeddings_graph = None
+        if deps.reduced_embeddings_graph is None:
+            reducer_graph.reducer = UMAP(
+                n_components=3,
+                n_neighbors=30,
+                min_dist=0.3,
+                metric='cosine',
+                random_state=42,
+                n_jobs=-1,
+                low_memory=True,
+                spread=1.5
+            )
+            deps.reduced_embeddings_graph = reducer_graph.fit_transform(deps.combined_embeddings)
+            with open(reduced_cache_umap_graph, 'wb') as f:
+                pickle.dump(deps.reduced_embeddings_graph, f)
+            reducer_graph.save_reducer(reducer_cache_umap_graph)
+            logger.info("Graph-aware embeddings reduced and cached")
+    # Update module-level aliases
+    df = deps.df
+    embedder = deps.embedder
+    graph_embedder = deps.graph_embedder
+    reducer = deps.reducer
+    embeddings = deps.embeddings
+    graph_embeddings_dict = deps.graph_embeddings_dict
+    combined_embeddings = deps.combined_embeddings
+    reduced_embeddings = deps.reduced_embeddings
+    reduced_embeddings_graph = deps.reduced_embeddings_graph
+from utils.family_tree import calculate_family_depths
 def compute_clusters(reduced_embeddings: np.ndarray, n_clusters: int = 50) -> np.ndarray:
     from sklearn.cluster import KMeans
     n_samples = len(reduced_embeddings)
         n_clusters = max(1, n_samples // 10)
     kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
+    return kmeans.fit_predict(reduced_embeddings)
 @app.get("/")
     search_query: Optional[str] = Query(None),
     color_by: str = Query("library_name"),
     size_by: str = Query("downloads"),
+    max_points: Optional[int] = Query(None),
+    projection_method: str = Query("umap"),
+    base_models_only: bool = Query(False),
+    max_hierarchy_depth: Optional[int] = Query(None, ge=0, description="Filter to models at or below this hierarchy depth."),
+    use_graph_embeddings: bool = Query(False, description="Use graph-aware embeddings that respect family tree structure")
 ):
+    if deps.df is None:
+        raise DataNotLoadedError()
+    df = deps.df
     # Filter data
     filtered_df = data_loader.filter_data(
                 (filtered_df['parent_model'].astype(str) == 'nan')
             ]
+    if max_hierarchy_depth is not None:
+        family_depths = calculate_family_depths(df)
+        filtered_df = filtered_df[
+            filtered_df['model_id'].astype(str).map(lambda x: family_depths.get(x, 0) <= max_hierarchy_depth)
+        ]
     filtered_count = len(filtered_df)
     if len(filtered_df) == 0:
         }
     if max_points is not None and len(filtered_df) > max_points:
         if 'library_name' in filtered_df.columns and filtered_df['library_name'].notna().any():
+            # Sample proportionally by library, preserving all columns
+            sampled_dfs = []
+            for lib_name, group in filtered_df.groupby('library_name', group_keys=False):
+                n_samples = max(1, int(max_points * len(group) / len(filtered_df)))
+                sampled_dfs.append(group.sample(min(len(group), n_samples), random_state=42))
+            filtered_df = pd.concat(sampled_dfs, ignore_index=True)
             if len(filtered_df) > max_points:
+                filtered_df = filtered_df.sample(n=max_points, random_state=42).reset_index(drop=True)
+            else:
+                filtered_df = filtered_df.reset_index(drop=True)
         else:
+            filtered_df = filtered_df.sample(n=max_points, random_state=42).reset_index(drop=True)
+    # Determine which embeddings to use
+    if use_graph_embeddings and combined_embeddings is not None:
+        current_embeddings = combined_embeddings
+        current_reduced = reduced_embeddings_graph
+        embedding_type = "graph-aware"
+    else:
+        if embeddings is None:
+            raise EmbeddingsNotReadyError()
+        current_embeddings = embeddings
+        current_reduced = reduced_embeddings
+        embedding_type = "text-only"
+    # Handle reduced embeddings loading/generation
+    if current_reduced is None or (reducer and reducer.method != projection_method.lower()):
         backend_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
         root_dir = os.path.dirname(backend_dir)
         cache_dir = os.path.join(root_dir, "cache")
+        cache_suffix = "_graph" if use_graph_embeddings and combined_embeddings is not None else ""
+        reduced_cache = os.path.join(cache_dir, f"reduced_{projection_method.lower()}_3d{cache_suffix}.pkl")
+        reducer_cache = os.path.join(cache_dir, f"reducer_{projection_method.lower()}_3d{cache_suffix}.pkl")
         if os.path.exists(reduced_cache) and os.path.exists(reducer_cache):
             try:
                 with open(reduced_cache, 'rb') as f:
+                    current_reduced = pickle.load(f)
                 if reducer is None or reducer.method != projection_method.lower():
                     reducer = DimensionReducer(method=projection_method.lower(), n_components=3)
                 reducer.load_reducer(reducer_cache)
+            except (IOError, pickle.UnpicklingError, EOFError) as e:
+                logger.warning(f"Failed to load cached reduced embeddings: {e}")
+                current_reduced = None
+        if current_reduced is None:
             if reducer is None or reducer.method != projection_method.lower():
                 reducer = DimensionReducer(method=projection_method.lower(), n_components=3)
                 if projection_method.lower() == "umap":
                         low_memory=True,
                         spread=1.5
                     )
+            current_reduced = reducer.fit_transform(current_embeddings)
             with open(reduced_cache, 'wb') as f:
+                pickle.dump(current_reduced, f)
             reducer.save_reducer(reducer_cache)
+            # Update global variable
+            if use_graph_embeddings and deps.combined_embeddings is not None:
+                deps.reduced_embeddings_graph = current_reduced
+            else:
+                deps.reduced_embeddings = current_reduced
+    # Get indices for filtered data
+    # Use model_id column to map between filtered_df and original df
+    # This is safer than using index positions which can change after filtering
+    filtered_model_ids = filtered_df['model_id'].astype(str).values
+    # Map model_ids to positions in original df
     if df.index.name == 'model_id' or 'model_id' in df.index.names:
+        # When df is indexed by model_id, use get_loc directly
+        filtered_indices = []
+        for model_id in filtered_model_ids:
+            try:
+                pos = df.index.get_loc(model_id)
+                # Handle both single position and array of positions
+                if isinstance(pos, (int, np.integer)):
+                    filtered_indices.append(int(pos))
+                elif isinstance(pos, (slice, np.ndarray)):
+                    # If multiple matches, take first
+                    if isinstance(pos, slice):
+                        filtered_indices.append(int(pos.start))
+                    else:
+                        filtered_indices.append(int(pos[0]))
+            except (KeyError, TypeError):
+                continue
+        filtered_indices = np.array(filtered_indices, dtype=np.int32)
     else:
+        # When df is not indexed by model_id, find positions by matching model_id column
+        df_model_ids = df['model_id'].astype(str).values
+        model_id_to_pos = {mid: pos for pos, mid in enumerate(df_model_ids)}
+        filtered_indices = np.array([
+            model_id_to_pos[mid] for mid in filtered_model_ids
+            if mid in model_id_to_pos
+        ], dtype=np.int32)
+    if len(filtered_indices) == 0:
+        return {
+            "models": [],
+            "embedding_type": embedding_type,
+            "filtered_count": filtered_count,
+            "returned_count": 0
+        }
+    filtered_reduced = current_reduced[filtered_indices]
     family_depths = calculate_family_depths(df)
+    # Use appropriate embeddings for clustering
+    clustering_embeddings = current_reduced
+    # Compute clusters if not already computed or if size changed
+    if models.cluster_labels is None or len(models.cluster_labels) != len(clustering_embeddings):
+        models.cluster_labels = compute_clusters(clustering_embeddings, n_clusters=min(50, len(clustering_embeddings) // 100))
+    # Handle case where cluster_labels might not match filtered data yet
+    if models.cluster_labels is not None and len(models.cluster_labels) > 0:
+        if len(filtered_indices) <= len(models.cluster_labels):
+            filtered_clusters = models.cluster_labels[filtered_indices]
+        else:
+            # Fallback: use first cluster for all if indices don't match
+            filtered_clusters = np.zeros(len(filtered_indices), dtype=int)
+    else:
+        filtered_clusters = np.zeros(len(filtered_indices), dtype=int)
     model_ids = filtered_df['model_id'].astype(str).values
+    library_names = filtered_df.get('library_name', pd.Series([None] * len(filtered_df))).values
+    pipeline_tags = filtered_df.get('pipeline_tag', pd.Series([None] * len(filtered_df))).values
+    downloads_arr = filtered_df.get('downloads', pd.Series([0] * len(filtered_df))).fillna(0).astype(int).values
+    likes_arr = filtered_df.get('likes', pd.Series([0] * len(filtered_df))).fillna(0).astype(int).values
+    trending_scores = filtered_df.get('trendingScore', pd.Series([None] * len(filtered_df))).values
+    tags_arr = filtered_df.get('tags', pd.Series([None] * len(filtered_df))).values
+    parent_models = filtered_df.get('parent_model', pd.Series([None] * len(filtered_df))).values
+    licenses_arr = filtered_df.get('licenses', pd.Series([None] * len(filtered_df))).values
+    created_at_arr = filtered_df.get('createdAt', pd.Series([None] * len(filtered_df))).values
     x_coords = filtered_reduced[:, 0].astype(float)
     y_coords = filtered_reduced[:, 1].astype(float)
     z_coords = filtered_reduced[:, 2].astype(float) if filtered_reduced.shape[1] > 2 else np.zeros(len(filtered_reduced), dtype=float)
     models = [
         ModelPoint(
             model_id=model_ids[idx],
             parent_model=parent_models[idx] if idx < len(parent_models) and pd.notna(parent_models[idx]) else None,
             licenses=licenses_arr[idx] if idx < len(licenses_arr) and pd.notna(licenses_arr[idx]) else None,
             family_depth=family_depths.get(model_ids[idx], None),
+            cluster_id=int(filtered_clusters[idx]) if idx < len(filtered_clusters) else None,
+            created_at=str(created_at_arr[idx]) if idx < len(created_at_arr) and pd.notna(created_at_arr[idx]) else None
         )
         for idx in range(len(filtered_df))
     ]
+    # Return models with metadata about embedding type
+    return {
+        "models": models,
+        "embedding_type": embedding_type,
+        "filtered_count": filtered_count,
+        "returned_count": len(models)
+    }
 @app.get("/api/stats")
 async def get_stats():
     """Get dataset statistics."""
     if df is None:
+        raise DataNotLoadedError()
     total_models = len(df.index) if hasattr(df, 'index') else len(df)
+    # Get unique licenses with counts
+    licenses = {}
+    if 'license' in df.columns:
+        license_counts = df['license'].value_counts().to_dict()
+        licenses = {str(k): int(v) for k, v in license_counts.items() if pd.notna(k) and str(k) != 'nan'}
     return {
         "total_models": total_models,
         "unique_libraries": int(df['library_name'].nunique()) if 'library_name' in df.columns else 0,
         "unique_pipelines": int(df['pipeline_tag'].nunique()) if 'pipeline_tag' in df.columns else 0,
         "unique_task_types": int(df['pipeline_tag'].nunique()) if 'pipeline_tag' in df.columns else 0,  # Alias for clarity
+        "unique_licenses": len(licenses),
+        "licenses": licenses,  # License name -> count mapping
         "avg_downloads": float(df['downloads'].mean()) if 'downloads' in df.columns else 0,
         "avg_likes": float(df['likes'].mean()) if 'likes' in df.columns else 0
     }
 async def get_model_details(model_id: str):
     """Get detailed information about a specific model."""
     if df is None:
+        raise DataNotLoadedError()
     model = df[df.get('model_id', '') == model_id]
     if len(model) == 0:
     model = model.iloc[0]
     tags_str = str(model.get('tags', '')) if pd.notna(model.get('tags')) else ''
     arxiv_ids = extract_arxiv_ids(tags_str)
     papers = []
     if arxiv_ids:
         papers = await fetch_arxiv_papers(arxiv_ids[:5])  # Limit to 5 papers
     }
+# Clusters endpoint is handled by routes/clusters.py router
 @app.get("/api/family/stats")
 async def get_family_stats():
     """
     Returns family size distribution, depth statistics, model card length by depth, etc.
     """
     if df is None:
+        raise DataNotLoadedError()
     family_sizes = {}
     root_models = set()
                 family_sizes[model_id] = 0
         else:
             parent_id_str = str(parent_id)
             root = parent_id_str
             visited = set()
             while root in df.index and pd.notna(df.loc[root].get('parent_model')):
                 parent = df.loc[root].get('parent_model')
                 if pd.isna(parent) or str(parent) == 'nan' or str(parent) == '':
                     break
+                if str(parent) in visited:
                     break
                 visited.add(root)
                 root = str(parent)
                 family_sizes[root] = 0
             family_sizes[root] += 1
     size_distribution = {}
     for root, size in family_sizes.items():
         size_distribution[size] = size_distribution.get(size, 0) + 1
     depths = calculate_family_depths(df)
     depth_counts = {}
     for depth in depths.values():
         depth_counts[depth] = depth_counts.get(depth, 0) + 1
     model_card_lengths_by_depth = {}
     if 'modelCard' in df.columns:
         for idx, row in df.iterrows():
                     model_card_lengths_by_depth[depth] = []
                 model_card_lengths_by_depth[depth].append(card_length)
     model_card_stats = {}
     for depth, lengths in model_card_lengths_by_depth.items():
         if lengths:
     }
+@app.get("/api/family/path/{model_id}")
+async def get_family_path(
+    model_id: str,
+    target_id: Optional[str] = Query(None, description="Target model ID. If None, returns path to root.")
+):
+    """
+    Get path from model to root or to target model.
+    Returns list of model IDs representing the path.
+    """
+    if df is None:
+        raise DataNotLoadedError()
+    model_id_str = str(model_id)
+    if df.index.name == 'model_id':
+        if model_id_str not in df.index:
+            raise HTTPException(status_code=404, detail="Model not found")
+    else:
+        model_rows = df[df.get('model_id', '') == model_id_str]
+        if len(model_rows) == 0:
+            raise HTTPException(status_code=404, detail="Model not found")
+    path = [model_id_str]
+    visited = set([model_id_str])
+    current = model_id_str
+    if target_id:
+        target_str = str(target_id)
+        if df.index.name == 'model_id':
+            if target_str not in df.index:
+                raise HTTPException(status_code=404, detail="Target model not found")
+        while current != target_str and current not in visited:
+            try:
+                if df.index.name == 'model_id':
+                    row = df.loc[current]
+                else:
+                    rows = df[df.get('model_id', '') == current]
+                    if len(rows) == 0:
+                        break
+                    row = rows.iloc[0]
+                parent_id = row.get('parent_model')
+                if parent_id and pd.notna(parent_id):
+                    parent_str = str(parent_id)
+                    if parent_str == target_str:
+                        path.append(parent_str)
+                        break
+                    if parent_str not in visited:
+                        path.append(parent_str)
+                        visited.add(parent_str)
+                        current = parent_str
+                    else:
+                        break
+                else:
+                    break
+            except (KeyError, IndexError):
+                break
+    else:
+        while True:
+            try:
+                if df.index.name == 'model_id':
+                    row = df.loc[current]
+                else:
+                    rows = df[df.get('model_id', '') == current]
+                    if len(rows) == 0:
+                        break
+                    row = rows.iloc[0]
+                parent_id = row.get('parent_model')
+                if parent_id and pd.notna(parent_id):
+                    parent_str = str(parent_id)
+                    if parent_str not in visited:
+                        path.append(parent_str)
+                        visited.add(parent_str)
+                        current = parent_str
+                    else:
+                        break
+                else:
+                    break
+            except (KeyError, IndexError):
+                break
+    return {
+        "path": path,
+        "source": model_id_str,
+        "target": target_id if target_id else "root",
+        "path_length": len(path) - 1
+    }
 @app.get("/api/family/{model_id}")
+async def get_family_tree(
+    model_id: str,
+    max_depth: Optional[int] = Query(None, ge=1, le=100, description="Maximum depth to traverse. If None, traverses entire tree without limit."),
+    max_depth_filter: Optional[int] = Query(None, ge=0, description="Filter results to models at or below this hierarchy depth.")
+):
     """
     Get family tree for a model (ancestors and descendants).
     Returns the model, its parent chain, and all children.
+    If max_depth is None, traverses the entire family tree without depth limits.
     """
     if df is None:
+        raise DataNotLoadedError()
     if reduced_embeddings is None:
         raise HTTPException(status_code=503, detail="Embeddings not ready")
+    model_id_str = str(model_id)
+    if df.index.name == 'model_id':
+        if model_id_str not in df.index:
+            raise HTTPException(status_code=404, detail="Model not found")
+        model_lookup = df.loc
+    else:
+        model_rows = df[df.get('model_id', '') == model_id_str]
+        if len(model_rows) == 0:
+            raise HTTPException(status_code=404, detail="Model not found")
+        model_lookup = lambda x: df[df.get('model_id', '') == x]
+    from utils.network_analysis import _get_all_parents, _parse_parent_list
+    children_index: Dict[str, List[str]] = {}
+    parent_columns = ['parent_model', 'finetune_parent', 'quantized_parent', 'adapter_parent', 'merge_parent']
+    for idx, row in df.iterrows():
+        model_id_from_row = str(row.get('model_id', idx))
+        all_parents = _get_all_parents(row)
+        for rel_type, parent_list in all_parents.items():
+            for parent_str in parent_list:
+                if parent_str not in children_index:
+                    children_index[parent_str] = []
+                children_index[parent_str].append(model_id_from_row)
+    visited = set()
+    def get_ancestors(current_id: str, depth: Optional[int]):
+        if current_id in visited:
+            return
+        if depth is not None and depth <= 0:
             return
         visited.add(current_id)
+        try:
+            if df.index.name == 'model_id':
+                row = df.loc[current_id]
+            else:
+                rows = model_lookup(current_id)
+                if len(rows) == 0:
+                    return
+                row = rows.iloc[0]
+            all_parents = _get_all_parents(row)
+            for rel_type, parent_list in all_parents.items():
+                for parent_str in parent_list:
+                    if parent_str != 'nan' and parent_str != '':
+                        next_depth = depth - 1 if depth is not None else None
+                        get_ancestors(parent_str, next_depth)
+        except (KeyError, IndexError):
+            return
+    def get_descendants(current_id: str, depth: Optional[int]):
+        if current_id in visited:
+            return
+        if depth is not None and depth <= 0:
             return
         visited.add(current_id)
+        children = children_index.get(current_id, [])
+        for child_id in children:
+            if child_id not in visited:
+                next_depth = depth - 1 if depth is not None else None
+                get_descendants(child_id, next_depth)
+    get_ancestors(model_id_str, max_depth)
+    visited = set()
+    get_descendants(model_id_str, max_depth)
+    visited.add(model_id_str)
+    if df.index.name == 'model_id':
         try:
             family_df = df.loc[list(visited)]
         except KeyError:
+            missing = [v for v in visited if v not in df.index]
+            if missing:
+                logger.warning(f"Some family members not found in index: {missing}")
+            family_df = df.loc[[v for v in visited if v in df.index]]
     else:
         family_df = df[df.get('model_id', '').isin(visited)]
+    if len(family_df) == 0:
+        raise HTTPException(status_code=404, detail="Family tree data not available")
+    family_indices = family_df.index.values
+    if len(family_indices) > len(reduced_embeddings):
+        raise HTTPException(status_code=503, detail="Embedding indices mismatch")
     family_reduced = reduced_embeddings[family_indices]
     family_map = {}
     for idx, (i, row) in enumerate(family_df.iterrows()):
+        model_id_val = str(row.get('model_id', i))
+        parent_id = row.get('parent_model')
+        parent_id_str = str(parent_id) if parent_id and pd.notna(parent_id) else None
+        depths = calculate_family_depths(df)
+        model_depth = depths.get(model_id_val, 0)
+        if max_depth_filter is not None and model_depth > max_depth_filter:
+            continue
         family_map[model_id_val] = {
             "model_id": model_id_val,
             "pipeline_tag": str(row.get('pipeline_tag')) if pd.notna(row.get('pipeline_tag')) else None,
             "downloads": int(row.get('downloads', 0)) if pd.notna(row.get('downloads')) else 0,
             "likes": int(row.get('likes', 0)) if pd.notna(row.get('likes')) else 0,
+            "parent_model": parent_id_str,
             "licenses": str(row.get('licenses')) if pd.notna(row.get('licenses')) else None,
+            "family_depth": model_depth,
             "children": []
         }
     root_models = []
     for model_id_val, model_data in family_map.items():
         parent_id = model_data["parent_model"]
             root_models.append(model_id_val)
     return {
+        "root_model": model_id_str,
         "family": list(family_map.values()),
         "family_map": family_map,
         "root_models": root_models
 @app.get("/api/search")
 async def search_models(
+    q: str = Query(..., min_length=1, alias="query"),
+    query: str = Query(None, min_length=1),
+    limit: int = Query(20, ge=1, le=100),
     graph_aware: bool = Query(False),
     include_neighbors: bool = Query(True)
 ):
     Enhanced with graph-aware search option that includes network relationships.
     """
     if df is None:
+        raise DataNotLoadedError()
+    # Support both 'q' and 'query' parameters
+    search_query = query or q
     if graph_aware:
         try:
             network_builder = ModelNetworkBuilder(df)
             top_models = network_builder.get_top_models_by_field(n=1000)
             model_ids = [mid for mid, _ in top_models]
             graph = network_builder.build_cooccurrence_network(model_ids, cooccurrence_method='combined')
             results = network_builder.search_graph_aware(
+                query=search_query,
                 graph=graph,
+                max_results=limit,
                 include_neighbors=include_neighbors
             )
+            return {"results": results, "search_type": "graph_aware", "query": search_query}
+        except (ValueError, KeyError, AttributeError) as e:
+            logger.warning(f"Graph-aware search failed, falling back to basic search: {e}")
+    query_lower = search_query.lower()
+    # Enhanced search: search model_id, org, tags, library, pipeline
+    model_id_col = df.get('model_id', '').astype(str).str.lower()
+    library_col = df.get('library_name', '').astype(str).str.lower()
+    pipeline_col = df.get('pipeline_tag', '').astype(str).str.lower()
+    tags_col = df.get('tags', '').astype(str).str.lower()
+    license_col = df.get('license', '').astype(str).str.lower()
+    # Extract org from model_id
+    org_col = model_id_col.str.split('/').str[0]
+    # Multi-field search
+    mask = (
+        model_id_col.str.contains(query_lower, na=False) |
+        org_col.str.contains(query_lower, na=False) |
+        library_col.str.contains(query_lower, na=False) |
+        pipeline_col.str.contains(query_lower, na=False) |
+        tags_col.str.contains(query_lower, na=False) |
+        license_col.str.contains(query_lower, na=False)
+    )
+    matches = df[mask].head(limit)
     results = []
     for _, row in matches.iterrows():
+        model_id = str(row.get('model_id', ''))
+        org = model_id.split('/')[0] if '/' in model_id else ''
+        # Get coordinates if available
+        x = float(row.get('x', 0.0)) if 'x' in row else None
+        y = float(row.get('y', 0.0)) if 'y' in row else None
+        z = float(row.get('z', 0.0)) if 'z' in row else None
         results.append({
+            "model_id": model_id,
+            "x": x,
+            "y": y,
+            "z": z,
+            "org": org,
+            "library": row.get('library_name'),
+            "pipeline": row.get('pipeline_tag'),
+            "license": row.get('license') if pd.notna(row.get('license')) else None,
             "downloads": int(row.get('downloads', 0)),
             "likes": int(row.get('likes', 0)),
             "parent_model": row.get('parent_model') if pd.notna(row.get('parent_model')) else None,
             "match_type": "direct"
         })
+    return {"results": results, "search_type": "basic", "query": search_query}
 @app.get("/api/similar/{model_id}")
     Get k-nearest neighbors of a model based on embedding similarity.
     Returns similar models with distance scores.
     """
+    if deps.df is None or deps.embeddings is None:
         raise HTTPException(status_code=503, detail="Data not loaded")
+    df = deps.df
+    embeddings = deps.embeddings
     if 'model_id' in df.index.names or df.index.name == 'model_id':
         try:
             model_row = df.loc[[model_id]]
         model_idx = model_row.index[0]
     model_embedding = embeddings[model_idx]
     from sklearn.metrics.pairwise import cosine_similarity
     model_embedding_2d = model_embedding.reshape(1, -1)
     similarities = cosine_similarity(model_embedding_2d, embeddings)[0]
     top_k_indices = np.argpartition(similarities, -k-1)[-k-1:-1]
     top_k_indices = top_k_indices[np.argsort(similarities[top_k_indices])][::-1]
     similar_models = []
         similar_models.append({
             "model_id": row.get('model_id', 'Unknown'),
             "similarity": float(similarities[idx]),
+            "distance": float(1 - similarities[idx]),
             "library_name": row.get('library_name'),
             "pipeline_tag": row.get('pipeline_tag'),
             "downloads": int(row.get('downloads', 0)),
     Returns models with their similarity scores and coordinates.
     Useful for exploring the embedding space around a specific model.
     """
+    if deps.df is None or deps.embeddings is None:
         raise HTTPException(status_code=503, detail="Data not loaded")
+    df = deps.df
+    embeddings = deps.embeddings
     # Find the query model
     if 'model_id' in df.index.names or df.index.name == 'model_id':
         try:
     query_embedding = embeddings[model_idx]
     filtered_df = data_loader.filter_data(
         df=df,
         min_downloads=min_downloads,
         pipeline_tags=None
     )
     if df.index.name == 'model_id' or 'model_id' in df.index.names:
         filtered_indices = [df.index.get_loc(idx) for idx in filtered_df.index]
         filtered_indices = np.array(filtered_indices, dtype=int)
     else:
         filtered_indices = filtered_df.index.values.astype(int)
     filtered_embeddings = embeddings[filtered_indices]
     from sklearn.metrics.pairwise import cosine_similarity
     query_embedding_2d = query_embedding.reshape(1, -1)
     similarities = cosine_similarity(query_embedding_2d, filtered_embeddings)[0]
     top_k_local_indices = np.argpartition(similarities, -k)[-k:]
     top_k_local_indices = top_k_local_indices[np.argsort(similarities[top_k_local_indices])][::-1]
     if reduced_embeddings is None:
         raise HTTPException(status_code=503, detail="Reduced embeddings not ready")
     top_k_original_indices = filtered_indices[top_k_local_indices]
     top_k_reduced = reduced_embeddings[top_k_original_indices]
     similar_models = []
     for i, orig_idx in enumerate(top_k_original_indices):
         row = df.iloc[orig_idx]
     """
     Calculate distance/similarity between two models.
     """
+    if deps.df is None or deps.embeddings is None:
         raise HTTPException(status_code=503, detail="Data not loaded")
+    df = deps.df
+    embeddings = deps.embeddings
     # Find both models - optimized with index lookup
     if 'model_id' in df.index.names or df.index.name == 'model_id':
         try:
     Export selected models as JSON with full metadata.
     """
     if df is None:
+        raise DataNotLoadedError()
     # Optimized export with index lookup
     if 'model_id' in df.index.names or df.index.name == 'model_id':
     if len(exported) == 0:
         return {"models": []}
     models = [
         {
             "model_id": str(row.get('model_id', '')),
     Returns network graph data suitable for visualization.
     """
     if df is None:
+        raise DataNotLoadedError()
     try:
         network_builder = ModelNetworkBuilder(df)
         top_models = network_builder.get_top_models_by_field(
             library=library,
             pipeline_tag=pipeline_tag,
             }
         model_ids = [mid for mid, _ in top_models]
         graph = network_builder.build_cooccurrence_network(
             model_ids=model_ids,
             cooccurrence_method=cooccurrence_method
         )
         nodes = []
         for node_id, attrs in graph.nodes(data=True):
             nodes.append({
             "links": links,
             "statistics": stats
         }
+    except (ValueError, KeyError, AttributeError) as e:
+        logger.error(f"Error building network: {e}", exc_info=True)
         raise HTTPException(status_code=500, detail=f"Error building network: {str(e)}")
 @app.get("/api/network/family/{model_id}")
 async def get_family_network(
     model_id: str,
+    max_depth: Optional[int] = Query(None, ge=1, le=100, description="Maximum depth to traverse. If None, traverses entire tree without limit."),
+    edge_types: Optional[str] = Query(None, description="Comma-separated list of edge types to include (finetune,quantized,adapter,merge,parent). If None, includes all types."),
+    include_edge_attributes: bool = Query(True, description="Whether to include edge attributes (change in likes, downloads, etc.)")
 ):
     """
     Build family tree network for a model (directed graph).
+    Returns network graph data showing parent-child relationships with multiple relationship types.
+    Supports filtering by edge type (finetune, quantized, adapter, merge, parent).
     """
     if df is None:
+        raise DataNotLoadedError()
     try:
+        filter_types = None
+        if edge_types:
+            filter_types = [t.strip() for t in edge_types.split(',') if t.strip()]
         network_builder = ModelNetworkBuilder(df)
         graph = network_builder.build_family_tree_network(
             root_model_id=model_id,
+            max_depth=max_depth,
+            include_edge_attributes=include_edge_attributes,
+            filter_edge_types=filter_types
         )
         nodes = []
         for node_id, attrs in graph.nodes(data=True):
             nodes.append({
                 "id": node_id,
                 "title": attrs.get('title', node_id),
+                "freq": attrs.get('freq', 0),
+                "likes": attrs.get('likes', 0),
+                "downloads": attrs.get('downloads', 0),
+                "library": attrs.get('library', ''),
+                "pipeline": attrs.get('pipeline', '')
             })
         links = []
+        for source, target, edge_attrs in graph.edges(data=True):
+            link_data = {
                 "source": source,
+                "target": target,
+                "edge_type": edge_attrs.get('edge_type'),
+                "edge_types": edge_attrs.get('edge_types', [])
+            }
+            if include_edge_attributes:
+                link_data.update({
+                    "change_in_likes": edge_attrs.get('change_in_likes'),
+                    "percentage_change_in_likes": edge_attrs.get('percentage_change_in_likes'),
+                    "change_in_downloads": edge_attrs.get('change_in_downloads'),
+                    "percentage_change_in_downloads": edge_attrs.get('percentage_change_in_downloads'),
+                    "change_in_createdAt_days": edge_attrs.get('change_in_createdAt_days')
+                })
+            links.append(link_data)
         stats = network_builder.get_network_statistics(graph)
             "statistics": stats,
             "root_model": model_id
         }
+    except (ValueError, KeyError, AttributeError) as e:
+        logger.error(f"Error building family network: {e}", exc_info=True)
         raise HTTPException(status_code=500, detail=f"Error building family network: {str(e)}")
     Similar to graph database queries for finding connected nodes.
     """
     if df is None:
+        raise DataNotLoadedError()
     try:
         network_builder = ModelNetworkBuilder(df)
         top_models = network_builder.get_top_models_by_field(n=1000)
         model_ids = [mid for mid, _ in top_models]
         graph = network_builder.build_cooccurrence_network(model_ids, cooccurrence_method='combined')
             "neighbors": neighbors,
             "count": len(neighbors)
         }
+    except (ValueError, KeyError, AttributeError) as e:
+        logger.error(f"Error finding neighbors: {e}", exc_info=True)
         raise HTTPException(status_code=500, detail=f"Error finding neighbors: {str(e)}")
     Similar to graph database path queries.
     """
     if df is None:
+        raise DataNotLoadedError()
     try:
         network_builder = ModelNetworkBuilder(df)
     Similar to graph database queries for co-assignment patterns.
     """
     if df is None:
+        raise DataNotLoadedError()
     try:
         network_builder = ModelNetworkBuilder(df)
     Similar to graph database relationship queries.
     """
     if df is None:
+        raise DataNotLoadedError()
     try:
         network_builder = ModelNetworkBuilder(df)
 async def get_current_model_count(
     use_cache: bool = Query(True),
     force_refresh: bool = Query(False),
+    use_dataset_snapshot: bool = Query(False),
+    use_models_page: bool = Query(True)
 ):
     """
     Get the current number of models on Hugging Face Hub.
+    Uses multiple strategies: models page scraping (fastest), dataset snapshot, or API.
     Query Parameters:
         use_cache: Use cached results if available (default: True)
         force_refresh: Force refresh even if cache is valid (default: False)
+        use_dataset_snapshot: Use dataset snapshot for breakdowns (default: False)
+        use_models_page: Try to get count from HF models page first (default: True)
     """
     try:
+        tracker = get_tracker()
         if use_dataset_snapshot:
+            count_data = tracker.get_count_from_models_page()
             if count_data is None:
+                count_data = tracker.get_current_model_count(use_models_page=False)
+            else:
+                try:
+                    from utils.data_loader import ModelDataLoader
+                    data_loader = ModelDataLoader()
+                    df = data_loader.load_data(sample_size=10000)
+                    library_counts = {}
+                    pipeline_counts = {}
+                    for _, row in df.iterrows():
+                        if pd.notna(row.get('library_name')):
+                            lib = str(row.get('library_name'))
+                            library_counts[lib] = library_counts.get(lib, 0) + 1
+                        if pd.notna(row.get('pipeline_tag')):
+                            pipeline = str(row.get('pipeline_tag'))
+                            pipeline_counts[pipeline] = pipeline_counts.get(pipeline, 0) + 1
+                    if len(df) > 0 and count_data["total_models"] > len(df):
+                        scale_factor = count_data["total_models"] / len(df)
+                        library_counts = {k: int(v * scale_factor) for k, v in library_counts.items()}
+                        pipeline_counts = {k: int(v * scale_factor) for k, v in pipeline_counts.items()}
+                    count_data["models_by_library"] = library_counts
+                    count_data["models_by_pipeline"] = pipeline_counts
+                except Exception as e:
+                    logger.warning(f"Could not get breakdowns from dataset: {e}")
         else:
+            count_data = tracker.get_current_model_count(use_models_page=use_models_page)
         return count_data
     except Exception as e:
+        logger.error(f"Error fetching model count: {e}", exc_info=True)
         raise HTTPException(status_code=500, detail=f"Error fetching model count: {str(e)}")
     try:
         from datetime import datetime
+        tracker = get_tracker()
         start = None
         end = None
 async def get_latest_model_count():
     """Get the most recently recorded model count from database."""
     try:
+        tracker = get_tracker()
         latest = tracker.get_latest_count()
         if latest is None:
             raise HTTPException(status_code=404, detail="No model counts recorded yet")
         use_dataset_snapshot: Use dataset snapshot instead of API (faster, default: False)
     """
     try:
+        tracker = get_tracker()
         def record():
             if use_dataset_snapshot:
                 count_data = tracker.get_count_from_dataset_snapshot()
                 if count_data:
                     tracker.record_count(count_data, source="dataset_snapshot")
                 else:
                     count_data = tracker.get_current_model_count(use_cache=False)
                     tracker.record_count(count_data, source="api")
             else:
         days: Number of days to analyze
     """
     try:
+        tracker = get_tracker()
         stats = tracker.get_growth_stats(days)
         return stats
     except Exception as e:
     Similar to Open Syllabus graph export functionality.
     """
     if df is None:
+        raise DataNotLoadedError()
     try:
         network_builder = ModelNetworkBuilder(df)
         top_models = network_builder.get_top_models_by_field(
             library=library,
             pipeline_tag=pipeline_tag,
             raise HTTPException(status_code=404, detail="No models found matching criteria")
         model_ids = [mid for mid, _ in top_models]
         graph = network_builder.build_cooccurrence_network(
             model_ids=model_ids,
             cooccurrence_method=cooccurrence_method
         )
         with tempfile.NamedTemporaryFile(mode='w', suffix='.graphml', delete=False) as tmp_file:
             tmp_path = tmp_file.name
             network_builder.export_graphml(graph, tmp_path)
         background_tasks.add_task(os.unlink, tmp_path)
         return FileResponse(
             tmp_path,
             media_type='application/xml',
             filename=f'network_{cooccurrence_method}_{n}_models.graphml'
         )
+    except (ValueError, KeyError, AttributeError, IOError) as e:
+        logger.error(f"Error exporting network: {e}", exc_info=True)
         raise HTTPException(status_code=500, detail=f"Error exporting network: {str(e)}")
     Extracts arXiv IDs from model tags and fetches paper information.
     """
     if df is None:
+        raise DataNotLoadedError()
     model = df[df.get('model_id', '') == model_id]
     if len(model) == 0:
     }
+@app.get("/api/models/minimal.bin")
+async def get_minimal_binary():
+    """
+    Serve the binary minimal dataset file.
+    This is optimized for fast client-side loading.
+    """
+    backend_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+    root_dir = os.path.dirname(backend_dir)
+    binary_path = os.path.join(root_dir, "cache", "binary", "embeddings.bin")
+    if not os.path.exists(binary_path):
+        raise HTTPException(status_code=404, detail="Binary dataset not found. Run export_binary.py first.")
+    return FileResponse(
+        binary_path,
+        media_type="application/octet-stream",
+        headers={
+            "Content-Disposition": "attachment; filename=embeddings.bin",
+            "Cache-Control": "public, max-age=3600"
+        }
+    )
+@app.get("/api/models/model_ids.json")
+async def get_model_ids_json():
+    """Serve the model IDs JSON file."""
+    backend_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+    root_dir = os.path.dirname(backend_dir)
+    json_path = os.path.join(root_dir, "cache", "binary", "model_ids.json")
+    if not os.path.exists(json_path):
+        raise HTTPException(status_code=404, detail="Model IDs file not found.")
+    return FileResponse(
+        json_path,
+        media_type="application/json",
+        headers={"Cache-Control": "public, max-age=3600"}
+    )
+@app.get("/api/models/metadata.json")
+async def get_metadata_json():
+    """Serve the metadata JSON file with lookup tables."""
+    backend_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+    root_dir = os.path.dirname(backend_dir)
+    json_path = os.path.join(root_dir, "cache", "binary", "metadata.json")
+    if not os.path.exists(json_path):
+        raise HTTPException(status_code=404, detail="Metadata file not found.")
+    return FileResponse(
+        json_path,
+        media_type="application/json",
+        headers={"Cache-Control": "public, max-age=3600"}
+    )
 @app.get("/api/model/{model_id}/files")
 async def get_model_files(model_id: str, branch: str = Query("main")):
     """
     Get file tree for a model from Hugging Face.
     Proxies the request to avoid CORS issues.
+    Returns a flat list of files with path and size information.
     """
+    if not model_id or not model_id.strip():
+        raise HTTPException(status_code=400, detail="Invalid model ID")
+    branches_to_try = [branch, "main", "master"] if branch not in ["main", "master"] else [branch, "main" if branch == "master" else "master"]
     try:
+        async with httpx.AsyncClient(timeout=15.0) as client:
             for branch_name in branches_to_try:
                 try:
                     url = f"https://huggingface.co/api/models/{model_id}/tree/{branch_name}"
                     response = await client.get(url)
                     if response.status_code == 200:
+                        data = response.json()
+                        # Ensure we return an array
+                        if isinstance(data, list):
+                            return data
+                        elif isinstance(data, dict) and 'tree' in data:
+                            return data['tree']
+                        else:
+                            return []
+                    elif response.status_code == 404:
+                        # Try next branch
+                        continue
+                    else:
+                        logger.warning(f"Unexpected status {response.status_code} for {url}")
+                        continue
+                except httpx.HTTPStatusError as e:
+                    if e.response.status_code == 404:
+                        continue  # Try next branch
+                    logger.warning(f"HTTP error for branch {branch_name}: {e}")
+                    continue
+                except httpx.HTTPError as e:
+                    logger.warning(f"HTTP error for branch {branch_name}: {e}")
                     continue
+            # All branches failed
+            raise HTTPException(
+                status_code=404,
+                detail=f"File tree not found for model '{model_id}'. The model may not exist or may not have any files."
+            )
     except httpx.TimeoutException:
+        raise HTTPException(
+            status_code=504,
+            detail="Request to Hugging Face timed out. Please try again later."
+        )
+    except HTTPException:
+        raise  # Re-raise HTTP exceptions
     except Exception as e:
+        logger.error(f"Error fetching file tree: {e}", exc_info=True)
+        raise HTTPException(
+            status_code=500,
+            detail=f"Error fetching file tree: {str(e)}"
+        )
 if __name__ == "__main__":
     import uvicorn
     port = int(os.getenv("PORT", 8000))
     uvicorn.run(app, host="0.0.0.0", port=port)

backend/api/routes/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+"""
+API route modules.
+"""
+from . import models, stats, clusters
+__all__ = ['models', 'stats', 'clusters']

backend/api/routes/clusters.py ADDED Viewed

	@@ -0,0 +1,102 @@

+"""
+API routes for cluster endpoints.
+"""
+from fastapi import APIRouter
+import numpy as np
+import pandas as pd
+from core.exceptions import DataNotLoadedError
+import api.dependencies as deps
+router = APIRouter(prefix="/api", tags=["clusters"])
+@router.get("/clusters")
+async def get_clusters():
+    """Get all clusters with metadata and hierarchical labels."""
+    if deps.df is None:
+        raise DataNotLoadedError()
+    # Import cluster_labels from models route
+    from api.routes.models import cluster_labels
+    # If clusters haven't been computed yet, return empty list instead of error
+    # This allows the frontend to work while data is still loading
+    if cluster_labels is None:
+        return {"clusters": []}
+    df = deps.df
+    # Generate hierarchical labels for clusters
+    clusters = []
+    unique_clusters = np.unique(cluster_labels)
+    for cluster_id in unique_clusters:
+        cluster_mask = cluster_labels == cluster_id
+        cluster_models = df[cluster_mask]
+        if len(cluster_models) == 0:
+            continue
+        # Generate hierarchical label
+        library_counts = cluster_models['library_name'].value_counts()
+        pipeline_counts = cluster_models['pipeline_tag'].value_counts()
+        # Determine primary domain/library
+        if len(library_counts) > 0:
+            primary_lib = library_counts.index[0]
+            if primary_lib and pd.notna(primary_lib):
+                if 'transformers' in str(primary_lib).lower():
+                    domain = "NLP"
+                elif 'diffusers' in str(primary_lib).lower():
+                    domain = "Multimodal"
+                elif 'timm' in str(primary_lib).lower():
+                    domain = "Computer Vision"
+                else:
+                    domain = str(primary_lib).replace('_', ' ').title()
+            else:
+                domain = "Other"
+        else:
+            domain = "Other"
+        # Determine subdomain from pipeline
+        if len(pipeline_counts) > 0:
+            primary_pipeline = pipeline_counts.index[0]
+            if primary_pipeline and pd.notna(primary_pipeline):
+                subdomain = str(primary_pipeline).replace('-', ' ').replace('_', ' ').title()
+            else:
+                subdomain = "General"
+        else:
+            subdomain = "General"
+        # Determine characteristics
+        characteristics = []
+        model_ids_lower = cluster_models['model_id'].astype(str).str.lower()
+        if model_ids_lower.str.contains('gpt', na=False).any():
+            characteristics.append("GPT-based")
+        if cluster_models['parent_model'].notna().any():
+            characteristics.append("Fine-tuned")
+        if not characteristics:
+            characteristics.append("Base Models")
+        char_str = "; ".join(characteristics)
+        label = f"{domain} — {subdomain} ({char_str})"
+        # Generate color (use consistent colors based on cluster_id)
+        colors = [
+            "#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#9467bd",
+            "#8c564b", "#e377c2", "#7f7f7f", "#bcbd22", "#17becf"
+        ]
+        color = colors[cluster_id % len(colors)]
+        clusters.append({
+            "cluster_id": int(cluster_id),
+            "cluster_label": label,
+            "count": int(len(cluster_models)),
+            "color": color
+        })
+    # Sort by count descending
+    clusters.sort(key=lambda x: x["count"], reverse=True)
+    return {"clusters": clusters}

backend/api/routes/models.py ADDED Viewed

	@@ -0,0 +1,247 @@

+"""
+API routes for model data endpoints.
+"""
+from typing import Optional
+from fastapi import APIRouter, Query, HTTPException
+import numpy as np
+import pandas as pd
+import pickle
+import os
+import logging
+from umap import UMAP
+from models.schemas import ModelPoint
+from utils.family_tree import calculate_family_depths
+from utils.dimensionality_reduction import DimensionReducer
+from core.exceptions import DataNotLoadedError, EmbeddingsNotReadyError
+import api.dependencies as deps
+logger = logging.getLogger(__name__)
+router = APIRouter(prefix="/api", tags=["models"])
+# Global cluster labels cache (shared across routes)
+cluster_labels = None
+def compute_clusters(reduced_embeddings: np.ndarray, n_clusters: int = 50) -> np.ndarray:
+    from sklearn.cluster import KMeans
+    n_samples = len(reduced_embeddings)
+    if n_samples < n_clusters:
+        n_clusters = max(1, n_samples // 10)
+    kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
+    return kmeans.fit_predict(reduced_embeddings)
+@router.get("/models")
+async def get_models(
+    min_downloads: int = Query(0),
+    min_likes: int = Query(0),
+    search_query: Optional[str] = Query(None),
+    color_by: str = Query("library_name"),
+    size_by: str = Query("downloads"),
+    max_points: Optional[int] = Query(None),
+    projection_method: str = Query("umap"),
+    base_models_only: bool = Query(False),
+    max_hierarchy_depth: Optional[int] = Query(None, ge=0, description="Filter to models at or below this hierarchy depth."),
+    use_graph_embeddings: bool = Query(False, description="Use graph-aware embeddings that respect family tree structure")
+):
+    if deps.df is None:
+        raise DataNotLoadedError()
+    df = deps.df
+    data_loader = deps.data_loader
+    # Filter data
+    filtered_df = data_loader.filter_data(
+        df=df,
+        min_downloads=min_downloads,
+        min_likes=min_likes,
+        search_query=search_query,
+        libraries=None,
+        pipeline_tags=None
+    )
+    if base_models_only:
+        if 'parent_model' in filtered_df.columns:
+            filtered_df = filtered_df[
+                filtered_df['parent_model'].isna() |
+                (filtered_df['parent_model'].astype(str).str.strip() == '') |
+                (filtered_df['parent_model'].astype(str) == 'nan')
+            ]
+    if max_hierarchy_depth is not None:
+        family_depths = calculate_family_depths(df)
+        filtered_df = filtered_df[
+            filtered_df['model_id'].astype(str).map(lambda x: family_depths.get(x, 0) <= max_hierarchy_depth)
+        ]
+    filtered_count = len(filtered_df)
+    if len(filtered_df) == 0:
+        return {
+            "models": [],
+            "filtered_count": 0,
+            "returned_count": 0
+        }
+    if max_points is not None and len(filtered_df) > max_points:
+        if 'library_name' in filtered_df.columns and filtered_df['library_name'].notna().any():
+            sampled_dfs = []
+            for lib_name, group in filtered_df.groupby('library_name', group_keys=False):
+                n_samples = max(1, int(max_points * len(group) / len(filtered_df)))
+                sampled_dfs.append(group.sample(min(len(group), n_samples), random_state=42))
+            filtered_df = pd.concat(sampled_dfs, ignore_index=True)
+            if len(filtered_df) > max_points:
+                filtered_df = filtered_df.sample(n=max_points, random_state=42).reset_index(drop=True)
+            else:
+                filtered_df = filtered_df.reset_index(drop=True)
+        else:
+            filtered_df = filtered_df.sample(n=max_points, random_state=42).reset_index(drop=True)
+    # Determine which embeddings to use
+    if use_graph_embeddings and deps.combined_embeddings is not None:
+        current_embeddings = deps.combined_embeddings
+        current_reduced = deps.reduced_embeddings_graph
+        embedding_type = "graph-aware"
+    else:
+        if deps.embeddings is None:
+            raise EmbeddingsNotReadyError()
+        current_embeddings = deps.embeddings
+        current_reduced = deps.reduced_embeddings
+        embedding_type = "text-only"
+    # Handle reduced embeddings loading/generation
+    reducer = deps.reducer
+    if current_reduced is None or (reducer and reducer.method != projection_method.lower()):
+        backend_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+        root_dir = os.path.dirname(backend_dir)
+        cache_dir = os.path.join(root_dir, "cache")
+        cache_suffix = "_graph" if use_graph_embeddings and deps.combined_embeddings is not None else ""
+        reduced_cache = os.path.join(cache_dir, f"reduced_{projection_method.lower()}_3d{cache_suffix}.pkl")
+        reducer_cache = os.path.join(cache_dir, f"reducer_{projection_method.lower()}_3d{cache_suffix}.pkl")
+        if os.path.exists(reduced_cache) and os.path.exists(reducer_cache):
+            try:
+                with open(reduced_cache, 'rb') as f:
+                    current_reduced = pickle.load(f)
+                if reducer is None or reducer.method != projection_method.lower():
+                    reducer = DimensionReducer(method=projection_method.lower(), n_components=3)
+                reducer.load_reducer(reducer_cache)
+            except (IOError, pickle.UnpicklingError, EOFError) as e:
+                logger.warning(f"Failed to load cached reduced embeddings: {e}")
+                current_reduced = None
+        if current_reduced is None:
+            if reducer is None or reducer.method != projection_method.lower():
+                reducer = DimensionReducer(method=projection_method.lower(), n_components=3)
+                if projection_method.lower() == "umap":
+                    reducer.reducer = UMAP(
+                        n_components=3,
+                        n_neighbors=30,
+                        min_dist=0.3,
+                        metric='cosine',
+                        random_state=42,
+                        n_jobs=-1,
+                        low_memory=True,
+                        spread=1.5
+                    )
+            current_reduced = reducer.fit_transform(current_embeddings)
+            with open(reduced_cache, 'wb') as f:
+                pickle.dump(current_reduced, f)
+            reducer.save_reducer(reducer_cache)
+            # Update global variable
+            if use_graph_embeddings and deps.combined_embeddings is not None:
+                deps.reduced_embeddings_graph = current_reduced
+            else:
+                deps.reduced_embeddings = current_reduced
+    # Get indices for filtered data
+    filtered_model_ids = filtered_df['model_id'].astype(str).values
+    if df.index.name == 'model_id' or 'model_id' in df.index.names:
+        filtered_indices = []
+        for model_id in filtered_model_ids:
+            try:
+                pos = df.index.get_loc(model_id)
+                if isinstance(pos, (int, np.integer)):
+                    filtered_indices.append(int(pos))
+                elif isinstance(pos, (slice, np.ndarray)):
+                    if isinstance(pos, slice):
+                        filtered_indices.append(int(pos.start))
+                    else:
+                        filtered_indices.append(int(pos[0]))
+            except (KeyError, TypeError):
+                continue
+        filtered_indices = np.array(filtered_indices, dtype=np.int32)
+    else:
+        df_model_ids = df['model_id'].astype(str).values
+        model_id_to_pos = {mid: pos for pos, mid in enumerate(df_model_ids)}
+        filtered_indices = np.array([
+            model_id_to_pos[mid] for mid in filtered_model_ids
+            if mid in model_id_to_pos
+        ], dtype=np.int32)
+    if len(filtered_indices) == 0:
+        return {
+            "models": [],
+            "embedding_type": embedding_type,
+            "filtered_count": filtered_count,
+            "returned_count": 0
+        }
+    filtered_reduced = current_reduced[filtered_indices]
+    family_depths = calculate_family_depths(df)
+    global cluster_labels
+    clustering_embeddings = current_reduced
+    if cluster_labels is None or len(cluster_labels) != len(clustering_embeddings):
+        cluster_labels = compute_clusters(clustering_embeddings, n_clusters=min(50, len(clustering_embeddings) // 100))
+    filtered_clusters = cluster_labels[filtered_indices]
+    model_ids = filtered_df['model_id'].astype(str).values
+    library_names = filtered_df.get('library_name', pd.Series([None] * len(filtered_df))).values
+    pipeline_tags = filtered_df.get('pipeline_tag', pd.Series([None] * len(filtered_df))).values
+    downloads_arr = filtered_df.get('downloads', pd.Series([0] * len(filtered_df))).fillna(0).astype(int).values
+    likes_arr = filtered_df.get('likes', pd.Series([0] * len(filtered_df))).fillna(0).astype(int).values
+    trending_scores = filtered_df.get('trendingScore', pd.Series([None] * len(filtered_df))).values
+    tags_arr = filtered_df.get('tags', pd.Series([None] * len(filtered_df))).values
+    parent_models = filtered_df.get('parent_model', pd.Series([None] * len(filtered_df))).values
+    licenses_arr = filtered_df.get('licenses', pd.Series([None] * len(filtered_df))).values
+    created_at_arr = filtered_df.get('createdAt', pd.Series([None] * len(filtered_df))).values
+    x_coords = filtered_reduced[:, 0].astype(float)
+    y_coords = filtered_reduced[:, 1].astype(float)
+    z_coords = filtered_reduced[:, 2].astype(float) if filtered_reduced.shape[1] > 2 else np.zeros(len(filtered_reduced), dtype=float)
+    models = [
+        ModelPoint(
+            model_id=model_ids[idx],
+            x=float(x_coords[idx]),
+            y=float(y_coords[idx]),
+            z=float(z_coords[idx]),
+            library_name=library_names[idx] if pd.notna(library_names[idx]) else None,
+            pipeline_tag=pipeline_tags[idx] if pd.notna(pipeline_tags[idx]) else None,
+            downloads=int(downloads_arr[idx]),
+            likes=int(likes_arr[idx]),
+            trending_score=float(trending_scores[idx]) if idx < len(trending_scores) and pd.notna(trending_scores[idx]) else None,
+            tags=tags_arr[idx] if idx < len(tags_arr) and pd.notna(tags_arr[idx]) else None,
+            parent_model=parent_models[idx] if idx < len(parent_models) and pd.notna(parent_models[idx]) else None,
+            licenses=licenses_arr[idx] if idx < len(licenses_arr) and pd.notna(licenses_arr[idx]) else None,
+            family_depth=family_depths.get(model_ids[idx], None),
+            cluster_id=int(filtered_clusters[idx]) if idx < len(filtered_clusters) else None,
+            created_at=str(created_at_arr[idx]) if idx < len(created_at_arr) and pd.notna(created_at_arr[idx]) else None
+        )
+        for idx in range(len(filtered_df))
+    ]
+    return {
+        "models": models,
+        "embedding_type": embedding_type,
+        "filtered_count": filtered_count,
+        "returned_count": len(models)
+    }

backend/api/routes/stats.py ADDED Viewed

	@@ -0,0 +1,37 @@

+"""
+API routes for statistics endpoints.
+"""
+from fastapi import APIRouter
+from core.exceptions import DataNotLoadedError
+import api.dependencies as deps
+router = APIRouter(prefix="/api", tags=["stats"])
+@router.get("/stats")
+async def get_stats():
+    """Get dataset statistics."""
+    if deps.df is None:
+        raise DataNotLoadedError()
+    df = deps.df
+    total_models = len(df.index) if hasattr(df, 'index') else len(df)
+    # Get unique licenses with counts
+    licenses = {}
+    if 'license' in df.columns:
+        import pandas as pd
+        license_counts = df['license'].value_counts().to_dict()
+        licenses = {str(k): int(v) for k, v in license_counts.items() if pd.notna(k) and str(k) != 'nan'}
+    return {
+        "total_models": total_models,
+        "unique_libraries": int(df['library_name'].nunique()) if 'library_name' in df.columns else 0,
+        "unique_pipelines": int(df['pipeline_tag'].nunique()) if 'pipeline_tag' in df.columns else 0,
+        "unique_task_types": int(df['pipeline_tag'].nunique()) if 'pipeline_tag' in df.columns else 0,
+        "unique_licenses": len(licenses),
+        "licenses": licenses,
+        "avg_downloads": float(df['downloads'].mean()) if 'downloads' in df.columns else 0,
+        "avg_likes": float(df['likes'].mean()) if 'likes' in df.columns else 0
+    }

backend/config/requirements.txt CHANGED Viewed

@@ -11,5 +11,6 @@ huggingface-hub>=0.17.0
 schedule>=1.2.0
 tqdm>=4.66.0
 networkx>=3.0
 httpx>=0.24.0

 schedule>=1.2.0
 tqdm>=4.66.0
 networkx>=3.0
+node2vec>=0.4.6
 httpx>=0.24.0

backend/core/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ """Core configuration and utilities."""
2	+

backend/core/config.py ADDED Viewed

	@@ -0,0 +1,23 @@

+"""Configuration management."""
+import os
+from typing import Optional
+class Settings:
+    """Application settings."""
+    FRONTEND_URL: str = os.getenv("FRONTEND_URL", "http://localhost:3000")
+    ALLOW_ALL_ORIGINS: bool = os.getenv("ALLOW_ALL_ORIGINS", "True").lower() in ("true", "1", "yes")
+    SAMPLE_SIZE: Optional[int] = None
+    USE_GRAPH_EMBEDDINGS: bool = os.getenv("USE_GRAPH_EMBEDDINGS", "false").lower() == "true"
+    PORT: int = int(os.getenv("PORT", 8000))
+    @classmethod
+    def get_sample_size(cls) -> Optional[int]:
+        """Get sample size from environment."""
+        sample_size_env = os.getenv("SAMPLE_SIZE")
+        if sample_size_env:
+            sample_size_val = int(sample_size_env)
+            return sample_size_val if sample_size_val > 0 else None
+        return None
+settings = Settings()

backend/core/exceptions.py ADDED Viewed

	@@ -0,0 +1,18 @@

+"""Custom exceptions."""
+from fastapi import HTTPException
+class ModelNotFoundError(HTTPException):
+    """Model not found exception."""
+    def __init__(self, model_id: str):
+        super().__init__(status_code=404, detail=f"Model not found: {model_id}")
+class DataNotLoadedError(HTTPException):
+    """Data not loaded exception."""
+    def __init__(self):
+        super().__init__(status_code=503, detail="Data not loaded")
+class EmbeddingsNotReadyError(HTTPException):
+    """Embeddings not ready exception."""
+    def __init__(self):
+        super().__init__(status_code=503, detail="Embeddings not ready")

backend/models/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ """Data models and schemas."""
2	+

backend/models/schemas.py ADDED Viewed

	@@ -0,0 +1,22 @@

+"""Pydantic models for API."""
+from pydantic import BaseModel
+from typing import Optional
+class ModelPoint(BaseModel):
+    """Model point in 3D space."""
+    model_id: str
+    x: float
+    y: float
+    z: float
+    library_name: Optional[str]
+    pipeline_tag: Optional[str]
+    downloads: int
+    likes: int
+    trending_score: Optional[float]
+    tags: Optional[str]
+    parent_model: Optional[str] = None
+    licenses: Optional[str] = None
+    family_depth: Optional[int] = None
+    cluster_id: Optional[int] = None
+    created_at: Optional[str] = None  # ISO format date string

backend/scripts/export_binary.py ADDED Viewed

	@@ -0,0 +1,263 @@

+"""
+Export minimal dataset to binary format for fast client-side loading.
+This creates a compact binary representation optimized for WebGL rendering.
+"""
+import struct
+import json
+import numpy as np
+import pandas as pd
+from pathlib import Path
+import sys
+import os
+# Add parent directory to path
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from utils.data_loader import ModelDataLoader
+from utils.dimensionality_reduction import DimensionReducer
+from utils.embeddings import ModelEmbedder
+def calculate_family_depths(df: pd.DataFrame) -> dict:
+    """Calculate depth of each model in its family tree."""
+    depths = {}
+    def get_depth(model_id: str, visited: set = None) -> int:
+        if visited is None:
+            visited = set()
+        if model_id in visited:
+            return 0  # Cycle detected
+        visited.add(model_id)
+        if model_id in depths:
+            return depths[model_id]
+        parent_col = df.get('parent_model', pd.Series([None] * len(df), index=df.index))
+        model_row = df[df['model_id'] == model_id]
+        if model_row.empty:
+            depths[model_id] = 0
+            return 0
+        parent = model_row.iloc[0].get('parent_model')
+        if pd.isna(parent) or parent == '' or str(parent) == 'nan':
+            depths[model_id] = 0
+            return 0
+        parent_depth = get_depth(str(parent), visited.copy())
+        depth = parent_depth + 1
+        depths[model_id] = depth
+        return depth
+    for model_id in df['model_id'].unique():
+        if model_id not in depths:
+            get_depth(str(model_id))
+    return depths
+def export_binary_dataset(df: pd.DataFrame, reduced_embeddings: np.ndarray, output_dir: Path):
+    """
+    Export minimal dataset to binary format for fast client-side loading.
+    Binary format:
+    - Header (64 bytes): magic, version, counts, lookup table sizes
+    - Domain lookup table (32 bytes per domain)
+    - License lookup table (32 bytes per license)
+    - Family lookup table (32 bytes per family)
+    - Model records (16 bytes each): x, y, z, domain_id, license_id, family_id, flags
+    """
+    output_dir.mkdir(parents=True, exist_ok=True)
+    print(f"Exporting {len(df)} models to binary format...")
+    # Ensure we have coordinates
+    if 'x' not in df.columns or 'y' not in df.columns:
+        if reduced_embeddings is None or len(reduced_embeddings) != len(df):
+            raise ValueError("Need reduced embeddings to generate coordinates")
+        df['x'] = reduced_embeddings[:, 0] if reduced_embeddings.shape[1] > 0 else 0.0
+        df['y'] = reduced_embeddings[:, 1] if reduced_embeddings.shape[1] > 1 else 0.0
+        df['z'] = reduced_embeddings[:, 2] if reduced_embeddings.shape[1] > 2 else 0.0
+    # Create lookup tables
+    # Domain = library_name
+    domains = sorted(df['library_name'].dropna().astype(str).unique())
+    domains = [d for d in domains if d and d != 'nan'][:255]  # Limit to 255
+    # License
+    licenses = sorted(df['license'].dropna().astype(str).unique())
+    licenses = [l for l in licenses if l and l != 'nan'][:255]  # Limit to 255
+    # Family ID mapping (use parent_model to create family groups)
+    family_depths = calculate_family_depths(df)
+    # Create family mapping: group models by root parent
+    def get_root_parent(model_id: str) -> str:
+        visited = set()
+        current = str(model_id)
+        while current in visited == False:
+            visited.add(current)
+            model_row = df[df['model_id'] == current]
+            if model_row.empty:
+                return current
+            parent = model_row.iloc[0].get('parent_model')
+            if pd.isna(parent) or parent == '' or str(parent) == 'nan':
+                return current
+            current = str(parent)
+        return current
+    root_parents = {}
+    family_counter = 0
+    for model_id in df['model_id'].unique():
+        root = get_root_parent(str(model_id))
+        if root not in root_parents:
+            root_parents[root] = family_counter
+            family_counter += 1
+    # Map each model to its family
+    model_to_family = {}
+    for model_id in df['model_id'].unique():
+        root = get_root_parent(str(model_id))
+        model_to_family[str(model_id)] = root_parents.get(root, 65535)
+    # Limit families to 65535 (u16 max)
+    if len(root_parents) > 65535:
+        # Use hash-based family IDs
+        import hashlib
+        for model_id in df['model_id'].unique():
+            root = get_root_parent(str(model_id))
+            family_hash = int(hashlib.md5(root.encode()).hexdigest()[:4], 16) % 65535
+            model_to_family[str(model_id)] = family_hash
+    # Prepare model records
+    records = []
+    model_ids = []
+    for idx, row in df.iterrows():
+        model_id = str(row['model_id'])
+        model_ids.append(model_id)
+        # Get coordinates
+        x = float(row.get('x', 0.0))
+        y = float(row.get('y', 0.0))
+        z = float(row.get('z', 0.0))
+        # Encode domain (library_name)
+        domain_str = str(row.get('library_name', ''))
+        domain_id = domains.index(domain_str) if domain_str in domains else 255
+        # Encode license
+        license_str = str(row.get('license', ''))
+        license_id = licenses.index(license_str) if license_str in licenses else 255
+        # Encode family
+        family_id = model_to_family.get(model_id, 65535)
+        # Encode flags
+        flags = 0
+        parent = row.get('parent_model')
+        if pd.isna(parent) or parent == '' or str(parent) == 'nan':
+            flags |= 0x01  # is_base_model
+        # Check if has children (simple check - could be improved)
+        children = df[df['parent_model'] == model_id]
+        if len(children) > 0:
+            flags |= 0x04  # has_children
+        elif not pd.isna(parent) and parent != '' and str(parent) != 'nan':
+            flags |= 0x02  # has_parent
+        # Pack record: f32 x, f32 y, f32 z, u8 domain, u8 license, u16 family, u8 flags
+        records.append(struct.pack('fffBBBH', x, y, z, domain_id, license_id, family_id, flags))
+    num_models = len(records)
+    # Write binary file
+    with open(output_dir / 'embeddings.bin', 'wb') as f:
+        # Header (64 bytes)
+        header = struct.pack('5sBIIIBBH50s',
+            b'HFVIZ',  # magic (5 bytes)
+            1,  # version (1 byte)
+            num_models,  # num_models (4 bytes)
+            len(domains),  # num_domains (4 bytes)
+            len(licenses),  # num_licenses (4 bytes)
+            len(set(model_to_family.values())),  # num_families (4 bytes)
+            0,  # reserved (1 byte)
+            0,  # reserved (1 byte)
+            0,  # reserved (2 bytes)
+            b'\x00' * 50  # padding (50 bytes)
+        )
+        f.write(header)
+        # Domain lookup table (32 bytes per domain, null-terminated)
+        for domain in domains:
+            domain_bytes = domain.encode('utf-8')[:31]
+            f.write(domain_bytes.ljust(32, b'\x00'))
+        # License lookup table (32 bytes per license)
+        for license in licenses:
+            license_bytes = license.encode('utf-8')[:31]
+            f.write(license_bytes.ljust(32, b'\x00'))
+        # Model records
+        f.write(b''.join(records))
+    # Write model IDs JSON (separate file for string table)
+    with open(output_dir / 'model_ids.json', 'w') as f:
+        json.dump(model_ids, f)
+    # Write metadata JSON
+    metadata = {
+        'domains': domains,
+        'licenses': licenses,
+        'num_models': num_models,
+        'num_families': len(set(model_to_family.values())),
+        'version': 1
+    }
+    with open(output_dir / 'metadata.json', 'w') as f:
+        json.dump(metadata, f, indent=2)
+    binary_size = (output_dir / 'embeddings.bin').stat().st_size
+    json_size = (output_dir / 'model_ids.json').stat().st_size
+    print(f"✓ Exported {num_models} models")
+    print(f"✓ Binary size: {binary_size / 1024 / 1024:.2f} MB")
+    print(f"✓ Model IDs JSON: {json_size / 1024 / 1024:.2f} MB")
+    print(f"✓ Total: {(binary_size + json_size) / 1024 / 1024:.2f} MB")
+    print(f"✓ Domains: {len(domains)}")
+    print(f"✓ Licenses: {len(licenses)}")
+    print(f"✓ Families: {len(set(model_to_family.values()))}")
+if __name__ == '__main__':
+    import argparse
+    parser = argparse.ArgumentParser(description='Export dataset to binary format')
+    parser.add_argument('--output', type=str, default='backend/cache/binary', help='Output directory')
+    parser.add_argument('--sample-size', type=int, default=None, help='Sample size (for testing)')
+    args = parser.parse_args()
+    output_dir = Path(args.output)
+    # Load data
+    print("Loading dataset...")
+    data_loader = ModelDataLoader()
+    df = data_loader.load_data(sample_size=args.sample_size)
+    df = data_loader.preprocess_for_embedding(df)
+    # Generate embeddings and reduce dimensions if needed
+    if 'x' not in df.columns or 'y' not in df.columns:
+        print("Generating embeddings...")
+        embedder = ModelEmbedder()
+        embeddings = embedder.generate_embeddings(df['combined_text'].tolist())
+        print("Reducing dimensions...")
+        reducer = DimensionReducer()
+        reduced_embeddings = reducer.reduce_dimensions(embeddings, n_components=3, method='umap')
+    else:
+        reduced_embeddings = None
+    # Export
+    export_binary_dataset(df, reduced_embeddings, output_dir)
+    print("Done!")

backend/services/model_tracker.py CHANGED Viewed

@@ -5,11 +5,16 @@ Tracks the number of models over time and provides historical data.
 import os
 import json
 import sqlite3
 from datetime import datetime, timedelta
 from typing import Dict, List, Optional, Tuple
 from huggingface_hub import HfApi
 import pandas as pd
 from pathlib import Path
 class ModelCountTracker:
@@ -34,7 +39,6 @@ class ModelCountTracker:
         conn = sqlite3.connect(self.db_path)
         cursor = conn.cursor()
-        # Create table for model counts
         cursor.execute("""
             CREATE TABLE IF NOT EXISTS model_counts (
                 id INTEGER PRIMARY KEY AUTOINCREMENT,
@@ -47,7 +51,6 @@ class ModelCountTracker:
             )
         """)
-        # Create index for faster queries
         cursor.execute("""
             CREATE INDEX IF NOT EXISTS idx_timestamp
             ON model_counts(timestamp)
@@ -56,27 +59,90 @@ class ModelCountTracker:
         conn.commit()
         conn.close()
-    def get_current_model_count(self) -> Dict:
         """
-        Fetch current model count from Hugging Face Hub API.
-        Uses efficient pagination to get accurate count.
         Returns:
-            Dictionary with total count and breakdowns
         """
         try:
-            # Use pagination to efficiently count models
-            # The API returns paginated results, so we iterate through pages
-            # For large counts, we sample and extrapolate for speed
             total_count = 0
             library_counts = {}
             pipeline_counts = {}
-            page_size = 1000  # Process in batches
-            max_pages = 100  # Limit to prevent timeout (can adjust)
-            sample_size = 10000  # Sample size for breakdowns
-            # Count total models efficiently
             models_iter = self.api.list_models(full=False)
             sampled_models = []
@@ -87,25 +153,18 @@ class ModelCountTracker:
                 if i < sample_size:
                     sampled_models.append(model)
-                # Safety limit to prevent infinite loops
                 if i >= max_pages * page_size:
-                    # If we hit the limit, estimate total from sample
-                    # This is a rough estimate - for exact count, increase max_pages
                     break
-            # Calculate breakdowns from sample (extrapolate if needed)
             for model in sampled_models:
-                # Count by library
                 if hasattr(model, 'library_name') and model.library_name:
                     lib = model.library_name
                     library_counts[lib] = library_counts.get(lib, 0) + 1
-                # Count by pipeline
                 if hasattr(model, 'pipeline_tag') and model.pipeline_tag:
                     pipeline = model.pipeline_tag
                     pipeline_counts[pipeline] = pipeline_counts.get(pipeline, 0) + 1
-            # If we sampled, scale up the breakdowns proportionally
             if len(sampled_models) < total_count and len(sampled_models) > 0:
                 scale_factor = total_count / len(sampled_models)
                 library_counts = {k: int(v * scale_factor) for k, v in library_counts.items()}
@@ -118,7 +177,7 @@ class ModelCountTracker:
                 "timestamp": datetime.utcnow().isoformat()
             }
         except Exception as e:
-            print(f"Error fetching model count: {e}")
             return {
                 "total_models": 0,
                 "models_by_library": {},
@@ -162,7 +221,7 @@ class ModelCountTracker:
             conn.close()
             return True
         except Exception as e:
-            print(f"Error recording count: {e}")
             return False
     def get_historical_counts(
@@ -211,7 +270,7 @@ class ModelCountTracker:
             conn.close()
             return results
         except Exception as e:
-            print(f"Error fetching historical counts: {e}")
             return []
     def get_latest_count(self) -> Optional[Dict]:
@@ -239,7 +298,7 @@ class ModelCountTracker:
                 }
             return None
         except Exception as e:
-            print(f"Error fetching latest count: {e}")
             return None
     def get_growth_stats(self, days: int = 7) -> Dict:

 import os
 import json
 import sqlite3
+import logging
+import re
 from datetime import datetime, timedelta
 from typing import Dict, List, Optional, Tuple
 from huggingface_hub import HfApi
 import pandas as pd
 from pathlib import Path
+import httpx
+logger = logging.getLogger(__name__)
 class ModelCountTracker:
         conn = sqlite3.connect(self.db_path)
         cursor = conn.cursor()
         cursor.execute("""
             CREATE TABLE IF NOT EXISTS model_counts (
                 id INTEGER PRIMARY KEY AUTOINCREMENT,
             )
         """)
         cursor.execute("""
             CREATE INDEX IF NOT EXISTS idx_timestamp
             ON model_counts(timestamp)
         conn.commit()
         conn.close()
+    def get_count_from_models_page(self) -> Optional[Dict]:
         """
+        Get model count by scraping the Hugging Face models page.
+        Extracts count from the div with class "font-normal text-gray-400" on https://huggingface.co/models
+        or from window.__hf_deferred["numTotalItems"] in the page script.
         Returns:
+            Dictionary with total_models count, or None if extraction fails
         """
         try:
+            url = "https://huggingface.co/models"
+            response = httpx.get(url, timeout=10.0, follow_redirects=True)
+            response.raise_for_status()
+            html_content = response.text
+            deferred_pattern = r'window\.__hf_deferred\["numTotalItems"\]\s*=\s*(\d+);'
+            deferred_matches = re.findall(deferred_pattern, html_content)
+            if deferred_matches:
+                total_models = int(deferred_matches[0])
+                logger.info(f"Extracted model count from window.__hf_deferred: {total_models}")
+                return {
+                    "total_models": total_models,
+                    "timestamp": datetime.utcnow().isoformat(),
+                    "source": "hf_models_page",
+                    "models_by_library": {},
+                    "models_by_pipeline": {},
+                    "models_by_author": {}
+                }
+            pattern = r'<div[^>]*class="[^"]*font-normal[^"]*text-gray-400[^"]*"[^>]*>([\d,]+)</div>'
+            matches = re.findall(pattern, html_content)
+            if matches:
+                count_str = matches[0].replace(',', '')
+                total_models = int(count_str)
+                logger.info(f"Extracted model count from div: {total_models}")
+                return {
+                    "total_models": total_models,
+                    "timestamp": datetime.utcnow().isoformat(),
+                    "source": "hf_models_page",
+                    "models_by_library": {},
+                    "models_by_pipeline": {},
+                    "models_by_author": {}
+                }
+            logger.warning("Could not find model count in HF models page HTML")
+            return None
+        except httpx.HTTPError as e:
+            logger.error(f"HTTP error fetching HF models page: {e}", exc_info=True)
+            return None
+        except Exception as e:
+            logger.error(f"Error extracting count from HF models page: {e}", exc_info=True)
+            return None
+    def get_current_model_count(self, use_models_page: bool = True) -> Dict:
+        """
+        Fetch current model count from Hugging Face Hub.
+        Uses multiple strategies: models page scraping (fastest), then API enumeration.
+        Args:
+            use_models_page: Try to get count from HF models page first (default: True)
+        Returns:
+            Dictionary with total count and breakdowns
+        """
+        if use_models_page:
+            page_count = self.get_count_from_models_page()
+            if page_count:
+                return page_count
+        try:
             total_count = 0
             library_counts = {}
             pipeline_counts = {}
+            page_size = 1000
+            max_pages = 100
+            sample_size = 10000
             models_iter = self.api.list_models(full=False)
             sampled_models = []
                 if i < sample_size:
                     sampled_models.append(model)
                 if i >= max_pages * page_size:
                     break
             for model in sampled_models:
                 if hasattr(model, 'library_name') and model.library_name:
                     lib = model.library_name
                     library_counts[lib] = library_counts.get(lib, 0) + 1
                 if hasattr(model, 'pipeline_tag') and model.pipeline_tag:
                     pipeline = model.pipeline_tag
                     pipeline_counts[pipeline] = pipeline_counts.get(pipeline, 0) + 1
             if len(sampled_models) < total_count and len(sampled_models) > 0:
                 scale_factor = total_count / len(sampled_models)
                 library_counts = {k: int(v * scale_factor) for k, v in library_counts.items()}
                 "timestamp": datetime.utcnow().isoformat()
             }
         except Exception as e:
+            logger.error(f"Error fetching model count: {e}", exc_info=True)
             return {
                 "total_models": 0,
                 "models_by_library": {},
             conn.close()
             return True
         except Exception as e:
+            logger.error(f"Error recording count: {e}", exc_info=True)
             return False
     def get_historical_counts(
             conn.close()
             return results
         except Exception as e:
+            logger.error(f"Error fetching historical counts: {e}", exc_info=True)
             return []
     def get_latest_count(self) -> Optional[Dict]:
                 }
             return None
         except Exception as e:
+            logger.error(f"Error fetching latest count: {e}", exc_info=True)
             return None
     def get_growth_stats(self, days: int = 7) -> Dict:

backend/services/model_tracker_improved.py CHANGED Viewed

@@ -11,12 +11,17 @@ Key improvements:
 import os
 import json
 import sqlite3
 from datetime import datetime, timedelta
 from typing import Dict, List, Optional, Tuple
 from huggingface_hub import HfApi
 import pandas as pd
 from pathlib import Path
 import time
 class ImprovedModelCountTracker:
@@ -78,72 +83,73 @@ class ImprovedModelCountTracker:
         elapsed = (datetime.utcnow() - self._cache_timestamp).total_seconds()
         return elapsed < self.cache_ttl
-    def get_current_model_count(self, use_cache: bool = True, force_refresh: bool = False) -> Dict:
         """
-        Fetch current model count from Hugging Face Hub API.
-        Uses caching and efficient sampling strategies.
         Args:
             use_cache: Whether to use cached results if available
             force_refresh: Force refresh even if cache is valid
         Returns:
             Dictionary with total count and breakdowns
         """
-        # Check cache first
         if use_cache and not force_refresh and self._is_cache_valid():
             return self._cache
         try:
-            # Strategy 1: Try to get count efficiently using pagination
-            # The HfApi.list_models() returns an iterator, so we can count efficiently
             total_count = 0
             library_counts = {}
             pipeline_counts = {}
             author_counts = {}
-            # For breakdowns, we sample a subset for efficiency
-            sample_size = 20000  # Sample 20K models for breakdowns
-            max_count_for_full_breakdown = 50000  # If less than this, do full breakdown
             models_iter = self.api.list_models(full=False, sort="created", direction=-1)
             sampled_models = []
             start_time = time.time()
-            timeout_seconds = 30  # Don't spend more than 30 seconds
             for i, model in enumerate(models_iter):
-                # Check timeout
                 if time.time() - start_time > timeout_seconds:
-                    # If we hit timeout, use sampling strategy
                     break
                 total_count += 1
-                # Sample models for breakdowns
                 if i < sample_size:
                     sampled_models.append(model)
-                # For smaller datasets, we can do full breakdown
                 if total_count < max_count_for_full_breakdown:
-                    # Count by library
                     if hasattr(model, 'library_name') and model.library_name:
                         lib = model.library_name
                         library_counts[lib] = library_counts.get(lib, 0) + 1
-                    # Count by pipeline
                     if hasattr(model, 'pipeline_tag') and model.pipeline_tag:
                         pipeline = model.pipeline_tag
                         pipeline_counts[pipeline] = pipeline_counts.get(pipeline, 0) + 1
-                    # Count by author (extract from model_id)
                     if hasattr(model, 'id') and model.id:
                         author = model.id.split('/')[0] if '/' in model.id else 'unknown'
                         author_counts[author] = author_counts.get(author, 0) + 1
-            # If we sampled, calculate breakdowns from sample and extrapolate
             if total_count > len(sampled_models) and len(sampled_models) > 0:
-                # Calculate breakdowns from sample
                 for model in sampled_models:
                     if hasattr(model, 'library_name') and model.library_name:
                         lib = model.library_name
@@ -157,7 +163,6 @@ class ImprovedModelCountTracker:
                         author = model.id.split('/')[0] if '/' in model.id else 'unknown'
                         author_counts[author] = author_counts.get(author, 0) + 1
-                # Scale up breakdowns proportionally
                 if len(sampled_models) > 0:
                     scale_factor = total_count / len(sampled_models)
                     library_counts = {k: int(v * scale_factor) for k, v in library_counts.items()}
@@ -168,20 +173,19 @@ class ImprovedModelCountTracker:
                 "total_models": total_count,
                 "models_by_library": library_counts,
                 "models_by_pipeline": pipeline_counts,
-                "models_by_author": dict(sorted(author_counts.items(), key=lambda x: x[1], reverse=True)[:20]),  # Top 20 authors
                 "timestamp": datetime.utcnow().isoformat(),
                 "sampling_used": total_count > len(sampled_models) if sampled_models else False,
                 "sample_size": len(sampled_models) if sampled_models else total_count
             }
-            # Update cache
             self._cache = result
             self._cache_timestamp = datetime.utcnow()
             return result
         except Exception as e:
-            print(f"Error fetching model count: {e}")
             return {
                 "total_models": 0,
                 "models_by_library": {},
@@ -191,6 +195,70 @@ class ImprovedModelCountTracker:
                 "error": str(e)
             }
     def get_count_from_dataset_snapshot(self, dataset_name: str = "modelbiome/ai_ecosystem_withmodelcards") -> Optional[Dict]:
         """
         Alternative method: Get count from dataset snapshot (like ai-ecosystem repo does).
@@ -205,11 +273,9 @@ class ImprovedModelCountTracker:
         try:
             from datasets import load_dataset
-            # Load just metadata to get count quickly
             dataset = load_dataset(dataset_name, split="train")
             total_count = len(dataset)
-            # Sample for breakdowns
             sample_size = min(10000, total_count)
             sample = dataset.shuffle(seed=42).select(range(sample_size))
@@ -225,7 +291,6 @@ class ImprovedModelCountTracker:
                     pipeline = item['pipeline_tag']
                     pipeline_counts[pipeline] = pipeline_counts.get(pipeline, 0) + 1
-            # Scale up
             if sample_size < total_count:
                 scale_factor = total_count / sample_size
                 library_counts = {k: int(v * scale_factor) for k, v in library_counts.items()}
@@ -239,7 +304,7 @@ class ImprovedModelCountTracker:
                 "source": "dataset_snapshot"
             }
         except Exception as e:
-            print(f"Error loading from dataset snapshot: {e}")
             return None
     def record_count(self, count_data: Optional[Dict] = None, source: str = "api") -> bool:
@@ -279,7 +344,7 @@ class ImprovedModelCountTracker:
             conn.close()
             return True
         except Exception as e:
-            print(f"Error recording count: {e}")
             return False
     def get_historical_counts(
@@ -329,7 +394,7 @@ class ImprovedModelCountTracker:
             conn.close()
             return results
         except Exception as e:
-            print(f"Error fetching historical counts: {e}")
             return []
     def get_latest_count(self) -> Optional[Dict]:
@@ -358,7 +423,7 @@ class ImprovedModelCountTracker:
                 }
             return None
         except Exception as e:
-            print(f"Error fetching latest count: {e}")
             return None
     def get_growth_stats(self, days: int = 7) -> Dict:

 import os
 import json
 import sqlite3
+import logging
+import re
 from datetime import datetime, timedelta
 from typing import Dict, List, Optional, Tuple
 from huggingface_hub import HfApi
 import pandas as pd
 from pathlib import Path
 import time
+import httpx
+logger = logging.getLogger(__name__)
 class ImprovedModelCountTracker:
         elapsed = (datetime.utcnow() - self._cache_timestamp).total_seconds()
         return elapsed < self.cache_ttl
+    def get_current_model_count(self, use_cache: bool = True, force_refresh: bool = False, use_models_page: bool = True) -> Dict:
         """
+        Fetch current model count from Hugging Face Hub.
+        Uses multiple strategies: models page scraping (fastest), API, or dataset snapshot.
         Args:
             use_cache: Whether to use cached results if available
             force_refresh: Force refresh even if cache is valid
+            use_models_page: Try to get count from HF models page first (default: True)
         Returns:
             Dictionary with total count and breakdowns
         """
         if use_cache and not force_refresh and self._is_cache_valid():
             return self._cache
+        if use_models_page:
+            page_count = self.get_count_from_models_page()
+            if page_count:
+                dataset_count = self.get_count_from_dataset_snapshot()
+                if dataset_count and dataset_count.get("models_by_library"):
+                    page_count["models_by_library"] = dataset_count.get("models_by_library", {})
+                    page_count["models_by_pipeline"] = dataset_count.get("models_by_pipeline", {})
+                    page_count["models_by_author"] = dataset_count.get("models_by_author", {})
+                self._cache = page_count
+                self._cache_timestamp = datetime.utcnow()
+                return page_count
         try:
             total_count = 0
             library_counts = {}
             pipeline_counts = {}
             author_counts = {}
+            sample_size = 20000
+            max_count_for_full_breakdown = 50000
             models_iter = self.api.list_models(full=False, sort="created", direction=-1)
             sampled_models = []
             start_time = time.time()
+            timeout_seconds = 30
             for i, model in enumerate(models_iter):
                 if time.time() - start_time > timeout_seconds:
                     break
                 total_count += 1
                 if i < sample_size:
                     sampled_models.append(model)
                 if total_count < max_count_for_full_breakdown:
                     if hasattr(model, 'library_name') and model.library_name:
                         lib = model.library_name
                         library_counts[lib] = library_counts.get(lib, 0) + 1
                     if hasattr(model, 'pipeline_tag') and model.pipeline_tag:
                         pipeline = model.pipeline_tag
                         pipeline_counts[pipeline] = pipeline_counts.get(pipeline, 0) + 1
                     if hasattr(model, 'id') and model.id:
                         author = model.id.split('/')[0] if '/' in model.id else 'unknown'
                         author_counts[author] = author_counts.get(author, 0) + 1
             if total_count > len(sampled_models) and len(sampled_models) > 0:
                 for model in sampled_models:
                     if hasattr(model, 'library_name') and model.library_name:
                         lib = model.library_name
                         author = model.id.split('/')[0] if '/' in model.id else 'unknown'
                         author_counts[author] = author_counts.get(author, 0) + 1
                 if len(sampled_models) > 0:
                     scale_factor = total_count / len(sampled_models)
                     library_counts = {k: int(v * scale_factor) for k, v in library_counts.items()}
                 "total_models": total_count,
                 "models_by_library": library_counts,
                 "models_by_pipeline": pipeline_counts,
+                "models_by_author": dict(sorted(author_counts.items(), key=lambda x: x[1], reverse=True)[:20]),
                 "timestamp": datetime.utcnow().isoformat(),
                 "sampling_used": total_count > len(sampled_models) if sampled_models else False,
                 "sample_size": len(sampled_models) if sampled_models else total_count
             }
             self._cache = result
             self._cache_timestamp = datetime.utcnow()
             return result
         except Exception as e:
+            logger.error(f"Error fetching model count: {e}", exc_info=True)
             return {
                 "total_models": 0,
                 "models_by_library": {},
                 "error": str(e)
             }
+    def get_count_from_models_page(self) -> Optional[Dict]:
+        """
+        Get model count by scraping the Hugging Face models page.
+        Extracts count from the div with class "font-normal text-gray-400" on https://huggingface.co/models
+        Returns:
+            Dictionary with total_models count, or None if extraction fails
+        """
+        try:
+            url = "https://huggingface.co/models"
+            response = httpx.get(url, timeout=10.0, follow_redirects=True)
+            response.raise_for_status()
+            html_content = response.text
+            # Look for the pattern: <div class="font-normal text-gray-400">2,249,310</div>
+            # The number is in the format with commas
+            pattern = r'<div[^>]*class="[^"]*font-normal[^"]*text-gray-400[^"]*"[^>]*>([\d,]+)</div>'
+            matches = re.findall(pattern, html_content)
+            if matches:
+                # Take the first match and remove commas
+                count_str = matches[0].replace(',', '')
+                total_models = int(count_str)
+                logger.info(f"Extracted model count from HF models page: {total_models}")
+                return {
+                    "total_models": total_models,
+                    "timestamp": datetime.utcnow().isoformat(),
+                    "source": "hf_models_page",
+                    "models_by_library": {},
+                    "models_by_pipeline": {},
+                    "models_by_author": {}
+                }
+            else:
+                # Fallback: try to find the number in the window.__hf_deferred object
+                # The page has: window.__hf_deferred["numTotalItems"] = 2249312;
+                deferred_pattern = r'window\.__hf_deferred\["numTotalItems"\]\s*=\s*(\d+);'
+                deferred_matches = re.findall(deferred_pattern, html_content)
+                if deferred_matches:
+                    total_models = int(deferred_matches[0])
+                    logger.info(f"Extracted model count from window.__hf_deferred: {total_models}")
+                    return {
+                        "total_models": total_models,
+                        "timestamp": datetime.utcnow().isoformat(),
+                        "source": "hf_models_page_deferred",
+                        "models_by_library": {},
+                        "models_by_pipeline": {},
+                        "models_by_author": {}
+                    }
+                logger.warning("Could not find model count in HF models page HTML")
+                return None
+        except httpx.HTTPError as e:
+            logger.error(f"HTTP error fetching HF models page: {e}", exc_info=True)
+            return None
+        except Exception as e:
+            logger.error(f"Error extracting count from HF models page: {e}", exc_info=True)
+            return None
     def get_count_from_dataset_snapshot(self, dataset_name: str = "modelbiome/ai_ecosystem_withmodelcards") -> Optional[Dict]:
         """
         Alternative method: Get count from dataset snapshot (like ai-ecosystem repo does).
         try:
             from datasets import load_dataset
             dataset = load_dataset(dataset_name, split="train")
             total_count = len(dataset)
             sample_size = min(10000, total_count)
             sample = dataset.shuffle(seed=42).select(range(sample_size))
                     pipeline = item['pipeline_tag']
                     pipeline_counts[pipeline] = pipeline_counts.get(pipeline, 0) + 1
             if sample_size < total_count:
                 scale_factor = total_count / sample_size
                 library_counts = {k: int(v * scale_factor) for k, v in library_counts.items()}
                 "source": "dataset_snapshot"
             }
         except Exception as e:
+            logger.error(f"Error loading from dataset snapshot: {e}", exc_info=True)
             return None
     def record_count(self, count_data: Optional[Dict] = None, source: str = "api") -> bool:
             conn.close()
             return True
         except Exception as e:
+            logger.error(f"Error recording count: {e}", exc_info=True)
             return False
     def get_historical_counts(
             conn.close()
             return results
         except Exception as e:
+            logger.error(f"Error fetching historical counts: {e}", exc_info=True)
             return []
     def get_latest_count(self) -> Optional[Dict]:
                 }
             return None
         except Exception as e:
+            logger.error(f"Error fetching latest count: {e}", exc_info=True)
             return None
     def get_growth_stats(self, days: int = 7) -> Dict:

backend/utils/data_loader.py CHANGED Viewed

@@ -50,18 +50,16 @@ class ModelDataLoader:
         else:
             df = df.copy()
-        # Fill NaN values
         text_fields = ['tags', 'pipeline_tag', 'library_name', 'modelCard']
         for field in text_fields:
             if field in df.columns:
                 df[field] = df[field].fillna('')
-        # Combine text fields for embedding
         df['combined_text'] = (
             df.get('tags', '').astype(str) + ' ' +
             df.get('pipeline_tag', '').astype(str) + ' ' +
             df.get('library_name', '').astype(str) + ' ' +
-            df['modelCard'].astype(str).str[:500]  # Limit modelCard to first 500 chars
         )
         return df
@@ -94,7 +92,6 @@ class ModelDataLoader:
         else:
             df = df.copy()
-        # Optimized filtering with vectorized operations
         if min_downloads is not None:
             downloads_col = df.get('downloads', pd.Series([0] * len(df), index=df.index))
             df = df[downloads_col >= min_downloads]

         else:
             df = df.copy()
         text_fields = ['tags', 'pipeline_tag', 'library_name', 'modelCard']
         for field in text_fields:
             if field in df.columns:
                 df[field] = df[field].fillna('')
         df['combined_text'] = (
             df.get('tags', '').astype(str) + ' ' +
             df.get('pipeline_tag', '').astype(str) + ' ' +
             df.get('library_name', '').astype(str) + ' ' +
+            df['modelCard'].astype(str).str[:500]
         )
         return df
         else:
             df = df.copy()
         if min_downloads is not None:
             downloads_col = df.get('downloads', pd.Series([0] * len(df), index=df.index))
             df = df[downloads_col >= min_downloads]

backend/utils/embeddings.py CHANGED Viewed

@@ -27,7 +27,7 @@ class ModelEmbedder:
     def generate_embeddings(
         self,
         texts: List[str],
-        batch_size: int = 128,  # Increased default batch size for speed
         show_progress: bool = True
     ) -> np.ndarray:
         """

     def generate_embeddings(
         self,
         texts: List[str],
+        batch_size: int = 128,
         show_progress: bool = True
     ) -> np.ndarray:
         """

backend/utils/family_tree.py ADDED Viewed

	@@ -0,0 +1,66 @@

+"""Family tree utility functions."""
+import pandas as pd
+from typing import Dict
+def calculate_family_depths(df: pd.DataFrame) -> Dict[str, int]:
+    """Calculate family depth for each model."""
+    depths = {}
+    computing = set()
+    def get_depth(model_id: str) -> int:
+        if model_id in depths:
+            return depths[model_id]
+        if model_id in computing:
+            depths[model_id] = 0
+            return 0
+        computing.add(model_id)
+        try:
+            if df.index.name == 'model_id':
+                row = df.loc[model_id]
+            else:
+                rows = df[df.get('model_id', '') == model_id]
+                if len(rows) == 0:
+                    depths[model_id] = 0
+                    computing.remove(model_id)
+                    return 0
+                row = rows.iloc[0]
+            parent_id = row.get('parent_model')
+            if parent_id and pd.notna(parent_id):
+                parent_str = str(parent_id)
+                if parent_str != 'nan' and parent_str != '':
+                    if df.index.name == 'model_id' and parent_str in df.index:
+                        depth = get_depth(parent_str) + 1
+                    elif df.index.name != 'model_id':
+                        parent_rows = df[df.get('model_id', '') == parent_str]
+                        if len(parent_rows) > 0:
+                            depth = get_depth(parent_str) + 1
+                        else:
+                            depth = 0
+                    else:
+                        depth = 0
+                else:
+                    depth = 0
+            else:
+                depth = 0
+        except (KeyError, IndexError):
+            depth = 0
+        depths[model_id] = depth
+        computing.remove(model_id)
+        return depth
+    if df.index.name == 'model_id':
+        for model_id in df.index:
+            if model_id not in depths:
+                get_depth(str(model_id))
+    else:
+        for _, row in df.iterrows():
+            model_id = str(row.get('model_id', ''))
+            if model_id and model_id not in depths:
+                get_depth(model_id)
+    return depths

backend/utils/graph_embeddings.py ADDED Viewed

	@@ -0,0 +1,177 @@

+"""
+Graph-aware embeddings for hierarchical model relationships.
+Uses Node2Vec to create embeddings that respect family tree structure.
+"""
+import numpy as np
+import pandas as pd
+from typing import Dict, List, Optional, Tuple
+import networkx as nx
+import pickle
+import os
+import logging
+logger = logging.getLogger(__name__)
+try:
+    from node2vec import Node2Vec
+    NODE2VEC_AVAILABLE = True
+except ImportError:
+    NODE2VEC_AVAILABLE = False
+    logger.warning("node2vec not available. Install with: pip install node2vec")
+class GraphEmbedder:
+    """
+    Generate graph embeddings that respect hierarchical relationships.
+    Combines text embeddings with graph structure embeddings.
+    """
+    def __init__(self, dimensions: int = 128, walk_length: int = 30, num_walks: int = 200):
+        """
+        Initialize graph embedder.
+        Args:
+            dimensions: Embedding dimensions
+            walk_length: Length of random walks
+            num_walks: Number of walks per node
+        """
+        self.dimensions = dimensions
+        self.walk_length = walk_length
+        self.num_walks = num_walks
+        self.graph: Optional[nx.DiGraph] = None
+        self.embeddings: Optional[np.ndarray] = None
+        self.model: Optional[Node2Vec] = None
+    def build_family_graph(self, df: pd.DataFrame) -> nx.DiGraph:
+        """
+        Build directed graph from family relationships.
+        Args:
+            df: DataFrame with model_id and parent_model columns
+        Returns:
+            NetworkX DiGraph
+        """
+        graph = nx.DiGraph()
+        for idx, row in df.iterrows():
+            model_id = str(row.get('model_id', idx))
+            graph.add_node(model_id)
+            parent_id = row.get('parent_model')
+            if parent_id and pd.notna(parent_id):
+                parent_str = str(parent_id)
+                if parent_str != 'nan' and parent_str != '':
+                    graph.add_edge(parent_str, model_id)
+        self.graph = graph
+        logger.info(f"Built graph with {graph.number_of_nodes()} nodes and {graph.number_of_edges()} edges")
+        return graph
+    def generate_graph_embeddings(
+        self,
+        graph: Optional[nx.DiGraph] = None,
+        workers: int = 4
+    ) -> Dict[str, np.ndarray]:
+        """
+        Generate Node2Vec embeddings for graph nodes.
+        Args:
+            graph: NetworkX graph (uses self.graph if None)
+            workers: Number of parallel workers
+        Returns:
+            Dictionary mapping model_id to embedding vector
+        """
+        if not NODE2VEC_AVAILABLE:
+            logger.warning("Node2Vec not available, returning empty embeddings")
+            return {}
+        if graph is None:
+            graph = self.graph
+        if graph is None or graph.number_of_nodes() == 0:
+            logger.warning("No graph available for embedding generation")
+            return {}
+        try:
+            node2vec = Node2Vec(
+                graph,
+                dimensions=self.dimensions,
+                walk_length=self.walk_length,
+                num_walks=self.num_walks,
+                workers=workers
+            )
+            model = node2vec.fit(window=10, min_count=1, batch_words=4)
+            self.model = model
+            embeddings_dict = {}
+            for node in graph.nodes():
+                if node in model.wv:
+                    embeddings_dict[node] = model.wv[node]
+            logger.info(f"Generated graph embeddings for {len(embeddings_dict)} nodes")
+            return embeddings_dict
+        except Exception as e:
+            logger.error(f"Error generating graph embeddings: {e}", exc_info=True)
+            return {}
+    def combine_embeddings(
+        self,
+        text_embeddings: np.ndarray,
+        graph_embeddings: Dict[str, np.ndarray],
+        model_ids: List[str],
+        text_weight: float = 0.7,
+        graph_weight: float = 0.3
+    ) -> np.ndarray:
+        """
+        Combine text and graph embeddings with weighted average.
+        Args:
+            text_embeddings: Text-based embeddings (n_samples, text_dim)
+            graph_embeddings: Graph embeddings dictionary
+            model_ids: List of model IDs corresponding to text_embeddings
+            text_weight: Weight for text embeddings
+            graph_weight: Weight for graph embeddings
+        Returns:
+            Combined embeddings (n_samples, combined_dim)
+        """
+        if not graph_embeddings:
+            return text_embeddings
+        text_dim = text_embeddings.shape[1]
+        graph_dim = next(iter(graph_embeddings.values())).shape[0]
+        combined = np.zeros((len(model_ids), text_dim + graph_dim))
+        for i, model_id in enumerate(model_ids):
+            model_id_str = str(model_id)
+            text_emb = text_embeddings[i]
+            graph_emb = graph_embeddings.get(model_id_str, np.zeros(graph_dim))
+            normalized_text = text_emb / (np.linalg.norm(text_emb) + 1e-8)
+            normalized_graph = graph_emb / (np.linalg.norm(graph_emb) + 1e-8)
+            combined[i] = np.concatenate([
+                normalized_text * text_weight,
+                normalized_graph * graph_weight
+            ])
+        return combined
+    def save_embeddings(self, embeddings: Dict[str, np.ndarray], filepath: str):
+        """Save graph embeddings to disk."""
+        os.makedirs(os.path.dirname(filepath) if os.path.dirname(filepath) else '.', exist_ok=True)
+        with open(filepath, 'wb') as f:
+            pickle.dump(embeddings, f)
+    def load_embeddings(self, filepath: str) -> Dict[str, np.ndarray]:
+        """Load graph embeddings from disk."""
+        with open(filepath, 'rb') as f:
+            return pickle.load(f)

backend/utils/network_analysis.py CHANGED Viewed

@@ -1,6 +1,7 @@
 """
 Network analysis module inspired by Open Syllabus Project.
 Builds co-occurrence networks for models based on shared contexts.
 """
 import pandas as pd
 import numpy as np
@@ -8,12 +9,66 @@ from collections import Counter
 from itertools import combinations
 from typing import List, Dict, Tuple, Optional, Set
 import networkx as nx
 class ModelNetworkBuilder:
     """
     Build network graphs for models based on co-occurrence patterns.
     Similar to Open Syllabus approach of connecting texts that appear together.
     """
     def __init__(self, df: pd.DataFrame):
@@ -22,13 +77,13 @@ class ModelNetworkBuilder:
         Args:
             df: DataFrame with model data including model_id, library_name,
-                pipeline_tag, tags, parent_model, downloads, likes
         """
         self.df = df.copy()
         if 'model_id' not in self.df.columns:
             raise ValueError("DataFrame must contain 'model_id' column")
-        # Ensure model_id is index for fast lookups
         if self.df.index.name != 'model_id':
             if 'model_id' in self.df.columns:
                 self.df.set_index('model_id', drop=False, inplace=True)
@@ -208,23 +263,41 @@ class ModelNetworkBuilder:
     def build_family_tree_network(
         self,
         root_model_id: str,
-        max_depth: int = 5
     ) -> nx.DiGraph:
         """
-        Build directed graph of model family tree.
         Args:
             root_model_id: Root model to start from
-            max_depth: Maximum depth to traverse
         Returns:
-            NetworkX DiGraph representing family tree
         """
         graph = nx.DiGraph()
         visited = set()
-        def add_family(current_id: str, depth: int):
-            if depth <= 0 or current_id in visited:
                 return
             visited.add(current_id)
@@ -233,28 +306,98 @@ class ModelNetworkBuilder:
             row = self.df.loc[current_id]
-            # Add node
             graph.add_node(str(current_id))
             graph.nodes[str(current_id)]['title'] = self._format_title(current_id)
             graph.nodes[str(current_id)]['freq'] = int(row.get('downloads', 0))
-            # Add edge to parent
-            parent_id = row.get('parent_model')
-            if parent_id and pd.notna(parent_id) and str(parent_id) != 'nan':
-                parent_id_str = str(parent_id)
-                graph.add_edge(parent_id_str, str(current_id))
-                add_family(parent_id_str, depth - 1)
-            # Add edges to children
-            children = self.df[self.df.get('parent_model', '') == current_id]
-            for child_id, child_row in children.iterrows():
                 if str(child_id) not in visited:
-                    graph.add_edge(str(current_id), str(child_id))
-                    add_family(str(child_id), depth - 1)
         add_family(root_model_id, max_depth)
         return graph
     def export_graphml(self, graph: nx.Graph, filename: str):
         """Export graph to GraphML format (like Open Syllabus)."""
         nx.write_graphml(graph, filename)

 """
 Network analysis module inspired by Open Syllabus Project.
 Builds co-occurrence networks for models based on shared contexts.
+Supports multiple relationship types: finetune, quantized, adapter, merge.
 """
 import pandas as pd
 import numpy as np
 from itertools import combinations
 from typing import List, Dict, Tuple, Optional, Set
 import networkx as nx
+import ast
+from datetime import datetime
+def _parse_parent_list(value) -> List[str]:
+    """
+    Parse parent model list from string/eval format.
+    Handles both string representations and actual lists.
+    """
+    if pd.isna(value) or value == '' or str(value) == 'nan':
+        return []
+    try:
+        if isinstance(value, str):
+            if value.startswith('[') or value.startswith('('):
+                parsed = ast.literal_eval(value)
+            else:
+                parsed = [value]
+        else:
+            parsed = value
+        if isinstance(parsed, list):
+            return [str(p) for p in parsed if p and str(p) != 'nan']
+        elif parsed:
+            return [str(parsed)]
+        else:
+            return []
+    except (ValueError, SyntaxError):
+        return []
+def _get_all_parents(row: pd.Series) -> Dict[str, List[str]]:
+    """
+    Extract all parent types from a row.
+    Returns dict mapping relationship type to list of parent IDs.
+    """
+    parents = {}
+    parent_columns = {
+        'parent_model': 'parent',
+        'finetune_parent': 'finetune',
+        'quantized_parent': 'quantized',
+        'adapter_parent': 'adapter',
+        'merge_parent': 'merge'
+    }
+    for col, rel_type in parent_columns.items():
+        if col in row:
+            parent_list = _parse_parent_list(row.get(col))
+            if parent_list:
+                parents[rel_type] = parent_list
+    return parents
 class ModelNetworkBuilder:
     """
     Build network graphs for models based on co-occurrence patterns.
     Similar to Open Syllabus approach of connecting texts that appear together.
+    Supports multiple relationship types: finetune, quantized, adapter, merge.
     """
     def __init__(self, df: pd.DataFrame):
         Args:
             df: DataFrame with model data including model_id, library_name,
+                pipeline_tag, tags, parent_model, finetune_parent, quantized_parent,
+                adapter_parent, merge_parent, downloads, likes, createdAt
         """
         self.df = df.copy()
         if 'model_id' not in self.df.columns:
             raise ValueError("DataFrame must contain 'model_id' column")
         if self.df.index.name != 'model_id':
             if 'model_id' in self.df.columns:
                 self.df.set_index('model_id', drop=False, inplace=True)
     def build_family_tree_network(
         self,
         root_model_id: str,
+        max_depth: Optional[int] = 5,
+        include_edge_attributes: bool = True,
+        filter_edge_types: Optional[List[str]] = None
     ) -> nx.DiGraph:
         """
+        Build directed graph of model family tree with multiple relationship types.
         Args:
             root_model_id: Root model to start from
+            max_depth: Maximum depth to traverse. If None, traverses entire tree without limit.
+            include_edge_attributes: Whether to calculate edge attributes (change in likes, downloads, etc.)
+            filter_edge_types: List of edge types to include (e.g., ['finetune', 'quantized']).
+                              If None, includes all types.
         Returns:
+            NetworkX DiGraph representing family tree with edge types and attributes
         """
         graph = nx.DiGraph()
         visited = set()
+        children_index: Dict[str, List[Tuple[str, str]]] = {}
+        for idx, row in self.df.iterrows():
+            model_id = str(row.get('model_id', idx))
+            all_parents = _get_all_parents(row)
+            for rel_type, parent_list in all_parents.items():
+                for parent_id in parent_list:
+                    if parent_id not in children_index:
+                        children_index[parent_id] = []
+                    children_index[parent_id].append((model_id, rel_type))
+        def add_family(current_id: str, depth: Optional[int]):
+            if current_id in visited:
+                return
+            if depth is not None and depth <= 0:
                 return
             visited.add(current_id)
             row = self.df.loc[current_id]
             graph.add_node(str(current_id))
             graph.nodes[str(current_id)]['title'] = self._format_title(current_id)
             graph.nodes[str(current_id)]['freq'] = int(row.get('downloads', 0))
+            graph.nodes[str(current_id)]['likes'] = int(row.get('likes', 0))
+            graph.nodes[str(current_id)]['downloads'] = int(row.get('downloads', 0))
+            graph.nodes[str(current_id)]['library'] = str(row.get('library_name', '')) if pd.notna(row.get('library_name')) else ''
+            graph.nodes[str(current_id)]['pipeline'] = str(row.get('pipeline_tag', '')) if pd.notna(row.get('pipeline_tag')) else ''
+            createdAt = row.get('createdAt')
+            if pd.notna(createdAt):
+                graph.nodes[str(current_id)]['createdAt'] = str(createdAt)
+            all_parents = _get_all_parents(row)
+            for rel_type, parent_list in all_parents.items():
+                if filter_edge_types and rel_type not in filter_edge_types:
+                    continue
+                for parent_id in parent_list:
+                    if parent_id in self.df.index:
+                        graph.add_edge(parent_id, str(current_id))
+                        graph[parent_id][str(current_id)]['edge_types'] = [rel_type]
+                        graph[parent_id][str(current_id)]['edge_type'] = rel_type
+                        next_depth = depth - 1 if depth is not None else None
+                        add_family(parent_id, next_depth)
+            children = children_index.get(current_id, [])
+            for child_id, rel_type in children:
+                if filter_edge_types and rel_type not in filter_edge_types:
+                    continue
                 if str(child_id) not in visited:
+                    if not graph.has_edge(str(current_id), child_id):
+                        graph.add_edge(str(current_id), child_id)
+                        graph[str(current_id)][child_id]['edge_types'] = [rel_type]
+                        graph[str(current_id)][child_id]['edge_type'] = rel_type
+                    else:
+                        if rel_type not in graph[str(current_id)][child_id].get('edge_types', []):
+                            graph[str(current_id)][child_id]['edge_types'].append(rel_type)
+                    next_depth = depth - 1 if depth is not None else None
+                    add_family(child_id, next_depth)
         add_family(root_model_id, max_depth)
+        if include_edge_attributes:
+            self._add_edge_attributes(graph)
         return graph
+    def _add_edge_attributes(self, graph: nx.DiGraph):
+        """
+        Add edge attributes like change in likes, downloads, time difference.
+        Similar to the notebook's edge attribute calculation.
+        """
+        for edge in graph.edges():
+            parent_model = edge[0]
+            model_id = edge[1]
+            if parent_model not in graph.nodes() or model_id not in graph.nodes():
+                continue
+            parent_likes = graph.nodes[parent_model].get('likes', 0)
+            model_likes = graph.nodes[model_id].get('likes', 0)
+            parent_downloads = graph.nodes[parent_model].get('downloads', 0)
+            model_downloads = graph.nodes[model_id].get('downloads', 0)
+            graph.edges[edge]['change_in_likes'] = model_likes - parent_likes
+            if parent_likes != 0:
+                graph.edges[edge]['percentage_change_in_likes'] = (model_likes - parent_likes) / parent_likes
+            else:
+                graph.edges[edge]['percentage_change_in_likes'] = np.nan
+            graph.edges[edge]['change_in_downloads'] = model_downloads - parent_downloads
+            if parent_downloads != 0:
+                graph.edges[edge]['percentage_change_in_downloads'] = (model_downloads - parent_downloads) / parent_downloads
+            else:
+                graph.edges[edge]['percentage_change_in_downloads'] = np.nan
+            parent_created = graph.nodes[parent_model].get('createdAt')
+            model_created = graph.nodes[model_id].get('createdAt')
+            if parent_created and model_created:
+                try:
+                    parent_dt = datetime.strptime(str(parent_created), '%Y-%m-%dT%H:%M:%S.%fZ')
+                    model_dt = datetime.strptime(str(model_created), '%Y-%m-%dT%H:%M:%S.%fZ')
+                    graph.edges[edge]['change_in_createdAt_days'] = (model_dt - parent_dt).days
+                except (ValueError, TypeError):
+                    graph.edges[edge]['change_in_createdAt_days'] = np.nan
+            else:
+                graph.edges[edge]['change_in_createdAt_days'] = np.nan
     def export_graphml(self, graph: nx.Graph, filename: str):
         """Export graph to GraphML format (like Open Syllabus)."""
         nx.write_graphml(graph, filename)

frontend/.npmrc CHANGED Viewed

	@@ -1,2 +1,4 @@
1	legacy-peer-deps=true
2


1	legacy-peer-deps=true
2
3	+
4	+

frontend/package-lock.json CHANGED Viewed

@@ -32,7 +32,8 @@
         "react-dom": "^18.2.0",
         "react-scripts": "5.0.1",
         "three": "^0.160.1",
-        "typescript": "^5.0.0"
       }
     },
     "node_modules/@alloc/quick-lru": {

         "react-dom": "^18.2.0",
         "react-scripts": "5.0.1",
         "three": "^0.160.1",
+        "typescript": "^5.0.0",
+        "zustand": "^5.0.8"
       }
     },
     "node_modules/@alloc/quick-lru": {

frontend/package.json CHANGED Viewed

@@ -28,7 +28,8 @@
     "react-dom": "^18.2.0",
     "react-scripts": "5.0.1",
     "three": "^0.160.1",
-    "typescript": "^5.0.0"
   },
   "scripts": {
     "start": "react-scripts start",

     "react-dom": "^18.2.0",
     "react-scripts": "5.0.1",
     "three": "^0.160.1",
+    "typescript": "^5.0.0",
+    "zustand": "^5.0.8"
   },
   "scripts": {
     "start": "react-scripts start",

frontend/public/index.html CHANGED Viewed

@@ -10,7 +10,7 @@
     />
     <link rel="preconnect" href="https://fonts.googleapis.com">
     <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
-    <link href="https://fonts.googleapis.com/css2?family=Vend+Sans:wght@300;400;500;600;700&display=swap" rel="stylesheet">
     <title>Anatomy of a Machine Learning Ecosystem: 2 Million Models on Hugging Face</title>
   </head>
   <body>

     />
     <link rel="preconnect" href="https://fonts.googleapis.com">
     <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+    <link href="https://fonts.googleapis.com/css2?family=Instrument+Sans:wght@400;500;600;700&display=swap" rel="stylesheet">
     <title>Anatomy of a Machine Learning Ecosystem: 2 Million Models on Hugging Face</title>
   </head>
   <body>

frontend/src/App.css CHANGED Viewed

@@ -7,86 +7,24 @@
 }
 .App-header {
-  background: linear-gradient(135deg, #1a237e 0%, #283593 20%, #3949ab 40%, #5e35b1 60%, #7b1fa2 80%, #6a1b9a 100%);
-  background-size: 200% 200%;
-  animation: gradientShift 20s ease infinite;
   color: #ffffff;
-  padding: 3rem 2.5rem;
   text-align: center;
-  border-bottom: 2px solid rgba(100, 181, 246, 0.3);
-  box-shadow: 0 4px 20px rgba(0, 0, 0, 0.25), 0 2px 10px rgba(123, 31, 162, 0.3);
   position: relative;
-  overflow: hidden;
 }
-.App-header::before {
-  content: '';
-  position: absolute;
-  top: 0;
-  left: 0;
-  right: 0;
-  bottom: 0;
-  background:
-    radial-gradient(circle at 20% 50%, rgba(100, 181, 246, 0.15) 0%, transparent 50%),
-    radial-gradient(circle at 80% 80%, rgba(156, 39, 176, 0.1) 0%, transparent 50%),
-    radial-gradient(circle at 40% 20%, rgba(33, 150, 243, 0.1) 0%, transparent 50%);
-  pointer-events: none;
-  animation: pulse 8s ease-in-out infinite;
-}
-.App-header::after {
-  content: '';
-  position: absolute;
-  top: 0;
-  left: 0;
-  right: 0;
-  bottom: 0;
-  background-image:
-    repeating-linear-gradient(
-      0deg,
-      transparent,
-      transparent 2px,
-      rgba(255, 255, 255, 0.03) 2px,
-      rgba(255, 255, 255, 0.03) 4px
-    );
-  pointer-events: none;
-  opacity: 0.5;
-}
-@keyframes gradientShift {
-  0% {
-    background-position: 0% 50%;
-  }
-  50% {
-    background-position: 100% 50%;
-  }
-  100% {
-    background-position: 0% 50%;
-  }
-}
-@keyframes pulse {
-  0%, 100% {
-    opacity: 1;
-  }
-  50% {
-    opacity: 0.8;
-  }
-}
 .App-header h1 {
   margin: 0 0 1rem 0;
-  font-size: 2.25rem;
-  font-weight: 700;
-  letter-spacing: -0.02em;
-  line-height: 1.2;
-  position: relative;
-  z-index: 1;
-  text-shadow: 0 2px 8px rgba(0, 0, 0, 0.4), 0 4px 16px rgba(123, 31, 162, 0.3);
-  background: linear-gradient(180deg, #ffffff 0%, #e1bee7 100%);
-  -webkit-background-clip: text;
-  -webkit-text-fill-color: transparent;
-  background-clip: text;
 }
 .App-header p {
@@ -122,23 +60,17 @@
 }
 .stats span {
-  padding: 0.75rem 1.5rem;
-  background: rgba(255, 255, 255, 0.15);
-  border-radius: 12px;
-  backdrop-filter: blur(20px);
-  -webkit-backdrop-filter: blur(20px);
-  border: 2px solid rgba(255, 255, 255, 0.25);
-  transition: all 0.3s cubic-bezier(0.4, 0, 0.2, 1);
-  box-shadow: 0 4px 12px rgba(0, 0, 0, 0.1), inset 0 1px 0 rgba(255, 255, 255, 0.3);
-  font-weight: 600;
-  letter-spacing: 0.02em;
 }
 .stats span:hover {
-  background: rgba(255, 255, 255, 0.25);
-  transform: translateY(-2px) scale(1.05);
-  box-shadow: 0 6px 20px rgba(0, 0, 0, 0.15), inset 0 1px 0 rgba(255, 255, 255, 0.4);
-  border-color: rgba(255, 255, 255, 0.4);
 }
 .main-content {
@@ -149,10 +81,9 @@
 .sidebar {
   width: 340px;
   padding: 1.5rem;
-  background: linear-gradient(to bottom, #fafafa 0%, #ffffff 100%);
   overflow-y: auto;
-  border-right: 2px solid #e0e0e0;
-  box-shadow: 2px 0 8px rgba(0, 0, 0, 0.05);
 }
 .sidebar h2 {
@@ -164,12 +95,11 @@
 }
 .sidebar h3 {
-  font-size: 0.95rem;
-  font-weight: 700;
-  color: #5e35b1;
-  margin: 0 0 1rem 0;
   letter-spacing: -0.01em;
-  text-transform: none;
 }
 .sidebar label {
@@ -202,9 +132,8 @@
 .sidebar input[type="text"]:focus,
 .sidebar select:focus {
   outline: none;
-  border-color: #5e35b1;
-  box-shadow: 0 0 0 3px rgba(94, 53, 177, 0.12), 0 2px 6px rgba(0, 0, 0, 0.1);
-  transform: translateY(-1px);
 }
 .sidebar input[type="range"] {
@@ -227,20 +156,20 @@
 .sidebar input[type="range"]::-webkit-slider-thumb {
   -webkit-appearance: none;
   appearance: none;
-  width: 20px;
-  height: 20px;
   border-radius: 50%;
-  background: linear-gradient(135deg, #5e35b1 0%, #7b1fa2 100%);
   cursor: pointer;
-  box-shadow: 0 2px 6px rgba(94, 53, 177, 0.3), 0 4px 12px rgba(94, 53, 177, 0.2);
-  transition: all 0.3s cubic-bezier(0.4, 0, 0.2, 1);
-  border: 3px solid #ffffff;
 }
 .sidebar input[type="range"]::-webkit-slider-thumb:hover {
-  background: linear-gradient(135deg, #512da8 0%, #6a1b9a 100%);
-  transform: scale(1.2);
-  box-shadow: 0 3px 8px rgba(94, 53, 177, 0.4), 0 6px 16px rgba(94, 53, 177, 0.3);
 }
 .sidebar input[type="range"]::-webkit-slider-thumb:active {
@@ -248,20 +177,20 @@
 }
 .sidebar input[type="range"]::-moz-range-thumb {
-  width: 20px;
-  height: 20px;
   border-radius: 50%;
-  background: linear-gradient(135deg, #5e35b1 0%, #7b1fa2 100%);
   cursor: pointer;
-  border: 3px solid #ffffff;
-  box-shadow: 0 2px 6px rgba(94, 53, 177, 0.3), 0 4px 12px rgba(94, 53, 177, 0.2);
-  transition: all 0.3s cubic-bezier(0.4, 0, 0.2, 1);
 }
 .sidebar input[type="range"]::-moz-range-thumb:hover {
-  background: linear-gradient(135deg, #512da8 0%, #6a1b9a 100%);
-  transform: scale(1.2);
-  box-shadow: 0 3px 8px rgba(94, 53, 177, 0.4), 0 6px 16px rgba(94, 53, 177, 0.3);
 }
 .sidebar input[type="range"]::-moz-range-thumb:active {
@@ -288,17 +217,16 @@
 .sidebar-section {
   background: #ffffff;
-  border-radius: 8px;
   padding: 1.25rem;
-  margin-bottom: 1.25rem;
   border: 1px solid #e0e0e0;
-  box-shadow: 0 1px 3px rgba(0, 0, 0, 0.08);
-  transition: all 0.3s ease;
 }
 .sidebar-section:hover {
-  box-shadow: 0 2px 8px rgba(0, 0, 0, 0.12);
   border-color: #d0d0d0;
 }
 .filter-chip {
@@ -380,22 +308,21 @@
 }
 .loading {
-  color: #5e35b1;
   font-weight: 600;
-  background: linear-gradient(135deg, #f5f3ff 0%, #ede7f6 100%);
-  border: 2px solid #d1c4e9;
-  box-shadow: 0 4px 12px rgba(94, 53, 177, 0.1);
 }
 .loading::after {
   content: '';
-  width: 48px;
-  height: 48px;
-  border: 5px solid #e1bee7;
-  border-top-color: #5e35b1;
-  border-right-color: #7b1fa2;
   border-radius: 50%;
-  animation: spin 0.8s cubic-bezier(0.68, -0.55, 0.265, 1.55) infinite;
 }
 @keyframes spin {
@@ -403,101 +330,62 @@
 }
 .error {
-  color: #c62828;
-  background: linear-gradient(135deg, #ffebee 0%, #ffcdd2 100%);
-  border-radius: 12px;
-  border: 2px solid #ef5350;
   max-width: 550px;
   margin: 0 auto;
-  box-shadow: 0 4px 12px rgba(198, 40, 40, 0.15);
   font-weight: 500;
 }
-.error::before {
-  content: '⚠️';
-  font-size: 2.5rem;
-  display: block;
-  margin-bottom: 0.5rem;
-}
 .empty {
-  color: #616161;
-  background: linear-gradient(135deg, #fafafa 0%, #f5f5f5 100%);
-  border-radius: 12px;
-  border: 2px solid #e0e0e0;
   max-width: 550px;
   margin: 0 auto;
-  box-shadow: 0 4px 12px rgba(0, 0, 0, 0.08);
   font-weight: 500;
 }
-.empty::before {
-  content: '🔍';
-  font-size: 2.5rem;
-  display: block;
-  margin-bottom: 0.5rem;
-}
 .btn {
   padding: 0.625rem 1.25rem;
-  border-radius: 6px;
   border: none;
   font-size: 0.9rem;
   font-weight: 600;
   cursor: pointer;
-  transition: all 0.25s cubic-bezier(0.4, 0, 0.2, 1);
   font-family: 'Instrument Sans', sans-serif;
   display: inline-flex;
   align-items: center;
   justify-content: center;
   gap: 0.5rem;
-  position: relative;
-  overflow: hidden;
-}
-.btn::before {
-  content: '';
-  position: absolute;
-  top: 50%;
-  left: 50%;
-  width: 0;
-  height: 0;
-  border-radius: 50%;
-  background: rgba(255, 255, 255, 0.3);
-  transform: translate(-50%, -50%);
-  transition: width 0.6s, height 0.6s;
 }
-.btn:hover::before {
-  width: 300px;
-  height: 300px;
-}
 .btn-primary {
-  background: linear-gradient(135deg, #5e35b1 0%, #7b1fa2 100%);
   color: white;
-  box-shadow: 0 2px 4px rgba(94, 53, 177, 0.3);
 }
 .btn-primary:hover {
-  background: linear-gradient(135deg, #512da8 0%, #6a1b9a 100%);
-  transform: translateY(-2px);
-  box-shadow: 0 4px 12px rgba(94, 53, 177, 0.4);
 }
 .btn-secondary {
   background: #f5f5f5;
-  color: #1a1a1a;
-  border: 2px solid #e0e0e0;
-  box-shadow: 0 1px 3px rgba(0, 0, 0, 0.08);
 }
 .btn-secondary:hover {
-  background: #ffffff;
-  border-color: #5e35b1;
-  color: #5e35b1;
-  transform: translateY(-1px);
-  box-shadow: 0 2px 6px rgba(0, 0, 0, 0.12);
 }
 .btn-small {
@@ -585,7 +473,7 @@
   --text-primary: #ffffff;
   --text-secondary: #cccccc;
   --border-color: #444444;
-  --accent-color: #64b5f6;
 }
 [data-theme="light"] {
@@ -595,32 +483,31 @@
   --text-primary: #1a1a1a;
   --text-secondary: #666666;
   --border-color: #d0d0d0;
-  --accent-color: #1976d2;
 }
 /* Random Model Button */
 .random-model-btn {
   display: flex;
   align-items: center;
-  gap: 0.5rem;
-  padding: 0.5rem 1rem;
-  background: var(--accent-color, #4a90e2);
   color: white;
   border: none;
   border-radius: 4px;
   cursor: pointer;
   font-size: 0.9rem;
   font-family: 'Instrument Sans', sans-serif;
-  font-weight: 500;
   transition: all 0.2s;
   width: 100%;
-  justify-content: center;
 }
 .random-model-btn:hover:not(:disabled) {
-  background: var(--accent-color, #357abd);
   transform: translateY(-1px);
-  box-shadow: 0 2px 4px rgba(0, 0, 0, 0.2);
 }
 .random-model-btn:disabled {
@@ -628,10 +515,6 @@
   cursor: not-allowed;
 }
-.random-icon {
-  font-size: 1.1rem;
-}
 /* Zoom Slider */
 .zoom-slider-container {
   margin-bottom: 1rem;
@@ -859,7 +742,7 @@
   width: 18px;
   height: 18px;
   cursor: pointer;
-  accent-color: #5e35b1;
   margin-right: 0.5rem;
 }

 }
 .App-header {
+  background: #2d2d2d;
   color: #ffffff;
+  padding: 2.5rem 2rem;
   text-align: center;
+  border-bottom: 1px solid #404040;
+  box-shadow: 0 2px 8px rgba(0, 0, 0, 0.15);
   position: relative;
 }
 .App-header h1 {
   margin: 0 0 1rem 0;
+  font-size: 2rem;
+  font-weight: 600;
+  letter-spacing: -0.01em;
+  line-height: 1.3;
+  color: #ffffff;
+  text-shadow: 0 1px 3px rgba(0, 0, 0, 0.3);
 }
 .App-header p {
 }
 .stats span {
+  padding: 0.625rem 1.25rem;
+  background: rgba(255, 255, 255, 0.1);
+  border-radius: 6px;
+  border: 1px solid rgba(255, 255, 255, 0.2);
+  transition: all 0.2s ease;
+  font-weight: 500;
 }
 .stats span:hover {
+  background: rgba(255, 255, 255, 0.15);
+  transform: translateY(-1px);
 }
 .main-content {
 .sidebar {
   width: 340px;
   padding: 1.5rem;
+  background: #fafafa;
   overflow-y: auto;
+  border-right: 1px solid #e0e0e0;
 }
 .sidebar h2 {
 }
 .sidebar h3 {
+  font-size: 0.9rem;
+  font-weight: 600;
+  color: #2d2d2d;
+  margin: 0 0 0.875rem 0;
   letter-spacing: -0.01em;
 }
 .sidebar label {
 .sidebar input[type="text"]:focus,
 .sidebar select:focus {
   outline: none;
+  border-color: #4a4a4a;
+  box-shadow: 0 0 0 2px rgba(0, 0, 0, 0.08);
 }
 .sidebar input[type="range"] {
 .sidebar input[type="range"]::-webkit-slider-thumb {
   -webkit-appearance: none;
   appearance: none;
+  width: 18px;
+  height: 18px;
   border-radius: 50%;
+  background: #4a4a4a;
   cursor: pointer;
+  box-shadow: 0 2px 4px rgba(0, 0, 0, 0.2);
+  transition: all 0.2s ease;
+  border: 2px solid #ffffff;
 }
 .sidebar input[type="range"]::-webkit-slider-thumb:hover {
+  background: #2d2d2d;
+  transform: scale(1.1);
+  box-shadow: 0 2px 6px rgba(0, 0, 0, 0.3);
 }
 .sidebar input[type="range"]::-webkit-slider-thumb:active {
 }
 .sidebar input[type="range"]::-moz-range-thumb {
+  width: 18px;
+  height: 18px;
   border-radius: 50%;
+  background: #4a4a4a;
   cursor: pointer;
+  border: 2px solid #ffffff;
+  box-shadow: 0 2px 4px rgba(0, 0, 0, 0.2);
+  transition: all 0.2s ease;
 }
 .sidebar input[type="range"]::-moz-range-thumb:hover {
+  background: #2d2d2d;
+  transform: scale(1.1);
+  box-shadow: 0 2px 6px rgba(0, 0, 0, 0.3);
 }
 .sidebar input[type="range"]::-moz-range-thumb:active {
 .sidebar-section {
   background: #ffffff;
+  border-radius: 6px;
   padding: 1.25rem;
+  margin-bottom: 1rem;
   border: 1px solid #e0e0e0;
+  transition: all 0.2s ease;
 }
 .sidebar-section:hover {
   border-color: #d0d0d0;
+  box-shadow: 0 1px 3px rgba(0, 0, 0, 0.05);
 }
 .filter-chip {
 }
 .loading {
+  color: #2d2d2d;
   font-weight: 600;
+  background: #f5f5f5;
+  border: 1px solid #d0d0d0;
+  box-shadow: 0 2px 8px rgba(0, 0, 0, 0.08);
 }
 .loading::after {
   content: '';
+  width: 40px;
+  height: 40px;
+  border: 4px solid #e0e0e0;
+  border-top-color: #4a4a4a;
   border-radius: 50%;
+  animation: spin 0.8s linear infinite;
 }
 @keyframes spin {
 }
 .error {
+  color: #d32f2f;
+  background: #ffebee;
+  border-radius: 8px;
+  border: 1px solid #ffcdd2;
   max-width: 550px;
   margin: 0 auto;
   font-weight: 500;
 }
 .empty {
+  color: #6a6a6a;
+  background: #f5f5f5;
+  border-radius: 8px;
+  border: 1px solid #e0e0e0;
   max-width: 550px;
   margin: 0 auto;
   font-weight: 500;
 }
 .btn {
   padding: 0.625rem 1.25rem;
+  border-radius: 4px;
   border: none;
   font-size: 0.9rem;
   font-weight: 600;
   cursor: pointer;
+  transition: all 0.2s ease;
   font-family: 'Instrument Sans', sans-serif;
   display: inline-flex;
   align-items: center;
   justify-content: center;
   gap: 0.5rem;
 }
 .btn-primary {
+  background: #2d2d2d;
   color: white;
+  box-shadow: 0 2px 4px rgba(0, 0, 0, 0.15);
 }
 .btn-primary:hover {
+  background: #1a1a1a;
+  transform: translateY(-1px);
+  box-shadow: 0 3px 8px rgba(0, 0, 0, 0.2);
 }
 .btn-secondary {
   background: #f5f5f5;
+  color: #2d2d2d;
+  border: 1px solid #d0d0d0;
 }
 .btn-secondary:hover {
+  background: #e8e8e8;
+  border-color: #b0b0b0;
 }
 .btn-small {
   --text-primary: #ffffff;
   --text-secondary: #cccccc;
   --border-color: #444444;
+  --accent-color: #4a4a4a;
 }
 [data-theme="light"] {
   --text-primary: #1a1a1a;
   --text-secondary: #666666;
   --border-color: #d0d0d0;
+  --accent-color: #4a4a4a;
 }
 /* Random Model Button */
 .random-model-btn {
   display: flex;
   align-items: center;
+  justify-content: center;
+  padding: 0.625rem 1.25rem;
+  background: #2d2d2d;
   color: white;
   border: none;
   border-radius: 4px;
   cursor: pointer;
   font-size: 0.9rem;
   font-family: 'Instrument Sans', sans-serif;
+  font-weight: 600;
   transition: all 0.2s;
   width: 100%;
 }
 .random-model-btn:hover:not(:disabled) {
+  background: #1a1a1a;
   transform: translateY(-1px);
+  box-shadow: 0 2px 6px rgba(0, 0, 0, 0.2);
 }
 .random-model-btn:disabled {
   cursor: not-allowed;
 }
 /* Zoom Slider */
 .zoom-slider-container {
   margin-bottom: 1rem;
   width: 18px;
   height: 18px;
   cursor: pointer;
+  accent-color: #4a4a4a;
   margin-right: 0.5rem;
 }

frontend/src/App.tsx CHANGED Viewed

@@ -506,28 +506,24 @@ function App() {
             alignItems: 'center',
             marginBottom: '1.5rem',
             paddingBottom: '1rem',
-            borderBottom: '2px solid #e8e8e8'
           }}>
             <h2 style={{
               margin: 0,
               fontSize: '1.5rem',
-              fontWeight: '700',
-              background: 'linear-gradient(135deg, #5e35b1 0%, #7b1fa2 100%)',
-              WebkitBackgroundClip: 'text',
-              WebkitTextFillColor: 'transparent',
-              backgroundClip: 'text'
             }}>
               Filters & Controls
             </h2>
             {activeFilterCount > 0 && (
               <div style={{
                 fontSize: '0.75rem',
-                background: 'linear-gradient(135deg, #5e35b1 0%, #7b1fa2 100%)',
                 color: 'white',
-                padding: '0.4rem 0.75rem',
-                borderRadius: '16px',
-                fontWeight: '600',
-                boxShadow: '0 2px 6px rgba(94, 53, 177, 0.3)'
               }}>
                 {activeFilterCount} active
               </div>
@@ -537,40 +533,40 @@ function App() {
           {/* Filter Results Count */}
           {!loading && data.length > 0 && (
             <div className="sidebar-section" style={{
-              background: 'linear-gradient(135deg, #f3e5f5 0%, #e1bee7 100%)',
-              border: '2px solid #ce93d8',
               fontSize: '0.9rem',
               marginBottom: '1.5rem'
             }}>
               <div style={{ display: 'flex', justifyContent: 'space-between', alignItems: 'center', marginBottom: '0.5rem' }}>
                 <div>
-                  <strong style={{ fontSize: '1.1rem', color: '#6a1b9a' }}>
                     {data.length.toLocaleString()}
                   </strong>
-                  <span style={{ marginLeft: '0.4rem', color: '#4a148c' }}>
                     {data.length === 1 ? 'model' : 'models'}
                   </span>
                 </div>
                 {embeddingType === 'graph-aware' && (
                   <span style={{
                     fontSize: '0.7rem',
-                    background: '#7b1fa2',
                     color: 'white',
                     padding: '0.3rem 0.6rem',
                     borderRadius: '12px',
                     fontWeight: '600'
                   }}>
-                    🌐 Graph
                   </span>
                 )}
               </div>
               {filteredCount !== null && filteredCount !== data.length && (
-                <div style={{ fontSize: '0.8rem', color: '#6a1b9a', marginTop: '0.25rem' }}>
                   of {filteredCount.toLocaleString()} matching
                 </div>
               )}
               {stats && filteredCount !== null && filteredCount < stats.total_models && (
-                <div style={{ fontSize: '0.75rem', color: '#8e24aa', marginTop: '0.25rem' }}>
                   from {stats.total_models.toLocaleString()} total
                 </div>
               )}
@@ -579,15 +575,7 @@ function App() {
           {/* Search Section */}
           <div className="sidebar-section">
-            <h3 style={{
-              display: 'flex',
-              alignItems: 'center',
-              gap: '0.5rem',
-              color: '#5e35b1',
-              marginBottom: '0.75rem'
-            }}>
-              🔍 Search Models
-            </h3>
             <input
               type="text"
               value={searchQuery}
@@ -602,14 +590,7 @@ function App() {
           {/* Popularity Filters */}
           <div className="sidebar-section">
-            <h3 style={{
-              display: 'flex',
-              alignItems: 'center',
-              gap: '0.5rem',
-              color: '#5e35b1'
-            }}>
-              📊 Popularity Filters
-            </h3>
             <label style={{ marginBottom: '1rem', display: 'block' }}>
               <div style={{ display: 'flex', justifyContent: 'space-between', marginBottom: '0.5rem' }}>
@@ -706,14 +687,7 @@ function App() {
           {/* Discovery */}
           <div className="sidebar-section">
-            <h3 style={{
-              display: 'flex',
-              alignItems: 'center',
-              gap: '0.5rem',
-              color: '#5e35b1'
-            }}>
-              🎲 Discovery
-            </h3>
             <RandomModelButton
               data={data}
               onSelect={(model: ModelPoint) => {
@@ -727,15 +701,7 @@ function App() {
           {/* Visualization Options */}
           <div className="sidebar-section">
             <div style={{ display: 'flex', justifyContent: 'space-between', alignItems: 'center', marginBottom: '1rem' }}>
-              <h3 style={{
-                margin: 0,
-                display: 'flex',
-                alignItems: 'center',
-                gap: '0.5rem',
-                color: '#5e35b1'
-              }}>
-                🎨 Visualization
-              </h3>
               <ThemeToggle />
             </div>
@@ -862,10 +828,10 @@ function App() {
               </select>
             </label>
-            <div className="sidebar-section" style={{ background: '#fff3cd', borderColor: '#ffc107', marginBottom: '1rem', padding: '0.75rem', borderRadius: '4px', border: '1px solid' }}>
               <label style={{ display: 'block', marginBottom: '0' }}>
-                <span style={{ fontWeight: '600', display: 'block', marginBottom: '0.5rem', color: '#856404' }}>
-                  ⚙️ Projection Method
                 </span>
                 <select
                   value={projectionMethod}
@@ -875,7 +841,7 @@ function App() {
                   <option value="umap">UMAP (better global structure)</option>
                   <option value="tsne">t-SNE (better local clusters)</option>
                 </select>
-                <div style={{ fontSize: '0.75rem', color: '#856404', marginTop: '0.5rem', lineHeight: '1.4' }}>
                   <strong>UMAP:</strong> Preserves global structure, better for exploring relationships<br/>
                   <strong>t-SNE:</strong> Emphasizes local clusters, better for finding groups
                 </div>
@@ -884,15 +850,8 @@ function App() {
           </div>
           {/* View Modes */}
-          <div className="sidebar-section" style={{ background: 'linear-gradient(135deg, #f3e5f5 0%, #fce4ec 100%)', border: '2px solid #f48fb1' }}>
-            <h3 style={{
-              display: 'flex',
-              alignItems: 'center',
-              gap: '0.5rem',
-              color: '#5e35b1'
-            }}>
-              ⚡ View Modes
-            </h3>
             <label style={{ marginBottom: '1rem', display: 'flex', alignItems: 'center', cursor: 'pointer' }}>
               <input
@@ -937,7 +896,7 @@ function App() {
                 style={{ marginRight: '0.5rem', cursor: 'pointer' }}
               />
               <div>
-                <span style={{ fontWeight: '500' }}>🌐 Graph-Aware Embeddings</span>
                 <div style={{ fontSize: '0.75rem', color: '#666', marginTop: '0.25rem' }}>
                   Use embeddings that respect family tree structure. Models in the same family will be closer together.
                 </div>
@@ -955,11 +914,11 @@ function App() {
                 color: '#666'
               }}>
                 <div style={{ display: 'flex', alignItems: 'center', gap: '0.5rem', marginBottom: '0.25rem' }}>
-                  <strong style={{ color: embeddingType === 'graph-aware' ? '#2e7d32' : '#666' }}>
-                    {embeddingType === 'graph-aware' ? '🌐 Graph-Aware' : '📝 Text-Only'} Embeddings
                   </strong>
                 </div>
-                <div style={{ fontSize: '0.7rem', color: '#888', lineHeight: '1.4' }}>
                   {embeddingType === 'graph-aware'
                     ? 'Models in the same family tree are positioned closer together, revealing hierarchical relationships.'
                     : 'Standard text-based embeddings showing semantic similarity from model descriptions and tags.'}
@@ -1006,15 +965,8 @@ function App() {
           {/* Structural Visualization Options */}
           {viewMode === '3d' && (
-            <div className="sidebar-section" style={{ background: 'linear-gradient(135deg, #e8f5e9 0%, #f1f8e9 100%)', border: '2px solid #aed581' }}>
-              <h3 style={{
-                display: 'flex',
-                alignItems: 'center',
-                gap: '0.5rem',
-                color: '#5e35b1'
-              }}>
-                🔗 Network Structure
-              </h3>
               <div style={{ fontSize: '0.75rem', color: '#666', marginBottom: '1rem', lineHeight: '1.4' }}>
                 Explore relationships and structure in the model ecosystem
               </div>
@@ -1026,12 +978,12 @@ function App() {
                   onChange={(e) => setOverviewMode(e.target.checked)}
                   style={{ marginRight: '0.5rem', cursor: 'pointer' }}
                 />
-                <div>
-                  <span style={{ fontWeight: '500' }}>🔍 Overview Mode</span>
-                  <div style={{ fontSize: '0.75rem', color: '#666', marginTop: '0.25rem' }}>
-                    Zoom out to see full ecosystem structure with all relationships visible. Camera will automatically adjust.
-                  </div>
                 </div>
               </label>
               <label style={{ marginBottom: '1rem', display: 'flex', alignItems: 'center', cursor: 'pointer' }}>
@@ -1041,12 +993,12 @@ function App() {
                   onChange={(e) => setShowNetworkEdges(e.target.checked)}
                   style={{ marginRight: '0.5rem', cursor: 'pointer' }}
                 />
-                <div>
-                  <span style={{ fontWeight: '500' }}>🌐 Network Relationships</span>
-                  <div style={{ fontSize: '0.75rem', color: '#666', marginTop: '0.25rem' }}>
-                    Show connections between related models (same library, pipeline, or tags). Blue = library, Pink = pipeline.
-                  </div>
                 </div>
               </label>
               {showNetworkEdges && (
@@ -1073,26 +1025,19 @@ function App() {
                   onChange={(e) => setShowStructuralGroups(e.target.checked)}
                   style={{ marginRight: '0.5rem', cursor: 'pointer' }}
                 />
-                <div>
-                  <span style={{ fontWeight: '500' }}>📦 Structural Groupings</span>
-                  <div style={{ fontSize: '0.75rem', color: '#666', marginTop: '0.25rem' }}>
-                    Highlight clusters and groups with wireframe boundaries. Shows top library and pipeline clusters.
-                  </div>
                 </div>
               </label>
             </div>
           )}
           {/* Quick Filters */}
           <div className="sidebar-section">
-            <h3 style={{
-              display: 'flex',
-              alignItems: 'center',
-              gap: '0.5rem',
-              color: '#5e35b1'
-            }}>
-              ⚡ Quick Actions
-            </h3>
             <div style={{ display: 'flex', flexWrap: 'wrap', gap: '0.5rem' }}>
               <button
                 onClick={() => {
@@ -1137,14 +1082,7 @@ function App() {
           </div>
           <div className="sidebar-section">
-            <h3 style={{
-              display: 'flex',
-              alignItems: 'center',
-              gap: '0.5rem',
-              color: '#5e35b1'
-            }}>
-              🌳 Hierarchy Navigation
-            </h3>
             <label style={{ marginBottom: '1rem', display: 'block' }}>
               <span style={{ fontWeight: '500', display: 'block', marginBottom: '0.5rem' }}>
                 Max Hierarchy Depth
@@ -1224,14 +1162,7 @@ function App() {
           </div>
           <div className="sidebar-section">
-            <h3 style={{
-              display: 'flex',
-              alignItems: 'center',
-              gap: '0.5rem',
-              color: '#5e35b1'
-            }}>
-              👥 Family Tree Explorer
-            </h3>
             <div style={{ position: 'relative' }}>
               <input
                 type="text"

             alignItems: 'center',
             marginBottom: '1.5rem',
             paddingBottom: '1rem',
+            borderBottom: '1px solid #e0e0e0'
           }}>
             <h2 style={{
               margin: 0,
               fontSize: '1.5rem',
+              fontWeight: '600',
+              color: '#2d2d2d'
             }}>
               Filters & Controls
             </h2>
             {activeFilterCount > 0 && (
               <div style={{
                 fontSize: '0.75rem',
+                background: '#4a4a4a',
                 color: 'white',
+                padding: '0.35rem 0.7rem',
+                borderRadius: '12px',
+                fontWeight: '600'
               }}>
                 {activeFilterCount} active
               </div>
           {/* Filter Results Count */}
           {!loading && data.length > 0 && (
             <div className="sidebar-section" style={{
+              background: '#f5f5f5',
+              border: '1px solid #d0d0d0',
               fontSize: '0.9rem',
               marginBottom: '1.5rem'
             }}>
               <div style={{ display: 'flex', justifyContent: 'space-between', alignItems: 'center', marginBottom: '0.5rem' }}>
                 <div>
+                  <strong style={{ fontSize: '1.1rem', color: '#2d2d2d' }}>
                     {data.length.toLocaleString()}
                   </strong>
+                  <span style={{ marginLeft: '0.4rem', color: '#4a4a4a' }}>
                     {data.length === 1 ? 'model' : 'models'}
                   </span>
                 </div>
                 {embeddingType === 'graph-aware' && (
                   <span style={{
                     fontSize: '0.7rem',
+                    background: '#4a4a4a',
                     color: 'white',
                     padding: '0.3rem 0.6rem',
                     borderRadius: '12px',
                     fontWeight: '600'
                   }}>
+                    Graph
                   </span>
                 )}
               </div>
               {filteredCount !== null && filteredCount !== data.length && (
+                <div style={{ fontSize: '0.8rem', color: '#666', marginTop: '0.25rem' }}>
                   of {filteredCount.toLocaleString()} matching
                 </div>
               )}
               {stats && filteredCount !== null && filteredCount < stats.total_models && (
+                <div style={{ fontSize: '0.75rem', color: '#666', marginTop: '0.25rem' }}>
                   from {stats.total_models.toLocaleString()} total
                 </div>
               )}
           {/* Search Section */}
           <div className="sidebar-section">
+            <h3>Search Models</h3>
             <input
               type="text"
               value={searchQuery}
           {/* Popularity Filters */}
           <div className="sidebar-section">
+            <h3>Popularity Filters</h3>
             <label style={{ marginBottom: '1rem', display: 'block' }}>
               <div style={{ display: 'flex', justifyContent: 'space-between', marginBottom: '0.5rem' }}>
           {/* Discovery */}
           <div className="sidebar-section">
+            <h3>Discovery</h3>
             <RandomModelButton
               data={data}
               onSelect={(model: ModelPoint) => {
           {/* Visualization Options */}
           <div className="sidebar-section">
             <div style={{ display: 'flex', justifyContent: 'space-between', alignItems: 'center', marginBottom: '1rem' }}>
+              <h3 style={{ margin: 0 }}>Visualization Options</h3>
               <ThemeToggle />
             </div>
               </select>
             </label>
+            <div className="sidebar-section" style={{ background: '#f5f5f5', borderColor: '#d0d0d0', marginBottom: '1rem', padding: '0.75rem', borderRadius: '4px', border: '1px solid' }}>
               <label style={{ display: 'block', marginBottom: '0' }}>
+                <span style={{ fontWeight: '600', display: 'block', marginBottom: '0.5rem', color: '#2d2d2d' }}>
+                  Projection Method
                 </span>
                 <select
                   value={projectionMethod}
                   <option value="umap">UMAP (better global structure)</option>
                   <option value="tsne">t-SNE (better local clusters)</option>
                 </select>
+                <div style={{ fontSize: '0.75rem', color: '#666', marginTop: '0.5rem', lineHeight: '1.4' }}>
                   <strong>UMAP:</strong> Preserves global structure, better for exploring relationships<br/>
                   <strong>t-SNE:</strong> Emphasizes local clusters, better for finding groups
                 </div>
           </div>
           {/* View Modes */}
+          <div className="sidebar-section">
+            <h3>View Modes</h3>
             <label style={{ marginBottom: '1rem', display: 'flex', alignItems: 'center', cursor: 'pointer' }}>
               <input
                 style={{ marginRight: '0.5rem', cursor: 'pointer' }}
               />
               <div>
+                <span style={{ fontWeight: '500' }}>Graph-Aware Embeddings</span>
                 <div style={{ fontSize: '0.75rem', color: '#666', marginTop: '0.25rem' }}>
                   Use embeddings that respect family tree structure. Models in the same family will be closer together.
                 </div>
                 color: '#666'
               }}>
                 <div style={{ display: 'flex', alignItems: 'center', gap: '0.5rem', marginBottom: '0.25rem' }}>
+                  <strong style={{ color: '#2d2d2d' }}>
+                    {embeddingType === 'graph-aware' ? 'Graph-Aware' : 'Text-Only'} Embeddings
                   </strong>
                 </div>
+                <div style={{ fontSize: '0.7rem', color: '#666', lineHeight: '1.4' }}>
                   {embeddingType === 'graph-aware'
                     ? 'Models in the same family tree are positioned closer together, revealing hierarchical relationships.'
                     : 'Standard text-based embeddings showing semantic similarity from model descriptions and tags.'}
           {/* Structural Visualization Options */}
           {viewMode === '3d' && (
+            <div className="sidebar-section">
+              <h3>Network Structure</h3>
               <div style={{ fontSize: '0.75rem', color: '#666', marginBottom: '1rem', lineHeight: '1.4' }}>
                 Explore relationships and structure in the model ecosystem
               </div>
                   onChange={(e) => setOverviewMode(e.target.checked)}
                   style={{ marginRight: '0.5rem', cursor: 'pointer' }}
                 />
+              <div>
+                <span style={{ fontWeight: '500' }}>Overview Mode</span>
+                <div style={{ fontSize: '0.75rem', color: '#666', marginTop: '0.25rem' }}>
+                  Zoom out to see full ecosystem structure with all relationships visible. Camera will automatically adjust.
                 </div>
+              </div>
               </label>
               <label style={{ marginBottom: '1rem', display: 'flex', alignItems: 'center', cursor: 'pointer' }}>
                   onChange={(e) => setShowNetworkEdges(e.target.checked)}
                   style={{ marginRight: '0.5rem', cursor: 'pointer' }}
                 />
+              <div>
+                <span style={{ fontWeight: '500' }}>Network Relationships</span>
+                <div style={{ fontSize: '0.75rem', color: '#666', marginTop: '0.25rem' }}>
+                  Show connections between related models (same library, pipeline, or tags). Blue = library, Pink = pipeline.
                 </div>
+              </div>
               </label>
               {showNetworkEdges && (
                   onChange={(e) => setShowStructuralGroups(e.target.checked)}
                   style={{ marginRight: '0.5rem', cursor: 'pointer' }}
                 />
+              <div>
+                <span style={{ fontWeight: '500' }}>Structural Groupings</span>
+                <div style={{ fontSize: '0.75rem', color: '#666', marginTop: '0.25rem' }}>
+                  Highlight clusters and groups with wireframe boundaries. Shows top library and pipeline clusters.
                 </div>
+              </div>
               </label>
             </div>
           )}
           {/* Quick Filters */}
           <div className="sidebar-section">
+            <h3>Quick Actions</h3>
             <div style={{ display: 'flex', flexWrap: 'wrap', gap: '0.5rem' }}>
               <button
                 onClick={() => {
           </div>
           <div className="sidebar-section">
+            <h3>Hierarchy Navigation</h3>
             <label style={{ marginBottom: '1rem', display: 'block' }}>
               <span style={{ fontWeight: '500', display: 'block', marginBottom: '0.5rem' }}>
                 Max Hierarchy Depth
           </div>
           <div className="sidebar-section">
+            <h3>Family Tree Explorer</h3>
             <div style={{ position: 'relative' }}>
               <input
                 type="text"

frontend/src/components/PaperPlots.css DELETED Viewed

@@ -1,92 +0,0 @@
-.paper-plots {
-  display: flex;
-  flex-direction: column;
-  gap: 1rem;
-  padding: 1rem;
-  background: white;
-  border-radius: 8px;
-  box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1);
-}
-.plot-selector {
-  border-bottom: 1px solid #e0e0e0;
-  padding-bottom: 1rem;
-}
-.plot-selector h3 {
-  margin: 0 0 0.75rem 0;
-  font-size: 1.25rem;
-  color: #333;
-}
-.plot-buttons {
-  display: flex;
-  flex-wrap: wrap;
-  gap: 0.5rem;
-}
-.plot-button {
-  padding: 0.5rem 1rem;
-  border: 1px solid #ccc;
-  background: white;
-  border-radius: 4px;
-  cursor: pointer;
-  font-size: 0.875rem;
-  transition: all 0.2s;
-  color: #333;
-}
-.plot-button:hover {
-  background: #f5f5f5;
-  border-color: #999;
-}
-.plot-button.active {
-  background: #4a90e2;
-  color: white;
-  border-color: #4a90e2;
-}
-.plot-container {
-  position: relative;
-  min-height: 600px;
-  display: flex;
-  align-items: center;
-  justify-content: center;
-}
-.plot-loading {
-  position: absolute;
-  top: 50%;
-  left: 50%;
-  transform: translate(-50%, -50%);
-  color: #666;
-  font-size: 1rem;
-}
-.plot-tooltip {
-  position: absolute;
-  padding: 0.5rem;
-  background: rgba(0, 0, 0, 0.8);
-  color: white;
-  border-radius: 4px;
-  pointer-events: none;
-  font-size: 0.875rem;
-  z-index: 1000;
-}
-.plot-container svg {
-  display: block;
-  margin: 0 auto;
-}
-@media (max-width: 768px) {
-  .plot-buttons {
-    flex-direction: column;
-  }
-  .plot-button {
-    width: 100%;
-  }
-}

frontend/src/components/PaperPlots.tsx DELETED Viewed

@@ -1,755 +0,0 @@
-/**
- * Interactive D3.js visualizations based on plots from the research paper.
- * "Anatomy of a Machine Learning Ecosystem: 2 Million Models on Hugging Face"
- */
-import React, { useRef, useEffect, useState, useMemo } from 'react';
-import * as d3 from 'd3';
-import { ModelPoint } from '../types';
-import './PaperPlots.css';
-const API_BASE = process.env.REACT_APP_API_URL || 'http://localhost:8000';
-interface PaperPlotsProps {
-  data: ModelPoint[];
-  width?: number;
-  height?: number;
-}
-type PlotType = 'family-size' | 'similarity-comparison' | 'license-drift' | 'model-card-length' | 'growth-timeline';
-export default function PaperPlots({ data, width = 800, height = 600 }: PaperPlotsProps) {
-  const [activePlot, setActivePlot] = useState<PlotType>('family-size');
-  const familySizeRef = useRef<SVGSVGElement>(null);
-  const similarityRef = useRef<SVGSVGElement>(null);
-  const licenseDriftRef = useRef<SVGSVGElement>(null);
-  const modelCardLengthRef = useRef<SVGSVGElement>(null);
-  const growthTimelineRef = useRef<SVGSVGElement>(null);
-  const [familyTreeData, setFamilyTreeData] = useState<any>(null);
-  const [loading, setLoading] = useState(false);
-  // Fetch family tree statistics
-  useEffect(() => {
-    const fetchFamilyStats = async () => {
-      setLoading(true);
-      try {
-        const response = await fetch(`${API_BASE}/api/family/stats`);
-        if (response.ok) {
-          const stats = await response.json();
-          setFamilyTreeData(stats);
-        }
-      } catch (err) {
-        console.error('Error fetching family stats:', err);
-      } finally {
-        setLoading(false);
-      }
-    };
-    fetchFamilyStats();
-  }, []);
-  // Plot 1: Family Size Distribution
-  useEffect(() => {
-    if (activePlot !== 'family-size' || !familySizeRef.current) return;
-    const svg = d3.select(familySizeRef.current);
-    svg.selectAll('*').remove();
-    const margin = { top: 40, right: 40, bottom: 60, left: 60 };
-    const innerWidth = width - margin.left - margin.right;
-    const innerHeight = height - margin.top - margin.bottom;
-    // Use API data if available, otherwise calculate from current data
-    let binData: Array<{ x0: number; x1: number; count: number }>;
-    if (familyTreeData && familyTreeData.family_size_distribution) {
-      const sizeDist = familyTreeData.family_size_distribution;
-      const sizes = Object.keys(sizeDist).map(Number);
-      const counts = Object.values(sizeDist) as number[];
-      // Create histogram bins from distribution
-      const maxSize = d3.max(sizes) || 1;
-      const bins = d3.bin().thresholds(20).domain([0, maxSize])(sizes);
-      binData = bins.map(bin => {
-        let count = 0;
-        sizes.forEach((size, i) => {
-          if (size >= (bin.x0 || 0) && size < (bin.x1 || maxSize)) {
-            count += counts[i];
-          }
-        });
-        return {
-          x0: bin.x0 || 0,
-          x1: bin.x1 || maxSize,
-          count: count
-        };
-      }).filter(d => d.count > 0);
-    } else {
-      // Fallback: Calculate from current data
-      const familySizes = new Map<string, number>();
-      data.forEach(model => {
-        const familyKey = model.parent_model || model.model_id;
-        familySizes.set(familyKey, (familySizes.get(familyKey) || 0) + 1);
-      });
-      const sizes = Array.from(familySizes.values());
-      const bins = d3.bin().thresholds(20)(sizes);
-      binData = bins.map(bin => ({
-        x0: bin.x0 || 0,
-        x1: bin.x1 || 0,
-        count: bin.length
-      }));
-    }
-    const g = svg.append('g')
-      .attr('transform', `translate(${margin.left},${margin.top})`);
-    // Scales
-    const xScale = d3.scaleLinear()
-      .domain([0, d3.max(binData, d => d.x1) || 1])
-      .range([0, innerWidth])
-      .nice();
-    const yScale = d3.scaleLinear()
-      .domain([0, d3.max(binData, d => d.count) || 1])
-      .range([innerHeight, 0])
-      .nice();
-    // Bars
-    g.selectAll('rect')
-      .data(binData)
-      .enter()
-      .append('rect')
-      .attr('x', d => xScale(d.x0))
-      .attr('width', d => Math.max(0, xScale(d.x1) - xScale(d.x0) - 1))
-      .attr('y', d => yScale(d.count))
-      .attr('height', d => innerHeight - yScale(d.count))
-      .attr('fill', '#4a90e2')
-      .attr('opacity', 0.7)
-      .on('mouseover', function(event, d) {
-        d3.select(this).attr('opacity', 1);
-        const tooltip = d3.select('body').append('div')
-          .attr('class', 'plot-tooltip')
-          .style('opacity', 0);
-        tooltip.transition().duration(200).style('opacity', 0.9);
-        tooltip.html(`Family Size: ${d.x0.toFixed(0)}-${d.x1.toFixed(0)}<br/>Count: ${d.count}`)
-          .style('left', (event.pageX + 10) + 'px')
-          .style('top', (event.pageY - 28) + 'px');
-      })
-      .on('mouseout', function() {
-        d3.select(this).attr('opacity', 0.7);
-        d3.selectAll('.plot-tooltip').remove();
-      });
-    // Axes
-    const xAxis = d3.axisBottom(xScale).tickFormat(d3.format('d'));
-    const yAxis = d3.axisLeft(yScale);
-    g.append('g')
-      .attr('transform', `translate(0,${innerHeight})`)
-      .call(xAxis)
-      .append('text')
-      .attr('x', innerWidth / 2)
-      .attr('y', 45)
-      .attr('fill', 'currentColor')
-      .style('text-anchor', 'middle')
-      .style('font-size', '14px')
-      .text('Family Size (number of models)');
-    g.append('g')
-      .call(yAxis)
-      .append('text')
-      .attr('transform', 'rotate(-90)')
-      .attr('y', -45)
-      .attr('x', -innerHeight / 2)
-      .attr('fill', 'currentColor')
-      .style('text-anchor', 'middle')
-      .style('font-size', '14px')
-      .text('Number of Families');
-    // Title
-    svg.append('text')
-      .attr('x', width / 2)
-      .attr('y', 20)
-      .attr('text-anchor', 'middle')
-      .style('font-size', '16px')
-      .style('font-weight', 'bold')
-      .text('Family Size Distribution');
-  }, [activePlot, data, width, height, familyTreeData]);
-  // Plot 2: Similarity Comparison (Sibling vs Parent-Child)
-  useEffect(() => {
-    if (activePlot !== 'similarity-comparison' || !similarityRef.current || !data.length) return;
-    const svg = d3.select(similarityRef.current);
-    svg.selectAll('*').remove();
-    const margin = { top: 40, right: 40, bottom: 60, left: 60 };
-    const innerWidth = width - margin.left - margin.right;
-    const innerHeight = height - margin.top - margin.bottom;
-    // This would require similarity data - for now, create a placeholder visualization
-    // In the paper, this shows that siblings are more similar than parent-child pairs
-    const g = svg.append('g')
-      .attr('transform', `translate(${margin.left},${margin.top})`);
-    // Placeholder: Box plot or violin plot showing similarity distributions
-    // Sibling similarity (higher)
-    const siblingData = Array.from({ length: 100 }, () => 0.6 + Math.random() * 0.3);
-    // Parent-child similarity (lower)
-    const parentChildData = Array.from({ length: 100 }, () => 0.3 + Math.random() * 0.3);
-    const xScale = d3.scaleBand()
-      .domain(['Sibling Pairs', 'Parent-Child Pairs'])
-      .range([0, innerWidth])
-      .padding(0.3);
-    const yScale = d3.scaleLinear()
-      .domain([0, 1])
-      .range([innerHeight, 0])
-      .nice();
-    // Box plot visualization
-    [siblingData, parentChildData].forEach((dataset, i) => {
-      const label = i === 0 ? 'Sibling Pairs' : 'Parent-Child Pairs';
-      const x = xScale(label);
-      const bandWidth = xScale.bandwidth();
-      if (x === undefined) return;
-      // Calculate quartiles
-      const sorted = dataset.sort((a, b) => a - b);
-      const q1 = d3.quantile(sorted, 0.25) || 0;
-      const q2 = d3.quantile(sorted, 0.5) || 0;
-      const q3 = d3.quantile(sorted, 0.75) || 0;
-      const min = sorted[0];
-      const max = sorted[sorted.length - 1];
-      // Box
-      g.append('rect')
-        .attr('x', x)
-        .attr('y', yScale(q3))
-        .attr('width', bandWidth)
-        .attr('height', yScale(q1) - yScale(q3))
-        .attr('fill', i === 0 ? '#4a90e2' : '#e24a4a')
-        .attr('opacity', 0.6)
-        .attr('stroke', '#333')
-        .attr('stroke-width', 1);
-      // Median line
-      g.append('line')
-        .attr('x1', x)
-        .attr('x2', x + bandWidth)
-        .attr('y1', yScale(q2))
-        .attr('y2', yScale(q2))
-        .attr('stroke', '#333')
-        .attr('stroke-width', 2);
-      // Whiskers
-      g.append('line')
-        .attr('x1', x + bandWidth / 2)
-        .attr('x2', x + bandWidth / 2)
-        .attr('y1', yScale(min))
-        .attr('y2', yScale(q1))
-        .attr('stroke', '#333')
-        .attr('stroke-width', 1);
-      g.append('line')
-        .attr('x1', x + bandWidth / 2)
-        .attr('x2', x + bandWidth / 2)
-        .attr('y1', yScale(q3))
-        .attr('y2', yScale(max))
-        .attr('stroke', '#333')
-        .attr('stroke-width', 1);
-      // Min/Max lines
-      g.append('line')
-        .attr('x1', x + bandWidth * 0.25)
-        .attr('x2', x + bandWidth * 0.75)
-        .attr('y1', yScale(min))
-        .attr('y2', yScale(min))
-        .attr('stroke', '#333')
-        .attr('stroke-width', 1);
-      g.append('line')
-        .attr('x1', x + bandWidth * 0.25)
-        .attr('x2', x + bandWidth * 0.75)
-        .attr('y1', yScale(max))
-        .attr('y2', yScale(max))
-        .attr('stroke', '#333')
-        .attr('stroke-width', 1);
-    });
-    // Axes
-    const yAxis = d3.axisLeft(yScale);
-    g.append('g').call(yAxis);
-    g.append('text')
-      .attr('transform', 'rotate(-90)')
-      .attr('y', -45)
-      .attr('x', -innerHeight / 2)
-      .attr('fill', 'currentColor')
-      .style('text-anchor', 'middle')
-      .style('font-size', '14px')
-      .text('Similarity Score');
-    // Title
-    svg.append('text')
-      .attr('x', width / 2)
-      .attr('y', 20)
-      .attr('text-anchor', 'middle')
-      .style('font-size', '16px')
-      .style('font-weight', 'bold')
-      .text('Similarity: Siblings vs Parent-Child Pairs');
-  }, [activePlot, data, width, height, familyTreeData]);
-  // Plot 3: License Drift (over family depth)
-  useEffect(() => {
-    if (activePlot !== 'license-drift' || !licenseDriftRef.current || !data.length) return;
-    const svg = d3.select(licenseDriftRef.current);
-    svg.selectAll('*').remove();
-    const margin = { top: 40, right: 40, bottom: 60, left: 80 };
-    const innerWidth = width - margin.left - margin.right;
-    const innerHeight = height - margin.top - margin.bottom;
-    // Group by family depth and license type
-    const depthGroups = new Map<number, Map<string, number>>();
-    data.forEach(model => {
-      const depth = model.family_depth || 0;
-      const license = model.licenses ? (model.licenses.split(',')[0].trim() || 'unknown') : 'unknown';
-      if (!depthGroups.has(depth)) {
-        depthGroups.set(depth, new Map());
-      }
-      const licenseMap = depthGroups.get(depth)!;
-      licenseMap.set(license, (licenseMap.get(license) || 0) + 1);
-    });
-    const depths = Array.from(depthGroups.keys()).sort((a, b) => a - b);
-    const allLicenses = new Set<string>();
-    depthGroups.forEach(licenseMap => {
-      licenseMap.forEach((_, license) => allLicenses.add(license));
-    });
-    const licenseTypes = Array.from(allLicenses).slice(0, 5); // Top 5 licenses
-    const colorScale = d3.scaleOrdinal(d3.schemeCategory10).domain(licenseTypes);
-    const g = svg.append('g')
-      .attr('transform', `translate(${margin.left},${margin.top})`);
-    const xScale = d3.scaleBand()
-      .domain(depths.map(d => d.toString()))
-      .range([0, innerWidth])
-      .padding(0.1);
-    const yScale = d3.scaleLinear()
-      .domain([0, 1])
-      .range([innerHeight, 0]);
-    // Stacked area or bars showing license distribution
-    licenseTypes.forEach((license, i) => {
-      const stack = depths.map(depth => {
-        const licenseMap = depthGroups.get(depth) || new Map();
-        const total = Array.from(licenseMap.values()).reduce((a, b) => a + b, 0);
-        const count = licenseMap.get(license) || 0;
-        return { depth, proportion: total > 0 ? count / total : 0 };
-      });
-      // Draw as line chart showing proportion over depth
-      const line = d3.line<{ depth: number; proportion: number }>()
-        .x(d => (xScale(d.depth.toString()) || 0) + xScale.bandwidth() / 2)
-        .y(d => yScale(d.proportion))
-        .curve(d3.curveMonotoneX);
-      g.append('path')
-        .datum(stack)
-        .attr('fill', 'none')
-        .attr('stroke', colorScale(license))
-        .attr('stroke-width', 2)
-        .attr('d', line);
-      // Add circles for data points
-      g.selectAll(`.dot-${i}`)
-        .data(stack)
-        .enter()
-        .append('circle')
-        .attr('cx', d => (xScale(d.depth.toString()) || 0) + xScale.bandwidth() / 2)
-        .attr('cy', d => yScale(d.proportion))
-        .attr('r', 4)
-        .attr('fill', colorScale(license));
-    });
-    // Axes
-    const xAxis = d3.axisBottom(xScale);
-    const yAxis = d3.axisLeft(yScale).tickFormat(d3.format('.0%'));
-    g.append('g')
-      .attr('transform', `translate(0,${innerHeight})`)
-      .call(xAxis)
-      .append('text')
-      .attr('x', innerWidth / 2)
-      .attr('y', 45)
-      .attr('fill', 'currentColor')
-      .style('text-anchor', 'middle')
-      .style('font-size', '14px')
-      .text('Family Depth (generation)');
-    g.append('g').call(yAxis)
-      .append('text')
-      .attr('transform', 'rotate(-90)')
-      .attr('y', -60)
-      .attr('x', -innerHeight / 2)
-      .attr('fill', 'currentColor')
-      .style('text-anchor', 'middle')
-      .style('font-size', '14px')
-      .text('Proportion of Models');
-    // Legend
-    const legend = g.append('g')
-      .attr('transform', `translate(${innerWidth - 150}, 20)`);
-    licenseTypes.forEach((license, i) => {
-      const legendRow = legend.append('g')
-        .attr('transform', `translate(0, ${i * 20})`);
-      legendRow.append('rect')
-        .attr('width', 15)
-        .attr('height', 15)
-        .attr('fill', colorScale(license));
-      legendRow.append('text')
-        .attr('x', 20)
-        .attr('y', 12)
-        .style('font-size', '12px')
-        .text(license.length > 15 ? license.substring(0, 15) + '...' : license);
-    });
-    // Title
-    svg.append('text')
-      .attr('x', width / 2)
-      .attr('y', 20)
-      .attr('text-anchor', 'middle')
-      .style('font-size', '16px')
-      .style('font-weight', 'bold')
-      .text('License Distribution Across Family Generations');
-  }, [activePlot, data, width, height, familyTreeData]);
-  // Plot 4: Model Card Length Distribution
-  useEffect(() => {
-    if (activePlot !== 'model-card-length' || !modelCardLengthRef.current || !data.length) return;
-    const svg = d3.select(modelCardLengthRef.current);
-    svg.selectAll('*').remove();
-    const margin = { top: 40, right: 40, bottom: 60, left: 60 };
-    const innerWidth = width - margin.left - margin.right;
-    const innerHeight = height - margin.top - margin.bottom;
-    // Placeholder: Would need model card length data
-    // In the paper, this shows model cards getting shorter and more standardized
-    const g = svg.append('g')
-      .attr('transform', `translate(${margin.left},${margin.top})`);
-    // Use real model card length data from API if available
-    let depthData = new Map<number, number[]>();
-    if (familyTreeData && familyTreeData.model_card_length_by_depth) {
-      // Use real data from API
-      const cardStats = familyTreeData.model_card_length_by_depth;
-      Object.keys(cardStats).forEach(depthStr => {
-        const depth = parseInt(depthStr);
-        const stats = cardStats[depthStr];
-        // Create synthetic distribution from stats (mean, q1, q3)
-        const lengths: number[] = [];
-        const count = Math.min(stats.count, 100); // Limit for performance
-        for (let i = 0; i < count; i++) {
-          // Generate values around the mean with spread based on quartiles
-          const spread = (stats.q3 - stats.q1) / 2;
-          const length = stats.mean + (Math.random() - 0.5) * spread * 2;
-          lengths.push(Math.max(0, length));
-        }
-        depthData.set(depth, lengths);
-      });
-    } else {
-      // Fallback: Calculate from current data
-      const depthGroups = new Map<number, number[]>();
-      data.forEach(model => {
-        const depth = model.family_depth || 0;
-        // We don't have model card length in ModelPoint, so use placeholder
-        // In a real implementation, this would come from the API
-        if (!depthGroups.has(depth)) {
-          depthGroups.set(depth, []);
-        }
-      });
-      depthData = depthGroups;
-    }
-    // If still no data, use simulated data
-    if (depthData.size === 0) {
-      for (let depth = 0; depth <= 5; depth++) {
-        const lengths = Array.from({ length: 50 }, () => {
-          const baseLength = 2000 - depth * 200;
-          return baseLength + (Math.random() - 0.5) * 500;
-        });
-        depthData.set(depth, lengths);
-      }
-    }
-    const depths = Array.from(depthData.keys()).sort((a, b) => a - b);
-    const maxDepth = d3.max(depths) || 5;
-    const allLengths = Array.from(depthData.values()).flat();
-    const maxLength = d3.max(allLengths) || 3000;
-    const xScale = d3.scaleBand()
-      .domain(depths.map(d => d.toString()))
-      .range([0, innerWidth])
-      .padding(0.2);
-    const yScale = d3.scaleLinear()
-      .domain([0, maxLength])
-      .range([innerHeight, 0])
-      .nice();
-    // Violin plot or box plot
-    depthData.forEach((lengths, depth) => {
-      const x = xScale(depth.toString());
-      const bandWidth = xScale.bandwidth();
-      if (x === undefined) return;
-      // Simple box plot
-      const sorted = lengths.sort((a, b) => a - b);
-      const q1 = d3.quantile(sorted, 0.25) || 0;
-      const q2 = d3.quantile(sorted, 0.5) || 0;
-      const q3 = d3.quantile(sorted, 0.75) || 0;
-      g.append('rect')
-        .attr('x', x)
-        .attr('y', yScale(q3))
-        .attr('width', bandWidth)
-        .attr('height', yScale(q1) - yScale(q3))
-        .attr('fill', '#4a90e2')
-        .attr('opacity', 0.6)
-        .attr('stroke', '#333');
-      g.append('line')
-        .attr('x1', x)
-        .attr('x2', x + bandWidth)
-        .attr('y1', yScale(q2))
-        .attr('y2', yScale(q2))
-        .attr('stroke', '#333')
-        .attr('stroke-width', 2);
-    });
-    const yAxis = d3.axisLeft(yScale);
-    g.append('g').call(yAxis)
-      .append('text')
-      .attr('transform', 'rotate(-90)')
-      .attr('y', -45)
-      .attr('x', -innerHeight / 2)
-      .attr('fill', 'currentColor')
-      .style('text-anchor', 'middle')
-      .style('font-size', '14px')
-      .text('Model Card Length (characters)');
-    // Title
-    svg.append('text')
-      .attr('x', width / 2)
-      .attr('y', 20)
-      .attr('text-anchor', 'middle')
-      .style('font-size', '16px')
-      .style('font-weight', 'bold')
-      .text('Model Card Length by Family Generation');
-  }, [activePlot, data, width, height, familyTreeData]);
-  // Plot 5: Growth Timeline
-  useEffect(() => {
-    if (activePlot !== 'growth-timeline' || !growthTimelineRef.current) return;
-    const svg = d3.select(growthTimelineRef.current);
-    svg.selectAll('*').remove();
-    const margin = { top: 40, right: 40, bottom: 60, left: 60 };
-    const innerWidth = width - margin.left - margin.right;
-    const innerHeight = height - margin.top - margin.bottom;
-    // Fetch growth data from model tracker API
-    fetch(`${API_BASE}/api/model-count/historical?days=365`)
-      .then(res => res.json())
-      .then(data => {
-        if (!data.counts || data.counts.length === 0) {
-          svg.append('text')
-            .attr('x', width / 2)
-            .attr('y', height / 2)
-            .attr('text-anchor', 'middle')
-            .text('No historical data available');
-          return;
-        }
-        const g = svg.append('g')
-          .attr('transform', `translate(${margin.left},${margin.top})`);
-        const counts = data.counts.map((d: any) => ({
-          date: new Date(d.timestamp),
-          count: d.total_models
-        })).sort((a: any, b: any) => a.date - b.date);
-        const extent = d3.extent(counts, (d: any) => d.date) as [Date | undefined, Date | undefined];
-        const minDate = extent[0];
-        const maxDate = extent[1];
-        if (!minDate || !maxDate) return;
-        const xScale = d3.scaleTime()
-          .domain([minDate, maxDate])
-          .range([0, innerWidth]);
-        const yScale = d3.scaleLinear()
-          .domain([0, d3.max(counts, (d: any) => d.count) || 0] as [number, number])
-          .range([innerHeight, 0])
-          .nice();
-        const line = d3.line<any>()
-          .x(d => xScale(d.date))
-          .y(d => yScale(d.count))
-          .curve(d3.curveMonotoneX);
-        g.append('path')
-          .datum(counts)
-          .attr('fill', 'none')
-          .attr('stroke', '#4a90e2')
-          .attr('stroke-width', 2)
-          .attr('d', line);
-        g.selectAll('circle')
-          .data(counts)
-          .enter()
-          .append('circle')
-          .attr('cx', (d: any) => xScale(d.date))
-          .attr('cy', (d: any) => yScale(d.count))
-          .attr('r', 3)
-          .attr('fill', '#4a90e2')
-          .on('mouseover', function(event, d: any) {
-            d3.select(this).attr('r', 5);
-            const tooltip = d3.select('body').append('div')
-              .attr('class', 'plot-tooltip')
-              .style('opacity', 0);
-            tooltip.transition().duration(200).style('opacity', 0.9);
-            tooltip.html(`${d.date.toLocaleDateString()}<br/>Models: ${d.count.toLocaleString()}`)
-              .style('left', (event.pageX + 10) + 'px')
-              .style('top', (event.pageY - 28) + 'px');
-          })
-          .on('mouseout', function() {
-            d3.select(this).attr('r', 3);
-            d3.selectAll('.plot-tooltip').remove();
-          });
-        const xAxis = d3.axisBottom(xScale).ticks(6);
-        const yAxis = d3.axisLeft(yScale).tickFormat(d3.format('.2s'));
-        g.append('g')
-          .attr('transform', `translate(0,${innerHeight})`)
-          .call(xAxis)
-          .append('text')
-          .attr('x', innerWidth / 2)
-          .attr('y', 45)
-          .attr('fill', 'currentColor')
-          .style('text-anchor', 'middle')
-          .style('font-size', '14px')
-          .text('Date');
-        g.append('g').call(yAxis)
-          .append('text')
-          .attr('transform', 'rotate(-90)')
-          .attr('y', -45)
-          .attr('x', -innerHeight / 2)
-          .attr('fill', 'currentColor')
-          .style('text-anchor', 'middle')
-          .style('font-size', '14px')
-          .text('Total Models');
-        svg.append('text')
-          .attr('x', width / 2)
-          .attr('y', 20)
-          .attr('text-anchor', 'middle')
-          .style('font-size', '16px')
-          .style('font-weight', 'bold')
-          .text('Model Count Growth Over Time');
-      })
-      .catch(err => {
-        console.error('Error fetching growth data:', err);
-        svg.append('text')
-          .attr('x', width / 2)
-          .attr('y', height / 2)
-          .attr('text-anchor', 'middle')
-          .text('Error loading growth data');
-      });
-  }, [activePlot, width, height]);
-  const plotOptions: { value: PlotType; label: string; description: string }[] = [
-    { value: 'family-size', label: 'Family Size Distribution', description: 'Distribution of family tree sizes' },
-    { value: 'similarity-comparison', label: 'Similarity Comparison', description: 'Sibling vs parent-child similarity' },
-    { value: 'license-drift', label: 'License Drift', description: 'License changes across generations' },
-    { value: 'model-card-length', label: 'Model Card Length', description: 'Model card length by generation' },
-    { value: 'growth-timeline', label: 'Growth Timeline', description: 'Model count over time' },
-  ];
-  return (
-    <div className="paper-plots">
-      <div className="plot-selector">
-        <h3>Paper Visualizations</h3>
-        <div className="plot-buttons">
-          {plotOptions.map(option => (
-            <button
-              key={option.value}
-              className={`plot-button ${activePlot === option.value ? 'active' : ''}`}
-              onClick={() => setActivePlot(option.value)}
-              title={option.description}
-            >
-              {option.label}
-            </button>
-          ))}
-        </div>
-      </div>
-      <div className="plot-container">
-        {loading && <div className="plot-loading">Loading data...</div>}
-        <svg
-          ref={familySizeRef}
-          width={width}
-          height={height}
-          style={{ display: activePlot === 'family-size' ? 'block' : 'none' }}
-        />
-        <svg
-          ref={similarityRef}
-          width={width}
-          height={height}
-          style={{ display: activePlot === 'similarity-comparison' ? 'block' : 'none' }}
-        />
-        <svg
-          ref={licenseDriftRef}
-          width={width}
-          height={height}
-          style={{ display: activePlot === 'license-drift' ? 'block' : 'none' }}
-        />
-        <svg
-          ref={modelCardLengthRef}
-          width={width}
-          height={height}
-          style={{ display: activePlot === 'model-card-length' ? 'block' : 'none' }}
-        />
-        <svg
-          ref={growthTimelineRef}
-          width={width}
-          height={height}
-          style={{ display: activePlot === 'growth-timeline' ? 'block' : 'none' }}
-        />
-      </div>
-    </div>
-  );
-}

frontend/src/components/ScatterPlot.tsx DELETED Viewed

@@ -1,7 +0,0 @@
-/**
- * Legacy Visx scatter plot - kept for reference.
- * Use EnhancedScatterPlot.tsx for D3.js implementation.
- */
-// This file is kept for compatibility but EnhancedScatterPlot is preferred
-export { default } from './EnhancedScatterPlot';

frontend/src/components/controls/ClusterFilter.css ADDED Viewed

	@@ -0,0 +1,122 @@

+.cluster-filter {
+  margin-bottom: 1.5rem;
+}
+.cluster-filter-header {
+  margin-bottom: 0.75rem;
+}
+.cluster-filter-header h3 {
+  margin: 0;
+  font-size: 1rem;
+  font-weight: 600;
+  color: var(--text-primary, #1a1a1a);
+}
+.cluster-filter-search {
+  margin-bottom: 0.75rem;
+}
+.cluster-search-input {
+  width: 100%;
+  padding: 0.5rem;
+  border: 1px solid var(--border-color, #e0e0e0);
+  border-radius: 4px;
+  font-size: 0.9rem;
+  background: var(--bg-primary, #ffffff);
+  color: var(--text-primary, #1a1a1a);
+}
+.cluster-search-input:focus {
+  outline: none;
+  border-color: var(--accent-color, #4a90e2);
+}
+.cluster-filter-actions {
+  display: flex;
+  gap: 0.5rem;
+  margin-bottom: 0.75rem;
+}
+.cluster-action-btn {
+  flex: 1;
+  padding: 0.4rem 0.6rem;
+  border: 1px solid var(--border-color, #e0e0e0);
+  border-radius: 4px;
+  background: var(--bg-primary, #ffffff);
+  color: var(--text-primary, #1a1a1a);
+  font-size: 0.85rem;
+  cursor: pointer;
+  transition: all 0.2s;
+}
+.cluster-action-btn:hover:not(:disabled) {
+  background: var(--bg-secondary, #f5f5f5);
+  border-color: var(--accent-color, #4a90e2);
+}
+.cluster-action-btn:disabled {
+  opacity: 0.5;
+  cursor: not-allowed;
+}
+.cluster-list {
+  max-height: 300px;
+  overflow-y: auto;
+  border: 1px solid var(--border-color, #e0e0e0);
+  border-radius: 4px;
+  padding: 0.5rem;
+  background: var(--bg-primary, #ffffff);
+}
+.cluster-item {
+  display: flex;
+  align-items: center;
+  gap: 0.5rem;
+  padding: 0.5rem;
+  cursor: pointer;
+  border-radius: 3px;
+  transition: background 0.15s;
+  font-size: 0.85rem;
+}
+.cluster-item:hover {
+  background: var(--bg-secondary, #f5f5f5);
+}
+.cluster-item.selected {
+  background: var(--bg-secondary, #f5f5f5);
+}
+.cluster-checkbox {
+  margin: 0;
+  cursor: pointer;
+}
+.cluster-color-indicator {
+  width: 12px;
+  height: 12px;
+  border-radius: 2px;
+  flex-shrink: 0;
+  border: 1px solid var(--border-color, #e0e0e0);
+}
+.cluster-label {
+  flex: 1;
+  color: var(--text-primary, #1a1a1a);
+}
+.cluster-count {
+  font-size: 0.75rem;
+  color: var(--text-secondary, #666);
+  margin-left: auto;
+}
+.cluster-filter-loading,
+.cluster-filter-empty {
+  padding: 1rem;
+  text-align: center;
+  color: var(--text-secondary, #666);
+  font-size: 0.9rem;
+}

frontend/src/components/controls/ClusterFilter.tsx ADDED Viewed

	@@ -0,0 +1,142 @@

+/**
+ * Enhanced cluster filter component with search, Select All/Clear All/Random buttons.
+ * Inspired by LAION's cluster filtering UI.
+ */
+import React, { useState, useMemo } from 'react';
+import { useFilterStore } from '../../stores/filterStore';
+import './ClusterFilter.css';
+export interface Cluster {
+  cluster_id: number;
+  cluster_label: string;
+  count: number;
+  color?: string;
+}
+interface ClusterFilterProps {
+  clusters: Cluster[];
+  loading?: boolean;
+}
+export default function ClusterFilter({ clusters, loading = false }: ClusterFilterProps) {
+  const { selectedClusters, setSelectedClusters } = useFilterStore();
+  const [searchTerm, setSearchTerm] = useState('');
+  const filteredClusters = useMemo(() => {
+    if (!searchTerm) return clusters;
+    const lowerSearch = searchTerm.toLowerCase();
+    return clusters.filter(c =>
+      c.cluster_label.toLowerCase().includes(lowerSearch) ||
+      c.cluster_id.toString().includes(lowerSearch)
+    );
+  }, [clusters, searchTerm]);
+  const handleSelectAll = () => {
+    setSelectedClusters(clusters.map(c => c.cluster_id));
+  };
+  const handleClearAll = () => {
+    setSelectedClusters([]);
+  };
+  const handleRandom = () => {
+    if (clusters.length === 0) return;
+    const randomCluster = clusters[Math.floor(Math.random() * clusters.length)];
+    setSelectedClusters([randomCluster.cluster_id]);
+  };
+  const handleToggleCluster = (clusterId: number) => {
+    if (selectedClusters.includes(clusterId)) {
+      setSelectedClusters(selectedClusters.filter(id => id !== clusterId));
+    } else {
+      setSelectedClusters([...selectedClusters, clusterId]);
+    }
+  };
+  if (loading) {
+    return (
+      <div className="cluster-filter">
+        <div className="cluster-filter-loading">Loading clusters...</div>
+      </div>
+    );
+  }
+  if (clusters.length === 0) {
+    return (
+      <div className="cluster-filter">
+        <div className="cluster-filter-empty">No clusters available</div>
+      </div>
+    );
+  }
+  return (
+    <div className="cluster-filter">
+      <div className="cluster-filter-header">
+        <h3>Dataset Clusters</h3>
+      </div>
+      <div className="cluster-filter-search">
+        <input
+          type="text"
+          placeholder={`Search ${clusters.length} clusters...`}
+          value={searchTerm}
+          onChange={(e) => setSearchTerm(e.target.value)}
+          className="cluster-search-input"
+        />
+      </div>
+      <div className="cluster-filter-actions">
+        <button
+          onClick={handleSelectAll}
+          className="cluster-action-btn"
+          disabled={clusters.length === 0}
+        >
+          Select All
+        </button>
+        <button
+          onClick={handleClearAll}
+          className="cluster-action-btn"
+          disabled={selectedClusters.length === 0}
+        >
+          Clear All
+        </button>
+        <button
+          onClick={handleRandom}
+          className="cluster-action-btn"
+          disabled={clusters.length === 0}
+        >
+          Random
+        </button>
+      </div>
+      <div className="cluster-list">
+        {filteredClusters.length === 0 ? (
+          <div className="cluster-filter-empty">No clusters match your search</div>
+        ) : (
+          filteredClusters.map(cluster => (
+            <label
+              key={cluster.cluster_id}
+              className={`cluster-item ${selectedClusters.includes(cluster.cluster_id) ? 'selected' : ''}`}
+            >
+              <input
+                type="checkbox"
+                checked={selectedClusters.includes(cluster.cluster_id)}
+                onChange={() => handleToggleCluster(cluster.cluster_id)}
+                className="cluster-checkbox"
+              />
+              {cluster.color && (
+                <span
+                  className="cluster-color-indicator"
+                  style={{ backgroundColor: cluster.color }}
+                />
+              )}
+              <span className="cluster-label">{cluster.cluster_label}</span>
+              <span className="cluster-count">({cluster.count.toLocaleString()})</span>
+            </label>
+          ))
+        )}
+      </div>
+    </div>
+  );
+}

frontend/src/components/controls/NodeDensitySlider.css ADDED Viewed

	@@ -0,0 +1,31 @@

+.node-density-slider {
+  margin-bottom: 1rem;
+}
+.node-density-label {
+  display: block;
+}
+.node-density-title {
+  font-weight: 500;
+  display: block;
+  margin-bottom: 0.5rem;
+  color: var(--text-primary, #1a1a1a);
+}
+.node-density-input {
+  width: 100%;
+  cursor: pointer;
+}
+.node-density-input:disabled {
+  opacity: 0.5;
+  cursor: not-allowed;
+}
+.node-density-hint {
+  font-size: 0.75rem;
+  color: var(--text-secondary, #666);
+  margin-top: 0.25rem;
+}

frontend/src/components/controls/NodeDensitySlider.tsx ADDED Viewed

	@@ -0,0 +1,39 @@

+/**
+ * Node density slider for controlling rendering performance.
+ * Lower density improves performance for large datasets.
+ */
+import React from 'react';
+import { useFilterStore } from '../../stores/filterStore';
+import './NodeDensitySlider.css';
+interface NodeDensitySliderProps {
+  disabled?: boolean;
+}
+export default function NodeDensitySlider({ disabled = false }: NodeDensitySliderProps) {
+  const { nodeDensity, setNodeDensity } = useFilterStore();
+  return (
+    <div className="node-density-slider">
+      <label className="node-density-label">
+        <span className="node-density-title">
+          Node Density ({nodeDensity}%)
+        </span>
+        <input
+          type="range"
+          min="10"
+          max="100"
+          step="10"
+          value={nodeDensity}
+          onChange={(e) => setNodeDensity(parseInt(e.target.value))}
+          disabled={disabled}
+          className="node-density-input"
+        />
+        <div className="node-density-hint">
+          Lower density improves performance for large datasets
+        </div>
+      </label>
+    </div>
+  );
+}

frontend/src/components/controls/RandomModelButton.tsx ADDED Viewed

	@@ -0,0 +1,32 @@

+/**
+ * Button to select a random model from the dataset for discovery.
+ */
+import React from 'react';
+import { ModelPoint } from '../../types';
+interface RandomModelButtonProps {
+  data: ModelPoint[];
+  onSelect: (model: ModelPoint) => void;
+  disabled?: boolean;
+}
+export default function RandomModelButton({ data, onSelect, disabled }: RandomModelButtonProps) {
+  const handleRandomSelect = () => {
+    if (data.length === 0) return;
+    const randomIndex = Math.floor(Math.random() * data.length);
+    onSelect(data[randomIndex]);
+  };
+  return (
+    <button
+      onClick={handleRandomSelect}
+      disabled={disabled || data.length === 0}
+      className="random-model-btn"
+      title="Select a random model"
+      aria-label="Select random model"
+    >
+      <span>Select Random Model</span>
+    </button>
+  );
+}

frontend/src/components/controls/RenderingStyleSelector.css ADDED Viewed

	@@ -0,0 +1,37 @@

+.rendering-style-selector {
+  margin-bottom: 1rem;
+}
+.rendering-style-label {
+  display: block;
+}
+.rendering-style-title {
+  font-weight: 500;
+  display: block;
+  margin-bottom: 0.5rem;
+  color: var(--text-primary, #1a1a1a);
+}
+.rendering-style-select {
+  width: 100%;
+  padding: 0.5rem;
+  border: 1px solid var(--border-color, #e0e0e0);
+  border-radius: 4px;
+  background: var(--bg-primary, #ffffff);
+  color: var(--text-primary, #1a1a1a);
+  font-size: 0.9rem;
+  cursor: pointer;
+}
+.rendering-style-select:focus {
+  outline: none;
+  border-color: var(--accent-color, #4a90e2);
+}
+.rendering-style-hint {
+  font-size: 0.75rem;
+  color: var(--text-secondary, #666);
+  margin-top: 0.25rem;
+}

frontend/src/components/controls/RenderingStyleSelector.tsx ADDED Viewed

	@@ -0,0 +1,43 @@

+/**
+ * Rendering style selector dropdown.
+ * Allows users to choose different 3D layout/geometry styles.
+ */
+import React from 'react';
+import { useFilterStore, RenderingStyle } from '../../stores/filterStore';
+import './RenderingStyleSelector.css';
+const STYLES: { value: RenderingStyle; label: string; description: string }[] = [
+  { value: 'embeddings', label: 'Embeddings', description: 'Standard embedding-based layout' },
+  { value: 'sphere', label: 'Sphere', description: 'Spherical arrangement of points' },
+  { value: 'galaxy', label: 'Galaxy', description: 'Spiral galaxy-like layout' },
+  { value: 'wave', label: 'Wave', description: 'Wave pattern arrangement' },
+  { value: 'helix', label: 'Helix', description: 'Helical/spiral arrangement' },
+  { value: 'torus', label: 'Torus', description: 'Torus/donut-shaped layout' },
+];
+export default function RenderingStyleSelector() {
+  const { renderingStyle, setRenderingStyle } = useFilterStore();
+  return (
+    <div className="rendering-style-selector">
+      <label className="rendering-style-label">
+        <span className="rendering-style-title">Rendering Style</span>
+        <select
+          value={renderingStyle}
+          onChange={(e) => setRenderingStyle(e.target.value as RenderingStyle)}
+          className="rendering-style-select"
+        >
+          {STYLES.map(style => (
+            <option key={style.value} value={style.value}>
+              {style.label}
+            </option>
+          ))}
+        </select>
+        <div className="rendering-style-hint">
+          {STYLES.find(s => s.value === renderingStyle)?.description}
+        </div>
+      </label>
+    </div>
+  );
+}

frontend/src/components/controls/ThemeToggle.tsx ADDED Viewed

	@@ -0,0 +1,22 @@

+/**
+ * Toggle button for switching between light and dark themes.
+ */
+import React from 'react';
+import { useFilterStore } from '../../stores/filterStore';
+export default function ThemeToggle() {
+  const theme = useFilterStore((state) => state.theme);
+  const toggleTheme = useFilterStore((state) => state.toggleTheme);
+  return (
+    <button
+      onClick={toggleTheme}
+      className="theme-toggle"
+      title={`Switch to ${theme === 'light' ? 'dark' : 'light'} mode`}
+      aria-label={`Current theme: ${theme}. Click to switch to ${theme === 'light' ? 'dark' : 'light'} mode`}
+    >
+      {theme === 'light' ? '🌙' : '☀️'}
+    </button>
+  );
+}

frontend/src/components/controls/VisualizationModeButtons.css ADDED Viewed

	@@ -0,0 +1,65 @@

+.visualization-mode-buttons {
+  position: sticky;
+  top: 0;
+  z-index: 100;
+  background: var(--bg-primary, #ffffff);
+  border-bottom: 1px solid var(--border-color, #e0e0e0);
+  padding: 0.75rem 1rem;
+  margin-bottom: 1rem;
+}
+.mode-buttons-container {
+  display: flex;
+  gap: 0.5rem;
+  flex-wrap: wrap;
+  justify-content: center;
+}
+.mode-button {
+  display: flex;
+  align-items: center;
+  gap: 0.5rem;
+  padding: 0.5rem 1rem;
+  border: 1px solid var(--border-color, #e0e0e0);
+  border-radius: 6px;
+  background: var(--bg-primary, #ffffff);
+  color: var(--text-primary, #1a1a1a);
+  font-size: 0.9rem;
+  cursor: pointer;
+  transition: all 0.2s;
+}
+.mode-button:hover {
+  background: var(--bg-secondary, #f5f5f5);
+  border-color: var(--accent-color, #4a90e2);
+}
+.mode-button.active {
+  background: var(--accent-color, #4a90e2);
+  color: white;
+  border-color: var(--accent-color, #4a90e2);
+}
+.mode-icon {
+  font-size: 1.1rem;
+}
+.mode-label {
+  font-weight: 500;
+}
+@media (max-width: 768px) {
+  .mode-buttons-container {
+    gap: 0.25rem;
+  }
+  .mode-button {
+    padding: 0.4rem 0.6rem;
+    font-size: 0.8rem;
+  }
+  .mode-label {
+    display: none;
+  }
+}

frontend/src/components/controls/VisualizationModeButtons.tsx ADDED Viewed

	@@ -0,0 +1,46 @@

+/**
+ * Visualization mode buttons with sticky header.
+ * Inspired by LAION's mode selection UI.
+ */
+import React from 'react';
+import { useFilterStore, ViewMode } from '../../stores/filterStore';
+import './VisualizationModeButtons.css';
+interface ModeOption {
+  value: ViewMode;
+  label: string;
+  icon: string;
+  description: string;
+}
+const MODES: ModeOption[] = [
+  { value: '3d', label: '3D Embedding', icon: '🎯', description: 'Interactive 3D exploration' },
+  { value: 'scatter', label: '2D Scatter', icon: '📊', description: '2D projection view' },
+  { value: 'network', label: 'Network', icon: '🕸️', description: 'Network graph view' },
+  { value: 'distribution', label: 'Distribution', icon: '📈', description: 'Statistical distributions' },
+  { value: 'stacked', label: 'Stacked', icon: '📚', description: 'Hierarchical view' },
+  { value: 'heatmap', label: 'Heatmap', icon: '🔥', description: 'Density heatmap' },
+];
+export default function VisualizationModeButtons() {
+  const { viewMode, setViewMode } = useFilterStore();
+  return (
+    <div className="visualization-mode-buttons">
+      <div className="mode-buttons-container">
+        {MODES.map(mode => (
+          <button
+            key={mode.value}
+            className={`mode-button ${viewMode === mode.value ? 'active' : ''}`}
+            onClick={() => setViewMode(mode.value)}
+            title={mode.description}
+          >
+            <span className="mode-icon">{mode.icon}</span>
+            <span className="mode-label">{mode.label}</span>
+          </button>
+        ))}
+      </div>
+    </div>
+  );
+}

frontend/src/components/controls/ZoomSlider.tsx ADDED Viewed

	@@ -0,0 +1,43 @@

+/**
+ * Slider control for zoom level in 3D visualization.
+ */
+import React from 'react';
+interface ZoomSliderProps {
+  value: number;
+  onChange: (value: number) => void;
+  min?: number;
+  max?: number;
+  step?: number;
+  disabled?: boolean;
+}
+export default function ZoomSlider({
+  value,
+  onChange,
+  min = 0.1,
+  max = 5,
+  step = 0.1,
+  disabled = false,
+}: ZoomSliderProps) {
+  return (
+    <div className="zoom-slider-container">
+      <label className="zoom-slider-label">
+        <span>Zoom Level</span>
+        <span className="zoom-value">{value.toFixed(1)}x</span>
+      </label>
+      <input
+        type="range"
+        min={min}
+        max={max}
+        step={step}
+        value={value}
+        onChange={(e) => onChange(parseFloat(e.target.value))}
+        disabled={disabled}
+        className="zoom-slider"
+        aria-label="Zoom level"
+      />
+    </div>
+  );
+}

frontend/src/components/layout/SearchBar.css ADDED Viewed

	@@ -0,0 +1,181 @@

+.search-bar-container {
+  position: relative;
+  width: 100%;
+  max-width: 600px;
+  z-index: 1000;
+}
+.search-bar {
+  position: relative;
+  display: flex;
+  align-items: center;
+  background: white;
+  border: 2px solid #e0e0e0;
+  border-radius: 8px;
+  padding: 8px 12px;
+  transition: border-color 0.2s;
+}
+.search-bar:focus-within {
+  border-color: #4a90e2;
+  box-shadow: 0 0 0 3px rgba(74, 144, 226, 0.1);
+}
+.search-input {
+  flex: 1;
+  border: none;
+  outline: none;
+  font-size: 14px;
+  font-family: 'Instrument Sans', sans-serif;
+  color: #333;
+  background: transparent;
+}
+.search-input::placeholder {
+  color: #999;
+}
+.search-loading {
+  margin-left: 8px;
+  color: #4a90e2;
+  animation: spin 1s linear infinite;
+  font-size: 16px;
+}
+.search-clear {
+  margin-left: 8px;
+  background: none;
+  border: none;
+  color: #999;
+  cursor: pointer;
+  font-size: 20px;
+  line-height: 1;
+  padding: 0;
+  width: 20px;
+  height: 20px;
+  display: flex;
+  align-items: center;
+  justify-content: center;
+  transition: color 0.2s;
+}
+.search-clear:hover {
+  color: #333;
+}
+.search-results {
+  position: absolute;
+  top: 100%;
+  left: 0;
+  right: 0;
+  margin-top: 4px;
+  background: white;
+  border: 1px solid #e0e0e0;
+  border-radius: 8px;
+  box-shadow: 0 4px 12px rgba(0, 0, 0, 0.15);
+  max-height: 400px;
+  overflow-y: auto;
+  z-index: 1001;
+}
+.search-result {
+  padding: 12px;
+  cursor: pointer;
+  border-bottom: 1px solid #f0f0f0;
+  transition: background-color 0.15s;
+}
+.search-result:last-child {
+  border-bottom: none;
+}
+.search-result:hover,
+.search-result.selected {
+  background-color: #f5f5f5;
+}
+.result-header {
+  display: flex;
+  align-items: center;
+  gap: 8px;
+  margin-bottom: 6px;
+}
+.result-model-id {
+  font-size: 14px;
+  font-weight: 600;
+  color: #333;
+  font-family: 'Instrument Sans', sans-serif;
+}
+.result-org {
+  font-size: 12px;
+  color: #666;
+  background: #f0f0f0;
+  padding: 2px 6px;
+  border-radius: 4px;
+}
+.result-meta {
+  display: flex;
+  flex-wrap: wrap;
+  gap: 6px;
+  margin-bottom: 4px;
+}
+.result-tag {
+  font-size: 11px;
+  color: #666;
+  background: #e8e8e8;
+  padding: 2px 6px;
+  border-radius: 3px;
+}
+.result-snippet {
+  font-size: 12px;
+  color: #666;
+  margin-top: 4px;
+  line-height: 1.4;
+}
+.result-snippet mark {
+  background: #fff3cd;
+  padding: 1px 2px;
+  border-radius: 2px;
+}
+.search-no-results {
+  padding: 16px;
+  text-align: center;
+  color: #999;
+  font-size: 14px;
+}
+@keyframes spin {
+  from {
+    transform: rotate(0deg);
+  }
+  to {
+    transform: rotate(360deg);
+  }
+}
+/* Scrollbar styling */
+.search-results::-webkit-scrollbar {
+  width: 8px;
+}
+.search-results::-webkit-scrollbar-track {
+  background: #f1f1f1;
+  border-radius: 4px;
+}
+.search-results::-webkit-scrollbar-thumb {
+  background: #c1c1c1;
+  border-radius: 4px;
+}
+.search-results::-webkit-scrollbar-thumb:hover {
+  background: #a8a8a8;
+}

frontend/src/components/layout/SearchBar.tsx ADDED Viewed

	@@ -0,0 +1,201 @@

+/**
+ * Enhanced search bar with autocomplete and keyboard navigation.
+ * Integrates with filter store and triggers map zoom/modal open.
+ */
+import React, { useState, useEffect, useRef, useCallback } from 'react';
+import { useFilterStore } from '../../stores/filterStore';
+import './SearchBar.css';
+import { API_BASE } from '../../config/api';
+interface SearchResult {
+  model_id: string;
+  x: number;
+  y: number;
+  z: number;
+  org: string;
+  library?: string;
+  pipeline?: string;
+  license?: string;
+  snippet?: string;
+  match_score?: number;
+}
+interface SearchBarProps {
+  onSelect?: (result: SearchResult) => void;
+  onZoomTo?: (x: number, y: number, z: number) => void;
+}
+export default function SearchBar({ onSelect, onZoomTo }: SearchBarProps) {
+  const [query, setQuery] = useState('');
+  const [results, setResults] = useState<SearchResult[]>([]);
+  const [selectedIndex, setSelectedIndex] = useState(-1);
+  const [isOpen, setIsOpen] = useState(false);
+  const [isLoading, setIsLoading] = useState(false);
+  const inputRef = useRef<HTMLInputElement>(null);
+  const resultsRef = useRef<HTMLDivElement>(null);
+  const setSearchQuery = useFilterStore((state) => state.setSearchQuery);
+  // Debounced search
+  useEffect(() => {
+    if (query.length < 2) {
+      setResults([]);
+      setIsOpen(false);
+      return;
+    }
+    setIsLoading(true);
+    const timer = setTimeout(async () => {
+      try {
+        const response = await fetch(
+          `${API_BASE}/api/search?q=${encodeURIComponent(query)}&limit=20`
+        );
+        if (!response.ok) throw new Error('Search failed');
+        const data = await response.json();
+        setResults(data.results || []);
+        setIsOpen(true);
+        setSelectedIndex(-1);
+      } catch (err) {
+        console.error('Search error:', err);
+        setResults([]);
+      } finally {
+        setIsLoading(false);
+      }
+    }, 150);
+    return () => clearTimeout(timer);
+  }, [query]);
+  const handleSelect = useCallback((result: SearchResult) => {
+    setSearchQuery(result.model_id);
+    // Trigger zoom if coordinates available
+    if (onZoomTo && result.x !== undefined && result.y !== undefined) {
+      onZoomTo(result.x, result.y, result.z || 0);
+    }
+    // Trigger select callback
+    if (onSelect) {
+      onSelect(result);
+    }
+    setIsOpen(false);
+    setQuery('');
+    inputRef.current?.blur();
+  }, [onSelect, onZoomTo, setSearchQuery]);
+  const handleKeyDown = (e: React.KeyboardEvent) => {
+    if (!isOpen || results.length === 0) return;
+    if (e.key === 'ArrowDown') {
+      e.preventDefault();
+      setSelectedIndex(prev =>
+        prev < results.length - 1 ? prev + 1 : prev
+      );
+      // Scroll into view
+      if (resultsRef.current && selectedIndex >= 0) {
+        const selectedElement = resultsRef.current.children[selectedIndex + 1] as HTMLElement;
+        selectedElement?.scrollIntoView({ block: 'nearest' });
+      }
+    } else if (e.key === 'ArrowUp') {
+      e.preventDefault();
+      setSelectedIndex(prev => prev > 0 ? prev - 1 : -1);
+    } else if (e.key === 'Enter') {
+      e.preventDefault();
+      if (selectedIndex >= 0 && results[selectedIndex]) {
+        handleSelect(results[selectedIndex]);
+      } else if (results.length > 0) {
+        handleSelect(results[0]);
+      }
+    } else if (e.key === 'Escape') {
+      setIsOpen(false);
+      inputRef.current?.blur();
+    }
+  };
+  const handleFocus = () => {
+    if (results.length > 0) {
+      setIsOpen(true);
+    }
+  };
+  const handleBlur = (e: React.FocusEvent) => {
+    // Delay to allow click events on results
+    setTimeout(() => {
+      if (!resultsRef.current?.contains(document.activeElement)) {
+        setIsOpen(false);
+      }
+    }, 200);
+  };
+  return (
+    <div className="search-bar-container">
+      <div className="search-bar">
+        <input
+          ref={inputRef}
+          type="text"
+          value={query}
+          onChange={(e) => setQuery(e.target.value)}
+          onKeyDown={handleKeyDown}
+          onFocus={handleFocus}
+          onBlur={handleBlur}
+          placeholder="Search models, orgs, tasks, licenses..."
+          className="search-input"
+          aria-label="Search models"
+          aria-expanded={isOpen}
+          aria-haspopup="listbox"
+        />
+        {isLoading && <div className="search-loading">⟳</div>}
+        {query.length > 0 && !isLoading && (
+          <button
+            className="search-clear"
+            onClick={() => {
+              setQuery('');
+              setResults([]);
+              setIsOpen(false);
+            }}
+            aria-label="Clear search"
+          >
+            ×
+          </button>
+        )}
+      </div>
+      {isOpen && results.length > 0 && (
+        <div ref={resultsRef} className="search-results" role="listbox">
+          {results.map((result, idx) => (
+            <div
+              key={result.model_id}
+              className={`search-result ${idx === selectedIndex ? 'selected' : ''}`}
+              onClick={() => handleSelect(result)}
+              role="option"
+              aria-selected={idx === selectedIndex}
+            >
+              <div className="result-header">
+                <strong className="result-model-id">{result.model_id}</strong>
+                {result.org && <span className="result-org">{result.org}</span>}
+              </div>
+              <div className="result-meta">
+                {result.library && <span className="result-tag">{result.library}</span>}
+                {result.pipeline && <span className="result-tag">{result.pipeline}</span>}
+                {result.license && <span className="result-tag">{result.license}</span>}
+              </div>
+              {result.snippet && (
+                <div
+                  className="result-snippet"
+                  dangerouslySetInnerHTML={{ __html: result.snippet }}
+                />
+              )}
+            </div>
+          ))}
+        </div>
+      )}
+      {isOpen && query.length >= 2 && results.length === 0 && !isLoading && (
+        <div className="search-results">
+          <div className="search-no-results">No results found</div>
+        </div>
+      )}
+    </div>
+  );
+}

frontend/src/components/{FileTree.css → modals/FileTree.css} RENAMED Viewed

@@ -3,8 +3,11 @@
   border: 1px solid #e0e0e0;
   border-radius: 4px;
   background: #fafafa;
-  max-height: 400px;
   overflow-y: auto;
 }
 .file-tree-header {
@@ -16,6 +19,19 @@
   border-bottom: 1px solid #e0e0e0;
   font-size: 0.9rem;
   font-weight: 600;
 }
 .file-tree-link {
@@ -23,16 +39,110 @@
   text-decoration: none;
   font-size: 0.85rem;
   font-weight: 400;
 }
 .file-tree-link:hover {
   text-decoration: underline;
 }
 .file-tree {
   padding: 0.5rem;
   font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace;
   font-size: 0.85rem;
 }
 .file-tree-node {
@@ -43,9 +153,14 @@
   display: flex;
   align-items: center;
   gap: 0.5rem;
-  padding: 0.25rem 0.5rem;
-  border-radius: 2px;
   transition: background 0.15s;
 }
 .file-tree-item.directory:hover {
@@ -56,6 +171,37 @@
   background: #f0f0f0;
 }
 .file-icon {
   font-size: 1rem;
   width: 1.25rem;
@@ -83,6 +229,9 @@
 .file-tree-children {
   margin-left: 0.5rem;
 }
 .file-tree-loading,
@@ -98,3 +247,22 @@
   color: #d32f2f;
 }

   border: 1px solid #e0e0e0;
   border-radius: 4px;
   background: #fafafa;
+  max-height: 600px;
   overflow-y: auto;
+  overflow-x: hidden;
+  display: flex;
+  flex-direction: column;
 }
 .file-tree-header {
   border-bottom: 1px solid #e0e0e0;
   font-size: 0.9rem;
   font-weight: 600;
+  flex-shrink: 0;
+  position: sticky;
+  top: 0;
+  z-index: 10;
+}
+.file-count-badge {
+  background: #e3f2fd;
+  color: #1976d2;
+  padding: 0.2rem 0.5rem;
+  border-radius: 12px;
+  font-size: 0.75rem;
+  font-weight: 500;
 }
 .file-tree-link {
   text-decoration: none;
   font-size: 0.85rem;
   font-weight: 400;
+  white-space: nowrap;
 }
 .file-tree-link:hover {
   text-decoration: underline;
 }
+.file-tree-button {
+  background: #f0f0f0;
+  border: 1px solid #d0d0d0;
+  border-radius: 3px;
+  padding: 0.25rem 0.5rem;
+  font-size: 0.75rem;
+  cursor: pointer;
+  color: #333;
+  font-family: 'Instrument Sans', sans-serif;
+  transition: background 0.15s;
+}
+.file-tree-button:hover {
+  background: #e0e0e0;
+}
+.file-tree-button:active {
+  background: #d0d0d0;
+}
+.file-tree-filters {
+  padding: 0.75rem 1rem;
+  background: #ffffff;
+  border-bottom: 1px solid #e0e0e0;
+  display: flex;
+  gap: 0.75rem;
+  flex-shrink: 0;
+  position: sticky;
+  top: 48px;
+  z-index: 9;
+}
+.file-tree-search {
+  flex: 1;
+  position: relative;
+  display: flex;
+  align-items: center;
+}
+.file-tree-search-input {
+  width: 100%;
+  padding: 0.5rem 2rem 0.5rem 0.75rem;
+  border: 1px solid #d0d0d0;
+  border-radius: 4px;
+  font-size: 0.85rem;
+  font-family: 'Instrument Sans', sans-serif;
+}
+.file-tree-search-input:focus {
+  outline: none;
+  border-color: #4a90e2;
+  box-shadow: 0 0 0 2px rgba(74, 144, 226, 0.1);
+}
+.file-tree-clear {
+  position: absolute;
+  right: 0.5rem;
+  background: none;
+  border: none;
+  cursor: pointer;
+  color: #666;
+  font-size: 1rem;
+  padding: 0.25rem;
+  display: flex;
+  align-items: center;
+  justify-content: center;
+  border-radius: 2px;
+}
+.file-tree-clear:hover {
+  background: #f0f0f0;
+  color: #1a1a1a;
+}
+.file-tree-type-filter {
+  padding: 0.5rem 0.75rem;
+  border: 1px solid #d0d0d0;
+  border-radius: 4px;
+  font-size: 0.85rem;
+  font-family: 'Instrument Sans', sans-serif;
+  background: white;
+  cursor: pointer;
+  min-width: 150px;
+}
+.file-tree-type-filter:focus {
+  outline: none;
+  border-color: #4a90e2;
+  box-shadow: 0 0 0 2px rgba(74, 144, 226, 0.1);
+}
 .file-tree {
   padding: 0.5rem;
   font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace;
   font-size: 0.85rem;
+  flex: 1;
+  overflow-y: auto;
 }
 .file-tree-node {
   display: flex;
   align-items: center;
   gap: 0.5rem;
+  padding: 0.375rem 0.5rem;
+  border-radius: 3px;
   transition: background 0.15s;
+  user-select: none;
+}
+.file-tree-item.directory {
+  cursor: pointer;
 }
 .file-tree-item.directory:hover {
   background: #f0f0f0;
 }
+.file-actions {
+  display: flex;
+  gap: 0.25rem;
+  margin-left: auto;
+  opacity: 0;
+  transition: opacity 0.2s;
+}
+.file-tree-item:hover .file-actions {
+  opacity: 1;
+}
+.file-action-btn {
+  background: none;
+  border: none;
+  cursor: pointer;
+  font-size: 0.9rem;
+  padding: 0.25rem;
+  border-radius: 2px;
+  display: flex;
+  align-items: center;
+  justify-content: center;
+  transition: background 0.15s;
+  text-decoration: none;
+  color: inherit;
+}
+.file-action-btn:hover {
+  background: rgba(0, 0, 0, 0.1);
+}
 .file-icon {
   font-size: 1rem;
   width: 1.25rem;
 .file-tree-children {
   margin-left: 0.5rem;
+  border-left: 1px solid #e8e8e8;
+  padding-left: 0.5rem;
+  margin-top: 0.125rem;
 }
 .file-tree-loading,
   color: #d32f2f;
 }
+/* Scrollbar styling */
+.file-tree-container::-webkit-scrollbar {
+  width: 8px;
+}
+.file-tree-container::-webkit-scrollbar-track {
+  background: #f1f1f1;
+  border-radius: 4px;
+}
+.file-tree-container::-webkit-scrollbar-thumb {
+  background: #c1c1c1;
+  border-radius: 4px;
+}
+.file-tree-container::-webkit-scrollbar-thumb:hover {
+  background: #a8a8a8;
+}

frontend/src/components/{FileTree.tsx → modals/FileTree.tsx} RENAMED Viewed

@@ -2,9 +2,12 @@
  * File tree component for displaying model file structure.
  * Fetches and displays files from Hugging Face model repository.
  */
-import React, { useState, useEffect } from 'react';
 import './FileTree.css';
 interface FileNode {
   path: string;
   type: 'file' | 'directory';
@@ -21,50 +24,84 @@ export default function FileTree({ modelId }: FileTreeProps) {
   const [loading, setLoading] = useState(true);
   const [error, setError] = useState<string | null>(null);
   const [expandedPaths, setExpandedPaths] = useState<Set<string>>(new Set());
   useEffect(() => {
     const fetchFiles = async () => {
       setLoading(true);
       setError(null);
       try {
-        // Fetch file tree through our backend API (avoids CORS issues)
-        // Use same API base as main app
-        const apiBase = (window as any).__API_BASE__ || process.env.REACT_APP_API_URL || 'http://localhost:8000';
         const response = await fetch(
-          `${apiBase}/api/model/${encodeURIComponent(modelId)}/files?branch=main`
         );
-        if (!response.ok) {
           throw new Error('File tree not available for this model');
         }
         const data = await response.json();
         // Convert flat list to tree structure
         const tree = buildFileTree(data);
         setFiles(tree);
       } catch (err: any) {
-        setError(err instanceof Error ? err.message : 'Failed to load files');
-        console.error('Error fetching file tree:', err);
       } finally {
         setLoading(false);
       }
     };
-    if (modelId) {
-      fetchFiles();
-    }
   }, [modelId]);
   const buildFileTree = (fileList: any[]): FileNode[] => {
     const tree: FileNode[] = [];
     const pathMap = new Map<string, FileNode>();
-    // Sort files by path
-    const sortedFiles = [...fileList].sort((a, b) => a.path.localeCompare(b.path));
     for (const file of sortedFiles) {
-      const parts = file.path.split('/');
       let currentPath = '';
       let parent: FileNode | null = null;
@@ -77,7 +114,7 @@ export default function FileTree({ modelId }: FileTreeProps) {
           const node: FileNode = {
             path: currentPath,
             type: isDirectory ? 'directory' : 'file',
-            size: file.size,
             children: isDirectory ? [] : undefined,
           };
@@ -111,6 +148,26 @@ export default function FileTree({ modelId }: FileTreeProps) {
     });
   };
   const formatFileSize = (bytes?: number): string => {
     if (!bytes) return '';
     if (bytes < 1024) return `${bytes} B`;
@@ -119,6 +176,109 @@ export default function FileTree({ modelId }: FileTreeProps) {
     return `${(bytes / (1024 * 1024 * 1024)).toFixed(1)} GB`;
   };
   const getFileIcon = (node: FileNode): string => {
     if (node.type === 'directory') {
       return expandedPaths.has(node.path) ? '📂' : '📁';
@@ -141,9 +301,28 @@ export default function FileTree({ modelId }: FileTreeProps) {
     return iconMap[ext || ''] || '📄';
   };
   const renderNode = (node: FileNode, depth: number = 0): React.ReactNode => {
     const isExpanded = expandedPaths.has(node.path);
     const hasChildren = node.children && node.children.length > 0;
     return (
       <div key={node.path} className="file-tree-node" style={{ paddingLeft: `${depth * 1.5}rem` }}>
@@ -153,13 +332,37 @@ export default function FileTree({ modelId }: FileTreeProps) {
           style={{ cursor: node.type === 'directory' ? 'pointer' : 'default' }}
         >
           <span className="file-icon">{getFileIcon(node)}</span>
-          <span className="file-name">{node.path.split('/').pop()}</span>
           {node.type === 'file' && node.size && (
             <span className="file-size">{formatFileSize(node.size)}</span>
           )}
           {node.type === 'directory' && (
             <span className="file-expand">{isExpanded ? '▼' : '▶'}</span>
           )}
         </div>
         {isExpanded && hasChildren && (
           <div className="file-tree-children">
@@ -199,21 +402,106 @@ export default function FileTree({ modelId }: FileTreeProps) {
     );
   }
   return (
     <div className="file-tree-container">
       <div className="file-tree-header">
-        <strong>Repository Files</strong>
-        <a
-          href={`https://huggingface.co/${modelId}/tree/main`}
-          target="_blank"
-          rel="noopener noreferrer"
-          className="file-tree-link"
-        >
-          View on Hugging Face →
-        </a>
       </div>
       <div className="file-tree">
-        {files.map((node) => renderNode(node))}
       </div>
     </div>
   );

  * File tree component for displaying model file structure.
  * Fetches and displays files from Hugging Face model repository.
  */
+import React, { useState, useEffect, useMemo } from 'react';
+import { getHuggingFaceFileTreeUrl } from '../../utils/api/hfUrl';
 import './FileTree.css';
+import { API_BASE } from '../../config/api';
 interface FileNode {
   path: string;
   type: 'file' | 'directory';
   const [loading, setLoading] = useState(true);
   const [error, setError] = useState<string | null>(null);
   const [expandedPaths, setExpandedPaths] = useState<Set<string>>(new Set());
+  const [searchQuery, setSearchQuery] = useState('');
+  const [fileTypeFilter, setFileTypeFilter] = useState<string>('all');
+  const [showSearch, setShowSearch] = useState(false);
+  const searchInputRef = React.useRef<HTMLInputElement>(null);
   useEffect(() => {
+    if (!modelId) {
+      setLoading(false);
+      setError('No model ID provided');
+      return;
+    }
     const fetchFiles = async () => {
       setLoading(true);
       setError(null);
       try {
         const response = await fetch(
+          `${API_BASE}/api/model/${encodeURIComponent(modelId)}/files?branch=main`
         );
+        if (response.status === 404) {
           throw new Error('File tree not available for this model');
         }
+        if (response.status === 503) {
+          throw new Error('Backend service unavailable');
+        }
+        if (!response.ok) {
+          const errorText = await response.text();
+          throw new Error(`Failed to load file tree: ${response.status} ${errorText}`);
+        }
         const data = await response.json();
+        if (!Array.isArray(data)) {
+          throw new Error('Invalid response format');
+        }
         // Convert flat list to tree structure
         const tree = buildFileTree(data);
         setFiles(tree);
       } catch (err: any) {
+        const errorMessage = err instanceof Error ? err.message : 'Failed to load files';
+        setError(errorMessage);
+        // Only log in development
+        if (process.env.NODE_ENV === 'development') {
+          console.error('Error fetching file tree:', err);
+        }
       } finally {
         setLoading(false);
       }
     };
+    fetchFiles();
   }, [modelId]);
   const buildFileTree = (fileList: any[]): FileNode[] => {
+    if (!Array.isArray(fileList) || fileList.length === 0) {
+      return [];
+    }
     const tree: FileNode[] = [];
     const pathMap = new Map<string, FileNode>();
+    // Sort files by path for consistent ordering
+    const sortedFiles = [...fileList].sort((a, b) => {
+      const pathA = a.path || '';
+      const pathB = b.path || '';
+      return pathA.localeCompare(pathB);
+    });
     for (const file of sortedFiles) {
+      if (!file.path) continue;
+      const parts = file.path.split('/').filter((p: string) => p.length > 0);
+      if (parts.length === 0) continue;
       let currentPath = '';
       let parent: FileNode | null = null;
           const node: FileNode = {
             path: currentPath,
             type: isDirectory ? 'directory' : 'file',
+            size: isDirectory ? undefined : (file.size || undefined), // Only set size for files
             children: isDirectory ? [] : undefined,
           };
     });
   };
+  const expandAll = () => {
+    const allPaths = new Set<string>();
+    const collectPaths = (nodes: FileNode[]) => {
+      nodes.forEach(node => {
+        if (node.type === 'directory' && node.children) {
+          allPaths.add(node.path);
+          if (node.children.length > 0) {
+            collectPaths(node.children);
+          }
+        }
+      });
+    };
+    collectPaths(files);
+    setExpandedPaths(allPaths);
+  };
+  const collapseAll = () => {
+    setExpandedPaths(new Set());
+  };
   const formatFileSize = (bytes?: number): string => {
     if (!bytes) return '';
     if (bytes < 1024) return `${bytes} B`;
     return `${(bytes / (1024 * 1024 * 1024)).toFixed(1)} GB`;
   };
+  // Get all file extensions from the tree
+  const getAllFileExtensions = useMemo(() => {
+    const extensions = new Set<string>();
+    const collectExtensions = (nodes: FileNode[]) => {
+      nodes.forEach(node => {
+        if (node.type === 'file') {
+          const ext = node.path.split('.').pop()?.toLowerCase();
+          if (ext) extensions.add(ext);
+        }
+        if (node.children) {
+          collectExtensions(node.children);
+        }
+      });
+    };
+    collectExtensions(files);
+    return Array.from(extensions).sort();
+  }, [files]);
+  // Auto-expand directories when searching
+  useEffect(() => {
+    if (searchQuery) {
+      const pathsToExpand = new Set<string>();
+      const findMatchingPaths = (nodes: FileNode[], query: string) => {
+        nodes.forEach(node => {
+          if (node.path.toLowerCase().includes(query.toLowerCase())) {
+            // Expand all parent directories
+            const parts = node.path.split('/');
+            let currentPath = '';
+            for (let i = 0; i < parts.length - 1; i++) {
+              currentPath = currentPath ? `${currentPath}/${parts[i]}` : parts[i];
+              pathsToExpand.add(currentPath);
+            }
+          }
+          if (node.children) {
+            findMatchingPaths(node.children, query);
+          }
+        });
+      };
+      findMatchingPaths(files, searchQuery);
+      setExpandedPaths(pathsToExpand);
+    }
+  }, [searchQuery, files]);
+  // Filter files based on search and file type
+  const filterNodes = (nodes: FileNode[]): FileNode[] => {
+    return nodes
+      .map(node => {
+        const matchesSearch = !searchQuery ||
+          node.path.toLowerCase().includes(searchQuery.toLowerCase());
+        const matchesType = fileTypeFilter === 'all' ||
+          (node.type === 'file' && node.path.toLowerCase().endsWith(`.${fileTypeFilter}`)) ||
+          (node.type === 'directory');
+        if (!matchesSearch || !matchesType) {
+          return null;
+        }
+        const filteredChildren = node.children ? filterNodes(node.children) : undefined;
+        const result: FileNode | null = filteredChildren && filteredChildren.length > 0
+          ? { ...node, children: filteredChildren }
+          : filteredChildren === undefined && matchesSearch && matchesType
+          ? { ...node }
+          : null;
+        return result;
+      })
+      .filter((node): node is FileNode => node !== null);
+  };
+  const filteredFiles = useMemo(() => {
+    if (!searchQuery && fileTypeFilter === 'all') return files;
+    return filterNodes(files);
+  }, [files, searchQuery, fileTypeFilter]);
+  // Count total files
+  const countFiles = (nodes: FileNode[]): number => {
+    let count = 0;
+    nodes.forEach(node => {
+      if (node.type === 'file') count++;
+      if (node.children) count += countFiles(node.children);
+    });
+    return count;
+  };
+  const totalFileCount = useMemo(() => countFiles(files), [files]);
+  const visibleFileCount = useMemo(() => countFiles(filteredFiles), [filteredFiles]);
+  // Keyboard shortcut for search (Cmd+K / Ctrl+K)
+  useEffect(() => {
+    const handleKeyDown = (e: KeyboardEvent) => {
+      if ((e.metaKey || e.ctrlKey) && e.key === 'k') {
+        e.preventDefault();
+        setShowSearch(true);
+        setTimeout(() => searchInputRef.current?.focus(), 0);
+      }
+      if (e.key === 'Escape' && showSearch) {
+        setShowSearch(false);
+        setSearchQuery('');
+      }
+    };
+    window.addEventListener('keydown', handleKeyDown);
+    return () => window.removeEventListener('keydown', handleKeyDown);
+  }, [showSearch]);
   const getFileIcon = (node: FileNode): string => {
     if (node.type === 'directory') {
       return expandedPaths.has(node.path) ? '📂' : '📁';
     return iconMap[ext || ''] || '📄';
   };
+  const copyFilePath = (path: string) => {
+    navigator.clipboard.writeText(path).then(() => {
+      // Show temporary feedback
+      const button = document.querySelector(`[data-file-path="${path}"]`) as HTMLElement;
+      if (button) {
+        const originalText = button.textContent;
+        button.textContent = 'Copied!';
+        setTimeout(() => {
+          if (button) button.textContent = originalText;
+        }, 1000);
+      }
+    });
+  };
+  const getFileUrl = (path: string) => {
+    return `https://huggingface.co/${modelId}/resolve/main/${path}`;
+  };
   const renderNode = (node: FileNode, depth: number = 0): React.ReactNode => {
     const isExpanded = expandedPaths.has(node.path);
     const hasChildren = node.children && node.children.length > 0;
+    const fileName = node.path.split('/').pop() || node.path;
     return (
       <div key={node.path} className="file-tree-node" style={{ paddingLeft: `${depth * 1.5}rem` }}>
           style={{ cursor: node.type === 'directory' ? 'pointer' : 'default' }}
         >
           <span className="file-icon">{getFileIcon(node)}</span>
+          <span className="file-name" title={node.path}>{fileName}</span>
           {node.type === 'file' && node.size && (
             <span className="file-size">{formatFileSize(node.size)}</span>
           )}
           {node.type === 'directory' && (
             <span className="file-expand">{isExpanded ? '▼' : '▶'}</span>
           )}
+          {node.type === 'file' && (
+            <div className="file-actions" onClick={(e) => e.stopPropagation()}>
+              <button
+                className="file-action-btn"
+                onClick={() => copyFilePath(node.path)}
+                data-file-path={node.path}
+                title="Copy file path"
+                aria-label="Copy path"
+              >
+                📋
+              </button>
+              <a
+                href={getFileUrl(node.path)}
+                target="_blank"
+                rel="noopener noreferrer"
+                className="file-action-btn"
+                title="Download file"
+                aria-label="Download"
+                onClick={(e) => e.stopPropagation()}
+              >
+                ⬇️
+              </a>
+            </div>
+          )}
         </div>
         {isExpanded && hasChildren && (
           <div className="file-tree-children">
     );
   }
+  const hasDirectories = files.some(node => node.type === 'directory');
   return (
     <div className="file-tree-container">
       <div className="file-tree-header">
+        <div style={{ display: 'flex', alignItems: 'center', gap: '0.5rem' }}>
+          <strong>Repository Files</strong>
+          <span className="file-count-badge">
+            {visibleFileCount === totalFileCount
+              ? `${totalFileCount} file${totalFileCount !== 1 ? 's' : ''}`
+              : `${visibleFileCount} of ${totalFileCount} files`}
+          </span>
+        </div>
+        <div style={{ display: 'flex', gap: '0.5rem', alignItems: 'center', flexWrap: 'wrap' }}>
+          <button
+            onClick={() => setShowSearch(!showSearch)}
+            className="file-tree-button"
+            title="Search files (Cmd+K)"
+            aria-label="Search"
+          >
+            🔍 Search
+          </button>
+          {hasDirectories && (
+            <>
+              <button
+                onClick={expandAll}
+                className="file-tree-button"
+                title="Expand all directories"
+                aria-label="Expand all"
+              >
+                Expand All
+              </button>
+              <button
+                onClick={collapseAll}
+                className="file-tree-button"
+                title="Collapse all directories"
+                aria-label="Collapse all"
+              >
+                Collapse All
+              </button>
+            </>
+          )}
+          <a
+            href={getHuggingFaceFileTreeUrl(modelId, 'main')}
+            target="_blank"
+            rel="noopener noreferrer"
+            className="file-tree-link"
+          >
+            View on HF →
+          </a>
+        </div>
       </div>
+      {/* Search and Filter Bar */}
+      {(showSearch || searchQuery || fileTypeFilter !== 'all') && (
+        <div className="file-tree-filters">
+          <div className="file-tree-search">
+            <input
+              ref={searchInputRef}
+              type="text"
+              placeholder="Search files... (Cmd+K)"
+              value={searchQuery}
+              onChange={(e) => setSearchQuery(e.target.value)}
+              className="file-tree-search-input"
+            />
+            {searchQuery && (
+              <button
+                onClick={() => setSearchQuery('')}
+                className="file-tree-clear"
+                aria-label="Clear search"
+              >
+                ✕
+              </button>
+            )}
+          </div>
+          {getAllFileExtensions.length > 0 && (
+            <select
+              value={fileTypeFilter}
+              onChange={(e) => setFileTypeFilter(e.target.value)}
+              className="file-tree-type-filter"
+            >
+              <option value="all">All file types</option>
+              {getAllFileExtensions.map(ext => (
+                <option key={ext} value={ext}>.{ext}</option>
+              ))}
+            </select>
+          )}
+        </div>
+      )}
       <div className="file-tree">
+        {filteredFiles.length === 0 ? (
+          <div className="file-tree-empty">
+            {searchQuery || fileTypeFilter !== 'all'
+              ? 'No files match your filters'
+              : 'No files found'}
+          </div>
+        ) : (
+          filteredFiles.map((node) => renderNode(node))
+        )}
       </div>
     </div>
   );

frontend/src/components/{ModelModal.css → modals/ModelModal.css} RENAMED Viewed

@@ -25,7 +25,7 @@
 .modal-content {
   background: #ffffff;
   border-radius: 8px;
-  max-width: 800px;
   width: 100%;
   max-height: 90vh;
   overflow-y: auto;
@@ -34,11 +34,16 @@
   box-shadow: 0 8px 32px rgba(0, 0, 0, 0.2);
   border: 1px solid #d0d0d0;
   animation: slideUp 0.3s ease-out;
-  font-family: 'Vend Sans', sans-serif;
   display: flex;
   flex-direction: column;
 }
 @keyframes slideUp {
   from {
     transform: translateY(20px);
@@ -66,7 +71,7 @@
   justify-content: center;
   border-radius: 2px;
   transition: all 0.2s;
-  font-family: 'Vend Sans', sans-serif;
 }
 .modal-close:hover {
@@ -89,7 +94,7 @@
   font-size: 1.5rem;
   color: #1a1a1a;
   word-break: break-word;
-  font-family: 'Vend Sans', sans-serif;
   font-weight: 600;
   line-height: 1.3;
 }
@@ -117,7 +122,7 @@
   border-radius: 4px;
   cursor: pointer;
   font-size: 0.85rem;
-  font-family: 'Vend Sans', sans-serif;
   transition: all 0.2s;
   font-weight: 500;
 }
@@ -136,6 +141,12 @@
   gap: 0.5rem;
   margin-bottom: 1.5rem;
   border-bottom: 2px solid #e0e0e0;
 }
 .modal-tab {
@@ -145,11 +156,29 @@
   border-bottom: 2px solid transparent;
   cursor: pointer;
   font-size: 0.9rem;
-  font-family: 'Vend Sans', sans-serif;
   color: #666;
   font-weight: 500;
   margin-bottom: -2px;
   transition: all 0.2s;
 }
 .modal-tab:hover {
@@ -186,14 +215,14 @@
   text-transform: uppercase;
   letter-spacing: 0.5px;
   font-weight: 600;
-  font-family: 'Vend Sans', sans-serif;
 }
 .info-value {
   font-size: 1.1rem;
   color: #1a1a1a;
   font-weight: 500;
-  font-family: 'Vend Sans', sans-serif;
 }
 .info-value.highlight {
@@ -230,7 +259,7 @@
   letter-spacing: 0.5px;
   font-weight: 600;
   margin-bottom: 0.75rem;
-  font-family: 'Vend Sans', sans-serif;
 }
 .section-content {
@@ -287,7 +316,7 @@
   color: #4a4a4a;
   text-transform: uppercase;
   letter-spacing: 0.5px;
-  font-family: 'Vend Sans', sans-serif;
 }
 .modal-info-grid {
@@ -306,14 +335,14 @@
   font-size: 0.875rem;
   color: #6a6a6a;
   font-weight: 500;
-  font-family: 'Vend Sans', sans-serif;
 }
 .modal-info-item span {
   font-size: 1rem;
   color: #1a1a1a;
   font-weight: 500;
-  font-family: 'Vend Sans', sans-serif;
 }
 .modal-tags {
@@ -324,7 +353,7 @@
   color: #1a1a1a;
   font-size: 0.9rem;
   line-height: 1.5;
-  font-family: 'Vend Sans', sans-serif;
 }
 .modal-footer {
@@ -345,7 +374,7 @@
   text-decoration: none;
   border-radius: 4px;
   font-weight: 500;
-  font-family: 'Vend Sans', sans-serif;
   transition: all 0.2s;
   border: 1px solid #1a1a1a;
 }

 .modal-content {
   background: #ffffff;
   border-radius: 8px;
+  max-width: 900px;
   width: 100%;
   max-height: 90vh;
   overflow-y: auto;
   box-shadow: 0 8px 32px rgba(0, 0, 0, 0.2);
   border: 1px solid #d0d0d0;
   animation: slideUp 0.3s ease-out;
+  font-family: 'Instrument Sans', sans-serif;
   display: flex;
   flex-direction: column;
 }
+.modal-content[data-tab="files"] {
+  max-width: 1000px;
+  max-height: 95vh;
+}
 @keyframes slideUp {
   from {
     transform: translateY(20px);
   justify-content: center;
   border-radius: 2px;
   transition: all 0.2s;
+  font-family: 'Instrument Sans', sans-serif;
 }
 .modal-close:hover {
   font-size: 1.5rem;
   color: #1a1a1a;
   word-break: break-word;
+  font-family: 'Instrument Sans', sans-serif;
   font-weight: 600;
   line-height: 1.3;
 }
   border-radius: 4px;
   cursor: pointer;
   font-size: 0.85rem;
+  font-family: 'Instrument Sans', sans-serif;
   transition: all 0.2s;
   font-weight: 500;
 }
   gap: 0.5rem;
   margin-bottom: 1.5rem;
   border-bottom: 2px solid #e0e0e0;
+  position: sticky;
+  top: 0;
+  background: #ffffff;
+  z-index: 10;
+  padding-top: 0.5rem;
+  margin-top: -0.5rem;
 }
 .modal-tab {
   border-bottom: 2px solid transparent;
   cursor: pointer;
   font-size: 0.9rem;
+  font-family: 'Instrument Sans', sans-serif;
   color: #666;
   font-weight: 500;
   margin-bottom: -2px;
   transition: all 0.2s;
+  display: flex;
+  align-items: center;
+  gap: 0.5rem;
+  position: relative;
+}
+.tab-icon {
+  font-size: 1rem;
+}
+.tab-badge {
+  background: #4a90e2;
+  color: white;
+  font-size: 0.7rem;
+  padding: 0.15rem 0.4rem;
+  border-radius: 10px;
+  font-weight: 600;
+  margin-left: 0.25rem;
 }
 .modal-tab:hover {
   text-transform: uppercase;
   letter-spacing: 0.5px;
   font-weight: 600;
+  font-family: 'Instrument Sans', sans-serif;
 }
 .info-value {
   font-size: 1.1rem;
   color: #1a1a1a;
   font-weight: 500;
+  font-family: 'Instrument Sans', sans-serif;
 }
 .info-value.highlight {
   letter-spacing: 0.5px;
   font-weight: 600;
   margin-bottom: 0.75rem;
+  font-family: 'Instrument Sans', sans-serif;
 }
 .section-content {
   color: #4a4a4a;
   text-transform: uppercase;
   letter-spacing: 0.5px;
+  font-family: 'Instrument Sans', sans-serif;
 }
 .modal-info-grid {
   font-size: 0.875rem;
   color: #6a6a6a;
   font-weight: 500;
+  font-family: 'Instrument Sans', sans-serif;
 }
 .modal-info-item span {
   font-size: 1rem;
   color: #1a1a1a;
   font-weight: 500;
+  font-family: 'Instrument Sans', sans-serif;
 }
 .modal-tags {
   color: #1a1a1a;
   font-size: 0.9rem;
   line-height: 1.5;
+  font-family: 'Instrument Sans', sans-serif;
 }
 .modal-footer {
   text-decoration: none;
   border-radius: 4px;
   font-weight: 500;
+  font-family: 'Instrument Sans', sans-serif;
   transition: all 0.2s;
   border: 1px solid #1a1a1a;
 }

frontend/src/components/{ModelModal.tsx → modals/ModelModal.tsx} RENAMED Viewed

@@ -3,12 +3,12 @@
  * Enhanced with bookmark, comparison, similar models, and file tree features.
  */
 import React, { useState, useEffect } from 'react';
-import { ModelPoint } from '../types';
 import FileTree from './FileTree';
 import './ModelModal.css';
-const API_BASE = process.env.REACT_APP_API_URL || 'http://localhost:8000';
 interface ArxivPaper {
   arxiv_id: string;
   title: string;
@@ -71,7 +71,7 @@ export default function ModelModal({
   if (!isOpen || !model) return null;
-  const hfUrl = `https://huggingface.co/${model.model_id}`;
   // Parse tags if it's a string representation of an array
   const parseTags = (tags: string | null | undefined): string[] => {
@@ -156,7 +156,11 @@ export default function ModelModal({
   return (
     <div className="modal-overlay" onClick={onClose}>
-      <div className="modal-content" onClick={(e) => e.stopPropagation()}>
         <div className="modal-header">
           <h2>{model.model_id}</h2>
           <button className="modal-close" onClick={onClose}>Close</button>
@@ -197,20 +201,24 @@ export default function ModelModal({
               className={`modal-tab ${activeTab === 'details' ? 'active' : ''}`}
               onClick={() => setActiveTab('details')}
             >
-              Details
             </button>
             <button
               className={`modal-tab ${activeTab === 'files' ? 'active' : ''}`}
               onClick={() => setActiveTab('files')}
             >
-              Files
             </button>
             {(papers.length > 0 || papersLoading) && (
               <button
                 className={`modal-tab ${activeTab === 'papers' ? 'active' : ''}`}
                 onClick={() => setActiveTab('papers')}
               >
-                Papers {papers.length > 0 && `(${papers.length})`}
               </button>
             )}
           </div>
@@ -283,7 +291,7 @@ export default function ModelModal({
                   <div className="section-title">Parent Model</div>
                   <div className="section-content">
                     <a
-                      href={`https://huggingface.co/${model.parent_model}`}
                       target="_blank"
                       rel="noopener noreferrer"
                       className="model-link"

  * Enhanced with bookmark, comparison, similar models, and file tree features.
  */
 import React, { useState, useEffect } from 'react';
+import { ModelPoint } from '../../types';
 import FileTree from './FileTree';
+import { getHuggingFaceUrl } from '../../utils/api/hfUrl';
+import { API_BASE } from '../../config/api';
 import './ModelModal.css';
 interface ArxivPaper {
   arxiv_id: string;
   title: string;
   if (!isOpen || !model) return null;
+  const hfUrl = getHuggingFaceUrl(model.model_id);
   // Parse tags if it's a string representation of an array
   const parseTags = (tags: string | null | undefined): string[] => {
   return (
     <div className="modal-overlay" onClick={onClose}>
+      <div
+        className="modal-content"
+        onClick={(e) => e.stopPropagation()}
+        data-tab={activeTab}
+      >
         <div className="modal-header">
           <h2>{model.model_id}</h2>
           <button className="modal-close" onClick={onClose}>Close</button>
               className={`modal-tab ${activeTab === 'details' ? 'active' : ''}`}
               onClick={() => setActiveTab('details')}
             >
+              <span className="tab-icon">📋</span>
+              <span>Details</span>
             </button>
             <button
               className={`modal-tab ${activeTab === 'files' ? 'active' : ''}`}
               onClick={() => setActiveTab('files')}
             >
+              <span className="tab-icon">📁</span>
+              <span>Files</span>
             </button>
             {(papers.length > 0 || papersLoading) && (
               <button
                 className={`modal-tab ${activeTab === 'papers' ? 'active' : ''}`}
                 onClick={() => setActiveTab('papers')}
               >
+                <span className="tab-icon">📄</span>
+                <span>Papers</span>
+                {papers.length > 0 && <span className="tab-badge">{papers.length}</span>}
               </button>
             )}
           </div>
                   <div className="section-title">Parent Model</div>
                   <div className="section-content">
                     <a
+                      href={getHuggingFaceUrl(model.parent_model)}
                       target="_blank"
                       rel="noopener noreferrer"
                       className="model-link"

frontend/src/components/{ColorLegend.css → ui/ColorLegend.css} RENAMED Viewed

File without changes

frontend/src/components/{ColorLegend.tsx → ui/ColorLegend.tsx} RENAMED Viewed

@@ -3,7 +3,7 @@
  * Shows color mappings for categorical and continuous data.
  */
 import React from 'react';
-import { getCategoricalColorMap, getContinuousColorScale } from '../utils/colors';
 import './ColorLegend.css';
 interface ColorLegendProps {

  * Shows color mappings for categorical and continuous data.
  */
 import React from 'react';
+import { getCategoricalColorMap, getContinuousColorScale } from '../../utils/rendering/colors';
 import './ColorLegend.css';
 interface ColorLegendProps {

frontend/src/components/{ErrorBoundary.tsx → ui/ErrorBoundary.tsx} RENAMED Viewed

File without changes