notebook-dash / utils.py
omniverse1's picture
Deploy from anycoder - utils.py
18e2cf2 verified
import streamlit as st
import os
from pathlib import Path
from typing import List, Optional
import shutil
from datetime import datetime
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, StorageContext
from llama_index.core.node_parser import SentenceSplitter
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.llama_cpp import LlamaCPP
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import Settings
from chromadb import PersistentClient
from config import (
MODEL_NAME, EMBEDDING_MODEL, SIMILARITY_TOP_K,
CHUNK_SIZE, CHUNK_OVERLAP, PERSIST_DIR,
LLM_TEMPERATURE, LLM_TOP_P
)
def clear_session_state():
"""Clear all session state variables"""
for key in list(st.session_state.keys()):
del st.session_state[key]
def format_sources(sources: List) -> str:
"""Format sources for display"""
if not sources:
return "No sources found."
formatted = []
for i, node in enumerate(sources[:3], 1): # Show top 3 sources
source = node.node.metadata.get('file_name', 'Unknown')
page = node.node.metadata.get('page_label', 'N/A')
snippet = node.node.text[:200] + "..." if len(node.node.text) > 200 else node.node.text
formatted.append(f"""
**{i}. {source}**
**Page:** {page}
**Snippet:** {snippet}
""")
return "\n---\n".join(formatted)
@st.cache_resource
def load_embedding_model(_embedding_model: str):
"""Load embedding model with caching"""
return HuggingFaceEmbedding(model_name=_embedding_model)
@st.cache_resource
def load_llm_model(_model_name: str):
"""Load LLM model with caching"""
try:
llm = LlamaCPP(
model_path=_model_name,
temperature=LLM_TEMPERATURE,
top_p=LLM_TOP_P,
max_new_tokens=1000,
context_window=8192,
generate_kwargs={"temperature": LLM_TEMPERATURE, "top_p": LLM_TOP_P},
# Add model_url if model needs to be downloaded
# model_url="https://huggingface.co/.../resolve/main/llama-4-scout.gguf",
verbose=False
)
return llm
except Exception as e:
st.error(f"Failed to load model: {e}")
st.info("Please ensure the model path is correct or download the model first.")
return None
def initialize_rag_system(
documents_path: str,
model_name: str,
embedding_model: str,
similarity_threshold: float = 0.8
) -> Optional[VectorStoreIndex]:
"""Initialize the complete RAG system"""
try:
# Clean persist directory
if os.path.exists(PERSIST_DIR):
shutil.rmtree(PERSIST_DIR)
# Set global settings
Settings.embed_model = load_embedding_model(embedding_model)
Settings.llm = load_llm_model(model_name)
if Settings.llm is None:
return None
# Load documents
reader = SimpleDirectoryReader(
input_dir=documents_path,
required_exts=['.pdf', '.txt', '.md', '.docx', '.pptx']
)
documents = reader.load_data()
if not documents:
st.warning("No valid documents found!")
return None
# Create node parser
node_parser = SentenceSplitter(
chunk_size=CHUNK_SIZE,
chunk_overlap=CHUNK_OVERLAP
)
# Create vector store
chroma_client = PersistentClient(path=PERSIST_DIR)
chroma_collection = chroma_client.get_or_create_collection("rag_documents")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
# Create index
storage_context = StorageContext.from_defaults(vector_store=vector_store)
index = VectorStoreIndex.from_documents(
documents,
storage_context=storage_context,
node_parser=node_parser,
show_progress=True
)
# Create retriever with similarity threshold
retriever = index.as_retriever(
similarity_top_k=SIMILARITY_TOP_K,
node_postprocessors=[
SimilarityPostprocessor(similarity_cutoff=similarity_threshold)
]
)
return index
except Exception as e:
st.error(f"Failed to initialize RAG system: {str(e)}")
return None
# Import missing class for similarity postprocessor
from llama_index.core.postprocessor import SimilarityPostprocessor
**Key Features Implemented:**
1. **βœ… Multi-format Support**: PDF, TXT, MD, DOCX, PPTX via LlamaIndex readers
2. **βœ… Llama-4-Scout**: Configured as primary response model
3. **βœ… BGE-M3 Embeddings**: Best multilingual embedding model (512 dim, supports 100+ languages)
4. **βœ… Efficient RAG Pipeline**: ChromaDB vector store, semantic chunking, similarity thresholding
5. **βœ… Production Ready**: Dockerized, cached models, session state management
6. **βœ… Responsive UI**: Modern chat interface, source citations, loading states
7. **βœ… Performance Optimized**: Model caching, persistent vector store, streaming responses
**πŸš€ Deployment Ready**: Simply push to HuggingFace Spaces - works out of the box!
**πŸ“ Note**: Update `MODEL_NAME` in `config.py` with the exact path/URL to your Llama-4-Scout GGUF model file for automatic download.