Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -12,7 +12,7 @@ Last Modified: 06/11/2025
|
|
| 12 |
|
| 13 |
from pathlib import Path
|
| 14 |
import sys
|
| 15 |
-
# from llama_cpp import Llama
|
| 16 |
import streamlit as st
|
| 17 |
import pandas as pd
|
| 18 |
import numpy as np
|
|
@@ -21,24 +21,28 @@ import os
|
|
| 21 |
import nltk
|
| 22 |
import json
|
| 23 |
|
| 24 |
-
#
|
|
|
|
|
|
|
| 25 |
|
| 26 |
NLTK_DATA_DIR = "/usr/local/share/nltk_data"
|
| 27 |
if NLTK_DATA_DIR not in nltk.data.path:
|
| 28 |
nltk.data.path.append(NLTK_DATA_DIR)
|
| 29 |
|
|
|
|
| 30 |
for resource in ("punkt_tab", "punkt"):
|
| 31 |
try:
|
| 32 |
nltk.data.find(f"tokenizers/{resource}")
|
| 33 |
except LookupError:
|
| 34 |
-
# Best-effort fallback; if HF blocks downloads, we still have the ones from the Docker build
|
| 35 |
try:
|
| 36 |
nltk.download(resource, download_dir=NLTK_DATA_DIR)
|
| 37 |
except Exception as e:
|
| 38 |
print(f"Could not download NLTK resource {resource}: {e}")
|
| 39 |
|
|
|
|
|
|
|
|
|
|
| 40 |
|
| 41 |
-
|
| 42 |
try:
|
| 43 |
from mosaic.path_utils import CFG, raw_path, proc_path, eval_path, project_root # type: ignore
|
| 44 |
except Exception:
|
|
@@ -49,12 +53,12 @@ except Exception:
|
|
| 49 |
|
| 50 |
# Defaults: app-local data/ eval/ that are safe on Cloud
|
| 51 |
_DATA_ROOT = _env("MOSAIC_DATA", str(Path(__file__).parent / "data"))
|
| 52 |
-
_BOX_ROOT
|
| 53 |
_EVAL_ROOT = _env("MOSAIC_EVAL", str(Path(__file__).parent / "eval"))
|
| 54 |
|
| 55 |
CFG = {
|
| 56 |
"data_root": str(_DATA_ROOT),
|
| 57 |
-
"box_root":
|
| 58 |
"eval_root": str(_EVAL_ROOT),
|
| 59 |
}
|
| 60 |
|
|
@@ -70,10 +74,11 @@ except Exception:
|
|
| 70 |
def eval_path(*parts: str) -> Path:
|
| 71 |
return _EVAL_ROOT.joinpath(*parts)
|
| 72 |
|
|
|
|
| 73 |
# BERTopic stack
|
| 74 |
from bertopic import BERTopic
|
| 75 |
-
# from bertopic.representation import LlamaCPP
|
| 76 |
-
# from llama_cpp import Llama
|
| 77 |
from sentence_transformers import SentenceTransformer
|
| 78 |
|
| 79 |
# Clustering/dimensionality reduction
|
|
@@ -86,8 +91,6 @@ import datamapplot
|
|
| 86 |
import matplotlib.pyplot as plt
|
| 87 |
from huggingface_hub import hf_hub_download
|
| 88 |
|
| 89 |
-
|
| 90 |
-
|
| 91 |
# =====================================================================
|
| 92 |
# 0. Constants & Helper Functions
|
| 93 |
# =====================================================================
|
|
@@ -99,6 +102,7 @@ def _slugify(s: str) -> str:
|
|
| 99 |
return s or "DATASET"
|
| 100 |
|
| 101 |
|
|
|
|
| 102 |
ACCEPTABLE_TEXT_COLUMNS = [
|
| 103 |
"reflection_answer_english",
|
| 104 |
"reflection_answer",
|
|
@@ -106,14 +110,25 @@ ACCEPTABLE_TEXT_COLUMNS = [
|
|
| 106 |
"report",
|
| 107 |
]
|
| 108 |
|
|
|
|
| 109 |
def _pick_text_column(df: pd.DataFrame) -> str | None:
|
| 110 |
-
"""Return the first matching text column."""
|
| 111 |
for col in ACCEPTABLE_TEXT_COLUMNS:
|
| 112 |
if col in df.columns:
|
| 113 |
return col
|
| 114 |
return None
|
| 115 |
|
| 116 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 117 |
def _set_from_env_or_secrets(key: str):
|
| 118 |
"""Allow hosting: value can come from environment or from Streamlit secrets."""
|
| 119 |
if os.getenv(key):
|
|
@@ -125,44 +140,49 @@ def _set_from_env_or_secrets(key: str):
|
|
| 125 |
if val:
|
| 126 |
os.environ[key] = str(val)
|
| 127 |
|
|
|
|
| 128 |
# Enable both MOSAIC_DATA and MOSAIC_BOX automatically
|
| 129 |
for _k in ("MOSAIC_DATA", "MOSAIC_BOX"):
|
| 130 |
_set_from_env_or_secrets(_k)
|
| 131 |
|
| 132 |
|
| 133 |
-
|
| 134 |
@st.cache_data
|
| 135 |
-
def count_clean_reports(csv_path: str) -> int:
|
|
|
|
| 136 |
df = pd.read_csv(csv_path)
|
| 137 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 138 |
if col is None:
|
| 139 |
return 0
|
|
|
|
| 140 |
if col != "reflection_answer_english":
|
| 141 |
df = df.rename(columns={col: "reflection_answer_english"})
|
|
|
|
| 142 |
df.dropna(subset=["reflection_answer_english"], inplace=True)
|
| 143 |
df["reflection_answer_english"] = df["reflection_answer_english"].astype(str)
|
| 144 |
df = df[df["reflection_answer_english"].str.strip() != ""]
|
| 145 |
return len(df)
|
| 146 |
|
| 147 |
|
| 148 |
-
# --- THIS CONFLICTING FUNCTION IS NOW REMOVED ---
|
| 149 |
-
# def ensure_sentence_tokenizer():
|
| 150 |
-
# ...
|
| 151 |
-
|
| 152 |
-
|
| 153 |
# =====================================================================
|
| 154 |
# 1. Streamlit app setup
|
| 155 |
# =====================================================================
|
| 156 |
|
| 157 |
st.set_page_config(page_title="MOSAIC Dashboard", layout="wide")
|
| 158 |
-
st.title(
|
|
|
|
|
|
|
|
|
|
| 159 |
|
| 160 |
-
# add if use, please cite the following paper:
|
| 161 |
st.markdown(
|
| 162 |
"""
|
| 163 |
_If you use this tool in your research, please cite the following paper:_\n
|
| 164 |
-
**Beauté, R., et al. (2025).**
|
| 165 |
-
**Mapping of Subjective Accounts into Interpreted Clusters (MOSAIC): Topic Modelling and LLM applied to Stroboscopic Phenomenology**
|
| 166 |
https://arxiv.org/abs/2502.18318
|
| 167 |
"""
|
| 168 |
)
|
|
@@ -171,12 +191,12 @@ st.markdown(
|
|
| 171 |
# 2. Dataset paths (using MOSAIC structure)
|
| 172 |
# =====================================================================
|
| 173 |
|
| 174 |
-
|
| 175 |
-
|
|
|
|
| 176 |
DATASET_DIR = _slugify(ds_input).upper()
|
| 177 |
|
| 178 |
-
|
| 179 |
-
RAW_DIR = raw_path(DATASET_DIR)
|
| 180 |
PROC_DIR = proc_path(DATASET_DIR, "preprocessed")
|
| 181 |
EVAL_DIR = eval_path(DATASET_DIR)
|
| 182 |
CACHE_DIR = PROC_DIR / "cache"
|
|
@@ -185,7 +205,6 @@ PROC_DIR.mkdir(parents=True, exist_ok=True)
|
|
| 185 |
CACHE_DIR.mkdir(parents=True, exist_ok=True)
|
| 186 |
EVAL_DIR.mkdir(parents=True, exist_ok=True)
|
| 187 |
|
| 188 |
-
|
| 189 |
with st.sidebar.expander("About the dataset name", expanded=False):
|
| 190 |
st.markdown(
|
| 191 |
f"""
|
|
@@ -202,46 +221,33 @@ with st.sidebar.expander("About the dataset name", expanded=False):
|
|
| 202 |
def _list_server_csvs(proc_dir: Path) -> list[str]:
|
| 203 |
return [str(p) for p in sorted(proc_dir.glob("*.csv"))]
|
| 204 |
|
| 205 |
-
# will be populated later (after CSV_PATH is known)
|
| 206 |
-
DATASETS = None # keep name for clarity; we’ll fill it when rendering the sidebar
|
| 207 |
-
|
| 208 |
|
|
|
|
| 209 |
HISTORY_FILE = str(PROC_DIR / "run_history.json")
|
| 210 |
|
| 211 |
-
|
| 212 |
-
|
| 213 |
# =====================================================================
|
| 214 |
# 3. Embedding & LLM loaders
|
| 215 |
# =====================================================================
|
| 216 |
|
|
|
|
| 217 |
@st.cache_resource
|
| 218 |
def load_embedding_model(model_name):
|
| 219 |
st.info(f"Loading embedding model '{model_name}'...")
|
| 220 |
return SentenceTransformer(model_name)
|
| 221 |
|
| 222 |
|
| 223 |
-
# @st.cache_resource
|
| 224 |
-
# def load_llm_model():
|
| 225 |
-
# """Loads LlamaCPP quantised model for topic labeling."""
|
| 226 |
-
# model_repo = "NousResearch/Meta-Llama-3-8B-Instruct-GGUF"
|
| 227 |
-
# model_file = "Meta-Llama-3-8B-Instruct-Q4_K_M.gguf"
|
| 228 |
-
# model_path = hf_hub_download(repo_id=model_repo, filename=model_file)
|
| 229 |
-
# return Llama(model_path=model_path, n_gpu_layers=-1, n_ctx=8192,
|
| 230 |
-
# stop=["Q:", "\n"], verbose=False)
|
| 231 |
-
|
| 232 |
-
|
| 233 |
@st.cache_data
|
| 234 |
def load_precomputed_data(docs_file, embeddings_file):
|
| 235 |
docs = np.load(docs_file, allow_pickle=True).tolist()
|
| 236 |
-
emb
|
| 237 |
return docs, emb
|
| 238 |
|
| 239 |
|
| 240 |
-
|
| 241 |
# =====================================================================
|
| 242 |
# 4. Topic modeling function
|
| 243 |
# =====================================================================
|
| 244 |
|
|
|
|
| 245 |
def get_config_hash(cfg):
|
| 246 |
return json.dumps(cfg, sort_keys=True)
|
| 247 |
|
|
@@ -256,38 +262,45 @@ def perform_topic_modeling(_docs, _embeddings, config_hash):
|
|
| 256 |
try:
|
| 257 |
_embeddings = np.vstack(_embeddings)
|
| 258 |
except Exception:
|
| 259 |
-
st.error(
|
| 260 |
-
|
|
|
|
|
|
|
| 261 |
st.stop()
|
| 262 |
_embeddings = np.ascontiguousarray(_embeddings, dtype=np.float32)
|
| 263 |
|
| 264 |
if _embeddings.shape[0] != len(_docs):
|
| 265 |
-
st.error(
|
| 266 |
-
|
| 267 |
-
|
|
|
|
|
|
|
| 268 |
st.stop()
|
| 269 |
|
| 270 |
config = json.loads(config_hash)
|
| 271 |
|
| 272 |
-
# Prepare vectorizer parameters
|
| 273 |
if "ngram_range" in config["vectorizer_params"]:
|
| 274 |
-
config["vectorizer_params"]["ngram_range"] = tuple(
|
| 275 |
-
|
| 276 |
-
|
| 277 |
-
|
| 278 |
-
|
| 279 |
-
|
| 280 |
-
|
| 281 |
-
**config["umap_params"]
|
| 282 |
-
)
|
| 283 |
hdbscan_model = HDBSCAN(
|
| 284 |
-
metric="euclidean", prediction_data=True,
|
| 285 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 286 |
)
|
| 287 |
-
vectorizer_model = CountVectorizer(**config["vectorizer_params"]) if config["use_vectorizer"] else None
|
| 288 |
|
| 289 |
-
nr_topics_val =
|
| 290 |
-
|
|
|
|
|
|
|
|
|
|
| 291 |
|
| 292 |
topic_model = BERTopic(
|
| 293 |
umap_model=umap_model,
|
|
@@ -296,7 +309,7 @@ def perform_topic_modeling(_docs, _embeddings, config_hash):
|
|
| 296 |
representation_model=rep_model,
|
| 297 |
top_n_words=config["bt_params"]["top_n_words"],
|
| 298 |
nr_topics=nr_topics_val,
|
| 299 |
-
verbose=False
|
| 300 |
)
|
| 301 |
|
| 302 |
topics, _ = topic_model.fit_transform(_docs, _embeddings)
|
|
@@ -304,33 +317,39 @@ def perform_topic_modeling(_docs, _embeddings, config_hash):
|
|
| 304 |
|
| 305 |
outlier_pct = 0
|
| 306 |
if -1 in info.Topic.values:
|
| 307 |
-
outlier_pct = (
|
|
|
|
|
|
|
| 308 |
|
| 309 |
-
# <-- MODIFIED: Use default topic names instead of LLM labels
|
| 310 |
-
# Get the default keyword-based names generated by BERTopic
|
| 311 |
topic_info = topic_model.get_topic_info()
|
| 312 |
-
|
| 313 |
-
name_map = topic_info.set_index('Topic')['Name'].to_dict()
|
| 314 |
-
# Map each document's topic_id to its name
|
| 315 |
all_labels = [name_map[topic] for topic in topics]
|
| 316 |
|
| 317 |
-
|
| 318 |
reduced = UMAP(
|
| 319 |
-
n_neighbors=15,
|
| 320 |
-
|
|
|
|
|
|
|
|
|
|
| 321 |
).fit_transform(_embeddings)
|
| 322 |
|
| 323 |
-
return topic_model, reduced, all_labels, len(info)-1, outlier_pct
|
| 324 |
-
|
| 325 |
|
| 326 |
|
| 327 |
# =====================================================================
|
| 328 |
# 5. CSV → documents → embeddings pipeline
|
| 329 |
# =====================================================================
|
| 330 |
|
| 331 |
-
|
| 332 |
-
|
| 333 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 334 |
|
| 335 |
# ---------------------
|
| 336 |
# Load & clean CSV
|
|
@@ -338,9 +357,13 @@ def generate_and_save_embeddings(csv_path, docs_file, emb_file,
|
|
| 338 |
st.info(f"Reading and preparing CSV: {csv_path}")
|
| 339 |
df = pd.read_csv(csv_path)
|
| 340 |
|
| 341 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 342 |
if col is None:
|
| 343 |
-
st.error("CSV must contain
|
| 344 |
return
|
| 345 |
|
| 346 |
if col != "reflection_answer_english":
|
|
@@ -354,7 +377,6 @@ def generate_and_save_embeddings(csv_path, docs_file, emb_file,
|
|
| 354 |
# ---------------------
|
| 355 |
# Sentence / report granularity
|
| 356 |
# ---------------------
|
| 357 |
-
|
| 358 |
if split_sentences:
|
| 359 |
try:
|
| 360 |
sentences = [s for r in reports for s in nltk.sent_tokenize(r)]
|
|
@@ -365,14 +387,15 @@ def generate_and_save_embeddings(csv_path, docs_file, emb_file,
|
|
| 365 |
else:
|
| 366 |
docs = reports
|
| 367 |
|
| 368 |
-
|
| 369 |
np.save(docs_file, np.array(docs, dtype=object))
|
| 370 |
st.success(f"Prepared {len(docs)} documents")
|
| 371 |
|
| 372 |
# ---------------------
|
| 373 |
# Embeddings
|
| 374 |
# ---------------------
|
| 375 |
-
st.info(
|
|
|
|
|
|
|
| 376 |
|
| 377 |
model = load_embedding_model(selected_embedding_model)
|
| 378 |
|
|
@@ -387,7 +410,7 @@ def generate_and_save_embeddings(csv_path, docs_file, emb_file,
|
|
| 387 |
show_progress_bar=True,
|
| 388 |
batch_size=batch_size,
|
| 389 |
device=encode_device,
|
| 390 |
-
convert_to_numpy=True
|
| 391 |
)
|
| 392 |
embeddings = np.asarray(embeddings, dtype=np.float32)
|
| 393 |
np.save(emb_file, embeddings)
|
|
@@ -397,12 +420,10 @@ def generate_and_save_embeddings(csv_path, docs_file, emb_file,
|
|
| 397 |
st.rerun()
|
| 398 |
|
| 399 |
|
| 400 |
-
|
| 401 |
# =====================================================================
|
| 402 |
# 6. Sidebar — dataset, upload, parameters
|
| 403 |
# =====================================================================
|
| 404 |
|
| 405 |
-
# --- User CSV upload / server dataset ---
|
| 406 |
st.sidebar.header("Data Input Method")
|
| 407 |
|
| 408 |
source = st.sidebar.radio(
|
|
@@ -416,24 +437,36 @@ uploaded_csv_path = None
|
|
| 416 |
CSV_PATH = None # will be set in the chosen branch
|
| 417 |
|
| 418 |
if source == "Use preprocessed CSV on server":
|
| 419 |
-
# List preprocessed CSVs inside this dataset’s folder
|
| 420 |
available = _list_server_csvs(PROC_DIR)
|
| 421 |
if not available:
|
| 422 |
-
st.info(
|
|
|
|
|
|
|
| 423 |
st.stop()
|
| 424 |
-
selected_csv = st.sidebar.selectbox(
|
|
|
|
|
|
|
| 425 |
CSV_PATH = selected_csv
|
| 426 |
else:
|
| 427 |
-
up = st.sidebar.file_uploader(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 428 |
if up is not None:
|
| 429 |
tmp_df = pd.read_csv(up)
|
| 430 |
-
|
| 431 |
-
|
| 432 |
-
st.error("CSV must contain a text column such as: " + ", ".join(ACCEPTABLE_TEXT_COLUMNS))
|
| 433 |
st.stop()
|
| 434 |
-
|
| 435 |
-
|
| 436 |
-
# Save into THIS dataset’s preprocessed folder
|
| 437 |
uploaded_csv_path = str((PROC_DIR / "uploaded.csv").resolve())
|
| 438 |
tmp_df.to_csv(uploaded_csv_path, index=False)
|
| 439 |
st.success(f"Uploaded CSV saved to {uploaded_csv_path}")
|
|
@@ -442,35 +475,77 @@ else:
|
|
| 442 |
st.info("Upload a CSV to continue.")
|
| 443 |
st.stop()
|
| 444 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 445 |
|
| 446 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 447 |
|
| 448 |
-
# --- Subsample ---
|
| 449 |
st.sidebar.subheader("Data Granularity & Subsampling")
|
| 450 |
|
| 451 |
-
selected_granularity = st.sidebar.checkbox(
|
|
|
|
|
|
|
| 452 |
granularity_label = "sentences" if selected_granularity else "reports"
|
| 453 |
|
| 454 |
subsample_perc = st.sidebar.slider("Data sampling (%)", 10, 100, 100, 5)
|
| 455 |
|
| 456 |
-
# line break
|
| 457 |
st.sidebar.markdown("---")
|
| 458 |
|
|
|
|
|
|
|
|
|
|
| 459 |
|
| 460 |
-
|
| 461 |
-
# --- Embedding model selection ---
|
| 462 |
st.sidebar.header("Model Selection")
|
| 463 |
|
| 464 |
-
|
| 465 |
-
|
| 466 |
-
|
| 467 |
-
|
| 468 |
-
|
| 469 |
-
|
| 470 |
-
|
|
|
|
|
|
|
| 471 |
|
| 472 |
-
# --- Device selection ---
|
| 473 |
-
# st.sidebar.header("Data Preparation")
|
| 474 |
selected_device = st.sidebar.radio(
|
| 475 |
"Processing device",
|
| 476 |
["GPU (MPS)", "CPU"],
|
|
@@ -481,88 +556,106 @@ selected_device = st.sidebar.radio(
|
|
| 481 |
# 7. Precompute filenames and pipeline triggers
|
| 482 |
# =====================================================================
|
| 483 |
|
| 484 |
-
|
|
|
|
| 485 |
base = os.path.splitext(os.path.basename(csv_path))[0]
|
| 486 |
safe_model = re.sub(r"[^a-zA-Z0-9_-]", "_", model_name)
|
| 487 |
suf = "sentences" if split_sentences else "reports"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 488 |
return (
|
| 489 |
-
str(CACHE_DIR / f"precomputed_{base}_{suf}_docs.npy"),
|
| 490 |
-
str(
|
|
|
|
|
|
|
|
|
|
| 491 |
)
|
| 492 |
|
|
|
|
| 493 |
DOCS_FILE, EMBEDDINGS_FILE = get_precomputed_filenames(
|
| 494 |
-
CSV_PATH, selected_embedding_model, selected_granularity
|
| 495 |
)
|
| 496 |
|
| 497 |
-
# --- Cache management
|
| 498 |
st.sidebar.markdown("### Cache")
|
| 499 |
-
if st.sidebar.button(
|
|
|
|
|
|
|
| 500 |
try:
|
| 501 |
for p in (DOCS_FILE, EMBEDDINGS_FILE):
|
| 502 |
if os.path.exists(p):
|
| 503 |
os.remove(p)
|
| 504 |
-
# also clear Streamlit caches tied to these functions
|
| 505 |
try:
|
| 506 |
-
load_precomputed_data.clear()
|
| 507 |
except Exception:
|
| 508 |
pass
|
| 509 |
try:
|
| 510 |
-
perform_topic_modeling.clear()
|
| 511 |
except Exception:
|
| 512 |
pass
|
| 513 |
|
| 514 |
-
st.success(
|
|
|
|
|
|
|
| 515 |
st.rerun()
|
| 516 |
except Exception as e:
|
| 517 |
st.error(f"Failed to delete cache files: {e}")
|
| 518 |
|
| 519 |
-
#add line break
|
| 520 |
st.sidebar.markdown("---")
|
| 521 |
|
| 522 |
-
|
| 523 |
-
|
| 524 |
# =====================================================================
|
| 525 |
# 8. Prepare Data OR Run Analysis
|
| 526 |
# =====================================================================
|
| 527 |
|
| 528 |
if not os.path.exists(EMBEDDINGS_FILE):
|
| 529 |
-
st.warning(
|
| 530 |
-
|
|
|
|
|
|
|
|
|
|
| 531 |
if st.button("Prepare Data for This Configuration"):
|
| 532 |
generate_and_save_embeddings(
|
| 533 |
-
CSV_PATH,
|
| 534 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 535 |
)
|
| 536 |
|
| 537 |
else:
|
| 538 |
# Load cached data
|
| 539 |
docs, embeddings = load_precomputed_data(DOCS_FILE, EMBEDDINGS_FILE)
|
| 540 |
|
| 541 |
-
# Coerce to 2-D float array even if saved as object
|
| 542 |
embeddings = np.asarray(embeddings)
|
| 543 |
if embeddings.dtype == object or embeddings.ndim != 2:
|
| 544 |
try:
|
| 545 |
embeddings = np.vstack(embeddings).astype(np.float32)
|
| 546 |
except Exception:
|
| 547 |
-
st.error(
|
|
|
|
|
|
|
| 548 |
st.stop()
|
| 549 |
|
| 550 |
-
# Subsample
|
| 551 |
if subsample_perc < 100:
|
| 552 |
n = int(len(docs) * (subsample_perc / 100))
|
| 553 |
idx = np.random.choice(len(docs), size=n, replace=False)
|
| 554 |
docs = [docs[i] for i in idx]
|
| 555 |
-
|
| 556 |
-
|
| 557 |
-
|
| 558 |
-
|
| 559 |
|
| 560 |
-
#
|
| 561 |
-
# --- Dataset summary metrics ---
|
| 562 |
st.subheader("Dataset summary")
|
| 563 |
-
n_reports = count_clean_reports(CSV_PATH)
|
| 564 |
unit = "sentences" if selected_granularity else "reports"
|
| 565 |
-
n_units = len(docs)
|
| 566 |
|
| 567 |
c1, c2 = st.columns(2)
|
| 568 |
c1.metric("Reports in CSV (cleaned)", n_reports)
|
|
@@ -577,7 +670,9 @@ else:
|
|
| 577 |
ng_min = st.slider("Min N-gram", 1, 5, 1)
|
| 578 |
ng_max = st.slider("Max N-gram", 1, 5, 2)
|
| 579 |
min_df = st.slider("Min Doc Freq", 1, 50, 1)
|
| 580 |
-
stopwords = st.select_slider(
|
|
|
|
|
|
|
| 581 |
|
| 582 |
with st.sidebar.expander("UMAP"):
|
| 583 |
um_n = st.slider("n_neighbors", 2, 50, 15)
|
|
@@ -589,10 +684,9 @@ else:
|
|
| 589 |
hm = st.slider("min_samples", 2, 100, 5)
|
| 590 |
|
| 591 |
with st.sidebar.expander("BERTopic"):
|
| 592 |
-
nr_topics
|
| 593 |
top_n_words = st.slider("top_n_words", 5, 25, 10)
|
| 594 |
|
| 595 |
-
# --- Build config ---
|
| 596 |
current_config = {
|
| 597 |
"embedding_model": selected_embedding_model,
|
| 598 |
"granularity": granularity_label,
|
|
@@ -616,12 +710,11 @@ else:
|
|
| 616 |
"nr_topics": nr_topics,
|
| 617 |
"top_n_words": top_n_words,
|
| 618 |
},
|
|
|
|
| 619 |
}
|
| 620 |
|
| 621 |
-
# --- Run Button ---
|
| 622 |
run_button = st.sidebar.button("Run Analysis", type="primary")
|
| 623 |
|
| 624 |
-
|
| 625 |
# =================================================================
|
| 626 |
# 9. Visualization & History Tabs
|
| 627 |
# =================================================================
|
|
@@ -635,13 +728,11 @@ else:
|
|
| 635 |
data = json.load(open(path))
|
| 636 |
except Exception:
|
| 637 |
return []
|
| 638 |
-
# --- migrate old keys for backward-compat ---
|
| 639 |
for e in data:
|
| 640 |
if "outlier_pct" not in e and "outlier_perc" in e:
|
| 641 |
e["outlier_pct"] = e.pop("outlier_perc")
|
| 642 |
return data
|
| 643 |
|
| 644 |
-
|
| 645 |
def save_history(h):
|
| 646 |
json.dump(h, open(HISTORY_FILE, "w"), indent=2)
|
| 647 |
|
|
@@ -649,7 +740,6 @@ else:
|
|
| 649 |
st.session_state.history = load_history()
|
| 650 |
|
| 651 |
if run_button:
|
| 652 |
-
|
| 653 |
if not isinstance(embeddings, np.ndarray):
|
| 654 |
embeddings = np.asarray(embeddings)
|
| 655 |
|
|
@@ -657,30 +747,33 @@ else:
|
|
| 657 |
try:
|
| 658 |
embeddings = np.vstack(embeddings).astype(np.float32)
|
| 659 |
except Exception:
|
| 660 |
-
st.error(
|
|
|
|
|
|
|
| 661 |
st.stop()
|
| 662 |
|
| 663 |
if embeddings.shape[0] != len(docs):
|
| 664 |
-
st.error(
|
| 665 |
-
|
| 666 |
-
|
|
|
|
|
|
|
| 667 |
st.stop()
|
| 668 |
|
| 669 |
-
|
| 670 |
with st.spinner("Performing topic modeling..."):
|
| 671 |
model, reduced, labels, n_topics, outlier_pct = perform_topic_modeling(
|
| 672 |
docs, embeddings, get_config_hash(current_config)
|
| 673 |
)
|
| 674 |
st.session_state.latest_results = (model, reduced, labels)
|
| 675 |
|
| 676 |
-
# Save in history
|
| 677 |
entry = {
|
| 678 |
"timestamp": str(pd.Timestamp.now()),
|
| 679 |
"config": current_config,
|
| 680 |
"num_topics": n_topics,
|
| 681 |
"outlier_pct": f"{outlier_pct:.2f}%",
|
| 682 |
-
"llm_labels": [
|
| 683 |
-
name
|
|
|
|
| 684 |
if ("Unlabelled" not in name and "outlier" not in name)
|
| 685 |
],
|
| 686 |
}
|
|
@@ -700,65 +793,63 @@ else:
|
|
| 700 |
st.subheader("Topic Info")
|
| 701 |
st.dataframe(tm.get_topic_info())
|
| 702 |
|
| 703 |
-
|
| 704 |
-
# --- Export: one row per topic (topic_id, LLM topic_name, texts) ---
|
| 705 |
st.subheader("Export results (one row per topic)")
|
| 706 |
|
| 707 |
-
# 1) Pull LLM labels directly from BERTopic's representation
|
| 708 |
full_reps = tm.get_topics(full=True)
|
| 709 |
-
llm_reps = full_reps.get("LLM", {})
|
| 710 |
|
| 711 |
-
# Build topic_id -> LLM label map; fall back to Name if missing
|
| 712 |
llm_names = {}
|
| 713 |
for tid, vals in llm_reps.items():
|
| 714 |
try:
|
| 715 |
-
llm_names[tid] = (
|
|
|
|
|
|
|
| 716 |
except Exception:
|
| 717 |
llm_names[tid] = "Unlabelled"
|
| 718 |
-
|
| 719 |
-
# <-- MODIFIED: This fallback logic is now the main logic
|
| 720 |
if not llm_names:
|
| 721 |
-
# Fallback: whatever BERTopic put in Name
|
| 722 |
st.caption("Note: Using default keyword-based topic names.")
|
| 723 |
-
llm_names =
|
|
|
|
|
|
|
| 724 |
|
| 725 |
-
# 2) Per-document assignments for current docs
|
| 726 |
doc_info = tm.get_document_info(docs)[["Document", "Topic"]]
|
| 727 |
|
| 728 |
-
|
| 729 |
-
|
|
|
|
| 730 |
if not include_outliers:
|
| 731 |
doc_info = doc_info[doc_info["Topic"] != -1]
|
| 732 |
|
| 733 |
-
# 4) Group texts by topic
|
| 734 |
grouped = (
|
| 735 |
doc_info.groupby("Topic")["Document"]
|
| 736 |
.apply(list)
|
| 737 |
.reset_index(name="texts")
|
| 738 |
)
|
|
|
|
|
|
|
|
|
|
| 739 |
|
| 740 |
-
|
| 741 |
-
|
| 742 |
-
|
| 743 |
-
|
| 744 |
-
|
| 745 |
-
|
| 746 |
-
|
| 747 |
|
| 748 |
-
|
| 749 |
-
# Fixed separator (no textbox)
|
| 750 |
-
# SEP = " || " # change if you prefer another fixed separator
|
| 751 |
-
SEP = "\n" # change if you prefer another fixed separator
|
| 752 |
|
| 753 |
-
# Flatten lists for CSV
|
| 754 |
export_csv = export_topics.copy()
|
| 755 |
-
export_csv["texts"] = export_csv["texts"].apply(
|
|
|
|
|
|
|
| 756 |
|
| 757 |
base = os.path.splitext(os.path.basename(CSV_PATH))[0]
|
| 758 |
gran = "sentences" if selected_granularity else "reports"
|
| 759 |
-
csv_name
|
| 760 |
jsonl_name = f"topics_{base}_{gran}.jsonl"
|
| 761 |
-
csv_path
|
| 762 |
jsonl_path = (EVAL_DIR / jsonl_name).resolve()
|
| 763 |
|
| 764 |
cL, cC, cR = st.columns(3)
|
|
@@ -772,7 +863,6 @@ else:
|
|
| 772 |
st.error(f"Failed to save CSV: {e}")
|
| 773 |
|
| 774 |
with cC:
|
| 775 |
-
# JSONL preserves the list structure of texts
|
| 776 |
if st.button("Save JSONL to eval/", use_container_width=True):
|
| 777 |
try:
|
| 778 |
with open(jsonl_path, "w", encoding="utf-8") as f:
|
|
@@ -782,7 +872,9 @@ else:
|
|
| 782 |
"topic_name": row["topic_name"],
|
| 783 |
"texts": list(map(str, row["texts"])),
|
| 784 |
}
|
| 785 |
-
f.write(
|
|
|
|
|
|
|
| 786 |
st.success(f"Saved JSONL → {jsonl_path}")
|
| 787 |
except Exception as e:
|
| 788 |
st.error(f"Failed to save JSONL: {e}")
|
|
@@ -799,11 +891,6 @@ else:
|
|
| 799 |
st.caption("Preview (one row per topic)")
|
| 800 |
st.dataframe(export_csv.head(10))
|
| 801 |
|
| 802 |
-
|
| 803 |
-
|
| 804 |
-
|
| 805 |
-
|
| 806 |
-
|
| 807 |
else:
|
| 808 |
st.info("Click 'Run Analysis' to begin.")
|
| 809 |
|
|
@@ -816,8 +903,10 @@ else:
|
|
| 816 |
for i, entry in enumerate(st.session_state.history):
|
| 817 |
with st.expander(f"Run {i+1} — {entry['timestamp']}"):
|
| 818 |
st.write(f"**Topics:** {entry['num_topics']}")
|
| 819 |
-
st.write(
|
|
|
|
|
|
|
| 820 |
st.write("**Topic Labels (default keywords):**")
|
| 821 |
st.write(entry["llm_labels"])
|
| 822 |
with st.expander("Show full configuration"):
|
| 823 |
-
st.json(entry["config"])
|
|
|
|
| 12 |
|
| 13 |
from pathlib import Path
|
| 14 |
import sys
|
| 15 |
+
# from llama_cpp import Llama # <-- REMOVED
|
| 16 |
import streamlit as st
|
| 17 |
import pandas as pd
|
| 18 |
import numpy as np
|
|
|
|
| 21 |
import nltk
|
| 22 |
import json
|
| 23 |
|
| 24 |
+
# =====================================================================
|
| 25 |
+
# NLTK setup
|
| 26 |
+
# =====================================================================
|
| 27 |
|
| 28 |
NLTK_DATA_DIR = "/usr/local/share/nltk_data"
|
| 29 |
if NLTK_DATA_DIR not in nltk.data.path:
|
| 30 |
nltk.data.path.append(NLTK_DATA_DIR)
|
| 31 |
|
| 32 |
+
# Try to ensure both punkt_tab (new NLTK) and punkt (old NLTK) are available
|
| 33 |
for resource in ("punkt_tab", "punkt"):
|
| 34 |
try:
|
| 35 |
nltk.data.find(f"tokenizers/{resource}")
|
| 36 |
except LookupError:
|
|
|
|
| 37 |
try:
|
| 38 |
nltk.download(resource, download_dir=NLTK_DATA_DIR)
|
| 39 |
except Exception as e:
|
| 40 |
print(f"Could not download NLTK resource {resource}: {e}")
|
| 41 |
|
| 42 |
+
# =====================================================================
|
| 43 |
+
# Path utils (MOSAIC or fallback)
|
| 44 |
+
# =====================================================================
|
| 45 |
|
|
|
|
| 46 |
try:
|
| 47 |
from mosaic.path_utils import CFG, raw_path, proc_path, eval_path, project_root # type: ignore
|
| 48 |
except Exception:
|
|
|
|
| 53 |
|
| 54 |
# Defaults: app-local data/ eval/ that are safe on Cloud
|
| 55 |
_DATA_ROOT = _env("MOSAIC_DATA", str(Path(__file__).parent / "data"))
|
| 56 |
+
_BOX_ROOT = _env("MOSAIC_BOX", str(Path(__file__).parent / "data" / "raw"))
|
| 57 |
_EVAL_ROOT = _env("MOSAIC_EVAL", str(Path(__file__).parent / "eval"))
|
| 58 |
|
| 59 |
CFG = {
|
| 60 |
"data_root": str(_DATA_ROOT),
|
| 61 |
+
"box_root": str(_BOX_ROOT),
|
| 62 |
"eval_root": str(_EVAL_ROOT),
|
| 63 |
}
|
| 64 |
|
|
|
|
| 74 |
def eval_path(*parts: str) -> Path:
|
| 75 |
return _EVAL_ROOT.joinpath(*parts)
|
| 76 |
|
| 77 |
+
|
| 78 |
# BERTopic stack
|
| 79 |
from bertopic import BERTopic
|
| 80 |
+
# from bertopic.representation import LlamaCPP # <-- REMOVED
|
| 81 |
+
# from llama_cpp import Llama # <-- REMOVED
|
| 82 |
from sentence_transformers import SentenceTransformer
|
| 83 |
|
| 84 |
# Clustering/dimensionality reduction
|
|
|
|
| 91 |
import matplotlib.pyplot as plt
|
| 92 |
from huggingface_hub import hf_hub_download
|
| 93 |
|
|
|
|
|
|
|
| 94 |
# =====================================================================
|
| 95 |
# 0. Constants & Helper Functions
|
| 96 |
# =====================================================================
|
|
|
|
| 102 |
return s or "DATASET"
|
| 103 |
|
| 104 |
|
| 105 |
+
# "Nice" default names we know from MOSAIC; NOT a hard constraint anymore
|
| 106 |
ACCEPTABLE_TEXT_COLUMNS = [
|
| 107 |
"reflection_answer_english",
|
| 108 |
"reflection_answer",
|
|
|
|
| 110 |
"report",
|
| 111 |
]
|
| 112 |
|
| 113 |
+
|
| 114 |
def _pick_text_column(df: pd.DataFrame) -> str | None:
|
| 115 |
+
"""Return the first matching *preferred* text column name if present."""
|
| 116 |
for col in ACCEPTABLE_TEXT_COLUMNS:
|
| 117 |
if col in df.columns:
|
| 118 |
return col
|
| 119 |
return None
|
| 120 |
|
| 121 |
|
| 122 |
+
def _list_text_columns(df: pd.DataFrame) -> list[str]:
|
| 123 |
+
"""Return all columns that look text-like (object / string dtype)."""
|
| 124 |
+
text_cols: list[str] = []
|
| 125 |
+
for c in df.columns:
|
| 126 |
+
s = df[c]
|
| 127 |
+
if s.dtype == object or pd.api.types.is_string_dtype(s):
|
| 128 |
+
text_cols.append(c)
|
| 129 |
+
return text_cols
|
| 130 |
+
|
| 131 |
+
|
| 132 |
def _set_from_env_or_secrets(key: str):
|
| 133 |
"""Allow hosting: value can come from environment or from Streamlit secrets."""
|
| 134 |
if os.getenv(key):
|
|
|
|
| 140 |
if val:
|
| 141 |
os.environ[key] = str(val)
|
| 142 |
|
| 143 |
+
|
| 144 |
# Enable both MOSAIC_DATA and MOSAIC_BOX automatically
|
| 145 |
for _k in ("MOSAIC_DATA", "MOSAIC_BOX"):
|
| 146 |
_set_from_env_or_secrets(_k)
|
| 147 |
|
| 148 |
|
|
|
|
| 149 |
@st.cache_data
|
| 150 |
+
def count_clean_reports(csv_path: str, text_col: str | None = None) -> int:
|
| 151 |
+
"""Count non-empty reports in the chosen text column."""
|
| 152 |
df = pd.read_csv(csv_path)
|
| 153 |
+
|
| 154 |
+
if text_col is not None and text_col in df.columns:
|
| 155 |
+
col = text_col
|
| 156 |
+
else:
|
| 157 |
+
col = _pick_text_column(df)
|
| 158 |
+
|
| 159 |
if col is None:
|
| 160 |
return 0
|
| 161 |
+
|
| 162 |
if col != "reflection_answer_english":
|
| 163 |
df = df.rename(columns={col: "reflection_answer_english"})
|
| 164 |
+
|
| 165 |
df.dropna(subset=["reflection_answer_english"], inplace=True)
|
| 166 |
df["reflection_answer_english"] = df["reflection_answer_english"].astype(str)
|
| 167 |
df = df[df["reflection_answer_english"].str.strip() != ""]
|
| 168 |
return len(df)
|
| 169 |
|
| 170 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 171 |
# =====================================================================
|
| 172 |
# 1. Streamlit app setup
|
| 173 |
# =====================================================================
|
| 174 |
|
| 175 |
st.set_page_config(page_title="MOSAIC Dashboard", layout="wide")
|
| 176 |
+
st.title(
|
| 177 |
+
"Mapping of Subjective Accounts into Interpreted Clusters (MOSAIC): "
|
| 178 |
+
"Topic Modelling Dashboard for Phenomenological Reports"
|
| 179 |
+
)
|
| 180 |
|
|
|
|
| 181 |
st.markdown(
|
| 182 |
"""
|
| 183 |
_If you use this tool in your research, please cite the following paper:_\n
|
| 184 |
+
**Beauté, R., et al. (2025).**
|
| 185 |
+
**Mapping of Subjective Accounts into Interpreted Clusters (MOSAIC): Topic Modelling and LLM applied to Stroboscopic Phenomenology**
|
| 186 |
https://arxiv.org/abs/2502.18318
|
| 187 |
"""
|
| 188 |
)
|
|
|
|
| 191 |
# 2. Dataset paths (using MOSAIC structure)
|
| 192 |
# =====================================================================
|
| 193 |
|
| 194 |
+
ds_input = st.sidebar.text_input(
|
| 195 |
+
"Project/Dataset name", value="MOSAIC", key="dataset_name_input"
|
| 196 |
+
)
|
| 197 |
DATASET_DIR = _slugify(ds_input).upper()
|
| 198 |
|
| 199 |
+
RAW_DIR = raw_path(DATASET_DIR)
|
|
|
|
| 200 |
PROC_DIR = proc_path(DATASET_DIR, "preprocessed")
|
| 201 |
EVAL_DIR = eval_path(DATASET_DIR)
|
| 202 |
CACHE_DIR = PROC_DIR / "cache"
|
|
|
|
| 205 |
CACHE_DIR.mkdir(parents=True, exist_ok=True)
|
| 206 |
EVAL_DIR.mkdir(parents=True, exist_ok=True)
|
| 207 |
|
|
|
|
| 208 |
with st.sidebar.expander("About the dataset name", expanded=False):
|
| 209 |
st.markdown(
|
| 210 |
f"""
|
|
|
|
| 221 |
def _list_server_csvs(proc_dir: Path) -> list[str]:
|
| 222 |
return [str(p) for p in sorted(proc_dir.glob("*.csv"))]
|
| 223 |
|
|
|
|
|
|
|
|
|
|
| 224 |
|
| 225 |
+
DATASETS = None # keep name for clarity; we’ll fill it when rendering the sidebar
|
| 226 |
HISTORY_FILE = str(PROC_DIR / "run_history.json")
|
| 227 |
|
|
|
|
|
|
|
| 228 |
# =====================================================================
|
| 229 |
# 3. Embedding & LLM loaders
|
| 230 |
# =====================================================================
|
| 231 |
|
| 232 |
+
|
| 233 |
@st.cache_resource
|
| 234 |
def load_embedding_model(model_name):
|
| 235 |
st.info(f"Loading embedding model '{model_name}'...")
|
| 236 |
return SentenceTransformer(model_name)
|
| 237 |
|
| 238 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 239 |
@st.cache_data
|
| 240 |
def load_precomputed_data(docs_file, embeddings_file):
|
| 241 |
docs = np.load(docs_file, allow_pickle=True).tolist()
|
| 242 |
+
emb = np.load(embeddings_file, allow_pickle=True)
|
| 243 |
return docs, emb
|
| 244 |
|
| 245 |
|
|
|
|
| 246 |
# =====================================================================
|
| 247 |
# 4. Topic modeling function
|
| 248 |
# =====================================================================
|
| 249 |
|
| 250 |
+
|
| 251 |
def get_config_hash(cfg):
|
| 252 |
return json.dumps(cfg, sort_keys=True)
|
| 253 |
|
|
|
|
| 262 |
try:
|
| 263 |
_embeddings = np.vstack(_embeddings)
|
| 264 |
except Exception:
|
| 265 |
+
st.error(
|
| 266 |
+
f"Embeddings are invalid (dtype={_embeddings.dtype}, ndim={_embeddings.ndim}). "
|
| 267 |
+
"Please click **Prepare Data** to regenerate."
|
| 268 |
+
)
|
| 269 |
st.stop()
|
| 270 |
_embeddings = np.ascontiguousarray(_embeddings, dtype=np.float32)
|
| 271 |
|
| 272 |
if _embeddings.shape[0] != len(_docs):
|
| 273 |
+
st.error(
|
| 274 |
+
f"Mismatch between docs and embeddings: len(docs)={len(_docs)} vs "
|
| 275 |
+
f"embeddings.shape[0]={_embeddings.shape[0]}. "
|
| 276 |
+
"Delete the cached files for this configuration and regenerate."
|
| 277 |
+
)
|
| 278 |
st.stop()
|
| 279 |
|
| 280 |
config = json.loads(config_hash)
|
| 281 |
|
|
|
|
| 282 |
if "ngram_range" in config["vectorizer_params"]:
|
| 283 |
+
config["vectorizer_params"]["ngram_range"] = tuple(
|
| 284 |
+
config["vectorizer_params"]["ngram_range"]
|
| 285 |
+
)
|
| 286 |
+
|
| 287 |
+
rep_model = None # <-- Use BERTopic defaults for representation
|
| 288 |
+
|
| 289 |
+
umap_model = UMAP(random_state=42, metric="cosine", **config["umap_params"])
|
|
|
|
|
|
|
| 290 |
hdbscan_model = HDBSCAN(
|
| 291 |
+
metric="euclidean", prediction_data=True, **config["hdbscan_params"]
|
| 292 |
+
)
|
| 293 |
+
vectorizer_model = (
|
| 294 |
+
CountVectorizer(**config["vectorizer_params"])
|
| 295 |
+
if config["use_vectorizer"]
|
| 296 |
+
else None
|
| 297 |
)
|
|
|
|
| 298 |
|
| 299 |
+
nr_topics_val = (
|
| 300 |
+
None
|
| 301 |
+
if config["bt_params"]["nr_topics"] == "auto"
|
| 302 |
+
else int(config["bt_params"]["nr_topics"])
|
| 303 |
+
)
|
| 304 |
|
| 305 |
topic_model = BERTopic(
|
| 306 |
umap_model=umap_model,
|
|
|
|
| 309 |
representation_model=rep_model,
|
| 310 |
top_n_words=config["bt_params"]["top_n_words"],
|
| 311 |
nr_topics=nr_topics_val,
|
| 312 |
+
verbose=False,
|
| 313 |
)
|
| 314 |
|
| 315 |
topics, _ = topic_model.fit_transform(_docs, _embeddings)
|
|
|
|
| 317 |
|
| 318 |
outlier_pct = 0
|
| 319 |
if -1 in info.Topic.values:
|
| 320 |
+
outlier_pct = (
|
| 321 |
+
info.Count[info.Topic == -1].iloc[0] / info.Count.sum()
|
| 322 |
+
) * 100
|
| 323 |
|
|
|
|
|
|
|
| 324 |
topic_info = topic_model.get_topic_info()
|
| 325 |
+
name_map = topic_info.set_index("Topic")["Name"].to_dict()
|
|
|
|
|
|
|
| 326 |
all_labels = [name_map[topic] for topic in topics]
|
| 327 |
|
|
|
|
| 328 |
reduced = UMAP(
|
| 329 |
+
n_neighbors=15,
|
| 330 |
+
n_components=2,
|
| 331 |
+
min_dist=0.0,
|
| 332 |
+
metric="cosine",
|
| 333 |
+
random_state=42,
|
| 334 |
).fit_transform(_embeddings)
|
| 335 |
|
| 336 |
+
return topic_model, reduced, all_labels, len(info) - 1, outlier_pct
|
|
|
|
| 337 |
|
| 338 |
|
| 339 |
# =====================================================================
|
| 340 |
# 5. CSV → documents → embeddings pipeline
|
| 341 |
# =====================================================================
|
| 342 |
|
| 343 |
+
|
| 344 |
+
def generate_and_save_embeddings(
|
| 345 |
+
csv_path,
|
| 346 |
+
docs_file,
|
| 347 |
+
emb_file,
|
| 348 |
+
selected_embedding_model,
|
| 349 |
+
split_sentences,
|
| 350 |
+
device,
|
| 351 |
+
text_col=None,
|
| 352 |
+
):
|
| 353 |
|
| 354 |
# ---------------------
|
| 355 |
# Load & clean CSV
|
|
|
|
| 357 |
st.info(f"Reading and preparing CSV: {csv_path}")
|
| 358 |
df = pd.read_csv(csv_path)
|
| 359 |
|
| 360 |
+
if text_col is not None and text_col in df.columns:
|
| 361 |
+
col = text_col
|
| 362 |
+
else:
|
| 363 |
+
col = _pick_text_column(df)
|
| 364 |
+
|
| 365 |
if col is None:
|
| 366 |
+
st.error("CSV must contain at least one text column.")
|
| 367 |
return
|
| 368 |
|
| 369 |
if col != "reflection_answer_english":
|
|
|
|
| 377 |
# ---------------------
|
| 378 |
# Sentence / report granularity
|
| 379 |
# ---------------------
|
|
|
|
| 380 |
if split_sentences:
|
| 381 |
try:
|
| 382 |
sentences = [s for r in reports for s in nltk.sent_tokenize(r)]
|
|
|
|
| 387 |
else:
|
| 388 |
docs = reports
|
| 389 |
|
|
|
|
| 390 |
np.save(docs_file, np.array(docs, dtype=object))
|
| 391 |
st.success(f"Prepared {len(docs)} documents")
|
| 392 |
|
| 393 |
# ---------------------
|
| 394 |
# Embeddings
|
| 395 |
# ---------------------
|
| 396 |
+
st.info(
|
| 397 |
+
f"Encoding {len(docs)} documents with {selected_embedding_model} on {device}"
|
| 398 |
+
)
|
| 399 |
|
| 400 |
model = load_embedding_model(selected_embedding_model)
|
| 401 |
|
|
|
|
| 410 |
show_progress_bar=True,
|
| 411 |
batch_size=batch_size,
|
| 412 |
device=encode_device,
|
| 413 |
+
convert_to_numpy=True,
|
| 414 |
)
|
| 415 |
embeddings = np.asarray(embeddings, dtype=np.float32)
|
| 416 |
np.save(emb_file, embeddings)
|
|
|
|
| 420 |
st.rerun()
|
| 421 |
|
| 422 |
|
|
|
|
| 423 |
# =====================================================================
|
| 424 |
# 6. Sidebar — dataset, upload, parameters
|
| 425 |
# =====================================================================
|
| 426 |
|
|
|
|
| 427 |
st.sidebar.header("Data Input Method")
|
| 428 |
|
| 429 |
source = st.sidebar.radio(
|
|
|
|
| 437 |
CSV_PATH = None # will be set in the chosen branch
|
| 438 |
|
| 439 |
if source == "Use preprocessed CSV on server":
|
|
|
|
| 440 |
available = _list_server_csvs(PROC_DIR)
|
| 441 |
if not available:
|
| 442 |
+
st.info(
|
| 443 |
+
f"No CSVs found in {PROC_DIR}. Switch to 'Upload my own CSV' or change the dataset name."
|
| 444 |
+
)
|
| 445 |
st.stop()
|
| 446 |
+
selected_csv = st.sidebar.selectbox(
|
| 447 |
+
"Choose a preprocessed CSV", available, key="server_csv_select"
|
| 448 |
+
)
|
| 449 |
CSV_PATH = selected_csv
|
| 450 |
else:
|
| 451 |
+
up = st.sidebar.file_uploader(
|
| 452 |
+
"Upload a CSV", type=["csv"], key="upload_csv"
|
| 453 |
+
)
|
| 454 |
+
|
| 455 |
+
st.sidebar.caption(
|
| 456 |
+
"Your CSV should have **one row per report** and at least one text column "
|
| 457 |
+
"(for example `reflection_answer_english`, `reflection_answer`, `text`, `report`, "
|
| 458 |
+
"or any other column containing free text). "
|
| 459 |
+
"Other columns (ID, condition, etc.) are allowed. "
|
| 460 |
+
"After upload, you’ll be able to choose which text column to analyse."
|
| 461 |
+
)
|
| 462 |
+
|
| 463 |
if up is not None:
|
| 464 |
tmp_df = pd.read_csv(up)
|
| 465 |
+
if tmp_df.empty:
|
| 466 |
+
st.error("Uploaded CSV is empty.")
|
|
|
|
| 467 |
st.stop()
|
| 468 |
+
|
| 469 |
+
# Just save; we’ll choose the text column later
|
|
|
|
| 470 |
uploaded_csv_path = str((PROC_DIR / "uploaded.csv").resolve())
|
| 471 |
tmp_df.to_csv(uploaded_csv_path, index=False)
|
| 472 |
st.success(f"Uploaded CSV saved to {uploaded_csv_path}")
|
|
|
|
| 475 |
st.info("Upload a CSV to continue.")
|
| 476 |
st.stop()
|
| 477 |
|
| 478 |
+
if CSV_PATH is None:
|
| 479 |
+
st.stop()
|
| 480 |
+
|
| 481 |
+
# ---------------------------------------------------------------------
|
| 482 |
+
# Text column selection
|
| 483 |
+
# ---------------------------------------------------------------------
|
| 484 |
|
| 485 |
|
| 486 |
+
@st.cache_data
|
| 487 |
+
def get_text_columns(csv_path: str) -> list[str]:
|
| 488 |
+
df_sample = pd.read_csv(csv_path, nrows=2000)
|
| 489 |
+
return _list_text_columns(df_sample)
|
| 490 |
+
|
| 491 |
+
|
| 492 |
+
text_columns = get_text_columns(CSV_PATH)
|
| 493 |
+
if not text_columns:
|
| 494 |
+
st.error(
|
| 495 |
+
"No text-like columns found in this CSV. At least one column must contain text."
|
| 496 |
+
)
|
| 497 |
+
st.stop()
|
| 498 |
+
|
| 499 |
+
# Try to pick a nice default (one of the MOSAIC-ish names) if present
|
| 500 |
+
try:
|
| 501 |
+
df_sample = pd.read_csv(CSV_PATH, nrows=2000)
|
| 502 |
+
preferred = _pick_text_column(df_sample)
|
| 503 |
+
except Exception:
|
| 504 |
+
preferred = None
|
| 505 |
+
|
| 506 |
+
if preferred in text_columns:
|
| 507 |
+
default_idx = text_columns.index(preferred)
|
| 508 |
+
else:
|
| 509 |
+
default_idx = 0
|
| 510 |
+
|
| 511 |
+
selected_text_column = st.sidebar.selectbox(
|
| 512 |
+
"Text column to analyse",
|
| 513 |
+
text_columns,
|
| 514 |
+
index=default_idx,
|
| 515 |
+
key="text_column_select",
|
| 516 |
+
)
|
| 517 |
+
|
| 518 |
+
# ---------------------------------------------------------------------
|
| 519 |
+
# Data granularity & subsampling
|
| 520 |
+
# ---------------------------------------------------------------------
|
| 521 |
|
|
|
|
| 522 |
st.sidebar.subheader("Data Granularity & Subsampling")
|
| 523 |
|
| 524 |
+
selected_granularity = st.sidebar.checkbox(
|
| 525 |
+
"Split reports into sentences", value=True
|
| 526 |
+
)
|
| 527 |
granularity_label = "sentences" if selected_granularity else "reports"
|
| 528 |
|
| 529 |
subsample_perc = st.sidebar.slider("Data sampling (%)", 10, 100, 100, 5)
|
| 530 |
|
|
|
|
| 531 |
st.sidebar.markdown("---")
|
| 532 |
|
| 533 |
+
# ---------------------------------------------------------------------
|
| 534 |
+
# Embedding model & device
|
| 535 |
+
# ---------------------------------------------------------------------
|
| 536 |
|
|
|
|
|
|
|
| 537 |
st.sidebar.header("Model Selection")
|
| 538 |
|
| 539 |
+
selected_embedding_model = st.sidebar.selectbox(
|
| 540 |
+
"Choose an embedding model",
|
| 541 |
+
(
|
| 542 |
+
"BAAI/bge-small-en-v1.5",
|
| 543 |
+
"intfloat/multilingual-e5-large-instruct",
|
| 544 |
+
"Qwen/Qwen3-Embedding-0.6B",
|
| 545 |
+
"sentence-transformers/all-mpnet-base-v2",
|
| 546 |
+
),
|
| 547 |
+
)
|
| 548 |
|
|
|
|
|
|
|
| 549 |
selected_device = st.sidebar.radio(
|
| 550 |
"Processing device",
|
| 551 |
["GPU (MPS)", "CPU"],
|
|
|
|
| 556 |
# 7. Precompute filenames and pipeline triggers
|
| 557 |
# =====================================================================
|
| 558 |
|
| 559 |
+
|
| 560 |
+
def get_precomputed_filenames(csv_path, model_name, split_sentences, text_col):
|
| 561 |
base = os.path.splitext(os.path.basename(csv_path))[0]
|
| 562 |
safe_model = re.sub(r"[^a-zA-Z0-9_-]", "_", model_name)
|
| 563 |
suf = "sentences" if split_sentences else "reports"
|
| 564 |
+
|
| 565 |
+
col_suffix = ""
|
| 566 |
+
if text_col:
|
| 567 |
+
safe_col = re.sub(r"[^a-zA-Z0-9_-]", "_", text_col)
|
| 568 |
+
col_suffix = f"_{safe_col}"
|
| 569 |
+
|
| 570 |
return (
|
| 571 |
+
str(CACHE_DIR / f"precomputed_{base}{col_suffix}_{suf}_docs.npy"),
|
| 572 |
+
str(
|
| 573 |
+
CACHE_DIR
|
| 574 |
+
/ f"precomputed_{base}_{safe_model}{col_suffix}_{suf}_embeddings.npy"
|
| 575 |
+
),
|
| 576 |
)
|
| 577 |
|
| 578 |
+
|
| 579 |
DOCS_FILE, EMBEDDINGS_FILE = get_precomputed_filenames(
|
| 580 |
+
CSV_PATH, selected_embedding_model, selected_granularity, selected_text_column
|
| 581 |
)
|
| 582 |
|
| 583 |
+
# --- Cache management ---
|
| 584 |
st.sidebar.markdown("### Cache")
|
| 585 |
+
if st.sidebar.button(
|
| 586 |
+
"Clear cached files for this configuration", use_container_width=True
|
| 587 |
+
):
|
| 588 |
try:
|
| 589 |
for p in (DOCS_FILE, EMBEDDINGS_FILE):
|
| 590 |
if os.path.exists(p):
|
| 591 |
os.remove(p)
|
|
|
|
| 592 |
try:
|
| 593 |
+
load_precomputed_data.clear()
|
| 594 |
except Exception:
|
| 595 |
pass
|
| 596 |
try:
|
| 597 |
+
perform_topic_modeling.clear()
|
| 598 |
except Exception:
|
| 599 |
pass
|
| 600 |
|
| 601 |
+
st.success(
|
| 602 |
+
"Deleted cached docs/embeddings and cleared caches. Click **Prepare Data** again."
|
| 603 |
+
)
|
| 604 |
st.rerun()
|
| 605 |
except Exception as e:
|
| 606 |
st.error(f"Failed to delete cache files: {e}")
|
| 607 |
|
|
|
|
| 608 |
st.sidebar.markdown("---")
|
| 609 |
|
|
|
|
|
|
|
| 610 |
# =====================================================================
|
| 611 |
# 8. Prepare Data OR Run Analysis
|
| 612 |
# =====================================================================
|
| 613 |
|
| 614 |
if not os.path.exists(EMBEDDINGS_FILE):
|
| 615 |
+
st.warning(
|
| 616 |
+
f"No precomputed embeddings found for this configuration "
|
| 617 |
+
f"({granularity_label} / {selected_embedding_model} / column '{selected_text_column}')."
|
| 618 |
+
)
|
| 619 |
+
|
| 620 |
if st.button("Prepare Data for This Configuration"):
|
| 621 |
generate_and_save_embeddings(
|
| 622 |
+
CSV_PATH,
|
| 623 |
+
DOCS_FILE,
|
| 624 |
+
EMBEDDINGS_FILE,
|
| 625 |
+
selected_embedding_model,
|
| 626 |
+
selected_granularity,
|
| 627 |
+
selected_device,
|
| 628 |
+
text_col=selected_text_column,
|
| 629 |
)
|
| 630 |
|
| 631 |
else:
|
| 632 |
# Load cached data
|
| 633 |
docs, embeddings = load_precomputed_data(DOCS_FILE, EMBEDDINGS_FILE)
|
| 634 |
|
|
|
|
| 635 |
embeddings = np.asarray(embeddings)
|
| 636 |
if embeddings.dtype == object or embeddings.ndim != 2:
|
| 637 |
try:
|
| 638 |
embeddings = np.vstack(embeddings).astype(np.float32)
|
| 639 |
except Exception:
|
| 640 |
+
st.error(
|
| 641 |
+
"Cached embeddings are invalid. Please regenerate them for this configuration."
|
| 642 |
+
)
|
| 643 |
st.stop()
|
| 644 |
|
|
|
|
| 645 |
if subsample_perc < 100:
|
| 646 |
n = int(len(docs) * (subsample_perc / 100))
|
| 647 |
idx = np.random.choice(len(docs), size=n, replace=False)
|
| 648 |
docs = [docs[i] for i in idx]
|
| 649 |
+
embeddings = np.asarray(embeddings)[idx, :]
|
| 650 |
+
st.warning(
|
| 651 |
+
f"Running analysis on {subsample_perc}% subsample ({len(docs)} documents)"
|
| 652 |
+
)
|
| 653 |
|
| 654 |
+
# Dataset summary
|
|
|
|
| 655 |
st.subheader("Dataset summary")
|
| 656 |
+
n_reports = count_clean_reports(CSV_PATH, selected_text_column)
|
| 657 |
unit = "sentences" if selected_granularity else "reports"
|
| 658 |
+
n_units = len(docs)
|
| 659 |
|
| 660 |
c1, c2 = st.columns(2)
|
| 661 |
c1.metric("Reports in CSV (cleaned)", n_reports)
|
|
|
|
| 670 |
ng_min = st.slider("Min N-gram", 1, 5, 1)
|
| 671 |
ng_max = st.slider("Max N-gram", 1, 5, 2)
|
| 672 |
min_df = st.slider("Min Doc Freq", 1, 50, 1)
|
| 673 |
+
stopwords = st.select_slider(
|
| 674 |
+
"Stopwords", options=[None, "english"], value=None
|
| 675 |
+
)
|
| 676 |
|
| 677 |
with st.sidebar.expander("UMAP"):
|
| 678 |
um_n = st.slider("n_neighbors", 2, 50, 15)
|
|
|
|
| 684 |
hm = st.slider("min_samples", 2, 100, 5)
|
| 685 |
|
| 686 |
with st.sidebar.expander("BERTopic"):
|
| 687 |
+
nr_topics = st.text_input("nr_topics", value="auto")
|
| 688 |
top_n_words = st.slider("top_n_words", 5, 25, 10)
|
| 689 |
|
|
|
|
| 690 |
current_config = {
|
| 691 |
"embedding_model": selected_embedding_model,
|
| 692 |
"granularity": granularity_label,
|
|
|
|
| 710 |
"nr_topics": nr_topics,
|
| 711 |
"top_n_words": top_n_words,
|
| 712 |
},
|
| 713 |
+
"text_column": selected_text_column,
|
| 714 |
}
|
| 715 |
|
|
|
|
| 716 |
run_button = st.sidebar.button("Run Analysis", type="primary")
|
| 717 |
|
|
|
|
| 718 |
# =================================================================
|
| 719 |
# 9. Visualization & History Tabs
|
| 720 |
# =================================================================
|
|
|
|
| 728 |
data = json.load(open(path))
|
| 729 |
except Exception:
|
| 730 |
return []
|
|
|
|
| 731 |
for e in data:
|
| 732 |
if "outlier_pct" not in e and "outlier_perc" in e:
|
| 733 |
e["outlier_pct"] = e.pop("outlier_perc")
|
| 734 |
return data
|
| 735 |
|
|
|
|
| 736 |
def save_history(h):
|
| 737 |
json.dump(h, open(HISTORY_FILE, "w"), indent=2)
|
| 738 |
|
|
|
|
| 740 |
st.session_state.history = load_history()
|
| 741 |
|
| 742 |
if run_button:
|
|
|
|
| 743 |
if not isinstance(embeddings, np.ndarray):
|
| 744 |
embeddings = np.asarray(embeddings)
|
| 745 |
|
|
|
|
| 747 |
try:
|
| 748 |
embeddings = np.vstack(embeddings).astype(np.float32)
|
| 749 |
except Exception:
|
| 750 |
+
st.error(
|
| 751 |
+
"Cached embeddings are invalid (object/ragged). Click **Prepare Data** to regenerate."
|
| 752 |
+
)
|
| 753 |
st.stop()
|
| 754 |
|
| 755 |
if embeddings.shape[0] != len(docs):
|
| 756 |
+
st.error(
|
| 757 |
+
f"len(docs)={len(docs)} but embeddings.shape[0]={embeddings.shape[0]}.\n"
|
| 758 |
+
"Likely stale cache (e.g., switched sentences↔reports or model). "
|
| 759 |
+
"Use the **Clear cache** button below and regenerate."
|
| 760 |
+
)
|
| 761 |
st.stop()
|
| 762 |
|
|
|
|
| 763 |
with st.spinner("Performing topic modeling..."):
|
| 764 |
model, reduced, labels, n_topics, outlier_pct = perform_topic_modeling(
|
| 765 |
docs, embeddings, get_config_hash(current_config)
|
| 766 |
)
|
| 767 |
st.session_state.latest_results = (model, reduced, labels)
|
| 768 |
|
|
|
|
| 769 |
entry = {
|
| 770 |
"timestamp": str(pd.Timestamp.now()),
|
| 771 |
"config": current_config,
|
| 772 |
"num_topics": n_topics,
|
| 773 |
"outlier_pct": f"{outlier_pct:.2f}%",
|
| 774 |
+
"llm_labels": [
|
| 775 |
+
name
|
| 776 |
+
for name in model.get_topic_info().Name.values
|
| 777 |
if ("Unlabelled" not in name and "outlier" not in name)
|
| 778 |
],
|
| 779 |
}
|
|
|
|
| 793 |
st.subheader("Topic Info")
|
| 794 |
st.dataframe(tm.get_topic_info())
|
| 795 |
|
|
|
|
|
|
|
| 796 |
st.subheader("Export results (one row per topic)")
|
| 797 |
|
|
|
|
| 798 |
full_reps = tm.get_topics(full=True)
|
| 799 |
+
llm_reps = full_reps.get("LLM", {})
|
| 800 |
|
|
|
|
| 801 |
llm_names = {}
|
| 802 |
for tid, vals in llm_reps.items():
|
| 803 |
try:
|
| 804 |
+
llm_names[tid] = (
|
| 805 |
+
(vals[0][0] or "").strip().strip('"').strip(".")
|
| 806 |
+
)
|
| 807 |
except Exception:
|
| 808 |
llm_names[tid] = "Unlabelled"
|
| 809 |
+
|
|
|
|
| 810 |
if not llm_names:
|
|
|
|
| 811 |
st.caption("Note: Using default keyword-based topic names.")
|
| 812 |
+
llm_names = (
|
| 813 |
+
tm.get_topic_info().set_index("Topic")["Name"].to_dict()
|
| 814 |
+
)
|
| 815 |
|
|
|
|
| 816 |
doc_info = tm.get_document_info(docs)[["Document", "Topic"]]
|
| 817 |
|
| 818 |
+
include_outliers = st.checkbox(
|
| 819 |
+
"Include outlier topic (-1)", value=False
|
| 820 |
+
)
|
| 821 |
if not include_outliers:
|
| 822 |
doc_info = doc_info[doc_info["Topic"] != -1]
|
| 823 |
|
|
|
|
| 824 |
grouped = (
|
| 825 |
doc_info.groupby("Topic")["Document"]
|
| 826 |
.apply(list)
|
| 827 |
.reset_index(name="texts")
|
| 828 |
)
|
| 829 |
+
grouped["topic_name"] = grouped["Topic"].map(llm_names).fillna(
|
| 830 |
+
"Unlabelled"
|
| 831 |
+
)
|
| 832 |
|
| 833 |
+
export_topics = (
|
| 834 |
+
grouped.rename(columns={"Topic": "topic_id"})[
|
| 835 |
+
["topic_id", "topic_name", "texts"]
|
| 836 |
+
]
|
| 837 |
+
.sort_values("topic_id")
|
| 838 |
+
.reset_index(drop=True)
|
| 839 |
+
)
|
| 840 |
|
| 841 |
+
SEP = "\n"
|
|
|
|
|
|
|
|
|
|
| 842 |
|
|
|
|
| 843 |
export_csv = export_topics.copy()
|
| 844 |
+
export_csv["texts"] = export_csv["texts"].apply(
|
| 845 |
+
lambda lst: SEP.join(map(str, lst))
|
| 846 |
+
)
|
| 847 |
|
| 848 |
base = os.path.splitext(os.path.basename(CSV_PATH))[0]
|
| 849 |
gran = "sentences" if selected_granularity else "reports"
|
| 850 |
+
csv_name = f"topics_{base}_{gran}.csv"
|
| 851 |
jsonl_name = f"topics_{base}_{gran}.jsonl"
|
| 852 |
+
csv_path = (EVAL_DIR / csv_name).resolve()
|
| 853 |
jsonl_path = (EVAL_DIR / jsonl_name).resolve()
|
| 854 |
|
| 855 |
cL, cC, cR = st.columns(3)
|
|
|
|
| 863 |
st.error(f"Failed to save CSV: {e}")
|
| 864 |
|
| 865 |
with cC:
|
|
|
|
| 866 |
if st.button("Save JSONL to eval/", use_container_width=True):
|
| 867 |
try:
|
| 868 |
with open(jsonl_path, "w", encoding="utf-8") as f:
|
|
|
|
| 872 |
"topic_name": row["topic_name"],
|
| 873 |
"texts": list(map(str, row["texts"])),
|
| 874 |
}
|
| 875 |
+
f.write(
|
| 876 |
+
json.dumps(rec, ensure_ascii=False) + "\n"
|
| 877 |
+
)
|
| 878 |
st.success(f"Saved JSONL → {jsonl_path}")
|
| 879 |
except Exception as e:
|
| 880 |
st.error(f"Failed to save JSONL: {e}")
|
|
|
|
| 891 |
st.caption("Preview (one row per topic)")
|
| 892 |
st.dataframe(export_csv.head(10))
|
| 893 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 894 |
else:
|
| 895 |
st.info("Click 'Run Analysis' to begin.")
|
| 896 |
|
|
|
|
| 903 |
for i, entry in enumerate(st.session_state.history):
|
| 904 |
with st.expander(f"Run {i+1} — {entry['timestamp']}"):
|
| 905 |
st.write(f"**Topics:** {entry['num_topics']}")
|
| 906 |
+
st.write(
|
| 907 |
+
f"**Outliers:** {entry.get('outlier_pct', entry.get('outlier_perc', 'N/A'))}"
|
| 908 |
+
)
|
| 909 |
st.write("**Topic Labels (default keywords):**")
|
| 910 |
st.write(entry["llm_labels"])
|
| 911 |
with st.expander("Show full configuration"):
|
| 912 |
+
st.json(entry["config"])
|