romybeaute commited on
Commit
76c91ce
·
verified ·
1 Parent(s): ba0ad54

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +283 -194
app.py CHANGED
@@ -12,7 +12,7 @@ Last Modified: 06/11/2025
12
 
13
  from pathlib import Path
14
  import sys
15
- # from llama_cpp import Llama # <-- REMOVED
16
  import streamlit as st
17
  import pandas as pd
18
  import numpy as np
@@ -21,24 +21,28 @@ import os
21
  import nltk
22
  import json
23
 
24
- # from mosaic.path_utils import CFG, raw_path, proc_path, eval_path, project_root
 
 
25
 
26
  NLTK_DATA_DIR = "/usr/local/share/nltk_data"
27
  if NLTK_DATA_DIR not in nltk.data.path:
28
  nltk.data.path.append(NLTK_DATA_DIR)
29
 
 
30
  for resource in ("punkt_tab", "punkt"):
31
  try:
32
  nltk.data.find(f"tokenizers/{resource}")
33
  except LookupError:
34
- # Best-effort fallback; if HF blocks downloads, we still have the ones from the Docker build
35
  try:
36
  nltk.download(resource, download_dir=NLTK_DATA_DIR)
37
  except Exception as e:
38
  print(f"Could not download NLTK resource {resource}: {e}")
39
 
 
 
 
40
 
41
-
42
  try:
43
  from mosaic.path_utils import CFG, raw_path, proc_path, eval_path, project_root # type: ignore
44
  except Exception:
@@ -49,12 +53,12 @@ except Exception:
49
 
50
  # Defaults: app-local data/ eval/ that are safe on Cloud
51
  _DATA_ROOT = _env("MOSAIC_DATA", str(Path(__file__).parent / "data"))
52
- _BOX_ROOT = _env("MOSAIC_BOX", str(Path(__file__).parent / "data" / "raw"))
53
  _EVAL_ROOT = _env("MOSAIC_EVAL", str(Path(__file__).parent / "eval"))
54
 
55
  CFG = {
56
  "data_root": str(_DATA_ROOT),
57
- "box_root": str(_BOX_ROOT),
58
  "eval_root": str(_EVAL_ROOT),
59
  }
60
 
@@ -70,10 +74,11 @@ except Exception:
70
  def eval_path(*parts: str) -> Path:
71
  return _EVAL_ROOT.joinpath(*parts)
72
 
 
73
  # BERTopic stack
74
  from bertopic import BERTopic
75
- # from bertopic.representation import LlamaCPP # <-- REMOVED
76
- # from llama_cpp import Llama # <-- REMOVED
77
  from sentence_transformers import SentenceTransformer
78
 
79
  # Clustering/dimensionality reduction
@@ -86,8 +91,6 @@ import datamapplot
86
  import matplotlib.pyplot as plt
87
  from huggingface_hub import hf_hub_download
88
 
89
-
90
-
91
  # =====================================================================
92
  # 0. Constants & Helper Functions
93
  # =====================================================================
@@ -99,6 +102,7 @@ def _slugify(s: str) -> str:
99
  return s or "DATASET"
100
 
101
 
 
102
  ACCEPTABLE_TEXT_COLUMNS = [
103
  "reflection_answer_english",
104
  "reflection_answer",
@@ -106,14 +110,25 @@ ACCEPTABLE_TEXT_COLUMNS = [
106
  "report",
107
  ]
108
 
 
109
  def _pick_text_column(df: pd.DataFrame) -> str | None:
110
- """Return the first matching text column."""
111
  for col in ACCEPTABLE_TEXT_COLUMNS:
112
  if col in df.columns:
113
  return col
114
  return None
115
 
116
 
 
 
 
 
 
 
 
 
 
 
117
  def _set_from_env_or_secrets(key: str):
118
  """Allow hosting: value can come from environment or from Streamlit secrets."""
119
  if os.getenv(key):
@@ -125,44 +140,49 @@ def _set_from_env_or_secrets(key: str):
125
  if val:
126
  os.environ[key] = str(val)
127
 
 
128
  # Enable both MOSAIC_DATA and MOSAIC_BOX automatically
129
  for _k in ("MOSAIC_DATA", "MOSAIC_BOX"):
130
  _set_from_env_or_secrets(_k)
131
 
132
 
133
-
134
  @st.cache_data
135
- def count_clean_reports(csv_path: str) -> int:
 
136
  df = pd.read_csv(csv_path)
137
- col = _pick_text_column(df)
 
 
 
 
 
138
  if col is None:
139
  return 0
 
140
  if col != "reflection_answer_english":
141
  df = df.rename(columns={col: "reflection_answer_english"})
 
142
  df.dropna(subset=["reflection_answer_english"], inplace=True)
143
  df["reflection_answer_english"] = df["reflection_answer_english"].astype(str)
144
  df = df[df["reflection_answer_english"].str.strip() != ""]
145
  return len(df)
146
 
147
 
148
- # --- THIS CONFLICTING FUNCTION IS NOW REMOVED ---
149
- # def ensure_sentence_tokenizer():
150
- # ...
151
-
152
-
153
  # =====================================================================
154
  # 1. Streamlit app setup
155
  # =====================================================================
156
 
157
  st.set_page_config(page_title="MOSAIC Dashboard", layout="wide")
158
- st.title("Mapping of Subjective Accounts into Interpreted Clusters (MOSAIC): Topic Modelling Dashboard for Phenomenological Reports")
 
 
 
159
 
160
- # add if use, please cite the following paper:
161
  st.markdown(
162
  """
163
  _If you use this tool in your research, please cite the following paper:_\n
164
- **Beauté, R., et al. (2025).**
165
- **Mapping of Subjective Accounts into Interpreted Clusters (MOSAIC): Topic Modelling and LLM applied to Stroboscopic Phenomenology**
166
  https://arxiv.org/abs/2502.18318
167
  """
168
  )
@@ -171,12 +191,12 @@ st.markdown(
171
  # 2. Dataset paths (using MOSAIC structure)
172
  # =====================================================================
173
 
174
- # --- Choose dataset/project name (drives folder names) ---
175
- ds_input = st.sidebar.text_input("Project/Dataset name", value="MOSAIC", key="dataset_name_input")
 
176
  DATASET_DIR = _slugify(ds_input).upper()
177
 
178
-
179
- RAW_DIR = raw_path(DATASET_DIR)
180
  PROC_DIR = proc_path(DATASET_DIR, "preprocessed")
181
  EVAL_DIR = eval_path(DATASET_DIR)
182
  CACHE_DIR = PROC_DIR / "cache"
@@ -185,7 +205,6 @@ PROC_DIR.mkdir(parents=True, exist_ok=True)
185
  CACHE_DIR.mkdir(parents=True, exist_ok=True)
186
  EVAL_DIR.mkdir(parents=True, exist_ok=True)
187
 
188
-
189
  with st.sidebar.expander("About the dataset name", expanded=False):
190
  st.markdown(
191
  f"""
@@ -202,46 +221,33 @@ with st.sidebar.expander("About the dataset name", expanded=False):
202
  def _list_server_csvs(proc_dir: Path) -> list[str]:
203
  return [str(p) for p in sorted(proc_dir.glob("*.csv"))]
204
 
205
- # will be populated later (after CSV_PATH is known)
206
- DATASETS = None # keep name for clarity; we’ll fill it when rendering the sidebar
207
-
208
 
 
209
  HISTORY_FILE = str(PROC_DIR / "run_history.json")
210
 
211
-
212
-
213
  # =====================================================================
214
  # 3. Embedding & LLM loaders
215
  # =====================================================================
216
 
 
217
  @st.cache_resource
218
  def load_embedding_model(model_name):
219
  st.info(f"Loading embedding model '{model_name}'...")
220
  return SentenceTransformer(model_name)
221
 
222
 
223
- # @st.cache_resource
224
- # def load_llm_model():
225
- # """Loads LlamaCPP quantised model for topic labeling."""
226
- # model_repo = "NousResearch/Meta-Llama-3-8B-Instruct-GGUF"
227
- # model_file = "Meta-Llama-3-8B-Instruct-Q4_K_M.gguf"
228
- # model_path = hf_hub_download(repo_id=model_repo, filename=model_file)
229
- # return Llama(model_path=model_path, n_gpu_layers=-1, n_ctx=8192,
230
- # stop=["Q:", "\n"], verbose=False)
231
-
232
-
233
  @st.cache_data
234
  def load_precomputed_data(docs_file, embeddings_file):
235
  docs = np.load(docs_file, allow_pickle=True).tolist()
236
- emb = np.load(embeddings_file, allow_pickle=True)
237
  return docs, emb
238
 
239
 
240
-
241
  # =====================================================================
242
  # 4. Topic modeling function
243
  # =====================================================================
244
 
 
245
  def get_config_hash(cfg):
246
  return json.dumps(cfg, sort_keys=True)
247
 
@@ -256,38 +262,45 @@ def perform_topic_modeling(_docs, _embeddings, config_hash):
256
  try:
257
  _embeddings = np.vstack(_embeddings)
258
  except Exception:
259
- st.error(f"Embeddings are invalid (dtype={_embeddings.dtype}, ndim={_embeddings.ndim}). "
260
- "Please click **Prepare Data** to regenerate.")
 
 
261
  st.stop()
262
  _embeddings = np.ascontiguousarray(_embeddings, dtype=np.float32)
263
 
264
  if _embeddings.shape[0] != len(_docs):
265
- st.error(f"Mismatch between docs and embeddings: len(docs)={len(_docs)} vs "
266
- f"embeddings.shape[0]={_embeddings.shape[0]}. "
267
- "Delete the cached files for this configuration and regenerate.")
 
 
268
  st.stop()
269
 
270
  config = json.loads(config_hash)
271
 
272
- # Prepare vectorizer parameters
273
  if "ngram_range" in config["vectorizer_params"]:
274
- config["vectorizer_params"]["ngram_range"] = tuple(config["vectorizer_params"]["ngram_range"])
275
-
276
- # <-- MODIFIED: Use BERTopic's default representation instead of LLM
277
- rep_model = None
278
-
279
- umap_model = UMAP(
280
- random_state=42, metric="cosine",
281
- **config["umap_params"]
282
- )
283
  hdbscan_model = HDBSCAN(
284
- metric="euclidean", prediction_data=True,
285
- **config["hdbscan_params"]
 
 
 
 
286
  )
287
- vectorizer_model = CountVectorizer(**config["vectorizer_params"]) if config["use_vectorizer"] else None
288
 
289
- nr_topics_val = None if config["bt_params"]["nr_topics"] == "auto" \
290
- else int(config["bt_params"]["nr_topics"])
 
 
 
291
 
292
  topic_model = BERTopic(
293
  umap_model=umap_model,
@@ -296,7 +309,7 @@ def perform_topic_modeling(_docs, _embeddings, config_hash):
296
  representation_model=rep_model,
297
  top_n_words=config["bt_params"]["top_n_words"],
298
  nr_topics=nr_topics_val,
299
- verbose=False
300
  )
301
 
302
  topics, _ = topic_model.fit_transform(_docs, _embeddings)
@@ -304,33 +317,39 @@ def perform_topic_modeling(_docs, _embeddings, config_hash):
304
 
305
  outlier_pct = 0
306
  if -1 in info.Topic.values:
307
- outlier_pct = (info.Count[info.Topic == -1].iloc[0] / info.Count.sum()) * 100
 
 
308
 
309
- # <-- MODIFIED: Use default topic names instead of LLM labels
310
- # Get the default keyword-based names generated by BERTopic
311
  topic_info = topic_model.get_topic_info()
312
- # Create a map of {topic_id: "topic_name", ...}
313
- name_map = topic_info.set_index('Topic')['Name'].to_dict()
314
- # Map each document's topic_id to its name
315
  all_labels = [name_map[topic] for topic in topics]
316
 
317
-
318
  reduced = UMAP(
319
- n_neighbors=15, n_components=2, min_dist=0.0,
320
- metric="cosine", random_state=42
 
 
 
321
  ).fit_transform(_embeddings)
322
 
323
- return topic_model, reduced, all_labels, len(info)-1, outlier_pct
324
-
325
 
326
 
327
  # =====================================================================
328
  # 5. CSV → documents → embeddings pipeline
329
  # =====================================================================
330
 
331
- def generate_and_save_embeddings(csv_path, docs_file, emb_file,
332
- selected_embedding_model,
333
- split_sentences, device):
 
 
 
 
 
 
 
334
 
335
  # ---------------------
336
  # Load & clean CSV
@@ -338,9 +357,13 @@ def generate_and_save_embeddings(csv_path, docs_file, emb_file,
338
  st.info(f"Reading and preparing CSV: {csv_path}")
339
  df = pd.read_csv(csv_path)
340
 
341
- col = _pick_text_column(df)
 
 
 
 
342
  if col is None:
343
- st.error("CSV must contain one of: " + ", ".join(ACCEPTABLE_TEXT_COLUMNS))
344
  return
345
 
346
  if col != "reflection_answer_english":
@@ -354,7 +377,6 @@ def generate_and_save_embeddings(csv_path, docs_file, emb_file,
354
  # ---------------------
355
  # Sentence / report granularity
356
  # ---------------------
357
-
358
  if split_sentences:
359
  try:
360
  sentences = [s for r in reports for s in nltk.sent_tokenize(r)]
@@ -365,14 +387,15 @@ def generate_and_save_embeddings(csv_path, docs_file, emb_file,
365
  else:
366
  docs = reports
367
 
368
-
369
  np.save(docs_file, np.array(docs, dtype=object))
370
  st.success(f"Prepared {len(docs)} documents")
371
 
372
  # ---------------------
373
  # Embeddings
374
  # ---------------------
375
- st.info(f"Encoding {len(docs)} documents with {selected_embedding_model} on {device}")
 
 
376
 
377
  model = load_embedding_model(selected_embedding_model)
378
 
@@ -387,7 +410,7 @@ def generate_and_save_embeddings(csv_path, docs_file, emb_file,
387
  show_progress_bar=True,
388
  batch_size=batch_size,
389
  device=encode_device,
390
- convert_to_numpy=True
391
  )
392
  embeddings = np.asarray(embeddings, dtype=np.float32)
393
  np.save(emb_file, embeddings)
@@ -397,12 +420,10 @@ def generate_and_save_embeddings(csv_path, docs_file, emb_file,
397
  st.rerun()
398
 
399
 
400
-
401
  # =====================================================================
402
  # 6. Sidebar — dataset, upload, parameters
403
  # =====================================================================
404
 
405
- # --- User CSV upload / server dataset ---
406
  st.sidebar.header("Data Input Method")
407
 
408
  source = st.sidebar.radio(
@@ -416,24 +437,36 @@ uploaded_csv_path = None
416
  CSV_PATH = None # will be set in the chosen branch
417
 
418
  if source == "Use preprocessed CSV on server":
419
- # List preprocessed CSVs inside this dataset’s folder
420
  available = _list_server_csvs(PROC_DIR)
421
  if not available:
422
- st.info(f"No CSVs found in {PROC_DIR}. Switch to 'Upload my own CSV' or change the dataset name.")
 
 
423
  st.stop()
424
- selected_csv = st.sidebar.selectbox("Choose a preprocessed CSV", available, key="server_csv_select")
 
 
425
  CSV_PATH = selected_csv
426
  else:
427
- up = st.sidebar.file_uploader("Upload a CSV", type=["csv"], key="upload_csv")
 
 
 
 
 
 
 
 
 
 
 
428
  if up is not None:
429
  tmp_df = pd.read_csv(up)
430
- col = _pick_text_column(tmp_df)
431
- if col is None:
432
- st.error("CSV must contain a text column such as: " + ", ".join(ACCEPTABLE_TEXT_COLUMNS))
433
  st.stop()
434
- if col != "reflection_answer_english":
435
- tmp_df = tmp_df.rename(columns={col: "reflection_answer_english"})
436
- # Save into THIS dataset’s preprocessed folder
437
  uploaded_csv_path = str((PROC_DIR / "uploaded.csv").resolve())
438
  tmp_df.to_csv(uploaded_csv_path, index=False)
439
  st.success(f"Uploaded CSV saved to {uploaded_csv_path}")
@@ -442,35 +475,77 @@ else:
442
  st.info("Upload a CSV to continue.")
443
  st.stop()
444
 
 
 
 
 
 
 
445
 
446
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
447
 
448
- # --- Subsample ---
449
  st.sidebar.subheader("Data Granularity & Subsampling")
450
 
451
- selected_granularity = st.sidebar.checkbox("Split reports into sentences", value=True)
 
 
452
  granularity_label = "sentences" if selected_granularity else "reports"
453
 
454
  subsample_perc = st.sidebar.slider("Data sampling (%)", 10, 100, 100, 5)
455
 
456
- # line break
457
  st.sidebar.markdown("---")
458
 
 
 
 
459
 
460
-
461
- # --- Embedding model selection ---
462
  st.sidebar.header("Model Selection")
463
 
464
- # Default is now the small model to prevent OOM crash on start
465
- selected_embedding_model = st.sidebar.selectbox("Choose an embedding model", (
466
- "BAAI/bge-small-en-v1.5",
467
- "intfloat/multilingual-e5-large-instruct",
468
- "Qwen/Qwen3-Embedding-0.6B",
469
- "sentence-transformers/all-mpnet-base-v2",
470
- ))
 
 
471
 
472
- # --- Device selection ---
473
- # st.sidebar.header("Data Preparation")
474
  selected_device = st.sidebar.radio(
475
  "Processing device",
476
  ["GPU (MPS)", "CPU"],
@@ -481,88 +556,106 @@ selected_device = st.sidebar.radio(
481
  # 7. Precompute filenames and pipeline triggers
482
  # =====================================================================
483
 
484
- def get_precomputed_filenames(csv_path, model_name, split_sentences):
 
485
  base = os.path.splitext(os.path.basename(csv_path))[0]
486
  safe_model = re.sub(r"[^a-zA-Z0-9_-]", "_", model_name)
487
  suf = "sentences" if split_sentences else "reports"
 
 
 
 
 
 
488
  return (
489
- str(CACHE_DIR / f"precomputed_{base}_{suf}_docs.npy"),
490
- str(CACHE_DIR / f"precomputed_{base}_{safe_model}_{suf}_embeddings.npy"),
 
 
 
491
  )
492
 
 
493
  DOCS_FILE, EMBEDDINGS_FILE = get_precomputed_filenames(
494
- CSV_PATH, selected_embedding_model, selected_granularity
495
  )
496
 
497
- # --- Cache management (after DOCS_FILE / EMBEDDINGS_FILE exist) ---
498
  st.sidebar.markdown("### Cache")
499
- if st.sidebar.button("Clear cached files for this configuration", use_container_width=True):
 
 
500
  try:
501
  for p in (DOCS_FILE, EMBEDDINGS_FILE):
502
  if os.path.exists(p):
503
  os.remove(p)
504
- # also clear Streamlit caches tied to these functions
505
  try:
506
- load_precomputed_data.clear() # st.cache_data func
507
  except Exception:
508
  pass
509
  try:
510
- perform_topic_modeling.clear() # st.cache_data func
511
  except Exception:
512
  pass
513
 
514
- st.success("Deleted cached docs/embeddings and cleared caches. Click **Prepare Data** again.")
 
 
515
  st.rerun()
516
  except Exception as e:
517
  st.error(f"Failed to delete cache files: {e}")
518
 
519
- #add line break
520
  st.sidebar.markdown("---")
521
 
522
-
523
-
524
  # =====================================================================
525
  # 8. Prepare Data OR Run Analysis
526
  # =====================================================================
527
 
528
  if not os.path.exists(EMBEDDINGS_FILE):
529
- st.warning(f"No precomputed embeddings found for this configuration ({granularity_label} / {selected_embedding_model}).")
530
-
 
 
 
531
  if st.button("Prepare Data for This Configuration"):
532
  generate_and_save_embeddings(
533
- CSV_PATH, DOCS_FILE, EMBEDDINGS_FILE,
534
- selected_embedding_model, selected_granularity, selected_device
 
 
 
 
 
535
  )
536
 
537
  else:
538
  # Load cached data
539
  docs, embeddings = load_precomputed_data(DOCS_FILE, EMBEDDINGS_FILE)
540
 
541
- # Coerce to 2-D float array even if saved as object
542
  embeddings = np.asarray(embeddings)
543
  if embeddings.dtype == object or embeddings.ndim != 2:
544
  try:
545
  embeddings = np.vstack(embeddings).astype(np.float32)
546
  except Exception:
547
- st.error("Cached embeddings are invalid. Please regenerate them for this configuration.")
 
 
548
  st.stop()
549
 
550
- # Subsample
551
  if subsample_perc < 100:
552
  n = int(len(docs) * (subsample_perc / 100))
553
  idx = np.random.choice(len(docs), size=n, replace=False)
554
  docs = [docs[i] for i in idx]
555
- # embeddings = embeddings[idx]
556
- embeddings = np.asarray(embeddings)
557
- embeddings = embeddings[idx, :] # keep it 2-D
558
- st.warning(f"Running analysis on {subsample_perc}% subsample ({len(docs)} documents)")
559
 
560
- # st.metric("Documents to Analyze", len(docs), granularity_label)
561
- # --- Dataset summary metrics ---
562
  st.subheader("Dataset summary")
563
- n_reports = count_clean_reports(CSV_PATH) # total cleaned reports in CSV
564
  unit = "sentences" if selected_granularity else "reports"
565
- n_units = len(docs) # actual units analyzed
566
 
567
  c1, c2 = st.columns(2)
568
  c1.metric("Reports in CSV (cleaned)", n_reports)
@@ -577,7 +670,9 @@ else:
577
  ng_min = st.slider("Min N-gram", 1, 5, 1)
578
  ng_max = st.slider("Max N-gram", 1, 5, 2)
579
  min_df = st.slider("Min Doc Freq", 1, 50, 1)
580
- stopwords = st.select_slider("Stopwords", options=[None, "english"], value=None)
 
 
581
 
582
  with st.sidebar.expander("UMAP"):
583
  um_n = st.slider("n_neighbors", 2, 50, 15)
@@ -589,10 +684,9 @@ else:
589
  hm = st.slider("min_samples", 2, 100, 5)
590
 
591
  with st.sidebar.expander("BERTopic"):
592
- nr_topics = st.text_input("nr_topics", value="auto")
593
  top_n_words = st.slider("top_n_words", 5, 25, 10)
594
 
595
- # --- Build config ---
596
  current_config = {
597
  "embedding_model": selected_embedding_model,
598
  "granularity": granularity_label,
@@ -616,12 +710,11 @@ else:
616
  "nr_topics": nr_topics,
617
  "top_n_words": top_n_words,
618
  },
 
619
  }
620
 
621
- # --- Run Button ---
622
  run_button = st.sidebar.button("Run Analysis", type="primary")
623
 
624
-
625
  # =================================================================
626
  # 9. Visualization & History Tabs
627
  # =================================================================
@@ -635,13 +728,11 @@ else:
635
  data = json.load(open(path))
636
  except Exception:
637
  return []
638
- # --- migrate old keys for backward-compat ---
639
  for e in data:
640
  if "outlier_pct" not in e and "outlier_perc" in e:
641
  e["outlier_pct"] = e.pop("outlier_perc")
642
  return data
643
 
644
-
645
  def save_history(h):
646
  json.dump(h, open(HISTORY_FILE, "w"), indent=2)
647
 
@@ -649,7 +740,6 @@ else:
649
  st.session_state.history = load_history()
650
 
651
  if run_button:
652
-
653
  if not isinstance(embeddings, np.ndarray):
654
  embeddings = np.asarray(embeddings)
655
 
@@ -657,30 +747,33 @@ else:
657
  try:
658
  embeddings = np.vstack(embeddings).astype(np.float32)
659
  except Exception:
660
- st.error("Cached embeddings are invalid (object/ragged). Click **Prepare Data** to regenerate.")
 
 
661
  st.stop()
662
 
663
  if embeddings.shape[0] != len(docs):
664
- st.error(f"len(docs)={len(docs)} but embeddings.shape[0]={embeddings.shape[0]}.\n"
665
- "Likely stale cache (e.g., switched sentences↔reports or model). "
666
- "Use the **Clear cache** button below and regenerate.")
 
 
667
  st.stop()
668
 
669
-
670
  with st.spinner("Performing topic modeling..."):
671
  model, reduced, labels, n_topics, outlier_pct = perform_topic_modeling(
672
  docs, embeddings, get_config_hash(current_config)
673
  )
674
  st.session_state.latest_results = (model, reduced, labels)
675
 
676
- # Save in history
677
  entry = {
678
  "timestamp": str(pd.Timestamp.now()),
679
  "config": current_config,
680
  "num_topics": n_topics,
681
  "outlier_pct": f"{outlier_pct:.2f}%",
682
- "llm_labels": [ # <-- This will use the fallback logic in the export section
683
- name for name in model.get_topic_info().Name.values
 
684
  if ("Unlabelled" not in name and "outlier" not in name)
685
  ],
686
  }
@@ -700,65 +793,63 @@ else:
700
  st.subheader("Topic Info")
701
  st.dataframe(tm.get_topic_info())
702
 
703
-
704
- # --- Export: one row per topic (topic_id, LLM topic_name, texts) ---
705
  st.subheader("Export results (one row per topic)")
706
 
707
- # 1) Pull LLM labels directly from BERTopic's representation
708
  full_reps = tm.get_topics(full=True)
709
- llm_reps = full_reps.get("LLM", {}) # {topic_id: [(label, score), ...], ...}
710
 
711
- # Build topic_id -> LLM label map; fall back to Name if missing
712
  llm_names = {}
713
  for tid, vals in llm_reps.items():
714
  try:
715
- llm_names[tid] = (vals[0][0] or "").strip().strip('"').strip(".")
 
 
716
  except Exception:
717
  llm_names[tid] = "Unlabelled"
718
-
719
- # <-- MODIFIED: This fallback logic is now the main logic
720
  if not llm_names:
721
- # Fallback: whatever BERTopic put in Name
722
  st.caption("Note: Using default keyword-based topic names.")
723
- llm_names = tm.get_topic_info().set_index("Topic")["Name"].to_dict()
 
 
724
 
725
- # 2) Per-document assignments for current docs
726
  doc_info = tm.get_document_info(docs)[["Document", "Topic"]]
727
 
728
- # 3) Optionally remove outliers
729
- include_outliers = st.checkbox("Include outlier topic (-1)", value=False)
 
730
  if not include_outliers:
731
  doc_info = doc_info[doc_info["Topic"] != -1]
732
 
733
- # 4) Group texts by topic
734
  grouped = (
735
  doc_info.groupby("Topic")["Document"]
736
  .apply(list)
737
  .reset_index(name="texts")
738
  )
 
 
 
739
 
740
- # 5) Attach LLM names
741
- grouped["topic_name"] = grouped["Topic"].map(llm_names).fillna("Unlabelled")
742
-
743
- # 6) Reorder/rename columns
744
- export_topics = grouped.rename(columns={"Topic": "topic_id"})[
745
- ["topic_id", "topic_name", "texts"]
746
- ].sort_values("topic_id").reset_index(drop=True)
747
 
748
- # ---- CSV + JSONL outputs ----
749
- # Fixed separator (no textbox)
750
- # SEP = " || " # change if you prefer another fixed separator
751
- SEP = "\n" # change if you prefer another fixed separator
752
 
753
- # Flatten lists for CSV
754
  export_csv = export_topics.copy()
755
- export_csv["texts"] = export_csv["texts"].apply(lambda lst: SEP.join(map(str, lst)))
 
 
756
 
757
  base = os.path.splitext(os.path.basename(CSV_PATH))[0]
758
  gran = "sentences" if selected_granularity else "reports"
759
- csv_name = f"topics_{base}_{gran}.csv"
760
  jsonl_name = f"topics_{base}_{gran}.jsonl"
761
- csv_path = (EVAL_DIR / csv_name).resolve()
762
  jsonl_path = (EVAL_DIR / jsonl_name).resolve()
763
 
764
  cL, cC, cR = st.columns(3)
@@ -772,7 +863,6 @@ else:
772
  st.error(f"Failed to save CSV: {e}")
773
 
774
  with cC:
775
- # JSONL preserves the list structure of texts
776
  if st.button("Save JSONL to eval/", use_container_width=True):
777
  try:
778
  with open(jsonl_path, "w", encoding="utf-8") as f:
@@ -782,7 +872,9 @@ else:
782
  "topic_name": row["topic_name"],
783
  "texts": list(map(str, row["texts"])),
784
  }
785
- f.write(json.dumps(rec, ensure_ascii=False) + "\n")
 
 
786
  st.success(f"Saved JSONL → {jsonl_path}")
787
  except Exception as e:
788
  st.error(f"Failed to save JSONL: {e}")
@@ -799,11 +891,6 @@ else:
799
  st.caption("Preview (one row per topic)")
800
  st.dataframe(export_csv.head(10))
801
 
802
-
803
-
804
-
805
-
806
-
807
  else:
808
  st.info("Click 'Run Analysis' to begin.")
809
 
@@ -816,8 +903,10 @@ else:
816
  for i, entry in enumerate(st.session_state.history):
817
  with st.expander(f"Run {i+1} — {entry['timestamp']}"):
818
  st.write(f"**Topics:** {entry['num_topics']}")
819
- st.write(f"**Outliers:** {entry.get('outlier_pct', entry.get('outlier_perc', 'N/A'))}")
 
 
820
  st.write("**Topic Labels (default keywords):**")
821
  st.write(entry["llm_labels"])
822
  with st.expander("Show full configuration"):
823
- st.json(entry["config"])
 
12
 
13
  from pathlib import Path
14
  import sys
15
+ # from llama_cpp import Llama # <-- REMOVED
16
  import streamlit as st
17
  import pandas as pd
18
  import numpy as np
 
21
  import nltk
22
  import json
23
 
24
+ # =====================================================================
25
+ # NLTK setup
26
+ # =====================================================================
27
 
28
  NLTK_DATA_DIR = "/usr/local/share/nltk_data"
29
  if NLTK_DATA_DIR not in nltk.data.path:
30
  nltk.data.path.append(NLTK_DATA_DIR)
31
 
32
+ # Try to ensure both punkt_tab (new NLTK) and punkt (old NLTK) are available
33
  for resource in ("punkt_tab", "punkt"):
34
  try:
35
  nltk.data.find(f"tokenizers/{resource}")
36
  except LookupError:
 
37
  try:
38
  nltk.download(resource, download_dir=NLTK_DATA_DIR)
39
  except Exception as e:
40
  print(f"Could not download NLTK resource {resource}: {e}")
41
 
42
+ # =====================================================================
43
+ # Path utils (MOSAIC or fallback)
44
+ # =====================================================================
45
 
 
46
  try:
47
  from mosaic.path_utils import CFG, raw_path, proc_path, eval_path, project_root # type: ignore
48
  except Exception:
 
53
 
54
  # Defaults: app-local data/ eval/ that are safe on Cloud
55
  _DATA_ROOT = _env("MOSAIC_DATA", str(Path(__file__).parent / "data"))
56
+ _BOX_ROOT = _env("MOSAIC_BOX", str(Path(__file__).parent / "data" / "raw"))
57
  _EVAL_ROOT = _env("MOSAIC_EVAL", str(Path(__file__).parent / "eval"))
58
 
59
  CFG = {
60
  "data_root": str(_DATA_ROOT),
61
+ "box_root": str(_BOX_ROOT),
62
  "eval_root": str(_EVAL_ROOT),
63
  }
64
 
 
74
  def eval_path(*parts: str) -> Path:
75
  return _EVAL_ROOT.joinpath(*parts)
76
 
77
+
78
  # BERTopic stack
79
  from bertopic import BERTopic
80
+ # from bertopic.representation import LlamaCPP # <-- REMOVED
81
+ # from llama_cpp import Llama # <-- REMOVED
82
  from sentence_transformers import SentenceTransformer
83
 
84
  # Clustering/dimensionality reduction
 
91
  import matplotlib.pyplot as plt
92
  from huggingface_hub import hf_hub_download
93
 
 
 
94
  # =====================================================================
95
  # 0. Constants & Helper Functions
96
  # =====================================================================
 
102
  return s or "DATASET"
103
 
104
 
105
+ # "Nice" default names we know from MOSAIC; NOT a hard constraint anymore
106
  ACCEPTABLE_TEXT_COLUMNS = [
107
  "reflection_answer_english",
108
  "reflection_answer",
 
110
  "report",
111
  ]
112
 
113
+
114
  def _pick_text_column(df: pd.DataFrame) -> str | None:
115
+ """Return the first matching *preferred* text column name if present."""
116
  for col in ACCEPTABLE_TEXT_COLUMNS:
117
  if col in df.columns:
118
  return col
119
  return None
120
 
121
 
122
+ def _list_text_columns(df: pd.DataFrame) -> list[str]:
123
+ """Return all columns that look text-like (object / string dtype)."""
124
+ text_cols: list[str] = []
125
+ for c in df.columns:
126
+ s = df[c]
127
+ if s.dtype == object or pd.api.types.is_string_dtype(s):
128
+ text_cols.append(c)
129
+ return text_cols
130
+
131
+
132
  def _set_from_env_or_secrets(key: str):
133
  """Allow hosting: value can come from environment or from Streamlit secrets."""
134
  if os.getenv(key):
 
140
  if val:
141
  os.environ[key] = str(val)
142
 
143
+
144
  # Enable both MOSAIC_DATA and MOSAIC_BOX automatically
145
  for _k in ("MOSAIC_DATA", "MOSAIC_BOX"):
146
  _set_from_env_or_secrets(_k)
147
 
148
 
 
149
  @st.cache_data
150
+ def count_clean_reports(csv_path: str, text_col: str | None = None) -> int:
151
+ """Count non-empty reports in the chosen text column."""
152
  df = pd.read_csv(csv_path)
153
+
154
+ if text_col is not None and text_col in df.columns:
155
+ col = text_col
156
+ else:
157
+ col = _pick_text_column(df)
158
+
159
  if col is None:
160
  return 0
161
+
162
  if col != "reflection_answer_english":
163
  df = df.rename(columns={col: "reflection_answer_english"})
164
+
165
  df.dropna(subset=["reflection_answer_english"], inplace=True)
166
  df["reflection_answer_english"] = df["reflection_answer_english"].astype(str)
167
  df = df[df["reflection_answer_english"].str.strip() != ""]
168
  return len(df)
169
 
170
 
 
 
 
 
 
171
  # =====================================================================
172
  # 1. Streamlit app setup
173
  # =====================================================================
174
 
175
  st.set_page_config(page_title="MOSAIC Dashboard", layout="wide")
176
+ st.title(
177
+ "Mapping of Subjective Accounts into Interpreted Clusters (MOSAIC): "
178
+ "Topic Modelling Dashboard for Phenomenological Reports"
179
+ )
180
 
 
181
  st.markdown(
182
  """
183
  _If you use this tool in your research, please cite the following paper:_\n
184
+ **Beauté, R., et al. (2025).**
185
+ **Mapping of Subjective Accounts into Interpreted Clusters (MOSAIC): Topic Modelling and LLM applied to Stroboscopic Phenomenology**
186
  https://arxiv.org/abs/2502.18318
187
  """
188
  )
 
191
  # 2. Dataset paths (using MOSAIC structure)
192
  # =====================================================================
193
 
194
+ ds_input = st.sidebar.text_input(
195
+ "Project/Dataset name", value="MOSAIC", key="dataset_name_input"
196
+ )
197
  DATASET_DIR = _slugify(ds_input).upper()
198
 
199
+ RAW_DIR = raw_path(DATASET_DIR)
 
200
  PROC_DIR = proc_path(DATASET_DIR, "preprocessed")
201
  EVAL_DIR = eval_path(DATASET_DIR)
202
  CACHE_DIR = PROC_DIR / "cache"
 
205
  CACHE_DIR.mkdir(parents=True, exist_ok=True)
206
  EVAL_DIR.mkdir(parents=True, exist_ok=True)
207
 
 
208
  with st.sidebar.expander("About the dataset name", expanded=False):
209
  st.markdown(
210
  f"""
 
221
  def _list_server_csvs(proc_dir: Path) -> list[str]:
222
  return [str(p) for p in sorted(proc_dir.glob("*.csv"))]
223
 
 
 
 
224
 
225
+ DATASETS = None # keep name for clarity; we’ll fill it when rendering the sidebar
226
  HISTORY_FILE = str(PROC_DIR / "run_history.json")
227
 
 
 
228
  # =====================================================================
229
  # 3. Embedding & LLM loaders
230
  # =====================================================================
231
 
232
+
233
  @st.cache_resource
234
  def load_embedding_model(model_name):
235
  st.info(f"Loading embedding model '{model_name}'...")
236
  return SentenceTransformer(model_name)
237
 
238
 
 
 
 
 
 
 
 
 
 
 
239
  @st.cache_data
240
  def load_precomputed_data(docs_file, embeddings_file):
241
  docs = np.load(docs_file, allow_pickle=True).tolist()
242
+ emb = np.load(embeddings_file, allow_pickle=True)
243
  return docs, emb
244
 
245
 
 
246
  # =====================================================================
247
  # 4. Topic modeling function
248
  # =====================================================================
249
 
250
+
251
  def get_config_hash(cfg):
252
  return json.dumps(cfg, sort_keys=True)
253
 
 
262
  try:
263
  _embeddings = np.vstack(_embeddings)
264
  except Exception:
265
+ st.error(
266
+ f"Embeddings are invalid (dtype={_embeddings.dtype}, ndim={_embeddings.ndim}). "
267
+ "Please click **Prepare Data** to regenerate."
268
+ )
269
  st.stop()
270
  _embeddings = np.ascontiguousarray(_embeddings, dtype=np.float32)
271
 
272
  if _embeddings.shape[0] != len(_docs):
273
+ st.error(
274
+ f"Mismatch between docs and embeddings: len(docs)={len(_docs)} vs "
275
+ f"embeddings.shape[0]={_embeddings.shape[0]}. "
276
+ "Delete the cached files for this configuration and regenerate."
277
+ )
278
  st.stop()
279
 
280
  config = json.loads(config_hash)
281
 
 
282
  if "ngram_range" in config["vectorizer_params"]:
283
+ config["vectorizer_params"]["ngram_range"] = tuple(
284
+ config["vectorizer_params"]["ngram_range"]
285
+ )
286
+
287
+ rep_model = None # <-- Use BERTopic defaults for representation
288
+
289
+ umap_model = UMAP(random_state=42, metric="cosine", **config["umap_params"])
 
 
290
  hdbscan_model = HDBSCAN(
291
+ metric="euclidean", prediction_data=True, **config["hdbscan_params"]
292
+ )
293
+ vectorizer_model = (
294
+ CountVectorizer(**config["vectorizer_params"])
295
+ if config["use_vectorizer"]
296
+ else None
297
  )
 
298
 
299
+ nr_topics_val = (
300
+ None
301
+ if config["bt_params"]["nr_topics"] == "auto"
302
+ else int(config["bt_params"]["nr_topics"])
303
+ )
304
 
305
  topic_model = BERTopic(
306
  umap_model=umap_model,
 
309
  representation_model=rep_model,
310
  top_n_words=config["bt_params"]["top_n_words"],
311
  nr_topics=nr_topics_val,
312
+ verbose=False,
313
  )
314
 
315
  topics, _ = topic_model.fit_transform(_docs, _embeddings)
 
317
 
318
  outlier_pct = 0
319
  if -1 in info.Topic.values:
320
+ outlier_pct = (
321
+ info.Count[info.Topic == -1].iloc[0] / info.Count.sum()
322
+ ) * 100
323
 
 
 
324
  topic_info = topic_model.get_topic_info()
325
+ name_map = topic_info.set_index("Topic")["Name"].to_dict()
 
 
326
  all_labels = [name_map[topic] for topic in topics]
327
 
 
328
  reduced = UMAP(
329
+ n_neighbors=15,
330
+ n_components=2,
331
+ min_dist=0.0,
332
+ metric="cosine",
333
+ random_state=42,
334
  ).fit_transform(_embeddings)
335
 
336
+ return topic_model, reduced, all_labels, len(info) - 1, outlier_pct
 
337
 
338
 
339
  # =====================================================================
340
  # 5. CSV → documents → embeddings pipeline
341
  # =====================================================================
342
 
343
+
344
+ def generate_and_save_embeddings(
345
+ csv_path,
346
+ docs_file,
347
+ emb_file,
348
+ selected_embedding_model,
349
+ split_sentences,
350
+ device,
351
+ text_col=None,
352
+ ):
353
 
354
  # ---------------------
355
  # Load & clean CSV
 
357
  st.info(f"Reading and preparing CSV: {csv_path}")
358
  df = pd.read_csv(csv_path)
359
 
360
+ if text_col is not None and text_col in df.columns:
361
+ col = text_col
362
+ else:
363
+ col = _pick_text_column(df)
364
+
365
  if col is None:
366
+ st.error("CSV must contain at least one text column.")
367
  return
368
 
369
  if col != "reflection_answer_english":
 
377
  # ---------------------
378
  # Sentence / report granularity
379
  # ---------------------
 
380
  if split_sentences:
381
  try:
382
  sentences = [s for r in reports for s in nltk.sent_tokenize(r)]
 
387
  else:
388
  docs = reports
389
 
 
390
  np.save(docs_file, np.array(docs, dtype=object))
391
  st.success(f"Prepared {len(docs)} documents")
392
 
393
  # ---------------------
394
  # Embeddings
395
  # ---------------------
396
+ st.info(
397
+ f"Encoding {len(docs)} documents with {selected_embedding_model} on {device}"
398
+ )
399
 
400
  model = load_embedding_model(selected_embedding_model)
401
 
 
410
  show_progress_bar=True,
411
  batch_size=batch_size,
412
  device=encode_device,
413
+ convert_to_numpy=True,
414
  )
415
  embeddings = np.asarray(embeddings, dtype=np.float32)
416
  np.save(emb_file, embeddings)
 
420
  st.rerun()
421
 
422
 
 
423
  # =====================================================================
424
  # 6. Sidebar — dataset, upload, parameters
425
  # =====================================================================
426
 
 
427
  st.sidebar.header("Data Input Method")
428
 
429
  source = st.sidebar.radio(
 
437
  CSV_PATH = None # will be set in the chosen branch
438
 
439
  if source == "Use preprocessed CSV on server":
 
440
  available = _list_server_csvs(PROC_DIR)
441
  if not available:
442
+ st.info(
443
+ f"No CSVs found in {PROC_DIR}. Switch to 'Upload my own CSV' or change the dataset name."
444
+ )
445
  st.stop()
446
+ selected_csv = st.sidebar.selectbox(
447
+ "Choose a preprocessed CSV", available, key="server_csv_select"
448
+ )
449
  CSV_PATH = selected_csv
450
  else:
451
+ up = st.sidebar.file_uploader(
452
+ "Upload a CSV", type=["csv"], key="upload_csv"
453
+ )
454
+
455
+ st.sidebar.caption(
456
+ "Your CSV should have **one row per report** and at least one text column "
457
+ "(for example `reflection_answer_english`, `reflection_answer`, `text`, `report`, "
458
+ "or any other column containing free text). "
459
+ "Other columns (ID, condition, etc.) are allowed. "
460
+ "After upload, you’ll be able to choose which text column to analyse."
461
+ )
462
+
463
  if up is not None:
464
  tmp_df = pd.read_csv(up)
465
+ if tmp_df.empty:
466
+ st.error("Uploaded CSV is empty.")
 
467
  st.stop()
468
+
469
+ # Just save; we’ll choose the text column later
 
470
  uploaded_csv_path = str((PROC_DIR / "uploaded.csv").resolve())
471
  tmp_df.to_csv(uploaded_csv_path, index=False)
472
  st.success(f"Uploaded CSV saved to {uploaded_csv_path}")
 
475
  st.info("Upload a CSV to continue.")
476
  st.stop()
477
 
478
+ if CSV_PATH is None:
479
+ st.stop()
480
+
481
+ # ---------------------------------------------------------------------
482
+ # Text column selection
483
+ # ---------------------------------------------------------------------
484
 
485
 
486
+ @st.cache_data
487
+ def get_text_columns(csv_path: str) -> list[str]:
488
+ df_sample = pd.read_csv(csv_path, nrows=2000)
489
+ return _list_text_columns(df_sample)
490
+
491
+
492
+ text_columns = get_text_columns(CSV_PATH)
493
+ if not text_columns:
494
+ st.error(
495
+ "No text-like columns found in this CSV. At least one column must contain text."
496
+ )
497
+ st.stop()
498
+
499
+ # Try to pick a nice default (one of the MOSAIC-ish names) if present
500
+ try:
501
+ df_sample = pd.read_csv(CSV_PATH, nrows=2000)
502
+ preferred = _pick_text_column(df_sample)
503
+ except Exception:
504
+ preferred = None
505
+
506
+ if preferred in text_columns:
507
+ default_idx = text_columns.index(preferred)
508
+ else:
509
+ default_idx = 0
510
+
511
+ selected_text_column = st.sidebar.selectbox(
512
+ "Text column to analyse",
513
+ text_columns,
514
+ index=default_idx,
515
+ key="text_column_select",
516
+ )
517
+
518
+ # ---------------------------------------------------------------------
519
+ # Data granularity & subsampling
520
+ # ---------------------------------------------------------------------
521
 
 
522
  st.sidebar.subheader("Data Granularity & Subsampling")
523
 
524
+ selected_granularity = st.sidebar.checkbox(
525
+ "Split reports into sentences", value=True
526
+ )
527
  granularity_label = "sentences" if selected_granularity else "reports"
528
 
529
  subsample_perc = st.sidebar.slider("Data sampling (%)", 10, 100, 100, 5)
530
 
 
531
  st.sidebar.markdown("---")
532
 
533
+ # ---------------------------------------------------------------------
534
+ # Embedding model & device
535
+ # ---------------------------------------------------------------------
536
 
 
 
537
  st.sidebar.header("Model Selection")
538
 
539
+ selected_embedding_model = st.sidebar.selectbox(
540
+ "Choose an embedding model",
541
+ (
542
+ "BAAI/bge-small-en-v1.5",
543
+ "intfloat/multilingual-e5-large-instruct",
544
+ "Qwen/Qwen3-Embedding-0.6B",
545
+ "sentence-transformers/all-mpnet-base-v2",
546
+ ),
547
+ )
548
 
 
 
549
  selected_device = st.sidebar.radio(
550
  "Processing device",
551
  ["GPU (MPS)", "CPU"],
 
556
  # 7. Precompute filenames and pipeline triggers
557
  # =====================================================================
558
 
559
+
560
+ def get_precomputed_filenames(csv_path, model_name, split_sentences, text_col):
561
  base = os.path.splitext(os.path.basename(csv_path))[0]
562
  safe_model = re.sub(r"[^a-zA-Z0-9_-]", "_", model_name)
563
  suf = "sentences" if split_sentences else "reports"
564
+
565
+ col_suffix = ""
566
+ if text_col:
567
+ safe_col = re.sub(r"[^a-zA-Z0-9_-]", "_", text_col)
568
+ col_suffix = f"_{safe_col}"
569
+
570
  return (
571
+ str(CACHE_DIR / f"precomputed_{base}{col_suffix}_{suf}_docs.npy"),
572
+ str(
573
+ CACHE_DIR
574
+ / f"precomputed_{base}_{safe_model}{col_suffix}_{suf}_embeddings.npy"
575
+ ),
576
  )
577
 
578
+
579
  DOCS_FILE, EMBEDDINGS_FILE = get_precomputed_filenames(
580
+ CSV_PATH, selected_embedding_model, selected_granularity, selected_text_column
581
  )
582
 
583
+ # --- Cache management ---
584
  st.sidebar.markdown("### Cache")
585
+ if st.sidebar.button(
586
+ "Clear cached files for this configuration", use_container_width=True
587
+ ):
588
  try:
589
  for p in (DOCS_FILE, EMBEDDINGS_FILE):
590
  if os.path.exists(p):
591
  os.remove(p)
 
592
  try:
593
+ load_precomputed_data.clear()
594
  except Exception:
595
  pass
596
  try:
597
+ perform_topic_modeling.clear()
598
  except Exception:
599
  pass
600
 
601
+ st.success(
602
+ "Deleted cached docs/embeddings and cleared caches. Click **Prepare Data** again."
603
+ )
604
  st.rerun()
605
  except Exception as e:
606
  st.error(f"Failed to delete cache files: {e}")
607
 
 
608
  st.sidebar.markdown("---")
609
 
 
 
610
  # =====================================================================
611
  # 8. Prepare Data OR Run Analysis
612
  # =====================================================================
613
 
614
  if not os.path.exists(EMBEDDINGS_FILE):
615
+ st.warning(
616
+ f"No precomputed embeddings found for this configuration "
617
+ f"({granularity_label} / {selected_embedding_model} / column '{selected_text_column}')."
618
+ )
619
+
620
  if st.button("Prepare Data for This Configuration"):
621
  generate_and_save_embeddings(
622
+ CSV_PATH,
623
+ DOCS_FILE,
624
+ EMBEDDINGS_FILE,
625
+ selected_embedding_model,
626
+ selected_granularity,
627
+ selected_device,
628
+ text_col=selected_text_column,
629
  )
630
 
631
  else:
632
  # Load cached data
633
  docs, embeddings = load_precomputed_data(DOCS_FILE, EMBEDDINGS_FILE)
634
 
 
635
  embeddings = np.asarray(embeddings)
636
  if embeddings.dtype == object or embeddings.ndim != 2:
637
  try:
638
  embeddings = np.vstack(embeddings).astype(np.float32)
639
  except Exception:
640
+ st.error(
641
+ "Cached embeddings are invalid. Please regenerate them for this configuration."
642
+ )
643
  st.stop()
644
 
 
645
  if subsample_perc < 100:
646
  n = int(len(docs) * (subsample_perc / 100))
647
  idx = np.random.choice(len(docs), size=n, replace=False)
648
  docs = [docs[i] for i in idx]
649
+ embeddings = np.asarray(embeddings)[idx, :]
650
+ st.warning(
651
+ f"Running analysis on {subsample_perc}% subsample ({len(docs)} documents)"
652
+ )
653
 
654
+ # Dataset summary
 
655
  st.subheader("Dataset summary")
656
+ n_reports = count_clean_reports(CSV_PATH, selected_text_column)
657
  unit = "sentences" if selected_granularity else "reports"
658
+ n_units = len(docs)
659
 
660
  c1, c2 = st.columns(2)
661
  c1.metric("Reports in CSV (cleaned)", n_reports)
 
670
  ng_min = st.slider("Min N-gram", 1, 5, 1)
671
  ng_max = st.slider("Max N-gram", 1, 5, 2)
672
  min_df = st.slider("Min Doc Freq", 1, 50, 1)
673
+ stopwords = st.select_slider(
674
+ "Stopwords", options=[None, "english"], value=None
675
+ )
676
 
677
  with st.sidebar.expander("UMAP"):
678
  um_n = st.slider("n_neighbors", 2, 50, 15)
 
684
  hm = st.slider("min_samples", 2, 100, 5)
685
 
686
  with st.sidebar.expander("BERTopic"):
687
+ nr_topics = st.text_input("nr_topics", value="auto")
688
  top_n_words = st.slider("top_n_words", 5, 25, 10)
689
 
 
690
  current_config = {
691
  "embedding_model": selected_embedding_model,
692
  "granularity": granularity_label,
 
710
  "nr_topics": nr_topics,
711
  "top_n_words": top_n_words,
712
  },
713
+ "text_column": selected_text_column,
714
  }
715
 
 
716
  run_button = st.sidebar.button("Run Analysis", type="primary")
717
 
 
718
  # =================================================================
719
  # 9. Visualization & History Tabs
720
  # =================================================================
 
728
  data = json.load(open(path))
729
  except Exception:
730
  return []
 
731
  for e in data:
732
  if "outlier_pct" not in e and "outlier_perc" in e:
733
  e["outlier_pct"] = e.pop("outlier_perc")
734
  return data
735
 
 
736
  def save_history(h):
737
  json.dump(h, open(HISTORY_FILE, "w"), indent=2)
738
 
 
740
  st.session_state.history = load_history()
741
 
742
  if run_button:
 
743
  if not isinstance(embeddings, np.ndarray):
744
  embeddings = np.asarray(embeddings)
745
 
 
747
  try:
748
  embeddings = np.vstack(embeddings).astype(np.float32)
749
  except Exception:
750
+ st.error(
751
+ "Cached embeddings are invalid (object/ragged). Click **Prepare Data** to regenerate."
752
+ )
753
  st.stop()
754
 
755
  if embeddings.shape[0] != len(docs):
756
+ st.error(
757
+ f"len(docs)={len(docs)} but embeddings.shape[0]={embeddings.shape[0]}.\n"
758
+ "Likely stale cache (e.g., switched sentences↔reports or model). "
759
+ "Use the **Clear cache** button below and regenerate."
760
+ )
761
  st.stop()
762
 
 
763
  with st.spinner("Performing topic modeling..."):
764
  model, reduced, labels, n_topics, outlier_pct = perform_topic_modeling(
765
  docs, embeddings, get_config_hash(current_config)
766
  )
767
  st.session_state.latest_results = (model, reduced, labels)
768
 
 
769
  entry = {
770
  "timestamp": str(pd.Timestamp.now()),
771
  "config": current_config,
772
  "num_topics": n_topics,
773
  "outlier_pct": f"{outlier_pct:.2f}%",
774
+ "llm_labels": [
775
+ name
776
+ for name in model.get_topic_info().Name.values
777
  if ("Unlabelled" not in name and "outlier" not in name)
778
  ],
779
  }
 
793
  st.subheader("Topic Info")
794
  st.dataframe(tm.get_topic_info())
795
 
 
 
796
  st.subheader("Export results (one row per topic)")
797
 
 
798
  full_reps = tm.get_topics(full=True)
799
+ llm_reps = full_reps.get("LLM", {})
800
 
 
801
  llm_names = {}
802
  for tid, vals in llm_reps.items():
803
  try:
804
+ llm_names[tid] = (
805
+ (vals[0][0] or "").strip().strip('"').strip(".")
806
+ )
807
  except Exception:
808
  llm_names[tid] = "Unlabelled"
809
+
 
810
  if not llm_names:
 
811
  st.caption("Note: Using default keyword-based topic names.")
812
+ llm_names = (
813
+ tm.get_topic_info().set_index("Topic")["Name"].to_dict()
814
+ )
815
 
 
816
  doc_info = tm.get_document_info(docs)[["Document", "Topic"]]
817
 
818
+ include_outliers = st.checkbox(
819
+ "Include outlier topic (-1)", value=False
820
+ )
821
  if not include_outliers:
822
  doc_info = doc_info[doc_info["Topic"] != -1]
823
 
 
824
  grouped = (
825
  doc_info.groupby("Topic")["Document"]
826
  .apply(list)
827
  .reset_index(name="texts")
828
  )
829
+ grouped["topic_name"] = grouped["Topic"].map(llm_names).fillna(
830
+ "Unlabelled"
831
+ )
832
 
833
+ export_topics = (
834
+ grouped.rename(columns={"Topic": "topic_id"})[
835
+ ["topic_id", "topic_name", "texts"]
836
+ ]
837
+ .sort_values("topic_id")
838
+ .reset_index(drop=True)
839
+ )
840
 
841
+ SEP = "\n"
 
 
 
842
 
 
843
  export_csv = export_topics.copy()
844
+ export_csv["texts"] = export_csv["texts"].apply(
845
+ lambda lst: SEP.join(map(str, lst))
846
+ )
847
 
848
  base = os.path.splitext(os.path.basename(CSV_PATH))[0]
849
  gran = "sentences" if selected_granularity else "reports"
850
+ csv_name = f"topics_{base}_{gran}.csv"
851
  jsonl_name = f"topics_{base}_{gran}.jsonl"
852
+ csv_path = (EVAL_DIR / csv_name).resolve()
853
  jsonl_path = (EVAL_DIR / jsonl_name).resolve()
854
 
855
  cL, cC, cR = st.columns(3)
 
863
  st.error(f"Failed to save CSV: {e}")
864
 
865
  with cC:
 
866
  if st.button("Save JSONL to eval/", use_container_width=True):
867
  try:
868
  with open(jsonl_path, "w", encoding="utf-8") as f:
 
872
  "topic_name": row["topic_name"],
873
  "texts": list(map(str, row["texts"])),
874
  }
875
+ f.write(
876
+ json.dumps(rec, ensure_ascii=False) + "\n"
877
+ )
878
  st.success(f"Saved JSONL → {jsonl_path}")
879
  except Exception as e:
880
  st.error(f"Failed to save JSONL: {e}")
 
891
  st.caption("Preview (one row per topic)")
892
  st.dataframe(export_csv.head(10))
893
 
 
 
 
 
 
894
  else:
895
  st.info("Click 'Run Analysis' to begin.")
896
 
 
903
  for i, entry in enumerate(st.session_state.history):
904
  with st.expander(f"Run {i+1} — {entry['timestamp']}"):
905
  st.write(f"**Topics:** {entry['num_topics']}")
906
+ st.write(
907
+ f"**Outliers:** {entry.get('outlier_pct', entry.get('outlier_perc', 'N/A'))}"
908
+ )
909
  st.write("**Topic Labels (default keywords):**")
910
  st.write(entry["llm_labels"])
911
  with st.expander("Show full configuration"):
912
+ st.json(entry["config"])