romybeaute commited on
Commit
f9eb985
·
verified ·
1 Parent(s): c1a0591

clear cache name csv file uploaded + download all topics sentences

Browse files
Files changed (1) hide show
  1. app.py +56 -26
app.py CHANGED
@@ -101,6 +101,24 @@ def _slugify(s: str) -> str:
101
  s = re.sub(r"[^A-Za-z0-9._-]+", "_", s)
102
  return s or "DATASET"
103
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
 
105
  # "Nice" default names we know from MOSAIC; NOT a hard constraint anymore
106
  ACCEPTABLE_TEXT_COLUMNS = [
@@ -459,24 +477,6 @@ else:
459
  "After upload, you’ll be able to choose which text column to analyse."
460
  )
461
 
462
- # if up is not None:
463
- # tmp_df = pd.read_csv(up)
464
- # if tmp_df.empty:
465
- # st.error("Uploaded CSV is empty.")
466
- # st.stop()
467
-
468
- # if up is not None:
469
- # try:
470
- # # Try loading as standard UTF-8
471
- # tmp_df = pd.read_csv(up)
472
- # except UnicodeDecodeError:
473
- # # If that fails (e.g., Excel/Windows CSV), try ISO-8859-1 (Latin-1)
474
- # up.seek(0) # Reset file pointer to the beginning
475
- # tmp_df = pd.read_csv(up, encoding='ISO-8859-1')
476
-
477
- # if tmp_df.empty:
478
- # st.error("Uploaded CSV is empty.")
479
- # st.stop()
480
 
481
  if up is not None:
482
  # List of encodings to try:
@@ -508,8 +508,18 @@ else:
508
  # Optional: Print which encoding worked to the logs (for your info)
509
  print(f"Successfully loaded CSV using {success_encoding} encoding.")
510
 
511
- # Just save; we’ll choose the text column later
512
- uploaded_csv_path = str((PROC_DIR / "uploaded.csv").resolve())
 
 
 
 
 
 
 
 
 
 
513
  tmp_df.to_csv(uploaded_csv_path, index=False)
514
  st.success(f"Uploaded CSV saved to {uploaded_csv_path}")
515
  CSV_PATH = uploaded_csv_path
@@ -930,16 +940,36 @@ else:
930
  st.error(f"Failed to save JSONL: {e}")
931
 
932
  with cR:
 
 
 
 
 
 
 
 
 
 
 
 
933
  st.download_button(
934
- "Download CSV",
935
- data=export_csv.to_csv(index=False).encode("utf-8-sig"),
936
- file_name=csv_name,
937
  mime="text/csv",
938
  use_container_width=True,
 
939
  )
940
-
941
- st.caption("Preview (one row per topic)")
942
- # st.dataframe(export_csv.head(10))
 
 
 
 
 
 
 
943
  st.dataframe(export_csv)
944
 
945
  else:
 
101
  s = re.sub(r"[^A-Za-z0-9._-]+", "_", s)
102
  return s or "DATASET"
103
 
104
+ def _cleanup_old_cache(current_slug: str):
105
+ """Deletes precomputed .npy files that do not match the current dataset slug."""
106
+ if not CACHE_DIR.exists():
107
+ return
108
+
109
+ removed_count = 0
110
+ # Iterate over all precomputed files
111
+ for p in CACHE_DIR.glob("precomputed_*.npy"):
112
+ # If the file belongs to a different dataset (doesn't contain the new slug)
113
+ if current_slug not in p.name:
114
+ try:
115
+ p.unlink() # Delete file
116
+ removed_count += 1
117
+ except Exception as e:
118
+ print(f"Error deleting {p.name}: {e}")
119
+
120
+ if removed_count > 0:
121
+ print(f"Auto-cleanup: Removed {removed_count} old cache files.")
122
 
123
  # "Nice" default names we know from MOSAIC; NOT a hard constraint anymore
124
  ACCEPTABLE_TEXT_COLUMNS = [
 
477
  "After upload, you’ll be able to choose which text column to analyse."
478
  )
479
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
480
 
481
  if up is not None:
482
  # List of encodings to try:
 
508
  # Optional: Print which encoding worked to the logs (for your info)
509
  print(f"Successfully loaded CSV using {success_encoding} encoding.")
510
 
511
+ # # Just save; we’ll choose the text column later
512
+ # uploaded_csv_path = str((PROC_DIR / "uploaded.csv").resolve())
513
+ # tmp_df.to_csv(uploaded_csv_path, index=False)
514
+ # st.success(f"Uploaded CSV saved to {uploaded_csv_path}")
515
+ # CSV_PATH = uploaded_csv_path
516
+
517
+ # FIX: Use the original filename to avoid cache collisions
518
+ # We sanitize the name to be safe for file systems
519
+ safe_filename = _slugify(os.path.splitext(up.name)[0])
520
+ _cleanup_old_cache(safe_filename)
521
+ uploaded_csv_path = str((PROC_DIR / f"{safe_filename}.csv").resolve())
522
+
523
  tmp_df.to_csv(uploaded_csv_path, index=False)
524
  st.success(f"Uploaded CSV saved to {uploaded_csv_path}")
525
  CSV_PATH = uploaded_csv_path
 
940
  st.error(f"Failed to save JSONL: {e}")
941
 
942
  with cR:
943
+
944
+ # Create a Long Format DataFrame (One row per sentence)
945
+ # This ensures NO text is hidden due to Excel cell limits
946
+ long_format_df = doc_info.copy()
947
+ long_format_df["Topic Name"] = long_format_df["Topic"].map(llm_names).fillna("Unlabelled")
948
+
949
+ # Reorder columns for clarity
950
+ long_format_df = long_format_df[["Topic", "Topic Name", "Document"]]
951
+
952
+ # Define filename
953
+ long_csv_name = f"all_sentences_{base}_{gran}.csv"
954
+
955
  st.download_button(
956
+ "Download All Sentences (Long Format)",
957
+ data=long_format_df.to_csv(index=False).encode("utf-8-sig"),
958
+ file_name=long_csv_name,
959
  mime="text/csv",
960
  use_container_width=True,
961
+ help="Download a CSV with one row per sentence. Best for checking exactly which sentences belong to which topic."
962
  )
963
+
964
+ # st.download_button(
965
+ # "Download CSV",
966
+ # data=export_csv.to_csv(index=False).encode("utf-8-sig"),
967
+ # file_name=csv_name,
968
+ # mime="text/csv",
969
+ # use_container_width=True,
970
+ # )
971
+
972
+ # st.caption("Preview (one row per topic)")
973
  st.dataframe(export_csv)
974
 
975
  else: