Spaces:
Running
Running
clear cache name csv file uploaded + download all topics sentences
Browse files
app.py
CHANGED
|
@@ -101,6 +101,24 @@ def _slugify(s: str) -> str:
|
|
| 101 |
s = re.sub(r"[^A-Za-z0-9._-]+", "_", s)
|
| 102 |
return s or "DATASET"
|
| 103 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 104 |
|
| 105 |
# "Nice" default names we know from MOSAIC; NOT a hard constraint anymore
|
| 106 |
ACCEPTABLE_TEXT_COLUMNS = [
|
|
@@ -459,24 +477,6 @@ else:
|
|
| 459 |
"After upload, you’ll be able to choose which text column to analyse."
|
| 460 |
)
|
| 461 |
|
| 462 |
-
# if up is not None:
|
| 463 |
-
# tmp_df = pd.read_csv(up)
|
| 464 |
-
# if tmp_df.empty:
|
| 465 |
-
# st.error("Uploaded CSV is empty.")
|
| 466 |
-
# st.stop()
|
| 467 |
-
|
| 468 |
-
# if up is not None:
|
| 469 |
-
# try:
|
| 470 |
-
# # Try loading as standard UTF-8
|
| 471 |
-
# tmp_df = pd.read_csv(up)
|
| 472 |
-
# except UnicodeDecodeError:
|
| 473 |
-
# # If that fails (e.g., Excel/Windows CSV), try ISO-8859-1 (Latin-1)
|
| 474 |
-
# up.seek(0) # Reset file pointer to the beginning
|
| 475 |
-
# tmp_df = pd.read_csv(up, encoding='ISO-8859-1')
|
| 476 |
-
|
| 477 |
-
# if tmp_df.empty:
|
| 478 |
-
# st.error("Uploaded CSV is empty.")
|
| 479 |
-
# st.stop()
|
| 480 |
|
| 481 |
if up is not None:
|
| 482 |
# List of encodings to try:
|
|
@@ -508,8 +508,18 @@ else:
|
|
| 508 |
# Optional: Print which encoding worked to the logs (for your info)
|
| 509 |
print(f"Successfully loaded CSV using {success_encoding} encoding.")
|
| 510 |
|
| 511 |
-
# Just save; we’ll choose the text column later
|
| 512 |
-
uploaded_csv_path = str((PROC_DIR / "uploaded.csv").resolve())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 513 |
tmp_df.to_csv(uploaded_csv_path, index=False)
|
| 514 |
st.success(f"Uploaded CSV saved to {uploaded_csv_path}")
|
| 515 |
CSV_PATH = uploaded_csv_path
|
|
@@ -930,16 +940,36 @@ else:
|
|
| 930 |
st.error(f"Failed to save JSONL: {e}")
|
| 931 |
|
| 932 |
with cR:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 933 |
st.download_button(
|
| 934 |
-
"Download
|
| 935 |
-
data=
|
| 936 |
-
file_name=
|
| 937 |
mime="text/csv",
|
| 938 |
use_container_width=True,
|
|
|
|
| 939 |
)
|
| 940 |
-
|
| 941 |
-
|
| 942 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 943 |
st.dataframe(export_csv)
|
| 944 |
|
| 945 |
else:
|
|
|
|
| 101 |
s = re.sub(r"[^A-Za-z0-9._-]+", "_", s)
|
| 102 |
return s or "DATASET"
|
| 103 |
|
| 104 |
+
def _cleanup_old_cache(current_slug: str):
|
| 105 |
+
"""Deletes precomputed .npy files that do not match the current dataset slug."""
|
| 106 |
+
if not CACHE_DIR.exists():
|
| 107 |
+
return
|
| 108 |
+
|
| 109 |
+
removed_count = 0
|
| 110 |
+
# Iterate over all precomputed files
|
| 111 |
+
for p in CACHE_DIR.glob("precomputed_*.npy"):
|
| 112 |
+
# If the file belongs to a different dataset (doesn't contain the new slug)
|
| 113 |
+
if current_slug not in p.name:
|
| 114 |
+
try:
|
| 115 |
+
p.unlink() # Delete file
|
| 116 |
+
removed_count += 1
|
| 117 |
+
except Exception as e:
|
| 118 |
+
print(f"Error deleting {p.name}: {e}")
|
| 119 |
+
|
| 120 |
+
if removed_count > 0:
|
| 121 |
+
print(f"Auto-cleanup: Removed {removed_count} old cache files.")
|
| 122 |
|
| 123 |
# "Nice" default names we know from MOSAIC; NOT a hard constraint anymore
|
| 124 |
ACCEPTABLE_TEXT_COLUMNS = [
|
|
|
|
| 477 |
"After upload, you’ll be able to choose which text column to analyse."
|
| 478 |
)
|
| 479 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 480 |
|
| 481 |
if up is not None:
|
| 482 |
# List of encodings to try:
|
|
|
|
| 508 |
# Optional: Print which encoding worked to the logs (for your info)
|
| 509 |
print(f"Successfully loaded CSV using {success_encoding} encoding.")
|
| 510 |
|
| 511 |
+
# # Just save; we’ll choose the text column later
|
| 512 |
+
# uploaded_csv_path = str((PROC_DIR / "uploaded.csv").resolve())
|
| 513 |
+
# tmp_df.to_csv(uploaded_csv_path, index=False)
|
| 514 |
+
# st.success(f"Uploaded CSV saved to {uploaded_csv_path}")
|
| 515 |
+
# CSV_PATH = uploaded_csv_path
|
| 516 |
+
|
| 517 |
+
# FIX: Use the original filename to avoid cache collisions
|
| 518 |
+
# We sanitize the name to be safe for file systems
|
| 519 |
+
safe_filename = _slugify(os.path.splitext(up.name)[0])
|
| 520 |
+
_cleanup_old_cache(safe_filename)
|
| 521 |
+
uploaded_csv_path = str((PROC_DIR / f"{safe_filename}.csv").resolve())
|
| 522 |
+
|
| 523 |
tmp_df.to_csv(uploaded_csv_path, index=False)
|
| 524 |
st.success(f"Uploaded CSV saved to {uploaded_csv_path}")
|
| 525 |
CSV_PATH = uploaded_csv_path
|
|
|
|
| 940 |
st.error(f"Failed to save JSONL: {e}")
|
| 941 |
|
| 942 |
with cR:
|
| 943 |
+
|
| 944 |
+
# Create a Long Format DataFrame (One row per sentence)
|
| 945 |
+
# This ensures NO text is hidden due to Excel cell limits
|
| 946 |
+
long_format_df = doc_info.copy()
|
| 947 |
+
long_format_df["Topic Name"] = long_format_df["Topic"].map(llm_names).fillna("Unlabelled")
|
| 948 |
+
|
| 949 |
+
# Reorder columns for clarity
|
| 950 |
+
long_format_df = long_format_df[["Topic", "Topic Name", "Document"]]
|
| 951 |
+
|
| 952 |
+
# Define filename
|
| 953 |
+
long_csv_name = f"all_sentences_{base}_{gran}.csv"
|
| 954 |
+
|
| 955 |
st.download_button(
|
| 956 |
+
"Download All Sentences (Long Format)",
|
| 957 |
+
data=long_format_df.to_csv(index=False).encode("utf-8-sig"),
|
| 958 |
+
file_name=long_csv_name,
|
| 959 |
mime="text/csv",
|
| 960 |
use_container_width=True,
|
| 961 |
+
help="Download a CSV with one row per sentence. Best for checking exactly which sentences belong to which topic."
|
| 962 |
)
|
| 963 |
+
|
| 964 |
+
# st.download_button(
|
| 965 |
+
# "Download CSV",
|
| 966 |
+
# data=export_csv.to_csv(index=False).encode("utf-8-sig"),
|
| 967 |
+
# file_name=csv_name,
|
| 968 |
+
# mime="text/csv",
|
| 969 |
+
# use_container_width=True,
|
| 970 |
+
# )
|
| 971 |
+
|
| 972 |
+
# st.caption("Preview (one row per topic)")
|
| 973 |
st.dataframe(export_csv)
|
| 974 |
|
| 975 |
else:
|