Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -26,10 +26,17 @@ import json
|
|
| 26 |
NLTK_DATA_DIR = "/usr/local/share/nltk_data"
|
| 27 |
if NLTK_DATA_DIR not in nltk.data.path:
|
| 28 |
nltk.data.path.append(NLTK_DATA_DIR)
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
|
| 34 |
|
| 35 |
try:
|
|
@@ -348,17 +355,17 @@ def generate_and_save_embeddings(csv_path, docs_file, emb_file,
|
|
| 348 |
# Sentence / report granularity
|
| 349 |
# ---------------------
|
| 350 |
|
| 351 |
-
# --- THIS BLOCK IS NOW MODIFIED ---
|
| 352 |
if split_sentences:
|
| 353 |
try:
|
| 354 |
sentences = [s for r in reports for s in nltk.sent_tokenize(r)]
|
| 355 |
docs = [s for s in sentences if len(s.split()) > 2]
|
| 356 |
-
except LookupError:
|
| 357 |
-
st.error("NLTK
|
| 358 |
st.stop()
|
| 359 |
else:
|
| 360 |
docs = reports
|
| 361 |
|
|
|
|
| 362 |
np.save(docs_file, np.array(docs, dtype=object))
|
| 363 |
st.success(f"Prepared {len(docs)} documents")
|
| 364 |
|
|
|
|
| 26 |
NLTK_DATA_DIR = "/usr/local/share/nltk_data"
|
| 27 |
if NLTK_DATA_DIR not in nltk.data.path:
|
| 28 |
nltk.data.path.append(NLTK_DATA_DIR)
|
| 29 |
+
|
| 30 |
+
for resource in ("punkt_tab", "punkt"):
|
| 31 |
+
try:
|
| 32 |
+
nltk.data.find(f"tokenizers/{resource}")
|
| 33 |
+
except LookupError:
|
| 34 |
+
# Best-effort fallback; if HF blocks downloads, we still have the ones from the Docker build
|
| 35 |
+
try:
|
| 36 |
+
nltk.download(resource, download_dir=NLTK_DATA_DIR)
|
| 37 |
+
except Exception as e:
|
| 38 |
+
print(f"Could not download NLTK resource {resource}: {e}")
|
| 39 |
+
|
| 40 |
|
| 41 |
|
| 42 |
try:
|
|
|
|
| 355 |
# Sentence / report granularity
|
| 356 |
# ---------------------
|
| 357 |
|
|
|
|
| 358 |
if split_sentences:
|
| 359 |
try:
|
| 360 |
sentences = [s for r in reports for s in nltk.sent_tokenize(r)]
|
| 361 |
docs = [s for s in sentences if len(s.split()) > 2]
|
| 362 |
+
except LookupError as e:
|
| 363 |
+
st.error(f"NLTK tokenizer data not found: {e}")
|
| 364 |
st.stop()
|
| 365 |
else:
|
| 366 |
docs = reports
|
| 367 |
|
| 368 |
+
|
| 369 |
np.save(docs_file, np.array(docs, dtype=object))
|
| 370 |
st.success(f"Prepared {len(docs)} documents")
|
| 371 |
|