romybeaute commited on
Commit
ba0ad54
·
verified ·
1 Parent(s): 5ca5039

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +14 -7
app.py CHANGED
@@ -26,10 +26,17 @@ import json
26
  NLTK_DATA_DIR = "/usr/local/share/nltk_data"
27
  if NLTK_DATA_DIR not in nltk.data.path:
28
  nltk.data.path.append(NLTK_DATA_DIR)
29
- try:
30
- nltk.data.find("tokenizers/punkt")
31
- except LookupError:
32
- nltk.download("punkt", download_dir=NLTK_DATA_DIR)
 
 
 
 
 
 
 
33
 
34
 
35
  try:
@@ -348,17 +355,17 @@ def generate_and_save_embeddings(csv_path, docs_file, emb_file,
348
  # Sentence / report granularity
349
  # ---------------------
350
 
351
- # --- THIS BLOCK IS NOW MODIFIED ---
352
  if split_sentences:
353
  try:
354
  sentences = [s for r in reports for s in nltk.sent_tokenize(r)]
355
  docs = [s for s in sentences if len(s.split()) > 2]
356
- except LookupError:
357
- st.error("NLTK 'punkt' data not found! This is a build error.")
358
  st.stop()
359
  else:
360
  docs = reports
361
 
 
362
  np.save(docs_file, np.array(docs, dtype=object))
363
  st.success(f"Prepared {len(docs)} documents")
364
 
 
26
  NLTK_DATA_DIR = "/usr/local/share/nltk_data"
27
  if NLTK_DATA_DIR not in nltk.data.path:
28
  nltk.data.path.append(NLTK_DATA_DIR)
29
+
30
+ for resource in ("punkt_tab", "punkt"):
31
+ try:
32
+ nltk.data.find(f"tokenizers/{resource}")
33
+ except LookupError:
34
+ # Best-effort fallback; if HF blocks downloads, we still have the ones from the Docker build
35
+ try:
36
+ nltk.download(resource, download_dir=NLTK_DATA_DIR)
37
+ except Exception as e:
38
+ print(f"Could not download NLTK resource {resource}: {e}")
39
+
40
 
41
 
42
  try:
 
355
  # Sentence / report granularity
356
  # ---------------------
357
 
 
358
  if split_sentences:
359
  try:
360
  sentences = [s for r in reports for s in nltk.sent_tokenize(r)]
361
  docs = [s for s in sentences if len(s.split()) > 2]
362
+ except LookupError as e:
363
+ st.error(f"NLTK tokenizer data not found: {e}")
364
  st.stop()
365
  else:
366
  docs = reports
367
 
368
+
369
  np.save(docs_file, np.array(docs, dtype=object))
370
  st.success(f"Prepared {len(docs)} documents")
371