NIKKI77 commited on
Commit
af56a8e
·
1 Parent(s): 48fba8f

Fix NLTK punkt_tab by setting NLTK_DATA and pre-downloading

Browse files
Files changed (1) hide show
  1. Dockerfile +21 -14
Dockerfile CHANGED
@@ -4,33 +4,40 @@ ENV DEBIAN_FRONTEND=noninteractive PIP_NO_CACHE_DIR=1
4
  WORKDIR /app
5
  ENV PYTHONPATH=/app/backend:$PYTHONPATH
6
 
7
- # OpenMP runtime (prevents missing libgomp)
8
  RUN apt-get update && apt-get install -y libgomp1 && rm -rf /var/lib/apt/lists/*
9
 
10
- # Writable caches + sane threads
11
  ENV HF_HOME=/app/.cache \
12
- XDG_CACHE_HOME=/app/.cache \
 
13
  SENTENCE_TRANSFORMERS_HOME=/app/.cache/sentence-transformers \
14
- OMP_NUM_THREADS=1
15
- RUN mkdir -p /app/.cache && chmod -R 777 /app/.cache
 
 
 
 
 
 
16
 
17
  COPY . .
18
 
19
- # Install GPU-ready deps
20
  RUN pip install -r requirements.txt
21
 
22
- # Preload spaCy + NLTK data so runtime doesn't download
23
  RUN python -m spacy download en_core_web_sm
24
  RUN python - <<'PY'
25
- import nltk
26
- nltk.download('punkt')
27
- nltk.download('punkt_tab') # NEW: required by NLTK 3.9+
28
- nltk.download('wordnet')
29
- nltk.download('omw-1.4')
30
  PY
31
 
32
- # HF Spaces uses port 7860
33
  EXPOSE 7860
34
 
35
- # Single worker + a few threads = nicer on GPU VRAM
36
  CMD ["gunicorn","-w","1","-k","gthread","--threads","4","-b","0.0.0.0:7860","backend.app:app"]
 
4
  WORKDIR /app
5
  ENV PYTHONPATH=/app/backend:$PYTHONPATH
6
 
7
+ # OpenMP runtime
8
  RUN apt-get update && apt-get install -y libgomp1 && rm -rf /var/lib/apt/lists/*
9
 
10
+ # Writable caches + thread caps + NLTK data path
11
  ENV HF_HOME=/app/.cache \
12
+ TRANSFORMERS_CACHE=/app/.cache/transformers \
13
+ HUGGINGFACE_HUB_CACHE=/app/.cache/huggingface \
14
  SENTENCE_TRANSFORMERS_HOME=/app/.cache/sentence-transformers \
15
+ XDG_CACHE_HOME=/app/.cache \
16
+ NLTK_DATA=/usr/local/nltk_data \
17
+ OMP_NUM_THREADS=1 \
18
+ OPENBLAS_NUM_THREADS=1 \
19
+ MKL_NUM_THREADS=1 \
20
+ NUMEXPR_NUM_THREADS=1
21
+
22
+ RUN mkdir -p /app/.cache "$NLTK_DATA" && chmod -R 777 /app/.cache "$NLTK_DATA"
23
 
24
  COPY . .
25
 
26
+ # Install deps (CUDA 12.1 torch per requirements.txt)
27
  RUN pip install -r requirements.txt
28
 
29
+ # Preload spaCy + NLTK data into the path NLTK will search
30
  RUN python -m spacy download en_core_web_sm
31
  RUN python - <<'PY'
32
+ import os, nltk
33
+ path = os.environ.get("NLTK_DATA", "/usr/local/nltk_data")
34
+ for pkg in ["punkt", "punkt_tab", "wordnet", "omw-1.4"]:
35
+ nltk.download(pkg, download_dir=path)
36
+ print("Downloaded NLTK data to:", path)
37
  PY
38
 
39
+ # HF Spaces port
40
  EXPOSE 7860
41
 
42
+ # Single worker + a few threads
43
  CMD ["gunicorn","-w","1","-k","gthread","--threads","4","-b","0.0.0.0:7860","backend.app:app"]