NIKKI77 commited on
Commit
e60c7a5
·
1 Parent(s): af56a8e

Fix NLTK punkt_tab by setting NLTK_DATA and pre-downloading

Browse files
Files changed (1) hide show
  1. Dockerfile +29 -15
Dockerfile CHANGED
@@ -4,40 +4,54 @@ ENV DEBIAN_FRONTEND=noninteractive PIP_NO_CACHE_DIR=1
4
  WORKDIR /app
5
  ENV PYTHONPATH=/app/backend:$PYTHONPATH
6
 
7
- # OpenMP runtime
8
  RUN apt-get update && apt-get install -y libgomp1 && rm -rf /var/lib/apt/lists/*
9
 
10
- # Writable caches + thread caps + NLTK data path
11
  ENV HF_HOME=/app/.cache \
12
- TRANSFORMERS_CACHE=/app/.cache/transformers \
13
  HUGGINGFACE_HUB_CACHE=/app/.cache/huggingface \
14
  SENTENCE_TRANSFORMERS_HOME=/app/.cache/sentence-transformers \
15
  XDG_CACHE_HOME=/app/.cache \
16
  NLTK_DATA=/usr/local/nltk_data \
17
- OMP_NUM_THREADS=1 \
18
- OPENBLAS_NUM_THREADS=1 \
19
- MKL_NUM_THREADS=1 \
20
- NUMEXPR_NUM_THREADS=1
21
-
22
  RUN mkdir -p /app/.cache "$NLTK_DATA" && chmod -R 777 /app/.cache "$NLTK_DATA"
23
 
24
  COPY . .
25
 
26
- # Install deps (CUDA 12.1 torch per requirements.txt)
27
  RUN pip install -r requirements.txt
28
 
29
- # Preload spaCy + NLTK data into the path NLTK will search
30
  RUN python -m spacy download en_core_web_sm
31
  RUN python - <<'PY'
32
  import os, nltk
33
  path = os.environ.get("NLTK_DATA", "/usr/local/nltk_data")
34
- for pkg in ["punkt", "punkt_tab", "wordnet", "omw-1.4"]:
35
  nltk.download(pkg, download_dir=path)
36
- print("Downloaded NLTK data to:", path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  PY
 
38
 
39
- # HF Spaces port
40
  EXPOSE 7860
41
 
42
- # Single worker + a few threads
43
- CMD ["gunicorn","-w","1","-k","gthread","--threads","4","-b","0.0.0.0:7860","backend.app:app"]
 
4
  WORKDIR /app
5
  ENV PYTHONPATH=/app/backend:$PYTHONPATH
6
 
7
+ # OpenMP + caches
8
  RUN apt-get update && apt-get install -y libgomp1 && rm -rf /var/lib/apt/lists/*
9
 
10
+ # Writable caches + NLTK path + thread caps
11
  ENV HF_HOME=/app/.cache \
 
12
  HUGGINGFACE_HUB_CACHE=/app/.cache/huggingface \
13
  SENTENCE_TRANSFORMERS_HOME=/app/.cache/sentence-transformers \
14
  XDG_CACHE_HOME=/app/.cache \
15
  NLTK_DATA=/usr/local/nltk_data \
16
+ OMP_NUM_THREADS=1 OPENBLAS_NUM_THREADS=1 MKL_NUM_THREADS=1 NUMEXPR_NUM_THREADS=1
 
 
 
 
17
  RUN mkdir -p /app/.cache "$NLTK_DATA" && chmod -R 777 /app/.cache "$NLTK_DATA"
18
 
19
  COPY . .
20
 
21
+ # Install deps (CUDA 12.1 wheel for Torch is in requirements.txt)
22
  RUN pip install -r requirements.txt
23
 
24
+ # Preload spaCy + NLTK
25
  RUN python -m spacy download en_core_web_sm
26
  RUN python - <<'PY'
27
  import os, nltk
28
  path = os.environ.get("NLTK_DATA", "/usr/local/nltk_data")
29
+ for pkg in ("punkt","punkt_tab","wordnet","omw-1.4"):
30
  nltk.download(pkg, download_dir=path)
31
+ print("Downloaded NLTK to", path)
32
+ PY
33
+
34
+ # ---------- PRE-DOWNLOAD MODELS TO AVOID BOOT TIMEOUT ----------
35
+ RUN python - <<'PY'
36
+ import os
37
+ from transformers import AutoTokenizer, AutoModelForTokenClassification
38
+ from sentence_transformers import SentenceTransformer
39
+
40
+ cache = os.environ.get("HF_HOME", "/app/.cache")
41
+
42
+ # Oliver Guhr punctuation model
43
+ m_punc = "oliverguhr/fullstop-punctuation-multilang-large"
44
+ AutoTokenizer.from_pretrained(m_punc, cache_dir=cache)
45
+ AutoModelForTokenClassification.from_pretrained(m_punc, cache_dir=cache)
46
+
47
+ # SBERT encoder
48
+ m_sbert = "sentence-transformers/all-MiniLM-L6-v2"
49
+ SentenceTransformer(m_sbert, cache_folder=os.path.join(cache, "sentence-transformers"))
50
+ print("Pre-downloaded:", m_punc, "and", m_sbert)
51
  PY
52
+ # ---------------------------------------------------------------
53
 
 
54
  EXPOSE 7860
55
 
56
+ # Give startup more time (models/GPU init) and keep single worker
57
+ CMD ["gunicorn","-w","1","-k","gthread","--threads","4","--timeout","300","-b","0.0.0.0:7860","backend.app:app"]