File size: 4,892 Bytes
5f7901d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bffb26a
5f7901d
 
 
bffb26a
 
 
 
 
 
 
 
5f7901d
bffb26a
 
 
 
 
 
5f7901d
bffb26a
 
 
5f7901d
bffb26a
5f7901d
 
bffb26a
 
 
5f7901d
 
bffb26a
5f7901d
 
bffb26a
5f7901d
03c32c6
5f7901d
bffb26a
 
5f7901d
bffb26a
5f7901d
 
 
03c32c6
5f7901d
 
 
 
 
 
bffb26a
5f7901d
 
 
bffb26a
03c32c6
5f7901d
bffb26a
 
5f7901d
bffb26a
 
5f7901d
 
bffb26a
 
 
 
 
5f7901d
 
bffb26a
5f7901d
 
 
 
 
bffb26a
5f7901d
 
bffb26a
 
 
5f7901d
 
 
 
 
 
 
bffb26a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
# =============================================================================
# ADUC-SDR Video Suite — High-Perf Diffusers for 8× L40S (SM 8.9)
# CUDA 12.8 | PyTorch 2.8.0+cu128 | Ubuntu 22.04
# =============================================================================
FROM nvidia/cuda:12.8.0-devel-ubuntu22.04

LABEL maintainer="Carlos Rodrigues dos Santos & Development Partner"
LABEL description="High-performance Diffusers stack with FA2/SDPA, 8×L40S"
LABEL version="4.4.0"
LABEL cuda_version="12.8.0"
LABEL python_version="3.10"
LABEL pytorch_version="2.8.0+cu128"
LABEL gpu_optimized_for="8x_NVIDIA_L40S"

# ---------------- Core env & caches ----------------
ENV DEBIAN_FRONTEND=noninteractive TZ=UTC LANG=C.UTF-8 LC_ALL=C.UTF-8 \
    PYTHONUNBUFFERED=1 PYTHONDONTWRITEBYTECODE=1 \
    PIP_NO_CACHE_DIR=1 PIP_DISABLE_PIP_VERSION_CHECK=1

# GPU/Compute
ENV NVIDIA_VISIBLE_DEVICES=all
ENV TORCH_CUDA_ARCH_LIST="8.9"
ENV CUDA_DEVICE_ORDER=PCI_BUS_ID
ENV CUDA_DEVICE_MAX_CONNECTIONS=32

# Threads
ENV OMP_NUM_THREADS=8 MKL_NUM_THREADS=8 MAX_JOBS=160

# Alloc/caches
ENV PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:512,garbage_collection_threshold:0.8
ENV CUDA_LAUNCH_BLOCKING=0 CUDA_CACHE_MAXSIZE=2147483648 CUDA_CACHE_DISABLE=0

# App paths
ENV APP_HOME=/app
WORKDIR $APP_HOME

# Persistent data and caches in /data
ENV HF_HOME=/data/.cache/huggingface
ENV TORCH_HOME=/data/.cache/torch
ENV HF_DATASETS_CACHE=/data/.cache/datasets
ENV TRANSFORMERS_CACHE=/data/.cache/transformers
ENV DIFFUSERS_CACHE=/data/.cache/diffusers
ENV HF_HUB_ENABLE_HF_TRANSFER=1
ENV TOKENIZERS_PARALLELISM=false

# Create non-root user and data dirs early, fix ownership
RUN useradd -m -u 1000 -s /bin/bash appuser && \
    mkdir -p /data /data/models \
             /data/.cache/huggingface /data/.cache/torch \
             /data/.cache/datasets /data/.cache/transformers /data/.cache/diffusers && \
    chown -R appuser:appuser /data

# Models live in /data/models and are visible at /app/models
ENV MODELS_DIR=/data/models
RUN ln -sf /data/models /app/models

# ---------------- System & Python ----------------
RUN apt-get update && apt-get install -y --no-install-recommends \
    build-essential gosu tree cmake git git-lfs curl wget ffmpeg ninja-build \
    python3.10 python3.10-dev python3.10-distutils python3-pip \
    ca-certificates libglib2.0-0 libgl1 \
 && apt-get clean && rm -rf /var/lib/apt/lists/*

RUN ln -sf /usr/bin/python3.10 /usr/bin/python3 && \
    ln -sf /usr/bin/python3.10 /usr/bin/python && \
    python3 -m pip install --upgrade pip

# ---------------- PyTorch cu128 (pinned) ----------------
RUN pip install --index-url https://download.pytorch.org/whl/cu128 \
    torch>=2.8.0+cu128 torchvision>=0.23.0+cu128 torchaudio>=2.8.0+cu128

# ---------------- Toolchain, Triton, FA2 (no bnb build) ----------------
RUN pip install packaging ninja cmake pybind11 scikit-build cython hf_transfer "numpy>=1.24.4"

# Triton 3.x (no triton.ops)
RUN pip uninstall -y triton || true && \
    pip install -v --no-build-isolation triton==3.4.0


# FlashAttention 2.8.x
RUN pip install flash-attn==2.8.3 --no-build-isolation || \
    pip install flash-attn==2.8.2 --no-build-isolation || \
    pip install flash-attn==2.8.1 --no-build-isolation || \
    pip install flash-attn==2.8.0.post2 --no-build-isolation

# ---------------- App dependencies ----------------
COPY requirements.txt ./requirements.txt
RUN pip install --no-cache-dir -r requirements.txt

# Pin bnb to avoid surprise CUDA/PTX mismatches (adjust as needed)
RUN pip install --upgrade bitsandbytes

# Custom .whl (Apex + dropout_layer_norm)
RUN echo "Installing custom wheels..." && \
    pip install --no-cache-dir \
      "https://huggingface.co/euIaxs22/Aduc-sdr/resolve/main/apex-0.1-cp310-cp310-linux_x86_64.whl" \
      "https://huggingface.co/euIaxs22/Aduc-sdr/resolve/main/dropout_layer_norm-0.1-cp310-cp310-linux_x86_64.whl"

# ====================================================================
# Optional: q8_kernels + LTX-Video (enable if needed; ensure wheel ABI)
RUN pip install --no-cache-dir \
   "https://huggingface.co/euIaxs22/Aduc-sdr/resolve/main/q8_kernels-0.0.5-cp310-cp310-linux_x86_64.whl"
# RUN git clone https://github.com/Lightricks/LTX-Video.git /data/LTX-Video && \
#     cd /data/LTX-Video && python -m pip install -e .[inference]
# ====================================================================

# Scripts and app
COPY info.sh ./app/info.sh
COPY builder.sh ./app/builder.sh
COPY start.sh ./app/start.sh
COPY entrypoint.sh ./app/entrypoint.sh

# Copy the rest of the source last for better caching
COPY . .

# Permissions on app tree
RUN chown -R appuser:appuser /app /data && \
    chmod 0755 /app/entrypoint.sh /app/start.sh /app/info.sh /app/builder.sh

VOLUME /data

ENTRYPOINT ["/app/entrypoint.sh"]
USER appuser

# ---------------- Entry ----------------
CMD ["/app/start.sh"]