Spaces:

colin730
/

SummarizerApp

Sleeping

ming commited on 20 days ago

Commit

a36f560

1 Parent(s): df75294

Switch V4 to GPU INT4 quantization with Qwen-1.5B

- Upgraded transformers to 4.44+ and accelerate to 0.33+
- Added bitsandbytes for 4-bit NF4 quantization on GPU
- Removed torchao dependency (causes HF Spaces errors)
- Added TRANSFORMERS_NO_TORCHAO=1 to prevent import errors
- Updated to use Qwen/Qwen2.5-1.5B-Instruct model
- GPU: 4-bit NF4 quantization via bitsandbytes
- CPU fallback: FP32 + dynamic INT8 quantization

Files changed (3) hide show

Dockerfile +9 -6
app/services/structured_summarizer.py +80 -30
requirements.txt +4 -4

Dockerfile CHANGED Viewed

@@ -1,14 +1,17 @@
-# Hugging Face Spaces compatible Dockerfile - V2 Only
 FROM python:3.9-slim
-# Set environment variables for V2-only deployment
 ENV PYTHONDONTWRITEBYTECODE=1 \
     PYTHONUNBUFFERED=1 \
     PYTHONPATH=/app \
     ENABLE_V1_WARMUP=false \
-    ENABLE_V2_WARMUP=true \
-    HF_MODEL_ID=sshleifer/distilbart-cnn-6-6 \
-    HF_HOME=/tmp/huggingface
 # Set work directory
 WORKDIR /app
@@ -38,5 +41,5 @@ EXPOSE 7860
 HEALTHCHECK --interval=30s --timeout=30s --start-period=60s --retries=3 \
     CMD curl -f http://localhost:7860/health || exit 1
-# Simple startup - V2 model will download during warmup
 CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860"]

+# Hugging Face Spaces compatible Dockerfile - V4 GPU INT4
 FROM python:3.9-slim
+# Set environment variables for V4 GPU deployment
 ENV PYTHONDONTWRITEBYTECODE=1 \
     PYTHONUNBUFFERED=1 \
     PYTHONPATH=/app \
     ENABLE_V1_WARMUP=false \
+    ENABLE_V2_WARMUP=false \
+    ENABLE_V4_WARMUP=true \
+    V4_MODEL_ID=Qwen/Qwen2.5-1.5B-Instruct \
+    V4_ENABLE_QUANTIZATION=true \
+    HF_HOME=/tmp/huggingface \
+    TRANSFORMERS_NO_TORCHAO=1
 # Set work directory
 WORKDIR /app
 HEALTHCHECK --interval=30s --timeout=30s --start-period=60s --retries=3 \
     CMD curl -f http://localhost:7860/health || exit 1
+# Simple startup - V4 model will download during warmup (with GPU INT4 quantization)
 CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860"]

app/services/structured_summarizer.py CHANGED Viewed

@@ -1,5 +1,5 @@
 """
-V4 Structured Summarization Service using Phi-3 and TextIteratorStreamer.
 """
 import asyncio
@@ -23,12 +23,20 @@ except ImportError:
     TRANSFORMERS_AVAILABLE = False
     logger.warning("Transformers library not available. V4 endpoints will be disabled.")
 class StructuredSummarizer:
-    """Service for streaming structured summarization using Phi-3."""
     def __init__(self):
-        """Initialize the Phi-3 model and tokenizer."""
         self.tokenizer: Optional[AutoTokenizer] = None
         self.model: Optional[AutoModelForCausalLM] = None
@@ -46,40 +54,82 @@ class StructuredSummarizer:
                 trust_remote_code=True,
             )
-            # Load model first (without quantization)
-            self.model = AutoModelForCausalLM.from_pretrained(
-                settings.v4_model_id,
-                torch_dtype=torch.float32,  # Base dtype for CPU
-                device_map="auto",
-                cache_dir=settings.hf_cache_dir,
-                trust_remote_code=True,
-            )
-            # Apply post-loading quantization if enabled
-            quantization_enabled = False
-            if settings.v4_enable_quantization:
-                try:
-                    logger.info("Applying INT8 dynamic quantization to V4 model...")
-                    # Quantize all Linear layers to INT8
-                    self.model = torch.quantization.quantize_dynamic(
-                        self.model, {torch.nn.Linear}, dtype=torch.qint8
-                    )
-                    quantization_enabled = True
-                    logger.info("✅ INT8 dynamic quantization applied successfully")
-                except Exception as quant_error:
-                    logger.warning(
-                        f"⚠️ Quantization failed: {quant_error}. Using FP32 model instead."
-                    )
-                    quantization_enabled = False
             # Set model to eval mode
             self.model.eval()
             logger.info("✅ V4 model initialized successfully")
             logger.info(f"   Model ID: {settings.v4_model_id}")
-            logger.info(
-                f"   Quantization: {'INT8 (~4GB)' if quantization_enabled else 'None (FP32, ~15GB)'}"
-            )
             logger.info(f"   Model device: {next(self.model.parameters()).device}")
             logger.info(f"   Torch dtype: {next(self.model.parameters()).dtype}")

 """
+V4 Structured Summarization Service using Qwen-1.5B.
 """
 import asyncio
     TRANSFORMERS_AVAILABLE = False
     logger.warning("Transformers library not available. V4 endpoints will be disabled.")
+# Try bitsandbytes 4-bit config
+try:
+    from transformers import BitsAndBytesConfig
+    HAS_BITSANDBYTES = True
+except ImportError:
+    HAS_BITSANDBYTES = False
 class StructuredSummarizer:
+    """Service for streaming structured summarization using Qwen-1.5B."""
     def __init__(self):
+        """Initialize the Qwen model and tokenizer with GPU/INT4 when possible."""
         self.tokenizer: Optional[AutoTokenizer] = None
         self.model: Optional[AutoModelForCausalLM] = None
                 trust_remote_code=True,
             )
+            # Decide device / quantization strategy
+            use_cuda = torch.cuda.is_available()
+            quantization_desc = "None"
+            if use_cuda:
+                logger.info("CUDA is available. Using GPU for V4 model.")
+            else:
+                logger.info("CUDA is NOT available. V4 model will run on CPU.")
+            # ------------------------------------------------------------------
+            # Preferred path: 4-bit NF4 on GPU via bitsandbytes
+            # ------------------------------------------------------------------
+            if (
+                use_cuda
+                and getattr(settings, "v4_enable_quantization", True)
+                and HAS_BITSANDBYTES
+            ):
+                logger.info("Applying 4-bit NF4 quantization (bitsandbytes) to V4 model...")
+                quant_config = BitsAndBytesConfig(
+                    load_in_4bit=True,
+                    bnb_4bit_compute_dtype=torch.bfloat16,
+                    bnb_4bit_quant_type="nf4",
+                    bnb_4bit_use_double_quant=True,
+                )
+                self.model = AutoModelForCausalLM.from_pretrained(
+                    settings.v4_model_id,
+                    device_map="auto",
+                    quantization_config=quant_config,
+                    cache_dir=settings.hf_cache_dir,
+                    trust_remote_code=True,
+                )
+                quantization_desc = "4-bit NF4 (bitsandbytes, GPU)"
+            else:
+                # ------------------------------------------------------------------
+                # Fallback path:
+                #   - GPU without bitsandbytes  -> FP16
+                #   - CPU                        -> FP32 + optional dynamic INT8
+                # ------------------------------------------------------------------
+                base_dtype = torch.float16 if use_cuda else torch.float32
+                logger.info(
+                    "Loading V4 model without 4-bit bitsandbytes. "
+                    f"Base dtype: {base_dtype}"
+                )
+                self.model = AutoModelForCausalLM.from_pretrained(
+                    settings.v4_model_id,
+                    torch_dtype=base_dtype,
+                    device_map="auto" if use_cuda else None,
+                    cache_dir=settings.hf_cache_dir,
+                    trust_remote_code=True,
+                )
+                # Optional dynamic INT8 quantization on CPU
+                if getattr(settings, "v4_enable_quantization", True) and not use_cuda:
+                    try:
+                        logger.info("Applying dynamic INT8 quantization to V4 model on CPU...")
+                        self.model = torch.quantization.quantize_dynamic(
+                            self.model, {torch.nn.Linear}, dtype=torch.qint8
+                        )
+                        quantization_desc = "INT8 dynamic (CPU)"
+                    except Exception as quant_error:
+                        logger.warning(
+                            f"⚠️ CPU INT8 quantization failed: {quant_error}. Using base dtype instead."
+                        )
+                        quantization_desc = f"None ({base_dtype})"
+                else:
+                    quantization_desc = f"None ({base_dtype})"
             # Set model to eval mode
             self.model.eval()
             logger.info("✅ V4 model initialized successfully")
             logger.info(f"   Model ID: {settings.v4_model_id}")
+            logger.info(f"   Quantization: {quantization_desc}")
             logger.info(f"   Model device: {next(self.model.parameters()).device}")
             logger.info(f"   Torch dtype: {next(self.model.parameters()).dtype}")

requirements.txt CHANGED Viewed

@@ -13,13 +13,13 @@ pydantic-settings>=2.0.0,<3.0.0
 python-dotenv>=0.19.0,<1.0.0
 # Transformers for fast summarization
-transformers>=4.41.0,<5.0.0  # Updated for Phi-3 support (V4)
 torch>=2.0.0,<3.0.0
 sentencepiece>=0.1.99,<0.3.0
-accelerate>=0.20.0,<1.0.0
-einops>=0.6.0,<1.0.0  # Required for Phi-3 architecture (V4)
 scipy>=1.10.0,<2.0.0  # Often needed for unquantized models (V4)
-torchao>=0.6.0  # CPU-optimized INT8 quantization for V4 (reduces memory 73%)
 # Testing
 pytest>=7.0.0,<8.0.0

 python-dotenv>=0.19.0,<1.0.0
 # Transformers for fast summarization
+transformers>=4.44.0,<5.0.0  # Updated for Qwen-1.5B support (V4)
 torch>=2.0.0,<3.0.0
 sentencepiece>=0.1.99,<0.3.0
+accelerate>=0.33.0,<1.0.0  # Required for GPU quantization (V4)
+bitsandbytes>=0.44.0  # 4-bit NF4 quantization for GPU (V4)
+einops>=0.6.0,<1.0.0  # Required for model architecture (V4)
 scipy>=1.10.0,<2.0.0  # Often needed for unquantized models (V4)
 # Testing
 pytest>=7.0.0,<8.0.0