ming commited on
Commit
a36f560
·
1 Parent(s): df75294

Switch V4 to GPU INT4 quantization with Qwen-1.5B

Browse files

- Upgraded transformers to 4.44+ and accelerate to 0.33+
- Added bitsandbytes for 4-bit NF4 quantization on GPU
- Removed torchao dependency (causes HF Spaces errors)
- Added TRANSFORMERS_NO_TORCHAO=1 to prevent import errors
- Updated to use Qwen/Qwen2.5-1.5B-Instruct model
- GPU: 4-bit NF4 quantization via bitsandbytes
- CPU fallback: FP32 + dynamic INT8 quantization

Dockerfile CHANGED
@@ -1,14 +1,17 @@
1
- # Hugging Face Spaces compatible Dockerfile - V2 Only
2
  FROM python:3.9-slim
3
 
4
- # Set environment variables for V2-only deployment
5
  ENV PYTHONDONTWRITEBYTECODE=1 \
6
  PYTHONUNBUFFERED=1 \
7
  PYTHONPATH=/app \
8
  ENABLE_V1_WARMUP=false \
9
- ENABLE_V2_WARMUP=true \
10
- HF_MODEL_ID=sshleifer/distilbart-cnn-6-6 \
11
- HF_HOME=/tmp/huggingface
 
 
 
12
 
13
  # Set work directory
14
  WORKDIR /app
@@ -38,5 +41,5 @@ EXPOSE 7860
38
  HEALTHCHECK --interval=30s --timeout=30s --start-period=60s --retries=3 \
39
  CMD curl -f http://localhost:7860/health || exit 1
40
 
41
- # Simple startup - V2 model will download during warmup
42
  CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860"]
 
1
+ # Hugging Face Spaces compatible Dockerfile - V4 GPU INT4
2
  FROM python:3.9-slim
3
 
4
+ # Set environment variables for V4 GPU deployment
5
  ENV PYTHONDONTWRITEBYTECODE=1 \
6
  PYTHONUNBUFFERED=1 \
7
  PYTHONPATH=/app \
8
  ENABLE_V1_WARMUP=false \
9
+ ENABLE_V2_WARMUP=false \
10
+ ENABLE_V4_WARMUP=true \
11
+ V4_MODEL_ID=Qwen/Qwen2.5-1.5B-Instruct \
12
+ V4_ENABLE_QUANTIZATION=true \
13
+ HF_HOME=/tmp/huggingface \
14
+ TRANSFORMERS_NO_TORCHAO=1
15
 
16
  # Set work directory
17
  WORKDIR /app
 
41
  HEALTHCHECK --interval=30s --timeout=30s --start-period=60s --retries=3 \
42
  CMD curl -f http://localhost:7860/health || exit 1
43
 
44
+ # Simple startup - V4 model will download during warmup (with GPU INT4 quantization)
45
  CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860"]
app/services/structured_summarizer.py CHANGED
@@ -1,5 +1,5 @@
1
  """
2
- V4 Structured Summarization Service using Phi-3 and TextIteratorStreamer.
3
  """
4
 
5
  import asyncio
@@ -23,12 +23,20 @@ except ImportError:
23
  TRANSFORMERS_AVAILABLE = False
24
  logger.warning("Transformers library not available. V4 endpoints will be disabled.")
25
 
 
 
 
 
 
 
 
 
26
 
27
  class StructuredSummarizer:
28
- """Service for streaming structured summarization using Phi-3."""
29
 
30
  def __init__(self):
31
- """Initialize the Phi-3 model and tokenizer."""
32
  self.tokenizer: Optional[AutoTokenizer] = None
33
  self.model: Optional[AutoModelForCausalLM] = None
34
 
@@ -46,40 +54,82 @@ class StructuredSummarizer:
46
  trust_remote_code=True,
47
  )
48
 
49
- # Load model first (without quantization)
50
- self.model = AutoModelForCausalLM.from_pretrained(
51
- settings.v4_model_id,
52
- torch_dtype=torch.float32, # Base dtype for CPU
53
- device_map="auto",
54
- cache_dir=settings.hf_cache_dir,
55
- trust_remote_code=True,
56
- )
57
 
58
- # Apply post-loading quantization if enabled
59
- quantization_enabled = False
60
- if settings.v4_enable_quantization:
61
- try:
62
- logger.info("Applying INT8 dynamic quantization to V4 model...")
63
- # Quantize all Linear layers to INT8
64
- self.model = torch.quantization.quantize_dynamic(
65
- self.model, {torch.nn.Linear}, dtype=torch.qint8
66
- )
67
- quantization_enabled = True
68
- logger.info(" INT8 dynamic quantization applied successfully")
69
- except Exception as quant_error:
70
- logger.warning(
71
- f"⚠️ Quantization failed: {quant_error}. Using FP32 model instead."
72
- )
73
- quantization_enabled = False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
 
75
  # Set model to eval mode
76
  self.model.eval()
77
 
78
  logger.info("✅ V4 model initialized successfully")
79
  logger.info(f" Model ID: {settings.v4_model_id}")
80
- logger.info(
81
- f" Quantization: {'INT8 (~4GB)' if quantization_enabled else 'None (FP32, ~15GB)'}"
82
- )
83
  logger.info(f" Model device: {next(self.model.parameters()).device}")
84
  logger.info(f" Torch dtype: {next(self.model.parameters()).dtype}")
85
 
 
1
  """
2
+ V4 Structured Summarization Service using Qwen-1.5B.
3
  """
4
 
5
  import asyncio
 
23
  TRANSFORMERS_AVAILABLE = False
24
  logger.warning("Transformers library not available. V4 endpoints will be disabled.")
25
 
26
+ # Try bitsandbytes 4-bit config
27
+ try:
28
+ from transformers import BitsAndBytesConfig
29
+
30
+ HAS_BITSANDBYTES = True
31
+ except ImportError:
32
+ HAS_BITSANDBYTES = False
33
+
34
 
35
  class StructuredSummarizer:
36
+ """Service for streaming structured summarization using Qwen-1.5B."""
37
 
38
  def __init__(self):
39
+ """Initialize the Qwen model and tokenizer with GPU/INT4 when possible."""
40
  self.tokenizer: Optional[AutoTokenizer] = None
41
  self.model: Optional[AutoModelForCausalLM] = None
42
 
 
54
  trust_remote_code=True,
55
  )
56
 
57
+ # Decide device / quantization strategy
58
+ use_cuda = torch.cuda.is_available()
59
+ quantization_desc = "None"
 
 
 
 
 
60
 
61
+ if use_cuda:
62
+ logger.info("CUDA is available. Using GPU for V4 model.")
63
+ else:
64
+ logger.info("CUDA is NOT available. V4 model will run on CPU.")
65
+
66
+ # ------------------------------------------------------------------
67
+ # Preferred path: 4-bit NF4 on GPU via bitsandbytes
68
+ # ------------------------------------------------------------------
69
+ if (
70
+ use_cuda
71
+ and getattr(settings, "v4_enable_quantization", True)
72
+ and HAS_BITSANDBYTES
73
+ ):
74
+ logger.info("Applying 4-bit NF4 quantization (bitsandbytes) to V4 model...")
75
+ quant_config = BitsAndBytesConfig(
76
+ load_in_4bit=True,
77
+ bnb_4bit_compute_dtype=torch.bfloat16,
78
+ bnb_4bit_quant_type="nf4",
79
+ bnb_4bit_use_double_quant=True,
80
+ )
81
+
82
+ self.model = AutoModelForCausalLM.from_pretrained(
83
+ settings.v4_model_id,
84
+ device_map="auto",
85
+ quantization_config=quant_config,
86
+ cache_dir=settings.hf_cache_dir,
87
+ trust_remote_code=True,
88
+ )
89
+ quantization_desc = "4-bit NF4 (bitsandbytes, GPU)"
90
+
91
+ else:
92
+ # ------------------------------------------------------------------
93
+ # Fallback path:
94
+ # - GPU without bitsandbytes -> FP16
95
+ # - CPU -> FP32 + optional dynamic INT8
96
+ # ------------------------------------------------------------------
97
+ base_dtype = torch.float16 if use_cuda else torch.float32
98
+ logger.info(
99
+ "Loading V4 model without 4-bit bitsandbytes. "
100
+ f"Base dtype: {base_dtype}"
101
+ )
102
+
103
+ self.model = AutoModelForCausalLM.from_pretrained(
104
+ settings.v4_model_id,
105
+ torch_dtype=base_dtype,
106
+ device_map="auto" if use_cuda else None,
107
+ cache_dir=settings.hf_cache_dir,
108
+ trust_remote_code=True,
109
+ )
110
+
111
+ # Optional dynamic INT8 quantization on CPU
112
+ if getattr(settings, "v4_enable_quantization", True) and not use_cuda:
113
+ try:
114
+ logger.info("Applying dynamic INT8 quantization to V4 model on CPU...")
115
+ self.model = torch.quantization.quantize_dynamic(
116
+ self.model, {torch.nn.Linear}, dtype=torch.qint8
117
+ )
118
+ quantization_desc = "INT8 dynamic (CPU)"
119
+ except Exception as quant_error:
120
+ logger.warning(
121
+ f"⚠️ CPU INT8 quantization failed: {quant_error}. Using base dtype instead."
122
+ )
123
+ quantization_desc = f"None ({base_dtype})"
124
+ else:
125
+ quantization_desc = f"None ({base_dtype})"
126
 
127
  # Set model to eval mode
128
  self.model.eval()
129
 
130
  logger.info("✅ V4 model initialized successfully")
131
  logger.info(f" Model ID: {settings.v4_model_id}")
132
+ logger.info(f" Quantization: {quantization_desc}")
 
 
133
  logger.info(f" Model device: {next(self.model.parameters()).device}")
134
  logger.info(f" Torch dtype: {next(self.model.parameters()).dtype}")
135
 
requirements.txt CHANGED
@@ -13,13 +13,13 @@ pydantic-settings>=2.0.0,<3.0.0
13
  python-dotenv>=0.19.0,<1.0.0
14
 
15
  # Transformers for fast summarization
16
- transformers>=4.41.0,<5.0.0 # Updated for Phi-3 support (V4)
17
  torch>=2.0.0,<3.0.0
18
  sentencepiece>=0.1.99,<0.3.0
19
- accelerate>=0.20.0,<1.0.0
20
- einops>=0.6.0,<1.0.0 # Required for Phi-3 architecture (V4)
 
21
  scipy>=1.10.0,<2.0.0 # Often needed for unquantized models (V4)
22
- torchao>=0.6.0 # CPU-optimized INT8 quantization for V4 (reduces memory 73%)
23
 
24
  # Testing
25
  pytest>=7.0.0,<8.0.0
 
13
  python-dotenv>=0.19.0,<1.0.0
14
 
15
  # Transformers for fast summarization
16
+ transformers>=4.44.0,<5.0.0 # Updated for Qwen-1.5B support (V4)
17
  torch>=2.0.0,<3.0.0
18
  sentencepiece>=0.1.99,<0.3.0
19
+ accelerate>=0.33.0,<1.0.0 # Required for GPU quantization (V4)
20
+ bitsandbytes>=0.44.0 # 4-bit NF4 quantization for GPU (V4)
21
+ einops>=0.6.0,<1.0.0 # Required for model architecture (V4)
22
  scipy>=1.10.0,<2.0.0 # Often needed for unquantized models (V4)
 
23
 
24
  # Testing
25
  pytest>=7.0.0,<8.0.0