Spaces:
Sleeping
Sleeping
Update app.py
Browse filesAdded environment variables at top to force XFormers backend
Updated max_model_len from 40960 → 16384 (T4 stability)
Added T4-specific vLLM flags
app.py
CHANGED
|
@@ -2,28 +2,40 @@ import gradio as gr
|
|
| 2 |
import re
|
| 3 |
from vllm import LLM, SamplingParams
|
| 4 |
import spaces
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
|
| 6 |
class VibeThinkerVLLM:
|
| 7 |
def __init__(self, model_path="WeiboAI/VibeThinker-1.5B"):
|
| 8 |
self.model_path = model_path
|
| 9 |
print("Loading model with vLLM... This may take a minute.")
|
| 10 |
|
| 11 |
-
# T4 GPU compatible - using float16
|
| 12 |
self.model = LLM(
|
| 13 |
model=self.model_path,
|
| 14 |
dtype="float16",
|
| 15 |
-
gpu_memory_utilization=0.
|
| 16 |
-
max_model_len=
|
| 17 |
-
trust_remote_code=True
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
)
|
| 19 |
|
| 20 |
print(f"Model loaded successfully with vLLM!")
|
| 21 |
-
print(f"Using dtype: float16 (T4 GPU compatible)")
|
| 22 |
|
| 23 |
@spaces.GPU
|
| 24 |
def infer_text(self, prompt, temperature=0.6, max_tokens=8192, top_p=0.95):
|
| 25 |
"""Generate response with vLLM for faster inference"""
|
| 26 |
|
|
|
|
|
|
|
|
|
|
| 27 |
messages = [
|
| 28 |
{"role": "user", "content": prompt}
|
| 29 |
]
|
|
@@ -295,7 +307,7 @@ with gr.Blocks(
|
|
| 295 |
gr.Markdown("""
|
| 296 |
# 🧠 VibeThinker-1.5B: Advanced Reasoning Interface
|
| 297 |
|
| 298 |
-
**⚡ Powered by vLLM** for
|
| 299 |
|
| 300 |
### ✨ Features:
|
| 301 |
- 🤔 **Collapsible Thinking Sections** - Explore the model's reasoning process
|
|
@@ -328,11 +340,11 @@ with gr.Blocks(
|
|
| 328 |
|
| 329 |
max_tokens_slider = gr.Slider(
|
| 330 |
minimum=512,
|
| 331 |
-
maximum=
|
| 332 |
value=8192,
|
| 333 |
step=512,
|
| 334 |
label="📏 Max Tokens",
|
| 335 |
-
info="Model supports up to
|
| 336 |
)
|
| 337 |
|
| 338 |
top_p_slider = gr.Slider(
|
|
|
|
| 2 |
import re
|
| 3 |
from vllm import LLM, SamplingParams
|
| 4 |
import spaces
|
| 5 |
+
import os
|
| 6 |
+
|
| 7 |
+
# Force XFormers backend for T4 GPU compatibility (prevent Triton compilation errors)
|
| 8 |
+
os.environ["VLLM_ATTENTION_BACKEND"] = "XFORMERS"
|
| 9 |
+
os.environ["VLLM_USE_TRITON_FLASH_ATTN"] = "0"
|
| 10 |
|
| 11 |
class VibeThinkerVLLM:
|
| 12 |
def __init__(self, model_path="WeiboAI/VibeThinker-1.5B"):
|
| 13 |
self.model_path = model_path
|
| 14 |
print("Loading model with vLLM... This may take a minute.")
|
| 15 |
|
| 16 |
+
# T4 GPU compatible - using float16 with XFormers backend
|
| 17 |
self.model = LLM(
|
| 18 |
model=self.model_path,
|
| 19 |
dtype="float16",
|
| 20 |
+
gpu_memory_utilization=0.85,
|
| 21 |
+
max_model_len=16384, # Reduced for T4 stability
|
| 22 |
+
trust_remote_code=True,
|
| 23 |
+
enforce_eager=True, # Disable CUDA graphs to save memory
|
| 24 |
+
disable_custom_all_reduce=True, # Prevent Triton compilation issues
|
| 25 |
+
enable_prefix_caching=False, # Disable prefix caching (causes Triton issues on T4)
|
| 26 |
+
max_num_seqs=1, # Process one sequence at a time for stability
|
| 27 |
)
|
| 28 |
|
| 29 |
print(f"Model loaded successfully with vLLM!")
|
| 30 |
+
print(f"Using dtype: float16 with XFormers backend (T4 GPU compatible)")
|
| 31 |
|
| 32 |
@spaces.GPU
|
| 33 |
def infer_text(self, prompt, temperature=0.6, max_tokens=8192, top_p=0.95):
|
| 34 |
"""Generate response with vLLM for faster inference"""
|
| 35 |
|
| 36 |
+
# Ensure max_tokens doesn't exceed model capacity
|
| 37 |
+
max_tokens = min(max_tokens, 16384)
|
| 38 |
+
|
| 39 |
messages = [
|
| 40 |
{"role": "user", "content": prompt}
|
| 41 |
]
|
|
|
|
| 307 |
gr.Markdown("""
|
| 308 |
# 🧠 VibeThinker-1.5B: Advanced Reasoning Interface
|
| 309 |
|
| 310 |
+
**⚡ Powered by vLLM + XFormers** for 10x faster inference on T4 GPU!
|
| 311 |
|
| 312 |
### ✨ Features:
|
| 313 |
- 🤔 **Collapsible Thinking Sections** - Explore the model's reasoning process
|
|
|
|
| 340 |
|
| 341 |
max_tokens_slider = gr.Slider(
|
| 342 |
minimum=512,
|
| 343 |
+
maximum=16384, # Reduced for T4 stability
|
| 344 |
value=8192,
|
| 345 |
step=512,
|
| 346 |
label="📏 Max Tokens",
|
| 347 |
+
info="Model supports up to 16,384 tokens (T4 optimized)"
|
| 348 |
)
|
| 349 |
|
| 350 |
top_p_slider = gr.Slider(
|