Spaces:

WellGoods
/

VibeThinker

Sleeping

App Files Files Community

VladBoyko commited on Nov 17

Commit

3b97453

verified ·

1 Parent(s): 02f680e

Update app.py

Browse files

Added environment variables at top to force XFormers backend
Updated max_model_len from 40960 → 16384 (T4 stability)
Added T4-specific vLLM flags

Files changed (1) hide show

app.py +20 -8

app.py CHANGED Viewed

@@ -2,28 +2,40 @@ import gradio as gr
 import re
 from vllm import LLM, SamplingParams
 import spaces
 class VibeThinkerVLLM:
     def __init__(self, model_path="WeiboAI/VibeThinker-1.5B"):
         self.model_path = model_path
         print("Loading model with vLLM... This may take a minute.")
-        # T4 GPU compatible - using float16
         self.model = LLM(
             model=self.model_path,
             dtype="float16",
-            gpu_memory_utilization=0.9,
-            max_model_len=40960,
-            trust_remote_code=True
         )
         print(f"Model loaded successfully with vLLM!")
-        print(f"Using dtype: float16 (T4 GPU compatible)")
     @spaces.GPU
     def infer_text(self, prompt, temperature=0.6, max_tokens=8192, top_p=0.95):
         """Generate response with vLLM for faster inference"""
         messages = [
             {"role": "user", "content": prompt}
         ]
@@ -295,7 +307,7 @@ with gr.Blocks(
     gr.Markdown("""
     # 🧠 VibeThinker-1.5B: Advanced Reasoning Interface
-    **⚡ Powered by vLLM** for 10-20x faster inference on T4 GPU!
     ### ✨ Features:
     - 🤔 **Collapsible Thinking Sections** - Explore the model's reasoning process
@@ -328,11 +340,11 @@ with gr.Blocks(
                 max_tokens_slider = gr.Slider(
                     minimum=512,
-                    maximum=40960,
                     value=8192,
                     step=512,
                     label="📏 Max Tokens",
-                    info="Model supports up to 40,960 tokens"
                 )
                 top_p_slider = gr.Slider(

 import re
 from vllm import LLM, SamplingParams
 import spaces
+import os
+# Force XFormers backend for T4 GPU compatibility (prevent Triton compilation errors)
+os.environ["VLLM_ATTENTION_BACKEND"] = "XFORMERS"
+os.environ["VLLM_USE_TRITON_FLASH_ATTN"] = "0"
 class VibeThinkerVLLM:
     def __init__(self, model_path="WeiboAI/VibeThinker-1.5B"):
         self.model_path = model_path
         print("Loading model with vLLM... This may take a minute.")
+        # T4 GPU compatible - using float16 with XFormers backend
         self.model = LLM(
             model=self.model_path,
             dtype="float16",
+            gpu_memory_utilization=0.85,
+            max_model_len=16384,  # Reduced for T4 stability
+            trust_remote_code=True,
+            enforce_eager=True,  # Disable CUDA graphs to save memory
+            disable_custom_all_reduce=True,  # Prevent Triton compilation issues
+            enable_prefix_caching=False,  # Disable prefix caching (causes Triton issues on T4)
+            max_num_seqs=1,  # Process one sequence at a time for stability
         )
         print(f"Model loaded successfully with vLLM!")
+        print(f"Using dtype: float16 with XFormers backend (T4 GPU compatible)")
     @spaces.GPU
     def infer_text(self, prompt, temperature=0.6, max_tokens=8192, top_p=0.95):
         """Generate response with vLLM for faster inference"""
+        # Ensure max_tokens doesn't exceed model capacity
+        max_tokens = min(max_tokens, 16384)
         messages = [
             {"role": "user", "content": prompt}
         ]
     gr.Markdown("""
     # 🧠 VibeThinker-1.5B: Advanced Reasoning Interface
+    **⚡ Powered by vLLM + XFormers** for 10x faster inference on T4 GPU!
     ### ✨ Features:
     - 🤔 **Collapsible Thinking Sections** - Explore the model's reasoning process
                 max_tokens_slider = gr.Slider(
                     minimum=512,
+                    maximum=16384,  # Reduced for T4 stability
                     value=8192,
                     step=512,
                     label="📏 Max Tokens",
+                    info="Model supports up to 16,384 tokens (T4 optimized)"
                 )
                 top_p_slider = gr.Slider(