VladBoyko commited on
Commit
3b97453
·
verified ·
1 Parent(s): 02f680e

Update app.py

Browse files

Added environment variables at top to force XFormers backend
Updated max_model_len from 40960 → 16384 (T4 stability)
Added T4-specific vLLM flags

Files changed (1) hide show
  1. app.py +20 -8
app.py CHANGED
@@ -2,28 +2,40 @@ import gradio as gr
2
  import re
3
  from vllm import LLM, SamplingParams
4
  import spaces
 
 
 
 
 
5
 
6
  class VibeThinkerVLLM:
7
  def __init__(self, model_path="WeiboAI/VibeThinker-1.5B"):
8
  self.model_path = model_path
9
  print("Loading model with vLLM... This may take a minute.")
10
 
11
- # T4 GPU compatible - using float16
12
  self.model = LLM(
13
  model=self.model_path,
14
  dtype="float16",
15
- gpu_memory_utilization=0.9,
16
- max_model_len=40960,
17
- trust_remote_code=True
 
 
 
 
18
  )
19
 
20
  print(f"Model loaded successfully with vLLM!")
21
- print(f"Using dtype: float16 (T4 GPU compatible)")
22
 
23
  @spaces.GPU
24
  def infer_text(self, prompt, temperature=0.6, max_tokens=8192, top_p=0.95):
25
  """Generate response with vLLM for faster inference"""
26
 
 
 
 
27
  messages = [
28
  {"role": "user", "content": prompt}
29
  ]
@@ -295,7 +307,7 @@ with gr.Blocks(
295
  gr.Markdown("""
296
  # 🧠 VibeThinker-1.5B: Advanced Reasoning Interface
297
 
298
- **⚡ Powered by vLLM** for 10-20x faster inference on T4 GPU!
299
 
300
  ### ✨ Features:
301
  - 🤔 **Collapsible Thinking Sections** - Explore the model's reasoning process
@@ -328,11 +340,11 @@ with gr.Blocks(
328
 
329
  max_tokens_slider = gr.Slider(
330
  minimum=512,
331
- maximum=40960,
332
  value=8192,
333
  step=512,
334
  label="📏 Max Tokens",
335
- info="Model supports up to 40,960 tokens"
336
  )
337
 
338
  top_p_slider = gr.Slider(
 
2
  import re
3
  from vllm import LLM, SamplingParams
4
  import spaces
5
+ import os
6
+
7
+ # Force XFormers backend for T4 GPU compatibility (prevent Triton compilation errors)
8
+ os.environ["VLLM_ATTENTION_BACKEND"] = "XFORMERS"
9
+ os.environ["VLLM_USE_TRITON_FLASH_ATTN"] = "0"
10
 
11
  class VibeThinkerVLLM:
12
  def __init__(self, model_path="WeiboAI/VibeThinker-1.5B"):
13
  self.model_path = model_path
14
  print("Loading model with vLLM... This may take a minute.")
15
 
16
+ # T4 GPU compatible - using float16 with XFormers backend
17
  self.model = LLM(
18
  model=self.model_path,
19
  dtype="float16",
20
+ gpu_memory_utilization=0.85,
21
+ max_model_len=16384, # Reduced for T4 stability
22
+ trust_remote_code=True,
23
+ enforce_eager=True, # Disable CUDA graphs to save memory
24
+ disable_custom_all_reduce=True, # Prevent Triton compilation issues
25
+ enable_prefix_caching=False, # Disable prefix caching (causes Triton issues on T4)
26
+ max_num_seqs=1, # Process one sequence at a time for stability
27
  )
28
 
29
  print(f"Model loaded successfully with vLLM!")
30
+ print(f"Using dtype: float16 with XFormers backend (T4 GPU compatible)")
31
 
32
  @spaces.GPU
33
  def infer_text(self, prompt, temperature=0.6, max_tokens=8192, top_p=0.95):
34
  """Generate response with vLLM for faster inference"""
35
 
36
+ # Ensure max_tokens doesn't exceed model capacity
37
+ max_tokens = min(max_tokens, 16384)
38
+
39
  messages = [
40
  {"role": "user", "content": prompt}
41
  ]
 
307
  gr.Markdown("""
308
  # 🧠 VibeThinker-1.5B: Advanced Reasoning Interface
309
 
310
+ **⚡ Powered by vLLM + XFormers** for 10x faster inference on T4 GPU!
311
 
312
  ### ✨ Features:
313
  - 🤔 **Collapsible Thinking Sections** - Explore the model's reasoning process
 
340
 
341
  max_tokens_slider = gr.Slider(
342
  minimum=512,
343
+ maximum=16384, # Reduced for T4 stability
344
  value=8192,
345
  step=512,
346
  label="📏 Max Tokens",
347
+ info="Model supports up to 16,384 tokens (T4 optimized)"
348
  )
349
 
350
  top_p_slider = gr.Slider(