leejunhyeok commited on
Commit
fbfb8df
·
verified ·
1 Parent(s): 390323a

add logitprocessor (#5)

Browse files

- add logitprocessor (bc61b509afcba4c8bd3b7e6d63a5536433c1b030)
- Update README.md (519788840b2fad3b94ee4d2e6da976777e9673fd)
- Update README.md (47d28325cd0a7ff61a30f437ddabca4af5a142ab)
- Update README.md (3bcafecb45378ab073e22fdbb84c2afe595917da)
- remove unused dependency in logitproc (bd52fd36761b58eb1112dfc9d9214625003c2ee5)
- Update README.md (73e65dd8d9a4e2b9c053014ac56d05efe33e8d71)
- Update logit_processors/logit_.py (f19aee39ef4030eaa17c46f381b1865c33d9f4a7)
- update min value to logit dtype minval (5b8a4315f28d80f7dac9e8a29076a620e4dda7a3)
- Update logit_processors/logit_.py (4ea8a0479b708ee22d53ff8e3cd0e20bcb6e1979)
- change n-gram and ratio check order (d63f78d6b693b382085be092f911d758d04526be)
- Update README.md (c2ca4055343c7cd7e67dbc07a80b59eee08ce90f)

README.md CHANGED
@@ -43,4 +43,57 @@ This is a reasoning enhanced version of **Motif-2-12.7B-Instruct**. Detailed inf
43
  |LiveCodeBench v5 <br> (2024.10 - 2025.2)|-|50.03|65|
44
  |LiveCodeBench v5 |0-shot, CoT|61.66|60.1|
45
  |HumanEval|0-shot|93.2|93.2|
46
- |**Average**|-|**75.45**|**79.71**|
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  |LiveCodeBench v5 <br> (2024.10 - 2025.2)|-|50.03|65|
44
  |LiveCodeBench v5 |0-shot, CoT|61.66|60.1|
45
  |HumanEval|0-shot|93.2|93.2|
46
+ |**Average**|-|**75.45**|**79.71**|
47
+
48
+ ## How to use in vllm
49
+ The [PR](https://github.com/vllm-project/vllm/pull/27396) adding support for the Motif model in the official vLLM package is currently under review.
50
+ In the meantime, to use our model with vLLM, please use the following container [image](https://github.com/motiftechnologies/vllm/pkgs/container/vllm).
51
+ Our model supports a sequence length of up to 64K tokens.
52
+ ```bash
53
+ # run vllm api server
54
+ VLLM_ATTENTION_BACKEND=DIFFERENTIAL_FLASH_ATTN \
55
+ vllm serve Motif-Technologies/Motif-2-12.7B-Reasoning \
56
+ --trust-remote-code \
57
+ --max-model-len 65536 \
58
+ --tensor-parallel-size 8
59
+
60
+ # sending requests with curl
61
+ curl http://localhost:8000/v1/chat/completions \
62
+ -H "Content-Type: application/json" \
63
+ -d '{
64
+ "messages": [
65
+ {"role": "system", "content": "You are a helpful assistant."},
66
+ {"role": "user", "content": "What is the capital city of South Korea?"}
67
+ ],
68
+ "temperature": 0.6
69
+ }'
70
+ ```
71
+
72
+ ## How to use advanced vllm options
73
+ For maximum performance, we highly recommend using the options below.
74
+ ```--compilation_config '{"full_cuda_graph": true}'``` : Activates cuda [full graph capture](https://docs.vllm.ai/en/stable/design/cuda_graphs/#cudagraphmodes)
75
+ ```--rope-scaling '{"rope_type":"yarn","factor":2.0,"original_max_position_embeddings":65536}'```: Apply [yarn](https://arxiv.org/abs/2309.00071) to support 128K context length
76
+ ```--enable-auto-tool-choice --tool-call-parser hermes``` : Enables [tool calling](https://docs.vllm.ai/en/latest/features/tool_calling/)
77
+ ```--logits-processors logit_:WrappedPerReqLogitsProcessor```: Enables a ratio-based thinking budget and repetition-based auto-stop. The model is guided to think for ```(model_max_len - input_prompt_len) * VLLM_THINK_BUDGET_RATIO``` tokens, using the rest of the context window to generate the response
78
+ ```--reasoning-parser deepseek_r1``` : Parses [reasoning outputs](https://docs.vllm.ai/en/latest/features/reasoning_outputs/)
79
+
80
+ ```bash
81
+ pip install -U "huggingface_hub[cli]"
82
+ hf download Motif-Technologies/Motif-2-12.7B-Reasoning \
83
+ --include "logit_processors/*" \
84
+ --local-dir ./
85
+
86
+ export PYTHONPATH="$PWD/logit_processors"
87
+ VLLM_ATTENTION_BACKEND=DIFFERENTIAL_FLASH_ATTN \
88
+ VLLM_THINK_BUDGET_RATIO=0.95 \
89
+ vllm serve Motif-Technologies/Motif-2-12.7B-Reasoning \
90
+ --trust-remote-code \
91
+ --compilation_config '{"full_cuda_graph": true}' \
92
+ --rope-scaling '{"rope_type":"yarn","factor":2.0,"original_max_position_embeddings":65536}' \
93
+ --max-model-len 131072 \
94
+ --tensor-parallel-size 8 \
95
+ --enable-auto-tool-choice \
96
+ --tool-call-parser hermes \
97
+ --logits-processors logit_:WrappedPerReqLogitsProcessor \
98
+ --reasoning-parser deepseek_r1
99
+ ```
logit_processors/__init__.py ADDED
File without changes
logit_processors/logit_.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Optional
2
+ import torch
3
+ from vllm.config import VllmConfig
4
+ from vllm.v1.sample.logits_processor import (
5
+ AdapterLogitsProcessor,
6
+ RequestLogitsProcessor,
7
+ )
8
+ from vllm.sampling_params import SamplingParams
9
+ import os
10
+ from collections import Counter
11
+
12
+ CHUNK_SIZE=16384
13
+ WINDOW_SIZE=256
14
+ MAX_REPETATION_COUNT=7
15
+
16
+ class ThinkLogitsProcessor:
17
+ def __init__(self, think_end_token = 219406, max_len: int = 131072, ratio: float = 0.95):
18
+ self.think_end_token = think_end_token
19
+ self.min_answer_budget = 4096
20
+ self.max_len = max_len
21
+ self.ratio = ratio
22
+ self.interval = 4096
23
+
24
+ def find_repeated_ngrams(self, input_ids, n=512):
25
+ """
26
+ input_ids: list of integer tokens
27
+ n: n-gram size
28
+ returns dict of {ngram_tuple: count} for repeated n-grams
29
+ """
30
+ ngrams = [tuple(input_ids[i:i+n]) for i in range(0, len(input_ids) - n + 1, WINDOW_SIZE)]
31
+ freq = Counter(ngrams)
32
+ return {ng: c for ng, c in freq.items() if c > MAX_REPETATION_COUNT}
33
+
34
+ def __call__(
35
+ self,
36
+ prompt_token_ids: List[int],
37
+ past_token_ids: List[int],
38
+ logits: torch.Tensor
39
+ ) -> torch.Tensor:
40
+ if self.think_end_token not in past_token_ids:
41
+ # ratio
42
+ tokens_since_think = len(past_token_ids)
43
+
44
+ response_budget = max(self.min_answer_budget, int((self.max_len - len(prompt_token_ids)) * (1-self.ratio)))
45
+ remaining_budget = self.max_len - len(prompt_token_ids) - response_budget - tokens_since_think
46
+
47
+ if 0 >= remaining_budget:
48
+ logits = torch.full_like(logits, torch.finfo(logits.dtype).min)
49
+ logits[self.think_end_token] = 1.0
50
+
51
+ # ngram
52
+ elif len(past_token_ids) % self.interval == 0:
53
+ # If repetation detected, force </think>
54
+ if self.find_repeated_ngrams(past_token_ids, n=CHUNK_SIZE):
55
+ # Set all other logits to -inf except for </think>
56
+ logits = torch.full_like(logits, torch.finfo(logits.dtype).min)
57
+ logits[self.think_end_token] = 1.0
58
+
59
+ return logits
60
+
61
+
62
+ class WrappedPerReqLogitsProcessor(AdapterLogitsProcessor):
63
+ def __init__(self, vllm_config: VllmConfig, device: torch.device,is_pin_memory: bool):
64
+ super().__init__(vllm_config, device, is_pin_memory)
65
+ self.model_max_len = vllm_config.model_config.max_model_len
66
+ assert self.model_max_len, "specify --model-max-len if using ratiologitprocessor"
67
+ self.ratio = float(os.environ.get("VLLM_THINK_BUDGET_RATIO", "0.0"))
68
+ assert 1 >= self.ratio > 0, "specify env var VLLM_THINK_BUDGET_RATIO in 0.0 < R =< 1.0 if using ratiologitprocessor"
69
+
70
+ def is_argmax_invariant(self) -> bool:
71
+ return False
72
+
73
+ def new_req_logits_processor(
74
+ self,
75
+ params: SamplingParams,
76
+ ) -> Optional[RequestLogitsProcessor]:
77
+
78
+ return ThinkLogitsProcessor(max_len = self.model_max_len, ratio = self.ratio)