add logitprocessor (#5)

- add logitprocessor (bc61b509afcba4c8bd3b7e6d63a5536433c1b030)
- Update README.md (519788840b2fad3b94ee4d2e6da976777e9673fd)
- Update README.md (47d28325cd0a7ff61a30f437ddabca4af5a142ab)
- Update README.md (3bcafecb45378ab073e22fdbb84c2afe595917da)
- remove unused dependency in logitproc (bd52fd36761b58eb1112dfc9d9214625003c2ee5)
- Update README.md (73e65dd8d9a4e2b9c053014ac56d05efe33e8d71)
- Update logit_processors/logit_.py (f19aee39ef4030eaa17c46f381b1865c33d9f4a7)
- update min value to logit dtype minval (5b8a4315f28d80f7dac9e8a29076a620e4dda7a3)
- Update logit_processors/logit_.py (4ea8a0479b708ee22d53ff8e3cd0e20bcb6e1979)
- change n-gram and ratio check order (d63f78d6b693b382085be092f911d758d04526be)
- Update README.md (c2ca4055343c7cd7e67dbc07a80b59eee08ce90f)

Files changed (3) hide show

README.md +54 -1
logit_processors/__init__.py +0 -0
logit_processors/logit_.py +78 -0

README.md CHANGED Viewed

@@ -43,4 +43,57 @@ This is a reasoning enhanced version of **Motif-2-12.7B-Instruct**. Detailed inf
 |LiveCodeBench v5 <br> (2024.10 - 2025.2)|-|50.03|65|
 |LiveCodeBench v5 |0-shot, CoT|61.66|60.1|
 |HumanEval|0-shot|93.2|93.2|
-|**Average**|-|**75.45**|**79.71**|

 |LiveCodeBench v5 <br> (2024.10 - 2025.2)|-|50.03|65|
 |LiveCodeBench v5 |0-shot, CoT|61.66|60.1|
 |HumanEval|0-shot|93.2|93.2|
+|**Average**|-|**75.45**|**79.71**|
+## How to use in vllm
+The [PR](https://github.com/vllm-project/vllm/pull/27396) adding support for the Motif model in the official vLLM package is currently under review.
+In the meantime, to use our model with vLLM, please use the following container [image](https://github.com/motiftechnologies/vllm/pkgs/container/vllm).
+Our model supports a sequence length of up to 64K tokens.
+```bash
+# run vllm api server
+VLLM_ATTENTION_BACKEND=DIFFERENTIAL_FLASH_ATTN \
+vllm serve Motif-Technologies/Motif-2-12.7B-Reasoning \
+    --trust-remote-code \
+    --max-model-len 65536 \
+    --tensor-parallel-size 8
+# sending requests with curl
+curl http://localhost:8000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+    "messages": [
+      {"role": "system", "content": "You are a helpful assistant."},
+      {"role": "user", "content": "What is the capital city of South Korea?"}
+    ],
+    "temperature": 0.6
+  }'
+```
+## How to use advanced vllm options
+For maximum performance, we highly recommend using the options below.
+```--compilation_config '{"full_cuda_graph": true}'``` : Activates cuda [full graph capture](https://docs.vllm.ai/en/stable/design/cuda_graphs/#cudagraphmodes)
+```--rope-scaling '{"rope_type":"yarn","factor":2.0,"original_max_position_embeddings":65536}'```: Apply [yarn](https://arxiv.org/abs/2309.00071) to support 128K context length
+```--enable-auto-tool-choice --tool-call-parser hermes``` : Enables [tool calling](https://docs.vllm.ai/en/latest/features/tool_calling/)
+```--logits-processors logit_:WrappedPerReqLogitsProcessor```: Enables a ratio-based thinking budget and repetition-based auto-stop. The model is guided to think for ```(model_max_len - input_prompt_len) * VLLM_THINK_BUDGET_RATIO``` tokens, using the rest of the context window to generate the response
+```--reasoning-parser deepseek_r1``` : Parses [reasoning outputs](https://docs.vllm.ai/en/latest/features/reasoning_outputs/)
+```bash
+pip install -U "huggingface_hub[cli]"
+hf download Motif-Technologies/Motif-2-12.7B-Reasoning \
+  --include "logit_processors/*" \
+  --local-dir ./
+export PYTHONPATH="$PWD/logit_processors"
+VLLM_ATTENTION_BACKEND=DIFFERENTIAL_FLASH_ATTN \
+VLLM_THINK_BUDGET_RATIO=0.95 \
+vllm serve Motif-Technologies/Motif-2-12.7B-Reasoning \
+    --trust-remote-code \
+    --compilation_config '{"full_cuda_graph": true}' \
+    --rope-scaling '{"rope_type":"yarn","factor":2.0,"original_max_position_embeddings":65536}' \
+    --max-model-len 131072 \
+    --tensor-parallel-size 8 \
+    --enable-auto-tool-choice \
+    --tool-call-parser hermes \
+    --logits-processors logit_:WrappedPerReqLogitsProcessor \
+    --reasoning-parser deepseek_r1
+```

logit_processors/__init__.py ADDED Viewed

File without changes

logit_processors/logit_.py ADDED Viewed

	@@ -0,0 +1,78 @@

+from typing import List, Optional
+import torch
+from vllm.config import VllmConfig
+from vllm.v1.sample.logits_processor import (
+    AdapterLogitsProcessor,
+    RequestLogitsProcessor,
+)
+from vllm.sampling_params import SamplingParams
+import os
+from collections import Counter
+CHUNK_SIZE=16384
+WINDOW_SIZE=256
+MAX_REPETATION_COUNT=7
+class ThinkLogitsProcessor:
+    def __init__(self, think_end_token = 219406, max_len: int = 131072, ratio: float = 0.95):
+        self.think_end_token = think_end_token
+        self.min_answer_budget = 4096
+        self.max_len = max_len
+        self.ratio = ratio
+        self.interval = 4096
+    def find_repeated_ngrams(self, input_ids, n=512):
+        """
+        input_ids: list of integer tokens
+        n: n-gram size
+        returns dict of {ngram_tuple: count} for repeated n-grams
+        """
+        ngrams = [tuple(input_ids[i:i+n]) for i in range(0, len(input_ids) - n + 1, WINDOW_SIZE)]
+        freq = Counter(ngrams)
+        return {ng: c for ng, c in freq.items() if c > MAX_REPETATION_COUNT}
+    def __call__(
+        self,
+        prompt_token_ids: List[int],
+        past_token_ids: List[int],
+        logits: torch.Tensor
+    ) -> torch.Tensor:
+        if self.think_end_token not in past_token_ids:
+            # ratio
+            tokens_since_think = len(past_token_ids)
+            response_budget = max(self.min_answer_budget, int((self.max_len - len(prompt_token_ids)) * (1-self.ratio)))
+            remaining_budget = self.max_len - len(prompt_token_ids) - response_budget - tokens_since_think
+            if 0 >= remaining_budget:
+                logits = torch.full_like(logits, torch.finfo(logits.dtype).min)
+                logits[self.think_end_token] = 1.0
+            # ngram
+            elif len(past_token_ids) % self.interval == 0:
+                # If repetation detected, force </think>
+                if self.find_repeated_ngrams(past_token_ids, n=CHUNK_SIZE):
+                    # Set all other logits to -inf except for </think>
+                    logits = torch.full_like(logits, torch.finfo(logits.dtype).min)
+                    logits[self.think_end_token] = 1.0
+        return logits
+class WrappedPerReqLogitsProcessor(AdapterLogitsProcessor):
+    def __init__(self, vllm_config: VllmConfig, device: torch.device,is_pin_memory: bool):
+        super().__init__(vllm_config, device, is_pin_memory)
+        self.model_max_len = vllm_config.model_config.max_model_len
+        assert self.model_max_len, "specify --model-max-len if using ratiologitprocessor"
+        self.ratio = float(os.environ.get("VLLM_THINK_BUDGET_RATIO", "0.0"))
+        assert 1 >= self.ratio > 0, "specify env var VLLM_THINK_BUDGET_RATIO in 0.0 < R =< 1.0 if using ratiologitprocessor"
+    def is_argmax_invariant(self) -> bool:
+        return False
+    def new_req_logits_processor(
+        self,
+        params: SamplingParams,
+    ) -> Optional[RequestLogitsProcessor]:
+        return ThinkLogitsProcessor(max_len = self.model_max_len, ratio = self.ratio)