|
|
|
|
|
import torch |
|
|
from typing import List, Dict, Any, Union |
|
|
from PIL import Image |
|
|
from transformers.processing_utils import ProcessorMixin, BatchFeature |
|
|
from transformers import AutoTokenizer, AutoImageProcessor |
|
|
|
|
|
PLACEHOLDER = "<|media_placeholder|>" |
|
|
|
|
|
class OpenCUAProcessor(ProcessorMixin): |
|
|
""" |
|
|
Lightweight processor that pairs the repo's custom TikTokenV3 tokenizer |
|
|
with Qwen2VLImageProcessor and exposes media token ids for vLLM. |
|
|
|
|
|
We intentionally keep __call__ minimal because vLLM doesn't require |
|
|
a full HF Processor pipeline at init time; it just needs the class |
|
|
to load cleanly and provide chat templating & media bookkeeping. |
|
|
""" |
|
|
attributes = [ |
|
|
"image_processor", |
|
|
"tokenizer", |
|
|
"image_token_id", |
|
|
"video_token_id", |
|
|
"merge_size", |
|
|
"image_token", |
|
|
"video_token", |
|
|
] |
|
|
|
|
|
def __init__( |
|
|
self, |
|
|
image_processor, |
|
|
tokenizer, |
|
|
image_token_id: int = 151667, |
|
|
video_token_id: int = 151668, |
|
|
merge_size: int = 2, |
|
|
**kwargs, |
|
|
): |
|
|
self.image_processor = image_processor |
|
|
self.tokenizer = tokenizer |
|
|
|
|
|
|
|
|
self.image_token_id = image_token_id |
|
|
self.video_token_id = video_token_id |
|
|
|
|
|
|
|
|
self.image_token = PLACEHOLDER |
|
|
self.video_token = PLACEHOLDER |
|
|
|
|
|
|
|
|
self.merge_size = getattr(image_processor, "merge_size", merge_size) |
|
|
|
|
|
|
|
|
self.chat_template = getattr(tokenizer, "chat_template", None) |
|
|
|
|
|
@classmethod |
|
|
def from_pretrained(cls, pretrained_model_name_or_path, **kwargs): |
|
|
|
|
|
trust = kwargs.get("trust_remote_code", True) |
|
|
|
|
|
|
|
|
try: |
|
|
from tokenization_opencua import TikTokenV3 |
|
|
tok = TikTokenV3.from_pretrained( |
|
|
pretrained_model_name_or_path, |
|
|
trust_remote_code=trust, |
|
|
) |
|
|
except Exception: |
|
|
tok = AutoTokenizer.from_pretrained( |
|
|
pretrained_model_name_or_path, |
|
|
trust_remote_code=trust, |
|
|
) |
|
|
|
|
|
|
|
|
imgproc = AutoImageProcessor.from_pretrained( |
|
|
pretrained_model_name_or_path, |
|
|
trust_remote_code=trust, |
|
|
) |
|
|
|
|
|
|
|
|
image_token_id = kwargs.pop("image_token_id", 151667) |
|
|
video_token_id = kwargs.pop("video_token_id", 151664) |
|
|
|
|
|
return cls( |
|
|
imgproc, |
|
|
tok, |
|
|
image_token_id=image_token_id, |
|
|
video_token_id=video_token_id, |
|
|
**kwargs, |
|
|
) |
|
|
|
|
|
def apply_chat_template( |
|
|
self, |
|
|
messages: List[Dict[str, Any]], |
|
|
**kwargs |
|
|
) -> Union[str, List[int]]: |
|
|
""" |
|
|
Delegate to tokenizer's chat template. Supports both str and ids via kwargs. |
|
|
""" |
|
|
return self.tokenizer.apply_chat_template(messages, **kwargs) |
|
|
|
|
|
|
|
|
def __call__(self, *args, **kwargs) -> BatchFeature: |
|
|
data = {"input_ids": torch.zeros(1, 1, dtype=torch.long)} |
|
|
return BatchFeature(data=data) |
|
|
|
|
|
|
|
|
def prepare_vllm_inputs( |
|
|
self, |
|
|
messages: List[Dict[str, Any]], |
|
|
images: Union[Image.Image, Any, List[Union[Image.Image, Any]]], |
|
|
add_generation_prompt: bool = True, |
|
|
): |
|
|
text = self.apply_chat_template( |
|
|
messages, |
|
|
tokenize=False, |
|
|
add_generation_prompt=add_generation_prompt, |
|
|
) |
|
|
proc = self.image_processor(images=images, return_tensors="pt") |
|
|
grid = torch.as_tensor(proc.get("image_grid_thw", [])) |
|
|
merge = getattr(self, "merge_size", 2) |
|
|
|
|
|
|
|
|
for thw in grid: |
|
|
num = int((thw[0] * thw[1] * thw[2]) // (merge ** 2)) |
|
|
text = text.replace(PLACEHOLDER, PLACEHOLDER * max(1, num), 1) |
|
|
|
|
|
return text, images |
|
|
|