GTA1-32B-vllm / processing_opencua.py
Adocados's picture
Update processing_opencua.py
e9fc699 verified
# processing_opencua.py
import torch
from typing import List, Dict, Any, Union
from PIL import Image # noqa: F401
from transformers.processing_utils import ProcessorMixin, BatchFeature
from transformers import AutoTokenizer, AutoImageProcessor
PLACEHOLDER = "<|media_placeholder|>"
class OpenCUAProcessor(ProcessorMixin):
"""
Lightweight processor that pairs the repo's custom TikTokenV3 tokenizer
with Qwen2VLImageProcessor and exposes media token ids for vLLM.
We intentionally keep __call__ minimal because vLLM doesn't require
a full HF Processor pipeline at init time; it just needs the class
to load cleanly and provide chat templating & media bookkeeping.
"""
attributes = [
"image_processor",
"tokenizer",
"image_token_id",
"video_token_id",
"merge_size",
"image_token",
"video_token",
]
def __init__(
self,
image_processor,
tokenizer,
image_token_id: int = 151667, # match your config.json
video_token_id: int = 151668, # match your config.json
merge_size: int = 2,
**kwargs,
):
self.image_processor = image_processor
self.tokenizer = tokenizer
# Media token ids (used by vLLM profiling & grids)
self.image_token_id = image_token_id
self.video_token_id = video_token_id
# String placeholders (kept for template-time substitution)
self.image_token = PLACEHOLDER
self.video_token = PLACEHOLDER
# Use the value baked into the image processor when available
self.merge_size = getattr(image_processor, "merge_size", merge_size)
# Pass through chat template if tokenizer carries one
self.chat_template = getattr(tokenizer, "chat_template", None)
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
# Ensure we can import local custom code
trust = kwargs.get("trust_remote_code", True)
# Prefer the repo's TikTokenV3; fall back to AutoTokenizer if needed
try:
from tokenization_opencua import TikTokenV3
tok = TikTokenV3.from_pretrained(
pretrained_model_name_or_path,
trust_remote_code=trust,
)
except Exception:
tok = AutoTokenizer.from_pretrained(
pretrained_model_name_or_path,
trust_remote_code=trust,
)
# Load the Qwen2VLImageProcessor as declared by preprocessor_config.json
imgproc = AutoImageProcessor.from_pretrained(
pretrained_model_name_or_path,
trust_remote_code=trust,
)
# Allow overrides of IDs via kwargs (rare)
image_token_id = kwargs.pop("image_token_id", 151667)
video_token_id = kwargs.pop("video_token_id", 151664)
return cls(
imgproc,
tok,
image_token_id=image_token_id,
video_token_id=video_token_id,
**kwargs,
)
def apply_chat_template(
self,
messages: List[Dict[str, Any]],
**kwargs
) -> Union[str, List[int]]:
"""
Delegate to tokenizer's chat template. Supports both str and ids via kwargs.
"""
return self.tokenizer.apply_chat_template(messages, **kwargs)
# Minimal callable to satisfy HF/VLLM if Processor is ever invoked.
def __call__(self, *args, **kwargs) -> BatchFeature:
data = {"input_ids": torch.zeros(1, 1, dtype=torch.long)}
return BatchFeature(data=data)
# Helper for your own client code: expand PLACEHOLDER count to match image grid.
def prepare_vllm_inputs(
self,
messages: List[Dict[str, Any]],
images: Union[Image.Image, Any, List[Union[Image.Image, Any]]],
add_generation_prompt: bool = True,
):
text = self.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=add_generation_prompt,
)
proc = self.image_processor(images=images, return_tensors="pt")
grid = torch.as_tensor(proc.get("image_grid_thw", []))
merge = getattr(self, "merge_size", 2)
# Each THW cell expands to (THW / merge^2) placeholders
for thw in grid:
num = int((thw[0] * thw[1] * thw[2]) // (merge ** 2))
text = text.replace(PLACEHOLDER, PLACEHOLDER * max(1, num), 1)
return text, images