processing_opencua.py · Adocados/GTA1-32B-vllm at main

File size: 4,496 Bytes

# processing_opencua.py
import torch
from typing import List, Dict, Any, Union
from PIL import Image  # noqa: F401
from transformers.processing_utils import ProcessorMixin, BatchFeature
from transformers import AutoTokenizer, AutoImageProcessor

PLACEHOLDER = "<|media_placeholder|>"

class OpenCUAProcessor(ProcessorMixin):
    """
    Lightweight processor that pairs the repo's custom TikTokenV3 tokenizer
    with Qwen2VLImageProcessor and exposes media token ids for vLLM.

    We intentionally keep __call__ minimal because vLLM doesn't require
    a full HF Processor pipeline at init time; it just needs the class
    to load cleanly and provide chat templating & media bookkeeping.
    """
    attributes = [
        "image_processor",
        "tokenizer",
        "image_token_id",
        "video_token_id",
        "merge_size",
        "image_token",
        "video_token",
    ]

    def __init__(
        self,
        image_processor,
        tokenizer,
        image_token_id: int = 151667,   # match your config.json
        video_token_id: int = 151668,   # match your config.json
        merge_size: int = 2,
        **kwargs,
    ):
        self.image_processor = image_processor
        self.tokenizer = tokenizer

        # Media token ids (used by vLLM profiling & grids)
        self.image_token_id = image_token_id
        self.video_token_id = video_token_id

        # String placeholders (kept for template-time substitution)
        self.image_token = PLACEHOLDER
        self.video_token = PLACEHOLDER

        # Use the value baked into the image processor when available
        self.merge_size = getattr(image_processor, "merge_size", merge_size)

        # Pass through chat template if tokenizer carries one
        self.chat_template = getattr(tokenizer, "chat_template", None)

    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
        # Ensure we can import local custom code
        trust = kwargs.get("trust_remote_code", True)

        # Prefer the repo's TikTokenV3; fall back to AutoTokenizer if needed
        try:
            from tokenization_opencua import TikTokenV3
            tok = TikTokenV3.from_pretrained(
                pretrained_model_name_or_path,
                trust_remote_code=trust,
            )
        except Exception:
            tok = AutoTokenizer.from_pretrained(
                pretrained_model_name_or_path,
                trust_remote_code=trust,
            )

        # Load the Qwen2VLImageProcessor as declared by preprocessor_config.json
        imgproc = AutoImageProcessor.from_pretrained(
            pretrained_model_name_or_path,
            trust_remote_code=trust,
        )

        # Allow overrides of IDs via kwargs (rare)
        image_token_id = kwargs.pop("image_token_id", 151667)
        video_token_id = kwargs.pop("video_token_id", 151664)

        return cls(
            imgproc,
            tok,
            image_token_id=image_token_id,
            video_token_id=video_token_id,
            **kwargs,
        )

    def apply_chat_template(
        self,
        messages: List[Dict[str, Any]],
        **kwargs
    ) -> Union[str, List[int]]:
        """
        Delegate to tokenizer's chat template. Supports both str and ids via kwargs.
        """
        return self.tokenizer.apply_chat_template(messages, **kwargs)

    # Minimal callable to satisfy HF/VLLM if Processor is ever invoked.
    def __call__(self, *args, **kwargs) -> BatchFeature:
        data = {"input_ids": torch.zeros(1, 1, dtype=torch.long)}
        return BatchFeature(data=data)

    # Helper for your own client code: expand PLACEHOLDER count to match image grid.
    def prepare_vllm_inputs(
        self,
        messages: List[Dict[str, Any]],
        images: Union[Image.Image, Any, List[Union[Image.Image, Any]]],
        add_generation_prompt: bool = True,
    ):
        text = self.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=add_generation_prompt,
        )
        proc = self.image_processor(images=images, return_tensors="pt")
        grid = torch.as_tensor(proc.get("image_grid_thw", []))
        merge = getattr(self, "merge_size", 2)

        # Each THW cell expands to (THW / merge^2) placeholders
        for thw in grid:
            num = int((thw[0] * thw[1] * thw[2]) // (merge ** 2))
            text = text.replace(PLACEHOLDER, PLACEHOLDER * max(1, num), 1)

        return text, images