processing_opencua.py · Adocados/GTA1-32B-vllm at main

GTA1-32B-vllm / processing_opencua.py

Update processing_opencua.py

e9fc699 verified 2 months ago

4.5 kB

	# processing_opencua.py
	import torch
	from typing import List, Dict, Any, Union
	from PIL import Image # noqa: F401
	from transformers.processing_utils import ProcessorMixin, BatchFeature
	from transformers import AutoTokenizer, AutoImageProcessor

	PLACEHOLDER = "<\|media_placeholder\|>"

	class OpenCUAProcessor(ProcessorMixin):
	"""
	Lightweight processor that pairs the repo's custom TikTokenV3 tokenizer
	with Qwen2VLImageProcessor and exposes media token ids for vLLM.

	We intentionally keep __call__ minimal because vLLM doesn't require
	a full HF Processor pipeline at init time; it just needs the class
	to load cleanly and provide chat templating & media bookkeeping.
	"""
	attributes = [
	"image_processor",
	"tokenizer",
	"image_token_id",
	"video_token_id",
	"merge_size",
	"image_token",
	"video_token",
	]

	def __init__(
	self,
	image_processor,
	tokenizer,
	image_token_id: int = 151667, # match your config.json
	video_token_id: int = 151668, # match your config.json
	merge_size: int = 2,
	**kwargs,
	):
	self.image_processor = image_processor
	self.tokenizer = tokenizer

	# Media token ids (used by vLLM profiling & grids)
	self.image_token_id = image_token_id
	self.video_token_id = video_token_id

	# String placeholders (kept for template-time substitution)
	self.image_token = PLACEHOLDER
	self.video_token = PLACEHOLDER

	# Use the value baked into the image processor when available
	self.merge_size = getattr(image_processor, "merge_size", merge_size)

	# Pass through chat template if tokenizer carries one
	self.chat_template = getattr(tokenizer, "chat_template", None)

	@classmethod
	def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
	# Ensure we can import local custom code
	trust = kwargs.get("trust_remote_code", True)

	# Prefer the repo's TikTokenV3; fall back to AutoTokenizer if needed
	try:
	from tokenization_opencua import TikTokenV3
	tok = TikTokenV3.from_pretrained(
	pretrained_model_name_or_path,
	trust_remote_code=trust,
	)
	except Exception:
	tok = AutoTokenizer.from_pretrained(
	pretrained_model_name_or_path,
	trust_remote_code=trust,
	)

	# Load the Qwen2VLImageProcessor as declared by preprocessor_config.json
	imgproc = AutoImageProcessor.from_pretrained(
	pretrained_model_name_or_path,
	trust_remote_code=trust,
	)

	# Allow overrides of IDs via kwargs (rare)
	image_token_id = kwargs.pop("image_token_id", 151667)
	video_token_id = kwargs.pop("video_token_id", 151664)

	return cls(
	imgproc,
	tok,
	image_token_id=image_token_id,
	video_token_id=video_token_id,
	**kwargs,
	)

	def apply_chat_template(
	self,
	messages: List[Dict[str, Any]],
	**kwargs
	) -> Union[str, List[int]]:
	"""
	Delegate to tokenizer's chat template. Supports both str and ids via kwargs.
	"""
	return self.tokenizer.apply_chat_template(messages, **kwargs)

	# Minimal callable to satisfy HF/VLLM if Processor is ever invoked.
	def __call__(self, args, *kwargs) -> BatchFeature:
	data = {"input_ids": torch.zeros(1, 1, dtype=torch.long)}
	return BatchFeature(data=data)

	# Helper for your own client code: expand PLACEHOLDER count to match image grid.
	def prepare_vllm_inputs(
	self,
	messages: List[Dict[str, Any]],
	images: Union[Image.Image, Any, List[Union[Image.Image, Any]]],
	add_generation_prompt: bool = True,
	):
	text = self.apply_chat_template(
	messages,
	tokenize=False,
	add_generation_prompt=add_generation_prompt,
	)
	proc = self.image_processor(images=images, return_tensors="pt")
	grid = torch.as_tensor(proc.get("image_grid_thw", []))
	merge = getattr(self, "merge_size", 2)

	# Each THW cell expands to (THW / merge^2) placeholders
	for thw in grid:
	num = int((thw[0] * thw[1] * thw[2]) // (merge ** 2))
	text = text.replace(PLACEHOLDER, PLACEHOLDER * max(1, num), 1)

	return text, images