| { | |
| "architectures": [ | |
| "Qwen2VLVAEForConditionalGeneration" | |
| ], | |
| "attention_dropout": 0.0, | |
| "bos_token_id": 151643, | |
| "eos_token_id": 151645, | |
| "vision_start_token_id": 151652, | |
| "vision_end_token_id": 151653, | |
| "vision_token_id": 151654, | |
| "image_token_id": 151655, | |
| "video_token_id": 151656, | |
| "hidden_act": "silu", | |
| "hidden_size": 3584, | |
| "initializer_range": 0.02, | |
| "intermediate_size": 18944, | |
| "max_position_embeddings": 32768, | |
| "max_window_layers": 28, | |
| "model_type": "qwen2_vl_vae", | |
| "num_attention_heads": 28, | |
| "num_hidden_layers": 28, | |
| "num_key_value_heads": 4, | |
| "rms_norm_eps": 1e-06, | |
| "rope_theta": 1000000.0, | |
| "sliding_window": 32768, | |
| "tie_word_embeddings": false, | |
| "torch_dtype": "bfloat16", | |
| "transformers_version": "4.41.2", | |
| "use_cache": true, | |
| "use_sliding_window": false, | |
| "vision_config": { | |
| "in_channels": 12, | |
| "patch_size": 2, | |
| "hidden_size": 3584, | |
| "vae_path": "genmo/mochi-1-preview", | |
| "vae_subfolder": "vae", | |
| "vae_config": { | |
| "_class_name": "AutoencoderKLMochi", | |
| "_diffusers_version": "0.32.0.dev0", | |
| "act_fn": "silu", | |
| "add_attention_block": [ | |
| false, | |
| true, | |
| true, | |
| true, | |
| true | |
| ], | |
| "decoder_block_out_channels": [ | |
| 128, | |
| 256, | |
| 512, | |
| 768 | |
| ], | |
| "encoder_block_out_channels": [ | |
| 64, | |
| 128, | |
| 256, | |
| 384 | |
| ], | |
| "in_channels": 15, | |
| "latent_channels": 12, | |
| "latents_mean": [ | |
| -0.06730895953510081, | |
| -0.038011381506090416, | |
| -0.07477820912866141, | |
| -0.05565264470995561, | |
| 0.012767231469026969, | |
| -0.04703542746246419, | |
| 0.043896967884726704, | |
| -0.09346305707025976, | |
| -0.09918314763016893, | |
| -0.008729793427399178, | |
| -0.011931556316503654, | |
| -0.0321993391887285 | |
| ], | |
| "latents_std": [ | |
| 0.9263795028493863, | |
| 0.9248894543193766, | |
| 0.9393059390890617, | |
| 0.959253732819592, | |
| 0.8244560132752793, | |
| 0.917259975397747, | |
| 0.9294154431013696, | |
| 1.3720942357788521, | |
| 0.881393668867029, | |
| 0.9168315692124348, | |
| 0.9185249279345552, | |
| 0.9274757570805041 | |
| ], | |
| "layers_per_block": [ | |
| 3, | |
| 3, | |
| 4, | |
| 6, | |
| 3 | |
| ], | |
| "out_channels": 3, | |
| "scaling_factor": 1.0, | |
| "spatial_expansions": [ | |
| 2, | |
| 2, | |
| 2 | |
| ], | |
| "temporal_expansions": [ | |
| 1, | |
| 2, | |
| 3 | |
| ] | |
| } | |
| }, | |
| "rope_scaling": { | |
| "type": "mrope", | |
| "mrope_section": [ | |
| 16, | |
| 24, | |
| 24 | |
| ] | |
| }, | |
| "vocab_size": 152064 | |
| } | |