DeepSpeedEngine(
  (module): Qwen2_5_VLForConditionalGeneration(
    (visual): Qwen2_5_VisionTransformerPretrainedModel(
      (patch_embed): Qwen2_5_VisionPatchEmbed(
        (proj): Conv3d(3, 1280, kernel_size=(2, 14, 14), stride=(2, 14, 14), bias=False)
      )
      (rotary_pos_emb): Qwen2_5_VisionRotaryEmbedding()
      (blocks): ModuleList(
        (0-15): 16 x Qwen2_5_VLVisionBlock(
          (norm1): Qwen2RMSNorm((0,), eps=1e-06)
          (norm2): Qwen2RMSNorm((0,), eps=1e-06)
          (attn): Qwen2_5_VLVisionSdpaAttention(
            (qkv): Linear(in_features=1280, out_features=3840, bias=True)
            (proj): Linear(in_features=1280, out_features=1280, bias=True)
          )
          (mlp): Qwen2_5_VLMLP(
            (gate_proj): Linear(in_features=1280, out_features=3420, bias=True)
            (up_proj): Linear(in_features=1280, out_features=3420, bias=True)
            (down_proj): Linear(in_features=3420, out_features=1280, bias=True)
            (act_fn): SiLU()
          )
        )
      )
      (merger): Qwen2_5_VLPatchMerger(
        (ln_q): Qwen2RMSNorm((0,), eps=1e-06)
        (mlp): Sequential(
          (0): Linear(in_features=5120, out_features=5120, bias=True)
          (1): GELU(approximate='none')
          (2): Linear(in_features=5120, out_features=2048, bias=True)
        )
      )
    )
    (model): Qwen2_5_VLModel(
      (embed_tokens): Embedding(151936, 2048)
      (layers): ModuleList(
        (0-7): 8 x Qwen2_5_VLDecoderLayer(
          (self_attn): Qwen2_5_VLSdpaAttention(
            (q_proj): Linear(in_features=2048, out_features=2048, bias=True)
            (k_proj): Linear(in_features=2048, out_features=256, bias=True)
            (v_proj): Linear(in_features=2048, out_features=256, bias=True)
            (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
            (rotary_emb): Qwen2_5_VLRotaryEmbedding()
          )
          (mlp): Qwen2MLP(
            (gate_proj): Linear(in_features=2048, out_features=11008, bias=False)
            (up_proj): Linear(in_features=2048, out_features=11008, bias=False)
            (down_proj): Linear(in_features=11008, out_features=2048, bias=False)
            (act_fn): SiLU()
          )
          (input_layernorm): Qwen2RMSNorm((0,), eps=1e-06)
          (post_attention_layernorm): Qwen2RMSNorm((0,), eps=1e-06)
        )
      )
      (norm): Qwen2RMSNorm((0,), eps=1e-06)
      (rotary_emb): Qwen2_5_VLRotaryEmbedding()
    )
    (lm_head): Linear(in_features=2048, out_features=151936, bias=False)
  )
)
