[DistributedDataParallel( (module): Float16Module( (module): VLMModel( (image_encoder): VisionModel( (encoder): Qwen2VLViT( (patch_embed): PatchEmbed( (proj): Conv3d(3, 1280, kernel_size=(2, 14, 14), stride=(2, 14, 14), bias=False) ) (rotary_pos_emb): VisionRotaryEmbedding() (blocks): Qwen2VLVisionTransformerBlock( (layers): ModuleList( (0-15): 16 x TransformerLayer( (input_layernorm): RMSNorm() (self_attention): Qwen2vlVitSelfAttention( (core_attention): DotProductAttention( (scale_mask_softmax): FusedScaleMaskSoftmax() (attention_dropout): Dropout(p=0.0, inplace=False) ) (linear_proj): RowParallelLinear() (linear_qkv): ColumnParallelLinear() (q_layernorm): IdentityOp() (k_layernorm): IdentityOp() ) (pre_cross_attn_layernorm): IdentityOp() (cross_attention): IdentityOp() (cross_attn_bda): IdentityFuncOp() (pre_mlp_layernorm): RMSNorm() (mlp): MLP( (linear_fc1): ColumnParallelLinear() (linear_fc2): RowParallelLinear() ) ) ) ) ) (projector): MultimodalProjector( (layernorm): RMSNorm() (encoder): MLP( (linear_fc1): ColumnParallelLinear() (linear_fc2): RowParallelLinear() ) ) ) (text_decoder): MMGPTModel( (embedding): LanguageModelEmbedding( (word_embeddings): VocabParallelEmbedding() (embedding_dropout): Dropout(p=0.0, inplace=False) ) (rotary_pos_emb): Qwen2VLRotaryEmbedding_llm() (decoder): TransformerBlock( (layers): ModuleList( (0-7): 8 x TransformerLayer( (input_layernorm): RMSNorm() (self_attention): Qwen2vlSelfAttention( (core_attention): DotProductAttention( (scale_mask_softmax): FusedScaleMaskSoftmax() (attention_dropout): Dropout(p=0.0, inplace=False) ) (linear_proj): RowParallelLinear() (linear_qkv): ColumnParallelLinear() (q_layernorm): IdentityOp() (k_layernorm): IdentityOp() ) (pre_cross_attn_layernorm): IdentityOp() (cross_attention): IdentityOp() (cross_attn_bda): IdentityFuncOp() (pre_mlp_layernorm): RMSNorm() (mlp): MLP( (linear_fc1): ColumnParallelLinear() (linear_fc2): RowParallelLinear() ) ) ) (final_layernorm): RMSNorm() ) (output_layer): ColumnParallelLinear() ) ) ) )]