
    jL                    x    d Z ddlmZ ddlmZ ddlZddlmZ ddlmZm	Z	 ddl
mZ  G d d	ej                  ZdS )
zGProvides utility to combine a vision backbone with a language backbone.    )annotations)copyN)
SDPBackendsdpa_kernel   )Sam3DualViTDetNeckc                  d     e Zd ZdZ	 	 	 	 dd fd
Z	 	 dddZddZddZddZddgfd dZ	 xZ
S )!SAM3VLBackbonezThis backbone combines a vision backbone and a language backbone without fusion. As such it is more of a
    convenience wrapper to handle the two backbones together.

    It adds support for activation checkpointing and compilation.
    Fr   visualr   compile_visualboolact_ckpt_whole_vision_backbone act_ckpt_whole_language_backbonec                    t                                                       |rt          j        |          n|| _        || _        || _        || _        || _        dS )zInitialize the backbone combiner.

        :param visual: The vision backbone to use
        :param text: The text encoder to use
        N)	super__init__torchcompilevision_backbonelanguage_backbonescalpr   r   )selfr   textr   r   r   r   	__class__s          l/home/longshao/multi-rider-rag/.venv/lib/python3.11/site-packages/ultralytics/models/sam/sam3/vl_combiner.pyr   zSAM3VLBackbone.__init__   s]     	LZ3f5=3H3H3H`f!%
.L+0P---    Nsamplestorch.Tensorcaptions	list[str]input_boxesadditional_textlist[str] | Nonec                    |                      |          }|                    |                     |||                     |S )a  Forward pass of the backbone combiner.

        :param samples: The input images
        :param captions: The input captions
        :param input_boxes: If the text contains place-holders for boxes, this
            parameter contains the tensor containing their spatial features
        :param additional_text: This can be used to encode some additional text
            (different from the captions) in the same forward of the backbone
        :return: Output dictionary with the following keys:
            - vision_features: The output of the vision backbone
            - language_features: The output of the language backbone
            - language_mask: The attention mask of the language backbone
            - vision_pos_enc: The positional encoding of the vision backbone
            - (optional) additional_text_features: The output of the language
                backbone for the additional text
            - (optional) additional_text_mask: The attention mask of the
                language backbone for the additional text
        )forward_imageupdateforward_text)r   r   r   r!   r"   outputs         r   forwardzSAM3VLBackbone.forward/   s@    2 ##G,,d''+OOPPPr   c                $   | j                             |          \  }}}}| j        dk    rD|d| j                  |d| j                  }}|"| |d| j                  |d| j                  }}d}|||d         }|||d}|d         }||||dS )zHForward pass of the vision backbone and get both SAM3 and SAM2 features.r   Nvision_featuresvision_pos_encbackbone_fpn)r-   r.   r/   sam2_backbone_out)r   r)   r   )	r   r   sam3_featuressam3_possam2_featuressam2_possam2_outputsam2_srcsam3_srcs	            r   r%   zSAM3VLBackbone.forward_imageL   s     <@;O;W;WX_;`;`8x:>> mm,DJ;' $M (X-A!-TZK-0]
{]+  (
 $)=$R(H#+"* - K !$'&)!,	
 
 	
r   c                0   | j                             |          }|d         }| j         j        
J d            | j                             || j         j                  \  }}| j        dk    r |d| j                  |d| j                  }}|d         ||dS )z>Forward pass of the vision backbone to get SAM2 features only.r+   NzSAM2 neck is not available.r   r,   )r   trunk
sam2_convssam_forward_feature_levelsr   )r   r   xsxr3   r4   s         r   forward_image_sam2z!SAM3VLBackbone.forward_image_sam2n   s    !''00rF#.::<Y:::"&"6"Q"QRSUYUiUt"u"ux:>> mm,DJ;' $M  -R0&)
 
 	
r   c                *   i }t          |          }|||z  }t          t          j        t          j        t          j        g          5  |                     ||          \  }}}ddd           n# 1 swxY w Y   |:|ddt          |           df         |d<   |t          |           d         |d<   |dddt          |          f         }|dt          |                   }|dddt          |          f         }||d<   ||d<   ||d<   |S )z!Forward pass of the text encoder.Nadditional_text_featuresadditional_text_masklanguage_featureslanguage_masklanguage_embeds)r   r   r   MATHEFFICIENT_ATTENTIONFLASH_ATTENTIONr   len)	r   r   r!   r"   r(   text_to_encodetext_attention_masktext_memorytext_embedss	            r   r'   zSAM3VLBackbone.forward_text   s    h& o-N*/:+I:Kefgg 	p 	p<@<R<RSacn<o<o9k	p 	p 	p 	p 	p 	p 	p 	p 	p 	p 	p 	p 	p 	p 	p &1<QQQ_AUAU@U@W@W=W1XF-.-@#oBVBVAVAXAX-YF)*!!!!_s8}}_"451/CMM/B!!!!_s8}}_"45&1"#"5$/ !s   	A00A47A4i  imgsz	list[int]c                :    | j                             |           dS )z+Set the image size for the vision backbone.N)r   	set_imgsz)r   rM   s     r   rP   zSAM3VLBackbone.set_imgsz   s    &&u-----r   )FFFr   )r   r   r   r   r   r   r   r   )NN)r   r   r   r    r!   r   r"   r#   )r   r   )rM   rN   )__name__
__module____qualname____doc__r   r)   r%   r>   r'   rP   __classcell__)r   s   @r   r
   r
      s           %/416Q Q Q Q Q Q Q4 %),0    : 
  
  
  
D
 
 
 
*   6 -1$< . . . . . . . . .r   r
   )rT   
__future__r   r   r   torch.nnnntorch.nn.attentionr   r   necksr   Moduler
    r   r   <module>r]      s   
 N M " " " " " "              6 6 6 6 6 6 6 6 % % % % % %N. N. N. N. N.RY N. N. N. N. N.r   