
    j                       d dl mZ d dlZd dlmc mZ d dlmZ d dlmZ d dl	m
Z
 d dlmZ ddlmZmZ dd	lmZmZ dd
lmZmZ ddlmZmZ dZ G d dej                  Z G d dej        j                  Z G d de          ZdS )    )annotationsN)nn)trunc_normal_)MLP)LOGGER   )SAM2TwoWayTransformerTwoWayTransformer)MaskDecoderSAM2MaskDecoder)ImageEncoderViTPromptEncoder)get_1d_sine_peselect_closest_cond_framesg      c                  @     e Zd ZU dZdZded<   	 	 dd fdZd Z xZS )SAMModela  Segment Anything Model (SAM) for object segmentation tasks.

    This class combines image encoders, prompt encoders, and mask decoders to predict object masks from images and input
    prompts.

    Attributes:
        mask_threshold (float): Threshold value for mask prediction.
        image_encoder (ImageEncoderViT): Backbone for encoding images into embeddings.
        prompt_encoder (PromptEncoder): Encoder for various types of input prompts.
        mask_decoder (MaskDecoder): Predicts object masks from image and prompt embeddings.
        pixel_mean (torch.Tensor): Mean values for normalizing pixels in the input image.
        pixel_std (torch.Tensor): Standard deviation values for normalizing pixels in the input image.

    Methods:
        set_imgsz: Set image size to make model compatible with different image sizes.

    Examples:
        >>> image_encoder = ImageEncoderViT(...)
        >>> prompt_encoder = PromptEncoder(...)
        >>> mask_decoder = MaskDecoder(...)
        >>> sam_model = SAMModel(image_encoder, prompt_encoder, mask_decoder)
        >>> # Further usage depends on SAMPredictor class

    Notes:
        All forward() operations are implemented in the SAMPredictor class.
            floatmask_thresholdg33333^@gR]@gRY@g(\2M@g(\L@g     L@image_encoderr   prompt_encoderr   mask_decoderr   
pixel_meanlist[float]	pixel_stdreturnNonec                j   t                                                       || _        || _        || _        |                     dt          j        |                              ddd          d           |                     dt          j        |                              ddd          d           dS )a  Initialize the SAMModel class to predict object masks from an image and input prompts.

        Args:
            image_encoder (ImageEncoderViT): The backbone used to encode the image into image embeddings.
            prompt_encoder (PromptEncoder): Encodes various types of input prompts.
            mask_decoder (MaskDecoder): Predicts masks from the image embeddings and encoded prompts.
            pixel_mean (list[float]): Mean values for normalizing pixels in the input image.
            pixel_std (list[float]): Standard deviation values for normalizing pixels in the input image.

        Notes:
            All forward() operations moved to SAMPredictor.
        r   r   Fr   N)	super__init__r   r   r   register_buffertorchTensorview)selfr   r   r   r   r   	__class__s         g/home/longshao/multi-rider-rag/.venv/lib/python3.11/site-packages/ultralytics/models/sam/modules/sam.pyr#   zSAMModel.__init__7   s    ( 	*,(\5<
+C+C+H+HQPQ+R+RTYZZZ[%,y*A*A*F*Fr1a*P*PRWXXXXX    c                    t          | j        d          r| j                            |           || j        _        d |D             | j        _        |d         | j        _        dS )CSet image size to make model compatible with different image sizes.	set_imgszc                    g | ]}|d z  S )    .0xs     r*   
<listcomp>z&SAMModel.set_imgsz.<locals>.<listcomp>W   s    3K3K3KAG3K3K3Kr+   r   N)hasattrr   r.   r   input_image_sizeimage_embedding_sizeimg_sizer(   imgszs     r*   r.   zSAMModel.set_imgszR   sg    4%{33 	0((////4,3K3KU3K3K3K0&+Ah###r+   )r   r   )r   r   r   r   r   r   r   r   r   r   r   r   )	__name__
__module____qualname____doc__r   __annotations__r#   r.   __classcell__r)   s   @r*   r   r      s          6  N #<!8Y Y Y Y Y Y Y6/ / / / / / /r+   r   c                      e Zd ZU dZdZded<   	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d*d+ fdZed             Zd Z	d Z
	 	 	 	 d,dZd-dZd.dZd/d Z	 d0d!Zd" Zd# Zd$ Z	 	 	 d1d%Zd& Zed'             Zd0d(Zd) Z xZS )2	SAM2Modela  SAM2Model class for Segment Anything Model 2 with memory-based video object segmentation capabilities.

    This class extends the functionality of SAM to handle video sequences, incorporating memory mechanisms for temporal
    consistency and efficient tracking of objects across frames.

    Attributes:
        mask_threshold (float): Threshold value for mask prediction.
        image_encoder (ImageEncoderViT): Visual encoder for extracting image features.
        memory_attention (nn.Module): Module for attending to memory features.
        memory_encoder (nn.Module): Encoder for generating memory representations.
        num_maskmem (int): Number of accessible memory frames.
        image_size (int): Size of input images.
        backbone_stride (int): Stride of the backbone network output.
        sam_prompt_embed_dim (int): Dimension of SAM prompt embeddings.
        sam_image_embedding_size (int): Size of SAM image embeddings.
        sam_prompt_encoder (PromptEncoder): Encoder for processing input prompts.
        sam_mask_decoder (SAM2MaskDecoder): Decoder for generating object masks.
        obj_ptr_proj (nn.Module): Projection layer for object pointers.
        obj_ptr_tpos_proj (nn.Module): Projection for temporal positional encoding in object pointers.
        hidden_dim (int): Hidden dimension of the model.
        mem_dim (int): Memory dimension for encoding features.
        use_high_res_features_in_sam (bool): Whether to use high-resolution feature maps in the SAM mask decoder.
        use_obj_ptrs_in_encoder (bool): Whether to cross-attend to object pointers from other frames in the encoder.
        max_obj_ptrs_in_encoder (int): Maximum number of object pointers from other frames in encoder cross-attention.
        add_tpos_enc_to_obj_ptrs (bool): Whether to add temporal positional encoding to object pointers.
        proj_tpos_enc_in_obj_ptrs (bool): Whether to add an extra linear projection layer for temporal positional
            encoding in object pointers.
        use_signed_tpos_enc_to_obj_ptrs (bool): Whether to use signed distance in temporal positional encoding.
        only_obj_ptrs_in_the_past_for_eval (bool): Whether to only attend to object pointers in the past during
            evaluation.
        pred_obj_scores (bool): Whether to predict if there is an object in the frame.
        pred_obj_scores_mlp (bool): Whether to use an MLP to predict object scores.
        fixed_no_obj_ptr (bool): Whether to have a fixed no-object pointer when there is no object present.
        soft_no_obj_ptr (bool): Whether to mix in no-object pointer softly for easier recovery and error mitigation.
        use_mlp_for_obj_ptr_proj (bool): Whether to use MLP for object pointer projection.
        no_obj_embed_spatial (torch.Tensor | None): No-object embedding for spatial frames.
        max_cond_frames_in_attn (int): Maximum number of conditioning frames to participate in memory attention.
        directly_add_no_mem_embed (bool): Whether to directly add no-memory embedding to image feature on the first
            frame.
        multimask_output_in_sam (bool): Whether to output multiple masks for the first click on initial conditioning
            frames.
        multimask_min_pt_num (int): Minimum number of clicks to use multimask output in SAM.
        multimask_max_pt_num (int): Maximum number of clicks to use multimask output in SAM.
        multimask_output_for_tracking (bool): Whether to use multimask output for tracking.
        use_multimask_token_for_obj_ptr (bool): Whether to use multimask tokens for object pointers.
        iou_prediction_use_sigmoid (bool): Whether to use sigmoid to restrict IoU prediction to [0-1].
        memory_temporal_stride_for_eval (int): Memory bank's temporal stride during evaluation.
        non_overlap_masks_for_mem_enc (bool): Whether to apply non-overlapping constraints on object masks in memory
            encoder during evaluation.
        sigmoid_scale_for_mem_enc (float): Scale factor for mask sigmoid probability.
        sigmoid_bias_for_mem_enc (float): Bias factor for mask sigmoid probability.
        binarize_mask_from_pts_for_mem_enc (bool): Whether to binarize sigmoid mask logits on interacted frames with
            clicks during evaluation.
        use_mask_input_as_output_without_sam (bool): Whether to directly output the input mask without using SAM prompt
            encoder and mask decoder on frames with mask input.

    Methods:
        forward_image: Process image batch through encoder to extract multi-level features.
        track_step: Perform a single tracking step, updating object masks and memory features.
        set_binarize: Set binarize for VideoPredictor.
        set_imgsz: Set image size to make model compatible with different image sizes.

    Examples:
        >>> model = SAM2Model(image_encoder, memory_attention, memory_encoder)
        >>> image_batch = torch.rand(1, 3, 512, 512)
        >>> features = model.forward_image(image_batch)
        >>> track_results = model.track_step(0, True, features, None, None, None, {})
    r   r   r         r0         ?Fr!   r   TNuse_multimask_token_for_obj_ptrboolpred_obj_scorespred_obj_scores_mlpfixed_no_obj_ptrsoft_no_obj_ptruse_mlp_for_obj_ptr_projno_obj_embed_spatialcompile_image_encoderc$                   t                                                       || _        || _        |rdnd| _        || _        || _        |r(t          j        	                    dddd          | _
        || _        |r|sJ || _        || _        || _        || _        |j        | _        || _        | j        | _        t)          | j        d          r;t)          | j        j        d          r!| j        j        j        j        d         | _        || _        t          j                            t          j        |dd| j                            | _        t9          | j        d	           t          j                            t          j        dd| j                            | _        t          j                            t          j        dd| j                            | _        t9          | j        d	           t9          | j        d	           || _        || _         || _!        |	| _"        || _#        || _$        |
| _%        || _&        || _'        || _(        || _)        || _*        || _+        || _,        || _-        |"| _.        || _/        || _0        || _1        || _2        | j1        r| j/        sJ | j        sJ | j/        rY| j        rRt          j                            t          j        d| j                            | _3        t9          | j3        d	           | | _4        d
| _5        |!rRt          j                            t          j        d| j                            | _5        t9          | j5        d	           | 6                                 || _7        d| _8        |#rBts          j:        d           t          j;        | j        j<        ddd          | j        _<        d
S d
S )a  Initialize the SAM2Model for video object segmentation with memory-based tracking.

        Args:
            image_encoder (nn.Module): Visual encoder for extracting image features.
            memory_attention (nn.Module): Module for attending to memory features.
            memory_encoder (nn.Module): Encoder for generating memory representations.
            num_maskmem (int): Number of accessible memory frames.
            image_size (int): Size of input images.
            backbone_stride (int): Stride of the image backbone output.
            sigmoid_scale_for_mem_enc (float): Scale factor for mask sigmoid probability.
            sigmoid_bias_for_mem_enc (float): Bias factor for mask sigmoid probability.
            binarize_mask_from_pts_for_mem_enc (bool): Whether to binarize sigmoid mask logits on interacted frames with
                clicks during evaluation.
            use_mask_input_as_output_without_sam (bool): Whether to directly output the input mask without using SAM
                prompt encoder and mask decoder on frames with mask input.
            max_cond_frames_in_attn (int): Maximum number of conditioning frames to participate in memory attention.
            directly_add_no_mem_embed (bool): Whether to directly add no-memory embedding to image feature on the first
                frame.
            use_high_res_features_in_sam (bool): Whether to use high-resolution feature maps in the SAM mask decoder.
            multimask_output_in_sam (bool): Whether to output multiple masks for the first click on initial conditioning
                frames.
            multimask_min_pt_num (int): Minimum number of clicks to use multimask output in SAM.
            multimask_max_pt_num (int): Maximum number of clicks to use multimask output in SAM.
            multimask_output_for_tracking (bool): Whether to use multimask output for tracking.
            use_multimask_token_for_obj_ptr (bool): Whether to use multimask tokens for object pointers.
            iou_prediction_use_sigmoid (bool): Whether to use sigmoid to restrict IoU prediction to [0-1].
            memory_temporal_stride_for_eval (int): Memory bank's temporal stride during evaluation.
            non_overlap_masks_for_mem_enc (bool): Whether to apply non-overlapping constraints on object masks in memory
                encoder during evaluation.
            use_obj_ptrs_in_encoder (bool): Whether to cross-attend to object pointers from other frames in the encoder.
            max_obj_ptrs_in_encoder (int): Maximum number of object pointers from other frames in encoder
                cross-attention.
            add_tpos_enc_to_obj_ptrs (bool): Whether to add temporal positional encoding to object pointers in the
                encoder.
            proj_tpos_enc_in_obj_ptrs (bool): Whether to add an extra linear projection layer for temporal positional
                encoding in object pointers.
            use_signed_tpos_enc_to_obj_ptrs (bool): Whether to use signed distance in the temporal positional encoding
                in the object pointers.
            only_obj_ptrs_in_the_past_for_eval (bool): Whether to only attend to object pointers in the past during
                evaluation.
            pred_obj_scores (bool): Whether to predict if there is an object in the frame.
            pred_obj_scores_mlp (bool): Whether to use an MLP to predict object scores.
            fixed_no_obj_ptr (bool): Whether to have a fixed no-object pointer when there is no object present.
            soft_no_obj_ptr (bool): Whether to mix in no-object pointer softly for easier recovery and error mitigation.
            use_mlp_for_obj_ptr_proj (bool): Whether to use MLP for object pointer projection.
            no_obj_embed_spatial (bool): Whether to add no-object embedding to spatial frames.
            sam_mask_decoder_extra_args (dict | None): Extra arguments for constructing the SAM mask decoder.
            compile_image_encoder (bool): Whether to compile the image encoder for faster inference.
           r      )kernel_sizestrideout_projweightr   g{Gz?)stdNTzFImage encoder compilation is enabled. First forward pass will be slow.zmax-autotuneF)mode	fullgraphdynamic)=r"   r#   r   use_high_res_features_in_samnum_feature_levelsuse_obj_ptrs_in_encodermax_obj_ptrs_in_encoderr%   r   Conv2dmask_downsampleadd_tpos_enc_to_obj_ptrsproj_tpos_enc_in_obj_ptrsuse_signed_tpos_enc_to_obj_ptrs"only_obj_ptrs_in_the_past_for_evalmemory_attentiond_model
hidden_dimmemory_encodermem_dimr6   rV   rW   shapenum_maskmem	Parameterzerosmaskmem_tpos_encr   no_mem_embedno_mem_pos_encdirectly_add_no_mem_embedsigmoid_scale_for_mem_encsigmoid_bias_for_mem_enc"binarize_mask_from_pts_for_mem_encnon_overlap_masks_for_mem_encmemory_temporal_stride_for_eval$use_mask_input_as_output_without_sammultimask_output_in_sammultimask_min_pt_nummultimask_max_pt_nummultimask_output_for_trackingrH   iou_prediction_use_sigmoid
image_sizebackbone_stridesam_mask_decoder_extra_argsrJ   rK   rL   rM   
no_obj_ptrrN   rO   _build_sam_headsmax_cond_frames_in_attn!add_all_frames_to_correct_as_condr   infocompileforward%r(   r   rf   ri   rl   r~   r   rs   rt   ru   rx   r   rr   r\   ry   rz   r{   r|   rH   r}   rw   rv   r^   r_   rb   rc   rd   re   rJ   rK   rL   rM   rN   rO   r   rP   r)   s%                                       r*   r#   zSAM2Model.__init__   s   n 	 +,H)'C"J!!'>$'>$" 	R $)8??1aQq?#Q#QD (@%$ 	,++++)B&/N,2T/ !1*2 -4&
33 	H@S@\^f8g8g 	H.7>DQGDL& % 2 25;{AqRVR^3_3_ ` `d+6666!H..u{1a/Q/QRR#h00Q4?1S1STTd'T2222d)t4444)B& *C&(@%2T/-J*/N, 5Y1'>$$8!$8!-J*/N,*D' %.+F(.#6  0.  	0''''//// 	5D$@ 	5#h00Q1P1PQQDO$/t4444(@%$(! 	?(-(:(:5;q$,;W;W(X(XD%$3>>>>'>$15. ! 	K`aaa)."*#	* * *D&&&	 	r+   c                N    t          |                                           j        S )z=Return the device on which the model's parameters are stored.)next
parametersdevicer(   s    r*   r   zSAM2Model.deviceY  s     DOO%%&&--r+   c                     t          d          )zWProcess image and prompt inputs to generate object masks and scores in video sequences.zPlease use the corresponding methods in SAM2VideoPredictor for inference.See notebooks/video_predictor_example.ipynb for an example.)NotImplementedError)r(   argskwargss      r*   r   zSAM2Model.forward^  s    !J
 
 	
r+   c                "   | j         | _        | j        | j        z  | _        t          | j        | j        | j        f| j        | j        fd          | _        t          ddt          d| j        dd          | j        dd| j	        | j
        | j        | j        | j        d	
| j        pi | _        | j        r]t"          j                            | j         | j                   | _        | j        r&t-          | j         | j         | j         d          | _        n#t"          j                                        | _        | j        r1t"          j                            | j         | j                  | _        d
S t"          j                                        | _        d
S )zMBuild SAM-style prompt encoder and mask decoder for image segmentation tasks.r0   )	embed_dimr8   r7   mask_in_chansrR            depthembedding_dimmlp_dim	num_heads   
num_multimask_outputstransformertransformer_dimiou_head_depthiou_head_hidden_dimuse_high_res_featuresr}   rJ   rK   rH   Nr1   )rh   sam_prompt_embed_dimr~   r   sam_image_embedding_sizer   sam_prompt_encoderr   r	   r\   r}   rJ   rK   rH   r   sam_mask_decoderr^   r%   r   Linearobj_ptr_projrN   r   Identityrc   rj   obj_ptr_tpos_projr   s    r*   r   zSAM2Model._build_sam_headse  s   $(O!(,4;O(O% #0/--" #ot?#
 #
 #
 !0 !
"#-"7	   !5 #"&"C'+'F 0 $ 8,0,P!
 !
  /52!!
 !
$ ' 	4 % Q QD, ^$'$/[\$]$]! % 1 1 3 3D) 	9 &+X__T_dl%S%SD"""%*X%6%6%8%8D"""r+   c           	     v   |j         d         }|j        }|                    d          | j        k    sJ |                    d          | j        k    sJ |                    d          | j        k    sJ |5|d         }|d         }	|j         d         |k    r|	j         d         |k    sJ nAt          j        |dd||j                  }t          j        |dt
          j	        |	           }	|t          |j                   d
k    r|j         dd         |dfk    sJ |j         dd         | j        j        k    r<t          j        |                    |j                  | j        j        ddd          }
n|}
nd}
|                     ||	fd|
          \  }}|                     || j                                        |||d|          \  }}}}| j        r,|dk    }t          j        |ddddf         |t(                    }t          j        || j        | j        fdd          }|dddf         }|rt          j        |d          }t          j        ||          }|||f                             d          }|||f                             d          }|                    d          dk    r
|||f         }n||}}|                     |          }| j        rR| j        r|                                }n|                    |j                  }| j        r||z  }|d|z
  | j        z  z   }|||||||fS )a[
  Forward pass through SAM prompt encoders and mask heads.

        This method processes image features and optional point/mask inputs to generate object masks and scores.

        Args:
            backbone_features (torch.Tensor): Image features with shape (B, C, H, W).
            point_inputs (dict[str, torch.Tensor] | None): Dictionary containing point prompts with keys 'point_coords'
                (Tensor of shape (B, P, 2) with float32 dtype, containing absolute pixel-unit coordinates in (x, y)
                format for P input points) and 'point_labels' (Tensor of shape (B, P) with int32 dtype, where 1 means
                positive clicks, 0 means negative clicks, and -1 means padding).
            mask_inputs (torch.Tensor | None): Mask of shape (B, 1, H*16, W*16), float or bool, with the same spatial
                size as the image.
            high_res_features (list[torch.Tensor] | None): List of two feature maps with shapes (B, C, 4*H, 4*W) and (B,
                C, 2*H, 2*W) respectively, used as high-resolution feature maps for SAM decoder.
            multimask_output (bool): If True, output 3 candidate masks and their IoU estimates; if False, output only 1
                mask and its IoU estimate.

        Returns:
            low_res_multimasks (torch.Tensor): Tensor of shape (B, M, H*4, W*4) with SAM output mask logits.
            high_res_multimasks (torch.Tensor): Tensor of shape (B, M, H*16, W*16) with upsampled mask logits.
            ious (torch.Tensor): Tensor of shape (B, M) with estimated IoU for each output mask.
            low_res_masks (torch.Tensor): Tensor of shape (B, 1, H*4, W*4) with the best low-resolution mask.
            high_res_masks (torch.Tensor): Tensor of shape (B, 1, H*16, W*16) with the best high-resolution mask.
            obj_ptr (torch.Tensor): Tensor of shape (B, C) with object pointer vector for the output mask.
            object_score_logits (torch.Tensor): Tensor of shape (B, 1) with object score logits.

        Examples:
            >>> backbone_features = torch.rand(1, 256, 32, 32)
            >>> point_inputs = {"point_coords": torch.rand(1, 2, 2), "point_labels": torch.tensor([[1, 0]])}
            >>> mask_inputs = torch.rand(1, 1, 512, 512)
            >>> results = model._forward_sam_heads(backbone_features, point_inputs, mask_inputs)
            >>> (
            ...     low_res_multimasks,
            ...     high_res_multimasks,
            ...     ious,
            ...     low_res_masks,
            ...     high_res_masks,
            ...     obj_ptr,
            ...     object_score_logits,
            ... ) = results
        r   r   r   rR   Npoint_coordspoint_labelsr   dtype)r   r   rS   FbilinearTsizealign_cornersrY   	antialias)pointsboxesmasks)image_embeddingsimage_pesparse_prompt_embeddingsdense_prompt_embeddingsmultimask_outputrepeat_imagehigh_res_features)r   rY   r   r!   dimr   )rk   r   r   r   r   r%   rn   r   onesint32lenr   mask_input_sizeFinterpolatetor   get_dense_perJ   whereNO_OBJ_SCOREr~   argmaxarange	unsqueezer   rM   sigmoidrL   r   )r(   backbone_featurespoint_inputsmask_inputsr   r   Br   sam_point_coordssam_point_labelssam_mask_promptsparse_embeddingsdense_embeddingslow_res_multimasksioussam_output_tokensobject_score_logitsis_obj_appearinghigh_res_multimaskssam_output_tokenbest_iou_inds
batch_indslow_res_maskshigh_res_masksobj_ptrlambda_is_obj_appearings                             r*   _forward_sam_headszSAM2Model._forward_sam_heads  s   b #A&") %%a((D,EEEEE %%a((D,IIIII %%a((D,IIIII #+N;+N;#)!,116F6LQ6OST6T6T6TT6T  %{1a6IZI`aaa %
1au{6 R R RR " {())Q..;3DRaR3HQPQF3R3R3RR %)@)PPP"#-NN#4#:;;0@"'#"# # # #. #O.2.E.E$&67! /F /
 /
++
 LPK`K`.,99;;%6$4-/ La L
 L
HD"35H  	p2Q6 "'-=aaatm-LN`bn!o!o  m/4?3	
 
 
 -QQQT2 		T!L2666Ma777J.z=/HISSTUVVM0]1JKUUVWXXN %%a((1,,#4Z5N#O ,>@S>M ##$455 		P# M*=*E*E*G*G''*:*=*=gm*L*L'$ <1G;%<!< OOG
 	
r+   c                \   d\  }}|                                 }||z  |z   }t          j        ||                    d          dz  |                    d          dz  fddd          }|                    |j        d	         d
                                           }	| j        r||-t          j        |j        d	         | j	        |j
                  }
nK|                     ||                     |                    |j                            |          \  }}}}}}
}t          j        |                    d
                                           dk    d
          }|d         }|                                 }||z  |z   }| j        r| j        r||
z  }
|
d
|z
  | j        z  z   }
|||	|||
|fS )zFProcess mask inputs directly as output, bypassing SAM encoder/decoder.)g      4@      $r   rS   r!   Fr   Tr   r   r   Nr   )r   r   r   r   r   ).N)r   r   r   r   new_onesrk   r^   r%   rn   rh   r   r   ra   r   r   anyflattenrJ   rL   r   )r(   r   r   r   	out_scaleout_biasmask_inputs_floatr   r   r   r   _r   r   r   s                  r*   _use_mask_as_outputzSAM2Model._use_mask_as_output(  s    *	8'--//*Y6A %%b))Q.0C0CB0G0G10LM
 
 
 ##K$5a$8!<<BBDD+ 		/@/HL]Lek+"3A"6P[PbcccGG )-(?(?"3 001B1E1EFWF]1^1^__"3 )@ ) )%Aq!Q7A !9[%8%8%;%;%A%A%C%Cc%IqQQQ+I6"2"8"8":":'*AAHL 	P$ <1G;%<!< OOG 
 	
r+   	img_batchtorch.Tensorc                    |                      |          }| j        r^| j                            |d         d                   |d         d<   | j                            |d         d                   |d         d<   |S zRProcess image batch through encoder to extract multi-level features for SAM model.backbone_fpnr   r   )r   r\   r   conv_s0conv_s1r(   r   backbone_outs      r*   forward_imagezSAM2Model.forward_imageW  s    )))44, 	m /3.C.K.KLYgLhijLk.l.lL(+.2.C.K.KLYgLhijLk.l.lL(+r+   c                   dk    r-i |fd|d         D             fd|d         D             d}t          |d                   t          |d                   k    sJ t          |d                   | j        k    sJ |d         | j         d         }|d         | j         d         }d |D             }d	 |D             }d
 |D             }||||fS )zZPrepare and flatten visual features from the image backbone output for further processing.r   c                @    g | ]}|                     d d d           S r!   expand)r3   featbatchs     r*   r5   z8SAM2Model._prepare_backbone_features.<locals>.<listcomp>f  s+     i i iDUBB!?!? i i ir+   r   c                @    g | ]}|                     d d d           S r   r  )r3   posr  s     r*   r5   z8SAM2Model._prepare_backbone_features.<locals>.<listcomp>g  s+    "k"k"kS3::eRR#@#@"k"k"kr+   vision_pos_enc)r   r  Nc                B    g | ]}|j         d          |j         d         fS )r   r!   )rk   r2   s     r*   r5   z8SAM2Model._prepare_backbone_features.<locals>.<listcomp>o  s)    LLLQqwr{AGBK0LLLr+   c                b    g | ],}|                     d                               d dd          -S r   r   r   r   permuter2   s     r*   r5   z8SAM2Model._prepare_backbone_features.<locals>.<listcomp>q  s4    LLL!		!,,Q155LLLr+   c                b    g | ],}|                     d                               d dd          -S r
  r  r2   s     r*   r5   z8SAM2Model._prepare_backbone_features.<locals>.<listcomp>r  s4    VVVqQYYq\\11!Q::VVVr+   )r   r]   )r(   r   r  feature_mapsvision_pos_embeds
feat_sizesvision_featss     `    r*   _prepare_backbone_featuresz$SAM2Model._prepare_backbone_featuresa  s7   199 i i i iLYgLh i i i"k"k"k"kLYiLj"k"k"k  L
 </00CEU8V4W4WWWWW</00D4KKKKK#N3T5L4L4N4NO()9:D<S;S;U;UVLL:KLLL
LL|LLLVVDUVVV\+<jHHr+   c	                4   + |d                              d          }	 j        }
|d         \  }}|d         j        } j        dk    r3|d                             ddd                              |	|
||          S d}rdnd+|sg g }}t          |d                   dk    sJ |d         }t          | j                  \  }}d |	                                D             } j
        rdn j        }t          d j                  D ]} j        |z
  }|dk    rr|z   n|z
  }n1sdz
  |z  |z  }||dz
  |z  z
  }ndz    |z   |z  }||dz
  |z  z   }|d                             |d          }||                    |d          }|                    ||f           |D ]\  }}||d	                             ||j        d
k              }|                    |                    d                              ddd                     |d         d                             |          }|                    d                              ddd          }| j         j        |z
  dz
           z   }|                    |            j        rt)          | j                  } j
        s) j        r"fd|                                D             }n|} +fd|                                D             }t          d|          D ]m} r| z   n| z
  }!|!dk     s||!|k    r nP|d                             |!|                    |!d                    }||                    | |d         f           n|rt1          | \  }"}#t3          j        |#d          }$ j        r|dz
  }% j        r|
n j        }&t3          j        |"||d         j                  }'tA          |'|%z  |&          }' !                    |'          }'|'"                    d          #                    d|	 j                  }'n)|$$                    t          |"          |	 j                  }' j        |
k     rp|$%                    d|	|
 j        z   j                  }$|$                    dddd                              dd          }$|'&                    |
 j        z  d          }'|                    |$           |                    |'           |$j'        d         }nd}n j(        r?|d          j)        z   }(|(                    ddd                              |	|
||          }(|(S  j)        #                    d|	 j                  g} j*        #                    d|	 j                  g}t3          j+        |d          })t3          j+        |d          }* ,                    |||)|*|          }(|(                    ddd                              |	|
||          }(|(S )zePrepare memory-conditioned features by fusing current frame's visual features with previous memories.r!   r   r   r   cond_frame_outputsc                    g | ]}d |fS )r   r1   )r3   outs     r*   r5   zBSAM2Model._prepare_memory_conditioned_features.<locals>.<listcomp>  s    RRRC3xRRRr+   non_cond_frame_outputsNmaskmem_featurescuda)r   non_blockingmaskmem_pos_encr   c                :    i | ]\  }}r|k    n|k    ||S r1   r1   )r3   tr  	frame_idxtrack_in_reverses      r*   
<dictcomp>zBSAM2Model._prepare_memory_conditioned_features.<locals>.<dictcomp>  sD     ( ( ("As.>( NNNANN 3DRNNr+   c                h    g | ].\  }}j         r|z
  z  nt          |z
            |d          f/S )r   )rd   abs)r3   r  r  r  r(   tpos_sign_muls      r*   r5   zBSAM2Model._prepare_memory_conditioned_features.<locals>.<listcomp>  s`           3  $C4Y]m;;!$Y]!3!3I     r+   r   r   r   rR   )currcurr_posmemory
memory_posnum_obj_ptr_tokens)-r   rh   r   rl   r  r'   r   r   r   valuestrainingrw   rangegetappendr   typer   ro   r^   minr_   re   itemszipr%   stackrb   rc   rj   tensorr   r   r   r   r  	new_zerosreshaperepeat_interleaverk   rr   rp   rq   catrf   ),r(   r  is_init_cond_framecurrent_vision_featscurrent_vision_pos_embedsr  output_dict
num_framesr  r   CHWr   r(  to_cat_memoryto_cat_memory_pos_embedcond_outputsselected_cond_outputsunselected_cond_outputst_pos_and_prevsrt_post_relprev_frame_idxr  prevfeatsmaskmem_encr_   ptr_cond_outputspos_and_ptrst_diffr  pos_list	ptrs_listobj_ptrs
t_diff_maxtpos_dimobj_pospix_feat_with_memr&  memory_pos_embedr#  s,   ``      `                                  @r*   $_prepare_memory_conditioned_featuresz.SAM2Model._prepare_memory_conditioned_featuresu  sA    !$))!,,O"~1%b)0 q  '+33Aq!<<AA!Q1MMM.5A! |	W572M {#7899A====&';<L=W<)E> >:!#: SR3H3O3O3Q3QRRRO
 ]L(LAq$"233 5 5(50A:::J%aY%6%6PY\aPaNN) F (11}&:a%?N%3uqyAo%ENN *3Q'71'<%=%AN%3uqyAo%EN!":;??PTUU; 255ndKKC&&s|4444. < <t< /0336PVP[_ePe3ff$$U]]1%5%5%=%=aA%F%FGGG"#45b9<<F<KK)11!44<<Q1EE)D,A$BRUZBZ]^B^,__'..{;;;; + 9+*-j$:V*W*W' } =)P =( ( ( ( (&;&A&A&C&C( ( ($$ (=$            #3"8"8":":      $A'>?? F FF.>V	F**IPVDVA1uu!7AOO%&>?CCAG^GbGbcdfjGkGkllC$++VS^,DEEE +*-|*<'Hi${9!<<<H 4 U%<q%@
(,(F#X11DL"',xNbceNfNl"m"m"m"0:1E8"T"T"T"&"8"8"A"A")"3"3A"6"6"="=b!T\"R"R"*"4"4S]]At|"T"T|a''#+#3#3B1;Ldl#[#[#+#3#3Aq!Q#?#?#G#G1#M#M")";";A<MST";"U"U!((222+227;;;)1):&&)*& - )$8$<t?P$P!$5$=$=aA$F$F$K$KAqRSUV$W$W!(( ".55aDLIIJM'+':'A'A!Q'U'U&V# =a000 9%<!DDD 11%.'1 2 
 
 .55aA>>CCAq!QOO  r+   c                   |d                              d          }| j        }|d         \  }}	|d                             ddd                              ||||	          }
| j        r| j        s|                     |          }| j        o|}|r&| j        s|dk                        |
j	                  }nt          j        |          }| j        dk    r
|| j        z  }| j        dk    r
|| j        z   }|                     |
|d          }|d	         }| j        A|dk                                    }|d|d         z
   | j        d         j        |j         z  z  }||d         fS )zXEncode frame features and masks into a new memory representation for video segmentation.r!   r   r   r   rG   r   T)skip_mask_sigmoidvision_featuresN.NNr  )r   rh   r  r'   rv   r*  "_apply_non_overlapping_constraintsru   r   r   r%   r   rs   rt   ri   rO   r   r  rk   )r(   r9  r  pred_masks_high_resr   is_mask_from_ptsr   r=  r>  r?  pix_featbinarizemask_for_memmaskmem_outr  r   s                   r*   _encode_new_memoryzSAM2Model._encode_new_memory  s    !$))!,,O"~1'+33Aq!<<AA!Q1MM- 	_dm 	_ #'"I"IJ]"^"^:O?O 	>DM 	>/!377GGLL !=)<==L)S00'$*HHL(C//'$*GGL))(LTX)YY&'89 $0 3a 7>>@@%5o%F!F K$JcK&,K. !. .  -=!>>>r+   c           
        t          |          dk    r+d t          |dd         |dd                   D             }nd}|V| j        rO|d                             ddd          } |j        d| j        g|d         R  }|                     |||          }no|                     |||dd         |dd         |dd         ||	|
          }|||J |}|                     ||          }| 	                    |||||          }|||fS )	hPerform a single tracking step, updating object masks and memory features based on current frame inputs.r   c                    g | ]O\  }} |                     d dd          j        |                    d           |                    d          g|R  PS )r   r   r   )r  r'   r   )r3   r4   ss      r*   r5   z)SAM2Model._track_step.<locals>.<listcomp>U  sg     ! ! !Aq (		!Q""'q		166!99AqAAA! ! !r+   Nr!   r   r   )r  r8  r9  r:  r  r;  r<  r  )r   r   r   r   r   )
r   r1  rx   r  r'   rh   r   rX  _use_multimaskr   )r(   r  r8  r9  r:  r  r   r   r;  r<  r  prev_sam_mask_logitsr   r`  sam_outputsr   s                   r*   _track_stepzSAM2Model._track_stepD  s     #$$q((! ! 4SbS 9:crc?KK! ! !
 !%"t'P" ,B/771a@@H$x}RJ:b>JJJH22;J[\\KK @@##5%9"##%>*CBCC*H%bcc?'%!1 A 	 	H $/#/K4G4GG2#223E|TT11"*)'"3!1 2  K -x77r+   c                    |r6| j         dk    r+|                     |||||du          \  }}	||d<   |	|d<   dS d|d<   d|d<   dS )z^Run memory encoder on predicted mask to encode it into a new memory feature for future frames.r   N)r9  r  r^  r   r_  r  r  )rl   rd  )
r(   r9  r  r   run_mem_encoderr   r   current_outr  r  s
             r*   _encode_memory_in_outputz"SAM2Model._encode_memory_in_output~  s      	2t/!33040G0G%9%$2$7".d": 1H 1 1-o /?K*+-<K)***.2K*+-1K)***r+   c                    |                      |||||||||	|
|          \  }}}|\  }}}}}}}|||d}| j        s||d<   |                     |||||||           |S )rf  )
pred_masksr^  r   r   )rl  r*  rp  )r(   r  r8  r9  r:  r  r   r   r;  r<  r  rn  rj  rk  r   r   r   r   r   ro  s                       r*   
track_stepzSAM2Model.track_step  s    , !,, % 
 
Q P[L1a9L (#1
 

 } 	E 2EK-. 	%% 	
 	
 	
 r+   c                    |dn|d                              d          }| j        o"|p| j        o| j        |cxk    o
| j        k    nc S )zaDetermine whether to use multiple mask outputs in the SAM head based on configuration and inputs.Nr   r   r   )r   ry   r|   rz   r{   )r(   r8  r   num_ptss       r*   ri  zSAM2Model._use_multimask  sk    #+!!n1M1R1RST1U1U( T#It'IT*gRRRR9RRRRR	
r+   c                   | j         d         }|dk    r| S | j        }t          j        | dd          }t          j        ||          dddddf         }||k    }t          j        || t          j        | d                    } | S )	z\Apply non-overlapping constraints to masks, keeping the highest scoring object per location.r   r   T)r   keepdimr   Nr   max)rk   r   r%   r   r   r   clamp)rr  
batch_sizer   max_obj_indsbatch_obj_indskeeps         r*   r]  z,SAM2Model._apply_non_overlapping_constraints  s      %a(
??"|JAtDDDj@@@D$PTATU~- [z5;zu3U3U3UVV
r+   c                    || _         dS )z Set binarize for VideoPredictor.N)ru   )r(   ra  s     r*   set_binarizezSAM2Model.set_binarize  s    2:///r+   c                     t           j        d          r j                            |           |d          _        | j        _         fd|D              j        _         fd|D              j        _         j         j        z   _	        dS )r-   r.   r   c                $    g | ]}|j         z  S r1   r   r3   r4   r(   s     r*   r5   z'SAM2Model.set_imgsz.<locals>.<listcomp>  s.     8
 8
 8
*+A%%8
 8
 8
r+   c                *    g | ]}|j         z  d z  S )rS   r  r  s     r*   r5   z'SAM2Model.set_imgsz.<locals>.<listcomp>  s3     3
 3
 3
./A%%)3
 3
 3
r+   N)
r6   r   r.   r~   r   r7   r8   r   r   r   r:   s   ` r*   r.   zSAM2Model.set_imgsz  s    4%{33 	0((///(3808
 8
 8
 8
/48
 8
 8
43
 3
 3
 3
383
 3
 3
/ )-4;O(O%%%r+   ) rE   rF   r0   rG   r   FFr!   FFFr   r   FFFr   FFr0   TFFFFFFFFFNFrH   rI   rJ   rI   rK   rI   rL   rI   rM   rI   rN   rI   rO   rI   rP   rI   )NNNF)NNr   r   )r   )F)FTN)r<   r=   r>   r?   r   r@   r#   propertyr   r   r   r   r   r   r  rX  rd  rl  rp  rs  ri  staticmethodr]  r  r.   rA   rB   s   @r*   rD   rD   [   s5        C CJ  N "%!$+0-2 ""'%* %&+05#(()&+ % "!%"'(-+0 %$)!& %).%*$(&+It t t t t t tl . . X.
 
 
-9 -9 -9d R
 R
 R
 R
h-
 -
 -
 -
^   I I I I: b! b! b! b!H)? )? )?V88 88 88t2 2 2H  !': : : :x
 
 
   \"; ; ; ;P P P P P P Pr+   rD   c                       e Zd ZdZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dd  fdZd!dZd" fdZed#d            Zd Z	 xZ
S )$	SAM3ModelfSAM3Model class for Segment Anything Model 3 with memory-based video object segmentation capabilities.rE        r   r   Fr!   r0   TNrH   rI   rJ   rK   rL   rM   rN   rO   rP   c$                p    t                      j        g |||||||||	|
|||||||||||||||||||||| |!|"|#R   t          d	dt          d| j        dd          | j        dd| j        | j        | j        | j        | j	        d
| j
        pi | _        dS )
r  rR   r   r   r   r   r   r   Nr1   )r"   r#   r   r
   r   r\   r}   rJ   rK   rH   r   r   r   s%                                       r*   r#   zSAM3Model.__init__  s   N 	 $	
$	
$	
 $	
 	$	

 $	
 $	
 &$	
 %$	
 /$	
 1$	
 $$	
 &$	
 )$	
 $$	
 !$	
  !!$	
" *#$	
$ ,%$	
& ''$	
( ,)$	
* *+$	
, $-$	
. $/$	
0 %1$	
2 &3$	
4 ,5$	
6 /7$	
8 9$	
:  ;$	
< =$	
> ?$	
@ %A$	
B !C$	
D (E$	
F "G$	
 $	
 $	
 $	
J !0 !
"#)"7	   !5 #"&"C'+'F 0 $ 8,0,P!
 !
  /52!!
 !
r+   r   r   c                   | j                             |          }| j        r^| j                            |d         d                   |d         d<   | j                            |d         d                   |d         d<   |S r   )r   forward_image_sam2r\   r   r   r   r   s      r*   r   zSAM3Model.forward_imagec  s    )<<YGG, 	m /3.C.K.KLYgLhijLk.l.lL(+.2.C.K.KLYgLhijLk.l.lL(+r+   r;   tuple[int, int]c                    t                                          |           d |D             | j        j        _        dS )z6Set the image size for the model and mask downsampler.c                    g | ]
}|d z  dz  S )r  r0   r1   )r3   r   s     r*   r5   z'SAM3Model.set_imgsz.<locals>.<listcomp>p  s!    =`=`=`RVdbj2o=`=`=`r+   N)r"   r.   ri   mask_downsamplerinterpol_size)r(   r;   r)   s     r*   r.   zSAM3Model.set_imgszm  s>    %   =`=`Z_=`=`=`,:::r+   333333?c                :   | dk                         d          }|dk                         d          }t          j        |d          }||z  }||k    }|d                             |           }t          j        || t          j        | d                    }|S )	zXSuppress masks that shrink in area after applying pixelwise non-overlapping constraints.r   )r!   r   r   rG   )r/  r\  r   rx  )sumr%   rz  	expand_asr   )	rr  new_pred_masksshrink_thresholdarea_before
area_after
area_ratior~  	keep_maskpred_masks_afters	            r*   _suppress_shrinked_masksz"SAM3Model._suppress_shrinked_masksr  s     "A~**x*88$q(--(-;;
k+3777+-
--)33J??	 ;y*ek*Z_>`>`>`aar+   c                \    |                      |          }|                     ||          }|S )zThis function suppresses masks that shrink in area after applying pixelwise non-overlapping constraints. Note
        that the final output can still be overlapping.
        )r]  r  )r(   rr  !pixel_level_non_overlapping_maskss      r*   "_suppress_object_pw_area_shrinkagez,SAM3Model._suppress_object_pw_area_shrinkage~  s7    
 -1,S,ST^,_,_) 22:?`aa
r+   ) rE   r  r  r   r   FFr!   FFFr   r   FFFr   FFr0   TFFFFFFFFFNFr  r  )r;   r  )r  )r<   r=   r>   r?   r#   r   r.   r  r  r  rA   rB   s   @r*   r  r    s       pp "#!"+0-2 ""'%* %&+05#(()&+ % "!%"'(-+0 %$)!& %).%*$(&+I]
 ]
 ]
 ]
 ]
 ]
 ]
~   a a a a a a
 	  	  	  \	 	 	 	 	 	 	 	r+   r  )
__future__r   r%   torch.nn.functionalr   
functionalr   torch.nn.initr   ultralytics.nn.modulesr   ultralytics.utilsr   blocksr	   r
   decodersr   r   encodersr   r   utilsr   r   r   Moduler   rD   r  r1   r+   r*   <module>r     s   # " " " " "                 ' ' ' ' ' ' & & & & & & $ $ $ $ $ $ < < < < < < < < 2 2 2 2 2 2 2 2 4 4 4 4 4 4 4 4 = = = = = = = = ?/ ?/ ?/ ?/ ?/ry ?/ ?/ ?/DcP cP cP cP cP cP cP cPLF F F F F	 F F F F Fr+   