
    ja                        d dl mZ d dlZd dlmZ d dlmZmZ  G d dej                  Z G d dej                  Z	dS )	    )annotationsN)nn)MLPLayerNorm2dc                  H     e Zd ZdZdej        ddfd fdZddZddZ xZ	S )MaskDecodera  Decoder module for generating masks and their associated quality scores using a transformer architecture.

    This class predicts masks given image and prompt embeddings, utilizing a transformer to process the inputs and
    generate mask predictions along with their quality scores.

    Attributes:
        transformer_dim (int): Channel dimension for the transformer module.
        transformer (nn.Module): Transformer module used for mask prediction.
        num_multimask_outputs (int): Number of masks to predict for disambiguating masks.
        iou_token (nn.Embedding): Embedding for the IoU token.
        num_mask_tokens (int): Number of mask tokens.
        mask_tokens (nn.Embedding): Embedding for the mask tokens.
        output_upscaling (nn.Sequential): Neural network sequence for upscaling the output.
        output_hypernetworks_mlps (nn.ModuleList): Hypernetwork MLPs for generating masks.
        iou_prediction_head (nn.Module): MLP for predicting mask quality.

    Methods:
        forward: Predict masks given image and prompt embeddings.
        predict_masks: Internal method for mask prediction.

    Examples:
        >>> decoder = MaskDecoder(transformer_dim=256, transformer=transformer_module)
        >>> masks, iou_pred = decoder(
        ...     image_embeddings, image_pe, sparse_prompt_embeddings, dense_prompt_embeddings, multimask_output=True
        ... )
        >>> print(f"Predicted masks shape: {masks.shape}, IoU predictions shape: {iou_pred.shape}")
          transformer_diminttransformer	nn.Modulenum_multimask_outputs
activationtype[nn.Module]iou_head_depthiou_head_hidden_dimreturnNonec                   t                                                       | _        || _        || _        t          j        d          | _        |dz   | _        t          j        | j                  | _	        t          j
        t          j        dz  dd          t          dz             |            t          j        dz  dz  dd           |                      | _        t          j        fdt          | j                  D                       | _        t#          || j        |          | _        dS )a  Initialize the MaskDecoder module for generating masks and their associated quality scores.

        Args:
            transformer_dim (int): Channel dimension for the transformer module.
            transformer (nn.Module): Transformer module used for mask prediction.
            num_multimask_outputs (int): Number of masks to predict for disambiguating masks.
            activation (type[nn.Module]): Type of activation to use when upscaling masks.
            iou_head_depth (int): Depth of the MLP used to predict mask quality.
            iou_head_hidden_dim (int): Hidden dimension of the MLP used to predict mask quality.
                 kernel_sizestride   c                :    g | ]}t          d z  d          S r   r	   r   .0_r   s     l/home/longshao/multi-rider-rag/.venv/lib/python3.11/site-packages/ultralytics/models/sam/modules/decoders.py
<listcomp>z(MaskDecoder.__init__.<locals>.<listcomp>M   -    qqqPQS/?a3GKKqqq    N)super__init__r   r   r   r   	Embedding	iou_tokennum_mask_tokensmask_tokens
SequentialConvTranspose2dr   output_upscaling
ModuleListrangeoutput_hypernetworks_mlpsr   iou_prediction_head)selfr   r   r   r   r   r   	__class__s    `     r$   r)   zMaskDecoder.__init__(   s?   & 	.&%:"a994q8<(<oNN "10DRS\]^^^1,--JLL!3_5IWXabcccJLL!
 !
 *,qqqqUZ[_[oUpUpqqq*
 *
& $'8KTMacq#r#r   r'   image_embeddingstorch.Tensorimage_pesparse_prompt_embeddingsdense_prompt_embeddingsmultimask_outputbool!tuple[torch.Tensor, torch.Tensor]c                    |                      ||||          \  }}|rt          dd          nt          dd          }|dd|ddddf         }|dd|f         }||fS )a  Predict masks given image and prompt embeddings.

        Args:
            image_embeddings (torch.Tensor): Embeddings from the image encoder.
            image_pe (torch.Tensor): Positional encoding with the shape of image_embeddings.
            sparse_prompt_embeddings (torch.Tensor): Embeddings of the points and boxes.
            dense_prompt_embeddings (torch.Tensor): Embeddings of the mask inputs.
            multimask_output (bool): Whether to return multiple masks or a single mask.

        Returns:
            masks (torch.Tensor): Batched predicted masks.
            iou_pred (torch.Tensor): Batched predictions of mask quality.

        Examples:
            >>> decoder = MaskDecoder(transformer_dim=256, transformer=transformer_module)
            >>> image_emb = torch.rand(1, 256, 64, 64)
            >>> image_pe = torch.rand(1, 256, 64, 64)
            >>> sparse_emb = torch.rand(1, 2, 256)
            >>> dense_emb = torch.rand(1, 256, 64, 64)
            >>> masks, iou_pred = decoder(image_emb, image_pe, sparse_emb, dense_emb, multimask_output=True)
            >>> print(f"Masks shape: {masks.shape}, IoU predictions shape: {iou_pred.shape}")
        )r7   r9   r:   r;   r   Nr   )predict_masksslice)	r5   r7   r9   r:   r;   r<   masksiou_pred
mask_slices	            r$   forwardzMaskDecoder.forwardR   s    < ,,-%=$;	 - 
 
x (8HU1d^^^U1a[[
aaaQQQ)*AAAzM*hr'   c                    t          j         j        j         j        j        gd          }|                    d                              |j        d         dd          }t          j        ||fd          }t          j        ||j        d         d          }||z   }t          j        ||j        d         d          }|j        \  }	}
}} 	                    |||          \  }}|dddddf         }|dddd j
        z   ddf         |                    dd                              |	|
||          }                     |          } fdt           j
                  D             }t          j        |d          }|j        \  }	}
}}||                    |	|
||z            z                      |	d||          }                     |          }||fS )z`Predict masks and quality scores using image and prompt embeddings via transformer architecture.r   dimr   Nr   c           	     V    g | ]%} j         |         d d |d d f                   &S Nr3   r"   imask_tokens_outr5   s     r$   r%   z-MaskDecoder.predict_masks.<locals>.<listcomp>   L     -
 -
 -
LM-D*1-oaaaAAAg.FGG-
 -
 -
r'   )torchcatr+   weightr-   	unsqueezeexpandshaperepeat_interleaver   r,   	transposeviewr0   r2   stackr4   )r5   r7   r9   r:   r;   output_tokenstokenssrcpos_srcbchwhsiou_token_outupscaled_embeddinghyper_in_listhyper_inrB   rC   rO   s   `                   @r$   r@   zMaskDecoder.predict_masks~   s    	4>#8$:J:Q"RXYZZZ%//2299:R:XYZ:[]_acddM+CD!LLL %&6QQOOO++)(FLOKKKY
1a ""388C111a7QQQQ)=%= >AB mmAq!!&&q!Q22!22377-
 -
 -
 -
 -
QVW[WkQlQl-
 -
 -
 ;}!444'-
1a.33Aq!a%@@@FFq"aQRSS ++M::hr'   )r   r   r   r   r   r   r   r   r   r   r   r   r   r   )r7   r8   r9   r8   r:   r8   r;   r8   r<   r=   r   r>   )
r7   r8   r9   r8   r:   r8   r;   r8   r   r>   )
__name__
__module____qualname____doc__r   GELUr)   rE   r@   __classcell__r6   s   @r$   r   r      s         @ &'&(g#&(s (s (s (s (s (s (sT* * * *X% % % % % % % %r'   r   c                  l     e Zd ZdZdej        ddddddddddfd' fdZ	 d(d)d#Z	 d(d*d$Zd% Z	d& Z
 xZS )+SAM2MaskDecodera~
  Transformer-based decoder for predicting instance segmentation masks from image and prompt embeddings.

    This class extends the functionality of the MaskDecoder, incorporating additional features such as high-resolution
    feature processing, dynamic multimask output, and object score prediction.

    Attributes:
        transformer_dim (int): Channel dimension of the transformer.
        transformer (nn.Module): Transformer used to predict masks.
        num_multimask_outputs (int): Number of masks to predict when disambiguating masks.
        iou_token (nn.Embedding): Embedding for IOU token.
        num_mask_tokens (int): Total number of mask tokens.
        mask_tokens (nn.Embedding): Embedding for mask tokens.
        pred_obj_scores (bool): Whether to predict object scores.
        obj_score_token (nn.Embedding): Embedding for object score token.
        use_multimask_token_for_obj_ptr (bool): Whether to use multimask token for object pointer.
        output_upscaling (nn.Sequential): Upscaling layers for output.
        use_high_res_features (bool): Whether to use high-resolution features.
        conv_s0 (nn.Conv2d): Convolutional layer for high-resolution features (s0).
        conv_s1 (nn.Conv2d): Convolutional layer for high-resolution features (s1).
        output_hypernetworks_mlps (nn.ModuleList): List of MLPs for output hypernetworks.
        iou_prediction_head (MLP): MLP for IOU prediction.
        pred_obj_score_head (nn.Linear | MLP): Linear layer or MLP for object score prediction.
        dynamic_multimask_via_stability (bool): Whether to use dynamic multimask via stability.
        dynamic_multimask_stability_delta (float): Delta value for dynamic multimask stability.
        dynamic_multimask_stability_thresh (float): Threshold for dynamic multimask stability.

    Methods:
        forward: Predict masks given image and prompt embeddings.
        predict_masks: Predict instance segmentation masks from image and prompt embeddings.
        _get_stability_scores: Compute mask stability scores based on IoU between thresholds.
        _dynamic_multimask_via_stability: Dynamically select the most stable mask output.

    Examples:
        >>> image_embeddings = torch.rand(1, 256, 64, 64)
        >>> image_pe = torch.rand(1, 256, 64, 64)
        >>> sparse_prompt_embeddings = torch.rand(1, 2, 256)
        >>> dense_prompt_embeddings = torch.rand(1, 256, 64, 64)
        >>> decoder = SAM2MaskDecoder(256, transformer)
        >>> masks, iou_pred, sam_tokens_out, obj_score_logits = decoder.forward(
        ...     image_embeddings, image_pe, sparse_prompt_embeddings, dense_prompt_embeddings, True, False
        ... )
    r	   r
   Fg?g\(\?r   r   r   r   r   r   r   r   r   use_high_res_featuresr=   pred_obj_scorespred_obj_scores_mlpuse_multimask_token_for_obj_ptrr   r   c                   t                                                       | _        || _        || _        t          j        d          | _        |dz   | _        t          j        | j                  | _	        || _
        | j
        rt          j        d          | _        || _        t          j        t          j        dz  dd          t          dz             |            t          j        dz  dz  dd           |                      | _        || _        |r@t          j        dz  dd          | _        t          j        dz  dd          | _        t          j        fdt-          | j                  D                       | _        t1          || j        ||          | _        | j
        r3t          j        d          | _        |rt1          dd          | _        |	| _        |
| _        || _        d	S )
a  Initialize the SAM2MaskDecoder module for predicting instance segmentation masks.

        This decoder extends the functionality of MaskDecoder, incorporating additional features such as high-resolution
        feature processing, dynamic multimask output, and object score prediction.

        Args:
            transformer_dim (int): Channel dimension of the transformer.
            transformer (nn.Module): Transformer used to predict masks.
            num_multimask_outputs (int): Number of masks to predict when disambiguating masks.
            activation (type[nn.Module]): Type of activation to use when upscaling masks.
            iou_head_depth (int): Depth of the MLP used to predict mask quality.
            iou_head_hidden_dim (int): Hidden dimension of the MLP used to predict mask quality.
            use_high_res_features (bool): Whether to use high-resolution features.
            iou_prediction_use_sigmoid (bool): Whether to use sigmoid for IOU prediction.
            dynamic_multimask_via_stability (bool): Whether to use dynamic multimask via stability.
            dynamic_multimask_stability_delta (float): Delta value for dynamic multimask stability.
            dynamic_multimask_stability_thresh (float): Threshold for dynamic multimask stability.
            pred_obj_scores (bool): Whether to predict object scores.
            pred_obj_scores_mlp (bool): Whether to use MLP for object score prediction.
            use_multimask_token_for_obj_ptr (bool): Whether to use multimask token for object pointer.
        r   r   r   r   r   c                :    g | ]}t          d z  d          S r   r    r!   s     r$   r%   z,SAM2MaskDecoder.__init__.<locals>.<listcomp>  r&   r'   )sigmoidr	   N)r(   r)   r   r   r   r   r*   r+   r,   r-   rr   obj_score_tokenrt   r.   r/   r   r0   rq   Conv2dconv_s0conv_s1r1   r2   r3   r   r4   Linearpred_obj_score_headdynamic_multimask_via_stability!dynamic_multimask_stability_delta"dynamic_multimask_stability_thresh)r5   r   r   r   r   r   r   rq   iou_prediction_use_sigmoidr~   r   r   rr   rs   rt   r6   s    `             r$   r)   zSAM2MaskDecoder.__init__   s/   L 	.&%:"a994q8<(<oNN. 	D#%<?#C#CD /N, "10DRS\]^^^1,--JLL!3_5IWXabcccJLL!
 !
 &;"  	e9_o6JXYbcdddDL9_o6JXYbcdddDL)+qqqqUZ[_[oUpUpqqq*
 *
& $' .$
 $
 $
   	W')y!'D'DD$" W+.QRTU+V+V( 0O,1R.2T///r'   Nr7   r8   r9   r:   r;   r<   repeat_imagehigh_res_featureslist[torch.Tensor] | None=tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]c                x   |                      ||||||          \  }}	}
}|r#|ddddddddf         }|	ddddf         }	nJ| j        r!| j        s|                     ||	          \  }}	n"|ddddddddf         }|	ddddf         }	|r| j        r|
ddddf         }n|
ddddf         }||	||fS )a  Predict masks given image and prompt embeddings.

        Args:
            image_embeddings (torch.Tensor): Embeddings from the image encoder with shape (B, C, H, W).
            image_pe (torch.Tensor): Positional encoding with the shape of image_embeddings (B, C, H, W).
            sparse_prompt_embeddings (torch.Tensor): Embeddings of the points and boxes with shape (B, N, C).
            dense_prompt_embeddings (torch.Tensor): Embeddings of the mask inputs with shape (B, C, H, W).
            multimask_output (bool): Whether to return multiple masks or a single mask.
            repeat_image (bool): Flag to repeat the image embeddings.
            high_res_features (list[torch.Tensor] | None, optional): Optional high-resolution features.

        Returns:
            masks (torch.Tensor): Batched predicted masks with shape (B, N, H, W).
            iou_pred (torch.Tensor): Batched predictions of mask quality with shape (B, N).
            sam_tokens_out (torch.Tensor): Batched SAM token for mask output with shape (B, N, C).
            object_score_logits (torch.Tensor): Batched object score logits with shape (B, 1).

        Examples:
            >>> image_embeddings = torch.rand(1, 256, 64, 64)
            >>> image_pe = torch.rand(1, 256, 64, 64)
            >>> sparse_prompt_embeddings = torch.rand(1, 2, 256)
            >>> dense_prompt_embeddings = torch.rand(1, 256, 64, 64)
            >>> decoder = SAM2MaskDecoder(256, transformer)
            >>> masks, iou_pred, sam_tokens_out, obj_score_logits = decoder.forward(
            ...     image_embeddings, image_pe, sparse_prompt_embeddings, dense_prompt_embeddings, True, False
            ... )
        )r7   r9   r:   r;   r   r   Nr   r   )r@   r~   training _dynamic_multimask_via_stabilityrt   )r5   r7   r9   r:   r;   r<   r   r   rB   rC   rO   object_score_logitssam_tokens_outs                r$   rE   zSAM2MaskDecoder.forward)  s=   J AE@R@R-%=$;%/ AS A
 A
=x*=  	(!!!QRRAAA+&E122HH1 	($- 	("CCE8TTOE88!!!QqS!!!QQQ,'E1Q3'H 	5 D 	5,QQQU3NN -QQQ!V4Nh0CCCr'   c                *    d} j         r:t          j         j        j         j        j         j        j        gd          }d}n,t          j         j        j         j        j        gd          }|                    d                              |j	        d         dd          }t          j        ||fd          }	|r#t          j
        ||	j	        d         d          }
n |j	        d         |	j	        d         k    sJ |}
|
|z   }
|j	        d         dk    s
J d            t          j
        ||	j	        d         d          }|
j	        \  }}}}                     |
||	          \  }}
|dd|ddf         }|dd|dz   |dz    j        z   ddf         |
                    dd                              ||||          }
 j        r|                     |
          }nI j        \  }}}}}|\  }} | | ||
          |z                       } | ||          |z             } fdt#           j                  D             }t          j        |d          }|j	        \  }}}}||                    ||||z            z                      |d||          }                     |          } j         r+|dk    sJ                      |dddddf                   }n$d	|                    |j	        d         d          z  }|||fS )
zYPredict instance segmentation masks from image and prompt embeddings using a transformer.r   rG   r   rI   z@image_pe should have size 1 in batch dim (from `get_dense_pe()`)Nr   c           	     V    g | ]%} j         |         d d |d d f                   &S rK   rL   rM   s     r$   r%   z1SAM2MaskDecoder.predict_masks.<locals>.<listcomp>  rP   r'   g      $@)rr   rQ   rR   rx   rS   r+   r-   rT   rU   rV   rW   r   r,   rX   rY   rq   r0   r2   rZ   r4   r}   new_ones) r5   r7   r9   r:   r;   r   r   sr[   r\   r]   r^   r_   r`   ra   rb   rc   rd   re   dc1ln1act1dc2act2feat_s0feat_s1rf   rg   rB   rC   r   rO   s    `                              @r$   r@   zSAM2MaskDecoder.predict_masksm  s     	_!I(/N)$+
   M AA!It~'<d>N>U&V\]^^^M%//2299:R:XYZ:[]_acddM+CD!LLL  	#)*:FLOQRSSSCC#)!,Q????"C++~a A%%%'i%%%)(FLOKKKY
1a ""388C111a7QQQAQ1E)E FIJ mmAq!!&&q!Q22) 	I->-F!%!6!6s!;!;(,(=%CdC0GW!%cc##c((W*<&=&=!>!>!%cc*<&=&=&G!H!H-
 -
 -
 -
 -
QVW[WkQlQl-
 -
 -
 ;}!444'-
1a.33Aq!a%@@@FFq"aQRSS ++M:: 	Q6666"&":":2aaaAAAg;"G"G #'):):8>!;La)P)P"Ph1DDDr'   c                ,   |                     d          }t          j        || j        k    d                                          }t          j        || j         k    d                                          }t          j        |dk    ||z  d          S )zNCompute mask stability scores based on IoU between upper and lower thresholds.rI   rG   r   g      ?)flattenrQ   sumr   floatwhere)r5   mask_logitsarea_iarea_us       r$   _get_stability_scoresz%SAM2MaskDecoder._get_stability_scores  s    !))"--;)OOUWXXX^^``;$*P)PPVXYYY__aa{6A:v<<<r'   c                v   |ddddddddf         }|ddddf         }t          j        |d          }t          j        |j        d         |j                  }|||f         }|                    d          }|||f         }|                    d          }|ddddddddf         }	|ddddf         }
|                     |	          }|| j        k    }t          j        |d         	                    |	          |	|          }t          j        |	                    |
          |
|          }||fS )a  Dynamically select the most stable mask output based on stability scores and IoU predictions.

        This method is used when outputting a single mask. If the stability score from the current single-mask output
        (based on output token 0) falls below a threshold, it instead selects from multi-mask outputs (based on output
        tokens 1-3) the mask with the highest predicted IoU score. This ensures a valid mask for both clicking and
        tracking scenarios.

        Args:
            all_mask_logits (torch.Tensor): Logits for all predicted masks, shape (B, N, H, W) where B is batch size, N
                is number of masks (typically 4), and H, W are mask dimensions.
            all_iou_scores (torch.Tensor): Predicted IoU scores for all masks, shape (B, N).

        Returns:
            mask_logits_out (torch.Tensor): Selected mask logits, shape (B, 1, H, W).
            iou_scores_out (torch.Tensor): Selected IoU scores, shape (B, 1).

        Examples:
            >>> decoder = SAM2MaskDecoder(...)
            >>> all_mask_logits = torch.rand(2, 4, 256, 256)  # 2 images, 4 masks each
            >>> all_iou_scores = torch.rand(2, 4)
            >>> mask_logits, iou_scores = decoder._dynamic_multimask_via_stability(all_mask_logits, all_iou_scores)
            >>> print(mask_logits.shape, iou_scores.shape)
            torch.Size([2, 1, 256, 256]) torch.Size([2, 1])
        Nr   rI   rG   r   )device).NN)
rQ   argmaxarangerV   r   rT   r   r   r   	expand_as)r5   all_mask_logitsall_iou_scoresmultimask_logitsmultimask_iou_scoresbest_scores_inds
batch_indsbest_multimask_logitsbest_multimask_iou_scoressinglemask_logitssinglemask_iou_scoresstability_scores	is_stablemask_logits_outiou_scores_outs                  r$   r   z0SAM2MaskDecoder._dynamic_multimask_via_stability  s   4 +111abb!!!QQQ;7-aaae4 <(<"EEE\"6"<Q"?H]^^^
 0=M1M N 5 ? ? B B$8EU9U$V!$=$G$G$J$J! ,AAAqsAAAqqqL9 .qqq!A#v 6556GHH$(OO	  +o&001BCC!
 

  566!%
 

 ..r'   )r   r   r   r   r   r   r   r   r   r   r   r   rq   r=   rr   r=   rs   r=   rt   r=   r   r   rK   )r7   r8   r9   r8   r:   r8   r;   r8   r<   r=   r   r=   r   r   r   r   )r7   r8   r9   r8   r:   r8   r;   r8   r   r=   r   r   r   r   )rh   ri   rj   rk   r   rl   r)   rE   r@   r   r   rm   rn   s   @r$   rp   rp      s        ) )^ &'&(g#&&+#((-*.+/ %$)05UU UU UU UU UU UU UU~ 8<BD BD BD BD BDV 8<EE EE EE EE EEN= = =4/ 4/ 4/ 4/ 4/ 4/ 4/r'   rp   )

__future__r   rQ   r   ultralytics.nn.modulesr   r   Moduler   rp    r'   r$   <module>r      s    # " " " " "        3 3 3 3 3 3 3 3X X X X X") X X XvI/ I/ I/ I/ I/bi I/ I/ I/ I/ I/r'   