
    /jsD                         d dl Z d dlmZ d dlZd dlmZ d dlmZ de j        fdZ	dde
fdZ G d	 d
          Z G d dej                  ZdS )    N)_get_clones)	xywh2xyxymaskc                     |                                  t          j        |                                  d          d         k                                    S )zGiven a padding mask (following pytorch convention, 1s for padded values), returns whether the padding is on the
    right or not.
    dimr   )longtorchsortall)r   s    r/home/longshao/multi-rider-rag/.venv/lib/python3.11/site-packages/ultralytics/models/sam/sam3/geometry_encoders.pyis_right_paddedr      s?     IIKK5:diikkr:::1==BBDDD    Freturn_indexc                    | j         \  }}}|j         \  }}}||                     d          cxk    rM|                    d          cxk    r3|                    d          cxk    r|                    d          k    sn J ||                     d          cxk    r|                    d          k    sn J ||                    d          k    sJ ||                    d          k    sJ t          j        t	          |          d           t          j        t	          |          d           |                     d          }	|                     d          }
|	|
z   }||z   }t          j        ||j                  d                             |d          |dddf         k    }t          j	        |||f|j        |j
        	          }| |d|ddddf<   t          j        ||j                  dddf                             d|          }||	d         z   }|                    d|dddddf                             dd|          |          }|r|||fS ||fS )
ab  
    Concatenates two right-padded sequences, such that the resulting sequence
    is contiguous and also right-padded.

    Following pytorch's convention, tensors are sequence first, and the mask are
    batch first, with 1s for padded values.

    :param seq1: A tensor of shape (seq1_length, batch_size, hidden_size).
    :param mask1: A tensor of shape (batch_size, seq1_length).
    :param seq2: A tensor of shape (seq2_length, batch_size,  hidden_size).
    :param mask2: A tensor of shape (batch_size, seq2_length).
    :param return_index: If True, also returns the index of the ids of the element of seq2
        in the concatenated sequence. This can be used to retrieve the elements of seq2
    :return: A tuple (concatenated_sequence, concatenated_mask) if return_index is False,
        otherwise (concatenated_sequence, concatenated_mask, index).
       r      zMask is not right paddedr   r   )deviceNr   dtype)shapesizer   _assertr   sumaranger   repeatzerosr   scatterexpand)seq1mask1seq2mask2r   seq1_length
batch_sizehidden_sizeseq2_lengthactual_seq1_lengthsactual_seq2_lengthsfinal_lengths
max_lengthconcatenated_maskconcatenated_sequenceindexs                   r   concat_padded_sequencesr0      s   " ,0:(K[+/:(K[1WWWW1WWWWAWWWW%**UV--WWWWWW$))A,,6666$))A,,666666%**Q--''''%**Q--''''	M/%((*DEEE	M/%((*DEEE!6,,2,..!6,,2,..'*==M{*JZ444T:AA*aPPTabcbcbceibiTjj 
 "KZ(MVZVaimisttt04,;,111,-
 LT[999!!!T'BII!ZXXE'--E199!U111aaa:=N=U=UVXZ\^i=j=jlpqq ?$&7>> "333r   c                   "    e Zd ZdZddZddZdS )PromptaD  Utility class to manipulate geometric prompts.

    We expect the sequences in pytorch convention, that is sequence first, batch second The dimensions are expected as
    follows: box_embeddings shape: N_boxes x B x C_box box_mask shape: B x N_boxes. Can be None if nothing is masked out
    point_embeddings shape: N_points x B x C_point point_mask shape: B x N_points. Can be None if nothing is masked out
    mask_embeddings shape: N_masks x B x 1 x H_mask x W_mask mask_mask shape: B x N_masks. Can be None if nothing is
    masked out

    We also store positive/negative labels. These tensors are also stored batch-first If they are None, we'll assume
    positive labels everywhere box_labels: long tensor of shape N_boxes x B point_labels: long tensor of shape N_points
    x B mask_labels: long tensor of shape N_masks x B
    Nc                    |d| _         d| _        d| _        dS |j        d         }|j        d         }|j        }|"t          j        |||t
          j                  }|"t          j        |||t
          j	                  }t          |j        dd                   ||gk    sJ d| d| d|j                     |j        d	         d
k    sJ d|j        d	                      t          |j                  ||gk    sJ d| d| d|j                     t          |j                  ||gk    sJ d| d| d|j                     |j        |k    sJ d| d|j                     |j        |k    sJ d| d|j                     |j        |k    sJ d| d|j                     || _         || _        || _        dS )zInitialize the Prompt object.Nr   r   r   r   z.Wrong dimension for box embeddings. Expected [z, z	, *] got r      z3Expected box embeddings to have 4 coordinates, got z(Wrong dimension for box mask. Expected [z] got z*Wrong dimension for box labels. Expected [z(Expected box embeddings to be on device , got z"Expected box mask to be on device z$Expected box labels to be on device )box_embeddings
box_labelsbox_maskr   r   r   onesr
   r   boollist)selfr6   r8   r7   box_seq_lenbsr   s          r   __init__zPrompt.__init__Y   s[    !"&D"DO DMF %*1-!!$& KF%*UUUJ{2{6TTTH N(!,--+r1BBBBo[ooBooYgYmoo CBB #B'1,,,\.BVWYBZ\\ -,, HN##K'8888`r``[``PXP^`` 988 J$%%+r)::::dddddR\Rbdd ;::
 $...\v\\^EZ\\ /.. &(((*nv*n*n]e]l*n*n((( F***,tSY,t,takar,t,t***, $r   c                 2   | j         || _         |j        d         }|j        d         }|'t          j        |||j        t          j                  }|'t          j        |||j        t          j                  }|| _        || _	        dS | j         j        d         }|j        d         |k    sJ d| d|j        d                      |2t          j        |j        d         ||j        t          j                  }|2t          j        ||j        d         t          j        |j                  }t          |j        dd                   t          |j        dd                   k    sJ d	|j         d
|j                     t          | j                            d          | j	        |                    d          |          \  | _        }| j                            d          | _        t          | j         | j	        ||          \  | _         | _	        dS )a  Append box prompts to existing prompts.

        Args:
            boxes (torch.Tensor): Tensor of shape (N_new_boxes, B, 4) with normalized box coordinates.
            labels (torch.Tensor | None): Optional tensor of shape (N_new_boxes, B) with positive/negative labels.
            mask (torch.Tensor | None): Optional tensor of shape (B, N_new_boxes) for attention mask.
        Nr   r   r   zBatch size mismatch: expected r5   r   r   r   zShape mismatch between boxes z and labels r   )r6   r   r   r9   r   r
   r   r:   r7   r8   r;   r0   	unsqueezesqueeze)r<   boxeslabelsr   r>   r=   _s          r   append_boxeszPrompt.append_boxes   s    &"'DQB+a.K~KELPUPZ[[[|{2{5<uzZZZ$DO DMF  &q){1~###%`b%`%`PUP[\]P^%`%`###>ZA5<uzZZZF<;r5;q>ELYYYDEKO$$V\"1"-=(>(>>>>SEKSSV\SS ?>>
 5O%%b))4=&:J:J2:N:NPT
 
 /11"55-DTEXZ^Zginpt-u-u*T]]]r   )NNN)NN)__name__
__module____qualname____doc__r?   rG    r   r   r2   r2   K   sM         ,% ,% ,% ,%\)v )v )v )v )v )vr   r2   c                        e Zd ZdZ	 	 	 	 ddedededed	ed
edej        dedededef fdZd Z	de
j        fdZddefdZ xZS )SequenceGeometryEncodera]  Encoder for geometric box prompts. Assumes boxes are passed in the "normalized CxCyWH" format.

    Boxes can be encoded with any of the three possibilities:
    - direct projection: linear projection from coordinate space to d_model
    - pooling: RoI align features from the backbone
    - pos encoder: position encoding of the box center

    These three options are mutually compatible and will be summed if multiple are selected.

    As an alternative, boxes can be encoded as two corner points (top-left and bottom-right).

    The encoded sequence can be further processed with a transformer.
       TFencode_boxes_as_pointsboxes_direct_project
boxes_poolboxes_pos_encd_model
num_layerslayerroi_sizeadd_clsadd_post_encode_projuse_act_ckptc                    t                                                       || _        || _        || _        |	| _        | j        rdnd}t          j                            || j                  | _	        d| _
        |
r*t          j                            d| j                  | _
        |r.t          j        d| j                  | _        d| _        d| _        n|s|s|s
J d            d| _        d| _        d| _        d| _        d| _        d| _        |rt          j        d| j                  | _        |r*t          j        | j        | j        | j                  | _        |r't          j        | j        dz   | j                  | _        d| _        |rBt          j        | j        | j                  | _        t          j        | j                  | _        t          j                    | _        | j        | j        t          j        | j                  | _        d| _        |dk    r?|
s
J d            t3          ||          | _        t          j        | j                  | _        || _        dS )	z'Initialize the SequenceGeometryEncoder.   r   Nr   z,Error: need at least one way to encode boxesr4   r   zGIt's currently highly recommended to add a CLS when using a transformer)superr?   rT   pos_encrP   rW   r   nn	Embeddinglabel_embed	cls_embedLinearpoints_direct_projectpoints_pool_projectpoints_pos_enc_projectrQ   boxes_pool_projectboxes_pos_enc_projectConv2d
final_proj	LayerNormnormIdentityimg_pre_normencoder   encode_normrZ   )r<   rP   rQ   rR   rS   rT   r^   rU   rV   rW   rX   rY   rZ   
num_labels	__class__s                 r   r?   z SequenceGeometryEncoder.__init__   sH     	&<#  5<QQ1
 8--j$,GG  	A"X//4<@@DN " 	W)+1dl)C)CD&'+D$*.D'' (v=vJvvHvvvF)-D&'+D$*.D'(,D%&*D#)-D&# G,.Ia,F,F) _*,)DL$,PTP]*^*^' W-/Yt|a7G-V-V* 	3 idlCCDOT\22DIKMM#/43J3V "T\ : :D>>eeeee7%eZ88DK!|DL99D(r   c                     |                      |                    |j                            }|                     |                                          }||z   |fS )z?Encode points (used when boxes are converted to corner points).)rd   tor   ra   r
   )r<   pointspoints_maskpoints_labels	img_featspoints_embed
type_embeds          r   _encode_pointsz&SequenceGeometryEncoder._encode_points  sV     11&))IO2L2LMM %%m&8&8&:&:;;
L(+55r   rx   c                    d}|j         dd         \  }}| j        /|                     |                    |j                            }|}| j        \|j         dd         \  }	}
t          |                    |j                            }t          j        |
|	|
|	g|j                  }|                    |j        d          }|	                    ddd          }||z  }t          j                            ||                    d	d                              d	          | j                  }t!          |j                   ||z  | j        | j        | j        gk    sJ |                     |          }|	                    ||| j                                      d	d          }||}n||z   }| j        |                    d
          \  }}}}| j                            |                                |                                |                                |                                          }|	                    |j         d	         |j         d         |j         d
                   }|                     |                    |j                            }||}n||z   }|                     |                                          }||z   |fS )z/Encode boxes using configured encoding methods.Nr   )r   T)r   non_blockingr   r4   r   r   )r   rQ   rt   r   rg   r   r   tensorr   viewtorchvisionops	roi_align	transposeunbindrW   r;   rT   rh   r^   encode_boxesflattenra   r
   )r<   rD   
boxes_maskboxes_labelsrx   boxes_embedn_boxesr>   projHW
boxes_xyxyscalesampledcxcywhencrz   s                       r   _encode_boxesz%SequenceGeometryEncoder._encode_boxes  s   k"1"o$0,,UXXio-F-FGGDK".?233'DAq #588IO#<#<==JL!Q1Z5EFFFEHHJ$5DHIIEJJq!Q''E#e+J "o//	:;O;OPQST;U;U;\;\]^;_;_aeanooG&&W	+     **733D99R$,77AA!QGGD"")D0%1 <<++LBAq,++BJJLL"**,,		UVU^U^U`U`aaC((5;q>5;q>39R=IIC--cffY_.E.EFFD"")D0 %%l&7&7&9&9::
K'33r   N
geo_promptc                 |   |j         }|j        }|j        }|d         }||d         nt          j        |          }	| j        s| j        rt          |          t          |          k    sJ |d         }
|                     |
          }
|d         \  }}|
j	        d         ||z  k    sJ |
j	        dd         \  }}|

                    ddd          }
|
                    ||||          }
|
}| j        r||j	        d         dk    sJ t          |          }|                    dd          \  }}|dz   }|dz   }t          j        ||gd	          }t          j        ||gd	          }t          j        ||gd	          }|                     ||||
          \  }}n|                     ||||          \  }}|j	        d         }|j	        d         |k    sJ | j        r| j        j                            dd| j                                      d|d          }t          j        |d|j        |j                  }t3          ||||          \  }}| j        (|                     |                     |                    }| j        .| j        D ]} |||||	          }|                     |          }||fS )a  Encode geometric box prompts.

        Args:
            geo_prompt (Prompt): Prompt object containing box embeddings, masks, and labels.
            img_feats (list[torch.Tensor]): List of image features from backbone.
            img_sizes (list[tuple[int, int]]): List of (H, W) tuples for each feature level.
            img_pos_embeds (list[torch.Tensor] | None): Optional position embeddings for image features.

        Returns:
            Tuple of (encoded_embeddings, attention_mask)
        r   Nr   r}   r   r   r4   )
split_sizer	   r   )ru   rv   rw   rx   )rD   r   r   rx   rA   )tgtmemorytgt_key_padding_maskpos)r6   r8   r7   r   
zeros_likere   rg   lenrn   r   permuter   rP   r   splitcatr{   r   rb   weightrT   r   r   r   r   r0   rj   rl   ro   rp   )r<   r   rx   	img_sizesimg_pos_embedsrD   r   r   seq_first_img_featsseq_first_img_pos_embedscur_img_featr   r   NCr   top_leftbottom_right	labels_tl	labels_brru   rw   rv   final_embeds
final_maskr>   clscls_masklays                                r   forwardzSequenceGeometryEncoder.forwardF  s;    )(
!,'m"0"<N2%BRSfBgBg 	!
 # 
	%t'> 
	%y>>S^^3333$R=L,,\::LR=DAq%a(AE1111%bcc*DAq'//1a88L',,Q1a88L$I& 	$RA)=)=)=="5))J%/%5%5%5%K%K"Hl %q(I$q(I Y,7Q???F!Iy)&<!DDDM)Z$<!DDDK'+':':'+#	 (; ( ($L** (,'9'9%)#	 (: ( ($L* ""b(((( >%.',,Q4<@@GG2qQQC{2q
0@IZ[[[H'>|ZY\^f'g'g$L* ?&99T__\%B%BCCL ;"{  "s$.)30	       ++L99LZ''r   )rO   TTF)N)rH   rI   rJ   rK   r:   intr_   Moduler?   r{   r   Tensorr   r2   r   __classcell__)rr   s   @r   rN   rN      s2        0 %)"G) G) $G) #G) 	G)
 G) G) G) yG) G) G) #G) G) G) G) G) G) G)R6 6 604 04 04 04 04dY( Y(& Y( Y( Y( Y( Y( Y( Y( Y(r   rN   )F)r   torch.nnr_   r   ultralytics.nn.modules.utilsr   ultralytics.utils.opsr   r   r   r:   r0   r2   r   rN   rL   r   r   <module>r      s  
            4 4 4 4 4 4 + + + + + +E%, E E E E44 44D 44 44 44 44nev ev ev ev ev ev ev evPl( l( l( l( l(bi l( l( l( l( l(r   