
    jUY                        d Z ddlmZ ddlZddlZddlmZ ddlmZ ddl	m
Z
 ddlmZmZ ddlmZ d	d
lmZ  G d dej                  Z G d dej                  ZdS )zR
Transformer decoder.
Inspired from Pytorch's version, adds the pre-norm variant.
    )annotationsN)nn)RoIAlign)MLP)_get_clonesinverse_sigmoid)	xywh2xyxy   )gen_sineembed_for_positionc                  j     e Zd ZdZ	 dd fdZed             Zd Z	 	 	 	 	 	 	 	 	 	 	 d d!dZd Z	 xZ
S )"TransformerDecoderLayerz[TransformerDecoderLayer is made up of self-attn, cross-attn, and feedforward network (FFN).Fd_modelintdim_feedforwarddropoutfloatcross_attention	nn.Modulen_headsuse_text_cross_attentionboolc                ,   t                                                       || _        |dk    rt          j        |          nt          j                    | _        t          j        |          | _        || _	        |rgt          j
        |||          | _        |dk    rt          j        |          nt          j                    | _        t          j        |          | _        t          j
        |||          | _        |dk    rt          j        |          nt          j                    | _        t          j        |          | _        t          j        ||          | _        t          j                    | _        |dk    rt          j        |          nt          j                    | _        t          j        ||          | _        |dk    rt          j        |          nt          j                    | _        t          j        |          | _        dS )z'Initialize the TransformerDecoderLayer.r   )r   N)super__init__
cross_attnr   DropoutIdentitydropout1	LayerNormnorm1r   MultiheadAttentionca_textcatext_dropoutcatext_norm	self_attndropout2norm2Linearlinear1ReLU
activationdropout3linear2dropout4norm3)selfr   r   r   r   r   r   	__class__s          h/home/longshao/multi-rider-rag/.venv/lib/python3.11/site-packages/ultralytics/models/sam/sam3/decoder.pyr   z TransformerDecoderLayer.__init__   s    	)/6{{
7+++\'**
 )A%# 	50'7SSSDL9@1"*W"5"5"5"+--D!|G44D .wQQQ/6{{
7+++\'**
 y/::'))/6{{
7+++y'::/6{{
7+++\'**


    c                    || n| |z   S )z'Add positional embedding to the tensor. )tensorposs     r2   with_pos_embedz&TransformerDecoderLayer.with_pos_embed>   s     vv&3,6r3   c           	         |                      |                     |                     |                     |                                        }||                     |          z   }|                     |          }|S )z!Feedforward network forward pass.)r-   r,   r+   r)   r.   r/   )r0   tgttgt2s      r2   forward_ffnz#TransformerDecoderLayer.forward_ffnC   sa    ||DMM$//$,,s:K:K*L*LMMNNDMM$'''jjoo
r3   NTr:   torch.Tensortgt_query_posmemory_texttext_attention_maskmemorymemory_key_padding_mask
memory_posself_attn_maskcross_attn_maskc           
        |                      |||
|||          \  }}| j        r|                     |                     ||          |                    |j                  |                    |j                  |          d         }||                     |          z   }|                     |          }|;t          j	        |	ddddddf                   }t          j
        ||	gd          }	|                     |                     ||          |                     ||          ||	||                    dd          ndd          d         }||                     |          z   }|                     |          }|                     |                    |j                            }d}||dd         }|dd         }||fS )z,Forward pass of the TransformerDecoderLayer.)key_padding_maskr   Nr
   dimF)querykeyvalue	attn_maskrG   need_weights)_apply_self_attentionr   r"   r8   todtyper#   r$   torch
zeros_likecatr   	transposer   r    r<   )r0   r:   r>   r?   r@   rA   rB   rC   rD   rE   dacdac_use_selfatt_lnpresence_tokenkwargsr;   presence_token_maskpresence_token_outs                    r2   forwardzTransformerDecoderLayer.forwardJ   s   . "77%7
 
] ( 	(<<##C77sy))sy))!4	    
 D ++D111C""3''C%"'"2?111bqb!!!83L"M"M#i)<o(NTUVVVO %%c=99##FJ77%I`Il5??1EEErv  
 
  DMM$'''jjoo svvfl3344!%!$RaRabb'C&&&r3   c                \   | j         |S |rM|j        d         dz  dk    s
J d            |j        d         dz  }|d|         }|d|         }	||d         }
n|}|}	|t          j        ||gd          }t          j        t          j        |          |	gd                              |j                  }	t          j        t          j        |          |gd          }|                     ||	          x}}|                      ||||          d                             |j                  }||                     |          z   }|rG|s| 	                    |          }t          j        ||
fd          }|r| 	                    |          }n|}| 	                    |          }||fS )z1Apply self-attention with optional DAC splitting.Nr      z#DAC requires even number of queriesrH   )rM   )
r%   shaperR   rT   rS   rP   rQ   r8   r&   r'   )r0   r:   r>   rV   rW   rX   rD   num_o2o_queriestgt_o2otgt_query_pos_o2otgt_o2mqkr;   s                 r2   rO   z-TransformerDecoderLayer._apply_self_attention   s   >!J 		.9Q<!#q(((*O(((!ila/O*?*+G -.>.> ?/**+GGG - %i 9qAAAG %	5+;N+K+KM^*_ef g g g j j! ! "Iu'7'G'G&W]^___M ##G->???A~~aG~~FFqILLSYWWDMM$///  	"% .**W--)Wg.A666C! &jjooC**S//CM!!r3   )F)r   r   r   r   r   r   r   r   r   r   r   r   )NNNNNNNNFTN)r:   r=   r>   r=   r?   r=   r@   r=   rA   r=   rB   r=   rC   r=   rD   r=   rE   r=   )__name__
__module____qualname____doc__r   staticmethodr8   r<   r\   rO   __classcell__r1   s   @r2   r   r      s        ee */"+ "+ "+ "+ "+ "+ "+H 7 7 \7   '+$(,0#04#''+(,#>' >' >' >' >'@(" (" (" (" (" (" ("r3   r   c                       e Zd ZdZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d2d3 fd Zed!             Zd" Z	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d4d5d1Z xZ	S )6TransformerDecoderz2Transformer Decoder consisting of multiple layers.Fr   noner
   TN      $@r   r   frozenr   
num_layersnum_queriesreturn_intermediate
box_refinenum_o2m_queriesrV   boxRPBstrinstance_querynum_instancesrW   use_act_checkpointrX   clamp_presence_logitsclamp_presence_logit_max_valr   use_normed_output_consistentlyseparate_box_head_instanceseparate_norm_instancec                   t                                                       || _        t          ||          | _        |t          ||          ndg|z  | _        || _        || _        |
| _        |
r
|| _	        |}n|	| _	        ||	z   }t          j        |          | _        || _        t          ||dd          | _        t          j        ||          | _        d| _        d| _        || _        || _        || _        |rt          j        |          nd| _        d| _        |rt          ||dd          | _        |rt          j        ||          | _        || _        |rt          j                            | j        j        d         j        j        d           t          j                            | j        j        d         j        j        d           t          j        |d          | _        |rt          j        |d          | _         |dv sJ || _!        |dk    r	 | j        d         j"        j#        }n'# tH          $ r | j        d         j%        j#        }Y nw xY w|dk    rdnd	}t          |||d	          | _&        t          |||d	          | _'        d| _(        d| _)        i | _*        |tW          d
ddd          nd| _,        |r,| -                                D ]}|.                    d           d| _/        || _0        || _1        |rJt          j        d|          | _/        t          ||dd          | _2        t          j        |          | _3        t          d	| j        z  | j        | j        d	          | _4        || _5        || _6        t          j        7                    | j        j        j                   | j        .t          j        7                    | j        j        j                   | j,        J | j        s
J d            | j        s
J d            || _8        d| _9        tu          | j                  D ]\  }}||_;        dS )z"Initialize the TransformerDecoder.N      r   )ro   loglinearbothro   r   r^      r
   T)output_sizespatial_scalesampling_ratioalignedFz support return_intermediate onlyzsupport box refine only)<r   r   r   r   layersfine_layersrr   rs   rV   rv   r   r   normrt   r   
bbox_embed	Embeddingquery_embedinstance_query_embedinstance_query_reference_pointsuse_instance_queryrz   r~   instance_norminstance_bbox_embedru   init	constant_weightdatabiasreference_pointsinstance_reference_pointsrw   cross_attn_image	num_headsAttributeErrorr   boxRPB_embed_xboxRPB_embed_ycompilable_cord_cachecompilable_stored_sizecoord_cacher   
roi_pooler
parametersrequires_grad_rX   r|   r}   presence_token_headpresence_token_out_normref_point_headrW   r{   normal_compile_modecompiled	enumerate	layer_idx)r0   r   rq   interaction_layerlayerrr   rs   rt   ru   rv   rV   rw   ry   rz   rW   r{   r   rX   r|   r}   r~   r   r   tot_num_queriesnheadsn_inputpr   r1   s                               r2   r   zTransformerDecoder.__init__   su   : 	!%44:K:WK):666^b]cfp]p 	 %& 	<#.D )OO#2D )O;OL))	#6 gw155<AA$(!/3,"0*.L+6LVR\'222RV#' % 	C'*7GQ'B'BD$ 	M(*]G(L(LD%$ 	PGdo4R8?DaHHHGdo4R8=BAFFF$&La$@$@D! P13mQ1O1O.:::::V=Q8B! = = =Q2<= "V++aaG"%gw"B"BD"%gw"B"BD)-D&*.D'!D !, !BPTUUUU 	
  	(__&& ( (  ''''"%:",H) 	A"$,q'":":D'*7GQ'B'BD$+-<+@+@D(!!dl"2DL$,PQRR"4"4
(/4555$0GOOD5<ABBB&&&'KK)KKK'99 999( !*$+ 6 6 	( 	(Iu'EOO	( 	(s   H4 4!IIc                v    t          j        d| ||          | z  }t          j        d|||          |z  }||fS )z0Get normalized coordinates for height and width.r   )rQ   device)rR   arange)HWr   rQ   coords_hcoords_ws         r2   _get_coordszTransformerDecoder._get_coords3  sI     <1E&AAAAE<1E&AAAAE!!r3   c                "   |\  }}t          |                              dd          }|j        \  }}}| j        0|                     |||j        |j                  | _        ||f| _        t          j	        
                                s| j        ||fk    r| j        \  }	}
nY|| j        vr$|                     |||j                  | j        |<   | j        |         \  }	}
|	j        |fk    sJ |
j        |fk    sJ |	                    ddd          |                    ddd          dddddddf         z
  }|                    ||dd          }|
                    ddd          |                    ddd          dddddddf         z
  }|                    ||dd          }| j        dv r|d	z  }t          j        |          t          j        t          j        |          d
z             z  t%          j        d	          z  }|d	z  }t          j        |          t          j        t          j        |          d
z             z  t%          j        d	          z  }| j        dk    r|}|}n0t          j        ||gd          }t          j        ||gd          }| j        r| j        s
J d            |                     |          }|                     |          }t          j	        
                                s0|j        dd         |||fk    sJ |j        dd         |||fk    sJ |                    d          |                    d          z   }t          j	        
                                s|j        dd         ||||fk    sJ |                    dd          }|                    dddd          }|                                }t          j	        
                                s|j        dd         |||z  fk    sJ |S )zKGet the relative position bias (RPB) matrix for box-relative position bias.r   r
   Nr   r   r^   r   )r   r      g      ?r   rH   z&activation ckpt not enabled in decoder)x)r	   rU   r_   r   r   r   rQ   r   rR   compileris_dynamo_compilingr   viewreshaperw   signlog2absnprT   trainingr{   r   r   	unsqueezeflattenpermute
contiguous)r0   reference_boxes	feat_sizer   r   
boxes_xyxybsrs   _r   r   deltas_ydeltas_xdeltas_x_logdeltas_y_logBs                   r2   _get_rpb_matrixz"TransformerDecoder._get_rpb_matrix:  sH   1//99!Q??
'-K%-)-)9)9!Q@VXgXm)n)nD&+,a&D'>--// 	*43NS
 4
 4

 "&!;Hhh  000.2.>.>q!_E[.\.\ +!%!1)!<Hh>aT))))>aT))))==B**Z-?-?Aq-I-I!!!QQQPQRSTUPU+-VV==["a88==B**Z-?-?Aq-I-I!!!QQQPQRSTUPU+-VV==["a88;/))#a<L :l33ej<AXAX[^A^6_6__bdbijkblbllL#a<L :l33ej<AXAX[^A^6_6__bdbijkblbllL{e##'' 9h%=2FFF 9h%=2FFF= 	U*TT,TTT*&&&22&&&22~1133 	>>"1"%"k1)=====>"1"%"k1)=====q!!H$6$6q$9$99~1133 	:72A2;2{Aq"99999IIaOOIIaAq!!LLNN~1133 	77122;;A"66666r3   tgt_maskr=   memory_maskrB   r7   r   spatial_shapesvalid_ratiosr?   r@   	apply_dacbool | Nonedecoder_extra_kwargsdict | Nonec                <
   || j         dk    s
J d            ||n| j        }|r|j        d         | j        k    s$| j        r|j        d         | j        j        k    sJ |                    ddd          }|Q|j        d         | j        k    s$| j        r|j        d         | j        j        k    sJ |                    ddd          }|j        d         }g }g }d}| j        ri|c| j	        j
                            d          }|r|                    d|d          n|                    d|d          }|                                }|g}nd}d}|}d}| j        +|du r'| j        j
        d                             d|d          }| j        }|r| j        | j        }| j        }|r| j        | j        }t'          | j                  D ]\  }}|dddddf         t+          j        |	|	gd          dddf         z  }t/          |dddddddf         | j                  }|                     |          }| j         dk    rW|U|j        d         dk    s
J d	            |                     ||d
         |d         f          }|                    dd          }| j        r| j        s
J d             |d|||
|||||||| j        |d|pi ||d\  }}| j        rt?          |          } |(| j         s ||          }!n | ||                    }!nk|d         }"|!                    d          |"k    sJ |                     |d|"                   }# |||"d                   }$t+          j        |#|$gd          }!|!| z   }%|%                                }&|&"                                }|| j#        dz
  k    r|$                    |&           ntK          d          |$                     ||                     | j        |du r| &                    | '                    |                    (                    d          }'| j)        r"|'*                    | j+         | j+                   |$                    |'           |,                                }| j-        s4| j.        -t+          j/        | j0        | j.        d          | _0        d| _-        t+          j1        |          t+          j1        |          | j        |du rt+          j1        |          nd|fS )z'Forward pass of the TransformerDecoder.Nro   zOinputting a memory_mask in the presence of boxRPB is unexpected/not implementedr   r^   r
   Fr   z%only single scale support implemented)r   r   )r   r
   z3Activation checkpointing not enabled in the decoder)r:   r>   r?   r@   rA   rB   rC   rD   rE   rV   rW   rX   )obj_roi_memory_featobj_roi_memory_maskQ_detrH   znot implemented yet)minmaxT)mode	fullgraphr5   )2rw   rV   r_   rs   r   r   num_embeddingsrepeatru   r   r   r   sigmoidrX   expandr   r   r   r   r   r   rR   rT   r   r   r   r   r   r   r{   rW   r   r~   sizedetachrr   appendNotImplementedErrorr   r   squeezer|   clampr}   cloner   r   compiler\   stack)(r0   r:   rA   r   r   rB   r7   r   r   r   r?   r@   r   is_instance_promptr   r   r   box_head_trkr   intermediateintermediate_presence_logitspresence_featsintermediate_ref_boxesoutputpresence_outbox_headout_normr   r   reference_points_inputquery_sine_embed	query_posreference_before_sigmoiddelta_unsigr   delta_unsig_detdelta_unsig_trkoutputs_unsignew_reference_points"intermediate_layer_presence_logitss(                                           r2   r\   zTransformerDecoder.forwardx  sR   2 ";&(((a )(( "+!6IIDH	 	BIaLD$444' 5-0Yq\T=V=e-e-e-e **Q1%%C *'-a0D4DDD+ E1@1Fq1ITMfMu1u1u1u #2"8"8Aq"A"AYq\')$? 		*&"&"7">"H"H"K"KFO"u/"8"8B"B"B"BUdUkUklmoqstUuUu"1"9"9";";&5%6"""O%)"*/AU/J/J.5d;BB1b"MML? 	0$":"F/H9 	*$"4"@)H )$+ 6 6 P	6 P	6Iu111d
+ei|8TVX.Y.YZ^`a`a`aZa.bb #  :&qqq!!!Qz2DL   
 ++,<==I{f$$)D%+A.!3335\333"22##D)>$+?@  *11!Q77} f.ee0eee.#(5 $''$7(?' +#'#:+$ $ (-2$ %8$7!$ $ $ $ FL(  A+:?+K+K('> A&.hv&6&6&.hxx/?/?&@&@ 19E!;;q>>U2222&*oofVeVn&E&EO&2l6%&&>&B&BO"')_o,NTU"V"V"VK +.F F'4'<'<'>'>$"6"="="?"?! 333*112FGGG)*?@@@ 0 0111".3E3N3N595M5M00>>6 6'"++ 3
 - 6<<!>> = =   
 -334VWWW!-!3!3!5!5} 	!!2!> =D<MY]^^^DL DM K%%K.// &27IU7R7R 8999	
 		
r3   )Fr   Fro   Fr
   TFNFTrp   TFF)&r   r   rq   r   rr   r   rs   r   rt   r   ru   r   rv   r   rV   r   rw   rx   ry   r   rz   r   rW   r   r{   r   rX   r   r|   r   r}   r   r~   r   r   r   r   r   )NNNNNNNNNNFNNNN)r   r=   r   r=   rB   r=   r7   r=   r   r=   r   r=   r   r=   r?   r=   r@   r=   r   r   r   r   )
rf   rg   rh   ri   r   rj   r   r   r\   rk   rl   s   @r2   rn   rn      s        << ! $ #'#($&*.2/3+0',5y( y( y( y( y( y( y(v " " \"< < <D "&$(04 (,'+%)$(,0!% ,0  -j
 j
 j
 j
 j
 j
 j
 j
 j
r3   rn   )ri   
__future__r   numpyr   rR   r   torchvision.ops.roi_alignr   "ultralytics.nn.modules.transformerr   ultralytics.nn.modules.utilsr   r   ultralytics.utils.opsr	   
model_miscr   Moduler   rn   r5   r3   r2   <module>r     s   
 # " " " " "            . . . . . . 2 2 2 2 2 2 E E E E E E E E + + + + + + 2 2 2 2 2 2[" [" [" [" ["bi [" [" ["|m
 m
 m
 m
 m
 m
 m
 m
 m
 m
r3   