
    j{S                        d dl mZ d dlZd dlmZ d dlZd dlmc mZ	 dAd	Z
dBdCdZdDdEdZdFdGdZdHdZ	 dIdJd#ZdKd%ZdLd*ZdMd/ZdNd4Z	 	 dOdPd9Z	 	 dQdRd@ZdS )S    )annotationsN)Any	frame_idxintcond_frame_outputsdict[int, Any]max_cond_frame_numc                    |dk    st                    |k    ri }n|dk    s
J d            i t           fdD             d          }||         |<   t           fdD             d          }||         |<   |t                    z
  }t          fdD              fd	
          d|         }                    fd|D                        fd                                D             }|fS )ab  Select the closest conditioning frames to a given frame index.

    Args:
        frame_idx (int): Current frame index.
        cond_frame_outputs (dict[int, Any]): Dictionary of conditioning frame outputs keyed by frame indices.
        max_cond_frame_num (int): Maximum number of conditioning frames to select.

    Returns:
        selected_outputs (dict[int, Any]): Selected items from cond_frame_outputs.
        unselected_outputs (dict[int, Any]): Items not selected from cond_frame_outputs.

    Examples:
        >>> frame_idx = 5
        >>> cond_frame_outputs = {1: "a", 3: "b", 7: "c", 9: "d"}
        >>> max_cond_frame_num = 2
        >>> selected, unselected = select_closest_cond_frames(frame_idx, cond_frame_outputs, max_cond_frame_num)
        >>> print(selected)
        {3: 'b', 7: 'c'}
        >>> print(unselected)
        {1: 'a', 9: 'd'}
       z,we should allow using 2+ conditioning framesc              3  (   K   | ]}|k     |V  d S N .0tr   s     i/home/longshao/multi-rider-rag/.venv/lib/python3.11/site-packages/ultralytics/models/sam/modules/utils.py	<genexpr>z-select_closest_cond_frames.<locals>.<genexpr>*   s'      II1y==!====II    N)defaultc              3  (   K   | ]}|k    |V  d S r   r   r   s     r   r   z-select_closest_cond_frames.<locals>.<genexpr>/   s'      IIq!y......IIr   c              3  $   K   | ]
}|v|V  d S r   r   )r   r   selected_outputss     r   r   z-select_closest_cond_frames.<locals>.<genexpr>7   s.      HH1a7G.G.GQ.G.G.G.GHHr   c                (    t          | z
            S r   )abs)xr   s    r   <lambda>z,select_closest_cond_frames.<locals>.<lambda>8   s    #a)m,, r   )keyc              3  ,   K   | ]}||         fV  d S r   r   )r   r   r   s     r   r   z-select_closest_cond_frames.<locals>.<genexpr>:   s.      PPq$6q$9 :PPPPPPr   c                $    i | ]\  }}|v	||S r   r   )r   r   vr   s      r   
<dictcomp>z.select_closest_cond_frames.<locals>.<dictcomp>;   s*    gggtq!QVfMfMfaMfMfMfr   )lenmaxminsortedupdateitems)	r   r   r	   unselected_outputs
idx_before	idx_after
num_remaininds_remainr   s	   ``      @r   select_closest_cond_framesr.      s   , R3'9#:#:>P#P#P-!Q&&&(V&&& IIII%7IIISWXXX
!+=j+IZ( IIII$6IIISWXXX	 *<Y*GY' (#.>*?*??
HHHH*HHH,,,,
 
 
 :+ 	PPPPKPPPPPPgggg/A/G/G/I/Iggg///r   '  pos_indstorch.Tensordimtemperaturefloatc                   |dz  }t          j        || j        | j                  }|d|dz  z  |z  z  }|                     d          |z  }t          j        |                                |                                gd          }|S )a  Generate 1D sinusoidal positional embeddings for given positions and dimensions.

    Args:
        pos_inds (torch.Tensor): Position indices for which to generate embeddings.
        dim (int): Dimension of the positional embeddings. Should be an even number.
        temperature (float, optional): Scaling factor for the frequency of the sinusoidal functions.

    Returns:
        (torch.Tensor): Sinusoidal positional embeddings with shape (pos_inds.shape, dim).

    Examples:
        >>> pos = torch.tensor([0, 1, 2, 3])
        >>> embeddings = get_1d_sine_pe(pos, 128)
        >>> embeddings.shape
        torch.Size([4, 128])
    r   dtypedevicer   r2   )torcharanger7   r8   	unsqueezecatsincos)r0   r2   r3   pe_dimdim_t	pos_embeds         r   get_1d_sine_perC   @   s    " AXFLx~hoNNNEA!,v56E""2&&.I	9==??IMMOO<"EEEIr         ?end_xend_yscaleoffsetc                    t          j        | |z  t           j                  }|| z                                  }t          j        || d                                          }||z  |z   ||z  |z   fS )up  Initialize 1D and 2D coordinate tensors for a grid of specified dimensions.

    This function creates coordinate tensors for a grid with dimensions end_x × end_y. It generates a linear index
    tensor and corresponding x and y coordinate tensors.

    Args:
        end_x (int): Width of the grid (number of columns).
        end_y (int): Height of the grid (number of rows).
        scale (float): Scaling factor to apply to the coordinates.
        offset (int): Offset to add to the coordinates.

    Returns:
        t_x (torch.Tensor): X-coordinates for each position, with shape (end_x * end_y).
        t_y (torch.Tensor): Y-coordinates for each position, with shape (end_x * end_y).

    Examples:
        >>> t_x, t_y = init_t_xy(3, 2)
        >>> print(t_x)
        tensor([0., 1., 2., 0., 1., 2.])
        >>> print(t_y)
        tensor([0., 0., 0., 1., 1., 1.])
    )r7   floor)rounding_mode)r:   r;   float32r4   div)rE   rF   rG   rH   r   t_xt_ys          r   	init_t_xyrP   Z   sr    . 	UU]%-888Au9



C
)AuG
4
4
4
:
:
<
<C;uv!555r        @theta	scale_posc                <   d|t          j        d| d          d| dz                                           | z  z  z  }d|t          j        d| d          d| dz                                           | z  z  z  }t          |||          \  }}t          j        ||          }t          j        ||          }t          j        t          j        |          |          }	t          j        t          j        |          |          }
t          j        |	|
gd          S )aa  Compute axial complex exponential positional encodings for 2D spatial positions in a grid.

    This function generates complex exponential positional encodings for a 2D grid of spatial positions, using separate
    frequency components for the x and y dimensions.

    Args:
        dim (int): Dimension of the positional encoding.
        end_x (int): Width of the 2D grid.
        end_y (int): Height of the 2D grid.
        theta (float, optional): Scaling factor for frequency computation.
        scale_pos (float, optional): Scaling factor for position coordinates.

    Returns:
        (torch.Tensor): Complex exponential positional encodings with shape (end_x*end_y, dim//2).

    Examples:
        >>> dim, end_x, end_y = 128, 8, 8
        >>> freqs_cis = compute_axial_cis(dim, end_x, end_y)
        >>> freqs_cis.shape
        torch.Size([64, 64])
    rD   r      N)rG   r   r9   )r:   r;   r4   rP   outerpolar	ones_liker=   )r2   rE   rF   rR   rS   freqs_xfreqs_yrN   rO   freqs_cis_xfreqs_cis_ys              r   compute_axial_cisr]   w   s   , Uu|AsA66|#(|DJJLLsRSTGUu|AsA66|#(|DJJLLsRSTGY777HCk#w''Gk#w''G+eog66@@K+eog66@@K9k;/R8888r   	freqs_cisr   c                    |j         dk    sJ | j        |j        d         |j        d         fk    sJ fdt          |j                  D             } | j        | S )aw  Reshape frequency tensor for broadcasting with input tensor.

    Reshapes a frequency tensor to ensure dimensional compatibility for broadcasting with an input tensor. This function
    is typically used in positional encoding operations.

    Args:
        freqs_cis (torch.Tensor): Frequency tensor with shape matching the last two dimensions of x.
        x (torch.Tensor): Input tensor to broadcast with.

    Returns:
        (torch.Tensor): Reshaped frequency tensor ready for broadcasting with the input tensor.

    Raises:
        AssertionError: If the shape of freqs_cis doesn't match the last two dimensions of x.
    r   r   c                0    g | ]\  }}|d z
  k    r|ndS )r      r   )r   idndims      r   
<listcomp>z)reshape_for_broadcast.<locals>.<listcomp>   s-    FFF41a!tax--QQQFFFr   )re   shape	enumerateview)r^   r   rg   re   s      @r   reshape_for_broadcastrj      so      6D19999?qwr{AGBK88888FFFF9QW3E3EFFFE9>5!!r   Fxqxkrepeat_freqs_kboolc                b   t          j         |                                 j        g | j        dd         ddR            }|j        d         dk    rBt          j         |                                j        g |j        dd         ddR            nd}t          ||          }t          j        ||z                                d          }|/|                    |           	                    | j
                  |fS |r|j        d         |j        d         z  x}dk    r|j
        j        dk    rZt          j        |          } |j        g dg|j        dz
  z  |ddR  }t          j        |                                          }n |j        g dg|j        dz
  z  |dR  }t          j        ||z                                d          }|                    |           	                    | j
                  |                    |          	                    |j
                  fS )	a  Apply rotary positional encoding to query and key tensors.

    This function applies rotary positional encoding (RoPE) to query and key tensors using complex-valued frequency
    components. RoPE is a technique that injects relative position information into self-attention mechanisms.

    Args:
        xq (torch.Tensor): Query tensor to encode with positional information.
        xk (torch.Tensor): Key tensor to encode with positional information.
        freqs_cis (torch.Tensor): Complex-valued frequency components for rotary encoding with shape matching the last
            two dimensions of xq.
        repeat_freqs_k (bool, optional): Whether to repeat frequency components along sequence length dimension to match
            key sequence length.

    Returns:
        xq_out (torch.Tensor): Query tensor with rotary positional encoding applied.
        xk_out (torch.Tensor): Key tensor with rotary positional encoding applied, or original xk if xk is empty.

    Examples:
        >>> import torch
        >>> xq = torch.randn(2, 8, 16, 64)  # [batch, heads, seq_len, dim]
        >>> xk = torch.randn(2, 8, 16, 64)
        >>> freqs_cis = compute_axial_cis(64, 4, 4)  # For a 4x4 spatial grid with dim=64
        >>> q_encoded, k_encoded = apply_rotary_enc(xq, xk, freqs_cis)
    Nr   r   r`   r      rb   mps)r:   view_as_complexr4   reshaperg   rj   view_as_realflattentype_astor8   typerepeatre   
contiguous)	rk   rl   r^   rm   xq_xk_xq_outrxk_outs	            r   apply_rotary_encr      s+   < 
 2

 2 IBHSbSM I2 Iq I I I
J
JCNPhWYl^_N_N_%
 2

 2 IBHSbSM I2 Iq I I I
J
J
JeiC%i55Ii0088;;F
{~~b!!$$RY//33 N	"2 >>1!CC E))*955I(	(PA3).12D+EPP1PaPPPI-i.B.B.D.DEEII(	(MA3).12D+EMM1MMMIi0088;;F>>"  ++V^^B-?-?-B-B29-M-MMMr   window_sizec           	     x   | j         \  }}}}|||z  z
  |z  }|||z  z
  |z  }|dk    s|dk    rt          j        | ddd|d|f          } ||z   ||z   }	}|                     |||z  ||	|z  ||          } |                     dddddd                                                              d|||          }
|
||	ffS )ae  Partition input tensor into non-overlapping windows with padding if needed.

    Args:
        x (torch.Tensor): Input tensor with shape (B, H, W, C).
        window_size (int): Size of each window.

    Returns:
        windows (torch.Tensor): Partitioned windows with shape (B * num_windows, window_size, window_size, C).
        padded_h_w (tuple[int, int]): Padded height and width before partition.

    Examples:
        >>> x = torch.randn(1, 16, 16, 3)
        >>> windows, (Hp, Wp) = window_partition(x, window_size=4)
        >>> print(windows.shape, Hp, Wp)
        torch.Size([16, 4, 4, 3]) 16 16
    r   rb   rp   r   rU      r   )rg   Fpadri   permuterz   )r   r   BHWCpad_hpad_wHpWpwindowss              r   window_partitionr      s    " JAq!Q1{?*k9E1{?*k9EqyyEAIIE!aAua/00YE	B	q"#["2C[RSTTAii1aAq))4466;;B[Z[\\GRHr   r   pad_hwtuple[int, int]hwc                t   |\  }}|\  }}| j         d         ||z  |z  |z  z  }|                     |||z  ||z  ||d          }	|	                    dddddd                                                              |||d          }	||k    s||k    r&|	ddd|d|ddf                                         }	|	S )	a_  Unpartition windowed sequences into original sequences and remove padding.

    This function reverses the windowing process, reconstructing the original input from windowed segments and removing
    any padding that was added during the windowing process.

    Args:
        windows (torch.Tensor): Input tensor of windowed sequences with shape (B * num_windows, window_size,
            window_size, C), where B is the batch size, num_windows is the number of windows, window_size is the size of
            each window, and C is the number of channels.
        window_size (int): Size of each window.
        pad_hw (tuple[int, int]): Padded height and width (Hp, Wp) of the input before windowing.
        hw (tuple[int, int]): Original height and width (H, W) of the input before padding and windowing.

    Returns:
        (torch.Tensor): Unpartitioned sequences with shape (B, H, W, C), where B is the batch size, H and W are the
            original height and width, and C is the number of channels.

    Examples:
        >>> windows = torch.rand(32, 8, 8, 64)  # 32 windows of size 8x8 with 64 channels
        >>> pad_hw = (16, 16)  # Padded height and width
        >>> hw = (15, 14)  # Original height and width
        >>> x = window_unpartition(windows, window_size=8, pad_hw=pad_hw, hw=hw)
        >>> print(x.shape)
        torch.Size([1, 15, 14, 64])
    r   r   rb   rp   r   rU   r   N)rg   ri   r   rz   )
r   r   r   r   r   r   r   r   r   r   s
             r   window_unpartitionr      s    4 FBDAqaR"W3{BCAQk)2+<k;XZ[[A			!Q1a##..0055aRDDA	Avvaaaa!RaRlO&&((Hr   q_sizek_sizerel_posreturnc                   t          dt          | |          z  dz
            }|j        d         |k    rwt          j        |                    d|j        d         d                              ddd          |d          }|                    d|                              dd          }n|}t          j        |           dddf         t          || z  d          z  }t          j        |          dddf         t          | |z  d          z  }||z
  |dz
  t          | |z  d          z  z   }||	                                         S )	a  Extract relative positional embeddings based on query and key sizes.

    Args:
        q_size (int): Size of the query.
        k_size (int): Size of the key.
        rel_pos (torch.Tensor): Relative position embeddings with shape (L, C), where L is the maximum relative distance
            and C is the embedding dimension.

    Returns:
        (torch.Tensor): Extracted positional embeddings according to relative positions, with shape (q_size, k_size, C).

    Examples:
        >>> q_size, k_size = 8, 16
        >>> rel_pos = torch.randn(31, 64)  # 31 = 2 * max(8, 16) - 1
        >>> extracted_pos = get_rel_pos(q_size, k_size, rel_pos)
        >>> print(extracted_pos.shape)
        torch.Size([8, 16, 64])
    r   rb   r   r   linear)sizemodeNrD   )
r   r$   rg   r   interpolaters   r   r:   r;   long)r   r   r   max_rel_distrel_pos_resizedq_coordsk_coordsrelative_coordss           r   get_rel_posr   $  sK   & q3vv...233L}Q<''-OOAw}Q/44<<Q1EE
 
 

 *11"lCCKKAqQQ! |F##AAAtG,s6F?C/H/HHH|F##D!!!G,s6F?C/H/HHH(*vzS&RU=V=V.VVO?//1122r   attnq	rel_pos_h	rel_pos_wc                   |\  }}|\  }}	t          |||          }
t          ||	|          }|j        \  }}}|                    ||||          }t          j        d||
          }t          j        d||          }|                     |||||	          |dddddddddf         z   |dddddddddf         z                       |||z  ||	z            } | S )aJ  Add decomposed Relative Positional Embeddings to the attention map.

    This function calculates and applies decomposed Relative Positional Embeddings as described in the MVITv2
    paper. It enhances the attention mechanism by incorporating spatial relationships between query and key
    positions.

    Args:
        attn (torch.Tensor): Attention map with shape (B, q_h * q_w, k_h * k_w).
        q (torch.Tensor): Query tensor in the attention layer with shape (B, q_h * q_w, C).
        rel_pos_h (torch.Tensor): Relative position embeddings for height axis with shape (Lh, C).
        rel_pos_w (torch.Tensor): Relative position embeddings for width axis with shape (Lw, C).
        q_size (tuple[int, int]): Spatial sequence size of query q as (q_h, q_w).
        k_size (tuple[int, int]): Spatial sequence size of key k as (k_h, k_w).

    Returns:
        (torch.Tensor): Updated attention map with added relative positional embeddings, shape (B, q_h * q_w, k_h *
            k_w).

    Examples:
        >>> B, C, q_h, q_w, k_h, k_w = 1, 64, 8, 8, 8, 8
        >>> attn = torch.rand(B, q_h * q_w, k_h * k_w)
        >>> q = torch.rand(B, q_h * q_w, C)
        >>> rel_pos_h = torch.rand(2 * max(q_h, k_h) - 1, C)
        >>> rel_pos_w = torch.rand(2 * max(q_w, k_w) - 1, C)
        >>> q_size, k_size = (q_h, q_w), (k_h, k_w)
        >>> updated_attn = add_decomposed_rel_pos(attn, q, rel_pos_h, rel_pos_w, q_size, k_size)
        >>> print(updated_attn.shape)
        torch.Size([1, 64, 64])

    References:
        https://github.com/facebookresearch/mvit/blob/main/mvit/models/attention.py
    bhwc,hkc->bhwkbhwc,wkc->bhwkN)r   rg   rs   r:   einsumri   )r   r   r   r   r   r   q_hq_wk_hk_wRhRwr   _r2   r_qrel_hrel_ws                     r   add_decomposed_rel_posr   L  s   P HCHC	S#y	)	)B	S#y	)	)BIAq#
))AsC
%
%CL)333EL)333EIIac3,,uQQQ111aaa5E/FFqqqRSRSRSUVUVUVX\^_^_^_O_I``ff	39cCi D Kr   abs_poshas_cls_tokenretain_cls_tokentilingc           
     P   |r|sJ |\  }}|r| ddddf         }| ddddf         } | j         d         }t          t          j        |                    }	|	|	z  |k    sJ |	|k    s|	|k    r|                     d|	|	d                              dddd          }
|rT|
                    ddgd t          ||f|
j         dd                   D             z             ddddd|d|f         }
nt          j	        |
||fdd	
          }
|s|
                    dddd          S |sJ t          j        ||
                    dddd                              d||z  d          gd          S |s|                     d||d          S |sJ t          j        || gd          S )a  Calculate absolute positional embeddings. If needed, resize embeddings and remove cls_token dimension for the
    original embeddings.

    Args:
        abs_pos (torch.Tensor): Absolute positional embeddings with shape (1, num_position, C).
        has_cls_token (bool): If true, has 1 embedding in abs_pos for cls token.
        hw (tuple[int, int]): Size of input image tokens.
        retain_cls_token (bool): Whether to retain the cls_token.
        tiling (bool): Whether to tile the embeddings, *instead* of interpolation (a la abs_win).

    Returns:
        (torch.Tensor): Absolute positional embeddings after processing with shape (1, H, W, C) if retain_cls_token is
            False, otherwise (1, 1+H*W, C).
    Nrb   r   r   rp   r   c                $    g | ]\  }}||z  d z   S )rb   r   )r   r   ys      r   rf   zget_abs_pos.<locals>.<listcomp>  s$    4o4o4oDAqQ!VaZ4o4o4or   bicubicF)r   r   align_cornersr9   )rg   r   mathsqrtrs   r   tilezipr   r   r:   r=   )r   r   r   r   r   hwcls_posxy_numr   new_abs_poss              r   get_abs_posr     s*   *  }DAq !!!!RaR%.!!!QRR%.]1Fty  !!D$;&    qyyDAIIooatR88@@Aq!LL 
	%**Aq64o4oCQRTUPVXcXijkjljlXmLnLn4o4o4o+opp111bqb"1"KK -V#	  K   	&&q!Q222 !  =9+--aAq99AA!QUBOOP      	8??1aB///   =9gw/Q7777r   kq_hwk_hwrescaler   !tuple[torch.Tensor, torch.Tensor]c                   |\  }}	|\  }
}||	k    r|
|k    s
J d            |||         }||         }n"t          ||
|          }t          |	||          }| j        \  }}}|                     |||	|          }|dz  }|r||
z   |z   dz  n|}||z  }t          j        d||          |z  }t          j        d||          |z  }t          j        |
| j        | j                  }t          j        || j        | j                  }|                    d|
d|
          	                    ||
||
g          }|                    dd||          	                    ||
||g          }t          j
        ||z  ||gd	                              |||	z  d          } t          j
        |                    ||
|d          ||gd	                              ||
|z  d          }| |fS )
a  Concatenate rel pos coeffs to the q & k tensors, so that qk^T is now effectively including rel pos biases.

    Args:
        q (torch.Tensor): Query tensor with shape (B, L_q, C).
        k (torch.Tensor): Key tensor with shape (B, L_k, C).
        q_hw (tuple[int, int]): Spatial size of query tensors as (height, width).
        k_hw (tuple[int, int]): Spatial size of key tensors as (height, width).
        rel_pos_h (torch.Tensor): Relative positional embeddings for the height axis.
        rel_pos_w (torch.Tensor): Relative positional embeddings for the width axis.
        rescale (bool): Whether to rescale for use with SDPA, which would scale by the wrong factor due to the concat.
        relative_coords (torch.Tensor | None): Precomputed relative coords index tensor.

    Returns:
        q (torch.Tensor): Query tensor padded so that qk^T accounts for relative position biases.
        k (torch.Tensor): Key tensor padded so that qk^T accounts for relative position biases.
    zonly square inputs supportedNg      ?r   r   r6   rb   r   r9   )r   rg   rs   r:   r   eyer7   r8   ri   expandr=   )r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r2   r   	old_scale	new_scalescale_ratior   r   eye_heye_ws                            r   concat_rel_posr     s   4 HCHC3JJSCZZZ*HZZ("''c9--c9--IAq#
))AsC
%
%CSI,3BsSS((Ii'KL)333i?EL)333i?EIc:::EIc:::EJJq#q#&&--q#sC.@AAEJJq!S#&&--q#sC.@AAE	3$eU3<<<AA!S3YPRSSA	166!S#r**E59rBBBGG3QT9VXYYAa4Kr   )r   r   r   r   r	   r   )r/   )r0   r1   r2   r   r3   r4   )rD   r   )rE   r   rF   r   rG   r4   rH   r   )rQ   rD   )
r2   r   rE   r   rF   r   rR   r4   rS   r4   )r^   r1   r   r1   )F)rk   r1   rl   r1   r^   r1   rm   rn   )r   r1   r   r   )r   r1   r   r   r   r   r   r   )r   r   r   r   r   r1   r   r1   )r   r1   r   r1   r   r1   r   r1   r   r   r   r   r   r1   )FF)r   r1   r   rn   r   r   r   rn   r   rn   r   r1   )FN)r   r1   r   r1   r   r   r   r   r   r1   r   r1   r   rn   r   r1   r   r   )
__future__r   r   typingr   r:   torch.nn.functionalnn
functionalr   r.   rC   rP   r]   rj   r   r   r   r   r   r   r   r   r   r   <module>r      s   # " " " " "                 10 10 10 10h    46 6 6 6 6:9 9 9 9 9B" " " "6 !	/N /N /N /N /Nd   <" " " "J%3 %3 %3 %3P6 6 6 6z #>8 >8 >8 >8 >8P $(: : : : : : :r   