
    /j#                      d Z ddlmZ ddlZddlmZ ddlmc mZ ddl	m
Z
 ddlmZmZmZmZmZmZ ddlmZ dZ G d	 d
ej                  Z G d dej                  Z G d dej                  Z G d dej                  Z G d dej                  Z G d dej                  Z G d dej                  Z G d dej                  Z G d dej                  Z G d dej                  Z G d de          Z  G d d ej                  Z! G d! d"e          Z" G d# d$e          Z# G d% d&ej                  Z$ G d' d(ej                  Z% G d) d*ej                  Z& G d+ d,ej                  Z' G d- d.ej                  Z( G d/ d0ej                  Z) G d1 d2ej                  Z* G d3 d4ej                  Z+ G d5 d6ej                  Z, G d7 d8ej                  Z- G d9 d:e%          Z. G d; d<e          Z/ G d= d>ej                  Z0 G d? d@e0          Z1 G dA dBej                  Z2 G dC dDej                  Z3 G dE dFej                  Z4 G dG dHej                  Z5 G dI dJej                  Z6 G dK dLej                  Z7 G dM dNe          Z8 G dO dPe          Z9 G dQ dRej        j                  Z: G dS dTej                  Z; G dU dVe          Z< G dW dXej                  Z= G dY dZej                  Z> G d[ d\ej                  Z? G d] d^ej                  Z@ G d_ d`e          ZA G da dbej                  ZB G dc ddej                  ZC G de dfej                  ZD G dg dhej                  ZE G di djej                  ZF G dk dlej                  ZG G dm dnej                  ZH G do dpej                  ZI G dq dre          ZJ G ds dtej                  ZKdS )uzBlock modules.    )annotationsN)fuse_conv_and_bn   )ConvDWConv	GhostConv	LightConvRepConvautopad)TransformerBlock)'C1C2C2PSAC3C3TRCIBDFLELAN1PSASPPSPPELANSPPFAConvADown	AttentionBNContrastiveHead
BottleneckBottleneckCSPC2fC2fAttnC2fCIBC2fPSAC3GhostC3k2C3xCBFuseCBLinearContrastiveHeadGhostBottleneckHGBlockHGStemImagePoolingAttnProtoRepC3RepNCSPELAN4RepVGGDWResNetLayerSCDownTorchVisionc                  .     e Zd ZdZd
d fdZdd	Z xZS )r   zIntegral module of Distribution Focal Loss (DFL).

    Proposed in Generalized Focal Loss https://ieeexplore.ieee.org/document/9792391
       c1intc                r   t                                                       t          j        |ddd                              d          | _        t          j        |t          j                  }t          j	        |
                    d|dd                    | j        j        j        dd<   || _        dS )zInitialize a convolutional layer with a given number of input channels.

        Args:
            c1 (int): Number of input channels.
        r   FbiasdtypeN)super__init__nnConv2drequires_grad_convtorcharangefloat	Parameterviewweightdatar6   )selfr6   x	__class__s      a/home/longshao/multi-rider-rag/.venv/lib/python3.11/site-packages/ultralytics/nn/modules/block.pyr>   zDFL.__init__@   s     	Ib!QU333BB5II	L5;///#%<q"a0C0C#D#D	aaa     rK   torch.Tensorreturnc                    |j         \  }}}|                     |                    |d| j        |                              dd                              d                                        |d|          S )zCApply the DFL module to input tensor and return transformed output.      r   )shaperB   rG   r6   	transposesoftmax)rJ   rK   b_as        rM   forwardzDFL.forwardL   sh    '1ayy1dgq11;;AqAAII!LLMMRRSTVWYZ[[[rN   )r5   )r6   r7   rK   rO   rP   rO   __name__
__module____qualname____doc__r>   rZ   __classcell__rL   s   @rM   r   r   :   sh         

 
 
 
 
 
 
\ \ \ \ \ \ \ \rN   r   c                  .     e Zd ZdZdd fdZddZ xZS )r-   zBUltralytics YOLO models mask Proto module for segmentation models.       r6   r7   c_c2c                   t                                                       t          ||d          | _        t	          j        ||dddd          | _        t          ||d          | _        t          ||          | _        dS )a  Initialize the Ultralytics YOLO models mask Proto module with specified number of protos and masks.

        Args:
            c1 (int): Input channels.
            c_ (int): Intermediate channels.
            c2 (int): Output channels (number of protos).
           krS   r   Tr9   N)	r=   r>   r   cv1r?   ConvTranspose2dupsamplecv2cv3)rJ   r6   rf   rg   rL   s       rM   r>   zProto.__init__V   sy     	B!$$$*2r1aFFFB!$$$B<<rN   rK   rO   rP   c           	         |                      |                     |                     |                     |                                        S )zEPerform a forward pass through layers using an upsampled input image.)rp   ro   rn   rl   rJ   rK   s     rM   rZ   zProto.forwardd   s6    xxtxx{{!;!;<<===rN   )rd   re   )r6   r7   rf   r7   rg   r7   r[   r\   rb   s   @rM   r-   r-   S   s\        LL             > > > > > > > >rN   r-   c                  ,     e Zd ZdZd fdZdd
Z xZS )r+   zStemBlock of PPHGNetV2 with 5 convolutions and one maxpool2d.

    https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/backbones/hgnet_v2.py
    r6   r7   cmrg   c           	     <   t                                                       t          ||ddt          j                              | _        t          ||dz  dddt          j                              | _        t          |dz  |dddt          j                              | _        t          |dz  |ddt          j                              | _        t          ||ddt          j                              | _	        t          j
        dddd          | _        dS )	zInitialize the StemBlock of PPHGNetV2.

        Args:
            c1 (int): Input channels.
            cm (int): Middle channels.
            c2 (int): Output channels.
        ri   rS   actr   r   T)kernel_sizestridepadding	ceil_modeN)r=   r>   r   r?   ReLUstem1stem2astem2bstem3stem4	MaxPool2dpool)rJ   r6   rt   rg   rL   s       rM   r>   zHGStem.__init__o   s     	"b!QBGII666
2rQw1aRWYY???27B1aRWYY???"q&"a		:::
"b!QBGII666
LQq!tTTT			rN   rK   rO   rP   c                   |                      |          }t          j        |g d          }|                     |          }t          j        |g d          }|                     |          }|                     |          }t          j        ||gd          }|                     |          }| 	                    |          }|S )+Forward pass of a PPHGNetV2 backbone layer.)r   r   r   r   r   dim)
r}   Fpadr~   r   r   rC   catr   r   )rJ   rK   x2x1s       rM   rZ   zHGStem.forward   s    JJqMME!\\\""[[^^U2|||$$[[__YYq\\Ir2hA&&&JJqMMJJqMMrN   )r6   r7   rt   r7   rg   r7   r[   r\   rb   s   @rM   r+   r+   i   sa         
U U U U U U        rN   r+   c                  R     e Zd ZdZdddd ej                    fd fdZddZ xZS )r*   zHG_Block of PPHGNetV2 with 2 convolutions and LightConv.

    https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/backbones/hgnet_v2.py
    ri      Fr6   r7   rt   rg   rk   n	lightconvboolshortcutrw   	nn.Modulec	                t  	 t                                                       |rt          nt          	t	          j        	fdt          |          D                       | _        t          |z  z   |dz  dd          | _        t          |dz  |dd          | _	        |o|k    | _
        dS )a  Initialize HGBlock with specified parameters.

        Args:
            c1 (int): Input channels.
            cm (int): Middle channels.
            c2 (int): Output channels.
            k (int): Kernel size.
            n (int): Number of LightConv or Conv blocks.
            lightconv (bool): Whether to use LightConv.
            shortcut (bool): Whether to use shortcut connection.
            act (nn.Module): Activation function.
        c              3  F   K   | ]} |d k    rn          V  dS )r   rk   rw   N ).0irw   blockr6   rt   rk   s     rM   	<genexpr>z#HGBlock.__init__.<locals>.<genexpr>   sC      __QRuu166RRr2LLL______rN   rS   r   rv   N)r=   r>   r	   r   r?   
ModuleListrangemscecadd)rJ   r6   rt   rg   rk   r   r   r   rw   r   rL   s    `` `   `@rM   r>   zHGBlock.__init__   s    . 	&0		D________V[\]V^V^_____rAF{B!GQs;;;rQwAqc222(brN   rK   rO   rP   c                    |g                     fd| j        D                        |                     |                     t	          j        d                              | j        r|z   nS )r   c              3  :   K   | ]} |d                    V  dS Nr   r   r   ys     rM   r   z"HGBlock.forward.<locals>.<genexpr>   /      **a1R5******rN   r   )extendr   r   r   rC   r   r   rJ   rK   r   s     @rM   rZ   zHGBlock.forward   so    C	****46******GGDGGEIaOO,,--'q1uua'rN   )r6   r7   rt   r7   rg   r7   rk   r7   r   r7   r   r   r   r   rw   r   r[   )	r]   r^   r_   r`   r?   r|   r>   rZ   ra   rb   s   @rM   r*   r*      sy           ) ) ) ) ) ) )<( ( ( ( ( ( ( (rN   r*   c                  .     e Zd ZdZdd fdZddZ xZS )r   zDSpatial Pyramid Pooling (SPP) layer https://arxiv.org/abs/1406.4729.   	      r6   r7   rg   rk   tuple[int, ...]c                   t                                                       |dz  }t          ||dd          | _        t          |t	          |          dz   z  |dd          | _        t          j        d |D                       | _        dS )zInitialize the SPP layer with input/output channels and pooling kernel sizes.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
            k (tuple): Kernel sizes for max pooling.
        rS   r   c                B    g | ]}t          j        |d |dz            S )r   rS   rx   ry   rz   )r?   r   )r   rK   s     rM   
<listcomp>z SPP.__init__.<locals>.<listcomp>   s/    aaaZ[1aSTf U U UaaarN   N)	r=   r>   r   rl   lenro   r?   r   r   )rJ   r6   rg   rk   rf   rL   s        rM   r>   zSPP.__init__   s     	1WB1%%c!ffqj)2q!44aa_`aaabbrN   rK   rO   rP   c                    |                                |                     t          j        gfd| j        D             z   d                    S )zBForward pass of the SPP layer, performing spatial pyramid pooling.c                &    g | ]} |          S r   r   )r   r   rK   s     rM   r   zSPP.forward.<locals>.<listcomp>   s!    (>(>(>!1(>(>(>rN   r   )rl   ro   rC   r   r   rr   s    `rM   rZ   zSPP.forward   sN    HHQKKxx	1#(>(>(>(>tv(>(>(>">BBCCCrN   )r   )r6   r7   rg   r7   rk   r   r[   r\   rb   s   @rM   r   r      sk        NNc c c c c c cD D D D D D D DrN   r   c                  .     e Zd ZdZdd fdZddZ xZS )r   zGSpatial Pyramid Pooling - Fast (SPPF) layer for YOLOv5 by Glenn Jocher.r   ri   Fr6   r7   rg   rk   r   r   r   c                $   t                                                       |dz  }t          ||ddd          | _        t          ||dz   z  |dd          | _        t          j        |d|dz            | _        || _        |o||k    | _	        dS )a  Initialize the SPPF layer with given input/output channels and kernel size.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
            k (int): Kernel size.
            n (int): Number of pooling iterations.
            shortcut (bool): Whether to use shortcut connection.

        Notes:
            This module is equivalent to SPP(k=(5, 9, 13)).
        rS   r   Frv   r   N)
r=   r>   r   rl   ro   r?   r   r   r   r   )rJ   r6   rg   rk   r   r   rf   rL   s          rM   r>   zSPPF.__init__   s     	1WB1%000a!eb!Q//!AqAvFFF(brN   rK   rO   rP   c           
     .                          |          g                     fdt          t           dd                    D                                             t          j        d                    t           dd          r|z   nS )zRApply sequential pooling operations to input and return concatenated feature maps.c              3  N   K   | ]}                     d                    V   dS r   r   )r   rX   rJ   r   s     rM   r   zSPPF.forward.<locals>.<genexpr>   s1      EE1"EEEEEErN   r   ri   r   r   F)rl   r   r   getattrro   rC   r   r   s   ` @rM   rZ   zSPPF.forward   s    XXa[[M	EEEEEgdC.C.C(D(DEEEEEEHHUYq!__%%eU33:q1uu:rN   )r   ri   F)
r6   r7   rg   r7   rk   r7   r   r7   r   r   r[   r\   rb   s   @rM   r   r      s\        QQ) ) ) ) ) ) )*; ; ; ; ; ; ; ;rN   r   c                  .     e Zd ZdZdd fdZddZ xZS )r   z"CSP Bottleneck with 1 convolution.r   r6   r7   rg   r   c                    t                                                       t          |dd          | _        t	          j        fdt          |          D              | _        dS )zInitialize the CSP Bottleneck with 1 convolution.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
            n (int): Number of convolutions.
        r   c              3  :   K   | ]}t          d           V  dS )ri   Nr   )r   rX   rg   s     rM   r   zC1.__init__.<locals>.<genexpr>   s-       C CQb"a C C C C C CrN   N)r=   r>   r   rl   r?   
Sequentialr   r   )rJ   r6   rg   r   rL   s     ` rM   r>   zC1.__init__   s^     	B1%% C C C C%(( C C CDrN   rK   rO   rP   c                \    |                      |          }|                     |          |z   S )z:Apply convolution and residual connection to input tensor.)rl   r   r   s      rM   rZ   z
C1.forward   s%    HHQKKvvayy1}rN   r   )r6   r7   rg   r7   r   r7   r[   r\   rb   s   @rM   r   r      sc        ,,
E 
E 
E 
E 
E 
E 
E       rN   r   c                  .     e Zd ZdZdd fdZddZ xZS )r   z#CSP Bottleneck with 2 convolutions.r   T      ?r6   r7   rg   r   r   r   gerE   c                R    t                                                       t          ||z             _        t	          |d j        z  dd           _        t	          d j        z  |d           _        t          j         fdt          |          D               _
        dS )a_  Initialize a CSP Bottleneck with 2 convolutions.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
            n (int): Number of Bottleneck blocks.
            shortcut (bool): Whether to use shortcut connections.
            g (int): Groups for convolutions.
            e (float): Expansion ratio.
        rS   r   c           	   3  V   K   | ]#}t          j        j        d d          V  $dS )ri   ri   r         ?rk   r   Nr   cr   rX   r   rJ   r   s     rM   r   zC2.__init__.<locals>.<genexpr>  s?       v vhiDFDFHaK[_b!c!c!c v v v v v vrN   Nr=   r>   r7   r   r   rl   ro   r?   r   r   r   rJ   r6   rg   r   r   r   r   rL   s   `   `` rM   r>   zC2.__init__  s     	R!VAJ1--DF
B** v v v v v vmrstmumu v v vwrN   rK   rO   rP   c                    |                      |                              dd          \  }}|                     t          j        |                     |          |fd                    S )z<Forward pass through the CSP bottleneck with 2 convolutions.rS   r   )rl   chunkro   rC   r   r   rJ   rK   rY   rW   s       rM   rZ   z
C2.forward  sQ    xx{{  A&&1xx	466!99a.!44555rN   r   Tr   r   r6   r7   rg   r7   r   r7   r   r   r   r7   r   rE   r[   r\   rb   s   @rM   r   r     sc        --x x x x x x x$6 6 6 6 6 6 6 6rN   r   c                  6     e Zd ZdZdd fdZddZddZ xZS )r   <Faster Implementation of CSP Bottleneck with 2 convolutions.r   Fr   r6   r7   rg   r   r   r   r   r   rE   c                d    t                                                       t          ||z             _        t	          |d j        z  dd           _        t	          d|z    j        z  |d           _        t          j         fdt          |          D                        _
        dS )a_  Initialize a CSP bottleneck with 2 convolutions.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
            n (int): Number of Bottleneck blocks.
            shortcut (bool): Whether to use shortcut connections.
            g (int): Groups for convolutions.
            e (float): Expansion ratio.
        rS   r   c           	   3  V   K   | ]#}t          j        j        d d          V  $dS r   r   r   s     rM   r   zC2f.__init__.<locals>.<genexpr>2  ?      ttfgz$&$&(AIY]`aaattttttrN   N)r=   r>   r7   r   r   rl   ro   r?   r   r   r   r   s   `   `` rM   r>   zC2f.__init__#  s     	R!VAJ1--Q$&("a00ttttttkpqrkskstttttrN   rK   rO   rP   c                   t          |                     |                              dd                                        fd| j        D                        |                     t          j        d                    S )zForward pass through C2f layer.rS   r   c              3  :   K   | ]} |d                    V  dS r   r   r   s     rM   r   zC2f.forward.<locals>.<genexpr>7  r   rN   )listrl   r   r   r   ro   rC   r   r   s     @rM   rZ   zC2f.forward4  so    !""1a(())	****46******xx	!Q(((rN   c                *   |                      |                              | j        | j        fd          d         d         g                    fd| j        D                        |                     t          j        d                    S ).Forward pass using split() instead of chunk().r   r   c              3  :   K   | ]} |d                    V  dS r   r   r   s     rM   r   z$C2f.forward_split.<locals>.<genexpr>>  r   rN   )rl   splitr   r   r   ro   rC   r   r   s     @rM   forward_splitzC2f.forward_split:  s    HHQKKtvtv.22qT1Q4L	****46******xx	!Q(((rN   r   Fr   r   r   r[   r]   r^   r_   r`   r>   rZ   r   ra   rb   s   @rM   r   r      sw        FFu u u u u u u") ) ) )) ) ) ) ) ) ) )rN   r   c                  .     e Zd ZdZdd fdZddZ xZS )r   z#CSP Bottleneck with 3 convolutions.r   Tr   r6   r7   rg   r   r   r   r   r   rE   c                \   t                                                       t          ||z            t          |dd          | _        t          |dd          | _        t          dz  |d          | _        t          j        fdt          |          D              | _
        dS )aa  Initialize the CSP Bottleneck with 3 convolutions.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
            n (int): Number of Bottleneck blocks.
            shortcut (bool): Whether to use shortcut connections.
            g (int): Groups for convolutions.
            e (float): Expansion ratio.
        r   rS   c           	   3  B   K   | ]}t          d d          V  dS )))r   r   r   r   r   Nr   r   rX   rf   r   r   s     rM   r   zC3.__init__.<locals>.<genexpr>U  s;       n n`aBHaCSWZ![![![ n n n n n nrN   N)r=   r>   r7   r   rl   ro   rp   r?   r   r   r   	rJ   r6   rg   r   r   r   r   rf   rL   s	       `` @rM   r>   zC3.__init__E  s     	a[[B1%%B1%%BA&& n n n n n nejklemem n n norN   rK   rO   rP   c           	         |                      t          j        |                     |                     |                    |                     |          fd                    S )z<Forward pass through the CSP bottleneck with 3 convolutions.r   )rp   rC   r   r   rl   ro   rr   s     rM   rZ   z
C3.forwardW  sE    xx	466$((1++#6#6"DaHHIIIrN   r   r   r[   r\   rb   s   @rM   r   r   B  sk        --p p p p p p p$J J J J J J J JrN   r   c                  &     e Zd ZdZdd fdZ xZS )r%   z"C3 module with cross-convolutions.r   Tr   r6   r7   rg   r   r   r   r   r   rE   c                     t                                          ||||           t          ||z             _        t	          j         fdt          |          D               _        dS )a\  Initialize C3 module with cross-convolutions.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
            n (int): Number of Bottleneck blocks.
            shortcut (bool): Whether to use shortcut connections.
            g (int): Groups for convolutions.
            e (float): Expansion ratio.
        c           	   3  V   K   | ]#}t          j        j        d d          V  $dS )))r   ri   ri   r   r   r   N)r   rf   r   s     rM   r   zC3x.__init__.<locals>.<genexpr>l  s?       v vhiDGTWhM]ab!c!c!c v v v v v vrN   N)r=   r>   r7   rf   r?   r   r   r   r   s   `   `` rM   r>   zC3x.__init___  sr     	RHa333b1f++ v v v v v vmrstmumu v v vwrN   r   r   r]   r^   r_   r`   r>   ra   rb   s   @rM   r%   r%   \  sS        ,,x x x x x x x x x x xrN   r%   c                  .     e Zd ZdZdd fd
ZddZ xZS )r.   zRep C3.ri   r   r6   r7   rg   r   r   rE   c                   t                                                       t          ||z            t          |dd          | _        t          |dd          | _        t          j        fdt          |          D              | _	        |k    rt          |dd          nt          j
                    | _        dS )zInitialize RepC3 module with RepConv blocks.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
            n (int): Number of RepConv blocks.
            e (float): Expansion ratio.
        r   c                0    g | ]}t                    S r   )r
   r   rX   rf   s     rM   r   z"RepC3.__init__.<locals>.<listcomp>  s!     C C CQR C C CrN   N)r=   r>   r7   r   rl   ro   r?   r   r   r   Identityrp   )rJ   r6   rg   r   r   rf   rL   s        @rM   r>   zRepC3.__init__r  s     	a[[B1%%B1%% C C C C%(( C C CD)+r4B1%%%r{}}rN   rK   rO   rP   c                    |                      |                     |                     |                    |                     |          z             S )zForward pass of RepC3 module.)rp   r   rl   ro   rr   s     rM   rZ   zRepC3.forward  s9    xxtxx{{++dhhqkk9:::rN   )ri   r   r6   r7   rg   r7   r   r7   r   rE   r[   r\   rb   s   @rM   r.   r.   o  sb        ME E E E E E E ; ; ; ; ; ; ; ;rN   r.   c                  &     e Zd ZdZdd fdZ xZS )r   z"C3 module with TransformerBlock().r   Tr   r6   r7   rg   r   r   r   r   r   rE   c                    t                                          ||||||           t          ||z            }t          ||d|          | _        dS )a[  Initialize C3 module with TransformerBlock.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
            n (int): Number of Transformer blocks.
            shortcut (bool): Whether to use shortcut connections.
            g (int): Groups for convolutions.
            e (float): Expansion ratio.
        rR   N)r=   r>   r7   r   r   r   s	           rM   r>   zC3TR.__init__  sO     	RHa333a[[!"b!Q//rN   r   r   r   rb   s   @rM   r   r     sH        ,,0 0 0 0 0 0 0 0 0 0 0rN   r   c                  &     e Zd ZdZdd fdZ xZS )r#   z!C3 module with GhostBottleneck().r   Tr   r6   r7   rg   r   r   r   r   r   rE   c                    t                                          ||||||           t          ||z            t          j        fdt          |          D              | _        dS )a_  Initialize C3 module with GhostBottleneck.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
            n (int): Number of Ghost bottleneck blocks.
            shortcut (bool): Whether to use shortcut connections.
            g (int): Groups for convolutions.
            e (float): Expansion ratio.
        c              3  8   K   | ]}t                    V  d S )N)r)   r   s     rM   r   z#C3Ghost.__init__.<locals>.<genexpr>  s-       K KQR!8!8 K K K K K KrN   Nr=   r>   r7   r?   r   r   r   r   s	          @rM   r>   zC3Ghost.__init__  sd     	RHa333a[[ K K K K%(( K K KLrN   r   r   r   rb   s   @rM   r#   r#     sS        ++M M M M M M M M M M MrN   r#   c                  .     e Zd ZdZdd fd	ZddZ xZS )r)   zGGhost Bottleneck https://github.com/huawei-noah/Efficient-AI-Backbones.ri   r   r6   r7   rg   rk   sc                   t                                                       |dz  }t          j        t	          ||dd          |dk    rt          ||||d          nt          j                    t	          ||ddd                    | _        |dk    r9t          j        t          ||||d          t          ||ddd                    nt          j                    | _	        dS )zInitialize Ghost Bottleneck module.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
            k (int): Kernel size.
            s (int): Stride.
        rS   r   Frv   N)
r=   r>   r?   r   r   r   r   rB   r   r   )rJ   r6   rg   rk   r  rf   rL   s         rM   r>   zGhostBottleneck.__init__  s     	1WMb"a##/0AvvF2r1aU++++2;==b"a...
 
	 ^_bc]c]cBM&RA59994B1RW;X;X;XYYYikitiviv 	rN   rK   rO   rP   c                X    |                      |          |                     |          z   S )z3Apply skip connection and addition to input tensor.)rB   r   rr   s     rM   rZ   zGhostBottleneck.forward  s#    yy||dmmA....rN   r   r6   r7   rg   r7   rk   r7   r  r7   r[   r\   rb   s   @rM   r)   r)     s\        QQ
 
 
 
 
 
 
(/ / / / / / / /rN   r)   c                  0     e Zd ZdZ	 dd fdZddZ xZS )r   zStandard bottleneck.Tr   r   r   r6   r7   rg   r   r   r   rk   tuple[int, int]r   rE   c                    t                                                       t          ||z            }t          |||d         d          | _        t          |||d         d|          | _        |o||k    | _        dS )aZ  Initialize a standard bottleneck module.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
            shortcut (bool): Whether to use shortcut connection.
            g (int): Groups for convolutions.
            k (tuple): Kernel sizes for convolutions.
            e (float): Expansion ratio.
        r   r   r   N)r=   r>   r7   r   rl   ro   r   	rJ   r6   rg   r   r   rk   r   rf   rL   s	           rM   r>   zBottleneck.__init__  su     	a[[B!a((B!a1---(brN   rK   rO   rP   c                    | j         r+||                     |                     |                    z   n'|                     |                     |                    S )z3Apply bottleneck with optional shortcut connection.)r   ro   rl   rr   s     rM   rZ   zBottleneck.forward  sE    ,0HOq488DHHQKK(((($((488A;;:O:OOrN   Tr   r   r   r6   r7   rg   r7   r   r   r   r7   rk   r  r   rE   r[   r\   rb   s   @rM   r   r     sk         lo) ) ) ) ) ) )&P P P P P P P PrN   r   c                  .     e Zd ZdZdd fdZddZ xZS )r   zGCSP Bottleneck https://github.com/WongKinYiu/CrossStagePartialNetworks.r   Tr   r6   r7   rg   r   r   r   r   r   rE   c                   t                                                       t          ||z            t          |dd          | _        t          j        |ddd          | _        t          j        ddd          | _        t          dz  |dd          | _	        t          j
        dz            | _        t          j                    | _        t          j        fdt          |          D              | _        dS )aI  Initialize CSP Bottleneck.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
            n (int): Number of Bottleneck blocks.
            shortcut (bool): Whether to use shortcut connections.
            g (int): Groups for convolutions.
            e (float): Expansion ratio.
        r   Fr9   rS   c              3  @   K   | ]}t          d           V  dS r   r   Nr   r   s     rM   r   z)BottleneckCSP.__init__.<locals>.<genexpr>  s6       Z ZABHa3!G!G!G Z Z Z Z Z ZrN   N)r=   r>   r7   r   rl   r?   r@   ro   rp   cv4BatchNorm2dbnSiLUrw   r   r   r   r   s	       `` @rM   r>   zBottleneckCSP.__init__  s     	a[[B1%%9RQ6669RQ666BAq)).R((799 Z Z Z Z Z ZQVWXQYQY Z Z Z[rN   rK   rO   rP   c           
     B   |                      |                     |                     |                              }|                     |          }|                     |                     |                     t          j        ||fd                                        S )z)Apply CSP bottleneck with 4 convolutions.r   )	rp   r   rl   ro   r  rw   r  rC   r   )rJ   rK   y1y2s       rM   rZ   zBottleneckCSP.forward  sr    XXdffTXXa[[))**XXa[[xxB8Q)?)?!@!@AABBBrN   r   r   r[   r\   rb   s   @rM   r   r     sk        QQ\ \ \ \ \ \ \*C C C C C C C CrN   r   c                  .     e Zd ZdZdd fd	ZddZ xZS )ResNetBlockz.ResNet block with standard convolution layers.r   rR   r6   r7   rg   r  r   c           	     |   t                                                       ||z  }t          ||ddd          | _        t          ||d|dd          | _        t          ||dd          | _        |dk    s||k    r&t          j        t          ||d|d                    nt          j                    | _	        dS )	zInitialize ResNet block.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
            s (int): Stride.
            e (int): Expansion ratio.
        r   Trk   r  rw   ri   rk   r  prw   Fr   N)
r=   r>   r   rl   ro   rp   r?   r   r   r   )rJ   r6   rg   r  r   c3rL   s         rM   r>   zResNetBlock.__init__  s     	VB!qd333B!qA4888B!///LMQRFFVX\^V^V^d2rQ!&G&G&GHHHdfdodqdqrN   rK   rO   rP   c           	         t          j        |                     |                     |                     |                              |                     |          z             S )z&Forward pass through the ResNet block.)r   relurp   ro   rl   r   rr   s     rM   rZ   zResNetBlock.forward  sE    vdhhtxx4455a8H8HHIIIrN   )r   rR   )r6   r7   rg   r7   r  r7   r   r7   r[   r\   rb   s   @rM   r  r    sk        88r r r r r r r J J J J J J J JrN   r  c                  .     e Zd ZdZdd fdZddZ xZS )r1   z)ResNet layer with multiple ResNet blocks.r   FrR   r6   r7   rg   r  is_firstr   r   r   c           
        t                                                       || _        | j        rDt          j        t          |dddd          t          j        ddd                    | _        d
S t          ||          g}|	                    fd	t          |dz
            D                        t          j        | | _        d
S )a,  Initialize ResNet layer.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
            s (int): Stride.
            is_first (bool): Whether this is the first layer.
            n (int): Number of ResNet blocks.
            e (int): Expansion ratio.
           rS   ri   Tr  r   r   r  c                <    g | ]}t          z  d           S )r   r  )r  )r   rX   rg   r   s     rM   r   z(ResNetLayer.__init__.<locals>.<listcomp>2  s.    QQQq;q2vr1:::QQQrN   N)r=   r>   r$  r?   r   r   r   layerr  r   r   )	rJ   r6   rg   r  r$  r   r   blocksrL   s	     `   ` rM   r>   zResNetLayer.__init__  s     	 = 	0RqA555r|PQZ[ef7g7g7g DJJJ ""b!q1112FMMQQQQQE!a%LLQQQRRR/DJJJrN   rK   rO   rP   c                ,    |                      |          S )z&Forward pass through the ResNet layer.)r(  rr   s     rM   rZ   zResNetLayer.forward5  s    zz!}}rN   )r   Fr   rR   )r6   r7   rg   r7   r  r7   r$  r   r   r7   r   r7   r[   r\   rb   s   @rM   r1   r1     s\        330 0 0 0 0 0 0.       rN   r1   c                  .     e Zd ZdZdd fdZddZ xZS )MaxSigmoidAttnBlockzMax Sigmoid attention block.r         Fr6   r7   rg   nhr   gcscaler   c                   t                                                       || _        ||z  | _        ||k    rt	          ||dd          nd| _        t          j        ||          | _        t          j	        t          j        |                    | _        t	          ||ddd          | _        |r)t          j	        t          j        d|dd                    nd| _        dS )a?  Initialize MaxSigmoidAttnBlock.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
            nh (int): Number of heads.
            ec (int): Embedding channels.
            gc (int): Guide channels.
            scale (bool): Whether to use learnable scale parameter.
        r   Fr   Nri   r  r   )r=   r>   r/  hcr   r   r?   LinearglrF   rC   zerosr:   	proj_convonesr1  )rJ   r6   rg   r/  r   r0  r1  rL   s          rM   r>   zMaxSigmoidAttnBlock.__init__=  s     	(24(($r2....)B##LR11	b"QE:::>CLR\%*QAq"9"9:::


rN   rK   rO   guiderP   c                   |j         \  }}}}|                     |          }|                    ||j         d         | j        | j                  }| j        |                     |          n|}|                    || j        | j        ||          }t          j        d||          }|                    d          d         }|| j        dz  z  }|| j	        dddddf         z   }|
                                | j        z  }|                     |          }|                    || j        d||          }||                    d          z  }|                    |d||          S )	zForward pass of MaxSigmoidAttnBlock.

        Args:
            x (torch.Tensor): Input tensor.
            guide (torch.Tensor): Guide tensor.

        Returns:
            (torch.Tensor): Output tensor after attention.
        r   Nzbmchw,bnmc->bmhwnr   r   r   r   rS   )rT   r5  rG   r/  r3  r   rC   einsummaxr:   sigmoidr1  r7  	unsqueeze)	rJ   rK   r9  bsrX   hwembedaws	            rM   rZ   zMaxSigmoidAttnBlock.forwardQ  s>    gAq!

2u{1~tw@@"g1


q

2twA66\-ue<<VVV^^A47C< $)D!!!T4/00ZZ\\DJ&NN1FF2twAq))Qvvb"a###rN   )r   r-  r.  F)r6   r7   rg   r7   r/  r7   r   r7   r0  r7   r1  r   rK   rO   r9  rO   rP   rO   r\   rb   s   @rM   r,  r,  :  sc        &&M M M M M M M($ $ $ $ $ $ $ $rN   r,  c                  D     e Zd ZdZ	 	 	 	 	 	 	 dd fdZddZddZ xZS )r    z*C2f module with an additional attn module.r   r-  r.  Fr   r6   r7   rg   r   r   r/  r0  r   r   r   r   rE   c
                    t                                                       t          ||	z             _        t	          |d j        z  dd           _        t	          d|z    j        z  |d           _        t          j         fdt          |          D                        _
        t           j         j        |||           _        dS )a  Initialize C2f module with attention mechanism.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
            n (int): Number of Bottleneck blocks.
            ec (int): Embedding channels for attention.
            nh (int): Number of heads for attention.
            gc (int): Guide channels for attention.
            shortcut (bool): Whether to use shortcut connections.
            g (int): Groups for convolutions.
            e (float): Expansion ratio.
        rS   r   ri   c           	   3  V   K   | ]#}t          j        j        d d          V  $dS r   r   r   s     rM   r   z#C2fAttn.__init__.<locals>.<genexpr>  r   rN   )r0  r   r/  N)r=   r>   r7   r   r   rl   ro   r?   r   r   r   r,  attn)rJ   r6   rg   r   r   r/  r0  r   r   r   rL   s   `      `` rM   r>   zC2fAttn.__init__q  s    2 	R!VAJ1--Q$&("a00ttttttkpqrksksttttt'2"LLL			rN   rK   rO   r9  rP   c                j   t          |                     |                              dd                                        fd| j        D                                            |                     d         |                     |                     t          j	        d                    S )zForward pass through C2f layer with attention.

        Args:
            x (torch.Tensor): Input tensor.
            guide (torch.Tensor): Guide tensor for attention.

        Returns:
            (torch.Tensor): Output tensor after processing.
        rS   r   c              3  :   K   | ]} |d                    V  dS r   r   r   s     rM   r   z"C2fAttn.forward.<locals>.<genexpr>  r   rN   r   )
r   rl   r   r   r   appendrH  ro   rC   r   rJ   rK   r9  r   s      @rM   rZ   zC2fAttn.forward  s     !""1a(())	****46******	1R5%(()))xx	!Q(((rN   c                   t          |                     |                              | j        | j        fd                                        fd| j        D                                            |                     d         |                     |                     t          j
        d                    S )zForward pass using split() instead of chunk().

        Args:
            x (torch.Tensor): Input tensor.
            guide (torch.Tensor): Guide tensor for attention.

        Returns:
            (torch.Tensor): Output tensor after processing.
        r   c              3  :   K   | ]} |d                    V  dS r   r   r   s     rM   r   z(C2fAttn.forward_split.<locals>.<genexpr>  r   rN   r   )r   rl   r   r   r   r   rK  rH  ro   rC   r   rL  s      @rM   r   zC2fAttn.forward_split  s     !""DFDF#3Q7788	****46******	1R5%(()))xx	!Q(((rN   )r   r-  r   r.  Fr   r   )r6   r7   rg   r7   r   r7   r   r7   r/  r7   r0  r7   r   r   r   r7   r   rE   rD  r   rb   s   @rM   r    r    n  s        44 M M M M M M M@) ) ) )) ) ) ) ) ) ) )rN   r    c                  0     e Zd ZdZ	 dd fdZddZ xZS )r,   zKImagePoolingAttn: Enhance the text embeddings with image-aware information.rd   r   r.     ri   Fr   r7   chr   ctr/  rk   r1  r   c                v   t                                                       t          |          }t          j        t          j        |          t          j        |                    | _        t          j        t          j                  t          j                            | _        t          j        t          j                  t          j                            | _	        t          j        |          | _
        |r)t          j        t          j        dg          d          nd| _        t          j        fd|D                       | _        t          j        fdt#          |          D                       | _        | _        || _        || _        |z  | _        | _        dS )a  Initialize ImagePoolingAttn module.

        Args:
            ec (int): Embedding channels.
            ch (tuple): Channel dimensions for feature maps.
            ct (int): Channel dimension for text embeddings.
            nh (int): Number of attention heads.
            k (int): Kernel size for pooling.
            scale (bool): Whether to use learnable scale parameter.
        g        Trequires_gradr   c                >    g | ]}t          j        |d           S )r   )rx   )r?   r@   )r   in_channelsr   s     rM   r   z-ImagePoolingAttn.__init__.<locals>.<listcomp>  s,    )j)j)jXc")KQR*S*S*S)j)j)jrN   c                <    g | ]}t          j        f          S r   )r?   AdaptiveMaxPool2d)r   rX   rk   s     rM   r   z-ImagePoolingAttn.__init__.<locals>.<listcomp>  s(    &W&W&Wr';QF'C'C&W&W&WrN   N)r=   r>   r   r?   r   	LayerNormr4  querykeyvalueprojrF   rC   tensorr1  r   projectionsr   im_poolsr   r/  nfr3  rk   )	rJ   r   rQ  rR  r/  rk   r1  rb  rL   s	    `   `  rM   r>   zImagePoolingAttn.__init__  sc    	WW]2<#3#3RYr25F5FGG
=b!1!129R3D3DEE]2<#3#3RYr25F5FGG
Ib"%%	NS\R\%,u"5"5TJJJJY\
=)j)j)j)jgi)j)j)jkk&W&W&W&WUSUYY&W&W&WXX(rN   rK   list[torch.Tensor]textrO   rP   c                x   |d         j         d         t          |          | j        k    sJ | j        dz  fdt	          || j        | j                  D             }t          j        |d          	                    dd          }| 
                    |          }|                     |          }|                     |          }|                    d| j        | j                  }|                    d| j        | j                  }|                    d| j        | j                  }t          j        d||          }|| j        dz  z  }t#          j        |d          }t          j        d	||          }|                     |                    d| j                            }|| j        z  |z   S )
zForward pass of ImagePoolingAttn.

        Args:
            x (list[torch.Tensor]): List of input feature maps.
            text (torch.Tensor): Text embeddings.

        Returns:
            (torch.Tensor): Enhanced text embeddings.
        r   rS   c                j    g | ]/\  }}} | ||                                         d           0S )r   )rG   )r   rK   r^  r   r?  num_patchess       rM   r   z,ImagePoolingAttn.forward.<locals>.<listcomp>  sA    ttt!T4TT$$q'']]B44tttrN   r   r   r   zbnmc,bkmc->bmnkr   zbmnk,bkmc->bnmc)rT   r   rb  rk   zipr`  ra  rC   r   rU   r[  r\  r]  reshaper/  r3  r;  r   rV   r^  r   r1  )	rJ   rK   rd  qrk   vrC  r?  rg  s	          @@rM   rZ   zImagePoolingAttn.forward  s    qTZ]1vv    faitttttCPQSWSceierLsLstttIaR   **1a00JJtHHQKKJJqMM IIb"dgtw//IIb"dgtw//IIb"dgtw//\+Q2247C< Yrr"""L*B22IIaiiB00114:~$$rN   )rd   r   r.  rP  ri   F)r   r7   rQ  r   rR  r7   r/  r7   rk   r7   r1  r   )rK   rc  rd  rO   rP   rO   r\   rb   s   @rM   r,   r,     sc        UU ns      :% % % % % % % %rN   r,   c                  *     e Zd ZdZ fdZddZ xZS )	r(   zZImplements contrastive learning head for region-text similarity in vision-language models.c                D   t                                                       t          j        t	          j        dg                    | _        t          j        t	          j        g           t	          j        d                                          z            | _	        dS )zBInitialize ContrastiveHead with region-text similarity parameters.      $g$I$I,@N)
r=   r>   r?   rF   rC   r_  r:   r8  loglogit_scalerJ   rL   s    rM   r>   zContrastiveHead.__init__  sq    Lug!6!677	<
2h9O9O9S9S9U9U(UVVrN   rK   rO   rA  rP   c                    t          j        |dd          }t          j        |dd          }t          j        d||          }|| j                                        z  | j        z   S )zForward function of contrastive learning.

        Args:
            x (torch.Tensor): Image features.
            w (torch.Tensor): Text features.

        Returns:
            (torch.Tensor): Similarity scores.
        r   rS   r   r  r   bchw,bkc->bkhw)r   	normalizerC   r;  rp  expr:   rJ   rK   rA  s      rM   rZ   zContrastiveHead.forward  se     KqA&&&KrQ'''L)1a004#'')))DI55rN   rK   rO   rA  rO   rP   rO   r\   rb   s   @rM   r(   r(     sW        ddW W W W W6 6 6 6 6 6 6 6rN   r(   c                  J     e Zd ZdZd fdZd Zedd
            ZddZ xZ	S )r   zBatch Norm Contrastive Head using batch norm instead of l2-normalization.

    Args:
        embed_dims (int): Embed dimensions of text and image features.
    
embed_dimsr7   c                .   t                                                       t          j        |          | _        t          j        t          j        dg                    | _        t          j        dt          j	        g           z            | _
        dS )zvInitialize BNContrastiveHead.

        Args:
            embed_dims (int): Embedding dimensions for features.
        rn  g      N)r=   r>   r?   r  normrF   rC   r_  r:   r8  rp  )rJ   rz  rL   s     rM   r>   zBNContrastiveHead.__init__  sn     	N:..	Lug!6!677	<uz"~~(=>>rN   c                *    | ` | `| `| j        | _        dS )zCFuse the batch normalization layer in the BNContrastiveHead module.N)r|  r:   rp  forward_fuserZ   rJ   s    rM   fusezBNContrastiveHead.fuse  s     II(rN   rK   rO   rA  rP   c                    | S )z5Passes image features through unchanged after fusing.r   )rK   rA  s     rM   r~  zBNContrastiveHead.forward_fuse&  s	     rN   c                    |                      |          }t          j        |dd          }t          j        d||          }|| j                                        z  | j        z   S )zForward function of contrastive learning with batch normalization.

        Args:
            x (torch.Tensor): Image features.
            w (torch.Tensor): Text features.

        Returns:
            (torch.Tensor): Similarity scores.
        r   rS   rs  rt  )r|  r   ru  rC   r;  rp  rv  r:   rw  s      rM   rZ   zBNContrastiveHead.forward+  s^     IIaLLKrQ'''L)1a004#'')))DI55rN   )rz  r7   rx  )
r]   r^   r_   r`   r>   r  staticmethodr~  rZ   ra   rb   s   @rM   r   r     s         ? ? ? ? ? ?) ) )    \6 6 6 6 6 6 6 6rN   r   c                  (     e Zd ZdZ	 dd fdZ xZS )RepBottleneckzRep bottleneck.Tr   r   r   r6   r7   rg   r   r   r   rk   r  r   rE   c                    t                                          ||||||           t          ||z            }t          |||d         d          | _        dS )aK  Initialize RepBottleneck.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
            shortcut (bool): Whether to use shortcut connection.
            g (int): Groups for convolutions.
            k (tuple): Kernel sizes for convolutions.
            e (float): Expansion ratio.
        r   r   N)r=   r>   r7   r
   rl   r
  s	           rM   r>   zRepBottleneck.__init__?  sS     	R1a333a[[2r1Q4++rN   r  r  r   rb   s   @rM   r  r  <  sO         lo, , , , , , , , , , ,rN   r  c                  &     e Zd ZdZdd fdZ xZS )RepCSPzXRepeatable Cross Stage Partial Network (RepCSP) module for efficient feature extraction.r   Tr   r6   r7   rg   r   r   r   r   r   rE   c                    t                                          ||||           t          ||z            t          j        fdt          |          D              | _        dS )aJ  Initialize RepCSP layer.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
            n (int): Number of RepBottleneck blocks.
            shortcut (bool): Whether to use shortcut connections.
            g (int): Groups for convolutions.
            e (float): Expansion ratio.
        c              3  @   K   | ]}t          d           V  dS r  )r  r   s     rM   r   z"RepCSP.__init__.<locals>.<genexpr>a  s6       ] ]qr2xc!J!J!J ] ] ] ] ] ]rN   Nr   r   s	       `` @rM   r>   zRepCSP.__init__T  sp     	RHa333a[[ ] ] ] ] ] ]TYZ[T\T\ ] ] ]^rN   r   r   r   rb   s   @rM   r  r  Q  sS        bb_ _ _ _ _ _ _ _ _ _ _rN   r  c                  6     e Zd ZdZdd fd	ZddZddZ xZS )r/   z	CSP-ELAN.r   r6   r7   rg   r   c4r   c           	        t                                                       |dz  | _        t          ||dd          | _        t          j        t          |dz  ||          t          ||dd                    | _        t          j        t          |||          t          ||dd                    | _	        t          |d|z  z   |dd          | _
        dS )a  Initialize CSP-ELAN layer.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
            c3 (int): Intermediate channels.
            c4 (int): Intermediate channels for RepCSP.
            n (int): Number of RepCSP blocks.
        rS   r   ri   N)r=   r>   r   r   rl   r?   r   r  ro   rp   r  )rJ   r6   rg   r   r  r   rL   s         rM   r>   zRepNCSPELAN4.__init__g  s     	qB1%%=aQ!7!7b"a9K9KLL=B!2!2DRA4F4FGGa"fr1a00rN   rK   rO   rP   c                   t          |                     |                              dd                                        fd| j        | j        fD                        |                     t          j        d                    S )z(Forward pass through RepNCSPELAN4 layer.rS   r   c              3  :   K   | ]} |d                    V  dS r   r   r   s     rM   r   z'RepNCSPELAN4.forward.<locals>.<genexpr>{  s/      ::!!AbE((::::::rN   )	r   rl   r   r   ro   rp   r  rC   r   r   s     @rM   rZ   zRepNCSPELAN4.forwardx  sv    !""1a(())	::::dh%9::::::xx	!Q(((rN   c                2   t          |                     |                              | j        | j        fd                                        fd| j        | j        fD                        |                     t          j	        d                    S )r   r   c              3  :   K   | ]} |d                    V  dS r   r   r   s     rM   r   z-RepNCSPELAN4.forward_split.<locals>.<genexpr>  s/      88a1R5888888rN   )
r   rl   r   r   r   ro   rp   r  rC   r   r   s     @rM   r   zRepNCSPELAN4.forward_split~  s    !""DFDF#3Q7788	8888DHdh#7888888xx	!Q(((rN   r   )
r6   r7   rg   r7   r   r7   r  r7   r   r7   r[   r   rb   s   @rM   r/   r/   d  so        O1 1 1 1 1 1 1") ) ) )) ) ) ) ) ) ) )rN   r/   c                  $     e Zd ZdZd fdZ xZS )	r   z!ELAN1 module with 4 convolutions.r6   r7   rg   r   r  c                .   t                                          ||||           |dz  | _        t          ||dd          | _        t          |dz  |dd          | _        t          ||dd          | _        t          |d|z  z   |dd          | _        dS )zInitialize ELAN1 layer.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
            c3 (int): Intermediate channels.
            c4 (int): Intermediate channels for convolutions.
        rS   r   ri   N)r=   r>   r   r   rl   ro   rp   r  )rJ   r6   rg   r   r  rL   s        rM   r>   zELAN1.__init__  s     	RR(((qB1%%aQ**B1%%a"fr1a00rN   )r6   r7   rg   r7   r   r7   r  r7   r   rb   s   @rM   r   r     sC        ++1 1 1 1 1 1 1 1 1 1rN   r   c                  ,     e Zd ZdZd
 fdZdd	Z xZS )r   zAConv.r6   r7   rg   c                x    t                                                       t          ||ddd          | _        dS )z}Initialize AConv module.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
        ri   rS   r   N)r=   r>   r   rl   rJ   r6   rg   rL   s      rM   r>   zAConv.__init__  s6     	B1a((rN   rK   rO   rP   c                ~    t           j        j                            |ddddd          }|                     |          S )z!Forward pass through AConv layer.rS   r   r   FT)rC   r?   
functional
avg_pool2drl   rr   s     rM   rZ   zAConv.forward  s4    H**1aAudCCxx{{rN   r6   r7   rg   r7   r[   r\   rb   s   @rM   r   r     sV        L) ) ) ) ) )       rN   r   c                  ,     e Zd ZdZd
 fdZdd	Z xZS )r   zADown.r6   r7   rg   c                    t                                                       |dz  | _        t          |dz  | j        ddd          | _        t          |dz  | j        ddd          | _        dS )z}Initialize ADown module.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
        rS   ri   r   r   N)r=   r>   r   r   rl   ro   r  s      rM   r>   zADown.__init__  sd     	qaAq11aAq11rN   rK   rO   rP   c                V   t           j        j                            |ddddd          }|                    dd          \  }}|                     |          }t           j        j                            |ddd          }|                     |          }t          j        ||fd          S )z!Forward pass through ADown layer.rS   r   r   FTri   )	rC   r?   r  r  r   rl   
max_pool2dro   r   )rJ   rK   r   r   s       rM   rZ   zADown.forward  s    H**1aAudCCABXXb\\X ++B1a88XXb\\y"b1%%%rN   r  r[   r\   rb   s   @rM   r   r     sV        L
2 
2 
2 
2 
2 
2& & & & & & & &rN   r   c                  .     e Zd ZdZdd fdZddZ xZS )r   z	SPP-ELAN.r   r6   r7   rg   r   rk   c                r   t                                                       || _        t          ||dd          | _        t          j        |d|dz            | _        t          j        |d|dz            | _        t          j        |d|dz            | _	        t          d|z  |dd          | _
        dS )zInitialize SPP-ELAN block.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
            c3 (int): Intermediate channels.
            k (int): Kernel size for max pooling.
        r   rS   r   rR   N)r=   r>   r   r   rl   r?   r   ro   rp   r  cv5)rJ   r6   rg   r   rk   rL   s        rM   r>   zSPPELAN.__init__  s     	B1%%<AaaHHH<AaaHHH<AaaHHHBAq))rN   rK   rO   rP   c                    |                      |          g                    fd| j        | j        | j        fD                        |                     t          j        d                    S )z#Forward pass through SPPELAN layer.c              3  :   K   | ]} |d                    V  dS r   r   r   s     rM   r   z"SPPELAN.forward.<locals>.<genexpr>  s/      BBa1R5BBBBBBrN   r   )rl   r   ro   rp   r  r  rC   r   r   s     @rM   rZ   zSPPELAN.forward  sd    XXa[[M	BBBBDHdh#ABBBBBBxx	!Q(((rN   )r   )r6   r7   rg   r7   r   r7   rk   r7   r[   r\   rb   s   @rM   r   r     s[        O* * * * * * *") ) ) ) ) ) ) )rN   r   c                  .     e Zd ZdZdd fdZddZ xZS )r'   z	CBLinear.r   Nr6   r7   c2s	list[int]rk   r  r  
int | Noner   c           
         t                                                       || _        t          j        |t          |          ||t          ||          |d          | _        dS )a  Initialize CBLinear module.

        Args:
            c1 (int): Input channels.
            c2s (list[int]): List of output channel sizes.
            k (int): Kernel size.
            s (int): Stride.
            p (int | None): Padding.
            g (int): Groups.
        T)groupsr:   N)r=   r>   r  r?   r@   sumr   rB   )rJ   r6   r  rk   r  r  r   rL   s          rM   r>   zCBLinear.__init__  sU     	Ib#c((Aq'!Q--PTUUU			rN   rK   rO   rP   rc  c                `    |                      |                              | j        d          S )z$Forward pass through CBLinear layer.r   r   )rB   r   r  rr   s     rM   rZ   zCBLinear.forward  s'    yy||!!$(!222rN   )r   r   Nr   )r6   r7   r  r  rk   r7   r  r7   r  r  r   r7   )rK   rO   rP   rc  r\   rb   s   @rM   r'   r'     sb        OV V V V V V V3 3 3 3 3 3 3 3rN   r'   c                  ,     e Zd ZdZd
 fdZdd	Z xZS )r&   zCBFuse.idxr  c                V    t                                                       || _        dS )zmInitialize CBFuse module.

        Args:
            idx (list[int]): Indices for feature selection.
        N)r=   r>   r  )rJ   r  rL   s     rM   r>   zCBFuse.__init__  s&     	rN   xsrc  rP   rO   c                     |d         j         dd          fdt          |dd                   D             }t          j        t          j        ||dd         z             d          S )zForward pass through CBFuse layer.

        Args:
            xs (list[torch.Tensor]): List of input tensors.

        Returns:
            (torch.Tensor): Fused output tensor.
        r   rS   Nc                f    g | ]-\  }}t          j        |j        |                  d           .S )nearestsizemode)r   interpolater  )r   r   rK   rJ   target_sizes      rM   r   z"CBFuse.forward.<locals>.<listcomp>  s;    nnnSWSTVWq}Qtx{^+INNNnnnrN   r   r   )rT   	enumeraterC   r  stack)rJ   r  resr  s   `  @rM   rZ   zCBFuse.forward  sw     fl122&nnnnn[deghkikhkel[m[mnnnyS2bcc7]33;;;;rN   )r  r  )r  rc  rP   rO   r\   rb   s   @rM   r&   r&     sV        M     < < < < < < < <rN   r&   c                  .     e Zd ZdZdd fdZddZ xZS )C3fz<Faster Implementation of CSP Bottleneck with 3 convolutions.r   Fr   r6   r7   rg   r   r   r   r   r   rE   c                n   t                                                       t          ||z            t          |dd          | _        t          |dd          | _        t          d|z   z  |d          | _        t          j        fdt          |          D                       | _
        dS )ag  Initialize CSP bottleneck layer with three convolutions.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
            n (int): Number of Bottleneck blocks.
            shortcut (bool): Whether to use shortcut connections.
            g (int): Groups for convolutions.
            e (float): Expansion ratio.
        r   rS   c           	   3  B   K   | ]}t          d d          V  dS r   r   r   s     rM   r   zC3f.__init__.<locals>.<genexpr>$  s;      ll^_z"b(AAQUXYYYllllllrN   N)r=   r>   r7   r   rl   ro   rp   r?   r   r   r   r   s	       `` @rM   r>   zC3f.__init__  s     	a[[B1%%B1%%Q"b!,,llllllchijckcklllllrN   rK   rO   rP   c                    |                      |          |                     |          g                    fd| j        D                        |                     t          j        d                    S )zForward pass through C3f layer.c              3  :   K   | ]} |d                    V  dS r   r   r   s     rM   r   zC3f.forward.<locals>.<genexpr>)  r   rN   r   )ro   rl   r   r   rp   rC   r   r   s     @rM   rZ   zC3f.forward&  sf    XXa[[$((1++&	****46******xx	!Q(((rN   r   r   r[   r\   rb   s   @rM   r  r    sc        FFm m m m m m m$) ) ) ) ) ) ) )rN   r  c                  2     e Zd ZdZ	 	 	 	 	 	 dd fdZ xZS )r$   r   r   Fr   Tr6   r7   rg   r   c3kr   r   rE   rH  r   r   c	                     t                                          ||||           t          j         fdt	          |          D                        _        dS )a  Initialize C3k2 module.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
            n (int): Number of blocks.
            c3k (bool): Whether to use C3k blocks.
            e (float): Expansion ratio.
            attn (bool): Whether to use attention blocks.
            g (int): Groups for convolutions.
            shortcut (bool): Whether to use shortcut connections.
        c              3  F  K   | ]}rZt          j        t          j        j                  t	          j        d t          j        dz  d                              n:rt          j        j        d          nt          j        j                  V  dS )r   @   r   
attn_ratio	num_headsrS   N)r?   r   r   r   PSABlockr<  C3k)r   rX   rH  r  r   rJ   r   s     rM   r   z C3k2.__init__.<locals>.<genexpr>H  s       

 

 	 	9BM46468Q77C3tv|Q;O;OPPP   9TVTVQ!444DFDFHa88

 

 

 

 

 

rN   Nr=   r>   r?   r   r   r   )
rJ   r6   rg   r   r  r   rH  r   r   rL   s
   `   ` ```rM   r>   zC3k2.__init__0  s    . 	RHa333 

 

 

 

 

 

 

 

 1XX

 

 

 

 

rN   )r   Fr   Fr   T)r6   r7   rg   r7   r   r7   r  r   r   rE   rH  r   r   r7   r   r   r   rb   s   @rM   r$   r$   -  s\        FF "
 "
 "
 "
 "
 "
 "
 "
 "
 "
 "
rN   r$   c                  &     e Zd ZdZdd fdZ xZS )r  zhC3k is a CSP bottleneck module with customizable kernel sizes for feature extraction in neural networks.r   Tr   ri   r6   r7   rg   r   r   r   r   r   rE   rk   c                    t                                          ||||           t          ||z            t          j        fdt          |          D              | _        dS )ag  Initialize C3k module.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
            n (int): Number of Bottleneck blocks.
            shortcut (bool): Whether to use shortcut connections.
            g (int): Groups for convolutions.
            e (float): Expansion ratio.
            k (int): Kernel size.
        c           	   3  F   K   | ]}t          fd           V  dS )r   r   Nr   )r   rX   rf   r   rk   r   s     rM   r   zC3k.__init__.<locals>.<genexpr>g  s=       d dVWBHaAq6S!Q!Q!Q d d d d d drN   Nr   )
rJ   r6   rg   r   r   r   r   rk   rf   rL   s
       `` `@rM   r>   zC3k.__init__X  st     	RHa333a[[ d d d d d d d[`ab[c[c d d derN   )r   Tr   r   ri   )r6   r7   rg   r7   r   r7   r   r   r   r7   r   rE   rk   r7   r   rb   s   @rM   r  r  U  sS        rrf f f f f f f f f f frN   r  c                  d     e Zd ZdZd fdZdd	Zdd
Z ej                    d             Z	 xZ
S )r0   z\RepVGGDW is a class that represents a depth-wise convolutional block in RepVGG architecture.edr7   rP   Nonec           	         t                                                       t          ||ddd|d          | _        t          ||ddd|d          | _        || _        t          j                    | _        dS )zdInitialize RepVGGDW module.

        Args:
            ed (int): Input and output channels.
        r&  r   ri   Fr   rw   N)	r=   r>   r   rB   conv1r   r?   r  rw   )rJ   r  rL   s     rM   r>   zRepVGGDW.__init__m  so     	RAqBE:::	"b!QRU;;;
799rN   rK   rO   c                ~    |                      |                     |          |                     |          z             S )zPerform a forward pass of the RepVGGDW block.

        Args:
            x (torch.Tensor): Input tensor.

        Returns:
            (torch.Tensor): Output tensor after applying the depth-wise convolution.
        )rw   rB   r  rr   s     rM   rZ   zRepVGGDW.forwardy  s/     xx		!tzz!}}4555rN   c                R    |                      |                     |                    S )zPerform a forward pass of the fused RepVGGDW block.

        Args:
            x (torch.Tensor): Input tensor.

        Returns:
            (torch.Tensor): Output tensor after applying the depth-wise convolution.
        )rw   rB   rr   s     rM   r~  zRepVGGDW.forward_fuse  s      xx		!%%%rN   c                   t          | d          sdS t          | j        j        | j        j                  }t          | j        j        | j        j                  }|j        }|j        }|j        }|j        }t          j        j	        
                    |g d          }||z   }||z   }|j        j                            |           |j        j                            |           || _        | `dS )zFuse the convolutional layers in the RepVGGDW block.

        This method fuses the convolutional layers and updates the weights and biases accordingly.
        r  N)rS   rS   rS   rS   )hasattrr   rB   r  r  rH   r:   rC   r?   r  r   rI   copy_)	rJ   rB   r  conv_wconv_bconv1_wconv1_bfinal_conv_wfinal_conv_bs	            rM   r  zRepVGGDW.fuse  s     tW%% 	F		== $*-@@,*(%))'<<<@@''|,,,	\***	JJJrN   )r  r7   rP   r  r[   )r]   r^   r_   r`   r>   rZ   r~  rC   no_gradr  ra   rb   s   @rM   r0   r0   j  s        ff
 
 
 
 
 
	6 	6 	6 	6	& 	& 	& 	& U]__  _    rN   r0   c                  .     e Zd ZdZdd fdZddZ xZS )r   a  Compact Inverted Block (CIB) module.

    Args:
        c1 (int): Number of input channels.
        c2 (int): Number of output channels.
        shortcut (bool, optional): Whether to add a shortcut connection. Defaults to True.
        e (float, optional): Scaling factor for the hidden channels. Defaults to 0.5.
        lk (bool, optional): Whether to use RepVGGDW for the third convolutional layer. Defaults to False.
    Tr   Fr6   r7   rg   r   r   r   rE   lkc                   t                                                       t          ||z            }t          j        t          ||d|          t          |d|z  d          |rt          d|z            nt          d|z  d|z  dd|z            t          d|z  |d          t          ||d|                    | _        |o||k    | _        dS )a  Initialize the CIB module.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
            shortcut (bool): Whether to use shortcut connection.
            e (float): Expansion ratio.
            lk (bool): Whether to use RepVGGDW.
        ri   r	  rS   r   N)	r=   r>   r7   r?   r   r   r0   rl   r   )rJ   r6   rg   r   r   r  rf   rL   s          rM   r>   zCIB.__init__  s     	a[[=Rb!!!QVQ "IHQVQVQVQ!b&(I(I(IRQRb!!!
 
 (brN   rK   rO   rP   c                j    | j         r||                     |          z   n|                     |          S )zForward pass of the CIB module.

        Args:
            x (torch.Tensor): Input tensor.

        Returns:
            (torch.Tensor): Output tensor.
        )r   rl   rr   s     rM   rZ   zCIB.forward  s-     #'(;q488A;;;rN   )Tr   F)
r6   r7   rg   r7   r   r   r   rE   r  r   r[   r\   rb   s   @rM   r   r     s`         ) ) ) ) ) ) ),	< 	< 	< 	< 	< 	< 	< 	<rN   r   c                  (     e Zd ZdZ	 dd fdZ xZS )r!   aD  C2fCIB class represents a convolutional block with C2f and CIB modules.

    Args:
        c1 (int): Number of input channels.
        c2 (int): Number of output channels.
        n (int, optional): Number of CIB modules to stack. Defaults to 1.
        shortcut (bool, optional): Whether to use shortcut connection. Defaults to False.
        lk (bool, optional): Whether to use large kernel. Defaults to False.
        g (int, optional): Number of groups for grouped convolution. Defaults to 1.
        e (float, optional): Expansion ratio for CIB modules. Defaults to 0.5.
    r   Fr   r6   r7   rg   r   r   r   r  r   r   rE   c                     t                                          |||||           t          j         fdt	          |          D                        _        dS )au  Initialize C2fCIB module.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
            n (int): Number of CIB modules.
            shortcut (bool): Whether to use shortcut connection.
            lk (bool): Whether to use large kernel.
            g (int): Groups for convolutions.
            e (float): Expansion ratio.
        c              3  T   K   | ]"}t          j        j        d           V  #dS )r   )r   r  N)r   r   )r   rX   r  rJ   r   s     rM   r   z"C2fCIB.__init__.<locals>.<genexpr>  s:      ]]qs46468srJJJ]]]]]]rN   Nr  )	rJ   r6   rg   r   r   r  r   r   rL   s	   `   ``  rM   r>   zC2fCIB.__init__  sd     	RHa333]]]]]]TYZ[T\T\]]]]]rN   )r   FFr   r   )r6   r7   rg   r7   r   r7   r   r   r  r   r   r7   r   rE   r   rb   s   @rM   r!   r!     s^        
 
 nq^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^rN   r!   c                  .     e Zd ZdZdd fd	ZddZ xZS )r   a  Attention module that performs self-attention on the input tensor.

    Args:
        dim (int): The input tensor dimension.
        num_heads (int): The number of attention heads.
        attn_ratio (float): The ratio of the attention key dimension to the head dimension.

    Attributes:
        num_heads (int): The number of attention heads.
        head_dim (int): The dimension of each attention head.
        key_dim (int): The dimension of the attention key.
        scale (float): The scaling factor for the attention scores.
        qkv (Conv): Convolutional layer for computing the query, key, and value.
        proj (Conv): Convolutional layer for projecting the attended values.
        pe (Conv): Convolutional layer for positional encoding.
    rP  r   r   r7   r  r  rE   c                x   t                                                       || _        ||z  | _        t	          | j        |z            | _        | j        dz  | _        | j        |z  }||dz  z   }t          ||dd          | _        t          ||dd          | _	        t          ||dd|d          | _
        dS )	zInitialize multi-head attention module.

        Args:
            dim (int): Input dimension.
            num_heads (int): Number of attention heads.
            attn_ratio (float): Attention ratio for key dimension.
              rS   r   Frv   ri   r  N)r=   r>   r  head_dimr7   key_dimr1  r   qkvr^  pe)rJ   r   r  r  nh_kdr@  rL   s         rM   r>   zAttention.__init__	  s     	"y(4=:566\4'
y(%!)OQu---c1%000	sCA%888rN   rK   rO   rP   c           	     P   |j         \  }}}}||z  }|                     |          }|                    || j        | j        dz  | j        z   |                              | j        | j        | j        gd          \  }}	}
|                    dd          |	z  | j        z  }|	                    d          }|
|                    dd          z                      ||||          | 
                    |
                    ||||                    z   }|                     |          }|S )zForward pass of the Attention module.

        Args:
            x (torch.Tensor): The input tensor.

        Returns:
            (torch.Tensor): The output tensor after self-attention.
        rS   r   r   )rT   r  rG   r  r  r  r   rU   r1  rV   r  ri  r^  )rJ   rK   BCHWNr  rj  rk   rk  rH  s               rM   rZ   zAttention.forward  s    W
1aEhhqkk((1dndlQ.>.NPQRRXX\4<7Q Y 
 
1a B##a'4:5|||##B'''--aAq99DGGAIIaQRTUWXDYDY<Z<ZZIIaLLrN   )rP  r   )r   r7   r  r7   r  rE   r[   r\   rb   s   @rM   r   r     s`         "9 9 9 9 9 9 9&       rN   r   c                  .     e Zd ZdZdd fdZddZ xZS )r  aF  PSABlock class implementing a Position-Sensitive Attention block for neural networks.

    This class encapsulates the functionality for applying multi-head attention and feed-forward neural network layers
    with optional shortcut connections.

    Attributes:
        attn (Attention): Multi-head attention module.
        ffn (nn.Sequential): Feed-forward neural network module.
        add (bool): Flag indicating whether to add shortcut connections.

    Methods:
        forward: Performs a forward pass through the PSABlock, applying attention and feed-forward layers.

    Examples:
        Create a PSABlock and perform a forward pass
        >>> psablock = PSABlock(c=128, attn_ratio=0.5, num_heads=4, shortcut=True)
        >>> input_tensor = torch.randn(1, 128, 32, 32)
        >>> output_tensor = psablock(input_tensor)
    r   rR   Tr   r7   r  rE   r  r   r   rP   r  c           	        t                                                       t          |||          | _        t	          j        t          ||dz  d          t          |dz  |dd                    | _        || _        dS )a  Initialize the PSABlock.

        Args:
            c (int): Input and output channels.
            attn_ratio (float): Attention ratio for key dimension.
            num_heads (int): Number of attention heads.
            shortcut (bool): Whether to use shortcut connections.
        r  rS   r   Frv   N)	r=   r>   r   rH  r?   r   r   ffnr   )rJ   r   r  r  r   rL   s        rM   r>   zPSABlock.__init__H  st     	aJ)LLL	=aQ!2!2DQ1%4P4P4PQQrN   rK   rO   c                    | j         r||                     |          z   n|                     |          }| j         r||                     |          z   n|                     |          }|S )zExecute a forward pass through PSABlock.

        Args:
            x (torch.Tensor): Input tensor.

        Returns:
            (torch.Tensor): Output tensor after attention and feed-forward processing.
        )r   rH  r  rr   s     rM   rZ   zPSABlock.forwardW  s[     !%:A		!diill#x8AOOTXXa[[rN   )r   rR   T)
r   r7   r  rE   r  r7   r   r   rP   r  r[   r\   rb   s   @rM   r  r  3  s`         (             rN   r  c                  .     e Zd ZdZdd fdZddZ xZS )r   a  PSA class for implementing Position-Sensitive Attention in neural networks.

    This class encapsulates the functionality for applying position-sensitive attention and feed-forward networks to
    input tensors, enhancing feature extraction and processing capabilities.

    Attributes:
        c (int): Number of hidden channels after applying the initial convolution.
        cv1 (Conv): 1x1 convolution layer to reduce the number of input channels to 2*c.
        cv2 (Conv): 1x1 convolution layer to reduce the number of output channels to c1.
        attn (Attention): Attention module for position-sensitive attention.
        ffn (nn.Sequential): Feed-forward network for further processing.

    Methods:
        forward: Applies position-sensitive attention and feed-forward network to the input tensor.

    Examples:
        Create a PSA module and apply it to an input tensor
        >>> psa = PSA(c1=128, c2=128, e=0.5)
        >>> input_tensor = torch.randn(1, 128, 64, 64)
        >>> output_tensor = psa.forward(input_tensor)
    r   r6   r7   rg   r   rE   c           	        t                                                       ||k    sJ t          ||z            | _        t	          |d| j        z  dd          | _        t	          d| j        z  |d          | _        t          | j        dt          | j        dz  d                    | _	        t          j        t	          | j        | j        dz  d          t	          | j        dz  | j        dd                    | _        dS )	zInitialize PSA module.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
            e (float): Expansion ratio.
        rS   r   r   r  r  Frv   N)r=   r>   r7   r   r   rl   ro   r   r<  rH  r?   r   r  )rJ   r6   rg   r   rL   s       rM   r>   zPSA.__init__|  s     	RxxxxR!VAJ1--DF
B**dfDFbLRS@T@TUUU	=dfdfqj!!<!<d46A:tvWX^c>d>d>deerN   rK   rO   rP   c                (   |                      |                              | j        | j        fd          \  }}||                     |          z   }||                     |          z   }|                     t          j        ||fd                    S )zExecute forward pass in PSA module.

        Args:
            x (torch.Tensor): Input tensor.

        Returns:
            (torch.Tensor): Output tensor after attention and feed-forward processing.
        r   r   )rl   r   r   rH  r  ro   rC   r   r   s       rM   rZ   zPSA.forward  s|     xx{{  $&$&!1q 991		!Oxx	1a&!,,---rN   )r   )r6   r7   rg   r7   r   rE   r[   r\   rb   s   @rM   r   r   e  sg         ,f f f f f f f". . . . . . . .rN   r   c                  .     e Zd ZdZdd fd
ZddZ xZS )r   aH  C2PSA module with attention mechanism for enhanced feature extraction and processing.

    This module implements a convolutional block with attention mechanisms to enhance feature extraction and processing
    capabilities. It includes a series of PSABlock modules for self-attention and feed-forward operations.

    Attributes:
        c (int): Number of hidden channels.
        cv1 (Conv): 1x1 convolution layer to reduce the number of input channels to 2*c.
        cv2 (Conv): 1x1 convolution layer to reduce the number of output channels to c1.
        m (nn.Sequential): Sequential container of PSABlock modules for attention and feed-forward operations.

    Methods:
        forward: Performs a forward pass through the C2PSA module, applying attention and feed-forward operations.

    Examples:
        >>> c2psa = C2PSA(c1=256, c2=256, n=3, e=0.5)
        >>> input_tensor = torch.randn(1, 256, 64, 64)
        >>> output_tensor = c2psa(input_tensor)

    Notes:
        This module essentially is the same as PSA module, but refactored to allow stacking more PSABlock modules.
    r   r   r6   r7   rg   r   r   rE   c                Z    t                                                       ||k    sJ t          ||z             _        t	          |d j        z  dd           _        t	          d j        z  |d           _        t          j         fdt          |          D               _
        dS )zInitialize C2PSA module.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
            n (int): Number of PSABlock modules.
            e (float): Expansion ratio.
        rS   r   c              3  V   K   | ]#}t          j        d j        dz            V  $dS )r   r  r  N)r  r   r   rX   rJ   s     rM   r   z!C2PSA.__init__.<locals>.<genexpr>  s<       l l^_$&SDFVXL!Y!Y!Y l l l l l lrN   Nr   rJ   r6   rg   r   r   rL   s   `    rM   r>   zC2PSA.__init__  s     	RxxxxR!VAJ1--DF
B** l l l lchijckck l l lmrN   rK   rO   rP   c                    |                      |                              | j        | j        fd          \  }}|                     |          }|                     t          j        ||fd                    S )zProcess the input tensor through a series of PSA blocks.

        Args:
            x (torch.Tensor): Input tensor.

        Returns:
            (torch.Tensor): Output tensor after processing.
        r   r   )rl   r   r   r   ro   rC   r   r   s       rM   rZ   zC2PSA.forward  sd     xx{{  $&$&!1q 991FF1IIxx	1a&!,,---rN   r   r   r   r[   r\   rb   s   @rM   r   r     sg         .n n n n n n n". . . . . . . .rN   r   c                  &     e Zd ZdZdd fd
Z xZS )r"   a  C2fPSA module with enhanced feature extraction using PSA blocks.

    This class extends the C2f module by incorporating PSA blocks for improved attention mechanisms and feature
    extraction.

    Attributes:
        c (int): Number of hidden channels.
        cv1 (Conv): 1x1 convolution layer to reduce the number of input channels to 2*c.
        cv2 (Conv): 1x1 convolution layer to reduce the number of output channels to c2.
        m (nn.ModuleList): List of PSABlock modules for feature extraction.

    Methods:
        forward: Performs a forward pass through the C2fPSA module.
        forward_split: Performs a forward pass using split() instead of chunk().

    Examples:
        >>> import torch
        >>> from ultralytics.nn.modules.block import C2fPSA
        >>> model = C2fPSA(c1=64, c2=64, n=3, e=0.5)
        >>> x = torch.randn(1, 64, 128, 128)
        >>> output = model(x)
        >>> print(output.shape)
    r   r   r6   r7   rg   r   r   rE   c                     ||k    sJ t                                          ||||           t          j         fdt	          |          D                        _        dS )zInitialize C2fPSA module.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
            n (int): Number of PSABlock modules.
            e (float): Expansion ratio.
        )r   r   c           	   3  r   K   | ]1}t          j        d t          j        dz  d                    V  2dS )r   r  r   r  N)r  r   r<  r  s     rM   r   z"C2fPSA.__init__.<locals>.<genexpr>  sG      rrdex3#dfXZl\]J^J^___rrrrrrrN   Nr  r  s   `    rM   r>   zC2fPSA.__init__  sh     RxxxxR1***rrrrinopiqiqrrrrrrN   r  r   r   rb   s   @rM   r"   r"     sW         0s s s s s s s s s s srN   r"   c                  ,     e Zd ZdZd fdZddZ xZS )r2   aH  SCDown module for downsampling with separable convolutions.

    This module performs downsampling using a combination of pointwise and depthwise convolutions, which helps in
    efficiently reducing the spatial dimensions of the input tensor while maintaining the channel information.

    Attributes:
        cv1 (Conv): Pointwise convolution layer that reduces the number of channels.
        cv2 (Conv): Depthwise convolution layer that performs spatial downsampling.

    Methods:
        forward: Applies the SCDown module to the input tensor.

    Examples:
        >>> import torch
        >>> from ultralytics.nn.modules.block import SCDown
        >>> model = SCDown(c1=64, c2=128, k=3, s=2)
        >>> x = torch.randn(1, 64, 128, 128)
        >>> y = model(x)
        >>> print(y.shape)
        torch.Size([1, 128, 64, 64])
    r6   r7   rg   rk   r  c                    t                                                       t          ||dd          | _        t          |||||d          | _        dS )zInitialize SCDown module.

        Args:
            c1 (int): Input channels.
            c2 (int): Output channels.
            k (int): Kernel size.
            s (int): Stride.
        r   F)rk   r  r   rw   N)r=   r>   r   rl   ro   )rJ   r6   rg   rk   r  rL   s        rM   r>   zSCDown.__init__  sP     	B1%%B!qBE:::rN   rK   rO   rP   c                R    |                      |                     |                    S )zApply convolution and downsampling to the input tensor.

        Args:
            x (torch.Tensor): Input tensor.

        Returns:
            (torch.Tensor): Downsampled output tensor.
        )ro   rl   rr   s     rM   rZ   zSCDown.forward  s      xx$$$rN   r  r[   r\   rb   s   @rM   r2   r2     s[         ,; ; ; ; ; ;	% 	% 	% 	% 	% 	% 	% 	%rN   r2   c                  0     e Zd ZdZ	 dd fdZddZ xZS )r3   a?  TorchVision module to allow loading any torchvision model.

    This class provides a way to load a model from the torchvision library, optionally load pre-trained weights, and
    customize the model by truncating or unwrapping layers.

    Args:
        model (str): Name of the torchvision model to load.
        weights (str, optional): Pre-trained weights to load. Default is "DEFAULT".
        unwrap (bool, optional): Unwraps the model to a sequential containing all but the last `truncate` layers.
        truncate (int, optional): Number of layers to truncate from the end if `unwrap` is True. Default is 2.
        split (bool, optional): Returns output from intermediate child modules as list. Default is False.

    Attributes:
        m (nn.Module): The loaded torchvision model, possibly truncated and unwrapped.
    DEFAULTTrS   Fmodelstrweightsunwrapr   truncater7   r   c                   ddl }t                                                       t          |j        d          r"|j                            ||          | _        n. |j        j        |         t          |                    | _        |rt          | j        
                                          }t          |d         t          j                  r3g t          |d         
                                          |dd         }t          j        |r|d|          n| | _        || _        dS d| _        t          j                    x| j        _        | j        _        dS )ae  Load the model and weights from torchvision.

        Args:
            model (str): Name of the torchvision model to load.
            weights (str): Pre-trained weights to load.
            unwrap (bool): Whether to unwrap the model.
            truncate (int): Number of layers to truncate.
            split (bool): Whether to split the output.
        r   N	get_model)r  )
pretrainedr   F)torchvisionr=   r>   r  modelsr  r   __dict__r   r   children
isinstancer?   r   r   r   headheads)	rJ   r  r  r  r  r   r  layersrL   s	           rM   r>   zTorchVision.__init__;  s;    	;%{33 	R '11%1IIDFF7['074==QQQDF 	7$&//++,,F&)R]33 DC4q	 2 2 4 455Cqrr
C]8%OVJhYJ%7%7QDFDJJJDJ)+6DFK$&,,,rN   rK   rO   rP   c                    | j         r*|g                    fd| j        D                        n|                     |          S )zForward pass through the model.

        Args:
            x (torch.Tensor): Input tensor.

        Returns:
            (torch.Tensor | list[torch.Tensor]): Output tensor or list of tensors.
        c              3  :   K   | ]} |d                    V  dS r   r   r   s     rM   r   z&TorchVision.forward.<locals>.<genexpr>c  s/      ..!QQquXX......rN   )r   r   r   r   s     @rM   rZ   zTorchVision.forwardX  sV     : 	AHH....tv.......q		ArN   )r  TrS   F)
r  r  r  r  r  r   r  r7   r   r   r[   r\   rb   s   @rM   r3   r3   *  sg         " kp7 7 7 7 7 7 7:       rN   r3   c                  .     e Zd ZdZdd fdZddZ xZS )AAttna  Area-attention module for YOLO models, providing efficient attention mechanisms.

    This module implements an area-based attention mechanism that processes input features in a spatially-aware manner,
    making it particularly effective for object detection tasks.

    Attributes:
        area (int): Number of areas the feature map is divided into.
        num_heads (int): Number of heads into which the attention mechanism is divided.
        head_dim (int): Dimension of each attention head.
        qkv (Conv): Convolution layer for computing query, key and value tensors.
        proj (Conv): Projection convolution layer.
        pe (Conv): Position encoding convolution layer.

    Methods:
        forward: Applies area-attention to input tensor.

    Examples:
        >>> attn = AAttn(dim=256, num_heads=8, area=4)
        >>> x = torch.randn(1, 256, 32, 32)
        >>> output = attn(x)
        >>> print(output.shape)
        torch.Size([1, 256, 32, 32])
    r   r   r7   r  areac           	     ,   t                                                       || _        || _        ||z  x| _        }|| j        z  }t          ||dz  dd          | _        t          ||dd          | _        t          ||ddd|d          | _        dS )a#  Initialize an Area-attention module for YOLO models.

        Args:
            dim (int): Number of hidden channels.
            num_heads (int): Number of heads into which the attention mechanism is divided.
            area (int): Number of areas the feature map is divided into.
        ri   r   Frv   r&  r  N)	r=   r>   r!  r  r  r   r  r^  r  )rJ   r   r  r!  r  all_head_dimrL   s         rM   r>   zAAttn.__init__  s     		"#&)#33$.0\A-qe<<<sA5999	|S!QSeDDDrN   rK   rO   rP   c                   |j         \  }}}}||z  }|                     |                              d                              dd          }| j        dk    r5|                    || j        z  || j        z  |dz            }|j         \  }}}|                    ||| j        | j        dz            	                    dddd          
                    | j        | j        | j        gd          \  }	}
}|	                    dd          |
z  | j        dz  z  }|                    d          }||                    dd          z  }|	                    dddd          }|	                    dddd          }| j        dk    rY|                    || j        z  || j        z  |          }|                    || j        z  || j        z  |          }|j         \  }}}|                    ||||          	                    dddd                                          }|                    ||||          	                    dddd                                          }||                     |          z   }|                     |          S )	zProcess the input tensor through the area-attention.

        Args:
            x (torch.Tensor): Input tensor.

        Returns:
            (torch.Tensor): Output tensor after area-attention.
        rS   r   ri   r   r   r  r   r  )rT   r  flattenrU   r!  ri  rG   r  r  permuter   rV   
contiguousr  r^  )rJ   rK   r  r  r  r  r  r  rX   rj  rk   rk  rH  s                rM   rZ   zAAttn.forward  sJ    W
1aEhhqkk!!!$$..q!449q==++a$)mQ$)^QUCCCiGAq!HHQ4>4=1+<==WQ1a  UDM4=$-@aUHH 	1a
 B##a'DM4,?@|||##r2&&&IIaAq!!IIaAq!!9q==		!ty.!di-;;A		!ty.!di-;;AgGAq!IIaAq!!))!Q155@@BBIIaAq!!))!Q155@@BB

Nyy||rN   r   )r   r7   r  r7   r!  r7   r[   r\   rb   s   @rM   r   r   i  sg         0E E E E E E E&$ $ $ $ $ $ $ $rN   r   c                  F     e Zd ZdZdd fd
Zedd            ZddZ xZS )ABlocka  Area-attention block module for efficient feature extraction in YOLO models.

    This module implements an area-attention mechanism combined with a feed-forward network for processing feature maps.
    It uses a novel area-based attention approach that is more efficient than traditional self-attention while
    maintaining effectiveness.

    Attributes:
        attn (AAttn): Area-attention module for processing spatial features.
        mlp (nn.Sequential): Multi-layer perceptron for feature transformation.

    Methods:
        _init_weights: Initializes module weights using truncated normal distribution.
        forward: Applies area-attention and feed-forward processing to input tensor.

    Examples:
        >>> block = ABlock(dim=256, num_heads=8, mlp_ratio=1.2, area=1)
        >>> x = torch.randn(1, 256, 32, 32)
        >>> output = block(x)
        >>> print(output.shape)
        torch.Size([1, 256, 32, 32])
    333333?r   r   r7   r  	mlp_ratiorE   r!  c           	     B   t                                                       t          |||          | _        t	          ||z            }t          j        t          ||d          t          ||dd                    | _        | 	                    | j
                   dS )aa  Initialize an Area-attention block module.

        Args:
            dim (int): Number of input channels.
            num_heads (int): Number of heads into which the attention mechanism is divided.
            mlp_ratio (float): Expansion ratio for MLP hidden dimension.
            area (int): Number of areas the feature map is divided into.
        )r  r!  r   Frv   N)r=   r>   r   rH  r7   r?   r   r   mlpapply_init_weights)rJ   r   r  r+  r!  mlp_hidden_dimrL   s         rM   r>   zABlock.__init__  s     	#>>>	S9_--=c>1!=!=tNTWYZ`e?f?f?fgg

4%&&&&&rN   r   r   c                    t          | t          j                  rTt          j                            | j        d           | j        )t          j                            | j        d           dS dS dS )zInitialize weights using a truncated normal distribution.

        Args:
            m (nn.Module): Module to initialize.
        g{Gz?)stdNr   )r  r?   r@   inittrunc_normal_rH   r:   	constant_r   s    rM   r/  zABlock._init_weights  sm     a## 	-G!!!(!555v!!!!&!,,,,,	- 	-!!rN   rK   rO   rP   c                b    ||                      |          z   }||                     |          z   S )zForward pass through ABlock.

        Args:
            x (torch.Tensor): Input tensor.

        Returns:
            (torch.Tensor): Output tensor after area-attention and feed-forward processing.
        )rH  r-  rr   s     rM   rZ   zABlock.forward  s,     		!488A;;rN   )r*  r   )r   r7   r  r7   r+  rE   r!  r7   )r   r   r[   )	r]   r^   r_   r`   r>   r  r/  rZ   ra   rb   s   @rM   r)  r)    s         ,' ' ' ' ' ' '" 	- 	- 	- \	-
 
 
 
 
 
 
 
rN   r)  c                  >     e Zd ZdZ	 	 	 	 	 	 	 	 dd fdZddZ xZS )A2C2fa  Area-Attention C2f module for enhanced feature extraction with area-based attention mechanisms.

    This module extends the C2f architecture by incorporating area-attention and ABlock layers for improved feature
    processing. It supports both area-attention and standard convolution modes.

    Attributes:
        cv1 (Conv): Initial 1x1 convolution layer that reduces input channels to hidden channels.
        cv2 (Conv): Final 1x1 convolution layer that processes concatenated features.
        gamma (nn.Parameter | None): Learnable parameter for residual scaling when using area attention.
        m (nn.ModuleList): List of either ABlock or C3k modules for feature processing.

    Methods:
        forward: Processes input through area-attention or standard convolution pathway.

    Examples:
        >>> m = A2C2f(512, 512, n=1, a2=True, area=1)
        >>> x = torch.randn(1, 512, 32, 32)
        >>> output = m(x)
        >>> print(output.shape)
        torch.Size([1, 512, 32, 32])
    r   TF       @r   r6   r7   rg   r   a2r   r!  residualr+  rE   r   r   r   c                  	
 t                                                       t          ||z            dz  dk    s
J d            t          |dd          | _        t          d|z   z  |d          | _        r-|r+t          j        dt          j	        |          z  d          nd| _
        t          j        	
fd	t          |          D                       | _        dS )
a  Initialize Area-Attention C2f module.

        Args:
            c1 (int): Number of input channels.
            c2 (int): Number of output channels.
            n (int): Number of ABlock or C3k modules to stack.
            a2 (bool): Whether to use area attention blocks. If False, uses C3k blocks instead.
            area (int): Number of areas the feature map is divided into.
            residual (bool): Whether to use residual connections with learnable gamma parameter.
            mlp_ratio (float): Expansion ratio for MLP hidden dimension.
            e (float): Channel expansion ratio for hidden channels.
            g (int): Number of groups for grouped convolutions.
            shortcut (bool): Whether to use shortcut connections in C3k blocks.
        re   r   z-Dimension of ABlock must be a multiple of 32.r   {Gz?TrT  Nc              3     K   | ]B}r)t          j        fd t          d          D              nt          d          V  CdS )c              3  B   K   | ]}t          d z            V  dS )re   N)r)  )r   rX   r!  rf   r+  s     rM   r   z+A2C2f.__init__.<locals>.<genexpr>.<genexpr>8  s5      TTaF2rRxDAATTTTTTrN   rS   N)r?   r   r   r  )r   rX   r:  r!  rf   r   r+  r   s     rM   r   z!A2C2f.__init__.<locals>.<genexpr>7  s       
 
  -BMTTTTTT5QR88TTTUURQ!,,
 
 
 
 
 
rN   )r=   r>   r7   r   rl   ro   r?   rF   rC   r8  gammar   r   r   )rJ   r6   rg   r   r:  r!  r;  r+  r   r   r   rf   rL   s       `` ` ``@rM   r>   zA2C2f.__init__  s
   6 	a[[Bw!|||L|||B1%%Q"b!,,PRiW_iR\$B"7tLLLLei
 
 
 
 
 
 
 
 
 
 1XX	
 
 
 
 
rN   rK   rO   rP   c                D   |                      |          g                    fd| j        D                        |                     t	          j        d                    | j        3|| j                            d| j        j        d         dd          z  z   S S )zForward pass through A2C2f layer.

        Args:
            x (torch.Tensor): Input tensor.

        Returns:
            (torch.Tensor): Output tensor after processing.
        c              3  :   K   | ]} |d                    V  dS r   r   r   s     rM   r   z A2C2f.forward.<locals>.<genexpr>H  r   rN   r   Nr   r   )	rl   r   r   ro   rC   r   r@  rG   rT   r   s     @rM   rZ   zA2C2f.forward>  s     XXa[[M	****46******HHUYq!__%%:!tzr4:+;A+>1EEIIIrN   )r   Tr   Fr9  r   r   T)r6   r7   rg   r7   r   r7   r:  r   r!  r7   r;  r   r+  rE   r   rE   r   r7   r   r   r[   r\   rb   s   @rM   r8  r8    s{         4 (
 (
 (
 (
 (
 (
 (
T       rN   r8  c                  .     e Zd ZdZdd fd	ZddZ xZS )	SwiGLUFFNz@SwiGLU Feed-Forward Network for transformer-based architectures.rR   r0  r7   r   r   rP   r  c                    t                                                       t          j        |||z            | _        t          j        ||z  dz  |          | _        dS )zInitialize SwiGLU FFN with input dimension, output dimension, and expansion factor.

        Args:
            gc (int): Guide channels.
            ec (int): Embedding channels.
            e (int): Expansion factor.
        rS   N)r=   r>   r?   r4  w12w3)rJ   r0  r   r   rL   s       rM   r>   zSwiGLUFFN.__init__R  sQ     	9RR(()AFaK,,rN   rK   rO   c                    |                      |          }|                    dd          \  }}t          j        |          |z  }|                     |          S )z.Apply SwiGLU transformation to input features.rS   r   r   )rF  r   r   silurG  )rJ   rK   x12r   r   hiddens         rM   rZ   zSwiGLUFFN.forward^  sK    hhqkk1"%%BbwwvrN   )rR   )r0  r7   r   r7   r   r7   rP   r  r[   r\   rb   s   @rM   rD  rD  O  s\        JJ
- 
- 
- 
- 
- 
- 
-       rN   rD  c                  ,     e Zd ZdZd
 fdZdd	Z xZS )Residualz7Residual connection wrapper for neural network modules.r   r   rP   r  c                   t                                                       || _        t          j                            | j        j        j                   t          j                            | j        j        j                   dS )zInitialize residual module with the wrapped module.

        Args:
            m (nn.Module): Module to wrap with residual connection.
        N)	r=   r>   r   r?   r3  zeros_rG  r:   rH   )rJ   r   rL   s     rM   r>   zResidual.__init__i  s_     	
tvy~&&& 	tvy'(((((rN   rK   rO   c                2    ||                      |          z   S )z,Apply residual connection to input features.r   rr   s     rM   rZ   zResidual.forwardv  s    466!99}rN   )r   r   rP   r  r[   r\   rb   s   @rM   rM  rM  f  sW        AA) ) ) ) ) )       rN   rM  c                  ,     e Zd ZdZd fdZddZ xZS )SAVPEzESpatial-Aware Visual Prompt Embedding module for feature enhancement.rQ  r  r   r7   rB  c           	        t                                                       t          j        fdt	          |          D                       | _        t          j        fdt	          |          D                       | _        d| _        t          j        dz  |d          | _	        t          j        dz  | j        dd          | _
        t          j        d| j        dd          | _        t          j        t          d| j        z  | j        d          t          j        | j        | j        dd                    | _        dS )	a  Initialize SAVPE module with channels, intermediate channels, and embedding dimension.

        Args:
            ch (list[int]): List of input channel dimensions.
            c3 (int): Intermediate channels.
            embed (int): Embedding dimension.
        c           	   3     K   | ]h\  }}t          j        t          |d           t          d           |dv rt          j        |dz            nt          j                              V  idS )ri      r   rS   rS   scale_factorNr?   r   r   Upsampler   r   r   rK   r   s      rM   r   z!SAVPE.__init__.<locals>.<genexpr>  s       !
 !
 1 MQARQTUY_T_T_!a%1P1P1P1Pegeperer !
 !
 !
 !
 !
 !
rN   c              3     K   | ]X\  }}t          j        t          |d           |dv rt          j        |dz            nt          j                              V  YdS )r   rU  rS   rV  NrX  rZ  s      rM   r   z!SAVPE.__init__.<locals>.<genexpr>  sz       !
 !
1 M$q"a..QRX[["+1q5*I*I*I*I^`^i^k^kll!
 !
 !
 !
 !
 !
rN   r5   ri   r   )rz   rS   N)r=   r>   r?   r   r  rl   ro   r   r@   rp   r  r  r   r   cv6)rJ   rQ  r   rB  rL   s     ` rM   r>   zSAVPE.__init__~  sB    	= !
 !
 !
 !
 ""	!
 !
 !
 
 
 = !
 !
 !
 !
!"!
 !
 !
 
 

 9QVUA..9QVTVQ:::9Q1555=a$&j$&!!<!<biPTPVXYcd>e>e>effrN   rK   rc  vprO   rP   c                R     fdt          |          D             }                     t          j        |d                    } fdt          |          D             }                     t          j        |d                    }|j        \  }}}}|j        d         }|                    ||d          }|                    |d j        ||          	                    d|ddd                              ||z   j        ||          }|                    ||d||                              ||z  d||          } 
                    t          j        |                     |          fd                    }|                    || j        d          }|                    ||dd          }||z  t          j        |          t          j        |j                  j        z  z   }	t!          j        |	d                              |j                  }	|	                    dd          |                    | j        | j        z  d                              dd          z  }
t!          j        |
                    dd                              ||d          dd	          S )
zJProcess input features and visual prompts to generate enhanced embeddings.c                B    g | ]\  }} j         |         |          S r   )ro   r   r   xirJ   s      rM   r   z!SAVPE.forward.<locals>.<listcomp>  +    777B[TXa[__777rN   r   r   c                B    g | ]\  }} j         |         |          S r   )rl   r`  s      rM   r   z!SAVPE.forward.<locals>.<listcomp>  rb  rN   r   r  rS   rs  )r  r  rC   r   rp   rT   rG   ri  r   expandr\  r  logical_notfinfor<   minr   rV   torU   ru  )rJ   rK   r]  r   r  r  r  r  Qscore
aggregateds   `          rM   rZ   zSAVPE.forward  s`   7777)A,,777HHUYqa((())7777)A,,777HHUYqa((())W
1aHQKFF1aIIaDFAq))00QBCCKKAPQESWSY[\^_``ZZ1aA&&..q1uaA>>HHUY488B<<0a88899IIaDFB''ZZ1a$$B*2..QW1E1E1III	%R(((++AG44__R,,qyyDFAKQS/T/T/^/^_ace/f/ff
{://B77??1bIIrUVWWWWrN   )rQ  r  r   r7   rB  r7   )rK   rc  r]  rO   rP   rO   r\   rb   s   @rM   rR  rR  {  se        OOg g g g g g6X X X X X X X XrN   rR  c                  :     e Zd ZdZdd fdZdd fdZd Z xZS )Proto26zDUltralytics YOLO26 models mask Proto module for segmentation models.r   rd   re   P   rQ  tuplerf   r7   rg   ncc           	        t                                          |||           t          j        fddd         D                       | _        t          d         |d          | _        t          j        t          d         |d          t          ||d          t          j        ||d                    | _	        dS )ap  Initialize the Ultralytics YOLO models mask Proto module with specified number of protos and masks.

        Args:
            ch (tuple): Tuple of channel sizes from backbone feature maps.
            c_ (int): Intermediate channels.
            c2 (int): Output channels (number of protos).
            nc (int): Number of classes for semantic segmentation.
        c              3  H   K   | ]}t          |d          d          V  dS )r   r   rj   Nr   )r   rK   rQ  s     rM   r   z#Proto26.__init__.<locals>.<genexpr>  s6      (M(MaA!)<)<)<(M(M(M(M(M(MrN   r   Nr   ri   rj   )
r=   r>   r?   r   feat_refiner   	feat_fuser   r@   semseg)rJ   rQ  rf   rg   rq  rL   s    `   rM   r>   zProto26.__init__  s     	R$$$=(M(M(M(Mbf(M(M(MMMbeR1---mDAa$8$8$8$r2:K:K:KRYWY[]_`MaMabbrN   TrK   rO   return_semsegr   rP   c                r   |d         }t          | j                  D ]B\  }} |||dz                      }t          j        ||j        dd         d          }||z   }Ct                                          |                     |                    }| j        r|r| 	                    |          }||fS |S )zUPerform a forward pass by fusing multi-scale feature maps and generating proto masks.r   r   rS   Nr  r  )
r  rt  r   r  rT   r=   rZ   ru  trainingrv  )
rJ   rK   rw  featr   fup_featr  rv  rL   s
            rM   rZ   zProto26.forward  s    td.// 	" 	"DAqa!a%kkGmG$*QRR.yQQQG'>DDGGOODNN40011= 	] 	[[&&Fv;rN   c                    d| _         dS )zHFuse the model for inference by removing the semantic segmentation head.N)rv  r  s    rM   r  zProto26.fuse  s    rN   )r   rd   re   ro  )rQ  rp  rf   r7   rg   r7   rq  r7   )T)rK   rO   rw  r   rP   rO   )r]   r^   r_   r`   r>   rZ   r  ra   rb   s   @rM   rn  rn    s        NNc c c c c c c            rN   rn  c                  v     e Zd ZdZed             Zed             Zed             Z fdZ	d Z
d Zd Z xZS )	RealNVPzRealNVP: a flow-based generative model.

    References:
        https://arxiv.org/abs/1605.08803
        https://github.com/open-mmlab/mmpose/blob/main/mmpose/models/utils/realnvp.py
    c            
        t          j        t          j        dd          t          j                    t          j        dd          t          j                    t          j        dd          t          j                              S )z3Get the scale model in a single invertible mapping.rS   r  )r?   r   r4  r  Tanhr   rN   rM   netszRealNVP.nets  s\     }RYq"--rwyy")B:K:KRWYYXZXabdfgXhXhjljqjsjstttrN   c            
         t          j        t          j        dd          t          j                    t          j        dd          t          j                    t          j        dd                    S )z9Get the translation model in a single invertible mapping.rS   r  )r?   r   r4  r  r   rN   rM   nettzRealNVP.nett  sP     }RYq"--rwyy")B:K:KRWYYXZXabdfgXhXhiiirN   c                V    t           j                            | j        | j                  S )zThe prior distribution.)rC   distributionsMultivariateNormalloccovr  s    rM   priorzRealNVP.prior  s!     "55dhIIIrN   c                    t                                                                            dt          j        d                                          dt          j        d                                          dt          j        ddgddggdz  t          j                             t          j        	                     fd	t          t           j                            D                        _        t          j        	                     fd
t          t           j                            D                        _                                          d S )Nr  rS   r  maskr   r   ri   r;   c                8    g | ]}                                 S r   )r  r  s     rM   r   z$RealNVP.__init__.<locals>.<listcomp>  !    %Q%Q%Qadiikk%Q%Q%QrN   c                8    g | ]}                                 S r   )r  r  s     rM   r   z$RealNVP.__init__.<locals>.<listcomp>  r  rN   )r=   r>   register_bufferrC   r6  eyer_  float32r?   r   r   r   r  r  tinit_weightsrq  s   `rM   r>   zRealNVP.__init__  s   UEKNN333UEIaLL111VU\Aq6Aq62BQ2Fem%\%\%\]]]$$%Q%Q%Q%Q5TY;P;P%Q%Q%QRR$$%Q%Q%Q%Q5TY;P;P%Q%Q%QRRrN   c                    |                                  D ]B}t          |t          j                  r&t          j                            |j        d           CdS )zInitialize model weights.r=  )gainN)modulesr  r?   r4  r3  xavier_uniform_rH   )rJ   r   s     rM   r  zRealNVP.init_weights  sU     	= 	=A!RY'' =''t'<<<	= 	=rN   c                   |                     |j        d                   |}}t          t          t	          | j                                      D ]}| j        |         |z  } | j        |         |          d| j        |         z
  z  } | j        |         |          d| j        |         z
  z  }d| j        |         z
  ||z
  z  t          j	        |           z  |z   }||
                    d          z  }||fS )zApply mapping from the data space to the latent space and calculate the log determinant of the Jacobian
        matrix.
        r   r   r   )	new_zerosrT   reversedr   r   r  r  r  rC   rv  r  )rJ   rK   log_det_jacobzr   z_r  r  s           rM   
backward_pzRealNVP.backward_p  s     ;;qwqz22Aq%DF,,-- 	* 	*A1!Bq	"TYq\!12Aq	"TYq\!12ATYq\!a!e,uy!}}<rAAQUUqU\\)MM-rN   c                   |j         t          j        k    r?| j        d         d         j        j         t          j        k    r|                                  |                     |          \  }}| j                            |          |z   S )z<Calculate the log probability of given sample in data space.r   )	r<   rC   r  r  rH   rE   r  r  log_prob)rJ   rK   r  log_dets       rM   r  zRealNVP.log_prob  sj    7em##q	!(;(AU](R(RJJLLL__Q''
7z""1%%//rN   )r]   r^   r_   r`   r  r  r  propertyr  r>   r  r  r  ra   rb   s   @rM   r  r    s          u u \u j j \j J J XJ	 	 	 	 	= = =     0 0 0 0 0 0 0rN   r  )Lr`   
__future__r   rC   torch.nnr?   torch.nn.functionalr  r   ultralytics.utils.torch_utilsr   rB   r   r   r   r	   r
   r   transformerr   __all__Moduler   r-   r+   r*   r   r   r   r   r   r   r%   r.   r   r#   r)   r   r   r  r1   r,  r    r,   r(   r   r  r  r/   r   r   r   r   r'   r&   r  r$   r  r0   r   r!   r   r  r   r   r"   r2   r3   r   r)  r8  rD  rM  rR  rn  r  r   rN   rM   <module>r     s}
     " " " " " "                 : : : : : : F F F F F F F F F F F F F F F F ) ) ) ) ) )(V\ \ \ \ \") \ \ \2> > > > >BI > > >,! ! ! ! !RY ! ! !H)( )( )( )( )(bi )( )( )(XD D D D D") D D D.; ; ; ; ;29 ; ; ;@       *6 6 6 6 6 6 6 66) ) ) ) )") ) ) )DJ J J J J J J J4x x x x x" x x x&; ; ; ; ;BI ; ; ;00 0 0 0 02 0 0 0&M M M M Mb M M M&/ / / / /bi / / /8P P P P P P P P6C C C C CBI C C C>J J J J J") J J J0    ")   >1$ 1$ 1$ 1$ 1$") 1$ 1$ 1$h?) ?) ?) ?) ?)bi ?) ?) ?)D>% >% >% >% >%ry >% >% >%B6 6 6 6 6bi 6 6 64.6 .6 .6 .6 .6	 .6 .6 .6b, , , , ,J , , ,*_ _ _ _ _R _ _ _&) ) ) ) )29 ) ) )B1 1 1 1 1L 1 1 1(    BI   && & & & &BI & & &2) ) ) ) )bi ) ) )63 3 3 3 3ry 3 3 3.< < < < <RY < < <4) ) ) ) )") ) ) )8%
 %
 %
 %
 %
3 %
 %
 %
Pf f f f f" f f f*> > > > >ux > > >B*< *< *< *< *<") *< *< *<Z^ ^ ^ ^ ^S ^ ^ ^>9 9 9 9 9	 9 9 9x/ / / / /ry / / /d4. 4. 4. 4. 4.") 4. 4. 4.n4. 4. 4. 4. 4.BI 4. 4. 4.n$s $s $s $s $sS $s $s $sN-% -% -% -% -%RY -% -% -%`< < < < <") < < <~P P P P PBI P P Pf> > > > >RY > > >BO O O O OBI O O Od    	   .    ry   *8X 8X 8X 8X 8XBI 8X 8X 8Xv         e      F:0 :0 :0 :0 :0bi :0 :0 :0 :0 :0rN   