
    /j7                        d dl mZ d dlZd dlmZ ddd
ZdddZ G d dej                  Z G d dej                  ZdS )    )annotationsN)optimHz>Gtorch.Tensorepsfloatreturnc                   t          | j                  dk    sJ |                                 }||                                |z   z  }|                     d          |                     d          k    r|j        }dD ])\  }}}||j        z  }||z  ||z  |z  z   }||z  ||z  z   }*|                     d          |                     d          k    r|j        }|S )a"  Compute the zeroth power / orthogonalization of matrix G using Newton-Schulz iteration.

    This function implements a quintic Newton-Schulz iteration to compute an approximate orthogonalization of the input
    matrix G. The iteration coefficients are optimized to maximize convergence slope at zero, producing a result similar
    to UV^T from SVD, where USV^T = G, but with relaxed convergence guarantees that empirically work well for
    optimization purposes.

    Args:
        G (torch.Tensor): Input 2D tensor/matrix to orthogonalize.
        eps (float, optional): Small epsilon value added to norm for numerical stability. Default: 1e-7.

    Returns:
        (torch.Tensor): Orthogonalized matrix with same shape as input G.

    Examples:
        >>> G = torch.randn(128, 64)
        >>> G_ortho = zeropower_via_newtonschulz5(G)
        >>> print(G_ortho.shape)
        torch.Size([128, 64])

    Notes:
        - Uses bfloat16 precision for computation.
        - Performs exactly 5 Newton-Schulz iteration steps with fixed coefficients.
        - Automatically transposes for efficiency when rows > columns.
        - Output approximates US'V^T where S' has diagonal entries ~ Uniform(0.5, 1.5).
        - Does not produce exact UV^T but works well empirically for neural network optimization.
       r      )guV@ggn@ @r   r   r   r   )lenshapebfloat16normsizeT)r   r   XabcABs           [/home/longshao/multi-rider-rag/.venv/lib/python3.11/site-packages/ultralytics/optim/muon.pyzeropower_via_newtonschulz5r   	   s    8 qw<<1	

ACAvvayy166!99C  1a GEAEAIEAEMvvayy166!99CH    ffffff?Tgradmomentumbetanesterovboolc                d   |                     | d|z
             |r|                     ||          n|}|j        dk    r#|                    t	          |          d          }t          |          }|t          d|                     d          |                     d          z            dz  z  }|S )aX  Compute Muon optimizer update with momentum and orthogonalization.

    This function applies momentum to the gradient, optionally uses Nesterov acceleration, and then orthogonalizes the
    update using Newton-Schulz iterations. For convolutional filters (4D tensors), it reshapes before orthogonalization
    and scales the final update based on parameter dimensions.

    Args:
        grad (torch.Tensor): Gradient tensor to update. Can be 2D or 4D (for conv filters).
        momentum (torch.Tensor): Momentum buffer tensor, modified in-place via lerp.
        beta (float, optional): Momentum coefficient for exponential moving average. Default: 0.95.
        nesterov (bool, optional): Whether to use Nesterov momentum acceleration. Default: True.

    Returns:
        (torch.Tensor): Orthogonalized update tensor with same shape as input grad. For 4D inputs, returns reshaped
            result matching original dimensions.

    Examples:
        >>> grad = torch.randn(64, 128)
        >>> momentum = torch.zeros_like(grad)
        >>> update = muon_update(grad, momentum, beta=0.95, nesterov=True)
        >>> print(update.shape)
        torch.Size([64, 128])

    Notes:
        - Momentum buffer is updated in-place: momentum = beta * momentum + (1-beta) * grad.
        - With Nesterov: update = beta * momentum + (1-beta) * grad.
        - Without Nesterov: update = momentum.
        - 4D tensors (conv filters) are reshaped to 2D as (out_channels, in_channels*height*width) for orthogonalization.
        - Final update is scaled by sqrt(max(1, dim[-2] / dim[-1])) to account for parameter dimensions.
    r            ?)lerp_lerpndimviewr   r   maxr   )r   r    r!   r"   updates        r   muon_updater/   ;   s    > NN4T"""*2@TYYx&&&F{aS[["--(00F
c!TYYr]]TYYr]]233s::FMr   c                  f     e Zd ZdZ	 	 	 	 	 	 	 dd fdZ ej                    dd            Z xZS )MuSGDa"  Hybrid optimizer combining Muon and SGD updates for neural network training.

    This optimizer implements a combination of Muon (a momentum-based optimizer with orthogonalization via Newton-Schulz
    iterations) and standard SGD with momentum. It allows different parameter groups to use either the hybrid Muon+SGD
    approach or pure SGD.

    Args:
        params (Iterable): Parameters to optimize or dicts defining parameter groups.
        muon (float, optional): Weight factor for Muon updates in hybrid mode. Default: 0.5.
        sgd (float, optional): Weight factor for SGD updates in hybrid mode. Default: 0.5.

    Attributes:
        muon (float): Scaling factor applied to Muon learning rate.
        sgd (float): Scaling factor applied to SGD learning rate in hybrid mode.

    Examples:
        >>> param_groups = [
        ...     {
        ...         "params": model.conv_params,
        ...         "lr": 0.02,
        ...         "use_muon": True,
        ...         "momentum": 0.95,
        ...         "nesterov": True,
        ...         "weight_decay": 0.01,
        ...     },
        ...     {
        ...         "params": model.other_params,
        ...         "lr": 0.01,
        ...         "use_muon": False,
        ...         "momentum": 0.9,
        ...         "nesterov": False,
        ...         "weight_decay": 0,
        ...     },
        ... ]
        >>> optimizer = MuSGD(param_groups, muon=0.5, sgd=0.5)
        >>> loss = model(data)
        >>> loss.backward()
        >>> optimizer.step()

    Notes:
        - Parameter groups with 'use_muon': True will receive both Muon and SGD updates.
        - Parameter groups with 'use_muon': False will receive only SGD updates.
        - The Muon update uses orthogonalization which works best for 2D+ parameter tensors.
    MbP?        Fr(   lrr	   r    weight_decayr"   r#   use_muonmuonsgdc	                    t          |||||          }	t                                          ||	           || _        || _        dS )aN  Initialize MuSGD optimizer with hybrid Muon and SGD capabilities.

        Args:
            params (Iterable): Iterable of parameters to optimize or dicts defining parameter groups.
            lr (float): Learning rate.
            momentum (float): Momentum factor for SGD.
            weight_decay (float): Weight decay (L2 penalty).
            nesterov (bool): Whether to use Nesterov momentum.
            use_muon (bool): Whether to enable Muon updates.
            muon (float): Scaling factor for Muon component.
            sgd (float): Scaling factor for SGD component.
        )r4   r    r5   r"   r6   N)dictsuper__init__r7   r8   )selfparamsr4   r    r5   r"   r6   r7   r8   defaults	__class__s             r   r<   zMuSGD.__init__   sV    . %
 
 
 	***	r   Nc           	        d}|5t          j                    5   |            }ddd           n# 1 swxY w Y   | j        D ]}|d         r|d         D ]u}|d         }|j        |j        }| j        |         }t          |          dk    r.t          j        |          |d<   t          j        |          |d<   t          ||d         |d         |d	         
          }|                    |	                    |j
                  || j        z              |d         dk    r|                    ||d                   }|d                             |d                                       |           |d	         r#|                    |d         |d                   n|d         }	|                    |	|| j        z              w|d         D ]}|d         }|j        |j        }|d         dk    r|                    ||d                   }| j        |         }t          |          dk    rt          j        |          |d<   |d                             |d                                       |           |d	         r#|                    |d         |d                   n|d         }|                    ||            |S )a#  Perform a single optimization step.

        Applies either hybrid Muon+SGD updates or pure SGD updates depending on the
        'use_muon' flag in each parameter group. For Muon-enabled groups, parameters
        receive both an orthogonalized Muon update and a standard SGD momentum update.

        Args:
            closure (Callable, optional): A closure that reevaluates the model
                and returns the loss. Default: None.

        Returns:
            (torch.Tensor | None): The loss value if closure is provided, otherwise None.

        Notes:
            - Parameters with None gradients are skipped.
            - Muon updates use Newton-Schulz orthogonalization and work best on 2D+ tensors.
            - Weight decay is applied only to the SGD component in hybrid mode.
        Nr6   r>   r4   r   momentum_buffermomentum_buffer_SGDr    r"   )r!   r"   alphar5   )torchenable_gradparam_groupsr   stater   
zeros_liker/   add_reshaper   r7   addmul_r8   )
r=   closurelossgrouppr4   r   rI   r.   
sgd_updates
             r   stepz
MuSGD.step   sK   ( "$$ ! !wyy! ! ! ! ! ! ! ! ! ! ! ! ! ! ! & .	. .	.EZ  ,.x ? ?AtBv~ 6D JqME5zzQ383CA3F3F/07<7G7J7J34(e$56U:=NY^_iYj  F FF6>>!'22BN:KFLLL ^,11#xx~1FxGG/055eJ6GHHMMdSSS !,:'<!=U:EVWWW"#89 
 FF:rDH}-=F>>>>1?4 x . .AtBv~ 6D^,11#xx~1FxGG JqME5zzQ383CA3F3F/0+,11%
2CDDII$OOO !,6'8!9zARSSS"#45 
 FF6"F----!."    /33)r2   r3   r3   FFr(   r(   )r4   r	   r    r	   r5   r	   r"   r#   r6   r#   r7   r	   r8   r	   N	__name__
__module____qualname____doc__r<   rF   no_gradrT   __classcell__r@   s   @r   r1   r1   c   s        + +` !             D U]__G G G _G G G G Gr   r1   c                  X     e Zd ZdZdd fd	Z ej                    dd            Z xZS )MuonaS  Muon optimizer for usage in non-distributed settings.

    This optimizer implements the Muon algorithm, which combines momentum-based updates with orthogonalization via
    Newton-Schulz iterations. It applies weight decay and learning rate scaling to parameter updates.

    Args:
        params (iterable): Iterable of parameters to optimize or dicts defining parameter groups.
        lr (float, optional): Learning rate. Default: 0.02.
        weight_decay (float, optional): Weight decay (L2 penalty) coefficient. Default: 0.
        momentum (float, optional): Momentum coefficient for exponential moving average. Default: 0.95.

    Attributes:
        param_groups (list): List of parameter groups with their optimization settings.
        state (dict): Dictionary containing optimizer state for each parameter.

    Examples:
        >>> model = YourModel()
        >>> optimizer = Muon(model.parameters(), lr=0.02, weight_decay=0.01, momentum=0.95)
        >>> loss = model(data)
        >>> loss.backward()
        >>> optimizer.step()

    Notes:
        - Designed for non-distributed training environments.
        - Uses Muon updates with orthogonalization for all parameters.
        - Weight decay is applied multiplicatively before parameter update.
        - Parameters with None gradients are assigned zero gradients for synchronization.
    {Gz?r   r   r4   r	   r5   r    c                p    t          |||          }t                                          ||           dS )a}  Initialize Muon optimizer with orthogonalization-based updates.

        Args:
            params (Iterable): Iterable of parameters to optimize or dicts defining parameter groups.
            lr (float): Learning rate.
            weight_decay (float): Weight decay factor applied multiplicatively.
            momentum (float): Momentum factor for gradient accumulation.
        )r4   r5   r    N)r:   r;   r<   )r=   r>   r4   r5   r    r?   r@   s         r   r<   zMuon.__init__  s9     2L8LLL*****r   Nc                P   d}|5t          j                    5   |            }ddd           n# 1 swxY w Y   | j        D ]}|d         D ]}|j        t          j        |          |_        | j        |         }t          |          dk    rt          j        |          |d<   t          |j        |d         |d                   }|                    d|d         |d	         z  z
             |	                    |
                    |j                  |d          
           ڌ|S )a  Perform a single optimization step.

        Applies Muon updates to all parameters, incorporating momentum and orthogonalization.
        Weight decay is applied multiplicatively before the parameter update.

        Args:
            closure (Callable[[], torch.Tensor] | None, optional): A closure that reevaluates the model
                and returns the loss. Default: None.

        Returns:
            (torch.Tensor | None): The loss value if closure is provided, otherwise None.

        Examples:
            >>> optimizer = Muon(model.parameters())
            >>> loss = model(inputs)
            >>> loss.backward()
            >>> optimizer.step()

        Notes:
            - Parameters with None gradients are assigned zero gradients for synchronization.
            - Weight decay is applied as: p *= (1 - lr * weight_decay).
            - Muon update uses Newton-Schulz orthogonalization and works best on 2D+ tensors.
        Nr>   r   rB   r    )r!   r   r4   r5   rD   )rF   rG   rH   r   rJ   rI   r   r/   rN   rK   rL   r   )r=   rO   rP   rQ   rR   rI   r.   s          r   rT   z	Muon.step(  ss   2 "$$ ! !wyy! ! ! ! ! ! ! ! ! ! ! ! ! ! ! & 
	D 
	DE8_ 	D 	D6>"-a00AF
1u::??/4/?/B/BE+,$QVU3D-EER\L]^^^q5;~)>>>???v~~ag..uT{lCCCC	D rU   )ra   r   r   )r4   r	   r5   r	   r    r	   rV   rW   r^   s   @r   r`   r`      st         :
+ 
+ 
+ 
+ 
+ 
+ 
+ U]__) ) ) _) ) ) ) )r   r`   )r   )r   r   r   r	   r
   r   )r   T)
r   r   r    r   r!   r	   r"   r#   r
   r   )	
__future__r   rF   r   r   r/   	Optimizerr1   r`    r   r   <module>rg      s    # " " " " "       / / / / /d% % % % %PX X X X XEO X X XvT T T T T5? T T T T Tr   