Struct Muon

pub struct Muon<B>where
    B: Backend,
{ /* private fields */ }

Expand description

Muon optimizer.

Muon internally runs standard SGD-momentum, and then performs an orthogonalization post-processing step, in which each 2D parameter’s update is replaced with the nearest orthogonal matrix. For efficient orthogonalization we use a Newton-Schulz iteration, which has the advantage that it can be stably run in bfloat16 on the GPU.

§Important Notes

Only for 2D+ parameters: Muon is designed for weight matrices. Use AdamW or SGD for biases, embeddings, and layer norms.
Learning rate adjustment: Muon automatically adjusts the learning rate based on parameter shape. See AdjustLrFn for details.
Weight decay timing: Unlike typical optimizers, Muon applies weight decay AFTER orthogonalization but uses the original (unadjusted) learning rate for it.

Trait Implementations§

§

impl Clone for Muon
where B: Clone + Backend,

§

fn clone(&self) -> Muon

Returns a duplicate of the value. Read more

1.0.0 · Source§

fn clone_from(&mut self, source: &Self)

Performs copy-assignment from source. Read more

§

impl SimpleOptimizer for Muon
where B: Backend,

§

fn step<const D: usize>( &self, lr: f64, tensor: Tensor<B, D>, grad: Tensor<B, D>, state: Option<<Muon as SimpleOptimizer>::State<D>>, ) -> (Tensor<B, D>, Option<<Muon as SimpleOptimizer>::State<D>>)

Perform a single Muon optimization step.

§Algorithm

Apply momentum to gradient
Orthogonalize update via Newton-Schulz
Adjust learning rate based on parameter shape
Apply weight decay (using original lr)
Update parameter (using adjusted lr)

§Notes

Unlike typical optimizers, the weight decay and parameter update use different learning rates:

Weight decay uses the original lr
Parameter update uses the shape-adjusted lr

§Panics

This function will panic if the input tensors are not 2D.

§

type State<const D: usize> = MuonState<B, D>

The state of the optimizer. It also implements record, so that it can be saved.

§

fn to_device<const D: usize>( state: <Muon as SimpleOptimizer>::State<D>, device: &::Device, ) -> <Muon as SimpleOptimizer>::State<D>

Change the device of the state. Read more

Auto Trait Implementations§

§

impl UnwindSafe for Muon
where ::FloatElem: UnwindSafe,

Blanket Implementations§

§

impl<T> Adaptor<()> for T

§

fn adapt(&self)

Adapt the type to be passed to a metric.

Muon

Struct Muon Copy item path

§Important Notes

Trait Implementations§

impl<B> Clone for Muon<B>where B: Clone + Backend,

fn clone(&self) -> Muon<B>

fn clone_from(&mut self, source: &Self)

impl<B> SimpleOptimizer<B> for Muon<B>where B: Backend,

fn step<const D: usize>( &self, lr: f64, tensor: Tensor<B, D>, grad: Tensor<B, D>, state: Option<<Muon<B> as SimpleOptimizer<B>>::State<D>>, ) -> (Tensor<B, D>, Option<<Muon<B> as SimpleOptimizer<B>>::State<D>>)

§Algorithm

§Notes

§Panics

type State<const D: usize> = MuonState<B, D>

fn to_device<const D: usize>( state: <Muon<B> as SimpleOptimizer<B>>::State<D>, device: &<B as BackendTypes>::Device, ) -> <Muon<B> as SimpleOptimizer<B>>::State<D>

Auto Trait Implementations§

impl<B> Freeze for Muon<B>where <B as BackendTypes>::FloatElem: Freeze,

impl<B> RefUnwindSafe for Muon<B>where <B as BackendTypes>::FloatElem: RefUnwindSafe,

impl<B> Send for Muon<B>

impl<B> Sync for Muon<B>

impl<B> Unpin for Muon<B>where <B as BackendTypes>::FloatElem: Unpin,

impl<B> UnwindSafe for Muon<B>where <B as BackendTypes>::FloatElem: UnwindSafe,

Blanket Implementations§

impl<T> Adaptor<()> for T

fn adapt(&self)

impl<T> Any for Twhere T: 'static + ?Sized,

fn type_id(&self) -> TypeId

impl<T> Borrow<T> for Twhere T: ?Sized,

fn borrow(&self) -> &T

impl<T> BorrowMut<T> for Twhere T: ?Sized,

fn borrow_mut(&mut self) -> &mut T

impl<C> CloneExpand for Cwhere C: Clone,

fn __expand_clone_method(&self, _scope: &mut Scope) -> C

impl<T> CloneToUninit for Twhere T: Clone,

unsafe fn clone_to_uninit(&self, dest: *mut u8)

impl<T> Downcast<T> for T

fn downcast(&self) -> &T

impl<T> From<T> for T

fn from(t: T) -> T

impl<T> Instrument for T

fn instrument(self, span: Span) -> Instrumented<Self>

fn in_current_span(self) -> Instrumented<Self>

impl<T, U> Into<U> for Twhere U: From<T>,

fn into(self) -> U

impl<T> IntoComptime for T

fn comptime(self) -> Self

impl<T> IntoEither for T

fn into_either(self, into_left: bool) -> Either<Self, Self>

fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>where F: FnOnce(&Self) -> bool,

impl<T> Pointable for T

const ALIGN: usize

type Init = T

unsafe fn init(init: <T as Pointable>::Init) -> usize

unsafe fn deref<'a>(ptr: usize) -> &'a T

unsafe fn deref_mut<'a>(ptr: usize) -> &'a mut T

unsafe fn drop(ptr: usize)

impl<T> ToOwned for Twhere T: Clone,

type Owned = T

fn to_owned(&self) -> T

fn clone_into(&self, target: &mut T)

impl<T, U> TryFrom<U> for Twhere U: Into<T>,

type Error = Infallible

fn try_from(value: U) -> Result<T, <T as TryFrom<U>>::Error>

impl<T, U> TryInto<U> for Twhere U: TryFrom<T>,

type Error = <U as TryFrom<T>>::Error

fn try_into(self) -> Result<U, <U as TryFrom<T>>::Error>

impl<T> TuneInputs for Twhere T: Clone + Send + Sync + 'static,

type At<'a> = T

impl<T> Upcast<T> for T

fn upcast(&self) -> Option<&T>

impl<V, T> VZip<V> for Twhere V: MultiLane<T>,

fn vzip(self) -> V

impl<T> WithSubscriber for T

fn with_subscriber<S>(self, subscriber: S) -> WithDispatch<Self>where S: Into<Dispatch>,

fn with_current_subscriber(self) -> WithDispatch<Self>

impl<T> WasmNotSend for Twhere T: Send,

impl<T> WasmNotSendSync for Twhere T: WasmNotSend + WasmNotSync,

impl<T> WasmNotSync for Twhere T: Sync,

Struct Muon

impl<B> Clone for Muon<B>
where B: Clone + Backend,

impl<B> SimpleOptimizer<B> for Muon<B>
where B: Backend,

impl<B> Freeze for Muon<B>
where <B as BackendTypes>::FloatElem: Freeze,

impl<B> RefUnwindSafe for Muon<B>
where <B as BackendTypes>::FloatElem: RefUnwindSafe,

impl<B> Unpin for Muon<B>
where <B as BackendTypes>::FloatElem: Unpin,

impl<B> UnwindSafe for Muon<B>
where <B as BackendTypes>::FloatElem: UnwindSafe,

impl<T> Any for T
where T: 'static + ?Sized,

impl<T> Borrow<T> for T
where T: ?Sized,

impl<T> BorrowMut<T> for T
where T: ?Sized,

impl<C> CloneExpand for C
where C: Clone,

impl<T> CloneToUninit for T
where T: Clone,

impl<T, U> Into<U> for T
where U: From<T>,

fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
where F: FnOnce(&Self) -> bool,

impl<T> ToOwned for T
where T: Clone,

impl<T, U> TryFrom<U> for T
where U: Into<T>,

impl<T, U> TryInto<U> for T
where U: TryFrom<T>,

impl<T> TuneInputs for T
where T: Clone + Send + Sync + 'static,

impl<V, T> VZip<V> for T
where V: MultiLane<T>,

fn with_subscriber<S>(self, subscriber: S) -> WithDispatch<Self>
where S: Into<Dispatch>,

impl<T> WasmNotSend for T
where T: Send,

impl<T> WasmNotSendSync for T
where T: WasmNotSend + WasmNotSync,

impl<T> WasmNotSync for T
where T: Sync,