Spaces:

subhankarg
/

MagpieTTS_Internal_Demo

Runtime error

App Files Files Community

MagpieTTS_Internal_Demo / nemo /collections /llm /peft /lora.py

subhankarg

Upload folder using huggingface_hub

0558aa4 verified 12 days ago

raw

history blame contribute delete

34.8 kB

	# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	from __future__ import annotations

	import math
	from dataclasses import dataclass, field
	from typing import Literal, Optional

	import torch
	from megatron.core import parallel_state

	from nemo.utils.import_utils import safe_import

	if torch.cuda.is_available():
	bitsandbytes, HAVE_BNB = safe_import("bitsandbytes")
	else:
	bitsandbytes = None
	HAVE_BNB = False

	import torch.nn.functional as F
	from torch import nn

	from nemo.utils.import_utils import safe_import_from

	te, HAVE_TE = safe_import_from("transformer_engine", "pytorch")

	from nemo.collections.llm.peft.module_matcher import ModuleMatcher
	from nemo.collections.llm.peft.utils import get_adapter_attributes_from_linear, is_expert_linear
	from nemo.lightning.pytorch.callbacks.peft import PEFT, AdapterWrapper
	from nemo.utils import logging
	from nemo.utils.te_utils import te_version


	class LoRALinear(AdapterWrapper):
	"""An adapter wrapper that adds the output of the adapter to the output of the wrapped module.

	This class is designed to be used with LoRA (Low-Rank Adaptation) and similar techniques
	where the adapter's output is added to the main module's output. It extends the AdapterWrapper
	class to provide a specific implementation of the forward method.
	"""

	def forward(
	self,
	x: torch.Tensor,
	*args,
	**kwargs,
	) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
	# pylint: disable=C0115,C0116
	linear_output, bias, layernorm_output = self.base_linear_forward(x, args, *kwargs)
	adapter_output = self.adapter(layernorm_output.contiguous())
	adapter_output = adapter_output.reshape(linear_output.shape)
	return linear_output + adapter_output, bias


	# Fused LoRA requires Transformer Engine 2.7+
	HAVE_TE_FUSED_LORA: bool = HAVE_TE and te_version() >= (2, 7)

	if HAVE_TE_FUSED_LORA:

	class TEFusedLoRALinear(LoRALinear):
	"""LoRA adapter wrapper using Transformer Engine operation fuser"""

	def __init__(self, to_wrap: nn.Module, adapter: nn.Module):
	super().__init__(to_wrap, adapter)
	self._fused_branches: Optional[tuple[te.ops.Sequential, te.ops.Sequential]] = None

	def _make_fused_branches(self) -> tuple[te.ops.Sequential, te.ops.Sequential]:
	"""Construct fused modules for main and LoRA branches"""

	# Extract layer size and tensor parallel config
	kwargs = {
	"in_features": self.to_wrap.weight.size(1),
	"out_features": self.to_wrap.weight.size(0),
	"tensor_parallel_mode": None,
	"tensor_parallel_group": None,
	"sequence_parallel": False,
	}
	tensor_parallel_size = parallel_state.get_tensor_model_parallel_world_size()
	if tensor_parallel_size > 1:
	kwargs["tensor_parallel_group"] = parallel_state.get_tensor_model_parallel_group()
	if isinstance(self.to_wrap, (te.Linear, te.LayerNormLinear)):
	kwargs["tensor_parallel_mode"] = self.to_wrap.parallel_mode
	kwargs["sequence_parallel"] = self.to_wrap.sequence_parallel
	if kwargs["tensor_parallel_mode"] == "row":
	kwargs["in_features"] *= tensor_parallel_size
	elif kwargs["tensor_parallel_mode"] == "column":
	kwargs["out_features"] *= tensor_parallel_size

	# wgrad accumulation fusion
	accumulate_into_main_grad = False
	if isinstance(self.to_wrap, (te.Linear, te.LayerNormLinear)):
	accumulate_into_main_grad = self.to_wrap.fuse_wgrad_accumulation
	kwargs["accumulate_into_main_grad"] = accumulate_into_main_grad

	# Construct fused branches
	main_branch = self._make_main_branch(**kwargs)
	lora_branch = self._make_lora_branch(**kwargs)

	# Get submodule forward hooks
	forward_pre_hooks = []
	forward_post_hooks = []
	for submodule in self.modules():
	for hook in submodule._forward_pre_hooks.values():
	forward_pre_hooks.append((submodule, hook))
	for hook in submodule._forward_hooks.values():
	forward_post_hooks.append((submodule, hook))

	# Attempt to emulate submodule forward hooks if needed
	# Note: Assume hooks do not interact with submodule inputs
	# or outputs since they are internal to the op fuser.
	if forward_pre_hooks:

	def forward_pre_hook(module, *_) -> None:
	for submodule, hook in forward_pre_hooks:
	# Assume that hook does not interact with
	# input
	hook(submodule, None)

	main_branch.register_forward_pre_hook(forward_pre_hook)
	if forward_post_hooks:

	def forward_post_hook(module, *_) -> None:
	for submodule, hook in forward_post_hooks:
	# Assume that hook does not interact with
	# input or output
	hook(submodule, None, None)

	lora_branch.register_forward_hook(forward_post_hook)

	return main_branch, lora_branch

	def _make_main_branch(
	self,
	*,
	in_features: int,
	out_features: int,
	tensor_parallel_mode: Optional[str],
	tensor_parallel_group: Optional[torch.distributed.ProcessGroup],
	sequence_parallel: bool,
	accumulate_into_main_grad: bool,
	) -> te.ops.Sequential:
	"""Construct fused module for main branch (norm + fork + linear)"""

	# Check wrapped linear class
	if not isinstance(self.to_wrap, (te.Linear, te.LayerNormLinear, torch.nn.Linear)):
	raise ValueError(f"Unsupported class for wrapped linear ({self.to_wrap.__class__.__name__})")

	# Ops in main branch
	main_branch = te.ops.Sequential()

	# Norm op
	if isinstance(self.to_wrap, te.LayerNormLinear):
	norm_type = self.to_wrap.normalization
	kwargs = {
	"eps": self.to_wrap.eps,
	"device": "meta",
	"dtype": self.to_wrap.layer_norm_weight.dtype,
	"zero_centered_gamma": self.to_wrap.zero_centered_gamma,
	}
	op = None
	if norm_type == "LayerNorm":
	op = te.ops.LayerNorm(in_features, **kwargs)
	op.weight = self.to_wrap.layer_norm_weight
	op.bias = self.to_wrap.layer_norm_bias
	elif norm_type == "RMSNorm":
	op = te.ops.RMSNorm(in_features, **kwargs)
	op.weight = self.to_wrap.layer_norm_weight
	else:
	raise ValueError(f"Unsupported normalization ({norm_type})")
	main_branch.append(op)
	main_branch.append(te.ops.Quantize(forward=True, backward=False))

	# Fork to LoRA branch
	# Note: GEMM with beta=1 in backward pass
	main_branch.append(te.ops.MakeExtraOutput(in_place=True))

	# Linear op
	weight = self.to_wrap.weight
	bias = self.to_wrap.bias
	if isinstance(bias, torch.Tensor) and bias.numel() == 0:
	bias = None
	op = te.ops.Linear(
	in_features,
	out_features,
	bias=bias is not None,
	device="meta",
	dtype=weight.dtype,
	tensor_parallel_mode=tensor_parallel_mode,
	tensor_parallel_group=tensor_parallel_group,
	sequence_parallel=sequence_parallel,
	accumulate_into_main_grad=accumulate_into_main_grad,
	)
	op.weight = weight
	op.bias = bias
	main_branch.append(op)

	return main_branch

	def _make_lora_branch(
	self,
	*,
	in_features: int,
	out_features: int,
	tensor_parallel_mode: Optional[str],
	tensor_parallel_group: Optional[torch.distributed.ProcessGroup],
	sequence_parallel: bool,
	accumulate_into_main_grad: bool,
	) -> te.ops.Sequential:
	"""Construct fused module for LoRA branch (lora_a + lora_b + add)"""

	from nemo.collections.llm.peft.utils import ParallelLinearAdapter

	# Extract params from LoRA adapter
	lora_a_weight = None
	lora_b_weight = None
	lora_dim = None
	dropout = 0
	dropout_position = None
	scale = None
	if isinstance(self.adapter, (LinearAdapter, TELinearAdapter)):
	lora_a_weight = self.adapter.lora_a.weight
	lora_b_weight = self.adapter.lora_b.weight
	lora_dim = lora_b_weight.size(1)
	dropout = self.adapter.dropout.p
	dropout_position = self.adapter.dropout_position
	scale = self.adapter.scale
	elif isinstance(self.adapter, ParallelLinearAdapter):
	lora_a_weight = self.adapter.linear_in.weight
	lora_b_weight = self.adapter.linear_out.weight
	lora_dim = lora_b_weight.size(1)
	if self.adapter.dropout is not None:
	dropout = self.adapter.dropout.p
	dropout_position = self.adapter.dropout_position
	scale = self.adapter.alpha / self.adapter.dim
	else:
	raise ValueError(f"Unsupported class for LoRA adapter ({self.adapter.__class__.__name__})")

	# Ops in LoRA branch
	lora_branch = te.ops.Sequential()

	# LoRA pre-processing
	if dropout > 0 and dropout_position == "pre":
	lora_branch.append(te.ops.Dropout(dropout))

	# LoRA A linear op
	op = te.ops.Linear(
	in_features,
	lora_dim,
	bias=False,
	device="meta",
	dtype=lora_a_weight.dtype,
	tensor_parallel_mode=tensor_parallel_mode,
	tensor_parallel_group=tensor_parallel_group,
	sequence_parallel=sequence_parallel,
	accumulate_into_main_grad=accumulate_into_main_grad,
	)
	op.weight = lora_a_weight
	lora_branch.append(op)

	# LoRA B linear op
	if tensor_parallel_mode == "column":
	# All-gather along dim -1
	raise NotImplementedError("Column tensor parallelism is not yet supported")
	op = te.ops.Linear(
	lora_dim,
	out_features,
	bias=False,
	device="meta",
	dtype=lora_b_weight.dtype,
	tensor_parallel_mode=None if tensor_parallel_mode is None else "column",
	tensor_parallel_group=tensor_parallel_group,
	sequence_parallel=False,
	accumulate_into_main_grad=accumulate_into_main_grad,
	)
	op.weight = lora_b_weight
	lora_branch.append(op)

	# LoRA post-processing
	if scale != 1:
	lora_branch.append(te.ops.ConstantScale(scale))
	if dropout > 0 and dropout_position == "post":
	lora_branch.append(te.ops.Dropout(dropout))
	if tensor_parallel_mode == "row":
	# All-gather along dim -1
	raise NotImplementedError("Row tensor parallelism is not yet supported")

	# Add with main branch
	# Note: GEMM with beta=1 in forward pass
	lora_branch.append(te.ops.AddExtraInput(in_place=True))

	return lora_branch

	def forward(self, x: torch.Tensor) -> tuple[torch.Tensor, None]:
	# pylint: disable=C0115,C0116

	# Construct fused impl if needed
	# Note: We initialize during the first forward pass in
	# case the params are modified after the constructor.
	# Note: The fused impl is stored in a tuple to avoid
	# registering submodules.
	if self._fused_branches is None:
	self._fused_branches = self._make_fused_branches()

	# Apply fused impl
	main_branch, lora_branch = self._fused_branches
	linear_output, linear_input = main_branch(x)
	with te.fp8_autocast(enabled=False):
	out = lora_branch(linear_input, linear_output)
	return out, None


	if HAVE_TE:

	class TELinearAdapter(te.Linear):
	"""
	TELinear + LoRA, maintains ckpts structrue (i.e. Linear's weight/bias remain at the same FQN)

	The _init_wrapper and _forward methods provide the LoRA functionality. We want to be able to
	use those inside LinearAdapter but also for monkey-patching modules, without repeating the
	same code -> therefore those are decorated with @staticmethod.

	Args:
	orig_linear (nn.Module): the linear module to augment.
	dim (int): lora's dim in_features -> dim -> out_features.
	alpha (int): lora's scaling alpha.
	dropout (float): dropout prob (default: 0.0).
	dropout_position (str): where to apply dropout rel. to lora (choices= ['pre', 'post'], default=post)
	lora_A_init_method (str): init method for lora_A (choices= ['xavier', 'uniform'])
	lora_dtype (torch.dtype): weight's dtype, by default will use orig_linear's but if they
	are quantized weights (e.g. 4bit) needs to be specified explicitly.
	"""

	def __init__(
	self,
	orig_linear,
	dim=8,
	alpha=32,
	dropout=0.0,
	dropout_position='post',
	lora_A_init_method='xavier',
	lora_dtype=None,
	):
	assert orig_linear.__class__ == te.Linear
	# TELinear has bias set to empty tensor
	has_bias = orig_linear.bias is not None and orig_linear.bias.shape[0] != 0
	super(TELinearAdapter, self).__init__(
	in_features=orig_linear.in_features,
	out_features=orig_linear.out_features,
	bias=has_bias,
	device=orig_linear.weight.device,
	params_dtype=orig_linear.weight.dtype,
	)
	# copy weights
	self.weight.data.copy_(orig_linear.weight.data)
	if has_bias:
	self.bias.data.copy_(orig_linear.bias.data)
	# initialize the adapter
	TELinearAdapter._init_adapter(
	self,
	dim=dim,
	alpha=alpha,
	dropout=dropout,
	dropout_position=dropout_position,
	lora_A_init_method=lora_A_init_method,
	lora_dtype=lora_dtype,
	)

	@torch.no_grad
	@staticmethod
	def _init_adapter(
	obj,
	dim=8,
	alpha=32,
	dropout=0.0,
	dropout_position='post',
	lora_A_init_method='xavier',
	lora_dtype=None,
	):
	"""Adds LoRA weights to obj. The obj is either a LinearAdapter or an nn.Module (when
	monkey-patching).

	Args:
	obj (LinearAdapter \| nn.Module): input module to adapt.
	dim (int): lora's dim in_features -> dim -> out_features.
	alpha (int): lora's scaling alpha.
	dropout (float): dropout prob (default: 0.0).
	dropout_position (str): where to apply dropout rel. to lora (choices= ['pre', 'post'], default=post)
	lora_A_init_method (str): init method for lora_A (choices= ['xavier', 'uniform'])
	lora_dtype (torch.dtype): weight's dtype, by default will use orig_linear's but if they
	are quantized weights (e.g. 4bit) needs to be specified explicitly.
	"""
	obj.dim = dim
	obj.scale = alpha / dim

	# Freezer
	device = obj.weight.device
	obj.weight.requires_grad = False
	if obj.bias is not None:
	obj.bias.requires_grad = False

	in_features = obj.in_features
	out_features = obj.out_features
	dtype = lora_dtype or obj.weight.dtype

	obj.lora_a = nn.Linear(in_features, dim, bias=False, dtype=dtype, device=device)
	obj.lora_b = nn.Linear(dim, out_features, bias=False, dtype=dtype, device=device)
	if lora_A_init_method == 'xavier':
	torch.nn.init.uniform_(obj.lora_a.weight.data)
	else:
	nn.init.kaiming_uniform_(obj.lora_a.weight.data, a=math.sqrt(5))
	obj.lora_b.weight.data.fill_(0)
	obj.dropout = nn.Dropout(p=dropout)
	assert dropout_position in ['pre', 'post'], dropout_position
	obj.dropout_position = dropout_position

	def forward(self, x):
	# pylint: disable=C0115,C0116
	res = super(TELinearAdapter, self).forward(x)
	if self.dropout_position == 'pre':
	x = self.dropout(x)
	# LoRA fwd is performed in original precision regardless of FP8 enabled
	lora_res = self.lora_b(self.lora_a(x))
	lora_res = lora_res * self.scale
	if self.dropout_position == 'post':
	lora_res = self.dropout(lora_res)
	return res + lora_res


	class LinearAdapter(nn.Linear):
	"""
	Linear + LoRA, maintains ckpts structrue (i.e. Linear's weight/bias remain at the same FQN)

	The _init_wrapper and _forward methods provide the LoRA functionality. We want to be able to
	use those inside LinearAdapter but also for monkey-patching modules, without repeating the
	same code -> therefore those are decorated with @staticmethod.

	Args:
	orig_linear (nn.Module): the linear module to augment.
	dim (int): lora's dim in_features -> dim -> out_features.
	alpha (int): lora's scaling alpha.
	dropout (float): dropout prob (default: 0.0).
	dropout_position (str): where to apply dropout rel. to lora (choices= ['pre', 'post'], default=post)
	lora_A_init_method (str): init method for lora_A (choices= ['xavier', 'uniform'])
	lora_dtype (torch.dtype): weight's dtype, by default will use orig_linear's but if they
	are quantized weights (e.g. 4bit) needs to be specified explicitly.
	"""

	def __init__(
	self,
	orig_linear,
	dim=8,
	alpha=32,
	dropout=0.0,
	dropout_position='post',
	lora_A_init_method='xavier',
	lora_dtype=None,
	):
	assert isinstance(orig_linear, nn.Linear)
	super(LinearAdapter, self).__init__(
	in_features=orig_linear.in_features,
	out_features=orig_linear.out_features,
	bias=orig_linear.bias is not None,
	device=orig_linear.weight.device,
	dtype=orig_linear.weight.dtype,
	)
	# copy weights
	self.weight.data.copy_(orig_linear.weight.data)
	if orig_linear.bias is not None:
	self.bias.data.copy_(orig_linear.bias.data)
	# initialize the adapte
	LinearAdapter._init_adapter(
	self,
	dim=dim,
	alpha=alpha,
	dropout=dropout,
	dropout_position=dropout_position,
	lora_A_init_method=lora_A_init_method,
	lora_dtype=lora_dtype,
	)

	@torch.no_grad
	@staticmethod
	def _init_adapter(
	obj,
	dim=8,
	alpha=32,
	dropout=0.0,
	dropout_position='post',
	lora_A_init_method='xavier',
	lora_dtype=None,
	):
	"""Adds LoRA weights to obj. The obj is either a LinearAdapter or an nn.Module (when
	monkey-patching).

	Args:
	obj (LinearAdapter \| nn.Module): input module to adapt.
	dim (int): lora's dim in_features -> dim -> out_features.
	alpha (int): lora's scaling alpha.
	dropout (float): dropout prob (default: 0.0).
	dropout_position (str): where to apply dropout rel. to lora (choices= ['pre', 'post'], default=post)
	lora_A_init_method (str): init method for lora_A (choices= ['xavier', 'uniform'])
	lora_dtype (torch.dtype): weight's dtype, by default will use orig_linear's but if they
	are quantized weights (e.g. 4bit) needs to be specified explicitly.
	"""
	obj.dim = dim
	obj.scale = alpha / dim

	# Freezer
	device = obj.weight.device
	obj.weight.requires_grad = False
	if obj.bias is not None:
	obj.bias.requires_grad = False

	in_features = obj.in_features
	out_features = obj.out_features
	dtype = lora_dtype or obj.weight.dtype

	obj.lora_a = nn.Linear(in_features, dim, bias=False, dtype=dtype, device=device)
	obj.lora_b = nn.Linear(dim, out_features, bias=False, dtype=dtype, device=device)
	if lora_A_init_method == 'xavier':
	torch.nn.init.uniform_(obj.lora_a.weight.data)
	else:
	nn.init.kaiming_uniform_(obj.lora_a.weight.data, a=math.sqrt(5))
	obj.lora_b.weight.data.fill_(0)
	obj.dropout = nn.Dropout(p=dropout)
	assert dropout_position in ['pre', 'post'], dropout_position
	obj.dropout_position = dropout_position

	def forward(self, x):
	# pylint: disable=C0115,C0116
	# If LinearAdapter is used to monkey-patch a nn.Linear module, we want to use nn.Linear's
	# forward in the case where it uses quantized weights. We store a reference to nn.Linear's
	# forward in `super_fwd` attribute. If the attribute does not exist we do the usual linear.
	if (fwd := getattr(self, 'super_fwd', None)) is not None:
	assert fwd != self.forward
	res = fwd(x)
	else:
	res = F.linear(x, self.weight, self.bias)

	if self.dropout_position == 'pre':
	x = self.dropout(x)
	lora_res = self.lora_b(self.lora_a(x))
	lora_res = lora_res * self.scale
	if self.dropout_position == 'post':
	lora_res = self.dropout(lora_res)
	return res + lora_res


	def patch_linear_module(
	orig_linear,
	dim=8,
	alpha=32,
	dropout=0.0,
	dropout_position='post',
	lora_A_init_method='xavier',
	lora_dtype=None,
	):
	"""Monkey-patches a nn.Linear (orig_linear param) to be a LinearAdapter, for all purposes
	think of this function as replacing a nn.Linear with a LinearAdapter defined above.

	The orig_linear might not contain valid weights, for example, the given orig_linear was
	initialized within a context-manager that uses a "meta" device. Therefore, we cannot copy
	the weight/bias from the orig_linear to the LinearAdapter, since those have not been allocated,

	To circumvent this scenario, LinearAdapter's additional functionality (_init_adapter, _forward)
	is based on static functions, so that we can use them for patching or when allocating a
	new LinearAdapter object.

	Args:
	orig_linear (nn.Linear): the module we add adapter to.
	dim (int, optional): Lora dim. Defaults to 8.
	alpha (int, optional): Lora alpha scale. Defaults to 32.
	dropout (float, optional): dropout prob. Defaults to 0.0.
	dropout_position (str, optional): location to apply dropout wrt lora.
	Defaults to 'post' (choices: 'pre', 'post').
	lora_A_init_method (str, optional): lora_a init method. Defaults to 'xavier'.
	lora_dtype (_type_, optional): Lora weights' dtype. By default will use orig_linear's dtype
	but orig_linear might use non-trainable dtype (e.g., 4bit), in which case the user must
	specify the dtype manually. Defaults to None.

	Returns:
	(nn.Module): the monkey-patched (nn.Linear + LoRA) nn.Module
	"""

	assert isinstance(orig_linear, nn.Linear) or orig_linear.__class__ == te.Linear
	assert not hasattr(orig_linear, 'super_fwd'), orig_linear.super_fwd

	if isinstance(orig_linear, nn.Linear):
	LinearAdapter._init_adapter(orig_linear, dim, alpha, dropout, dropout_position, lora_A_init_method, lora_dtype)
	cls = orig_linear.__class__
	new_cls = type('PatchedLinearAdapter', (LinearAdapter, cls), {})
	elif orig_linear.__class__ == te.Linear:
	TELinearAdapter._init_adapter(
	orig_linear, dim, alpha, dropout, dropout_position, lora_A_init_method, lora_dtype
	)
	cls = orig_linear.__class__
	new_cls = type('PatchedTELinearAdapter', (TELinearAdapter, cls), {})
	else:
	raise NotImplementedError("Expected isinstance(orig_linear, (nn.Linear, te.Linear))")

	# If the model uses quantized weights, we want to use orig_linear's forward
	if (
	getattr(orig_linear, 'quant_state', None) is not None
	and orig_linear.quant_state.__class__ == bitsandbytes.functional.QuantState
	):
	orig_linear.super_fwd = orig_linear.forward

	orig_linear.__class__ = new_cls
	return orig_linear


	@dataclass
	class LoRA(PEFT, ModuleMatcher):
	"""
	Implements the LoRA (Low-Rank Adaptation) module for parameter-efficient fine-tuning.

	LoRA uses a low-rank projection to adapt the weights of a pre-trained model to a new downstream task.
	This class facilitates the application of LoRA to specific modules within the model architecture.

	Args:
	target_modules (list[str], optional): A list of module names to apply LoRA to.
	Defaults to all linear layers ['linear_qkv', 'linear_proj', 'linear_fc1', 'linear_fc2'].
	- 'linear_qkv': Apply LoRA to the fused linear layer used for query, key, and value projections
	in self-attention.
	- 'linear_proj': Apply LoRA to the linear layer used for projecting the output of self-attention.
	- 'linear_fc1': Apply LoRA to the first fully-connected layer in MLP.
	- 'linear_fc2': Apply LoRA to the second fully-connected layer in MLP.
	Target modules can also contain wildcards. For example, you can specify
	target_modules=['.layers.0..linear_qkv', '.layers.1..linear_qkv'] to add LoRA to only linear_qkv
	on the first two layers.
	exclude_modules (list[str], optional): A list of module names not to apply LoRa to. It will
	match all nn.Linear & nn.Linear-adjacent modules whose name does not match any string in
	exclude_modules. If used, will require target_modules to be empty list or None.
	dim (int): Dimension of the low-rank projection space. Defaults to 32.
	alpha (int): Weighting factor for the low-rank projection. Defaults to 32.
	dropout (float): Dropout rate for the low-rank projection. Defaults to 0.0.
	dropout_position (Literal['pre', 'post'], optional): Position for applying dropout.
	Can be 'pre' (before the low-rank projection) or 'post' (after). Defaults to 'pre'.
	a2a_experimental (bool): Enables the experimental All-to-All (A2A) communication strategy. Defaults to False.
	dropout_recompute (bool): Enables dropout recompute using Thunder JIT compilation. When True,
	applies thunder.jit() to the dropout layer for memory-efficient training by recomputing
	dropout activations during backward pass instead of storing them.
	lora_dtype (torch.dtype): Parameter data type for LoRA weights. Default None (will use model's dtype).

	Example:
	--------
	>>> from nemo.collections import llm
	>>> lora = llm.peft.LoRA(target_modules=['linear_qkv', 'linear_proj'], dim=32)
	>>> model = llm.Mistral7BModel(model_transform=lora)
	>>> # (set up trainer and data)
	>>> trainer.fit(model, data)

	References:
	-----------
	Hu, E. J., Shen, Y., Wallis, P., Allen-Zhu, Z., Li, Y., Wang, S., Wang, L., & Chen, W. (2021).
	LoRA: Low-Rank Adaptation of Large Language Models. arXiv preprint arXiv:2106.09685.
	https://arxiv.org/abs/2106.09685

	)
	"""

	target_modules: list[str] = field(
	default_factory=lambda: ['linear_qkv', 'linear_proj', 'linear_fc1', 'linear_fc2']
	)
	dim: int = 32
	alpha: int = 32
	dropout: float = 0.0
	dropout_position: Literal['pre', 'post'] = 'pre'
	lora_A_init_method: str = "xavier"
	lora_B_init_method: str = "zero"
	a2a_experimental: bool = False
	lora_dtype: torch.dtype = None
	dropout_recompute: bool = False

	def transform(self, m: nn.Module, name=None, prefix=None):
	"""
	Applies LoRA to a specific module within the model architecture.

	Args:
	m (nn.Module): The module to apply LoRA to.
	name (str, optional): Name of the module (if applicable). Defaults to None.
	prefix (str, optional): Prefix for the module name (if applicable). Defaults to None.

	Returns:
	nn.Module: The modified module with LoRA applied, or the original module if not a target.
	"""
	from nemo.collections.llm.peft.utils import ParallelLinearAdapter

	if (ans := self.match(m, name, prefix)) is not None:
	(match, full_name) = ans
	if isinstance(m, nn.Linear) or m.__class__ == te.Linear:
	# Will use the `patch_linear_module` function if:
	# - is FSDP v1
	# - is DTensor (has _local_tensor attribute)
	# - has quant_state attribute
	if (
	self._add_via_setattr
	or hasattr(m.weight.data, '_local_tensor')
	or (
	getattr(m, 'quant_state', None) is not None
	and m.quant_state.__class__ == bitsandbytes.functional.QuantState
	)
	):
	lora_cls = patch_linear_module
	elif HAVE_TE and m.__class__ == te.Linear:
	lora_cls = TELinearAdapter
	else:
	lora_cls = LinearAdapter

	# Construct LoRA module
	return lora_cls(
	m,
	dim=self.dim,
	alpha=self.alpha,
	dropout=self.dropout,
	lora_A_init_method=self.lora_A_init_method,
	lora_dtype=self.lora_dtype,
	)

	input_is_parallel, in_features, out_features, disable_sp_comm, base_linear_is_parallel = (
	get_adapter_attributes_from_linear(m)
	)
	enable_op_fuser = (
	HAVE_TE_FUSED_LORA
	and hasattr(m, "config")
	and getattr(m.config, "use_transformer_engine_op_fuser", False)
	# TP not yet supported
	and parallel_state.get_tensor_model_parallel_world_size() == 1
	)
	logging.info(f"Adding lora to: {full_name}")
	adapter = ParallelLinearAdapter(
	in_features,
	out_features,
	self.dim,
	base_linear_name=full_name,
	activation='identity',
	norm_type=None,
	column_init_method=self.lora_A_init_method,
	row_init_method=self.lora_B_init_method,
	gather_output=False,
	input_is_parallel=input_is_parallel,
	dropout=self.dropout,
	dropout_position=self.dropout_position,
	model_parallel_config=getattr(m, "config", None),
	alpha=self.alpha,
	is_expert=is_expert_linear(full_name),
	a2a_experimental=self.a2a_experimental,
	disable_sequence_parallel_comm=disable_sp_comm,
	dropout_recompute=self.dropout_recompute,
	base_linear_is_parallel=base_linear_is_parallel,
	)
	if enable_op_fuser:
	return TEFusedLoRALinear(m, adapter)
	else:
	return LoRALinear(m, adapter)
	return m


	class LoRAMerge(PEFT):
	"""
	Implements the LoRA weight merge for parameter-efficient fine-tuning.

	Example:
	--------
	>>> from nemo.collections.llm.peft.lora import LoRAMerge
	>>> lora_merge = LoRAMerge()
	>>> merged_model = lora_merge(trainer.strategy.megatron_parallel)
	"""

	@torch.no_grad()
	def transform(self, m: nn.Module, name=None, prefix=None):
	"""
	Merges the LoRA adapter with the base model weights.

	Args:
	m (nn.Module): The module to apply LoRA merge to.
	name (str, optional): Name of the module to merge. Defaults to None.
	prefix (str, optional): Prefix for the module name. Defaults to None.

	Returns:
	nn.Module: The modified module with the LoRA adapter merged into the base model weights.
	"""

	if not isinstance(m, LoRALinear):
	return m
	logging.info(f'merging {(prefix if prefix else "") + "." + (name if name else "")}')
	lora_weight = m.adapter.alpha / m.adapter.dim * m.adapter.linear_out.weight @ m.adapter.linear_in.weight
	if hasattr(m.to_wrap, "weight"):
	base_weight = m.to_wrap.weight
	merged_weight = base_weight + lora_weight.to(base_weight.device)
	m.to_wrap.weight.data = merged_weight
	else: # TE Grouped Linear
	for i in range(m.to_wrap.num_gemms):
	base_weight = getattr(m.to_wrap, f"weight{i}")
	merged_weight = base_weight + lora_weight.to(base_weight.device)
	getattr(m.to_wrap, f"weight{i}").data = merged_weight
	return m