PLDR-LLM-v52-110M-1 / modeling_pldrllm.py

Added model files.

ff9ee90 4 months ago

78.9 kB

	# coding=utf-8
	# Copyright 2025 Fromthesky Research Labs, LLC. All rights reserved.
	# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
	#
	# This code uses the Llama model implementation by Eleuther AI
	# and Huggingface teams in this library as a starting point and implements
	# the PLDR-LLM (Large Language Model from Power Law Decoder Representations)
	# architecture based on its implementation by the Fromthesky Research Labs team.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	from typing import Callable, Optional, Union

	import torch
	from torch import nn
	import torch.nn.functional as F

	from transformers.activations import ACT2FN
	from transformers.cache_utils import Cache, DynamicCache, StaticCache
	from transformers.generation import GenerationMixin
	from transformers.masking_utils import create_causal_mask
	from transformers.modeling_layers import GradientCheckpointingLayer

	from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
	from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
	from transformers.processing_utils import Unpack
	from transformers.utils import TransformersKwargs, auto_docstring, can_return_tuple, logging
	from .configuration_pldrllm import PldrllmConfig

	from dataclasses import dataclass
	from transformers.utils import ModelOutput

	logger = logging.get_logger(__name__)

	################## PLDRLLM POWER LAW GRAPH ATTENTION IMPLEMENTATION ########################################

	''''
	Power law attention implementation for PLDR-LLM with KV-cache and G-cache.
	'''

	class PlgaLayer(nn.Module):
	'''
	Power law graph attention layer implementation.
	'''
	def __init__(self, config:PldrllmConfig,
	F_hidden:int,
	F_heads:int,
	layer_idx:int,
	device=None,
	**kwargs)->None:
	'''
	Args:
	F_hidden: hidden layer shape used in layer weight creation. For multi-head plga this is head_dim.
	F_heads: Number of attention heads.
	layer_idx: index for the decoder layer.
	device: device(cpu or gpu) to load tensors.
	'''

	super().__init__(**kwargs)
	self.F_hidden=F_hidden
	self.F_heads=F_heads
	self.layer_idx=layer_idx
	self.device=device
	self.config=config
	self.is_causal = True
	self.custom_G_type=config.custom_G_type
	self.attention_dropout=config.attention_dropout

	# default type is set as config.torch_dtype
	self.wdtype=None

	if self.custom_G_type is None:
	self.build_weights()
	else:
	self.Wlst = None
	self.blst = None
	self.pwlst = None
	self.alst = None
	self.balst = None



	def cg_align_one(self, Hin:torch.Tensor,
	Hk:torch.Tensor,
	Hv:torch.Tensor,
	A:torch.Tensor,
	a_vec:Optional[torch.Tensor],
	ba:Optional[torch.Tensor],
	W:Optional[torch.Tensor],
	b:Optional[torch.Tensor],
	pw:Optional[torch.Tensor],
	past_G_values: Optional[torch.Tensor],
	past_G_values_status: Optional[torch.BoolTensor]=None,
	mask:Optional[torch.Tensor]=None,
	use_cache: Optional[bool]=None,
	**kwargs)->tuple[torch.Tensor, tuple[torch.Tensor,...]]:
	'''
	Alignment model for calculating attention weights
	Args:
	Hin: query
	Hk: key
	A: metric tensor instance
	a_vec: learned coupling coefficients.
	ba: bias for coupling coeffients
	W: weights applied on metric tensor before AdjActivation
	b: bias applied on metric tensor before AdjActivation
	pw: learned power exponents applied on metric tensor
	mask: padding or lookahead mask
	Returns:
	Hout: Attention output.
	A tuple of:
	A: metric tensor as output of residual metric learner layer, A
	AW: metric tensor after AdjActivation is applied, A_LM
	pw: learned power exponents
	a_vec: learned coupling coefficients for energy-curvature tensor
	ba: bias for energy-curvature tensor
	avAp: Energy curvature tensor, G_LM
	E: attention weights
	'''

	if self.custom_G_type is None and not (use_cache and past_G_values_status[self.layer_idx]):

	AdjActivation=iSwiGLU
	epsilonAdj=1e-9

	# make metric tensor positive definite
	AW=AdjActivation(torch.matmul(W,A)+b)+epsilonAdj

	# find energy curvature tensor and attention weights
	Ap=torch.pow(AW, pw)
	avAp=torch.matmul(a_vec, Ap)+ba # [batch_size, num_head, depth, depth]

	if use_cache:
	# update only once if cache is enabled.
	G_batch_size=past_G_values.size()[2]
	past_G_values[self.layer_idx]=torch.stack([A[:G_batch_size,:,:,:],
	AW[:G_batch_size,:,:,:],
	avAp[:G_batch_size,:,:,:]], dim=0) # [3, batch_size, num_head, depth, depth]
	past_G_values_status[self.layer_idx]=True
	else:
	AW=past_G_values[self.layer_idx, 1]
	avAp=past_G_values[self.layer_idx, 2]

	WHiWHj = torch.matmul(Hin, avAp) # [batch_size, num_head, seq_lenq, depth]

	# scale attention with square root of depth
	dk=torch.tensor(self.F_hidden).to(Hin.dtype)
	scaling=1/torch.sqrt(dk)

	attention_interface: Callable = eager_attention_forward
	if self.config._attn_implementation != "eager":
	attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]

	query, key, value = WHiWHj.to(dtype=Hk.dtype), Hk, Hv

	Hout, E = attention_interface(
	self,
	query=query,
	key=key,
	value=value,
	attention_mask=mask,
	dropout=0.0 if not self.training else self.attention_dropout,
	scaling=scaling,
	**kwargs
	)

	return Hout, (A, AW, pw, a_vec, ba, avAp, E)

	def cg_align_head(self, Hin:torch.Tensor,
	Hk:torch.Tensor,
	Hv:torch.Tensor,
	A:torch.Tensor,
	mask:Optional[torch.Tensor]=None,
	past_G_values: Optional[torch.Tensor]=None,
	past_G_values_status: Optional[torch.BoolTensor]=None,
	use_cache: Optional[bool]=None,
	**kwargs)->tuple[torch.Tensor, tuple[torch.Tensor,...]]:
	'''
	Method for linear propagation of attention weights over values.
	'''

	Hout, att_weights=self.cg_align_one(Hin=Hin, Hk=Hk, Hv=Hv, A=A,
	a_vec=self.alst,
	ba=self.balst,
	W=self.Wlst,
	b=self.blst,
	pw=self.pwlst,
	mask=mask,
	past_G_values=past_G_values,
	past_G_values_status=past_G_values_status,
	use_cache=use_cache,
	**kwargs)

	return Hout, att_weights



	def build_weights(self)->None:
	'''
	Used to initialize learnable parameters for the layer:
	W: weights to apply on metric tensor.
	b: bias to apply on metric tensor.
	a: coupling coefficients for energy-curvature (G) tensor.
	ba: bias for energy-curvature tensor.
	pw: power exponent weights for potential tensor.
	'''

	weight_shape=[self.F_heads, self.F_hidden, self.F_hidden] # [num_heads, depth, depth]

	add_weight_Wpart= torch.empty(weight_shape, dtype=self.wdtype, device=self.device)
	add_weight_bpart=torch.empty(weight_shape, dtype=self.wdtype, device=self.device)
	add_weight_pwpart=torch.empty(weight_shape, dtype=self.wdtype, device=self.device)
	add_weight_apart = torch.empty(weight_shape, dtype=self.wdtype, device=self.device)
	add_weight_bapart=torch.empty(weight_shape, dtype=self.wdtype, device=self.device)

	self.Wlst = nn.Parameter(add_weight_Wpart, requires_grad=True)
	self.blst = nn.Parameter(add_weight_bpart, requires_grad=True)
	self.pwlst = nn.Parameter(add_weight_pwpart, requires_grad=True)
	self.alst = nn.Parameter(add_weight_apart, requires_grad=True)
	self.balst = nn.Parameter(add_weight_bapart, requires_grad=True)


	def forward(self, inputs:tuple[torch.Tensor,...],
	past_G_values: Optional[torch.Tensor]=None,
	past_G_values_status: Optional[torch.BoolTensor]=None,
	use_cache:Optional[bool]=False,
	**kwargs)->tuple[torch.Tensor, tuple[torch.Tensor,...]]:
	'''
	execute the forward propagation
	inputs[0] = query = Hin
	inputs[1] = key = Hk
	inputs[2] = value = Hv
	inputs[3] = metric tensor = A
	inputs[4] = mask
	'''

	Hin, Hk, Hv, A, mask=inputs
	H_next, att_weights = self.cg_align_head(Hin=Hin, Hk=Hk, Hv=Hv, A=A, mask=mask,
	past_G_values=past_G_values,
	past_G_values_status=past_G_values_status,
	use_cache=use_cache, **kwargs)
	return H_next, att_weights

	def eager_attention_forward(
	module: nn.Module,
	query: torch.Tensor,
	key: torch.Tensor,
	value: torch.Tensor,
	attention_mask: Optional[torch.Tensor],
	scaling: float,
	dropout: float = 0.0,
	**kwargs:Unpack[TransformersKwargs],
	)->tuple[torch.Tensor, torch.Tensor]:

	keyt=torch.permute(key, [0, 1, 3, 2]) # [batch_size, num_head, depth, seq_lenk]
	attn_weights = torch.matmul(query, keyt) * scaling # [batch_size, num_head, seq_lenq, seq_lenk]
	if attention_mask is not None:
	causal_mask = attention_mask[:, :, :, : key.shape[-2]]
	attn_weights = attn_weights + causal_mask

	attn_weights = F.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
	attn_weights = F.dropout(attn_weights, p=dropout, training=module.training)
	attn_output = torch.matmul(attn_weights, value)
	attn_output = torch.permute(attn_output, [0, 2, 1, 3])
	attn_output = attn_output.contiguous()

	return attn_output, attn_weights

	def iSwiGLU(x):
	'''SwiGLU activation function with weights W,V equal to identity matrix and no bias.'''
	gate=F.silu(x)
	out=torch.mul(x, gate)
	return out

	################################### END OF PLDRLLM POWER LAW GRAPH ATTENTION IMPLEMENTATION ############################################

	#################################### PLDR-LLM MODEL IMPLEMENTATION ################################################################

	'''
	Model Implementation for Large Language Model from Power Law Decoder Representations with KV-cache and G-cache.
	'''

	class PldrllmAttention(nn.Module):
	'''
	Power Law Multihead Attention Implementation for PLDR-LLM.
	'''
	def __init__(self,config: PldrllmConfig,
	layer_idx:int,
	device=None,
	**kwargs)->None:


	super().__init__(**kwargs)
	self.num_heads = config.num_attention_heads
	self.d_model = config.hidden_size
	self.A_dff = config.A_dff
	self.num_denseA = config.num_denseA
	self.num_reslayerA = config.num_reslayerA
	self.activation=ACT2FN[config.hidden_act]
	self.max_seq_len=config.max_position_embeddings
	self.layer_idx=layer_idx
	self.device=device
	self.attention_bias=config.attention_bias
	self.custom_G_type=config.custom_G_type
	self.layer_norm_eps=config.layer_norm_eps
	self.glu_bias=config.glu_bias
	self.reference_rope=config.reference_rope
	self.wdtype=None

	assert self.d_model % self.num_heads == 0
	self.depth = config.head_dim

	self.wq = nn.Linear(self.d_model, self.d_model, bias=self.attention_bias, device=self.device, dtype=self.wdtype)
	self.wk = nn.Linear(self.d_model, self.d_model, bias=self.attention_bias, device=self.device, dtype=self.wdtype)
	self.wv = nn.Linear(self.d_model, self.d_model, bias=self.attention_bias, device=self.device, dtype=self.wdtype)

	self.plgatt_layer= PlgaLayer(config=config,
	F_hidden=self.depth,
	F_heads= self.num_heads,
	layer_idx=self.layer_idx,
	device=self.device)

	self.dense = nn.Linear(self.d_model, self.d_model, bias=self.attention_bias, device=self.device, dtype=self.wdtype)

	if self.custom_G_type is None:
	# residual layers for metric tensor learning
	self.reslayerAs=nn.ModuleList([ResLayerA(depth=self.depth,
	A_dff=self.A_dff,
	num_denseA=self.num_denseA,
	layer_norm_eps=self.layer_norm_eps,
	glu_bias=self.glu_bias,
	activation=self.activation,
	device=self.device,
	dtype=self.wdtype) for _ in range(self.num_reslayerA)])

	self.layernorm1 = nn.LayerNorm(self.depth, eps=self.layer_norm_eps, device=self.device, dtype=self.wdtype)

	if self.reference_rope:
	# keep initialization and forward in same module for reference rope implementation
	self.rotary_embedding=RotaryPositionalEmbeddings(dim=self.depth,
	max_seq_len=self.max_seq_len,
	base=config.rope_theta
	).to(device=self.device, dtype=self.wdtype)



	def split_heads(self, x, batch_size):
	'''
	Split the last dimension into (num_heads, depth).
	'''
	x = x.view(batch_size, -1, self.num_heads, self.depth)
	return x # [batch_size, seq_len, num_heads, depth]

	def forward(self, inputs:tuple[torch.Tensor, ...],
	position_embeddings:torch.Tensor,
	position_ids: Optional[torch.LongTensor]=None,
	cache_position:Optional[torch.LongTensor]=None,
	past_G_values: Optional[torch.Tensor]=None,
	past_G_values_status: Optional[torch.BoolTensor]=None,
	past_key_values: Optional[Cache]=None,
	use_cache:Optional[bool]=None,
	**kwargs: Unpack[TransformersKwargs]
	)->tuple[torch.Tensor, tuple[torch.Tensor,...]]:

	q, k, v, mask = inputs
	batch_size = q.size()[0]

	q = self.wq(q) # [batch_size, seq_len, d_model]
	k = self.wk(k)
	v = self.wv(v)


	q = self.split_heads(q, batch_size) # [batch_size, seq_len, num_heads, depth]
	k = self.split_heads(k, batch_size)
	v = self.split_heads(v, batch_size)


	if position_embeddings is not None:
	cos, sin = position_embeddings
	q, k = apply_rotary_pos_emb(q=q, k=k, cos=cos, sin=sin, unsqueeze_dim=2)
	else:
	q=self.rotary_embedding(q, input_pos=position_ids)
	k=self.rotary_embedding(k, input_pos=position_ids)

	q = torch.permute(q, [0, 2, 1, 3]) # [batch_size, num_heads, seq_len, depth]
	k = torch.permute(k, [0, 2, 1, 3])
	v = torch.permute(v, [0, 2, 1, 3])

	if self.custom_G_type is None and not (use_cache and past_G_values_status[self.layer_idx]):
	# Calculate density matrix using linear self attention
	qt = torch.permute(q, [0, 1, 3, 2])
	A = torch.matmul(qt, q) # [batch_size, num_head, depth, depth]
	A=self.layernorm1(A)

	#Deep residual network for learning metric tensor
	for i in range(self.num_reslayerA):
	A=self.reslayerAs[i]([A])
	else:
	A=past_G_values[self.layer_idx,0] # [1, num_head, depth, depth]

	if use_cache:
	#cache position for static cache
	cache_kwargs = {"cache_position": cache_position}
	k, v = past_key_values.update(key_states=k, value_states=v, layer_idx=self.layer_idx, cache_kwargs=cache_kwargs)

	#Apply multi-head power law attention
	Hnext, att_weights = self.plgatt_layer((q, k, v, A, mask),
	past_G_values,
	past_G_values_status,
	use_cache, **kwargs)

	Hnext= Hnext.reshape(batch_size, -1, self.d_model) # [batch_size, seq_len, d_model]

	output = self.dense(Hnext)

	return output, att_weights


	class PLDR_DecoderLayer(GradientCheckpointingLayer):
	'''
	Single decoder layer implementation for PLDR-LLM with single masked multihead attention.
	'''
	def __init__(self, config: PldrllmConfig,
	layer_idx:int,
	device=None,
	**kwargs)->None:

	super().__init__(**kwargs)

	self.d_model=config.hidden_size
	self.num_heads=config.num_attention_heads
	self.dff=config.intermediate_size
	self.A_dff=config.A_dff
	self.num_denseA = config.num_denseA
	self.num_reslayerA = config.num_reslayerA
	self.activation=ACT2FN[config.hidden_act]
	self.max_seq_len=config.max_position_embeddings
	self.layer_idx=layer_idx
	self.device=device
	self.layer_norm_eps=config.layer_norm_eps
	self.glu_bias=config.glu_bias
	self.wdtype=None

	self.mha1 = PldrllmAttention(config=config, layer_idx=layer_idx, device=self.device)

	self.ffn = self.dec_point_wise_feed_forward_network()

	self.layernorm1 = nn.LayerNorm(self.d_model, eps=self.layer_norm_eps, device=self.device, dtype=self.wdtype)
	self.layernorm2 = nn.LayerNorm(self.d_model, eps=self.layer_norm_eps, device=self.device, dtype=self.wdtype)

	def forward(self,
	hidden_states:torch.Tensor,
	look_ahead_mask:torch.Tensor,
	position_embeddings:torch.Tensor,
	position_ids:Optional[torch.LongTensor]=None,
	cache_position:Optional[torch.LongTensor]=None,
	use_cache:Optional[bool]=None,
	past_key_values:Optional[Cache]=None,
	past_G_values:Optional[torch.Tensor]=None,
	past_G_values_status:Optional[list[bool]]=None,
	**kwargs:Unpack[TransformersKwargs]
	)->tuple[torch.Tensor, tuple[torch.Tensor,...]]:

	attn1, att_weights = self.mha1(inputs=[hidden_states, hidden_states, hidden_states, look_ahead_mask],
	position_embeddings=position_embeddings,
	position_ids=position_ids,
	cache_position=cache_position,
	past_key_values=past_key_values,
	past_G_values=past_G_values,
	past_G_values_status=past_G_values_status,
	use_cache=use_cache,
	**kwargs
	)
	out1 = self.layernorm1(attn1 + hidden_states)

	ffn_output = self.ffn(out1)
	out2 = self.layernorm2(ffn_output + out1) # [batch_size, target_seq_len, d_model]

	return out2, att_weights


	# GLUVariant implementation for feedforward network, scale dff accordingly (i.e., 2/3 of original).
	def dec_point_wise_feed_forward_network(self):
	return GLUVariant(self.d_model, self.dff, self.d_model,
	glu_bias=self.glu_bias,
	activation=self.activation,
	device=self.device,
	dtype=self.wdtype)


	class ResLayerA(nn.Module):
	'''
	Residual Layer implementation for metric learner of PLDR-LLM
	'''
	def __init__(self, depth:int,
	A_dff:int,
	num_denseA:int,
	layer_norm_eps:float,
	glu_bias:bool,
	activation:Callable=F.silu,
	device=None,
	dtype=None,
	**kwargs)->None:
	super().__init__(**kwargs)
	self.depth=depth
	self.A_dff = A_dff
	self.num_denseA = num_denseA
	self.activation=activation
	self.device=device
	self.layer_norm_eps=layer_norm_eps
	self.glu_bias=glu_bias

	self.denseAs = nn.ModuleList([GLUVariant(self.depth, self.A_dff, self.depth,
	glu_bias=self.glu_bias,
	activation=self.activation,
	device=self.device,
	dtype=dtype) for _ in range(self.num_denseA)])

	self.layernormA = nn.LayerNorm(self.depth, eps=self.layer_norm_eps, device=self.device, dtype=dtype)
	self.identity=nn.Identity()

	def ResUnit(self, A:torch.Tensor)->torch.Tensor:
	Ain = self.identity(A)
	for i in range(self.num_denseA):
	A = self.denseAs[i](A)
	A = self.layernormA(A + Ain)
	return A

	def forward(self, inputs:list[torch.Tensor], **kwargs)->torch.Tensor:
	A=inputs[0]
	return self.ResUnit(A)


	class GLUVariant(nn.Module):
	'''
	Implementation of GLU variants with default activation for SwiGLU configuration
	For the hidden layer dff, to match size with non-SwiGLU FFN version scaling with 2/3 may be useful.
	'''
	def __init__(self, d_model:int,
	dff:int,
	depth:int,
	glu_bias:bool,
	activation:Callable=F.silu,
	device=None,
	dtype=None,
	**kwargs)->None:
	super().__init__(**kwargs)
	self.dff=dff
	self.depth=depth
	self.d_model=d_model
	self.activation=activation
	self.device=device
	self.glu_bias=glu_bias

	self.gluw1=nn.Linear(self.d_model, self.dff, bias=self.glu_bias, device=self.device, dtype=dtype)
	self.gluw2=nn.Linear(self.d_model, self.dff, bias=self.glu_bias, device=self.device, dtype=dtype)
	self.gluw3=nn.Linear(self.dff, self.depth, bias=self.glu_bias, device=self.device, dtype=dtype)

	def forward(self, input:torch.Tensor, **kwargs)->torch.Tensor:
	x1=self.gluw1(input)
	x1=self.activation(x1)
	x2=self.gluw2(input)
	return self.gluw3(torch.mul(x1, x2))


	###################################### END OF PLDRLLM MODEL IMPLEMENTATION #####################################################


	# RotaryPositionalEmbeddings is from https://github.com/pytorch/torchtune/blob/main/torchtune/modules/position_embeddings.py
	# This implementation was used in the original pytorch based implementation of PLDR-LLM.
	class RotaryPositionalEmbeddings(nn.Module):
	"""
	This class implements Rotary Positional Embeddings (RoPE)
	proposed in https://arxiv.org/abs/2104.09864.

	Reference implementation (used for correctness verfication)
	can be found here:
	https://github.com/meta-llama/llama/blob/main/llama/model.py#L80

	In this implementation we cache the embeddings for each position upto
	``max_seq_len`` by computing this during init.

	Args:
	dim (int): Embedding dimension. This is usually set to the dim of each
	head in the attention module computed as ``embed_dim // num_heads``
	max_seq_len (int): Maximum expected sequence length for the
	model, if exceeded the cached freqs will be recomputed
	base (int): The base for the geometric progression used to compute
	the rotation angles
	"""

	def __init__(
	self,
	dim: int,
	max_seq_len: int = 4096,
	base: int = 10_000,
	) -> None:
	super().__init__()
	self.dim = dim
	self.base = base
	self.max_seq_len = max_seq_len
	self.rope_init()

	def rope_init(self):
	theta = 1.0 / (
	self.base
	** (torch.arange(0, self.dim, 2)[: (self.dim // 2)].float() / self.dim)
	)
	self.register_buffer("theta", theta, persistent=False)
	self.build_rope_cache(self.max_seq_len)

	def build_rope_cache(self, max_seq_len: int = 4096) -> None:
	# Create position indexes `[0, 1, ..., max_seq_len - 1]`
	seq_idx = torch.arange(
	max_seq_len, dtype=self.theta.dtype, device=self.theta.device
	)

	# Outer product of theta and position index; output tensor has
	# a shape of [max_seq_len, dim // 2]
	idx_theta = torch.einsum("i, j -> ij", seq_idx, self.theta).float()

	# cache includes both the cos and sin components and so the output shape is
	# [max_seq_len, dim // 2, 2]
	cache = torch.stack([torch.cos(idx_theta), torch.sin(idx_theta)], dim=-1)
	self.register_buffer("cache", cache, persistent=False)

	def forward(
	self, x: torch.Tensor, *, input_pos: Optional[torch.Tensor] = None
	) -> torch.Tensor:
	"""
	Args:
	x (torch.Tensor): input tensor with shape
	``[b, s, n_h, h_d]``
	input_pos (Optional[torch.Tensor]): Optional tensor which contains the position ids
	of each token. During training, this is used to indicate the positions
	of each token relative to its sample when packed, shape [b, s].
	During inference, this indicates the position of the current token.
	If none, assume the index of the token is its position id. Default is None.

	Returns:
	torch.Tensor: output tensor with shape ``[b, s, n_h, h_d]``

	Notation used for tensor shapes:
	- b: batch size
	- s: sequence length
	- n_h: num heads
	- h_d: head dim
	"""
	# input tensor has shape [b, s, n_h, h_d]
	seq_len = x.size(1)

	# extract the values based on whether input_pos is set or not
	rope_cache = (
	self.cache[:seq_len] if input_pos is None else self.cache[input_pos]
	)

	# reshape input; the last dimension is used for computing the output.
	# Cast to float to match the reference implementation
	# tensor has shape [b, s, n_h, h_d // 2, 2]
	xshaped = x.float().reshape(*x.shape[:-1], -1, 2)

	# reshape the cache for broadcasting
	# tensor has shape [b, s, 1, h_d // 2, 2] if packed samples,
	# otherwise has shape [1, s, 1, h_d // 2, 2]
	rope_cache = rope_cache.view(-1, xshaped.size(1), 1, xshaped.size(3), 2)

	# tensor has shape [b, s, n_h, h_d // 2, 2]
	x_out = torch.stack(
	[
	xshaped[..., 0] * rope_cache[..., 0]
	- xshaped[..., 1] * rope_cache[..., 1],
	xshaped[..., 1] * rope_cache[..., 0]
	+ xshaped[..., 0] * rope_cache[..., 1],
	],
	-1,
	)

	# tensor has shape [b, s, n_h, h_d]
	x_out = x_out.flatten(3)
	return x_out.type_as(x)



	class PldrllmRotaryEmbedding(nn.Module):
	def __init__(self, config: PldrllmConfig, device=None):
	super().__init__()
	# BC: "rope_type" was originally "type"
	if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
	self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
	else:
	self.rope_type = "default"
	self.max_seq_len_cached = config.max_position_embeddings
	self.original_max_seq_len = config.max_position_embeddings

	self.config = config
	self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]

	inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
	self.register_buffer("inv_freq", inv_freq, persistent=False)
	self.original_inv_freq = self.inv_freq

	@torch.no_grad()
	@dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope)
	def forward(self, x, position_ids):
	inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
	position_ids_expanded = position_ids[:, None, :].float()

	device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
	with torch.autocast(device_type=device_type, enabled=False): # Force float32
	freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
	emb = torch.cat((freqs, freqs), dim=-1)
	cos = emb.cos() * self.attention_scaling
	sin = emb.sin() * self.attention_scaling

	return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)


	def rotate_half(x):
	"""Rotates half the hidden dims of the input."""
	x1 = x[..., : x.shape[-1] // 2]
	x2 = x[..., x.shape[-1] // 2 :]
	return torch.cat((-x2, x1), dim=-1)


	def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
	"""Applies Rotary Position Embedding to the query and key tensors.

	Args:
	q (`torch.Tensor`): The query tensor.
	k (`torch.Tensor`): The key tensor.
	cos (`torch.Tensor`): The cosine part of the rotary embedding.
	sin (`torch.Tensor`): The sine part of the rotary embedding.
	position_ids (`torch.Tensor`, optional):
	Deprecated and unused.
	unsqueeze_dim (`int`, optional, defaults to 1):
	The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
	sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
	that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
	k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
	cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
	the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
	Returns:
	`tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
	"""
	cos = cos.unsqueeze(unsqueeze_dim)
	sin = sin.unsqueeze(unsqueeze_dim)
	q_embed = (q * cos) + (rotate_half(q) * sin)
	k_embed = (k * cos) + (rotate_half(k) * sin)
	return q_embed, k_embed

	############# END OF ROTARY EMBEDDING IMPLEMENTATION #################################################

	@dataclass
	class BasePLDRModelOutputWithPast(ModelOutput):
	"""
	Base class for [`PldrllmModel`] outputs that may also contain a past key/values (to speed up sequential decoding).

	Args:
	last_hidden_state (`torch.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
	Sequence of hidden-states at the output of the last layer of the model.

	If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
	hidden_size)` is output.
	past_key_values (`Cache`, optional, returned when `use_cache=True` is passed or when `config.use_cache=True`):
	It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

	Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
	`config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
	input) to speed up sequential decoding.
	hidden_states (`tuple(torch.Tensor)`, optional, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
	Tuple of `torch.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
	one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

	Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
	attentions (`tuple(torch.Tensor)`, optional, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
	Tuple of `torch.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
	sequence_length)`.

	Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
	heads.
	pldr_attentions (`tuple(tuple(torch.Tensor)))`, optional, returned when `output_pldr_attentions=True` is passed or when `config.output_pldr_attentions=True`):
	Tuple of `tuple(torch.Tensor)` (one for each layer) of the deductive outputs and learnable parameters of power law graph attention module.

	The tuple for each layer contains:
	output of the residual metric learner (metric tensor, A) of shape `(batch_size, num_heads, head_dim,head_dim)`,
	output after application of iSwiGLU on metric tensor, A_LM of shape `(batch_size, num_heads, head_dim,head_dim)`,
	learned exponents of potential tensor of shape `(batch_size, num_heads, head_dim,head_dim)`,
	learned weights for energy-curvature tensor of shape `(batch_size, num_heads, head_dim,head_dim)`,
	learned bias for energy-curvature tensor of shape `(batch_size, num_heads, head_dim,head_dim)`,
	energy-curvature tensor G_LM of shape `(batch_size, num_heads, head_dim,head_dim)`,
	attention weights of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
	"""
	last_hidden_state: Optional[torch.Tensor] = None
	past_key_values: Optional[Cache] = None
	hidden_states: Optional[tuple[torch.Tensor, ...]] = None
	attentions: Optional[tuple[torch.Tensor, ...]] = None
	pldr_attentions:Optional[tuple[tuple[torch.Tensor, ...]]] = None

	@dataclass
	class CausalPLDRLLMOutputWithPast(ModelOutput):
	"""
	Base class for [`PldrllmForCausalLM`] causal language model (or autoregressive) outputs.

	Args:
	loss (`torch.Tensor` of shape `(1,)`, optional, returned when `labels` is provided):
	Language modeling loss (for next-token prediction).
	logits (`torch.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
	Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
	past_key_values (`Cache`, optional, returned when `use_cache=True` is passed or when `config.use_cache=True`):
	It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

	Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
	`past_key_values` input) to speed up sequential decoding.
	hidden_states (`tuple(torch.Tensor)`, optional, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
	Tuple of `torch.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
	one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

	Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
	attentions (`tuple(torch.Tensor)`, optional, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
	Tuple of `torch.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
	sequence_length)`.

	Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
	heads.
	pldr_attentions (`tuple(tuple(torch.Tensor)))`, optional, returned when `output_pldr_attentions=True` is passed or when `config.output_pldr_attentions=True`):
	Tuple of `tuple(torch.Tensor)` (one for each layer) of the deductive outputs and learnable parameters of power law graph attention module.

	The tuple for each layer contains:
	output of the residual metric learner (metric tensor, A) of shape `(batch_size, num_heads, head_dim,head_dim)`,
	output after application of iSwiGLU on metric tensor, A_LM of shape `(batch_size, num_heads, head_dim,head_dim)`,
	learned exponents of potential tensor of shape `(batch_size, num_heads, head_dim,head_dim)`,
	learned weights for energy-curvature tensor of shape `(batch_size, num_heads, head_dim,head_dim)`,
	learned bias for energy-curvature tensor of shape `(batch_size, num_heads, head_dim,head_dim)`,
	energy-curvature tensor G_LM of shape `(batch_size, num_heads, head_dim,head_dim)`,
	attention weights of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
	"""
	loss: Optional[torch.Tensor] = None
	logits: Optional[torch.Tensor] = None
	past_key_values: Optional[Cache] = None
	hidden_states: Optional[tuple[torch.Tensor, ...]] = None
	attentions: Optional[tuple[torch.Tensor, ...]] = None
	pldr_attentions:Optional[tuple[tuple[torch.Tensor, ...]]] = None

	@dataclass
	class TokenClassifierPLDRLLMOutput(ModelOutput):
	"""
	Base class for outputs of [`PldrllmForTokenClassification`] token classification model.

	Args:
	loss (`torch.Tensor` of shape `(1,)`, optional, returned when `labels` is provided) :
	Classification loss.
	logits (`torch.Tensor` of shape `(batch_size, sequence_length, config.num_labels)`):
	Classification scores (before SoftMax).
	hidden_states (`tuple(torch.Tensor)`, optional, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
	Tuple of `torch.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
	one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

	Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
	attentions (`tuple(torch.Tensor)`, optional, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
	Tuple of `torch.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
	sequence_length)`.

	Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
	heads.
	pldr_attentions (`tuple(tuple(torch.Tensor)))`, optional, returned when `output_pldr_attentions=True` is passed or when `config.output_pldr_attentions=True`):
	Tuple of `tuple(torch.Tensor)` (one for each layer) of the deductive outputs and learnable parameters of power law graph attention module.

	The tuple for each layer contains:
	output of the residual metric learner (metric tensor, A) of shape `(batch_size, num_heads, head_dim,head_dim)`,
	output after application of iSwiGLU on metric tensor, A_LM of shape `(batch_size, num_heads, head_dim,head_dim)`,
	learned exponents of potential tensor of shape `(batch_size, num_heads, head_dim,head_dim)`,
	learned weights for energy-curvature tensor of shape `(batch_size, num_heads, head_dim,head_dim)`,
	learned bias for energy-curvature tensor of shape `(batch_size, num_heads, head_dim,head_dim)`,
	energy-curvature tensor G_LM of shape `(batch_size, num_heads, head_dim,head_dim)`,
	attention weights of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
	"""
	loss: Optional[torch.Tensor] = None
	logits: Optional[torch.Tensor] = None
	hidden_states: Optional[tuple[torch.Tensor, ...]] = None
	attentions: Optional[tuple[torch.Tensor, ...]] = None
	pldr_attentions:Optional[tuple[tuple[torch.Tensor, ...]]] = None

	@dataclass
	class QuestionAnsweringPLDRModelOutput(ModelOutput):
	"""
	Base class for outputs of [`PldrllmForQuestionAnswering`] question answering model.

	Args:
	loss (`torch.Tensor` of shape `(1,)`, optional, returned when `labels` is provided):
	Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
	start_logits (`torch.Tensor` of shape `(batch_size, sequence_length)`):
	Span-start scores (before SoftMax).
	end_logits (`torch.Tensor` of shape `(batch_size, sequence_length)`):
	Span-end scores (before SoftMax).
	hidden_states (`tuple(torch.Tensor)`, optional, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
	Tuple of `torch.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
	one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

	Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
	attentions (`tuple(torch.Tensor)`, optional, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
	Tuple of `torch.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
	sequence_length)`.

	Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
	heads.
	pldr_attentions (`tuple(tuple(torch.Tensor)))`, optional, returned when `output_pldr_attentions=True` is passed or when `config.output_pldr_attentions=True`):
	Tuple of `tuple(torch.Tensor)` (one for each layer) of the deductive outputs and learnable parameters of power law graph attention module.

	The tuple for each layer contains:
	output of the residual metric learner (metric tensor, A) of shape `(batch_size, num_heads, head_dim,head_dim)`,
	output after application of iSwiGLU on metric tensor, A_LM of shape `(batch_size, num_heads, head_dim,head_dim)`,
	learned exponents of potential tensor of shape `(batch_size, num_heads, head_dim,head_dim)`,
	learned weights for energy-curvature tensor of shape `(batch_size, num_heads, head_dim,head_dim)`,
	learned bias for energy-curvature tensor of shape `(batch_size, num_heads, head_dim,head_dim)`,
	energy-curvature tensor G_LM of shape `(batch_size, num_heads, head_dim,head_dim)`,
	attention weights of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
	"""

	loss: Optional[torch.Tensor] = None
	start_logits: Optional[torch.Tensor] = None
	end_logits: Optional[torch.Tensor] = None
	hidden_states: Optional[tuple[torch.Tensor, ...]] = None
	attentions: Optional[tuple[torch.Tensor, ...]] = None
	pldr_attentions:Optional[tuple[tuple[torch.Tensor, ...]]] = None

	@dataclass
	class SequenceClassifierPLDRLLMOutputWithPast(ModelOutput):
	"""
	Base class for outputs of [`PldrllmForSequenceClassification`] sentence classification model.

	Args:
	loss (`torch.Tensor` of shape `(1,)`, optional, returned when `labels` is provided):
	Classification (or regression if config.num_labels==1) loss.
	logits (`torch.Tensor` of shape `(batch_size, config.num_labels)`):
	Classification (or regression if config.num_labels==1) scores (before SoftMax).
	past_key_values (`Cache`, optional, returned when `use_cache=True` is passed or when `config.use_cache=True`):
	It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

	Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
	`past_key_values` input) to speed up sequential decoding.
	hidden_states (`tuple(torch.Tensor)`, optional, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
	Tuple of `torch.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
	one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

	Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
	attentions (`tuple(torch.Tensor)`, optional, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
	Tuple of `torch.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
	sequence_length)`.

	Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
	heads.
	pldr_attentions (`tuple(tuple(torch.Tensor)))`, optional, returned when `output_pldr_attentions=True` is passed or when `config.output_pldr_attentions=True`):
	Tuple of `tuple(torch.Tensor)` (one for each layer) of the deductive outputs and learnable parameters of power law graph attention module.

	The tuple for each layer contains:
	output of the residual metric learner (metric tensor, A) of shape `(batch_size, num_heads, head_dim,head_dim)`,
	output after application of iSwiGLU on metric tensor, A_LM of shape `(batch_size, num_heads, head_dim,head_dim)`,
	learned exponents of potential tensor of shape `(batch_size, num_heads, head_dim,head_dim)`,
	learned weights for energy-curvature tensor of shape `(batch_size, num_heads, head_dim,head_dim)`,
	learned bias for energy-curvature tensor of shape `(batch_size, num_heads, head_dim,head_dim)`,
	energy-curvature tensor G_LM of shape `(batch_size, num_heads, head_dim,head_dim)`,
	attention weights of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
	"""

	loss: Optional[torch.Tensor] = None
	logits: Optional[torch.Tensor] = None
	past_key_values: Optional[Cache] = None
	hidden_states: Optional[tuple[torch.Tensor, ...]] = None
	attentions: Optional[tuple[torch.Tensor, ...]] = None
	pldr_attentions:Optional[tuple[tuple[torch.Tensor, ...]]] = None


	@auto_docstring
	class PldrllmPreTrainedModel(PreTrainedModel):
	config_class = PldrllmConfig
	base_model_prefix = "decoder"
	supports_gradient_checkpointing = True
	_no_split_modules = ["PLDR_DecoderLayer"]
	_skip_keys_device_placement = ["past_key_values"]
	_supports_flash_attn = True
	_supports_sdpa = True
	_supports_flex_attn = False
	_supports_attention_backend = True
	_can_compile_fullgraph=False

	def __init__(self, config: PldrllmConfig)->None:
	super().__init__(config)
	self.custom_G_type=config.custom_G_type
	if self.custom_G_type is not None:
	self._can_compile_fullgraph=True

	def _init_weights(self, module):
	if isinstance(module, nn.Linear):
	nn.init.xavier_uniform_(module.weight.data)
	if module.bias is not None:
	module.bias.data.zero_()
	elif isinstance(module, nn.Embedding):
	module.weight.data.normal_(mean=0.0, std=1.0)
	if module.padding_idx is not None:
	module.weight.data[module.padding_idx].zero_()
	elif isinstance(module, nn.LayerNorm):
	module.weight.data.fill_(1.0)
	if module.bias is not None:
	module.bias.data.zero_()
	elif isinstance(module, PlgaLayer):
	if module.Wlst is not None:
	nn.init.xavier_uniform_(module.Wlst.data)
	if module.pwlst is not None:
	nn.init.xavier_uniform_(module.pwlst.data)
	if module.alst is not None:
	nn.init.xavier_uniform_(module.alst.data)
	if module.blst is not None:
	module.blst.data.zero_()
	if module.balst is not None:
	module.balst.data.zero_()

	MODEL_COMMON_CUSTOM_ARGS=r"""
	output_pldr_attentions (`bool`, optional, defaults to `False`):
	Whether to return the deductive outputs and learnable parameters of power law graph attention module as tuple containing:
	the output of the residual metric learner (metric tensor, A), output (A_LM) after application of iSwiGLU on metric tensor, learned
	exponents of potential tensor, learned weights for energy-curvature tensor, learned bias for
	energy-curvature tensor, energy-curvature tensor (G_LM), and attention weights.
	cache_first_G (`bool`, optional, defaults to `False`):
	Whether or not the model should return the G values from first sample in a batch or G values from all samples for past_G_values initialization.
	When `cache_first_G=true`, the batch_size of past_G_values is 1. This argument should be set to True for contrastive text generation
	with learned G values.
	"""


	@auto_docstring(custom_intro="""
	Large Language Model From Power Law Decoder Representations (PLDR-LLM) with decoder hidden state as output.
	PLDR-LLM is a model architecture that utilizes Power Law Graph Attention (PLGA) in decoder layers.
	For details of model architecture, check out these papers:
	[Paper-1](https://huggingface.co/papers/2107.02039) [Paper-2](https://huggingface.co/papers/2410.16703) [Paper-3](https://huggingface.co/papers/2502.13502)
	"""
	)
	class PldrllmModel(PldrllmPreTrainedModel):
	def __init__(self, config: PldrllmConfig)->None:
	super().__init__(config)

	# Initialize weights and apply final processing
	self.num_layers = config.num_hidden_layers
	self.d_model=config.hidden_size
	self.num_heads=config.num_attention_heads
	self.target_vocab_size =config.vocab_size
	self.max_seq_len=config.max_position_embeddings
	self.reference_rope=config.reference_rope
	self.pldr_device=None
	self.gradient_checkpointing = False
	self.layer_norm_eps=config.layer_norm_eps
	self.wdtype=None

	assert self.d_model % self.num_heads == 0
	self.depth = config.head_dim

	self.custom_G_type=config.custom_G_type

	if self.custom_G_type is not None:
	# predefined past_G_values are initialized for both training and inference
	past_G_values, past_G_values_status=self.G_values_init(device=self.pldr_device, dtype=self.wdtype)
	self.register_buffer("past_G_values_status", past_G_values_status, persistent=True)
	self.register_buffer("past_G_values", past_G_values, persistent=True)

	logger.warning("\nIMPORTANT: decoder.past_G_values are set to predefined values and deep PLGA layers will be skipped. "
	"Set config.custom_G_type=None to enable deep PLGA layers.")
	if self.custom_G_type=="external":
	logger.warning("\nIMPORTANT: config.custom_G_type is selected as 'external' and an external value of decoder.past_G_values[:,2,...] is expected. "
	"decoder.past_G_values[:,2,...] are initialized to identity tensor by default. This is equivalent to an LLM with SDPA. To provide external values "
	"to the decoder.past_G_values, either load these values along with the pretrained model or set decoder.past_G_values to a torch.float tensor of "
	"size (num_layers, 3, 1, num_heads, head_dim, head_dim) after model is initialized.\n")
	else:
	# learned past_G_values is initialized at inference.
	self.register_buffer("past_G_values_status", None, persistent=False)
	self.register_buffer("past_G_values", None, persistent=False)
	self.is_past_G_values_initialized=False


	self.embedding = nn.Embedding(self.target_vocab_size, self.d_model, device=self.pldr_device, dtype=self.wdtype)

	self.dec_layers = nn.ModuleList([PLDR_DecoderLayer(config,
	layer_idx=i,
	device=self.pldr_device) for i in range(self.num_layers)])

	self.layernorm1 = nn.LayerNorm(self.d_model, eps=self.layer_norm_eps, device=self.pldr_device, dtype=self.wdtype)

	if not self.reference_rope:
	self.rotary_embedding=PldrllmRotaryEmbedding(config=config)

	self.post_init()

	def G_values_init(self, batch_size=1, device=None, dtype=None):
	G_values_dim=(self.num_layers, 1, self.num_heads, self.depth, self.depth) # [num_layers, 1, num_heads, depth, depth]
	zeros_tensor=torch.zeros(G_values_dim, device=device, dtype=dtype)
	identity_tensor=torch.eye(self.depth).repeat(self.num_layers, 1, self.num_heads, 1, 1).to(device=device, dtype=dtype)
	random_tensor=torch.randn(G_values_dim, device=device, dtype=dtype)
	CUSTOM_G_VALUES={
	'identity':torch.stack([zeros_tensor, zeros_tensor, identity_tensor], dim=1), # [num_layers, 3, num_heads, depth, depth]
	'random': torch.stack([zeros_tensor, zeros_tensor, random_tensor], dim=1),
	'external': torch.stack([zeros_tensor, zeros_tensor, identity_tensor], dim=1)
	}

	if self.custom_G_type is None:
	# 3 tensors for A, AW and avAp per layer
	past_G_values = torch.zeros((self.num_layers, 3, batch_size, self.num_heads, self.depth, self.depth), device=device, dtype=dtype)
	past_G_values_status=torch.tensor([False]*self.num_layers, dtype=torch.bool, device=device)
	elif self.custom_G_type in ['identity', 'random', 'external']:
	past_G_values=CUSTOM_G_VALUES[self.custom_G_type]
	past_G_values_status=torch.tensor([True]*self.num_layers, dtype=torch.bool, device=device)
	else:
	raise ValueError("Invalid custom_G_type value. Available values are "
	"None, 'identity', 'random', and 'external'.")

	self.is_past_G_values_initialized=True
	return past_G_values, past_G_values_status

	@can_return_tuple
	@auto_docstring(
	custom_args=MODEL_COMMON_CUSTOM_ARGS
	)
	def forward(self,
	input_ids: Optional[torch.LongTensor] = None,
	attention_mask: Optional[torch.Tensor] = None,
	position_ids: Optional[torch.LongTensor] = None,
	past_key_values: Optional[Cache]=None,
	inputs_embeds: Optional[torch.FloatTensor] = None,
	use_cache: Optional[bool] = None,
	output_attentions: Optional[bool] = None,
	output_pldr_attentions: Optional[bool] = None,
	output_hidden_states: Optional[bool] = None,
	cache_position: Optional[torch.LongTensor] = None,
	cache_first_G: Optional[bool] = None,
	**kwargs: Unpack[TransformersKwargs]
	):

	use_cache=use_cache if use_cache is not None else self.config.use_cache
	cache_first_G=cache_first_G if cache_first_G is not None else self.config.cache_first_G
	output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
	output_pldr_attentions=output_pldr_attentions if output_pldr_attentions is not None else self.config.output_pldr_attentions
	output_hidden_states=output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states

	if (self.gradient_checkpointing or self.training) and use_cache:
	logger.warning_once(
	"During training, setting `use_cache=False`. Additionally, `use_cache=True` is incompatible with gradient checkpointing."
	)
	use_cache = False

	if (input_ids is None) ^ (inputs_embeds is not None):
	raise ValueError("You must specify exactly one of input_ids or inputs_embeds")

	inputs_embeds = self.embedding(input_ids) if inputs_embeds is None else inputs_embeds # [batch_size, target_seq_len, d_model]

	dec_att_weights=() if output_pldr_attentions else None
	dec_attentions=() if output_attentions else None

	dec_outputs=(inputs_embeds,) if output_hidden_states else None

	if not isinstance(past_key_values, (type(None), Cache)):
	raise ValueError("The `past_key_values` should be either a `Cache` object or `None`.")

	if use_cache and past_key_values is None:
	past_key_values = DynamicCache()

	# reset past_G_Values_status if they are not custom and predefined.
	if use_cache and self.custom_G_type is None and not isinstance(past_key_values, StaticCache) and past_key_values.get_seq_length()==0:
	self.past_G_values_status=torch.tensor([False]*self.num_layers, dtype=torch.bool, device=inputs_embeds.device)
	self.is_past_G_values_initialized=False

	if use_cache and isinstance(past_key_values, StaticCache) and ((self.custom_G_type is None) or
	"flash_attention" in self.config._attn_implementation):
	raise ValueError("Static Cache is only supported with predefined past_G_values. "
	"Flash attention is not supported. "
	"Supported models are with config.custom_G_type set to 'random', 'identity' or 'external'.")

	if not self.is_past_G_values_initialized and self.custom_G_type is None:
	if use_cache:
	batch_size=1 if cache_first_G else inputs_embeds.size()[0]
	self.past_G_values, self.past_G_values_status=self.G_values_init(batch_size=batch_size,
	device=inputs_embeds.device,
	dtype=inputs_embeds.dtype)
	else:
	self.past_G_values_status=torch.tensor([False]*self.num_layers, dtype=torch.bool, device=inputs_embeds.device)
	self.past_G_values=None
	self.is_past_G_values_initialized=True

	if cache_position is None:
	past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
	cache_position = torch.arange(
	past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
	)

	if position_ids is None:
	position_ids = cache_position.unsqueeze(0)

	causal_mask = create_causal_mask(
	config=self.config,
	input_embeds=inputs_embeds,
	attention_mask=attention_mask,
	cache_position=cache_position,
	past_key_values=past_key_values,
	position_ids=position_ids
	)

	hidden_states=inputs_embeds
	# create position embeddings to be shared across the decoder layers
	if not self.reference_rope:
	position_embeddings = self.rotary_embedding(hidden_states, position_ids)
	else:
	# defer reference rope initialization in the PldrllmAttention module.
	position_embeddings=None

	hidden_states *= torch.sqrt(torch.tensor(self.d_model).to(dtype=hidden_states.dtype))

	hidden_states=self.layernorm1(hidden_states)

	for i in range(self.num_layers):
	hidden_states, dec_att_w= self.dec_layers[i](hidden_states,
	causal_mask,
	position_embeddings=position_embeddings,
	position_ids=position_ids,
	cache_position=cache_position,
	use_cache=use_cache,
	past_key_values=past_key_values,
	past_G_values=self.past_G_values,
	past_G_values_status=self.past_G_values_status,
	**kwargs
	)

	if output_pldr_attentions:
	dec_att_weights += (dec_att_w,)

	if output_attentions:
	dec_attentions += (dec_att_w[-1],)

	if output_hidden_states:
	dec_outputs += (hidden_states,)

	last_hidden_state=hidden_states

	return BasePLDRModelOutputWithPast(
	last_hidden_state = last_hidden_state,
	past_key_values=past_key_values if use_cache else None,
	hidden_states=dec_outputs,
	attentions=dec_attentions,
	pldr_attentions=dec_att_weights
	)

	def get_input_embeddings(self):
	return self.embedding

	def set_input_embeddings(self, value):
	self.embedding = value

	@auto_docstring(custom_intro="""
	Large Language Model From Power Law Decoder Representations (PLDR-LLM) with LM Head as final layer.
	PLDR-LLM is a model architecture that utilizes Power Law Graph Attention (PLGA) in decoder layers.
	For details of model architecture, check out these papers:
	[Paper-1](https://huggingface.co/papers/2107.02039) [Paper-2](https://huggingface.co/papers/2410.16703) [Paper-3](https://huggingface.co/papers/2502.13502)
	"""
	)
	class PldrllmForCausalLM(PldrllmPreTrainedModel, GenerationMixin):
	def __init__(self, config: PldrllmConfig)->None:
	super().__init__(config)

	self.d_model=config.hidden_size
	self.input_vocab_size =config.vocab_size
	self.final_bias=config.final_bias
	self.pldr_device=None
	self.decoder=PldrllmModel(config=config)
	self.wdtype=None

	self.final_layer = nn.Linear(self.d_model, self.input_vocab_size, bias=self.final_bias, device=self.pldr_device, dtype=self.wdtype)

	self.post_init()

	def get_input_embeddings(self):
	return self.decoder.embedding


	def set_input_embeddings(self, value):
	self.decoder.embedding = value

	def get_output_embeddings(self):
	return self.final_layer

	def set_output_embeddings(self, new_embeddings):
	self.final_layer = new_embeddings

	def set_decoder(self, decoder):
	self.decoder = decoder

	def get_decoder(self):
	return self.decoder

	@can_return_tuple
	@auto_docstring(
	custom_args=MODEL_COMMON_CUSTOM_ARGS
	)
	def forward(self,
	input_ids: Optional[torch.LongTensor] = None,
	attention_mask: Optional[torch.Tensor] = None,
	position_ids: Optional[torch.LongTensor] = None,
	past_key_values: Optional[Cache]=None,
	use_cache: Optional[bool] = None,
	inputs_embeds: Optional[torch.FloatTensor] = None,
	labels: Optional[torch.LongTensor] = None,
	output_attentions: Optional[bool] = None,
	output_pldr_attentions: Optional[bool] = None,
	output_hidden_states: Optional[bool] = None,
	cache_position: Optional[torch.LongTensor] = None,
	cache_first_G: Optional[bool] = None,
	logits_to_keep: Union[int, torch.Tensor] = 0,
	**kwargs: Unpack[TransformersKwargs],
	)-> CausalPLDRLLMOutputWithPast:

	outputs: BasePLDRModelOutputWithPast=self.decoder(input_ids=input_ids,
	attention_mask=attention_mask,
	position_ids=position_ids,
	past_key_values=past_key_values,
	use_cache=use_cache,
	inputs_embeds=inputs_embeds,
	output_attentions=output_attentions,
	output_pldr_attentions=output_pldr_attentions,
	output_hidden_states=output_hidden_states,
	cache_position=cache_position,
	cache_first_G=cache_first_G,
	**kwargs
	)


	hidden_states = outputs.last_hidden_state
	# Only compute necessary logits, and do not upcast them to float if we are not computing the loss
	slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
	logits = self.final_layer(hidden_states[:, slice_indices, :])

	loss = None
	if labels is not None:
	loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs)

	return CausalPLDRLLMOutputWithPast(
	loss=loss,
	logits=logits,
	past_key_values=outputs.past_key_values,
	hidden_states=outputs.hidden_states,
	attentions= outputs.attentions, #list of E
	pldr_attentions=outputs.pldr_attentions
	)

	@auto_docstring
	class PldrllmForTokenClassification(PldrllmPreTrainedModel):
	def __init__(self, config:PldrllmConfig)->None:
	super().__init__(config)
	self.num_labels = config.num_labels
	self.decoder = PldrllmModel(config)
	self.wdtype=None
	if getattr(config, "classifier_dropout", None) is not None:
	classifier_dropout = config.classifier_dropout
	elif getattr(config, "hidden_dropout", None) is not None:
	classifier_dropout = config.hidden_dropout
	else:
	classifier_dropout = 0.1
	self.dropout = nn.Dropout(classifier_dropout)
	self.score = nn.Linear(config.hidden_size, config.num_labels, bias=True, dtype=self.wdtype)

	# Initialize weights and apply final processing
	self.post_init()

	def get_input_embeddings(self):
	return self.decoder.embedding

	def set_input_embeddings(self, value):
	self.decoder.embedding = value

	@can_return_tuple
	@auto_docstring(
	custom_args=MODEL_COMMON_CUSTOM_ARGS
	)
	def forward(
	self,
	input_ids: Optional[torch.LongTensor] = None,
	attention_mask: Optional[torch.Tensor] = None,
	position_ids: Optional[torch.LongTensor] = None,
	past_key_values: Optional[Cache] = None,
	inputs_embeds: Optional[torch.FloatTensor] = None,
	labels: Optional[torch.LongTensor] = None,
	use_cache: Optional[bool] = None,
	output_attentions: Optional[bool] = None,
	output_pldr_attentions: Optional[bool] = None,
	output_hidden_states: Optional[bool] = None,
	cache_first_G: Optional[bool] = None,
	) -> TokenClassifierPLDRLLMOutput:
	r"""
	labels (`torch.LongTensor` of shape `(batch_size,)`, optional):
	Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
	config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
	`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
	"""

	outputs: BasePLDRModelOutputWithPast = self.decoder(
	input_ids,
	attention_mask=attention_mask,
	position_ids=position_ids,
	past_key_values=past_key_values,
	inputs_embeds=inputs_embeds,
	use_cache=use_cache,
	output_attentions=output_attentions,
	output_hidden_states=output_hidden_states,
	output_pldr_attentions=output_pldr_attentions,
	cache_first_G=cache_first_G
	)
	sequence_output = outputs.last_hidden_state
	sequence_output = self.dropout(sequence_output)
	logits = self.score(sequence_output)

	loss = None
	if labels is not None:
	loss = self.loss_function(logits, labels, self.config)

	return TokenClassifierPLDRLLMOutput(
	loss=loss,
	logits=logits,
	hidden_states=outputs.hidden_states,
	attentions=outputs.attentions,
	pldr_attentions=outputs.pldr_attentions
	)


	@auto_docstring
	class PldrllmForQuestionAnswering(PldrllmPreTrainedModel):

	# Copied from transformers.models.bloom.modeling_bloom.BloomForQuestionAnswering.__init__ with Bloom->Llama->Pldrllm
	def __init__(self, config:PldrllmConfig):
	super().__init__(config)
	self.decoder = PldrllmModel(config)
	self.wdtype=None
	self.qa_outputs = nn.Linear(config.hidden_size, 2, bias=True, dtype=self.wdtype)

	# Initialize weights and apply final processing
	self.post_init()

	def get_input_embeddings(self):
	return self.decoder.embedding

	def set_input_embeddings(self, value):
	self.decoder.embedding = value

	@can_return_tuple
	@auto_docstring(
	custom_args=MODEL_COMMON_CUSTOM_ARGS
	)
	def forward(
	self,
	input_ids: Optional[torch.LongTensor] = None,
	attention_mask: Optional[torch.Tensor] = None,
	position_ids: Optional[torch.LongTensor] = None,
	past_key_values: Optional[Cache] = None,
	inputs_embeds: Optional[torch.FloatTensor] = None,
	start_positions: Optional[torch.LongTensor] = None,
	end_positions: Optional[torch.LongTensor] = None,
	output_attentions: Optional[bool] = None,
	output_pldr_attentions: Optional[bool] = None,
	output_hidden_states: Optional[bool] = None,
	cache_first_G: Optional[bool] = None,
	**kwargs,
	) -> QuestionAnsweringPLDRModelOutput:
	outputs: BasePLDRModelOutputWithPast = self.decoder(
	input_ids,
	attention_mask=attention_mask,
	position_ids=position_ids,
	past_key_values=past_key_values,
	inputs_embeds=inputs_embeds,
	output_attentions=output_attentions,
	output_hidden_states=output_hidden_states,
	output_pldr_attentions=output_pldr_attentions,
	cache_first_G=cache_first_G
	)

	sequence_output = outputs.last_hidden_state

	logits = self.qa_outputs(sequence_output)
	start_logits, end_logits = logits.split(1, dim=-1)
	start_logits = start_logits.squeeze(-1).contiguous()
	end_logits = end_logits.squeeze(-1).contiguous()

	loss = None
	if start_positions is not None and end_positions is not None:
	loss = self.loss_function(start_logits, end_logits, start_positions, end_positions, **kwargs)

	return QuestionAnsweringPLDRModelOutput(
	loss=loss,
	start_logits=start_logits,
	end_logits=end_logits,
	hidden_states=outputs.hidden_states,
	attentions=outputs.attentions,
	pldr_attentions=outputs.pldr_attentions
	)

	@auto_docstring(
	custom_intro="""
	The PLDR-LLM with a sequence classification head on top (linear layer).

	[`PldrllmForSequenceClassification`] uses the last token in order to do the classification, as other causal models
	(e.g. GPT-2) do.

	Since it does classification on the last token, it requires to know the position of the last token. If a
	`pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
	no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
	padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
	each row of the batch).
	"""
	)
	class PldrllmForSequenceClassification(PldrllmPreTrainedModel):
	def __init__(self, config:PldrllmConfig)->None:
	super().__init__(config)
	self.num_labels = config.num_labels
	self.decoder = PldrllmModel(config)
	self.wdtype=None
	self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False, dtype=self.wdtype)

	# Initialize weights and apply final processing
	self.post_init()

	def get_input_embeddings(self):
	return self.decoder.embedding

	def set_input_embeddings(self, value):
	self.decoder.embedding = value

	@can_return_tuple
	@auto_docstring(
	custom_args=MODEL_COMMON_CUSTOM_ARGS
	)
	def forward(
	self,
	input_ids: Optional[torch.LongTensor] = None,
	attention_mask: Optional[torch.Tensor] = None,
	position_ids: Optional[torch.LongTensor] = None,
	past_key_values: Optional[Cache] = None,
	inputs_embeds: Optional[torch.FloatTensor] = None,
	labels: Optional[torch.LongTensor] = None,
	use_cache: Optional[bool] = None,
	output_attentions: Optional[bool] = None,
	output_pldr_attentions: Optional[bool] = None,
	output_hidden_states: Optional[bool] = None,
	cache_first_G: Optional[bool] = None
	) -> SequenceClassifierPLDRLLMOutputWithPast:
	r"""
	labels (`torch.LongTensor` of shape `(batch_size,)`, optional):
	Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
	config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
	`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
	"""

	outputs: BasePLDRModelOutputWithPast = self.decoder(
	input_ids,
	attention_mask=attention_mask,
	position_ids=position_ids,
	past_key_values=past_key_values,
	inputs_embeds=inputs_embeds,
	use_cache=use_cache,
	output_attentions=output_attentions,
	output_pldr_attentions=output_pldr_attentions,
	output_hidden_states=output_hidden_states,
	cache_first_G=cache_first_G
	)
	hidden_states = outputs.last_hidden_state
	logits = self.score(hidden_states)

	if input_ids is not None:
	batch_size = input_ids.shape[0]
	else:
	batch_size = inputs_embeds.shape[0]

	if self.config.pad_token_id is None and batch_size != 1:
	raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
	if self.config.pad_token_id is None:
	last_non_pad_token = -1
	elif input_ids is not None:
	# To handle both left- and right- padding, we take the rightmost token that is not equal to pad_token_id
	non_pad_mask = (input_ids != self.config.pad_token_id).to(logits.device, torch.int32)
	token_indices = torch.arange(input_ids.shape[-1], device=logits.device, dtype=torch.int32)
	last_non_pad_token = (token_indices * non_pad_mask).argmax(-1)
	else:
	last_non_pad_token = -1
	logger.warning_once(
	f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be "
	"unexpected if using padding tokens in conjunction with `inputs_embeds.`"
	)

	pooled_logits = logits[torch.arange(batch_size, device=logits.device), last_non_pad_token]

	loss = None
	if labels is not None:
	loss = self.loss_function(logits=logits, labels=labels, pooled_logits=pooled_logits, config=self.config)

	return SequenceClassifierPLDRLLMOutputWithPast(
	loss=loss,
	logits=pooled_logits,
	past_key_values=outputs.past_key_values,
	hidden_states=outputs.hidden_states,
	attentions=outputs.attentions,
	pldr_attentions=outputs.pldr_attentions
	)


	__all__ = [
	"PldrllmForCausalLM",
	"PldrllmModel",
	"PldrllmPreTrainedModel",
	"PldrllmForTokenClassification",
	"PldrllmForQuestionAnswering",
	"PldrllmForSequenceClassification"
	]