|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from typing import Callable, Optional, Union |
|
|
|
|
|
import torch |
|
|
from torch import nn |
|
|
import torch.nn.functional as F |
|
|
|
|
|
from transformers.activations import ACT2FN |
|
|
from transformers.cache_utils import Cache, DynamicCache, StaticCache |
|
|
from transformers.generation import GenerationMixin |
|
|
from transformers.masking_utils import create_causal_mask |
|
|
from transformers.modeling_layers import GradientCheckpointingLayer |
|
|
|
|
|
from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update |
|
|
from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel |
|
|
from transformers.processing_utils import Unpack |
|
|
from transformers.utils import TransformersKwargs, auto_docstring, can_return_tuple, logging |
|
|
from .configuration_pldrllm import PldrllmConfig |
|
|
|
|
|
from dataclasses import dataclass |
|
|
from transformers.utils import ModelOutput |
|
|
|
|
|
logger = logging.get_logger(__name__) |
|
|
|
|
|
|
|
|
|
|
|
'''' |
|
|
Power law attention implementation for PLDR-LLM with KV-cache and G-cache. |
|
|
''' |
|
|
|
|
|
class PlgaLayer(nn.Module): |
|
|
''' |
|
|
Power law graph attention layer implementation. |
|
|
''' |
|
|
def __init__(self, config:PldrllmConfig, |
|
|
F_hidden:int, |
|
|
F_heads:int, |
|
|
layer_idx:int, |
|
|
device=None, |
|
|
**kwargs)->None: |
|
|
''' |
|
|
Args: |
|
|
F_hidden: hidden layer shape used in layer weight creation. For multi-head plga this is head_dim. |
|
|
F_heads: Number of attention heads. |
|
|
layer_idx: index for the decoder layer. |
|
|
device: device(cpu or gpu) to load tensors. |
|
|
''' |
|
|
|
|
|
super().__init__(**kwargs) |
|
|
self.F_hidden=F_hidden |
|
|
self.F_heads=F_heads |
|
|
self.layer_idx=layer_idx |
|
|
self.device=device |
|
|
self.config=config |
|
|
self.is_causal = True |
|
|
self.custom_G_type=config.custom_G_type |
|
|
self.attention_dropout=config.attention_dropout |
|
|
|
|
|
|
|
|
self.wdtype=None |
|
|
|
|
|
if self.custom_G_type is None: |
|
|
self.build_weights() |
|
|
else: |
|
|
self.Wlst = None |
|
|
self.blst = None |
|
|
self.pwlst = None |
|
|
self.alst = None |
|
|
self.balst = None |
|
|
|
|
|
|
|
|
|
|
|
def cg_align_one(self, Hin:torch.Tensor, |
|
|
Hk:torch.Tensor, |
|
|
Hv:torch.Tensor, |
|
|
A:torch.Tensor, |
|
|
a_vec:Optional[torch.Tensor], |
|
|
ba:Optional[torch.Tensor], |
|
|
W:Optional[torch.Tensor], |
|
|
b:Optional[torch.Tensor], |
|
|
pw:Optional[torch.Tensor], |
|
|
past_G_values: Optional[torch.Tensor], |
|
|
past_G_values_status: Optional[torch.BoolTensor]=None, |
|
|
mask:Optional[torch.Tensor]=None, |
|
|
use_cache: Optional[bool]=None, |
|
|
**kwargs)->tuple[torch.Tensor, tuple[torch.Tensor,...]]: |
|
|
''' |
|
|
Alignment model for calculating attention weights |
|
|
Args: |
|
|
Hin: query |
|
|
Hk: key |
|
|
A: metric tensor instance |
|
|
a_vec: learned coupling coefficients. |
|
|
ba: bias for coupling coeffients |
|
|
W: weights applied on metric tensor before AdjActivation |
|
|
b: bias applied on metric tensor before AdjActivation |
|
|
pw: learned power exponents applied on metric tensor |
|
|
mask: padding or lookahead mask |
|
|
Returns: |
|
|
Hout: Attention output. |
|
|
A tuple of: |
|
|
A: metric tensor as output of residual metric learner layer, A |
|
|
AW: metric tensor after AdjActivation is applied, A_LM |
|
|
pw: learned power exponents |
|
|
a_vec: learned coupling coefficients for energy-curvature tensor |
|
|
ba: bias for energy-curvature tensor |
|
|
avAp: Energy curvature tensor, G_LM |
|
|
E: attention weights |
|
|
''' |
|
|
|
|
|
if self.custom_G_type is None and not (use_cache and past_G_values_status[self.layer_idx]): |
|
|
|
|
|
AdjActivation=iSwiGLU |
|
|
epsilonAdj=1e-9 |
|
|
|
|
|
|
|
|
AW=AdjActivation(torch.matmul(W,A)+b)+epsilonAdj |
|
|
|
|
|
|
|
|
Ap=torch.pow(AW, pw) |
|
|
avAp=torch.matmul(a_vec, Ap)+ba |
|
|
|
|
|
if use_cache: |
|
|
|
|
|
G_batch_size=past_G_values.size()[2] |
|
|
past_G_values[self.layer_idx]=torch.stack([A[:G_batch_size,:,:,:], |
|
|
AW[:G_batch_size,:,:,:], |
|
|
avAp[:G_batch_size,:,:,:]], dim=0) |
|
|
past_G_values_status[self.layer_idx]=True |
|
|
else: |
|
|
AW=past_G_values[self.layer_idx, 1] |
|
|
avAp=past_G_values[self.layer_idx, 2] |
|
|
|
|
|
WHiWHj = torch.matmul(Hin, avAp) |
|
|
|
|
|
|
|
|
dk=torch.tensor(self.F_hidden).to(Hin.dtype) |
|
|
scaling=1/torch.sqrt(dk) |
|
|
|
|
|
attention_interface: Callable = eager_attention_forward |
|
|
if self.config._attn_implementation != "eager": |
|
|
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] |
|
|
|
|
|
query, key, value = WHiWHj.to(dtype=Hk.dtype), Hk, Hv |
|
|
|
|
|
Hout, E = attention_interface( |
|
|
self, |
|
|
query=query, |
|
|
key=key, |
|
|
value=value, |
|
|
attention_mask=mask, |
|
|
dropout=0.0 if not self.training else self.attention_dropout, |
|
|
scaling=scaling, |
|
|
**kwargs |
|
|
) |
|
|
|
|
|
return Hout, (A, AW, pw, a_vec, ba, avAp, E) |
|
|
|
|
|
def cg_align_head(self, Hin:torch.Tensor, |
|
|
Hk:torch.Tensor, |
|
|
Hv:torch.Tensor, |
|
|
A:torch.Tensor, |
|
|
mask:Optional[torch.Tensor]=None, |
|
|
past_G_values: Optional[torch.Tensor]=None, |
|
|
past_G_values_status: Optional[torch.BoolTensor]=None, |
|
|
use_cache: Optional[bool]=None, |
|
|
**kwargs)->tuple[torch.Tensor, tuple[torch.Tensor,...]]: |
|
|
''' |
|
|
Method for linear propagation of attention weights over values. |
|
|
''' |
|
|
|
|
|
Hout, att_weights=self.cg_align_one(Hin=Hin, Hk=Hk, Hv=Hv, A=A, |
|
|
a_vec=self.alst, |
|
|
ba=self.balst, |
|
|
W=self.Wlst, |
|
|
b=self.blst, |
|
|
pw=self.pwlst, |
|
|
mask=mask, |
|
|
past_G_values=past_G_values, |
|
|
past_G_values_status=past_G_values_status, |
|
|
use_cache=use_cache, |
|
|
**kwargs) |
|
|
|
|
|
return Hout, att_weights |
|
|
|
|
|
|
|
|
|
|
|
def build_weights(self)->None: |
|
|
''' |
|
|
Used to initialize learnable parameters for the layer: |
|
|
W: weights to apply on metric tensor. |
|
|
b: bias to apply on metric tensor. |
|
|
a: coupling coefficients for energy-curvature (G) tensor. |
|
|
ba: bias for energy-curvature tensor. |
|
|
pw: power exponent weights for potential tensor. |
|
|
''' |
|
|
|
|
|
weight_shape=[self.F_heads, self.F_hidden, self.F_hidden] |
|
|
|
|
|
add_weight_Wpart= torch.empty(weight_shape, dtype=self.wdtype, device=self.device) |
|
|
add_weight_bpart=torch.empty(weight_shape, dtype=self.wdtype, device=self.device) |
|
|
add_weight_pwpart=torch.empty(weight_shape, dtype=self.wdtype, device=self.device) |
|
|
add_weight_apart = torch.empty(weight_shape, dtype=self.wdtype, device=self.device) |
|
|
add_weight_bapart=torch.empty(weight_shape, dtype=self.wdtype, device=self.device) |
|
|
|
|
|
self.Wlst = nn.Parameter(add_weight_Wpart, requires_grad=True) |
|
|
self.blst = nn.Parameter(add_weight_bpart, requires_grad=True) |
|
|
self.pwlst = nn.Parameter(add_weight_pwpart, requires_grad=True) |
|
|
self.alst = nn.Parameter(add_weight_apart, requires_grad=True) |
|
|
self.balst = nn.Parameter(add_weight_bapart, requires_grad=True) |
|
|
|
|
|
|
|
|
def forward(self, inputs:tuple[torch.Tensor,...], |
|
|
past_G_values: Optional[torch.Tensor]=None, |
|
|
past_G_values_status: Optional[torch.BoolTensor]=None, |
|
|
use_cache:Optional[bool]=False, |
|
|
**kwargs)->tuple[torch.Tensor, tuple[torch.Tensor,...]]: |
|
|
''' |
|
|
execute the forward propagation |
|
|
inputs[0] = query = Hin |
|
|
inputs[1] = key = Hk |
|
|
inputs[2] = value = Hv |
|
|
inputs[3] = metric tensor = A |
|
|
inputs[4] = mask |
|
|
''' |
|
|
|
|
|
Hin, Hk, Hv, A, mask=inputs |
|
|
H_next, att_weights = self.cg_align_head(Hin=Hin, Hk=Hk, Hv=Hv, A=A, mask=mask, |
|
|
past_G_values=past_G_values, |
|
|
past_G_values_status=past_G_values_status, |
|
|
use_cache=use_cache, **kwargs) |
|
|
return H_next, att_weights |
|
|
|
|
|
def eager_attention_forward( |
|
|
module: nn.Module, |
|
|
query: torch.Tensor, |
|
|
key: torch.Tensor, |
|
|
value: torch.Tensor, |
|
|
attention_mask: Optional[torch.Tensor], |
|
|
scaling: float, |
|
|
dropout: float = 0.0, |
|
|
**kwargs:Unpack[TransformersKwargs], |
|
|
)->tuple[torch.Tensor, torch.Tensor]: |
|
|
|
|
|
keyt=torch.permute(key, [0, 1, 3, 2]) |
|
|
attn_weights = torch.matmul(query, keyt) * scaling |
|
|
if attention_mask is not None: |
|
|
causal_mask = attention_mask[:, :, :, : key.shape[-2]] |
|
|
attn_weights = attn_weights + causal_mask |
|
|
|
|
|
attn_weights = F.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype) |
|
|
attn_weights = F.dropout(attn_weights, p=dropout, training=module.training) |
|
|
attn_output = torch.matmul(attn_weights, value) |
|
|
attn_output = torch.permute(attn_output, [0, 2, 1, 3]) |
|
|
attn_output = attn_output.contiguous() |
|
|
|
|
|
return attn_output, attn_weights |
|
|
|
|
|
def iSwiGLU(x): |
|
|
'''SwiGLU activation function with weights W,V equal to identity matrix and no bias.''' |
|
|
gate=F.silu(x) |
|
|
out=torch.mul(x, gate) |
|
|
return out |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
''' |
|
|
Model Implementation for Large Language Model from Power Law Decoder Representations with KV-cache and G-cache. |
|
|
''' |
|
|
|
|
|
class PldrllmAttention(nn.Module): |
|
|
''' |
|
|
Power Law Multihead Attention Implementation for PLDR-LLM. |
|
|
''' |
|
|
def __init__(self,config: PldrllmConfig, |
|
|
layer_idx:int, |
|
|
device=None, |
|
|
**kwargs)->None: |
|
|
|
|
|
|
|
|
super().__init__(**kwargs) |
|
|
self.num_heads = config.num_attention_heads |
|
|
self.d_model = config.hidden_size |
|
|
self.A_dff = config.A_dff |
|
|
self.num_denseA = config.num_denseA |
|
|
self.num_reslayerA = config.num_reslayerA |
|
|
self.activation=ACT2FN[config.hidden_act] |
|
|
self.max_seq_len=config.max_position_embeddings |
|
|
self.layer_idx=layer_idx |
|
|
self.device=device |
|
|
self.attention_bias=config.attention_bias |
|
|
self.custom_G_type=config.custom_G_type |
|
|
self.layer_norm_eps=config.layer_norm_eps |
|
|
self.glu_bias=config.glu_bias |
|
|
self.reference_rope=config.reference_rope |
|
|
self.wdtype=None |
|
|
|
|
|
assert self.d_model % self.num_heads == 0 |
|
|
self.depth = config.head_dim |
|
|
|
|
|
self.wq = nn.Linear(self.d_model, self.d_model, bias=self.attention_bias, device=self.device, dtype=self.wdtype) |
|
|
self.wk = nn.Linear(self.d_model, self.d_model, bias=self.attention_bias, device=self.device, dtype=self.wdtype) |
|
|
self.wv = nn.Linear(self.d_model, self.d_model, bias=self.attention_bias, device=self.device, dtype=self.wdtype) |
|
|
|
|
|
self.plgatt_layer= PlgaLayer(config=config, |
|
|
F_hidden=self.depth, |
|
|
F_heads= self.num_heads, |
|
|
layer_idx=self.layer_idx, |
|
|
device=self.device) |
|
|
|
|
|
self.dense = nn.Linear(self.d_model, self.d_model, bias=self.attention_bias, device=self.device, dtype=self.wdtype) |
|
|
|
|
|
if self.custom_G_type is None: |
|
|
|
|
|
self.reslayerAs=nn.ModuleList([ResLayerA(depth=self.depth, |
|
|
A_dff=self.A_dff, |
|
|
num_denseA=self.num_denseA, |
|
|
layer_norm_eps=self.layer_norm_eps, |
|
|
glu_bias=self.glu_bias, |
|
|
activation=self.activation, |
|
|
device=self.device, |
|
|
dtype=self.wdtype) for _ in range(self.num_reslayerA)]) |
|
|
|
|
|
self.layernorm1 = nn.LayerNorm(self.depth, eps=self.layer_norm_eps, device=self.device, dtype=self.wdtype) |
|
|
|
|
|
if self.reference_rope: |
|
|
|
|
|
self.rotary_embedding=RotaryPositionalEmbeddings(dim=self.depth, |
|
|
max_seq_len=self.max_seq_len, |
|
|
base=config.rope_theta |
|
|
).to(device=self.device, dtype=self.wdtype) |
|
|
|
|
|
|
|
|
|
|
|
def split_heads(self, x, batch_size): |
|
|
''' |
|
|
Split the last dimension into (num_heads, depth). |
|
|
''' |
|
|
x = x.view(batch_size, -1, self.num_heads, self.depth) |
|
|
return x |
|
|
|
|
|
def forward(self, inputs:tuple[torch.Tensor, ...], |
|
|
position_embeddings:torch.Tensor, |
|
|
position_ids: Optional[torch.LongTensor]=None, |
|
|
cache_position:Optional[torch.LongTensor]=None, |
|
|
past_G_values: Optional[torch.Tensor]=None, |
|
|
past_G_values_status: Optional[torch.BoolTensor]=None, |
|
|
past_key_values: Optional[Cache]=None, |
|
|
use_cache:Optional[bool]=None, |
|
|
**kwargs: Unpack[TransformersKwargs] |
|
|
)->tuple[torch.Tensor, tuple[torch.Tensor,...]]: |
|
|
|
|
|
q, k, v, mask = inputs |
|
|
batch_size = q.size()[0] |
|
|
|
|
|
q = self.wq(q) |
|
|
k = self.wk(k) |
|
|
v = self.wv(v) |
|
|
|
|
|
|
|
|
q = self.split_heads(q, batch_size) |
|
|
k = self.split_heads(k, batch_size) |
|
|
v = self.split_heads(v, batch_size) |
|
|
|
|
|
|
|
|
if position_embeddings is not None: |
|
|
cos, sin = position_embeddings |
|
|
q, k = apply_rotary_pos_emb(q=q, k=k, cos=cos, sin=sin, unsqueeze_dim=2) |
|
|
else: |
|
|
q=self.rotary_embedding(q, input_pos=position_ids) |
|
|
k=self.rotary_embedding(k, input_pos=position_ids) |
|
|
|
|
|
q = torch.permute(q, [0, 2, 1, 3]) |
|
|
k = torch.permute(k, [0, 2, 1, 3]) |
|
|
v = torch.permute(v, [0, 2, 1, 3]) |
|
|
|
|
|
if self.custom_G_type is None and not (use_cache and past_G_values_status[self.layer_idx]): |
|
|
|
|
|
qt = torch.permute(q, [0, 1, 3, 2]) |
|
|
A = torch.matmul(qt, q) |
|
|
A=self.layernorm1(A) |
|
|
|
|
|
|
|
|
for i in range(self.num_reslayerA): |
|
|
A=self.reslayerAs[i]([A]) |
|
|
else: |
|
|
A=past_G_values[self.layer_idx,0] |
|
|
|
|
|
if use_cache: |
|
|
|
|
|
cache_kwargs = {"cache_position": cache_position} |
|
|
k, v = past_key_values.update(key_states=k, value_states=v, layer_idx=self.layer_idx, cache_kwargs=cache_kwargs) |
|
|
|
|
|
|
|
|
Hnext, att_weights = self.plgatt_layer((q, k, v, A, mask), |
|
|
past_G_values, |
|
|
past_G_values_status, |
|
|
use_cache, **kwargs) |
|
|
|
|
|
Hnext= Hnext.reshape(batch_size, -1, self.d_model) |
|
|
|
|
|
output = self.dense(Hnext) |
|
|
|
|
|
return output, att_weights |
|
|
|
|
|
|
|
|
class PLDR_DecoderLayer(GradientCheckpointingLayer): |
|
|
''' |
|
|
Single decoder layer implementation for PLDR-LLM with single masked multihead attention. |
|
|
''' |
|
|
def __init__(self, config: PldrllmConfig, |
|
|
layer_idx:int, |
|
|
device=None, |
|
|
**kwargs)->None: |
|
|
|
|
|
super().__init__(**kwargs) |
|
|
|
|
|
self.d_model=config.hidden_size |
|
|
self.num_heads=config.num_attention_heads |
|
|
self.dff=config.intermediate_size |
|
|
self.A_dff=config.A_dff |
|
|
self.num_denseA = config.num_denseA |
|
|
self.num_reslayerA = config.num_reslayerA |
|
|
self.activation=ACT2FN[config.hidden_act] |
|
|
self.max_seq_len=config.max_position_embeddings |
|
|
self.layer_idx=layer_idx |
|
|
self.device=device |
|
|
self.layer_norm_eps=config.layer_norm_eps |
|
|
self.glu_bias=config.glu_bias |
|
|
self.wdtype=None |
|
|
|
|
|
self.mha1 = PldrllmAttention(config=config, layer_idx=layer_idx, device=self.device) |
|
|
|
|
|
self.ffn = self.dec_point_wise_feed_forward_network() |
|
|
|
|
|
self.layernorm1 = nn.LayerNorm(self.d_model, eps=self.layer_norm_eps, device=self.device, dtype=self.wdtype) |
|
|
self.layernorm2 = nn.LayerNorm(self.d_model, eps=self.layer_norm_eps, device=self.device, dtype=self.wdtype) |
|
|
|
|
|
def forward(self, |
|
|
hidden_states:torch.Tensor, |
|
|
look_ahead_mask:torch.Tensor, |
|
|
position_embeddings:torch.Tensor, |
|
|
position_ids:Optional[torch.LongTensor]=None, |
|
|
cache_position:Optional[torch.LongTensor]=None, |
|
|
use_cache:Optional[bool]=None, |
|
|
past_key_values:Optional[Cache]=None, |
|
|
past_G_values:Optional[torch.Tensor]=None, |
|
|
past_G_values_status:Optional[list[bool]]=None, |
|
|
**kwargs:Unpack[TransformersKwargs] |
|
|
)->tuple[torch.Tensor, tuple[torch.Tensor,...]]: |
|
|
|
|
|
attn1, att_weights = self.mha1(inputs=[hidden_states, hidden_states, hidden_states, look_ahead_mask], |
|
|
position_embeddings=position_embeddings, |
|
|
position_ids=position_ids, |
|
|
cache_position=cache_position, |
|
|
past_key_values=past_key_values, |
|
|
past_G_values=past_G_values, |
|
|
past_G_values_status=past_G_values_status, |
|
|
use_cache=use_cache, |
|
|
**kwargs |
|
|
) |
|
|
out1 = self.layernorm1(attn1 + hidden_states) |
|
|
|
|
|
ffn_output = self.ffn(out1) |
|
|
out2 = self.layernorm2(ffn_output + out1) |
|
|
|
|
|
return out2, att_weights |
|
|
|
|
|
|
|
|
|
|
|
def dec_point_wise_feed_forward_network(self): |
|
|
return GLUVariant(self.d_model, self.dff, self.d_model, |
|
|
glu_bias=self.glu_bias, |
|
|
activation=self.activation, |
|
|
device=self.device, |
|
|
dtype=self.wdtype) |
|
|
|
|
|
|
|
|
class ResLayerA(nn.Module): |
|
|
''' |
|
|
Residual Layer implementation for metric learner of PLDR-LLM |
|
|
''' |
|
|
def __init__(self, depth:int, |
|
|
A_dff:int, |
|
|
num_denseA:int, |
|
|
layer_norm_eps:float, |
|
|
glu_bias:bool, |
|
|
activation:Callable=F.silu, |
|
|
device=None, |
|
|
dtype=None, |
|
|
**kwargs)->None: |
|
|
super().__init__(**kwargs) |
|
|
self.depth=depth |
|
|
self.A_dff = A_dff |
|
|
self.num_denseA = num_denseA |
|
|
self.activation=activation |
|
|
self.device=device |
|
|
self.layer_norm_eps=layer_norm_eps |
|
|
self.glu_bias=glu_bias |
|
|
|
|
|
self.denseAs = nn.ModuleList([GLUVariant(self.depth, self.A_dff, self.depth, |
|
|
glu_bias=self.glu_bias, |
|
|
activation=self.activation, |
|
|
device=self.device, |
|
|
dtype=dtype) for _ in range(self.num_denseA)]) |
|
|
|
|
|
self.layernormA = nn.LayerNorm(self.depth, eps=self.layer_norm_eps, device=self.device, dtype=dtype) |
|
|
self.identity=nn.Identity() |
|
|
|
|
|
def ResUnit(self, A:torch.Tensor)->torch.Tensor: |
|
|
Ain = self.identity(A) |
|
|
for i in range(self.num_denseA): |
|
|
A = self.denseAs[i](A) |
|
|
A = self.layernormA(A + Ain) |
|
|
return A |
|
|
|
|
|
def forward(self, inputs:list[torch.Tensor], **kwargs)->torch.Tensor: |
|
|
A=inputs[0] |
|
|
return self.ResUnit(A) |
|
|
|
|
|
|
|
|
class GLUVariant(nn.Module): |
|
|
''' |
|
|
Implementation of GLU variants with default activation for SwiGLU configuration |
|
|
For the hidden layer dff, to match size with non-SwiGLU FFN version scaling with 2/3 may be useful. |
|
|
''' |
|
|
def __init__(self, d_model:int, |
|
|
dff:int, |
|
|
depth:int, |
|
|
glu_bias:bool, |
|
|
activation:Callable=F.silu, |
|
|
device=None, |
|
|
dtype=None, |
|
|
**kwargs)->None: |
|
|
super().__init__(**kwargs) |
|
|
self.dff=dff |
|
|
self.depth=depth |
|
|
self.d_model=d_model |
|
|
self.activation=activation |
|
|
self.device=device |
|
|
self.glu_bias=glu_bias |
|
|
|
|
|
self.gluw1=nn.Linear(self.d_model, self.dff, bias=self.glu_bias, device=self.device, dtype=dtype) |
|
|
self.gluw2=nn.Linear(self.d_model, self.dff, bias=self.glu_bias, device=self.device, dtype=dtype) |
|
|
self.gluw3=nn.Linear(self.dff, self.depth, bias=self.glu_bias, device=self.device, dtype=dtype) |
|
|
|
|
|
def forward(self, input:torch.Tensor, **kwargs)->torch.Tensor: |
|
|
x1=self.gluw1(input) |
|
|
x1=self.activation(x1) |
|
|
x2=self.gluw2(input) |
|
|
return self.gluw3(torch.mul(x1, x2)) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class RotaryPositionalEmbeddings(nn.Module): |
|
|
""" |
|
|
This class implements Rotary Positional Embeddings (RoPE) |
|
|
proposed in https://arxiv.org/abs/2104.09864. |
|
|
|
|
|
Reference implementation (used for correctness verfication) |
|
|
can be found here: |
|
|
https://github.com/meta-llama/llama/blob/main/llama/model.py#L80 |
|
|
|
|
|
In this implementation we cache the embeddings for each position upto |
|
|
``max_seq_len`` by computing this during init. |
|
|
|
|
|
Args: |
|
|
dim (int): Embedding dimension. This is usually set to the dim of each |
|
|
head in the attention module computed as ``embed_dim // num_heads`` |
|
|
max_seq_len (int): Maximum expected sequence length for the |
|
|
model, if exceeded the cached freqs will be recomputed |
|
|
base (int): The base for the geometric progression used to compute |
|
|
the rotation angles |
|
|
""" |
|
|
|
|
|
def __init__( |
|
|
self, |
|
|
dim: int, |
|
|
max_seq_len: int = 4096, |
|
|
base: int = 10_000, |
|
|
) -> None: |
|
|
super().__init__() |
|
|
self.dim = dim |
|
|
self.base = base |
|
|
self.max_seq_len = max_seq_len |
|
|
self.rope_init() |
|
|
|
|
|
def rope_init(self): |
|
|
theta = 1.0 / ( |
|
|
self.base |
|
|
** (torch.arange(0, self.dim, 2)[: (self.dim // 2)].float() / self.dim) |
|
|
) |
|
|
self.register_buffer("theta", theta, persistent=False) |
|
|
self.build_rope_cache(self.max_seq_len) |
|
|
|
|
|
def build_rope_cache(self, max_seq_len: int = 4096) -> None: |
|
|
|
|
|
seq_idx = torch.arange( |
|
|
max_seq_len, dtype=self.theta.dtype, device=self.theta.device |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
idx_theta = torch.einsum("i, j -> ij", seq_idx, self.theta).float() |
|
|
|
|
|
|
|
|
|
|
|
cache = torch.stack([torch.cos(idx_theta), torch.sin(idx_theta)], dim=-1) |
|
|
self.register_buffer("cache", cache, persistent=False) |
|
|
|
|
|
def forward( |
|
|
self, x: torch.Tensor, *, input_pos: Optional[torch.Tensor] = None |
|
|
) -> torch.Tensor: |
|
|
""" |
|
|
Args: |
|
|
x (torch.Tensor): input tensor with shape |
|
|
``[b, s, n_h, h_d]`` |
|
|
input_pos (Optional[torch.Tensor]): Optional tensor which contains the position ids |
|
|
of each token. During training, this is used to indicate the positions |
|
|
of each token relative to its sample when packed, shape [b, s]. |
|
|
During inference, this indicates the position of the current token. |
|
|
If none, assume the index of the token is its position id. Default is None. |
|
|
|
|
|
Returns: |
|
|
torch.Tensor: output tensor with shape ``[b, s, n_h, h_d]`` |
|
|
|
|
|
Notation used for tensor shapes: |
|
|
- b: batch size |
|
|
- s: sequence length |
|
|
- n_h: num heads |
|
|
- h_d: head dim |
|
|
""" |
|
|
|
|
|
seq_len = x.size(1) |
|
|
|
|
|
|
|
|
rope_cache = ( |
|
|
self.cache[:seq_len] if input_pos is None else self.cache[input_pos] |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
xshaped = x.float().reshape(*x.shape[:-1], -1, 2) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
rope_cache = rope_cache.view(-1, xshaped.size(1), 1, xshaped.size(3), 2) |
|
|
|
|
|
|
|
|
x_out = torch.stack( |
|
|
[ |
|
|
xshaped[..., 0] * rope_cache[..., 0] |
|
|
- xshaped[..., 1] * rope_cache[..., 1], |
|
|
xshaped[..., 1] * rope_cache[..., 0] |
|
|
+ xshaped[..., 0] * rope_cache[..., 1], |
|
|
], |
|
|
-1, |
|
|
) |
|
|
|
|
|
|
|
|
x_out = x_out.flatten(3) |
|
|
return x_out.type_as(x) |
|
|
|
|
|
|
|
|
|
|
|
class PldrllmRotaryEmbedding(nn.Module): |
|
|
def __init__(self, config: PldrllmConfig, device=None): |
|
|
super().__init__() |
|
|
|
|
|
if hasattr(config, "rope_scaling") and config.rope_scaling is not None: |
|
|
self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) |
|
|
else: |
|
|
self.rope_type = "default" |
|
|
self.max_seq_len_cached = config.max_position_embeddings |
|
|
self.original_max_seq_len = config.max_position_embeddings |
|
|
|
|
|
self.config = config |
|
|
self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] |
|
|
|
|
|
inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) |
|
|
self.register_buffer("inv_freq", inv_freq, persistent=False) |
|
|
self.original_inv_freq = self.inv_freq |
|
|
|
|
|
@torch.no_grad() |
|
|
@dynamic_rope_update |
|
|
def forward(self, x, position_ids): |
|
|
inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device) |
|
|
position_ids_expanded = position_ids[:, None, :].float() |
|
|
|
|
|
device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu" |
|
|
with torch.autocast(device_type=device_type, enabled=False): |
|
|
freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) |
|
|
emb = torch.cat((freqs, freqs), dim=-1) |
|
|
cos = emb.cos() * self.attention_scaling |
|
|
sin = emb.sin() * self.attention_scaling |
|
|
|
|
|
return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) |
|
|
|
|
|
|
|
|
def rotate_half(x): |
|
|
"""Rotates half the hidden dims of the input.""" |
|
|
x1 = x[..., : x.shape[-1] // 2] |
|
|
x2 = x[..., x.shape[-1] // 2 :] |
|
|
return torch.cat((-x2, x1), dim=-1) |
|
|
|
|
|
|
|
|
def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1): |
|
|
"""Applies Rotary Position Embedding to the query and key tensors. |
|
|
|
|
|
Args: |
|
|
q (`torch.Tensor`): The query tensor. |
|
|
k (`torch.Tensor`): The key tensor. |
|
|
cos (`torch.Tensor`): The cosine part of the rotary embedding. |
|
|
sin (`torch.Tensor`): The sine part of the rotary embedding. |
|
|
position_ids (`torch.Tensor`, *optional*): |
|
|
Deprecated and unused. |
|
|
unsqueeze_dim (`int`, *optional*, defaults to 1): |
|
|
The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and |
|
|
sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note |
|
|
that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and |
|
|
k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes |
|
|
cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have |
|
|
the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2. |
|
|
Returns: |
|
|
`tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding. |
|
|
""" |
|
|
cos = cos.unsqueeze(unsqueeze_dim) |
|
|
sin = sin.unsqueeze(unsqueeze_dim) |
|
|
q_embed = (q * cos) + (rotate_half(q) * sin) |
|
|
k_embed = (k * cos) + (rotate_half(k) * sin) |
|
|
return q_embed, k_embed |
|
|
|
|
|
|
|
|
|
|
|
@dataclass |
|
|
class BasePLDRModelOutputWithPast(ModelOutput): |
|
|
""" |
|
|
Base class for [`PldrllmModel`] outputs that may also contain a past key/values (to speed up sequential decoding). |
|
|
|
|
|
Args: |
|
|
last_hidden_state (`torch.Tensor` of shape `(batch_size, sequence_length, hidden_size)`): |
|
|
Sequence of hidden-states at the output of the last layer of the model. |
|
|
|
|
|
If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1, |
|
|
hidden_size)` is output. |
|
|
past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): |
|
|
It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache). |
|
|
|
|
|
Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if |
|
|
`config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values` |
|
|
input) to speed up sequential decoding. |
|
|
hidden_states (`tuple(torch.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): |
|
|
Tuple of `torch.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + |
|
|
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. |
|
|
|
|
|
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. |
|
|
attentions (`tuple(torch.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): |
|
|
Tuple of `torch.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, |
|
|
sequence_length)`. |
|
|
|
|
|
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention |
|
|
heads. |
|
|
pldr_attentions (`tuple(tuple(torch.Tensor)))`, *optional*, returned when `output_pldr_attentions=True` is passed or when `config.output_pldr_attentions=True`): |
|
|
Tuple of `tuple(torch.Tensor)` (one for each layer) of the deductive outputs and learnable parameters of power law graph attention module. |
|
|
|
|
|
The tuple for each layer contains: |
|
|
output of the residual metric learner (metric tensor, A) of shape `(batch_size, num_heads, head_dim,head_dim)`, |
|
|
output after application of iSwiGLU on metric tensor, A_LM of shape `(batch_size, num_heads, head_dim,head_dim)`, |
|
|
learned exponents of potential tensor of shape `(batch_size, num_heads, head_dim,head_dim)`, |
|
|
learned weights for energy-curvature tensor of shape `(batch_size, num_heads, head_dim,head_dim)`, |
|
|
learned bias for energy-curvature tensor of shape `(batch_size, num_heads, head_dim,head_dim)`, |
|
|
energy-curvature tensor G_LM of shape `(batch_size, num_heads, head_dim,head_dim)`, |
|
|
attention weights of shape `(batch_size, num_heads, sequence_length, sequence_length)`. |
|
|
""" |
|
|
last_hidden_state: Optional[torch.Tensor] = None |
|
|
past_key_values: Optional[Cache] = None |
|
|
hidden_states: Optional[tuple[torch.Tensor, ...]] = None |
|
|
attentions: Optional[tuple[torch.Tensor, ...]] = None |
|
|
pldr_attentions:Optional[tuple[tuple[torch.Tensor, ...]]] = None |
|
|
|
|
|
@dataclass |
|
|
class CausalPLDRLLMOutputWithPast(ModelOutput): |
|
|
""" |
|
|
Base class for [`PldrllmForCausalLM`] causal language model (or autoregressive) outputs. |
|
|
|
|
|
Args: |
|
|
loss (`torch.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided): |
|
|
Language modeling loss (for next-token prediction). |
|
|
logits (`torch.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`): |
|
|
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). |
|
|
past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): |
|
|
It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache). |
|
|
|
|
|
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see |
|
|
`past_key_values` input) to speed up sequential decoding. |
|
|
hidden_states (`tuple(torch.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): |
|
|
Tuple of `torch.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + |
|
|
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. |
|
|
|
|
|
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. |
|
|
attentions (`tuple(torch.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): |
|
|
Tuple of `torch.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, |
|
|
sequence_length)`. |
|
|
|
|
|
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention |
|
|
heads. |
|
|
pldr_attentions (`tuple(tuple(torch.Tensor)))`, *optional*, returned when `output_pldr_attentions=True` is passed or when `config.output_pldr_attentions=True`): |
|
|
Tuple of `tuple(torch.Tensor)` (one for each layer) of the deductive outputs and learnable parameters of power law graph attention module. |
|
|
|
|
|
The tuple for each layer contains: |
|
|
output of the residual metric learner (metric tensor, A) of shape `(batch_size, num_heads, head_dim,head_dim)`, |
|
|
output after application of iSwiGLU on metric tensor, A_LM of shape `(batch_size, num_heads, head_dim,head_dim)`, |
|
|
learned exponents of potential tensor of shape `(batch_size, num_heads, head_dim,head_dim)`, |
|
|
learned weights for energy-curvature tensor of shape `(batch_size, num_heads, head_dim,head_dim)`, |
|
|
learned bias for energy-curvature tensor of shape `(batch_size, num_heads, head_dim,head_dim)`, |
|
|
energy-curvature tensor G_LM of shape `(batch_size, num_heads, head_dim,head_dim)`, |
|
|
attention weights of shape `(batch_size, num_heads, sequence_length, sequence_length)`. |
|
|
""" |
|
|
loss: Optional[torch.Tensor] = None |
|
|
logits: Optional[torch.Tensor] = None |
|
|
past_key_values: Optional[Cache] = None |
|
|
hidden_states: Optional[tuple[torch.Tensor, ...]] = None |
|
|
attentions: Optional[tuple[torch.Tensor, ...]] = None |
|
|
pldr_attentions:Optional[tuple[tuple[torch.Tensor, ...]]] = None |
|
|
|
|
|
@dataclass |
|
|
class TokenClassifierPLDRLLMOutput(ModelOutput): |
|
|
""" |
|
|
Base class for outputs of [`PldrllmForTokenClassification`] token classification model. |
|
|
|
|
|
Args: |
|
|
loss (`torch.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided) : |
|
|
Classification loss. |
|
|
logits (`torch.Tensor` of shape `(batch_size, sequence_length, config.num_labels)`): |
|
|
Classification scores (before SoftMax). |
|
|
hidden_states (`tuple(torch.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): |
|
|
Tuple of `torch.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + |
|
|
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. |
|
|
|
|
|
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. |
|
|
attentions (`tuple(torch.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): |
|
|
Tuple of `torch.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, |
|
|
sequence_length)`. |
|
|
|
|
|
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention |
|
|
heads. |
|
|
pldr_attentions (`tuple(tuple(torch.Tensor)))`, *optional*, returned when `output_pldr_attentions=True` is passed or when `config.output_pldr_attentions=True`): |
|
|
Tuple of `tuple(torch.Tensor)` (one for each layer) of the deductive outputs and learnable parameters of power law graph attention module. |
|
|
|
|
|
The tuple for each layer contains: |
|
|
output of the residual metric learner (metric tensor, A) of shape `(batch_size, num_heads, head_dim,head_dim)`, |
|
|
output after application of iSwiGLU on metric tensor, A_LM of shape `(batch_size, num_heads, head_dim,head_dim)`, |
|
|
learned exponents of potential tensor of shape `(batch_size, num_heads, head_dim,head_dim)`, |
|
|
learned weights for energy-curvature tensor of shape `(batch_size, num_heads, head_dim,head_dim)`, |
|
|
learned bias for energy-curvature tensor of shape `(batch_size, num_heads, head_dim,head_dim)`, |
|
|
energy-curvature tensor G_LM of shape `(batch_size, num_heads, head_dim,head_dim)`, |
|
|
attention weights of shape `(batch_size, num_heads, sequence_length, sequence_length)`. |
|
|
""" |
|
|
loss: Optional[torch.Tensor] = None |
|
|
logits: Optional[torch.Tensor] = None |
|
|
hidden_states: Optional[tuple[torch.Tensor, ...]] = None |
|
|
attentions: Optional[tuple[torch.Tensor, ...]] = None |
|
|
pldr_attentions:Optional[tuple[tuple[torch.Tensor, ...]]] = None |
|
|
|
|
|
@dataclass |
|
|
class QuestionAnsweringPLDRModelOutput(ModelOutput): |
|
|
""" |
|
|
Base class for outputs of [`PldrllmForQuestionAnswering`] question answering model. |
|
|
|
|
|
Args: |
|
|
loss (`torch.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided): |
|
|
Total span extraction loss is the sum of a Cross-Entropy for the start and end positions. |
|
|
start_logits (`torch.Tensor` of shape `(batch_size, sequence_length)`): |
|
|
Span-start scores (before SoftMax). |
|
|
end_logits (`torch.Tensor` of shape `(batch_size, sequence_length)`): |
|
|
Span-end scores (before SoftMax). |
|
|
hidden_states (`tuple(torch.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): |
|
|
Tuple of `torch.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + |
|
|
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. |
|
|
|
|
|
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. |
|
|
attentions (`tuple(torch.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): |
|
|
Tuple of `torch.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, |
|
|
sequence_length)`. |
|
|
|
|
|
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention |
|
|
heads. |
|
|
pldr_attentions (`tuple(tuple(torch.Tensor)))`, *optional*, returned when `output_pldr_attentions=True` is passed or when `config.output_pldr_attentions=True`): |
|
|
Tuple of `tuple(torch.Tensor)` (one for each layer) of the deductive outputs and learnable parameters of power law graph attention module. |
|
|
|
|
|
The tuple for each layer contains: |
|
|
output of the residual metric learner (metric tensor, A) of shape `(batch_size, num_heads, head_dim,head_dim)`, |
|
|
output after application of iSwiGLU on metric tensor, A_LM of shape `(batch_size, num_heads, head_dim,head_dim)`, |
|
|
learned exponents of potential tensor of shape `(batch_size, num_heads, head_dim,head_dim)`, |
|
|
learned weights for energy-curvature tensor of shape `(batch_size, num_heads, head_dim,head_dim)`, |
|
|
learned bias for energy-curvature tensor of shape `(batch_size, num_heads, head_dim,head_dim)`, |
|
|
energy-curvature tensor G_LM of shape `(batch_size, num_heads, head_dim,head_dim)`, |
|
|
attention weights of shape `(batch_size, num_heads, sequence_length, sequence_length)`. |
|
|
""" |
|
|
|
|
|
loss: Optional[torch.Tensor] = None |
|
|
start_logits: Optional[torch.Tensor] = None |
|
|
end_logits: Optional[torch.Tensor] = None |
|
|
hidden_states: Optional[tuple[torch.Tensor, ...]] = None |
|
|
attentions: Optional[tuple[torch.Tensor, ...]] = None |
|
|
pldr_attentions:Optional[tuple[tuple[torch.Tensor, ...]]] = None |
|
|
|
|
|
@dataclass |
|
|
class SequenceClassifierPLDRLLMOutputWithPast(ModelOutput): |
|
|
""" |
|
|
Base class for outputs of [`PldrllmForSequenceClassification`] sentence classification model. |
|
|
|
|
|
Args: |
|
|
loss (`torch.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided): |
|
|
Classification (or regression if config.num_labels==1) loss. |
|
|
logits (`torch.Tensor` of shape `(batch_size, config.num_labels)`): |
|
|
Classification (or regression if config.num_labels==1) scores (before SoftMax). |
|
|
past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): |
|
|
It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache). |
|
|
|
|
|
Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see |
|
|
`past_key_values` input) to speed up sequential decoding. |
|
|
hidden_states (`tuple(torch.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): |
|
|
Tuple of `torch.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + |
|
|
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. |
|
|
|
|
|
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. |
|
|
attentions (`tuple(torch.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): |
|
|
Tuple of `torch.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, |
|
|
sequence_length)`. |
|
|
|
|
|
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention |
|
|
heads. |
|
|
pldr_attentions (`tuple(tuple(torch.Tensor)))`, *optional*, returned when `output_pldr_attentions=True` is passed or when `config.output_pldr_attentions=True`): |
|
|
Tuple of `tuple(torch.Tensor)` (one for each layer) of the deductive outputs and learnable parameters of power law graph attention module. |
|
|
|
|
|
The tuple for each layer contains: |
|
|
output of the residual metric learner (metric tensor, A) of shape `(batch_size, num_heads, head_dim,head_dim)`, |
|
|
output after application of iSwiGLU on metric tensor, A_LM of shape `(batch_size, num_heads, head_dim,head_dim)`, |
|
|
learned exponents of potential tensor of shape `(batch_size, num_heads, head_dim,head_dim)`, |
|
|
learned weights for energy-curvature tensor of shape `(batch_size, num_heads, head_dim,head_dim)`, |
|
|
learned bias for energy-curvature tensor of shape `(batch_size, num_heads, head_dim,head_dim)`, |
|
|
energy-curvature tensor G_LM of shape `(batch_size, num_heads, head_dim,head_dim)`, |
|
|
attention weights of shape `(batch_size, num_heads, sequence_length, sequence_length)`. |
|
|
""" |
|
|
|
|
|
loss: Optional[torch.Tensor] = None |
|
|
logits: Optional[torch.Tensor] = None |
|
|
past_key_values: Optional[Cache] = None |
|
|
hidden_states: Optional[tuple[torch.Tensor, ...]] = None |
|
|
attentions: Optional[tuple[torch.Tensor, ...]] = None |
|
|
pldr_attentions:Optional[tuple[tuple[torch.Tensor, ...]]] = None |
|
|
|
|
|
|
|
|
@auto_docstring |
|
|
class PldrllmPreTrainedModel(PreTrainedModel): |
|
|
config_class = PldrllmConfig |
|
|
base_model_prefix = "decoder" |
|
|
supports_gradient_checkpointing = True |
|
|
_no_split_modules = ["PLDR_DecoderLayer"] |
|
|
_skip_keys_device_placement = ["past_key_values"] |
|
|
_supports_flash_attn = True |
|
|
_supports_sdpa = True |
|
|
_supports_flex_attn = False |
|
|
_supports_attention_backend = True |
|
|
_can_compile_fullgraph=False |
|
|
|
|
|
def __init__(self, config: PldrllmConfig)->None: |
|
|
super().__init__(config) |
|
|
self.custom_G_type=config.custom_G_type |
|
|
if self.custom_G_type is not None: |
|
|
self._can_compile_fullgraph=True |
|
|
|
|
|
def _init_weights(self, module): |
|
|
if isinstance(module, nn.Linear): |
|
|
nn.init.xavier_uniform_(module.weight.data) |
|
|
if module.bias is not None: |
|
|
module.bias.data.zero_() |
|
|
elif isinstance(module, nn.Embedding): |
|
|
module.weight.data.normal_(mean=0.0, std=1.0) |
|
|
if module.padding_idx is not None: |
|
|
module.weight.data[module.padding_idx].zero_() |
|
|
elif isinstance(module, nn.LayerNorm): |
|
|
module.weight.data.fill_(1.0) |
|
|
if module.bias is not None: |
|
|
module.bias.data.zero_() |
|
|
elif isinstance(module, PlgaLayer): |
|
|
if module.Wlst is not None: |
|
|
nn.init.xavier_uniform_(module.Wlst.data) |
|
|
if module.pwlst is not None: |
|
|
nn.init.xavier_uniform_(module.pwlst.data) |
|
|
if module.alst is not None: |
|
|
nn.init.xavier_uniform_(module.alst.data) |
|
|
if module.blst is not None: |
|
|
module.blst.data.zero_() |
|
|
if module.balst is not None: |
|
|
module.balst.data.zero_() |
|
|
|
|
|
MODEL_COMMON_CUSTOM_ARGS=r""" |
|
|
output_pldr_attentions (`bool`, *optional*, defaults to `False`): |
|
|
Whether to return the deductive outputs and learnable parameters of power law graph attention module as tuple containing: |
|
|
the output of the residual metric learner (metric tensor, A), output (A_LM) after application of iSwiGLU on metric tensor, learned |
|
|
exponents of potential tensor, learned weights for energy-curvature tensor, learned bias for |
|
|
energy-curvature tensor, energy-curvature tensor (G_LM), and attention weights. |
|
|
cache_first_G (`bool`, *optional*, defaults to `False`): |
|
|
Whether or not the model should return the G values from first sample in a batch or G values from all samples for past_G_values initialization. |
|
|
When `cache_first_G=true`, the batch_size of past_G_values is 1. This argument should be set to True for contrastive text generation |
|
|
with learned G values. |
|
|
""" |
|
|
|
|
|
|
|
|
@auto_docstring(custom_intro=""" |
|
|
Large Language Model From Power Law Decoder Representations (PLDR-LLM) with decoder hidden state as output. |
|
|
PLDR-LLM is a model architecture that utilizes Power Law Graph Attention (PLGA) in decoder layers. |
|
|
For details of model architecture, check out these papers: |
|
|
[Paper-1](https://huggingface.co/papers/2107.02039) [Paper-2](https://huggingface.co/papers/2410.16703) [Paper-3](https://huggingface.co/papers/2502.13502) |
|
|
""" |
|
|
) |
|
|
class PldrllmModel(PldrllmPreTrainedModel): |
|
|
def __init__(self, config: PldrllmConfig)->None: |
|
|
super().__init__(config) |
|
|
|
|
|
|
|
|
self.num_layers = config.num_hidden_layers |
|
|
self.d_model=config.hidden_size |
|
|
self.num_heads=config.num_attention_heads |
|
|
self.target_vocab_size =config.vocab_size |
|
|
self.max_seq_len=config.max_position_embeddings |
|
|
self.reference_rope=config.reference_rope |
|
|
self.pldr_device=None |
|
|
self.gradient_checkpointing = False |
|
|
self.layer_norm_eps=config.layer_norm_eps |
|
|
self.wdtype=None |
|
|
|
|
|
assert self.d_model % self.num_heads == 0 |
|
|
self.depth = config.head_dim |
|
|
|
|
|
self.custom_G_type=config.custom_G_type |
|
|
|
|
|
if self.custom_G_type is not None: |
|
|
|
|
|
past_G_values, past_G_values_status=self.G_values_init(device=self.pldr_device, dtype=self.wdtype) |
|
|
self.register_buffer("past_G_values_status", past_G_values_status, persistent=True) |
|
|
self.register_buffer("past_G_values", past_G_values, persistent=True) |
|
|
|
|
|
logger.warning("\nIMPORTANT: decoder.past_G_values are set to predefined values and deep PLGA layers will be skipped. " |
|
|
"Set config.custom_G_type=None to enable deep PLGA layers.") |
|
|
if self.custom_G_type=="external": |
|
|
logger.warning("\nIMPORTANT: config.custom_G_type is selected as 'external' and an external value of decoder.past_G_values[:,2,...] is expected. " |
|
|
"decoder.past_G_values[:,2,...] are initialized to identity tensor by default. This is equivalent to an LLM with SDPA. To provide external values " |
|
|
"to the decoder.past_G_values, either load these values along with the pretrained model or set decoder.past_G_values to a torch.float tensor of " |
|
|
"size (num_layers, 3, 1, num_heads, head_dim, head_dim) after model is initialized.\n") |
|
|
else: |
|
|
|
|
|
self.register_buffer("past_G_values_status", None, persistent=False) |
|
|
self.register_buffer("past_G_values", None, persistent=False) |
|
|
self.is_past_G_values_initialized=False |
|
|
|
|
|
|
|
|
self.embedding = nn.Embedding(self.target_vocab_size, self.d_model, device=self.pldr_device, dtype=self.wdtype) |
|
|
|
|
|
self.dec_layers = nn.ModuleList([PLDR_DecoderLayer(config, |
|
|
layer_idx=i, |
|
|
device=self.pldr_device) for i in range(self.num_layers)]) |
|
|
|
|
|
self.layernorm1 = nn.LayerNorm(self.d_model, eps=self.layer_norm_eps, device=self.pldr_device, dtype=self.wdtype) |
|
|
|
|
|
if not self.reference_rope: |
|
|
self.rotary_embedding=PldrllmRotaryEmbedding(config=config) |
|
|
|
|
|
self.post_init() |
|
|
|
|
|
def G_values_init(self, batch_size=1, device=None, dtype=None): |
|
|
G_values_dim=(self.num_layers, 1, self.num_heads, self.depth, self.depth) |
|
|
zeros_tensor=torch.zeros(G_values_dim, device=device, dtype=dtype) |
|
|
identity_tensor=torch.eye(self.depth).repeat(self.num_layers, 1, self.num_heads, 1, 1).to(device=device, dtype=dtype) |
|
|
random_tensor=torch.randn(G_values_dim, device=device, dtype=dtype) |
|
|
CUSTOM_G_VALUES={ |
|
|
'identity':torch.stack([zeros_tensor, zeros_tensor, identity_tensor], dim=1), |
|
|
'random': torch.stack([zeros_tensor, zeros_tensor, random_tensor], dim=1), |
|
|
'external': torch.stack([zeros_tensor, zeros_tensor, identity_tensor], dim=1) |
|
|
} |
|
|
|
|
|
if self.custom_G_type is None: |
|
|
|
|
|
past_G_values = torch.zeros((self.num_layers, 3, batch_size, self.num_heads, self.depth, self.depth), device=device, dtype=dtype) |
|
|
past_G_values_status=torch.tensor([False]*self.num_layers, dtype=torch.bool, device=device) |
|
|
elif self.custom_G_type in ['identity', 'random', 'external']: |
|
|
past_G_values=CUSTOM_G_VALUES[self.custom_G_type] |
|
|
past_G_values_status=torch.tensor([True]*self.num_layers, dtype=torch.bool, device=device) |
|
|
else: |
|
|
raise ValueError("Invalid custom_G_type value. Available values are " |
|
|
"None, 'identity', 'random', and 'external'.") |
|
|
|
|
|
self.is_past_G_values_initialized=True |
|
|
return past_G_values, past_G_values_status |
|
|
|
|
|
@can_return_tuple |
|
|
@auto_docstring( |
|
|
custom_args=MODEL_COMMON_CUSTOM_ARGS |
|
|
) |
|
|
def forward(self, |
|
|
input_ids: Optional[torch.LongTensor] = None, |
|
|
attention_mask: Optional[torch.Tensor] = None, |
|
|
position_ids: Optional[torch.LongTensor] = None, |
|
|
past_key_values: Optional[Cache]=None, |
|
|
inputs_embeds: Optional[torch.FloatTensor] = None, |
|
|
use_cache: Optional[bool] = None, |
|
|
output_attentions: Optional[bool] = None, |
|
|
output_pldr_attentions: Optional[bool] = None, |
|
|
output_hidden_states: Optional[bool] = None, |
|
|
cache_position: Optional[torch.LongTensor] = None, |
|
|
cache_first_G: Optional[bool] = None, |
|
|
**kwargs: Unpack[TransformersKwargs] |
|
|
): |
|
|
|
|
|
use_cache=use_cache if use_cache is not None else self.config.use_cache |
|
|
cache_first_G=cache_first_G if cache_first_G is not None else self.config.cache_first_G |
|
|
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions |
|
|
output_pldr_attentions=output_pldr_attentions if output_pldr_attentions is not None else self.config.output_pldr_attentions |
|
|
output_hidden_states=output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states |
|
|
|
|
|
if (self.gradient_checkpointing or self.training) and use_cache: |
|
|
logger.warning_once( |
|
|
"During training, setting `use_cache=False`. Additionally, `use_cache=True` is incompatible with gradient checkpointing." |
|
|
) |
|
|
use_cache = False |
|
|
|
|
|
if (input_ids is None) ^ (inputs_embeds is not None): |
|
|
raise ValueError("You must specify exactly one of input_ids or inputs_embeds") |
|
|
|
|
|
inputs_embeds = self.embedding(input_ids) if inputs_embeds is None else inputs_embeds |
|
|
|
|
|
dec_att_weights=() if output_pldr_attentions else None |
|
|
dec_attentions=() if output_attentions else None |
|
|
|
|
|
dec_outputs=(inputs_embeds,) if output_hidden_states else None |
|
|
|
|
|
if not isinstance(past_key_values, (type(None), Cache)): |
|
|
raise ValueError("The `past_key_values` should be either a `Cache` object or `None`.") |
|
|
|
|
|
if use_cache and past_key_values is None: |
|
|
past_key_values = DynamicCache() |
|
|
|
|
|
|
|
|
if use_cache and self.custom_G_type is None and not isinstance(past_key_values, StaticCache) and past_key_values.get_seq_length()==0: |
|
|
self.past_G_values_status=torch.tensor([False]*self.num_layers, dtype=torch.bool, device=inputs_embeds.device) |
|
|
self.is_past_G_values_initialized=False |
|
|
|
|
|
if use_cache and isinstance(past_key_values, StaticCache) and ((self.custom_G_type is None) or |
|
|
"flash_attention" in self.config._attn_implementation): |
|
|
raise ValueError("Static Cache is only supported with predefined past_G_values. " |
|
|
"Flash attention is not supported. " |
|
|
"Supported models are with config.custom_G_type set to 'random', 'identity' or 'external'.") |
|
|
|
|
|
if not self.is_past_G_values_initialized and self.custom_G_type is None: |
|
|
if use_cache: |
|
|
batch_size=1 if cache_first_G else inputs_embeds.size()[0] |
|
|
self.past_G_values, self.past_G_values_status=self.G_values_init(batch_size=batch_size, |
|
|
device=inputs_embeds.device, |
|
|
dtype=inputs_embeds.dtype) |
|
|
else: |
|
|
self.past_G_values_status=torch.tensor([False]*self.num_layers, dtype=torch.bool, device=inputs_embeds.device) |
|
|
self.past_G_values=None |
|
|
self.is_past_G_values_initialized=True |
|
|
|
|
|
if cache_position is None: |
|
|
past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 |
|
|
cache_position = torch.arange( |
|
|
past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device |
|
|
) |
|
|
|
|
|
if position_ids is None: |
|
|
position_ids = cache_position.unsqueeze(0) |
|
|
|
|
|
causal_mask = create_causal_mask( |
|
|
config=self.config, |
|
|
input_embeds=inputs_embeds, |
|
|
attention_mask=attention_mask, |
|
|
cache_position=cache_position, |
|
|
past_key_values=past_key_values, |
|
|
position_ids=position_ids |
|
|
) |
|
|
|
|
|
hidden_states=inputs_embeds |
|
|
|
|
|
if not self.reference_rope: |
|
|
position_embeddings = self.rotary_embedding(hidden_states, position_ids) |
|
|
else: |
|
|
|
|
|
position_embeddings=None |
|
|
|
|
|
hidden_states *= torch.sqrt(torch.tensor(self.d_model).to(dtype=hidden_states.dtype)) |
|
|
|
|
|
hidden_states=self.layernorm1(hidden_states) |
|
|
|
|
|
for i in range(self.num_layers): |
|
|
hidden_states, dec_att_w= self.dec_layers[i](hidden_states, |
|
|
causal_mask, |
|
|
position_embeddings=position_embeddings, |
|
|
position_ids=position_ids, |
|
|
cache_position=cache_position, |
|
|
use_cache=use_cache, |
|
|
past_key_values=past_key_values, |
|
|
past_G_values=self.past_G_values, |
|
|
past_G_values_status=self.past_G_values_status, |
|
|
**kwargs |
|
|
) |
|
|
|
|
|
if output_pldr_attentions: |
|
|
dec_att_weights += (dec_att_w,) |
|
|
|
|
|
if output_attentions: |
|
|
dec_attentions += (dec_att_w[-1],) |
|
|
|
|
|
if output_hidden_states: |
|
|
dec_outputs += (hidden_states,) |
|
|
|
|
|
last_hidden_state=hidden_states |
|
|
|
|
|
return BasePLDRModelOutputWithPast( |
|
|
last_hidden_state = last_hidden_state, |
|
|
past_key_values=past_key_values if use_cache else None, |
|
|
hidden_states=dec_outputs, |
|
|
attentions=dec_attentions, |
|
|
pldr_attentions=dec_att_weights |
|
|
) |
|
|
|
|
|
def get_input_embeddings(self): |
|
|
return self.embedding |
|
|
|
|
|
def set_input_embeddings(self, value): |
|
|
self.embedding = value |
|
|
|
|
|
@auto_docstring(custom_intro=""" |
|
|
Large Language Model From Power Law Decoder Representations (PLDR-LLM) with LM Head as final layer. |
|
|
PLDR-LLM is a model architecture that utilizes Power Law Graph Attention (PLGA) in decoder layers. |
|
|
For details of model architecture, check out these papers: |
|
|
[Paper-1](https://huggingface.co/papers/2107.02039) [Paper-2](https://huggingface.co/papers/2410.16703) [Paper-3](https://huggingface.co/papers/2502.13502) |
|
|
""" |
|
|
) |
|
|
class PldrllmForCausalLM(PldrllmPreTrainedModel, GenerationMixin): |
|
|
def __init__(self, config: PldrllmConfig)->None: |
|
|
super().__init__(config) |
|
|
|
|
|
self.d_model=config.hidden_size |
|
|
self.input_vocab_size =config.vocab_size |
|
|
self.final_bias=config.final_bias |
|
|
self.pldr_device=None |
|
|
self.decoder=PldrllmModel(config=config) |
|
|
self.wdtype=None |
|
|
|
|
|
self.final_layer = nn.Linear(self.d_model, self.input_vocab_size, bias=self.final_bias, device=self.pldr_device, dtype=self.wdtype) |
|
|
|
|
|
self.post_init() |
|
|
|
|
|
def get_input_embeddings(self): |
|
|
return self.decoder.embedding |
|
|
|
|
|
|
|
|
def set_input_embeddings(self, value): |
|
|
self.decoder.embedding = value |
|
|
|
|
|
def get_output_embeddings(self): |
|
|
return self.final_layer |
|
|
|
|
|
def set_output_embeddings(self, new_embeddings): |
|
|
self.final_layer = new_embeddings |
|
|
|
|
|
def set_decoder(self, decoder): |
|
|
self.decoder = decoder |
|
|
|
|
|
def get_decoder(self): |
|
|
return self.decoder |
|
|
|
|
|
@can_return_tuple |
|
|
@auto_docstring( |
|
|
custom_args=MODEL_COMMON_CUSTOM_ARGS |
|
|
) |
|
|
def forward(self, |
|
|
input_ids: Optional[torch.LongTensor] = None, |
|
|
attention_mask: Optional[torch.Tensor] = None, |
|
|
position_ids: Optional[torch.LongTensor] = None, |
|
|
past_key_values: Optional[Cache]=None, |
|
|
use_cache: Optional[bool] = None, |
|
|
inputs_embeds: Optional[torch.FloatTensor] = None, |
|
|
labels: Optional[torch.LongTensor] = None, |
|
|
output_attentions: Optional[bool] = None, |
|
|
output_pldr_attentions: Optional[bool] = None, |
|
|
output_hidden_states: Optional[bool] = None, |
|
|
cache_position: Optional[torch.LongTensor] = None, |
|
|
cache_first_G: Optional[bool] = None, |
|
|
logits_to_keep: Union[int, torch.Tensor] = 0, |
|
|
**kwargs: Unpack[TransformersKwargs], |
|
|
)-> CausalPLDRLLMOutputWithPast: |
|
|
|
|
|
outputs: BasePLDRModelOutputWithPast=self.decoder(input_ids=input_ids, |
|
|
attention_mask=attention_mask, |
|
|
position_ids=position_ids, |
|
|
past_key_values=past_key_values, |
|
|
use_cache=use_cache, |
|
|
inputs_embeds=inputs_embeds, |
|
|
output_attentions=output_attentions, |
|
|
output_pldr_attentions=output_pldr_attentions, |
|
|
output_hidden_states=output_hidden_states, |
|
|
cache_position=cache_position, |
|
|
cache_first_G=cache_first_G, |
|
|
**kwargs |
|
|
) |
|
|
|
|
|
|
|
|
hidden_states = outputs.last_hidden_state |
|
|
|
|
|
slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep |
|
|
logits = self.final_layer(hidden_states[:, slice_indices, :]) |
|
|
|
|
|
loss = None |
|
|
if labels is not None: |
|
|
loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs) |
|
|
|
|
|
return CausalPLDRLLMOutputWithPast( |
|
|
loss=loss, |
|
|
logits=logits, |
|
|
past_key_values=outputs.past_key_values, |
|
|
hidden_states=outputs.hidden_states, |
|
|
attentions= outputs.attentions, |
|
|
pldr_attentions=outputs.pldr_attentions |
|
|
) |
|
|
|
|
|
@auto_docstring |
|
|
class PldrllmForTokenClassification(PldrllmPreTrainedModel): |
|
|
def __init__(self, config:PldrllmConfig)->None: |
|
|
super().__init__(config) |
|
|
self.num_labels = config.num_labels |
|
|
self.decoder = PldrllmModel(config) |
|
|
self.wdtype=None |
|
|
if getattr(config, "classifier_dropout", None) is not None: |
|
|
classifier_dropout = config.classifier_dropout |
|
|
elif getattr(config, "hidden_dropout", None) is not None: |
|
|
classifier_dropout = config.hidden_dropout |
|
|
else: |
|
|
classifier_dropout = 0.1 |
|
|
self.dropout = nn.Dropout(classifier_dropout) |
|
|
self.score = nn.Linear(config.hidden_size, config.num_labels, bias=True, dtype=self.wdtype) |
|
|
|
|
|
|
|
|
self.post_init() |
|
|
|
|
|
def get_input_embeddings(self): |
|
|
return self.decoder.embedding |
|
|
|
|
|
def set_input_embeddings(self, value): |
|
|
self.decoder.embedding = value |
|
|
|
|
|
@can_return_tuple |
|
|
@auto_docstring( |
|
|
custom_args=MODEL_COMMON_CUSTOM_ARGS |
|
|
) |
|
|
def forward( |
|
|
self, |
|
|
input_ids: Optional[torch.LongTensor] = None, |
|
|
attention_mask: Optional[torch.Tensor] = None, |
|
|
position_ids: Optional[torch.LongTensor] = None, |
|
|
past_key_values: Optional[Cache] = None, |
|
|
inputs_embeds: Optional[torch.FloatTensor] = None, |
|
|
labels: Optional[torch.LongTensor] = None, |
|
|
use_cache: Optional[bool] = None, |
|
|
output_attentions: Optional[bool] = None, |
|
|
output_pldr_attentions: Optional[bool] = None, |
|
|
output_hidden_states: Optional[bool] = None, |
|
|
cache_first_G: Optional[bool] = None, |
|
|
) -> TokenClassifierPLDRLLMOutput: |
|
|
r""" |
|
|
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): |
|
|
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., |
|
|
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If |
|
|
`config.num_labels > 1` a classification loss is computed (Cross-Entropy). |
|
|
""" |
|
|
|
|
|
outputs: BasePLDRModelOutputWithPast = self.decoder( |
|
|
input_ids, |
|
|
attention_mask=attention_mask, |
|
|
position_ids=position_ids, |
|
|
past_key_values=past_key_values, |
|
|
inputs_embeds=inputs_embeds, |
|
|
use_cache=use_cache, |
|
|
output_attentions=output_attentions, |
|
|
output_hidden_states=output_hidden_states, |
|
|
output_pldr_attentions=output_pldr_attentions, |
|
|
cache_first_G=cache_first_G |
|
|
) |
|
|
sequence_output = outputs.last_hidden_state |
|
|
sequence_output = self.dropout(sequence_output) |
|
|
logits = self.score(sequence_output) |
|
|
|
|
|
loss = None |
|
|
if labels is not None: |
|
|
loss = self.loss_function(logits, labels, self.config) |
|
|
|
|
|
return TokenClassifierPLDRLLMOutput( |
|
|
loss=loss, |
|
|
logits=logits, |
|
|
hidden_states=outputs.hidden_states, |
|
|
attentions=outputs.attentions, |
|
|
pldr_attentions=outputs.pldr_attentions |
|
|
) |
|
|
|
|
|
|
|
|
@auto_docstring |
|
|
class PldrllmForQuestionAnswering(PldrllmPreTrainedModel): |
|
|
|
|
|
|
|
|
def __init__(self, config:PldrllmConfig): |
|
|
super().__init__(config) |
|
|
self.decoder = PldrllmModel(config) |
|
|
self.wdtype=None |
|
|
self.qa_outputs = nn.Linear(config.hidden_size, 2, bias=True, dtype=self.wdtype) |
|
|
|
|
|
|
|
|
self.post_init() |
|
|
|
|
|
def get_input_embeddings(self): |
|
|
return self.decoder.embedding |
|
|
|
|
|
def set_input_embeddings(self, value): |
|
|
self.decoder.embedding = value |
|
|
|
|
|
@can_return_tuple |
|
|
@auto_docstring( |
|
|
custom_args=MODEL_COMMON_CUSTOM_ARGS |
|
|
) |
|
|
def forward( |
|
|
self, |
|
|
input_ids: Optional[torch.LongTensor] = None, |
|
|
attention_mask: Optional[torch.Tensor] = None, |
|
|
position_ids: Optional[torch.LongTensor] = None, |
|
|
past_key_values: Optional[Cache] = None, |
|
|
inputs_embeds: Optional[torch.FloatTensor] = None, |
|
|
start_positions: Optional[torch.LongTensor] = None, |
|
|
end_positions: Optional[torch.LongTensor] = None, |
|
|
output_attentions: Optional[bool] = None, |
|
|
output_pldr_attentions: Optional[bool] = None, |
|
|
output_hidden_states: Optional[bool] = None, |
|
|
cache_first_G: Optional[bool] = None, |
|
|
**kwargs, |
|
|
) -> QuestionAnsweringPLDRModelOutput: |
|
|
outputs: BasePLDRModelOutputWithPast = self.decoder( |
|
|
input_ids, |
|
|
attention_mask=attention_mask, |
|
|
position_ids=position_ids, |
|
|
past_key_values=past_key_values, |
|
|
inputs_embeds=inputs_embeds, |
|
|
output_attentions=output_attentions, |
|
|
output_hidden_states=output_hidden_states, |
|
|
output_pldr_attentions=output_pldr_attentions, |
|
|
cache_first_G=cache_first_G |
|
|
) |
|
|
|
|
|
sequence_output = outputs.last_hidden_state |
|
|
|
|
|
logits = self.qa_outputs(sequence_output) |
|
|
start_logits, end_logits = logits.split(1, dim=-1) |
|
|
start_logits = start_logits.squeeze(-1).contiguous() |
|
|
end_logits = end_logits.squeeze(-1).contiguous() |
|
|
|
|
|
loss = None |
|
|
if start_positions is not None and end_positions is not None: |
|
|
loss = self.loss_function(start_logits, end_logits, start_positions, end_positions, **kwargs) |
|
|
|
|
|
return QuestionAnsweringPLDRModelOutput( |
|
|
loss=loss, |
|
|
start_logits=start_logits, |
|
|
end_logits=end_logits, |
|
|
hidden_states=outputs.hidden_states, |
|
|
attentions=outputs.attentions, |
|
|
pldr_attentions=outputs.pldr_attentions |
|
|
) |
|
|
|
|
|
@auto_docstring( |
|
|
custom_intro=""" |
|
|
The PLDR-LLM with a sequence classification head on top (linear layer). |
|
|
|
|
|
[`PldrllmForSequenceClassification`] uses the last token in order to do the classification, as other causal models |
|
|
(e.g. GPT-2) do. |
|
|
|
|
|
Since it does classification on the last token, it requires to know the position of the last token. If a |
|
|
`pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If |
|
|
no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the |
|
|
padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in |
|
|
each row of the batch). |
|
|
""" |
|
|
) |
|
|
class PldrllmForSequenceClassification(PldrllmPreTrainedModel): |
|
|
def __init__(self, config:PldrllmConfig)->None: |
|
|
super().__init__(config) |
|
|
self.num_labels = config.num_labels |
|
|
self.decoder = PldrllmModel(config) |
|
|
self.wdtype=None |
|
|
self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False, dtype=self.wdtype) |
|
|
|
|
|
|
|
|
self.post_init() |
|
|
|
|
|
def get_input_embeddings(self): |
|
|
return self.decoder.embedding |
|
|
|
|
|
def set_input_embeddings(self, value): |
|
|
self.decoder.embedding = value |
|
|
|
|
|
@can_return_tuple |
|
|
@auto_docstring( |
|
|
custom_args=MODEL_COMMON_CUSTOM_ARGS |
|
|
) |
|
|
def forward( |
|
|
self, |
|
|
input_ids: Optional[torch.LongTensor] = None, |
|
|
attention_mask: Optional[torch.Tensor] = None, |
|
|
position_ids: Optional[torch.LongTensor] = None, |
|
|
past_key_values: Optional[Cache] = None, |
|
|
inputs_embeds: Optional[torch.FloatTensor] = None, |
|
|
labels: Optional[torch.LongTensor] = None, |
|
|
use_cache: Optional[bool] = None, |
|
|
output_attentions: Optional[bool] = None, |
|
|
output_pldr_attentions: Optional[bool] = None, |
|
|
output_hidden_states: Optional[bool] = None, |
|
|
cache_first_G: Optional[bool] = None |
|
|
) -> SequenceClassifierPLDRLLMOutputWithPast: |
|
|
r""" |
|
|
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): |
|
|
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., |
|
|
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If |
|
|
`config.num_labels > 1` a classification loss is computed (Cross-Entropy). |
|
|
""" |
|
|
|
|
|
outputs: BasePLDRModelOutputWithPast = self.decoder( |
|
|
input_ids, |
|
|
attention_mask=attention_mask, |
|
|
position_ids=position_ids, |
|
|
past_key_values=past_key_values, |
|
|
inputs_embeds=inputs_embeds, |
|
|
use_cache=use_cache, |
|
|
output_attentions=output_attentions, |
|
|
output_pldr_attentions=output_pldr_attentions, |
|
|
output_hidden_states=output_hidden_states, |
|
|
cache_first_G=cache_first_G |
|
|
) |
|
|
hidden_states = outputs.last_hidden_state |
|
|
logits = self.score(hidden_states) |
|
|
|
|
|
if input_ids is not None: |
|
|
batch_size = input_ids.shape[0] |
|
|
else: |
|
|
batch_size = inputs_embeds.shape[0] |
|
|
|
|
|
if self.config.pad_token_id is None and batch_size != 1: |
|
|
raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.") |
|
|
if self.config.pad_token_id is None: |
|
|
last_non_pad_token = -1 |
|
|
elif input_ids is not None: |
|
|
|
|
|
non_pad_mask = (input_ids != self.config.pad_token_id).to(logits.device, torch.int32) |
|
|
token_indices = torch.arange(input_ids.shape[-1], device=logits.device, dtype=torch.int32) |
|
|
last_non_pad_token = (token_indices * non_pad_mask).argmax(-1) |
|
|
else: |
|
|
last_non_pad_token = -1 |
|
|
logger.warning_once( |
|
|
f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be " |
|
|
"unexpected if using padding tokens in conjunction with `inputs_embeds.`" |
|
|
) |
|
|
|
|
|
pooled_logits = logits[torch.arange(batch_size, device=logits.device), last_non_pad_token] |
|
|
|
|
|
loss = None |
|
|
if labels is not None: |
|
|
loss = self.loss_function(logits=logits, labels=labels, pooled_logits=pooled_logits, config=self.config) |
|
|
|
|
|
return SequenceClassifierPLDRLLMOutputWithPast( |
|
|
loss=loss, |
|
|
logits=pooled_logits, |
|
|
past_key_values=outputs.past_key_values, |
|
|
hidden_states=outputs.hidden_states, |
|
|
attentions=outputs.attentions, |
|
|
pldr_attentions=outputs.pldr_attentions |
|
|
) |
|
|
|
|
|
|
|
|
__all__ = [ |
|
|
"PldrllmForCausalLM", |
|
|
"PldrllmModel", |
|
|
"PldrllmPreTrainedModel", |
|
|
"PldrllmForTokenClassification", |
|
|
"PldrllmForQuestionAnswering", |
|
|
"PldrllmForSequenceClassification" |
|
|
] |
|
|
|