Upload folder using huggingface_hub

Browse files

Files changed (3) hide show

configuration_vaetki.py +3 -5
model.safetensors.index.json +0 -0
modeling_vaetki.py +20 -20

configuration_vaetki.py CHANGED Viewed

@@ -3,7 +3,6 @@ from transformers.modeling_rope_utils import rope_config_validation
 class VaetkiConfig(PretrainedConfig):
     model_type = "vaetki"
     keys_to_ignore_at_inference = ["past_key_values"]
     base_model_tp_plan = {  # TODO: only replicate attention layers when > first_k_dense_replace
@@ -98,12 +97,10 @@ class VaetkiConfig(PretrainedConfig):
         self.rope_scaling = rope_scaling
         self.attention_bias = attention_bias
         self.attention_dropout = attention_dropout
-        # Validate the correctness of rotary position embeddings parameters
-        # BC: if there is a 'type' field, copy it it to 'rope_type'.
-        if self.rope_scaling is not None and "type" in self.rope_scaling:
-            self.rope_scaling["rope_type"] = self.rope_scaling["type"]
         if self.rope_scaling is not None:
             for key in ["beta_fast", "beta_slow", "factor"]:
                 if key in self.rope_scaling:
                     self.rope_scaling[key] = float(self.rope_scaling[key])
@@ -112,6 +109,7 @@ class VaetkiConfig(PretrainedConfig):
         if self.layer_types is None:
             self.layer_types = [
                 "sliding_attention" if bool((i + 1) % 6) else "full_attention"
                 for i in range(self.num_hidden_layers)
             ]

 class VaetkiConfig(PretrainedConfig):
     model_type = "vaetki"
     keys_to_ignore_at_inference = ["past_key_values"]
     base_model_tp_plan = {  # TODO: only replicate attention layers when > first_k_dense_replace
         self.rope_scaling = rope_scaling
         self.attention_bias = attention_bias
         self.attention_dropout = attention_dropout
         if self.rope_scaling is not None:
+            if self.rope_scaling["rope_type"] == "rope":
+                self.rope_scaling["rope_type"] = "default"
             for key in ["beta_fast", "beta_slow", "factor"]:
                 if key in self.rope_scaling:
                     self.rope_scaling[key] = float(self.rope_scaling[key])
         if self.layer_types is None:
             self.layer_types = [
+                # FIXME: megatron transformer_config에 맞게 패턴 변경 필요
                 "sliding_attention" if bool((i + 1) % 6) else "full_attention"
                 for i in range(self.num_hidden_layers)
             ]

model.safetensors.index.json CHANGED Viewed

The diff for this file is too large to render. See raw diff

modeling_vaetki.py CHANGED Viewed

@@ -13,7 +13,7 @@ from transformers.masking_utils import create_causal_mask, create_sliding_window
 from transformers.modeling_flash_attention_utils import FlashAttentionKwargs
 from transformers.modeling_layers import GradientCheckpointingLayer
 from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
-from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
 from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from transformers.processing_utils import Unpack
 from transformers.utils import TransformersKwargs, can_return_tuple
@@ -38,25 +38,19 @@ class VaetkiRMSNorm(nn.Module):
 class VaetkiRotaryEmbedding(nn.Module):
-    def __init__(self, config: VaetkiConfig, device=None):
         super().__init__()
-        # BC: "rope_type" was originally "type"
-        if hasattr(config, "rope_scaling") and config.rope_scaling is not None and isinstance(config.rope_scaling, dict):
-            self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
-        else:
-            self.rope_type = "default"
         self.max_seq_len_cached = config.max_position_embeddings
-        self.original_max_seq_len = config.max_position_embeddings
         self.config = config
         self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
         inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
         self.register_buffer("inv_freq", inv_freq, persistent=False)
-        self.original_inv_freq = self.inv_freq
     @torch.no_grad()
-    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
     def forward(self, x, position_ids):
         inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
         position_ids_expanded = position_ids[:, None, :].float()
@@ -258,10 +252,9 @@ class VaetkiAttention(nn.Module):
         self.scaling = self.qk_head_dim ** (-0.5)
         if self.config.rope_scaling is not None and not self.is_sliding:
-            # TODO: check yarn related logic
             mscale_all_dim = self.config.rope_scaling.get("mscale_all_dim", 0)
-            scaling_factor = self.config.rope_scaling["factor"]
             if mscale_all_dim:
                 mscale = yarn_get_mscale(scaling_factor, mscale_all_dim)
                 self.scaling = self.scaling * mscale * mscale
@@ -408,8 +401,7 @@ class VaetkiPreTrainedModel(PreTrainedModel):
     supports_gradient_checkpointing = True
     _no_split_modules = ["VaetkiDecoderLayer"]
     _skip_keys_device_placement = ["past_key_values"]
-    _supports_flash_attn_3 = True
-    _supports_flash_attn_2 = True
     _supports_sdpa = False
     _supports_flex_attn = False
     _supports_cache_class = True
@@ -445,13 +437,21 @@ class VaetkiModel(VaetkiPreTrainedModel):
             [VaetkiDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
         )
         self.norm = VaetkiRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.rotary_emb_local = VaetkiRotaryEmbedding(config=config)
         self.gradient_checkpointing = False
         config = copy.deepcopy(config)
         config.rope_theta = config.rope_theta_global
-        self.rotary_emb_global = VaetkiRotaryEmbedding(config=config)
-        self.rotary_emb_global.inv_freq /= 8.0  # TODO: Possibly change in the future
         # Initialize weights and apply final processing
         self.post_init()
@@ -571,7 +571,7 @@ class VaetkiForCausalLM(VaetkiPreTrainedModel, GenerationMixin):
         super().__init__(config)
         self.model = VaetkiModel(config)
         self.vocab_size = config.vocab_size
-        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
         # Initialize weights and apply final processing
         self.post_init()
@@ -586,7 +586,7 @@ class VaetkiForCausalLM(VaetkiPreTrainedModel, GenerationMixin):
         return self.lm_head
     def set_output_embeddings(self, new_embeddings):
-        self.lm_head = new_embeddings
     def set_decoder(self, decoder):
         self.model = decoder
@@ -633,7 +633,7 @@ class VaetkiForCausalLM(VaetkiPreTrainedModel, GenerationMixin):
         hidden_states = outputs.last_hidden_state
         # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
         slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
-        logits = self.lm_head(hidden_states[:, slice_indices, :])
         loss = None
         if labels is not None:

 from transformers.modeling_flash_attention_utils import FlashAttentionKwargs
 from transformers.modeling_layers import GradientCheckpointingLayer
 from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
+from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS
 from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from transformers.processing_utils import Unpack
 from transformers.utils import TransformersKwargs, can_return_tuple
 class VaetkiRotaryEmbedding(nn.Module):
+    def __init__(self, config: VaetkiConfig, rope_type="default", original_max_position_embeddings=None, device=None):
         super().__init__()
+        self.rope_type = rope_type
         self.max_seq_len_cached = config.max_position_embeddings
+        self.original_max_seq_len = original_max_position_embeddings
         self.config = config
         self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
         inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
         self.register_buffer("inv_freq", inv_freq, persistent=False)
     @torch.no_grad()
     def forward(self, x, position_ids):
         inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
         position_ids_expanded = position_ids[:, None, :].float()
         self.scaling = self.qk_head_dim ** (-0.5)
         if self.config.rope_scaling is not None and not self.is_sliding:
             mscale_all_dim = self.config.rope_scaling.get("mscale_all_dim", 0)
             if mscale_all_dim:
+                scaling_factor = self.config.rope_scaling["factor"]
                 mscale = yarn_get_mscale(scaling_factor, mscale_all_dim)
                 self.scaling = self.scaling * mscale * mscale
     supports_gradient_checkpointing = True
     _no_split_modules = ["VaetkiDecoderLayer"]
     _skip_keys_device_placement = ["past_key_values"]
+    _supports_flash_attn = True
     _supports_sdpa = False
     _supports_flex_attn = False
     _supports_cache_class = True
             [VaetkiDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
         )
         self.norm = VaetkiRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.gradient_checkpointing = False
+        self.rotary_emb_local = VaetkiRotaryEmbedding(config=config)
         config = copy.deepcopy(config)
         config.rope_theta = config.rope_theta_global
+        if self.config.rope_scaling is None:
+            rope_type = "default"
+            original_max_position_embeddings = config.max_position_embeddings
+        else:
+            rope_type = config.rope_scaling["rope_type"]
+            original_max_position_embeddings = config.rope_scaling["original_max_position_embeddings"]
+        self.rotary_emb_global = VaetkiRotaryEmbedding(config=config, rope_type=rope_type, original_max_position_embeddings=original_max_position_embeddings)
+        if rope_type == "default":
+            self.rotary_emb_global.inv_freq /= 8.0
         # Initialize weights and apply final processing
         self.post_init()
         super().__init__(config)
         self.model = VaetkiModel(config)
         self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False, dtype=torch.float32)
         # Initialize weights and apply final processing
         self.post_init()
         return self.lm_head
     def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings.to(torch.float32)
     def set_decoder(self, decoder):
         self.model = decoder
         hidden_states = outputs.last_hidden_state
         # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
         slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :].to(self.lm_head.weight.dtype))
         loss = None
         if labels is not None: