End of training

Browse files

Files changed (5) hide show

README.md +32 -32
benchmarks.shelve.bak +1 -0
benchmarks.shelve.dat +2 -2
benchmarks.shelve.dir +1 -0
tokenizer.json +2 -14

README.md CHANGED Viewed

@@ -41,38 +41,38 @@ More information needed
 # Benchmark Metrics Comparison
-| Metric | attn_layer_mapper=layer-2, attn_loss_fn=raw_mse, attn_projector=orthogonal, attn_weight=25.0 | teacher |
-| :--- | :--- | :--- |
-| ai2_arc (acc) | 0.305 | 0.354 |
-| ai2_arc (acc_norm) | 0.302 | 0.339 |
-| arc_challenge (acc) | 0.173 | 0.188 |
-| arc_challenge (acc_norm) | 0.223 | 0.222 |
-| arc_easy (acc) | 0.37 | 0.436 |
-| arc_easy (acc_norm) | 0.34 | 0.396 |
-| boolq (acc) | 0.387 | 0.51 |
-| cola (mcc) | 0.044 | 0.01 |
-| glue (acc) | 0.412 | 0.403 |
-| glue (f1) | 0.451 | 0.529 |
-| glue (mcc) | 0.044 | 0.01 |
-| hellaswag (acc) | 0.315 | 0.343 |
-| hellaswag (acc_norm) | 0.344 | 0.393 |
-| mnli (acc) | 0.338 | 0.338 |
-| mnli_mismatch (acc) | 0.351 | 0.346 |
-| mrpc (acc) | 0.353 | 0.515 |
-| mrpc (f1) | 0.143 | 0.631 |
-| qnli (acc) | 0.497 | 0.491 |
-| qqp (acc) | 0.406 | 0.367 |
-| qqp (f1) | 0.501 | 0.512 |
-| rte (acc) | 0.549 | 0.516 |
-| sst2 (acc) | 0.545 | 0.511 |
-| wikitext (bits_per_byte) | 1.127 | 0.98 |
-| wikitext (byte_perplexity) | 2.184 | 1.973 |
-| wikitext (word_perplexity) | 65.25 | 37.82 |
-| wnli (acc) | 0.451 | 0.451 |
 # Resource Usage Comparison
-- VRAM Use: 7.7830 GB
 # Distillation (Teacher -> Student) Architecture Difference:
@@ -102,7 +102,7 @@ Trained on 145,724,804 tokens from the [wikimedia/wikipedia](https://huggingface
 # Training Objective
 ```
-DistillationObjective(logits_loss_component=LossComponent(label=logits, weight=1, loss_fn=kl), attn_loss_component=LossComponent(label=attn, weight=25.0, loss_fn=raw_mse, layer_mapper=layer-2))
 ```
 # Hyperparameters
@@ -119,9 +119,9 @@ The following hyperparameters were used during training:
 - lr_scheduler_type: `cosine_with_min_lr`
 - lr_scheduler_warmup_ratio: `0.5`
 - num_epochs: `1.0`
-- distillation_objective: `DistillationObjective(logits_loss_component=LossComponent(label=logits, weight=1, loss_fn=kl), attn_loss_component=LossComponent(label=attn, weight=25.0, loss_fn=raw_mse, layer_mapper=layer-2))`
 - train_embeddings: `True`
-- lr_scheduler: `<torch.optim.lr_scheduler.LambdaLR object at 0x7f0d1223cb50>`
 - student_model_name_or_path: `None`
 - student_config_name_or_path: `None`
 - student_model_config: `None`

 # Benchmark Metrics Comparison
+| Metric | attn_layer_mapper=all, attn_loss_fn=cos, attn_projector=orthogonal, attn_weight=5 | attn_layer_mapper=layer-2, attn_loss_fn=raw_mse, attn_projector=orthogonal, attn_weight=25.0 | teacher |
+| :--- | :--- | :--- | :--- |
+| ai2_arc (acc) | 0.313 | 0.305 | 0.354 |
+| ai2_arc (acc_norm) | 0.31 | 0.302 | 0.339 |
+| arc_challenge (acc) | 0.181 | 0.173 | 0.188 |
+| arc_challenge (acc_norm) | 0.224 | 0.223 | 0.222 |
+| arc_easy (acc) | 0.378 | 0.37 | 0.436 |
+| arc_easy (acc_norm) | 0.353 | 0.34 | 0.396 |
+| boolq (acc) | 0.49 | 0.387 | 0.51 |
+| cola (mcc) | -0.041 | 0.044 | 0.01 |
+| glue (acc) | 0.396 | 0.412 | 0.403 |
+| glue (f1) | 0.516 | 0.451 | 0.529 |
+| glue (mcc) | -0.041 | 0.044 | 0.01 |
+| hellaswag (acc) | 0.32 | 0.315 | 0.343 |
+| hellaswag (acc_norm) | 0.348 | 0.344 | 0.393 |
+| mnli (acc) | 0.336 | 0.338 | 0.338 |
+| mnli_mismatch (acc) | 0.343 | 0.351 | 0.346 |
+| mrpc (acc) | 0.444 | 0.353 | 0.515 |
+| mrpc (f1) | 0.478 | 0.143 | 0.631 |
+| qnli (acc) | 0.488 | 0.497 | 0.491 |
+| qqp (acc) | 0.356 | 0.406 | 0.367 |
+| qqp (f1) | 0.522 | 0.501 | 0.512 |
+| rte (acc) | 0.56 | 0.549 | 0.516 |
+| sst2 (acc) | 0.498 | 0.545 | 0.511 |
+| wikitext (bits_per_byte) | 1.118 | 1.127 | 0.98 |
+| wikitext (byte_perplexity) | 2.17 | 2.184 | 1.973 |
+| wikitext (word_perplexity) | 63.05 | 65.25 | 37.82 |
+| wnli (acc) | 0.408 | 0.451 | 0.451 |
 # Resource Usage Comparison
+- VRAM Use: 8.2855 GB
 # Distillation (Teacher -> Student) Architecture Difference:
 # Training Objective
 ```
+DistillationObjective(logits_loss_component=LossComponent(label=logits, weight=1, loss_fn=kl), attn_loss_component=LossComponent(label=attn, weight=5, loss_fn=cos, layer_mapper=all))
 ```
 # Hyperparameters
 - lr_scheduler_type: `cosine_with_min_lr`
 - lr_scheduler_warmup_ratio: `0.5`
 - num_epochs: `1.0`
+- distillation_objective: `DistillationObjective(logits_loss_component=LossComponent(label=logits, weight=1, loss_fn=kl), attn_loss_component=LossComponent(label=attn, weight=5, loss_fn=cos, layer_mapper=all))`
 - train_embeddings: `True`
+- lr_scheduler: `<torch.optim.lr_scheduler.LambdaLR object at 0x7f05c40e2050>`
 - student_model_name_or_path: `None`
 - student_config_name_or_path: `None`
 - student_model_config: `None`

benchmarks.shelve.bak CHANGED Viewed

@@ -1,2 +1,3 @@
 'teacher', (0, 26029753)
 'attn_layer_mapper=layer-2, attn_loss_fn=raw_mse, attn_projector=orthogonal, attn_weight=25.0', (26030080, 26029753)

 'teacher', (0, 26029753)
 'attn_layer_mapper=layer-2, attn_loss_fn=raw_mse, attn_projector=orthogonal, attn_weight=25.0', (26030080, 26029753)
+'attn_layer_mapper=all, attn_loss_fn=cos, attn_projector=orthogonal, attn_weight=5', (52060160, 26029753)

benchmarks.shelve.dat CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b08da2c1102a7b8635c1aac31997fbdc32e594beca1614e4a38096dec1f9bf07
-size 52059833

 version https://git-lfs.github.com/spec/v1
+oid sha256:777b1d28fe282a5405865474430509b697a416a88b6dec206322e7edbf2f1e2d
+size 78089913

benchmarks.shelve.dir CHANGED Viewed

@@ -1,2 +1,3 @@
 'teacher', (0, 26029753)
 'attn_layer_mapper=layer-2, attn_loss_fn=raw_mse, attn_projector=orthogonal, attn_weight=25.0', (26030080, 26029753)

 'teacher', (0, 26029753)
 'attn_layer_mapper=layer-2, attn_loss_fn=raw_mse, attn_projector=orthogonal, attn_weight=25.0', (26030080, 26029753)
+'attn_layer_mapper=all, attn_loss_fn=cos, attn_projector=orthogonal, attn_weight=5', (52060160, 26029753)

tokenizer.json CHANGED Viewed

@@ -1,19 +1,7 @@
 {
   "version": "1.0",
-  "truncation": {
-    "direction": "Right",
-    "max_length": 1023,
-    "strategy": "LongestFirst",
-    "stride": 0
-  },
-  "padding": {
-    "strategy": "BatchLongest",
-    "direction": "Right",
-    "pad_to_multiple_of": null,
-    "pad_id": 50256,
-    "pad_type_id": 0,
-    "pad_token": "<|endoftext|>"
-  },
   "added_tokens": [
     {
       "id": 50256,

 {
   "version": "1.0",
+  "truncation": null,
+  "padding": null,
   "added_tokens": [
     {
       "id": 50256,