omarkamali commited on
Commit
b086235
·
verified ·
1 Parent(s): 2bfe73f

Upload all models and assets for eml (latest)

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +1 -0
  2. README.md +329 -135
  3. models/embeddings/aligned/eml_128d.bin +3 -0
  4. models/embeddings/aligned/eml_128d.meta.json +1 -0
  5. models/embeddings/aligned/eml_128d.projection.npy +3 -0
  6. models/embeddings/aligned/eml_128d_metadata.json +8 -0
  7. models/embeddings/aligned/eml_32d.bin +3 -0
  8. models/embeddings/aligned/eml_32d.meta.json +1 -0
  9. models/embeddings/aligned/eml_32d.projection.npy +3 -0
  10. models/embeddings/aligned/eml_32d_metadata.json +8 -0
  11. models/embeddings/aligned/eml_64d.bin +3 -0
  12. models/embeddings/aligned/eml_64d.meta.json +1 -0
  13. models/embeddings/aligned/eml_64d.projection.npy +3 -0
  14. models/embeddings/aligned/eml_64d_metadata.json +8 -0
  15. models/embeddings/monolingual/eml_128d.bin +2 -2
  16. models/embeddings/monolingual/eml_128d_metadata.json +5 -3
  17. models/embeddings/monolingual/eml_32d.bin +2 -2
  18. models/embeddings/monolingual/eml_32d_metadata.json +5 -3
  19. models/embeddings/monolingual/eml_64d.bin +2 -2
  20. models/embeddings/monolingual/eml_64d_metadata.json +5 -3
  21. models/subword_markov/eml_markov_ctx1_subword.parquet +2 -2
  22. models/subword_markov/eml_markov_ctx1_subword_metadata.json +2 -2
  23. models/subword_markov/eml_markov_ctx2_subword.parquet +2 -2
  24. models/subword_markov/eml_markov_ctx2_subword_metadata.json +2 -2
  25. models/subword_markov/eml_markov_ctx3_subword.parquet +2 -2
  26. models/subword_markov/eml_markov_ctx3_subword_metadata.json +2 -2
  27. models/subword_markov/eml_markov_ctx4_subword.parquet +2 -2
  28. models/subword_markov/eml_markov_ctx4_subword_metadata.json +2 -2
  29. models/subword_ngram/eml_2gram_subword.parquet +2 -2
  30. models/subword_ngram/eml_2gram_subword_metadata.json +2 -2
  31. models/subword_ngram/eml_3gram_subword.parquet +2 -2
  32. models/subword_ngram/eml_3gram_subword_metadata.json +2 -2
  33. models/subword_ngram/eml_4gram_subword.parquet +2 -2
  34. models/subword_ngram/eml_4gram_subword_metadata.json +2 -2
  35. models/subword_ngram/eml_5gram_subword.parquet +3 -0
  36. models/subword_ngram/eml_5gram_subword_metadata.json +7 -0
  37. models/tokenizer/eml_tokenizer_16k.model +2 -2
  38. models/tokenizer/eml_tokenizer_16k.vocab +0 -0
  39. models/tokenizer/eml_tokenizer_32k.model +2 -2
  40. models/tokenizer/eml_tokenizer_32k.vocab +0 -0
  41. models/tokenizer/eml_tokenizer_8k.model +2 -2
  42. models/tokenizer/eml_tokenizer_8k.vocab +0 -0
  43. models/vocabulary/eml_vocabulary.parquet +2 -2
  44. models/vocabulary/eml_vocabulary_metadata.json +10 -9
  45. models/word_markov/eml_markov_ctx1_word.parquet +2 -2
  46. models/word_markov/eml_markov_ctx1_word_metadata.json +2 -2
  47. models/word_markov/eml_markov_ctx2_word.parquet +2 -2
  48. models/word_markov/eml_markov_ctx2_word_metadata.json +2 -2
  49. models/word_markov/eml_markov_ctx3_word.parquet +2 -2
  50. models/word_markov/eml_markov_ctx3_word_metadata.json +2 -2
.gitattributes CHANGED
@@ -39,3 +39,4 @@ visualizations/position_encoding_comparison.png filter=lfs diff=lfs merge=lfs -t
39
  visualizations/tsne_sentences.png filter=lfs diff=lfs merge=lfs -text
40
  visualizations/tsne_words.png filter=lfs diff=lfs merge=lfs -text
41
  visualizations/zipf_law.png filter=lfs diff=lfs merge=lfs -text
 
 
39
  visualizations/tsne_sentences.png filter=lfs diff=lfs merge=lfs -text
40
  visualizations/tsne_words.png filter=lfs diff=lfs merge=lfs -text
41
  visualizations/zipf_law.png filter=lfs diff=lfs merge=lfs -text
42
+ visualizations/embedding_tsne_multilingual.png filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,6 +1,6 @@
1
  ---
2
  language: eml
3
- language_name: EML
4
  language_family: romance_galloitalic
5
  tags:
6
  - wikilangs
@@ -10,11 +10,21 @@ tags:
10
  - n-gram
11
  - markov
12
  - wikipedia
 
 
 
 
 
 
 
 
 
 
13
  - monolingual
14
  - family-romance_galloitalic
15
  license: mit
16
  library_name: wikilangs
17
- pipeline_tag: feature-extraction
18
  datasets:
19
  - omarkamali/wikipedia-monthly
20
  dataset_info:
@@ -23,20 +33,20 @@ dataset_info:
23
  metrics:
24
  - name: best_compression_ratio
25
  type: compression
26
- value: 3.867
27
  - name: best_isotropy
28
  type: isotropy
29
- value: 0.4456
30
  - name: vocabulary_size
31
  type: vocab
32
- value: 18015
33
- generated: 2025-12-30
34
  ---
35
 
36
- # EML - Wikilangs Models
37
  ## Comprehensive Research Report & Full Ablation Study
38
 
39
- This repository contains NLP models trained and evaluated by Wikilangs, specifically on **EML** Wikipedia data.
40
  We analyze tokenizers, n-gram models, Markov chains, vocabulary statistics, and word embeddings.
41
 
42
  ## 📋 Repository Contents
@@ -44,12 +54,13 @@ We analyze tokenizers, n-gram models, Markov chains, vocabulary statistics, and
44
  ### Models & Assets
45
 
46
  - Tokenizers (8k, 16k, 32k, 64k)
47
- - N-gram models (2, 3, 4-gram)
48
- - Markov chains (context of 1, 2, 3 and 4)
49
  - Subword N-gram and Markov chains
50
- - Embeddings in various sizes and dimensions
51
  - Language Vocabulary
52
  - Language Statistics
 
53
  ![Performance Dashboard](visualizations/performance_dashboard.png)
54
 
55
  ### Analysis and Evaluation
@@ -59,7 +70,8 @@ We analyze tokenizers, n-gram models, Markov chains, vocabulary statistics, and
59
  - [3. Markov Chain Evaluation](#3-markov-chain-evaluation)
60
  - [4. Vocabulary Analysis](#4-vocabulary-analysis)
61
  - [5. Word Embeddings Evaluation](#5-word-embeddings-evaluation)
62
- - [6. Summary & Recommendations](#6-summary--recommendations)
 
63
  - [Metrics Glossary](#appendix-metrics-glossary--interpretation-guide)
64
  - [Visualizations Index](#visualizations-index)
65
 
@@ -68,54 +80,53 @@ We analyze tokenizers, n-gram models, Markov chains, vocabulary statistics, and
68
 
69
  ![Tokenizer Compression](visualizations/tokenizer_compression.png)
70
 
 
 
 
 
 
 
71
  ### Results
72
 
73
  | Vocab Size | Compression | Avg Token Len | UNK Rate | Total Tokens |
74
  |------------|-------------|---------------|----------|--------------|
75
- | **8k** | 3.069x | 3.04 | 0.1566% | 208,856 |
76
- | **16k** | 3.337x | 3.30 | 0.1703% | 192,055 |
77
- | **32k** | 3.580x | 3.54 | 0.1827% | 179,030 |
78
- | **64k** | 3.867x 🏆 | 3.83 | 0.1973% | 165,756 |
79
 
80
  ### Tokenization Examples
81
 
82
  Below are sample sentences tokenized with each vocabulary size:
83
 
84
- **Sample 1:** `Categoria:TELEVISIONE
85
- Categoria:CINEMA
86
- Categoria:Atōr tedésc`
87
 
88
  | Vocab | Tokens | Count |
89
  |-------|--------|-------|
90
- | 8k | `▁categoria : televisionecategoria : cinemacategoria : atōrtedésc` | 10 |
91
- | 16k | `▁categoria : televisionecategoria : cinemacategoria : atōrtedésc` | 10 |
92
- | 32k | `▁categoria : televisionecategoria : cinemacategoria : atōrtedésc` | 10 |
93
- | 64k | `▁categoria : televisione ▁categoria : cinema ▁categoria : atōr ▁tedésc` | 10 |
94
 
95
- **Sample 2:** `La Ròca o anch Ròca San Casiàn (Rocca San Casciano in italièn) l un cumòn ad 1...`
96
 
97
  | Vocab | Tokens | Count |
98
  |-------|--------|-------|
99
- | 8k | `▁la ▁rò caoanch casanca si ... (+37 more)` | 47 |
100
- | 16k | `▁la ▁ròca ▁oanch ▁ròcasanca si àn( ... (+33 more)` | 43 |
101
- | 32k | `▁la ▁ròca ▁oanch ▁ròcasanca si àn( ... (+32 more)` | 42 |
102
- | 64k | `▁la ▁ròca ▁o ▁anch ▁ròca ▁san ▁casiàn ▁( rocca ▁san ... (+29 more)` | 39 |
103
 
104
- **Sample 3:** `Categoria:GEOGRAFIA
105
- Categoria:CITTADITALIA`
106
 
107
  | Vocab | Tokens | Count |
108
  |-------|--------|-------|
109
- | 8k | `▁categoria : geografiacategoria : cittaditalia` | 6 |
110
- | 16k | `▁categoria : geografiacategoria : cittaditalia` | 6 |
111
- | 32k | `▁categoria : geografiacategoria : cittaditalia` | 6 |
112
- | 64k | `▁categoria : geografia ▁categoria : cittaditalia` | 6 |
113
 
114
 
115
  ### Key Findings
116
 
117
- - **Best Compression:** 64k achieves 3.867x compression
118
- - **Lowest UNK Rate:** 8k with 0.1566% unknown tokens
119
  - **Trade-off:** Larger vocabularies improve compression but increase model size
120
  - **Recommendation:** 32k vocabulary provides optimal balance for production use
121
 
@@ -124,57 +135,111 @@ Categoria:CITTADITALIA`
124
 
125
  ![N-gram Perplexity](visualizations/ngram_perplexity.png)
126
 
 
 
127
  ![N-gram Coverage](visualizations/ngram_coverage.png)
128
 
129
  ### Results
130
 
131
- | N-gram | Perplexity | Entropy | Unique N-grams | Top-100 Coverage | Top-1000 Coverage |
132
- |--------|------------|---------|----------------|------------------|-------------------|
133
- | **2-gram** | 689 🏆 | 9.43 | 7,061 | 54.1% | 81.3% |
134
- | **2-gram** | 367 🏆 | 8.52 | 2,917 | 61.9% | 97.6% |
135
- | **3-gram** | 1,170 | 10.19 | 11,174 | 48.6% | 76.8% |
136
- | **3-gram** | 2,199 | 11.10 | 21,058 | 33.1% | 68.9% |
137
- | **4-gram** | 1,936 | 10.92 | 20,258 | 43.6% | 71.1% |
138
- | **4-gram** | 7,361 | 12.85 | 88,240 | 25.5% | 51.4% |
 
 
139
 
140
  ### Top 5 N-grams by Size
141
 
142
- **2-grams:**
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
143
 
144
  | Rank | N-gram | Count |
145
  |------|--------|-------|
146
- | 1 | `categoria :` | 28,651 |
147
- | 2 | `: nùmer` | 11,903 |
148
- | 3 | `' l` | 8,131 |
149
- | 4 | `l '` | 4,130 |
150
- | 5 | `' d` | 3,768 |
151
 
152
- **3-grams:**
153
 
154
  | Rank | N-gram | Count |
155
  |------|--------|-------|
156
- | 1 | `categoria : nùmer` | 11,902 |
157
- | 2 | `' l è` | 3,029 |
158
- | 3 | `l è '` | 2,876 |
159
- | 4 | `categoria : cinema` | 2,834 |
160
- | 5 | `da ' l` | 2,787 |
161
 
162
- **4-grams:**
163
 
164
  | Rank | N-gram | Count |
165
  |------|--------|-------|
166
- | 1 | `' l è '` | 2,875 |
167
- | 2 | `: cinema categoria :` | 2,777 |
168
- | 3 | `categoria : cinema categoria` | 2,777 |
169
- | 4 | `: matematica categoria :` | 2,068 |
170
- | 5 | `categoria : matematica categoria` | 2,068 |
171
 
172
 
173
  ### Key Findings
174
 
175
- - **Best Perplexity:** 2-gram with 367
176
  - **Entropy Trend:** Decreases with larger n-grams (more predictable)
177
- - **Coverage:** Top-1000 patterns cover ~51% of corpus
178
  - **Recommendation:** 4-gram or 5-gram for best predictive performance
179
 
180
  ---
@@ -182,55 +247,86 @@ Categoria:CITTADITALIA`
182
 
183
  ![Markov Entropy](visualizations/markov_entropy.png)
184
 
 
 
185
  ![Markov Branching](visualizations/markov_branching.png)
186
 
187
  ### Results
188
 
189
- | Context | Avg Entropy | Perplexity | Branching Factor | Unique Contexts | Predictability |
190
- |---------|-------------|------------|------------------|-----------------|----------------|
191
- | **1** | 0.4790 | 1.394 | 2.99 | 54,943 | 52.1% |
192
- | **1** | 1.0426 | 2.060 | 10.88 | 518 | 0.0% |
193
- | **2** | 0.1826 | 1.135 | 1.42 | 163,004 | 81.7% |
194
- | **2** | 1.1789 | 2.264 | 6.89 | 5,633 | 0.0% |
195
- | **3** | 0.0769 | 1.055 | 1.15 | 229,724 | 92.3% |
196
- | **3** | 0.9021 | 1.869 | 3.96 | 38,777 | 9.8% |
197
- | **4** | 0.0371 🏆 | 1.026 | 1.07 | 262,110 | 96.3% |
198
- | **4** | 0.6406 🏆 | 1.559 | 2.42 | 153,229 | 35.9% |
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
199
 
200
- ### Generated Text Samples
201
 
202
- Below are text samples generated from each Markov chain model:
 
 
203
 
204
  **Context Size 1:**
205
 
206
- 1. `: cinema categoria : matematica categoria : film , ch ' la fifa , sànt pîr`
207
- 2. `categoria : matematica categoria : nùmer intēr categoria : nùmer prìm categoria : nùmer ed leyland`
208
- 3. `' 700 méla cristiàn dla smana ch ' l avû ? se stès . sînch ân`
209
 
210
  **Context Size 2:**
211
 
212
- 1. `categoria : nùmer naturêl categoria : film eròtic categoria : nùmer intēr categoria : nùmer difetìv ...`
213
- 2. `: nùmer naturêl categoria : nùmer 19 - gonèl categoria : cantànt itagliàn categoria : nùmer naturêl`
214
- 3. `' l ' intervîn gaudio – an vrîven mìa fêr i spetâcôl pr ’ un zôgh americân`
215
 
216
  **Context Size 3:**
217
 
218
- 1. `categoria : nùmer moltìplica per 2 ' d un nùmer quèder categoria : nùmer 3 - quèśi prim`
219
- 2. `' l è ' l nòm ' d un domìni genèric la registrasiòun dal in dal sît edl`
220
- 3. `l è ' n elemèint ed ) 1894 ∃ quantificadōr eśistensiêl ( " a gh ' è la`
221
 
222
  **Context Size 4:**
223
 
224
- 1. `' l è ' l nòm ' d un domìni genèric . al funsiòuna da ' l setèmber dal`
225
- 2. `: cinema categoria : atōr americàṅ categoria : atōr canadéś categoria : atōr americàṅ categoria : ar...`
226
- 3. `categoria : cinema categoria : film categoria : film eròtic categoria : film ad scarésa`
227
 
228
 
229
  ### Key Findings
230
 
231
- - **Best Predictability:** Context-4 with 96.3% predictability
232
  - **Branching Factor:** Decreases with context size (more deterministic)
233
- - **Memory Trade-off:** Larger contexts require more storage (153,229 contexts)
234
  - **Recommendation:** Context-3 or Context-4 for text generation
235
 
236
  ---
@@ -246,36 +342,36 @@ Below are text samples generated from each Markov chain model:
246
 
247
  | Metric | Value |
248
  |--------|-------|
249
- | Vocabulary Size | 18,015 |
250
- | Total Tokens | 364,595 |
251
- | Mean Frequency | 20.24 |
252
  | Median Frequency | 3 |
253
- | Frequency Std Dev | 319.82 |
254
 
255
  ### Most Common Words
256
 
257
  | Rank | Word | Frequency |
258
  |------|------|-----------|
259
- | 1 | categoria | 28,655 |
260
- | 2 | nùmer | 14,021 |
261
- | 3 | l | 13,281 |
262
- | 4 | al | 10,419 |
263
- | 5 | dal | 9,093 |
264
- | 6 | a | 7,637 |
265
- | 7 | ed | 7,001 |
266
- | 8 | la | 6,876 |
267
- | 9 | d | 5,723 |
268
- | 10 | in | 5,357 |
269
 
270
  ### Least Common Words (from vocabulary)
271
 
272
  | Rank | Word | Frequency |
273
  |------|------|-----------|
274
- | 1 | antiquariàt | 2 |
275
- | 2 | espositìv | 2 |
276
- | 3 | ecosistèma | 2 |
277
- | 4 | trasformasiòun | 2 |
278
- | 5 | galleria | 2 |
279
  | 6 | velò | 2 |
280
  | 7 | arriv | 2 |
281
  | 8 | sèda | 2 |
@@ -286,24 +382,24 @@ Below are text samples generated from each Markov chain model:
286
 
287
  | Metric | Value |
288
  |--------|-------|
289
- | Zipf Coefficient | 1.0040 |
290
- | R² (Goodness of Fit) | 0.993487 |
291
  | Adherence Quality | **excellent** |
292
 
293
  ### Coverage Analysis
294
 
295
  | Top N Words | Coverage |
296
  |-------------|----------|
297
- | Top 100 | 59.9% |
298
- | Top 1,000 | 77.9% |
299
- | Top 5,000 | 89.8% |
300
- | Top 10,000 | 95.2% |
301
 
302
  ### Key Findings
303
 
304
- - **Zipf Compliance:** R²=0.9935 indicates excellent adherence to Zipf's law
305
- - **High Frequency Dominance:** Top 100 words cover 59.9% of corpus
306
- - **Long Tail:** 8,015 words needed for remaining 4.8% coverage
307
 
308
  ---
309
  ## 5. Word Embeddings Evaluation
@@ -316,24 +412,119 @@ Below are text samples generated from each Markov chain model:
316
 
317
  ![t-SNE Sentences](visualizations/tsne_sentences.png)
318
 
319
- ### Model Comparison
320
 
321
- | Model | Vocab Size | Dimension | Avg Norm | Std Norm | Isotropy |
322
- |-------|------------|-----------|----------|----------|----------|
323
- | **mono_32d** | 5,895 | 32 | 4.010 | 0.920 | 0.4456 🏆 |
324
- | **mono_64d** | 5,895 | 64 | 4.045 | 0.873 | 0.1681 |
325
- | **mono_128d** | 5,895 | 128 | 4.056 | 0.886 | 0.0292 |
326
- | **embeddings_enhanced** | 0 | 0 | 0.000 | 0.000 | 0.0000 |
 
 
 
 
 
 
 
 
 
 
 
327
 
328
  ### Key Findings
329
 
330
- - **Best Isotropy:** mono_32d with 0.4456 (more uniform distribution)
331
- - **Dimension Trade-off:** Higher dimensions capture more semantics but reduce isotropy
332
- - **Vocabulary Coverage:** All models cover 5,895 words
333
- - **Recommendation:** 100d for balanced semantic capture and efficiency
334
 
335
  ---
336
- ## 6. Summary & Recommendations
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
337
 
338
  ![Performance Dashboard](visualizations/performance_dashboard.png)
339
 
@@ -341,11 +532,12 @@ Below are text samples generated from each Markov chain model:
341
 
342
  | Component | Recommended | Rationale |
343
  |-----------|-------------|-----------|
344
- | Tokenizer | **32k BPE** | Best compression (3.87x) with low UNK rate |
345
- | N-gram | **5-gram** | Lowest perplexity (367) |
346
- | Markov | **Context-4** | Highest predictability (96.3%) |
347
  | Embeddings | **100d** | Balanced semantic capture and isotropy |
348
 
 
349
  ---
350
  ## Appendix: Metrics Glossary & Interpretation Guide
351
 
@@ -535,7 +727,8 @@ If you use these models in your research, please cite:
535
  author = {Kamali, Omar},
536
  title = {Wikilangs: Open NLP Models for Wikipedia Languages},
537
  year = {2025},
538
- publisher = {HuggingFace},
 
539
  url = {https://huggingface.co/wikilangs}
540
  institution = {Omneity Labs}
541
  }
@@ -551,7 +744,8 @@ MIT License - Free for academic and commercial use.
551
  - 🤗 Models: [huggingface.co/wikilangs](https://huggingface.co/wikilangs)
552
  - 📊 Data: [wikipedia-monthly](https://huggingface.co/datasets/omarkamali/wikipedia-monthly)
553
  - 👤 Author: [Omar Kamali](https://huggingface.co/omarkamali)
 
554
  ---
555
  *Generated by Wikilangs Models Pipeline*
556
 
557
- *Report Date: 2025-12-30 12:20:31*
 
1
  ---
2
  language: eml
3
+ language_name: Unknown language [eml]
4
  language_family: romance_galloitalic
5
  tags:
6
  - wikilangs
 
10
  - n-gram
11
  - markov
12
  - wikipedia
13
+ - feature-extraction
14
+ - sentence-similarity
15
+ - tokenization
16
+ - n-grams
17
+ - markov-chain
18
+ - text-mining
19
+ - fasttext
20
+ - babelvec
21
+ - vocabulous
22
+ - vocabulary
23
  - monolingual
24
  - family-romance_galloitalic
25
  license: mit
26
  library_name: wikilangs
27
+ pipeline_tag: text-generation
28
  datasets:
29
  - omarkamali/wikipedia-monthly
30
  dataset_info:
 
33
  metrics:
34
  - name: best_compression_ratio
35
  type: compression
36
+ value: 3.369
37
  - name: best_isotropy
38
  type: isotropy
39
+ value: 0.3584
40
  - name: vocabulary_size
41
  type: vocab
42
+ value: 0
43
+ generated: 2026-01-04
44
  ---
45
 
46
+ # Unknown language [eml] - Wikilangs Models
47
  ## Comprehensive Research Report & Full Ablation Study
48
 
49
+ This repository contains NLP models trained and evaluated by Wikilangs, specifically on **Unknown language [eml]** Wikipedia data.
50
  We analyze tokenizers, n-gram models, Markov chains, vocabulary statistics, and word embeddings.
51
 
52
  ## 📋 Repository Contents
 
54
  ### Models & Assets
55
 
56
  - Tokenizers (8k, 16k, 32k, 64k)
57
+ - N-gram models (2, 3, 4, 5-gram)
58
+ - Markov chains (context of 1, 2, 3, 4 and 5)
59
  - Subword N-gram and Markov chains
60
+ - Embeddings in various sizes and dimensions (aligned and unaligned)
61
  - Language Vocabulary
62
  - Language Statistics
63
+
64
  ![Performance Dashboard](visualizations/performance_dashboard.png)
65
 
66
  ### Analysis and Evaluation
 
70
  - [3. Markov Chain Evaluation](#3-markov-chain-evaluation)
71
  - [4. Vocabulary Analysis](#4-vocabulary-analysis)
72
  - [5. Word Embeddings Evaluation](#5-word-embeddings-evaluation)
73
+ - [6. Morphological Analysis (Experimental)](#6--morphological-analysis-experimental)
74
+ - [7. Summary & Recommendations](#7-summary--recommendations)
75
  - [Metrics Glossary](#appendix-metrics-glossary--interpretation-guide)
76
  - [Visualizations Index](#visualizations-index)
77
 
 
80
 
81
  ![Tokenizer Compression](visualizations/tokenizer_compression.png)
82
 
83
+ ![Tokenizer Fertility](visualizations/tokenizer_fertility.png)
84
+
85
+ ![Tokenizer OOV](visualizations/tokenizer_oov.png)
86
+
87
+ ![Total Tokens](visualizations/tokenizer_total_tokens.png)
88
+
89
  ### Results
90
 
91
  | Vocab Size | Compression | Avg Token Len | UNK Rate | Total Tokens |
92
  |------------|-------------|---------------|----------|--------------|
93
+ | **8k** | 2.942x | 2.95 | 0.4433% | 289,426 |
94
+ | **16k** | 3.144x | 3.15 | 0.4738% | 270,763 |
95
+ | **32k** | 3.369x 🏆 | 3.37 | 0.5076% | 252,742 |
 
96
 
97
  ### Tokenization Examples
98
 
99
  Below are sample sentences tokenized with each vocabulary size:
100
 
101
+ **Sample 1:** `'l è 'l nòm 'd un domìni genèric. Al funsiòuna da 'l zógn dal ed domìni tachê a ...`
 
 
102
 
103
  | Vocab | Tokens | Count |
104
  |-------|--------|-------|
105
+ | 8k | `▁' l ▁è' l ▁nòm' d ▁undomìni ... (+17 more)` | 27 |
106
+ | 16k | `▁' l ▁è' l ▁nòm' d ▁undomìni ... (+17 more)` | 27 |
107
+ | 32k | `▁' l ▁è' l ▁nòm' d ▁undomìni ... (+17 more)` | 27 |
 
108
 
109
+ **Sample 2:** `'l è 'l nòm 'd un domìni genèric. Al funsiòuna da 'l setèmber dal ed domìni tach...`
110
 
111
  | Vocab | Tokens | Count |
112
  |-------|--------|-------|
113
+ | 8k | `▁' l ▁è' l nòm' dundomìni ... (+17 more)` | 27 |
114
+ | 16k | `▁' l ▁è' lnòm' d ▁undomìni ... (+17 more)` | 27 |
115
+ | 32k | `▁' l ▁è' lnòm' d ▁undomìni ... (+17 more)` | 27 |
 
116
 
117
+ **Sample 3:** `Al 294 'l è 'n an edl III sécol dal Calendàri gregoriàn. Avenimèint Nê Mort III`
 
118
 
119
  | Vocab | Tokens | Count |
120
  |-------|--------|-------|
121
+ | 8k | `▁al 2 9 4 ' l ▁è ▁' n ... (+12 more)` | 22 |
122
+ | 16k | `▁al 2 9 4 ' l ▁è ▁' n ... (+12 more)` | 22 |
123
+ | 32k | `▁al 2 9 4 ' l ▁è ▁' n ... (+12 more)` | 22 |
 
124
 
125
 
126
  ### Key Findings
127
 
128
+ - **Best Compression:** 32k achieves 3.369x compression
129
+ - **Lowest UNK Rate:** 8k with 0.4433% unknown tokens
130
  - **Trade-off:** Larger vocabularies improve compression but increase model size
131
  - **Recommendation:** 32k vocabulary provides optimal balance for production use
132
 
 
135
 
136
  ![N-gram Perplexity](visualizations/ngram_perplexity.png)
137
 
138
+ ![N-gram Unique](visualizations/ngram_unique.png)
139
+
140
  ![N-gram Coverage](visualizations/ngram_coverage.png)
141
 
142
  ### Results
143
 
144
+ | N-gram | Variant | Perplexity | Entropy | Unique N-grams | Top-100 Coverage | Top-1000 Coverage |
145
+ |--------|---------|------------|---------|----------------|------------------|-------------------|
146
+ | **2-gram** | Word | 855 | 9.74 | 4,527 | 49.2% | 80.8% |
147
+ | **2-gram** | Subword | 342 🏆 | 8.42 | 2,464 | 62.9% | 97.8% |
148
+ | **3-gram** | Word | 936 | 9.87 | 6,071 | 49.5% | 79.8% |
149
+ | **3-gram** | Subword | 2,480 | 11.28 | 17,300 | 27.4% | 69.1% |
150
+ | **4-gram** | Word | 1,262 | 10.30 | 9,814 | 45.9% | 76.0% |
151
+ | **4-gram** | Subword | 9,840 | 13.26 | 65,901 | 17.3% | 46.3% |
152
+ | **5-gram** | Word | 1,050 | 10.04 | 7,194 | 45.5% | 79.7% |
153
+ | **5-gram** | Subword | 19,916 | 14.28 | 117,450 | 14.0% | 39.1% |
154
 
155
  ### Top 5 N-grams by Size
156
 
157
+ **2-grams (Word):**
158
+
159
+ | Rank | N-gram | Count |
160
+ |------|--------|-------|
161
+ | 1 | `l è` | 4,349 |
162
+ | 2 | `da l` | 2,854 |
163
+ | 3 | `d un` | 2,584 |
164
+ | 4 | `dal calendàri` | 1,948 |
165
+ | 5 | `è n` | 1,667 |
166
+
167
+ **3-grams (Word):**
168
+
169
+ | Rank | N-gram | Count |
170
+ |------|--------|-------|
171
+ | 1 | `l è n` | 1,665 |
172
+ | 2 | `dal calendàri gregoriàn` | 1,584 |
173
+ | 3 | `sécol dal calendàri` | 1,575 |
174
+ | 4 | `è n an` | 1,575 |
175
+ | 5 | `avenimèint nê mort` | 1,412 |
176
+
177
+ **4-grams (Word):**
178
+
179
+ | Rank | N-gram | Count |
180
+ |------|--------|-------|
181
+ | 1 | `l è n an` | 1,575 |
182
+ | 2 | `ed domìni tachê a` | 1,255 |
183
+ | 3 | `a funsionèr da l` | 1,255 |
184
+ | 4 | `domìni tachê a funsionèr` | 1,255 |
185
+ | 5 | `tachê a funsionèr da` | 1,255 |
186
+
187
+ **5-grams (Word):**
188
+
189
+ | Rank | N-gram | Count |
190
+ |------|--------|-------|
191
+ | 1 | `domìni tachê a funsionèr da` | 1,255 |
192
+ | 2 | `ed domìni tachê a funsionèr` | 1,255 |
193
+ | 3 | `tachê a funsionèr da l` | 1,255 |
194
+ | 4 | `l è l nòm d` | 1,247 |
195
+ | 5 | `l nòm d un domìni` | 1,247 |
196
+
197
+ **2-grams (Subword):**
198
+
199
+ | Rank | N-gram | Count |
200
+ |------|--------|-------|
201
+ | 1 | `a _` | 44,681 |
202
+ | 2 | `l _` | 36,354 |
203
+ | 3 | `_ d` | 31,152 |
204
+ | 4 | `_ a` | 28,707 |
205
+ | 5 | `n _` | 26,332 |
206
+
207
+ **3-grams (Subword):**
208
 
209
  | Rank | N-gram | Count |
210
  |------|--------|-------|
211
+ | 1 | `a l _` | 19,233 |
212
+ | 2 | `_ d a` | 13,700 |
213
+ | 3 | `_ i n` | 10,014 |
214
+ | 4 | `l a _` | 9,054 |
215
+ | 5 | `d a l` | 8,840 |
216
 
217
+ **4-grams (Subword):**
218
 
219
  | Rank | N-gram | Count |
220
  |------|--------|-------|
221
+ | 1 | `_ d a l` | 8,766 |
222
+ | 2 | `d a l _` | 8,710 |
223
+ | 3 | `_ a l _` | 7,884 |
224
+ | 4 | `_ e d _` | 6,634 |
225
+ | 5 | `_ l a _` | 5,983 |
226
 
227
+ **5-grams (Subword):**
228
 
229
  | Rank | N-gram | Count |
230
  |------|--------|-------|
231
+ | 1 | `_ d a l _` | 8,679 |
232
+ | 2 | `_ d a _ '` | 2,988 |
233
+ | 3 | `' l _ è _` | 2,975 |
234
+ | 4 | `l _ è _ '` | 2,854 |
235
+ | 5 | `d a _ ' l` | 2,762 |
236
 
237
 
238
  ### Key Findings
239
 
240
+ - **Best Perplexity:** 2-gram (subword) with 342
241
  - **Entropy Trend:** Decreases with larger n-grams (more predictable)
242
+ - **Coverage:** Top-1000 patterns cover ~39% of corpus
243
  - **Recommendation:** 4-gram or 5-gram for best predictive performance
244
 
245
  ---
 
247
 
248
  ![Markov Entropy](visualizations/markov_entropy.png)
249
 
250
+ ![Markov Contexts](visualizations/markov_contexts.png)
251
+
252
  ![Markov Branching](visualizations/markov_branching.png)
253
 
254
  ### Results
255
 
256
+ | Context | Variant | Avg Entropy | Perplexity | Branching Factor | Unique Contexts | Predictability |
257
+ |---------|---------|-------------|------------|------------------|-----------------|----------------|
258
+ | **1** | Word | 0.6144 | 1.531 | 3.27 | 38,079 | 38.6% |
259
+ | **1** | Subword | 1.2142 | 2.320 | 11.72 | 398 | 0.0% |
260
+ | **2** | Word | 0.1859 | 1.138 | 1.39 | 123,729 | 81.4% |
261
+ | **2** | Subword | 1.1401 | 2.204 | 6.72 | 4,661 | 0.0% |
262
+ | **3** | Word | 0.0688 | 1.049 | 1.12 | 170,769 | 93.1% |
263
+ | **3** | Subword | 0.8376 | 1.787 | 3.69 | 31,279 | 16.2% |
264
+ | **4** | Word | 0.0286 🏆 | 1.020 | 1.05 | 189,112 | 97.1% |
265
+ | **4** | Subword | 0.5759 | 1.491 | 2.30 | 115,443 | 42.4% |
266
+
267
+ ### Generated Text Samples (Word-based)
268
+
269
+ Below are text samples generated from each word-based Markov chain model:
270
+
271
+ **Context Size 1:**
272
+
273
+ 1. `l é al progrâma pc 12 518 519 520 gonèl 22 ed domìni genèric al urèl`
274
+ 2. `al funsiòuna da per 4 quèśi prim sfènic difetìv 322 in sensu laudator temporis acti prudentes`
275
+ 3. `dal crìst 4 d oro una cumêdia d antonino inferito da l è l è l`
276
+
277
+ **Context Size 2:**
278
+
279
+ 1. `l è n an dal vii sécol dal calendàri gregoriàn avenimèint nê guélf vi mort xii`
280
+ 2. `d un nùmer triangolèr moltìplica per 5 d un nùmer quèder moltìplica per 3 d un domìni`
281
+ 3. `dal calendàri gregoriàn avenimèint nê mort x`
282
+
283
+ **Context Size 3:**
284
+
285
+ 1. `l è n an edl iii sécol dal calendàri gregoriàn avenimèint nê mort i`
286
+ 2. `dal calendàri gregoriàn avenimèint nê mort viii`
287
+ 3. `è n an edl viii sécol dal calendàri gregoriàn avenimèint nê mort xvi`
288
+
289
+ **Context Size 4:**
290
+
291
+ 1. `l è n an edl ix sécol dal calendàri gregoriàn avenimèint nê mort v`
292
+ 2. `domìni tachê a funsionèr da l`
293
+ 3. `ed domìni tachê a funsionèr da l`
294
 
 
295
 
296
+ ### Generated Text Samples (Subword-based)
297
+
298
+ Below are text samples generated from each subword-based Markov chain model:
299
 
300
  **Context Size 1:**
301
 
302
+ 1. `_pe_gotili_l'n_i`
303
+ 2. `andogrin_menèiṣa`
304
+ 3. `i_incōridl_stêst`
305
 
306
  **Context Size 2:**
307
 
308
+ 1. `a_cuns_e_tòra_fiō`
309
+ 2. `l_séco,_ed_unèli_`
310
+ 3. `_drê_avōl_è_'l_59`
311
 
312
  **Context Size 3:**
313
 
314
+ 1. `al_sît_la_cà_paolo`
315
+ 2. `_da_63_in_difestìl`
316
+ 3. `_in-dóvv_a_un_di_c`
317
 
318
  **Context Size 4:**
319
 
320
+ 1. `_dal_calendàri_greg`
321
+ 2. `dal_viii_sèc._préma`
322
+ 3. `_al_funsiòuna_da_'l`
323
 
324
 
325
  ### Key Findings
326
 
327
+ - **Best Predictability:** Context-4 (word) with 97.1% predictability
328
  - **Branching Factor:** Decreases with context size (more deterministic)
329
+ - **Memory Trade-off:** Larger contexts require more storage (115,443 contexts)
330
  - **Recommendation:** Context-3 or Context-4 for text generation
331
 
332
  ---
 
342
 
343
  | Metric | Value |
344
  |--------|-------|
345
+ | Vocabulary Size | 14,744 |
346
+ | Total Tokens | 272,012 |
347
+ | Mean Frequency | 18.45 |
348
  | Median Frequency | 3 |
349
+ | Frequency Std Dev | 223.57 |
350
 
351
  ### Most Common Words
352
 
353
  | Rank | Word | Frequency |
354
  |------|------|-----------|
355
+ | 1 | l | 12,992 |
356
+ | 2 | al | 10,267 |
357
+ | 3 | dal | 8,736 |
358
+ | 4 | a | 7,317 |
359
+ | 5 | ed | 6,740 |
360
+ | 6 | la | 6,622 |
361
+ | 7 | d | 5,491 |
362
+ | 8 | in | 5,032 |
363
+ | 9 | è | 4,792 |
364
+ | 10 | da | 4,480 |
365
 
366
  ### Least Common Words (from vocabulary)
367
 
368
  | Rank | Word | Frequency |
369
  |------|------|-----------|
370
+ | 1 | espositìv | 2 |
371
+ | 2 | ecosistèma | 2 |
372
+ | 3 | trasformasiòun | 2 |
373
+ | 4 | galleria | 2 |
374
+ | 5 | space | 2 |
375
  | 6 | velò | 2 |
376
  | 7 | arriv | 2 |
377
  | 8 | sèda | 2 |
 
382
 
383
  | Metric | Value |
384
  |--------|-------|
385
+ | Zipf Coefficient | 1.0159 |
386
+ | R² (Goodness of Fit) | 0.990784 |
387
  | Adherence Quality | **excellent** |
388
 
389
  ### Coverage Analysis
390
 
391
  | Top N Words | Coverage |
392
  |-------------|----------|
393
+ | Top 100 | 57.7% |
394
+ | Top 1,000 | 77.8% |
395
+ | Top 5,000 | 90.7% |
396
+ | Top 10,000 | 96.5% |
397
 
398
  ### Key Findings
399
 
400
+ - **Zipf Compliance:** R²=0.9908 indicates excellent adherence to Zipf's law
401
+ - **High Frequency Dominance:** Top 100 words cover 57.7% of corpus
402
+ - **Long Tail:** 4,744 words needed for remaining 3.5% coverage
403
 
404
  ---
405
  ## 5. Word Embeddings Evaluation
 
412
 
413
  ![t-SNE Sentences](visualizations/tsne_sentences.png)
414
 
 
415
 
416
+ ### 5.1 Cross-Lingual Alignment
417
+
418
+ ![Alignment Quality](visualizations/embedding_alignment_quality.png)
419
+
420
+ ![Multilingual t-SNE](visualizations/embedding_tsne_multilingual.png)
421
+
422
+
423
+ ### 5.2 Model Comparison
424
+
425
+ | Model | Dimension | Isotropy | Semantic Density | Alignment R@1 | Alignment R@10 |
426
+ |-------|-----------|----------|------------------|---------------|----------------|
427
+ | **mono_32d** | 32 | 0.3584 | 0.4391 | N/A | N/A |
428
+ | **mono_64d** | 64 | 0.1134 | 0.4504 | N/A | N/A |
429
+ | **mono_128d** | 128 | 0.0166 | 0.4596 | N/A | N/A |
430
+ | **aligned_32d** | 32 | 0.3584 🏆 | 0.4411 | 0.0140 | 0.1660 |
431
+ | **aligned_64d** | 64 | 0.1134 | 0.4292 | 0.0460 | 0.2440 |
432
+ | **aligned_128d** | 128 | 0.0166 | 0.4457 | 0.0400 | 0.2640 |
433
 
434
  ### Key Findings
435
 
436
+ - **Best Isotropy:** aligned_32d with 0.3584 (more uniform distribution)
437
+ - **Semantic Density:** Average pairwise similarity of 0.4442. Lower values indicate better semantic separation.
438
+ - **Alignment Quality:** Aligned models achieve up to 4.6% R@1 in cross-lingual retrieval.
439
+ - **Recommendation:** 128d aligned for best cross-lingual performance
440
 
441
  ---
442
+ ## 6. Morphological Analysis (Experimental)
443
+
444
+ This section presents an automated morphological analysis derived from the statistical divergence between word-level and subword-level models. By analyzing where subword predictability spikes and where word-level coverage fails, we can infer linguistic structures without supervised data.
445
+
446
+ ### 6.1 Productivity & Complexity
447
+
448
+ | Metric | Value | Interpretation | Recommendation |
449
+ |--------|-------|----------------|----------------|
450
+ | Productivity Index | **5.000** | High morphological productivity | Reliable analysis |
451
+ | Idiomaticity Gap | **1.037** | High formulaic/idiomatic content | - |
452
+
453
+ ### 6.2 Affix Inventory (Productive Units)
454
+
455
+ These are the most productive prefixes and suffixes identified by sampling the vocabulary for global substitutability patterns. A unit is considered an affix if stripping it leaves a valid stem that appears in other contexts.
456
+
457
+ #### Productive Prefixes
458
+ | Prefix | Examples |
459
+ |--------|----------|
460
+ | `-ca` | cal, cavésin, caviân |
461
+
462
+ #### Productive Suffixes
463
+ | Suffix | Examples |
464
+ |--------|----------|
465
+ | `-a` | scōla, algebra, câṣva |
466
+ | `-um` | coelum, adsum, 217śum |
467
+ | `-na` | vègna, teresina, ruvîna |
468
+
469
+ ### 6.3 Bound Stems (Lexical Roots)
470
+
471
+ Bound stems are high-frequency subword units that are semantically cohesive but rarely appear as standalone words. These often correspond to the 'core' of a word that requires inflection or derivation to be valid.
472
+
473
+ | Stem | Cohesion | Substitutability | Examples |
474
+ |------|----------|------------------|----------|
475
+ | `asiò` | 1.80x | 17 contexts | asiòṅ, asiòun, frasiòn |
476
+ | `siòu` | 1.79x | 17 contexts | asiòun, sesiòun, lesiòun |
477
+ | `purt` | 1.55x | 23 contexts | purtâ, purtê, purtä |
478
+ | `iòun` | 1.73x | 16 contexts | uniòun, asiòun, sesiòun |
479
+ | `nter` | 1.50x | 24 contexts | inter, nterra, dänter |
480
+ | `sèin` | 1.51x | 17 contexts | sèins, sèint, casèin |
481
+ | `tèin` | 1.48x | 16 contexts | latèin, estèin, putèin |
482
+ | `ital` | 1.53x | 14 contexts | italy, italo, vitali |
483
+ | `tôri` | 1.78x | 9 contexts | stôri, stôric, stôria |
484
+ | `rèin` | 1.46x | 14 contexts | rèina, trèin, terèin |
485
+ | `inte` | 1.59x | 11 contexts | inter, intern, interès |
486
+ | `mèin` | 1.79x | 8 contexts | mèint, camèin, mumèint |
487
+
488
+ ### 6.4 Affix Compatibility (Co-occurrence)
489
+
490
+ This table shows which prefixes and suffixes most frequently co-occur on the same stems, revealing the 'stacking' rules of the language's morphology.
491
+
492
+ | Prefix | Suffix | Frequency | Examples |
493
+ |--------|--------|-----------|----------|
494
+ | `-ca` | `-a` | 53 words | cavacürta, canpâgna |
495
+ | `-ca` | `-na` | 16 words | canpâgna, catalógna |
496
+
497
+ ### 6.5 Recursive Morpheme Segmentation
498
+
499
+ Using **Recursive Hierarchical Substitutability**, we decompose complex words into their constituent morphemes. This approach handles nested affixes (e.g., `prefix-prefix-root-suffix`).
500
+
501
+ | Word | Suggested Split | Confidence | Stem |
502
+ |------|-----------------|------------|------|
503
+ | cascaggna | **`ca-scagg-na`** | 3.0 | `scagg` |
504
+ | califòrgna | **`ca-lifòrg-na`** | 3.0 | `lifòrg` |
505
+ | campàggna | **`ca-mpàgg-na`** | 3.0 | `mpàgg` |
506
+ | castlaran | **`ca-stlaran`** | 1.5 | `stlaran` |
507
+ | philosophum | **`philosoph-um`** | 1.5 | `philosoph` |
508
+ | privilegium | **`privilegi-um`** | 1.5 | `privilegi` |
509
+ | calandäri | **`ca-landäri`** | 1.5 | `landäri` |
510
+ | referendum | **`referend-um`** | 1.5 | `referend` |
511
+ | metropolitana | **`metropolita-na`** | 1.5 | `metropolita` |
512
+ | carabinieri | **`ca-rabinieri`** | 1.5 | `rabinieri` |
513
+ | parmigiana | **`parmigia-na`** | 1.5 | `parmigia` |
514
+ | funsiòuna | **`funsiòu-na`** | 1.5 | `funsiòu` |
515
+ | carpigiano | **`ca-rpigiano`** | 1.5 | `rpigiano` |
516
+ | caraterésstic | **`ca-raterésstic`** | 1.5 | `raterésstic` |
517
+ | indipendentîxum | **`indipendentîx-um`** | 1.5 | `indipendentîx` |
518
+
519
+ ### 6.6 Linguistic Interpretation
520
+
521
+ > **Automated Insight:**
522
+ The language Unknown language [eml] shows high morphological productivity. The subword models are significantly more efficient than word models, suggesting a rich system of affixation or compounding.
523
+
524
+ > **Note on Idiomaticity:** The high Idiomaticity Gap suggests a large number of frequent multi-word expressions or formulaic sequences that are statistically distinct from their component parts.
525
+
526
+ ---
527
+ ## 7. Summary & Recommendations
528
 
529
  ![Performance Dashboard](visualizations/performance_dashboard.png)
530
 
 
532
 
533
  | Component | Recommended | Rationale |
534
  |-----------|-------------|-----------|
535
+ | Tokenizer | **32k BPE** | Best compression (3.37x) |
536
+ | N-gram | **2-gram** | Lowest perplexity (342) |
537
+ | Markov | **Context-4** | Highest predictability (97.1%) |
538
  | Embeddings | **100d** | Balanced semantic capture and isotropy |
539
 
540
+
541
  ---
542
  ## Appendix: Metrics Glossary & Interpretation Guide
543
 
 
727
  author = {Kamali, Omar},
728
  title = {Wikilangs: Open NLP Models for Wikipedia Languages},
729
  year = {2025},
730
+ doi = {10.5281/zenodo.18073153},
731
+ publisher = {Zenodo},
732
  url = {https://huggingface.co/wikilangs}
733
  institution = {Omneity Labs}
734
  }
 
744
  - 🤗 Models: [huggingface.co/wikilangs](https://huggingface.co/wikilangs)
745
  - 📊 Data: [wikipedia-monthly](https://huggingface.co/datasets/omarkamali/wikipedia-monthly)
746
  - 👤 Author: [Omar Kamali](https://huggingface.co/omarkamali)
747
+ - 🤝 Sponsor: [Featherless AI](https://featherless.ai)
748
  ---
749
  *Generated by Wikilangs Models Pipeline*
750
 
751
+ *Report Date: 2026-01-04 14:33:51*
models/embeddings/aligned/eml_128d.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fcb8fc5fbcdb5bbb3c3855e6260d0c0ed9be1bd2540732441851e144e55b1bb5
3
+ size 1028819578
models/embeddings/aligned/eml_128d.meta.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"lang": "eml", "dim": 128, "max_seq_len": 512, "is_aligned": true}
models/embeddings/aligned/eml_128d.projection.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:92a565cca0b1fec7fcc3929c57b7a0f14938727f603974d98feafa917c5b6a07
3
+ size 65664
models/embeddings/aligned/eml_128d_metadata.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "language": "eml",
3
+ "dimension": 128,
4
+ "version": "aligned",
5
+ "hub_language": "en",
6
+ "seed_vocab_size": 1782,
7
+ "vocab_size": 4635
8
+ }
models/embeddings/aligned/eml_32d.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9048aabb8f2135dcb1eb5657f02c39c14081bad67cf1f08cc9d4be1e05987fdd
3
+ size 257259898
models/embeddings/aligned/eml_32d.meta.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"lang": "eml", "dim": 32, "max_seq_len": 512, "is_aligned": true}
models/embeddings/aligned/eml_32d.projection.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4b3e21ae26c45f38a7dd8a3536b26336fb4ff728db58b503dcf80f58a62ce218
3
+ size 4224
models/embeddings/aligned/eml_32d_metadata.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "language": "eml",
3
+ "dimension": 32,
4
+ "version": "aligned",
5
+ "hub_language": "en",
6
+ "seed_vocab_size": 1782,
7
+ "vocab_size": 4635
8
+ }
models/embeddings/aligned/eml_64d.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:54e13e135d67a9e81ac2a112d124d647a6d159a876cf31a47f640828b12b32af
3
+ size 514446458
models/embeddings/aligned/eml_64d.meta.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"lang": "eml", "dim": 64, "max_seq_len": 512, "is_aligned": true}
models/embeddings/aligned/eml_64d.projection.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c8496df49686df9ad094fba37cb9c2c359c62122fe0b0c3a35107a9b92700715
3
+ size 16512
models/embeddings/aligned/eml_64d_metadata.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "language": "eml",
3
+ "dimension": 64,
4
+ "version": "aligned",
5
+ "hub_language": "en",
6
+ "seed_vocab_size": 1782,
7
+ "vocab_size": 4635
8
+ }
models/embeddings/monolingual/eml_128d.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2dec3940dbc985fa841d1a2109e3331109da252bb16a290feddef6bff141233a
3
- size 1030131813
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fcb8fc5fbcdb5bbb3c3855e6260d0c0ed9be1bd2540732441851e144e55b1bb5
3
+ size 1028819578
models/embeddings/monolingual/eml_128d_metadata.json CHANGED
@@ -3,11 +3,13 @@
3
  "dimension": 128,
4
  "version": "monolingual",
5
  "training_params": {
6
- "dim": 128,
7
  "min_count": 5,
8
  "window": 5,
9
  "negative": 5,
10
- "epochs": 5
 
 
11
  },
12
- "vocab_size": 5895
13
  }
 
3
  "dimension": 128,
4
  "version": "monolingual",
5
  "training_params": {
6
+ "algorithm": "skipgram",
7
  "min_count": 5,
8
  "window": 5,
9
  "negative": 5,
10
+ "epochs": 5,
11
+ "encoding_method": "rope",
12
+ "dim": 128
13
  },
14
+ "vocab_size": 4635
15
  }
models/embeddings/monolingual/eml_32d.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8961e078ac6044a853b46be56893ebce2c5eb901499627424d436e4b8225a68e
3
- size 257604453
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9048aabb8f2135dcb1eb5657f02c39c14081bad67cf1f08cc9d4be1e05987fdd
3
+ size 257259898
models/embeddings/monolingual/eml_32d_metadata.json CHANGED
@@ -3,11 +3,13 @@
3
  "dimension": 32,
4
  "version": "monolingual",
5
  "training_params": {
6
- "dim": 32,
7
  "min_count": 5,
8
  "window": 5,
9
  "negative": 5,
10
- "epochs": 5
 
 
11
  },
12
- "vocab_size": 5895
13
  }
 
3
  "dimension": 32,
4
  "version": "monolingual",
5
  "training_params": {
6
+ "algorithm": "skipgram",
7
  "min_count": 5,
8
  "window": 5,
9
  "negative": 5,
10
+ "epochs": 5,
11
+ "encoding_method": "rope",
12
+ "dim": 32
13
  },
14
+ "vocab_size": 4635
15
  }
models/embeddings/monolingual/eml_64d.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ae7fd24e033b292e2212d1507f6faa04781e383418da4f7fbdf4b9fc502c9483
3
- size 515113573
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:54e13e135d67a9e81ac2a112d124d647a6d159a876cf31a47f640828b12b32af
3
+ size 514446458
models/embeddings/monolingual/eml_64d_metadata.json CHANGED
@@ -3,11 +3,13 @@
3
  "dimension": 64,
4
  "version": "monolingual",
5
  "training_params": {
6
- "dim": 64,
7
  "min_count": 5,
8
  "window": 5,
9
  "negative": 5,
10
- "epochs": 5
 
 
11
  },
12
- "vocab_size": 5895
13
  }
 
3
  "dimension": 64,
4
  "version": "monolingual",
5
  "training_params": {
6
+ "algorithm": "skipgram",
7
  "min_count": 5,
8
  "window": 5,
9
  "negative": 5,
10
+ "epochs": 5,
11
+ "encoding_method": "rope",
12
+ "dim": 64
13
  },
14
+ "vocab_size": 4635
15
  }
models/subword_markov/eml_markov_ctx1_subword.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c097ab7bc9b897e8c09154ba19528d122008c0ea16dc6b51790599f0c3dd6dfe
3
- size 49006
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d6b926e1938473f2238d8b1a246fdca632c9df31a38f38cde47b1430b2326ca8
3
+ size 41044
models/subword_markov/eml_markov_ctx1_subword_metadata.json CHANGED
@@ -2,6 +2,6 @@
2
  "context_size": 1,
3
  "variant": "subword",
4
  "language": "eml",
5
- "unique_contexts": 518,
6
- "total_transitions": 2357750
7
  }
 
2
  "context_size": 1,
3
  "variant": "subword",
4
  "language": "eml",
5
+ "unique_contexts": 398,
6
+ "total_transitions": 1534890
7
  }
models/subword_markov/eml_markov_ctx2_subword.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:be0033475c1652e77b5fdf968e2c48e4cc8243a92791a66f0075dd78ea7da229
3
- size 302403
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b6b72d5397e477fda64eb68479d038a681650923e44ef8e56c40f841f61196ad
3
+ size 239585
models/subword_markov/eml_markov_ctx2_subword_metadata.json CHANGED
@@ -2,6 +2,6 @@
2
  "context_size": 2,
3
  "variant": "subword",
4
  "language": "eml",
5
- "unique_contexts": 5633,
6
- "total_transitions": 2344566
7
  }
 
2
  "context_size": 2,
3
  "variant": "subword",
4
  "language": "eml",
5
+ "unique_contexts": 4661,
6
+ "total_transitions": 1529310
7
  }
models/subword_markov/eml_markov_ctx3_subword.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:52b0a67bc82d05f5a35f649a72f33cdb45172a5d33904de6e0809a03164f8bae
3
- size 1051721
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d32d5c8474464f3542b5c640f67f02b7cdd18b7294731baa5ac803d49bb0eda9
3
+ size 824128
models/subword_markov/eml_markov_ctx3_subword_metadata.json CHANGED
@@ -2,6 +2,6 @@
2
  "context_size": 3,
3
  "variant": "subword",
4
  "language": "eml",
5
- "unique_contexts": 38777,
6
- "total_transitions": 2331382
7
  }
 
2
  "context_size": 3,
3
  "variant": "subword",
4
  "language": "eml",
5
+ "unique_contexts": 31279,
6
+ "total_transitions": 1523730
7
  }
models/subword_markov/eml_markov_ctx4_subword.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0b3fa3bdc992bb33045daf260d321e98cd4d8f0bff7fe9925ceefb1b894f645c
3
- size 2772328
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:86656b5d89b562fdcdf8d2b076d8bbbace53baaf9733ab41014512d25be68d3a
3
+ size 2063582
models/subword_markov/eml_markov_ctx4_subword_metadata.json CHANGED
@@ -2,6 +2,6 @@
2
  "context_size": 4,
3
  "variant": "subword",
4
  "language": "eml",
5
- "unique_contexts": 153229,
6
- "total_transitions": 2318198
7
  }
 
2
  "context_size": 4,
3
  "variant": "subword",
4
  "language": "eml",
5
+ "unique_contexts": 115443,
6
+ "total_transitions": 1518150
7
  }
models/subword_ngram/eml_2gram_subword.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fc925b29a9fc88ab4cf66b8408d9eed367b4928e944c1292dd5824336cea41ac
3
- size 38300
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:abc65477b77ec14a68768bc6a4cbe985f8fb901ef27a3c085790b09ba780b5f7
3
+ size 32669
models/subword_ngram/eml_2gram_subword_metadata.json CHANGED
@@ -2,6 +2,6 @@
2
  "n": 2,
3
  "variant": "subword",
4
  "language": "eml",
5
- "unique_ngrams": 2917,
6
- "total_ngrams": 2357750
7
  }
 
2
  "n": 2,
3
  "variant": "subword",
4
  "language": "eml",
5
+ "unique_ngrams": 2464,
6
+ "total_ngrams": 1534890
7
  }
models/subword_ngram/eml_3gram_subword.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:77c3e7f77b71859d97a1553d613852050db6480386165b77e169f8f2e811014e
3
- size 253696
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:98c47ca7a64b381223f763a388157b191e0de53ced724ae4b32a96deaf39a731
3
+ size 205276
models/subword_ngram/eml_3gram_subword_metadata.json CHANGED
@@ -2,6 +2,6 @@
2
  "n": 3,
3
  "variant": "subword",
4
  "language": "eml",
5
- "unique_ngrams": 21058,
6
- "total_ngrams": 2344566
7
  }
 
2
  "n": 3,
3
  "variant": "subword",
4
  "language": "eml",
5
+ "unique_ngrams": 17300,
6
+ "total_ngrams": 1529310
7
  }
models/subword_ngram/eml_4gram_subword.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2f729f6f0180bbb326697b8f8f6ee4544692e6237369a32db85e4d4314868f64
3
- size 995050
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c52554bef1faa4c04b02e07fd6ee1d6c2d1808c138f7468b367a43effed73bf
3
+ size 776580
models/subword_ngram/eml_4gram_subword_metadata.json CHANGED
@@ -2,6 +2,6 @@
2
  "n": 4,
3
  "variant": "subword",
4
  "language": "eml",
5
- "unique_ngrams": 88240,
6
- "total_ngrams": 2331382
7
  }
 
2
  "n": 4,
3
  "variant": "subword",
4
  "language": "eml",
5
+ "unique_ngrams": 65901,
6
+ "total_ngrams": 1523730
7
  }
models/subword_ngram/eml_5gram_subword.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ceedf622b97b0e16718698141c4b219dee5ba830fac72b206d1dc6e3f594096b
3
+ size 1371520
models/subword_ngram/eml_5gram_subword_metadata.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "n": 5,
3
+ "variant": "subword",
4
+ "language": "eml",
5
+ "unique_ngrams": 117450,
6
+ "total_ngrams": 1518150
7
+ }
models/tokenizer/eml_tokenizer_16k.model CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:da605d84f28d5f46f26288c35f9d14a5b54103782a7f4fff43cb4ebbc45fcfb0
3
- size 501006
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e4ca140168e773921387292dd413a84d6448393458bdcff201a3aa5256417afb
3
+ size 508897
models/tokenizer/eml_tokenizer_16k.vocab CHANGED
The diff for this file is too large to render. See raw diff
 
models/tokenizer/eml_tokenizer_32k.model CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4dd80c30b968ba0c9cb19ceea25fde8d0ee363c9db15ed1778feabd4e8d0fe63
3
- size 783799
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3f05d7f5bc311f044df8793f8da075d15b79dd7ccce85b63f5afe91f40ea36b3
3
+ size 797552
models/tokenizer/eml_tokenizer_32k.vocab CHANGED
The diff for this file is too large to render. See raw diff
 
models/tokenizer/eml_tokenizer_8k.model CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1f40c61f5172b6055933e6e6cbad3e888013bf908612e87962354dcee190ed69
3
- size 370286
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0a22d81760f62d518080719bef868af8f2d18593e0e1ddcab0664e141bc3c718
3
+ size 370847
models/tokenizer/eml_tokenizer_8k.vocab CHANGED
The diff for this file is too large to render. See raw diff
 
models/vocabulary/eml_vocabulary.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d7e9c74a50c31370e5c3ea97bfdf5872799156358c39c661f61de5beebc61be3
3
- size 287372
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:292e25805145d187f0bbc5f3f2fc487329b45ee890b9b03b744d59d04ce3e18d
3
+ size 233473
models/vocabulary/eml_vocabulary_metadata.json CHANGED
@@ -1,16 +1,17 @@
1
  {
2
  "language": "eml",
3
- "vocabulary_size": 18015,
 
4
  "statistics": {
5
- "type_token_ratio": 0.1374079429750441,
6
  "coverage": {
7
- "top_100": 0.5431980467359734,
8
- "top_1000": 0.7066495100662288,
9
- "top_5000": 0.814778403589944,
10
- "top_10000": 0.8643118651829692
11
  },
12
- "hapax_count": 37194,
13
- "hapax_ratio": 0.6736945063304896,
14
- "total_documents": 13184
15
  }
16
  }
 
1
  {
2
  "language": "eml",
3
+ "vocabulary_size": 14744,
4
+ "variant": "full",
5
  "statistics": {
6
+ "type_token_ratio": 0.12930704797360182,
7
  "coverage": {
8
+ "top_100": 0.5308097131737034,
9
+ "top_1000": 0.7164396311024621,
10
+ "top_5000": 0.8352855571537355,
11
+ "top_10000": 0.8884812589897623
12
  },
13
+ "hapax_count": 23463,
14
+ "hapax_ratio": 0.6141021278823252,
15
+ "total_documents": 5580
16
  }
17
  }
models/word_markov/eml_markov_ctx1_word.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d821202557de5b54da2f843a288c7a778a7dccc6dbce3a3dcac43e102399b7d1
3
- size 1512273
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b58650add7a0b58bfff9460f57cc83996fd12568ed8a603676040d4122f7d3f7
3
+ size 1170078
models/word_markov/eml_markov_ctx1_word_metadata.json CHANGED
@@ -2,6 +2,6 @@
2
  "context_size": 1,
3
  "variant": "word",
4
  "language": "eml",
5
- "unique_contexts": 54943,
6
- "total_transitions": 500906
7
  }
 
2
  "context_size": 1,
3
  "variant": "word",
4
  "language": "eml",
5
+ "unique_contexts": 38079,
6
+ "total_transitions": 289895
7
  }
models/word_markov/eml_markov_ctx2_word.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b6b0d6986dea78583fb0c1563283c6d6f8c70ea36c65e20d31a321417c8aff29
3
- size 2759692
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:180ce7b12c5d6e9d779d053b5c92bd28686e4ae07387f9ce71787acbdc206aa8
3
+ size 2189970
models/word_markov/eml_markov_ctx2_word_metadata.json CHANGED
@@ -2,6 +2,6 @@
2
  "context_size": 2,
3
  "variant": "word",
4
  "language": "eml",
5
- "unique_contexts": 163004,
6
- "total_transitions": 487775
7
  }
 
2
  "context_size": 2,
3
  "variant": "word",
4
  "language": "eml",
5
+ "unique_contexts": 123729,
6
+ "total_transitions": 284315
7
  }
models/word_markov/eml_markov_ctx3_word.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:45b80193c6d32ec40bbe313766649b2f767a796e6427017d9b93cd588fc105bb
3
- size 3690696
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8dcf8703e6eb0d8f9636fcdb573b7b4b59c35eca9c75294e4d6f51dbb89b5c46
3
+ size 2864580
models/word_markov/eml_markov_ctx3_word_metadata.json CHANGED
@@ -2,6 +2,6 @@
2
  "context_size": 3,
3
  "variant": "word",
4
  "language": "eml",
5
- "unique_contexts": 229724,
6
- "total_transitions": 474974
7
  }
 
2
  "context_size": 3,
3
  "variant": "word",
4
  "language": "eml",
5
+ "unique_contexts": 170769,
6
+ "total_transitions": 278735
7
  }