omarkamali commited on
Commit
c7a088b
·
verified ·
1 Parent(s): 4d1df03

Upload all models and assets for dz (latest)

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +1 -0
  2. README.md +296 -142
  3. models/embeddings/aligned/dz_128d.bin +3 -0
  4. models/embeddings/aligned/dz_128d.meta.json +1 -0
  5. models/embeddings/aligned/dz_128d.projection.npy +3 -0
  6. models/embeddings/aligned/dz_128d_metadata.json +8 -0
  7. models/embeddings/aligned/dz_32d.bin +3 -0
  8. models/embeddings/aligned/dz_32d.meta.json +1 -0
  9. models/embeddings/aligned/dz_32d.projection.npy +3 -0
  10. models/embeddings/aligned/dz_32d_metadata.json +8 -0
  11. models/embeddings/aligned/dz_64d.bin +3 -0
  12. models/embeddings/aligned/dz_64d.meta.json +1 -0
  13. models/embeddings/aligned/dz_64d.projection.npy +3 -0
  14. models/embeddings/aligned/dz_64d_metadata.json +8 -0
  15. models/embeddings/monolingual/dz_128d.bin +2 -2
  16. models/embeddings/monolingual/dz_128d_metadata.json +5 -3
  17. models/embeddings/monolingual/dz_32d.bin +2 -2
  18. models/embeddings/monolingual/dz_32d_metadata.json +5 -3
  19. models/embeddings/monolingual/dz_64d.bin +2 -2
  20. models/embeddings/monolingual/dz_64d_metadata.json +5 -3
  21. models/subword_markov/dz_markov_ctx1_subword.parquet +2 -2
  22. models/subword_markov/dz_markov_ctx1_subword_metadata.json +2 -2
  23. models/subword_markov/dz_markov_ctx2_subword.parquet +2 -2
  24. models/subword_markov/dz_markov_ctx2_subword_metadata.json +2 -2
  25. models/subword_markov/dz_markov_ctx3_subword.parquet +2 -2
  26. models/subword_markov/dz_markov_ctx3_subword_metadata.json +2 -2
  27. models/subword_markov/dz_markov_ctx4_subword.parquet +2 -2
  28. models/subword_markov/dz_markov_ctx4_subword_metadata.json +2 -2
  29. models/subword_ngram/dz_2gram_subword.parquet +2 -2
  30. models/subword_ngram/dz_2gram_subword_metadata.json +2 -2
  31. models/subword_ngram/dz_3gram_subword.parquet +2 -2
  32. models/subword_ngram/dz_3gram_subword_metadata.json +2 -2
  33. models/subword_ngram/dz_4gram_subword.parquet +2 -2
  34. models/subword_ngram/dz_4gram_subword_metadata.json +2 -2
  35. models/subword_ngram/dz_5gram_subword.parquet +3 -0
  36. models/subword_ngram/dz_5gram_subword_metadata.json +7 -0
  37. models/tokenizer/dz_tokenizer_16k.model +2 -2
  38. models/tokenizer/dz_tokenizer_16k.vocab +0 -0
  39. models/tokenizer/dz_tokenizer_32k.model +2 -2
  40. models/tokenizer/dz_tokenizer_32k.vocab +0 -0
  41. models/tokenizer/dz_tokenizer_64k.model +2 -2
  42. models/tokenizer/dz_tokenizer_64k.vocab +0 -0
  43. models/tokenizer/dz_tokenizer_8k.model +2 -2
  44. models/tokenizer/dz_tokenizer_8k.vocab +0 -0
  45. models/vocabulary/dz_vocabulary.parquet +2 -2
  46. models/vocabulary/dz_vocabulary_metadata.json +10 -8
  47. models/word_markov/dz_markov_ctx1_word.parquet +2 -2
  48. models/word_markov/dz_markov_ctx1_word_metadata.json +2 -2
  49. models/word_markov/dz_markov_ctx2_word.parquet +2 -2
  50. models/word_markov/dz_markov_ctx2_word_metadata.json +2 -2
.gitattributes CHANGED
@@ -39,3 +39,4 @@ visualizations/position_encoding_comparison.png filter=lfs diff=lfs merge=lfs -t
39
  visualizations/tsne_sentences.png filter=lfs diff=lfs merge=lfs -text
40
  visualizations/tsne_words.png filter=lfs diff=lfs merge=lfs -text
41
  visualizations/zipf_law.png filter=lfs diff=lfs merge=lfs -text
 
 
39
  visualizations/tsne_sentences.png filter=lfs diff=lfs merge=lfs -text
40
  visualizations/tsne_words.png filter=lfs diff=lfs merge=lfs -text
41
  visualizations/zipf_law.png filter=lfs diff=lfs merge=lfs -text
42
+ visualizations/embedding_tsne_multilingual.png filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,6 +1,6 @@
1
  ---
2
  language: dz
3
- language_name: DZ
4
  language_family: tibetoburman_tibetic
5
  tags:
6
  - wikilangs
@@ -10,11 +10,21 @@ tags:
10
  - n-gram
11
  - markov
12
  - wikipedia
 
 
 
 
 
 
 
 
 
 
13
  - monolingual
14
  - family-tibetoburman_tibetic
15
  license: mit
16
  library_name: wikilangs
17
- pipeline_tag: feature-extraction
18
  datasets:
19
  - omarkamali/wikipedia-monthly
20
  dataset_info:
@@ -23,20 +33,20 @@ dataset_info:
23
  metrics:
24
  - name: best_compression_ratio
25
  type: compression
26
- value: 7.097
27
  - name: best_isotropy
28
  type: isotropy
29
- value: 0.7372
30
  - name: vocabulary_size
31
  type: vocab
32
- value: 4041
33
- generated: 2025-12-30
34
  ---
35
 
36
- # DZ - Wikilangs Models
37
  ## Comprehensive Research Report & Full Ablation Study
38
 
39
- This repository contains NLP models trained and evaluated by Wikilangs, specifically on **DZ** Wikipedia data.
40
  We analyze tokenizers, n-gram models, Markov chains, vocabulary statistics, and word embeddings.
41
 
42
  ## 📋 Repository Contents
@@ -44,12 +54,13 @@ We analyze tokenizers, n-gram models, Markov chains, vocabulary statistics, and
44
  ### Models & Assets
45
 
46
  - Tokenizers (8k, 16k, 32k, 64k)
47
- - N-gram models (2, 3, 4-gram)
48
- - Markov chains (context of 1, 2, 3 and 4)
49
  - Subword N-gram and Markov chains
50
- - Embeddings in various sizes and dimensions
51
  - Language Vocabulary
52
  - Language Statistics
 
53
  ![Performance Dashboard](visualizations/performance_dashboard.png)
54
 
55
  ### Analysis and Evaluation
@@ -59,7 +70,8 @@ We analyze tokenizers, n-gram models, Markov chains, vocabulary statistics, and
59
  - [3. Markov Chain Evaluation](#3-markov-chain-evaluation)
60
  - [4. Vocabulary Analysis](#4-vocabulary-analysis)
61
  - [5. Word Embeddings Evaluation](#5-word-embeddings-evaluation)
62
- - [6. Summary & Recommendations](#6-summary--recommendations)
 
63
  - [Metrics Glossary](#appendix-metrics-glossary--interpretation-guide)
64
  - [Visualizations Index](#visualizations-index)
65
 
@@ -68,58 +80,57 @@ We analyze tokenizers, n-gram models, Markov chains, vocabulary statistics, and
68
 
69
  ![Tokenizer Compression](visualizations/tokenizer_compression.png)
70
 
 
 
 
 
 
 
71
  ### Results
72
 
73
  | Vocab Size | Compression | Avg Token Len | UNK Rate | Total Tokens |
74
  |------------|-------------|---------------|----------|--------------|
75
- | **8k** | 4.839x | 4.82 | 0.1103% | 866,377 |
76
- | **16k** | 5.558x | 5.53 | 0.1267% | 754,348 |
77
- | **32k** | 6.305x | 6.28 | 0.1438% | 664,948 |
78
- | **64k** | 7.097x 🏆 | 7.07 | 0.1618% | 590,703 |
79
 
80
  ### Tokenization Examples
81
 
82
  Below are sample sentences tokenized with each vocabulary size:
83
 
84
- **Sample 1:** `ཡུ་ནའི་ཊེཊ་ཨ་རབ་ཨེ་མི་རེཊསི
85
-
86
- དུ་བ་ཡེ
87
-
88
- Category:རྒྱལ་ཁབ
89
- Category:ཨེ་ཤི་ཡ`
90
 
91
  | Vocab | Tokens | Count |
92
  |-------|--------|-------|
93
- | 8k | `▁ཡུ་ ནའི་ ཊེ ཊ་ ཨ་ར བ་ ཨེ་ མི་ རེ ... (+12 more)` | 22 |
94
- | 16k | `▁ཡུ་ནའི་ཊེཊ་ ཨ་རབ་ ཨེ་ མི་ རེ སི ▁ད ུ་ བ་ ... (+7 more)` | 17 |
95
- | 32k | `▁ཡུ་ནའི་ཊེཊ་ ཨ་རབ་ ཨེ་ མི་རེ སི ▁དུ་ བ་ ཡེ ▁category ... (+5 more)` | 15 |
96
- | 64k | `▁ཡུ་ནའི་ཊེཊ་ ཨ་རབ་ཨེ་མི་རེ སི ▁དུ་བ་ཡེ ▁category : རྒྱལ་ཁབ ▁category : ... (+1 more)` | 11 |
97
 
98
- **Sample 2:** `གཟའནེཔ་ཊུན ༺རྟགས༔ 20px|♆༻
99
- Category:གནམ་རིག`
100
 
101
  | Vocab | Tokens | Count |
102
  |-------|--------|-------|
103
- | 8k | `▁གཟ པ་ ུན རྟ ... (+13 more)` | 23 |
104
- | 16k | `▁གཟ ནེ པ་ ུན རྟ གས༔ ... (+9 more)` | 19 |
105
- | 32k | `▁གཟའ ནེ པ་ཊ ུན རྟགས༔ 2 0 ... (+6 more)` | 16 |
106
- | 64k | `▁གཟའ ནེ པ་ཊ ུན རྟགས༔ 2 0 ... (+6 more)` | 16 |
107
 
108
- **Sample 3:** `གཟའསྤེན་པ ༺རྟགས༔ 20px|♄༻
109
- Category:གནམ་རིག`
110
 
111
  | Vocab | Tokens | Count |
112
  |-------|--------|-------|
113
- | 8k | `▁གཟ སྤ ེན་ རྟ གས ... (+11 more)` | 21 |
114
- | 16k | `▁གཟ སྤ ེན་ རྟ གས༔ ... (+8 more)` | 18 |
115
- | 32k | `▁གཟའ སྤ ེན་ རྟགས༔ 2 0 ... (+6 more)` | 16 |
116
- | 64k | `▁གཟའསྤེན་ རྟགས༔ 2 0 px | ... (+4 more)` | 14 |
117
 
118
 
119
  ### Key Findings
120
 
121
- - **Best Compression:** 64k achieves 7.097x compression
122
- - **Lowest UNK Rate:** 8k with 0.1103% unknown tokens
123
  - **Trade-off:** Larger vocabularies improve compression but increase model size
124
  - **Recommendation:** 32k vocabulary provides optimal balance for production use
125
 
@@ -128,57 +139,111 @@ Category:གནམ་རིག`
128
 
129
  ![N-gram Perplexity](visualizations/ngram_perplexity.png)
130
 
 
 
131
  ![N-gram Coverage](visualizations/ngram_coverage.png)
132
 
133
  ### Results
134
 
135
- | N-gram | Perplexity | Entropy | Unique N-grams | Top-100 Coverage | Top-1000 Coverage |
136
- |--------|------------|---------|----------------|------------------|-------------------|
137
- | **2-gram** | 405 🏆 | 8.66 | 5,365 | 59.5% | 96.2% |
138
- | **2-gram** | 305 🏆 | 8.25 | 3,288 | 65.5% | 97.8% |
139
- | **3-gram** | 2,415 | 11.24 | 27,806 | 28.8% | 71.5% |
140
- | **3-gram** | 1,793 | 10.81 | 19,053 | 28.1% | 78.4% |
141
- | **4-gram** | 11,256 | 13.46 | 95,263 | 14.3% | 42.2% |
142
- | **4-gram** | 7,832 | 12.94 | 68,338 | 14.3% | 45.4% |
 
 
143
 
144
  ### Top 5 N-grams by Size
145
 
146
- **2-grams:**
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
147
 
148
  | Rank | N-gram | Count |
149
  |------|--------|-------|
150
- | 1 | `ི ་` | 117,504 |
151
- | 2 | `་ ས` | 79,905 |
152
- | 3 | `ན ་` | 68,413 |
153
- | 4 | `་ ར` | 62,786 |
154
- | 5 | `་ ལ` | 54,596 |
155
 
156
- **3-grams:**
157
 
158
  | Rank | N-gram | Count |
159
  |------|--------|-------|
160
- | 1 | `ྱ ་` | 22,148 |
161
- | 2 | `་ དང ་` | 20,477 |
162
- | 3 | `་ ་` | 18,904 |
163
- | 4 | `་ ུ` | 18,737 |
164
- | 5 | `ི ན ་` | 18,711 |
165
 
166
- **4-grams:**
167
 
168
  | Rank | N-gram | Count |
169
  |------|--------|-------|
170
- | 1 | `་ ྱ` | 16,554 |
171
- | 2 | `་ པའ ་` | 15,863 |
172
- | 3 | `་ ་` | 15,197 |
173
- | 4 | `་ ་` | 11,786 |
174
- | 5 | `་ ་` | 11,404 |
175
 
176
 
177
  ### Key Findings
178
 
179
- - **Best Perplexity:** 2-gram with 305
180
  - **Entropy Trend:** Decreases with larger n-grams (more predictable)
181
- - **Coverage:** Top-1000 patterns cover ~45% of corpus
182
  - **Recommendation:** 4-gram or 5-gram for best predictive performance
183
 
184
  ---
@@ -186,55 +251,86 @@ Category:གནམ་རིག`
186
 
187
  ![Markov Entropy](visualizations/markov_entropy.png)
188
 
 
 
189
  ![Markov Branching](visualizations/markov_branching.png)
190
 
191
  ### Results
192
 
193
- | Context | Avg Entropy | Perplexity | Branching Factor | Unique Contexts | Predictability |
194
- |---------|-------------|------------|------------------|-----------------|----------------|
195
- | **1** | 0.4630 | 1.378 | 3.64 | 8,836 | 53.7% |
196
- | **1** | 1.1778 | 2.262 | 9.63 | 727 | 0.0% |
197
- | **2** | 0.3272 | 1.255 | 2.83 | 32,129 | 67.3% |
198
- | **2** | 0.9588 | 1.944 | 5.50 | 7,001 | 4.1% |
199
- | **3** | 0.2937 🏆 | 1.226 | 2.25 | 90,896 | 70.6% |
200
- | **3** | 0.6936 🏆 | 1.617 | 3.16 | 38,506 | 30.6% |
201
- | **4** | 0.3266 | 1.254 | 2.05 | 204,174 | 67.3% |
202
- | **4** | 0.5052 | 1.419 | 2.36 | 121,621 | 49.5% |
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
203
 
204
- ### Generated Text Samples
 
 
 
 
 
 
 
 
 
 
205
 
206
- Below are text samples generated from each Markov chain model:
 
 
 
207
 
208
  **Context Size 1:**
209
 
210
- 1. `་ མ ་ གཡ ོ ད ་ པ ོ ག ་ དང ་ བའ ི ང`
211
- 2. `ི ་ ཚ ུ ག ི ན ་ འཕ ེ ༌ འད ྲ ྭ གས ་`
212
- 3. `ོ ། ད ་ པ ་ ད ་ མ ་ ག ་ ས ་ བཙག ་`
213
 
214
  **Context Size 2:**
215
 
216
- 1. `ི ་ གནས ་ ཀ ྱ ི ་ ལ ུ ་ ར ྒ ྱ ལ ་ མང`
217
- 2. `་ ས ྤ ྱ ོ བས ་ ལ ུ ་ འཐད ་ པ ་ དང ་ གཅ`
218
- 3. `ན ་ ཌ ེ མ ་ ག ི ་ ལ ེ འ ི ་ གསར ་ ར`
219
 
220
  **Context Size 3:**
221
 
222
- 1. `ྱ ི ་ ཡ ུ དཔ ་ ཐ ེ ངས ་ ཏ ེ ། ། ཡར ་ ཡར`
223
- 2. `་ དང ་ ། ། མས ༌ ཀ ྱ ི ས ་ ག ླ ི ང ་ ག`
224
- 3. `་ པ ་ ཟ ེ ར ་ ག ྱ ི ་ མཐ ུ ད ་ འད ི ་`
225
 
226
  **Context Size 4:**
227
 
228
- 1. `་ ར ྒ ྱ ལ ་ ཁབ ་ ནང ་ ལ ུ ་ ཐད ་ ཀར ་ ག ླ`
229
- 2. `་ པའ ི ་ གས ེ ར ་ ག ྱ ི ་ ས ྒ ྲ ུ བ ་ ཆ`
230
- 3. `་ ལ ུ ་ མཁས ་ པ ་ འགའ ་ ཡང ་ མ ི ་ ས ྡ ེ ་`
231
 
232
 
233
  ### Key Findings
234
 
235
- - **Best Predictability:** Context-3 with 70.6% predictability
236
  - **Branching Factor:** Decreases with context size (more deterministic)
237
- - **Memory Trade-off:** Larger contexts require more storage (121,621 contexts)
238
  - **Recommendation:** Context-3 or Context-4 for text generation
239
 
240
  ---
@@ -250,64 +346,64 @@ Below are text samples generated from each Markov chain model:
250
 
251
  | Metric | Value |
252
  |--------|-------|
253
- | Vocabulary Size | 4,041 |
254
- | Total Tokens | 1,481,034 |
255
- | Mean Frequency | 366.50 |
256
- | Median Frequency | 4 |
257
- | Frequency Std Dev | 4331.41 |
258
 
259
  ### Most Common Words
260
 
261
  | Rank | Word | Frequency |
262
  |------|------|-----------|
263
- | 1 | | 138,469 |
264
- | 2 | | 99,582 |
265
- | 3 | | 94,106 |
266
- | 4 | | 91,997 |
267
- | 5 | | 90,195 |
268
- | 6 | | 76,047 |
269
- | 7 | | 55,299 |
270
- | 8 | | 48,577 |
271
- | 9 | | 46,244 |
272
- | 10 | གས | 35,001 |
273
 
274
  ### Least Common Words (from vocabulary)
275
 
276
  | Rank | Word | Frequency |
277
  |------|------|-----------|
278
- | 1 | yongla | 2 |
279
- | 2 | pelbar | 2 |
280
- | 3 | dargeychhoeling | 2 |
281
- | 4 | fortress | 2 |
282
- | 5 | gods | 2 |
283
- | 6 | shaba | 2 |
284
  | 7 | assam | 2 |
285
  | 8 | pelgen | 2 |
286
- | 9 | bjoka | 2 |
287
- | 10 | ༡༨༨༩ | 2 |
288
 
289
  ### Zipf's Law Analysis
290
 
291
  | Metric | Value |
292
  |--------|-------|
293
- | Zipf Coefficient | 1.7500 |
294
- | R² (Goodness of Fit) | 0.979006 |
295
  | Adherence Quality | **excellent** |
296
 
297
  ### Coverage Analysis
298
 
299
  | Top N Words | Coverage |
300
  |-------------|----------|
301
- | Top 100 | 91.1% |
302
- | Top 1,000 | 99.3% |
303
- | Top 5,000 | 0.0% |
304
  | Top 10,000 | 0.0% |
305
 
306
  ### Key Findings
307
 
308
- - **Zipf Compliance:** R²=0.9790 indicates excellent adherence to Zipf's law
309
- - **High Frequency Dominance:** Top 100 words cover 91.1% of corpus
310
- - **Long Tail:** -5,959 words needed for remaining 100.0% coverage
311
 
312
  ---
313
  ## 5. Word Embeddings Evaluation
@@ -320,24 +416,79 @@ Below are text samples generated from each Markov chain model:
320
 
321
  ![t-SNE Sentences](visualizations/tsne_sentences.png)
322
 
323
- ### Model Comparison
324
 
325
- | Model | Vocab Size | Dimension | Avg Norm | Std Norm | Isotropy |
326
- |-------|------------|-----------|----------|----------|----------|
327
- | **mono_32d** | 1,914 | 32 | 4.382 | 1.097 | 0.7372 🏆 |
328
- | **mono_64d** | 1,914 | 64 | 4.602 | 1.013 | 0.4928 |
329
- | **mono_128d** | 1,914 | 128 | 4.665 | 1.008 | 0.1299 |
330
- | **embeddings_enhanced** | 0 | 0 | 0.000 | 0.000 | 0.0000 |
 
 
 
 
 
 
 
 
 
 
 
331
 
332
  ### Key Findings
333
 
334
- - **Best Isotropy:** mono_32d with 0.7372 (more uniform distribution)
335
- - **Dimension Trade-off:** Higher dimensions capture more semantics but reduce isotropy
336
- - **Vocabulary Coverage:** All models cover 1,914 words
337
- - **Recommendation:** 100d for balanced semantic capture and efficiency
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
338
 
339
  ---
340
- ## 6. Summary & Recommendations
341
 
342
  ![Performance Dashboard](visualizations/performance_dashboard.png)
343
 
@@ -345,11 +496,12 @@ Below are text samples generated from each Markov chain model:
345
 
346
  | Component | Recommended | Rationale |
347
  |-----------|-------------|-----------|
348
- | Tokenizer | **32k BPE** | Best compression (7.10x) with low UNK rate |
349
- | N-gram | **5-gram** | Lowest perplexity (305) |
350
- | Markov | **Context-4** | Highest predictability (70.6%) |
351
  | Embeddings | **100d** | Balanced semantic capture and isotropy |
352
 
 
353
  ---
354
  ## Appendix: Metrics Glossary & Interpretation Guide
355
 
@@ -539,7 +691,8 @@ If you use these models in your research, please cite:
539
  author = {Kamali, Omar},
540
  title = {Wikilangs: Open NLP Models for Wikipedia Languages},
541
  year = {2025},
542
- publisher = {HuggingFace},
 
543
  url = {https://huggingface.co/wikilangs}
544
  institution = {Omneity Labs}
545
  }
@@ -555,7 +708,8 @@ MIT License - Free for academic and commercial use.
555
  - 🤗 Models: [huggingface.co/wikilangs](https://huggingface.co/wikilangs)
556
  - 📊 Data: [wikipedia-monthly](https://huggingface.co/datasets/omarkamali/wikipedia-monthly)
557
  - 👤 Author: [Omar Kamali](https://huggingface.co/omarkamali)
 
558
  ---
559
  *Generated by Wikilangs Models Pipeline*
560
 
561
- *Report Date: 2025-12-30 08:46:14*
 
1
  ---
2
  language: dz
3
+ language_name: Dzongkha
4
  language_family: tibetoburman_tibetic
5
  tags:
6
  - wikilangs
 
10
  - n-gram
11
  - markov
12
  - wikipedia
13
+ - feature-extraction
14
+ - sentence-similarity
15
+ - tokenization
16
+ - n-grams
17
+ - markov-chain
18
+ - text-mining
19
+ - fasttext
20
+ - babelvec
21
+ - vocabulous
22
+ - vocabulary
23
  - monolingual
24
  - family-tibetoburman_tibetic
25
  license: mit
26
  library_name: wikilangs
27
+ pipeline_tag: text-generation
28
  datasets:
29
  - omarkamali/wikipedia-monthly
30
  dataset_info:
 
33
  metrics:
34
  - name: best_compression_ratio
35
  type: compression
36
+ value: 5.510
37
  - name: best_isotropy
38
  type: isotropy
39
+ value: 0.6999
40
  - name: vocabulary_size
41
  type: vocab
42
+ value: 0
43
+ generated: 2026-01-04
44
  ---
45
 
46
+ # Dzongkha - Wikilangs Models
47
  ## Comprehensive Research Report & Full Ablation Study
48
 
49
+ This repository contains NLP models trained and evaluated by Wikilangs, specifically on **Dzongkha** Wikipedia data.
50
  We analyze tokenizers, n-gram models, Markov chains, vocabulary statistics, and word embeddings.
51
 
52
  ## 📋 Repository Contents
 
54
  ### Models & Assets
55
 
56
  - Tokenizers (8k, 16k, 32k, 64k)
57
+ - N-gram models (2, 3, 4, 5-gram)
58
+ - Markov chains (context of 1, 2, 3, 4 and 5)
59
  - Subword N-gram and Markov chains
60
+ - Embeddings in various sizes and dimensions (aligned and unaligned)
61
  - Language Vocabulary
62
  - Language Statistics
63
+
64
  ![Performance Dashboard](visualizations/performance_dashboard.png)
65
 
66
  ### Analysis and Evaluation
 
70
  - [3. Markov Chain Evaluation](#3-markov-chain-evaluation)
71
  - [4. Vocabulary Analysis](#4-vocabulary-analysis)
72
  - [5. Word Embeddings Evaluation](#5-word-embeddings-evaluation)
73
+ - [6. Morphological Analysis (Experimental)](#6--morphological-analysis-experimental)
74
+ - [7. Summary & Recommendations](#7-summary--recommendations)
75
  - [Metrics Glossary](#appendix-metrics-glossary--interpretation-guide)
76
  - [Visualizations Index](#visualizations-index)
77
 
 
80
 
81
  ![Tokenizer Compression](visualizations/tokenizer_compression.png)
82
 
83
+ ![Tokenizer Fertility](visualizations/tokenizer_fertility.png)
84
+
85
+ ![Tokenizer OOV](visualizations/tokenizer_oov.png)
86
+
87
+ ![Total Tokens](visualizations/tokenizer_total_tokens.png)
88
+
89
  ### Results
90
 
91
  | Vocab Size | Compression | Avg Token Len | UNK Rate | Total Tokens |
92
  |------------|-------------|---------------|----------|--------------|
93
+ | **8k** | 4.484x | 4.49 | 0.0965% | 813,691 |
94
+ | **16k** | 4.768x | 4.77 | 0.1026% | 765,197 |
95
+ | **32k** | 5.092x | 5.09 | 0.1096% | 716,539 |
96
+ | **64k** | 5.510x 🏆 | 5.51 | 0.1185% | 662,175 |
97
 
98
  ### Tokenization Examples
99
 
100
  Below are sample sentences tokenized with each vocabulary size:
101
 
102
+ **Sample 1:** `རྒྱལ་ཁབ ཇཱ་པཱན། 日本 ཇ་པན་གྱི་རྒྱལ་ཁབ་འདི་ཤར་ཨེ་ཤི་ཡ་ལུ་ཆགས་ཏི་ཡོད་མི་མཚོ་གླིང་གྱི...`
 
 
 
 
 
103
 
104
  | Vocab | Tokens | Count |
105
  |-------|--------|-------|
106
+ | 8k | `▁རྒྱལ་ཁབ ▁ཇ ཱ་ ཱན། 日本 ▁ཇ་པ ན་ གྱི་རྒྱལ་ཁབ་ ... (+31 more)` | 41 |
107
+ | 16k | `▁རྒྱལ་ཁབ ▁ཇཱ་པཱན། 日本 ▁ཇ་པན་ གྱི་རྒྱལ་ཁབ་ འདི་ ཤར་ཨེ་ཤི་ཡ་ ལུ་ཆག���་ ཏི་ ... (+23 more)` | 33 |
108
+ | 32k | `▁རྒྱལ་ཁབ ▁ཇཱ་པཱན། 日本 ▁ཇ་པན་ གྱི་རྒྱལ་ཁབ་ འདི་ཤར་ཨེ་ཤི་ཡ་ ལུ་ཆགས་ཏི་ ཡོད་མི་ མཚོ་གླིང་གྱི་ ... (+12 more)` | 22 |
109
+ | 64k | `▁རྒྱལ་ཁབ ▁ཇཱ་པཱན། 日本 ▁ཇ་པན་ གྱི་རྒྱལ་ཁབ་ འདི་ཤར་ཨེ་ཤི་ཡ་ ལུ་ཆགས་ཏི་ ཡོད་མི་ མཚོ་གླིང་གྱི་ ... (+12 more)` | 22 |
110
 
111
+ **Sample 2:** `སེམས་ཅན བྱི་ལི ཁྱི ཉ སྟག​ བྱམོ དོམ ལུག རྟ བྱི་ཙི པར་རིས་བར་འཁྱམས། ཁུངས་གཏུག། ཕྱི...`
 
112
 
113
  | Vocab | Tokens | Count |
114
  |-------|--------|-------|
115
+ | 8k | `▁སེམས་ ཅན ▁བྱི་ ལི ▁ཁྱ ▁ཉ ▁སྟ ▁བྱ ... (+15 more)` | 25 |
116
+ | 16k | `▁སེམས་ཅན ▁བྱི་ལི ▁ཁྱ ▁ཉ ▁སྟ ▁བྱ མོ ▁ད ... (+13 more)` | 23 |
117
+ | 32k | `▁སེམས་ཅན ▁བྱི་ལི ▁ཁྱི ▁ཉ ▁སྟག ▁བྱམོ ▁དོམ ▁ལུག ▁རྟ ▁བྱི་ཙི ... (+5 more)` | 15 |
118
+ | 64k | `▁སེམས་ཅན ▁བྱི་ལི ▁ཁྱི ▁ཉ ▁སྟག ▁བྱམོ ▁དོམ ▁ལུག ▁རྟ ▁བྱི་ཙི ... (+5 more)` | 15 |
119
 
120
+ **Sample 3:** `ཞི་ཆོག་གི་སྐབས་ལུ་འཕུ་ནི་གི་ཆོས་ཆས། རྒྱ་མཚོ་ནང་གི་སེམས་ཅན་ཅིག་གི་ཕྱི་ཤུབས། དུང་ད...`
 
121
 
122
  | Vocab | Tokens | Count |
123
  |-------|--------|-------|
124
+ | 8k | `▁ཞི་ ཆོག་ གི་ སྐབས་ལུ་ འཕ ུ་ ནི་གི་ ཆོས་ ཆས། ▁རྒྱ་མཚོ་ ... (+15 more)` | 25 |
125
+ | 16k | `▁ཞི་ ཆོག་ གི་སྐབས་ལུ་ འཕ ུ་ ནི་གི་ ཆོས་ ཆས། ▁རྒྱ་མཚོ་ ནང་གི་ ... (+12 more)` | 22 |
126
+ | 32k | `▁ཞི་ཆོག་ གི་སྐབས་ལུ་ འཕུ་ནི་གི་ ཆོས་ཆས། ▁རྒྱ་མཚོ་ ནང་གི་སེམས་ཅན་ ཅིག་གི་ཕྱི་ཤུབས། ▁དུང་དཀར་གྱི་ མིང་གཞན་ ▁སྐྱེ་བ་ལྔ་པ་ ... (+1 more)` | 11 |
127
+ | 64k | `▁ཞི་ཆོག་ གི་སྐབས་ལུ་ འཕུ་ནི་གི་ ཆོས་ཆས། ▁རྒྱ་མཚོ་ ནང་གི་སེམས་ཅན་ ཅིག་གི་ཕྱི་ཤུབས། ▁དུང་དཀར་གྱི་ མིང་གཞན་ ▁སྐྱེ་བ་ལྔ་པ་ ... (+1 more)` | 11 |
128
 
129
 
130
  ### Key Findings
131
 
132
+ - **Best Compression:** 64k achieves 5.510x compression
133
+ - **Lowest UNK Rate:** 8k with 0.0965% unknown tokens
134
  - **Trade-off:** Larger vocabularies improve compression but increase model size
135
  - **Recommendation:** 32k vocabulary provides optimal balance for production use
136
 
 
139
 
140
  ![N-gram Perplexity](visualizations/ngram_perplexity.png)
141
 
142
+ ![N-gram Unique](visualizations/ngram_unique.png)
143
+
144
  ![N-gram Coverage](visualizations/ngram_coverage.png)
145
 
146
  ### Results
147
 
148
+ | N-gram | Variant | Perplexity | Entropy | Unique N-grams | Top-100 Coverage | Top-1000 Coverage |
149
+ |--------|---------|------------|---------|----------------|------------------|-------------------|
150
+ | **2-gram** | Word | 11,790 | 13.53 | 28,884 | 11.2% | 35.3% |
151
+ | **2-gram** | Subword | 488 🏆 | 8.93 | 5,527 | 57.6% | 90.8% |
152
+ | **3-gram** | Word | 34,131 | 15.06 | 59,067 | 5.7% | 18.6% |
153
+ | **3-gram** | Subword | 3,461 | 11.76 | 28,498 | 24.5% | 62.8% |
154
+ | **4-gram** | Word | 80,153 | 16.29 | 114,752 | 2.9% | 10.7% |
155
+ | **4-gram** | Subword | 15,479 | 13.92 | 106,273 | 12.4% | 37.5% |
156
+ | **5-gram** | Word | 77,316 | 16.24 | 96,422 | 2.3% | 8.9% |
157
+ | **5-gram** | Subword | 44,243 | 15.43 | 194,726 | 7.1% | 23.4% |
158
 
159
  ### Top 5 N-grams by Size
160
 
161
+ **2-grams (Word):**
162
+
163
+ | Rank | N-gram | Count |
164
+ |------|--------|-------|
165
+ | 1 | `ཡོདཔ ཨིན` | 3,325 |
166
+ | 2 | `རྒྱལ ཁབ` | 2,719 |
167
+ | 3 | `སྤྱི ལོ` | 1,933 |
168
+ | 4 | `ཨིན མས` | 1,872 |
169
+ | 5 | `ནང ལུ` | 1,628 |
170
+
171
+ **3-grams (Word):**
172
+
173
+ | Rank | N-gram | Count |
174
+ |------|--------|-------|
175
+ | 1 | `རིན པོ ཆེ` | 778 |
176
+ | 2 | `ཡོདཔ ཨིན མས` | 778 |
177
+ | 3 | `རྒྱལ ཁབ ནང` | 732 |
178
+ | 4 | `སྤྱི ལོ ལུ` | 688 |
179
+ | 5 | `འབྲུག རྒྱལ ཁབ` | 623 |
180
+
181
+ **4-grams (Word):**
182
+
183
+ | Rank | N-gram | Count |
184
+ |------|--------|-------|
185
+ | 1 | `རྒྱལ ཁབ ནང ལུ` | 309 |
186
+ | 2 | `འབྲུག རྒྱལ ཁབ ནང` | 288 |
187
+ | 3 | `དཔལ ལྡན འབྲུག པའི` | 272 |
188
+ | 4 | `གུ རུ རིན པོ` | 250 |
189
+ | 5 | `སྡེ སྲིད ཁྲི རབས` | 223 |
190
+
191
+ **5-grams (Word):**
192
+
193
+ | Rank | N-gram | Count |
194
+ |------|--------|-------|
195
+ | 1 | `གུ རུ རིན པོ ཆེ` | 184 |
196
+ | 2 | `གནམ ལོ མེད སྤྱི ལོ` | 162 |
197
+ | 3 | `ཞབས དྲུང རིན པོ ཆེ` | 150 |
198
+ | 4 | `རྒྱལ ཡོངས དགའ སྐྱིད དཔལ` | 127 |
199
+ | 5 | `ཡོངས དགའ སྐྱིད དཔལ འཛོམས` | 125 |
200
+
201
+ **2-grams (Subword):**
202
+
203
+ | Rank | N-gram | Count |
204
+ |------|--------|-------|
205
+ | 1 | `ས ་` | 123,525 |
206
+ | 2 | `ང ་` | 91,851 |
207
+ | 3 | `ན ་` | 70,834 |
208
+ | 4 | `་ _` | 62,281 |
209
+ | 5 | `་ བ` | 59,589 |
210
+
211
+ **3-grams (Subword):**
212
 
213
  | Rank | N-gram | Count |
214
  |------|--------|-------|
215
+ | 1 | `ག ་` | 25,075 |
216
+ | 2 | `ད ་` | 18,381 |
217
+ | 3 | `་ ང` | 17,725 |
218
+ | 4 | `། _ །` | 15,647 |
219
+ | 5 | `་ ་` | 15,536 |
220
 
221
+ **4-grams (Subword):**
222
 
223
  | Rank | N-gram | Count |
224
  |------|--------|-------|
225
+ | 1 | `་ ་` | 17,384 |
226
+ | 2 | `་ འི ་` | 13,232 |
227
+ | 3 | `་ ་` | 12,579 |
228
+ | 4 | `་ དི ་` | 8,184 |
229
+ | 5 | `་་` | 6,539 |
230
 
231
+ **5-grams (Subword):**
232
 
233
  | Rank | N-gram | Count |
234
  |------|--------|-------|
235
+ | 1 | `་ ཡོ ་` | 5,559 |
236
+ | 2 | `་ _` | 4,930 |
237
+ | 3 | `་ _` | 4,145 |
238
+ | 4 | `་ ་` | 3,971 |
239
+ | 5 | `ས འི ་` | 3,925 |
240
 
241
 
242
  ### Key Findings
243
 
244
+ - **Best Perplexity:** 2-gram (subword) with 488
245
  - **Entropy Trend:** Decreases with larger n-grams (more predictable)
246
+ - **Coverage:** Top-1000 patterns cover ~23% of corpus
247
  - **Recommendation:** 4-gram or 5-gram for best predictive performance
248
 
249
  ---
 
251
 
252
  ![Markov Entropy](visualizations/markov_entropy.png)
253
 
254
+ ![Markov Contexts](visualizations/markov_contexts.png)
255
+
256
  ![Markov Branching](visualizations/markov_branching.png)
257
 
258
  ### Results
259
 
260
+ | Context | Variant | Avg Entropy | Perplexity | Branching Factor | Unique Contexts | Predictability |
261
+ |---------|---------|-------------|------------|------------------|-----------------|----------------|
262
+ | **1** | Word | 1.1820 | 2.269 | 14.12 | 12,061 | 0.0% |
263
+ | **1** | Subword | 0.8884 | 1.851 | 7.57 | 1,607 | 11.2% |
264
+ | **2** | Word | 0.5611 | 1.475 | 2.65 | 170,162 | 43.9% |
265
+ | **2** | Subword | 0.6433 | 1.562 | 5.02 | 12,152 | 35.7% |
266
+ | **3** | Word | 0.2267 | 1.170 | 1.41 | 449,950 | 77.3% |
267
+ | **3** | Subword | 0.5247 | 1.439 | 3.26 | 61,009 | 47.5% |
268
+ | **4** | Word | 0.0989 🏆 | 1.071 | 1.15 | 633,460 | 90.1% |
269
+ | **4** | Subword | 0.3500 | 1.275 | 2.11 | 199,035 | 65.0% |
270
+
271
+ ### Generated Text Samples (Word-based)
272
+
273
+ Below are text samples generated from each word-based Markov chain model:
274
+
275
+ **Context Size 1:**
276
+
277
+ 1. `དང སྲས ཚུ གིས ས ཐག མ སླེབས ཚེ འདི ལེགས སོ བཅོམ ཡིད གསུམ གྱི`
278
+ 2. `པ ཨིན པས དེ མ མི ཡུལ བྱིན ཅན ཁྱོད འདི གི ལཱ འབད ནི ཀ`
279
+ 3. `ལུ བདག སྐྱོང ལེགས སོ དཀརཔོ ཅིག གཅིག པུར ལུ ༡༡ ག གིས པདྨ རིགས མ`
280
+
281
+ **Context Size 2:**
282
+
283
+ 1. `ཡོདཔ ཨིན པས རྒྱབ རྟེན ༡ དྲག ཤོས ཀྱི གསོལ ར ༤ གློག འཕྲིན གྱི ཁྱབ བདག`
284
+ 2. `རྒྱལ ཁབ ཀྱི སྐུ རིམ དང པོ ནས བློ གྲོས བཟང མོ གིས ཨ ལུ འདི ཆ`
285
+ 3. `སྤྱི ལོ སྤྱི ཟླ ༤ པ ༡༡ པ ལས འཛིན ཟེར བཙུགས མི མཐོ ཚད ཀི ལོ`
286
 
287
+ **Context Size 3:**
288
+
289
+ 1. `རིན པོ ཆེ སངས རྒྱས ཀུན གྱི སྐུ འཆང བ སངས རྒྱས ཀུན གྱི གསུང ཡང ཡིན རྡོ`
290
+ 2. `ཡོདཔ ཨིན མས ཨོ རྒྱན ཆོས གླིང ལྷ ཁང འདི དུས རབས ༨ པའི ནང གུ རུ རིན`
291
+ 3. `རྒྱལ ཁབ ནང ལུ ཡང དམངས གཙོའི རིང ལུགས ཀྱི རྒྱལ པོའི བརྟན བཞུགས གི རྩ ཚིག གསར`
292
+
293
+ **Context Size 4:**
294
+
295
+ 1. `རྒྱལ ཁབ ནང ལུ དཔལ འབྱོར གྱི སྡེ ཚན ཅིག ཡང གཞི གཙུགས འབད དེ འདུག དེ ཡང སྔོན`
296
+ 2. `འབྲུག རྒྱལ ཁབ ནང ཡོད པའི རྒྱལ ཁབ ཅིག ཨིན དེ ཡང གྷི རེཊ བིརི ཊེན ཟེར མི འདི`
297
+ 3. `དཔལ ལྡན འབྲུག པའི གདུང བརྒྱུད ཅིག ཞུ ནིའི དོན ལུ ཚེས ཉེར དགུ ལུ བླ མ གུ རུ`
298
 
299
+
300
+ ### Generated Text Samples (Subword-based)
301
+
302
+ Below are text samples generated from each subword-based Markov chain model:
303
 
304
  **Context Size 1:**
305
 
306
+ 1. `་_ཚར་ཏེ་གིས་ཡོདཔོན་འ`
307
+ 2. `_རྫོང་སྟེངས་_ཞེང་བ་རུའི`
308
+ 3. `སལཔ་ལུ་ག་བཏུབཟོཔ་ཡིག`
309
 
310
  **Context Size 2:**
311
 
312
+ 1. `ས་ལུང་ཞིན་པ་འབྲུག་འབྲུག`
313
+ 2. `ང་ཁྲུང་ཁབ་སྦྲུལ་ཙ་ཝཊ་ཛ`
314
+ 3. `ན་ནང་བླ་མཆོད་ཆོས་དཔ་`
315
 
316
  **Context Size 3:**
317
 
318
+ 1. `གས་རིག་པའི་ནུས་པ་སྦེ་ཐོན`
319
+ 2. `དང་རའི་ཨཔ་ཟླཝ་ག་རང་འ`
320
+ 3. `་དང་ཕྱི་མས།_།ཉི་ཟླ་_༢༩`
321
 
322
  **Context Size 4:**
323
 
324
+ 1. `་དང་གཅིག་ནང་_ཡན་ལག་ཁ`
325
+ 2. `་པའི་བླ་མ་ཐུབ།_།དགེ་བ་སྟོ`
326
+ 3. `་ལས་_འབྱུང་ཁུངས།_།དགའ་`
327
 
328
 
329
  ### Key Findings
330
 
331
+ - **Best Predictability:** Context-4 (word) with 90.1% predictability
332
  - **Branching Factor:** Decreases with context size (more deterministic)
333
+ - **Memory Trade-off:** Larger contexts require more storage (199,035 contexts)
334
  - **Recommendation:** Context-3 or Context-4 for text generation
335
 
336
  ---
 
346
 
347
  | Metric | Value |
348
  |--------|-------|
349
+ | Vocabulary Size | 6,761 |
350
+ | Total Tokens | 898,876 |
351
+ | Mean Frequency | 132.95 |
352
+ | Median Frequency | 6 |
353
+ | Frequency Std Dev | 709.47 |
354
 
355
  ### Most Common Words
356
 
357
  | Rank | Word | Frequency |
358
  |------|------|-----------|
359
+ | 1 | དང | 18,802 |
360
+ | 2 | | 17,903 |
361
+ | 3 | ལུ | 15,384 |
362
+ | 4 | པའི | 14,560 |
363
+ | 5 | ལས | 14,391 |
364
+ | 6 | མི | 11,348 |
365
+ | 7 | དེ | 11,091 |
366
+ | 8 | | 10,372 |
367
+ | 9 | གི | 10,307 |
368
+ | 10 | འདི | 9,382 |
369
 
370
  ### Least Common Words (from vocabulary)
371
 
372
  | Rank | Word | Frequency |
373
  |------|------|-----------|
374
+ | 1 | printer | 2 |
375
+ | 2 | fortress | 2 |
376
+ | 3 | gods | 2 |
377
+ | 4 | wordpress | 2 |
378
+ | 5 | phurdo | 2 |
379
+ | 6 | gonpa | 2 |
380
  | 7 | assam | 2 |
381
  | 8 | pelgen | 2 |
382
+ | 9 | anecdotes | 2 |
383
+ | 10 | kheng | 2 |
384
 
385
  ### Zipf's Law Analysis
386
 
387
  | Metric | Value |
388
  |--------|-------|
389
+ | Zipf Coefficient | 1.8277 |
390
+ | R² (Goodness of Fit) | 0.959592 |
391
  | Adherence Quality | **excellent** |
392
 
393
  ### Coverage Analysis
394
 
395
  | Top N Words | Coverage |
396
  |-------------|----------|
397
+ | Top 100 | 49.0% |
398
+ | Top 1,000 | 92.3% |
399
+ | Top 5,000 | 99.6% |
400
  | Top 10,000 | 0.0% |
401
 
402
  ### Key Findings
403
 
404
+ - **Zipf Compliance:** R²=0.9596 indicates excellent adherence to Zipf's law
405
+ - **High Frequency Dominance:** Top 100 words cover 49.0% of corpus
406
+ - **Long Tail:** -3,239 words needed for remaining 100.0% coverage
407
 
408
  ---
409
  ## 5. Word Embeddings Evaluation
 
416
 
417
  ![t-SNE Sentences](visualizations/tsne_sentences.png)
418
 
 
419
 
420
+ ### 5.1 Cross-Lingual Alignment
421
+
422
+ ![Alignment Quality](visualizations/embedding_alignment_quality.png)
423
+
424
+ ![Multilingual t-SNE](visualizations/embedding_tsne_multilingual.png)
425
+
426
+
427
+ ### 5.2 Model Comparison
428
+
429
+ | Model | Dimension | Isotropy | Semantic Density | Alignment R@1 | Alignment R@10 |
430
+ |-------|-----------|----------|------------------|---------------|----------------|
431
+ | **mono_32d** | 32 | 0.6999 🏆 | 0.3567 | N/A | N/A |
432
+ | **mono_64d** | 64 | 0.4345 | 0.3403 | N/A | N/A |
433
+ | **mono_128d** | 128 | 0.1109 | 0.3305 | N/A | N/A |
434
+ | **aligned_32d** | 32 | 0.6999 | 0.3594 | 0.0547 | 0.2644 |
435
+ | **aligned_64d** | 64 | 0.4345 | 0.3388 | 0.1307 | 0.4103 |
436
+ | **aligned_128d** | 128 | 0.1109 | 0.3270 | 0.2340 | 0.4742 |
437
 
438
  ### Key Findings
439
 
440
+ - **Best Isotropy:** mono_32d with 0.6999 (more uniform distribution)
441
+ - **Semantic Density:** Average pairwise similarity of 0.3421. Lower values indicate better semantic separation.
442
+ - **Alignment Quality:** Aligned models achieve up to 23.4% R@1 in cross-lingual retrieval.
443
+ - **Recommendation:** 128d aligned for best cross-lingual performance
444
+
445
+ ---
446
+ ## 6. Morphological Analysis (Experimental)
447
+
448
+ This section presents an automated morphological analysis derived from the statistical divergence between word-level and subword-level models. By analyzing where subword predictability spikes and where word-level coverage fails, we can infer linguistic structures without supervised data.
449
+
450
+ ### 6.1 Productivity & Complexity
451
+
452
+ | Metric | Value | Interpretation | Recommendation |
453
+ |--------|-------|----------------|----------------|
454
+ | Productivity Index | **5.000** | High morphological productivity | Reliable analysis |
455
+ | Idiomaticity Gap | **-0.621** | Low formulaic content | - |
456
+
457
+ ### 6.2 Affix Inventory (Productive Units)
458
+
459
+ These are the most productive prefixes and suffixes identified by sampling the vocabulary for global substitutability patterns. A unit is considered an affix if stripping it leaves a valid stem that appears in other contexts.
460
+
461
+ *No productive affixes detected.*
462
+
463
+
464
+ ### 6.3 Bound Stems (Lexical Roots)
465
+
466
+ Bound stems are high-frequency subword units that are semantically cohesive but rarely appear as standalone words. These often correspond to the 'core' of a word that requires inflection or derivation to be valid.
467
+
468
+ *No significant bound stems detected.*
469
+
470
+
471
+ ### 6.4 Affix Compatibility (Co-occurrence)
472
+
473
+ This table shows which prefixes and suffixes most frequently co-occur on the same stems, revealing the 'stacking' rules of the language's morphology.
474
+
475
+ *No significant affix co-occurrences detected.*
476
+
477
+
478
+ ### 6.5 Recursive Morpheme Segmentation
479
+
480
+ Using **Recursive Hierarchical Substitutability**, we decompose complex words into their constituent morphemes. This approach handles nested affixes (e.g., `prefix-prefix-root-suffix`).
481
+
482
+ *Insufficient data for recursive segmentation.*
483
+
484
+
485
+ ### 6.6 Linguistic Interpretation
486
+
487
+ > **Automated Insight:**
488
+ The language Dzongkha shows high morphological productivity. The subword models are significantly more efficient than word models, suggesting a rich system of affixation or compounding.
489
 
490
  ---
491
+ ## 7. Summary & Recommendations
492
 
493
  ![Performance Dashboard](visualizations/performance_dashboard.png)
494
 
 
496
 
497
  | Component | Recommended | Rationale |
498
  |-----------|-------------|-----------|
499
+ | Tokenizer | **64k BPE** | Best compression (5.51x) |
500
+ | N-gram | **2-gram** | Lowest perplexity (488) |
501
+ | Markov | **Context-4** | Highest predictability (90.1%) |
502
  | Embeddings | **100d** | Balanced semantic capture and isotropy |
503
 
504
+
505
  ---
506
  ## Appendix: Metrics Glossary & Interpretation Guide
507
 
 
691
  author = {Kamali, Omar},
692
  title = {Wikilangs: Open NLP Models for Wikipedia Languages},
693
  year = {2025},
694
+ doi = {10.5281/zenodo.18073153},
695
+ publisher = {Zenodo},
696
  url = {https://huggingface.co/wikilangs}
697
  institution = {Omneity Labs}
698
  }
 
708
  - 🤗 Models: [huggingface.co/wikilangs](https://huggingface.co/wikilangs)
709
  - 📊 Data: [wikipedia-monthly](https://huggingface.co/datasets/omarkamali/wikipedia-monthly)
710
  - 👤 Author: [Omar Kamali](https://huggingface.co/omarkamali)
711
+ - 🤝 Sponsor: [Featherless AI](https://featherless.ai)
712
  ---
713
  *Generated by Wikilangs Models Pipeline*
714
 
715
+ *Report Date: 2026-01-04 03:00:40*
models/embeddings/aligned/dz_128d.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1dcfe07db51e338f29b11cc3948400bc263bdaa9ee20bff094b937df3a88eafe
3
+ size 1025712069
models/embeddings/aligned/dz_128d.meta.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"lang": "dz", "dim": 128, "max_seq_len": 512, "is_aligned": true}
models/embeddings/aligned/dz_128d.projection.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ae9d51cd6d1f20f33ad43e0ebc040e9c60e5efef1660c66c48e1753e7f29e7d1
3
+ size 65664
models/embeddings/aligned/dz_128d_metadata.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "language": "dz",
3
+ "dimension": 128,
4
+ "version": "aligned",
5
+ "hub_language": "en",
6
+ "seed_vocab_size": 329,
7
+ "vocab_size": 1602
8
+ }
models/embeddings/aligned/dz_32d.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e3933e9d9dffccc98f99b10e7f95f6656ef45b6fa711dc8726dc64e374a14aa6
3
+ size 256481733
models/embeddings/aligned/dz_32d.meta.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"lang": "dz", "dim": 32, "max_seq_len": 512, "is_aligned": true}
models/embeddings/aligned/dz_32d.projection.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:17308e828737ebd9ac6a3c706dae164e4bcc335a5173fecdc73408ccedf7da6c
3
+ size 4224
models/embeddings/aligned/dz_32d_metadata.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "language": "dz",
3
+ "dimension": 32,
4
+ "version": "aligned",
5
+ "hub_language": "en",
6
+ "seed_vocab_size": 329,
7
+ "vocab_size": 1602
8
+ }
models/embeddings/aligned/dz_64d.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:34e82a789619b535b455c2b7f657a7f8bda7d16f597d1daedd1096e57ba64065
3
+ size 512891845
models/embeddings/aligned/dz_64d.meta.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"lang": "dz", "dim": 64, "max_seq_len": 512, "is_aligned": true}
models/embeddings/aligned/dz_64d.projection.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:55de110e901818649692cfc6a7e97343d7cd34e63a6fa97e21046355a71de3e0
3
+ size 16512
models/embeddings/aligned/dz_64d_metadata.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "language": "dz",
3
+ "dimension": 64,
4
+ "version": "aligned",
5
+ "hub_language": "en",
6
+ "seed_vocab_size": 329,
7
+ "vocab_size": 1602
8
+ }
models/embeddings/monolingual/dz_128d.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d722d0680f671d8ae327caa3ee8603c53cb43740cfd380d68f04ef1fdc0564e9
3
- size 1026040985
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1dcfe07db51e338f29b11cc3948400bc263bdaa9ee20bff094b937df3a88eafe
3
+ size 1025712069
models/embeddings/monolingual/dz_128d_metadata.json CHANGED
@@ -3,11 +3,13 @@
3
  "dimension": 128,
4
  "version": "monolingual",
5
  "training_params": {
6
- "dim": 128,
7
  "min_count": 5,
8
  "window": 5,
9
  "negative": 5,
10
- "epochs": 5
 
 
11
  },
12
- "vocab_size": 1914
13
  }
 
3
  "dimension": 128,
4
  "version": "monolingual",
5
  "training_params": {
6
+ "algorithm": "skipgram",
7
  "min_count": 5,
8
  "window": 5,
9
  "negative": 5,
10
+ "epochs": 5,
11
+ "encoding_method": "rope",
12
+ "dim": 128
13
  },
14
+ "vocab_size": 1602
15
  }
models/embeddings/monolingual/dz_32d.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:de6fde9cee2ca33805b66192484f75a384fc767a4e41c9e3e1349e5df4efbbe0
3
- size 256571033
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e3933e9d9dffccc98f99b10e7f95f6656ef45b6fa711dc8726dc64e374a14aa6
3
+ size 256481733
models/embeddings/monolingual/dz_32d_metadata.json CHANGED
@@ -3,11 +3,13 @@
3
  "dimension": 32,
4
  "version": "monolingual",
5
  "training_params": {
6
- "dim": 32,
7
  "min_count": 5,
8
  "window": 5,
9
  "negative": 5,
10
- "epochs": 5
 
 
11
  },
12
- "vocab_size": 1914
13
  }
 
3
  "dimension": 32,
4
  "version": "monolingual",
5
  "training_params": {
6
+ "algorithm": "skipgram",
7
  "min_count": 5,
8
  "window": 5,
9
  "negative": 5,
10
+ "epochs": 5,
11
+ "encoding_method": "rope",
12
+ "dim": 32
13
  },
14
+ "vocab_size": 1602
15
  }
models/embeddings/monolingual/dz_64d.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3145c5cc9d22a1770f9210ca05d9737c7c503b50ffe23ea80a6eb9ceb9258651
3
- size 513061017
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:34e82a789619b535b455c2b7f657a7f8bda7d16f597d1daedd1096e57ba64065
3
+ size 512891845
models/embeddings/monolingual/dz_64d_metadata.json CHANGED
@@ -3,11 +3,13 @@
3
  "dimension": 64,
4
  "version": "monolingual",
5
  "training_params": {
6
- "dim": 64,
7
  "min_count": 5,
8
  "window": 5,
9
  "negative": 5,
10
- "epochs": 5
 
 
11
  },
12
- "vocab_size": 1914
13
  }
 
3
  "dimension": 64,
4
  "version": "monolingual",
5
  "training_params": {
6
+ "algorithm": "skipgram",
7
  "min_count": 5,
8
  "window": 5,
9
  "negative": 5,
10
+ "epochs": 5,
11
+ "encoding_method": "rope",
12
+ "dim": 64
13
  },
14
+ "vocab_size": 1602
15
  }
models/subword_markov/dz_markov_ctx1_subword.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b4bb0e27f1c210e8aab111267e19d1d5e1d18721ec5e52e6b277feae0adde466
3
- size 60805
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3087bea20e8eb6b2ec3a5f0aa9584c7311e41fe5a4c01f823fa4834212620662
3
+ size 111860
models/subword_markov/dz_markov_ctx1_subword_metadata.json CHANGED
@@ -2,6 +2,6 @@
2
  "context_size": 1,
3
  "variant": "subword",
4
  "language": "dz",
5
- "unique_contexts": 727,
6
- "total_transitions": 4381675
7
  }
 
2
  "context_size": 1,
3
  "variant": "subword",
4
  "language": "dz",
5
+ "unique_contexts": 1607,
6
+ "total_transitions": 2899198
7
  }
models/subword_markov/dz_markov_ctx2_subword.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3747acef076c0fe34becbcf80d7e30260918fc21406b3e37de57d81f61ce0562
3
- size 310779
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6292dd07a274165c86703180f2425201d1d9674e4ba2c33f61d9d5938c3467e1
3
+ size 489356
models/subword_markov/dz_markov_ctx2_subword_metadata.json CHANGED
@@ -2,6 +2,6 @@
2
  "context_size": 2,
3
  "variant": "subword",
4
  "language": "dz",
5
- "unique_contexts": 7001,
6
- "total_transitions": 4380562
7
  }
 
2
  "context_size": 2,
3
  "variant": "subword",
4
  "language": "dz",
5
+ "unique_contexts": 12152,
6
+ "total_transitions": 2898160
7
  }
models/subword_markov/dz_markov_ctx3_subword.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:95233f33b057a7c0b853eb01e55d5fea5a9a3405b48f24242e21c2ba56259a16
3
- size 1040905
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:388847f6c74b750ae03e7f5efd4aaeacf73b7bd43240dd42b5f2aa721dbdc066
3
+ size 1647571
models/subword_markov/dz_markov_ctx3_subword_metadata.json CHANGED
@@ -2,6 +2,6 @@
2
  "context_size": 3,
3
  "variant": "subword",
4
  "language": "dz",
5
- "unique_contexts": 38506,
6
- "total_transitions": 4379449
7
  }
 
2
  "context_size": 3,
3
  "variant": "subword",
4
  "language": "dz",
5
+ "unique_contexts": 61009,
6
+ "total_transitions": 2897122
7
  }
models/subword_markov/dz_markov_ctx4_subword.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2dca69cb48bb57a909fbca7cb5406a84c25f6c5f445ab67e8d62f592fc6ac5a2
3
- size 2665013
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fed702bf6148ce9dc1a7ea84a35755fc292d0a1697d7d3752899d2b7ed2b2c5b
3
+ size 4311752
models/subword_markov/dz_markov_ctx4_subword_metadata.json CHANGED
@@ -2,6 +2,6 @@
2
  "context_size": 4,
3
  "variant": "subword",
4
  "language": "dz",
5
- "unique_contexts": 121621,
6
- "total_transitions": 4378336
7
  }
 
2
  "context_size": 4,
3
  "variant": "subword",
4
  "language": "dz",
5
+ "unique_contexts": 199035,
6
+ "total_transitions": 2896084
7
  }
models/subword_ngram/dz_2gram_subword.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:09d5275b8f276ba3da20aca7a059f240b755a5848b95578cd0e57576e8d2b173
3
- size 43022
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:98106b25d70e32c0869643449157893becaaa5cd2d9c93a574d49972755b142f
3
+ size 77884
models/subword_ngram/dz_2gram_subword_metadata.json CHANGED
@@ -2,6 +2,6 @@
2
  "n": 2,
3
  "variant": "subword",
4
  "language": "dz",
5
- "unique_ngrams": 3288,
6
- "total_ngrams": 4381675
7
  }
 
2
  "n": 2,
3
  "variant": "subword",
4
  "language": "dz",
5
+ "unique_ngrams": 5527,
6
+ "total_ngrams": 2899198
7
  }
models/subword_ngram/dz_3gram_subword.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:67ecae4be4d51a96f235bd1b6cdecd6fa9034a1a947167dcebb6838e6424000f
3
- size 244220
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:19e9ca58532c279df9875e0fc70fc0ca01a0dc4a6eeebb0d90cc9611de440cca
3
+ size 417712
models/subword_ngram/dz_3gram_subword_metadata.json CHANGED
@@ -2,6 +2,6 @@
2
  "n": 3,
3
  "variant": "subword",
4
  "language": "dz",
5
- "unique_ngrams": 19053,
6
- "total_ngrams": 4380562
7
  }
 
2
  "n": 3,
3
  "variant": "subword",
4
  "language": "dz",
5
+ "unique_ngrams": 28498,
6
+ "total_ngrams": 2898160
7
  }
models/subword_ngram/dz_4gram_subword.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:72af1322dc5108a5a783c65271a08d008eb8788ea196291554e7bb961046ce3d
3
- size 858312
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dc939eb8f6b49a4d85499e546bee0c3aa02484d5639731734c57e7e77ccc3406
3
+ size 1535380
models/subword_ngram/dz_4gram_subword_metadata.json CHANGED
@@ -2,6 +2,6 @@
2
  "n": 4,
3
  "variant": "subword",
4
  "language": "dz",
5
- "unique_ngrams": 68338,
6
- "total_ngrams": 4379449
7
  }
 
2
  "n": 4,
3
  "variant": "subword",
4
  "language": "dz",
5
+ "unique_ngrams": 106273,
6
+ "total_ngrams": 2897122
7
  }
models/subword_ngram/dz_5gram_subword.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:89c0a9d344f772c56895fcfa4f96ca8389e2d1d65d242db9969fdf57da31004e
3
+ size 2925710
models/subword_ngram/dz_5gram_subword_metadata.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "n": 5,
3
+ "variant": "subword",
4
+ "language": "dz",
5
+ "unique_ngrams": 194726,
6
+ "total_ngrams": 2896084
7
+ }
models/tokenizer/dz_tokenizer_16k.model CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6e4b153b5c498e9676f1d0be1af974800ada3ef90b84b25fb029c769bb4f2c39
3
- size 723437
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2f90c7e3cf5a68d178f10a0beb7ddb97feaa78af6989d232df8e51de111001a4
3
+ size 731782
models/tokenizer/dz_tokenizer_16k.vocab CHANGED
The diff for this file is too large to render. See raw diff
 
models/tokenizer/dz_tokenizer_32k.model CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3535359e078e90bde0054e2b711f0dd81cef07816c72d21f67b1611058d511e4
3
- size 1262637
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ea0364e6b2d160e778e60d5e1c73e3e9b08f8793e92690aafe427a87f8eb980a
3
+ size 1365323
models/tokenizer/dz_tokenizer_32k.vocab CHANGED
The diff for this file is too large to render. See raw diff
 
models/tokenizer/dz_tokenizer_64k.model CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b0639d3f1b896a46c0dddd000ad31618a562b86efd1e742d9ffaaee9eb96d8ca
3
- size 2493435
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:402c9983197ff52d444fc136259bb6b1f0cfb5b38449cd46aef1f96b7701f99e
3
+ size 2390465
models/tokenizer/dz_tokenizer_64k.vocab CHANGED
The diff for this file is too large to render. See raw diff
 
models/tokenizer/dz_tokenizer_8k.model CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a1b86d110a86dab6e1e1c6f0a86b9076034e30c13e2bccd44bf4df01c9c6ed44
3
- size 466064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c1416416e068b7dd36e23224aed70c1131a475dc4b0464d18851552856625ace
3
+ size 450802
models/tokenizer/dz_tokenizer_8k.vocab CHANGED
The diff for this file is too large to render. See raw diff
 
models/vocabulary/dz_vocabulary.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:62eb74b27face901c271eb032243b46b21201b90fd59225353cbe861179c11f6
3
- size 68375
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:14e984f867b546d329e8a05a2bde8d13e79ac78e9afc51f2e97dc5f7abdcc723
3
+ size 113898
models/vocabulary/dz_vocabulary_metadata.json CHANGED
@@ -1,15 +1,17 @@
1
  {
2
  "language": "dz",
3
- "vocabulary_size": 4041,
 
4
  "statistics": {
5
- "type_token_ratio": 0.0058438083699328585,
6
  "coverage": {
7
- "top_100": 0.907915257374594,
8
- "top_1000": 0.9895912632305182,
9
- "top_5000": 0.9975216652363403
 
10
  },
11
- "hapax_count": 4641,
12
- "hapax_ratio": 0.5345542501727713,
13
- "total_documents": 1113
14
  }
15
  }
 
1
  {
2
  "language": "dz",
3
+ "vocabulary_size": 6761,
4
+ "variant": "full",
5
  "statistics": {
6
+ "type_token_ratio": 0.013374135154743156,
7
  "coverage": {
8
+ "top_100": 0.4871301735883779,
9
+ "top_1000": 0.9174028542105356,
10
+ "top_5000": 0.9902080052377329,
11
+ "top_10000": 0.9976852671066834
12
  },
13
+ "hapax_count": 5332,
14
+ "hapax_ratio": 0.4409162325312164,
15
+ "total_documents": 1038
16
  }
17
  }
models/word_markov/dz_markov_ctx1_word.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:61a9184a1b8ab3551fe0bc2b20b0017902678f748e4675c3e748aea5859a0d5c
3
- size 312387
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:76e337706f6a7fb91b004453fee9ac14da9e139b1614cc7f2e3089bc6b4ea3fc
3
+ size 876930
models/word_markov/dz_markov_ctx1_word_metadata.json CHANGED
@@ -2,6 +2,6 @@
2
  "context_size": 1,
3
  "variant": "word",
4
  "language": "dz",
5
- "unique_contexts": 8836,
6
- "total_transitions": 3510455
7
  }
 
2
  "context_size": 1,
3
  "variant": "word",
4
  "language": "dz",
5
+ "unique_contexts": 12061,
6
+ "total_transitions": 903170
7
  }
models/word_markov/dz_markov_ctx2_word.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:71526c2583406425808542d0de6b23cbb666ed4109f31c4941ffa339ebc9ddcd
3
- size 865992
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:62fa7abf255f3a8a538ee56225f2684bcea4c021a186d768c55d5ca85930ca27
3
+ size 4355103
models/word_markov/dz_markov_ctx2_word_metadata.json CHANGED
@@ -2,6 +2,6 @@
2
  "context_size": 2,
3
  "variant": "word",
4
  "language": "dz",
5
- "unique_contexts": 32129,
6
- "total_transitions": 3509342
7
  }
 
2
  "context_size": 2,
3
  "variant": "word",
4
  "language": "dz",
5
+ "unique_contexts": 170162,
6
+ "total_transitions": 902132
7
  }