omarkamali commited on 7 days ago

Commit

5d95b76

verified ·

1 Parent(s): d191af0

Upload all models and assets for dga (latest)

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +1 -0
README.md +326 -132
models/embeddings/aligned/dga_128d.bin +3 -0
models/embeddings/aligned/dga_128d.meta.json +1 -0
models/embeddings/aligned/dga_128d.projection.npy +3 -0
models/embeddings/aligned/dga_128d_metadata.json +8 -0
models/embeddings/aligned/dga_32d.bin +3 -0
models/embeddings/aligned/dga_32d.meta.json +1 -0
models/embeddings/aligned/dga_32d.projection.npy +3 -0
models/embeddings/aligned/dga_32d_metadata.json +8 -0
models/embeddings/aligned/dga_64d.bin +3 -0
models/embeddings/aligned/dga_64d.meta.json +1 -0
models/embeddings/aligned/dga_64d.projection.npy +3 -0
models/embeddings/aligned/dga_64d_metadata.json +8 -0
models/embeddings/monolingual/dga_128d.bin +2 -2
models/embeddings/monolingual/dga_128d_metadata.json +5 -3
models/embeddings/monolingual/dga_32d.bin +2 -2
models/embeddings/monolingual/dga_32d_metadata.json +5 -3
models/embeddings/monolingual/dga_64d.bin +2 -2
models/embeddings/monolingual/dga_64d_metadata.json +5 -3
models/subword_markov/dga_markov_ctx1_subword.parquet +2 -2
models/subword_markov/dga_markov_ctx1_subword_metadata.json +2 -2
models/subword_markov/dga_markov_ctx2_subword.parquet +2 -2
models/subword_markov/dga_markov_ctx2_subword_metadata.json +2 -2
models/subword_markov/dga_markov_ctx3_subword.parquet +2 -2
models/subword_markov/dga_markov_ctx3_subword_metadata.json +2 -2
models/subword_markov/dga_markov_ctx4_subword.parquet +2 -2
models/subword_markov/dga_markov_ctx4_subword_metadata.json +2 -2
models/subword_ngram/dga_2gram_subword.parquet +2 -2
models/subword_ngram/dga_2gram_subword_metadata.json +2 -2
models/subword_ngram/dga_3gram_subword.parquet +2 -2
models/subword_ngram/dga_3gram_subword_metadata.json +2 -2
models/subword_ngram/dga_4gram_subword.parquet +2 -2
models/subword_ngram/dga_4gram_subword_metadata.json +2 -2
models/subword_ngram/dga_5gram_subword.parquet +3 -0
models/subword_ngram/dga_5gram_subword_metadata.json +7 -0
models/tokenizer/dga_tokenizer_16k.model +2 -2
models/tokenizer/dga_tokenizer_16k.vocab +0 -0
models/tokenizer/dga_tokenizer_32k.model +2 -2
models/tokenizer/dga_tokenizer_32k.vocab +0 -0
models/tokenizer/dga_tokenizer_64k.model +2 -2
models/tokenizer/dga_tokenizer_64k.vocab +0 -0
models/tokenizer/dga_tokenizer_8k.model +2 -2
models/tokenizer/dga_tokenizer_8k.vocab +0 -0
models/vocabulary/dga_vocabulary.parquet +2 -2
models/vocabulary/dga_vocabulary_metadata.json +10 -9
models/word_markov/dga_markov_ctx1_word.parquet +2 -2
models/word_markov/dga_markov_ctx1_word_metadata.json +2 -2
models/word_markov/dga_markov_ctx2_word.parquet +2 -2
models/word_markov/dga_markov_ctx2_word_metadata.json +2 -2

.gitattributes CHANGED Viewed

@@ -39,3 +39,4 @@ visualizations/position_encoding_comparison.png filter=lfs diff=lfs merge=lfs -t
 visualizations/tsne_sentences.png filter=lfs diff=lfs merge=lfs -text
 visualizations/tsne_words.png filter=lfs diff=lfs merge=lfs -text
 visualizations/zipf_law.png filter=lfs diff=lfs merge=lfs -text

 visualizations/tsne_sentences.png filter=lfs diff=lfs merge=lfs -text
 visualizations/tsne_words.png filter=lfs diff=lfs merge=lfs -text
 visualizations/zipf_law.png filter=lfs diff=lfs merge=lfs -text
+visualizations/embedding_tsne_multilingual.png filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,6 +1,6 @@
 ---
 language: dga
-language_name: DGA
 language_family: atlantic_gur
 tags:
   - wikilangs
@@ -10,11 +10,21 @@ tags:
   - n-gram
   - markov
   - wikipedia
   - monolingual
   - family-atlantic_gur
 license: mit
 library_name: wikilangs
-pipeline_tag: feature-extraction
 datasets:
   - omarkamali/wikipedia-monthly
 dataset_info:
@@ -23,20 +33,20 @@ dataset_info:
 metrics:
   - name: best_compression_ratio
     type: compression
-    value: 3.554
   - name: best_isotropy
     type: isotropy
-    value: 0.8544
   - name: vocabulary_size
     type: vocab
-    value: 40845
-generated: 2025-12-30
 ---
-# DGA - Wikilangs Models
 ## Comprehensive Research Report & Full Ablation Study
-This repository contains NLP models trained and evaluated by Wikilangs, specifically on **DGA** Wikipedia data.
 We analyze tokenizers, n-gram models, Markov chains, vocabulary statistics, and word embeddings.
 ## 📋 Repository Contents
@@ -44,12 +54,13 @@ We analyze tokenizers, n-gram models, Markov chains, vocabulary statistics, and
 ### Models & Assets
 - Tokenizers (8k, 16k, 32k, 64k)
-- N-gram models (2, 3, 4-gram)
-- Markov chains (context of 1, 2, 3 and 4)
 - Subword N-gram and Markov chains
-- Embeddings in various sizes and dimensions
 - Language Vocabulary
 - Language Statistics
 ![Performance Dashboard](visualizations/performance_dashboard.png)
 ### Analysis and Evaluation
@@ -59,7 +70,8 @@ We analyze tokenizers, n-gram models, Markov chains, vocabulary statistics, and
 - [3. Markov Chain Evaluation](#3-markov-chain-evaluation)
 - [4. Vocabulary Analysis](#4-vocabulary-analysis)
 - [5. Word Embeddings Evaluation](#5-word-embeddings-evaluation)
-- [6. Summary & Recommendations](#6-summary--recommendations)
 - [Metrics Glossary](#appendix-metrics-glossary--interpretation-guide)
 - [Visualizations Index](#visualizations-index)
@@ -68,53 +80,57 @@ We analyze tokenizers, n-gram models, Markov chains, vocabulary statistics, and
 ![Tokenizer Compression](visualizations/tokenizer_compression.png)
 ### Results
 | Vocab Size | Compression | Avg Token Len | UNK Rate | Total Tokens |
 |------------|-------------|---------------|----------|--------------|
-| **8k** | 3.196x | 3.16 | 0.0394% | 662,329 |
-| **16k** | 3.356x | 3.32 | 0.0414% | 630,808 |
-| **32k** | 3.472x | 3.43 | 0.0428% | 609,791 |
-| **64k** | 3.554x 🏆 | 3.51 | 0.0438% | 595,723 |
 ### Tokenization Examples
 Below are sample sentences tokenized with each vocabulary size:
-**Sample 1:** `Tembilee kaŋa na be Africa, Ka oneŋ Ghana la laŋ dankyinne`
 | Vocab | Tokens | Count |
 |-------|--------|-------|
-| 8k | `▁tem bilee ▁kaŋa ▁na ▁be ▁africa , ▁ka ▁o neŋ ... (+6 more)` | 16 |
-| 16k | `▁tem bilee ▁kaŋa ▁na ▁be ▁africa , ▁ka ▁o neŋ ... (+6 more)` | 16 |
-| 32k | `▁tembilee ▁kaŋa ▁na ▁be ▁africa , ▁ka ▁o neŋ ▁ghana ... (+4 more)` | 14 |
-| 64k | `▁tembilee ▁kaŋa ▁na ▁be ▁africa , ▁ka ▁oneŋ ▁ghana ▁la ... (+2 more)` | 12 |
-**Sample 2:** `Zaguo e la tembiili kaŋa naŋ be Jirapa paaloŋ poɔ. Koɔbo ane done guoluu la ba y...`
 | Vocab | Tokens | Count |
 |-------|--------|-------|
-| 8k | `▁zag uo ▁e ▁la ▁tem bi i li ▁kaŋa ▁naŋ ... (+16 more)` | 26 |
-| 16k | `▁zag uo ▁e ▁la ▁tem bii li ▁kaŋa ▁naŋ ▁be ... (+14 more)` | 24 |
-| 32k | `▁zag uo ▁e ▁la ▁tem bii li ▁kaŋa ▁naŋ ▁be ... (+13 more)` | 23 |
-| 64k | `▁zaguo ▁e ▁la ▁tem bii li ▁kaŋa ▁naŋ ▁be ▁jirapa ... (+11 more)` | 21 |
-**Sample 3:** `Ullo e la yie bile kaŋ naŋ be Upper West Region.
-Ullo e la yiri naŋ taa noba k...`
 | Vocab | Tokens | Count |
 |-------|--------|-------|
-| 8k | `▁ul lo ▁e ▁la ▁yie ▁bile ▁kaŋ ▁naŋ ▁be ▁upper ... (+32 more)` | 42 |
-| 16k | `▁ul lo ▁e ▁la ▁yie ▁bile ▁kaŋ ▁naŋ ▁be ▁upper ... (+31 more)` | 41 |
-| 32k | `▁ullo ▁e ▁la ▁yie ▁bile ▁kaŋ ▁naŋ ▁be ▁upper ▁west ... (+27 more)` | 37 |
-| 64k | `▁ullo ▁e ▁la ▁yie ▁bile ▁kaŋ ▁naŋ ▁be ▁upper ▁west ... (+26 more)` | 36 |
 ### Key Findings
-- **Best Compression:** 64k achieves 3.554x compression
-- **Lowest UNK Rate:** 8k with 0.0394% unknown tokens
 - **Trade-off:** Larger vocabularies improve compression but increase model size
 - **Recommendation:** 32k vocabulary provides optimal balance for production use
@@ -123,57 +139,111 @@ Ullo e la yiri naŋ taa noba k...`
 ![N-gram Perplexity](visualizations/ngram_perplexity.png)
 ![N-gram Coverage](visualizations/ngram_coverage.png)
 ### Results
-| N-gram | Perplexity | Entropy | Unique N-grams | Top-100 Coverage | Top-1000 Coverage |
-|--------|------------|---------|----------------|------------------|-------------------|
-| **2-gram** | 5,975 🏆 | 12.54 | 38,594 | 27.3% | 51.9% |
-| **2-gram** | 414 🏆 | 8.69 | 4,448 | 55.6% | 97.7% |
-| **3-gram** | 13,170 | 13.68 | 71,585 | 21.4% | 41.1% |
-| **3-gram** | 3,526 | 11.78 | 38,114 | 23.3% | 63.1% |
-| **4-gram** | 28,677 | 14.81 | 138,854 | 18.3% | 33.1% |
-| **4-gram** | 17,399 | 14.09 | 189,158 | 12.7% | 38.7% |
 ### Top 5 N-grams by Size
-**2-grams:**
 | Rank | N-gram | Count |
 |------|--------|-------|
-| 1 | `: /` | 18,314 |
-| 2 | `/ /` | 18,305 |
-| 3 | `https :` | 11,430 |
-| 4 | `gbuli :` | 11,117 |
-| 5 | `. com` | 9,398 |
-**3-grams:**
 | Rank | N-gram | Count |
 |------|--------|-------|
-| 1 | `: / /` | 18,287 |
-| 2 | `https : /` | 11,430 |
-| 3 | `. com /` | 8,145 |
-| 4 | `/ www .` | 6,911 |
-| 5 | `/ / www` | 6,909 |
-**4-grams:**
 | Rank | N-gram | Count |
 |------|--------|-------|
-| 1 | `https : / /` | 11,430 |
-| 2 | `: / / www` | 6,908 |
-| 3 | `/ / www .` | 6,907 |
-| 4 | `. https : /` | 6,433 |
-| 5 | `archive . org /` | 4,005 |
 ### Key Findings
-- **Best Perplexity:** 2-gram with 414
 - **Entropy Trend:** Decreases with larger n-grams (more predictable)
-- **Coverage:** Top-1000 patterns cover ~39% of corpus
 - **Recommendation:** 4-gram or 5-gram for best predictive performance
 ---
@@ -181,55 +251,86 @@ Ullo e la yiri naŋ taa noba k...`
 ![Markov Entropy](visualizations/markov_entropy.png)
 ![Markov Branching](visualizations/markov_branching.png)
 ### Results
-| Context | Avg Entropy | Perplexity | Branching Factor | Unique Contexts | Predictability |
-|---------|-------------|------------|------------------|-----------------|----------------|
-| **1** | 0.5605 | 1.475 | 4.56 | 110,912 | 44.0% |
-| **1** | 1.0147 | 2.021 | 8.63 | 1,148 | 0.0% |
-| **2** | 0.3023 | 1.233 | 1.85 | 505,122 | 69.8% |
-| **2** | 1.1781 | 2.263 | 7.58 | 9,899 | 0.0% |
-| **3** | 0.1398 | 1.102 | 1.30 | 935,548 | 86.0% |
-| **3** | 0.9413 | 1.920 | 4.43 | 75,047 | 5.9% |
-| **4** | 0.0660 🏆 | 1.047 | 1.12 | 1,213,249 | 93.4% |
-| **4** | 0.6668 🏆 | 1.588 | 2.67 | 332,542 | 33.3% |
-### Generated Text Samples
-Below are text samples generated from each Markov chain model:
 **Context Size 1:**
-1. `. https : / en . co . archive . https : / o da ba`
-2. `/ www . http : / www . g . com / / mps / mps`
-3. `a yi 1969 – 2007 entɛnɛte zagekpoŋpaatiare - nkrumah aboahnational democratic congress2016 - christi...`
 **Context Size 2:**
-1. `: / / doi / 10 . 1independentbawa mamshie ali4 , 13825 . 7 ( 4 )`
-2. `/ / www . premiumtimesng . com / books ? id = 100267 baba da paale la`
-3. `https : / / northpad . ng / entertainment / movies / emelia - brobbey - abeiku`
 **Context Size 3:**
-1. `: / / www . bellanaija . com / pages / 2020 / 06 / c_137803189 . htm`
-2. `https : / / en . wikipedia . org / web / 20230324002112 / https : / /`
-3. `. com / books ? id = 97 gɔɔloŋ asibiti gɔɔloŋ e la sankrite yelbie poɔ te seŋ`
 **Context Size 4:**
-1. `https : / / www . ghanaweb . com / ghanahomepage / sportsarchive / i - have - built`
-2. `: / / www . modernghana . com / news / 1016574 / voter - register - hajia -`
-3. `/ / www . birimnorth . ghanadistricts . gov . gh / index . php ? option = com_content`
 ### Key Findings
-- **Best Predictability:** Context-4 with 93.4% predictability
 - **Branching Factor:** Decreases with context size (more deterministic)
-- **Memory Trade-off:** Larger contexts require more storage (332,542 contexts)
 - **Recommendation:** Context-3 or Context-4 for text generation
 ---
@@ -245,26 +346,26 @@ Below are text samples generated from each Markov chain model:
 | Metric | Value |
 |--------|-------|
-| Vocabulary Size | 40,845 |
-| Total Tokens | 1,334,061 |
-| Mean Frequency | 32.66 |
-| Median Frequency | 3 |
-| Frequency Std Dev | 571.47 |
 ### Most Common Words
 | Rank | Word | Frequency |
 |------|------|-----------|
-| 1 | a | 77,980 |
-| 2 | la | 41,636 |
-| 3 | o | 29,377 |
-| 4 | naŋ | 24,628 |
-| 5 | da | 21,300 |
-| 6 | ka | 20,399 |
-| 7 | ba | 17,358 |
-| 8 | e | 16,509 |
-| 9 | poɔ | 14,690 |
-| 10 | ane | 12,209 |
 ### Least Common Words (from vocabulary)
@@ -277,32 +378,32 @@ Below are text samples generated from each Markov chain model:
 | 5 | jaʋ | 2 |
 | 6 | daahe | 2 |
 | 7 | tigrihi | 2 |
-| 8 | dglw | 2 |
-| 9 | pileehi | 2 |
 | 10 | ekewaolu | 2 |
 ### Zipf's Law Analysis
 | Metric | Value |
 |--------|-------|
-| Zipf Coefficient | 1.1521 |
-| R² (Goodness of Fit) | 0.997705 |
 | Adherence Quality | **excellent** |
 ### Coverage Analysis
 | Top N Words | Coverage |
 |-------------|----------|
-| Top 100 | 47.4% |
-| Top 1,000 | 73.4% |
-| Top 5,000 | 87.5% |
-| Top 10,000 | 92.3% |
 ### Key Findings
-- **Zipf Compliance:** R²=0.9977 indicates excellent adherence to Zipf's law
-- **High Frequency Dominance:** Top 100 words cover 47.4% of corpus
-- **Long Tail:** 30,845 words needed for remaining 7.7% coverage
 ---
 ## 5. Word Embeddings Evaluation
@@ -315,24 +416,114 @@ Below are text samples generated from each Markov chain model:
 ![t-SNE Sentences](visualizations/tsne_sentences.png)
-### Model Comparison
-| Model | Vocab Size | Dimension | Avg Norm | Std Norm | Isotropy |
-|-------|------------|-----------|----------|----------|----------|
-| **mono_32d** | 15,785 | 32 | 3.653 | 0.840 | 0.8544 🏆 |
-| **mono_64d** | 15,785 | 64 | 4.073 | 0.797 | 0.7925 |
-| **mono_128d** | 15,785 | 128 | 4.340 | 0.752 | 0.5386 |
-| **embeddings_enhanced** | 0 | 0 | 0.000 | 0.000 | 0.0000 |
 ### Key Findings
-- **Best Isotropy:** mono_32d with 0.8544 (more uniform distribution)
-- **Dimension Trade-off:** Higher dimensions capture more semantics but reduce isotropy
-- **Vocabulary Coverage:** All models cover 15,785 words
-- **Recommendation:** 100d for balanced semantic capture and efficiency
 ---
-## 6. Summary & Recommendations
 ![Performance Dashboard](visualizations/performance_dashboard.png)
@@ -340,11 +531,12 @@ Below are text samples generated from each Markov chain model:
 | Component | Recommended | Rationale |
 |-----------|-------------|-----------|
-| Tokenizer | **32k BPE** | Best compression (3.55x) with low UNK rate |
-| N-gram | **5-gram** | Lowest perplexity (414) |
-| Markov | **Context-4** | Highest predictability (93.4%) |
 | Embeddings | **100d** | Balanced semantic capture and isotropy |
 ---
 ## Appendix: Metrics Glossary & Interpretation Guide
@@ -534,7 +726,8 @@ If you use these models in your research, please cite:
   author = {Kamali, Omar},
   title = {Wikilangs: Open NLP Models for Wikipedia Languages},
   year = {2025},
-  publisher = {HuggingFace},
   url = {https://huggingface.co/wikilangs}
   institution = {Omneity Labs}
 }
@@ -550,7 +743,8 @@ MIT License - Free for academic and commercial use.
 - 🤗 Models: [huggingface.co/wikilangs](https://huggingface.co/wikilangs)
 - 📊 Data: [wikipedia-monthly](https://huggingface.co/datasets/omarkamali/wikipedia-monthly)
 - 👤 Author: [Omar Kamali](https://huggingface.co/omarkamali)
 ---
 *Generated by Wikilangs Models Pipeline*
-*Report Date: 2025-12-30 08:23:36*

 ---
 language: dga
+language_name: Southern Dagaare
 language_family: atlantic_gur
 tags:
   - wikilangs
   - n-gram
   - markov
   - wikipedia
+  - feature-extraction
+  - sentence-similarity
+  - tokenization
+  - n-grams
+  - markov-chain
+  - text-mining
+  - fasttext
+  - babelvec
+  - vocabulous
+  - vocabulary
   - monolingual
   - family-atlantic_gur
 license: mit
 library_name: wikilangs
+pipeline_tag: text-generation
 datasets:
   - omarkamali/wikipedia-monthly
 dataset_info:
 metrics:
   - name: best_compression_ratio
     type: compression
+    value: 4.080
   - name: best_isotropy
     type: isotropy
+    value: 0.8588
   - name: vocabulary_size
     type: vocab
+    value: 0
+generated: 2026-01-04
 ---
+# Southern Dagaare - Wikilangs Models
 ## Comprehensive Research Report & Full Ablation Study
+This repository contains NLP models trained and evaluated by Wikilangs, specifically on **Southern Dagaare** Wikipedia data.
 We analyze tokenizers, n-gram models, Markov chains, vocabulary statistics, and word embeddings.
 ## 📋 Repository Contents
 ### Models & Assets
 - Tokenizers (8k, 16k, 32k, 64k)
+- N-gram models (2, 3, 4, 5-gram)
+- Markov chains (context of 1, 2, 3, 4 and 5)
 - Subword N-gram and Markov chains
+- Embeddings in various sizes and dimensions (aligned and unaligned)
 - Language Vocabulary
 - Language Statistics
 ![Performance Dashboard](visualizations/performance_dashboard.png)
 ### Analysis and Evaluation
 - [3. Markov Chain Evaluation](#3-markov-chain-evaluation)
 - [4. Vocabulary Analysis](#4-vocabulary-analysis)
 - [5. Word Embeddings Evaluation](#5-word-embeddings-evaluation)
+- [6. Morphological Analysis (Experimental)](#6--morphological-analysis-experimental)
+- [7. Summary & Recommendations](#7-summary--recommendations)
 - [Metrics Glossary](#appendix-metrics-glossary--interpretation-guide)
 - [Visualizations Index](#visualizations-index)
 ![Tokenizer Compression](visualizations/tokenizer_compression.png)
+![Tokenizer Fertility](visualizations/tokenizer_fertility.png)
+![Tokenizer OOV](visualizations/tokenizer_oov.png)
+![Total Tokens](visualizations/tokenizer_total_tokens.png)
 ### Results
 | Vocab Size | Compression | Avg Token Len | UNK Rate | Total Tokens |
 |------------|-------------|---------------|----------|--------------|
+| **8k** | 3.655x | 3.66 | 0.0592% | 426,016 |
+| **16k** | 3.850x | 3.85 | 0.0623% | 404,419 |
+| **32k** | 3.987x | 3.99 | 0.0645% | 390,549 |
+| **64k** | 4.080x 🏆 | 4.08 | 0.0660% | 381,660 |
 ### Tokenization Examples
 Below are sample sentences tokenized with each vocabulary size:
+**Sample 1:** `Lambussie e la tembile ane a Lambussie Karni desekyere teŋkpoŋ, desekyere naŋ be...`
 | Vocab | Tokens | Count |
 |-------|--------|-------|
+| 8k | `▁lambussie ▁e ▁la ▁tembile ▁ane ▁a ▁lambussie ▁kar ni ▁desekyere ... (+19 more)` | 29 |
+| 16k | `▁lambussie ▁e ▁la ▁tembile ▁ane ▁a ▁lambussie ▁kar ni ▁desekyere ... (+19 more)` | 29 |
+| 32k | `▁lambussie ▁e ▁la ▁tembile ▁ane ▁a ▁lambussie ▁karni ▁desekyere ▁teŋkpoŋ ... (+18 more)` | 28 |
+| 64k | `▁lambussie ▁e ▁la ▁tembile ▁ane ▁a ▁lambussie ▁karni ▁desekyere ▁teŋkpoŋ ... (+18 more)` | 28 |
+**Sample 2:** `Lugo e la dabaarãã ba naŋ maŋ ba wagre ŋa ba naŋ wa meɛrɛ dié, lugo maŋ taa la k...`
 | Vocab | Tokens | Count |
 |-------|--------|-------|
+| 8k | `▁lu go ▁e ▁la ▁da baa r ãã ▁ba ▁naŋ ... (+27 more)` | 37 |
+| 16k | `▁lu go ▁e ▁la ▁da baa rãã ▁ba ▁naŋ ▁maŋ ... (+25 more)` | 35 |
+| 32k | `▁lugo ▁e ▁la ▁da baa rãã ▁ba ▁naŋ ▁maŋ ▁ba ... (+21 more)` | 31 |
+| 64k | `▁lugo ▁e ▁la ▁dabaarãã ▁ba ▁naŋ ▁maŋ ▁ba ▁wagre ▁ŋa ... (+18 more)` | 28 |
+**Sample 3:** `Sheikh Osman Nuhu Sharubutu waa la a Ghana zaa Silaamabiiri wideɛrɛ. O dɔgebo be...`
 | Vocab | Tokens | Count |
 |-------|--------|-------|
+| 8k | `▁she ikh ▁os man ▁nuhu ▁shar ubu tu ▁waa ▁la ... (+18 more)` | 28 |
+| 16k | `▁sheikh ▁osman ▁nuhu ▁shar ubu tu ▁waa ▁la ▁a ▁ghana ... (+14 more)` | 24 |
+| 32k | `▁sheikh ▁osman ▁nuhu ▁shar ubutu ▁waa ▁la ▁a ▁ghana ▁zaa ... (+13 more)` | 23 |
+| 64k | `▁sheikh ▁osman ▁nuhu ▁sharubutu ▁waa ▁la ▁a ▁ghana ▁zaa ▁silaamabiiri ... (+12 more)` | 22 |
 ### Key Findings
+- **Best Compression:** 64k achieves 4.080x compression
+- **Lowest UNK Rate:** 8k with 0.0592% unknown tokens
 - **Trade-off:** Larger vocabularies improve compression but increase model size
 - **Recommendation:** 32k vocabulary provides optimal balance for production use
 ![N-gram Perplexity](visualizations/ngram_perplexity.png)
+![N-gram Unique](visualizations/ngram_unique.png)
 ![N-gram Coverage](visualizations/ngram_coverage.png)
 ### Results
+| N-gram | Variant | Perplexity | Entropy | Unique N-grams | Top-100 Coverage | Top-1000 Coverage |
+|--------|---------|------------|---------|----------------|------------------|-------------------|
+| **2-gram** | Word | 5,247 | 12.36 | 23,604 | 25.2% | 52.5% |
+| **2-gram** | Subword | 261 🏆 | 8.03 | 3,102 | 67.0% | 99.0% |
+| **3-gram** | Word | 15,091 | 13.88 | 40,759 | 12.7% | 34.8% |
+| **3-gram** | Subword | 2,130 | 11.06 | 23,753 | 29.7% | 72.3% |
+| **4-gram** | Word | 37,462 | 15.19 | 77,183 | 7.6% | 22.9% |
+| **4-gram** | Subword | 10,952 | 13.42 | 113,607 | 15.0% | 44.1% |
+| **5-gram** | Word | 33,178 | 15.02 | 59,664 | 7.6% | 22.1% |
+| **5-gram** | Subword | 34,072 | 15.06 | 261,669 | 9.2% | 29.6% |
 ### Top 5 N-grams by Size
+**2-grams (Word):**
+| Rank | N-gram | Count |
+|------|--------|-------|
+| 1 | `la a` | 8,318 |
+| 2 | `e la` | 8,255 |
+| 3 | `ka o` | 5,097 |
+| 4 | `naŋ be` | 4,526 |
+| 5 | `o da` | 4,441 |
+**3-grams (Word):**
+| Rank | N-gram | Count |
+|------|--------|-------|
+| 1 | `naŋ be a` | 2,381 |
+| 2 | `e la a` | 1,581 |
+| 3 | `o e la` | 1,352 |
+| 4 | `da e la` | 1,226 |
+| 5 | `sommo yizie zaa` | 1,176 |
+**4-grams (Word):**
+| Rank | N-gram | Count |
+|------|--------|-------|
+| 1 | `sommo yizie zaa africa` | 1,004 |
+| 2 | `o da e la` | 534 |
+| 3 | `of the 4th republic` | 440 |
+| 4 | `4th republic of ghana` | 439 |
+| 5 | `parliament of the 4th` | 439 |
+**5-grams (Word):**
 | Rank | N-gram | Count |
 |------|--------|-------|
+| 1 | `parliament of the 4th republic` | 438 |
+| 2 | `the 4th republic of ghana` | 434 |
+| 3 | `of the 4th republic of` | 434 |
+| 4 | `4th republic of ghana zaa` | 348 |
+| 5 | `republic of ghana zaa africa` | 341 |
+**2-grams (Subword):**
 | Rank | N-gram | Count |
 |------|--------|-------|
+| 1 | `a _` | 295,811 |
+| 2 | `e _` | 179,385 |
+| 3 | `_ a` | 141,327 |
+| 4 | `_ n` | 88,607 |
+| 5 | `a n` | 84,293 |
+**3-grams (Subword):**
 | Rank | N-gram | Count |
 |------|--------|-------|
+| 1 | `_ a _` | 75,684 |
+| 2 | `_ l a` | 47,173 |
+| 3 | `l a _` | 44,354 |
+| 4 | `_ n a` | 42,956 |
+| 5 | `a ŋ _` | 41,497 |
+**4-grams (Subword):**
+| Rank | N-gram | Count |
+|------|--------|-------|
+| 1 | `_ l a _` | 40,459 |
+| 2 | `n a ŋ _` | 25,681 |
+| 3 | `_ n a ŋ` | 24,499 |
+| 4 | `_ d a _` | 21,223 |
+| 5 | `_ k a _` | 20,083 |
+**5-grams (Subword):**
+| Rank | N-gram | Count |
+|------|--------|-------|
+| 1 | `_ n a ŋ _` | 24,351 |
+| 2 | `e _ l a _` | 16,215 |
+| 3 | `_ a n e _` | 12,136 |
+| 4 | `g h a n a` | 10,185 |
+| 5 | `_ g h a n` | 9,611 |
 ### Key Findings
+- **Best Perplexity:** 2-gram (subword) with 261
 - **Entropy Trend:** Decreases with larger n-grams (more predictable)
+- **Coverage:** Top-1000 patterns cover ~30% of corpus
 - **Recommendation:** 4-gram or 5-gram for best predictive performance
 ---
 ![Markov Entropy](visualizations/markov_entropy.png)
+![Markov Contexts](visualizations/markov_contexts.png)
 ![Markov Branching](visualizations/markov_branching.png)
 ### Results
+| Context | Variant | Avg Entropy | Perplexity | Branching Factor | Unique Contexts | Predictability |
+|---------|---------|-------------|------------|------------------|-----------------|----------------|
+| **1** | Word | 0.7877 | 1.726 | 5.30 | 76,185 | 21.2% |
+| **1** | Subword | 0.9123 | 1.882 | 7.16 | 1,143 | 8.8% |
+| **2** | Word | 0.2777 | 1.212 | 1.73 | 402,963 | 72.2% |
+| **2** | Subword | 0.9272 | 1.902 | 5.67 | 8,182 | 7.3% |
+| **3** | Word | 0.1241 | 1.090 | 1.24 | 697,461 | 87.6% |
+| **3** | Subword | 0.8532 | 1.807 | 4.16 | 46,384 | 14.7% |
+| **4** | Word | 0.0565 🏆 | 1.040 | 1.09 | 865,473 | 94.4% |
+| **4** | Subword | 0.6504 | 1.570 | 2.74 | 193,130 | 35.0% |
+### Generated Text Samples (Word-based)
+Below are text samples generated from each word-based Markov chain model:
 **Context Size 1:**
+1. `a dudu taa la taa la martha hyer spainunited states be la rev r b enfuomo`
+2. `la 27 june how lucky philip dube akon kanye kanye west african universities african cinema hosts`
+3. `o teŋkpoŋ geogarapi a 21 december ane o ba meŋ da eɛ bonwuoraa dagaaba naŋ be`
 **Context Size 2:**
+1. `la a ghana ports ane coastal eŋgyinia poɔ o da nyeε gyerema aŋa pɔge ko a paalikaara`
+2. `e la desekyere ayi eŋɛ twifo atti morkwa desekyere a o south sɛŋ ne fumesua a o`
+3. `ka o fãã a kyɛ a na toɔ di a kogi ne 14 391 vootuu ka lɛ`
 **Context Size 3:**
+1. `naŋ be a gaana paaloo mr hackman owusu agyeman la a diplomats mine naŋ baare knust aliu mahama`
+2. `e la a is bolgatanga munisipal naŋ taa tensɔgɔ yɛlloŋ naŋ na baŋ pare pie ne anuu te`
+3. `o e la neŋkpoŋ naŋ kaara a naasaala mine nimikpɛ kyaare ne a silla ane goryeo saŋa naŋ`
 **Context Size 4:**
+1. `o da e la business development officer of fonak technologies ltd and chief executive officer of the ...`
+2. `of the 4th republic of ghana zaa africa parliament of the 4th republic of ghana zaa africa parliamen...`
+3. `parliament of the 4th republic of ghana zaa africa parliament of the 4th republic of ghana zaa afric...`
+### Generated Text Samples (Subword-based)
+Below are text samples generated from each subword-based Markov chain model:
+**Context Size 1:**
+1. `_erwiamarles_a_l`
+2. `anest,_nelica_a_`
+3. `e_pra,_ssi_aaɡba`
+**Context Size 2:**
+1. `a_nund_te_8_me._o`
+2. `e_a_e_a_tho/_ⓘ_in`
+3. `_a_garebɔloolijew`
+**Context Size 3:**
+1. `_a_baŋ_bebiri_daga`
+2. `_la_bare_poɔ._a_yu`
+3. `la_kology._oble_ma`
+**Context Size 4:**
+1. `_la_doŋ_kaa_naŋ_naŋ`
+2. `naŋ_be_a_kaŋa_naŋ_b`
+3. `_naŋ_be_tigiri_a_de`
 ### Key Findings
+- **Best Predictability:** Context-4 (word) with 94.4% predictability
 - **Branching Factor:** Decreases with context size (more deterministic)
+- **Memory Trade-off:** Larger contexts require more storage (193,130 contexts)
 - **Recommendation:** Context-3 or Context-4 for text generation
 ---
 | Metric | Value |
 |--------|-------|
+| Vocabulary Size | 33,219 |
+| Total Tokens | 1,069,636 |
+| Mean Frequency | 32.20 |
+| Median Frequency | 4 |
+| Frequency Std Dev | 610.71 |
 ### Most Common Words
 | Rank | Word | Frequency |
 |------|------|-----------|
+| 1 | a | 77,430 |
+| 2 | la | 41,562 |
+| 3 | o | 29,242 |
+| 4 | naŋ | 24,554 |
+| 5 | da | 21,295 |
+| 6 | ka | 20,388 |
+| 7 | ba | 17,329 |
+| 8 | e | 16,396 |
+| 9 | poɔ | 14,743 |
+| 10 | ane | 12,198 |
 ### Least Common Words (from vocabulary)
 | 5 | jaʋ | 2 |
 | 6 | daahe | 2 |
 | 7 | tigrihi | 2 |
+| 8 | pileehi | 2 |
+| 9 | revive | 2 |
 | 10 | ekewaolu | 2 |
 ### Zipf's Law Analysis
 | Metric | Value |
 |--------|-------|
+| Zipf Coefficient | 1.1395 |
+| R² (Goodness of Fit) | 0.997636 |
 | Adherence Quality | **excellent** |
 ### Coverage Analysis
 | Top N Words | Coverage |
 |-------------|----------|
+| Top 100 | 51.3% |
+| Top 1,000 | 75.0% |
+| Top 5,000 | 88.7% |
+| Top 10,000 | 93.4% |
 ### Key Findings
+- **Zipf Compliance:** R²=0.9976 indicates excellent adherence to Zipf's law
+- **High Frequency Dominance:** Top 100 words cover 51.3% of corpus
+- **Long Tail:** 23,219 words needed for remaining 6.6% coverage
 ---
 ## 5. Word Embeddings Evaluation
 ![t-SNE Sentences](visualizations/tsne_sentences.png)
+### 5.1 Cross-Lingual Alignment
+![Alignment Quality](visualizations/embedding_alignment_quality.png)
+![Multilingual t-SNE](visualizations/embedding_tsne_multilingual.png)
+### 5.2 Model Comparison
+| Model | Dimension | Isotropy | Semantic Density | Alignment R@1 | Alignment R@10 |
+|-------|-----------|----------|------------------|---------------|----------------|
+| **mono_32d** | 32 | 0.8588 🏆 | 0.3392 | N/A | N/A |
+| **mono_64d** | 64 | 0.7947 | 0.2830 | N/A | N/A |
+| **mono_128d** | 128 | 0.5119 | 0.2439 | N/A | N/A |
+| **aligned_32d** | 32 | 0.8588 | 0.3417 | 0.0440 | 0.3200 |
+| **aligned_64d** | 64 | 0.7947 | 0.2829 | 0.1180 | 0.4400 |
+| **aligned_128d** | 128 | 0.5119 | 0.2497 | 0.2020 | 0.5080 |
 ### Key Findings
+- **Best Isotropy:** mono_32d with 0.8588 (more uniform distribution)
+- **Semantic Density:** Average pairwise similarity of 0.2901. Lower values indicate better semantic separation.
+- **Alignment Quality:** Aligned models achieve up to 20.2% R@1 in cross-lingual retrieval.
+- **Recommendation:** 128d aligned for best cross-lingual performance
+---
+## 6.  Morphological Analysis (Experimental)
+This section presents an automated morphological analysis derived from the statistical divergence between word-level and subword-level models. By analyzing where subword predictability spikes and where word-level coverage fails, we can infer linguistic structures without supervised data.
+### 6.1 Productivity & Complexity
+| Metric | Value | Interpretation | Recommendation |
+|--------|-------|----------------|----------------|
+| Productivity Index | **5.000** | High morphological productivity | Reliable analysis |
+| Idiomaticity Gap | **-0.253** | Low formulaic content | - |
+### 6.2 Affix Inventory (Productive Units)
+These are the most productive prefixes and suffixes identified by sampling the vocabulary for global substitutability patterns. A unit is considered an affix if stripping it leaves a valid stem that appears in other contexts.
+#### Productive Prefixes
+| Prefix | Examples |
+|--------|----------|
+#### Productive Suffixes
+| Suffix | Examples |
+|--------|----------|
+| `-e` | service, kpeɛmine, dɔre |
+| `-re` | dɔre, core, sefaare |
+| `-ng` | providing, serving, keeling |
+### 6.3 Bound Stems (Lexical Roots)
+Bound stems are high-frequency subword units that are semantically cohesive but rarely appear as standalone words. These often correspond to the 'core' of a word that requires inflection or derivation to be valid.
+| Stem | Cohesion | Substitutability | Examples |
+|------|----------|------------------|----------|
+| `aare` | 1.84x | 72 contexts | zaare, daare, gaare |
+| `igyi` | 2.52x | 15 contexts | rigyiŋ, firigyi, irigyiŋ |
+| `aalo` | 1.78x | 43 contexts | gaalo, maalo, saalo |
+| `atio` | 2.19x | 20 contexts | matio, nation, station |
+| `eɛre` | 1.75x | 39 contexts | jeɛre, weɛre, neɛre |
+| `paal` | 1.62x | 50 contexts | paali, paale, paalo |
+| `tion` | 1.99x | 22 contexts | motion, nation, action |
+| `aale` | 1.53x | 47 contexts | laale, waale, paale |
+| `aloŋ` | 2.09x | 16 contexts | baloŋ, zaloŋ, yaloŋ |
+| `yaar` | 1.73x | 28 contexts | yaari, yaaro, yaara |
+| `rigy` | 2.15x | 14 contexts | rigyiŋ, firigyi, irigyiŋ |
+| `irig` | 2.40x | 9 contexts | irigiŋ, irigin, firigyi |
+### 6.4 Affix Compatibility (Co-occurrence)
+This table shows which prefixes and suffixes most frequently co-occur on the same stems, revealing the 'stacking' rules of the language's morphology.
+*No significant affix co-occurrences detected.*
+### 6.5 Recursive Morpheme Segmentation
+Using **Recursive Hierarchical Substitutability**, we decompose complex words into their constituent morphemes. This approach handles nested affixes (e.g., `prefix-prefix-root-suffix`).
+| Word | Suggested Split | Confidence | Stem |
+|------|-----------------|------------|------|
+| pɔgesarre | **`pɔgesar-re`** | 4.5 | `pɔgesar` |
+| lomboring | **`lombori-ng`** | 4.5 | `lombori` |
+| counselling | **`counselli-ng`** | 1.5 | `counselli` |
+| processing | **`processi-ng`** | 1.5 | `processi` |
+| containing | **`containi-ng`** | 1.5 | `containi` |
+| sasefaare | **`sasefaa-re`** | 1.5 | `sasefaa` |
+| schoolboarding | **`schoolboardi-ng`** | 1.5 | `schoolboardi` |
+| parodying | **`parodyi-ng`** | 1.5 | `parodyi` |
+| transforming | **`transformi-ng`** | 1.5 | `transformi` |
+| derbyshire | **`derbyshi-re`** | 1.5 | `derbyshi` |
+| dankwasere | **`dankwase-re`** | 1.5 | `dankwase` |
+| bonyɔgere | **`bonyɔge-re`** | 1.5 | `bonyɔge` |
+| sɛgebikparre | **`sɛgebikpar-re`** | 1.5 | `sɛgebikpar` |
+| nimbitɔɔre | **`nimbitɔɔ-re`** | 1.5 | `nimbitɔɔ` |
+| chongqing | **`chongqi-ng`** | 1.5 | `chongqi` |
+### 6.6 Linguistic Interpretation
+> **Automated Insight:**
+The language Southern Dagaare shows high morphological productivity. The subword models are significantly more efficient than word models, suggesting a rich system of affixation or compounding.
 ---
+## 7. Summary & Recommendations
 ![Performance Dashboard](visualizations/performance_dashboard.png)
 | Component | Recommended | Rationale |
 |-----------|-------------|-----------|
+| Tokenizer | **64k BPE** | Best compression (4.08x) |
+| N-gram | **2-gram** | Lowest perplexity (261) |
+| Markov | **Context-4** | Highest predictability (94.4%) |
 | Embeddings | **100d** | Balanced semantic capture and isotropy |
 ---
 ## Appendix: Metrics Glossary & Interpretation Guide
   author = {Kamali, Omar},
   title = {Wikilangs: Open NLP Models for Wikipedia Languages},
   year = {2025},
+  doi = {10.5281/zenodo.18073153},
+  publisher = {Zenodo},
   url = {https://huggingface.co/wikilangs}
   institution = {Omneity Labs}
 }
 - 🤗 Models: [huggingface.co/wikilangs](https://huggingface.co/wikilangs)
 - 📊 Data: [wikipedia-monthly](https://huggingface.co/datasets/omarkamali/wikipedia-monthly)
 - 👤 Author: [Omar Kamali](https://huggingface.co/omarkamali)
+- 🤝 Sponsor: [Featherless AI](https://featherless.ai)
 ---
 *Generated by Wikilangs Models Pipeline*
+*Report Date: 2026-01-04 02:08:16*

models/embeddings/aligned/dga_128d.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b6b3631e0fff8545f469b01e2d1bf05841ab0a5045a798fc48069ca41d6ddbb5
+size 1039825434

models/embeddings/aligned/dga_128d.meta.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"lang": "dga", "dim": 128, "max_seq_len": 512, "is_aligned": true}

models/embeddings/aligned/dga_128d.projection.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:50f4789b7295eb1824c65a5fc955a71d61ee81554fdd3b77dd40c4df7a61ccd0
+size 65664

models/embeddings/aligned/dga_128d_metadata.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "language": "dga",
+  "dimension": 128,
+  "version": "aligned",
+  "hub_language": "en",
+  "seed_vocab_size": 7985,
+  "vocab_size": 15210
+}

models/embeddings/aligned/dga_32d.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:101c1d2e7f82529eaf1f624fc1f9a8869242fb730c465d01982e7b740f585935
+size 260144154

models/embeddings/aligned/dga_32d.meta.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"lang": "dga", "dim": 32, "max_seq_len": 512, "is_aligned": true}

models/embeddings/aligned/dga_32d.projection.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fd3094746af59172715cb6615ae22d62cc55fe723a0d16231abd61b92c73bb80
+size 4224

models/embeddings/aligned/dga_32d_metadata.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "language": "dga",
+  "dimension": 32,
+  "version": "aligned",
+  "hub_language": "en",
+  "seed_vocab_size": 7985,
+  "vocab_size": 15210
+}

models/embeddings/aligned/dga_64d.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:29f3396a864b1787c765f6c869438d427adba6e4948ae28a99867bbd4d90e2fb
+size 520037914

models/embeddings/aligned/dga_64d.meta.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"lang": "dga", "dim": 64, "max_seq_len": 512, "is_aligned": true}

models/embeddings/aligned/dga_64d.projection.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cea0c0561c8e8d885a8feafb46ff0dedc92937ebd78baa60de23058af684584f
+size 16512

models/embeddings/aligned/dga_64d_metadata.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "language": "dga",
+  "dimension": 64,
+  "version": "aligned",
+  "hub_language": "en",
+  "seed_vocab_size": 7985,
+  "vocab_size": 15210
+}

models/embeddings/monolingual/dga_128d.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7509b16c6a1db795616de5a687579f0267cd55400806146ebbfe23b2a6d8f297
-size 1040426603

 version https://git-lfs.github.com/spec/v1
+oid sha256:b6b3631e0fff8545f469b01e2d1bf05841ab0a5045a798fc48069ca41d6ddbb5
+size 1039825434

models/embeddings/monolingual/dga_128d_metadata.json CHANGED Viewed

@@ -3,11 +3,13 @@
   "dimension": 128,
   "version": "monolingual",
   "training_params": {
-    "dim": 128,
     "min_count": 5,
     "window": 5,
     "negative": 5,
-    "epochs": 5
   },
-  "vocab_size": 15785
 }

   "dimension": 128,
   "version": "monolingual",
   "training_params": {
+    "algorithm": "skipgram",
     "min_count": 5,
     "window": 5,
     "negative": 5,
+    "epochs": 5,
+    "encoding_method": "rope",
+    "dim": 128
   },
+  "vocab_size": 15210
 }

models/embeddings/monolingual/dga_32d.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:862e8483779b7224adc263c4c32f80bb8f0dc89346b4a8cfdd45d099f0430193
-size 260303723

 version https://git-lfs.github.com/spec/v1
+oid sha256:101c1d2e7f82529eaf1f624fc1f9a8869242fb730c465d01982e7b740f585935
+size 260144154

models/embeddings/monolingual/dga_32d_metadata.json CHANGED Viewed

@@ -3,11 +3,13 @@
   "dimension": 32,
   "version": "monolingual",
   "training_params": {
-    "dim": 32,
     "min_count": 5,
     "window": 5,
     "negative": 5,
-    "epochs": 5
   },
-  "vocab_size": 15785
 }

   "dimension": 32,
   "version": "monolingual",
   "training_params": {
+    "algorithm": "skipgram",
     "min_count": 5,
     "window": 5,
     "negative": 5,
+    "epochs": 5,
+    "encoding_method": "rope",
+    "dim": 32
   },
+  "vocab_size": 15210
 }

models/embeddings/monolingual/dga_64d.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:faaff3cdeef7a76b81e7498d5eb82465a34ce7a38b590ccb1a5445bffbd06514
-size 520344683

 version https://git-lfs.github.com/spec/v1
+oid sha256:29f3396a864b1787c765f6c869438d427adba6e4948ae28a99867bbd4d90e2fb
+size 520037914

models/embeddings/monolingual/dga_64d_metadata.json CHANGED Viewed

@@ -3,11 +3,13 @@
   "dimension": 64,
   "version": "monolingual",
   "training_params": {
-    "dim": 64,
     "min_count": 5,
     "window": 5,
     "negative": 5,
-    "epochs": 5
   },
-  "vocab_size": 15785
 }

   "dimension": 64,
   "version": "monolingual",
   "training_params": {
+    "algorithm": "skipgram",
     "min_count": 5,
     "window": 5,
     "negative": 5,
+    "epochs": 5,
+    "encoding_method": "rope",
+    "dim": 64
   },
+  "vocab_size": 15210
 }

models/subword_markov/dga_markov_ctx1_subword.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:07c24bc03d2999772af4c9745355efe203472de45ad5f349aa0cc8cd6156fb1a
-size 81060

 version https://git-lfs.github.com/spec/v1
+oid sha256:a7e8de70a919223af91ca351c9ed325b8e2c0ee422d41a69e6f1bef4ded50698
+size 67833

models/subword_markov/dga_markov_ctx1_subword_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "context_size": 1,
   "variant": "subword",
   "language": "dga",
-  "unique_contexts": 1148,
-  "total_transitions": 7959013
 }

   "context_size": 1,
   "variant": "subword",
   "language": "dga",
+  "unique_contexts": 1143,
+  "total_transitions": 5878828
 }

models/subword_markov/dga_markov_ctx2_subword.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:bf941de32e0df793abaef699e301d13eb4023e24e890e01414ca3ebda3697799
-size 572662

 version https://git-lfs.github.com/spec/v1
+oid sha256:3599bce6aa7738f80109f4d74bf545ab75787a2610ffae7b7763601b36ebee98
+size 381039

models/subword_markov/dga_markov_ctx2_subword_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "context_size": 2,
   "variant": "subword",
   "language": "dga",
-  "unique_contexts": 9899,
-  "total_transitions": 7955160
 }

   "context_size": 2,
   "variant": "subword",
   "language": "dga",
+  "unique_contexts": 8182,
+  "total_transitions": 5875008
 }

models/subword_markov/dga_markov_ctx3_subword.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d5bd7c429d0f0635b00f49c078b1e4fce9dd6c30ff4a614d27a0532abae6563f
-size 2220058

 version https://git-lfs.github.com/spec/v1
+oid sha256:5026e56875be843fc1fbfe1a36536b26cb2c9c9cdfb98e4fa07636aaa6914e2e
+size 1490586

models/subword_markov/dga_markov_ctx3_subword_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "context_size": 3,
   "variant": "subword",
   "language": "dga",
-  "unique_contexts": 75047,
-  "total_transitions": 7951307
 }

   "context_size": 3,
   "variant": "subword",
   "language": "dga",
+  "unique_contexts": 46384,
+  "total_transitions": 5871188
 }

models/subword_markov/dga_markov_ctx4_subword.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e574d35b50270c25b7bd9ed81e900819439048790abe395c7af08ce0b3b472d1
-size 6646209

 version https://git-lfs.github.com/spec/v1
+oid sha256:01808c29d38deae3defece7b7d6b0efeac4a0c57fe5d5b1ca86a2beeeffbe8ce
+size 4121587

models/subword_markov/dga_markov_ctx4_subword_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "context_size": 4,
   "variant": "subword",
   "language": "dga",
-  "unique_contexts": 332542,
-  "total_transitions": 7947454
 }

   "context_size": 4,
   "variant": "subword",
   "language": "dga",
+  "unique_contexts": 193130,
+  "total_transitions": 5867368
 }

models/subword_ngram/dga_2gram_subword.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7467ad1bbdee47d2e747687cb934192f0f44c6afbe55e161f910c81d2bc20743
-size 58133

 version https://git-lfs.github.com/spec/v1
+oid sha256:20a83d1edcf4a2ddadc72fb861c65adeed34b4d63a0d4609cca706b147f4f503
+size 41340

models/subword_ngram/dga_2gram_subword_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "n": 2,
   "variant": "subword",
   "language": "dga",
-  "unique_ngrams": 4448,
-  "total_ngrams": 7959013
 }

   "n": 2,
   "variant": "subword",
   "language": "dga",
+  "unique_ngrams": 3102,
+  "total_ngrams": 5878828
 }

models/subword_ngram/dga_3gram_subword.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:84fcd4fb5b56554f194dd0224de34e80518c4d52e7dba5e4c9b5dad0ded264bd
-size 461841

 version https://git-lfs.github.com/spec/v1
+oid sha256:5511afecfbb618e55a0e6dff14ea074b950aef7de5b692c6e8dfcccaf518e4df
+size 288911

models/subword_ngram/dga_3gram_subword_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "n": 3,
   "variant": "subword",
   "language": "dga",
-  "unique_ngrams": 38114,
-  "total_ngrams": 7955160
 }

   "n": 3,
   "variant": "subword",
   "language": "dga",
+  "unique_ngrams": 23753,
+  "total_ngrams": 5875008
 }

models/subword_ngram/dga_4gram_subword.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:2019d612b5a5c51fe72836d6abbd1c862ce2913fa72bb070212fada37b7ad396
-size 2081564

 version https://git-lfs.github.com/spec/v1
+oid sha256:cd207924ad307a120e4f9d7355057e3e6066c566c73b67fe1f5b9f8ef7c24647
+size 1287525

models/subword_ngram/dga_4gram_subword_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "n": 4,
   "variant": "subword",
   "language": "dga",
-  "unique_ngrams": 189158,
-  "total_ngrams": 7951307
 }

   "n": 4,
   "variant": "subword",
   "language": "dga",
+  "unique_ngrams": 113607,
+  "total_ngrams": 5871188
 }

models/subword_ngram/dga_5gram_subword.parquet ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:888e7f1e2521411b8a80a6c6daf7570775e9c0ca432daa7cd95d56f130243a84
+size 3003073

models/subword_ngram/dga_5gram_subword_metadata.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "n": 5,
+  "variant": "subword",
+  "language": "dga",
+  "unique_ngrams": 261669,
+  "total_ngrams": 5867368
+}

models/tokenizer/dga_tokenizer_16k.model CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:04292c72d179ce67422f703caa41e18226a29d592998052882ca08730b90939c
-size 498726

 version https://git-lfs.github.com/spec/v1
+oid sha256:3a34e8b91d3051420bfb8c4ba27722b99d278a4978aef5762486618f61b6c9cb
+size 504858

models/tokenizer/dga_tokenizer_16k.vocab CHANGED Viewed

The diff for this file is too large to render. See raw diff

models/tokenizer/dga_tokenizer_32k.model CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c6adbb6a2b032fa7b557424a5edd13dc3704143f7d186901b32123bcf787d26e
-size 762316

 version https://git-lfs.github.com/spec/v1
+oid sha256:826110fdb06bda52c24e93da98774bf4100a3debe58994bc1c86d2a34ed118db
+size 770121

models/tokenizer/dga_tokenizer_32k.vocab CHANGED Viewed

The diff for this file is too large to render. See raw diff

models/tokenizer/dga_tokenizer_64k.model CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:695c7b0dd869dc5f9126a73cf711decd919d032aa8892d7438104ceea5a7c753
-size 1323388

 version https://git-lfs.github.com/spec/v1
+oid sha256:1c5171a543ae4e21cd31b367763559376ac764272d2694e29a4337d23bc699fe
+size 1346382

models/tokenizer/dga_tokenizer_64k.vocab CHANGED Viewed

The diff for this file is too large to render. See raw diff

models/tokenizer/dga_tokenizer_8k.model CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4ce78da224fb5b29f03d6213868be8683d4052fc0e52e7e0cbbd27666c375e14
-size 369134

 version https://git-lfs.github.com/spec/v1
+oid sha256:f812eb174504a46a3df3548ce2b36943ee8709f15da981d20e06bcd090f36da8
+size 372649

models/tokenizer/dga_tokenizer_8k.vocab CHANGED Viewed

The diff for this file is too large to render. See raw diff

models/vocabulary/dga_vocabulary.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1bd94786291299be72a5be7e116f391249bdc68e0bded026706ea3b22708c4c8
-size 698852

 version https://git-lfs.github.com/spec/v1
+oid sha256:fb56e1a5ae64d3c05131485aff4c1f555eb471d296f0e0dfd449a3ac2a4307d8
+size 566914

models/vocabulary/dga_vocabulary_metadata.json CHANGED Viewed

@@ -1,16 +1,17 @@
 {
   "language": "dga",
-  "vocabulary_size": 40845,
   "statistics": {
-    "type_token_ratio": 0.07889417385692979,
     "coverage": {
-      "top_100": 0.4500698726906755,
-      "top_1000": 0.6972710476345139,
-      "top_5000": 0.8312848740226014,
-      "top_10000": 0.8771366014664006
     },
-    "hapax_count": 69921,
-    "hapax_ratio": 0.631249661448459,
-    "total_documents": 3853
   }
 }

 {
   "language": "dga",
+  "vocabulary_size": 33219,
+  "variant": "full",
   "statistics": {
+    "type_token_ratio": 0.06853071308718046,
     "coverage": {
+      "top_100": 0.49309902585584753,
+      "top_1000": 0.7210536107323922,
+      "top_5000": 0.8525060013355275,
+      "top_10000": 0.89766138896653
     },
+    "hapax_count": 43033,
+    "hapax_ratio": 0.564352410428579,
+    "total_documents": 3820
   }
 }

models/word_markov/dga_markov_ctx1_word.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:00aeb4a119714d9b31ec3351d405875ff617499e74aa5c4cc29b957a118b9b57
-size 4397669

 version https://git-lfs.github.com/spec/v1
+oid sha256:868b4ce1ba4db3c0a6352234f8049f7c35a4e8393e16dde6bd5fbacdf7fcd405
+size 2926334

models/word_markov/dga_markov_ctx1_word_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "context_size": 1,
   "variant": "word",
   "language": "dga",
-  "unique_contexts": 110912,
-  "total_transitions": 1863288
 }

   "context_size": 1,
   "variant": "word",
   "language": "dga",
+  "unique_contexts": 76185,
+  "total_transitions": 1108849
 }

models/word_markov/dga_markov_ctx2_word.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1945878f20d05022c1bb511b96c0631e6268a2fbdaaa18299e4c5632860c4063
-size 9997328

 version https://git-lfs.github.com/spec/v1
+oid sha256:ac10270f52d31bd0366bc6e60776fc1eeb7019c509213cb098107ff1ffa4e9ff
+size 7424049

models/word_markov/dga_markov_ctx2_word_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "context_size": 2,
   "variant": "word",
   "language": "dga",
-  "unique_contexts": 505122,
-  "total_transitions": 1859435
 }

   "context_size": 2,
   "variant": "word",
   "language": "dga",
+  "unique_contexts": 402963,
+  "total_transitions": 1105029
 }