ποΈ LFM2.5-VL
Collection
4 items
β’
Updated
β’
12
ONNX export of LFM2.5-VL-1.6B for cross-platform inference.
| Encoder | Decoder | Size | Platform | Use Case |
|---|---|---|---|---|
| FP16 | Q4 | ~1.5GB | WebGPU, Server | Recommended for most uses |
| FP16 | FP16 | ~3.2GB | Server | Higher quality |
onnx/
βββ embed_tokens.onnx # Token embeddings (FP32)
βββ embed_tokens_fp16.onnx # Token embeddings (FP16)
βββ embed_images.onnx # Vision encoder (FP32)
βββ embed_images_fp16.onnx # Vision encoder (FP16)
βββ embed_images_q4.onnx # Vision encoder (Q4)
βββ embed_images_q8.onnx # Vision encoder (Q8)
βββ decoder.onnx # Language decoder (FP32)
βββ decoder_fp16.onnx # Language decoder (FP16)
βββ decoder_q4.onnx # Language decoder (Q4)
βββ decoder_q8.onnx # Language decoder (Q8)
pip install onnxruntime transformers pillow torch huggingface_hub
# or with GPU support:
pip install onnxruntime-gpu transformers pillow torch huggingface_hub
import numpy as np
import onnxruntime as ort
from huggingface_hub import hf_hub_download
from transformers import AutoProcessor
from PIL import Image
# Download model files (fp16 encoder + q4 decoder recommended)
model_id = "LiquidAI/LFM2.5-VL-1.6B-ONNX"
embed_tokens_path = hf_hub_download(model_id, "onnx/embed_tokens_fp16.onnx")
embed_images_path = hf_hub_download(model_id, "onnx/embed_images_fp16.onnx")
decoder_path = hf_hub_download(model_id, "onnx/decoder_q4.onnx")
# Load ONNX sessions
embed_tokens = ort.InferenceSession(embed_tokens_path)
embed_images = ort.InferenceSession(embed_images_path)
decoder = ort.InferenceSession(decoder_path)
# Load processor
processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
# Prepare input
image = Image.open("photo.jpg")
messages = [{"role": "user", "content": [
{"type": "image"},
{"type": "text", "text": "What is in this image?"}
]}]
# Process inputs
prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
inputs = processor(images=[image], text=prompt, return_tensors="pt")
# Convert to numpy with correct dtypes
pixel_values = inputs["pixel_values"].numpy().astype(np.float32)
pixel_attention_mask = inputs["pixel_attention_mask"].numpy().astype(np.int64)
spatial_shapes = inputs["spatial_shapes"].numpy().astype(np.int64)
input_ids = inputs["input_ids"].numpy().astype(np.int64)
# Get image embeddings
image_outputs = embed_images.run(None, {
"pixel_values": pixel_values,
"pixel_attention_mask": pixel_attention_mask,
"spatial_shapes": spatial_shapes,
})
image_embeds = image_outputs[0]
# Get token embeddings
token_outputs = embed_tokens.run(None, {"input_ids": input_ids})
token_embeds = token_outputs[0]
# Replace <image> tokens with image embeddings
image_token_id = processor.tokenizer.convert_tokens_to_ids("<image>")
image_positions = np.where(input_ids[0] == image_token_id)[0]
for i, pos in enumerate(image_positions):
if i < len(image_embeds):
token_embeds[0, pos] = image_embeds[i]
# Initialize KV cache for stateful decoding
ONNX_DTYPE = {"tensor(float)": np.float32, "tensor(float16)": np.float16, "tensor(int64)": np.int64}
cache = {}
for inp in decoder.get_inputs():
if inp.name in {"inputs_embeds", "attention_mask", "position_ids"}:
continue
shape = [d if isinstance(d, int) else 1 for d in inp.shape]
for i, d in enumerate(inp.shape):
if isinstance(d, str) and "sequence" in d.lower():
shape[i] = 0
cache[inp.name] = np.zeros(shape, dtype=ONNX_DTYPE.get(inp.type, np.float32))
# Generate tokens
seq_len = token_embeds.shape[1]
generated_tokens = []
for step in range(100): # max tokens
if step == 0:
embeds = token_embeds.astype(np.float32)
else:
last_token = np.array([[generated_tokens[-1]]], dtype=np.int64)
embeds = embed_tokens.run(None, {"input_ids": last_token})[0].astype(np.float32)
attn_mask = np.ones((1, seq_len + len(generated_tokens)), dtype=np.int64)
feed = {"inputs_embeds": embeds, "attention_mask": attn_mask, **cache}
outputs = decoder.run(None, feed)
next_token = int(np.argmax(outputs[0][0, -1]))
generated_tokens.append(next_token)
# Update cache
for i, out in enumerate(decoder.get_outputs()[1:], 1):
name = out.name.replace("present_conv", "past_conv").replace("present.", "past_key_values.")
if name in cache:
cache[name] = outputs[i]
if next_token == processor.tokenizer.eos_token_id:
break
print(processor.tokenizer.decode(generated_tokens, skip_special_tokens=True))
npm install onnxruntime-web @huggingface/transformers
WebGPU is required for browser inference. To enable:
chrome://flags/#enable-unsafe-webgpu, enable, and restartchrome://gpu for "WebGPU" statusnavigator.gpu.requestAdapter() in DevTools consoleimport * as ort from "onnxruntime-web/webgpu";
import { AutoTokenizer } from "@huggingface/transformers";
// Check WebGPU availability
if (!navigator.gpu) {
throw new Error("WebGPU not available. Enable at chrome://flags/#enable-unsafe-webgpu");
}
const adapter = await navigator.gpu.requestAdapter();
if (!adapter) {
throw new Error("WebGPU adapter not found. Check chrome://gpu for status.");
}
ort.env.wasm.numThreads = 1;
const modelId = "LiquidAI/LFM2.5-VL-1.6B-ONNX";
const modelBase = `https://huggingface.co/${modelId}/resolve/main`;
// Load tokenizer
const tokenizer = await AutoTokenizer.from_pretrained(modelId);
// Load ONNX sessions with external data
async function loadSession(name) {
const onnxPath = `${modelBase}/onnx/${name}.onnx`;
const dataPath = `${modelBase}/onnx/${name}.onnx_data`;
return ort.InferenceSession.create(onnxPath, {
executionProviders: ["webgpu"],
externalData: [{ path: `${name}.onnx_data`, data: dataPath }],
});
}
const embedTokens = await loadSession("embed_tokens_fp16");
const embedImages = await loadSession("embed_images_fp16");
const decoder = await loadSession("decoder_q4");
// Model config (from config.json)
const hiddenSize = 1536;
const numKVHeads = 12;
const headDim = 128;
// Get text embeddings helper
async function getTextEmbeddings(ids) {
const tensor = new ort.Tensor("int64", new BigInt64Array(ids.map(BigInt)), [1, ids.length]);
const out = await embedTokens.run({ input_ids: tensor });
return out.inputs_embeds;
}
// Initialize KV cache
function initCache() {
const cache = {};
for (const name of decoder.inputNames) {
if (name.startsWith("past_conv")) {
cache[name] = new ort.Tensor("float32", new Float32Array(hiddenSize * 3), [1, hiddenSize, 3]);
} else if (name.startsWith("past_key_values")) {
cache[name] = new ort.Tensor("float32", new Float32Array(0), [1, numKVHeads, 0, headDim]);
}
}
return cache;
}
// Update cache from outputs
function updateCache(cache, outputs) {
for (const [name, tensor] of Object.entries(outputs)) {
if (name.startsWith("present_conv")) {
cache[name.replace("present_conv", "past_conv")] = tensor;
} else if (name.startsWith("present.")) {
cache[name.replace("present.", "past_key_values.")] = tensor;
}
}
}
// Build prompt and tokenize
const prompt = tokenizer.apply_chat_template(messages, { add_generation_prompt: true, tokenize: false });
const inputIds = tokenizer.encode(prompt);
// Get embeddings (for VL: merge image embeddings at <image> token positions)
let inputsEmbeds = await getTextEmbeddings(inputIds);
// Generation loop
const cache = initCache();
const eosTokenId = tokenizer.eos_token_id;
const generatedTokens = [];
let curLen = inputsEmbeds.dims[1];
let embeds = inputsEmbeds;
for (let step = 0; step < 256; step++) {
const attentionMask = new ort.Tensor("int64", new BigInt64Array(curLen).fill(1n), [1, curLen]);
const outputs = await decoder.run({ inputs_embeds: embeds, attention_mask: attentionMask, ...cache });
// Greedy decode: argmax of last token logits
const logits = outputs.logits;
const vocabSize = logits.dims[2];
const lastLogits = logits.data.slice((logits.dims[1] - 1) * vocabSize);
const nextToken = lastLogits.indexOf(Math.max(...lastLogits));
generatedTokens.push(nextToken);
if (nextToken === eosTokenId) break;
updateCache(cache, outputs);
embeds = await getTextEmbeddings([nextToken]);
curLen++;
}
console.log(tokenizer.decode(generatedTokens, { skip_special_tokens: true }));
embed_images_fp16.onnx + decoder_q4.onnxembed_images_fp16.onnx + decoder_fp16.onnx.onnx_data) that are loaded automaticallyBigInt64ArrayThis model is released under the LFM 1.0 License.