YAML Metadata
Warning:
empty or missing yaml metadata in repo card
(https://huggingface.co/docs/hub/model-cards#model-card-metadata)
This model is GPTQ-Int4 thinker part of Qwen3-Omni-30B-A3B-Instruct. Use asr data to Calibrate the model.
#usage
from transformers import Qwen3OmniMoeThinkerForConditionalGeneration, Qwen3OmniMoeProcessor
from qwen_omni_utils import process_mm_info
import torch
model_path = "ReopenAI/Qwen3-omni-ASR-GPTQ-Int4"
model = Qwen3OmniMoeThinkerForConditionalGeneration.from_pretrained(
model_path,
torch_dtype=torch.bfloat16,
device_map="auto",
attn_implementation="flash_attention_2",
)
processor = Qwen3OmniMoeProcessor.from_pretrained(model_path)
conversation = [
{
"role": "system",
"content": [
{"type": "text", "text": "You are a speech recognition model."}
#{"type": "text", "text": "You are a helpful assistant."}
],
},
{
"role": "user",
"content": [
{"type": "audio", "audio": "test.wav"},
{"type": "text", "text": "Transcribe the audio into text."},
],
},
]
# set use audio in video
USE_AUDIO_IN_VIDEO = False
# Preparation for inference
text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
audios, images, videos = process_mm_info(conversation, use_audio_in_video=USE_AUDIO_IN_VIDEO)
#print("audios: ", len(audios[0]) / 16000, text)
inputs = processor(text=text, audio=audios, images=images, videos=videos, return_tensors="pt", padding=True, use_audio_in_video=USE_AUDIO_IN_VIDEO)
inputs = inputs.to(model.device).to(model.dtype)
text_ids = model.generate(**inputs, temperature=0.01)
text = processor.batch_decode(text_ids[:, inputs["input_ids"].shape[1] :],
skip_special_tokens=True,
clean_up_tokenization_spaces=False)
print(text)
#vllm
#Strongly recommend.
unzip vllm-0.13.0.zip
cd vllm-0.13.0
pip install -e .
#server
vllm serve "ReopenAI/Qwen3-omni-ASR-GPTQ-Int4" \
--trust-remote-code \
--tensor-parallel-size 1 \
--served-model-name Qwen3_omni \
--host 0.0.0.0 \
--seed 1234 \
--max-num-seqs 32 \
--gpu-memory-utilization 0.95 \
--limit-mm-per-prompt '{"image":1,"video":1,"audio":5}' \
--max-model-len 1280 \
--max-num-batched-tokens 32768
#client
from openai import OpenAI
import base64, io, numpy as np, soundfile as sf
import librosa, time, os
client = OpenAI(base_url="your url", api_key="EMPTY", timeout=60)
MODEL = "Qwen3_omni"
y, sr = librosa.load("test.wav", sr=16000, mono=True)
pcm16 = (np.clip(y, -1, 1) * 32767).astype(np.int16)
buf = io.BytesIO()
sf.write(buf, pcm16, 16000, format="WAV", subtype="PCM_16")
wav_b64 = base64.b64encode(buf.getvalue()).decode("utf-8")
messages=[
{ "role": "system", "content": "You are a speech recognition model."},
{
"role": "user",
"content": [
{"type": "input_audio", "input_audio": {"data": wav_b64, "format": "wav"}},
{"type": "text", "text": "Transcribe the audio into text."},
],
},
]
resp = client.chat.completions.create(
model=MODEL,
messages=messages,
max_tokens=160,
temperature=1e-2,
logprobs=True,
top_logprobs=1,
top_p=0.1,
extra_body={"top_k": 1},
)
text = resp.choices[0].message.content
print("text: ", [text])
- Downloads last month
- 66
Inference Providers
NEW
This model isn't deployed by any Inference Provider.
馃檵
Ask for provider support