YAML Metadata Warning: empty or missing yaml metadata in repo card (https://huggingface.co/docs/hub/model-cards#model-card-metadata)

This model is GPTQ-Int4 thinker part of Qwen3-Omni-30B-A3B-Instruct. Use asr data to Calibrate the model.

#usage
from transformers import Qwen3OmniMoeThinkerForConditionalGeneration, Qwen3OmniMoeProcessor
from qwen_omni_utils import process_mm_info
import torch

model_path = "ReopenAI/Qwen3-omni-ASR-GPTQ-Int4"
model = Qwen3OmniMoeThinkerForConditionalGeneration.from_pretrained(
    model_path,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    attn_implementation="flash_attention_2",
)
processor = Qwen3OmniMoeProcessor.from_pretrained(model_path)


conversation = [
    {
    "role": "system",
    "content": [
        {"type": "text", "text": "You are a speech recognition model."}
        #{"type": "text", "text": "You are a helpful assistant."}
    ],
    },
    {
        "role": "user",
        "content": [
            {"type": "audio", "audio": "test.wav"},
            {"type": "text", "text": "Transcribe the audio into text."},
        ],
    },
]

# set use audio in video
USE_AUDIO_IN_VIDEO = False

# Preparation for inference
text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
audios, images, videos = process_mm_info(conversation, use_audio_in_video=USE_AUDIO_IN_VIDEO)
#print("audios: ", len(audios[0]) / 16000, text)
inputs = processor(text=text, audio=audios, images=images, videos=videos, return_tensors="pt", padding=True, use_audio_in_video=USE_AUDIO_IN_VIDEO)
inputs = inputs.to(model.device).to(model.dtype)

text_ids = model.generate(**inputs, temperature=0.01)
text = processor.batch_decode(text_ids[:, inputs["input_ids"].shape[1] :],
                              skip_special_tokens=True,
                              clean_up_tokenization_spaces=False)
print(text)
#vllm
#Strongly recommend.
unzip vllm-0.13.0.zip
cd vllm-0.13.0
pip install -e .

#server 
vllm serve  "ReopenAI/Qwen3-omni-ASR-GPTQ-Int4" \
  --trust-remote-code \
  --tensor-parallel-size 1 \
  --served-model-name Qwen3_omni \
  --host 0.0.0.0 \
  --seed 1234 \
  --max-num-seqs 32 \
  --gpu-memory-utilization 0.95 \
  --limit-mm-per-prompt '{"image":1,"video":1,"audio":5}' \
  --max-model-len 1280 \
  --max-num-batched-tokens 32768


#client
from openai import OpenAI
import base64, io, numpy as np, soundfile as sf
import librosa, time, os

client = OpenAI(base_url="your url", api_key="EMPTY", timeout=60)
MODEL = "Qwen3_omni"

y, sr = librosa.load("test.wav", sr=16000, mono=True)
pcm16 = (np.clip(y, -1, 1) * 32767).astype(np.int16)
buf = io.BytesIO()
sf.write(buf, pcm16, 16000, format="WAV", subtype="PCM_16")
wav_b64 = base64.b64encode(buf.getvalue()).decode("utf-8")

messages=[
        { "role": "system", "content": "You are a speech recognition model."},
        {
            "role": "user",
            "content": [
                {"type": "input_audio", "input_audio": {"data": wav_b64, "format": "wav"}},
                {"type": "text", "text": "Transcribe the audio into text."},
            ],
        },
]

resp = client.chat.completions.create(
    model=MODEL,
    messages=messages,
    max_tokens=160,
    temperature=1e-2,
    logprobs=True,
    top_logprobs=1,  
    top_p=0.1,
    extra_body={"top_k": 1},
)

text = resp.choices[0].message.content
print("text: ", [text])
Downloads last month
66
Safetensors
Model size
6B params
Tensor type
I64
I32
BF16
Inference Providers NEW
This model isn't deployed by any Inference Provider. 馃檵 Ask for provider support