|
|
from transformers import Qwen3OmniMoeThinkerForConditionalGeneration, Qwen3OmniMoeProcessor |
|
|
from qwen_omni_utils import process_mm_info |
|
|
import torch |
|
|
|
|
|
model_path = "./" |
|
|
model = Qwen3OmniMoeThinkerForConditionalGeneration.from_pretrained( |
|
|
model_path, |
|
|
torch_dtype=torch.bfloat16, |
|
|
device_map="auto", |
|
|
attn_implementation="sdpa", |
|
|
) |
|
|
processor = Qwen3OmniMoeProcessor.from_pretrained(model_path) |
|
|
|
|
|
|
|
|
conversation = [ |
|
|
{ |
|
|
"role": "system", |
|
|
"content": [ |
|
|
{"type": "text", "text": "You are a speech recognition model."} |
|
|
|
|
|
], |
|
|
}, |
|
|
{ |
|
|
"role": "user", |
|
|
"content": [ |
|
|
{"type": "audio", "audio": "test.wav"}, |
|
|
{"type": "text", "text": "Transcribe the audio into text."}, |
|
|
], |
|
|
}, |
|
|
] |
|
|
|
|
|
|
|
|
USE_AUDIO_IN_VIDEO = False |
|
|
|
|
|
|
|
|
text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False) |
|
|
audios, images, videos = process_mm_info(conversation, use_audio_in_video=USE_AUDIO_IN_VIDEO) |
|
|
|
|
|
inputs = processor(text=text, audio=audios, images=images, videos=videos, return_tensors="pt", padding=True, use_audio_in_video=USE_AUDIO_IN_VIDEO) |
|
|
inputs = inputs.to(model.device).to(model.dtype) |
|
|
|
|
|
text_ids = model.generate(**inputs, temperature=0.01) |
|
|
|
|
|
text = processor.batch_decode(text_ids[:, inputs["input_ids"].shape[1] :], |
|
|
skip_special_tokens=True, |
|
|
clean_up_tokenization_spaces=False) |
|
|
print(text) |