humair025 commited on
Commit
4cba748
·
verified ·
1 Parent(s): 157475f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +40 -30
app.py CHANGED
@@ -5,62 +5,68 @@ from soprano import SopranoTTS
5
  from scipy.io.wavfile import write as wav_write
6
  import tempfile
7
  import os
8
- import spaces
9
 
 
10
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
11
- print(DEVICE)
12
 
13
- # Load model once
14
  model = SopranoTTS(
15
- backend="auto",
16
  device=DEVICE,
17
- cache_size_mb=100,
18
  decoder_batch_size=1,
19
  )
20
 
21
  SAMPLE_RATE = 32000
22
 
23
-
24
- @spaces.GPU
25
  def tts_stream(text, temperature, top_p, repetition_penalty, state):
26
  if not text.strip():
27
  yield None, state
28
  return
29
-
30
  out = model.infer(
31
  text,
32
  temperature=temperature,
33
  top_p=top_p,
34
  repetition_penalty=repetition_penalty,
35
  )
36
-
37
  audio_np = out.cpu().numpy()
38
  yield (SAMPLE_RATE, audio_np), audio_np
39
 
40
-
41
  def save_audio(state):
42
  if state is None or len(state) == 0:
43
  return None
 
44
  fd, path = tempfile.mkstemp(suffix=".wav")
45
  os.close(fd)
46
  wav_write(path, SAMPLE_RATE, state)
47
  return path
48
 
49
-
50
  with gr.Blocks() as demo:
51
  state_audio = gr.State(None)
52
-
53
  with gr.Row():
54
  with gr.Column():
55
- gr.Markdown("# Soprano Demo\n\nSoprano is an ultra‑lightweight, open‑source text‑to‑speech (TTS) model designed for real‑time, high‑fidelity speech synthesis at unprecedented speed. Soprano can achieve **<15 ms streaming latency** and up to **2000x real-time generation**, all while being easy to deploy at **<1 GB VRAM usage**.\n\nGithub: https://github.com/ekwek1/soprano\n\nModel Weights: https://huggingface.co/ekwek/Soprano-80M")
56
-
 
 
 
 
 
 
 
 
57
  text_in = gr.Textbox(
58
  label="Input Text",
59
  placeholder="Enter text to synthesize...",
60
  value="Soprano is an extremely lightweight text to speech model designed to produce highly realistic speech at unprecedented speed.",
61
  lines=4,
62
  )
63
-
64
  with gr.Accordion("Advanced options", open=False):
65
  temperature = gr.Slider(
66
  0.0, 1.0, value=0.3, step=0.05, label="Temperature"
@@ -71,37 +77,41 @@ with gr.Blocks() as demo:
71
  repetition_penalty = gr.Slider(
72
  1.0, 2.0, value=1.2, step=0.05, label="Repetition penalty"
73
  )
74
-
75
  gen_btn = gr.Button("Generate")
76
-
77
  with gr.Column():
78
  audio_out = gr.Audio(
79
  label="Output Audio",
80
  autoplay=True,
81
  streaming=False,
82
  )
83
- #download_btn = gr.Button("Download")
84
- #file_out = gr.File(label="Download file")
 
 
85
  gr.Markdown(
86
  "Usage tips:\n\n"
87
  "- Soprano works best when each sentence is between 2 and 15 seconds long.\n"
88
- "- Although Soprano recognizes numbers and some special characters, it occasionally mispronounces them. Best results can be achieved by converting these into their phonetic form. (1+1 -> one plus one, etc)\n"
89
- "- If Soprano produces unsatisfactory results, you can easily regenerate it for a new, potentially better generation. You may also change the sampling settings for more varied results.\n"
90
- "- Avoid improper grammar such as not using contractions, multiple spaces, etc."
 
 
 
91
  )
92
-
93
-
94
  gen_btn.click(
95
  fn=tts_stream,
96
  inputs=[text_in, temperature, top_p, repetition_penalty, state_audio],
97
  outputs=[audio_out, state_audio],
98
  )
99
-
100
- #download_btn.click(
101
- # fn=save_audio,
102
- # inputs=[state_audio],
103
- # outputs=[file_out],
104
- #)
105
 
106
  demo.queue()
107
  demo.launch()
 
5
  from scipy.io.wavfile import write as wav_write
6
  import tempfile
7
  import os
 
8
 
9
+ # Detect device
10
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
11
+ print(f"Using device: {DEVICE}")
12
 
13
+ # Load model once - works on both CUDA and CPU
14
  model = SopranoTTS(
15
+ backend="auto", # Will automatically choose best backend for device
16
  device=DEVICE,
17
+ cache_size_mb=100, # Only relevant for CUDA
18
  decoder_batch_size=1,
19
  )
20
 
21
  SAMPLE_RATE = 32000
22
 
23
+ # Remove @spaces.GPU decorator - not needed for CPU support
 
24
  def tts_stream(text, temperature, top_p, repetition_penalty, state):
25
  if not text.strip():
26
  yield None, state
27
  return
28
+
29
  out = model.infer(
30
  text,
31
  temperature=temperature,
32
  top_p=top_p,
33
  repetition_penalty=repetition_penalty,
34
  )
35
+
36
  audio_np = out.cpu().numpy()
37
  yield (SAMPLE_RATE, audio_np), audio_np
38
 
 
39
  def save_audio(state):
40
  if state is None or len(state) == 0:
41
  return None
42
+
43
  fd, path = tempfile.mkstemp(suffix=".wav")
44
  os.close(fd)
45
  wav_write(path, SAMPLE_RATE, state)
46
  return path
47
 
 
48
  with gr.Blocks() as demo:
49
  state_audio = gr.State(None)
50
+
51
  with gr.Row():
52
  with gr.Column():
53
+ gr.Markdown(
54
+ f"# Soprano Demo\n\n"
55
+ f"**Running on: {DEVICE.upper()}**\n\n"
56
+ "Soprano is an ultra‑lightweight, open‑source text‑to‑speech (TTS) model designed for real‑time, "
57
+ "high‑fidelity speech synthesis at unprecedented speed. Soprano can achieve **<15 ms streaming latency** "
58
+ "and up to **2000x real-time generation**, all while being easy to deploy at **<1 GB VRAM usage**.\n\n"
59
+ "Github: https://github.com/ekwek1/soprano\n\n"
60
+ "Model Weights: https://huggingface.co/ekwek/Soprano-80M"
61
+ )
62
+
63
  text_in = gr.Textbox(
64
  label="Input Text",
65
  placeholder="Enter text to synthesize...",
66
  value="Soprano is an extremely lightweight text to speech model designed to produce highly realistic speech at unprecedented speed.",
67
  lines=4,
68
  )
69
+
70
  with gr.Accordion("Advanced options", open=False):
71
  temperature = gr.Slider(
72
  0.0, 1.0, value=0.3, step=0.05, label="Temperature"
 
77
  repetition_penalty = gr.Slider(
78
  1.0, 2.0, value=1.2, step=0.05, label="Repetition penalty"
79
  )
80
+
81
  gen_btn = gr.Button("Generate")
82
+
83
  with gr.Column():
84
  audio_out = gr.Audio(
85
  label="Output Audio",
86
  autoplay=True,
87
  streaming=False,
88
  )
89
+
90
+ download_btn = gr.Button("Download")
91
+ file_out = gr.File(label="Download file")
92
+
93
  gr.Markdown(
94
  "Usage tips:\n\n"
95
  "- Soprano works best when each sentence is between 2 and 15 seconds long.\n"
96
+ "- Although Soprano recognizes numbers and some special characters, it occasionally mispronounces them. "
97
+ "Best results can be achieved by converting these into their phonetic form. (1+1 -> one plus one, etc)\n"
98
+ "- If Soprano produces unsatisfactory results, you can easily regenerate it for a new, potentially better generation. "
99
+ "You may also change the sampling settings for more varied results.\n"
100
+ "- Avoid improper grammar such as not using contractions, multiple spaces, etc.\n\n"
101
+ f"**Note:** {'GPU acceleration active' if DEVICE == 'cuda' else 'Running on CPU - generation may be slower'}"
102
  )
103
+
 
104
  gen_btn.click(
105
  fn=tts_stream,
106
  inputs=[text_in, temperature, top_p, repetition_penalty, state_audio],
107
  outputs=[audio_out, state_audio],
108
  )
109
+
110
+ download_btn.click(
111
+ fn=save_audio,
112
+ inputs=[state_audio],
113
+ outputs=[file_out],
114
+ )
115
 
116
  demo.queue()
117
  demo.launch()