-
Notifications
You must be signed in to change notification settings - Fork 70
Expand file tree
/
Copy pathcli.py
More file actions
143 lines (123 loc) · 3.92 KB
/
cli.py
File metadata and controls
143 lines (123 loc) · 3.92 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
#!/usr/bin/env python3
"""
Pocket TTS - Local text-to-speech using Kyutai's Pocket TTS model
Usage:
pocket-tts "Your text here" [--output FILE] [--voice VOICE] [--speed FLOAT]
Installation:
pip install pocket-tts
Note: You must accept the model license at:
https://huggingface.co/kyutai/pocket-tts
"""
import argparse
import os
import sys
from pathlib import Path
try:
from pocket_tts import TTSModel
import scipy.io.wavfile
except ImportError:
print("❌ Pocket TTS not installed.")
print("Install with: pip install pocket-tts")
print("")
print("⚠️ Accept the model license first:")
print(" https://huggingface.co/kyutai/pocket-tts")
sys.exit(1)
# Available voices
VOICES = [
"alba", "marius", "javert", "jean",
"fantine", "cosette", "eponine", "azelma"
]
def main():
parser = argparse.ArgumentParser(
description="Pocket TTS - Local text-to-speech",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Available voices:
alba, marius, javert, jean, fantine, cosette, eponine, azelma
Examples:
pocket-tts "Hello world"
pocket-tts "Hello" --voice alba --output hello.wav
pocket-tts "Text-to-speech is awesome" --speed 1.2
"""
)
parser.add_argument(
"text",
nargs="?",
help="Text to convert to speech"
)
parser.add_argument(
"-o", "--output",
default="output.wav",
help="Output WAV file (default: output.wav)"
)
parser.add_argument(
"-v", "--voice",
default="alba",
choices=VOICES,
help=f"Voice preset (default: alba)"
)
parser.add_argument(
"-s", "--speed",
type=float,
default=1.0,
help="Speech speed 0.5-2.0 (default: 1.0)"
)
parser.add_argument(
"--voice-file",
help="Use custom WAV file for voice cloning"
)
parser.add_argument(
"--serve",
action="store_true",
help="Start local TTS server"
)
parser.add_argument(
"--list-voices",
action="store_true",
help="List available voices"
)
args = parser.parse_args()
if args.list_voices:
print("🎤 Available voices:")
for voice in VOICES:
print(f" - {voice}")
print("")
print("Or use --voice-file /path/to/voice.wav for custom voice cloning")
sys.exit(0)
if args.serve:
print("🚀 Starting Pocket TTS server on http://localhost:8000")
os.system("pocket-tts serve")
sys.exit(0)
if not args.text:
parser.print_help()
print("\n💡 Tip: pocket-tts \"Hello, world!\"")
sys.exit(1)
print(f"🔊 Generating speech...")
print(f"📝 Text: {args.text[:60]}{'...' if len(args.text) > 60 else ''}")
print(f"🎤 Voice: {args.voice}")
try:
# Load model
tts_model = TTSModel.load_model()
# Get voice state
if args.voice_file:
voice_state = tts_model.get_state_for_audio_prompt(args.voice_file)
print(f"🎭 Using custom voice from: {args.voice_file}")
else:
voice_state = tts_model.get_state_for_audio_prompt(
f"hf://kyutai/tts-voices/{args.voice}-mackenna/casual.wav"
)
# Generate audio
audio = tts_model.generate_audio(voice_state, args.text)
# Save audio
wavfile.write(args.output, tts_model.sample_rate, audio.numpy())
print(f"✅ Saved to: {args.output}")
print(f"📊 Sample rate: {tts_model.sample_rate} Hz")
print(f"📏 Audio length: {len(audio) / tts_model.sample_rate:.2f}s")
except Exception as e:
print(f"❌ Error: {e}")
print("")
print("Make sure you've accepted the model license at:")
print(" https://huggingface.co/kyutai/pocket-tts")
sys.exit(1)
if __name__ == "__main__":
main()