gemini tts
概要
- 複数の話者の音声を生成することができる
コード
from google import genai
from google.genai import types
import wave
# Set up the wave file to save the output:
def wave_file(filename, pcm, channels=1, rate=24000, sample_width=2):
with wave.open(filename, "wb") as wf:
wf.setnchannels(channels)
wf.setsampwidth(sample_width)
wf.setframerate(rate)
wf.writeframes(pcm)
client = genai.Client()
prompt = """TTS the following conversation between Joe and Jane in Japanese:
Joe: ジェーン、今日はどんな感じ?
Jane: 悪くないわね、あなたはどう?"""
response = client.models.generate_content(
model="gemini-2.5-pro-preview-tts",
contents=prompt,
config=types.GenerateContentConfig(
response_modalities=["AUDIO"],
speech_config=types.SpeechConfig(
multi_speaker_voice_config=types.MultiSpeakerVoiceConfig(
speaker_voice_configs=[
types.SpeakerVoiceConfig(
speaker="Joe",
voice_config=types.VoiceConfig(
prebuilt_voice_config=types.PrebuiltVoiceConfig(
voice_name="Puck",
)
),
),
types.SpeakerVoiceConfig(
speaker="Jane",
voice_config=types.VoiceConfig(
prebuilt_voice_config=types.PrebuiltVoiceConfig(
voice_name="Zephyr",
)
),
),
]
)
),
),
)
data = response.candidates[0].content.parts[0].inline_data.data
file_name = "out.wav"
wave_file(file_name, data)