pyannote
date: 2025-07-26 excerpt: pyannoteの概要と使い方
pyannoteの概要と使い方
概要
- huggingfaceの音声モデルを簡単に使えるライブラリ
- 無音状態の検出、話者分離(diarization)などが可能
インストール
$ pip install pyannote.audio
ユースケース別の使い方
無音状態の検出
from pyannote.audio import Pipeline
import torch, torchaudio, soundfile as sf
# 以下のモデルに同意が必要
# https://huggingface.co/pyannote/voice-activity-detection
# https://huggingface.co/pyannote/segmentation
pipeline = Pipeline.from_pretrained("pyannote/voice-activity-detection", use_auth_token="hf_XXXXXXXXXXXXXX")
vad = pipeline("input.wav")
segments = vad.get_timeline().support()
wave, sr = torchaudio.load("input.wav")
output = torch.cat([wave[:, int(s.start*sr):int(s.end*sr)] for s in segments], dim=1)
sf.write("output.wav", output.T.numpy(), sr)
話者分離(diarization)
from pyannote.audio import Pipeline
pipeline = Pipeline.from_pretrained(
"pyannote/speaker-diarization-3.1",
use_auth_token="hf_XXXXXXXXXXXXXXX")
# GPU を使う場合
# pipeline.to(torch.device("cuda"))
annotation = pipeline("input.wav",
num_speakers=2) # 話者数を指定(他に min_speakers, max_speakers も指定可能)
for turn, _, speaker in annotation.itertracks(yield_label=True):
print(f"{turn.start:.1f}s — {turn.end:.1f}s: speaker {speaker}")
#################
# 以下、後処理で単一話者を抽出
################
from pydub import AudioSegment
from collections import defaultdict
from pyannote.core import Timeline
audio = AudioSegment.from_file("input.wav")
solo_tracks = defaultdict(list) # speaker -> list[AudioSegment]
for spk in annotation.labels(): # 例: SPEAKER_00, ...
# 話者タイムライン
spk_tl = annotation.label_timeline(spk) # Timeline
# 他者タイムライン(集合演算で結合)
others = Timeline()
for other in annotation.labels():
if other == spk:
continue
others |= annotation.label_timeline(other)
# extrude = spk_tl - others (重なり部分を除外)
solo_tl = spk_tl.extrude(others)
# 取り出して pydub Segment に変換
for segment in solo_tl:
start_ms = int(segment.start * 1000)
end_ms = int(segment.end * 1000)
solo_tracks[spk].append(audio[start_ms:end_ms])
# 時系列順に結合して MP3 出力
for spk, segs in solo_tracks.items():
if not segs:
continue
merged = segs[0]
for seg in segs[1:]:
merged += seg
outfile = f"{spk}_solo.mp3"
merged.export(outfile, format="mp3", bitrate="192k")
print(f"Exported {outfile} ({len(merged)/1000:.1f}s)")