Przejdź do głównej zawartości

Speaker Diarization

Speaker Diarization to proces identyfikacji i segmentacji audio według mówców. W Vista rozróżniamy:

  • Lekarz (SPEAKER_00)
  • Właściciel (SPEAKER_01)
  • Opcjonalnie: dodatkowy personel

sequenceDiagram
participant Audio as 🎤 Audio File
participant VAD as Voice Activity Detection
participant EMB as Speaker Embeddings
participant CLU as Clustering
participant OUT as 📝 Labeled Output
Audio->>VAD: Raw audio
VAD->>VAD: Detect speech segments
VAD->>EMB: Speech-only segments
EMB->>EMB: Extract speaker embeddings
Note over EMB: Neural network extracts<br/>voice characteristics
EMB->>CLU: Embedding vectors
CLU->>CLU: Cluster similar voices
Note over CLU: Group segments by speaker
CLU->>OUT: Labeled segments
Note over OUT: SPEAKER_00: 0:00-0:15<br/>SPEAKER_01: 0:15-0:30

#[derive(Debug, Serialize, Deserialize)]
pub struct SpeakerInfo {
pub id: String, // "SPEAKER_00", "SPEAKER_01"
pub label: Option<String>, // "Lekarz", "Właściciel"
pub total_speech_time: f64, // Total seconds of speech
pub confidence: f32, // Clustering confidence
}
#[derive(Debug, Serialize, Deserialize)]
pub struct SpeakerSegment {
pub speaker_id: String,
pub start: f64, // Start time in seconds
pub end: f64, // End time in seconds
pub text: String, // Transcribed text
}
#[derive(Debug, Serialize, Deserialize)]
pub struct DiarizationResult {
pub speakers: Vec<SpeakerInfo>,
pub segments: Vec<SpeakerSegment>,
pub quality: DiarizationQuality,
}
#[derive(Debug, Serialize, Deserialize)]
pub enum DiarizationQuality {
High, // Clear separation, high confidence
Medium, // Some overlap, moderate confidence
Low, // Significant overlap, low confidence
}

CREATE TABLE speaker_segments (
id TEXT PRIMARY KEY,
visit_id TEXT NOT NULL REFERENCES visits(visit_id),
recording_id TEXT REFERENCES recordings(id),
-- Speaker identification
speaker_id TEXT NOT NULL, -- "SPEAKER_00"
speaker_label TEXT, -- "Lekarz", "Właściciel"
-- Timing
start_time REAL NOT NULL, -- Seconds from start
end_time REAL NOT NULL,
-- Content
text TEXT,
confidence REAL,
-- Audit
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
CREATE INDEX idx_speaker_segments_visit ON speaker_segments(visit_id);
CREATE INDEX idx_speaker_segments_recording ON speaker_segments(recording_id);
CREATE INDEX idx_speaker_segments_speaker ON speaker_segments(speaker_id);

ProviderDiarizationQuality
LibraxisAI✅ YesHigh
MLX Whisper❌ No-
OpenAI Whisper❌ No-

src/components/audio/SpeakerLabels.tsx
interface SpeakerLabelsProps {
segments: SpeakerSegment[];
onLabelChange: (speakerId: string, label: string) => void;
}
export const SpeakerLabels: React.FC<SpeakerLabelsProps> = ({
segments,
onLabelChange,
}) => {
// Group by speaker
const speakers = useMemo(() => {
const grouped = groupBy(segments, 'speaker_id');
return Object.entries(grouped).map(([id, segs]) => ({
id,
totalTime: segs.reduce((sum, s) => sum + (s.end - s.start), 0),
segments: segs,
}));
}, [segments]);
return (
<div className="space-y-4">
{speakers.map(speaker => (
<div key={speaker.id} className="flex items-center gap-4">
<SpeakerAvatar speakerId={speaker.id} />
<Select
value={speaker.label || speaker.id}
onChange={(label) => onLabelChange(speaker.id, label)}
>
<SelectItem value="Lekarz">🩺 Lekarz</SelectItem>
<SelectItem value="Właściciel">👤 Właściciel</SelectItem>
<SelectItem value="Asystent">👥 Asystent</SelectItem>
</Select>
<span className="text-sm text-gray-500">
{formatDuration(speaker.totalTime)} mówienia
</span>
</div>
))}
</div>
);
};

src/components/audio/TranscriptWithSpeakers.tsx
interface TranscriptWithSpeakersProps {
segments: SpeakerSegment[];
currentTime?: number; // For highlighting during playback
}
export const TranscriptWithSpeakers: React.FC<TranscriptWithSpeakersProps> = ({
segments,
currentTime,
}) => {
return (
<div className="space-y-3">
{segments.map((segment, idx) => {
const isActive = currentTime !== undefined &&
currentTime >= segment.start &&
currentTime <= segment.end;
return (
<div
key={idx}
className={cn(
'flex gap-3 p-2 rounded',
isActive && 'bg-blue-50 dark:bg-blue-900/20'
)}
>
<div className="flex-shrink-0">
<SpeakerBadge
speakerId={segment.speaker_id}
label={segment.speaker_label}
/>
</div>
<div className="flex-1">
<p className="text-sm">{segment.text}</p>
<span className="text-xs text-gray-400">
{formatTime(segment.start)} - {formatTime(segment.end)}
</span>
</div>
</div>
);
})}
</div>
);
};

FactorImpact on Quality
Overlapping speechReduces accuracy
Background noiseReduces accuracy
Similar voicesMakes clustering harder
Short utterancesLess data for embeddings
Clear turn-takingImproves accuracy
const assessDiarizationQuality = (result: DiarizationResult): DiarizationQuality => {
// Check speaker balance
const totalTime = result.speakers.reduce((sum, s) => sum + s.total_speech_time, 0);
const speakerRatios = result.speakers.map(s => s.total_speech_time / totalTime);
// Check average confidence
const avgConfidence = result.speakers.reduce((sum, s) => sum + s.confidence, 0) /
result.speakers.length;
// Check segment count (too many switches = noisy)
const switchesPerMinute = result.segments.length / (totalTime / 60);
if (avgConfidence > 0.85 && switchesPerMinute < 10) {
return 'High';
} else if (avgConfidence > 0.7 && switchesPerMinute < 20) {
return 'Medium';
} else {
return 'Low';
}
};

Użytkownicy mogą ręcznie korygować błędną diaryzację:

// Merge segments from same speaker
const mergeSegments = (segments: SpeakerSegment[], indices: number[]): SpeakerSegment[] => {
// ...
};
// Split segment (when two speakers are incorrectly merged)
const splitSegment = (segment: SpeakerSegment, splitTime: number): SpeakerSegment[] => {
// ...
};
// Reassign segment to different speaker
const reassignSpeaker = (segmentId: string, newSpeakerId: string): Promise<void> => {
return invoke('update_speaker_segment', { segmentId, speakerId: newSpeakerId });
};