Przejdź do głównej zawartości

Audio Engines

Na tej stronie
flowchart LR
subgraph Recording
MIC[Mikrofon] --> CAP[Audio Capture]
CAP --> BUF[Ring Buffer]
BUF --> WAV[WAV Encoder]
end
subgraph Processing
WAV --> STT[Speech-to-Text]
STT --> DIAR[Diarization]
DIAR --> SEG[Segmentation]
end
subgraph Storage
WAV --> FS[File System]
SEG --> DB[(SQLite)]
end

Vista używa cpal (Cross-Platform Audio Library) dla nagrywania audio.

src-tauri/src/audio/capture.rs
use cpal::traits::{DeviceTrait, HostTrait, StreamTrait};
pub struct AudioCapture {
stream: Option<cpal::Stream>,
buffer: Arc<RwLock<RingBuffer>>,
config: AudioConfig,
}
#[derive(Debug, Clone)]
pub struct AudioConfig {
pub sample_rate: u32, // Default: 16000 Hz (optimal for STT)
pub channels: u16, // Default: 1 (mono)
pub bits_per_sample: u16, // Default: 16
pub buffer_size: usize, // Default: 4096 samples
}
impl AudioCapture {
pub fn start(&mut self) -> Result<(), AudioError> {
let host = cpal::default_host();
let device = host.default_input_device()
.ok_or(AudioError::NoInputDevice)?;
let config = cpal::StreamConfig {
channels: self.config.channels,
sample_rate: cpal::SampleRate(self.config.sample_rate),
buffer_size: cpal::BufferSize::Fixed(self.config.buffer_size as u32),
};
let buffer = self.buffer.clone();
let stream = device.build_input_stream(
&config,
move |data: &[f32], _| {
buffer.write().unwrap().push_samples(data);
},
|err| eprintln!("Audio stream error: {}", err),
None,
)?;
stream.play()?;
self.stream = Some(stream);
Ok(())
}
}
FormatExtensionUse CaseCompression
WAV.wavRecording, archivalNone (lossless)
WebM/Opus.webmBrowser recordingLossy (efficient)
AAC.m4aiOS compatibilityLossy
FLAC.flacHigh quality archivalLossless
// Audio files stored in app data directory
// ~/Library/Application Support/Vista/recordings/
pub fn get_recording_path(recording_id: &str) -> PathBuf {
let app_dir = tauri::api::path::app_data_dir(&Config::default())
.expect("Failed to get app data dir");
app_dir
.join("recordings")
.join(format!("{}.wav", recording_id))
}

VAD automatycznie wykrywa mowę w sygnale audio.

pub struct VADConfig {
pub threshold_db: f32, // Default: -45 dB
pub min_speech_duration_ms: u32, // Default: 250 ms
pub min_silence_duration_ms: u32, // Default: 1500 ms
pub smoothing_window: u32, // Default: 3 frames
}
pub struct VoiceActivityDetector {
config: VADConfig,
state: VADState,
energy_history: VecDeque<f32>,
}
impl VoiceActivityDetector {
pub fn process_frame(&mut self, samples: &[f32]) -> VADResult {
// 1. Calculate RMS energy
let energy = calculate_rms(samples);
let energy_db = 20.0 * energy.log10();
// 2. Smooth energy over window
self.energy_history.push_back(energy_db);
if self.energy_history.len() > self.config.smoothing_window as usize {
self.energy_history.pop_front();
}
let smoothed_energy = self.energy_history.iter().sum::<f32>()
/ self.energy_history.len() as f32;
// 3. Compare to threshold
let is_speech = smoothed_energy > self.config.threshold_db;
// 4. State machine for min durations
self.update_state(is_speech)
}
}
// Frontend events
interface VADEvents {
onSpeechStart: () => void;
onSpeechEnd: () => void;
onSilenceDetected: (duration: number) => void;
onEnergyLevel: (level: number) => void;
}

ProviderLanguageAccuracyLatencyDiarization
LibraxisAIPL, EN95%+~2s/min
MLX WhisperPL, EN90%+~3s/min
OpenAI Whisper50+ langs95%+~4s/min
#[derive(Debug, Serialize, Deserialize)]
pub struct TranscriptionRequest {
pub audio_path: String,
pub language: String,
pub provider: TranscriptionProvider,
pub options: TranscriptionOptions,
}
#[derive(Debug, Serialize, Deserialize)]
pub struct TranscriptionOptions {
pub enable_timestamps: bool, // Word-level timestamps
pub enable_diarization: bool, // Speaker identification
pub num_speakers: Option<u8>, // Expected number of speakers
pub vocabulary_boost: Vec<String>, // Medical terms to prioritize
}
#[derive(Debug, Serialize, Deserialize)]
pub struct TranscriptionResponse {
pub text: String,
pub segments: Vec<TranscriptionSegment>,
pub language_detected: String,
pub duration_seconds: f64,
pub confidence: f32,
}
#[derive(Debug, Serialize, Deserialize)]
pub struct TranscriptionSegment {
pub id: u32,
pub start: f64, // Start time in seconds
pub end: f64, // End time in seconds
pub text: String,
pub speaker: Option<String>, // Speaker ID if diarization enabled
pub confidence: f32,
}

Diarization identyfikuje kto mówi w nagraniu (lekarz vs właściciel).

sequenceDiagram
participant Audio
participant VAD as Voice Activity Detection
participant EMB as Speaker Embeddings
participant CLU as Clustering
participant OUT as Output
Audio->>VAD: Audio segments
VAD->>EMB: Speech segments
EMB->>EMB: Extract speaker embeddings
EMB->>CLU: Embedding vectors
CLU->>CLU: Cluster into speakers
CLU->>OUT: Labeled segments
#[derive(Debug, Serialize, Deserialize)]
pub struct DiarizationResult {
pub speakers: Vec<SpeakerInfo>,
pub segments: Vec<SpeakerSegment>,
pub quality: DiarizationQuality,
}
#[derive(Debug, Serialize, Deserialize)]
pub struct SpeakerInfo {
pub id: String, // "SPEAKER_00", "SPEAKER_01"
pub label: Option<String>, // "Lekarz", "Właściciel"
pub total_speech_time: f64,
pub confidence: f32,
}
#[derive(Debug, Serialize, Deserialize)]
pub struct SpeakerSegment {
pub speaker_id: String,
pub start: f64,
pub end: f64,
pub text: String,
}
#[derive(Debug, Serialize, Deserialize)]
pub enum DiarizationQuality {
High, // Clear separation, high confidence
Medium, // Some overlap, moderate confidence
Low, // Significant overlap, low confidence
}
CREATE TABLE speaker_segments (
id TEXT PRIMARY KEY,
visit_id TEXT NOT NULL REFERENCES visits(visit_id),
speaker_id TEXT NOT NULL, -- "SPEAKER_00"
speaker_label TEXT, -- "Lekarz"
start_time REAL NOT NULL,
end_time REAL NOT NULL,
text TEXT,
confidence REAL,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
CREATE INDEX idx_speaker_segments_visit ON speaker_segments(visit_id);

pub async fn process_visit_audio(
recording_id: &str,
options: ProcessingOptions,
) -> Result<ProcessingResult, AudioError> {
// 1. Load audio file
let audio_path = get_recording_path(recording_id);
let audio_data = load_audio(&audio_path)?;
// 2. Transcription
let transcription = transcribe_audio(&audio_data, &options.stt_options).await?;
// 3. Diarization (if enabled)
let diarization = if options.enable_diarization {
Some(diarize_audio(&audio_data, &transcription).await?)
} else {
None
};
// 4. Merge transcription with diarization
let segments = merge_transcription_diarization(
&transcription.segments,
diarization.as_ref(),
);
// 5. Store results
store_transcript(recording_id, &transcription).await?;
if let Some(diar) = &diarization {
store_diarization(recording_id, diar).await?;
}
Ok(ProcessingResult {
transcript: transcription.text,
segments,
diarization,
processing_time_ms: elapsed.as_millis(),
})
}
// Audio processing runs in background via jobs table
pub async fn queue_audio_processing(
recording_id: &str,
options: ProcessingOptions,
) -> Result<String, Error> {
let job_id = uuid::Uuid::new_v4().to_string();
sqlx::query!(
"INSERT INTO jobs (id, type, key, status, payload_json)
VALUES (?, 'audio_processing', ?, 'queued', ?)",
job_id,
recording_id,
serde_json::to_string(&options)?,
).execute(&pool).await?;
// Job will be picked up by background worker
Ok(job_id)
}

pub struct AudioPreprocessor {
// Noise reduction
noise_gate_threshold: f32,
// Normalization
target_loudness: f32,
// Filtering
highpass_freq: f32,
lowpass_freq: f32,
}
impl AudioPreprocessor {
pub fn process(&self, samples: &mut [f32]) {
// 1. Apply noise gate
self.apply_noise_gate(samples);
// 2. High-pass filter (remove rumble)
self.apply_highpass(samples);
// 3. Low-pass filter (remove hiss)
self.apply_lowpass(samples);
// 4. Normalize loudness
self.normalize(samples);
}
}
#[derive(Debug, Serialize)]
pub struct AudioQualityReport {
pub duration_seconds: f64,
pub sample_rate: u32,
pub channels: u16,
// Quality metrics
pub signal_to_noise_ratio: f32, // dB
pub peak_level: f32, // dBFS
pub average_level: f32, // dBFS
pub clipping_detected: bool,
pub silence_percentage: f32,
// Recommendations
pub quality_score: u8, // 0-100
pub issues: Vec<QualityIssue>,
}
#[derive(Debug, Serialize)]
pub enum QualityIssue {
TooQuiet,
TooLoud,
Clipping,
ExcessiveNoise,
ExcessiveSilence,
}

pub async fn cleanup_old_recordings(
pool: &SqlitePool,
retention_days: i32,
) -> Result<CleanupResult, Error> {
let cutoff = chrono::Utc::now() - chrono::Duration::days(retention_days as i64);
// 1. Find recordings older than retention period
let old_recordings = sqlx::query_as!(Recording,
"SELECT * FROM recordings
WHERE created_at < ? AND status = 'done'",
cutoff.to_rfc3339()
).fetch_all(pool).await?;
let mut deleted_count = 0;
let mut freed_bytes = 0u64;
for recording in old_recordings {
// 2. Delete file from filesystem
if let Ok(metadata) = fs::metadata(&recording.path) {
freed_bytes += metadata.len();
fs::remove_file(&recording.path)?;
}
// 3. Update database record
sqlx::query!(
"UPDATE recordings SET status = 'deleted', path = NULL WHERE id = ?",
recording.id
).execute(pool).await?;
deleted_count += 1;
}
Ok(CleanupResult {
deleted_count,
freed_bytes,
})
}
// In user preferences
interface AudioRetentionSettings {
retentionDays: number; // 7, 14, 30, 90, 365
deleteOnVisitFinalize: boolean;
keepTranscriptsOnly: boolean; // Delete audio, keep text
}