Przejdź do głównej zawartości

WhisperX Integration

Vista wspiera wielu providerów STT z automatycznym failover:

ProviderLanguageAccuracyLatencyDiarization
LibraxisAIPL, EN95%+~2s/min
MLX WhisperPL, EN90%+~3s/min
OpenAI Whisper50+ langs95%+~4s/min

flowchart TD
REQ[🎤 Audio File] --> DETECT{Detect Best Provider}
DETECT -->|Test 1| LIB{LibraxisAI Available?}
LIB -->|Yes| LIB_USE[Use LibraxisAI]
LIB -->|No| MLX{Local MLX Available?}
MLX -->|Yes| MLX_USE[Use MLX Whisper]
MLX -->|No| OAI{OpenAI Key Set?}
OAI -->|Yes| OAI_USE[Use OpenAI Whisper]
OAI -->|No| FAIL[❌ No STT Available]
LIB_USE --> RESULT[📝 Transcript]
MLX_USE --> RESULT
OAI_USE --> RESULT

const LIBRAXIS_STT = 'https://stt.libraxis.cloud/v1/transcribe/file';
// Limits
// Max file size: 50MB
// Supported formats: mp3, wav, webm, mp4, flac
// Languages: pl, en (with auto-detection)
const transcribeWithLibraxis = async (
audioFile: File,
language: string = 'pl'
): Promise<TranscriptionResponse> => {
const formData = new FormData();
formData.append('file', audioFile);
formData.append('language', language);
formData.append('enable_diarization', 'true');
formData.append('word_timestamps', 'true');
const response = await fetch(LIBRAXIS_STT, {
method: 'POST',
headers: {
'Authorization': `Bearer ${apiKey}`,
},
body: formData,
});
return response.json();
};

Okno terminala
# Instalacja
pip install mlx-whisper
# Uruchomienie serwera
mlx_whisper.server --model mlx-community/whisper-large-v3-mlx --port 1911
const MLX_STT = 'http://localhost:1911/v1/audio/transcriptions';
const transcribeWithMLX = async (audioFile: File): Promise<TranscriptionResponse> => {
const formData = new FormData();
formData.append('file', audioFile);
formData.append('model', 'whisper-large-v3');
formData.append('language', 'pl');
formData.append('response_format', 'verbose_json');
const response = await fetch(MLX_STT, {
method: 'POST',
body: formData,
});
return response.json();
};

const OPENAI_STT = 'https://api.openai.com/v1/audio/transcriptions';
const transcribeWithOpenAI = async (audioFile: File): Promise<TranscriptionResponse> => {
const formData = new FormData();
formData.append('file', audioFile);
formData.append('model', 'whisper-1');
formData.append('language', 'pl');
formData.append('response_format', 'verbose_json');
formData.append('timestamp_granularities[]', 'word');
formData.append('timestamp_granularities[]', 'segment');
const response = await fetch(OPENAI_STT, {
method: 'POST',
headers: {
'Authorization': `Bearer ${openaiApiKey}`,
},
body: formData,
});
return response.json();
};
ModelCost
whisper-1$0.006 / minute

src-tauri/src/commands/audio/stt/api.rs
#[tauri::command]
pub async fn detect_best_stt_endpoint() -> Result<SttEndpoint, String> {
// Test order: LibraxisAI → MLX → OpenAI
if test_libraxis_stt().await.is_ok() {
return Ok(SttEndpoint::Libraxis);
}
if test_mlx_stt().await.is_ok() {
return Ok(SttEndpoint::LocalMLX);
}
if test_openai_stt().await.is_ok() {
return Ok(SttEndpoint::OpenAI);
}
Ok(SttEndpoint::None) // All services unavailable
}
async fn test_libraxis_stt() -> Result<(), Error> {
let client = reqwest::Client::new();
let response = client
.get("https://stt.libraxis.cloud/health")
.timeout(Duration::from_secs(5))
.send()
.await?;
if response.status().is_success() {
Ok(())
} else {
Err(Error::ServiceUnavailable)
}
}

#[derive(Debug, Serialize, Deserialize)]
pub struct TranscriptionRequest {
pub audio_path: String,
pub language: String,
pub provider: TranscriptionProvider,
pub options: TranscriptionOptions,
}
#[derive(Debug, Serialize, Deserialize)]
pub struct TranscriptionOptions {
pub enable_timestamps: bool, // Word-level timestamps
pub enable_diarization: bool, // Speaker identification
pub num_speakers: Option<u8>, // Expected number of speakers
pub vocabulary_boost: Vec<String>, // Medical terms to prioritize
}
#[derive(Debug, Serialize, Deserialize)]
pub struct TranscriptionResponse {
pub text: String, // Full transcript text
pub segments: Vec<TranscriptionSegment>,
pub language_detected: String,
pub duration_seconds: f64,
pub confidence: f32,
}
#[derive(Debug, Serialize, Deserialize)]
pub struct TranscriptionSegment {
pub id: u32,
pub start: f64, // Start time in seconds
pub end: f64, // End time in seconds
pub text: String,
pub speaker: Option<String>, // Speaker ID if diarization enabled
pub confidence: f32,
pub words: Option<Vec<WordTimestamp>>,
}
#[derive(Debug, Serialize, Deserialize)]
pub struct WordTimestamp {
pub word: String,
pub start: f64,
pub end: f64,
pub confidence: f32,
}

#[tauri::command]
pub async fn start_transcription(
db: State<'_, Database>,
recording_id: String,
provider: String, // "auto", "libraxis", "mlx", "openai"
language: String, // "pl", "en"
) -> Result<TranscriptionJob, String> {
// 1. Get recording metadata
let recording = get_recording(&db, &recording_id).await?;
// 2. Select provider
let selected_provider = if provider == "auto" {
detect_best_stt_endpoint().await?
} else {
SttEndpoint::from_str(&provider)?
};
// 3. Create job record
let job_id = uuid::Uuid::new_v4().to_string();
sqlx::query!(
"INSERT INTO jobs (id, type, key, status, payload_json)
VALUES (?, 'transcription', ?, 'queued', ?)",
job_id,
recording_id,
serde_json::to_string(&TranscriptionJobPayload {
provider: selected_provider,
language,
})?
).execute(&db.pool).await?;
// 4. Update recording status
sqlx::query!(
"UPDATE recordings SET status = 'transcribing' WHERE id = ?",
recording_id
).execute(&db.pool).await?;
Ok(TranscriptionJob {
job_id,
status: "queued".to_string(),
})
}

Vista może podnosić prawdopodobieństwo rozpoznania terminów medycznych:

const veterinaryVocabulary = [
// Diagnoses
'zapalenie', 'infekcja', 'nowotwór', 'alergia',
// Procedures
'kastracja', 'sterylizacja', 'biopsja', 'USG',
// Medications
'antybiotyk', 'szczepionka', 'znieczulenie',
// Anatomy
'wątroba', 'nerka', 'serce', 'płuca',
];
// Include in transcription request
const options: TranscriptionOptions = {
enable_timestamps: true,
enable_diarization: true,
vocabulary_boost: veterinaryVocabulary,
};