-
Notifications
You must be signed in to change notification settings - Fork 282
Add live audio transcription streaming support to Foundry Local Rust SDK #613
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
fd22350
ab41774
2d6eb8c
5862384
1b23343
d8459b2
0f8ae7a
d358aa6
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,65 @@ | ||
| # Codex Feedback: Rust Live Audio Streaming Review | ||
|
|
||
| ## Outcome | ||
|
|
||
| The live-streaming feature is **functionally working end-to-end**: | ||
|
|
||
| **Microphone -> Rust SDK -> core.dll -> onnxruntime.dll / onnxruntime-genai.dll** | ||
|
|
||
| The runtime path was validated (including device detection, session start/stop, and no native errors during streaming flow). | ||
|
|
||
| --- | ||
|
|
||
| ## API Parity Comparison (Rust vs C#) | ||
|
|
||
| ### ✅ Matching areas | ||
|
|
||
| 1. Factory method exists in both SDKs: | ||
| - C#: `CreateLiveTranscriptionSession()` | ||
| - Rust: `create_live_transcription_session()` | ||
|
|
||
| 2. Core command flow is aligned: | ||
| - `audio_stream_start` | ||
| - `audio_stream_push` (binary payload path) | ||
| - `audio_stream_stop` | ||
|
|
||
| 3. Session lifecycle shape exists in both: | ||
| - start -> append/push -> stream results -> stop | ||
|
|
||
| 4. Settings coverage is aligned: | ||
| - sample rate, channels, bits per sample, language, queue capacity | ||
|
|
||
| 5. **[RESOLVED]** Cancellation semantics: | ||
| - Rust now accepts `Option<CancellationToken>` on `start()`, `append()`, `stop()` | ||
| - `stop()` uses cancel-safe pattern matching C# `StopAsync` | ||
|
|
||
| 6. **[RESOLVED]** Response surface shape: | ||
| - Rust response now has `content: Vec<ContentPart>` with `text`/`transcript` fields | ||
| - Callers use `result.content[0].text` — identical to C# `Content[0].Text` | ||
|
|
||
| 7. **[RESOLVED]** Disposal contract: | ||
| - `Drop` performs synchronous best-effort `audio_stream_stop` | ||
|
|
||
| --- | ||
|
|
||
| ### Remaining minor differences (by design) | ||
|
|
||
| 1. **Stream accessor is single-take** — Rust `get_transcription_stream()` moves the receiver out (one call per session). C# returns `IAsyncEnumerable` from the channel reader directly. Functionally equivalent. | ||
|
|
||
| 2. **Cancellation token type** — Rust uses `tokio_util::sync::CancellationToken`; C# uses `System.Threading.CancellationToken`. Both serve the same purpose with idiomatic patterns. | ||
|
|
||
| --- | ||
|
|
||
| ## Reliability / Safety Notes | ||
|
|
||
| 1. FFI binary pointer handling for empty slices uses `std::ptr::null()` to avoid dangling-pointer risk. | ||
| 2. Native session cleanup on drop includes best-effort `audio_stream_stop` to reduce leak risk. | ||
| 3. Cancel-safe stop always completes native session cleanup even if cancellation fires. | ||
|
|
||
| --- | ||
|
|
||
| ## Final Assessment | ||
|
|
||
| - **Feature status**: Working | ||
| - **E2E path**: Verified (microphone → SDK → core.dll → ort-genai) | ||
| - **Parity status**: API-identical to C# (cancellation, response envelope, disposal) | ||
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
| @@ -0,0 +1,11 @@ | ||||||
| [package] | ||||||
| name = "live-audio-transcription-example" | ||||||
| version = "0.1.0" | ||||||
| edition = "2021" | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
| description = "Live audio transcription (streaming) example using the Foundry Local Rust SDK" | ||||||
|
|
||||||
| [dependencies] | ||||||
| foundry-local-sdk = { path = "../../../sdk/rust" } | ||||||
| tokio = { version = "1", features = ["rt-multi-thread", "macros"] } | ||||||
| tokio-stream = "0.1" | ||||||
| cpal = "0.15" | ||||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,277 @@ | ||
| // Live Audio Transcription — Foundry Local Rust SDK Example | ||
| // | ||
| // Demonstrates real-time microphone-to-text using: | ||
| // Microphone (cpal) → SDK → Core (NativeAOT DLL) → onnxruntime-genai (StreamingProcessor) | ||
| // | ||
| // Usage: | ||
| // cargo run # Live microphone transcription (press ENTER to stop) | ||
| // cargo run -- --synth # Use synthetic 440Hz sine wave instead of microphone | ||
|
|
||
| use std::env; | ||
| use std::io::{self, Write}; | ||
| use std::sync::Arc; | ||
|
|
||
| use cpal::traits::{DeviceTrait, HostTrait, StreamTrait}; | ||
| use foundry_local_sdk::{FoundryLocalConfig, FoundryLocalManager}; | ||
| use tokio_stream::StreamExt; | ||
|
|
||
| const ALIAS: &str = "nemotron"; | ||
|
|
||
| #[tokio::main] | ||
| async fn main() -> Result<(), Box<dyn std::error::Error>> { | ||
| let use_synth = env::args().any(|a| a == "--synth"); | ||
|
|
||
| println!("==========================================================="); | ||
| println!(" Foundry Local -- Live Audio Transcription Demo (Rust)"); | ||
| println!("==========================================================="); | ||
| println!(); | ||
|
|
||
| // ── 1. Resolve e2e-test-pkgs path ──────────────────────────────────── | ||
| let exe_dir = env::current_exe()?.parent().unwrap().to_path_buf(); | ||
|
|
||
| let manifest_dir = env!("CARGO_MANIFEST_DIR"); | ||
| let e2e_pkgs = std::path::PathBuf::from(manifest_dir) | ||
| .join("..") | ||
| .join("e2e-test-pkgs"); | ||
|
|
||
| let (core_path, model_cache_dir) = if e2e_pkgs.exists() { | ||
| let core = e2e_pkgs | ||
| .canonicalize() | ||
| .expect("Failed to canonicalize e2e-test-pkgs path"); | ||
| let models = core.join("models"); | ||
| println!("Using e2e-test-pkgs:"); | ||
| println!(" Core DLLs: {}", core.display()); | ||
| println!(" Models: {}", models.display()); | ||
| ( | ||
| core.to_string_lossy().into_owned(), | ||
| models.to_string_lossy().into_owned(), | ||
| ) | ||
| } else { | ||
| println!("Using default paths (exe directory)"); | ||
| ( | ||
| exe_dir.to_string_lossy().into_owned(), | ||
| exe_dir.join("models").to_string_lossy().into_owned(), | ||
| ) | ||
| }; | ||
|
|
||
| // ── 2. Initialise the manager ──────────────────────────────────────── | ||
| let config = FoundryLocalConfig::new("foundry_local_samples") | ||
| .library_path(&core_path) | ||
| .model_cache_dir(&model_cache_dir) | ||
| .additional_setting("Bootstrap", "false"); | ||
|
|
||
| let manager = FoundryLocalManager::create(config)?; | ||
| println!("✓ FoundryLocalManager initialized\n"); | ||
|
|
||
| // ── 3. Get the nemotron model ──────────────────────────────────────── | ||
| let model = manager.catalog().get_model(ALIAS).await?; | ||
| println!("Model: {} (id: {})", model.alias(), model.id()); | ||
|
|
||
| if !model.is_cached().await? { | ||
| println!("Downloading model..."); | ||
| model | ||
| .download(Some(|progress: f64| { | ||
| print!("\r {progress:.1}%"); | ||
| io::stdout().flush().ok(); | ||
| })) | ||
| .await?; | ||
| println!(); | ||
| } | ||
|
|
||
| println!("Loading model..."); | ||
| model.load().await?; | ||
| println!("✓ Model loaded\n"); | ||
|
|
||
| // ── 4. Create live transcription session ───────────────────────────── | ||
| let audio_client = model.create_audio_client(); | ||
| let session = Arc::new(audio_client.create_live_transcription_session()); | ||
|
|
||
| println!("Starting live transcription session..."); | ||
| session.start(None).await?; | ||
| println!("✓ Session started\n"); | ||
|
|
||
| // ── 5. Start reading transcription results in background ───────────── | ||
| let mut stream = session.get_transcription_stream()?; | ||
| let read_task = tokio::spawn(async move { | ||
| let mut count = 0usize; | ||
| while let Some(result) = stream.next().await { | ||
| match result { | ||
| Ok(r) => { | ||
| let text = &r.content[0].text; | ||
| if r.is_final { | ||
| println!(); | ||
| println!(" [FINAL] {text}"); | ||
| io::stdout().flush().ok(); | ||
| } else if !text.is_empty() { | ||
| print!("{text}"); | ||
| io::stdout().flush().ok(); | ||
| } | ||
| count += 1; | ||
| } | ||
| Err(e) => { | ||
| eprintln!("\n [ERROR] Stream error: {e}"); | ||
| break; | ||
| } | ||
| } | ||
| } | ||
| count | ||
| }); | ||
|
|
||
| if use_synth { | ||
| // ── 6a. Synthetic audio mode ───────────────────────────────────── | ||
| println!("Generating synthetic PCM audio (440Hz sine wave, 3 seconds)...\n"); | ||
|
|
||
| println!("==========================================================="); | ||
| println!(" PUSHING AUDIO → SDK → Core → onnxruntime-genai"); | ||
| println!("===========================================================\n"); | ||
|
|
||
| let pcm_data = generate_sine_wave_pcm(16000, 3, 440.0); | ||
| let chunk_size = 16000 / 10 * 2; // 100ms chunks | ||
| let mut chunks_pushed = 0; | ||
| for offset in (0..pcm_data.len()).step_by(chunk_size) { | ||
| let end = std::cmp::min(offset + chunk_size, pcm_data.len()); | ||
| session.append(&pcm_data[offset..end], None).await?; | ||
| chunks_pushed += 1; | ||
| } | ||
| println!("Pushed {chunks_pushed} chunks ({} bytes)", pcm_data.len()); | ||
| } else { | ||
| // ── 6b. Live microphone mode ───────────────────────────────────── | ||
| let host = cpal::default_host(); | ||
| let device = host | ||
| .default_input_device() | ||
| .expect("No input audio device available"); | ||
| println!("Microphone: {}", device.name().unwrap_or_default()); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What is default? |
||
|
|
||
| // Query the device's default input config and adapt | ||
| let default_config = device.default_input_config()?; | ||
| println!( | ||
| "Device default: {} Hz, {} ch, {:?}", | ||
| default_config.sample_rate().0, | ||
| default_config.channels(), | ||
| default_config.sample_format() | ||
| ); | ||
|
|
||
| let device_rate = default_config.sample_rate().0; | ||
| let device_channels = default_config.channels(); | ||
| let mic_config: cpal::StreamConfig = default_config.into(); | ||
|
|
||
| let session_for_mic = Arc::clone(&session); | ||
| let rt = tokio::runtime::Handle::current(); | ||
|
|
||
| // Build the stream with the device's native sample format (f32) | ||
| // and convert to 16kHz/16-bit/mono PCM for the SDK | ||
| let input_stream = device.build_input_stream( | ||
| &mic_config, | ||
| move |data: &[f32], _: &cpal::InputCallbackInfo| { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we move this conversion logic into a named function (e.g. |
||
| // Step 1: Mix to mono if stereo+ | ||
| let mono: Vec<f32> = if device_channels > 1 { | ||
| data.chunks(device_channels as usize) | ||
| .map(|frame| frame.iter().sum::<f32>() / device_channels as f32) | ||
| .collect() | ||
| } else { | ||
| data.to_vec() | ||
| }; | ||
|
|
||
| // Step 2: Resample to 16kHz if device rate differs | ||
| let resampled = if device_rate != 16000 { | ||
| resample(&mono, device_rate, 16000) | ||
| } else { | ||
| mono | ||
| }; | ||
|
|
||
| // Step 3: Convert f32 → i16 → little-endian bytes | ||
| let bytes: Vec<u8> = resampled | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can it be recorded in this format or? We need to show low latency here, and cannot afford iter() which is slow in Rust. |
||
| .iter() | ||
| .flat_map(|&s| { | ||
| let clamped = s.clamp(-1.0, 1.0); | ||
| let sample = (clamped * i16::MAX as f32) as i16; | ||
| sample.to_le_bytes() | ||
| }) | ||
| .collect(); | ||
|
|
||
| if !bytes.is_empty() { | ||
| let session_ref = Arc::clone(&session_for_mic); | ||
| rt.spawn(async move { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Consider not doing this, and utilizing channels. It will be faster + you can remove unnecessary Arc clone above |
||
| if let Err(e) = session_ref.append(&bytes, None).await { | ||
| eprintln!("Append error: {e}"); | ||
| } | ||
| }); | ||
| } | ||
| }, | ||
| |err| eprintln!("Microphone stream error: {err}"), | ||
| None, | ||
| )?; | ||
|
|
||
| input_stream.play()?; | ||
|
|
||
| println!(); | ||
| println!("==========================================================="); | ||
| println!(" LIVE TRANSCRIPTION ACTIVE"); | ||
| println!(" Speak into your microphone."); | ||
| println!(" Transcription appears in real-time."); | ||
| println!(" Press ENTER to stop recording."); | ||
| println!("==========================================================="); | ||
| println!(); | ||
|
|
||
| // Block until user presses ENTER | ||
| let mut line = String::new(); | ||
| io::stdin().read_line(&mut line)?; | ||
|
|
||
| drop(input_stream); | ||
| println!("Microphone stopped."); | ||
| } | ||
|
|
||
| // ── 7. Stop session and wait for results ───────────────────────────── | ||
| println!("\nStopping session (flushing remaining audio)..."); | ||
| session.stop(None).await?; | ||
| println!("✓ Session stopped\n"); | ||
|
|
||
| let result_count = read_task.await?; | ||
|
|
||
| println!("==========================================================="); | ||
| println!(" Total transcription results: {result_count}"); | ||
| println!("==========================================================="); | ||
|
|
||
| // ── 8. Cleanup ─────────────────────────────────────────────────────── | ||
| println!("\nUnloading model..."); | ||
| model.unload().await?; | ||
| println!("Done."); | ||
|
|
||
| Ok(()) | ||
| } | ||
|
|
||
| /// Generate synthetic PCM audio (sine wave, 16kHz, 16-bit signed little-endian, mono). | ||
| fn generate_sine_wave_pcm(sample_rate: i32, duration_seconds: i32, frequency: f64) -> Vec<u8> { | ||
| let total_samples = (sample_rate * duration_seconds) as usize; | ||
| let mut pcm_bytes = vec![0u8; total_samples * 2]; | ||
|
|
||
| for i in 0..total_samples { | ||
| let t = i as f64 / sample_rate as f64; | ||
| let sample = | ||
| (i16::MAX as f64 * 0.5 * (2.0 * std::f64::consts::PI * frequency * t).sin()) as i16; | ||
| let bytes = sample.to_le_bytes(); | ||
| pcm_bytes[i * 2] = bytes[0]; | ||
| pcm_bytes[i * 2 + 1] = bytes[1]; | ||
| } | ||
|
|
||
| pcm_bytes | ||
| } | ||
|
|
||
| /// Simple linear-interpolation resampler (e.g. 48kHz → 16kHz). | ||
| fn resample(input: &[f32], from_rate: u32, to_rate: u32) -> Vec<f32> { | ||
| if from_rate == to_rate { | ||
| return input.to_vec(); | ||
| } | ||
| let ratio = from_rate as f64 / to_rate as f64; | ||
| let out_len = (input.len() as f64 / ratio).ceil() as usize; | ||
| let mut output = Vec::with_capacity(out_len); | ||
| for i in 0..out_len { | ||
| let src_idx = i as f64 * ratio; | ||
| let idx = src_idx as usize; | ||
| let frac = src_idx - idx as f64; | ||
| let s0 = input[idx.min(input.len() - 1)]; | ||
| let s1 = input[(idx + 1).min(input.len() - 1)]; | ||
| output.push(s0 + (s1 - s0) * frac as f32); | ||
| } | ||
| output | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Let's remove this file