//! MFCC (Mel-Frequency Cepstral Coefficients) from FFT magnitude frames. //! //! Computed from existing STFT magnitudes — no additional FFT needed: //! mel filterbank (26 bands) -> log energy -> DCT-II -> first 13 coefficients. //! Aggregated as mean + variance across frames. /// Number of mel filterbank bands. const NUM_MEL_FILTERS: usize = 26; /// Number of MFCC coefficients to keep from DCT. pub const NUM_MFCC: usize = 13; /// Aggregated MFCC features across all frames. #[derive(Debug, Clone)] pub struct MfccFeatures { pub means: [f64; NUM_MFCC], pub variances: [f64; NUM_MFCC], } impl Default for MfccFeatures { fn default() -> Self { Self { means: [0.0; NUM_MFCC], variances: [0.0; NUM_MFCC], } } } /// Sparse representation of a triangular mel filter. struct MelFilter { start_bin: usize, weights: Vec, } /// Precomputed mel filterbank for a given FFT size and sample rate. struct MelFilterbank { filters: Vec, } /// Convert frequency in Hz to the mel scale: `2595 * log10(1 + hz/700)`. /// The mel scale approximates human pitch perception — equal mel intervals /// sound equally spaced to the ear, even though the Hz intervals grow wider /// at higher frequencies. fn hz_to_mel(hz: f64) -> f64 { 2595.0 * (1.0 + hz / 700.0).log10() } /// Inverse mel scale: convert mel value back to Hz. fn mel_to_hz(mel: f64) -> f64 { 700.0 * (10.0_f64.powf(mel / 2595.0) - 1.0) } /// Build a mel-spaced triangular filterbank for MFCC computation. /// /// Creates `num_filters` overlapping triangular filters spanning 0 Hz to Nyquist, /// spaced uniformly on the mel scale. Each filter has a rising slope from its left /// edge to its center and a falling slope from center to right edge — this smoothly /// bins FFT magnitudes into perceptually meaningful frequency bands. fn build_mel_filterbank(fft_size: usize, sample_rate: u32, num_filters: usize) -> MelFilterbank { let spectrum_len = fft_size / 2 + 1; let freq_bin_hz = sample_rate as f64 / fft_size as f64; let mel_low = hz_to_mel(0.0); let mel_high = hz_to_mel(sample_rate as f64 / 2.0); // num_filters + 2 points define the edges of all triangular filters. let num_points = num_filters + 2; let mel_points: Vec = (0..num_points) .map(|i| mel_low + (mel_high - mel_low) * i as f64 / (num_points - 1) as f64) .collect(); let hz_points: Vec = mel_points.iter().map(|&m| mel_to_hz(m)).collect(); let bin_points: Vec = hz_points .iter() .map(|&hz| ((hz / freq_bin_hz).round() as usize).min(spectrum_len - 1)) .collect(); let mut filters = Vec::with_capacity(num_filters); for i in 0..num_filters { let start = bin_points[i]; let center = bin_points[i + 1]; let end = bin_points[i + 2]; if start >= end || center <= start || center >= end { filters.push(MelFilter { start_bin: 0, weights: Vec::new(), }); continue; } let mut weights = vec![0.0; end - start]; // Rising slope: start -> center for bin in start..center { weights[bin - start] = (bin - start) as f64 / (center - start) as f64; } // Falling slope: center -> end for bin in center..end { weights[bin - start] = (end - bin) as f64 / (end - center) as f64; } filters.push(MelFilter { start_bin: start, weights, }); } MelFilterbank { filters } } /// Compute aggregated MFCC features from per-frame magnitude spectra. /// /// Each frame is a magnitude spectrum from the STFT. Applies a mel filterbank, /// takes log energies, applies DCT-II, and aggregates mean + variance across frames. pub fn compute_mfccs( magnitude_frames: &[Vec], sample_rate: u32, fft_size: usize, ) -> MfccFeatures { if magnitude_frames.is_empty() { return MfccFeatures::default(); } let filterbank = build_mel_filterbank(fft_size, sample_rate, NUM_MEL_FILTERS); let mut all_mfccs: Vec<[f64; NUM_MFCC]> = Vec::with_capacity(magnitude_frames.len()); for frame in magnitude_frames { // Apply mel filterbank let mut mel_energies = [0.0f64; NUM_MEL_FILTERS]; for (i, filter) in filterbank.filters.iter().enumerate() { let mut energy = 0.0; for (j, &w) in filter.weights.iter().enumerate() { let bin = filter.start_bin + j; if bin < frame.len() { energy += w * frame[bin]; } } mel_energies[i] = energy.max(1e-10); } // Log energies let log_energies: [f64; NUM_MEL_FILTERS] = std::array::from_fn(|i| mel_energies[i].ln()); // DCT-II: keep first NUM_MFCC coefficients let n = NUM_MEL_FILTERS as f64; let mut mfccs = [0.0f64; NUM_MFCC]; for (k, coeff) in mfccs.iter_mut().enumerate() { let mut sum = 0.0; for (i, &le) in log_energies.iter().enumerate() { sum += le * (std::f64::consts::PI * k as f64 * (2.0 * i as f64 + 1.0) / (2.0 * n)) .cos(); } *coeff = sum * 2.0; } all_mfccs.push(mfccs); } // Mean across frames let num_frames = all_mfccs.len() as f64; let mut means = [0.0f64; NUM_MFCC]; for mfccs in &all_mfccs { for (i, &v) in mfccs.iter().enumerate() { means[i] += v; } } for m in &mut means { *m /= num_frames; } // Variance across frames let mut variances = [0.0f64; NUM_MFCC]; if all_mfccs.len() > 1 { for mfccs in &all_mfccs { for (i, &v) in mfccs.iter().enumerate() { let diff = v - means[i]; variances[i] += diff * diff; } } for v in &mut variances { *v /= num_frames; } } MfccFeatures { means, variances } } #[cfg(test)] mod tests { use super::*; #[test] fn hz_mel_roundtrip() { for &hz in &[0.0, 100.0, 440.0, 1000.0, 8000.0, 22050.0] { let mel = hz_to_mel(hz); let back = mel_to_hz(mel); assert!( (hz - back).abs() < 0.01, "roundtrip failed for {hz}: got {back}" ); } } #[test] fn filterbank_has_correct_count() { let fb = build_mel_filterbank(1024, 44100, NUM_MEL_FILTERS); assert_eq!(fb.filters.len(), NUM_MEL_FILTERS); } #[test] fn empty_frames_returns_default() { let result = compute_mfccs(&[], 44100, 1024); assert_eq!(result.means, [0.0; NUM_MFCC]); assert_eq!(result.variances, [0.0; NUM_MFCC]); } #[test] fn single_frame_has_zero_variance() { let frame = vec![1.0; 513]; // spectrum_len for fft_size=1024 let result = compute_mfccs(&[frame], 44100, 1024); assert_eq!(result.variances, [0.0; NUM_MFCC]); } #[test] fn different_frames_have_nonzero_variance() { let frame1 = vec![1.0; 513]; let frame2 = vec![2.0; 513]; let result = compute_mfccs(&[frame1, frame2], 44100, 1024); assert!(result.variances.iter().any(|&v| v > 0.0)); } #[test] fn mel_scale_is_monotonic() { let freqs = [100.0, 200.0, 500.0, 1000.0, 2000.0, 5000.0, 10000.0]; for pair in freqs.windows(2) { assert!(hz_to_mel(pair[1]) > hz_to_mel(pair[0])); } } #[test] fn mfcc_values_are_finite() { // Random-ish magnitude spectrum let frame: Vec = (0..513).map(|i| (i as f64 * 0.01).sin().abs() + 0.001).collect(); let result = compute_mfccs(&[frame.clone(), frame], 44100, 1024); for &m in &result.means { assert!(m.is_finite(), "non-finite mean: {m}"); } for &v in &result.variances { assert!(v.is_finite(), "non-finite variance: {v}"); } } }