//! Near-duplicate detection via peak envelope fingerprinting. //! //! Computes a downsampled peak envelope (100 Hz), quantizes to bytes, and stores //! as a BLOB. Compares envelopes via normalized cross-correlation to find //! re-encoded or slightly trimmed copies of the same sample. use crate::db::Database; use crate::error::{unix_now, CoreError, Result}; use crate::vp_tree::VpTree; use tracing::instrument; /// Peak envelope sample rate: 100 peak values per second of audio. const ENVELOPE_RATE: u32 = 100; /// Correlation score below this threshold indicates a near-duplicate. /// 0.0 = identical envelopes, 1.0 = completely different. const DUPLICATE_THRESHOLD: f64 = 0.05; /// A stored fingerprint: peak envelope at fixed rate + metadata. pub struct Fingerprint { pub hash: String, pub envelope: Vec, pub sample_rate: u32, } /// A near-duplicate search result. pub struct DuplicateResult { pub hash: String, /// Distance score: 0.0 = identical, 1.0 = completely different. pub score: f64, /// Detected time offset in milliseconds (positive = b starts later). pub offset_ms: i64, } /// Compute a peak envelope from mono f32 samples. /// /// Divides audio into blocks of `sample_rate / ENVELOPE_RATE` samples, /// takes the absolute peak of each block, normalizes to global max, /// and quantizes to 0..255. #[instrument(skip_all)] pub fn compute_envelope(samples: &[f32], sample_rate: u32) -> Vec { if samples.is_empty() || sample_rate == 0 { return Vec::new(); } let block_size = (sample_rate / ENVELOPE_RATE) as usize; if block_size == 0 { return Vec::new(); } let num_blocks = samples.len().div_ceil(block_size); let mut peaks: Vec = Vec::with_capacity(num_blocks); for chunk in samples.chunks(block_size) { let peak = chunk.iter().fold(0.0f32, |acc, &s| acc.max(s.abs())); peaks.push(peak); } let global_max = peaks.iter().fold(0.0f32, |acc, &p| acc.max(p)); if global_max < f32::EPSILON { return vec![0u8; peaks.len()]; } peaks .iter() .map(|&p| (p / global_max * 255.0).round().clamp(0.0, 255.0) as u8) .collect() } /// Save a fingerprint to the database. #[instrument(skip_all)] pub fn save_fingerprint(db: &Database, fp: &Fingerprint) -> Result<()> { let now = unix_now(); db.conn().execute( "INSERT OR REPLACE INTO fingerprints (hash, envelope, sample_rate, generated_at) VALUES (?1, ?2, ?3, ?4)", rusqlite::params![fp.hash, fp.envelope, fp.sample_rate, now], )?; Ok(()) } /// Load a fingerprint from the database. #[instrument(skip_all)] pub fn load_fingerprint(db: &Database, hash: &str) -> Result { db.conn() .query_row( "SELECT hash, envelope, sample_rate FROM fingerprints WHERE hash = ?1", [hash], |row| { Ok(Fingerprint { hash: row.get(0)?, envelope: row.get(1)?, sample_rate: row.get(2)?, }) }, ) .map_err(|_| CoreError::SampleNotFound(hash.to_string())) } /// Find near-duplicates of the given sample by comparing peak envelopes (linear scan). /// /// Loads all fingerprints, computes envelope distance against the reference, /// filters by threshold, and returns the top `limit` results sorted by score. /// /// O(n) linear scan. For indexed sub-linear queries, use [`FingerprintIndex`]. #[instrument(skip_all, fields(hash = %hash))] pub fn find_near_duplicates( db: &Database, hash: &str, limit: usize, ) -> Result> { let reference = load_fingerprint(db, hash)?; let mut stmt = db.conn().prepare( "SELECT hash, envelope, sample_rate FROM fingerprints WHERE hash != ?1", )?; let others: Vec = stmt .query_map([hash], |row| { Ok(Fingerprint { hash: row.get(0)?, envelope: row.get(1)?, sample_rate: row.get(2)?, }) })? .collect::, _>>()?; let mut results: Vec = others .iter() .filter_map(|other| { let (score, offset_ms) = envelope_distance(&reference.envelope, &other.envelope); if score <= DUPLICATE_THRESHOLD { Some(DuplicateResult { hash: other.hash.clone(), score, offset_ms, }) } else { None } }) .collect(); results.sort_by(|a, b| a.score.total_cmp(&b.score)); results.truncate(limit); Ok(results) } /// Compute normalized cross-correlation distance between two envelopes. /// /// Returns `(score, offset_ms)` where score is `1.0 - best_correlation` /// (0.0 = identical, 1.0 = uncorrelated) and offset_ms is the time shift /// at which the best alignment was found. #[instrument(skip_all)] pub fn envelope_distance(a: &[u8], b: &[u8]) -> (f64, i64) { if a.is_empty() || b.is_empty() { return (1.0, 0); } let len_ratio = a.len().max(b.len()) as f64 / a.len().min(b.len()) as f64; if len_ratio > 2.0 { return (1.0, 0); } // Ensure `longer` is the longer envelope let (longer, shorter, sign) = if a.len() >= b.len() { (a, b, 1i64) } else { (b, a, -1i64) }; let len_diff = longer.len() - shorter.len(); // 50 blocks = 0.5s tolerance at 100 Hz envelope rate let max_shift = len_diff + 50; let mut best_corr = f64::MIN; let mut best_offset: i64 = 0; // Slide shorter over longer at each offset from -max_shift to +max_shift. // Negative offset means shorter starts before longer (we compare a prefix of shorter // against the start of longer). let shift_range = max_shift as i64; for offset in -shift_range..=shift_range { // Compute overlap region indices let a_start = offset.max(0) as usize; let b_start = (-offset).max(0) as usize; let a_end = (longer.len() as i64).min(shorter.len() as i64 + offset).max(0) as usize; let b_end = (shorter.len() as i64).min(longer.len() as i64 - offset).max(0) as usize; let n = a_end.saturating_sub(a_start).min(b_end.saturating_sub(b_start)); if n < 10 { continue; // Too little overlap for meaningful correlation } let a_slice = &longer[a_start..a_start + n]; let b_slice = &shorter[b_start..b_start + n]; let corr = normalized_cross_correlation(a_slice, b_slice); if corr > best_corr { best_corr = corr; best_offset = offset; } } if best_corr < f64::MIN + 1.0 { return (1.0, 0); } let score = 1.0 - best_corr.clamp(0.0, 1.0); // Convert block offset to milliseconds (1 block = 1000/ENVELOPE_RATE ms = 10ms) let offset_ms = best_offset * sign * 10; (score, offset_ms) } /// Normalized cross-correlation between two u8 slices of equal length. fn normalized_cross_correlation(a: &[u8], b: &[u8]) -> f64 { let n = a.len() as f64; let mean_a: f64 = a.iter().map(|&v| v as f64).sum::() / n; let mean_b: f64 = b.iter().map(|&v| v as f64).sum::() / n; let mut sum_ab = 0.0; let mut sum_a2 = 0.0; let mut sum_b2 = 0.0; for (&va, &vb) in a.iter().zip(b.iter()) { let da = va as f64 - mean_a; let db = vb as f64 - mean_b; sum_ab += da * db; sum_a2 += da * da; sum_b2 += db * db; } let denom = (sum_a2 * sum_b2).sqrt(); if denom < f64::EPSILON { return 1.0; // Both constant → identical } sum_ab / denom } // --- VP-tree index for fast near-duplicate search --- /// Number of bins for compact envelope summary features. const SUMMARY_BINS: usize = 16; /// Search radius in summary feature space. Generous enough to capture all /// near-duplicates (including trimmed/offset variants) while filtering out /// the vast majority of non-duplicates. const SUMMARY_SEARCH_RADIUS: f64 = 1.0; /// Entry stored in the VP-tree: hash + full envelope + compact summary features. pub struct FingerprintEntry { pub(crate) hash: String, pub(crate) envelope: Vec, pub(crate) features: [f64; SUMMARY_BINS], } /// Compute compact features from an envelope: 16 mean-amplitude bins in [0, 1]. /// /// Divides the envelope into 16 equal segments regardless of length, so the /// features are comparable across different sample durations. fn compact_features(envelope: &[u8]) -> [f64; SUMMARY_BINS] { let mut features = [0.0f64; SUMMARY_BINS]; if envelope.is_empty() { return features; } let len = envelope.len(); for (bin, feat) in features.iter_mut().enumerate() { let start = bin * len / SUMMARY_BINS; let end = (bin + 1) * len / SUMMARY_BINS; if start >= end { continue; } let sum: f64 = envelope[start..end].iter().map(|&v| v as f64).sum(); *feat = sum / ((end - start) as f64 * 255.0); } features } /// Euclidean distance between compact feature vectors. Proper metric. fn summary_distance(a: &FingerprintEntry, b: &FingerprintEntry) -> f64 { let mut sum = 0.0; for i in 0..SUMMARY_BINS { let d = a.features[i] - b.features[i]; sum += d * d; } sum.sqrt() } /// Pre-built VP-tree index for fast near-duplicate fingerprint search. /// /// Uses a two-phase approach: /// 1. VP-tree on compact 16-bin envelope features (Euclidean metric, O(log n) query) /// 2. Full normalized cross-correlation verification on candidates /// /// Build cost: O(n log n) with cheap Euclidean distance (< 1s for 100K fingerprints). /// Query cost: O(log n + candidates) — typically < 5ms for 100K fingerprints. pub struct FingerprintIndex { tree: VpTree, } impl FingerprintIndex { /// Build an index from all fingerprints in the database. #[instrument(skip_all)] /// Load raw fingerprint data from the database (fast, just I/O). pub fn load_data(db: &Database) -> Result> { let mut stmt = db.conn().prepare( "SELECT hash, envelope FROM fingerprints", )?; let entries: Vec = stmt .query_map([], |row| { let hash: String = row.get(0)?; let envelope: Vec = row.get(1)?; let features = compact_features(&envelope); Ok(FingerprintEntry { hash, envelope, features, }) })? .collect::, _>>()?; Ok(entries) } /// Build the index from pre-loaded data (CPU-intensive, no DB needed). pub fn build_from_data(entries: Vec) -> Self { let tree = VpTree::build(entries, summary_distance); Self { tree } } pub fn build(db: &Database) -> Result { let entries = Self::load_data(db)?; Ok(Self::build_from_data(entries)) } /// Number of fingerprints in the index. pub fn len(&self) -> usize { self.tree.len() } /// Whether the index is empty. pub fn is_empty(&self) -> bool { self.tree.is_empty() } /// Find near-duplicates of the given sample. /// /// Phase 1: VP-tree range query on compact features to find candidates. /// Phase 2: Full NCC verification on candidates to confirm near-duplicates. #[instrument(skip_all, fields(hash = %hash))] pub fn find_near_duplicates( &self, hash: &str, envelope: &[u8], limit: usize, ) -> Vec { let query_entry = FingerprintEntry { hash: hash.to_string(), envelope: envelope.to_vec(), features: compact_features(envelope), }; // Phase 1: find candidates with similar compact features. let candidates = self .tree .find_within(&query_entry, SUMMARY_SEARCH_RADIUS, summary_distance); // Phase 2: verify with full NCC. let mut results: Vec = candidates .into_iter() .filter(|c| self.tree.get(c.index).hash != hash) .filter_map(|c| { let entry = self.tree.get(c.index); let (score, offset_ms) = envelope_distance(envelope, &entry.envelope); if score <= DUPLICATE_THRESHOLD { Some(DuplicateResult { hash: entry.hash.clone(), score, offset_ms, }) } else { None } }) .collect(); results.sort_by(|a, b| a.score.total_cmp(&b.score)); results.truncate(limit); results } } #[cfg(test)] mod tests { use super::*; use crate::test_helpers::insert_fake_sample; #[test] fn compute_envelope_basic() { // Ascending ramp from 0.0 to 1.0 over 1 second at 1000 Hz // block_size = 1000/100 = 10 samples per block let samples: Vec = (0..1000).map(|i| i as f32 / 999.0).collect(); let envelope = compute_envelope(&samples, 1000); assert_eq!(envelope.len(), 100); // First block peak should be much lower than last block peak assert!(envelope[0] < envelope[99]); // Last block should be 255 (the global max) assert_eq!(envelope[99], 255); } #[test] fn compute_envelope_silence() { let samples = vec![0.0f32; 4400]; let envelope = compute_envelope(&samples, 44100); assert!(!envelope.is_empty()); assert!(envelope.iter().all(|&v| v == 0)); } #[test] fn identical_envelopes_score_zero() { let env: Vec = (0..200).map(|i| (i % 256) as u8).collect(); let (score, _offset) = envelope_distance(&env, &env); assert!(score < 0.001, "Expected ~0.0, got {score}"); } #[test] fn different_envelopes_score_high() { // Ascending vs descending — should be anti-correlated let a: Vec = (0..200).map(|i| i as u8).collect(); let b: Vec = (0..200).map(|i| (199 - i) as u8).collect(); let (score, _) = envelope_distance(&a, &b); assert!( score > DUPLICATE_THRESHOLD, "Expected > {DUPLICATE_THRESHOLD}, got {score}" ); } #[test] fn trimmed_version_detected() { // Original: 300 blocks. Trimmed: skip first 20 blocks. let original: Vec = (0..300).map(|i| ((i * 7) % 256) as u8).collect(); let trimmed: Vec = original[20..].to_vec(); let (score, offset) = envelope_distance(&original, &trimmed); assert!( score < DUPLICATE_THRESHOLD, "Expected < {DUPLICATE_THRESHOLD}, got {score}" ); // Offset should be approximately 20 blocks * 10ms = 200ms assert!( (offset.abs() - 200).unsigned_abs() < 50, "Expected offset ~200ms, got {offset}ms" ); } #[test] fn save_and_load_roundtrip() { let db = crate::db::Database::open_in_memory().unwrap(); insert_fake_sample(&db, "aabbccdd"); let envelope = vec![10u8, 20, 30, 40, 50]; let fp = Fingerprint { hash: "aabbccdd".to_string(), envelope: envelope.clone(), sample_rate: 44100, }; save_fingerprint(&db, &fp).unwrap(); let loaded = load_fingerprint(&db, "aabbccdd").unwrap(); assert_eq!(loaded.envelope, envelope); assert_eq!(loaded.sample_rate, 44100); } #[test] fn find_near_duplicates_returns_matches() { let db = crate::db::Database::open_in_memory().unwrap(); insert_fake_sample(&db, "ref_hash"); insert_fake_sample(&db, "dup_hash"); insert_fake_sample(&db, "diff_hash"); let env_ref: Vec = (0..200).map(|i| ((i * 3) % 256) as u8).collect(); // Near-duplicate: identical envelope let env_dup = env_ref.clone(); // Different: unrelated pattern let env_diff: Vec = (0..200).map(|i| ((i * 13 + 100) % 256) as u8).collect(); save_fingerprint( &db, &Fingerprint { hash: "ref_hash".to_string(), envelope: env_ref, sample_rate: 44100, }, ) .unwrap(); save_fingerprint( &db, &Fingerprint { hash: "dup_hash".to_string(), envelope: env_dup, sample_rate: 44100, }, ) .unwrap(); save_fingerprint( &db, &Fingerprint { hash: "diff_hash".to_string(), envelope: env_diff, sample_rate: 44100, }, ) .unwrap(); let results = find_near_duplicates(&db, "ref_hash", 50).unwrap(); assert_eq!(results.len(), 1, "Should find exactly 1 near-duplicate"); assert_eq!(results[0].hash, "dup_hash"); assert!(results[0].score < DUPLICATE_THRESHOLD); } // --- compact_features tests --- #[test] fn compact_features_empty() { let f = compact_features(&[]); assert!(f.iter().all(|&v| v == 0.0)); } #[test] fn compact_features_uniform() { // All 128 → each bin mean should be 128/255 ≈ 0.502 let env = vec![128u8; 160]; let f = compact_features(&env); for &v in &f { assert!((v - 128.0 / 255.0).abs() < 0.01, "Expected ~0.502, got {v}"); } } #[test] fn compact_features_ascending() { // Ascending ramp: first bins should be lower, last bins higher. let env: Vec = (0..160).map(|i| (i * 255 / 159) as u8).collect(); let f = compact_features(&env); assert!(f[0] < f[15], "First bin should be less than last bin"); } #[test] fn compact_features_identical_envelopes_same_features() { let env: Vec = (0..300).map(|i| ((i * 7) % 256) as u8).collect(); let f1 = compact_features(&env); let f2 = compact_features(&env); for i in 0..SUMMARY_BINS { assert!((f1[i] - f2[i]).abs() < f64::EPSILON); } } // --- FingerprintIndex tests --- #[test] fn index_build_empty() { let db = crate::db::Database::open_in_memory().unwrap(); let idx = FingerprintIndex::build(&db).unwrap(); assert!(idx.is_empty()); assert_eq!(idx.len(), 0); } #[test] fn index_finds_duplicate() { let db = crate::db::Database::open_in_memory().unwrap(); insert_fake_sample(&db, "ref_hash"); insert_fake_sample(&db, "dup_hash"); insert_fake_sample(&db, "diff_hash"); let env_ref: Vec = (0..200).map(|i| ((i * 3) % 256) as u8).collect(); let env_dup = env_ref.clone(); let env_diff: Vec = (0..200).map(|i| ((i * 13 + 100) % 256) as u8).collect(); for (h, e) in [("ref_hash", &env_ref), ("dup_hash", &env_dup), ("diff_hash", &env_diff)] { save_fingerprint( &db, &Fingerprint { hash: h.to_string(), envelope: e.clone(), sample_rate: 44100, }, ) .unwrap(); } let idx = FingerprintIndex::build(&db).unwrap(); assert_eq!(idx.len(), 3); let results = idx.find_near_duplicates("ref_hash", &env_ref, 50); assert_eq!(results.len(), 1, "Should find exactly 1 near-duplicate"); assert_eq!(results[0].hash, "dup_hash"); assert!(results[0].score < DUPLICATE_THRESHOLD); } #[test] fn index_excludes_self() { let db = crate::db::Database::open_in_memory().unwrap(); insert_fake_sample(&db, "only"); let env: Vec = (0..200).map(|i| ((i * 3) % 256) as u8).collect(); save_fingerprint( &db, &Fingerprint { hash: "only".to_string(), envelope: env.clone(), sample_rate: 44100, }, ) .unwrap(); let idx = FingerprintIndex::build(&db).unwrap(); let results = idx.find_near_duplicates("only", &env, 50); assert!(results.is_empty(), "Should not return self as a duplicate"); } #[test] fn index_matches_linear_scan() { // Verify the index produces the same results as the linear scan. let db = crate::db::Database::open_in_memory().unwrap(); insert_fake_sample(&db, "a"); insert_fake_sample(&db, "b"); insert_fake_sample(&db, "c"); let env_a: Vec = (0..200).map(|i| ((i * 3) % 256) as u8).collect(); let env_b = env_a.clone(); // duplicate let env_c: Vec = (0..200).map(|i| ((i * 13 + 100) % 256) as u8).collect(); for (h, e) in [("a", &env_a), ("b", &env_b), ("c", &env_c)] { save_fingerprint( &db, &Fingerprint { hash: h.to_string(), envelope: e.clone(), sample_rate: 44100, }, ) .unwrap(); } let linear = find_near_duplicates(&db, "a", 50).unwrap(); let idx = FingerprintIndex::build(&db).unwrap(); let indexed = idx.find_near_duplicates("a", &env_a, 50); assert_eq!(linear.len(), indexed.len()); for (l, i) in linear.iter().zip(indexed.iter()) { assert_eq!(l.hash, i.hash); assert!((l.score - i.score).abs() < 1e-10); } } }