Skip to main content

max / audiofiles

21.4 KB · 652 lines History Blame Raw
1 //! Near-duplicate detection via peak envelope fingerprinting.
2 //!
3 //! Computes a downsampled peak envelope (100 Hz), quantizes to bytes, and stores
4 //! as a BLOB. Compares envelopes via normalized cross-correlation to find
5 //! re-encoded or slightly trimmed copies of the same sample.
6
7 use crate::db::Database;
8 use crate::error::{unix_now, CoreError, Result};
9 use crate::vp_tree::VpTree;
10 use tracing::instrument;
11
12 /// Peak envelope sample rate: 100 peak values per second of audio.
13 const ENVELOPE_RATE: u32 = 100;
14
15 /// Correlation score below this threshold indicates a near-duplicate.
16 /// 0.0 = identical envelopes, 1.0 = completely different.
17 const DUPLICATE_THRESHOLD: f64 = 0.05;
18
19 /// A stored fingerprint: peak envelope at fixed rate + metadata.
20 pub struct Fingerprint {
21 pub hash: String,
22 pub envelope: Vec<u8>,
23 pub sample_rate: u32,
24 }
25
26 /// A near-duplicate search result.
27 pub struct DuplicateResult {
28 pub hash: String,
29 /// Distance score: 0.0 = identical, 1.0 = completely different.
30 pub score: f64,
31 /// Detected time offset in milliseconds (positive = b starts later).
32 pub offset_ms: i64,
33 }
34
35 /// Compute a peak envelope from mono f32 samples.
36 ///
37 /// Divides audio into blocks of `sample_rate / ENVELOPE_RATE` samples,
38 /// takes the absolute peak of each block, normalizes to global max,
39 /// and quantizes to 0..255.
40 #[instrument(skip_all)]
41 pub fn compute_envelope(samples: &[f32], sample_rate: u32) -> Vec<u8> {
42 if samples.is_empty() || sample_rate == 0 {
43 return Vec::new();
44 }
45
46 let block_size = (sample_rate / ENVELOPE_RATE) as usize;
47 if block_size == 0 {
48 return Vec::new();
49 }
50
51 let num_blocks = samples.len().div_ceil(block_size);
52 let mut peaks: Vec<f32> = Vec::with_capacity(num_blocks);
53
54 for chunk in samples.chunks(block_size) {
55 let peak = chunk.iter().fold(0.0f32, |acc, &s| acc.max(s.abs()));
56 peaks.push(peak);
57 }
58
59 let global_max = peaks.iter().fold(0.0f32, |acc, &p| acc.max(p));
60 if global_max < f32::EPSILON {
61 return vec![0u8; peaks.len()];
62 }
63
64 peaks
65 .iter()
66 .map(|&p| (p / global_max * 255.0).round().clamp(0.0, 255.0) as u8)
67 .collect()
68 }
69
70 /// Save a fingerprint to the database.
71 #[instrument(skip_all)]
72 pub fn save_fingerprint(db: &Database, fp: &Fingerprint) -> Result<()> {
73 let now = unix_now();
74 db.conn().execute(
75 "INSERT OR REPLACE INTO fingerprints (hash, envelope, sample_rate, generated_at)
76 VALUES (?1, ?2, ?3, ?4)",
77 rusqlite::params![fp.hash, fp.envelope, fp.sample_rate, now],
78 )?;
79 Ok(())
80 }
81
82 /// Load a fingerprint from the database.
83 #[instrument(skip_all)]
84 pub fn load_fingerprint(db: &Database, hash: &str) -> Result<Fingerprint> {
85 db.conn()
86 .query_row(
87 "SELECT hash, envelope, sample_rate FROM fingerprints WHERE hash = ?1",
88 [hash],
89 |row| {
90 Ok(Fingerprint {
91 hash: row.get(0)?,
92 envelope: row.get(1)?,
93 sample_rate: row.get(2)?,
94 })
95 },
96 )
97 .map_err(|_| CoreError::SampleNotFound(hash.to_string()))
98 }
99
100 /// Find near-duplicates of the given sample by comparing peak envelopes (linear scan).
101 ///
102 /// Loads all fingerprints, computes envelope distance against the reference,
103 /// filters by threshold, and returns the top `limit` results sorted by score.
104 ///
105 /// O(n) linear scan. For indexed sub-linear queries, use [`FingerprintIndex`].
106 #[instrument(skip_all, fields(hash = %hash))]
107 pub fn find_near_duplicates(
108 db: &Database,
109 hash: &str,
110 limit: usize,
111 ) -> Result<Vec<DuplicateResult>> {
112 let reference = load_fingerprint(db, hash)?;
113
114 let mut stmt = db.conn().prepare(
115 "SELECT hash, envelope, sample_rate FROM fingerprints WHERE hash != ?1",
116 )?;
117 let others: Vec<Fingerprint> = stmt
118 .query_map([hash], |row| {
119 Ok(Fingerprint {
120 hash: row.get(0)?,
121 envelope: row.get(1)?,
122 sample_rate: row.get(2)?,
123 })
124 })?
125 .collect::<std::result::Result<Vec<_>, _>>()?;
126
127 let mut results: Vec<DuplicateResult> = others
128 .iter()
129 .filter_map(|other| {
130 let (score, offset_ms) = envelope_distance(&reference.envelope, &other.envelope);
131 if score <= DUPLICATE_THRESHOLD {
132 Some(DuplicateResult {
133 hash: other.hash.clone(),
134 score,
135 offset_ms,
136 })
137 } else {
138 None
139 }
140 })
141 .collect();
142
143 results.sort_by(|a, b| a.score.total_cmp(&b.score));
144 results.truncate(limit);
145 Ok(results)
146 }
147
148 /// Compute normalized cross-correlation distance between two envelopes.
149 ///
150 /// Returns `(score, offset_ms)` where score is `1.0 - best_correlation`
151 /// (0.0 = identical, 1.0 = uncorrelated) and offset_ms is the time shift
152 /// at which the best alignment was found.
153 #[instrument(skip_all)]
154 pub fn envelope_distance(a: &[u8], b: &[u8]) -> (f64, i64) {
155 if a.is_empty() || b.is_empty() {
156 return (1.0, 0);
157 }
158
159 let len_ratio = a.len().max(b.len()) as f64 / a.len().min(b.len()) as f64;
160 if len_ratio > 2.0 {
161 return (1.0, 0);
162 }
163
164 // Ensure `longer` is the longer envelope
165 let (longer, shorter, sign) = if a.len() >= b.len() {
166 (a, b, 1i64)
167 } else {
168 (b, a, -1i64)
169 };
170
171 let len_diff = longer.len() - shorter.len();
172 // 50 blocks = 0.5s tolerance at 100 Hz envelope rate
173 let max_shift = len_diff + 50;
174
175 let mut best_corr = f64::MIN;
176 let mut best_offset: i64 = 0;
177
178 // Slide shorter over longer at each offset from -max_shift to +max_shift.
179 // Negative offset means shorter starts before longer (we compare a prefix of shorter
180 // against the start of longer).
181 let shift_range = max_shift as i64;
182 for offset in -shift_range..=shift_range {
183 // Compute overlap region indices
184 let a_start = offset.max(0) as usize;
185 let b_start = (-offset).max(0) as usize;
186 let a_end = (longer.len() as i64).min(shorter.len() as i64 + offset).max(0) as usize;
187 let b_end = (shorter.len() as i64).min(longer.len() as i64 - offset).max(0) as usize;
188
189 let n = a_end.saturating_sub(a_start).min(b_end.saturating_sub(b_start));
190 if n < 10 {
191 continue; // Too little overlap for meaningful correlation
192 }
193
194 let a_slice = &longer[a_start..a_start + n];
195 let b_slice = &shorter[b_start..b_start + n];
196
197 let corr = normalized_cross_correlation(a_slice, b_slice);
198 if corr > best_corr {
199 best_corr = corr;
200 best_offset = offset;
201 }
202 }
203
204 if best_corr < f64::MIN + 1.0 {
205 return (1.0, 0);
206 }
207
208 let score = 1.0 - best_corr.clamp(0.0, 1.0);
209 // Convert block offset to milliseconds (1 block = 1000/ENVELOPE_RATE ms = 10ms)
210 let offset_ms = best_offset * sign * 10;
211 (score, offset_ms)
212 }
213
214 /// Normalized cross-correlation between two u8 slices of equal length.
215 fn normalized_cross_correlation(a: &[u8], b: &[u8]) -> f64 {
216 let n = a.len() as f64;
217
218 let mean_a: f64 = a.iter().map(|&v| v as f64).sum::<f64>() / n;
219 let mean_b: f64 = b.iter().map(|&v| v as f64).sum::<f64>() / n;
220
221 let mut sum_ab = 0.0;
222 let mut sum_a2 = 0.0;
223 let mut sum_b2 = 0.0;
224
225 for (&va, &vb) in a.iter().zip(b.iter()) {
226 let da = va as f64 - mean_a;
227 let db = vb as f64 - mean_b;
228 sum_ab += da * db;
229 sum_a2 += da * da;
230 sum_b2 += db * db;
231 }
232
233 let denom = (sum_a2 * sum_b2).sqrt();
234 if denom < f64::EPSILON {
235 return 1.0; // Both constant → identical
236 }
237
238 sum_ab / denom
239 }
240
241 // --- VP-tree index for fast near-duplicate search ---
242
243 /// Number of bins for compact envelope summary features.
244 const SUMMARY_BINS: usize = 16;
245
246 /// Search radius in summary feature space. Generous enough to capture all
247 /// near-duplicates (including trimmed/offset variants) while filtering out
248 /// the vast majority of non-duplicates.
249 const SUMMARY_SEARCH_RADIUS: f64 = 1.0;
250
251 /// Entry stored in the VP-tree: hash + full envelope + compact summary features.
252 pub struct FingerprintEntry {
253 pub(crate) hash: String,
254 pub(crate) envelope: Vec<u8>,
255 pub(crate) features: [f64; SUMMARY_BINS],
256 }
257
258 /// Compute compact features from an envelope: 16 mean-amplitude bins in [0, 1].
259 ///
260 /// Divides the envelope into 16 equal segments regardless of length, so the
261 /// features are comparable across different sample durations.
262 fn compact_features(envelope: &[u8]) -> [f64; SUMMARY_BINS] {
263 let mut features = [0.0f64; SUMMARY_BINS];
264 if envelope.is_empty() {
265 return features;
266 }
267 let len = envelope.len();
268 for (bin, feat) in features.iter_mut().enumerate() {
269 let start = bin * len / SUMMARY_BINS;
270 let end = (bin + 1) * len / SUMMARY_BINS;
271 if start >= end {
272 continue;
273 }
274 let sum: f64 = envelope[start..end].iter().map(|&v| v as f64).sum();
275 *feat = sum / ((end - start) as f64 * 255.0);
276 }
277 features
278 }
279
280 /// Euclidean distance between compact feature vectors. Proper metric.
281 fn summary_distance(a: &FingerprintEntry, b: &FingerprintEntry) -> f64 {
282 let mut sum = 0.0;
283 for i in 0..SUMMARY_BINS {
284 let d = a.features[i] - b.features[i];
285 sum += d * d;
286 }
287 sum.sqrt()
288 }
289
290 /// Pre-built VP-tree index for fast near-duplicate fingerprint search.
291 ///
292 /// Uses a two-phase approach:
293 /// 1. VP-tree on compact 16-bin envelope features (Euclidean metric, O(log n) query)
294 /// 2. Full normalized cross-correlation verification on candidates
295 ///
296 /// Build cost: O(n log n) with cheap Euclidean distance (< 1s for 100K fingerprints).
297 /// Query cost: O(log n + candidates) — typically < 5ms for 100K fingerprints.
298 pub struct FingerprintIndex {
299 tree: VpTree<FingerprintEntry>,
300 }
301
302 impl FingerprintIndex {
303 /// Build an index from all fingerprints in the database.
304 #[instrument(skip_all)]
305 /// Load raw fingerprint data from the database (fast, just I/O).
306 pub fn load_data(db: &Database) -> Result<Vec<FingerprintEntry>> {
307 let mut stmt = db.conn().prepare(
308 "SELECT hash, envelope FROM fingerprints",
309 )?;
310 let entries: Vec<FingerprintEntry> = stmt
311 .query_map([], |row| {
312 let hash: String = row.get(0)?;
313 let envelope: Vec<u8> = row.get(1)?;
314 let features = compact_features(&envelope);
315 Ok(FingerprintEntry {
316 hash,
317 envelope,
318 features,
319 })
320 })?
321 .collect::<std::result::Result<Vec<_>, _>>()?;
322 Ok(entries)
323 }
324
325 /// Build the index from pre-loaded data (CPU-intensive, no DB needed).
326 pub fn build_from_data(entries: Vec<FingerprintEntry>) -> Self {
327 let tree = VpTree::build(entries, summary_distance);
328 Self { tree }
329 }
330
331 pub fn build(db: &Database) -> Result<Self> {
332 let entries = Self::load_data(db)?;
333 Ok(Self::build_from_data(entries))
334 }
335
336 /// Number of fingerprints in the index.
337 pub fn len(&self) -> usize {
338 self.tree.len()
339 }
340
341 /// Whether the index is empty.
342 pub fn is_empty(&self) -> bool {
343 self.tree.is_empty()
344 }
345
346 /// Find near-duplicates of the given sample.
347 ///
348 /// Phase 1: VP-tree range query on compact features to find candidates.
349 /// Phase 2: Full NCC verification on candidates to confirm near-duplicates.
350 #[instrument(skip_all, fields(hash = %hash))]
351 pub fn find_near_duplicates(
352 &self,
353 hash: &str,
354 envelope: &[u8],
355 limit: usize,
356 ) -> Vec<DuplicateResult> {
357 let query_entry = FingerprintEntry {
358 hash: hash.to_string(),
359 envelope: envelope.to_vec(),
360 features: compact_features(envelope),
361 };
362
363 // Phase 1: find candidates with similar compact features.
364 let candidates = self
365 .tree
366 .find_within(&query_entry, SUMMARY_SEARCH_RADIUS, summary_distance);
367
368 // Phase 2: verify with full NCC.
369 let mut results: Vec<DuplicateResult> = candidates
370 .into_iter()
371 .filter(|c| self.tree.get(c.index).hash != hash)
372 .filter_map(|c| {
373 let entry = self.tree.get(c.index);
374 let (score, offset_ms) = envelope_distance(envelope, &entry.envelope);
375 if score <= DUPLICATE_THRESHOLD {
376 Some(DuplicateResult {
377 hash: entry.hash.clone(),
378 score,
379 offset_ms,
380 })
381 } else {
382 None
383 }
384 })
385 .collect();
386
387 results.sort_by(|a, b| a.score.total_cmp(&b.score));
388 results.truncate(limit);
389 results
390 }
391 }
392
393 #[cfg(test)]
394 mod tests {
395 use super::*;
396 use crate::test_helpers::insert_fake_sample;
397
398 #[test]
399 fn compute_envelope_basic() {
400 // Ascending ramp from 0.0 to 1.0 over 1 second at 1000 Hz
401 // block_size = 1000/100 = 10 samples per block
402 let samples: Vec<f32> = (0..1000).map(|i| i as f32 / 999.0).collect();
403 let envelope = compute_envelope(&samples, 1000);
404 assert_eq!(envelope.len(), 100);
405 // First block peak should be much lower than last block peak
406 assert!(envelope[0] < envelope[99]);
407 // Last block should be 255 (the global max)
408 assert_eq!(envelope[99], 255);
409 }
410
411 #[test]
412 fn compute_envelope_silence() {
413 let samples = vec![0.0f32; 4400];
414 let envelope = compute_envelope(&samples, 44100);
415 assert!(!envelope.is_empty());
416 assert!(envelope.iter().all(|&v| v == 0));
417 }
418
419 #[test]
420 fn identical_envelopes_score_zero() {
421 let env: Vec<u8> = (0..200).map(|i| (i % 256) as u8).collect();
422 let (score, _offset) = envelope_distance(&env, &env);
423 assert!(score < 0.001, "Expected ~0.0, got {score}");
424 }
425
426 #[test]
427 fn different_envelopes_score_high() {
428 // Ascending vs descending — should be anti-correlated
429 let a: Vec<u8> = (0..200).map(|i| i as u8).collect();
430 let b: Vec<u8> = (0..200).map(|i| (199 - i) as u8).collect();
431 let (score, _) = envelope_distance(&a, &b);
432 assert!(
433 score > DUPLICATE_THRESHOLD,
434 "Expected > {DUPLICATE_THRESHOLD}, got {score}"
435 );
436 }
437
438 #[test]
439 fn trimmed_version_detected() {
440 // Original: 300 blocks. Trimmed: skip first 20 blocks.
441 let original: Vec<u8> = (0..300).map(|i| ((i * 7) % 256) as u8).collect();
442 let trimmed: Vec<u8> = original[20..].to_vec();
443 let (score, offset) = envelope_distance(&original, &trimmed);
444 assert!(
445 score < DUPLICATE_THRESHOLD,
446 "Expected < {DUPLICATE_THRESHOLD}, got {score}"
447 );
448 // Offset should be approximately 20 blocks * 10ms = 200ms
449 assert!(
450 (offset.abs() - 200).unsigned_abs() < 50,
451 "Expected offset ~200ms, got {offset}ms"
452 );
453 }
454
455 #[test]
456 fn save_and_load_roundtrip() {
457 let db = crate::db::Database::open_in_memory().unwrap();
458 insert_fake_sample(&db, "aabbccdd");
459 let envelope = vec![10u8, 20, 30, 40, 50];
460 let fp = Fingerprint {
461 hash: "aabbccdd".to_string(),
462 envelope: envelope.clone(),
463 sample_rate: 44100,
464 };
465 save_fingerprint(&db, &fp).unwrap();
466 let loaded = load_fingerprint(&db, "aabbccdd").unwrap();
467 assert_eq!(loaded.envelope, envelope);
468 assert_eq!(loaded.sample_rate, 44100);
469 }
470
471 #[test]
472 fn find_near_duplicates_returns_matches() {
473 let db = crate::db::Database::open_in_memory().unwrap();
474 insert_fake_sample(&db, "ref_hash");
475 insert_fake_sample(&db, "dup_hash");
476 insert_fake_sample(&db, "diff_hash");
477
478 let env_ref: Vec<u8> = (0..200).map(|i| ((i * 3) % 256) as u8).collect();
479 // Near-duplicate: identical envelope
480 let env_dup = env_ref.clone();
481 // Different: unrelated pattern
482 let env_diff: Vec<u8> = (0..200).map(|i| ((i * 13 + 100) % 256) as u8).collect();
483
484 save_fingerprint(
485 &db,
486 &Fingerprint {
487 hash: "ref_hash".to_string(),
488 envelope: env_ref,
489 sample_rate: 44100,
490 },
491 )
492 .unwrap();
493 save_fingerprint(
494 &db,
495 &Fingerprint {
496 hash: "dup_hash".to_string(),
497 envelope: env_dup,
498 sample_rate: 44100,
499 },
500 )
501 .unwrap();
502 save_fingerprint(
503 &db,
504 &Fingerprint {
505 hash: "diff_hash".to_string(),
506 envelope: env_diff,
507 sample_rate: 44100,
508 },
509 )
510 .unwrap();
511
512 let results = find_near_duplicates(&db, "ref_hash", 50).unwrap();
513 assert_eq!(results.len(), 1, "Should find exactly 1 near-duplicate");
514 assert_eq!(results[0].hash, "dup_hash");
515 assert!(results[0].score < DUPLICATE_THRESHOLD);
516 }
517
518 // --- compact_features tests ---
519
520 #[test]
521 fn compact_features_empty() {
522 let f = compact_features(&[]);
523 assert!(f.iter().all(|&v| v == 0.0));
524 }
525
526 #[test]
527 fn compact_features_uniform() {
528 // All 128 → each bin mean should be 128/255 ≈ 0.502
529 let env = vec![128u8; 160];
530 let f = compact_features(&env);
531 for &v in &f {
532 assert!((v - 128.0 / 255.0).abs() < 0.01, "Expected ~0.502, got {v}");
533 }
534 }
535
536 #[test]
537 fn compact_features_ascending() {
538 // Ascending ramp: first bins should be lower, last bins higher.
539 let env: Vec<u8> = (0..160).map(|i| (i * 255 / 159) as u8).collect();
540 let f = compact_features(&env);
541 assert!(f[0] < f[15], "First bin should be less than last bin");
542 }
543
544 #[test]
545 fn compact_features_identical_envelopes_same_features() {
546 let env: Vec<u8> = (0..300).map(|i| ((i * 7) % 256) as u8).collect();
547 let f1 = compact_features(&env);
548 let f2 = compact_features(&env);
549 for i in 0..SUMMARY_BINS {
550 assert!((f1[i] - f2[i]).abs() < f64::EPSILON);
551 }
552 }
553
554 // --- FingerprintIndex tests ---
555
556 #[test]
557 fn index_build_empty() {
558 let db = crate::db::Database::open_in_memory().unwrap();
559 let idx = FingerprintIndex::build(&db).unwrap();
560 assert!(idx.is_empty());
561 assert_eq!(idx.len(), 0);
562 }
563
564 #[test]
565 fn index_finds_duplicate() {
566 let db = crate::db::Database::open_in_memory().unwrap();
567 insert_fake_sample(&db, "ref_hash");
568 insert_fake_sample(&db, "dup_hash");
569 insert_fake_sample(&db, "diff_hash");
570
571 let env_ref: Vec<u8> = (0..200).map(|i| ((i * 3) % 256) as u8).collect();
572 let env_dup = env_ref.clone();
573 let env_diff: Vec<u8> = (0..200).map(|i| ((i * 13 + 100) % 256) as u8).collect();
574
575 for (h, e) in [("ref_hash", &env_ref), ("dup_hash", &env_dup), ("diff_hash", &env_diff)] {
576 save_fingerprint(
577 &db,
578 &Fingerprint {
579 hash: h.to_string(),
580 envelope: e.clone(),
581 sample_rate: 44100,
582 },
583 )
584 .unwrap();
585 }
586
587 let idx = FingerprintIndex::build(&db).unwrap();
588 assert_eq!(idx.len(), 3);
589
590 let results = idx.find_near_duplicates("ref_hash", &env_ref, 50);
591 assert_eq!(results.len(), 1, "Should find exactly 1 near-duplicate");
592 assert_eq!(results[0].hash, "dup_hash");
593 assert!(results[0].score < DUPLICATE_THRESHOLD);
594 }
595
596 #[test]
597 fn index_excludes_self() {
598 let db = crate::db::Database::open_in_memory().unwrap();
599 insert_fake_sample(&db, "only");
600
601 let env: Vec<u8> = (0..200).map(|i| ((i * 3) % 256) as u8).collect();
602 save_fingerprint(
603 &db,
604 &Fingerprint {
605 hash: "only".to_string(),
606 envelope: env.clone(),
607 sample_rate: 44100,
608 },
609 )
610 .unwrap();
611
612 let idx = FingerprintIndex::build(&db).unwrap();
613 let results = idx.find_near_duplicates("only", &env, 50);
614 assert!(results.is_empty(), "Should not return self as a duplicate");
615 }
616
617 #[test]
618 fn index_matches_linear_scan() {
619 // Verify the index produces the same results as the linear scan.
620 let db = crate::db::Database::open_in_memory().unwrap();
621 insert_fake_sample(&db, "a");
622 insert_fake_sample(&db, "b");
623 insert_fake_sample(&db, "c");
624
625 let env_a: Vec<u8> = (0..200).map(|i| ((i * 3) % 256) as u8).collect();
626 let env_b = env_a.clone(); // duplicate
627 let env_c: Vec<u8> = (0..200).map(|i| ((i * 13 + 100) % 256) as u8).collect();
628
629 for (h, e) in [("a", &env_a), ("b", &env_b), ("c", &env_c)] {
630 save_fingerprint(
631 &db,
632 &Fingerprint {
633 hash: h.to_string(),
634 envelope: e.clone(),
635 sample_rate: 44100,
636 },
637 )
638 .unwrap();
639 }
640
641 let linear = find_near_duplicates(&db, "a", 50).unwrap();
642 let idx = FingerprintIndex::build(&db).unwrap();
643 let indexed = idx.find_near_duplicates("a", &env_a, 50);
644
645 assert_eq!(linear.len(), indexed.len());
646 for (l, i) in linear.iter().zip(indexed.iter()) {
647 assert_eq!(l.hash, i.hash);
648 assert!((l.score - i.score).abs() < 1e-10);
649 }
650 }
651 }
652