Skip to main content

max / audiofiles

Release DB lock before VP-tree build, batch export tag query The similarity and fingerprint indexes build a VP-tree lazily on first query. Previously the DB mutex was held across the entire build, which is CPU-bound and blocks every other backend operation. Split into load_data (under lock, fast I/O) + build_from_data (no lock, CPU-intensive). enrich_with_tags now issues one chunked IN query per 500 hashes instead of one query per item. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
Author: Max J. <87768334+MaxJMath@users.noreply.github.com> · 2026-05-14 19:23 UTC
Commit: f148f11f6972fbf667b106485cdcea8bbf82ec55
Parent: 3297df2
4 files changed, +84 insertions, -19 deletions
@@ -539,13 +539,20 @@ impl Backend for DirectBackend {
539 539 hash: &str,
540 540 limit: usize,
541 541 ) -> BackendResult<Vec<similarity::SimilarResult>> {
542 - let db = self.db.lock();
543 542 // Build VP-tree index lazily on first query.
543 + // Load data under DB lock, release lock, then build tree (CPU-intensive).
544 544 let mut idx = self.similarity_index.lock();
545 545 if idx.is_none() {
546 - *idx = Some(similarity::SimilarityIndex::build(&db)?);
546 + let data = {
547 + let db = self.db.lock();
548 + similarity::SimilarityIndex::load_data(&db)?
549 + };
550 + *idx = Some(similarity::SimilarityIndex::build_from_data(data));
547 551 }
548 - let features = similarity::load_features(&db, hash)?;
552 + let features = {
553 + let db = self.db.lock();
554 + similarity::load_features(&db, hash)?
555 + };
549 556 Ok(idx.as_ref().unwrap().find_similar(hash, &features, limit))
550 557 }
551 558
@@ -554,13 +561,20 @@ impl Backend for DirectBackend {
554 561 hash: &str,
555 562 limit: usize,
556 563 ) -> BackendResult<Vec<fingerprint::DuplicateResult>> {
557 - let db = self.db.lock();
558 564 // Build VP-tree index lazily on first query.
565 + // Load data under DB lock, release lock, then build tree (CPU-intensive).
559 566 let mut idx = self.fingerprint_index.lock();
560 567 if idx.is_none() {
561 - *idx = Some(fingerprint::FingerprintIndex::build(&db)?);
568 + let entries = {
569 + let db = self.db.lock();
570 + fingerprint::FingerprintIndex::load_data(&db)?
571 + };
572 + *idx = Some(fingerprint::FingerprintIndex::build_from_data(entries));
562 573 }
563 - let reference = fingerprint::load_fingerprint(&db, hash)?;
574 + let reference = {
575 + let db = self.db.lock();
576 + fingerprint::load_fingerprint(&db, hash)?
577 + };
564 578 Ok(idx
565 579 .as_ref()
566 580 .unwrap()
@@ -202,9 +202,42 @@ fn map_export_item(row: &rusqlite::Row) -> rusqlite::Result<Option<ExportItem>>
202 202
203 203 /// Populate the `tags` field on each export item by querying the database.
204 204 pub fn enrich_with_tags(db: &Database, items: &mut [ExportItem]) {
205 + if items.is_empty() {
206 + return;
207 + }
208 +
209 + // Batch query: fetch all tags for all hashes in one statement.
210 + let hashes: Vec<String> = items.iter().map(|i| i.hash.to_string()).collect();
211 + let mut tag_map = std::collections::HashMap::<String, Vec<String>>::new();
212 +
213 + // SQLite variable limit is 999 in older builds; chunk to stay safe.
214 + for chunk in hashes.chunks(500) {
215 + let placeholders: String = chunk.iter().enumerate()
216 + .map(|(i, _)| format!("?{}", i + 1))
217 + .collect::<Vec<_>>()
218 + .join(", ");
219 + let sql = format!(
220 + "SELECT sample_hash, tag FROM tags WHERE sample_hash IN ({}) ORDER BY tag",
221 + placeholders,
222 + );
223 + if let Ok(mut stmt) = db.conn().prepare(&sql) {
224 + let params: Vec<&dyn rusqlite::types::ToSql> = chunk
225 + .iter()
226 + .map(|h| h as &dyn rusqlite::types::ToSql)
227 + .collect();
228 + if let Ok(rows) = stmt.query_map(params.as_slice(), |row| {
229 + Ok((row.get::<_, String>(0)?, row.get::<_, String>(1)?))
230 + }) {
231 + for row in rows.flatten() {
232 + tag_map.entry(row.0).or_default().push(row.1);
233 + }
234 + }
235 + }
236 + }
237 +
205 238 for item in items.iter_mut() {
206 - if let Ok(t) = tags::get_sample_tags(db, &item.hash) {
207 - item.tags = t;
239 + if let Some(tags) = tag_map.remove(item.hash.as_str()) {
240 + item.tags = tags;
208 241 }
209 242 }
210 243 }
@@ -249,10 +249,10 @@ const SUMMARY_BINS: usize = 16;
249 249 const SUMMARY_SEARCH_RADIUS: f64 = 1.0;
250 250
251 251 /// Entry stored in the VP-tree: hash + full envelope + compact summary features.
252 - struct FingerprintEntry {
253 - hash: String,
254 - envelope: Vec<u8>,
255 - features: [f64; SUMMARY_BINS],
252 + pub struct FingerprintEntry {
253 + pub(crate) hash: String,
254 + pub(crate) envelope: Vec<u8>,
255 + pub(crate) features: [f64; SUMMARY_BINS],
256 256 }
257 257
258 258 /// Compute compact features from an envelope: 16 mean-amplitude bins in [0, 1].
@@ -302,7 +302,8 @@ pub struct FingerprintIndex {
302 302 impl FingerprintIndex {
303 303 /// Build an index from all fingerprints in the database.
304 304 #[instrument(skip_all)]
305 - pub fn build(db: &Database) -> Result<Self> {
305 + /// Load raw fingerprint data from the database (fast, just I/O).
306 + pub fn load_data(db: &Database) -> Result<Vec<FingerprintEntry>> {
306 307 let mut stmt = db.conn().prepare(
307 308 "SELECT hash, envelope FROM fingerprints",
308 309 )?;
@@ -318,10 +319,18 @@ impl FingerprintIndex {
318 319 })
319 320 })?
320 321 .collect::<std::result::Result<Vec<_>, _>>()?;
322 + Ok(entries)
323 + }
321 324
325 + /// Build the index from pre-loaded data (CPU-intensive, no DB needed).
326 + pub fn build_from_data(entries: Vec<FingerprintEntry>) -> Self {
322 327 let tree = VpTree::build(entries, summary_distance);
328 + Self { tree }
329 + }
323 330
324 - Ok(Self { tree })
331 + pub fn build(db: &Database) -> Result<Self> {
332 + let entries = Self::load_data(db)?;
333 + Ok(Self::build_from_data(entries))
325 334 }
326 335
327 336 /// Number of fingerprints in the index.
@@ -328,7 +328,8 @@ pub struct SimilarityIndex {
328 328 impl SimilarityIndex {
329 329 /// Build an index from all analysed samples in the database.
330 330 #[instrument(skip_all)]
331 - pub fn build(db: &Database) -> Result<Self> {
331 + /// Load raw feature data from the database (fast, just I/O).
332 + pub fn load_data(db: &Database) -> Result<Vec<(String, FeatureVector)>> {
332 333 let mut stmt = db.conn().prepare(
333 334 "SELECT hash, bpm, duration, lufs, spectral_centroid, spectral_flatness,
334 335 spectral_rolloff, zero_crossing_rate, onset_strength,
@@ -356,15 +357,18 @@ impl SimilarityIndex {
356 357 ))
357 358 })?
358 359 .collect::<std::result::Result<Vec<_>, _>>()?;
360 + Ok(all)
361 + }
359 362
363 + /// Build the index from pre-loaded data (CPU-intensive, no DB needed).
364 + pub fn build_from_data(all: Vec<(String, FeatureVector)>) -> Self {
360 365 if all.is_empty() {
361 - return Ok(Self {
366 + return Self {
362 367 tree: VpTree::build(vec![], entry_distance),
363 368 ranges: NormRanges::default(),
364 - });
369 + };
365 370 }
366 371
367 - // Compute ranges from the full dataset (fixed, not per-query).
368 372 let ranges = compute_ranges_all(&all);
369 373
370 374 let entries: Vec<SimilarityEntry> = all
@@ -377,7 +381,12 @@ impl SimilarityIndex {
377 381
378 382 let tree = VpTree::build(entries, entry_distance);
379 383
380 - Ok(Self { tree, ranges })
384 + Self { tree, ranges }
385 + }
386 +
387 + pub fn build(db: &Database) -> Result<Self> {
388 + let all = Self::load_data(db)?;
389 + Ok(Self::build_from_data(all))
381 390 }
382 391
383 392 /// Number of samples in the index.