Skip to main content

max / makenotwork

15.0 KB · 388 lines History Blame Raw
1 //! Content file export: ZIP archive of audio, covers, videos, versions, and insertions.
2 //!
3 //! Writes the ZIP to a temporary file and uploads via S3 multipart upload,
4 //! so peak memory is O(single_file) regardless of total export size.
5
6 use std::io::Write;
7
8 use axum::{
9 extract::{Query, State},
10 http::header::HeaderMap,
11 response::{IntoResponse, Response},
12 };
13 use serde::Deserialize;
14 use zip::write::SimpleFileOptions;
15
16 use crate::{
17 auth::AuthUser,
18 db,
19 error::{AppError, Result, ResultExt},
20 helpers::is_htmx_request,
21 templates::ExportContentReadyTemplate,
22 AppState,
23 };
24
25 use super::export_error_html;
26
27 /// Max content exports running at once. Each can move up to 2 GB through a
28 /// synchronous zip on the blocking pool; without a cap a burst could saturate
29 /// the blocking pool and stall unrelated `spawn_blocking` work. Excess exports
30 /// queue on the semaphore instead.
31 const MAX_CONCURRENT_EXPORTS: usize = 3;
32 static EXPORT_LIMITER: tokio::sync::Semaphore =
33 tokio::sync::Semaphore::const_new(MAX_CONCURRENT_EXPORTS);
34
35 /// Query parameters for the content export endpoint.
36 #[derive(Deserialize)]
37 pub(in crate::routes::api) struct ContentExportQuery {
38 /// When set, only export files from this project (useful for large
39 /// catalogs or to stay within the 2GB per-export memory limit).
40 pub project_id: Option<db::ProjectId>,
41 }
42
43 /// Export content files as a ZIP archive uploaded to S3.
44 ///
45 /// Collects audio, covers, version downloads, and insertion clips,
46 /// bundles them with a README.txt manifest, uploads to S3 as a
47 /// temporary export, and returns a presigned download link.
48 ///
49 /// Pass `?project_id=<uuid>` to limit the export to a single project
50 /// (insertions are user-scoped and always excluded from per-project exports).
51 #[tracing::instrument(skip_all, name = "exports::export_content")]
52 pub(in crate::routes::api) async fn export_content(
53 State(state): State<AppState>,
54 headers: HeaderMap,
55 Query(query): Query<ContentExportQuery>,
56 AuthUser(user): AuthUser,
57 ) -> Result<Response> {
58 let is_htmx = is_htmx_request(&headers);
59
60 // Hold a concurrency permit for the lifetime of the export so a burst can't
61 // saturate the blocking pool. Acquired before any DB/S3 work, so a queued
62 // request holds no connection while it waits.
63 let _export_permit = EXPORT_LIMITER
64 .acquire()
65 .await
66 .expect("export limiter semaphore is never closed");
67
68 let s3 = state.s3.as_ref().ok_or_else(|| {
69 AppError::ServiceUnavailable("File storage is not configured".to_string())
70 })?;
71
72 // Collect all S3 keys from items, versions, and insertions
73 let item_keys = db::items::get_user_s3_keys(&state.db, user.id).await?;
74 let version_keys = db::versions::get_user_version_s3_keys(&state.db, user.id).await?;
75
76 // Build the list of (s3_key, zip_path, db_size) triples. The DB-known file
77 // size lets us enforce the per-file/total caps without a per-file S3 HEAD
78 // round-trip (these columns are written at upload-confirm time). `None` only
79 // for legacy rows missing the size; those fall through to the post-download
80 // total guard.
81 let mut files: Vec<(String, String, Option<i64>)> = Vec::new();
82
83 for item in &item_keys {
84 if let Some(pid) = query.project_id
85 && item.project_id != pid
86 {
87 continue;
88 }
89 let slug = item.project_slug.as_str();
90 let title = sanitize_filename(&item.title);
91 if let Some(ref key) = item.audio_s3_key {
92 let ext = extension_from_key(key);
93 files.push((key.clone(), format!("projects/{}/{}.{}", slug, title, ext), item.audio_file_size_bytes));
94 }
95 if let Some(ref key) = item.cover_s3_key {
96 let ext = extension_from_key(key);
97 files.push((key.clone(), format!("projects/{}/{}-cover.{}", slug, title, ext), item.cover_file_size_bytes));
98 }
99 if let Some(ref key) = item.video_s3_key {
100 let ext = extension_from_key(key);
101 files.push((key.clone(), format!("projects/{}/{}-video.{}", slug, title, ext), item.video_file_size_bytes));
102 }
103 }
104
105 for ver in &version_keys {
106 if let Some(pid) = query.project_id
107 && ver.project_id != pid
108 {
109 continue;
110 }
111 if let Some(ref key) = ver.s3_key {
112 let slug = ver.project_slug.as_str();
113 let title = sanitize_filename(&ver.item_title);
114 let fname = ver.file_name.as_deref().unwrap_or("file");
115 files.push((key.clone(), format!("projects/{}/{}/v{}-{}", slug, title, ver.version_number, fname), ver.file_size_bytes));
116 }
117 }
118
119 // Insertions are user-scoped (not project-scoped), so only include
120 // them when exporting all content (no project_id filter).
121 if query.project_id.is_none() {
122 let insertions = db::content_insertions::list_insertions(&state.db, user.id).await?;
123 for ins in &insertions {
124 let ext = extension_from_key(&ins.storage_key);
125 let title = sanitize_filename(&ins.title);
126 files.push((ins.storage_key.clone(), format!("insertions/{}.{}", title, ext), Some(ins.file_size)));
127 }
128 }
129
130 if files.is_empty() {
131 if is_htmx {
132 return Ok(export_error_html("No content files to export."));
133 }
134 return Err(AppError::BadRequest("No content files to export.".to_string()));
135 }
136
137 // Write ZIP to a temporary file, downloading files one at a time.
138 // Peak memory is O(largest_single_file) — the ZIP itself lives on disk.
139 let s3_clone = s3.clone();
140 let username = user.username.to_string();
141
142 let tmp_dir = tempfile::tempdir()
143 .context("create temp dir for export")?;
144 let zip_path = tmp_dir.path().join("export.zip");
145
146 {
147 // The `zip` crate's IO is synchronous; a single `write_all` of up to
148 // 500 MB (compression is Stored, so this is raw disk IO) would stall a
149 // tokio worker. Every blocking zip operation below runs on the blocking
150 // pool via `spawn_blocking`; the writer is moved in and handed back out
151 // each step. S3 downloads stay async, and peak memory is still
152 // O(largest_single_file) — one file is in RAM at a time.
153 let create_path = zip_path.clone();
154 let mut zip = tokio::task::spawn_blocking(
155 move || -> std::result::Result<_, std::io::Error> {
156 let zip_file = std::fs::File::create(&create_path)?;
157 Ok(zip::ZipWriter::new(std::io::BufWriter::new(zip_file)))
158 },
159 )
160 .await
161 .context("join zip create task")?
162 .context("create export zip file")?;
163
164 let options = SimpleFileOptions::default()
165 .compression_method(zip::CompressionMethod::Stored);
166
167 let mut manifest: Vec<(String, i64)> = Vec::new();
168 let mut total_size: u64 = 0;
169 const MAX_TOTAL_SIZE: u64 = 2 * 1024 * 1024 * 1024; // 2 GB
170 const MAX_FILE_SIZE: u64 = 500 * 1024 * 1024; // 500 MB per file
171 let mut skipped: Vec<String> = Vec::new();
172
173 for (s3_key, zip_path_entry, db_size) in &files {
174 // Per-file size pre-check BEFORE downloading so a single 20 GB video
175 // can't blow the heap before the post-download total check fires.
176 // The size comes from the DB column written at upload-confirm — no
177 // S3 HEAD round-trip. The post-download total guard below is the
178 // backstop for any row with a missing (None) size.
179 if let Some(size) = db_size {
180 let size = (*size).max(0) as u64;
181 if size > MAX_FILE_SIZE {
182 skipped.push(format!(
183 "{} (exceeds 500 MB per-file export cap)",
184 zip_path_entry
185 ));
186 continue;
187 }
188 if total_size + size > MAX_TOTAL_SIZE {
189 let msg = "Content export exceeds 2 GB limit. Try exporting a single project instead.";
190 if is_htmx {
191 return Ok(export_error_html(msg));
192 }
193 return Err(AppError::BadRequest(msg.to_string()));
194 }
195 }
196
197 match s3_clone.download_object(s3_key).await {
198 Ok(data) => {
199 total_size += data.len() as u64;
200 if total_size > MAX_TOTAL_SIZE {
201 let msg = "Content export exceeds 2 GB limit. Try exporting a single project instead.";
202 if is_htmx {
203 return Ok(export_error_html(msg));
204 }
205 return Err(AppError::BadRequest(msg.to_string()));
206 }
207 let file_size = data.len() as i64;
208 // Move writer + file bytes onto the blocking pool, write, get
209 // the writer back. `data` drops inside the task afterward, so
210 // only one file is in RAM at a time.
211 let entry = zip_path_entry.clone();
212 zip = tokio::task::spawn_blocking(
213 move || -> std::result::Result<_, zip::result::ZipError> {
214 zip.start_file(&entry, options)?;
215 zip.write_all(&data)?;
216 Ok(zip)
217 },
218 )
219 .await
220 .context("join zip write task")?
221 .context("write file into export zip")?;
222 manifest.push((zip_path_entry.clone(), file_size));
223 }
224 Err(e) => {
225 tracing::warn!("Failed to download S3 key {}: {}", s3_key, e);
226 skipped.push(zip_path_entry.clone());
227 }
228 }
229 }
230
231 if manifest.is_empty() {
232 let msg = "Could not download any files from storage. Please try again later.";
233 if is_htmx {
234 return Ok(export_error_html(msg));
235 }
236 return Err(AppError::Storage(msg.to_string()));
237 }
238
239 // Build README.txt as the last ZIP entry (cheap string work, async side)
240 let now = chrono::Utc::now();
241 let mut readme = format!(
242 "Makenot.work Content Export\n\
243 Creator: {}\n\
244 Exported: {}\n\
245 Files: {}\n\n\
246 Manifest:\n",
247 username,
248 now.format("%Y-%m-%d %H:%M:%S UTC"),
249 manifest.len(),
250 );
251 for (path, size) in &manifest {
252 readme.push_str(&format!(" {} ({})\n", path, crate::helpers::format_file_size(*size)));
253 }
254 if !skipped.is_empty() {
255 readme.push_str(&format!("\nSkipped ({} files could not be downloaded):\n", skipped.len()));
256 for path in &skipped {
257 readme.push_str(&format!(" {}\n", path));
258 }
259 }
260 readme.push_str("\nNote: Git repositories are not included in this export.\n");
261 readme.push_str("Clone them separately: git clone https://makenot.work/source/<username>/<repo>.git\n");
262
263 // Append README, finalize the central directory, and flush the buffer to
264 // disk — all blocking — off the runtime before the upload reads the file.
265 tokio::task::spawn_blocking(move || -> std::result::Result<(), zip::result::ZipError> {
266 zip.start_file("README.txt", options)?;
267 zip.write_all(readme.as_bytes())?;
268 let buf = zip.finish()?;
269 // Flush BufWriter so all bytes hit the OS file before we upload it.
270 buf.into_inner().map_err(|e| e.into_error())?;
271 Ok(())
272 })
273 .await
274 .context("join zip finalize task")?
275 .context("finalize export zip")?;
276 }
277
278 // Upload ZIP to S3 via multipart upload (streams from disk in 10 MB parts)
279 let timestamp = chrono::Utc::now().format("%Y%m%d-%H%M%S");
280 let export_key = format!("{}/exports/content-{}.zip", user.id, timestamp);
281 if let Err(e) = s3.upload_multipart(&export_key, "application/zip", &zip_path).await {
282 tracing::error!(error = ?e, "Failed to upload content export ZIP to S3");
283 if is_htmx {
284 return Ok(export_error_html("Failed to prepare download. Please try again."));
285 }
286 return Err(e);
287 }
288
289 // Generate presigned download URL (1 hour)
290 let download_url = match s3.presign_download(&export_key, Some(3600)).await {
291 Ok(url) => url,
292 Err(e) => {
293 tracing::error!(error = ?e, "Failed to generate presigned URL for content export");
294 if is_htmx {
295 return Ok(export_error_html("Export created but download link failed. Please try again."));
296 }
297 return Err(e);
298 }
299 };
300
301 if is_htmx {
302 return Ok(ExportContentReadyTemplate { download_url }.into_response());
303 }
304
305 // Direct API call: redirect to presigned URL
306 Response::builder()
307 .status(303)
308 .header("Location", &download_url)
309 .body("".into())
310 .context("build export redirect response")
311 }
312
313 /// Extract file extension from an S3 key (e.g. "user/item/audio/track.mp3" -> "mp3").
314 fn extension_from_key(key: &str) -> &str {
315 key.rsplit('.').next().unwrap_or("bin")
316 }
317
318 /// Sanitize a title for use as a filename in the ZIP archive.
319 fn sanitize_filename(name: &str) -> String {
320 name.chars()
321 .map(|c| if c.is_alphanumeric() || c == '-' || c == '_' || c == ' ' { c } else { '_' })
322 .collect::<String>()
323 .trim()
324 .to_string()
325 }
326
327 #[cfg(test)]
328 mod tests {
329 use super::*;
330
331 #[test]
332 fn extension_from_key_mp3() {
333 assert_eq!(extension_from_key("user/item/audio/track.mp3"), "mp3");
334 }
335
336 #[test]
337 fn extension_from_key_nested_path() {
338 assert_eq!(extension_from_key("a/b/c/file.tar.gz"), "gz");
339 }
340
341 #[test]
342 fn extension_from_key_no_dot_returns_whole_segment() {
343 // rsplit('.').next() returns the whole string when no dot is present
344 assert_eq!(extension_from_key("user/item/audio/noext"), "user/item/audio/noext");
345 }
346
347 #[test]
348 fn extension_from_key_empty_returns_bin() {
349 // rsplit('.').next() on "" returns Some(""), which unwrap_or("bin") keeps as ""
350 assert_eq!(extension_from_key(""), "");
351 }
352
353 #[test]
354 fn extension_from_key_dot_only() {
355 assert_eq!(extension_from_key("file."), "");
356 }
357
358 #[test]
359 fn sanitize_filename_passthrough() {
360 assert_eq!(sanitize_filename("My Track"), "My Track");
361 }
362
363 #[test]
364 fn sanitize_filename_special_chars() {
365 assert_eq!(sanitize_filename("hello/world:2"), "hello_world_2");
366 }
367
368 #[test]
369 fn sanitize_filename_preserves_hyphens_underscores() {
370 assert_eq!(sanitize_filename("my-file_name"), "my-file_name");
371 }
372
373 #[test]
374 fn sanitize_filename_trims_whitespace() {
375 assert_eq!(sanitize_filename(" padded "), "padded");
376 }
377
378 #[test]
379 fn sanitize_filename_empty() {
380 assert_eq!(sanitize_filename(""), "");
381 }
382
383 #[test]
384 fn sanitize_filename_all_special() {
385 assert_eq!(sanitize_filename("@#$%"), "____");
386 }
387 }
388