Spaces:
Running
Running
liuyang
commited on
Commit
·
a4ab88e
1
Parent(s):
a095ed4
Add AudioJob integration to app.py with UI for running audio jobs and handling manifests. Updated requirements.txt to include webrtcvad and boto3.
Browse files- app.py +58 -0
- audiojob.py +1016 -0
- requirements.txt +3 -1
app.py
CHANGED
@@ -2,6 +2,11 @@ import gradio as gr
|
|
2 |
import requests
|
3 |
import tempfile
|
4 |
import os
|
|
|
|
|
|
|
|
|
|
|
5 |
from pydub import AudioSegment
|
6 |
from typing import Optional, Tuple
|
7 |
import logging
|
@@ -240,6 +245,59 @@ with gr.Blocks(title="Audio Editor", theme=gr.themes.Soft()) as demo:
|
|
240 |
outputs=[audio_output, status_output]
|
241 |
)
|
242 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
243 |
# Launch the app
|
244 |
if __name__ == "__main__":
|
245 |
demo.launch()
|
|
|
2 |
import requests
|
3 |
import tempfile
|
4 |
import os
|
5 |
+
import json
|
6 |
+
import traceback
|
7 |
+
|
8 |
+
# AudioJob integration
|
9 |
+
from audiojob import AudioJobRunner, LocalStorageAdapter
|
10 |
from pydub import AudioSegment
|
11 |
from typing import Optional, Tuple
|
12 |
import logging
|
|
|
245 |
outputs=[audio_output, status_output]
|
246 |
)
|
247 |
|
248 |
+
with gr.Tab("AudioJob Runner"):
|
249 |
+
gr.Markdown("### AudioJob: preprocess -> split (inspect manifest)")
|
250 |
+
with gr.Row():
|
251 |
+
with gr.Column():
|
252 |
+
aj_source_input = gr.Textbox(
|
253 |
+
label="Source URI",
|
254 |
+
placeholder="e.g. /abs/path/to/file.wav or s3://bucket/key",
|
255 |
+
info="Source URI for AudioJobRunner"
|
256 |
+
)
|
257 |
+
aj_manifest_input = gr.Textbox(
|
258 |
+
label="Manifest JSON (optional)",
|
259 |
+
placeholder="Paste existing manifest JSON to resume (optional)",
|
260 |
+
lines=10
|
261 |
+
)
|
262 |
+
aj_s3_prefix = gr.Textbox(
|
263 |
+
label="S3 Prefix",
|
264 |
+
placeholder="Optional prefix for uploaded working copies (e.g. jobs/)",
|
265 |
+
info="Uploaded keys will be prefixed with this value",
|
266 |
+
)
|
267 |
+
aj_run_button = gr.Button("Run AudioJob", variant="primary")
|
268 |
+
with gr.Column():
|
269 |
+
aj_output = gr.Textbox(label="AudioJob Output (manifest)", lines=30, interactive=False)
|
270 |
+
|
271 |
+
def run_audiojob_ui(source_uri: str, manifest_json: str, s3_prefix: str) -> str:
|
272 |
+
try:
|
273 |
+
manifest = None
|
274 |
+
if manifest_json and manifest_json.strip():
|
275 |
+
manifest = json.loads(manifest_json)
|
276 |
+
|
277 |
+
work_root = tempfile.mkdtemp(prefix="audiojob_")
|
278 |
+
storage = LocalStorageAdapter()
|
279 |
+
# allow presets from top-level presets if desired; using defaults here
|
280 |
+
runner = AudioJobRunner(
|
281 |
+
manifest=manifest,
|
282 |
+
source_uri=None if manifest else source_uri,
|
283 |
+
work_root=work_root,
|
284 |
+
storage=storage,
|
285 |
+
presets={
|
286 |
+
# Read bucket and endpoint from environment where possible
|
287 |
+
"s3_bucket": os.environ.get("S3_BUCKET"),
|
288 |
+
"s3_region": "auto",
|
289 |
+
"s3_prefix": s3_prefix or "",
|
290 |
+
"s3_endpoint": os.environ.get("S3_ENDPOINT", "")
|
291 |
+
}
|
292 |
+
)
|
293 |
+
out_manifest = runner.run_until_split()
|
294 |
+
return json.dumps(out_manifest, ensure_ascii=False, indent=2)
|
295 |
+
except Exception as e:
|
296 |
+
tb = traceback.format_exc()
|
297 |
+
return f"Error: {e}\n\n{tb}"
|
298 |
+
|
299 |
+
aj_run_button.click(fn=run_audiojob_ui, inputs=[aj_source_input, aj_manifest_input], outputs=[aj_output])
|
300 |
+
|
301 |
# Launch the app
|
302 |
if __name__ == "__main__":
|
303 |
demo.launch()
|
audiojob.py
ADDED
@@ -0,0 +1,1016 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
# -*- coding: utf-8 -*-
|
3 |
+
"""
|
4 |
+
Audio preprocess runner v2 (until split), checkpointable & resumable.
|
5 |
+
|
6 |
+
Key upgrades:
|
7 |
+
- Robust subprocess with retries/backoff/timeout
|
8 |
+
- Content-addressed cache for preprocess outputs
|
9 |
+
- Unique temp files + cleanup
|
10 |
+
- Explicit stream mapping (-map 0:a:0)
|
11 |
+
- Filter availability detection with graceful fallbacks
|
12 |
+
- VAD streams PCM from ffmpeg (no giant temp WAV), with progress updates
|
13 |
+
- Split plan exposes per-channel source_uris (for virtual slicing)
|
14 |
+
- Stage error breadcrumbs + atomic manifest rev bumps
|
15 |
+
- Pluggable StorageAdapter (LocalStorageAdapter provided)
|
16 |
+
|
17 |
+
Author: you
|
18 |
+
"""
|
19 |
+
|
20 |
+
import os
|
21 |
+
import re
|
22 |
+
import io
|
23 |
+
import sys
|
24 |
+
import json
|
25 |
+
import math
|
26 |
+
import time
|
27 |
+
import shutil
|
28 |
+
import hashlib
|
29 |
+
import tempfile
|
30 |
+
import datetime as dt
|
31 |
+
import subprocess
|
32 |
+
import uuid
|
33 |
+
from typing import Optional, Dict, Any, List, Tuple, BinaryIO
|
34 |
+
|
35 |
+
# ============================================================
|
36 |
+
# Storage Adapters
|
37 |
+
# ============================================================
|
38 |
+
|
39 |
+
class StorageAdapter:
|
40 |
+
"""Abstract interface for reading/writing blobs and metadata."""
|
41 |
+
def exists(self, uri: str) -> bool: raise NotImplementedError
|
42 |
+
def open_read(self, uri: str) -> BinaryIO: raise NotImplementedError
|
43 |
+
def open_write(self, uri: str) -> BinaryIO: raise NotImplementedError
|
44 |
+
def save_json(self, uri: str, obj: dict) -> None:
|
45 |
+
with self.open_write(uri) as f:
|
46 |
+
f.write(json.dumps(obj, ensure_ascii=False, indent=2).encode("utf-8"))
|
47 |
+
def load_json(self, uri: str) -> dict:
|
48 |
+
with self.open_read(uri) as f:
|
49 |
+
return json.loads(f.read().decode("utf-8"))
|
50 |
+
def stat(self, uri: str) -> Dict[str, Any]:
|
51 |
+
"""Return {'bytes': int|None, 'sha256': str|None, 'etag': str|None} where possible."""
|
52 |
+
return {"bytes": None, "sha256": None, "etag": None}
|
53 |
+
def presign(self, uri: str, method: str = "GET", ttl: int = 3600) -> str:
|
54 |
+
"""Return a URL suitable for HTTP reads/writes. Local adapter may return a file:// path."""
|
55 |
+
return uri
|
56 |
+
|
57 |
+
class LocalStorageAdapter(StorageAdapter):
|
58 |
+
"""Treats absolute paths as 'uris' (file system)."""
|
59 |
+
def exists(self, uri: str) -> bool:
|
60 |
+
return os.path.exists(uri)
|
61 |
+
def open_read(self, uri: str) -> BinaryIO:
|
62 |
+
return open(uri, "rb")
|
63 |
+
def open_write(self, uri: str) -> BinaryIO:
|
64 |
+
os.makedirs(os.path.dirname(uri), exist_ok=True)
|
65 |
+
return open(uri, "wb")
|
66 |
+
def stat(self, uri: str) -> Dict[str, Any]:
|
67 |
+
if not os.path.exists(uri):
|
68 |
+
return {"bytes": None, "sha256": None, "etag": None}
|
69 |
+
st = os.stat(uri)
|
70 |
+
return {"bytes": st.st_size, "sha256": None, "etag": None}
|
71 |
+
def presign(self, uri: str, method: str = "GET", ttl: int = 3600) -> str:
|
72 |
+
return uri # consumers should handle file paths
|
73 |
+
|
74 |
+
# Stub you can implement for S3/R2 later:
|
75 |
+
class S3LikeStorageAdapter(StorageAdapter):
|
76 |
+
"""S3-like storage adapter using boto3. Exposes simple operations for
|
77 |
+
checking existence, reading, writing (via temp files) and presigning URLs.
|
78 |
+
|
79 |
+
Usage:
|
80 |
+
adapter = S3LikeStorageAdapter(bucket="my-bucket", region_name="us-east-1")
|
81 |
+
adapter.upload_file(local_path, key)
|
82 |
+
url = adapter.presign(key, "GET", ttl=3600)
|
83 |
+
"""
|
84 |
+
def __init__(self, bucket: str, region_name: Optional[str] = None,
|
85 |
+
aws_access_key_id: Optional[str] = None,
|
86 |
+
aws_secret_access_key: Optional[str] = None,
|
87 |
+
aws_session_token: Optional[str] = None,
|
88 |
+
endpoint_url: Optional[str] = None):
|
89 |
+
try:
|
90 |
+
import boto3
|
91 |
+
except Exception:
|
92 |
+
raise RuntimeError("boto3 is required for S3LikeStorageAdapter but is not installed")
|
93 |
+
session_kwargs = {}
|
94 |
+
if aws_access_key_id and aws_secret_access_key:
|
95 |
+
session_kwargs.update({
|
96 |
+
"aws_access_key_id": aws_access_key_id,
|
97 |
+
"aws_secret_access_key": aws_secret_access_key,
|
98 |
+
})
|
99 |
+
if aws_session_token:
|
100 |
+
session_kwargs["aws_session_token"] = aws_session_token
|
101 |
+
|
102 |
+
session = boto3.session.Session(**session_kwargs)
|
103 |
+
client_kwargs = {"region_name": region_name}
|
104 |
+
if endpoint_url:
|
105 |
+
client_kwargs["endpoint_url"] = endpoint_url
|
106 |
+
self.s3 = session.client("s3", **client_kwargs)
|
107 |
+
self.bucket = bucket
|
108 |
+
|
109 |
+
def exists(self, key: str) -> bool:
|
110 |
+
try:
|
111 |
+
self.s3.head_object(Bucket=self.bucket, Key=key)
|
112 |
+
return True
|
113 |
+
except Exception:
|
114 |
+
return False
|
115 |
+
|
116 |
+
def open_read(self, key: str) -> BinaryIO:
|
117 |
+
# Download into memory (caller should avoid very large files via this API)
|
118 |
+
obj = self.s3.get_object(Bucket=self.bucket, Key=key)
|
119 |
+
body = obj["Body"].read()
|
120 |
+
return io.BytesIO(body)
|
121 |
+
|
122 |
+
def open_write(self, key: str) -> BinaryIO:
|
123 |
+
# Provide a temp file that will be uploaded on close
|
124 |
+
tmp = tempfile.NamedTemporaryFile(delete=False)
|
125 |
+
|
126 |
+
class _S3Writer(io.BufferedWriter):
|
127 |
+
def __init__(self, tmp_file_path: str, outer: "S3LikeStorageAdapter", key: str):
|
128 |
+
self._tmp_path = tmp_file_path
|
129 |
+
self._outer = outer
|
130 |
+
self._key = key
|
131 |
+
f = open(tmp_file_path, "r+b")
|
132 |
+
super().__init__(f)
|
133 |
+
|
134 |
+
def close(self):
|
135 |
+
try:
|
136 |
+
super().close()
|
137 |
+
finally:
|
138 |
+
# Upload using boto3's upload_file which handles multipart for large files
|
139 |
+
try:
|
140 |
+
self._outer.s3.upload_file(self._tmp_path, self._outer.bucket, self._key)
|
141 |
+
except Exception as e:
|
142 |
+
raise
|
143 |
+
finally:
|
144 |
+
try:
|
145 |
+
os.remove(self._tmp_path)
|
146 |
+
except Exception:
|
147 |
+
pass
|
148 |
+
|
149 |
+
tmp_path = tmp.name
|
150 |
+
tmp.close()
|
151 |
+
return _S3Writer(tmp_path, self, key)
|
152 |
+
|
153 |
+
def stat(self, key: str) -> Dict[str, Any]:
|
154 |
+
try:
|
155 |
+
r = self.s3.head_object(Bucket=self.bucket, Key=key)
|
156 |
+
return {"bytes": int(r.get("ContentLength", 0)), "sha256": None, "etag": r.get("ETag")}
|
157 |
+
except Exception:
|
158 |
+
return {"bytes": None, "sha256": None, "etag": None}
|
159 |
+
|
160 |
+
def presign(self, key: str, method: str = "GET", ttl: int = 3600) -> str:
|
161 |
+
# Return a presigned URL for the given key
|
162 |
+
params = {"Bucket": self.bucket, "Key": key}
|
163 |
+
http_method = method.upper()
|
164 |
+
return self.s3.generate_presigned_url(
|
165 |
+
ClientMethod="get_object" if http_method == "GET" else "put_object",
|
166 |
+
Params=params,
|
167 |
+
ExpiresIn=int(ttl),
|
168 |
+
)
|
169 |
+
|
170 |
+
def upload_file(self, local_path: str, key: str) -> str:
|
171 |
+
"""Upload a local file to the S3 bucket under `key`.
|
172 |
+
Uses boto3's managed uploader which supports multipart uploads for large files.
|
173 |
+
Returns the uploaded key on success.
|
174 |
+
"""
|
175 |
+
self.s3.upload_file(local_path, self.bucket, key)
|
176 |
+
return key
|
177 |
+
|
178 |
+
# ============================================================
|
179 |
+
# Utilities
|
180 |
+
# ============================================================
|
181 |
+
|
182 |
+
def utc_now_iso() -> str:
|
183 |
+
return dt.datetime.utcnow().replace(microsecond=0).isoformat() + "Z"
|
184 |
+
|
185 |
+
def sha256_file(path: str, chunk_size: int = 1024 * 1024) -> str:
|
186 |
+
h = hashlib.sha256()
|
187 |
+
with open(path, "rb") as f:
|
188 |
+
for chunk in iter(lambda: f.read(chunk_size), b""):
|
189 |
+
h.update(chunk)
|
190 |
+
return h.hexdigest()
|
191 |
+
|
192 |
+
def clamp(v: float, lo: float, hi: float) -> float:
|
193 |
+
return max(lo, min(hi, v))
|
194 |
+
|
195 |
+
def sec_to_hms(seconds: float) -> str:
|
196 |
+
s = max(0.0, float(seconds))
|
197 |
+
h = int(s // 3600)
|
198 |
+
m = int((s % 3600) // 60)
|
199 |
+
ss = s - h * 3600 - m * 60
|
200 |
+
return f"{h:02d}:{m:02d}:{ss:06.3f}"
|
201 |
+
|
202 |
+
def float_or_none(x: Any) -> Optional[float]:
|
203 |
+
try: return float(x)
|
204 |
+
except Exception: return None
|
205 |
+
|
206 |
+
# Robust subprocess with retries/backoff/timeout
|
207 |
+
def run(cmd: List[str], timeout: Optional[int] = None) -> Tuple[int, str, str]:
|
208 |
+
proc = subprocess.Popen(
|
209 |
+
cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True
|
210 |
+
)
|
211 |
+
try:
|
212 |
+
out, err = proc.communicate(timeout=timeout)
|
213 |
+
except subprocess.TimeoutExpired:
|
214 |
+
proc.kill()
|
215 |
+
out, err = proc.communicate()
|
216 |
+
return 124, out, err + "\nTIMEOUT"
|
217 |
+
return proc.returncode, out, err
|
218 |
+
|
219 |
+
def run_with_retry(cmd: List[str], retries: int = 3, timeout: Optional[int] = None, backoff: float = 1.5) -> str:
|
220 |
+
last = None
|
221 |
+
for i in range(retries):
|
222 |
+
code, out, err = run(cmd, timeout)
|
223 |
+
if code == 0:
|
224 |
+
return out
|
225 |
+
last = (code, err)
|
226 |
+
time.sleep(backoff ** i)
|
227 |
+
raise RuntimeError(f"Command failed after {retries} attempts: {' '.join(cmd)}\n{last}")
|
228 |
+
|
229 |
+
# ============================================================
|
230 |
+
# Defaults / presets
|
231 |
+
# ============================================================
|
232 |
+
|
233 |
+
DEFAULT_PRESETS = {
|
234 |
+
"materialize_chunks": False, # virtual slicing by default
|
235 |
+
"sample_rate_target": 16000,
|
236 |
+
"container_target": "flac",
|
237 |
+
"channel_policy": "auto", # auto | split | downmix | keep
|
238 |
+
"normalize": "light", # none | light | r128
|
239 |
+
"denoise": "auto", # none | light | auto
|
240 |
+
"chunk_policy": "vad_fallback_fixed",
|
241 |
+
"chunk_target_ms": 1800000,
|
242 |
+
"overlap_ms": 300,
|
243 |
+
"vad_aggressiveness": 2, # 0..3 if webrtcvad available
|
244 |
+
"highpass_hz": 60,
|
245 |
+
"max_gain_db": 6.0,
|
246 |
+
"min_mean_dbfs_for_gain": -30.0,
|
247 |
+
"stereo_side_mid_threshold_db": 20.0, # side <= mid - 20 dB => mono OK
|
248 |
+
"ff_timeout_sec": 600, # per ffmpeg/ffprobe call
|
249 |
+
"ff_retries": 3,
|
250 |
+
# Optional S3 uploader settings (for working copy uploads only)
|
251 |
+
"s3_bucket": None,
|
252 |
+
"s3_region": None,
|
253 |
+
"s3_prefix": "",
|
254 |
+
}
|
255 |
+
|
256 |
+
# ============================================================
|
257 |
+
# Main runner
|
258 |
+
# ============================================================
|
259 |
+
|
260 |
+
class AudioJobRunner:
|
261 |
+
"""
|
262 |
+
Drives the workflow until 'split' is complete (transcription not included).
|
263 |
+
Now with retries, content-addressed cache, streaming VAD, and StorageAdapter.
|
264 |
+
|
265 |
+
Usage:
|
266 |
+
storage = LocalStorageAdapter()
|
267 |
+
runner = AudioJobRunner(
|
268 |
+
manifest=None,
|
269 |
+
source_uri="/abs/path/to/audio.wav", # or r2://bucket/key if your adapter supports it
|
270 |
+
work_root="/tmp/jobwork",
|
271 |
+
storage=storage,
|
272 |
+
presets={"chunk_target_ms": 45000}
|
273 |
+
)
|
274 |
+
manifest = runner.run_until_split()
|
275 |
+
"""
|
276 |
+
|
277 |
+
def __init__(
|
278 |
+
self,
|
279 |
+
manifest: Optional[Dict[str, Any]],
|
280 |
+
source_uri: Optional[str], # should always be a url, upstream should determine and convert the filekey to url with proper domain
|
281 |
+
work_root: str,
|
282 |
+
storage: StorageAdapter,
|
283 |
+
presets: Optional[Dict[str, Any]] = None,
|
284 |
+
):
|
285 |
+
self.storage = storage
|
286 |
+
self.work_root = os.path.abspath(work_root)
|
287 |
+
os.makedirs(self.work_root, exist_ok=True)
|
288 |
+
|
289 |
+
self.presets = dict(DEFAULT_PRESETS)
|
290 |
+
if presets:
|
291 |
+
self.presets.update(presets)
|
292 |
+
|
293 |
+
# Detect tools & filters
|
294 |
+
self.tool_versions = self._detect_tool_versions()
|
295 |
+
self.filter_caps = self._detect_filter_caps()
|
296 |
+
|
297 |
+
# Initialize or load manifest
|
298 |
+
if manifest is None:
|
299 |
+
if not source_uri:
|
300 |
+
raise ValueError("source_uri is required for a new job.")
|
301 |
+
self.manifest = self._init_manifest(source_uri)
|
302 |
+
else:
|
303 |
+
self.manifest = manifest
|
304 |
+
self.manifest.setdefault("tool_versions", self.tool_versions)
|
305 |
+
|
306 |
+
self.manifest.setdefault("version", "2.0")
|
307 |
+
self.manifest.setdefault("rev", 0)
|
308 |
+
self._touch_updated()
|
309 |
+
|
310 |
+
# --------------------------------------------------------
|
311 |
+
# Public API
|
312 |
+
# --------------------------------------------------------
|
313 |
+
|
314 |
+
def run_until_split(self) -> Dict[str, Any]:
|
315 |
+
try:
|
316 |
+
if self._stage_status("probe") != "done":
|
317 |
+
self._run_probe()
|
318 |
+
|
319 |
+
if self._stage_status("preprocess") != "done":
|
320 |
+
self._run_preprocess()
|
321 |
+
|
322 |
+
if self._stage_status("split") != "done":
|
323 |
+
self._run_split_plan()
|
324 |
+
|
325 |
+
return self.manifest
|
326 |
+
|
327 |
+
except Exception as e:
|
328 |
+
# Leave breadcrumb on the current running stage if any
|
329 |
+
for stage in ("split", "preprocess", "probe"):
|
330 |
+
if self._stage_status(stage) == "running":
|
331 |
+
self._set_stage(stage, "failed", 0.0, {"last_error": str(e), "ended_at": utc_now_iso()})
|
332 |
+
break
|
333 |
+
raise
|
334 |
+
|
335 |
+
# --------------------------------------------------------
|
336 |
+
# Manifest helpers
|
337 |
+
# --------------------------------------------------------
|
338 |
+
|
339 |
+
def _init_manifest(self, source_uri: str) -> Dict[str, Any]:
|
340 |
+
job_id = str(uuid.uuid4())
|
341 |
+
m = {
|
342 |
+
"version": "2.0",
|
343 |
+
"rev": 0,
|
344 |
+
"job_id": job_id,
|
345 |
+
"created_at": utc_now_iso(),
|
346 |
+
"updated_at": utc_now_iso(),
|
347 |
+
"source": {
|
348 |
+
"uri": source_uri,
|
349 |
+
"sha256": None,
|
350 |
+
"etag": None,
|
351 |
+
"bytes": None,
|
352 |
+
"container": None,
|
353 |
+
"codec": None,
|
354 |
+
"duration_ms": None,
|
355 |
+
"sample_rate": None,
|
356 |
+
"channels": None,
|
357 |
+
},
|
358 |
+
"tool_versions": self.tool_versions,
|
359 |
+
"policy": dict(self.presets),
|
360 |
+
"stages": {
|
361 |
+
"probe": {"status": "pending", "progress": 0.0},
|
362 |
+
"preprocess": {"status": "pending", "progress": 0.0},
|
363 |
+
"split": {"status": "pending", "progress": 0.0},
|
364 |
+
"transcribe": {"status": "pending", "progress": 0.0},
|
365 |
+
},
|
366 |
+
"stitch": {"status": "pending", "progress": 0.0},
|
367 |
+
"outputs": {
|
368 |
+
"transcript_uri": None,
|
369 |
+
"srt_uri": None,
|
370 |
+
"vtt_uri": None,
|
371 |
+
"txt_uri": None,
|
372 |
+
"qc": {"passed": None, "issues": []},
|
373 |
+
},
|
374 |
+
}
|
375 |
+
return m
|
376 |
+
|
377 |
+
def _touch_updated(self):
|
378 |
+
self.manifest["updated_at"] = utc_now_iso()
|
379 |
+
self.manifest["rev"] = int(self.manifest.get("rev", 0)) + 1
|
380 |
+
|
381 |
+
def _stage_status(self, name: str) -> str:
|
382 |
+
return self.manifest.get("stages", {}).get(name, {}).get("status", "pending")
|
383 |
+
|
384 |
+
def _set_stage(self, name: str, status: str, progress: float, extra: Dict[str, Any] = None):
|
385 |
+
st = self.manifest["stages"].setdefault(name, {})
|
386 |
+
st["status"] = status
|
387 |
+
st["progress"] = clamp(progress, 0.0, 1.0)
|
388 |
+
if extra:
|
389 |
+
st.update(extra)
|
390 |
+
self._touch_updated()
|
391 |
+
|
392 |
+
# --------------------------------------------------------
|
393 |
+
# Tool/Filter detection
|
394 |
+
# --------------------------------------------------------
|
395 |
+
|
396 |
+
def _detect_tool_versions(self) -> Dict[str, str]:
|
397 |
+
vers = {}
|
398 |
+
for tool in ("ffmpeg", "ffprobe"):
|
399 |
+
try:
|
400 |
+
out = run_with_retry([tool, "-version"], retries=1, timeout=10)
|
401 |
+
first = out.splitlines()[0]
|
402 |
+
m = re.search(r"version\s+([^\s]+)", first)
|
403 |
+
vers[tool] = m.group(1) if m else first
|
404 |
+
except Exception:
|
405 |
+
vers[tool] = "unknown"
|
406 |
+
try:
|
407 |
+
import webrtcvad # noqa
|
408 |
+
vers["webrtcvad"] = "installed"
|
409 |
+
except Exception:
|
410 |
+
vers["webrtcvad"] = "missing"
|
411 |
+
return vers
|
412 |
+
|
413 |
+
def _detect_filter_caps(self) -> Dict[str, bool]:
|
414 |
+
caps = {"arnndn": False, "adeclip": False, "highpass": True}
|
415 |
+
try:
|
416 |
+
out = run_with_retry(["ffmpeg", "-hide_banner", "-filters"], retries=1, timeout=10)
|
417 |
+
txt = "\n".join(out.splitlines())
|
418 |
+
for name in list(caps.keys()):
|
419 |
+
if f" {name} " in txt:
|
420 |
+
caps[name] = True
|
421 |
+
except Exception:
|
422 |
+
pass
|
423 |
+
return caps
|
424 |
+
|
425 |
+
# --------------------------------------------------------
|
426 |
+
# Optional S3 uploader for working copies only
|
427 |
+
# --------------------------------------------------------
|
428 |
+
|
429 |
+
def _get_s3_uploader(self) -> Optional[S3LikeStorageAdapter]:
|
430 |
+
bucket = self.presets.get("s3_bucket")
|
431 |
+
if not bucket:
|
432 |
+
return None
|
433 |
+
region = self.presets.get("s3_region")
|
434 |
+
prefix = self.presets.get("s3_prefix", "") # kept in presets for key generation
|
435 |
+
endpoint = self.presets.get("s3_endpoint")
|
436 |
+
try:
|
437 |
+
return S3LikeStorageAdapter(bucket=bucket, region_name=region, endpoint_url=endpoint)
|
438 |
+
except Exception:
|
439 |
+
return None
|
440 |
+
|
441 |
+
def _maybe_upload_working_to_s3(self, working: Dict[str, Any], local_map: Dict[str, str]) -> None:
|
442 |
+
uploader = self._get_s3_uploader()
|
443 |
+
if not uploader:
|
444 |
+
return
|
445 |
+
prefix = str(self.presets.get("s3_prefix", "")).strip()
|
446 |
+
jobid = self.manifest.get("job_id", "")
|
447 |
+
for chan, local_path in local_map.items():
|
448 |
+
ext = os.path.splitext(local_path)[1].lstrip(".") or working.get("format", "flac")
|
449 |
+
key = f"{jobid}_{'main' if chan == 'mono' else ('ch1' if chan == 'L' else 'ch2')}.{ext}"
|
450 |
+
if prefix:
|
451 |
+
key = os.path.join(prefix, key)
|
452 |
+
try:
|
453 |
+
uploader.upload_file(local_path, key)
|
454 |
+
working["uris_remote"][chan] = key
|
455 |
+
except Exception:
|
456 |
+
# best-effort; continue without failing the job
|
457 |
+
pass
|
458 |
+
|
459 |
+
# --------------------------------------------------------
|
460 |
+
# Stage: Probe
|
461 |
+
# --------------------------------------------------------
|
462 |
+
|
463 |
+
def _run_probe(self):
|
464 |
+
self._set_stage("probe", "running", 0.05, {"started_at": utc_now_iso()})
|
465 |
+
|
466 |
+
src = self.manifest["source"]["uri"]
|
467 |
+
|
468 |
+
# Try to stat source (size/etag); SHA for local files
|
469 |
+
st = self.storage.stat(src)
|
470 |
+
self.manifest["source"]["bytes"] = st.get("bytes")
|
471 |
+
self.manifest["source"]["etag"] = st.get("etag")
|
472 |
+
|
473 |
+
if os.path.isabs(src) and os.path.isfile(src):
|
474 |
+
try:
|
475 |
+
self.manifest["source"]["sha256"] = sha256_file(src)
|
476 |
+
except Exception:
|
477 |
+
self.manifest["source"]["sha256"] = None
|
478 |
+
|
479 |
+
info = self._ffprobe_streams(src)
|
480 |
+
fmt = info.get("format", {})
|
481 |
+
streams = info.get("streams", [])
|
482 |
+
audio = next((s for s in streams if s.get("codec_type") == "audio"), {})
|
483 |
+
|
484 |
+
self.manifest["source"].update({
|
485 |
+
"container": fmt.get("format_name"),
|
486 |
+
"codec": audio.get("codec_name"),
|
487 |
+
"duration_ms": int(float(fmt.get("duration", 0)) * 1000) if fmt.get("duration") else None,
|
488 |
+
"sample_rate": int(audio.get("sample_rate", 0)) if audio.get("sample_rate") else None,
|
489 |
+
"channels": int(audio.get("channels", 0)) if audio.get("channels") else None,
|
490 |
+
})
|
491 |
+
self._set_stage("probe", "running", 0.6)
|
492 |
+
|
493 |
+
# Stereo assessment (if 2ch)
|
494 |
+
if self.manifest["source"]["channels"] == 2:
|
495 |
+
stereo_metrics = self._stereo_metrics(src)
|
496 |
+
else:
|
497 |
+
stereo_metrics = {
|
498 |
+
"rms_L": None, "rms_R": None,
|
499 |
+
"mid_rms_db": None, "side_rms_db": None,
|
500 |
+
"max_dbfs": None, "clipping_pct": None,
|
501 |
+
"near_silent_channel": False, "corr": None,
|
502 |
+
"recommended_mode": "mono" if self.manifest["source"]["channels"] == 1 else "as_is"
|
503 |
+
}
|
504 |
+
|
505 |
+
actions = self._decide_actions(stereo_metrics)
|
506 |
+
self.manifest["stages"]["probe"].update({
|
507 |
+
"metrics": {
|
508 |
+
"rms_dbfs_L": stereo_metrics.get("rms_L"),
|
509 |
+
"rms_dbfs_R": stereo_metrics.get("rms_R"),
|
510 |
+
"max_dbfs": stereo_metrics.get("max_dbfs"),
|
511 |
+
"clipping_pct": stereo_metrics.get("clipping_pct"),
|
512 |
+
"stereo": stereo_metrics,
|
513 |
+
},
|
514 |
+
"actions": actions,
|
515 |
+
})
|
516 |
+
self._set_stage("probe", "done", 1.0, {"ended_at": utc_now_iso()})
|
517 |
+
|
518 |
+
def _ffprobe_streams(self, uri: str) -> Dict[str, Any]:
|
519 |
+
cmd = ["ffprobe", "-v", "error", "-select_streams", "a:0", "-show_streams",
|
520 |
+
"-show_format", "-of", "json", uri]
|
521 |
+
out = run_with_retry(cmd, retries=self.presets["ff_retries"], timeout=self.presets["ff_timeout_sec"])
|
522 |
+
return json.loads(out)
|
523 |
+
|
524 |
+
def _stereo_metrics(self, uri: str) -> Dict[str, Any]:
|
525 |
+
# Unique temp files to avoid collisions
|
526 |
+
base = os.path.join(tempfile.gettempdir(), f"stmetrics_{uuid.uuid4().hex}")
|
527 |
+
L_txt, R_txt, MID_txt, SIDE_txt = [base + s for s in (".L.txt",".R.txt",".MID.txt",".SIDE.txt")]
|
528 |
+
|
529 |
+
try:
|
530 |
+
# astats + mid/side
|
531 |
+
cmd = [
|
532 |
+
"ffmpeg", "-nostdin", "-hide_banner", "-v", "error",
|
533 |
+
"-i", uri, "-map", "0:a:0",
|
534 |
+
"-filter_complex",
|
535 |
+
(
|
536 |
+
"channelsplit=channel_layout=stereo[chl][chr];"
|
537 |
+
"[chl]astats=metadata=1:reset=1,ametadata=print:file={L};"
|
538 |
+
"[chr]astats=metadata=1:reset=1,ametadata=print:file={R};"
|
539 |
+
"pan=stereo|c0=0.5*c0+0.5*c1|c1=0.5*c0-0.5*c1[mid][side];"
|
540 |
+
"[mid]astats=metadata=1:reset=1,ametadata=print:file={MID};"
|
541 |
+
"[side]astats=metadata=1:reset=1,ametadata=print:file={SIDE}"
|
542 |
+
).format(L=L_txt, R=R_txt, MID=MID_txt, SIDE=SIDE_txt),
|
543 |
+
"-f", "null", "-"
|
544 |
+
]
|
545 |
+
run_with_retry(cmd, retries=self.presets["ff_retries"], timeout=self.presets["ff_timeout_sec"])
|
546 |
+
|
547 |
+
# volumedetect
|
548 |
+
vd = run_with_retry([
|
549 |
+
"ffmpeg", "-nostdin", "-hide_banner", "-v", "error",
|
550 |
+
"-i", uri, "-map", "0:a:0", "-af", "volumedetect", "-f", "null", "-"
|
551 |
+
], retries=self.presets["ff_retries"], timeout=self.presets["ff_timeout_sec"])
|
552 |
+
|
553 |
+
def parse_overall_rms(txt_path: str) -> Optional[float]:
|
554 |
+
if not os.path.exists(txt_path): return None
|
555 |
+
with open(txt_path, "r", encoding="utf-8", errors="ignore") as f:
|
556 |
+
data = f.read()
|
557 |
+
m = re.findall(r"Overall RMS level:\s*([-\d\.]+)\s*dB", data)
|
558 |
+
return float(m[-1]) if m else None
|
559 |
+
|
560 |
+
def parse_max_dbfs(vol_text: str) -> Optional[float]:
|
561 |
+
m = re.findall(r"max_volume:\s*([-\d\.]+)\s*dB", vol_text)
|
562 |
+
return float(m[-1]) if m else None
|
563 |
+
|
564 |
+
def parse_clipping(vol_text: str) -> Optional[float]:
|
565 |
+
m = re.findall(r"number of clipped samples:\s*(\d+)", vol_text)
|
566 |
+
return 100.0 if (m and int(m[-1]) > 0) else 0.0
|
567 |
+
|
568 |
+
rms_L = parse_overall_rms(L_txt)
|
569 |
+
rms_R = parse_overall_rms(R_txt)
|
570 |
+
mid_rms = parse_overall_rms(MID_txt)
|
571 |
+
side_rms = parse_overall_rms(SIDE_txt)
|
572 |
+
|
573 |
+
# Decide near-silent channel
|
574 |
+
near_silent = False
|
575 |
+
if rms_L is not None and rms_R is not None:
|
576 |
+
if (rms_L < -45.0 and (rms_R - rms_L) > 15.0) or (rms_R < -45.0 and (rms_L - rms_R) > 15.0):
|
577 |
+
near_silent = True
|
578 |
+
|
579 |
+
# Recommended mode
|
580 |
+
rec_mode = "split"
|
581 |
+
thr = float(self.presets["stereo_side_mid_threshold_db"])
|
582 |
+
if (mid_rms is not None and side_rms is not None and (side_rms <= (mid_rms - thr))) or near_silent:
|
583 |
+
rec_mode = "mono"
|
584 |
+
|
585 |
+
return {
|
586 |
+
"rms_L": rms_L, "rms_R": rms_R,
|
587 |
+
"mid_rms_db": mid_rms, "side_rms_db": side_rms,
|
588 |
+
"max_dbfs": parse_max_dbfs(vd), "clipping_pct": parse_clipping(vd),
|
589 |
+
"near_silent_channel": near_silent, "corr": None,
|
590 |
+
"recommended_mode": rec_mode
|
591 |
+
}
|
592 |
+
finally:
|
593 |
+
for p in (L_txt, R_txt, MID_txt, SIDE_txt):
|
594 |
+
try:
|
595 |
+
if os.path.exists(p): os.remove(p)
|
596 |
+
except Exception:
|
597 |
+
pass
|
598 |
+
|
599 |
+
def _decide_actions(self, stereo_metrics: Dict[str, Any]) -> Dict[str, Any]:
|
600 |
+
src_ch = self.manifest["source"]["channels"] or 1
|
601 |
+
policy = self.manifest["policy"]
|
602 |
+
|
603 |
+
# Channel policy
|
604 |
+
if policy.get("channel_policy") == "auto":
|
605 |
+
if src_ch == 1:
|
606 |
+
ch_pol = "downmix"
|
607 |
+
else:
|
608 |
+
ch_pol = "split" if stereo_metrics.get("recommended_mode") == "split" else "downmix"
|
609 |
+
else:
|
610 |
+
ch_pol = policy.get("channel_policy")
|
611 |
+
|
612 |
+
# Denoise: auto -> light if very quiet
|
613 |
+
denoise = policy.get("denoise", "auto")
|
614 |
+
if denoise == "auto":
|
615 |
+
rms_L = stereo_metrics.get("rms_L")
|
616 |
+
rms_R = stereo_metrics.get("rms_R")
|
617 |
+
denoise_flag = bool((rms_L and rms_L < -35.0) or (rms_R and rms_R < -35.0))
|
618 |
+
denoise = "light" if denoise_flag else "none"
|
619 |
+
|
620 |
+
# Normalize
|
621 |
+
normalize = policy.get("normalize", "light")
|
622 |
+
|
623 |
+
return {
|
624 |
+
"will_resample": True,
|
625 |
+
"will_split_stereo": (ch_pol == "split"),
|
626 |
+
"will_downmix": (ch_pol == "downmix"),
|
627 |
+
"will_denoise": (denoise == "light" and self.filter_caps.get("arnndn", False)),
|
628 |
+
"will_normalize": (normalize != "none"),
|
629 |
+
"channel_policy": ch_pol,
|
630 |
+
"normalize_mode": normalize,
|
631 |
+
"denoise_mode": denoise if self.filter_caps.get("arnndn", False) else "none",
|
632 |
+
}
|
633 |
+
|
634 |
+
# --------------------------------------------------------
|
635 |
+
# Stage: Preprocess (with content-addressed cache)
|
636 |
+
# --------------------------------------------------------
|
637 |
+
|
638 |
+
def _run_preprocess(self):
|
639 |
+
self._set_stage("preprocess", "running", 0.05, {"started_at": utc_now_iso()})
|
640 |
+
|
641 |
+
src = self.manifest["source"]["uri"]
|
642 |
+
actions = self.manifest["stages"]["probe"]["actions"]
|
643 |
+
policy = self.manifest["policy"]
|
644 |
+
|
645 |
+
# Build filtergraph with soft-fallbacks
|
646 |
+
filters = []
|
647 |
+
if self.filter_caps.get("highpass", True):
|
648 |
+
filters.append(f"highpass=f={int(policy.get('highpass_hz', 60))}")
|
649 |
+
if self.filter_caps.get("adeclip", False):
|
650 |
+
filters.append("adeclip")
|
651 |
+
if actions["will_denoise"] and self.filter_caps.get("arnndn", False):
|
652 |
+
filters.append("arnndn")
|
653 |
+
|
654 |
+
# Gentle gain if needed
|
655 |
+
metrics = self.manifest["stages"]["probe"].get("metrics", {})
|
656 |
+
mean_db = None
|
657 |
+
if metrics.get("stereo", {}).get("mid_rms_db") is not None:
|
658 |
+
mean_db = metrics["stereo"]["mid_rms_db"]
|
659 |
+
elif metrics.get("rms_dbfs_L") is not None and metrics.get("rms_dbfs_R") is not None:
|
660 |
+
mean_db = (metrics["rms_dbfs_L"] + metrics["rms_dbfs_R"]) / 2.0
|
661 |
+
if mean_db is not None and mean_db < float(self.presets["min_mean_dbfs_for_gain"]):
|
662 |
+
target_boost = min(float(self.presets["max_gain_db"]),
|
663 |
+
abs(float(self.presets["min_mean_dbfs_for_gain"]) - mean_db))
|
664 |
+
filters.append(f"volume={target_boost:.1f}dB")
|
665 |
+
|
666 |
+
filtergraph = ",".join(filters) if filters else "anull"
|
667 |
+
sr = int(policy["sample_rate_target"])
|
668 |
+
target_container = policy["container_target"].lower()
|
669 |
+
ch_policy = actions["channel_policy"]
|
670 |
+
|
671 |
+
# Compute idempotency key BEFORE encoding (source hash/etag + params)
|
672 |
+
idem_src = self.manifest["source"].get("sha256") or self.manifest["source"].get("etag") or self.manifest["source"]["uri"]
|
673 |
+
idem_payload = json.dumps({
|
674 |
+
"src": idem_src, "filter": filtergraph, "sr": sr,
|
675 |
+
"fmt": target_container, "ch_policy": ch_policy,
|
676 |
+
"tools": self.tool_versions
|
677 |
+
}, sort_keys=True).encode("utf-8")
|
678 |
+
idem_key = hashlib.sha256(idem_payload).hexdigest()
|
679 |
+
|
680 |
+
# Content-addressed working dir
|
681 |
+
base_dir = os.path.join(self.work_root, self.manifest["job_id"], idem_key)
|
682 |
+
os.makedirs(base_dir, exist_ok=True)
|
683 |
+
|
684 |
+
def out_path(name: str) -> str:
|
685 |
+
return os.path.join(base_dir, f"{name}.{target_container}")
|
686 |
+
|
687 |
+
# Note: Do not store local paths in manifest. Only store remote keys.
|
688 |
+
working = {"format": target_container, "sample_rate": sr, "channel_map": [], "uris_remote": {}, "filtergraph": filtergraph}
|
689 |
+
produced_any = False
|
690 |
+
|
691 |
+
try:
|
692 |
+
if ch_policy == "split" and (self.manifest["source"]["channels"] == 2):
|
693 |
+
# L/R mono outputs
|
694 |
+
outL, outR = out_path("ch1"), out_path("ch2")
|
695 |
+
if not (self.storage.exists(outL) and self.storage.exists(outR)):
|
696 |
+
cmd = [
|
697 |
+
"ffmpeg", "-nostdin", "-hide_banner", "-y", "-v", "error",
|
698 |
+
"-i", self.storage.presign(src), "-map", "0:a:0",
|
699 |
+
"-map_channel", "0.0.0", "-ac", "1", "-ar", str(sr), "-af", filtergraph, outL,
|
700 |
+
"-map_channel", "0.0.1", "-ac", "1", "-ar", str(sr), "-af", filtergraph, outR
|
701 |
+
]
|
702 |
+
run_with_retry(cmd, retries=self.presets["ff_retries"], timeout=self.presets["ff_timeout_sec"])
|
703 |
+
produced_any = True
|
704 |
+
working["channel_map"] = ["L", "R"]
|
705 |
+
# Upload to S3/R2 if configured; keep local files but do not store local paths in manifest
|
706 |
+
self._maybe_upload_working_to_s3(working, {"L": outL, "R": outR})
|
707 |
+
|
708 |
+
else:
|
709 |
+
# Single mono output
|
710 |
+
outM = out_path("main")
|
711 |
+
if not self.storage.exists(outM):
|
712 |
+
cmd = [
|
713 |
+
"ffmpeg", "-nostdin", "-hide_banner", "-y", "-v", "error",
|
714 |
+
"-i", self.storage.presign(src), "-map", "0:a:0",
|
715 |
+
"-ac", "1", "-ar", str(sr), "-af", filtergraph, outM
|
716 |
+
]
|
717 |
+
run_with_retry(cmd, retries=self.presets["ff_retries"], timeout=self.presets["ff_timeout_sec"])
|
718 |
+
produced_any = True
|
719 |
+
working["channel_map"] = ["mono"]
|
720 |
+
self._maybe_upload_working_to_s3(working, {"mono": outM})
|
721 |
+
|
722 |
+
self.manifest["stages"]["preprocess"].update({
|
723 |
+
"idempotency_key": idem_key, "working": working, "ended_at": utc_now_iso()
|
724 |
+
})
|
725 |
+
self._set_stage("preprocess", "done", 1.0)
|
726 |
+
|
727 |
+
except Exception as e:
|
728 |
+
self._set_stage("preprocess", "failed", 0.0, {"last_error": str(e), "ended_at": utc_now_iso()})
|
729 |
+
raise
|
730 |
+
|
731 |
+
# --------------------------------------------------------
|
732 |
+
# Stage: Split plan (virtual by default, VAD streaming)
|
733 |
+
# --------------------------------------------------------
|
734 |
+
|
735 |
+
def _run_split_plan(self):
|
736 |
+
self._set_stage("split", "running", 0.05, {"started_at": utc_now_iso()})
|
737 |
+
|
738 |
+
policy = self.manifest["policy"]
|
739 |
+
chunk_target = int(policy["chunk_target_ms"])
|
740 |
+
overlap = int(policy["overlap_ms"])
|
741 |
+
materialize = bool(policy["materialize_chunks"])
|
742 |
+
|
743 |
+
work = self.manifest["stages"]["preprocess"]["working"]
|
744 |
+
channels = work["channel_map"]
|
745 |
+
|
746 |
+
chunks: List[Dict[str, Any]] = []
|
747 |
+
total_chunks = 0
|
748 |
+
plan_source_uris = {}
|
749 |
+
proc_source: Dict[str, str] = {}
|
750 |
+
|
751 |
+
try:
|
752 |
+
for chan in channels:
|
753 |
+
# Decide source for processing: local file if exists, else remote if available
|
754 |
+
local_candidate = None
|
755 |
+
# Build the expected local path based on idempotent working dir
|
756 |
+
base_dir = os.path.join(self.work_root, self.manifest["job_id"], self.manifest["stages"]["preprocess"]["idempotency_key"])
|
757 |
+
fname = "main" if chan == "mono" else ("ch1" if chan == "L" else "ch2")
|
758 |
+
local_candidate = os.path.join(base_dir, f"{fname}.{work['format']}")
|
759 |
+
remote_key = work.get("uris_remote", {}).get(chan)
|
760 |
+
|
761 |
+
# Expose preferred source to downstream: presigned remote if available, else local path
|
762 |
+
if remote_key:
|
763 |
+
try:
|
764 |
+
uploader = self._get_s3_uploader()
|
765 |
+
plan_source_uris[chan] = uploader.presign(remote_key, "GET") if uploader else local_candidate
|
766 |
+
except Exception:
|
767 |
+
plan_source_uris[chan] = local_candidate
|
768 |
+
else:
|
769 |
+
plan_source_uris[chan] = local_candidate
|
770 |
+
|
771 |
+
# Choose ffmpeg input: local if present, else presigned remote, else raw key (ffmpeg may support s3/http)
|
772 |
+
if os.path.exists(local_candidate):
|
773 |
+
ffmpeg_src = local_candidate
|
774 |
+
elif remote_key:
|
775 |
+
try:
|
776 |
+
uploader = self._get_s3_uploader()
|
777 |
+
ffmpeg_src = uploader.presign(remote_key, "GET") if uploader else remote_key
|
778 |
+
except Exception:
|
779 |
+
ffmpeg_src = remote_key
|
780 |
+
else:
|
781 |
+
ffmpeg_src = local_candidate # may not exist; will fail predictably
|
782 |
+
|
783 |
+
# cache processing source for materialization stage
|
784 |
+
proc_source[chan] = ffmpeg_src
|
785 |
+
|
786 |
+
info = self._ffprobe_streams(ffmpeg_src)
|
787 |
+
dur_ms = int(float(info.get("format", {}).get("duration", 0.0)) * 1000)
|
788 |
+
|
789 |
+
# Build ranges via streaming VAD if requested/available
|
790 |
+
ranges = self._build_chunks_vad_or_fixed_streaming(ffmpeg_src, dur_ms, chunk_target, overlap)
|
791 |
+
for idx, (start_ms, dur) in enumerate(ranges):
|
792 |
+
chunks.append({"idx": idx if len(channels) == 1 else f"{idx}{chan}",
|
793 |
+
"chan": chan, "start_ms": int(start_ms), "dur_ms": int(dur),
|
794 |
+
"status": "queued"})
|
795 |
+
total_chunks += len(ranges)
|
796 |
+
# Progress rough update per channel
|
797 |
+
self._set_stage("split", "running", clamp(0.05 + 0.4 * (total_chunks / max(1, total_chunks)), 0.05, 0.9))
|
798 |
+
|
799 |
+
plan = {
|
800 |
+
"mode": "materialized" if materialize else "virtual",
|
801 |
+
"channels": channels,
|
802 |
+
"source_uris": plan_source_uris, # <--- expose per-channel sources
|
803 |
+
"chunk_policy": policy["chunk_policy"],
|
804 |
+
"chunk_target_ms": chunk_target,
|
805 |
+
"overlap_ms": overlap,
|
806 |
+
"total_chunks": total_chunks,
|
807 |
+
"chunks": chunks[:2000] if total_chunks <= 2000 else [], # avoid bloating manifest
|
808 |
+
}
|
809 |
+
|
810 |
+
if materialize:
|
811 |
+
# Accurate-seek pattern for better boundaries:
|
812 |
+
# - Fast seek before -i
|
813 |
+
# - Fine seek after -i (optional) + atrim
|
814 |
+
out_dir = os.path.join(self.work_root, self.manifest["job_id"], "chunks")
|
815 |
+
os.makedirs(out_dir, exist_ok=True)
|
816 |
+
for c in chunks:
|
817 |
+
st_s = c["start_ms"] / 1000.0
|
818 |
+
du_s = c["dur_ms"] / 1000.0
|
819 |
+
chan = c["chan"]
|
820 |
+
outp = os.path.join(out_dir, f"chunk_{c['idx']}.flac")
|
821 |
+
inp = proc_source.get(chan, None) or plan_source_uris.get(chan)
|
822 |
+
# fast seek near the start, then fine trim with atrim to be exact
|
823 |
+
cmd = [
|
824 |
+
"ffmpeg", "-nostdin", "-hide_banner", "-y", "-v", "error",
|
825 |
+
"-ss", sec_to_hms(max(0.0, st_s - 0.05)), # fast seek slightly earlier
|
826 |
+
"-i", inp, "-map", "0:a:0",
|
827 |
+
"-ss", "0.05", "-t", f"{du_s:.3f}",
|
828 |
+
"-af", f"atrim=start=0:end={du_s:.3f}",
|
829 |
+
"-ac", "1", "-ar", str(self.presets["sample_rate_target"]),
|
830 |
+
outp
|
831 |
+
]
|
832 |
+
run_with_retry(cmd, retries=self.presets["ff_retries"], timeout=self.presets["ff_timeout_sec"])
|
833 |
+
c["uri"] = outp
|
834 |
+
c["status"] = "ready"
|
835 |
+
|
836 |
+
# Save plan
|
837 |
+
self.manifest["stages"]["split"]["plan"] = plan
|
838 |
+
self._set_stage("split", "done", 1.0, {"ended_at": utc_now_iso()})
|
839 |
+
|
840 |
+
# Initialize downstream counters
|
841 |
+
self.manifest["stages"]["transcribe"].update({
|
842 |
+
"status": "pending",
|
843 |
+
"progress": 0.0,
|
844 |
+
"chunks": {"total": total_chunks, "done": 0, "running": 0, "failed": 0, "queued": total_chunks},
|
845 |
+
"per_chunk": []
|
846 |
+
})
|
847 |
+
|
848 |
+
except Exception as e:
|
849 |
+
self._set_stage("split", "failed", 0.0, {"last_error": str(e), "ended_at": utc_now_iso()})
|
850 |
+
raise
|
851 |
+
|
852 |
+
# --- VAD (streaming from ffmpeg to avoid big temp files) ---
|
853 |
+
|
854 |
+
def _build_chunks_vad_or_fixed_streaming(self, src_uri: str, dur_ms: int, target_ms: int, overlap_ms: int) -> List[Tuple[int, int]]:
|
855 |
+
use_vad = (self.tool_versions.get("webrtcvad") == "installed") and \
|
856 |
+
(self.manifest["policy"].get("chunk_policy", "").startswith("vad"))
|
857 |
+
if not use_vad:
|
858 |
+
return self._fixed_chunks(dur_ms, target_ms, overlap_ms)
|
859 |
+
|
860 |
+
# Decode via ffmpeg to s16le PCM @16k mono on stdout, feed WebRTC VAD
|
861 |
+
try:
|
862 |
+
import webrtcvad
|
863 |
+
vad = webrtcvad.Vad(int(self.presets["vad_aggressiveness"]))
|
864 |
+
frame_ms = 30
|
865 |
+
bytes_per_frame = int(16000 * 2 * frame_ms / 1000)
|
866 |
+
|
867 |
+
# Start ffmpeg process
|
868 |
+
cmd = [
|
869 |
+
"ffmpeg", "-nostdin", "-hide_banner", "-v", "error",
|
870 |
+
"-i", src_uri, "-map", "0:a:0", "-ac", "1", "-ar", "16000",
|
871 |
+
"-f", "s16le", "-" # raw PCM to stdout
|
872 |
+
]
|
873 |
+
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
874 |
+
speech_regions: List[Tuple[int, int]] = []
|
875 |
+
in_speech = False
|
876 |
+
seg_start = 0
|
877 |
+
frames_read = 0
|
878 |
+
last_progress_emit = time.time()
|
879 |
+
|
880 |
+
while True:
|
881 |
+
chunk = proc.stdout.read(bytes_per_frame)
|
882 |
+
if chunk is None or len(chunk) == 0:
|
883 |
+
break
|
884 |
+
if len(chunk) < bytes_per_frame:
|
885 |
+
break # tail
|
886 |
+
|
887 |
+
t_ms = frames_read * frame_ms
|
888 |
+
frames_read += 1
|
889 |
+
is_speech = vad.is_speech(chunk, 16000)
|
890 |
+
if is_speech and not in_speech:
|
891 |
+
in_speech = True
|
892 |
+
seg_start = t_ms
|
893 |
+
elif not is_speech and in_speech:
|
894 |
+
in_speech = False
|
895 |
+
speech_regions.append((seg_start, t_ms - seg_start))
|
896 |
+
|
897 |
+
# emit progress occasionally (decode-based approximation)
|
898 |
+
if time.time() - last_progress_emit > 0.5 and dur_ms:
|
899 |
+
prog = clamp(0.05 + 0.8 * (t_ms / dur_ms), 0.05, 0.95)
|
900 |
+
self._set_stage("split", "running", prog)
|
901 |
+
last_progress_emit = time.time()
|
902 |
+
|
903 |
+
# finalize region if still in speech
|
904 |
+
if in_speech:
|
905 |
+
speech_regions.append((seg_start, max(0, dur_ms - seg_start)))
|
906 |
+
|
907 |
+
# Build chunks by packing islands to ~target_ms
|
908 |
+
chunks: List[Tuple[int, int]] = []
|
909 |
+
if not speech_regions:
|
910 |
+
return self._fixed_chunks(dur_ms, target_ms, overlap_ms)
|
911 |
+
|
912 |
+
cur_start = None
|
913 |
+
cur_end = None
|
914 |
+
max_len = target_ms + 500
|
915 |
+
gap_allow = 300
|
916 |
+
# Pack complete speech islands into chunks. We never split an
|
917 |
+
# individual speech island; max_len is only used to decide when
|
918 |
+
# to merge adjacent islands. If a single island exceeds max_len
|
919 |
+
# it will remain intact in its own chunk.
|
920 |
+
for s, d in speech_regions:
|
921 |
+
e = s + d
|
922 |
+
if cur_start is None:
|
923 |
+
cur_start, cur_end = s, e
|
924 |
+
continue
|
925 |
+
# merge only if small gap and combined length stays within max_len
|
926 |
+
if (s - cur_end) <= gap_allow and (e - cur_start) <= max_len:
|
927 |
+
cur_end = e
|
928 |
+
else:
|
929 |
+
# finalize current chunk as the full span of islands
|
930 |
+
chunks.append((cur_start, cur_end - cur_start))
|
931 |
+
cur_start = s
|
932 |
+
cur_end = e
|
933 |
+
|
934 |
+
# finalize last chunk
|
935 |
+
if cur_start is not None and (cur_end - cur_start) > 250:
|
936 |
+
chunks.append((cur_start, cur_end - cur_start))
|
937 |
+
|
938 |
+
# Normalize with overlap shift but preserve complete islands.
|
939 |
+
# We shift the start earlier by overlap_ms for non-first chunks but
|
940 |
+
# keep the full island coverage in the duration.
|
941 |
+
normalized: List[Tuple[int, int]] = []
|
942 |
+
for i, (s, d) in enumerate(chunks):
|
943 |
+
if i == 0:
|
944 |
+
normalized.append((max(0, s), d))
|
945 |
+
else:
|
946 |
+
s2 = max(0, s - overlap_ms)
|
947 |
+
normalized.append((s2, (s + d) - s2))
|
948 |
+
|
949 |
+
return self._cap_chunks(normalized, dur_ms)
|
950 |
+
|
951 |
+
except Exception:
|
952 |
+
# If anything fails in VAD pipeline → fallback to fixed
|
953 |
+
return self._fixed_chunks(dur_ms, target_ms, overlap_ms)
|
954 |
+
|
955 |
+
def _fixed_chunks(self, dur_ms: int, target_ms: int, overlap_ms: int) -> List[Tuple[int, int]]:
|
956 |
+
chunks: List[Tuple[int, int]] = []
|
957 |
+
if dur_ms <= 0: return chunks
|
958 |
+
step = max(1, target_ms - overlap_ms)
|
959 |
+
start = 0
|
960 |
+
while start < dur_ms:
|
961 |
+
length = min(target_ms, dur_ms - start)
|
962 |
+
chunks.append((start, length))
|
963 |
+
if length < target_ms: break
|
964 |
+
start += step
|
965 |
+
return chunks
|
966 |
+
|
967 |
+
def _cap_chunks(self, chunks: List[Tuple[int, int]], dur_ms: int) -> List[Tuple[int, int]]:
|
968 |
+
capped = []
|
969 |
+
for s, d in chunks:
|
970 |
+
s2 = clamp(s, 0, max(0, dur_ms - 1))
|
971 |
+
d2 = clamp(d, 1, dur_ms - s2)
|
972 |
+
capped.append((int(s2), int(d2)))
|
973 |
+
return capped
|
974 |
+
|
975 |
+
|
976 |
+
# ------------------------------------------------------
|
977 |
+
# Example CLI (optional)
|
978 |
+
# ------------------------------------------------------
|
979 |
+
if __name__ == "__main__":
|
980 |
+
import argparse
|
981 |
+
ap = argparse.ArgumentParser(description="Audio preprocess runner v2 (until split).")
|
982 |
+
ap.add_argument("source", help="Local path or URL/storage URI to audio")
|
983 |
+
ap.add_argument("work_root", help="Working root directory")
|
984 |
+
ap.add_argument("--manifest", help="Path to existing manifest.json to resume", default=None)
|
985 |
+
ap.add_argument("--chunk_ms", type=int, default=60000)
|
986 |
+
ap.add_argument("--overlap_ms", type=int, default=300)
|
987 |
+
ap.add_argument("--materialize", action="store_true")
|
988 |
+
args = ap.parse_args()
|
989 |
+
|
990 |
+
presets = {
|
991 |
+
"chunk_target_ms": args.chunk_ms,
|
992 |
+
"overlap_ms": args.overlap_ms,
|
993 |
+
"materialize_chunks": args.materialize
|
994 |
+
}
|
995 |
+
|
996 |
+
storage = LocalStorageAdapter()
|
997 |
+
|
998 |
+
manifest = None
|
999 |
+
if args.manifest and os.path.exists(args.manifest):
|
1000 |
+
with open(args.manifest, "r", encoding="utf-8") as f:
|
1001 |
+
manifest = json.load(f)
|
1002 |
+
|
1003 |
+
runner = AudioJobRunner(
|
1004 |
+
manifest=manifest,
|
1005 |
+
source_uri=None if manifest else args.source,
|
1006 |
+
work_root=args.work_root,
|
1007 |
+
storage=storage,
|
1008 |
+
presets=presets
|
1009 |
+
)
|
1010 |
+
out_manifest = runner.run_until_split()
|
1011 |
+
|
1012 |
+
out_path = os.path.join(args.work_root, "manifest.json")
|
1013 |
+
os.makedirs(args.work_root, exist_ok=True)
|
1014 |
+
with open(out_path, "w", encoding="utf-8") as f:
|
1015 |
+
json.dump(out_manifest, f, ensure_ascii=False, indent=2)
|
1016 |
+
print(f"Saved manifest -> {out_path}")
|
requirements.txt
CHANGED
@@ -1,4 +1,6 @@
|
|
1 |
gradio>=5.39.0
|
2 |
requests>=2.31.0
|
3 |
pydub>=0.25.1
|
4 |
-
ffmpeg-python>=0.2.0
|
|
|
|
|
|
1 |
gradio>=5.39.0
|
2 |
requests>=2.31.0
|
3 |
pydub>=0.25.1
|
4 |
+
ffmpeg-python>=0.2.0
|
5 |
+
webrtcvad
|
6 |
+
boto3>=1.34.0
|