$ cat node-template.py
Video Creation From Speech
// Generates a talking-head video by animating a portrait image driven by an audio clip. Uses the SONIC model via a native GPU service with face detection, Whisper audio conditioning, dual guidance, and RIFE frame interpolation.
Process
Video
template.py
1import os2import sys3import json4import subprocess5import time6import traceback78try:9 import requests10except ImportError:11 subprocess.check_call([sys.executable, "-m", "pip", "install", "requests"])12 import requests1314NATIVE_VIDEO_SPEECH_SERVICE_URL = os.getenv(15 "NATIVE_VIDEO_SPEECH_SERVICE_URL", "http://native-video-speech-service:8106"16)17_EMBLEMA_VERSION = os.getenv("EMBLEMA_VERSION", "dev")18NATIVE_VIDEO_SPEECH_SERVICE_IMAGE = os.getenv(19 "NATIVE_VIDEO_SPEECH_SERVICE_IMAGE",20 f"emblema/native-video-speech-service:{_EMBLEMA_VERSION}",21)22HF_CACHE_HOST_PATH = os.getenv("HF_CACHE_HOST_PATH", "/root/.cache/huggingface")23CONTAINER_NAME = "native-video-speech-service"24INPUT_DIR = "/data/input"25OUTPUT_DIR = "/data/output"262728def start_container():29 """Create and start native-video-speech-service, removing any stale container first."""30 subprocess.run(31 ["docker", "rm", "-f", CONTAINER_NAME],32 capture_output=True, text=True33 )3435 hf_token = os.getenv("HUGGINGFACE_TOKEN", "")36 print(f"Creating container {CONTAINER_NAME}...", file=sys.stderr)37 run_cmd = [38 "docker", "run", "-d",39 "--name", CONTAINER_NAME,40 "--network", "emblema",41 "--gpus", "all",42 "-e", "PORT=8106",43 "-e", "DEVICE=cuda",44 "-e", f"HF_TOKEN={hf_token}",45 "-e", f"SONIC_REPO_ID={os.getenv('SONIC_REPO_ID', 'LeonJoe13/Sonic')}",46 "-e", f"ENABLE_RIFE={os.getenv('ENABLE_RIFE', 'true')}",47 "-v", f"{HF_CACHE_HOST_PATH}:/root/.cache/huggingface",48 NATIVE_VIDEO_SPEECH_SERVICE_IMAGE,49 ]50 result = subprocess.run(run_cmd, capture_output=True, text=True)51 if result.returncode != 0:52 print(f"docker run failed (exit {result.returncode}): {result.stderr}", file=sys.stderr)53 raise RuntimeError(f"Failed to start container: {result.stderr}")5455 # Poll health endpoint56 timeout = 36057 interval = 358 elapsed = 059 health_url = f"{NATIVE_VIDEO_SPEECH_SERVICE_URL}/health"60 while elapsed < timeout:61 try:62 r = requests.get(health_url, timeout=5)63 if r.status_code == 200:64 print(f"Container healthy (waited {elapsed}s).", file=sys.stderr)65 return66 except requests.ConnectionError:67 pass68 time.sleep(interval)69 elapsed += interval7071 raise RuntimeError(f"Container did not become healthy within {timeout}s")727374def stop_container():75 """Remove the container."""76 try:77 subprocess.run(78 ["docker", "rm", "-f", CONTAINER_NAME],79 capture_output=True, text=True, timeout=3080 )81 print(f"Container {CONTAINER_NAME} removed.", file=sys.stderr)82 except Exception as e:83 print(f"Warning: failed to remove container: {e}", file=sys.stderr)848586def main():87 try:88 input_json = sys.stdin.read()89 execution_input = json.loads(input_json)90 inputs = execution_input.get("inputs", {})9192 image = inputs.get("image", "")93 audio = inputs.get("audio", "")94 resolution = inputs.get("resolution", "768x512")95 min_resolution = inputs.get("min_resolution", 512)96 dynamic_scale = inputs.get("dynamic_scale", 1.0)97 crop = inputs.get("crop", False)98 seed = inputs.get("seed", 72589)99 inference_steps = inputs.get("inference_steps", 25)100 enable_rife = inputs.get("enable_rife", True)101102 if not image:103 raise ValueError("Input image is required")104 if not audio:105 raise ValueError("Input audio is required")106107 image_path = os.path.join(INPUT_DIR, image)108 audio_path = os.path.join(INPUT_DIR, audio)109110 if not os.path.exists(image_path):111 raise FileNotFoundError(f"Input image not found: {image_path}")112 if not os.path.exists(audio_path):113 raise FileNotFoundError(f"Input audio not found: {audio_path}")114115 os.makedirs(OUTPUT_DIR, exist_ok=True)116117 # Start the container118 start_container()119120 try:121 # Send image and audio to service122 with open(image_path, "rb") as img_f, open(audio_path, "rb") as aud_f:123 resp = requests.post(124 f"{NATIVE_VIDEO_SPEECH_SERVICE_URL}/generate",125 files={126 "image": (os.path.basename(image_path), img_f, "image/png"),127 "audio": (os.path.basename(audio_path), aud_f, "audio/wav"),128 },129 data={130 "resolution": str(resolution),131 "min_resolution": str(min_resolution),132 "dynamic_scale": str(dynamic_scale),133 "crop": str(crop).lower(),134 "seed": str(seed),135 "inference_steps": str(inference_steps),136 "enable_rife": str(enable_rife).lower(),137 },138 timeout=2520,139 )140141 if resp.status_code != 200:142 try:143 error_detail = resp.json()144 except Exception:145 error_detail = resp.text146 raise RuntimeError(147 f"Video speech service returned {resp.status_code}: {error_detail}"148 )149150 # Save result151 out_filename = "speech_video.mp4"152 out_path = os.path.join(OUTPUT_DIR, out_filename)153 with open(out_path, "wb") as f:154 f.write(resp.content)155156 inference_time = resp.headers.get("X-Inference-Time-Ms", "unknown")157 print(f"Video generated: time={inference_time}ms", file=sys.stderr)158159 output = {160 "video": out_filename,161 }162 print(json.dumps(output, indent=2))163164 finally:165 stop_container()166167 except Exception as e:168 error_output = {169 "error": str(e),170 "errorType": type(e).__name__,171 "traceback": traceback.format_exc(),172 }173 print(json.dumps(error_output), file=sys.stderr)174 sys.exit(1)175176177if __name__ == "__main__":178 main()