$ cat node-template.py
Image Vision Legacy
// Analyzes images using Florence-2 computer vision. Generates detailed captions, OCR text extraction, and image descriptions. Output is text that can be connected to prompt inputs of other nodes.
Process
Image
template.py
1import os2import sys3import json4import time5import random6import traceback78try:9 import requests10except ImportError:11 import subprocess12 subprocess.check_call([sys.executable, "-m", "pip", "install", "requests"])13 import requests1415COMFYUI_API_URL = os.getenv("COMFYUI_API_URL", "http://192.168.1.39:8188")16INPUT_DIR = "/data/input"1718WORKFLOW = {19 "2": {20 "inputs": {21 "text_input": "",22 "task": "more_detailed_caption",23 "fill_mask": True,24 "keep_model_loaded": False,25 "max_new_tokens": 1024,26 "num_beams": 3,27 "do_sample": True,28 "output_mask_select": "",29 "seed": 732209118079946,30 "image": ["7", 0],31 "florence2_model": ["3", 0],32 },33 "class_type": "Florence2Run",34 "_meta": {"title": "Florence2Run"},35 },36 "3": {37 "inputs": {38 "model": "gokaygokay/Florence-2-Flux-Large",39 "precision": "fp16",40 "attention": "sdpa",41 "convert_to_safetensors": False,42 },43 "class_type": "DownloadAndLoadFlorence2Model",44 "_meta": {"title": "DownloadAndLoadFlorence2Model"},45 },46 "7": {47 "inputs": {48 "directory": "",49 "image_load_cap": 0,50 "skip_first_images": 0,51 "select_every_nth": 1,52 },53 "class_type": "VHS_LoadImagesPath",54 "_meta": {"title": "Load Images (Path)"},55 },56 "6": {57 "inputs": {58 "text": ["2", 2],59 },60 "class_type": "ShowText|pysssss",61 "_meta": {"title": "Show Text"},62 },63}646566def upload_image_to_comfyui(local_path: str) -> str:67 """Upload a local image to ComfyUI and return the uploaded filename."""68 with open(local_path, "rb") as f:69 resp = requests.post(70 f"{COMFYUI_API_URL}/upload/image",71 files={"image": (os.path.basename(local_path), f, "image/png")},72 timeout=30,73 )74 resp.raise_for_status()75 data = resp.json()76 return data["name"]777879def build_workflow(comfyui_image_name: str, task: str) -> dict:80 """Inject user inputs into the workflow template."""81 import copy82 wf = copy.deepcopy(WORKFLOW)8384 # Set input image — swap VHS_LoadImagesPath to LoadImage85 wf["7"] = {86 "inputs": {"image": comfyui_image_name},87 "class_type": "LoadImage",88 "_meta": {"title": "Load Image"},89 }9091 # Set vision task92 wf["2"]["inputs"]["task"] = task9394 # Randomize seed95 wf["2"]["inputs"]["seed"] = random.randint(0, 2**31 - 1)9697 return wf9899100def submit_prompt(workflow: dict) -> str:101 """Submit workflow to ComfyUI and return prompt_id."""102 resp = requests.post(103 f"{COMFYUI_API_URL}/prompt",104 json={"prompt": workflow},105 timeout=30,106 )107 if resp.status_code != 200:108 try:109 error_detail = resp.json()110 except Exception:111 error_detail = resp.text112 raise RuntimeError(113 f"ComfyUI /prompt returned {resp.status_code}: {json.dumps(error_detail, indent=2) if isinstance(error_detail, dict) else error_detail}"114 )115 data = resp.json()116117 # ComfyUI returns 200 even when nodes have validation errors118 node_errors = data.get("node_errors", {})119 if node_errors:120 raise RuntimeError(121 f"ComfyUI workflow has node errors: {json.dumps(node_errors, indent=2)}"122 )123124 return data["prompt_id"]125126127def wait_for_result(prompt_id: str, timeout: int = 600, poll_interval: int = 2) -> dict:128 """Poll ComfyUI history until the prompt completes with outputs."""129 deadline = time.time() + timeout130 empty_complete_retries = 0131 max_empty_retries = 3 # grace period for output serialization lag132133 while time.time() < deadline:134 resp = requests.get(135 f"{COMFYUI_API_URL}/history/{prompt_id}",136 timeout=10,137 )138 resp.raise_for_status()139 history = resp.json()140141 if prompt_id in history:142 prompt_data = history[prompt_id]143 status = prompt_data.get("status", {})144145 if status.get("status_str") == "error":146 messages = status.get("messages", [])147 raise RuntimeError(148 f"ComfyUI prompt failed: {json.dumps(messages, indent=2)}"149 )150151 if status.get("completed", False):152 if prompt_data.get("outputs"):153 return prompt_data154155 # Completed but no outputs — retry briefly for race condition156 empty_complete_retries += 1157 if empty_complete_retries >= max_empty_retries:158 raise RuntimeError(159 f"ComfyUI prompt completed but produced no outputs. "160 f"This usually means a node failed silently (missing custom node or model). "161 f"Status: {json.dumps(status, indent=2)}"162 )163164 time.sleep(poll_interval)165166 raise TimeoutError(f"ComfyUI prompt {prompt_id} did not complete within {timeout}s")167168169def extract_text_output(prompt_data: dict) -> str:170 """Extract text from ComfyUI history outputs."""171 outputs = prompt_data.get("outputs", {})172173 # Try Florence2Run node "2" — may have text output174 node_2 = outputs.get("2", {})175 if "text" in node_2:176 text_data = node_2["text"]177 if isinstance(text_data, list):178 return "\n".join(str(t) for t in text_data)179 return str(text_data)180181 # Try SaveText node "6" — may have text/string output182 node_6 = outputs.get("6", {})183 if "text" in node_6:184 text_data = node_6["text"]185 if isinstance(text_data, list):186 return "\n".join(str(t) for t in text_data)187 return str(text_data)188 if "string" in node_6:189 text_data = node_6["string"]190 if isinstance(text_data, list):191 return "\n".join(str(t) for t in text_data)192 return str(text_data)193194 # Fallback: search all node outputs for any text data195 for node_id, node_output in outputs.items():196 for key in ("text", "string", "TEXT", "STRING"):197 if key in node_output:198 text_data = node_output[key]199 if isinstance(text_data, list):200 return "\n".join(str(t) for t in text_data)201 return str(text_data)202203 raise RuntimeError(204 f"No text output found in ComfyUI response. Available outputs: {json.dumps(outputs, indent=2)}"205 )206207208def main():209 try:210 input_json = sys.stdin.read()211 execution_input = json.loads(input_json)212 inputs = execution_input.get("inputs", {})213214 image = inputs.get("image", "")215 task = inputs.get("task", "more_detailed_caption")216217 if not image:218 raise ValueError("Image is required")219220 valid_tasks = [221 "more_detailed_caption", "caption", "ocr",222 "ocr_with_region", "object_detection", "dense_region_caption",223 ]224 if task not in valid_tasks:225 raise ValueError(f"Invalid task '{task}'. Must be one of: {valid_tasks}")226227 # Upload input image to ComfyUI228 local_path = os.path.join(INPUT_DIR, image)229 if not os.path.exists(local_path):230 raise FileNotFoundError(f"Input image not found: {local_path}")231 comfyui_image_name = upload_image_to_comfyui(local_path)232233 # Build and submit workflow234 workflow = build_workflow(comfyui_image_name, task)235 prompt_id = submit_prompt(workflow)236237 # Wait for completion and extract text238 prompt_data = wait_for_result(prompt_id)239 result_text = extract_text_output(prompt_data)240241 # Log metadata to stderr242 print(f"prompt_id={prompt_id}, task={task}", file=sys.stderr)243244 # Flat output — keys match OUTPUT_SCHEMA245 # portType is "Text" so output the actual content, not a filename246 output = {247 "text": result_text,248 }249 print(json.dumps(output, indent=2))250251 except Exception as e:252 error_output = {253 "error": str(e),254 "errorType": type(e).__name__,255 "traceback": traceback.format_exc(),256 }257 print(json.dumps(error_output), file=sys.stderr)258 sys.exit(1)259260261if __name__ == "__main__":262 main()