$ cat node-template.py

F

Fetch URL as Markdown

// Fetch a batch of public URLs (1-50) and return per-URL Markdown ready for LLM consumption. Scrapes HTML and converts to Markdown via crawl4ai + Playwright + Chromium. Wire directly from `Web Search` — the urls input accepts the ranked URL array as-is. Returns parallel arrays (markdown, title, error) plus success_count and failed_count. On success each markdown item is prefixed with a Source/title header. Failure is encoded by empty markdown. Pairs with: `Web Search`.

Process
Integration
#web#fetch#url#markdown#research#internet#search#scrape#crawl#html#page-content#crawl4ai#batch#array
template.py
1"""2Fetch URL as Markdown — array form (v2.0.0).34Accepts an array of URLs and returns three parallel arrays (markdown, title,5error) plus success_count and failed_count. Wire directly from Web Search's6urls output.78Success/failure is encoded by emptiness: markdown[i] is non-empty on success9(decorated with a Source/title header) and empty on failure. error[i] carries10the reason when failed and is empty when succeeded. success_count and11failed_count are pre-computed aggregates over markdown emptiness for fast12downstream branching without iterating the arrays.1314Per-item soft-fail invariant: any single fetch that raises an SDK exception15(BackendTimeout, BackendError, RemoteError) OR returns success:false from the16sidecar is surfaced as data in the per-item slot. The node exits 0 unless17input validation fails (programmer error, exit 1).1819Concurrency: up to 4 URLs in flight via ThreadPoolExecutor.map, which preserves20input order so results[i] always corresponds to urls[i]. gais-web caps at 421process-wide as well, so client-side concurrency above 4 would just queue.2223Markdown decoration: on a successful fetch the per-item markdown is prefixed24with a single header line carrying the source URL and the extracted title.25This makes each item self-describing so downstream LLM consumers see the26provenance inline without having to cross-reference the parallel arrays. The27prefix is only added when the body is non-empty (failed fetches stay empty so28`not markdown[i]` remains a valid quick-failure check).2930No truncation: the full markdown body from gais-web flows through. The31sidecar still applies its own 1 MB cap upstream, but the signal is not32surfaced here — the template's contract is "full text or empty (failed)".33Worst-case stdout payload at N=50 can therefore exceed Temporal's 2 MB gRPC34default; this is a deliberate trade-off for faithful content delivery.35"""3637from __future__ import annotations3839import json40import sys41import traceback42from concurrent.futures import ThreadPoolExecutor43from typing import Any4445from gais import Gais464748MAX_URLS = 5049MAX_WORKERS = 4505152def parse_urls(value: Any) -> list[str]:53    """Lenient parse: accept list, or JSON-encoded string of a list.5455    Mirrors the parse_json helper in odoo-rpc — the established convention56    for Json-typed inputs in this codebase. Rejects anything that is not a57    list or a JSON string that decodes to a list.58    """59    if value is None:60        raise ValueError("Required input 'urls' not provided")61    if isinstance(value, list):62        return value63    if isinstance(value, str):64        stripped = value.strip()65        if not stripped:66            raise ValueError("Required input 'urls' is empty")67        try:68            decoded = json.loads(stripped)69        except json.JSONDecodeError as exc:70            raise ValueError(f"urls is a string but not valid JSON: {exc}") from exc71        if not isinstance(decoded, list):72            raise ValueError(73                f"urls (string) decoded to {type(decoded).__name__}, expected list"74            )75        return decoded76    raise ValueError(77        f"urls must be a list or a JSON string of a list, got {type(value).__name__}"78    )798081def validate_urls(urls: list[Any]) -> list[str]:82    """Reject empty, over-cap, or non-string inputs with a clear error."""83    if not urls:84        raise ValueError("urls cannot be empty")85    if len(urls) > MAX_URLS:86        raise ValueError(87            f"urls exceeds max {MAX_URLS} (got {len(urls)}). "88            f"Cap web-search's num_results upstream or split the batch."89        )90    for idx, item in enumerate(urls):91        if not isinstance(item, str) or not item.strip():92            raise ValueError(93                f"urls[{idx}] must be a non-empty string, got {type(item).__name__}"94            )95    return urls969798def decorate_markdown(url: str, title: str, markdown: str) -> str:99    """Prepend a Source/title header to the markdown body.100101    The prefix is a single bold-title + link-to-source line followed by a blank102    line, so it composes cleanly with whatever heading the page itself emits103    below. Title can be empty (sidecar best-effort) — fall back to URL-only.104105    Returns the original body unchanged when it is empty (a failed fetch106    should NOT gain a misleading provenance header).107    """108    if not markdown:109        return markdown110    safe_title = (title or "").strip()111    if safe_title:112        header = f"**{safe_title}** — [Source]({url})"113    else:114        header = f"[Source]({url})"115    return f"{header}\n\n{markdown}"116117118def fetch_one(url: str, timeout_seconds: int, idx: int, total: int) -> dict[str, Any]:119    """Fetch one URL. NEVER raises — exceptions become soft-fail data.120121    Returns 3 fields per item: markdown (empty on failure), title, error.122    Success/failure is encoded by markdown emptiness; the explicit success123    bool was removed as redundant.124    """125    try:126        print(127            f"[fetch-url-markdown][{idx + 1}/{total}] start url={url!r} "128            f"timeout={timeout_seconds}s",129            file=sys.stderr,130        )131        result = Gais.web.fetch(url=url, timeout_seconds=timeout_seconds)132        md = result.metadata133        markdown = md.get("markdown", "") or ""134        title = md.get("title", "") or ""135        success = bool(md.get("success", False))136        error = md.get("error", "") or ""137138        # Soft-fail unification: a sidecar success=False with an empty body139        # MUST collapse markdown to "" so `not markdown[i]` is the single140        # source of truth for "this item failed". A sidecar success=False141        # with a non-empty body (rare) is treated as failed too.142        if not success:143            markdown = ""144145        # Decorate non-empty successful bodies with provenance.146        if markdown:147            markdown = decorate_markdown(url, title, markdown)148149        print(150            f"[fetch-url-markdown][{idx + 1}/{total}] done "151            f"markdown_chars={len(markdown)} error={error!r}",152            file=sys.stderr,153        )154        return {155            "markdown": markdown,156            "title": title,157            "error": error,158        }159    except Exception as exc:  # noqa: BLE001 — soft-fail by contract160        err_label = f"{type(exc).__name__}: {exc}"161        print(162            f"[fetch-url-markdown][{idx + 1}/{total}] FAILED {err_label}",163            file=sys.stderr,164        )165        return {166            "markdown": "",167            "title": "",168            "error": err_label,169        }170171172def process(inputs: dict[str, Any]) -> dict[str, Any]:173    """Run the batch fetch and return the 5-field output dict.174175    Pure function over the input dict. Raises ValueError on input validation176    failures (programmer error); per-item SDK exceptions are caught inside177    fetch_one and surfaced as soft-fail data (empty markdown + populated178    error). Extracted from main() so tests can drive it without stdin/stdout179    monkey-patching.180    """181    urls = validate_urls(parse_urls(inputs.get("urls")))182    timeout_seconds = int(inputs.get("timeout_seconds") or 30)183    if not (5 <= timeout_seconds <= 120):184        raise ValueError(185            f"timeout_seconds must be between 5 and 120, got {timeout_seconds}"186        )187188    total = len(urls)189    print(190        f"[fetch-url-markdown] batch start n={total} "191        f"timeout={timeout_seconds}s workers={MAX_WORKERS}",192        file=sys.stderr,193    )194195    # ThreadPoolExecutor.map preserves order: results[i] <-> urls[i].196    # gais-web's process-wide semaphore caps concurrency at 4 server-side,197    # so MAX_WORKERS=4 client-side aligns perfectly.198    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as pool:199        results = list(200            pool.map(201                lambda iu: fetch_one(iu[1], timeout_seconds, iu[0], total),202                enumerate(urls),203            )204        )205206    # Pivot: list-of-dicts -> 3 parallel arrays. Order preserved by pool.map.207    markdown_arr = [r["markdown"] for r in results]208    title_arr = [r["title"] for r in results]209    error_arr = [r["error"] for r in results]210211    # Aggregates derived from markdown emptiness (the single failure signal212    # after dropping the redundant success[] and the no-longer-meaningful213    # truncated[] arrays).214    success_count = sum(1 for m in markdown_arr if m)215    failed_count = total - success_count216217    print(218        f"[fetch-url-markdown] batch done n={total} success={success_count} "219        f"failed={failed_count}",220        file=sys.stderr,221    )222223    return {224        "markdown": markdown_arr,225        "title": title_arr,226        "error": error_arr,227        "success_count": success_count,228        "failed_count": failed_count,229    }230231232def main() -> None:233    try:234        envelope = json.loads(sys.stdin.read() or "{}")235        inputs: dict[str, Any] = (236            envelope.get("inputs", {}) if isinstance(envelope, dict) else {}237        )238        output = process(inputs)239        json.dump(output, sys.stdout)240    except Exception as e:241        error_payload = {242            "error": str(e),243            "errorType": type(e).__name__,244            "traceback": traceback.format_exc(),245        }246        print(json.dumps(error_payload), file=sys.stderr)247        sys.exit(1)248249250if __name__ == "__main__":251    main()252

$ git log --oneline

v2.0.0
HEAD
2026-05-25
v1.0.32026-05-25
v1.0.22026-05-22
v1.0.12026-05-22
v1.0.02026-05-21