Skip to content

runtime

runtime

Container runtime gateway wrapping the Podman CLI.

Provides module-level functions for container lifecycle operations (state queries, GPU args, log streaming, port allocation, etc.).

All functions accept plain parameters (strings, paths) — no terok-specific types like ProjectConfig. Container naming is orchestration policy and lives in the caller.

GpuConfigError(message, *, hint=_CDI_HINT)

Bases: RuntimeError

CDI/NVIDIA misconfiguration detected during container launch.

Store the CDI hint alongside the standard error message.

Source code in src/terok_sandbox/runtime.py
def __init__(self, message: str, *, hint: str = _CDI_HINT) -> None:
    """Store the CDI *hint* alongside the standard error *message*."""
    self.hint = hint
    super().__init__(message)

podman_userns_args()

Return user namespace args for rootless podman so UID 1000 maps correctly.

Maps the host user to container UID/GID 1000, the conventional non-root dev user in terok container images.

Source code in src/terok_sandbox/runtime.py
def podman_userns_args() -> list[str]:
    """Return user namespace args for rootless podman so UID 1000 maps correctly.

    Maps the host user to container UID/GID 1000, the conventional non-root
    ``dev`` user in terok container images.
    """
    if os.geteuid() == 0:
        return []
    return ["--userns=keep-id:uid=1000,gid=1000"]

check_gpu_error(exc)

Raise :class:GpuConfigError if exc looks like a CDI/NVIDIA issue.

Does nothing if the error does not match any known CDI patterns.

Source code in src/terok_sandbox/runtime.py
def check_gpu_error(exc: subprocess.CalledProcessError) -> None:
    """Raise :class:`GpuConfigError` if *exc* looks like a CDI/NVIDIA issue.

    Does nothing if the error does not match any known CDI patterns.
    """
    stderr = (exc.stderr or b"").decode(errors="replace")
    if any(pat in stderr for pat in _CDI_ERROR_PATTERNS):
        msg = f"Container launch failed (GPU misconfiguration):\n{stderr.strip()}\n\n{_CDI_HINT}"
        raise GpuConfigError(msg) from exc

redact_env_args(cmd)

Return a copy of cmd with sensitive -e KEY=VALUE args redacted.

Handles the two-arg form (-e KEY=VALUE) produced by :meth:~.sandbox.Sandbox.run. Does not handle --env, -e=KEY=VALUE, or --env=KEY=VALUE — callers passing sensitive values via extra_args must pre-redact them.

Source code in src/terok_sandbox/runtime.py
def redact_env_args(cmd: list[str]) -> list[str]:
    """Return a copy of *cmd* with sensitive ``-e KEY=VALUE`` args redacted.

    Handles the two-arg form (``-e KEY=VALUE``) produced by
    :meth:`~.sandbox.Sandbox.run`.  Does not handle ``--env``,
    ``-e=KEY=VALUE``, or ``--env=KEY=VALUE`` — callers passing sensitive
    values via ``extra_args`` must pre-redact them.
    """
    out: list[str] = []
    redact_next = False
    for arg in cmd:
        if redact_next:
            key, _, _val = arg.partition("=")
            if _SENSITIVE_KEY_RE.search(key) or key in _ALWAYS_REDACT_KEYS:
                out.append(f"{key}=<redacted>")
            else:
                out.append(arg)
            redact_next = False
        elif arg == "-e":
            out.append(arg)
            redact_next = True
        else:
            out.append(arg)
    return out

get_project_container_states(name_prefix)

Return {container_name: state} for all containers matching name_prefix.

Uses a single podman ps -a call with a name filter instead of per-container podman inspect calls. Returns an empty dict when podman is unavailable.

Source code in src/terok_sandbox/runtime.py
def get_project_container_states(name_prefix: str) -> dict[str, str]:
    """Return ``{container_name: state}`` for all containers matching *name_prefix*.

    Uses a single ``podman ps -a`` call with a name filter instead of
    per-container ``podman inspect`` calls.  Returns an empty dict when
    podman is unavailable.
    """
    try:
        out = subprocess.check_output(
            [
                "podman",
                "ps",
                "-a",
                "--filter",
                f"name=^{name_prefix}-",
                "--format",
                "{{.Names}} {{.State}}",
                "--no-trunc",
            ],
            stderr=subprocess.DEVNULL,
            text=True,
        )
    except (subprocess.CalledProcessError, FileNotFoundError):
        return {}

    result: dict[str, str] = {}
    for line in out.strip().splitlines():
        parts = line.split(None, 1)
        if len(parts) == 2:
            result[parts[0]] = parts[1].lower()
    return result

get_container_state(cname)

Return container state ('running', 'exited', ...) or None if not found.

Source code in src/terok_sandbox/runtime.py
def get_container_state(cname: str) -> str | None:
    """Return container state ('running', 'exited', ...) or ``None`` if not found."""
    try:
        out = subprocess.check_output(
            ["podman", "inspect", "-f", "{{.State.Status}}", cname],
            stderr=subprocess.DEVNULL,
            text=True,
        ).strip()
        return out.lower() if out else None
    except (subprocess.CalledProcessError, FileNotFoundError):
        return None

is_container_running(cname)

Return True if the named container is currently running.

Source code in src/terok_sandbox/runtime.py
def is_container_running(cname: str) -> bool:
    """Return ``True`` if the named container is currently running."""
    try:
        out = subprocess.check_output(
            ["podman", "inspect", "-f", "{{.State.Running}}", cname],
            stderr=subprocess.DEVNULL,
            text=True,
        ).strip()
    except (subprocess.CalledProcessError, FileNotFoundError):
        return False
    return out.lower() == "true"

stop_task_containers(container_names)

Best-effort podman rm -f of the given containers.

Ignores all errors so that task deletion succeeds even when podman is absent or the containers are already gone.

Source code in src/terok_sandbox/runtime.py
def stop_task_containers(container_names: list[str]) -> None:
    """Best-effort ``podman rm -f`` of the given containers.

    Ignores all errors so that task deletion succeeds even when podman is
    absent or the containers are already gone.
    """
    for name in container_names:
        try:
            log_debug(f"stop_containers: podman rm -f {name} (start)")
            subprocess.run(
                ["podman", "rm", "-f", name],
                check=False,
                stdout=subprocess.DEVNULL,
                stderr=subprocess.DEVNULL,
                timeout=120,
            )
            log_debug(f"stop_containers: podman rm -f {name} (done)")
        except Exception as exc:
            log_debug(f"stop_containers: podman rm -f {name} failed: {exc}")

gpu_run_args(*, enabled=False)

Return additional podman run args to enable NVIDIA GPU passthrough.

The caller is responsible for determining whether GPUs are enabled (e.g. by reading project configuration). This function only maps the boolean flag to the appropriate podman CLI arguments.

Source code in src/terok_sandbox/runtime.py
def gpu_run_args(*, enabled: bool = False) -> list[str]:
    """Return additional ``podman run`` args to enable NVIDIA GPU passthrough.

    The caller is responsible for determining whether GPUs are enabled
    (e.g. by reading project configuration).  This function only maps
    the boolean flag to the appropriate podman CLI arguments.
    """
    if not enabled:
        return []

    return [
        "--device",
        "nvidia.com/gpu=all",
        "-e",
        "NVIDIA_VISIBLE_DEVICES=all",
        "-e",
        "NVIDIA_DRIVER_CAPABILITIES=all",
    ]

stream_initial_logs(container_name, timeout_sec, ready_check)

Stream logs until ready marker is seen or timeout.

Returns True if the ready marker was found, False on timeout.

Source code in src/terok_sandbox/runtime.py
def stream_initial_logs(
    container_name: str,
    timeout_sec: float | None,
    ready_check: Callable[[str], bool],
) -> bool:
    """Stream logs until ready marker is seen or timeout.

    Returns ``True`` if the ready marker was found, ``False`` on timeout.
    """
    import select
    import sys
    import threading
    import time

    holder: list[bool] = [False]
    stop_event = threading.Event()
    proc_holder: list[subprocess.Popen | None] = [None]

    def _stream_logs() -> None:
        """Follow container logs in a thread, setting *holder[0]* on ready."""
        try:
            proc = subprocess.Popen(
                ["podman", "logs", "-f", container_name],
                stdout=subprocess.PIPE,
                stderr=subprocess.STDOUT,
            )
            proc_holder[0] = proc
            start_time = time.time()
            buf = b""

            while not stop_event.is_set():
                if timeout_sec is not None and time.time() - start_time >= timeout_sec:
                    break
                if proc.poll() is not None:
                    remaining = proc.stdout.read()
                    if remaining:
                        buf += remaining
                    break
                try:
                    ready, _, _ = select.select([proc.stdout], [], [], 0.2)
                    if not ready:
                        continue
                    chunk = proc.stdout.read1(4096) if hasattr(proc.stdout, "read1") else b""
                    if not chunk:
                        continue
                    buf += chunk
                except Exception as exc:
                    log_warning(f"_stream_initial_logs read error: {exc}")
                    break

                while b"\n" in buf:
                    raw_line, buf = buf.split(b"\n", 1)
                    line = raw_line.decode("utf-8", errors="replace").strip()
                    if line:
                        print(line, file=sys.stdout, flush=True)
                        if ready_check(line):
                            holder[0] = True
                            proc.terminate()
                            return

            if buf:
                line = buf.decode("utf-8", errors="replace").strip()
                if line:
                    print(line, file=sys.stdout, flush=True)
                    if ready_check(line):
                        holder[0] = True

            proc.terminate()
        except Exception as exc:
            log_warning(f"_stream_initial_logs error: {exc}")

    stream_thread = threading.Thread(target=_stream_logs)
    stream_thread.start()
    stream_thread.join(timeout_sec)

    if stream_thread.is_alive():
        stop_event.set()
        proc = proc_holder[0]
        if proc is not None:
            proc.terminate()
        stream_thread.join(timeout=5)

    return holder[0]

wait_for_exit(cname, timeout_sec=None)

Wait for a container to exit and return its exit code.

Returns 124 on timeout, 1 if podman is not found.

Source code in src/terok_sandbox/runtime.py
def wait_for_exit(cname: str, timeout_sec: float | None = None) -> int:
    """Wait for a container to exit and return its exit code.

    Returns 124 on timeout, 1 if podman is not found.
    """
    try:
        proc = subprocess.run(
            ["podman", "wait", cname],
            check=False,
            capture_output=True,
            timeout=timeout_sec,
        )
        stdout = proc.stdout.decode().strip() if isinstance(proc.stdout, bytes) else proc.stdout
        if stdout:
            return int(stdout)
        return proc.returncode
    except subprocess.TimeoutExpired:
        return 124
    except (FileNotFoundError, ValueError):
        return 1

reserve_free_port(host='127.0.0.1')

Reserve a TCP port on host and return (socket, port).

The socket stays open — the caller holds the reservation until they close it (typically right before binding the actual service). Useful for Python-native servers that can accept a pre-bound socket.

Source code in src/terok_sandbox/runtime.py
def reserve_free_port(host: str = "127.0.0.1") -> tuple[socket.socket, int]:
    """Reserve a TCP port on *host* and return ``(socket, port)``.

    The socket stays open — the caller holds the reservation until they
    close it (typically right before binding the actual service).  Useful
    for Python-native servers that can accept a pre-bound socket.
    """
    s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
    try:
        s.bind((host, 0))
        return s, s.getsockname()[1]
    except BaseException:
        s.close()
        raise

find_free_port(host='127.0.0.1')

Find and return a free TCP port on host.

Releases the socket immediately — there is a small race window before the caller binds the port. This is the standard approach when passing a port number to an external process (e.g. podman run -p).

Source code in src/terok_sandbox/runtime.py
def find_free_port(host: str = "127.0.0.1") -> int:
    """Find and return a free TCP port on *host*.

    Releases the socket immediately — there is a small race window before
    the caller binds the port.  This is the standard approach when passing
    a port number to an external process (e.g. ``podman run -p``).
    """
    s, port = reserve_free_port(host)
    s.close()
    return port

bypass_network_args(gate_port)

Return podman network args for running without shield.

Replicates the networking that terok-shield's OCI hook normally provides (allowing the container to reach host.containers.internal for the gate server) but without nftables rules, annotations, or cap-drops.

This is a dangerous fallback for environments where shield can't run. All egress is unfiltered.

Source code in src/terok_sandbox/runtime.py
def bypass_network_args(gate_port: int) -> list[str]:
    """Return podman network args for running without shield.

    Replicates the networking that terok-shield's OCI hook normally provides
    (allowing the container to reach ``host.containers.internal`` for the gate
    server) but without nftables rules, annotations, or cap-drops.

    This is a **dangerous fallback** for environments where shield can't run.
    All egress is unfiltered.
    """
    if os.geteuid() == 0:
        return []
    if _detect_rootless_network_mode() == "slirp4netns":
        return [
            "--network",
            "slirp4netns:allow_host_loopback=true",
            "--add-host",
            f"host.containers.internal:{_SLIRP_GATEWAY}",
        ]
    return [
        "--network",
        f"pasta:--map-host-loopback,{_PASTA_HOST_LOOPBACK_MAP}",
        "--add-host",
        f"host.containers.internal:{_PASTA_HOST_LOOPBACK_MAP}",
    ]