From 5fa5f047dad3cbe3a574edd43d206e6da3f01007 Mon Sep 17 00:00:00 2001 From: Saak Date: Thu, 25 Jun 2026 21:46:48 -0400 Subject: [PATCH 1/2] Removed previous hardcoded host/port addresses and implemented local and AWS EC2 instance creation --- .dockerignore | 21 + .gitignore | 8 + README.md | 8 +- .../config/global_controller.ec2_smoke.yaml | 33 + examples/config/global_controller.yaml | 62 +- examples/ec2/agents/example_agent.py | 3 + examples/ec2/agents/example_agent.yaml | 12 + examples/ec2/config/global_controller.yaml | 31 + examples/ec2/docker/generic-agent.Dockerfile | 16 + .../ec2/docker/global-controller.Dockerfile | 16 + pyproject.toml | 1 + requirements.txt | 1 + tests/README.md | 42 +- tests/__init__.py | 1 + tests/global_controller/__init__.py | 1 + .../test_runtime_backed_controller.py | 242 +++++ tests/live/__init__.py | 1 + tests/live/test_full_local_deploy.py | 180 ++++ tests/live/test_local_docker_runtime.py | 94 ++ tests/local/__init__.py | 1 + tests/local/test_port_allocation.py | 127 +++ tests/routing/__init__.py | 1 + tests/routing/test_routing_publication.py | 114 +++ tests/runtime/__init__.py | 1 + tests/runtime/test_runtime_lifecycle.py | 145 +++ tests/support/__init__.py | 1 + tests/support/runtime_fakes.py | 199 ++++ tests/test_cli_build.py | 163 ++++ tests/test_cli_deploy.py | 137 +++ tests/test_integration.py | 68 +- tests/test_performance.py | 69 ++ tests/test_runtime_manager.py | 904 ++++++++++++++++++ tests/test_stateful_affinity.py | 48 +- ventis/cli.py | 186 +++- ventis/controller/agent_spec_loader.py | 24 + .../cloud_provider_logic/EC2/__init__.py | 16 + .../cloud_provider_logic/EC2/_runtime.py | 338 +++++++ ventis/controller/global_controller.py | 809 +++++++--------- ventis/controller/runtime_manager.py | 470 +++++++++ .../templates/config/global_controller.yaml | 35 +- 40 files changed, 4053 insertions(+), 576 deletions(-) create mode 100644 .dockerignore create mode 100644 examples/config/global_controller.ec2_smoke.yaml create mode 100644 examples/ec2/agents/example_agent.py create mode 100644 examples/ec2/agents/example_agent.yaml create mode 100644 examples/ec2/config/global_controller.yaml create mode 100644 examples/ec2/docker/generic-agent.Dockerfile create mode 100644 examples/ec2/docker/global-controller.Dockerfile create mode 100644 tests/__init__.py create mode 100644 tests/global_controller/__init__.py create mode 100644 tests/global_controller/test_runtime_backed_controller.py create mode 100644 tests/live/__init__.py create mode 100644 tests/live/test_full_local_deploy.py create mode 100644 tests/live/test_local_docker_runtime.py create mode 100644 tests/local/__init__.py create mode 100644 tests/local/test_port_allocation.py create mode 100644 tests/routing/__init__.py create mode 100644 tests/routing/test_routing_publication.py create mode 100644 tests/runtime/__init__.py create mode 100644 tests/runtime/test_runtime_lifecycle.py create mode 100644 tests/support/__init__.py create mode 100644 tests/support/runtime_fakes.py create mode 100644 tests/test_cli_build.py create mode 100644 tests/test_cli_deploy.py create mode 100644 tests/test_runtime_manager.py create mode 100644 ventis/controller/agent_spec_loader.py create mode 100644 ventis/controller/cloud_provider_logic/EC2/__init__.py create mode 100644 ventis/controller/cloud_provider_logic/EC2/_runtime.py create mode 100644 ventis/controller/runtime_manager.py diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..a5c47a7 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,21 @@ +.git +.omx +.pytest_cache +.venv +.env +.env.* +!.env.example +__pycache__ +*.pyc +*.pyo +*.pyd +*.swp +*.swo +*~ +._* +Thumbs.db +.DS_Store +AWSCLIV2.pkg +docker_container +grpc_stubs +stubs diff --git a/.gitignore b/.gitignore index ee8f58f..db416f0 100644 --- a/.gitignore +++ b/.gitignore @@ -24,6 +24,7 @@ env/ # OS files .DS_Store Thumbs.db +._* # Generated stubs stubs/ @@ -32,3 +33,10 @@ docker_container/ # Logs *.log + +# Local env / machine artifacts +.env +.env.* +!.env.example +AWSCLIV2.pkg +.python-version diff --git a/README.md b/README.md index f643ee5..37f0a43 100644 --- a/README.md +++ b/README.md @@ -68,7 +68,7 @@ cp -r ../examples/* ./ ## Deployment Guide #### Step 1: Configure the Global Controller -Edit `examples/config/global_controller.yaml` to list the agents you want to deploy, their hosts, ports, and resource limits. +Edit `config/global_controller.yaml` in your project directory to list the agents you want to deploy, their `provider`, `replicas`, and resource limits. #### Step 2: Build the project ```bash @@ -89,10 +89,10 @@ Upon running the deploy command, ventis automatically generates a REST API endpo Users can send requests to this endpoint to trigger the workflow. For this example, workflow to send a request - ```bash -curl -X POST http://localhost:8080/finance_workflow/run \ +curl -X POST http://localhost:8080/main \ -H "Content-Type: application/json" \ -d '{ - "query": "What is the current stock price of Apple?" + "ticker": "AAPL" }' ``` The request is asynchronous. To get the result, you use the following URL- @@ -105,7 +105,7 @@ curl http://localhost:8080/status/ Remove all generated stub and gRPC files: ```bash -make clean +ventis clean ``` ### Harnessing the power of Ventis diff --git a/examples/config/global_controller.ec2_smoke.yaml b/examples/config/global_controller.ec2_smoke.yaml new file mode 100644 index 0000000..80e63a3 --- /dev/null +++ b/examples/config/global_controller.ec2_smoke.yaml @@ -0,0 +1,33 @@ +# Cheapest EC2 smoke test: +# - exactly 2 agent instances +# - generic agent image + bind-mounted project code +# - t2.nano for lowest-cost MVP validation + +agents: + - name: MarketResearchAgent + replicas: 2 + redis_port: 6379 + resources: + cpu: 1 + memory: 256 + entrypoint: agents/market_agent.py + provider: EC2 + +poll_interval: 5 + +redis: + host: localhost + port: 6379 + db: 0 + +ec2: + region: us-east-1 + ami_id: ami-0123456789abcdef0 + instance_type: t2.nano + subnet_id: subnet-0123456789abcdef0 + security_group_ids: + - sg-0123456789abcdef0 + ssh_user: ubuntu + key_name: ventis-key + agent_image: ventis-agent-base + remote_project_dir: /opt/ventis/project diff --git a/examples/config/global_controller.yaml b/examples/config/global_controller.yaml index 8e8539d..25d2ad1 100644 --- a/examples/config/global_controller.yaml +++ b/examples/config/global_controller.yaml @@ -3,70 +3,70 @@ agents: - name: FinanceAgent - # Replicas can be placed on different hosts with explicit host/port. - # Alternatively, use `replicas: 3` as shorthand to launch 3 instances - # on the same host with sequential ports starting from `port`. - replicas: - - host: localhost - port: 8051 - - host: localhost - port: 8052 - redis_port: 6379 # Redis port on this node - # user: sagarwal # SSH user for remote hosts (omit for localhost) - # Stateful agents get session affinity: all calls within a single - # request_id are routed to the same instance. + replicas: 2 + redis_port: 6379 # Redis port on this node stateful: true - # Resource limits per instance resources: - cpu: 1 # Number of CPU cores - memory: 512 # Memory in MB - # Path to the agent entrypoint script + cpu: 1 + memory: 512 entrypoint: agents/finance_agent.py + provider: local - name: MarketResearchAgent - host: localhost - port: 8053 - redis_port: 6379 - # user: sagarwal replicas: 1 + redis_port: 6379 resources: cpu: 1 memory: 512 entrypoint: agents/market_agent.py + provider: local - name: VllmAgent - host: localhost - port: 8054 - redis_port: 6379 replicas: 1 + redis_port: 6379 resources: cpu: 2 memory: 2048 entrypoint: agents/vllm_agent.py + provider: local - name: Workflow - host: localhost - port: 8050 # LC gRPC port (exposed to host) + replicas: 1 type: workflow - api_port: 8080 # Flask REST API port (exposed to host) redis_port: 6379 - replicas: 1 workflow_file: workflows/example_workflow.py + provider: local -# Polling interval in seconds poll_interval: 5 -# Redis connection redis: host: localhost port: 6379 db: 0 +# EC2 defaults for `provider: EC2` replicas. +# MVP expects the subnet/security group/network path to already allow +# controller -> public_ip:50051 for agent gRPC and controller -> public_ip:6379 +# for the per-instance Redis container. Restrict both inbound rules to the +# global controller host's source IP. +# +# ec2: +# region: us-east-1 +# ami_id: ami-0123456789abcdef0 +# instance_type: t2.nano +# subnet_id: subnet-0123456789abcdef0 +# security_group_ids: +# - sg-0123456789abcdef0 +# ssh_user: ubuntu +# key_name: ventis-key +# agent_image: ventis-agent-base +# remote_project_dir: /opt/ventis/project + # Docker image registry (optional). # If set, `ventis deploy` will push images to this registry locally and # pull them on remote nodes before starting containers. # If omitted, images are shipped to remote nodes via `docker save | ssh ... docker load`. # # registry: -# url: myregistry.example.com:5000 # Registry host:port -# user: sagarwal # (optional) SSH user used when pulling on remote nodes +# url: myregistry.example.com:5000 +# user: sagarwal diff --git a/examples/ec2/agents/example_agent.py b/examples/ec2/agents/example_agent.py new file mode 100644 index 0000000..969f2e9 --- /dev/null +++ b/examples/ec2/agents/example_agent.py @@ -0,0 +1,3 @@ +class ExampleAgent: + def hello(self, name: str) -> str: + return f"Hello, {name}!" diff --git a/examples/ec2/agents/example_agent.yaml b/examples/ec2/agents/example_agent.yaml new file mode 100644 index 0000000..a0fe00b --- /dev/null +++ b/examples/ec2/agents/example_agent.yaml @@ -0,0 +1,12 @@ +agent: + name: ExampleAgent + methods: + - name: hello + input_schema: + type: object + properties: + name: + type: string + required: [name] + output_schema: + type: string diff --git a/examples/ec2/config/global_controller.yaml b/examples/ec2/config/global_controller.yaml new file mode 100644 index 0000000..c940337 --- /dev/null +++ b/examples/ec2/config/global_controller.yaml @@ -0,0 +1,31 @@ +# Global controller config for this EC2 example. + +agents: + - name: ExampleAgent + replicas: 2 + redis_port: 6379 + resources: + cpu: 1 + memory: 256 + entrypoint: agents/example_agent.py + provider: EC2 + +poll_interval: 5 + +redis: + host: localhost + port: 6379 + db: 0 + +ec2: + region: us-east-1 + ami_id: ami-08f44e8eca9095668 + instance_type: t2.nano + subnet_id: subnet-0638ac6d79d488124 + security_group_ids: + - sg-08d81e58ac5818c60 + ssh_user: ec2-user + ssh_private_key_path: /home/ec2-user/.ssh/saakec2.pem + key_name: saakec2 + agent_image: ventis-agent-base + remote_project_dir: /opt/ventis/project diff --git a/examples/ec2/docker/generic-agent.Dockerfile b/examples/ec2/docker/generic-agent.Dockerfile new file mode 100644 index 0000000..b99f5e4 --- /dev/null +++ b/examples/ec2/docker/generic-agent.Dockerfile @@ -0,0 +1,16 @@ +FROM python:3.12-slim + +WORKDIR /workspace + +ENV PYTHONUNBUFFERED=1 + +RUN apt-get update \ + && apt-get install -y --no-install-recommends build-essential \ + && rm -rf /var/lib/apt/lists/* + +COPY requirements.txt /tmp/requirements.txt +RUN pip install --no-cache-dir -r /tmp/requirements.txt + +COPY . /workspace + +CMD ["python", "-m", "ventis.controller.local_controller"] diff --git a/examples/ec2/docker/global-controller.Dockerfile b/examples/ec2/docker/global-controller.Dockerfile new file mode 100644 index 0000000..a7da0db --- /dev/null +++ b/examples/ec2/docker/global-controller.Dockerfile @@ -0,0 +1,16 @@ +FROM python:3.12-slim + +WORKDIR /workspace + +ENV PYTHONUNBUFFERED=1 + +RUN apt-get update \ + && apt-get install -y --no-install-recommends build-essential docker.io openssh-client \ + && rm -rf /var/lib/apt/lists/* + +COPY requirements.txt /tmp/requirements.txt +RUN pip install --no-cache-dir -r /tmp/requirements.txt + +COPY . /workspace + +CMD ["python", "-m", "ventis.cli", "deploy"] diff --git a/pyproject.toml b/pyproject.toml index 16e5a56..b0562a7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,6 +4,7 @@ version = "0.1.0" description = "Distributed agent orchestration framework" requires-python = ">=3.10" dependencies = [ + "boto3", "grpcio", "grpcio-tools", "redis", diff --git a/requirements.txt b/requirements.txt index de56b8e..203234d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ +boto3 grpcio grpcio-tools redis diff --git a/tests/README.md b/tests/README.md index c609a0e..1720b8c 100644 --- a/tests/README.md +++ b/tests/README.md @@ -1,38 +1,4 @@ -# Ventis Testing & Load Analysis Tools - -This directory contains an automated end-to-end testing suite for Ventis. It is designed to verify both functional correctness and concurrent performance of the distributed agent architecture. - -## 1. Automated Test Runner (`run_tests.sh`) -This script automates the entire testing lifecycle by interacting with the `ventis` CLI: -1. Scaffolds a new temporary project using `ventis new-project`. -2. Compiles the project using `ventis build`. -3. Launches the project using `ventis deploy` in the background. -4. Waits for the GlobalController and all agent sidecars to become healthy. -5. Runs the Python integration and performance scripts. -6. **Cleanup:** Automatically terminates the deployment and cleans up the temporary directory upon success or failure. - -To run the complete suite: -```bash -./run_tests.sh -``` - -## 2. Functional Integration Validation (`test_integration.py`) -Verifies that Ventis correctly passes data and dependencies between chained agents. -- Dispatches a single query to the deployed `/main` endpoint. -- Polls the `/status` endpoint until completion. -- Validates the output payload structure and ensures that data successfully flowed through `FinanceAgent`, `MarketResearchAgent`, and `VllmAgent`. - -To run manually against an already-deployed Ventis instance: -```bash -python test_integration.py -``` - -## 3. High-Concurrency Stress Test (`test_performance.py`) -Evaluates the robustness and scalability of the Ventis Redis routing and Docker architecture under load. Using `concurrent.futures`, this script models N concurrent users actively polling Ventis simultaneously. - -It produces an analytical report summarizing throughput, dropped requests, and latency percentiles. - -To run manually against an already-deployed Ventis instance (e.g. 50 requests across 10 concurrent virtual users): -```bash -python test_performance.py --concurrent 10 --total 50 -``` +# Tests +Fast suite from repo root: `python3 -m pytest tests` +Full local live smoke: `VENTIS_RUN_FULL_LOCAL=1 python3 -m unittest tests.live.test_full_local_deploy` +Small live Docker smoke: `VENTIS_RUN_LIVE_DOCKER=1 python3 -m unittest tests.live.test_local_docker_runtime` diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/tests/__init__.py @@ -0,0 +1 @@ + diff --git a/tests/global_controller/__init__.py b/tests/global_controller/__init__.py new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/tests/global_controller/__init__.py @@ -0,0 +1 @@ + diff --git a/tests/global_controller/test_runtime_backed_controller.py b/tests/global_controller/test_runtime_backed_controller.py new file mode 100644 index 0000000..ae3eb5d --- /dev/null +++ b/tests/global_controller/test_runtime_backed_controller.py @@ -0,0 +1,242 @@ +import json +import unittest +from types import SimpleNamespace +from unittest.mock import patch + +from tests.support.runtime_fakes import FakeRedis +from tests.support.runtime_fakes import install_grpc_stubs +from tests.support.runtime_fakes import make_global_controller +from tests.support.runtime_fakes import make_instance + +install_grpc_stubs() + +from ventis.controller.global_controller import GlobalController + + +class GlobalControllerRuntimeBackedTests(unittest.TestCase): + def test_wait_for_healthy_reads_local_status_from_host_redis(self): + instance = make_instance("Alpha", 0, host="localhost", host_port=8000) + controller = make_global_controller([instance]) + controller.node_redis["localhost"] = FakeRedis() + controller.node_redis["localhost"].set( + "controller:host.docker.internal:8000:status", + "healthy", + ) + + GlobalController._wait_for_healthy(controller, timeout=1, interval=0) + + self.assertEqual(controller._last_status, {("localhost", "8000"): "healthy"}) + + def test_poll_controllers_calls_healthy_hook_for_runtime_instance(self): + instance = make_instance("Beta", 0, host="localhost", host_port=8001) + controller = make_global_controller([instance]) + controller.node_redis["localhost"] = FakeRedis() + controller.node_redis["localhost"].set( + "controller:host.docker.internal:8001:status", + "healthy", + ) + + GlobalController._poll_controllers(controller) + + self.assertEqual(controller._healthy_calls, [("Beta", "localhost", "8001")]) + self.assertEqual(controller._unhealthy_calls, []) + + def test_poll_controllers_calls_unhealthy_hook_for_missing_status(self): + instance = make_instance("Beta", 0, host="localhost", host_port=8001) + controller = make_global_controller([instance]) + controller.node_redis["localhost"] = FakeRedis() + + GlobalController._poll_controllers(controller) + + self.assertEqual(controller._healthy_calls, []) + self.assertEqual(controller._unhealthy_calls, [("Beta", "localhost", "8001")]) + self.assertEqual(controller._last_status[("localhost", "8001")], "unknown") + + def test_poll_controllers_uses_remote_host_as_status_key(self): + instance = make_instance("Remote", 0, host="10.0.0.7", host_port=9000) + controller = make_global_controller([instance]) + controller.node_redis["10.0.0.7"] = FakeRedis() + controller.node_redis["10.0.0.7"].set( + "controller:10.0.0.7:9000:status", + "healthy", + ) + + GlobalController._poll_controllers(controller) + + self.assertEqual(controller._healthy_calls, [("Remote", "10.0.0.7", "9000")]) + + def test_trigger_cleanup_broadcasts_to_runtime_endpoints(self): + instance = make_instance("Gamma", 0, host="localhost", host_port=8002) + controller = make_global_controller([instance]) + controller.redis.sadd("request:completed", "req-1") + messages = [] + + class Stub: + def Cleanup(self, message): + messages.append(message) + + controller._get_lc_stub = lambda endpoint: Stub() + + GlobalController._trigger_cleanup(controller) + + self.assertEqual([json.loads(message.resonse) for message in messages], [{"request_id": "req-1"}]) + self.assertEqual(controller.redis.smembers("request:completed"), set()) + + def test_trigger_cleanup_noops_without_completed_requests(self): + instance = make_instance("Gamma", 0, host="localhost", host_port=8002) + controller = make_global_controller([instance]) + calls = [] + controller._get_lc_stub = lambda endpoint: calls.append(endpoint) + + GlobalController._trigger_cleanup(controller) + + self.assertEqual(calls, []) + + def test_stop_docker_agents_delegates_to_runtime_manager(self): + instance = make_instance("Delta", 0, host="localhost", host_port=8003) + controller = make_global_controller([instance]) + controller.containers = {"Delta": [instance["runtime_id"]]} + removed = [] + controller.runtime_manager.remove_instance = lambda instance_id: removed.append(instance_id) + + GlobalController._stop_docker_agents(controller) + + self.assertEqual(removed, ["local:Delta:0"]) + self.assertEqual(controller._run_cmd_calls, []) + self.assertEqual(controller.containers, {}) + + def test_agent_host_key_maps_localhost_for_container_status(self): + controller = make_global_controller([]) + + self.assertEqual( + GlobalController._agent_host_key(controller, "localhost"), + "host.docker.internal", + ) + self.assertEqual( + GlobalController._agent_host_key(controller, "10.0.0.4"), + "10.0.0.4", + ) + + def test_launch_redis_containers_uses_runtime_nodes(self): + controller = make_global_controller([]) + controller.runtime_manager.list_runtime_nodes = lambda agent_specs=None: { + "localhost": {"user": None, "redis_port": 6379}, + "10.0.0.7": {"user": "ubuntu", "redis_port": 6380}, + } + created_clients = [] + + with patch( + "ventis.controller.global_controller.RedisClient", + side_effect=lambda **kwargs: created_clients.append(kwargs) or FakeRedis(), + ): + GlobalController._launch_redis_containers(controller) + + self.assertEqual( + controller.redis_containers, + { + "localhost": "ventis-redis-localhost", + "10.0.0.7": "ventis-redis-10-0-0-7", + }, + ) + self.assertEqual(created_clients, [{"host": "localhost", "port": 6379}, {"host": "10.0.0.7", "port": 6380}]) + + def test_ensure_host_redis_reuses_existing_client(self): + controller = make_global_controller([]) + existing = FakeRedis() + controller.node_redis["localhost"] = existing + + result = GlobalController.ensure_host_redis(controller, "localhost") + + self.assertIs(result, existing) + self.assertEqual(controller._run_cmd_calls, []) + + def test_ensure_host_redis_exits_when_docker_run_fails(self): + controller = make_global_controller([]) + controller._run_cmd = lambda cmd, host, user=None: SimpleNamespace( + returncode=1, + stdout="", + stderr="docker failed", + ) + + with self.assertRaises(SystemExit): + GlobalController.ensure_host_redis(controller, "localhost") + + def test_ssh_base_cmd_uses_configured_private_key(self): + controller = make_global_controller([]) + controller.config = {"ec2": {"ssh_private_key_path": "/tmp/test.pem"}} + + self.assertEqual( + GlobalController._ssh_base_cmd(controller), + [ + "ssh", + "-o", + "StrictHostKeyChecking=no", + "-o", + "ConnectTimeout=10", + "-i", + "/tmp/test.pem", + ], + ) + + def test_run_cmd_uses_ssh_options_for_remote_host(self): + controller = make_global_controller([]) + controller.config = {"ec2": {"ssh_private_key_path": "/tmp/test.pem"}} + + with patch("subprocess.run", return_value=SimpleNamespace(returncode=0, stdout="", stderr="")) as run_mock: + GlobalController._run_cmd(controller, ["docker", "ps"], "10.0.0.7", "ec2-user") + + run_mock.assert_called_once_with( + [ + "ssh", + "-o", + "StrictHostKeyChecking=no", + "-o", + "ConnectTimeout=10", + "-i", + "/tmp/test.pem", + "ec2-user@10.0.0.7", + "sudo docker ps", + ], + capture_output=True, + text=True, + ) + + def test_ensure_host_redis_prepares_remote_docker_first(self): + controller = make_global_controller([]) + calls = [] + controller._ensure_remote_docker = lambda host, user=None: calls.append( + ("prep", host, user) + ) or SimpleNamespace(returncode=0, stdout="", stderr="") + controller._run_cmd = lambda cmd, host, user=None: calls.append( + ("run", cmd, host, user) + ) or SimpleNamespace(returncode=0, stdout="", stderr="") + + with patch( + "ventis.controller.global_controller.RedisClient", + side_effect=lambda **kwargs: FakeRedis(), + ): + GlobalController.ensure_host_redis(controller, "10.0.0.7", "ec2-user", 6380) + + self.assertEqual(calls[0], ("prep", "10.0.0.7", "ec2-user")) + self.assertEqual(calls[1][0], "run") + + def test_wait_for_remote_ssh_retries_until_success(self): + controller = make_global_controller([]) + controller.config = {"ec2": {"ssh_private_key_path": "/tmp/test.pem"}} + results = [ + SimpleNamespace(returncode=255, stdout="", stderr="Connection timed out"), + SimpleNamespace(returncode=0, stdout="", stderr=""), + ] + + with patch("subprocess.run", side_effect=results) as run_mock: + with patch("time.sleep", return_value=None): + result = GlobalController._wait_for_remote_ssh( + controller, "10.0.0.7", "ec2-user", timeout=5, interval=0 + ) + + self.assertEqual(result.returncode, 0) + self.assertEqual(run_mock.call_count, 2) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/live/__init__.py b/tests/live/__init__.py new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/tests/live/__init__.py @@ -0,0 +1 @@ + diff --git a/tests/live/test_full_local_deploy.py b/tests/live/test_full_local_deploy.py new file mode 100644 index 0000000..c411527 --- /dev/null +++ b/tests/live/test_full_local_deploy.py @@ -0,0 +1,180 @@ +import json +import os +import shutil +import signal +import subprocess +import sys +import tempfile +import time +import unittest +import urllib.error +import urllib.request + +import yaml + +from ventis.utils.redis_client import RedisClient + + +RUN_FULL_LOCAL = os.environ.get("VENTIS_RUN_FULL_LOCAL") == "1" +REPO_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")) + + +def _docker_available(): + if shutil.which("docker") is None: + return False + result = subprocess.run( + ["docker", "ps"], + capture_output=True, + text=True, + check=False, + ) + return result.returncode == 0 + + +def _run_ventis(args, cwd, timeout=180): + env = os.environ.copy() + env["PYTHONPATH"] = f"{REPO_ROOT}{os.pathsep}{env.get('PYTHONPATH', '')}" + return subprocess.run( + [sys.executable, "-m", "ventis.cli", *args], + cwd=cwd, + env=env, + capture_output=True, + text=True, + timeout=timeout, + check=False, + ) + + +def _request_json(method, url, payload=None, timeout=5): + data = None if payload is None else json.dumps(payload).encode("utf-8") + request = urllib.request.Request( + url, + data=data, + method=method, + headers={"Content-Type": "application/json"}, + ) + with urllib.request.urlopen(request, timeout=timeout) as response: + return response.status, json.loads(response.read().decode("utf-8")) + + +def _wait_for_http(url, timeout=60): + deadline = time.time() + timeout + while time.time() < deadline: + try: + _request_json("POST", url, {"name": "Probe"}, timeout=2) + return + except (urllib.error.URLError, TimeoutError, ConnectionError): + time.sleep(1) + raise TimeoutError(f"{url} did not become reachable within {timeout}s") + + +def _wait_for_done(request_id, timeout=60): + deadline = time.time() + timeout + while time.time() < deadline: + _status, payload = _request_json( + "GET", + f"http://localhost:8080/status/{request_id}", + timeout=5, + ) + if payload.get("status") == "done": + return payload + if payload.get("status") == "error": + raise AssertionError(payload.get("error")) + time.sleep(1) + raise TimeoutError(f"request {request_id} did not finish within {timeout}s") + + +@unittest.skipUnless( + RUN_FULL_LOCAL, + "set VENTIS_RUN_FULL_LOCAL=1 to run the full local build/deploy smoke test", +) +class FullLocalDeployTests(unittest.TestCase): + """Build, deploy, and exercise a local-only generated Ventis project.""" + + def setUp(self): + if not _docker_available(): + raise unittest.SkipTest("Docker daemon is not available") + self.tmpdir = tempfile.mkdtemp(prefix="ventis_full_local_") + self.project_name = "local_smoke" + self.project_dir = os.path.join(self.tmpdir, self.project_name) + self.deploy = None + + def tearDown(self): + if self.deploy and self.deploy.poll() is None: + self.deploy.send_signal(signal.SIGTERM) + try: + self.deploy.wait(timeout=20) + except subprocess.TimeoutExpired: + self.deploy.kill() + self.deploy.wait(timeout=10) + if self.deploy and self.deploy.stdout: + self.deploy.stdout.close() + shutil.rmtree(self.tmpdir, ignore_errors=True) + + def test_generated_local_project_builds_deploys_and_routes(self): + result = _run_ventis(["new-project", self.project_name], cwd=self.tmpdir) + self.assertEqual(result.returncode, 0, result.stderr) + + self._force_local_only_config() + + result = _run_ventis(["build"], cwd=self.project_dir, timeout=300) + self.assertEqual(result.returncode, 0, result.stderr) + + self.deploy = subprocess.Popen( + [sys.executable, "-m", "ventis.cli", "deploy"], + cwd=self.project_dir, + env={ + **os.environ, + "PYTHONPATH": f"{REPO_ROOT}{os.pathsep}{os.environ.get('PYTHONPATH', '')}", + }, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + text=True, + ) + + _wait_for_http("http://localhost:8080/main", timeout=90) + self._assert_routing_metadata() + + status, submitted = _request_json( + "POST", + "http://localhost:8080/main", + {"name": "LocalSmoke"}, + timeout=5, + ) + self.assertEqual(status, 202) + + completed = _wait_for_done(submitted["request_id"], timeout=90) + self.assertEqual(completed["status"], "done") + self.assertEqual( + completed["result"], + {"greeting": "Hello, LocalSmoke! I'm the ExampleAgent."}, + ) + + def _force_local_only_config(self): + config_path = os.path.join(self.project_dir, "config", "global_controller.yaml") + with open(config_path, "r") as f: + config = yaml.safe_load(f) + + for agent in config["agents"]: + agent["provider"] = "local" + agent["replicas"] = 1 + agent.setdefault("resources", {}).pop("gpu", None) + + with open(config_path, "w") as f: + yaml.safe_dump(config, f, sort_keys=False) + + def _assert_routing_metadata(self): + redis = RedisClient(host="localhost", port=6379) + services = redis.smembers("routing_table:services") + self.assertIn("ExampleAgent", services) + self.assertIn("Workflow", services) + + raw_endpoints = redis.hget("routing_table:endpoints", "ExampleAgent") + self.assertIsNotNone(raw_endpoints) + endpoints = json.loads(raw_endpoints) + self.assertEqual(len(endpoints), 1) + self.assertTrue(endpoints[0].startswith("host.docker.internal:")) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/live/test_local_docker_runtime.py b/tests/live/test_local_docker_runtime.py new file mode 100644 index 0000000..f6ee172 --- /dev/null +++ b/tests/live/test_local_docker_runtime.py @@ -0,0 +1,94 @@ +import os +import shutil +import subprocess +import tempfile +import textwrap +import unittest + +from tests.support.runtime_fakes import FakeRedis +from ventis.controller.runtime_manager import RuntimeManager + + +RUN_LIVE_DOCKER = os.environ.get("VENTIS_RUN_LIVE_DOCKER") == "1" + + +class LiveDockerController: + def __init__(self): + self.redis = FakeRedis() + self.node_redis = {} + self.redis_containers = {} + self.containers = {} + self.config = {"redis": {"host": "localhost", "port": 6379}} + + def _run_cmd(self, cmd, host, user=None): + if host not in ("localhost", "127.0.0.1"): + raise AssertionError(f"live local test cannot run remote command on {host}") + return subprocess.run(cmd, capture_output=True, text=True, check=False) + + def _ensure_image_on_host(self, image, host, user): + return None + + +@unittest.skipUnless( + RUN_LIVE_DOCKER, + "set VENTIS_RUN_LIVE_DOCKER=1 to run live local Docker runtime tests", +) +class LocalDockerRuntimeLiveTests(unittest.TestCase): + image = "ventis-liveprobe" + + @classmethod + def setUpClass(cls): + if shutil.which("docker") is None: + raise unittest.SkipTest("docker CLI is not available") + + with tempfile.TemporaryDirectory() as tmpdir: + dockerfile = os.path.join(tmpdir, "Dockerfile") + with open(dockerfile, "w") as f: + f.write( + textwrap.dedent( + """ + FROM alpine:3.20 + CMD ["sh", "-c", "sleep 600"] + """ + ).strip() + ) + result = subprocess.run( + ["docker", "build", "-t", cls.image, tmpdir], + capture_output=True, + text=True, + check=False, + ) + if result.returncode != 0: + raise unittest.SkipTest(f"failed to build live Docker image: {result.stderr}") + + @classmethod + def tearDownClass(cls): + subprocess.run(["docker", "rm", "-f", "ventis-local-liveprobe-0"], capture_output=True, text=True) + subprocess.run(["docker", "rmi", "-f", cls.image], capture_output=True, text=True) + + def test_local_runtime_launches_container_and_writes_routing_metadata(self): + controller = LiveDockerController() + manager = RuntimeManager(controller, controller.redis) + self.addCleanup(lambda: manager.remove_instance("local:LiveProbe:0")) + + instances = manager.ensure_instances( + [{"name": "LiveProbe", "provider": "local", "replicas": 1}] + ) + + self.assertEqual(len(instances), 1) + self.assertEqual(instances[0]["endpoint"], "localhost:8000") + self.assertEqual( + controller.redis.hgetall("agent_instance:local:LiveProbe:0")["runtime_id"], + "ventis-local-liveprobe-0", + ) + inspect_result = subprocess.run( + ["docker", "inspect", "ventis-local-liveprobe-0"], + capture_output=True, + text=True, + check=False, + ) + self.assertEqual(inspect_result.returncode, 0) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/local/__init__.py b/tests/local/__init__.py new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/tests/local/__init__.py @@ -0,0 +1 @@ + diff --git a/tests/local/test_port_allocation.py b/tests/local/test_port_allocation.py new file mode 100644 index 0000000..d5649d7 --- /dev/null +++ b/tests/local/test_port_allocation.py @@ -0,0 +1,127 @@ +import unittest + +from tests.support.runtime_fakes import FakeController +from tests.support.runtime_fakes import make_instance +from ventis.controller.runtime_manager import allocate_host_port +from ventis.controller.runtime_manager import resolve_local_replica_placements as resolve_replica_placements +from ventis.controller.runtime_manager import RuntimeManager + + +class LocalPlacementTests(unittest.TestCase): + def test_resolve_replica_placements_uses_dynamic_ports(self): + placements = resolve_replica_placements({"name": "Alpha", "replicas": 2}) + + self.assertEqual( + placements, + [ + {"host": "localhost", "host_port": None}, + {"host": "localhost", "host_port": None}, + ], + ) + + def test_resolve_replica_placements_defaults_to_one_replica(self): + placements = resolve_replica_placements({"name": "Alpha"}) + + self.assertEqual(placements, [{"host": "localhost", "host_port": None}]) + + def test_resolve_replica_placements_accepts_numeric_string_replicas(self): + placements = resolve_replica_placements({"name": "Alpha", "replicas": "3"}) + + self.assertEqual(len(placements), 3) + self.assertTrue(all(item["host_port"] is None for item in placements)) + + def test_resolve_replica_placements_allows_zero_replicas(self): + placements = resolve_replica_placements({"name": "Alpha", "replicas": 0}) + + self.assertEqual(placements, []) + + def test_resolve_replica_placements_rejects_legacy_static_config(self): + legacy_specs = [ + {"name": "Legacy", "host": "localhost", "replicas": 1}, + {"name": "Legacy", "port": 9000, "replicas": 1}, + {"name": "Legacy", "replicas": [{"host": "localhost", "port": 9000}]}, + ] + + for spec in legacy_specs: + with self.subTest(spec=spec): + with self.assertRaisesRegex(ValueError, "Legacy YAML host/port"): + resolve_replica_placements(spec) + + +class LocalPortAllocationTests(unittest.TestCase): + def setUp(self): + self.controller = FakeController() + self.manager = RuntimeManager(self.controller, self.controller.redis) + + def write_instance(self, instance): + key = self.manager._instance_key( + instance["provider"], + instance["agent_name"], + int(instance["replica_index"]), + ) + self.controller.redis.hset_multiple(key, instance) + + def test_allocate_host_port_uses_first_available_port(self): + self.write_instance(make_instance("Alpha", 0, host_port=8000)) + self.write_instance(make_instance("Beta", 0, host_port=8002)) + + port = allocate_host_port(self.manager, "localhost") + + self.assertEqual(port, 8001) + + def test_allocate_host_port_ignores_different_hosts(self): + self.write_instance(make_instance("Remote", 0, host="10.0.0.5", host_port=8000)) + + port = allocate_host_port(self.manager, "localhost") + + self.assertEqual(port, 8000) + + def test_allocate_host_port_skips_contiguous_used_ports(self): + for index, port in enumerate((8000, 8001, 8002, 8003)): + self.write_instance(make_instance(f"Agent{index}", 0, host_port=port)) + + port = allocate_host_port(self.manager, "localhost") + + self.assertEqual(port, 8004) + + def test_allocate_host_port_tracks_ports_per_host(self): + self.write_instance(make_instance("Local", 0, host="localhost", host_port=8000)) + self.write_instance(make_instance("Remote", 0, host="10.0.0.5", host_port=8000)) + + local_port = allocate_host_port(self.manager, "localhost") + remote_port = allocate_host_port(self.manager, "10.0.0.5") + + self.assertEqual(local_port, 8001) + self.assertEqual(remote_port, 8001) + + def test_allocate_host_port_handles_string_ports_in_records(self): + instance = make_instance("Alpha", 0, host_port=8000) + instance["host_port"] = "8000" + self.write_instance(instance) + + port = allocate_host_port(self.manager, "localhost") + + self.assertEqual(port, 8001) + + def test_allocate_host_port_ignores_replaced_instance(self): + existing = make_instance("Alpha", 0, host_port=8000) + self.write_instance(existing) + + port = allocate_host_port( + self.manager, + "localhost", + ignore_instance_id="local:Alpha:0", + ) + + self.assertEqual(port, 8000) + + def test_requested_host_port_is_respected(self): + self.write_instance(make_instance("Alpha", 0, host_port=8000)) + + port = allocate_host_port(self.manager, "localhost", requested_host_port=9100) + + self.assertEqual(port, 9100) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/routing/__init__.py b/tests/routing/__init__.py new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/tests/routing/__init__.py @@ -0,0 +1 @@ + diff --git a/tests/routing/test_routing_publication.py b/tests/routing/test_routing_publication.py new file mode 100644 index 0000000..2049d65 --- /dev/null +++ b/tests/routing/test_routing_publication.py @@ -0,0 +1,114 @@ +import json +import unittest + +from tests.support.runtime_fakes import FakeController +from tests.support.runtime_fakes import FakeRedis +from tests.support.runtime_fakes import make_instance +from ventis.controller.runtime_manager import RuntimeManager + + +class RoutingPublicationTests(unittest.TestCase): + def setUp(self): + self.controller = FakeController() + self.manager = RuntimeManager(self.controller, self.controller.redis) + + def write_instance(self, instance): + self.manager._write_instance(instance) + self.controller.redis.sadd( + f"agent:{instance['agent_name']}:instances", + self.manager._instance_id_from_record(instance), + ) + + def test_publish_routing_snapshot_orders_endpoints_by_replica_index(self): + self.write_instance(make_instance("Alpha", 1, host_port=8001)) + self.write_instance(make_instance("Alpha", 0, host_port=8000)) + + self.manager.publish_routing_snapshot([{"name": "Alpha", "stateful": True}]) + + self.assertEqual( + json.loads(self.controller.redis.hget("routing_table:endpoints", "Alpha")), + ["host.docker.internal:8000", "host.docker.internal:8001"], + ) + self.assertEqual(self.controller.redis.hget("routing_table:stateful", "Alpha"), "true") + self.assertEqual(self.controller.redis.smembers("routing_table:services"), {"Alpha"}) + + def test_publish_routing_snapshot_removes_endpoint_when_no_instances_exist(self): + redis = self.controller.redis + redis.sadd("routing_table:services", "Alpha") + redis.hset("routing_table:endpoints", "Alpha", json.dumps(["localhost:8000"])) + + self.manager.publish_routing_snapshot([{"name": "Alpha", "stateful": False}]) + + self.assertEqual(redis.smembers("routing_table:services"), {"Alpha"}) + self.assertIsNone(redis.hget("routing_table:endpoints", "Alpha")) + + def test_publish_routing_snapshot_clears_stale_service_metadata(self): + redis = self.controller.redis + redis.sadd("routing_table:services", "Old", "Keep") + redis.hset("routing_table:endpoints", "Old", json.dumps(["localhost:9000"])) + redis.hset("routing_table:stateful", "Old", "true") + redis.hset("routing_table:stateful", "Keep", "true") + + self.manager.publish_routing_snapshot([{"name": "Keep", "stateful": False}]) + + self.assertEqual(redis.smembers("routing_table:services"), {"Keep"}) + self.assertIsNone(redis.hget("routing_table:endpoints", "Old")) + self.assertIsNone(redis.hget("routing_table:stateful", "Old")) + self.assertIsNone(redis.hget("routing_table:stateful", "Keep")) + + def test_publish_routing_snapshot_targets_each_node_redis(self): + node_a = FakeRedis() + node_b = FakeRedis() + self.controller.node_redis = {"localhost": node_a, "127.0.0.1": node_b} + self.write_instance(make_instance("Alpha", 0, host="localhost", host_port=8000)) + self.write_instance(make_instance("Beta", 0, host="127.0.0.1", host_port=8001)) + + self.manager.publish_routing_snapshot( + [ + {"name": "Alpha", "stateful": True}, + {"name": "Beta", "stateful": False}, + ] + ) + + for redis in (node_a, node_b): + self.assertEqual(redis.smembers("routing_table:services"), {"Alpha", "Beta"}) + self.assertEqual(redis.hget("routing_table:stateful", "Alpha"), "true") + self.assertEqual( + json.loads(redis.hget("routing_table:endpoints", "Alpha")), + ["host.docker.internal:8000"], + ) + self.assertEqual( + json.loads(redis.hget("routing_table:endpoints", "Beta")), + ["host.docker.internal:8001"], + ) + + self.assertIsNone(self.controller.redis.hget("routing_table:endpoints", "Alpha")) + + def test_routing_targets_fall_back_to_central_redis(self): + self.assertEqual(self.manager._routing_redis_targets(), [self.controller.redis]) + + def test_routing_targets_use_node_redis_when_present(self): + node = FakeRedis() + self.controller.node_redis = {"localhost": node} + + self.assertEqual(self.manager._routing_redis_targets(), [node]) + + def test_publish_policy_rules_writes_to_all_targets(self): + self.controller.node_redis = {"localhost": FakeRedis(), "other": FakeRedis()} + rules = [{"match": {"role": "admin"}, "access": "all"}] + + count = self.manager.publish_policy_rules(rules) + + self.assertEqual(count, 2) + for redis in self.controller.node_redis.values(): + self.assertEqual(json.loads(redis.get("policy:rules")), rules) + + def test_publish_policy_rules_writes_empty_rule_list(self): + count = self.manager.publish_policy_rules([]) + + self.assertEqual(count, 1) + self.assertEqual(json.loads(self.controller.redis.get("policy:rules")), []) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/runtime/__init__.py b/tests/runtime/__init__.py new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/tests/runtime/__init__.py @@ -0,0 +1 @@ + diff --git a/tests/runtime/test_runtime_lifecycle.py b/tests/runtime/test_runtime_lifecycle.py new file mode 100644 index 0000000..84dfca1 --- /dev/null +++ b/tests/runtime/test_runtime_lifecycle.py @@ -0,0 +1,145 @@ +import unittest + +from tests.support.runtime_fakes import FakeController +from tests.support.runtime_fakes import make_instance +from ventis.controller.runtime_manager import RuntimeManager + + +class RuntimeLifecycleTests(unittest.TestCase): + def setUp(self): + self.controller = FakeController() + self.manager = RuntimeManager(self.controller, self.controller.redis) + + def write_instance(self, instance): + key = self.manager._instance_key( + instance["provider"], + instance["agent_name"], + int(instance["replica_index"]), + ) + self.controller.redis.hset_multiple(key, instance) + self.controller.redis.sadd( + f"agent:{instance['agent_name']}:instances", + self.manager._instance_id_from_record(instance), + ) + + def docker_run_calls(self): + return [call for call in self.controller.run_calls if call[0][:2] == ["docker", "run"]] + + def test_existing_runtime_is_reused_without_new_docker_run(self): + instance = make_instance("Alpha", 0, host_port=8000) + self.controller.runtime_ids.add(instance["runtime_id"]) + self.write_instance(instance) + + self.manager.ensure_instances([{"name": "Alpha", "provider": "local", "replicas": 1}]) + + self.assertEqual(self.docker_run_calls(), []) + self.assertEqual( + self.controller.redis.hgetall("agent_instance:local:Alpha:0")["endpoint"], + "localhost:8000", + ) + + def test_missing_runtime_is_recreated_on_previous_port(self): + self.write_instance(make_instance("Beta", 0, host_port=9100)) + + self.manager.ensure_instances([{"name": "Beta", "provider": "local", "replicas": 1}]) + + docker_run = self.docker_run_calls()[0][0] + self.assertIn("9100:50051", docker_run) + self.assertEqual( + self.controller.redis.hgetall("agent_instance:local:Beta:0")["endpoint"], + "localhost:9100", + ) + + def test_new_instances_get_incrementing_ports(self): + self.manager.ensure_instances([{"name": "Beta", "provider": "local", "replicas": 3}]) + + endpoints = [ + self.controller.redis.hgetall(f"agent_instance:local:Beta:{index}")["endpoint"] + for index in range(3) + ] + + self.assertEqual(endpoints, ["localhost:8000", "localhost:8001", "localhost:8002"]) + + def test_stale_runtime_is_removed_before_recreate(self): + stale = make_instance("Beta", 0, host_port=8000) + stale["runtime_id"] = "stale-runtime" + self.write_instance(stale) + + self.manager.ensure_instances([{"name": "Beta", "provider": "local", "replicas": 1}]) + + remove_calls = [call for call in self.controller.run_calls if call[0][:3] == ["docker", "rm", "-f"]] + self.assertEqual(remove_calls[0][0], ["docker", "rm", "-f", "stale-runtime"]) + + def test_recreate_updates_controller_tracking_once(self): + self.manager.ensure_instances([{"name": "Beta", "provider": "local", "replicas": 1}]) + self.manager.ensure_instances([{"name": "Beta", "provider": "local", "replicas": 1}]) + + self.assertEqual(self.controller.containers["Beta"], ["ventis-local-beta-0"]) + + def test_remove_instance_deletes_record_membership_and_runtime_tracking(self): + instance = make_instance("Gamma", 0, host_port=8000) + self.controller.runtime_ids.add(instance["runtime_id"]) + self.controller.containers = {"Gamma": [instance["runtime_id"]]} + self.write_instance(instance) + + self.manager.remove_instance("local:Gamma:0") + + self.assertEqual(self.controller.redis.hgetall("agent_instance:local:Gamma:0"), {}) + self.assertEqual(self.controller.redis.smembers("agent:Gamma:instances"), set()) + self.assertEqual(self.controller.containers["Gamma"], []) + self.assertFalse(self.controller.runtime_ids) + + def test_remove_missing_instance_is_noop(self): + self.manager.remove_instance("local:Missing:0") + + self.assertEqual(self.controller.run_calls, []) + + def test_runtime_exists_returns_false_without_runtime_id(self): + self.assertFalse(self.manager._runtime_exists({"host": "localhost"})) + + def test_launch_container_sets_runtime_environment_and_resources(self): + spec = { + "name": "Worker", + "provider": "local", + "replicas": 1, + "redis_port": 6380, + "resources": {"cpu": 2, "memory": 1024, "gpu": 1}, + } + + self.manager.ensure_instances([spec]) + + docker_run = self.docker_run_calls()[0][0] + self.assertIn("8000:50051", docker_run) + self.assertIn("VENTIS_AGENT_HOST=host.docker.internal", docker_run) + self.assertIn("VENTIS_AGENT_PORT=8000", docker_run) + self.assertIn("VENTIS_REDIS_HOST=host.docker.internal", docker_run) + self.assertIn("VENTIS_REDIS_PORT=6380", docker_run) + self.assertIn("--cpus", docker_run) + self.assertIn("2", docker_run) + self.assertIn("--memory", docker_run) + self.assertIn("1024m", docker_run) + self.assertIn("--gpus", docker_run) + self.assertIn("1", docker_run) + + def test_launch_container_raises_when_docker_run_fails(self): + def failing_run(cmd, host, user=None): + return type("Result", (), {"returncode": 1, "stdout": "", "stderr": "boom"})() + + self.controller._run_cmd = failing_run + + with self.assertRaisesRegex(RuntimeError, "Failed to launch"): + self.manager.ensure_instances([{"name": "Broken", "provider": "local", "replicas": 1}]) + + def test_workflow_container_exposes_runtime_managed_api_port(self): + self.manager.ensure_instances( + [{"name": "Workflow", "provider": "local", "type": "workflow", "replicas": 1}] + ) + + docker_run = self.docker_run_calls()[0][0] + + self.assertIn("8000:50051", docker_run) + self.assertIn("8080:8080", docker_run) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/support/__init__.py b/tests/support/__init__.py new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/tests/support/__init__.py @@ -0,0 +1 @@ + diff --git a/tests/support/runtime_fakes.py b/tests/support/runtime_fakes.py new file mode 100644 index 0000000..dba4901 --- /dev/null +++ b/tests/support/runtime_fakes.py @@ -0,0 +1,199 @@ +import json +import sys +from types import ModuleType +from types import SimpleNamespace + + +def _is_local_host(host): + return host in {"localhost", "127.0.0.1"} + + +def _container_routing_host(host): + return "host.docker.internal" if _is_local_host(host) else host + + +def install_grpc_stubs(): + local_controler_pb2 = ModuleType("local_controler_pb2") + + class JsonResponse: + def __init__(self, resonse): + self.resonse = resonse + + local_controler_pb2.JsonResponse = JsonResponse + sys.modules.setdefault("local_controler_pb2", local_controler_pb2) + + local_controler_pb2_grpc = ModuleType("local_controler_pb2_grpc") + local_controler_pb2_grpc.LocalControllerStub = type("LocalControllerStub", (), {}) + local_controler_pb2_grpc.__file__ = "local_controler_pb2_grpc.py" + sys.modules.setdefault("local_controler_pb2_grpc", local_controler_pb2_grpc) + sys.modules.setdefault("grpc", ModuleType("grpc")) + + +class FakeRedis: + def __init__(self): + self.strings = {} + self.hashes = {} + self.sets = {} + self.client = self + + def set(self, key, value): + self.strings[key] = value + + def get(self, key): + return self.strings.get(key) + + def delete(self, *keys): + for key in keys: + self.strings.pop(key, None) + self.hashes.pop(key, None) + self.sets.pop(key, None) + + def hset(self, name, field, value): + self.hashes.setdefault(name, {})[field] = value + + def hset_multiple(self, name, mapping): + self.hashes.setdefault(name, {}).update(mapping) + + def hget(self, name, field): + return self.hashes.get(name, {}).get(field) + + def hgetall(self, name): + return dict(self.hashes.get(name, {})) + + def hdel(self, name, field): + self.hashes.setdefault(name, {}).pop(field, None) + + def sadd(self, name, *values): + self.sets.setdefault(name, set()).update(values) + + def srem(self, name, *values): + members = self.sets.setdefault(name, set()) + members.difference_update(values) + + def smembers(self, name): + return set(self.sets.get(name, set())) + + def scan_keys(self, pattern): + prefix = pattern.rstrip("*") + keys = set(self.strings) | set(self.hashes) | set(self.sets) + return [key for key in sorted(keys) if key.startswith(prefix)] + + +class FakeController: + def __init__(self): + self.redis = FakeRedis() + self.node_redis = {} + self.redis_containers = {} + self.containers = {} + self.runtime_ids = set() + self.run_calls = [] + self.shipped_images = [] + self.synced_projects = [] + self.config = { + "redis": {"host": "redis.internal", "port": 6379}, + "ec2": { + "ami_id": "ami-123456", + "instance_type": "t3.small", + "subnet_id": "subnet-123456", + "security_group_ids": ["sg-123456"], + "ssh_user": "ubuntu", + "region": "us-east-1", + }, + } + + def _run_cmd(self, cmd, host, user=None): + self.run_calls.append((cmd, host, user)) + if cmd[:2] == ["docker", "inspect"]: + runtime_id = cmd[2] + return SimpleNamespace( + returncode=0 if runtime_id in self.runtime_ids else 1, + stdout="", + stderr="missing", + ) + if cmd[:3] == ["docker", "rm", "-f"]: + self.runtime_ids.discard(cmd[3]) + return SimpleNamespace(returncode=0, stdout="", stderr="") + if cmd[:2] == ["docker", "run"]: + runtime_id = cmd[cmd.index("--name") + 1] + self.runtime_ids.add(runtime_id) + return SimpleNamespace(returncode=0, stdout=f"{runtime_id}\n", stderr="") + return SimpleNamespace(returncode=0, stdout="", stderr="") + + def _ensure_image_on_host(self, image, host, user): + if not _is_local_host(host): + self.shipped_images.append((image, host, user)) + + def _sync_project_to_host(self, host, user, remote_dir): + if not _is_local_host(host): + self.synced_projects.append((host, user, remote_dir)) + + def ensure_host_redis(self, host, user=None, redis_port=6379): + if host not in self.node_redis: + self.redis_containers[host] = f"ventis-redis-{host.replace('.', '-')}" + self.node_redis[host] = FakeRedis() + return self.node_redis[host] + + def _ensure_remote_docker(self, host, user=None): + return SimpleNamespace(returncode=0, stdout="", stderr="") + + +def make_instance(agent_name, replica_index, host="localhost", host_port=8000, provider="local"): + runtime_id = f"ventis-{provider.lower()}-{agent_name.lower()}-{replica_index}" + routing_host = _container_routing_host(host) + return { + "agent_name": agent_name, + "provider": provider, + "replica_index": str(replica_index), + "host": host, + "host_port": str(host_port), + "container_port": "50051", + "endpoint": f"{host}:{host_port}", + "redis_host": routing_host, + "redis_port": "6379", + "runtime_id": runtime_id, + } + + +def make_global_controller(instances): + install_grpc_stubs() + + from ventis.controller.global_controller import GlobalController + + controller = GlobalController.__new__(GlobalController) + controller.controllers = [] + controller.redis = FakeRedis() + controller.node_redis = {} + controller.redis_containers = {} + controller.containers = {} + controller._last_status = {} + controller._lc_stubs = {} + controller._healthy_calls = [] + controller._unhealthy_calls = [] + controller._run_cmd_calls = [] + + controller.runtime_manager = SimpleNamespace( + list_instances=lambda agent_name=None: list(instances), + list_runtime_nodes=lambda agent_specs=None: {}, + _user_for_instance=lambda instance: instance.get("user"), + _instance_id_from_record=lambda instance: ( + f"{instance['provider']}:{instance['agent_name']}:{instance['replica_index']}" + ), + ) + controller._on_controller_healthy = ( + lambda name, host, port: controller._healthy_calls.append((name, host, port)) + ) + controller._on_controller_unhealthy = ( + lambda name, host, port: controller._unhealthy_calls.append((name, host, port)) + ) + controller._run_cmd = ( + lambda cmd, host, user=None: controller._run_cmd_calls.append((cmd, host, user)) + or SimpleNamespace(returncode=0, stdout="", stderr="") + ) + controller._ensure_remote_docker = ( + lambda host, user=None: SimpleNamespace(returncode=0, stdout="", stderr="") + ) + return controller + + +def cleanup_payloads(stub_calls): + return [json.loads(message.resonse) for message in stub_calls] diff --git a/tests/test_cli_build.py b/tests/test_cli_build.py new file mode 100644 index 0000000..d732db5 --- /dev/null +++ b/tests/test_cli_build.py @@ -0,0 +1,163 @@ +import os +import tempfile +import unittest +from types import SimpleNamespace +from unittest.mock import patch + +import yaml + +from ventis import cli + + +class CliBuildTests(unittest.TestCase): + def setUp(self): + self._repo_cwd = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) + os.chdir(self._repo_cwd) + + def _write_config(self, root): + config_dir = os.path.join(root, "config") + docker_dir = os.path.join(root, "docker") + os.makedirs(config_dir, exist_ok=True) + os.makedirs(docker_dir, exist_ok=True) + + with open(os.path.join(config_dir, "global_controller.yaml"), "w") as f: + yaml.safe_dump({"agents": []}, f, sort_keys=False) + + for name in ("generic-agent.Dockerfile", "global-controller.Dockerfile"): + with open(os.path.join(docker_dir, name), "w") as f: + f.write("FROM scratch\n") + + def _write_ec2_config(self, root): + config_dir = os.path.join(root, "config") + os.makedirs(config_dir, exist_ok=True) + with open(os.path.join(config_dir, "global_controller.yaml"), "w") as f: + yaml.safe_dump( + { + "agents": [{"name": "ExampleAgent", "provider": "EC2", "entrypoint": "agents/example_agent.py"}], + "ec2": { + "region": "us-east-1", + "ami_id": "ami-123", + "instance_type": "t2.nano", + "subnet_id": "subnet-123", + "security_group_ids": ["sg-123"], + "ssh_user": "ec2-user", + "agent_image": "ec2-agent-base", + "controller_image": "ec2-global-controller", + }, + }, + f, + sort_keys=False, + ) + + def test_build_uses_linux_amd64_platform_and_builds_controller_images(self): + commands = [] + + def _capture_run(cmd, check=True, **_kwargs): + commands.append(cmd) + return SimpleNamespace(returncode=0) + + with tempfile.TemporaryDirectory() as tmpdir: + self._write_config(tmpdir) + config_path = os.path.join(tmpdir, "config", "global_controller.yaml") + args = SimpleNamespace(config=config_path) + + with patch("subprocess.run", side_effect=_capture_run): + with patch("os.getcwd", return_value=tmpdir): + with patch.dict(os.environ, {}, clear=False): + cli.cmd_build(args) + + docker_builds = [cmd for cmd in commands if cmd[:2] == ["docker", "build"]] + self.assertEqual(len(docker_builds), 2) + self.assertEqual( + docker_builds[0], + [ + "docker", + "build", + "--platform", + "linux/amd64", + "-f", + os.path.join(tmpdir, "docker", "generic-agent.Dockerfile"), + "-t", + "ventis-agent-base", + tmpdir, + ], + ) + self.assertEqual( + docker_builds[1], + [ + "docker", + "build", + "--platform", + "linux/amd64", + "-f", + os.path.join(tmpdir, "docker", "global-controller.Dockerfile"), + "-t", + "ventis-global-controller", + tmpdir, + ], + ) + + def test_build_respects_platform_override(self): + commands = [] + + def _capture_run(cmd, check=True, **_kwargs): + commands.append(cmd) + return SimpleNamespace(returncode=0) + + with tempfile.TemporaryDirectory() as tmpdir: + self._write_config(tmpdir) + config_path = os.path.join(tmpdir, "config", "global_controller.yaml") + args = SimpleNamespace(config=config_path) + + with patch("subprocess.run", side_effect=_capture_run): + with patch("os.getcwd", return_value=tmpdir): + with patch.dict(os.environ, {"VENTIS_DOCKER_PLATFORM": "linux/arm64"}, clear=False): + cli.cmd_build(args) + + docker_builds = [cmd for cmd in commands if cmd[:2] == ["docker", "build"]] + self.assertTrue(docker_builds) + self.assertTrue(all(cmd[2:4] == ["--platform", "linux/arm64"] for cmd in docker_builds)) + + def test_ec2_build_uses_resolved_config_and_custom_image_names(self): + commands = [] + + def _capture_run(cmd, check=True, **_kwargs): + commands.append(cmd) + return SimpleNamespace(returncode=0) + + with tempfile.TemporaryDirectory() as tmpdir: + self._write_config(tmpdir) + self._write_ec2_config(tmpdir) + args = SimpleNamespace(config="config/global_controller.yaml") + os.chdir(tmpdir) + try: + with patch("subprocess.run", side_effect=_capture_run): + with patch("os.getcwd", return_value=tmpdir): + with patch("shutil.which", return_value="/usr/bin/docker"): + with patch("ventis.cli._running_on_ec2", return_value=True): + cli.cmd_build(args) + finally: + os.chdir(self._repo_cwd) + + docker_builds = [cmd for cmd in commands if cmd[:2] == ["docker", "build"]] + self.assertIn("ec2-agent-base", docker_builds[0]) + self.assertIn("ec2-global-controller", docker_builds[1]) + + def test_ec2_build_fails_fast_without_docker(self): + with tempfile.TemporaryDirectory() as tmpdir: + self._write_config(tmpdir) + self._write_ec2_config(tmpdir) + args = SimpleNamespace(config="config/global_controller.yaml") + os.chdir(tmpdir) + try: + with patch("os.getcwd", return_value=tmpdir): + with patch("shutil.which", return_value=None): + with patch("ventis.cli._running_on_ec2", return_value=True): + with self.assertRaisesRegex(RuntimeError, "EC2 translation for `ventis build` requires local Docker"): + cli.cmd_build(args) + finally: + os.chdir(self._repo_cwd) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_cli_deploy.py b/tests/test_cli_deploy.py new file mode 100644 index 0000000..fb662ab --- /dev/null +++ b/tests/test_cli_deploy.py @@ -0,0 +1,137 @@ +import os +import tempfile +import unittest +from types import SimpleNamespace +from unittest.mock import patch + +import yaml + +from ventis import cli + + +class _FakeController: + def __init__(self, config_path): + self.config_path = config_path + self.cleanup_calls = 0 + self.launch_calls = 0 + self.wait_calls = 0 + self.run_calls = 0 + + def cleanup(self): + self.cleanup_calls += 1 + + def launch_agents(self): + self.launch_calls += 1 + + def _wait_for_healthy(self): + self.wait_calls += 1 + + def run(self): + self.run_calls += 1 + + +class CliDeployTests(unittest.TestCase): + def setUp(self): + self._repo_cwd = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) + os.chdir(self._repo_cwd) + + def _write_config(self, root, *, provider="local", include_ec2=False, ssh_key_path=None): + config_dir = os.path.join(root, "config") + grpc_dir = os.path.join(root, "grpc_stubs") + os.makedirs(config_dir, exist_ok=True) + os.makedirs(grpc_dir, exist_ok=True) + + with open(os.path.join(grpc_dir, "local_controler_pb2.py"), "w") as f: + f.write("MESSAGE = 'ok'\n") + with open(os.path.join(grpc_dir, "local_controler_pb2_grpc.py"), "w") as f: + f.write("SERVICE = 'ok'\n") + + config = { + "agents": [{"name": "ExampleAgent", "provider": provider}], + "redis": {"host": "localhost", "port": 6379, "db": 0}, + } + if include_ec2: + config["ec2"] = { + "region": "us-east-1", + "ami_id": "ami-123", + "instance_type": "t2.nano", + "subnet_id": "subnet-123", + "security_group_ids": ["sg-123"], + "ssh_user": "ec2-user", + } + if ssh_key_path is not None: + config["ec2"]["ssh_private_key_path"] = ssh_key_path + + with open(os.path.join(config_dir, "global_controller.yaml"), "w") as f: + yaml.safe_dump(config, f, sort_keys=False) + + def test_ec2_deploy_uses_global_controller_config_and_runs_controller(self): + created = [] + + def _factory(config_path): + controller = _FakeController(config_path) + created.append(controller) + return controller + + with tempfile.TemporaryDirectory() as tmpdir: + self._write_config(tmpdir, provider="EC2", include_ec2=True) + args = SimpleNamespace(config="config/global_controller.yaml") + os.chdir(tmpdir) + try: + with patch("ventis.controller.global_controller.GlobalController", side_effect=_factory): + with patch("ventis.cli._running_on_ec2", return_value=True): + with patch("ventis.cli._docker_available", return_value=True): + cli.cmd_deploy(args) + finally: + os.chdir(self._repo_cwd) + + self.assertEqual(len(created), 1) + self.assertEqual(created[0].config_path, "config/global_controller.yaml") + self.assertEqual(created[0].launch_calls, 1) + self.assertEqual(created[0].wait_calls, 1) + self.assertEqual(created[0].run_calls, 1) + + def test_ec2_deploy_preflight_stops_before_controller_run(self): + with tempfile.TemporaryDirectory() as tmpdir: + missing_key = os.path.join(tmpdir, "missing.pem") + self._write_config(tmpdir, provider="EC2", include_ec2=True, ssh_key_path=missing_key) + args = SimpleNamespace(config="config/global_controller.yaml") + os.chdir(tmpdir) + try: + with patch("ventis.controller.global_controller.GlobalController") as controller_cls: + with patch("ventis.cli._running_on_ec2", return_value=True): + with patch("ventis.cli._docker_available", return_value=True): + with self.assertRaisesRegex(RuntimeError, "ssh_private_key_path does not exist"): + cli.cmd_deploy(args) + finally: + os.chdir(self._repo_cwd) + + controller_cls.assert_not_called() + + def test_non_ec2_deploy_keeps_existing_behavior(self): + created = [] + + def _factory(config_path): + controller = _FakeController(config_path) + created.append(controller) + return controller + + with tempfile.TemporaryDirectory() as tmpdir: + self._write_config(tmpdir, provider="local", include_ec2=False) + args = SimpleNamespace(config="config/global_controller.yaml") + os.chdir(tmpdir) + try: + with patch("ventis.controller.global_controller.GlobalController", side_effect=_factory): + with patch("ventis.cli._running_on_ec2", return_value=False): + cli.cmd_deploy(args) + finally: + os.chdir(self._repo_cwd) + + self.assertEqual(created[0].config_path, "config/global_controller.yaml") + self.assertEqual(created[0].launch_calls, 1) + self.assertEqual(created[0].wait_calls, 1) + self.assertEqual(created[0].run_calls, 1) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_integration.py b/tests/test_integration.py index fac98fd..48d582e 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -1,6 +1,9 @@ import requests -import time import sys +import time +import unittest +from types import SimpleNamespace +from unittest.mock import patch def run_integration_test(): base_url = "http://localhost:8080" @@ -49,3 +52,66 @@ def run_integration_test(): if __name__ == "__main__": run_integration_test() + + +class IntegrationScriptTests(unittest.TestCase): + def test_run_integration_test_exits_zero_on_expected_result(self): + submit_response = SimpleNamespace( + status_code=202, + json=lambda: {"request_id": "req-1"}, + ) + status_response = SimpleNamespace( + json=lambda: { + "status": "done", + "result": { + "company_name": "MSFT Corp", + "competitors": "This is an LLM generated response to competitors", + "stock_price": 100.0, + }, + } + ) + + with patch("tests.test_integration.requests.post", return_value=submit_response): + with patch("tests.test_integration.requests.get", return_value=status_response): + with self.assertRaises(SystemExit) as raised: + run_integration_test() + + self.assertEqual(raised.exception.code, 0) + + def test_run_integration_test_exits_one_on_submit_failure(self): + submit_response = SimpleNamespace(status_code=500, text="boom") + + with patch("tests.test_integration.requests.post", return_value=submit_response): + with self.assertRaises(SystemExit) as raised: + run_integration_test() + + self.assertEqual(raised.exception.code, 1) + + def test_run_integration_test_exits_one_on_workflow_error(self): + submit_response = SimpleNamespace( + status_code=202, + json=lambda: {"request_id": "req-1"}, + ) + status_response = SimpleNamespace(json=lambda: {"status": "error", "error": "boom"}) + + with patch("tests.test_integration.requests.post", return_value=submit_response): + with patch("tests.test_integration.requests.get", return_value=status_response): + with self.assertRaises(SystemExit) as raised: + run_integration_test() + + self.assertEqual(raised.exception.code, 1) + + def test_run_integration_test_times_out(self): + submit_response = SimpleNamespace( + status_code=202, + json=lambda: {"request_id": "req-1"}, + ) + status_response = SimpleNamespace(json=lambda: {"status": "running"}) + + with patch("tests.test_integration.requests.post", return_value=submit_response): + with patch("tests.test_integration.requests.get", return_value=status_response): + with patch("tests.test_integration.time.sleep", return_value=None): + with self.assertRaises(SystemExit) as raised: + run_integration_test() + + self.assertEqual(raised.exception.code, 1) diff --git a/tests/test_performance.py b/tests/test_performance.py index d1f876a..32c65ad 100644 --- a/tests/test_performance.py +++ b/tests/test_performance.py @@ -4,6 +4,9 @@ import sys import requests import statistics +import unittest +from types import SimpleNamespace +from unittest.mock import patch def dispatch_request(session, base_url, payload): start_time = time.time() @@ -87,3 +90,69 @@ def run_performance_test(concurrent_users, total_requests): args = parser.parse_args() run_performance_test(args.concurrent, args.total) + + +class PerformanceScriptTests(unittest.TestCase): + def test_dispatch_request_returns_request_id_on_success(self): + response = SimpleNamespace( + raise_for_status=lambda: None, + json=lambda: {"request_id": "req-1"}, + ) + session = SimpleNamespace(post=lambda *args, **kwargs: response) + + result = dispatch_request(session, "http://test", {"ticker": "MSFT"}) + + self.assertEqual(result["status"], "dispatched") + self.assertEqual(result["req_id"], "req-1") + + def test_dispatch_request_returns_error_on_exception(self): + session = SimpleNamespace(post=lambda *args, **kwargs: (_ for _ in ()).throw(RuntimeError("boom"))) + + result = dispatch_request(session, "http://test", {"ticker": "MSFT"}) + + self.assertEqual(result["status"], "error") + self.assertIn("boom", result["error"]) + + def test_poll_request_returns_done_status(self): + response = SimpleNamespace(json=lambda: {"status": "done"}) + session = SimpleNamespace(get=lambda *args, **kwargs: response) + + result = poll_request(session, "http://test", "req-1") + + self.assertEqual(result["status"], "done") + + def test_poll_request_returns_error_on_exception(self): + session = SimpleNamespace(get=lambda *args, **kwargs: (_ for _ in ()).throw(RuntimeError("boom"))) + + result = poll_request(session, "http://test", "req-1") + + self.assertEqual(result["status"], "error") + self.assertIn("boom", result["error"]) + + def test_run_performance_test_exits_zero_when_all_requests_complete(self): + class Session: + def post(self, *args, **kwargs): + return SimpleNamespace( + raise_for_status=lambda: None, + json=lambda: {"request_id": "req-1"}, + ) + + def get(self, *args, **kwargs): + return SimpleNamespace(json=lambda: {"status": "done"}) + + with patch("tests.test_performance.requests.Session", return_value=Session()): + with self.assertRaises(SystemExit) as raised: + run_performance_test(concurrent_users=1, total_requests=1) + + self.assertEqual(raised.exception.code, 0) + + def test_run_performance_test_exits_one_when_dispatch_fails(self): + class Session: + def post(self, *args, **kwargs): + raise RuntimeError("boom") + + with patch("tests.test_performance.requests.Session", return_value=Session()): + with self.assertRaises(SystemExit) as raised: + run_performance_test(concurrent_users=1, total_requests=1) + + self.assertEqual(raised.exception.code, 1) diff --git a/tests/test_runtime_manager.py b/tests/test_runtime_manager.py new file mode 100644 index 0000000..d11ed36 --- /dev/null +++ b/tests/test_runtime_manager.py @@ -0,0 +1,904 @@ +import json +import sys +import unittest +from types import ModuleType +from types import SimpleNamespace +from unittest.mock import patch + +local_controler_pb2 = ModuleType("local_controler_pb2") + + +class _JsonResponse: + def __init__(self, resonse): + self.resonse = resonse + + +local_controler_pb2.JsonResponse = _JsonResponse +sys.modules.setdefault("local_controler_pb2", local_controler_pb2) + +local_controler_pb2_grpc = ModuleType("local_controler_pb2_grpc") +local_controler_pb2_grpc.LocalControllerStub = type("LocalControllerStub", (), {}) +local_controler_pb2_grpc.__file__ = "local_controler_pb2_grpc.py" +sys.modules.setdefault("local_controler_pb2_grpc", local_controler_pb2_grpc) +sys.modules.setdefault("grpc", ModuleType("grpc")) + +from ventis.controller.global_controller import GlobalController +from ventis.controller.runtime_manager import RuntimeManager +from ventis.controller.cloud_provider_logic.EC2 import _runtime as ec2_runtime_impl + + +def _is_local_host(host): + return host in {"localhost", "127.0.0.1"} + + +class FakeRedis: + def __init__(self): + self.strings = {} + self.hashes = {} + self.sets = {} + self.client = self + + def set(self, key, value): + self.strings[key] = value + + def get(self, key): + return self.strings.get(key) + + def delete(self, *keys): + for key in keys: + self.strings.pop(key, None) + self.hashes.pop(key, None) + self.sets.pop(key, None) + + def hset(self, name, field, value): + self.hashes.setdefault(name, {})[field] = value + + def hset_multiple(self, name, mapping): + self.hashes.setdefault(name, {}).update(mapping) + + def hget(self, name, field): + return self.hashes.get(name, {}).get(field) + + def hgetall(self, name): + return dict(self.hashes.get(name, {})) + + def hdel(self, name, field): + self.hashes.setdefault(name, {}).pop(field, None) + + def sadd(self, name, *values): + self.sets.setdefault(name, set()).update(values) + + def srem(self, name, *values): + members = self.sets.setdefault(name, set()) + members.difference_update(values) + + def smembers(self, name): + return set(self.sets.get(name, set())) + + def scan_keys(self, pattern): + prefix = pattern.rstrip("*") + keys = set(self.strings) | set(self.hashes) | set(self.sets) + return [key for key in sorted(keys) if key.startswith(prefix)] + + +class FakeController: + def __init__(self): + self.redis = FakeRedis() + self.node_redis = {} + self.redis_containers = {} + self.containers = {} + self.runtime_ids = set() + self.run_calls = [] + self.shipped_images = [] + self.synced_projects = [] + self.config = { + "redis": {"host": "redis.internal", "port": 6379}, + "ec2": { + "ami_id": "ami-123456", + "instance_type": "t3.small", + "subnet_id": "subnet-123456", + "security_group_ids": ["sg-123456"], + "ssh_user": "ubuntu", + "region": "us-east-1", + "key_name": "ventis-key", + "controller_health_timeout": 1, + "public_ip_timeout": 1, + }, + } + + def _run_cmd(self, cmd, host, user=None): + self.run_calls.append((cmd, host, user)) + if cmd[:2] == ["docker", "inspect"]: + runtime_id = cmd[2] + return SimpleNamespace( + returncode=0 if runtime_id in self.runtime_ids else 1, + stdout="", + stderr="missing", + ) + if cmd[:3] == ["docker", "rm", "-f"]: + self.runtime_ids.discard(cmd[3]) + return SimpleNamespace(returncode=0, stdout="", stderr="") + if cmd[:2] == ["docker", "run"]: + runtime_id = cmd[cmd.index("--name") + 1] + self.runtime_ids.add(runtime_id) + return SimpleNamespace(returncode=0, stdout=f"{runtime_id}\n", stderr="") + return SimpleNamespace(returncode=0, stdout="", stderr="") + + def _ensure_image_on_host(self, image, host, user): + if _is_local_host(host): + return + self.shipped_images.append((image, host, user)) + + def _sync_project_to_host(self, host, user, remote_dir): + if _is_local_host(host): + return + self.synced_projects.append((host, user, remote_dir)) + + def _ensure_remote_docker(self, host, user=None): + return SimpleNamespace(returncode=0, stdout="", stderr="") + + def ensure_host_redis(self, host, user=None, redis_port=6379, ssh_host=None): + if host not in self.node_redis: + self._run_cmd( + [ + "docker", "run", "-d", + "--name", f"ventis-redis-{host.replace('.', '-')}", + "-p", f"{redis_port}:6379", + "redis:alpine", + ], + ssh_host or host, + user, + ) + self.redis_containers[host] = f"ventis-redis-{host.replace('.', '-')}" + self.node_redis[host] = FakeRedis() + return self.node_redis[host] + + +class FakeWaiter: + def __init__(self): + self.calls = [] + + def wait(self, InstanceIds): + self.calls.append(list(InstanceIds)) + + +class FakeEC2Client: + def __init__(self, public_ip="54.10.20.30", private_ip="10.0.0.30"): + self.public_ip = public_ip + self.private_ip = private_ip + self.run_requests = [] + self.terminate_requests = [] + self.waiter = FakeWaiter() + self.instances = {} + + def run_instances(self, **kwargs): + self.run_requests.append(kwargs) + instance_id = f"i-test{len(self.run_requests)}" + self.instances[instance_id] = { + "InstanceId": instance_id, + "State": {"Name": "running"}, + "PrivateIpAddress": self.private_ip, + "PublicIpAddress": self.public_ip, + } + return {"Instances": [{"InstanceId": instance_id}]} + + def get_waiter(self, name): + assert name == "instance_running" + return self.waiter + + def describe_instances(self, InstanceIds): + reservations = [] + for instance_id in InstanceIds: + if instance_id in self.instances: + reservations.append({"Instances": [self.instances[instance_id]]}) + return {"Reservations": reservations} + + def terminate_instances(self, InstanceIds): + self.terminate_requests.append(list(InstanceIds)) + return {} + + +class FakeSession: + def __init__(self, client, region_name="us-east-1", credentials=True): + self._client = client + self.region_name = region_name + self._credentials = object() if credentials else None + self.client_calls = [] + + def get_credentials(self): + return self._credentials + + def client(self, service_name, region_name=None): + self.client_calls.append((service_name, region_name)) + return self._client + + +class RuntimeManagerTests(unittest.TestCase): + def setUp(self): + self.fake_ec2_client = FakeEC2Client() + self.fake_ec2_session = FakeSession(self.fake_ec2_client) + self.ec2_session_patch = patch.object( + ec2_runtime_impl.boto3, + "Session", + side_effect=lambda **_kwargs: self.fake_ec2_session, + ) + self.health_patch = patch.object(ec2_runtime_impl, "_check_controller_health", return_value=True) + self.ec2_session_patch.start() + self.health_patch.start() + + def tearDown(self): + self.health_patch.stop() + self.ec2_session_patch.stop() + + def test_ensure_instances_creates_missing_instances_and_writes_redis_records(self): + controller = FakeController() + manager = RuntimeManager(controller, controller.redis) + + manager.ensure_instances( + [ + { + "name": "Alpha", + "provider": "local", + "replicas": 2, + "redis_port": 6379, + } + ] + ) + + self.assertEqual( + controller.redis.smembers("agent:Alpha:instances"), + {"local:Alpha:0", "local:Alpha:1"}, + ) + self.assertEqual( + controller.redis.hgetall("agent_instance:local:Alpha:0"), + { + "agent_name": "Alpha", + "provider": "local", + "replica_index": "0", + "host": "localhost", + "host_port": "8000", + "container_port": "50051", + "endpoint": "localhost:8000", + "redis_host": "host.docker.internal", + "redis_port": "6379", + "runtime_id": "ventis-local-alpha-0", + }, + ) + self.assertEqual( + json.loads(controller.redis.hget("routing_table:endpoints", "Alpha")), + ["host.docker.internal:8000", "host.docker.internal:8001"], + ) + self.assertEqual(controller.redis.hget("routing_table:stateful", "Alpha"), None) + self.assertEqual(controller.redis.smembers("routing_table:services"), {"Alpha"}) + + def test_stable_instance_ids_include_provider(self): + controller = FakeController() + manager = RuntimeManager(controller, controller.redis) + + manager.ensure_instances( + [ + { + "name": "Beta", + "provider": "EC2", + "replicas": 1, + "redis_port": 6380, + } + ] + ) + + self.assertIn("EC2:Beta:0", controller.redis.smembers("agent:Beta:instances")) + self.assertEqual( + controller.redis.hgetall("agent_instance:EC2:Beta:0")["provider"], + "EC2", + ) + self.assertEqual( + controller.redis.hgetall("agent_instance:EC2:Beta:0")["host_port"], + "50051", + ) + + def test_stateful_and_services_metadata_are_synced(self): + controller = FakeController() + manager = RuntimeManager(controller, controller.redis) + + manager.sync_routing_metadata( + [ + {"name": "Sticky", "stateful": True}, + {"name": "Plain", "stateful": False}, + ] + ) + + self.assertEqual(controller.redis.hget("routing_table:stateful", "Sticky"), "true") + self.assertEqual(controller.redis.hget("routing_table:stateful", "Plain"), None) + self.assertEqual( + controller.redis.smembers("routing_table:services"), + {"Sticky", "Plain"}, + ) + + def test_sync_routing_metadata_removes_stale_service_and_stateful_flags(self): + controller = FakeController() + redis = controller.redis + manager = RuntimeManager(controller, redis) + + redis.sadd("routing_table:services", "Old", "Keep") + redis.hset("routing_table:stateful", "Old", "true") + redis.hset("routing_table:stateful", "Keep", "true") + + manager.sync_routing_metadata([{"name": "Keep", "stateful": False}]) + + self.assertEqual(redis.smembers("routing_table:services"), {"Keep"}) + self.assertEqual(redis.hget("routing_table:stateful", "Old"), None) + self.assertEqual(redis.hget("routing_table:stateful", "Keep"), None) + + def test_stale_redis_instance_gets_recreated_when_runtime_is_missing(self): + controller = FakeController() + redis = controller.redis + manager = RuntimeManager(controller, redis) + + redis.hset_multiple( + "agent_instance:local:Gamma:0", + { + "agent_name": "Gamma", + "provider": "local", + "replica_index": "0", + "host": "localhost", + "host_port": "8000", + "container_port": "50051", + "endpoint": "localhost:8000", + "redis_host": "host.docker.internal", + "redis_port": "6379", + "runtime_id": "stale-runtime", + }, + ) + redis.sadd("agent:Gamma:instances", "local:Gamma:0") + + manager.ensure_instances( + [ + { + "name": "Gamma", + "provider": "local", + "replicas": 1, + "redis_port": 6379, + } + ] + ) + + self.assertEqual( + redis.hgetall("agent_instance:local:Gamma:0")["runtime_id"], + "ventis-local-gamma-0", + ) + self.assertTrue(any(call[0][:2] == ["docker", "run"] for call in controller.run_calls)) + + def test_existing_runtime_is_reused_without_relaunch(self): + controller = FakeController() + redis = controller.redis + controller.runtime_ids.add("ventis-local-epsilon-0") + manager = RuntimeManager(controller, redis) + + redis.hset_multiple( + "agent_instance:local:Epsilon:0", + { + "agent_name": "Epsilon", + "provider": "local", + "replica_index": "0", + "host": "localhost", + "host_port": "8000", + "container_port": "50051", + "endpoint": "localhost:8000", + "redis_host": "host.docker.internal", + "redis_port": "6379", + "runtime_id": "ventis-local-epsilon-0", + }, + ) + redis.sadd("agent:Epsilon:instances", "local:Epsilon:0") + + manager.ensure_instances( + [ + { + "name": "Epsilon", + "provider": "local", + "replicas": 1, + "redis_port": 6379, + } + ] + ) + + self.assertFalse(any(call[0][:2] == ["docker", "run"] for call in controller.run_calls)) + self.assertEqual( + json.loads(redis.hget("routing_table:endpoints", "Epsilon")), + ["host.docker.internal:8000"], + ) + + def test_local_provider_allocates_ports_without_yaml_port(self): + controller = FakeController() + manager = RuntimeManager(controller, controller.redis) + + manager.ensure_instances( + [ + { + "name": "Zeta", + "provider": "local", + "replicas": 2, + "redis_port": 6379, + } + ] + ) + + self.assertEqual( + json.loads(controller.redis.hget("routing_table:endpoints", "Zeta")), + ["host.docker.internal:8000", "host.docker.internal:8001"], + ) + self.assertEqual( + controller.redis.hgetall("agent_instance:local:Zeta:0")["host_port"], + "8000", + ) + self.assertEqual( + controller.redis.hgetall("agent_instance:local:Zeta:1")["host_port"], + "8001", + ) + + def test_ec2_launch_includes_redis_and_agent_env(self): + controller = FakeController() + manager = RuntimeManager(controller, controller.redis) + + manager.ensure_instances( + [ + { + "name": "Delta", + "provider": "EC2", + "replicas": 1, + "redis_port": 6380, + } + ] + ) + + docker_run, host, user = next( + (cmd, host, user) + for cmd, host, user in controller.run_calls + if cmd[:2] == ["docker", "run"] and "ventis-delta" in cmd + ) + self.assertEqual(host, "54.10.20.30") + self.assertEqual(user, "ubuntu") + self.assertIn("VENTIS_REDIS_HOST=10.0.0.30", docker_run) + self.assertIn("VENTIS_REDIS_PORT=6380", docker_run) + self.assertIn("VENTIS_AGENT_HOST=10.0.0.30", docker_run) + self.assertIn("VENTIS_AGENT_PORT=50051", docker_run) + self.assertEqual(controller.shipped_images, [("ventis-delta", "54.10.20.30", "ubuntu")]) + self.assertEqual( + controller.redis.hgetall("agent_instance:EC2:Delta:0")["endpoint"], + "10.0.0.30:50051", + ) + self.assertIn("10.0.0.30", controller.node_redis) + self.assertEqual( + controller.redis.hgetall("agent_instance:EC2:Delta:0")["redis_host"], + "10.0.0.30", + ) + + def test_ec2_agent_entrypoint_uses_generic_image_and_bind_mount(self): + controller = FakeController() + manager = RuntimeManager(controller, controller.redis) + + manager.ensure_instances( + [ + { + "name": "Mounted", + "provider": "EC2", + "replicas": 1, + "redis_port": 6380, + "entrypoint": "agents/mounted_agent.py", + } + ] + ) + + docker_run, host, user = next( + (cmd, host, user) + for cmd, host, user in controller.run_calls + if cmd[:2] == ["docker", "run"] and "ventis-ec2-mounted-0" in cmd + ) + self.assertEqual((host, user), ("54.10.20.30", "ubuntu")) + self.assertIn("ventis-agent-base", docker_run) + self.assertIn("/opt/ventis/project:/workspace", docker_run) + self.assertIn("VENTIS_AGENT_NAME=Mounted", docker_run) + self.assertIn("VENTIS_AGENT_FILE=agents/mounted_agent.py", docker_run) + self.assertEqual( + controller.synced_projects, + [("54.10.20.30", "ubuntu", "/opt/ventis/project")], + ) + self.assertEqual( + controller.shipped_images, + [("ventis-agent-base", "54.10.20.30", "ubuntu")], + ) + + def test_mixed_local_and_ec2_publish_routing_snapshot_to_host_redis(self): + controller = FakeController() + controller.node_redis["localhost"] = FakeRedis() + manager = RuntimeManager(controller, controller.redis) + + manager.ensure_instances( + [ + { + "name": "LocalA", + "provider": "local", + "replicas": 1, + "redis_port": 6379, + "stateful": True, + }, + { + "name": "RemoteB", + "provider": "EC2", + "replicas": 1, + "redis_port": 6380, + }, + ] + ) + manager.publish_policy_rules([{"match": {"role": "admin"}, "access": "all"}]) + + self.assertIn("local:LocalA:0", controller.redis.smembers("agent:LocalA:instances")) + self.assertIn("EC2:RemoteB:0", controller.redis.smembers("agent:RemoteB:instances")) + self.assertIsNone(controller.redis.hget("routing_table:endpoints", "LocalA")) + + for host in ("localhost", "10.0.0.30"): + host_redis = controller.node_redis[host] + self.assertEqual( + host_redis.smembers("routing_table:services"), + {"LocalA", "RemoteB"}, + ) + self.assertEqual(host_redis.hget("routing_table:stateful", "LocalA"), "true") + self.assertEqual( + json.loads(host_redis.hget("routing_table:endpoints", "LocalA")), + ["host.docker.internal:8000"], + ) + self.assertEqual( + json.loads(host_redis.hget("routing_table:endpoints", "RemoteB")), + ["10.0.0.30:50051"], + ) + self.assertEqual( + json.loads(host_redis.get("policy:rules")), + [{"match": {"role": "admin"}, "access": "all"}], + ) + + def test_ec2_validate_config_fails_when_required_fields_are_missing(self): + controller = FakeController() + controller.config["ec2"].pop("ami_id") + ec2_runtime_impl._set_controller(controller) + + with self.assertRaisesRegex(ValueError, "Missing EC2 config"): + ec2_runtime_impl.validate_config() + + def test_ec2_create_instance_tags_waits_and_returns_runtime_id_with_instance_id(self): + controller = FakeController() + ec2_runtime_impl._set_controller(controller) + + instance = ec2_runtime_impl.create_instance( + {"name": "Tagged", "provider": "EC2", "redis_port": 6390}, + 2, + ) + + request = self.fake_ec2_client.run_requests[0] + self.assertEqual(request["ImageId"], "ami-123456") + self.assertEqual(request["KeyName"], "ventis-key") + tags = request["TagSpecifications"][0]["Tags"] + self.assertIn({"Key": "Name", "Value": "ventis-Tagged-2"}, tags) + self.assertIn({"Key": "VentisManaged", "Value": "true"}, tags) + self.assertIn({"Key": "VentisReplica", "Value": "2"}, tags) + self.assertEqual(self.fake_ec2_client.waiter.calls, [["i-test1"]]) + self.assertEqual(instance["host"], "10.0.0.30") + self.assertEqual(instance["endpoint"], "10.0.0.30:50051") + self.assertIn("--i-test1", instance["runtime_id"]) + + def test_ec2_create_instance_terminates_when_health_check_fails(self): + controller = FakeController() + ec2_runtime_impl._set_controller(controller) + + with patch.object(ec2_runtime_impl, "_check_controller_health", side_effect=TimeoutError("boom")): + with self.assertRaises(TimeoutError): + ec2_runtime_impl.create_instance({"name": "Broken", "provider": "EC2"}, 0) + + self.assertEqual(self.fake_ec2_client.terminate_requests, [["i-test1"]]) + + def test_workflow_launch_uses_runtime_managed_api_port(self): + controller = FakeController() + manager = RuntimeManager(controller, controller.redis) + + manager.ensure_instances( + [ + { + "name": "Workflow", + "provider": "local", + "type": "workflow", + "replicas": 1, + "redis_port": 6379, + } + ] + ) + + docker_run = next( + cmd for cmd, _host, _user in controller.run_calls if cmd[:2] == ["docker", "run"] + ) + self.assertIn("8000:50051", docker_run) + self.assertIn("8080:8080", docker_run) + + def test_list_instances_returns_all_records_sorted_by_key(self): + controller = FakeController() + redis = controller.redis + manager = RuntimeManager(controller, redis) + redis.hset_multiple( + "agent_instance:local:Beta:0", + { + "agent_name": "Beta", + "provider": "local", + "replica_index": "0", + "host": "localhost", + "host_port": "8001", + "endpoint": "localhost:8001", + "runtime_id": "ventis-local-beta-0", + }, + ) + redis.hset_multiple( + "agent_instance:local:Alpha:0", + { + "agent_name": "Alpha", + "provider": "local", + "replica_index": "0", + "host": "localhost", + "host_port": "8000", + "endpoint": "localhost:8000", + "runtime_id": "ventis-local-alpha-0", + }, + ) + + self.assertEqual( + [instance["agent_name"] for instance in manager.list_instances()], + ["Alpha", "Beta"], + ) + + def test_list_instances_for_agent_ignores_missing_records(self): + controller = FakeController() + redis = controller.redis + manager = RuntimeManager(controller, redis) + redis.sadd("agent:Alpha:instances", "local:Alpha:0", "local:Alpha:1") + redis.hset_multiple( + "agent_instance:local:Alpha:0", + { + "agent_name": "Alpha", + "provider": "local", + "replica_index": "0", + "host": "localhost", + "host_port": "8000", + "endpoint": "localhost:8000", + "runtime_id": "ventis-local-alpha-0", + }, + ) + + self.assertEqual(len(manager.list_instances("Alpha")), 1) + + def test_remove_missing_instance_is_noop(self): + controller = FakeController() + manager = RuntimeManager(controller, controller.redis) + + manager.remove_instance("local:Missing:0") + + self.assertEqual(controller.run_calls, []) + + def test_instance_ids_are_stable_and_provider_scoped(self): + self.assertEqual( + RuntimeManager._instance_id("local", "Alpha", 2), + "local:Alpha:2", + ) + self.assertEqual( + RuntimeManager._instance_key("EC2", "Alpha", 2), + "agent_instance:EC2:Alpha:2", + ) + + +class GlobalControllerRuntimeBackedTests(unittest.TestCase): + def make_controller(self, instances): + controller = GlobalController.__new__(GlobalController) + controller.controllers = [] + controller.redis = FakeRedis() + controller.node_redis = {} + controller.redis_containers = {} + controller._last_status = {} + controller._lc_stubs = {} + controller.containers = {} + controller.runtime_manager = SimpleNamespace( + list_instances=lambda agent_name=None: list(instances), + _user_for_instance=lambda instance: "ubuntu", + ) + controller._healthy_calls = [] + controller._unhealthy_calls = [] + controller._on_controller_healthy = ( + lambda name, host, port: controller._healthy_calls.append((name, host, port)) + ) + controller._on_controller_unhealthy = ( + lambda name, host, port: controller._unhealthy_calls.append((name, host, port)) + ) + controller._run_cmd_calls = [] + + def _run_cmd(cmd, host, user=None): + controller._run_cmd_calls.append((cmd, host, user)) + return SimpleNamespace(returncode=0, stdout="", stderr="") + + controller._run_cmd = _run_cmd + controller._ensure_remote_docker = ( + lambda host, user=None: SimpleNamespace(returncode=0, stdout="", stderr="") + ) + return controller + + def test_wait_for_healthy_uses_runtime_manager_instances(self): + instance = { + "agent_name": "Alpha", + "host": "localhost", + "host_port": "8000", + "endpoint": "localhost:8000", + "runtime_id": "ventis-local-alpha-0", + } + controller = self.make_controller([instance]) + controller.node_redis["localhost"] = FakeRedis() + controller.node_redis["localhost"].set( + "controller:host.docker.internal:8000:status", + "healthy", + ) + + GlobalController._wait_for_healthy(controller, timeout=1, interval=0) + + self.assertEqual(controller._last_status, {("localhost", "8000"): "healthy"}) + + def test_poll_controllers_uses_runtime_manager_instances(self): + instance = { + "agent_name": "Beta", + "host": "localhost", + "host_port": "8000", + "endpoint": "localhost:8000", + "runtime_id": "ventis-local-beta-0", + } + controller = self.make_controller([instance]) + controller.node_redis["localhost"] = FakeRedis() + controller.node_redis["localhost"].set( + "controller:host.docker.internal:8000:status", + "healthy", + ) + + GlobalController._poll_controllers(controller) + + self.assertEqual(controller._healthy_calls, [("Beta", "localhost", "8000")]) + self.assertEqual(controller._unhealthy_calls, []) + + def test_trigger_cleanup_uses_runtime_manager_instances(self): + instance = { + "agent_name": "Gamma", + "host": "localhost", + "host_port": "9301", + "endpoint": "localhost:9301", + "runtime_id": "ventis-local-gamma-0", + } + controller = self.make_controller([instance]) + controller.redis.sadd("request:completed", "req-1") + seen = [] + + class Stub: + def Cleanup(self, message): + seen.append(json.loads(message.resonse)) + + controller._get_lc_stub = lambda endpoint: Stub() + + GlobalController._trigger_cleanup(controller) + + self.assertEqual(seen, [{"request_id": "req-1"}]) + self.assertEqual(controller.redis.smembers("request:completed"), set()) + + def test_stop_docker_agents_uses_runtime_manager_instances(self): + instance = { + "agent_name": "Delta", + "host": "10.0.0.7", + "host_port": "9401", + "endpoint": "10.0.0.7:9401", + "runtime_id": "ventis-ec2-delta-0", + "provider": "EC2", + "replica_index": "0", + } + controller = self.make_controller([instance]) + controller.containers = {"Delta": ["ventis-ec2-delta-0"]} + removed = [] + controller.runtime_manager.remove_instance = lambda instance_id: removed.append(instance_id) + controller.runtime_manager._instance_id_from_record = ( + lambda item: f"{item['provider']}:{item['agent_name']}:{item['replica_index']}" + ) + + GlobalController._stop_docker_agents(controller) + + self.assertEqual(removed, ["EC2:Delta:0"]) + self.assertEqual(controller._run_cmd_calls, []) + self.assertEqual(controller.containers, {}) + + def test_launch_redis_containers_keeps_central_redis(self): + controller = self.make_controller([]) + central_redis = controller.redis + controller.runtime_manager = SimpleNamespace( + list_runtime_nodes=lambda agent_specs=None: { + "localhost": {"user": None, "redis_port": 6379} + } + ) + + with patch("ventis.controller.global_controller.RedisClient", side_effect=lambda **_kwargs: FakeRedis()): + GlobalController._launch_redis_containers(controller) + + self.assertIs(controller.redis, central_redis) + self.assertIn("localhost", controller.node_redis) + self.assertIsNot(controller.node_redis["localhost"], central_redis) + + def test_ensure_host_redis_republishes_policy_rules_for_new_node(self): + controller = self.make_controller([]) + published = [] + controller._load_policy_rules = lambda: [{"match": {"role": "admin"}, "access": "all"}] + controller.runtime_manager = SimpleNamespace( + list_runtime_nodes=lambda agent_specs=None: {}, + publish_policy_rules=lambda rules: published.append(rules), + ) + + with patch("ventis.controller.global_controller.RedisClient", side_effect=lambda **_kwargs: FakeRedis()): + GlobalController.ensure_host_redis(controller, "54.10.20.30", "ubuntu", 6380) + + self.assertEqual( + published, + [[{"match": {"role": "admin"}, "access": "all"}]], + ) + self.assertIn("54.10.20.30", controller.node_redis) + + def test_ensure_host_redis_waits_for_redis_before_registering_node(self): + controller = self.make_controller([]) + waited = [] + controller._load_policy_rules = lambda: [] + controller.runtime_manager = SimpleNamespace( + list_runtime_nodes=lambda agent_specs=None: {}, + publish_policy_rules=lambda rules: None, + ) + + with patch("ventis.controller.global_controller.RedisClient", side_effect=lambda **_kwargs: FakeRedis()): + with patch.object( + GlobalController, + "_wait_for_redis", + side_effect=lambda redis_client, host, port: waited.append((redis_client, host, port)), + ): + GlobalController.ensure_host_redis(controller, "54.10.20.30", "ubuntu", 6380) + + self.assertEqual(len(waited), 1) + self.assertEqual(waited[0][1:], ("54.10.20.30", 6380)) + self.assertIs(controller.node_redis["54.10.20.30"], waited[0][0]) + + def test_wait_for_redis_timeout_mentions_ec2_security_group(self): + class FailingRedis: + def set(self, *_args): + raise TimeoutError("Timeout connecting to server") + + controller = self.make_controller([]) + + with self.assertRaisesRegex(TimeoutError, "security group allows inbound TCP 6380"): + GlobalController._wait_for_redis( + controller, + FailingRedis(), + "54.10.20.30", + 6380, + timeout=0, + interval=0, + ) + + +if __name__ == "__main__": + unittest.main() + + +class LegacyConfigRejectionTests(unittest.TestCase): + def test_runtime_manager_rejects_legacy_host_port_yaml(self): + controller = FakeController() + manager = RuntimeManager(controller, controller.redis) + + with self.assertRaisesRegex(ValueError, "Legacy YAML host/port"): + manager.ensure_instances([ + { + "name": "Legacy", + "provider": "local", + "host": "localhost", + "port": 9000, + "replicas": [{"host": "localhost", "port": 9000}], + } + ]) diff --git a/tests/test_stateful_affinity.py b/tests/test_stateful_affinity.py index c781696..fad248e 100644 --- a/tests/test_stateful_affinity.py +++ b/tests/test_stateful_affinity.py @@ -17,16 +17,12 @@ import json import os -import sys import uuid import pytest +from redis.exceptions import RedisError -# Ensure project paths are importable -sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "utils")) -sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "ventis", "controller")) - -from redis_client import RedisClient +from ventis.utils.redis_client import RedisClient # ------------------------------------------------------------------ # @@ -44,6 +40,11 @@ def redis(): """Provide a RedisClient connected to a test Redis, flushed before each test.""" client = RedisClient(host=REDIS_HOST, port=REDIS_PORT) + try: + client.client.ping() + except RedisError as exc: + pytest.skip(f"Redis is not available at {REDIS_HOST}:{REDIS_PORT}: {exc}") + # Flush only keys in the test namespace to avoid clobbering production data for key in client.scan_keys("routing_table:*"): client.delete(key) @@ -111,6 +112,14 @@ def test_stateful_flag_written(self, redis): assert redis.hget(ROUTING_STATEFUL_KEY, "StatefulAgent") == "true" assert redis.hget(ROUTING_STATEFUL_KEY, "StatelessAgent") is None + def test_empty_endpoint_list_is_stored_as_empty_json_list(self, redis): + """An agent with no endpoints should still store valid JSON.""" + _seed_routing_table(redis, [ + {"name": "EmptyAgent", "endpoints": []}, + ]) + + assert json.loads(redis.hget(ROUTING_ENDPOINTS_KEY, "EmptyAgent")) == [] + # ------------------------------------------------------------------ # # Tests — Affinity resolution # @@ -214,6 +223,33 @@ def test_unknown_service_returns_none(self, redis): ]) assert self._resolve_endpoint(redis, "NonExistent", uuid.uuid4().hex) is None + def test_empty_endpoint_list_returns_none(self, redis): + """A known service with no endpoints should not route anywhere.""" + _seed_routing_table(redis, [ + {"name": "Empty", "endpoints": []}, + ]) + + assert self._resolve_endpoint(redis, "Empty", uuid.uuid4().hex) is None + + def test_stateful_without_request_id_does_not_write_affinity(self, redis): + """Stateful routing only becomes sticky when a request_id is supplied.""" + _seed_routing_table(redis, [ + {"name": "FA", "endpoints": ["h:5001", "h:5002"], "stateful": True}, + ]) + + assert self._resolve_endpoint(redis, "FA", None) in ["h:5001", "h:5002"] + assert redis.scan_keys("affinity:*") == [] + + def test_existing_affinity_binding_is_used_even_if_endpoint_order_changes(self, redis): + """Existing sticky bindings should win over later endpoint list order.""" + rid = uuid.uuid4().hex + redis.hset(f"affinity:{rid}", "FA", "h:5002") + _seed_routing_table(redis, [ + {"name": "FA", "endpoints": ["h:5001", "h:5002"], "stateful": True}, + ]) + + assert self._resolve_endpoint(redis, "FA", rid) == "h:5002" + # ------------------------------------------------------------------ # # Tests — Cleanup # diff --git a/ventis/cli.py b/ventis/cli.py index ebbdd88..f2b6189 100644 --- a/ventis/cli.py +++ b/ventis/cli.py @@ -14,9 +14,21 @@ import shutil import subprocess import sys +import urllib.error +import urllib.request logging.basicConfig(level=logging.INFO) logger = logging.getLogger("ventis") +DEFAULT_DOCKER_PLATFORM = "linux/amd64" +DEFAULT_CONFIG_PATH = "config/global_controller.yaml" +EC2_REQUIRED_CONFIG_KEYS = ( + "ami_id", + "instance_type", + "subnet_id", + "security_group_ids", + "ssh_user", + "region", +) # ------------------------------------------------------------------ # @@ -40,6 +52,119 @@ def _load_config(config_path): return yaml.safe_load(f) +def _docker_platform(): + """Return the target Docker platform for portable runtime images.""" + return os.environ.get("VENTIS_DOCKER_PLATFORM", DEFAULT_DOCKER_PLATFORM) + + +def _docker_build_cmd(*args): + """Build a Docker build command with an explicit target platform.""" + return ["docker", "build", "--platform", _docker_platform(), *args] + + +def _truthy_env(name): + value = os.environ.get(name, "") + return value.lower() in {"1", "true", "yes", "on"} + + +def _running_on_ec2(): + """Best-effort EC2 detection with test and escape-hatch env overrides.""" + if _truthy_env("VENTIS_DISABLE_EC2_TRANSLATION"): + return False + if _truthy_env("VENTIS_FORCE_EC2"): + return True + + request = urllib.request.Request( + "http://169.254.169.254/latest/meta-data/instance-id", + headers={"X-aws-ec2-metadata-token-ttl-seconds": "60"}, + ) + try: + with urllib.request.urlopen(request, timeout=0.2) as response: + return bool(response.read(32)) + except (urllib.error.URLError, TimeoutError, OSError, ValueError): + return False + + +def _docker_available(): + if not shutil.which("docker"): + return False + + try: + result = subprocess.run( + ["docker", "info"], + capture_output=True, + text=True, + check=False, + ) + except OSError: + return False + + return result.returncode == 0 + + +def _require_docker_for_ec2(command_name): + if _docker_available(): + return + raise RuntimeError( + f"EC2 translation for `ventis {command_name}` requires local Docker on the EC2 host, " + "but Docker is unavailable or unreachable." + ) + + +def _uses_ec2_agents(config): + return any(agent.get("provider", "local").upper() == "EC2" for agent in config.get("agents", [])) + + +def _log_ec2_translation(command_name, config_path, config): + if _running_on_ec2() and _uses_ec2_agents(config): + logger.info( + "EC2 translation active for `ventis %s` using config: %s", + command_name, + config_path, + ) + + +def _ec2_build_image_names(config): + ec2_cfg = config.get("ec2", {}) + return { + "generic_agent": ec2_cfg.get("agent_image", "ventis-agent-base"), + "global_controller": ec2_cfg.get("controller_image", "ventis-global-controller"), + } + + +def _ensure_grpc_stubs_importable(project_dir): + grpc_stubs_dir = os.path.join(project_dir, "grpc_stubs") + if grpc_stubs_dir not in sys.path: + sys.path.insert(0, grpc_stubs_dir) + + try: + __import__("local_controler_pb2") + __import__("local_controler_pb2_grpc") + except ImportError as exc: + raise RuntimeError( + "EC2 deploy preflight failed: generated grpc_stubs are missing or not importable. " + "Run `ventis build` on this host first." + ) from exc + + +def _preflight_ec2_deploy(config, project_dir): + ec2_cfg = config.get("ec2", {}) + missing = [key for key in EC2_REQUIRED_CONFIG_KEYS if not ec2_cfg.get(key)] + if missing: + raise RuntimeError( + f"EC2 deploy preflight failed: missing ec2 config keys: {', '.join(sorted(missing))}" + ) + + ssh_key_path = ec2_cfg.get("ssh_private_key_path") + if ssh_key_path and not os.path.isfile(ssh_key_path): + raise RuntimeError( + f"EC2 deploy preflight failed: ssh_private_key_path does not exist: {ssh_key_path}" + ) + + _require_docker_for_ec2("deploy") + _ensure_grpc_stubs_importable(project_dir) + + # ------------------------------------------------------------------ # # ventis new-project # # ------------------------------------------------------------------ # @@ -89,6 +214,11 @@ def cmd_build(args): sys.exit(1) config = _load_config(config_path) + ec2_translation = _running_on_ec2() + if ec2_translation: + _require_docker_for_ec2("build") + _log_ec2_translation("build", config_path, config) + image_names = _ec2_build_image_names(config) if ec2_translation else {} agents = config.get("agents", []) project_dir = os.getcwd() package_dir = _get_package_dir() @@ -138,7 +268,46 @@ def cmd_build(args): ], check=True) # -------------------------------------------------------------- # - # Step 3: Generate Docker contexts and build images # + # Step 3: Build the generic agent image used by EC2 runtimes # + # -------------------------------------------------------------- # + generic_agent_dockerfile = os.path.join(project_dir, "docker", "generic-agent.Dockerfile") + if os.path.isfile(generic_agent_dockerfile): + generic_agent_image = image_names.get("generic_agent", "ventis-agent-base") + logger.info( + "Building generic agent image: %s (platform=%s)", + generic_agent_image, + _docker_platform(), + ) + subprocess.run( + _docker_build_cmd("-f", generic_agent_dockerfile, "-t", generic_agent_image, project_dir), + check=True, + ) + else: + logger.warning("Generic agent Dockerfile not found: %s", generic_agent_dockerfile) + + global_controller_dockerfile = os.path.join(project_dir, "docker", "global-controller.Dockerfile") + if os.path.isfile(global_controller_dockerfile): + global_controller_image = image_names.get("global_controller", "ventis-global-controller") + logger.info( + "Building global controller image: %s (platform=%s)", + global_controller_image, + _docker_platform(), + ) + subprocess.run( + _docker_build_cmd( + "-f", + global_controller_dockerfile, + "-t", + global_controller_image, + project_dir, + ), + check=True, + ) + else: + logger.warning("Global controller Dockerfile not found: %s", global_controller_dockerfile) + + # -------------------------------------------------------------- # + # Step 4: Generate Docker contexts and build images # # -------------------------------------------------------------- # for agent_cfg in agents: agent_name = agent_cfg["name"] @@ -167,7 +336,7 @@ def cmd_build(args): image_name = f"ventis-{agent_name.lower()}" logger.info("Building Docker image: %s", image_name) - subprocess.run(["docker", "build", "-t", image_name, docker_context], check=True) + subprocess.run(_docker_build_cmd("-t", image_name, docker_context), check=True) else: # Agent container @@ -207,7 +376,7 @@ def cmd_build(args): image_name = f"ventis-{agent_name.lower()}" logger.info("Building Docker image: %s", image_name) - subprocess.run(["docker", "build", "-t", image_name, docker_context], check=True) + subprocess.run(_docker_build_cmd("-t", image_name, docker_context), check=True) logger.info("Build complete.") @@ -229,12 +398,19 @@ def cmd_deploy(args): logger.error("Config file not found: %s", config_path) sys.exit(1) + config = _load_config(config_path) + project_dir = os.getcwd() + # Ensure imports resolve package_dir = _get_package_dir() repo_root = os.path.dirname(package_dir) sys.path.insert(0, repo_root) # Add the project's grpc_stubs to path so global controller can find them - sys.path.insert(0, os.path.join(os.getcwd(), "grpc_stubs")) + sys.path.insert(0, os.path.join(project_dir, "grpc_stubs")) + + if _running_on_ec2(): + _preflight_ec2_deploy(config, project_dir) + _log_ec2_translation("deploy", config_path, config) from ventis.controller.global_controller import GlobalController @@ -251,7 +427,7 @@ def _signal_handler(sig, frame): atexit.register(controller.cleanup) logger.info("Deploying from config: %s", config_path) - controller.launch_docker_agents() + controller.launch_agents() controller._wait_for_healthy() controller.run() diff --git a/ventis/controller/agent_spec_loader.py b/ventis/controller/agent_spec_loader.py new file mode 100644 index 0000000..f3b0fa9 --- /dev/null +++ b/ventis/controller/agent_spec_loader.py @@ -0,0 +1,24 @@ +import json + +import yaml + + +def write_agent_specs(config_path, redis_client): + """Read global_controller.yaml and write agent specs to Redis.""" + with open(config_path, "r") as f: + config = yaml.safe_load(f) or {} + + for agent in config.get("agents", []): + name = agent["name"] + spec_class = "WorkflowSpec" if agent.get("type") == "workflow" else "AgentSpec" + redis_client.hset_multiple( + f"agent:{name}:", + { + "class": spec_class, + "resources": json.dumps(agent.get("resources", {})), + "replicas": json.dumps(agent.get("replicas", 1)), + "stateful": json.dumps(agent.get("stateful", False)), + "redis_port": str(agent.get("redis_port", 6379)), + "provider": agent.get("provider", "local"), + }, + ) diff --git a/ventis/controller/cloud_provider_logic/EC2/__init__.py b/ventis/controller/cloud_provider_logic/EC2/__init__.py new file mode 100644 index 0000000..e39dd19 --- /dev/null +++ b/ventis/controller/cloud_provider_logic/EC2/__init__.py @@ -0,0 +1,16 @@ +"""EC2 runtime package.""" + + +__all__ = [ + "resolve_replica_placements", + "validate_config", + "create_instance", + "_wait_for_instance_ready", + "_get_instance_host", + "_bootstrap_instance", + "_check_controller_health", + "_build_instance_record", + "bootstrap_instance", + "terminate_instance", + "provision_instance", +] diff --git a/ventis/controller/cloud_provider_logic/EC2/_runtime.py b/ventis/controller/cloud_provider_logic/EC2/_runtime.py new file mode 100644 index 0000000..75a01a2 --- /dev/null +++ b/ventis/controller/cloud_provider_logic/EC2/_runtime.py @@ -0,0 +1,338 @@ +import logging +import socket +import time + +import boto3 +from botocore.exceptions import ClientError + +logger = logging.getLogger(__name__) + +CONTAINER_PORT = 50051 +_controller = None +DEFAULT_REMOTE_PROJECT_DIR = "/opt/ventis/project" +DEFAULT_AGENT_IMAGE = "ventis-agent-base" + + +def _set_controller(controller): + global _controller + _controller = controller + + +def _require_controller(): + if _controller is None: + raise RuntimeError("EC2 runtime controller is not configured.") + return _controller + + +def _ec2_config(): + return _require_controller().config.get("ec2", {}) + + +def _redis_config(): + return _require_controller().config.get("redis", {}) + + +def _session(): + cfg = _ec2_config() + kwargs = {"region_name": cfg.get("region")} + if cfg.get("profile"): + kwargs["profile_name"] = cfg["profile"] + if cfg.get("aws_access_key_id"): + kwargs["aws_access_key_id"] = cfg["aws_access_key_id"] + if cfg.get("aws_secret_access_key"): + kwargs["aws_secret_access_key"] = cfg["aws_secret_access_key"] + if cfg.get("aws_session_token"): + kwargs["aws_session_token"] = cfg["aws_session_token"] + return boto3.Session(**kwargs) + + +def _ec2_client(): + return _session().client("ec2", region_name=_ec2_config()["region"]) + + +def _instance_id_from_runtime_id(runtime_id): + if "--" not in runtime_id: + raise ValueError(f"Invalid EC2 runtime id: {runtime_id}") + return runtime_id.rsplit("--", 1)[1] + + +def _describe_instance(instance_id): + response = _ec2_client().describe_instances(InstanceIds=[instance_id]) + for reservation in response.get("Reservations", []): + for instance in reservation.get("Instances", []): + if instance.get("InstanceId") == instance_id: + return instance + return None + + +def _preferred_instance_host(instance): + if not instance: + return None + return instance.get("PrivateIpAddress") or instance.get("PublicIpAddress") + + +def _ssh_instance_host(instance): + if not instance: + return None + return instance.get("PublicIpAddress") or instance.get("PrivateIpAddress") + + +def _instance_name(agent_name, replica_index): + return f"ventis-ec2-{agent_name.lower()}-{replica_index}" + + +def _runtime_id(agent_name, replica_index, instance_id): + return f"ventis-ec2-{agent_name.lower()}-{replica_index}--{instance_id}" + +def validate_config(): + cfg = _ec2_config() + required = [ + "ami_id", + "instance_type", + "subnet_id", + "security_group_ids", + "ssh_user", + "region", + ] + missing = [field for field in required if not cfg.get(field)] + if missing: + raise ValueError(f"Missing EC2 config: {', '.join(sorted(missing))}") + if not isinstance(cfg["security_group_ids"], list) or not all(cfg["security_group_ids"]): + raise ValueError("EC2 security_group_ids must be a non-empty list.") + + session = _session() + if not session.region_name: + raise ValueError("EC2 region must be configured.") + + credentials = session.get_credentials() + if credentials is None: + raise ValueError("AWS credentials are not available for the EC2 runtime.") + + _ec2_client() + return cfg + + +def create_instance(spec, replica_index): + provisioned = provision_instance(spec, replica_index) + return bootstrap_instance(provisioned, spec, replica_index) + + +def provision_instance(spec, replica_index): + cfg = validate_config() + client = _ec2_client() + agent_name = spec["name"] + tags = [ + {"Key": "Name", "Value": f"ventis-{agent_name}-{replica_index}"}, + {"Key": "VentisManaged", "Value": "true"}, + {"Key": "VentisProvider", "Value": "EC2"}, + {"Key": "VentisAgent", "Value": agent_name}, + {"Key": "VentisReplica", "Value": str(replica_index)}, + ] + request = { + "ImageId": cfg["ami_id"], + "InstanceType": cfg["instance_type"], + "SubnetId": cfg["subnet_id"], + "SecurityGroupIds": cfg["security_group_ids"], + "MinCount": 1, + "MaxCount": 1, + "TagSpecifications": [ + {"ResourceType": "instance", "Tags": tags + [{"Key": "CreatedBy", "Value": "EC2 Fast Launch"}]}, + {"ResourceType": "volume", "Tags": [{"Key": "CreatedBy", "Value": "EC2 Fast Launch"}]}, + ], + } + if cfg.get("key_name"): + request["KeyName"] = cfg["key_name"] + + try: + response = client.run_instances(**request) + except ClientError as exc: + error = exc.response.get("Error", {}) + if error.get("Code") == "UnauthorizedOperation": + raise RuntimeError( + "EC2 launch failed: controller IAM role is missing ec2:RunInstances " + "(and likely related EC2 permissions such as DescribeInstances and CreateTags)." + ) from exc + raise + instance_id = response["Instances"][0]["InstanceId"] + runtime_id = _runtime_id(agent_name, replica_index, instance_id) + instance = _wait_for_instance_ready(runtime_id) + host = _preferred_instance_host(instance) + ssh_host = _ssh_instance_host(instance) + if not host: + raise RuntimeError(f"EC2 instance {instance_id} does not have a reachable IP address.") + if not ssh_host: + raise RuntimeError(f"EC2 instance {instance_id} does not have an SSH-reachable IP address.") + redis_cfg = _redis_config() + return { + "host": host, + "ssh_host": ssh_host, + "runtime_id": runtime_id, + "ec2_instance_id": instance_id, + "user": cfg["ssh_user"], + "redis_port": spec.get("redis_port", redis_cfg.get("port", 6379)), + } + + +def bootstrap_instance(provisioned, spec, replica_index, redis_host=None, redis_port=None): + host = provisioned["host"] + ssh_host = provisioned.get("ssh_host", host) + runtime_id = provisioned["runtime_id"] + try: + _bootstrap_instance( + host, + spec, + replica_index, + ssh_host=ssh_host, + redis_host=redis_host, + redis_port=redis_port, + ) + endpoint = f"{host}:{CONTAINER_PORT}" + _check_controller_health(endpoint) + return _build_instance_record( + spec, + replica_index, + host, + runtime_id, + redis_host=redis_host, + redis_port=redis_port, + ec2_instance_id=provisioned.get("ec2_instance_id"), + ) + except Exception: + logger.exception("EC2 runtime bootstrap failed for %s", runtime_id) + try: + terminate_instance(runtime_id) + except Exception: + logger.warning("Leaving failed EC2 instance %s running for manual cleanup.", runtime_id) + raise + + +def _wait_for_instance_ready(runtime_id): + instance_id = _instance_id_from_runtime_id(runtime_id) + client = _ec2_client() + client.get_waiter("instance_running").wait(InstanceIds=[instance_id]) + deadline = time.time() + _ec2_config().get("public_ip_timeout", 120) + while time.time() < deadline: + instance = _describe_instance(instance_id) + if _preferred_instance_host(instance) and _ssh_instance_host(instance): + return instance + time.sleep(2) + raise TimeoutError(f"EC2 instance {instance_id} never received usable network addresses.") + + +def _get_instance_host(runtime_id): + instance_id = _instance_id_from_runtime_id(runtime_id) + instance = _describe_instance(instance_id) + host = _preferred_instance_host(instance) + if not host: + raise RuntimeError(f"EC2 instance {instance_id} does not have a usable runtime IP.") + return host + + +def _bootstrap_instance(host, spec, replica_index, ssh_host=None, redis_host=None, redis_port=None): + cfg = _ec2_config() + controller = _require_controller() + agent_name = spec["name"] + ctrl_type = spec.get("type", "agent") + entrypoint = spec.get("entrypoint") + use_generic_agent_image = ctrl_type != "workflow" and bool(entrypoint) + image = cfg.get("agent_image", DEFAULT_AGENT_IMAGE) if use_generic_agent_image else f"ventis-{agent_name.lower()}" + remote_project_dir = cfg.get("remote_project_dir", DEFAULT_REMOTE_PROJECT_DIR) + redis_cfg = _redis_config() + ssh_host = ssh_host or host + redis_host = redis_host or redis_cfg.get("host", "localhost") + redis_port = redis_port or spec.get("redis_port", redis_cfg.get("port", 6379)) + + prep_result = controller._ensure_remote_docker(ssh_host, cfg["ssh_user"]) + if prep_result.returncode != 0: + raise RuntimeError(f"Failed to prepare Docker on {ssh_host}: {prep_result.stderr.strip()}") + if use_generic_agent_image: + controller._sync_project_to_host(ssh_host, cfg["ssh_user"], remote_project_dir) + controller._ensure_image_on_host(image, ssh_host, cfg["ssh_user"]) + + cmd = [ + "docker", + "run", + "-d", + "-it", + "--restart", + "unless-stopped", + "--add-host=host.docker.internal:host-gateway", + "--name", + _instance_name(agent_name, replica_index), + "-p", + f"{CONTAINER_PORT}:{CONTAINER_PORT}", + "-e", + f"VENTIS_REDIS_HOST={redis_host}", + "-e", + f"VENTIS_REDIS_PORT={redis_port}", + "-e", + f"VENTIS_AGENT_HOST={host}", + "-e", + f"VENTIS_AGENT_PORT={CONTAINER_PORT}", + ] + if use_generic_agent_image: + cmd.extend([ + "-w", + "/workspace", + "-v", + f"{remote_project_dir}:/workspace", + "-e", + f"VENTIS_AGENT_NAME={agent_name}", + "-e", + f"VENTIS_AGENT_FILE={entrypoint}", + ]) + cmd.append(image) + result = controller._run_cmd(cmd, ssh_host, cfg["ssh_user"]) + if result.returncode != 0: + raise RuntimeError( + f"Failed to launch EC2 runtime container for {agent_name} on {ssh_host}: {result.stderr.strip()}" + ) + return result + + +def _check_controller_health(endpoint): + host, port = endpoint.split(":") + deadline = time.time() + _ec2_config().get("controller_health_timeout", 180) + while time.time() < deadline: + try: + with socket.create_connection((host, int(port)), timeout=2): + return True + except OSError: + time.sleep(2) + raise TimeoutError(f"LocalController never became reachable at {endpoint}.") + + +def _build_instance_record( + spec, + replica_index, + host, + runtime_id, + redis_host=None, + redis_port=None, + ec2_instance_id=None, +): + redis_cfg = _redis_config() + redis_host = redis_host or redis_cfg.get("host", "localhost") + redis_port = redis_port or spec.get("redis_port", redis_cfg.get("port", 6379)) + return { + "agent_name": spec["name"], + "provider": "EC2", + "replica_index": str(replica_index), + "host": host, + "host_port": str(CONTAINER_PORT), + "container_port": str(CONTAINER_PORT), + "endpoint": f"{host}:{CONTAINER_PORT}", + "redis_host": redis_host, + "redis_port": str(redis_port), + "runtime_id": runtime_id, + "ec2_instance_id": ec2_instance_id or _instance_id_from_runtime_id(runtime_id), + } + + +def terminate_instance(runtime_id): + instance_id = _instance_id_from_runtime_id(runtime_id) + client = _ec2_client() + try: + client.terminate_instances(InstanceIds=[instance_id]) + except Exception: + logger.exception("Failed to terminate EC2 instance %s", instance_id) diff --git a/ventis/controller/global_controller.py b/ventis/controller/global_controller.py index 58655c1..c5eedf3 100644 --- a/ventis/controller/global_controller.py +++ b/ventis/controller/global_controller.py @@ -5,6 +5,7 @@ import atexit import logging import signal +import shlex import subprocess import threading import time @@ -14,6 +15,8 @@ import yaml +from ventis.controller.agent_spec_loader import write_agent_specs +from ventis.controller.runtime_manager import RuntimeManager from ventis.utils.redis_client import RedisClient # Add generated grpc_stubs from the local project to the path @@ -22,13 +25,18 @@ import local_controler_pb2_grpc import grpc -print(f"DEBUG: Loading gRPC stubs from: {local_controler_pb2_grpc.__file__}") -print(f"DEBUG: LocalControllerStub attributes: {[a for a in dir(local_controler_pb2_grpc.LocalControllerStub) if not a.startswith('_')]}") - logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) +def _is_local_host(host): + return host in {"localhost", "127.0.0.1"} + + +def _container_routing_host(host): + return "host.docker.internal" if _is_local_host(host) else host + + class GlobalController(object): """ Daemon that manages a routing table across multiple local controller instances. @@ -60,27 +68,25 @@ def __init__(self, config_path): self.cleanup_interval = self.config.get("cleanup_interval", 10) self.controllers = self.config.get("agents", []) self.running = False - self.processes = {} # name -> [Popen, ...] self.containers = {} # name -> [container_name, ...] self.redis_containers = {} # host -> container_name self.node_redis = {} # host -> RedisClient self._last_status = {} # (host, port) -> last known status self._lc_stubs = {} # endpoint -> gRPC stub self._shipped_images = set() # (image, host) already shipped this session - - # Registry config: if set, use push/pull; otherwise fall back to SSH pipe - registry_cfg = self.config.get("registry", {}) - self.registry_url = registry_cfg.get("url") # e.g. "myregistry.example.com:5000" - self.registry_user = registry_cfg.get("user") + self._synced_projects = set() # (host, remote_dir) synced this session + self.runtime_manager = RuntimeManager(self) # Clean up any stale containers from previous runs self._cleanup_stale_containers() - # Launch Redis on each unique node, then write routing table and policies + # Launch Redis on each unique local node, then write central specs and + # publish routing/policy snapshots to host Redis instances. self._launch_redis_containers() - self._build_routing_table() + write_agent_specs(self.config_path, self.redis) self._write_resource_specs() self._load_and_write_policies() + self.runtime_manager.publish_routing_snapshot(self.controllers) logger.info("Global controller initialized with %d controller(s).", len(self.controllers)) # Start background cleanup thread @@ -92,26 +98,21 @@ def __init__(self, config_path): # ------------------------------------------------------------------ # def _cleanup_stale_containers(self): - """Remove any containers from previous runs before launching new ones.""" - logger.info("Checking for stale containers from previous runs...") + """Remove only Redis containers from previous runs. + + ponytail: agent containers are now reused by RuntimeManager, so startup + cleanup must not delete them preemptively. + """ + logger.info("Checking for stale Redis containers from previous runs...") - # Collect all expected container names and the hosts they run on - # { host: (user, [container_names]) } host_containers = {} for ctrl in self.controllers: user = ctrl.get("user") - placements = self._get_replica_placements(ctrl) - name = ctrl["name"] - - for i, (host, port) in enumerate(placements): + for host in self.runtime_manager.list_runtime_nodes([ctrl]): if host not in host_containers: host_containers[host] = (user, set()) - - # Redis container for this host host_containers[host][1].add(f"ventis-redis-{host.replace('.', '-')}") - # Agent container - host_containers[host][1].add(f"ventis-{name.lower()}-{i}") # Try to remove each one on its respective host for host, (user, container_names) in host_containers.items(): @@ -123,7 +124,7 @@ def _cleanup_stale_containers(self): except Exception: pass # Container didn't exist, that's fine - logger.info("Stale container cleanup complete.") + logger.info("Stale Redis container cleanup complete.") # ------------------------------------------------------------------ # # Config # @@ -136,116 +137,32 @@ def _load_config(config_path): return yaml.safe_load(f) def reload_config(self): - """Reload the config file and rebuild the routing table.""" + """Reload the config file and refresh routing metadata.""" logger.info("Reloading config from %s", self.config_path) self.config = self._load_config(self.config_path) self.controllers = self.config.get("agents", []) self.poll_interval = self.config.get("poll_interval", 5) - self._build_routing_table() - - # ------------------------------------------------------------------ # - # Routing table # - # ------------------------------------------------------------------ # - - @staticmethod - def _get_replica_placements(ctrl): - """Normalize the ``replicas`` field into a list of (host, port) tuples. - - Supports two formats in the YAML config: - - 1. **Integer shorthand** — ``replicas: 3`` - Uses the agent's ``host`` and sequential ports starting at ``port``. - - 2. **Explicit list** — each entry specifies its own ``host`` and ``port``: - :: - - replicas: - - host: node1 - port: 8051 - - host: node2 - port: 8052 - """ - replicas = ctrl.get("replicas", 1) - default_host = ctrl.get("host", "localhost") - base_port = ctrl.get("port", 50051) - - if isinstance(replicas, int): - return [(default_host, base_port + i) for i in range(replicas)] - elif isinstance(replicas, list): - return [ - (r.get("host", default_host), r.get("port", base_port)) - for r in replicas - ] - else: - return [(default_host, base_port)] - - def _build_routing_table(self): - """Write the routing table to Redis on every node. - - For each agent, stores a JSON list of all replica endpoints under - ``routing_table:endpoints``. Agents marked ``stateful: true`` are - recorded in ``routing_table:stateful`` so local controllers can - enforce session affinity. - """ - endpoints_table = {} # name → JSON list of endpoints - stateful_table = {} # name → "true" (only for stateful agents) - - for ctrl in self.controllers: - name = ctrl["name"] - stateful = ctrl.get("stateful", False) - placements = self._get_replica_placements(ctrl) - - endpoints = [] - for host, port in placements: - # Use host.docker.internal for localhost so Docker containers - # can reach each other through the host's port mappings. - rt_host = "host.docker.internal" if host in ("localhost", "127.0.0.1") else host - endpoints.append(f"{rt_host}:{port}") - - endpoints_table[name] = json.dumps(endpoints) - if stateful: - stateful_table[name] = "true" - - # Write to every node's Redis so each local controller can look up - # the full routing table from its own Redis instance. - targets = list(self.node_redis.values()) if self.node_redis else [self.redis] - for redis_client in targets: - if endpoints_table: - redis_client.hset_multiple(self.ROUTING_ENDPOINTS_KEY, endpoints_table) - if stateful_table: - redis_client.hset_multiple(self.ROUTING_STATEFUL_KEY, stateful_table) - - existing = redis_client.smembers(self.SERVICES_SET_KEY) - for stale in existing - set(endpoints_table.keys()): - redis_client.srem(self.SERVICES_SET_KEY, stale) - for name in endpoints_table.keys(): - redis_client.sadd(self.SERVICES_SET_KEY, name) - - logger.info("Routing table written to %d Redis instance(s): %s", - len(targets), endpoints_table) - self._on_routing_table_updated(endpoints_table) + self.runtime_manager.publish_routing_snapshot(self.controllers) def _write_resource_specs(self): """Write the per-agent resource specs to Redis.""" for ctrl in self.controllers: name = ctrl["name"] resources = ctrl.get("resources", {}) - placements = self._get_replica_placements(ctrl) - self.redis.hset_multiple(f"agent:{name}:resources", { "cpu": str(resources.get("cpu", 1)), "memory": str(resources.get("memory", 512)), - "replicas": str(len(placements)), + "replicas": str(int(ctrl.get("replicas", 1))), }) - def _load_and_write_policies(self): - """Load policy rules from config/policy.yaml and write to all Redis instances.""" + def _load_policy_rules(self): + """Load policy rules from config/policy.yaml.""" config_dir = os.path.dirname(os.path.abspath(self.config_path)) policy_path = os.path.join(config_dir, "policy.yaml") if not os.path.isfile(policy_path): logger.info("No policy file found at %s, skipping policy setup.", policy_path) - return + return [] with open(policy_path, "r") as f: policy_config = yaml.safe_load(f) @@ -255,23 +172,18 @@ def _load_and_write_policies(self): # Sort rules by specificity: most match keys first # This way the local controller can iterate and use the first matching rule. rules.sort(key=lambda r: len(r.get("match", {})), reverse=True) + return rules - rules_json = json.dumps(rules) - - # Write to every node's Redis - targets = list(self.node_redis.values()) if self.node_redis else [self.redis] - for redis_client in targets: - redis_client.set(self.POLICY_RULES_KEY, rules_json) - - logger.info("Policy rules written to %d Redis instance(s): %d rule(s)", len(targets), len(rules)) + def _load_and_write_policies(self): + """Load policy rules and publish them to every host Redis.""" + rules = self._load_policy_rules() + target_count = self.runtime_manager.publish_policy_rules(rules) - def get_routing_table(self): - """Read the current routing table from Redis.""" - return self.redis.hgetall(self.ROUTING_TABLE_KEY) + logger.info("Policy rules written to %d Redis instance(s): %d rule(s)", target_count, len(rules)) - def get_endpoint(self, service_name): - """Look up the endpoint for a given service.""" - return self.redis.hget(self.ROUTING_TABLE_KEY, service_name) + # Routing reads are direct Redis calls now that RuntimeManager owns publication: + # - self.redis.hgetall(self.ROUTING_ENDPOINTS_KEY) + # - self.redis.hget(self.ROUTING_ENDPOINTS_KEY, service_name) def get_node_redis(self, host): """Get the RedisClient for a specific node.""" @@ -289,76 +201,86 @@ def _launch_redis_containers(self): redis:alpine container per host. Creates a RedisClient instance for each node so the global controller can query any node's Redis. """ - # Collect unique nodes from all replica placements - nodes = {} - for ctrl in self.controllers: - user = ctrl.get("user") - redis_port = ctrl.get("redis_port", 6379) - for host, _port in self._get_replica_placements(ctrl): - if host not in nodes: - nodes[host] = { - "user": user, - "redis_port": redis_port, - } + nodes = self.runtime_manager.list_runtime_nodes() for host, node_cfg in nodes.items(): redis_port = node_cfg["redis_port"] user = node_cfg["user"] - container_name = f"ventis-redis-{host.replace('.', '-')}" + self.ensure_host_redis(host, user, redis_port) - cmd = [ - "docker", "run", "-d", - "--name", container_name, - "-p", f"{redis_port}:6379", - "redis:alpine", - ] - - try: - result = self._run_cmd(cmd, host, user) - if result.returncode == 0: - self.redis_containers[host] = container_name - logger.info( - "Launched Redis container %s on %s:%d", - container_name, host, redis_port, - ) - else: - logger.critical( - "Failed to launch Redis on %s: %s", - host, result.stderr.strip(), - ) - sys.exit(1) - except FileNotFoundError: - logger.critical("Docker is not installed or not in PATH. Cannot launch Redis.") - sys.exit(1) - except Exception as e: - logger.critical("Failed to launch Redis on %s: %s", host, e) - sys.exit(1) + logger.info("Redis launched on %d node(s).", len(self.redis_containers)) - # Create a RedisClient for this node - # For localhost, connect directly; for remote, connect via host IP - connect_host = "localhost" if host in ("localhost", "127.0.0.1") else host - self.node_redis[host] = RedisClient( - host=connect_host, port=redis_port, + def ensure_host_redis(self, host, user=None, redis_port=6379, ssh_host=None): + """Launch/register the Redis container used by controllers on one host.""" + if host in self.node_redis: + return self.node_redis[host] + + ssh_host = ssh_host or host + prep_result = self._ensure_remote_docker(ssh_host, user) + if prep_result.returncode != 0: + logger.critical( + "Failed to prepare Docker on %s: %s", + ssh_host, prep_result.stderr.strip(), ) + sys.exit(1) - # Update the primary redis client to the local node's Redis - if "localhost" in self.node_redis: - self.redis = self.node_redis["localhost"] + container_name = f"ventis-redis-{host.replace('.', '-')}" + cmd = [ + "docker", "run", "-d", + "--name", container_name, + "-p", f"{redis_port}:6379", + "redis:alpine", + ] - logger.info("Redis launched on %d node(s).", len(self.redis_containers)) + try: + result = self._run_cmd(cmd, ssh_host, user) + if result.returncode != 0: + logger.critical( + "Failed to launch Redis on %s: %s", + ssh_host, result.stderr.strip(), + ) + sys.exit(1) + except FileNotFoundError: + logger.critical("Docker is not installed or not in PATH. Cannot launch Redis.") + sys.exit(1) + except Exception as e: + logger.critical("Failed to launch Redis on %s: %s", ssh_host, e) + sys.exit(1) + + self.redis_containers[host] = container_name + connect_host = "localhost" if _is_local_host(host) else host + node_redis = RedisClient(host=connect_host, port=redis_port) + self._wait_for_redis(node_redis, connect_host, redis_port) + self.node_redis[host] = node_redis + publish_policy_rules = getattr(self.runtime_manager, "publish_policy_rules", None) + if publish_policy_rules: + publish_policy_rules(self._load_policy_rules()) + logger.info("Launched Redis container %s on %s:%d", container_name, host, redis_port) + return self.node_redis[host] + + def _wait_for_redis(self, redis_client, host, port, timeout=30, interval=1): + """Wait until Redis accepts commands, surfacing network issues clearly.""" + deadline = time.time() + timeout + last_error = None + while time.time() < deadline: + try: + redis_client.set("__ventis_redis_healthcheck__", "ok") + return + except Exception as exc: + last_error = exc + time.sleep(interval) + + raise TimeoutError( + f"Timed out connecting to Redis at {host}:{port}. " + "For EC2 runtimes, ensure the instance security group allows inbound " + f"TCP {port} from the global controller host." + ) from last_error def _stop_redis_containers(self): """Stop and remove all launched Redis containers.""" + nodes = self.runtime_manager.list_runtime_nodes() for host, container_name in self.redis_containers.items(): - # Find the SSH user for this host by checking all replica placements - user = None - for ctrl in self.controllers: - for rhost, _rport in self._get_replica_placements(ctrl): - if rhost == host: - user = ctrl.get("user") - break - if user is not None: - break + user = nodes.get(host, {}).get("user") try: self._run_cmd(["docker", "stop", container_name], host, user) self._run_cmd(["docker", "rm", container_name], host, user) @@ -379,7 +301,7 @@ def _get_node_redis_for(self, host): def _agent_host_key(self, host): """Return the host string as seen by Docker containers (for status key matching).""" - return "host.docker.internal" if host in ("localhost", "127.0.0.1") else host + return _container_routing_host(host) def _wait_for_healthy(self, timeout=30, interval=2): """ @@ -390,12 +312,10 @@ def _wait_for_healthy(self, timeout=30, interval=2): interval: Seconds between checks. """ deadline = time.time() + timeout - # Build a list of every individual replica to check - pending = [] - for c in self.controllers: - name = c["name"] - for host, port in self._get_replica_placements(c): - pending.append((name, host, port)) + pending = [ + (instance["agent_name"], instance["host"], instance["host_port"]) + for instance in self.runtime_manager.list_instances() + ] logger.info("Waiting for %d replica(s) to become healthy (timeout=%ds)...", len(pending), timeout) @@ -441,33 +361,34 @@ def run(self): def _poll_controllers(self): """Check the health of each registered controller replica via its node's Redis.""" - for ctrl in self.controllers: - name = ctrl["name"] - for host, port in self._get_replica_placements(ctrl): - node_redis = self._get_node_redis_for(host) - agent_host = self._agent_host_key(host) - status_key = f"controller:{agent_host}:{port}:status" - - status = node_redis.get(status_key) or "unknown" - prev = self._last_status.get((host, port)) - - if status != prev: - if status == "healthy": - logger.info("Controller %s (%s:%s) is now healthy.", name, host, port) - self._on_controller_healthy(name, host, port) - else: - logger.warning( - "Controller %s (%s:%s) status changed: %s -> %s", - name, host, port, prev or "(none)", status, - ) - self._on_controller_unhealthy(name, host, port) - self._last_status[(host, port)] = status + for instance in self.runtime_manager.list_instances(): + name = instance["agent_name"] + host = instance["host"] + port = instance["host_port"] + node_redis = self._get_node_redis_for(host) + agent_host = self._agent_host_key(host) + status_key = f"controller:{agent_host}:{port}:status" + + status = node_redis.get(status_key) or "unknown" + prev = self._last_status.get((host, port)) + + if status != prev: + if status == "healthy": + logger.info("Controller %s (%s:%s) is now healthy.", name, host, port) + self._on_controller_healthy(name, host, port) + else: + logger.warning( + "Controller %s (%s:%s) status changed: %s -> %s", + name, host, port, prev or "(none)", status, + ) + self._on_controller_unhealthy(name, host, port) + self._last_status[(host, port)] = status + else: + # No change — healthy stays quiet, unhealthy stays quiet too + if status == "healthy": + self._on_controller_healthy(name, host, port) else: - # No change — healthy stays quiet, unhealthy stays quiet too - if status == "healthy": - self._on_controller_healthy(name, host, port) - else: - self._on_controller_unhealthy(name, host, port) + self._on_controller_unhealthy(name, host, port) # ------------------------------------------------------------------ # # Extensibility hooks — override in subclasses # @@ -513,115 +434,21 @@ def _trigger_cleanup(self): for request_id in completed: logger.info("Triggering cleanup for completed request %s", request_id) - for ctrl in self.controllers: - for host, port in self._get_replica_placements(ctrl): - endpoint = f"{host}:{port}" - try: - stub = self._get_lc_stub(endpoint) - payload = json.dumps({"request_id": request_id}) - stub.Cleanup(local_controler_pb2.JsonResponse(resonse=payload)) - logger.debug("Sent Cleanup for request %s to %s", request_id, endpoint) - except Exception as e: - logger.warning("Failed to trigger cleanup on %s: %s", endpoint, e) + for instance in self.runtime_manager.list_instances(): + endpoint = instance["endpoint"] + try: + stub = self._get_lc_stub(endpoint) + payload = json.dumps({"request_id": request_id}) + stub.Cleanup(local_controler_pb2.JsonResponse(resonse=payload)) + logger.debug("Sent Cleanup for request %s to %s", request_id, endpoint) + except Exception as e: + logger.warning("Failed to trigger cleanup on %s: %s", endpoint, e) # Remove from completed set after broadcast self.redis.srem("request:completed", request_id) # ------------------------------------------------------------------ # - # Agent launching # - # ------------------------------------------------------------------ # - def launch_agents(self): - """ - Launch all agents defined in the config. - - For each controller entry, spawn `replicas` number of subprocesses - using the configured entrypoint script. Each replica gets assigned - a port starting from the controller's base port. - """ - project_root = os.path.abspath( - os.path.join(os.path.dirname(__file__), "..", "..") - ) - - for ctrl in self.controllers: - name = ctrl["name"] - placements = self._get_replica_placements(ctrl) - entrypoint = ctrl.get("entrypoint") - - if not entrypoint: - logger.warning("No entrypoint for %s, skipping launch.", name) - continue - - entrypoint_path = os.path.join(project_root, entrypoint) - if not os.path.isfile(entrypoint_path): - logger.error("Entrypoint not found: %s", entrypoint_path) - continue - - self.processes[name] = [] - for host, port in placements: - proc = self._launch_single_agent(name, entrypoint_path, port, ctrl, host) - if proc: - self.processes[name].append(proc) - - total = sum(len(procs) for procs in self.processes.values()) - logger.info("Launched %d agent process(es) across %d service(s).", - total, len(self.processes)) - - def _launch_single_agent(self, name, entrypoint_path, port, ctrl, host=None): - """ - Launch a single agent subprocess. - - Args: - name: Service/agent name. - entrypoint_path: Absolute path to the agent script. - port: Port number for this instance. - ctrl: Full controller config dict. - - Returns: - The Popen object, or None on failure. - """ - resources = ctrl.get("resources", {}) - env = os.environ.copy() - env["VENTIS_AGENT_NAME"] = name - env["VENTIS_AGENT_PORT"] = str(port) - env["VENTIS_AGENT_CPU"] = str(resources.get("cpu", 1)) - env["VENTIS_AGENT_MEMORY"] = str(resources.get("memory", 512)) - - try: - proc = subprocess.Popen( - [sys.executable, entrypoint_path, "--port", str(port)], - env=env, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - ) - logger.info("Launched %s (pid=%d) on port %d", name, proc.pid, port) - - # Record status in Redis - host = host or ctrl.get("host", "localhost") - self.redis.set(f"controller:{host}:{port}:status", "healthy") - self.redis.set(f"controller:{host}:{port}:pid", str(proc.pid)) - - return proc - except Exception as e: - logger.error("Failed to launch %s on port %d: %s", name, port, e) - return None - - def _stop_agents(self): - """Terminate all launched agent subprocesses.""" - for name, procs in self.processes.items(): - for proc in procs: - if proc.poll() is None: # still running - logger.info("Terminating %s (pid=%d)", name, proc.pid) - proc.terminate() - try: - proc.wait(timeout=5) - except subprocess.TimeoutExpired: - logger.warning("Killing %s (pid=%d)", name, proc.pid) - proc.kill() - self.processes.clear() - logger.info("All agent processes stopped.") - - # ------------------------------------------------------------------ # - # Docker launching # + # Runtime launching # # ------------------------------------------------------------------ # def _run_cmd(self, cmd, host, user=None): @@ -636,79 +463,192 @@ def _run_cmd(self, cmd, host, user=None): Returns: subprocess.CompletedProcess """ - is_local = host in ("localhost", "127.0.0.1") + is_local = _is_local_host(host) if is_local: return subprocess.run(cmd, capture_output=True, text=True) else: - ssh_target = f"{user}@{host}" if user else host + ssh_target = self._ssh_target(host, user) remote_cmd = " ".join(cmd) + if cmd and cmd[0] == "docker": + remote_cmd = f"sudo {remote_cmd}" return subprocess.run( - ["ssh", ssh_target, remote_cmd], + [*self._ssh_base_cmd(), ssh_target, remote_cmd], capture_output=True, text=True, ) + def _ssh_target(self, host, user=None): + """Build the SSH target string for a remote host.""" + return f"{user}@{host}" if user else host + + def _ssh_base_cmd(self): + """Return the shared SSH command prefix for remote EC2 operations.""" + ec2_cfg = getattr(self, "config", {}).get("ec2", {}) + cmd = [ + "ssh", + "-o", + "StrictHostKeyChecking=no", + "-o", + "ConnectTimeout=10", + ] + ssh_key_path = ec2_cfg.get("ssh_private_key_path") + if ssh_key_path: + cmd.extend(["-i", ssh_key_path]) + return cmd + + def _run_remote_script(self, host, script, user=None): + """Run a shell script on a remote host over SSH.""" + if _is_local_host(host): + return subprocess.run(["bash", "-lc", script], capture_output=True, text=True) + return subprocess.run( + [*self._ssh_base_cmd(), self._ssh_target(host, user), "bash", "-lc", shlex.quote(script)], + capture_output=True, + text=True, + ) + + def _ensure_remote_docker(self, host, user=None): + """Install and start Docker on a remote host if needed.""" + if _is_local_host(host): + return subprocess.CompletedProcess(args=[], returncode=0, stdout="", stderr="") + wait_result = self._wait_for_remote_ssh(host, user) + if wait_result.returncode != 0: + return wait_result + script = """ +set -e +if ! command -v docker >/dev/null 2>&1; then + if command -v apt-get >/dev/null 2>&1; then + sudo apt-get update -y + sudo apt-get install -y docker.io + elif command -v dnf >/dev/null 2>&1; then + sudo dnf install -y docker + elif command -v yum >/dev/null 2>&1; then + sudo yum install -y docker + else + echo unsupported-package-manager >&2 + exit 1 + fi +fi +sudo systemctl enable --now docker || sudo service docker start +""".strip() + return self._run_remote_script(host, script, user) + + def _wait_for_remote_ssh(self, host, user=None, timeout=120, interval=2): + """Wait until a remote host accepts SSH connections.""" + if _is_local_host(host): + return subprocess.CompletedProcess(args=[], returncode=0, stdout="", stderr="") + + deadline = time.time() + timeout + last_result = None + while time.time() < deadline: + result = subprocess.run( + [*self._ssh_base_cmd(), self._ssh_target(host, user), "true"], + capture_output=True, + text=True, + ) + if result.returncode == 0: + return result + + last_result = result + stderr = (result.stderr or "").lower() + if "permission denied" in stderr: + return result + + time.sleep(interval) + + return last_result or subprocess.CompletedProcess( + args=[], + returncode=1, + stdout="", + stderr=f"Timed out waiting for SSH on {host}", + ) + def _ensure_image_on_host(self, image, host, user): """ Ensure `image` is available on `host` before running a container. - Strategy: - - If a registry is configured: tag + push to registry, then pull on - the remote host via SSH. - - Fallback (no registry): stream the image over SSH using - ``docker save | ssh ... docker load``. - Images are only shipped once per (image, host) pair per session. Does nothing for localhost. """ - if host in ("localhost", "127.0.0.1"): + if _is_local_host(host): return # already on the local Docker engine if (image, host) in self._shipped_images: logger.debug("Image %s already shipped to %s this session, skipping.", image, host) return - if self.registry_url: - self._ship_image_registry(image, host, user) - else: - self._ship_image_ssh(image, host, user) - + self._ship_image_ssh(image, host, user) self._shipped_images.add((image, host)) - def _ship_image_registry(self, image, host, user): + def _sync_project_to_host(self, host, user, remote_dir): """ - Push image to the configured registry on the local host, then - instruct the remote host to pull it. + Mirror the current project directory to a fixed remote path. + + ponytail: this is a full-tree tar sync for MVP simplicity; replace with + rsync or artifact packaging later if transfer size matters. """ - remote_tag = f"{self.registry_url}/{image}:latest" - logger.info("Tagging %s -> %s for registry push...", image, remote_tag) - subprocess.run(["docker", "tag", image, remote_tag], check=True) + if _is_local_host(host): + return - logger.info("Pushing %s to registry %s...", image, self.registry_url) - subprocess.run(["docker", "push", remote_tag], check=True) + sync_key = (host, remote_dir) + if sync_key in self._synced_projects: + logger.debug("Project already synced to %s:%s this session, skipping.", host, remote_dir) + return - logger.info("Pulling %s on %s from registry...", image, host) - result = self._run_cmd(["docker", "pull", remote_tag], host, user) - if result.returncode != 0: + ssh_target = self._ssh_target(host, user) + remote_cmd = ( + f"sudo rm -rf {shlex.quote(remote_dir)} && " + f"sudo mkdir -p {shlex.quote(remote_dir)} && " + f"sudo tar -xzf - -C {shlex.quote(remote_dir)}" + ) + tar_cmd = [ + "tar", + "--exclude=.git", + "--exclude=.omx", + "--exclude=.pytest_cache", + "--exclude=.venv", + "--exclude=__pycache__", + "--exclude=docker_container", + "-czf", + "-", + ".", + ] + + logger.info("Syncing project to %s:%s...", host, remote_dir) + tar_proc = subprocess.Popen( + tar_cmd, + cwd=os.getcwd(), + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + load_proc = subprocess.Popen( + [*self._ssh_base_cmd(), ssh_target, remote_cmd], + stdin=tar_proc.stdout, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + tar_proc.stdout.close() + _, stderr = load_proc.communicate() + tar_stderr = tar_proc.stderr.read().decode().strip() + tar_proc.stderr.close() + tar_proc.wait() + + if tar_proc.returncode != 0: raise RuntimeError( - f"Failed to pull {remote_tag} on {host}: {result.stderr.strip()}" + f"Failed to create project archive for {host}: {tar_stderr or 'tar failed'}" + ) + if load_proc.returncode != 0: + raise RuntimeError( + f"Failed to sync project to {host}:{remote_dir}: {stderr.decode().strip()}" ) - # Retag to the expected short name on the remote so container launch works - retag_result = self._run_cmd( - ["docker", "tag", remote_tag, f"{image}:latest"], - host, user, - ) - if retag_result.returncode != 0: - logger.warning("Failed to retag %s on %s: %s", remote_tag, host, retag_result.stderr.strip()) - - logger.info("Image %s is ready on %s (via registry).", image, host) + self._synced_projects.add(sync_key) + logger.info("Project synced to %s:%s.", host, remote_dir) def _ship_image_ssh(self, image, host, user): """ Stream image to remote host using `docker save | ssh docker load`. Used as a fallback when no registry is configured. """ - ssh_target = f"{user}@{host}" if user else host + ssh_target = self._ssh_target(host, user) logger.info( "Shipping image %s to %s via SSH pipe (no registry configured)...", image, host, @@ -718,7 +658,7 @@ def _ship_image_ssh(self, image, host, user): stdout=subprocess.PIPE, ) load_proc = subprocess.Popen( - ["ssh", ssh_target, "docker load"], + [*self._ssh_base_cmd(), ssh_target, "sudo docker load"], stdin=save_proc.stdout, stdout=subprocess.PIPE, stderr=subprocess.PIPE, @@ -733,126 +673,37 @@ def _ship_image_ssh(self, image, host, user): ) logger.info("Image %s shipped to %s successfully.", image, host) - def launch_docker_agents(self): - """ - Launch all agents as Docker containers. - - For each agent in the config, runs `docker run` either locally or - via SSH on the specified host. Spawns `replicas` containers per agent, - each on an incrementing port from the base port. - - Assumes Docker images are pre-built (via `make docker`). - Image name convention: ventis- - """ - for ctrl in self.controllers: - name = ctrl["name"] - default_host = ctrl.get("host", "localhost") - user = ctrl.get("user") - resources = ctrl.get("resources", {}) - ctrl_type = ctrl.get("type", "agent") - placements = self._get_replica_placements(ctrl) - - image = f"ventis-{name.lower()}" - self.containers[name] = [] - - for i, (host, port) in enumerate(placements): - container_name = f"ventis-{name.lower()}-{i}" - - # Containers can't reach host's Redis via "localhost"; - # use host.docker.internal to route to the Docker host. - redis_host_for_container = "host.docker.internal" if host in ("localhost", "127.0.0.1") else host - - cmd = [ - "docker", "run", "-d", "-it", - "--add-host=host.docker.internal:host-gateway", - "--name", container_name, - "-p", f"{port}:50051", - "-e", f"VENTIS_AGENT_PORT={port}", - "-e", f"VENTIS_AGENT_HOST={'host.docker.internal' if host in ('localhost', '127.0.0.1') else host}", - "-e", f"VENTIS_REDIS_HOST={redis_host_for_container}", - "-e", f"VENTIS_REDIS_PORT={ctrl.get('redis_port', 6379)}", - ] - - # Workflow containers also expose the REST API port - if ctrl_type == "workflow": - api_port = ctrl.get("api_port", 8080) - cmd.extend(["-p", f"{api_port}:8080"]) - - # Apply resource limits - cpu = resources.get("cpu") - memory = resources.get("memory") - gpu = resources.get("gpu") - if cpu: - cmd.extend(["--cpus", str(cpu)]) - if memory: - cmd.extend(["--memory", f"{memory}m"]) - if gpu: - # Provide the specific count or identifier requested (e.g., '1', '2', 'all') - cmd.extend(["--gpus", str(gpu)]) - - cmd.append(image) - - try: - # Ensure the image exists on the target host before launching - replica_user = user # TODO: per-replica user support - self._ensure_image_on_host(image, host, replica_user) - result = self._run_cmd(cmd, host, replica_user) - if result.returncode == 0: - container_id = result.stdout.strip()[:12] - self.containers[name].append(container_name) - logger.info( - "Launched container %s (%s) on %s:%d", - container_name, container_id, host, port, - ) - else: - logger.critical( - "Failed to launch %s on %s:%d: %s", - container_name, host, port, result.stderr.strip(), - ) - # Remove the failed container left in "Created" state - self._run_cmd(["docker", "rm", "-f", container_name], host, user) - self._stop_docker_agents() - self._stop_redis_containers() - sys.exit(1) - except FileNotFoundError: - logger.critical("Docker is not installed or not in PATH. Cannot launch agents.") - self._stop_redis_containers() - sys.exit(1) - except Exception as e: - logger.critical( - "Failed to launch %s on %s:%d: %s", - container_name, host, port, e, - ) - self._run_cmd(["docker", "rm", "-f", container_name], host, user) - self._stop_docker_agents() - self._stop_redis_containers() - sys.exit(1) - - total = sum(len(c) for c in self.containers.values()) - logger.info("Launched %d Docker container(s) across %d service(s).", - total, len(self.containers)) + def launch_agents(self): + """Create or reuse agent containers through RuntimeManager.""" + try: + self.containers = {} + instances = self.runtime_manager.ensure_instances(self.controllers) + total = len(instances) + logger.info( + "Ensured %d Docker container(s) across %d service(s).", + total, + len(self.containers), + ) + except FileNotFoundError: + logger.critical("Docker is not installed or not in PATH. Cannot launch agents.") + self._stop_redis_containers() + sys.exit(1) + except Exception: + logger.exception("Failed to ensure agent runtimes") + self._stop_docker_agents() + self._stop_redis_containers() + sys.exit(1) def _stop_docker_agents(self): - """Stop and remove all launched Docker containers.""" - for ctrl in self.controllers: - name = ctrl["name"] - user = ctrl.get("user") - placements = self._get_replica_placements(ctrl) - - containers_for_agent = self.containers.get(name, []) - for i, container_name in enumerate(containers_for_agent): - # Match each container to its placement host - if i < len(placements): - host = placements[i][0] - else: - host = ctrl.get("host", "localhost") - - try: - self._run_cmd(["docker", "stop", container_name], host, user) - self._run_cmd(["docker", "rm", container_name], host, user) - logger.info("Stopped and removed %s on %s", container_name, host) - except Exception as e: - logger.warning("Failed to stop %s on %s: %s", container_name, host, e) + """Stop and remove all managed runtimes.""" + for instance in list(self.runtime_manager.list_instances()): + try: + self.runtime_manager.remove_instance( + self.runtime_manager._instance_id_from_record(instance) + ) + logger.info("Removed runtime %s", instance["runtime_id"]) + except Exception as e: + logger.warning("Failed to remove runtime %s: %s", instance["runtime_id"], e) self.containers.clear() logger.info("All Docker containers stopped.") @@ -903,6 +754,6 @@ def _signal_handler(sig, frame): signal.signal(signal.SIGTERM, _signal_handler) atexit.register(controller.cleanup) - controller.launch_docker_agents() + controller.launch_agents() controller._wait_for_healthy() controller.run() diff --git a/ventis/controller/runtime_manager.py b/ventis/controller/runtime_manager.py new file mode 100644 index 0000000..7e485f6 --- /dev/null +++ b/ventis/controller/runtime_manager.py @@ -0,0 +1,470 @@ +import json +import logging + +from ventis.controller.cloud_provider_logic.EC2 import _runtime as ec2_runtime + +logger = logging.getLogger(__name__) +DEFAULT_HOST = "localhost" +DEFAULT_HOST_PORT_START = 8000 + + +def _is_local_host(host): + return host in {"localhost", "127.0.0.1"} + + +def _container_routing_host(host): + return "host.docker.internal" if _is_local_host(host) else host + + +def reject_legacy_replica_shape(agent_spec): + if "host" in agent_spec or "port" in agent_spec or isinstance(agent_spec.get("replicas"), list): + raise ValueError("Legacy YAML host/port replica placement is no longer supported; use provider with integer replicas only.") + + +def resolve_local_replica_placements(agent_spec): + reject_legacy_replica_shape(agent_spec) + replicas = int(agent_spec.get("replicas", 1)) + return [{"host": DEFAULT_HOST, "host_port": None} for _ in range(replicas)] + + +def allocate_host_port(runtime_manager, host, requested_host_port=None, ignore_instance_id=None): + if requested_host_port is not None: + return int(requested_host_port) + + used_ports = set() + for instance in runtime_manager.list_instances(): + if ignore_instance_id and runtime_manager._instance_id_from_record(instance) == ignore_instance_id: + continue + if instance.get("host") != host: + continue + host_port = instance.get("host_port") + if host_port is not None: + used_ports.add(int(host_port)) + + host_port = DEFAULT_HOST_PORT_START + while host_port in used_ports: + host_port += 1 + return host_port + + +class RuntimeManager: + """Create, reuse, and publish agent runtimes.""" + + ROUTING_ENDPOINTS_KEY = "routing_table:endpoints" + ROUTING_STATEFUL_KEY = "routing_table:stateful" + SERVICES_SET_KEY = "routing_table:services" + CONTAINER_PORT = 50051 + WORKFLOW_API_PORT = 8080 + + def __init__(self, controller, redis_client=None): + self.controller = controller + self._redis = redis_client + + @property + def redis(self): + return self._redis or self.controller.redis + + def ensure_instances(self, agent_specs): + instances = [] + self._agent_specs = list(agent_specs) + self.publish_routing_snapshot(self._agent_specs) + ec2_runtime._set_controller(self.controller) + + for agent_spec in agent_specs: + agent_name = agent_spec["name"] + provider = agent_spec.get("provider", "local") + self.controller.containers.setdefault(agent_name, []) + + if provider.upper() == "EC2": + ec2_runtime.validate_config() + for replica_index in range(int(agent_spec.get("replicas", 1))): + instance_id = self._instance_id(provider, agent_name, replica_index) + key = self._instance_key(provider, agent_name, replica_index) + instance = self.redis.hgetall(key) + if instance: + self.remove_instance(instance_id) + provisioned = ec2_runtime.provision_instance(agent_spec, replica_index) + redis_port = int(agent_spec.get("redis_port", provisioned.get("redis_port", 6379))) + self.controller.ensure_host_redis( + provisioned["host"], + provisioned.get("user"), + redis_port, + ssh_host=provisioned.get("ssh_host"), + ) + self.publish_routing_snapshot(self._agent_specs) + instance = ec2_runtime.bootstrap_instance( + provisioned, + agent_spec, + replica_index, + redis_host=provisioned["host"], + redis_port=redis_port, + ) + self._write_instance(instance) + self._add_instance_to_agent(agent_name, instance_id) + self._track_runtime(agent_name, instance["runtime_id"]) + self.publish_routing_snapshot(self._agent_specs) + instances.append(instance) + continue + + for replica_index, placement in enumerate(self._replica_placements(agent_spec)): + host = placement["host"] + host_port = placement.get("host_port") + instance_id = self._instance_id(provider, agent_name, replica_index) + key = self._instance_key(provider, agent_name, replica_index) + instance = self.redis.hgetall(key) + + if instance and self._runtime_exists(instance) and self._placement_matches(instance, host, host_port): + pass + else: + if instance: + self.remove_instance(instance_id) + instance = self._create_instance( + agent_spec=agent_spec, + host=host, + host_port=host_port, + replica_index=replica_index, + instance_id=instance_id, + previous_instance=instance, + ) + self._write_instance(instance) + + self._add_instance_to_agent(agent_name, instance_id) + self._track_runtime(agent_name, instance["runtime_id"]) + self.publish_routing_snapshot(self._agent_specs) + instances.append(instance) + + return instances + + def _write_instance(self, instance): + key = self._instance_key( + instance["provider"], + instance["agent_name"], + int(instance["replica_index"]), + ) + mapping = { + "agent_name": instance["agent_name"], + "provider": instance["provider"], + "replica_index": str(instance["replica_index"]), + "host": instance["host"], + "host_port": str(instance["host_port"]), + "container_port": str(instance["container_port"]), + "endpoint": instance["endpoint"], + "redis_host": instance["redis_host"], + "redis_port": str(instance["redis_port"]), + "runtime_id": instance["runtime_id"], + } + if instance.get("ec2_instance_id"): + mapping["ec2_instance_id"] = instance["ec2_instance_id"] + self.redis.hset_multiple(key, mapping) + + def _add_instance_to_agent(self, agent_name, instance_id): + self.redis.sadd(f"agent:{agent_name}:instances", instance_id) + + def _publish_endpoint(self, instance): + self.publish_routing_snapshot(self._current_agent_specs()) + + def remove_instance(self, instance_id): + key = f"agent_instance:{instance_id}" + instance = self.redis.hgetall(key) + if not instance: + return + + self._destroy_runtime(instance) + self.redis.delete(key) + self.redis.srem(f"agent:{instance['agent_name']}:instances", instance_id) + + self.publish_routing_snapshot(self._current_agent_specs()) + + containers = self.controller.containers.get(instance["agent_name"], []) + self.controller.containers[instance["agent_name"]] = [ + runtime_id + for runtime_id in containers + if runtime_id != instance["runtime_id"] + ] + + def list_instances(self, agent_name=None): + if agent_name: + instance_ids = sorted(self.redis.smembers(f"agent:{agent_name}:instances")) + instances = [] + for instance_id in instance_ids: + instance = self.redis.hgetall(f"agent_instance:{instance_id}") + if instance: + instances.append(instance) + return instances + + instances = [] + for key in sorted(self.redis.scan_keys("agent_instance:*")): + instance = self.redis.hgetall(key) + if instance: + instances.append(instance) + return instances + + def _create_instance( + self, + agent_spec, + host, + host_port, + replica_index, + instance_id=None, + previous_instance=None, + ): + provider = agent_spec.get("provider", "local") + host_port = self._resolve_host_port( + agent_spec, + host, + host_port, + instance_id=instance_id, + previous_instance=previous_instance, + ) + return self._create_local_instance(agent_spec, host, host_port, replica_index) + + def _create_local_instance(self, agent_spec, host, host_port, replica_index): + return self._launch_container( + agent_spec=agent_spec, + host=host, + host_port=host_port, + replica_index=replica_index, + ensure_remote_image=False, + ) + + def _launch_container(self, agent_spec, host, host_port, replica_index, ensure_remote_image): + agent_name = agent_spec["name"] + provider = agent_spec.get("provider", "local") + user = agent_spec.get("user") + resources = agent_spec.get("resources", {}) + ctrl_type = agent_spec.get("type", "agent") + image = f"ventis-{agent_name.lower()}" + runtime_id = f"ventis-{provider.lower()}-{agent_name.lower()}-{replica_index}" + redis_host = _container_routing_host(host) + agent_host = redis_host + + cmd = [ + "docker", + "run", + "-d", + "-it", + "--add-host=host.docker.internal:host-gateway", + "--name", + runtime_id, + "-p", + f"{host_port}:{self.CONTAINER_PORT}", + "-e", + f"VENTIS_AGENT_PORT={host_port}", + "-e", + f"VENTIS_AGENT_HOST={agent_host}", + "-e", + f"VENTIS_REDIS_HOST={redis_host}", + "-e", + f"VENTIS_REDIS_PORT={agent_spec.get('redis_port', 6379)}", + ] + + if ctrl_type == "workflow": + cmd.extend(["-p", f"{self.WORKFLOW_API_PORT}:8080"]) + + cpu = resources.get("cpu") + memory = resources.get("memory") + gpu = resources.get("gpu") + if cpu: + cmd.extend(["--cpus", str(cpu)]) + if memory: + cmd.extend(["--memory", f"{memory}m"]) + if gpu: + cmd.extend(["--gpus", str(gpu)]) + + cmd.append(image) + + if ensure_remote_image: + self.controller._ensure_image_on_host(image, host, user) + + result = self.controller._run_cmd(cmd, host, user) + if result.returncode != 0: + raise RuntimeError( + f"Failed to launch {runtime_id} on {host}:{host_port}: {result.stderr.strip()}" + ) + + instance = { + "agent_name": agent_name, + "provider": provider, + "replica_index": str(replica_index), + "host": host, + "host_port": str(host_port), + "container_port": str(self.CONTAINER_PORT), + "endpoint": f"{host}:{host_port}", + "redis_host": redis_host, + "redis_port": str(agent_spec.get("redis_port", 6379)), + "runtime_id": runtime_id, + } + logger.info("Runtime ready: %s -> %s", runtime_id, instance["endpoint"]) + return instance + + def _runtime_exists(self, instance): + runtime_id = instance.get("runtime_id") + if not runtime_id: + return False + if instance.get("provider", "local").upper() == "EC2": + try: + return bool(ec2_runtime._get_instance_host(runtime_id)) + except Exception: + return False + host = instance.get("host", "localhost") + result = self.controller._run_cmd( + ["docker", "inspect", runtime_id], + host, + self._user_for_instance(instance), + ) + return result.returncode == 0 + + def _destroy_runtime(self, instance): + runtime_id = instance.get("runtime_id") + if not runtime_id: + return + if instance.get("provider", "local").upper() == "EC2": + ec2_runtime.terminate_instance(runtime_id) + host = instance.get("host") + if host: + getattr(self.controller, "redis_containers", {}).pop(host, None) + getattr(self.controller, "node_redis", {}).pop(host, None) + return + result = self.controller._run_cmd( + ["docker", "rm", "-f", runtime_id], + instance.get("host", "localhost"), + self._user_for_instance(instance), + ) + if result.returncode != 0: + logger.warning("Failed to remove runtime %s: %s", runtime_id, result.stderr.strip()) + + def _track_runtime(self, agent_name, runtime_id): + containers = self.controller.containers.setdefault(agent_name, []) + if runtime_id not in containers: + containers.append(runtime_id) + + def _replica_placements(self, agent_spec): + provider = agent_spec.get("provider", "local") + if provider.upper() == "EC2": + reject_legacy_replica_shape(agent_spec) + return [None] * int(agent_spec.get("replicas", 1)) + return resolve_local_replica_placements(agent_spec) + + def list_runtime_nodes(self, agent_specs=None): + nodes = {} + for agent_spec in agent_specs or getattr(self.controller, "controllers", []): + if agent_spec.get("provider", "local").upper() == "EC2": + continue + user = agent_spec.get("user") + redis_port = agent_spec.get("redis_port", 6379) + for placement in self._replica_placements(agent_spec): + host = placement["host"] + nodes.setdefault(host, {"user": user, "redis_port": redis_port}) + return nodes + + def _resolve_host_port( + self, + agent_spec, + host, + requested_host_port, + instance_id=None, + previous_instance=None, + ): + if previous_instance and previous_instance.get("host") == host and previous_instance.get("host_port"): + requested_host_port = previous_instance["host_port"] + return allocate_host_port( + self, + host, + requested_host_port=requested_host_port, + ignore_instance_id=instance_id, + ) + + def _placement_matches(self, instance, host, host_port): + if instance.get("host") != host: + return False + if host_port is None: + return True + return str(instance.get("host_port")) == str(host_port) + + def _routing_endpoint_for(self, instance): + if instance.get("provider", "local").lower() == "local" and _is_local_host(instance.get("host")): + return f"host.docker.internal:{instance['host_port']}" + return instance["endpoint"] + + @staticmethod + def _instance_id(provider, agent_name, replica_index): + return f"{provider}:{agent_name}:{replica_index}" + + @classmethod + def _instance_key(cls, provider, agent_name, replica_index): + return f"agent_instance:{cls._instance_id(provider, agent_name, replica_index)}" + + def sync_routing_metadata(self, agent_specs): + self.publish_routing_snapshot(agent_specs) + + def publish_routing_snapshot(self, agent_specs): + """Copy routing metadata derived from central records to host Redis.""" + services = {agent_spec["name"] for agent_spec in agent_specs} + stateful = { + agent_spec["name"] + for agent_spec in agent_specs + if agent_spec.get("stateful", False) + } + + for redis_client in self._routing_redis_targets(): + existing_services = redis_client.smembers(self.SERVICES_SET_KEY) + for stale in existing_services - services: + redis_client.srem(self.SERVICES_SET_KEY, stale) + self._hdel(redis_client, self.ROUTING_STATEFUL_KEY, stale) + self._hdel(redis_client, self.ROUTING_ENDPOINTS_KEY, stale) + for service in services: + redis_client.sadd(self.SERVICES_SET_KEY, service) + if service in stateful: + redis_client.hset(self.ROUTING_STATEFUL_KEY, service, "true") + else: + self._hdel(redis_client, self.ROUTING_STATEFUL_KEY, service) + endpoints = [ + self._routing_endpoint_for(item) + for item in sorted( + self.list_instances(service), + key=lambda item: int(item["replica_index"]), + ) + ] + if endpoints: + redis_client.hset( + self.ROUTING_ENDPOINTS_KEY, + service, + json.dumps(endpoints), + ) + else: + self._hdel(redis_client, self.ROUTING_ENDPOINTS_KEY, service) + + def publish_policy_rules(self, rules): + rules_json = json.dumps(rules) + targets = self._routing_redis_targets() + for redis_client in targets: + redis_client.set("policy:rules", rules_json) + return len(targets) + + def _hdel(self, redis_client, name, field): + if hasattr(redis_client, "client"): + redis_client.client.hdel(name, field) + return + if hasattr(redis_client, "hdel"): + redis_client.hdel(name, field) + + def _instance_id_from_record(self, instance): + return self._instance_id( + instance["provider"], + instance["agent_name"], + int(instance["replica_index"]), + ) + + def _user_for_instance(self, instance): + agent_name = instance.get("agent_name") + for agent_spec in getattr(self.controller, "controllers", []): + if agent_spec.get("name") == agent_name: + return agent_spec.get("user") + return None + + def _current_agent_specs(self): + return getattr(self, "_agent_specs", getattr(self.controller, "controllers", [])) + + def _routing_redis_targets(self): + targets = list(getattr(self.controller, "node_redis", {}).values()) + return targets or [self.redis] diff --git a/ventis/templates/config/global_controller.yaml b/ventis/templates/config/global_controller.yaml index 74167bf..f4378cb 100644 --- a/ventis/templates/config/global_controller.yaml +++ b/ventis/templates/config/global_controller.yaml @@ -3,40 +3,51 @@ agents: - name: ExampleAgent - host: localhost - port: 8051 - redis_port: 6379 replicas: 1 + redis_port: 6379 resources: cpu: 1 memory: 512 entrypoint: agents/example_agent.py + provider: local - name: VllmAgent - host: localhost - port: 8052 - redis_port: 6379 replicas: 1 + redis_port: 6379 resources: cpu: 2 memory: 2048 gpu: 1 entrypoint: agents/vllm_agent.py + provider: local - name: Workflow - host: localhost - port: 8050 + replicas: 1 type: workflow - api_port: 8080 redis_port: 6379 - replicas: 1 workflow_file: workflows/example_workflow.py + provider: local -# Polling interval in seconds poll_interval: 5 -# Redis connection redis: host: localhost port: 6379 db: 0 + +# EC2 defaults for `provider: EC2` replicas. +# Keep them here so `config/global_controller.yaml` stays the only source of truth. +# +ec2: + # For the current EC2 runtime path, the instance security group must allow + # inbound TCP 50051 and 6379 from the global controller host's source IP. + region: us-east-1 + ami_id: ami-0123456789abcdef0 + instance_type: t2.nano + subnet_id: subnet-0123456789abcdef0 + security_group_ids: + - sg-0123456789abcdef0 + ssh_user: ubuntu + key_name: ventis-key + agent_image: ventis-agent-base + remote_project_dir: /opt/ventis/project From 2adedd8a73270c57744f8d7def6d994dfac30e4d Mon Sep 17 00:00:00 2001 From: Saak Date: Mon, 29 Jun 2026 15:18:45 -0400 Subject: [PATCH 2/2] Tweaked testing command --- tests/README.md | 43 +++++++++++++++++++++++++++++++++++++++---- tests/run_tests.sh | 7 +++++-- 2 files changed, 44 insertions(+), 6 deletions(-) diff --git a/tests/README.md b/tests/README.md index 1720b8c..51bdbe1 100644 --- a/tests/README.md +++ b/tests/README.md @@ -1,4 +1,39 @@ -# Tests -Fast suite from repo root: `python3 -m pytest tests` -Full local live smoke: `VENTIS_RUN_FULL_LOCAL=1 python3 -m unittest tests.live.test_full_local_deploy` -Small live Docker smoke: `VENTIS_RUN_LIVE_DOCKER=1 python3 -m unittest tests.live.test_local_docker_runtime` +# Ventis Testing & Load Analysis Tools + +This directory contains an automated end-to-end testing suite for Ventis. It is designed to verify both functional correctness and concurrent performance of the distributed agent architecture. + +## 1. Automated Test Runner (`run_tests.sh`) +This script automates the entire testing lifecycle by interacting with the `ventis` CLI: +0. Runs the repo pytest suite. +1. Scaffolds a new temporary project using `ventis new-project`. +2. Compiles the project using `ventis build`. +3. Launches the project using `ventis deploy` in the background. +4. Waits for the GlobalController and all agent sidecars to become healthy. +5. Runs the Python integration and performance scripts. +6. **Cleanup:** Automatically terminates the deployment and cleans up the temporary directory upon success or failure. + +To run the complete suite: +```bash +./run_tests.sh +``` + +## 2. Functional Integration Validation (`test_integration.py`) +Verifies that Ventis correctly passes data and dependencies between chained agents. +- Dispatches a single query to the deployed `/main` endpoint. +- Polls the `/status` endpoint until completion. +- Validates the output payload structure and ensures that data successfully flowed through `FinanceAgent`, `MarketResearchAgent`, and `VllmAgent`. + +To run manually against an already-deployed Ventis instance: +```bash +python test_integration.py +``` + +## 3. High-Concurrency Stress Test (`test_performance.py`) +Evaluates the robustness and scalability of the Ventis Redis routing and Docker architecture under load. Using `concurrent.futures`, this script models N concurrent users actively polling Ventis simultaneously. + +It produces an analytical report summarizing throughput, dropped requests, and latency percentiles. + +To run manually against an already-deployed Ventis instance (e.g. 50 requests across 10 concurrent virtual users): +```bash +python test_performance.py --concurrent 10 --total 50 +``` diff --git a/tests/run_tests.sh b/tests/run_tests.sh index e9e1dc7..3f1ac30 100755 --- a/tests/run_tests.sh +++ b/tests/run_tests.sh @@ -5,6 +5,11 @@ echo "===========================================" echo " Ventis Integration & Performance Tests" echo "===========================================" +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &> /dev/null && pwd)" + +echo ">> 0. Running pytest suite..." +python -m pytest "$SCRIPT_DIR" || exit 1 + TEST_DIR="/tmp/ventis_test_env_$$" PROJECT_NAME="ventis_test" @@ -44,8 +49,6 @@ sleep 5 echo ">> Deployment healthy! Running test suite." ORIG_CWD=$(pwd) -# Assuming the script was called from inside the ventis repo root -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &> /dev/null && pwd)" echo "-------------------------------------------" echo ">> Running Integration Tests..."