From 5210fca010f6168de601bf21a6422ca33aa6a6da Mon Sep 17 00:00:00 2001 From: Ersan Bilik Date: Sat, 23 May 2026 17:35:12 +0300 Subject: [PATCH] feat(backend): add llama.cpp (llama-server) backend wrapper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add llamacpp as a named backend alongside vllm, openai, anthropic, ollama, and lmstudio. Follows the established vLLM pattern exactly: - llamacpp struct embedding *openAICompat with cold-start retry - Ping() delegates to coldStartRetry (reuses vllm_internal.go helper) - Default endpoint: http://localhost:8080 (llama-server default) - No API key required by default (llama-server runs unauthenticated) .ctxrc usage (once factory wiring + ctx ai are implemented): backends: - name: local type: llamacpp endpoint: http://localhost:8080 timeout: 60s default_backend: local Note: this commit adds the backend type and tests only. Factory registration wiring and consumer CLI commands (ctx ai, ctx compact --emit, ctx ingest) are not yet implemented in the vllm-integration branch either — they are expected in a future phase once the backend abstraction layer stabilizes. When that wiring lands, llamacpp will be available as a backend type with zero additional work. Validated against a live llama-server (Qwen3-4B-Q4_K_M): - Ping: GET /v1/models returns 200 - Complete: POST /v1/chat/completions returns model response - Cold-start retry: unit-tested via shared coldStartRetry helper Files: internal/config/backend/backend.go +2 constants internal/backend/types.go +1 struct (llamacpp) internal/backend/llamacpp.go constructor + Ping override internal/backend/llamacpp_test.go 3 unit tests (mock server) internal/backend/llamacpp_e2e_test.go 2 e2e tests (build tag: e2e) --- internal/backend/llamacpp.go | 67 +++++++++++++++++++++++++++ internal/backend/llamacpp_e2e_test.go | 62 +++++++++++++++++++++++++ internal/backend/llamacpp_test.go | 59 +++++++++++++++++++++++ internal/backend/types.go | 18 +++++++ internal/config/backend/backend.go | 6 +++ 5 files changed, 212 insertions(+) create mode 100644 internal/backend/llamacpp.go create mode 100644 internal/backend/llamacpp_e2e_test.go create mode 100644 internal/backend/llamacpp_test.go diff --git a/internal/backend/llamacpp.go b/internal/backend/llamacpp.go new file mode 100644 index 00000000..9df0e970 --- /dev/null +++ b/internal/backend/llamacpp.go @@ -0,0 +1,67 @@ +// / ctx: https://ctx.ist +// ,'`./ do you remember? +// `.,'\ +// \ Copyright 2026-present Context contributors. +// SPDX-License-Identifier: Apache-2.0 + +package backend + +import ( + "context" + + cfgBackend "github.com/ActiveMemory/ctx/internal/config/backend" +) + +// newLlamaCpp constructs a llama.cpp backend. The wire +// work is delegated to the embedded openAICompat; +// llamacpp adds cold-start retry on ECONNREFUSED for +// Ping. llama-server does not bind its HTTP listener +// until the model weights are fully loaded; during that +// window the OS returns ECONNREFUSED, so the same +// cold-start retry logic used by vLLM applies here. +// +// Parameters: +// - cfg: per-project backend config. +// +// Returns: +// - *llamacpp: concrete backend. +// - error: typed err/backend sentinel on validation +// failure (missing/invalid endpoint). +func newLlamaCpp(cfg Config) (*llamacpp, error) { + if cfg.Name == "" { + cfg.Name = cfgBackend.NameLlamaCpp + } + if cfg.Endpoint == "" { + cfg.Endpoint = cfgBackend.DefaultEndpointLlamaCpp + } + inner, err := newOpenAICompat(cfg) + if err != nil { + return nil, err + } + return &llamacpp{ + openAICompat: inner, + coldStartWindow: cfgBackend.DefaultColdStartWindow, + coldStartInterval: cfgBackend.DefaultColdStartInterval, + }, nil +} + +// Ping implements [Backend] with llama.cpp-specific +// cold-start retry. Delegates to the embedded +// openAICompat.Ping via [coldStartRetry] which retries +// while the failure is a dial-refused error (server has +// not yet bound the listener) and the cold-start window +// has not elapsed. +// +// Parameters: +// - ctx: caller-provided context for cancellation. +// +// Returns: +// - error: nil on success, typed sentinel on failure. +func (b *llamacpp) Ping(ctx context.Context) error { + return coldStartRetry( + ctx, + b.openAICompat.Ping, + b.coldStartWindow, + b.coldStartInterval, + ) +} diff --git a/internal/backend/llamacpp_e2e_test.go b/internal/backend/llamacpp_e2e_test.go new file mode 100644 index 00000000..3a8ec3b7 --- /dev/null +++ b/internal/backend/llamacpp_e2e_test.go @@ -0,0 +1,62 @@ +//go:build e2e + +package backend + +import ( + "context" + "os" + "testing" + "time" +) + +func TestLlamaCpp_E2E_PingRealServer(t *testing.T) { + endpoint := os.Getenv("LLAMACPP_ENDPOINT") + if endpoint == "" { + endpoint = "http://localhost:8080" + } + b, err := newLlamaCpp(Config{Endpoint: endpoint}) + if err != nil { + t.Fatalf("ctor: %v", err) + } + if b.Name() != "llamacpp" { + t.Errorf("Name = %q, want llamacpp", b.Name()) + } + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + if err := b.Ping(ctx); err != nil { + t.Fatalf("Ping real server: %v", err) + } + t.Logf("Ping OK: %s", endpoint) +} + +func TestLlamaCpp_E2E_CompleteRealServer(t *testing.T) { + endpoint := os.Getenv("LLAMACPP_ENDPOINT") + if endpoint == "" { + endpoint = "http://localhost:8080" + } + b, err := newLlamaCpp(Config{ + Endpoint: endpoint, + DefaultModel: "test", + }) + if err != nil { + t.Fatalf("ctor: %v", err) + } + ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) + defer cancel() + resp, err := b.Complete(ctx, Request{ + Messages: []Message{ + {Role: "user", Content: "Say hello in one word."}, + }, + MaxTokens: 32, + Temperature: 0.0, + }) + if err != nil { + t.Fatalf("Complete: %v", err) + } + t.Logf("Model: %s", resp.Model) + t.Logf("Content: %q", resp.Content) + t.Logf("Raw: %s", string(resp.Raw)) + if resp.Content == "" && len(resp.Raw) == 0 { + t.Fatal("Complete returned empty content and empty raw") + } +} diff --git a/internal/backend/llamacpp_test.go b/internal/backend/llamacpp_test.go new file mode 100644 index 00000000..f3999133 --- /dev/null +++ b/internal/backend/llamacpp_test.go @@ -0,0 +1,59 @@ +// / ctx: https://ctx.ist +// ,'`./ do you remember? +// `.,'\ +// \ Copyright 2026-present Context contributors. +// SPDX-License-Identifier: Apache-2.0 + +package backend + +import ( + "context" + "errors" + "net/http" + "net/http/httptest" + "testing" + "time" + + errBackend "github.com/ActiveMemory/ctx/internal/err/backend" +) + +func TestNewLlamaCpp_DefaultName(t *testing.T) { + srv := fakeOpenAIServer(t, testModel) + defer srv.Close() + b, err := newLlamaCpp(Config{Endpoint: srv.URL}) + if err != nil { + t.Fatalf("ctor: %v", err) + return + } + if b.Name() != "llamacpp" { + t.Errorf("Name = %q, want llamacpp", b.Name()) + } +} + +func TestLlamaCpp_Ping_HappyDelegatesToOpenAICompat(t *testing.T) { + srv := fakeOpenAIServer(t, testModel) + defer srv.Close() + b, _ := newLlamaCpp(Config{Endpoint: srv.URL}) + if err := b.Ping(context.Background()); err != nil { + t.Fatalf("Ping: %v", err) + } +} + +func TestLlamaCpp_Ping_NonDialErrorReturnsImmediately(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + http.Error(w, "broken", http.StatusInternalServerError) + })) + defer srv.Close() + b, _ := newLlamaCpp(Config{Endpoint: srv.URL}) + b.coldStartWindow = 10 * time.Millisecond + b.coldStartInterval = 5 * time.Millisecond + start := time.Now() + err := b.Ping(context.Background()) + elapsed := time.Since(start) + if !errors.Is(err, errBackend.ErrUnhealthyStatus) { + t.Fatalf("got %v, want ErrUnhealthyStatus", err) + } + if elapsed > 200*time.Millisecond { + t.Errorf("Ping retried on 500; elapsed %v should be near-zero", elapsed) + } +} diff --git a/internal/backend/types.go b/internal/backend/types.go index af88e923..c67e17aa 100644 --- a/internal/backend/types.go +++ b/internal/backend/types.go @@ -174,6 +174,24 @@ type vllm struct { coldStartInterval time.Duration } +// llamacpp is the llama.cpp (llama-server) backend. +// Embeds *openAICompat for the wire work and overrides +// Ping with cold-start retry on ECONNREFUSED (llama-server +// does not bind the listener until the model is fully +// loaded; the OS returns ECONNREFUSED during that window). +// +// Fields: +// - openAICompat: embedded generic backend providing +// Name/Complete and the base Ping. +// - coldStartWindow: maximum wall-clock during which +// Ping retries on refused. +// - coldStartInterval: sleep between retry attempts. +type llamacpp struct { + *openAICompat + coldStartWindow time.Duration + coldStartInterval time.Duration +} + // openAICompat is a generic OpenAI-compatible HTTP // backend used directly for `openai-compatible` configs // and embedded by per-vendor wrappers (vllm, openai, ...) diff --git a/internal/config/backend/backend.go b/internal/config/backend/backend.go index c165998e..58ae970e 100644 --- a/internal/config/backend/backend.go +++ b/internal/config/backend/backend.go @@ -33,6 +33,9 @@ const ( // NameLMStudio is the registered backend type label // for the LM Studio wrapper. NameLMStudio = "lmstudio" + // NameLlamaCpp is the registered backend type label + // for the llama.cpp (llama-server) wrapper. + NameLlamaCpp = "llamacpp" ) // Default endpoints for each per-vendor wrapper. Applied @@ -57,6 +60,9 @@ const ( // deployments routinely vary the port so the user can // always override. DefaultEndpointVLLM = "http://localhost:8000" + // DefaultEndpointLlamaCpp is the llama-server default + // local listener address. + DefaultEndpointLlamaCpp = "http://localhost:8080" ) // Default API-key environment variable names for each