ActiveMemory · bilersan · May 23, 2026
@@ -0,0 +1,67 @@
+//   /    ctx:                         https://ctx.ist
+// ,'`./    do you remember?
+// `.,'\
+//   \    Copyright 2026-present Context contributors.
+//                 SPDX-License-Identifier: Apache-2.0
+
+package backend
+
+import (
+	"context"
+
+	cfgBackend "github.com/ActiveMemory/ctx/internal/config/backend"
+)
+
+// newLlamaCpp constructs a llama.cpp backend. The wire
+// work is delegated to the embedded openAICompat;
+// llamacpp adds cold-start retry on ECONNREFUSED for
+// Ping. llama-server does not bind its HTTP listener
+// until the model weights are fully loaded; during that
+// window the OS returns ECONNREFUSED, so the same
+// cold-start retry logic used by vLLM applies here.
+//
+// Parameters:
+//   - cfg: per-project backend config.
+//
+// Returns:
+//   - *llamacpp: concrete backend.
+//   - error: typed err/backend sentinel on validation
+//     failure (missing/invalid endpoint).
+func newLlamaCpp(cfg Config) (*llamacpp, error) {
+	if cfg.Name == "" {
+		cfg.Name = cfgBackend.NameLlamaCpp
+	}
+	if cfg.Endpoint == "" {
+		cfg.Endpoint = cfgBackend.DefaultEndpointLlamaCpp
+	}
+	inner, err := newOpenAICompat(cfg)
+	if err != nil {
+		return nil, err
+	}
+	return &llamacpp{
+		openAICompat:      inner,
+		coldStartWindow:   cfgBackend.DefaultColdStartWindow,
+		coldStartInterval: cfgBackend.DefaultColdStartInterval,
+	}, nil
+}
+
+// Ping implements [Backend] with llama.cpp-specific
+// cold-start retry. Delegates to the embedded
+// openAICompat.Ping via [coldStartRetry] which retries
+// while the failure is a dial-refused error (server has
+// not yet bound the listener) and the cold-start window
+// has not elapsed.
+//
+// Parameters:
+//   - ctx: caller-provided context for cancellation.
+//
+// Returns:
+//   - error: nil on success, typed sentinel on failure.
+func (b *llamacpp) Ping(ctx context.Context) error {
+	return coldStartRetry(
+		ctx,
+		b.openAICompat.Ping,
+		b.coldStartWindow,
+		b.coldStartInterval,
+	)
+}
@@ -0,0 +1,62 @@
+//go:build e2e
+
+package backend
+
+import (
+	"context"
+	"os"
+	"testing"
+	"time"
+)
+
+func TestLlamaCpp_E2E_PingRealServer(t *testing.T) {
+	endpoint := os.Getenv("LLAMACPP_ENDPOINT")
+	if endpoint == "" {
+		endpoint = "http://localhost:8080"
+	}
+	b, err := newLlamaCpp(Config{Endpoint: endpoint})
+	if err != nil {
+		t.Fatalf("ctor: %v", err)
+	}
+	if b.Name() != "llamacpp" {
+		t.Errorf("Name = %q, want llamacpp", b.Name())
+	}
+	ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
+	defer cancel()
+	if err := b.Ping(ctx); err != nil {
+		t.Fatalf("Ping real server: %v", err)
+	}
+	t.Logf("Ping OK: %s", endpoint)
+}
+
+func TestLlamaCpp_E2E_CompleteRealServer(t *testing.T) {
+	endpoint := os.Getenv("LLAMACPP_ENDPOINT")
+	if endpoint == "" {
+		endpoint = "http://localhost:8080"
+	}
+	b, err := newLlamaCpp(Config{
+		Endpoint:     endpoint,
+		DefaultModel: "test",
+	})
+	if err != nil {
+		t.Fatalf("ctor: %v", err)
+	}
+	ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
+	defer cancel()
+	resp, err := b.Complete(ctx, Request{
+		Messages: []Message{
+			{Role: "user", Content: "Say hello in one word."},
+		},
+		MaxTokens:   32,
+		Temperature: 0.0,
+	})
+	if err != nil {
+		t.Fatalf("Complete: %v", err)
+	}
+	t.Logf("Model: %s", resp.Model)
+	t.Logf("Content: %q", resp.Content)
+	t.Logf("Raw: %s", string(resp.Raw))
+	if resp.Content == "" && len(resp.Raw) == 0 {
+		t.Fatal("Complete returned empty content and empty raw")
+	}
+}
@@ -0,0 +1,59 @@
+//   /    ctx:                         https://ctx.ist
+// ,'`./    do you remember?
+// `.,'\
+//   \    Copyright 2026-present Context contributors.
+//                 SPDX-License-Identifier: Apache-2.0
+
+package backend
+
+import (
+	"context"
+	"errors"
+	"net/http"
+	"net/http/httptest"
+	"testing"
+	"time"
+
+	errBackend "github.com/ActiveMemory/ctx/internal/err/backend"
+)
+
+func TestNewLlamaCpp_DefaultName(t *testing.T) {
+	srv := fakeOpenAIServer(t, testModel)
+	defer srv.Close()
+	b, err := newLlamaCpp(Config{Endpoint: srv.URL})
+	if err != nil {
+		t.Fatalf("ctor: %v", err)
+		return
+	}
+	if b.Name() != "llamacpp" {
+		t.Errorf("Name = %q, want llamacpp", b.Name())
+	}
+}
+
+func TestLlamaCpp_Ping_HappyDelegatesToOpenAICompat(t *testing.T) {
+	srv := fakeOpenAIServer(t, testModel)
+	defer srv.Close()
+	b, _ := newLlamaCpp(Config{Endpoint: srv.URL})
+	if err := b.Ping(context.Background()); err != nil {
+		t.Fatalf("Ping: %v", err)
+	}
+}
+
+func TestLlamaCpp_Ping_NonDialErrorReturnsImmediately(t *testing.T) {
+	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
+		http.Error(w, "broken", http.StatusInternalServerError)
+	}))
+	defer srv.Close()
+	b, _ := newLlamaCpp(Config{Endpoint: srv.URL})
+	b.coldStartWindow = 10 * time.Millisecond
+	b.coldStartInterval = 5 * time.Millisecond
+	start := time.Now()
+	err := b.Ping(context.Background())
+	elapsed := time.Since(start)
+	if !errors.Is(err, errBackend.ErrUnhealthyStatus) {
+		t.Fatalf("got %v, want ErrUnhealthyStatus", err)
+	}
+	if elapsed > 200*time.Millisecond {
+		t.Errorf("Ping retried on 500; elapsed %v should be near-zero", elapsed)
+	}
+}
@@ -174,6 +174,24 @@ type vllm struct {
 	coldStartInterval time.Duration
 }
 
+// llamacpp is the llama.cpp (llama-server) backend.
+// Embeds *openAICompat for the wire work and overrides
+// Ping with cold-start retry on ECONNREFUSED (llama-server
+// does not bind the listener until the model is fully
+// loaded; the OS returns ECONNREFUSED during that window).
+//
+// Fields:
+//   - openAICompat: embedded generic backend providing
+//     Name/Complete and the base Ping.
+//   - coldStartWindow: maximum wall-clock during which
+//     Ping retries on refused.
+//   - coldStartInterval: sleep between retry attempts.
+type llamacpp struct {
+	*openAICompat
+	coldStartWindow   time.Duration
+	coldStartInterval time.Duration
+}
+
 // openAICompat is a generic OpenAI-compatible HTTP
 // backend used directly for `openai-compatible` configs
 // and embedded by per-vendor wrappers (vllm, openai, ...)

@@ -33,6 +33,9 @@ const (
 	// NameLMStudio is the registered backend type label
 	// for the LM Studio wrapper.
 	NameLMStudio = "lmstudio"
+	// NameLlamaCpp is the registered backend type label
+	// for the llama.cpp (llama-server) wrapper.
+	NameLlamaCpp = "llamacpp"
 )
 
 // Default endpoints for each per-vendor wrapper. Applied
@@ -57,6 +60,9 @@ const (
 	// deployments routinely vary the port so the user can
 	// always override.
 	DefaultEndpointVLLM = "http://localhost:8000"
+	// DefaultEndpointLlamaCpp is the llama-server default
+	// local listener address.
+	DefaultEndpointLlamaCpp = "http://localhost:8080"
 )
 
 // Default API-key environment variable names for each