Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
67 changes: 67 additions & 0 deletions internal/backend/llamacpp.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
// / ctx: https://ctx.ist
// ,'`./ do you remember?
// `.,'\
// \ Copyright 2026-present Context contributors.
// SPDX-License-Identifier: Apache-2.0

package backend

import (
"context"

cfgBackend "github.com/ActiveMemory/ctx/internal/config/backend"
)

// newLlamaCpp constructs a llama.cpp backend. The wire
// work is delegated to the embedded openAICompat;
// llamacpp adds cold-start retry on ECONNREFUSED for
// Ping. llama-server does not bind its HTTP listener
// until the model weights are fully loaded; during that
// window the OS returns ECONNREFUSED, so the same
// cold-start retry logic used by vLLM applies here.
//
// Parameters:
// - cfg: per-project backend config.
//
// Returns:
// - *llamacpp: concrete backend.
// - error: typed err/backend sentinel on validation
// failure (missing/invalid endpoint).
func newLlamaCpp(cfg Config) (*llamacpp, error) {
if cfg.Name == "" {
cfg.Name = cfgBackend.NameLlamaCpp
}
if cfg.Endpoint == "" {
cfg.Endpoint = cfgBackend.DefaultEndpointLlamaCpp
}
inner, err := newOpenAICompat(cfg)
if err != nil {
return nil, err
}
return &llamacpp{
openAICompat: inner,
coldStartWindow: cfgBackend.DefaultColdStartWindow,
coldStartInterval: cfgBackend.DefaultColdStartInterval,
}, nil
}

// Ping implements [Backend] with llama.cpp-specific
// cold-start retry. Delegates to the embedded
// openAICompat.Ping via [coldStartRetry] which retries
// while the failure is a dial-refused error (server has
// not yet bound the listener) and the cold-start window
// has not elapsed.
//
// Parameters:
// - ctx: caller-provided context for cancellation.
//
// Returns:
// - error: nil on success, typed sentinel on failure.
func (b *llamacpp) Ping(ctx context.Context) error {
return coldStartRetry(
ctx,
b.openAICompat.Ping,
b.coldStartWindow,
b.coldStartInterval,
)
}
62 changes: 62 additions & 0 deletions internal/backend/llamacpp_e2e_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
//go:build e2e

package backend

import (
"context"
"os"
"testing"
"time"
)

func TestLlamaCpp_E2E_PingRealServer(t *testing.T) {
endpoint := os.Getenv("LLAMACPP_ENDPOINT")
if endpoint == "" {
endpoint = "http://localhost:8080"
}
b, err := newLlamaCpp(Config{Endpoint: endpoint})
if err != nil {
t.Fatalf("ctor: %v", err)
}
if b.Name() != "llamacpp" {
t.Errorf("Name = %q, want llamacpp", b.Name())
}
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
if err := b.Ping(ctx); err != nil {
t.Fatalf("Ping real server: %v", err)
}
t.Logf("Ping OK: %s", endpoint)
}

func TestLlamaCpp_E2E_CompleteRealServer(t *testing.T) {
endpoint := os.Getenv("LLAMACPP_ENDPOINT")
if endpoint == "" {
endpoint = "http://localhost:8080"
}
b, err := newLlamaCpp(Config{
Endpoint: endpoint,
DefaultModel: "test",
})
if err != nil {
t.Fatalf("ctor: %v", err)
}
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()
resp, err := b.Complete(ctx, Request{
Messages: []Message{
{Role: "user", Content: "Say hello in one word."},
},
MaxTokens: 32,
Temperature: 0.0,
})
if err != nil {
t.Fatalf("Complete: %v", err)
}
t.Logf("Model: %s", resp.Model)
t.Logf("Content: %q", resp.Content)
t.Logf("Raw: %s", string(resp.Raw))
if resp.Content == "" && len(resp.Raw) == 0 {
t.Fatal("Complete returned empty content and empty raw")
}
}
59 changes: 59 additions & 0 deletions internal/backend/llamacpp_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
// / ctx: https://ctx.ist
// ,'`./ do you remember?
// `.,'\
// \ Copyright 2026-present Context contributors.
// SPDX-License-Identifier: Apache-2.0

package backend

import (
"context"
"errors"
"net/http"
"net/http/httptest"
"testing"
"time"

errBackend "github.com/ActiveMemory/ctx/internal/err/backend"
)

func TestNewLlamaCpp_DefaultName(t *testing.T) {
srv := fakeOpenAIServer(t, testModel)
defer srv.Close()
b, err := newLlamaCpp(Config{Endpoint: srv.URL})
if err != nil {
t.Fatalf("ctor: %v", err)
return
}
if b.Name() != "llamacpp" {
t.Errorf("Name = %q, want llamacpp", b.Name())
}
}

func TestLlamaCpp_Ping_HappyDelegatesToOpenAICompat(t *testing.T) {
srv := fakeOpenAIServer(t, testModel)
defer srv.Close()
b, _ := newLlamaCpp(Config{Endpoint: srv.URL})
if err := b.Ping(context.Background()); err != nil {
t.Fatalf("Ping: %v", err)
}
}

func TestLlamaCpp_Ping_NonDialErrorReturnsImmediately(t *testing.T) {
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
http.Error(w, "broken", http.StatusInternalServerError)
}))
defer srv.Close()
b, _ := newLlamaCpp(Config{Endpoint: srv.URL})
b.coldStartWindow = 10 * time.Millisecond
b.coldStartInterval = 5 * time.Millisecond
start := time.Now()
err := b.Ping(context.Background())
elapsed := time.Since(start)
if !errors.Is(err, errBackend.ErrUnhealthyStatus) {
t.Fatalf("got %v, want ErrUnhealthyStatus", err)
}
if elapsed > 200*time.Millisecond {
t.Errorf("Ping retried on 500; elapsed %v should be near-zero", elapsed)
}
}
18 changes: 18 additions & 0 deletions internal/backend/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,24 @@ type vllm struct {
coldStartInterval time.Duration
}

// llamacpp is the llama.cpp (llama-server) backend.
// Embeds *openAICompat for the wire work and overrides
// Ping with cold-start retry on ECONNREFUSED (llama-server
// does not bind the listener until the model is fully
// loaded; the OS returns ECONNREFUSED during that window).
//
// Fields:
// - openAICompat: embedded generic backend providing
// Name/Complete and the base Ping.
// - coldStartWindow: maximum wall-clock during which
// Ping retries on refused.
// - coldStartInterval: sleep between retry attempts.
type llamacpp struct {
*openAICompat
coldStartWindow time.Duration
coldStartInterval time.Duration
}

// openAICompat is a generic OpenAI-compatible HTTP
// backend used directly for `openai-compatible` configs
// and embedded by per-vendor wrappers (vllm, openai, ...)
Expand Down
6 changes: 6 additions & 0 deletions internal/config/backend/backend.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,9 @@ const (
// NameLMStudio is the registered backend type label
// for the LM Studio wrapper.
NameLMStudio = "lmstudio"
// NameLlamaCpp is the registered backend type label
// for the llama.cpp (llama-server) wrapper.
NameLlamaCpp = "llamacpp"
)

// Default endpoints for each per-vendor wrapper. Applied
Expand All @@ -57,6 +60,9 @@ const (
// deployments routinely vary the port so the user can
// always override.
DefaultEndpointVLLM = "http://localhost:8000"
// DefaultEndpointLlamaCpp is the llama-server default
// local listener address.
DefaultEndpointLlamaCpp = "http://localhost:8080"
)

// Default API-key environment variable names for each
Expand Down