Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 41 additions & 8 deletions src/bonfire/knowledge/memory.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,17 @@
# SPDX-License-Identifier: Apache-2.0
# Copyright 2026 BonfireAI

"""In-memory vault backend for testing.
"""In-memory vault backend.

Stores entries in a list. Query uses substring matching on content.
No embeddings, no external dependencies. Implements VaultBackend protocol.
Stores entries in a list with O(1)-indexed dedup and a lowercase-content
cache for fast substring query. No embeddings, no external dependencies.
Implements VaultBackend protocol.

Originally documented as "for testing," but the knowledge factory returns
this backend as the SHIPPING DEFAULT when knowledge is unconfigured
(``enabled=False`` or ``backend="memory"``). Production users hit this
class on every ``bonfire scan`` unless they explicitly opt into LanceDB.
The performance characteristics matter accordingly.
"""

from __future__ import annotations
Expand All @@ -18,15 +25,36 @@


class InMemoryVaultBackend:
"""In-memory vault for tests. No embeddings, substring matching."""
"""In-memory vault. Substring matching, no embeddings.

Shipping default when knowledge is unconfigured. O(1) ``exists()`` via a
side hash-set index; ``query()`` reads pre-computed lowercase content
from a side cache so it does NOT allocate ``e.content.lower()`` per
entry per call.

The public ``_entries: list[VaultEntry]`` attribute is preserved as the
canonical iteration / inspection surface; ``_hashes`` and
``_lower_content`` are private indexes maintained at ``store()`` time.
All three are append-only — ``store()`` does NOT dedup on hash collision
(the backend's caller is responsible for the ``exists()``-then-``store()``
guard pattern).
"""

def __init__(self) -> None:
self._entries: list[VaultEntry] = []
# O(1) ``exists()`` — set of all stored content hashes.
self._hashes: set[str] = set()
# ``query()`` lower-case cache, keyed by ``entry_id``. Populated at
# ``store()``; read in the substring-match hot loop so we never call
# ``e.content.lower()`` per entry per query.
self._lower_content: dict[str, str] = {}

async def store(self, entry: VaultEntry) -> str:
if not entry.content_hash:
entry = entry.model_copy(update={"content_hash": compute_hash(entry.content)})
self._entries.append(entry)
self._hashes.add(entry.content_hash)
self._lower_content[entry.entry_id] = entry.content.lower()
return entry.entry_id

async def query(
Expand All @@ -39,15 +67,20 @@ async def query(
candidates = self._entries
if entry_type is not None:
candidates = [e for e in candidates if e.entry_type == entry_type]
query_lower = query.lower()
query_words = query_lower.split()
scored = [(e, sum(1 for w in query_words if w in e.content.lower())) for e in candidates]
query_words = query.lower().split()
# Read from the lower-case cache populated at store() — no per-entry
# ``.lower()`` allocation in this hot path.
lower_content = self._lower_content
scored = [
(e, sum(1 for w in query_words if w in lower_content[e.entry_id])) for e in candidates
]
scored = [(e, s) for e, s in scored if s > 0]
scored.sort(key=lambda x: x[1], reverse=True)
return [e for e, _ in scored[:limit]]

async def exists(self, content_hash: str) -> bool:
return any(e.content_hash == content_hash for e in self._entries)
# O(1) via side index; replaces the prior O(N) ``any(...)`` scan.
return content_hash in self._hashes

async def get_by_source(self, source_path: str) -> list[VaultEntry]:
return [e for e in self._entries if e.source_path == source_path]
194 changes: 194 additions & 0 deletions tests/unit/test_knowledge_memory_performance.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,194 @@
# SPDX-License-Identifier: Apache-2.0
# Copyright 2026 BonfireAI

"""Knight RED tests — InMemoryVaultBackend performance (linear ingest).

The in-memory backend ships as the default when knowledge is unconfigured —
its docstring says "for testing" but the factory returns it for the bare
``enabled=False`` and ``backend="memory"`` paths. Production users hit it
unless they explicitly opt into LanceDB.

The pre-fix implementation:

- ``exists(content_hash)`` is ``any(e.content_hash == hash for e in self._entries)``
— linear scan over the entire entries list per call.
- ``query(...)`` allocates a fresh ``e.content.lower()`` per entry per call —
hot-path string allocation that grows linearly with vault size.

The ingest paths (``KnowledgeIngestConsumer._store``, the various
``knowledge.ingest`` entry points, ``TechScanner.scan_and_store``) ALL call
``exists()`` before ``store()``. Stacked together: ingesting N entries
requires N linear-scan exists checks ⇒ **O(N²) ingest** for an "in-memory
backend marked for testing" that's actually the shipping default.

This Knight pins:

1. **Wall-clock cap on 10,000-entry ingest.** Linear behavior should complete
well under 1 second; the pre-fix quadratic shape takes 10s+ on this CPU.
The test asserts < 1.0 s with the standard ``exists``-then-``store``
guard pattern that production callers use.

2. **Correctness regression guards** ensuring the perf fix (hash-set +
lower-cache indexes) doesn't break the existing semantics already pinned
by ``test_knowledge_memory.py``. Specifically: the ``_entries`` list MUST
still grow on every ``store()`` even when ``content_hash`` collides with
a previously-stored entry — the existing
``test_store_appends_to_entries`` contract requires append-on-every-store,
not dedup-on-hash.

The fix Warrior is constrained to:

- Add a ``_hashes: set[str]`` index for O(1) ``exists()``.
- Cache ``content.lower()`` at insert (keyed by ``entry_id``) so ``query()``
doesn't re-allocate per call.
- PRESERVE ``_entries: list[VaultEntry]`` as the public attribute — the
innovative-lens conformance test in ``test_knowledge_memory.py`` asserts
``backend._entries == []`` and ``isinstance(backend._entries, list)``
directly.
- Update the class docstring to acknowledge the production-default usage
(acceptance criterion #3 in the ticket).
"""

from __future__ import annotations

import time

from bonfire.knowledge.memory import InMemoryVaultBackend
from bonfire.protocols import VaultEntry


class TestIngestScalesLinearly:
"""Acceptance criterion #4 — ingest of 10k entries completes in < 1 s.

Models the production-default ingest path: every entry is ``exists()``-
checked before ``store()`` (mirroring ``KnowledgeIngestConsumer._store``,
``knowledge.ingest.ingest_*``, and ``TechScanner.scan_and_store`` — the
four callers the ticket enumerates).
"""

async def test_10k_entry_ingest_under_one_second(self) -> None:
"""Pre-fix: ~10s+ (quadratic). Post-fix: well under 1s (linear).

The wall-clock cap is generous on purpose — the post-fix shape is
dominated by Python attribute access + Pydantic instantiation, NOT
by the dedup index. A 10x margin over expected wall-clock time
keeps the test stable across CPUs and CI runners.
"""
backend = InMemoryVaultBackend()
n = 10_000

# Pre-construct entries so the loop measures *backend operations*, not
# Pydantic instantiation. Use distinct content + distinct hash so
# exists() always returns False and store() always fires (worst case
# for the quadratic shape).
entries = [
VaultEntry(
content=f"entry content body {i}",
entry_type="code_chunk",
content_hash=f"hash-{i:08d}",
)
for i in range(n)
]

start = time.perf_counter()
for entry in entries:
# Production guard pattern: exists-check then store-on-miss.
if not await backend.exists(entry.content_hash):
await backend.store(entry)
elapsed = time.perf_counter() - start

# Confirm we actually stored everything (sanity guard against the
# exists/store contract drifting).
assert len(backend._entries) == n

# Linear cap: the post-fix shape is O(N); quadratic pre-fix would
# take ~10s+ for N=10_000 on this CPU.
assert elapsed < 1.0, (
f"10k-entry ingest took {elapsed:.3f}s — expected sub-second under "
"the linear-time fix; the pre-fix quadratic shape takes 10s+"
)

async def test_query_with_large_vault_does_not_allocate_per_entry_lower(
self,
) -> None:
"""Pre-fix: query() does e.content.lower() per entry per call.

Post-fix: lowercase is cached at insert time (keyed by entry_id).
Verifies the fix by timing 100 queries over a 5k-entry vault — pre-
fix this is ~5s+; post-fix it should be sub-second.
"""
backend = InMemoryVaultBackend()
n = 5_000
for i in range(n):
await backend.store(
VaultEntry(
content=f"content body number {i} alpha beta gamma",
entry_type="code_chunk",
content_hash=f"hash-{i:08d}",
)
)

start = time.perf_counter()
for _ in range(100):
results = await backend.query("alpha beta gamma", limit=10)
assert len(results) == 10
elapsed = time.perf_counter() - start

assert elapsed < 2.0, (
f"100 queries over 5k-entry vault took {elapsed:.3f}s — expected "
"sub-second with lower-cache; pre-fix per-entry .lower() takes 5s+"
)


class TestPerfFixPreservesCorrectness:
"""Regression guards — the perf fix must NOT break existing semantics."""

async def test_entries_list_grows_on_every_store_even_with_hash_collision(
self,
) -> None:
"""``_entries`` list appends on every store, regardless of hash dedup.

Test contract from ``test_knowledge_memory.py::test_store_appends_to_entries``:
a single store yields ``len(_entries) == 1``. By extension, two stores
yield ``len(_entries) == 2`` even when both share a content_hash —
the in-memory backend does NOT dedup on store; only ``exists()`` is
the contract for "have we seen this hash."
"""
backend = InMemoryVaultBackend()
e1 = VaultEntry(content="same body", entry_type="code_chunk", content_hash="dup-hash")
e2 = VaultEntry(content="same body", entry_type="code_chunk", content_hash="dup-hash")
await backend.store(e1)
await backend.store(e2)
assert len(backend._entries) == 2, (
"perf fix must not silently dedup on store — _entries grows on every store"
)

async def test_exists_returns_true_for_any_stored_hash(self) -> None:
"""O(1) exists check returns True for hashes from any prior store."""
backend = InMemoryVaultBackend()
for i in range(50):
await backend.store(
VaultEntry(
content=f"c{i}",
entry_type="code_chunk",
content_hash=f"h-{i}",
)
)
for i in range(50):
assert await backend.exists(f"h-{i}") is True
assert await backend.exists("h-not-stored") is False

async def test_query_lower_cache_handles_uppercase_content(self) -> None:
"""Cached lowercase must match query.lower() — Unicode-safe."""
backend = InMemoryVaultBackend()
await backend.store(
VaultEntry(
content="ALPHA Beta Gamma",
entry_type="code_chunk",
content_hash="upper-1",
)
)
# Query is lowercase — must still find the uppercase-stored entry.
results = await backend.query("alpha beta")
assert len(results) == 1
assert results[0].content == "ALPHA Beta Gamma"
Loading