diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index a860452..5a0d025 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -364,7 +364,7 @@ jobs: echo "::endgroup::" echo "::group::build unittest binary for android" - make build/unittest ${{ matrix.make }} SQLITE_AMALGAM=${SQLITE_DIR}/sqlite3.c + make build/unittest ${{ matrix.make }} SQLITE_AMALGAM=${SQLITE_DIR}/sqlite3.c DEFINES="-DTEST_SQLITE_EXTENSION" echo "::endgroup::" echo "::group::build e2e binary for android" @@ -406,12 +406,12 @@ jobs: - name: unix test sqlite-memory if: matrix.skip_test != true && matrix.os != 'windows-2022' && matrix.name != 'android' - run: ${{ matrix.name == 'linux-musl' && matrix.arch == 'arm64' && 'docker exec alpine' || '' }} make test ${{ matrix.make && matrix.make || ''}} + run: ${{ matrix.name == 'linux-musl' && matrix.arch == 'arm64' && 'docker exec alpine' || '' }} make test ${{ matrix.make && matrix.make || ''}} DEFINES="-DTEST_SQLITE_EXTENSION" - name: windows test sqlite-memory if: matrix.skip_test != true && matrix.name == 'windows' shell: msys2 {0} - run: make test ${{ matrix.make && matrix.make || ''}} + run: make test ${{ matrix.make && matrix.make || ''}} DEFINES="-DTEST_SQLITE_EXTENSION" - name: unix e2e sqlite-memory if: matrix.skip_test != true && matrix.variant != 'local' && matrix.os != 'windows-2022' && matrix.name != 'android' diff --git a/API.md b/API.md index e77760f..694e16d 100644 --- a/API.md +++ b/API.md @@ -564,8 +564,8 @@ typedef struct { **`dbmem_embedding_result_t` struct:** ```c typedef struct { - int n_tokens; // Number of tokens processed - int n_tokens_truncated; // Tokens that were truncated (0 if none) + int n_tokens; // Number of processed tokens (0 if unknown) + bool truncated; // True when the input was truncated before embedding int n_embd; // Embedding dimension float *embedding; // Embedding vector (engine-owned, valid until next call or free) } dbmem_embedding_result_t; @@ -574,6 +574,7 @@ typedef struct { **Notes:** - Works regardless of `DBMEM_OMIT_LOCAL_ENGINE` / `DBMEM_OMIT_REMOTE_ENGINE` compile flags - The `embedding` buffer in `dbmem_embedding_result_t` must remain valid until the next `compute` call or `free` — it is engine-owned, not copied by the caller +- `n_tokens` is metadata about the processed input when the engine can provide it; `truncated` is a boolean flag, not a truncated-token count - Only one custom provider can be registered per connection at a time; registering again replaces the previous one - The provider struct is copied by value; the caller does not need to keep it alive after registration @@ -596,7 +597,7 @@ static int my_compute(void *engine, const char *text, int text_len, void *xdata, // ... fill vec with your embedding ... result->n_embd = e->dimension; result->n_tokens = text_len / 4; - result->n_tokens_truncated = 0; + result->truncated = false; result->embedding = vec; return 0; } @@ -769,6 +770,21 @@ FROM dbmem_content WHERE last_accessed > 0 ORDER BY last_accessed DESC LIMIT 10; + +-- Tokens consumed and truncation per context +-- (n_tokens / truncated were added in schema version 2) +SELECT + COALESCE(c.context, '(none)') as context, + SUM(v.n_tokens) as tokens_processed, + SUM(v.truncated) as truncated_chunks +FROM dbmem_vault v +JOIN dbmem_content c ON c.hash = v.hash +GROUP BY c.context; + +-- Chunks that the embedding model truncated on input +SELECT hash, seq, length, n_tokens +FROM dbmem_vault +WHERE truncated = 1; ``` --- diff --git a/Makefile b/Makefile index c3fb299..1e5ad22 100644 --- a/Makefile +++ b/Makefile @@ -561,7 +561,7 @@ ifeq ($(PLATFORM),windows) else unzip -o $(CURL_ZIP) -d $(CURL_DIR)/src/. endif - cd $(CURL_SRC) && ./configure \ + cd $(CURL_SRC) && env -u LDFLAGS -u CPPFLAGS -u CFLAGS -u LIBS ./configure \ --without-libpsl \ --disable-alt-svc \ --disable-ares \ diff --git a/src/dbmem-embed.h b/src/dbmem-embed.h index 0ddc66d..15c1c1a 100644 --- a/src/dbmem-embed.h +++ b/src/dbmem-embed.h @@ -17,7 +17,7 @@ typedef struct dbmem_remote_engine_t dbmem_remote_engine_t; // Embedding result structure (always one embedding per call) typedef struct { int n_tokens; // Number of tokens processed - int n_tokens_truncated; // Number of tokens truncated (0 if none) + bool truncated; // True when the input was truncated before embedding int n_embd; // Embedding dimension float *embedding; // Pointer to embedding (points to engine's buffer, do not free) } embedding_result_t; diff --git a/src/dbmem-lembed.c b/src/dbmem-lembed.c index ce0f0e1..e3c842f 100644 --- a/src/dbmem-lembed.c +++ b/src/dbmem-lembed.c @@ -223,9 +223,9 @@ int dbmem_local_compute_embedding (dbmem_local_engine_t *engine, const char *tex } // Handle token overflow: truncate to max context size - int n_tokens_truncated = 0; + bool truncated = false; if (n_tokens > engine->n_ctx) { - n_tokens_truncated = n_tokens - engine->n_ctx; + truncated = true; n_tokens = engine->n_ctx; } @@ -275,7 +275,7 @@ int dbmem_local_compute_embedding (dbmem_local_engine_t *engine, const char *tex // Fill result result->n_tokens = n_tokens; - result->n_tokens_truncated = n_tokens_truncated; + result->truncated = truncated; result->n_embd = engine->n_embd; result->embedding = engine->embedding; diff --git a/src/dbmem-rembed.c b/src/dbmem-rembed.c index eb36d69..8432c0e 100644 --- a/src/dbmem-rembed.c +++ b/src/dbmem-rembed.c @@ -210,6 +210,62 @@ static int set_json_error_message (dbmem_remote_engine_t *engine) { return -1; } +static int dbmem_json_skip_token (const jsmntok_t *tokens, int index) { + int next = index + 1; + + if (tokens[index].type == JSMN_ARRAY) { + for (int i = 0; i < tokens[index].size; i++) { + next = dbmem_json_skip_token(tokens, next); + } + return next; + } + + if (tokens[index].type == JSMN_OBJECT) { + for (int i = 0; i < tokens[index].size; i++) { + next += 1; // skip key token + next = dbmem_json_skip_token(tokens, next); + } + return next; + } + + return next; +} + +static bool dbmem_json_token_equals (const char *json, const jsmntok_t *token, const char *text) { + size_t len = strlen(text); + size_t token_len = (size_t)(token->end - token->start); + return token_len == len && memcmp(json + token->start, text, len) == 0; +} + +static int dbmem_json_object_find (const char *json, const jsmntok_t *tokens, int object_index, const char *key) { + if (object_index < 0 || tokens[object_index].type != JSMN_OBJECT) return -1; + + int index = object_index + 1; + for (int i = 0; i < tokens[object_index].size; i++) { + int key_index = index; + int value_index = key_index + 1; + + if (tokens[key_index].type != JSMN_STRING) return -1; + if (dbmem_json_token_equals(json, &tokens[key_index], key)) return value_index; + + index = dbmem_json_skip_token(tokens, value_index); + } + + return -1; +} + +static bool dbmem_json_parse_bool (const char *json, const jsmntok_t *token) { + size_t len = (size_t)(token->end - token->start); + return token->type == JSMN_PRIMITIVE && len == 4 && memcmp(json + token->start, "true", 4) == 0; +} + +#if ENABLE_DBMEM_DEBUG_EMBEDDING +static void dbmem_remote_debug_log_response(dbmem_remote_engine_t *engine, long http_code) { + const char *response = engine->data ? engine->data : ""; + DEBUG_DBMEM_ALWAYS("[dbmem-rembed] vectors.space response (HTTP %ld): %s", http_code, response); +} +#endif + // MARK: - dbmem_remote_engine_t *dbmem_remote_engine_init (void *ctx, const char *provider, const char *model, char err_msg[DBMEM_ERRBUF_SIZE]) { @@ -450,6 +506,10 @@ int dbmem_remote_compute_embedding (dbmem_remote_engine_t *engine, const char *t sqlite3_free(response_data); #endif +#if ENABLE_DBMEM_DEBUG_EMBEDDING + dbmem_remote_debug_log_response(engine, http_code); +#endif + if (http_code != 200) { return set_json_error_message(engine); } @@ -482,27 +542,65 @@ int dbmem_remote_compute_embedding (dbmem_remote_engine_t *engine, const char *t int n_embd = 0; int prompt_tokens = 0; int estimated_prompt_tokens = 0; + int exact_prompt_tokens = 0; + bool truncated = false; int emb_start = -1; size_t emb_count = 0; - for (int i = 0; i < ntokens - 1; i++) { - if (tokens[i].type != JSMN_STRING) continue; - int klen = tokens[i].end - tokens[i].start; - const char *key = engine->data + tokens[i].start; - - if (klen == 9 && memcmp(key, "embedding", 9) == 0 && tokens[i + 1].type == JSMN_ARRAY) { - if (tokens[i + 1].size <= 0) { - dbmem_context_set_error(engine->context, "Invalid embedding array size in API response"); - return -1; - } - emb_count = (size_t)tokens[i + 1].size; - emb_start = i + 2; - } else if (klen == 16 && memcmp(key, "output_dimension", 16) == 0) { - n_embd = atoi(engine->data + tokens[i + 1].start); - } else if (klen == 13 && memcmp(key, "prompt_tokens", 13) == 0 && tokens[i + 1].type == JSMN_PRIMITIVE) { - prompt_tokens = atoi(engine->data + tokens[i + 1].start); - } else if (klen == 23 && memcmp(key, "estimated_prompt_tokens", 23) == 0) { - estimated_prompt_tokens = atoi(engine->data + tokens[i + 1].start); + if (tokens[0].type != JSMN_OBJECT) { + dbmem_context_set_error(engine->context, "Invalid API response shape"); + return -1; + } + + int output_dimension_index = dbmem_json_object_find(engine->data, tokens, 0, "output_dimension"); + if (output_dimension_index >= 0 && tokens[output_dimension_index].type == JSMN_PRIMITIVE) { + n_embd = atoi(engine->data + tokens[output_dimension_index].start); + } + + int data_index = dbmem_json_object_find(engine->data, tokens, 0, "data"); + if (data_index < 0 || tokens[data_index].type != JSMN_ARRAY || tokens[data_index].size <= 0) { + dbmem_context_set_error(engine->context, "Missing embedding data in API response"); + return -1; + } + + int item_index = data_index + 1; + if (tokens[item_index].type != JSMN_OBJECT) { + dbmem_context_set_error(engine->context, "Invalid embedding item in API response"); + return -1; + } + + int embedding_index = dbmem_json_object_find(engine->data, tokens, item_index, "embedding"); + if (embedding_index < 0 || tokens[embedding_index].type != JSMN_ARRAY) { + dbmem_context_set_error(engine->context, "Missing embedding data in API response"); + return -1; + } + if (tokens[embedding_index].size <= 0) { + dbmem_context_set_error(engine->context, "Invalid embedding array size in API response"); + return -1; + } + emb_count = (size_t)tokens[embedding_index].size; + emb_start = embedding_index + 1; + + int truncated_index = dbmem_json_object_find(engine->data, tokens, item_index, "truncated"); + if (truncated_index >= 0) { + truncated = dbmem_json_parse_bool(engine->data, &tokens[truncated_index]); + } + + int usage_index = dbmem_json_object_find(engine->data, tokens, 0, "usage"); + if (usage_index >= 0 && tokens[usage_index].type == JSMN_OBJECT) { + int prompt_tokens_index = dbmem_json_object_find(engine->data, tokens, usage_index, "prompt_tokens"); + if (prompt_tokens_index >= 0 && tokens[prompt_tokens_index].type == JSMN_PRIMITIVE) { + prompt_tokens = atoi(engine->data + tokens[prompt_tokens_index].start); + } + + int exact_prompt_tokens_index = dbmem_json_object_find(engine->data, tokens, usage_index, "exact_prompt_tokens"); + if (exact_prompt_tokens_index >= 0 && tokens[exact_prompt_tokens_index].type == JSMN_PRIMITIVE) { + exact_prompt_tokens = atoi(engine->data + tokens[exact_prompt_tokens_index].start); + } + + int estimated_prompt_tokens_index = dbmem_json_object_find(engine->data, tokens, usage_index, "estimated_prompt_tokens"); + if (estimated_prompt_tokens_index >= 0 && tokens[estimated_prompt_tokens_index].type == JSMN_PRIMITIVE) { + estimated_prompt_tokens = atoi(engine->data + tokens[estimated_prompt_tokens_index].start); } } @@ -534,12 +632,12 @@ int dbmem_remote_compute_embedding (dbmem_remote_engine_t *engine, const char *t // Fill result result->n_embd = n_embd; - result->n_tokens = prompt_tokens; - result->n_tokens_truncated = (estimated_prompt_tokens > prompt_tokens) ? estimated_prompt_tokens - prompt_tokens : 0; + result->n_tokens = exact_prompt_tokens > 0 ? exact_prompt_tokens : (estimated_prompt_tokens > 0 ? estimated_prompt_tokens : prompt_tokens); + result->truncated = truncated; result->embedding = engine->embedding; // Update statistics - engine->total_tokens_processed += prompt_tokens; + engine->total_tokens_processed += result->n_tokens; engine->total_embeddings_generated++; return 0; diff --git a/src/sqlite-memory.c b/src/sqlite-memory.c index f8708c8..5663b97 100644 --- a/src/sqlite-memory.c +++ b/src/sqlite-memory.c @@ -60,6 +60,9 @@ SQLITE_EXTENSION_INIT1 #define DBMEM_SETTINGS_KEY_EMBEDDING_CACHE "embedding_cache" #define DBMEM_SETTINGS_KEY_CACHE_MAX_ENTRIES "cache_max_entries" #define DBMEM_SETTINGS_KEY_SEARCH_OVERSAMPLE "search_oversample" +#define DBMEM_SETTINGS_KEY_SCHEMA_VERSION "schema_version" + +#define DBMEM_SCHEMA_VERSION 2 // default values from https://docs.openclaw.ai/concepts/memory #define DEFAULT_CHARS_PER_TOKEN 4 // Approximate number of characters per token (GPT ≈ 4, Claude ≈ 3.5) @@ -358,6 +361,105 @@ void dbmem_settings_load (sqlite3 *db, dbmem_context *ctx) { // MARK: - Database - +static bool dbmem_database_column_exists (sqlite3 *db, const char *table, const char *column, int *out_rc) { + char sql[256]; + snprintf(sql, sizeof(sql), "PRAGMA table_info(%s);", table); + + sqlite3_stmt *vm = NULL; + int rc = sqlite3_prepare_v2(db, sql, -1, &vm, NULL); + if (rc != SQLITE_OK) { + if (out_rc) *out_rc = rc; + return false; + } + + bool exists = false; + while ((rc = sqlite3_step(vm)) == SQLITE_ROW) { + const char *name = (const char *)sqlite3_column_text(vm, 1); + if (name && strcmp(name, column) == 0) { + exists = true; + break; + } + } + + if (rc == SQLITE_DONE || rc == SQLITE_ROW) rc = SQLITE_OK; + sqlite3_finalize(vm); + if (out_rc) *out_rc = rc; + return exists; +} + +static int dbmem_database_add_column_if_missing (sqlite3 *db, const char *table, const char *column, const char *alter_sql) { + int rc = SQLITE_OK; + if (dbmem_database_column_exists(db, table, column, &rc)) return SQLITE_OK; + if (rc != SQLITE_OK) return rc; + return sqlite3_exec(db, alter_sql, NULL, NULL, NULL); +} + +static int dbmem_database_schema_version (sqlite3 *db, int *version) { + static const char *sql = "SELECT value FROM dbmem_settings WHERE key=?1 LIMIT 1;"; + + *version = 0; + + sqlite3_stmt *vm = NULL; + int rc = sqlite3_prepare_v2(db, sql, -1, &vm, NULL); + if (rc != SQLITE_OK) goto cleanup; + + rc = sqlite3_bind_text(vm, 1, DBMEM_SETTINGS_KEY_SCHEMA_VERSION, -1, SQLITE_STATIC); + if (rc != SQLITE_OK) goto cleanup; + + rc = sqlite3_step(vm); + if (rc == SQLITE_ROW) { + *version = sqlite3_column_int(vm, 0); + rc = SQLITE_OK; + } else if (rc == SQLITE_DONE) { + rc = SQLITE_OK; + } + +cleanup: + if (vm) sqlite3_finalize(vm); + return rc; +} + +static int dbmem_database_set_schema_version (sqlite3 *db, int version) { + return dbmem_settings_write_int(db, DBMEM_SETTINGS_KEY_SCHEMA_VERSION, version); +} + +static int dbmem_database_migrate_v1_to_v2 (sqlite3 *db) { + int rc = dbmem_database_add_column_if_missing(db, "dbmem_vault", "n_tokens", + "ALTER TABLE dbmem_vault ADD COLUMN n_tokens INTEGER NOT NULL DEFAULT 0;"); + if (rc != SQLITE_OK) return rc; + + rc = dbmem_database_add_column_if_missing(db, "dbmem_vault", "truncated", + "ALTER TABLE dbmem_vault ADD COLUMN truncated INTEGER NOT NULL DEFAULT 0;"); + if (rc != SQLITE_OK) return rc; + + rc = dbmem_database_add_column_if_missing(db, "dbmem_cache", "n_tokens", + "ALTER TABLE dbmem_cache ADD COLUMN n_tokens INTEGER NOT NULL DEFAULT 0;"); + if (rc != SQLITE_OK) return rc; + + return dbmem_database_add_column_if_missing(db, "dbmem_cache", "truncated", + "ALTER TABLE dbmem_cache ADD COLUMN truncated INTEGER NOT NULL DEFAULT 0;"); +} + +static int dbmem_database_migrate (sqlite3 *db) { + int version = 0; + int rc = dbmem_database_schema_version(db, &version); + if (rc != SQLITE_OK) return rc; + + if (version > DBMEM_SCHEMA_VERSION) return SQLITE_MISMATCH; + if (version <= 0) version = 1; + + if (version < 2) { + rc = dbmem_database_migrate_v1_to_v2(db); + if (rc != SQLITE_OK) return rc; + version = 2; + rc = dbmem_database_set_schema_version(db, version); + if (rc != SQLITE_OK) return rc; + } + + if (version != DBMEM_SCHEMA_VERSION) return SQLITE_MISMATCH; + return SQLITE_OK; +} + static int dbmem_database_init (sqlite3 *db) { const char *sql = "CREATE TABLE IF NOT EXISTS dbmem_settings (key TEXT PRIMARY KEY, value TEXT);"; int rc = sqlite3_exec(db, sql, NULL, NULL, NULL); @@ -367,14 +469,17 @@ static int dbmem_database_init (sqlite3 *db) { rc = sqlite3_exec(db, sql, NULL, NULL, NULL); if (rc != SQLITE_OK) return rc; - sql = "CREATE TABLE IF NOT EXISTS dbmem_vault (hash TEXT NOT NULL, seq INTEGER NOT NULL, embedding BLOB NOT NULL, offset INTEGER NOT NULL, length INTEGER NOT NULL, PRIMARY KEY (hash, seq));"; + sql = "CREATE TABLE IF NOT EXISTS dbmem_vault (hash TEXT NOT NULL, seq INTEGER NOT NULL, embedding BLOB NOT NULL, offset INTEGER NOT NULL, length INTEGER NOT NULL, n_tokens INTEGER NOT NULL DEFAULT 0, truncated INTEGER NOT NULL DEFAULT 0, PRIMARY KEY (hash, seq));"; rc = sqlite3_exec(db, sql, NULL, NULL, NULL); if (rc != SQLITE_OK) return rc; - sql = "CREATE TABLE IF NOT EXISTS dbmem_cache (text_hash TEXT NOT NULL, provider TEXT NOT NULL, model TEXT NOT NULL, embedding BLOB NOT NULL, dimension INTEGER NOT NULL, PRIMARY KEY (text_hash, provider, model));"; + sql = "CREATE TABLE IF NOT EXISTS dbmem_cache (text_hash TEXT NOT NULL, provider TEXT NOT NULL, model TEXT NOT NULL, embedding BLOB NOT NULL, dimension INTEGER NOT NULL, n_tokens INTEGER NOT NULL DEFAULT 0, truncated INTEGER NOT NULL DEFAULT 0, PRIMARY KEY (text_hash, provider, model));"; rc = sqlite3_exec(db, sql, NULL, NULL, NULL); if (rc != SQLITE_OK) return rc; + rc = dbmem_database_migrate(db); + if (rc != SQLITE_OK) return rc; + sql = "CREATE VIRTUAL TABLE IF NOT EXISTS dbmem_vault_fts USING fts5 (content, hash UNINDEXED, seq UNINDEXED, context UNINDEXED);"; rc = sqlite3_exec(db, sql, NULL, NULL, NULL); if (rc != SQLITE_OK) { @@ -495,7 +600,7 @@ static int dbmem_database_add_entry (dbmem_context *ctx, sqlite3 *db, uint64_t h } static int dbmem_database_add_chunk (dbmem_context *ctx, embedding_result_t *result, size_t offset, size_t length, size_t index) { - static const char *sql = "INSERT INTO dbmem_vault (hash, seq, embedding, offset, length) VALUES (?1, ?2, ?3, ?4, ?5);"; + static const char *sql = "INSERT INTO dbmem_vault (hash, seq, embedding, offset, length, n_tokens, truncated) VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7);"; sqlite3_stmt *vm = NULL; int rc = sqlite3_prepare_v2(ctx->db, sql, -1, &vm, NULL); @@ -515,6 +620,12 @@ static int dbmem_database_add_chunk (dbmem_context *ctx, embedding_result_t *res rc = sqlite3_bind_int64(vm, 5, (sqlite3_int64)length); if (rc != SQLITE_OK) goto cleanup; + + rc = sqlite3_bind_int(vm, 6, result->n_tokens); + if (rc != SQLITE_OK) goto cleanup; + + rc = sqlite3_bind_int(vm, 7, result->truncated ? 1 : 0); + if (rc != SQLITE_OK) goto cleanup; rc = sqlite3_step(vm); if (rc == SQLITE_DONE) rc = SQLITE_OK; @@ -642,7 +753,7 @@ int dbmem_context_custom_compute (dbmem_context *ctx, const char *text, int text int rc = ctx->custom_provider.compute(ctx->custom_engine, text, text_len, ctx->custom_provider.xdata, &cr); if (rc != 0) return rc; result->n_tokens = cr.n_tokens; - result->n_tokens_truncated = cr.n_tokens_truncated; + result->truncated = cr.truncated; result->n_embd = cr.n_embd; result->embedding = cr.embedding; return 0; @@ -1249,7 +1360,7 @@ static void dbmem_get_option (sqlite3_context *context, int argc, sqlite3_value static void dbmem_dump_embeding (const embedding_result_t *result) { printf("{\n"); printf(" \"n_tokens\": %d,\n", result->n_tokens); - printf(" \"n_tokens_truncated\": %d,\n", result->n_tokens_truncated); + printf(" \"truncated\": %s,\n", result->truncated ? "true" : "false"); printf(" \"n_embd\": %d,\n", result->n_embd); printf(" \"embedding\": ["); @@ -1267,7 +1378,7 @@ static void dbmem_dump_embeding (const embedding_result_t *result) { // MARK: - Embedding Cache - static bool dbmem_cache_lookup (dbmem_context *ctx, uint64_t text_hash, embedding_result_t *result) { - static const char *sql = "SELECT embedding, dimension FROM dbmem_cache WHERE text_hash=?1 AND provider=?2 AND model=?3 LIMIT 1;"; + static const char *sql = "SELECT embedding, dimension, n_tokens, truncated FROM dbmem_cache WHERE text_hash=?1 AND provider=?2 AND model=?3 LIMIT 1;"; if (!ctx->provider || !ctx->model) return false; @@ -1300,8 +1411,8 @@ static bool dbmem_cache_lookup (dbmem_context *ctx, uint64_t text_hash, embeddin memcpy(ctx->cache_buffer, blob, blob_bytes); result->embedding = ctx->cache_buffer; result->n_embd = dimension; - result->n_tokens = 0; - result->n_tokens_truncated = 0; + result->n_tokens = sqlite3_column_int(vm, 2); + result->truncated = sqlite3_column_int(vm, 3) != 0; found = true; cleanup: @@ -1337,7 +1448,7 @@ static void dbmem_cache_evict (dbmem_context *ctx) { } static void dbmem_cache_store (dbmem_context *ctx, uint64_t text_hash, const embedding_result_t *result) { - static const char *sql = "INSERT OR REPLACE INTO dbmem_cache (text_hash, provider, model, embedding, dimension) VALUES (?1, ?2, ?3, ?4, ?5);"; + static const char *sql = "INSERT OR REPLACE INTO dbmem_cache (text_hash, provider, model, embedding, dimension, n_tokens, truncated) VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7);"; if (!ctx->provider || !ctx->model) return; @@ -1350,6 +1461,8 @@ static void dbmem_cache_store (dbmem_context *ctx, uint64_t text_hash, const emb sqlite3_bind_text(vm, 3, ctx->model, -1, SQLITE_STATIC); sqlite3_bind_blob(vm, 4, result->embedding, result->n_embd * (int)sizeof(float), SQLITE_STATIC); sqlite3_bind_int(vm, 5, result->n_embd); + sqlite3_bind_int(vm, 6, result->n_tokens); + sqlite3_bind_int(vm, 7, result->truncated ? 1 : 0); sqlite3_step(vm); diff --git a/src/sqlite-memory.h b/src/sqlite-memory.h index b8059bd..77e17e9 100644 --- a/src/sqlite-memory.h +++ b/src/sqlite-memory.h @@ -26,7 +26,7 @@ extern "C" { #endif -#define SQLITE_DBMEMORY_VERSION "1.1.0" +#define SQLITE_DBMEMORY_VERSION "1.2.0" // public API SQLITE_DBMEMORY_API int sqlite3_memory_init (sqlite3 *db, char **pzErrMsg, const sqlite3_api_routines *pApi); @@ -39,7 +39,7 @@ typedef struct dbmem_context dbmem_context; typedef struct { int n_tokens; - int n_tokens_truncated; + bool truncated; int n_embd; float *embedding; // Engine-owned buffer, valid until next call or free } dbmem_embedding_result_t; diff --git a/test/e2e.c b/test/e2e.c index 41d97ed..cffbbe6 100644 --- a/test/e2e.c +++ b/test/e2e.c @@ -58,12 +58,13 @@ static int tests_failed = 0; #define TEST(name) static void test_##name(void) #define RUN_TEST(name) do { \ + int _failed_before = tests_failed; \ printf(" Running %s... ", #name); \ fflush(stdout); \ test_##name(); \ tests_run++; \ tests_passed++; \ - printf("PASSED\n"); \ + if (tests_failed == _failed_before) printf("PASSED\n"); \ } while(0) #define ASSERT(cond) do { \ @@ -120,6 +121,33 @@ static void create_test_file(const char *path, const char *content) { } } +static int get_vault_metadata(const char *hash, int *chunk_count, int *min_tokens, int *min_truncated, int *max_truncated) { + sqlite3_stmt *stmt = NULL; + int rc = sqlite3_prepare_v2(db, + "SELECT COUNT(*), COALESCE(MIN(n_tokens), 0), COALESCE(MIN(truncated), 0), COALESCE(MAX(truncated), 0) " + "FROM dbmem_vault WHERE hash = ?1;", + -1, &stmt, NULL); + if (rc != SQLITE_OK) return rc; + + rc = sqlite3_bind_text(stmt, 1, hash, -1, SQLITE_STATIC); + if (rc != SQLITE_OK) { + sqlite3_finalize(stmt); + return rc; + } + + rc = sqlite3_step(stmt); + if (rc == SQLITE_ROW) { + if (chunk_count) *chunk_count = sqlite3_column_int(stmt, 0); + if (min_tokens) *min_tokens = sqlite3_column_int(stmt, 1); + if (min_truncated) *min_truncated = sqlite3_column_int(stmt, 2); + if (max_truncated) *max_truncated = sqlite3_column_int(stmt, 3); + rc = SQLITE_OK; + } + + sqlite3_finalize(stmt); + return rc; +} + // ============================================================================ // Phase 1: Setup // ============================================================================ @@ -242,6 +270,24 @@ TEST(verify_embedding) { sqlite3_finalize(stmt); } +// Verify remote embedding metadata is persisted on the stored chunk. +TEST(verify_embedding_metadata) { + sqlite3_stmt *stmt = NULL; + int rc = sqlite3_prepare_v2(db, + "SELECT n_tokens, truncated FROM dbmem_vault LIMIT 1;", + -1, &stmt, NULL); + ASSERT(rc == SQLITE_OK); + ASSERT(sqlite3_step(stmt) == SQLITE_ROW); + + int n_tokens = sqlite3_column_int(stmt, 0); + int truncated = sqlite3_column_int(stmt, 1); + sqlite3_finalize(stmt); + + ASSERT(n_tokens > 0); + ASSERT(truncated == 0); + printf("(n_tokens=%d, truncated=%d) ", n_tokens, truncated); +} + // memory_add_text with context (triggers remote embedding) TEST(memory_add_text_context) { ASSERT_SQL_OK(db, "SELECT memory_add_text('SQLite is a C-language library that implements a small, fast, self-contained SQL database engine.', 'test-context');"); @@ -424,6 +470,753 @@ TEST(memory_search_statement_reuse) { sqlite3_finalize(stmt); } +// ============================================================================ +// Phase 4b: Long-text chunking + multi-section retrieval +// ============================================================================ + +// A long text with 4 clearly distinct sections, each tagged with a unique +// anchor token so we can verify both (a) the chunker covers the whole text +// and (b) section-specific queries retrieve the matching chunk. +#define LONG_TEXT_ANCHOR_COOKING "ZANZIBAR-PASTA" +#define LONG_TEXT_ANCHOR_KERNEL "QUOKKA-SCHEDULER" +#define LONG_TEXT_ANCHOR_VIOLIN "TARANTELLA-BRIDGE" +#define LONG_TEXT_ANCHOR_ASTRO "BETELGEUSE-PARALLAX" + +static const char *LONG_TEXT = + // Section 1 - cooking + "Cooking pasta well begins with abundant salted water at a rolling boil. " + "The " LONG_TEXT_ANCHOR_COOKING " technique calls for finishing the noodles " + "directly in the sauce, ladling in starchy cooking water until the emulsion " + "clings to each strand. Timing matters more than the package suggests: pull " + "the pasta a minute early and let the residual heat do the rest. " + "Salt aggressively. Stir often. Reserve water before draining. Toss vigorously. " + "Salt aggressively. Stir often. Reserve water before draining. Toss vigorously. " + "\n\n" + // Section 2 - kernel scheduling + "Operating system schedulers balance throughput against latency under load. " + "The " LONG_TEXT_ANCHOR_KERNEL " design favors short interactive tasks by " + "boosting their effective priority for a brief window after a wakeup event, " + "then decaying that boost as CPU time accumulates. This avoids starving " + "background batch work while keeping UI threads responsive. " + "Run queues, vruntime, and load balancing across cores all interact here. " + "Run queues, vruntime, and load balancing across cores all interact here. " + "\n\n" + // Section 3 - violin + "A violin's tone depends as much on setup as on the maker. The " + LONG_TEXT_ANCHOR_VIOLIN " is shaped from well-aged maple and positioned to " + "transmit string vibration to the top plate without damping the upper " + "partials. Soundpost placement, tailgut tension, and bow rosin all subtly " + "shift the instrument's voice. " + "Maple, spruce, varnish, and time. Maple, spruce, varnish, and time. " + "\n\n" + // Section 4 - astronomy + "Measuring stellar distances requires careful baseline geometry. The " + LONG_TEXT_ANCHOR_ASTRO " measurement is challenging because the star is a " + "pulsating red supergiant whose photosphere is not well defined. Modern " + "interferometry combined with Gaia astrometry has narrowed the uncertainty " + "but not eliminated it. " + "Parallax, redshift, standard candles, distance ladder. " + "Parallax, redshift, standard candles, distance ladder. "; + +// Structural: long text produces multiple chunks that fully cover the input, +// every chunk has a valid embedding, and chunk offsets are well-formed. +TEST(memory_add_long_text_chunking) { + // Force raw-text chunking so the chunk count is determined by + // max_tokens/overlay_tokens, not by markdown structure. + ASSERT_SQL_OK(db, "SELECT memory_set_option('skip_semantic', 1);"); + ASSERT_SQL_OK(db, "SELECT memory_set_option('max_tokens', 80);"); + ASSERT_SQL_OK(db, "SELECT memory_set_option('overlay_tokens', 16);"); + + sqlite3_stmt *stmt = NULL; + int rc = sqlite3_prepare_v2(db, "SELECT memory_add_text(?1, 'long-text');", -1, &stmt, NULL); + ASSERT(rc == SQLITE_OK); + rc = sqlite3_bind_text(stmt, 1, LONG_TEXT, -1, SQLITE_STATIC); + ASSERT(rc == SQLITE_OK); + ASSERT(sqlite3_step(stmt) == SQLITE_ROW); + sqlite3_finalize(stmt); + + char hash[DBMEM_HASH_STR_MAXLEN] = {0}; + rc = sqlite3_prepare_v2(db, + "SELECT hash FROM dbmem_content WHERE context = 'long-text' LIMIT 1;", + -1, &stmt, NULL); + ASSERT(rc == SQLITE_OK && sqlite3_step(stmt) == SQLITE_ROW); + snprintf(hash, sizeof(hash), "%s", (const char *)sqlite3_column_text(stmt, 0)); + sqlite3_finalize(stmt); + ASSERT(strlen(hash) == DBMEM_HASH_HEX_LEN); + + char sql[256]; + snprintf(sql, sizeof(sql), + "SELECT COUNT(*) FROM dbmem_vault WHERE hash = '%s';", hash); + result_int = 0; + sqlite3_exec(db, sql, capture_int, NULL, NULL); + int chunk_count = result_int; + ASSERT(chunk_count >= 3); + + snprintf(sql, sizeof(sql), + "SELECT seq, offset, length, embedding, n_tokens, truncated FROM dbmem_vault " + "WHERE hash = '%s' ORDER BY seq;", hash); + rc = sqlite3_prepare_v2(db, sql, -1, &stmt, NULL); + ASSERT(rc == SQLITE_OK); + + int prev_seq = -1; + int prev_offset = -1; + int last_offset = 0, last_length = 0; + int seen = 0; + while (sqlite3_step(stmt) == SQLITE_ROW) { + int seq = sqlite3_column_int(stmt, 0); + int offset = sqlite3_column_int(stmt, 1); + int length = sqlite3_column_int(stmt, 2); + int bytes = sqlite3_column_bytes(stmt, 3); + int n_tokens = sqlite3_column_int(stmt, 4); + int truncated = sqlite3_column_int(stmt, 5); + + ASSERT(seq == prev_seq + 1); + ASSERT(offset >= prev_offset); + ASSERT(length > 0); + ASSERT(bytes == EXPECTED_DIMENSION * (int)sizeof(float)); + ASSERT(n_tokens > 0); + ASSERT(truncated == 0); + + prev_seq = seq; + prev_offset = offset; + last_offset = offset; + last_length = length; + seen++; + } + sqlite3_finalize(stmt); + ASSERT(seen == chunk_count); + + int total = (int)strlen(LONG_TEXT); + // Allow small tail slack for trailing-whitespace trimming by the parser. + ASSERT(last_offset + last_length >= total - 8); + + printf("(%d chunks covering %d bytes) ", chunk_count, total); + + // Restore defaults for downstream tests. + ASSERT_SQL_OK(db, "SELECT memory_set_option('skip_semantic', 0);"); + ASSERT_SQL_OK(db, "SELECT memory_set_option('max_tokens', 400);"); + ASSERT_SQL_OK(db, "SELECT memory_set_option('overlay_tokens', 80);"); +} + +// Retrieval: each section is reachable by a query phrase from that section. +// Asserts on anchor-token presence in the top-3 snippets, not absolute +// ranking, so minor embedding drift will not flake the test. +TEST(memory_search_long_text_sections) { + struct { const char *query; const char *anchor; } cases[] = { + { "finishing pasta in the sauce with starchy water", LONG_TEXT_ANCHOR_COOKING }, + { "boosting interactive task priority after wakeup", LONG_TEXT_ANCHOR_KERNEL }, + { "soundpost placement and string vibration", LONG_TEXT_ANCHOR_VIOLIN }, + { "measuring stellar distance with parallax", LONG_TEXT_ANCHOR_ASTRO }, + }; + int n_cases = (int)(sizeof(cases) / sizeof(cases[0])); + + ASSERT_SQL_OK(db, "SELECT memory_set_option('min_score', 0.0);"); + + int matched = 0; + for (int i = 0; i < n_cases; i++) { + sqlite3_stmt *stmt = NULL; + int rc = sqlite3_prepare_v2(db, + "SELECT snippet FROM memory_search(?1, 3);", -1, &stmt, NULL); + ASSERT(rc == SQLITE_OK); + rc = sqlite3_bind_text(stmt, 1, cases[i].query, -1, SQLITE_STATIC); + ASSERT(rc == SQLITE_OK); + + int found = 0; + while (sqlite3_step(stmt) == SQLITE_ROW) { + const char *snippet = (const char *)sqlite3_column_text(stmt, 0); + if (snippet && strstr(snippet, cases[i].anchor)) { found = 1; break; } + } + sqlite3_finalize(stmt); + + if (!found) { + printf("FAILED\n Query '%s' did not retrieve anchor '%s' in top 3\n", + cases[i].query, cases[i].anchor); + tests_failed++; + tests_passed--; + return; + } + matched++; + } + + // Surface aggregate per-chunk metadata for the underlying long-text + // corpus (one row in dbmem_content, multiple chunks in dbmem_vault). + char long_text_hash[DBMEM_HASH_STR_MAXLEN] = {0}; + sqlite3_stmt *hstmt = NULL; + int hrc = sqlite3_prepare_v2(db, + "SELECT hash FROM dbmem_content WHERE context = 'long-text' LIMIT 1;", + -1, &hstmt, NULL); + int chunk_count = 0, min_tokens = 0, min_truncated = 0, max_truncated = 0; + if (hrc == SQLITE_OK && sqlite3_step(hstmt) == SQLITE_ROW) { + snprintf(long_text_hash, sizeof(long_text_hash), "%s", + (const char *)sqlite3_column_text(hstmt, 0)); + sqlite3_finalize(hstmt); + get_vault_metadata(long_text_hash, &chunk_count, &min_tokens, + &min_truncated, &max_truncated); + } else { + if (hstmt) sqlite3_finalize(hstmt); + } + + printf("(%d/%d sections retrieved; %d chunks min_n_tok=%d any_trunc=%d) ", + matched, n_cases, chunk_count, min_tokens, max_truncated); + + ASSERT_SQL_OK(db, "SELECT memory_set_option('min_score', 0.7);"); +} + +// ============================================================================ +// Phase 4c: Single-chunk near the provider token ceiling +// ============================================================================ + +// Control test for memory_search_truncation_signature below: same setup +// (single-chunk-everything, pure-vector ranking, leading-mosaics + tail- +// vents text alongside a short vent reference) but the long text is sized +// to land *under* vectors.space's 1024-token batch ceiling. Expectations: +// +// 1) The long chunk embeds successfully (no provider rejection). +// 2) Stored as exactly one chunk in dbmem_vault. +// 3) A tail-topic query retrieves both the short reference and the long +// chunk in the top-10 — confirming the tail was included in the +// embedding when the input fit in one batch. +// +// Sized at ~5200 bytes. Empirical calibration: 7159 / 9346 / 10075 bytes +// all rejected with the same "input (1026 tokens)" template (so "1026" is +// not a real count — just an "exceeded" sentinel). 7159 / 1024 ≈ 7.0 +// chars-per-token actual ratio for this filler, so 5200 bytes ≈ ~740 +// tokens — clear of the 1024 ceiling. +TEST(memory_search_under_token_limit) { + ASSERT_SQL_OK(db, "SELECT memory_set_option('skip_semantic', 1);"); + ASSERT_SQL_OK(db, "SELECT memory_set_option('max_tokens', 2048);"); + ASSERT_SQL_OK(db, "SELECT memory_set_option('overlay_tokens', 0);"); + ASSERT_SQL_OK(db, "SELECT memory_set_option('vector_weight', 1.0);"); + ASSERT_SQL_OK(db, "SELECT memory_set_option('text_weight', 0.0);"); + ASSERT_SQL_OK(db, "SELECT memory_set_option('min_score', 0.0);"); + + static const char *SHORT_REF = + "Hydrothermal vents on the deep ocean floor sustain chemosynthetic " + "microbial ecosystems independent of sunlight. Tubeworms and " + "thermophilic archaea metabolize sulfur compounds emitted by the " + "vent fluids in total darkness."; + + sqlite3_stmt *stmt = NULL; + int rc = sqlite3_prepare_v2(db, + "SELECT memory_add_text(?1, 'under-limit-short');", -1, &stmt, NULL); + ASSERT(rc == SQLITE_OK); + rc = sqlite3_bind_text(stmt, 1, SHORT_REF, -1, SQLITE_STATIC); + ASSERT(rc == SQLITE_OK); + ASSERT(sqlite3_step(stmt) == SQLITE_ROW); + sqlite3_finalize(stmt); + + static const char *MOSAIC_LEAD = + "Andalusian zellige mosaics from medieval Granada and Cordoba feature " + "interlocking geometric tiles arranged in repeating decagonal motifs " + "of cobalt and ochre glaze. "; + static const char *MOSAIC_FILLER = + "Master craftsmen historically cut tesserae from glazed terracotta " + "and fit them into intricate patterns whose mathematical foundations " + "anticipate aperiodic tilings by centuries; pigments include lapis " + "lazuli, copper carbonate, and iron oxides. "; + static const char *VENT_TAIL = + " And entirely separately, deep ocean hydrothermal vents host " + "chemosynthetic communities of microbial mats, tubeworms, and " + "thermophilic archaea metabolizing sulfur compounds in total darkness."; + + size_t cap = 16 * 1024; + char *long_text = (char *)malloc(cap); + ASSERT(long_text != NULL); + size_t pos = 0; + int n = snprintf(long_text + pos, cap - pos, "%s", MOSAIC_LEAD); + pos += (size_t)n; + while (pos < 5000 + && pos + strlen(MOSAIC_FILLER) + strlen(VENT_TAIL) + 4 < cap) { + n = snprintf(long_text + pos, cap - pos, "%s", MOSAIC_FILLER); + if (n <= 0) break; + pos += (size_t)n; + } + n = snprintf(long_text + pos, cap - pos, "%s", VENT_TAIL); + pos += (size_t)n; + int long_text_len = (int)pos; + + rc = sqlite3_prepare_v2(db, + "SELECT memory_add_text(?1, 'under-limit-long');", -1, &stmt, NULL); + ASSERT(rc == SQLITE_OK); + rc = sqlite3_bind_text(stmt, 1, long_text, long_text_len, SQLITE_STATIC); + ASSERT(rc == SQLITE_OK); + rc = sqlite3_step(stmt); + if (rc != SQLITE_ROW) { + printf("FAILED\n memory_add_text(%d bytes) returned rc=%d\n sqlite error: %s\n", + long_text_len, rc, sqlite3_errmsg(db)); + sqlite3_finalize(stmt); + free(long_text); + tests_failed++; + tests_passed--; + return; + } + sqlite3_finalize(stmt); + free(long_text); + + char short_hash[DBMEM_HASH_STR_MAXLEN] = {0}; + char long_hash[DBMEM_HASH_STR_MAXLEN] = {0}; + rc = sqlite3_prepare_v2(db, + "SELECT hash FROM dbmem_content WHERE context = 'under-limit-short' LIMIT 1;", + -1, &stmt, NULL); + ASSERT(rc == SQLITE_OK && sqlite3_step(stmt) == SQLITE_ROW); + snprintf(short_hash, sizeof(short_hash), "%s", (const char *)sqlite3_column_text(stmt, 0)); + sqlite3_finalize(stmt); + + rc = sqlite3_prepare_v2(db, + "SELECT hash FROM dbmem_content WHERE context = 'under-limit-long' LIMIT 1;", + -1, &stmt, NULL); + ASSERT(rc == SQLITE_OK && sqlite3_step(stmt) == SQLITE_ROW); + snprintf(long_hash, sizeof(long_hash), "%s", (const char *)sqlite3_column_text(stmt, 0)); + sqlite3_finalize(stmt); + + // Single chunk, length around ~5KB but under the rejection threshold. + char sql[256]; + snprintf(sql, sizeof(sql), + "SELECT COUNT(*) FROM dbmem_vault WHERE hash = '%s';", long_hash); + result_int = 0; + sqlite3_exec(db, sql, capture_int, NULL, NULL); + ASSERT(result_int == 1); + + snprintf(sql, sizeof(sql), + "SELECT length FROM dbmem_vault WHERE hash = '%s' LIMIT 1;", long_hash); + result_int = 0; + sqlite3_exec(db, sql, capture_int, NULL, NULL); + int long_chunk_bytes = result_int; + ASSERT(long_chunk_bytes > 4500); + + int chunk_count = 0, min_tokens = 0, min_truncated = 0, max_truncated = 0; + int short_n_tokens = 0, short_truncated = 0; + int long_n_tokens = 0, long_truncated = 0; + + rc = get_vault_metadata(short_hash, &chunk_count, &min_tokens, &min_truncated, &max_truncated); + ASSERT(rc == SQLITE_OK); + ASSERT(chunk_count == 1); + ASSERT(min_tokens > 0); + ASSERT(min_truncated == 0 && max_truncated == 0); + short_n_tokens = min_tokens; + short_truncated = max_truncated; + + rc = get_vault_metadata(long_hash, &chunk_count, &min_tokens, &min_truncated, &max_truncated); + ASSERT(rc == SQLITE_OK); + ASSERT(chunk_count == 1); + ASSERT(min_tokens > 0); + ASSERT(min_truncated == 0 && max_truncated == 0); + long_n_tokens = min_tokens; + long_truncated = max_truncated; + + // Same query as the truncation test; with the full chunk embedded we + // expect both the short ref and the long chunk to surface in top-10. + rc = sqlite3_prepare_v2(db, + "SELECT hash, ranking FROM memory_search(" + " 'chemosynthesis around deep-sea volcanic vents', 10);", + -1, &stmt, NULL); + ASSERT(rc == SQLITE_OK); + + int short_rank = -1, long_rank = -1; + double short_score = 0.0, long_score = 0.0; + int row = 0; + while (sqlite3_step(stmt) == SQLITE_ROW) { + const char *hash = (const char *)sqlite3_column_text(stmt, 0); + double rank = sqlite3_column_double(stmt, 1); + if (hash && strcmp(hash, short_hash) == 0) { + short_rank = row; short_score = rank; + } + if (hash && strcmp(hash, long_hash) == 0) { + long_rank = row; long_score = rank; + } + row++; + } + sqlite3_finalize(stmt); + + ASSERT(short_rank >= 0); + ASSERT(long_rank >= 0); + + printf("(short: n_tok=%d trunc=%d rank=%d score=%.3f; long: %d bytes n_tok=%d trunc=%d rank=%d score=%.3f) ", + short_n_tokens, short_truncated, short_rank, short_score, + long_chunk_bytes, long_n_tokens, long_truncated, long_rank, long_score); + + ASSERT_SQL_OK(db, "SELECT memory_set_option('skip_semantic', 0);"); + ASSERT_SQL_OK(db, "SELECT memory_set_option('max_tokens', 400);"); + ASSERT_SQL_OK(db, "SELECT memory_set_option('overlay_tokens', 80);"); + ASSERT_SQL_OK(db, "SELECT memory_set_option('vector_weight', 0.6);"); + ASSERT_SQL_OK(db, "SELECT memory_set_option('text_weight', 0.4);"); + ASSERT_SQL_OK(db, "SELECT memory_set_option('min_score', 0.7);"); +} + +// ============================================================================ +// Phase 4d: Model-level truncation behavioral signature +// ============================================================================ + +// When a single chunk exceeds the embedding model's input context window +// (embeddinggemma-300m: ~2048 tokens), the service truncates and returns an +// embedding that only represents the leading portion. The truncated flag is +// persisted on dbmem_vault, and this test also checks the observable search +// behavior: +// +// 1) Store a SHORT reference (fully embedded) entirely about topic T. +// 2) Store a LONG single-chunk document whose LEADING ~10KB is about an +// unrelated topic and whose final ~250 bytes (well past the 2048-token +// window) introduce topic T. +// 3) Search for topic T with pure-vector ranking. +// +// If the long chunk's embedding includes the tail, both should rank in the +// same neighborhood. If truncated, the long chunk's embedding only encodes +// the unrelated leading topic and ranks far below the short reference (or +// drops out of the top-K entirely). +TEST(memory_search_truncation_signature) { + ASSERT_SQL_OK(db, "SELECT memory_set_option('skip_semantic', 1);"); + ASSERT_SQL_OK(db, "SELECT memory_set_option('max_tokens', 3000);"); + ASSERT_SQL_OK(db, "SELECT memory_set_option('overlay_tokens', 0);"); + ASSERT_SQL_OK(db, "SELECT memory_set_option('vector_weight', 1.0);"); + ASSERT_SQL_OK(db, "SELECT memory_set_option('text_weight', 0.0);"); + ASSERT_SQL_OK(db, "SELECT memory_set_option('min_score', 0.0);"); + + // Short reference (~50 tokens), fully embedded, entirely about the topic. + // Trailing sentence differs per test so memory_add_text's content-hash + // idempotency doesn't collapse this insert into a no-op of an earlier + // test's identical SHORT_REF. + static const char *SHORT_REF = + "Hydrothermal vents on the deep ocean floor sustain chemosynthetic " + "microbial ecosystems independent of sunlight. Tubeworms and " + "thermophilic archaea metabolize sulfur compounds emitted by the " + "vent fluids in total darkness. Truncation-signature reference."; + + sqlite3_stmt *stmt = NULL; + int rc = sqlite3_prepare_v2(db, + "SELECT memory_add_text(?1, 'trunc-short');", -1, &stmt, NULL); + ASSERT(rc == SQLITE_OK); + rc = sqlite3_bind_text(stmt, 1, SHORT_REF, -1, SQLITE_STATIC); + ASSERT(rc == SQLITE_OK); + ASSERT(sqlite3_step(stmt) == SQLITE_ROW); + sqlite3_finalize(stmt); + + // Build ~10KB single-chunk text: leading + filler about Andalusian + // mosaics, then a final ~250-byte tail introducing hydrothermal vents. + // ~10KB / ~4 chars-per-token ≈ 2500 tokens — past gemma's 2048 window. + static const char *MOSAIC_LEAD = + "Andalusian zellige mosaics from medieval Granada and Cordoba feature " + "interlocking geometric tiles arranged in repeating decagonal motifs " + "of cobalt and ochre glaze. "; + static const char *MOSAIC_FILLER = + "Master craftsmen historically cut tesserae from glazed terracotta " + "and fit them into intricate patterns whose mathematical foundations " + "anticipate aperiodic tilings by centuries; pigments include lapis " + "lazuli, copper carbonate, and iron oxides. "; + static const char *VENT_TAIL = + " And entirely separately, deep ocean hydrothermal vents host " + "chemosynthetic communities of microbial mats, tubeworms, and " + "thermophilic archaea metabolizing sulfur compounds in total darkness."; + + size_t cap = 16 * 1024; + char *long_text = (char *)malloc(cap); + ASSERT(long_text != NULL); + size_t pos = 0; + int n = snprintf(long_text + pos, cap - pos, "%s", MOSAIC_LEAD); + pos += (size_t)n; + while (pos < 9800 + && pos + strlen(MOSAIC_FILLER) + strlen(VENT_TAIL) + 4 < cap) { + n = snprintf(long_text + pos, cap - pos, "%s", MOSAIC_FILLER); + if (n <= 0) break; + pos += (size_t)n; + } + n = snprintf(long_text + pos, cap - pos, "%s", VENT_TAIL); + pos += (size_t)n; + int long_text_len = (int)pos; + + rc = sqlite3_prepare_v2(db, + "SELECT memory_add_text(?1, 'trunc-long');", -1, &stmt, NULL); + ASSERT(rc == SQLITE_OK); + rc = sqlite3_bind_text(stmt, 1, long_text, long_text_len, SQLITE_STATIC); + ASSERT(rc == SQLITE_OK); + rc = sqlite3_step(stmt); + if (rc != SQLITE_ROW) { + printf("FAILED\n memory_add_text(%d bytes) returned rc=%d\n sqlite error: %s\n", + long_text_len, rc, sqlite3_errmsg(db)); + sqlite3_finalize(stmt); + free(long_text); + tests_failed++; + tests_passed--; + return; + } + sqlite3_finalize(stmt); + free(long_text); + + // Capture both hashes. + char short_hash[DBMEM_HASH_STR_MAXLEN] = {0}; + char long_hash[DBMEM_HASH_STR_MAXLEN] = {0}; + rc = sqlite3_prepare_v2(db, + "SELECT hash FROM dbmem_content WHERE context = 'trunc-short' LIMIT 1;", + -1, &stmt, NULL); + ASSERT(rc == SQLITE_OK && sqlite3_step(stmt) == SQLITE_ROW); + snprintf(short_hash, sizeof(short_hash), "%s", (const char *)sqlite3_column_text(stmt, 0)); + sqlite3_finalize(stmt); + + rc = sqlite3_prepare_v2(db, + "SELECT hash FROM dbmem_content WHERE context = 'trunc-long' LIMIT 1;", + -1, &stmt, NULL); + ASSERT(rc == SQLITE_OK && sqlite3_step(stmt) == SQLITE_ROW); + snprintf(long_hash, sizeof(long_hash), "%s", (const char *)sqlite3_column_text(stmt, 0)); + sqlite3_finalize(stmt); + + // Confirm the long content stored as one chunk past gemma's window. + char sql[256]; + snprintf(sql, sizeof(sql), + "SELECT COUNT(*) FROM dbmem_vault WHERE hash = '%s';", long_hash); + result_int = 0; + sqlite3_exec(db, sql, capture_int, NULL, NULL); + ASSERT(result_int == 1); + + snprintf(sql, sizeof(sql), + "SELECT length FROM dbmem_vault WHERE hash = '%s' LIMIT 1;", long_hash); + result_int = 0; + sqlite3_exec(db, sql, capture_int, NULL, NULL); + int long_chunk_bytes = result_int; + // ~2048 tokens × ~4 chars/token = ~8192 chars; chunk must clearly exceed. + ASSERT(long_chunk_bytes > 9000); + + int chunk_count = 0, min_tokens = 0, min_truncated = 0, max_truncated = 0; + int short_n_tokens = 0, short_truncated = 0; + int long_n_tokens = 0, long_truncated = 0; + + rc = get_vault_metadata(short_hash, &chunk_count, &min_tokens, &min_truncated, &max_truncated); + ASSERT(rc == SQLITE_OK); + ASSERT(chunk_count == 1); + ASSERT(min_tokens > 0); + ASSERT(min_truncated == 0 && max_truncated == 0); + short_n_tokens = min_tokens; + short_truncated = max_truncated; + + rc = get_vault_metadata(long_hash, &chunk_count, &min_tokens, &min_truncated, &max_truncated); + ASSERT(rc == SQLITE_OK); + ASSERT(chunk_count == 1); + ASSERT(min_tokens > 0); + ASSERT(min_truncated == 1 && max_truncated == 1); + long_n_tokens = min_tokens; + long_truncated = max_truncated; + + // Query for the topic that appears throughout the short reference and + // only in the *tail* of the long chunk. Paraphrased so any residual FTS + // contribution would match both texts roughly equally. + rc = sqlite3_prepare_v2(db, + "SELECT hash, ranking FROM memory_search(" + " 'chemosynthesis around deep-sea volcanic vents', 10);", + -1, &stmt, NULL); + ASSERT(rc == SQLITE_OK); + + int short_rank = -1, long_rank = -1; + double short_score = 0.0, long_score = 0.0; + int row = 0; + while (sqlite3_step(stmt) == SQLITE_ROW) { + const char *hash = (const char *)sqlite3_column_text(stmt, 0); + double rank = sqlite3_column_double(stmt, 1); + if (hash && strcmp(hash, short_hash) == 0) { + short_rank = row; short_score = rank; + } + if (hash && strcmp(hash, long_hash) == 0) { + long_rank = row; long_score = rank; + } + row++; + } + sqlite3_finalize(stmt); + + ASSERT(short_rank >= 0); + if (long_rank == -1) { + printf("(short: n_tok=%d trunc=%d rank=%d score=%.3f; long: %d bytes n_tok=%d trunc=%d absent from top-10) ", + short_n_tokens, short_truncated, short_rank, short_score, + long_chunk_bytes, long_n_tokens, long_truncated); + } else { + // With a fully-embedded long chunk we'd expect comparable rankings; + // truncation pushes the long chunk strictly below the short ref. + ASSERT(short_rank < long_rank); + printf("(short: n_tok=%d trunc=%d rank=%d score=%.3f; long: %d bytes n_tok=%d trunc=%d rank=%d score=%.3f) ", + short_n_tokens, short_truncated, short_rank, short_score, + long_chunk_bytes, long_n_tokens, long_truncated, long_rank, long_score); + } + + ASSERT_SQL_OK(db, "SELECT memory_set_option('skip_semantic', 0);"); + ASSERT_SQL_OK(db, "SELECT memory_set_option('max_tokens', 400);"); + ASSERT_SQL_OK(db, "SELECT memory_set_option('overlay_tokens', 80);"); + ASSERT_SQL_OK(db, "SELECT memory_set_option('vector_weight', 0.6);"); + ASSERT_SQL_OK(db, "SELECT memory_set_option('text_weight', 0.4);"); + ASSERT_SQL_OK(db, "SELECT memory_set_option('min_score', 0.7);"); +} + +// ============================================================================ +// Phase 4e: Truncation signature near the model context window (~2000 tok) +// ============================================================================ + +// Same shape as memory_search_truncation_signature, but with a long text +// sized at ~19500 bytes / ~9.8 chars-per-token ≈ ~1990 tokens — close to +// embeddinggemma-300m's documented 2048-token context window. Useful for +// observing how the provider behaves further past the 1024-token batch +// ceiling: same rejection error, a different message, or (if the batch +// size is raised on the server) a successful embed where truncation +// actually occurs. +TEST(memory_search_truncation_near_model_context) { + ASSERT_SQL_OK(db, "SELECT memory_set_option('skip_semantic', 1);"); + ASSERT_SQL_OK(db, "SELECT memory_set_option('max_tokens', 6000);"); + ASSERT_SQL_OK(db, "SELECT memory_set_option('overlay_tokens', 0);"); + ASSERT_SQL_OK(db, "SELECT memory_set_option('vector_weight', 1.0);"); + ASSERT_SQL_OK(db, "SELECT memory_set_option('text_weight', 0.0);"); + ASSERT_SQL_OK(db, "SELECT memory_set_option('min_score', 0.0);"); + + // Trailing sentence differs from the other tests' SHORT_REFs so the + // content-hash idempotency in memory_add_text doesn't collapse the insert. + static const char *SHORT_REF = + "Hydrothermal vents on the deep ocean floor sustain chemosynthetic " + "microbial ecosystems independent of sunlight. Tubeworms and " + "thermophilic archaea metabolize sulfur compounds emitted by the " + "vent fluids in total darkness. Near-context reference."; + + sqlite3_stmt *stmt = NULL; + int rc = sqlite3_prepare_v2(db, + "SELECT memory_add_text(?1, 'trunc-large-short');", -1, &stmt, NULL); + ASSERT(rc == SQLITE_OK); + rc = sqlite3_bind_text(stmt, 1, SHORT_REF, -1, SQLITE_STATIC); + ASSERT(rc == SQLITE_OK); + ASSERT(sqlite3_step(stmt) == SQLITE_ROW); + sqlite3_finalize(stmt); + + static const char *MOSAIC_LEAD = + "Andalusian zellige mosaics from medieval Granada and Cordoba feature " + "interlocking geometric tiles arranged in repeating decagonal motifs " + "of cobalt and ochre glaze. "; + static const char *MOSAIC_FILLER = + "Master craftsmen historically cut tesserae from glazed terracotta " + "and fit them into intricate patterns whose mathematical foundations " + "anticipate aperiodic tilings by centuries; pigments include lapis " + "lazuli, copper carbonate, and iron oxides. "; + static const char *VENT_TAIL = + " And entirely separately, deep ocean hydrothermal vents host " + "chemosynthetic communities of microbial mats, tubeworms, and " + "thermophilic archaea metabolizing sulfur compounds in total darkness."; + + size_t cap = 32 * 1024; + char *long_text = (char *)malloc(cap); + ASSERT(long_text != NULL); + size_t pos = 0; + int n = snprintf(long_text + pos, cap - pos, "%s", MOSAIC_LEAD); + pos += (size_t)n; + while (pos < 19300 + && pos + strlen(MOSAIC_FILLER) + strlen(VENT_TAIL) + 4 < cap) { + n = snprintf(long_text + pos, cap - pos, "%s", MOSAIC_FILLER); + if (n <= 0) break; + pos += (size_t)n; + } + n = snprintf(long_text + pos, cap - pos, "%s", VENT_TAIL); + pos += (size_t)n; + int long_text_len = (int)pos; + + rc = sqlite3_prepare_v2(db, + "SELECT memory_add_text(?1, 'trunc-large-long');", -1, &stmt, NULL); + ASSERT(rc == SQLITE_OK); + rc = sqlite3_bind_text(stmt, 1, long_text, long_text_len, SQLITE_STATIC); + ASSERT(rc == SQLITE_OK); + rc = sqlite3_step(stmt); + if (rc != SQLITE_ROW) { + printf("FAILED\n memory_add_text(%d bytes) returned rc=%d\n sqlite error: %s\n", + long_text_len, rc, sqlite3_errmsg(db)); + sqlite3_finalize(stmt); + free(long_text); + tests_failed++; + tests_passed--; + return; + } + sqlite3_finalize(stmt); + free(long_text); + + char short_hash[DBMEM_HASH_STR_MAXLEN] = {0}; + char long_hash[DBMEM_HASH_STR_MAXLEN] = {0}; + rc = sqlite3_prepare_v2(db, + "SELECT hash FROM dbmem_content WHERE context = 'trunc-large-short' LIMIT 1;", + -1, &stmt, NULL); + ASSERT(rc == SQLITE_OK && sqlite3_step(stmt) == SQLITE_ROW); + snprintf(short_hash, sizeof(short_hash), "%s", (const char *)sqlite3_column_text(stmt, 0)); + sqlite3_finalize(stmt); + + rc = sqlite3_prepare_v2(db, + "SELECT hash FROM dbmem_content WHERE context = 'trunc-large-long' LIMIT 1;", + -1, &stmt, NULL); + ASSERT(rc == SQLITE_OK && sqlite3_step(stmt) == SQLITE_ROW); + snprintf(long_hash, sizeof(long_hash), "%s", (const char *)sqlite3_column_text(stmt, 0)); + sqlite3_finalize(stmt); + + char sql[256]; + snprintf(sql, sizeof(sql), + "SELECT COUNT(*) FROM dbmem_vault WHERE hash = '%s';", long_hash); + result_int = 0; + sqlite3_exec(db, sql, capture_int, NULL, NULL); + ASSERT(result_int == 1); + + snprintf(sql, sizeof(sql), + "SELECT length FROM dbmem_vault WHERE hash = '%s' LIMIT 1;", long_hash); + result_int = 0; + sqlite3_exec(db, sql, capture_int, NULL, NULL); + int long_chunk_bytes = result_int; + ASSERT(long_chunk_bytes > 18000); + + int chunk_count = 0, min_tokens = 0, min_truncated = 0, max_truncated = 0; + int short_n_tokens = 0, short_truncated = 0; + int long_n_tokens = 0, long_truncated = 0; + + rc = get_vault_metadata(short_hash, &chunk_count, &min_tokens, &min_truncated, &max_truncated); + ASSERT(rc == SQLITE_OK); + ASSERT(chunk_count == 1); + ASSERT(min_tokens > 0); + ASSERT(min_truncated == 0 && max_truncated == 0); + short_n_tokens = min_tokens; + short_truncated = max_truncated; + + rc = get_vault_metadata(long_hash, &chunk_count, &min_tokens, &min_truncated, &max_truncated); + ASSERT(rc == SQLITE_OK); + ASSERT(chunk_count == 1); + ASSERT(min_tokens > 0); + ASSERT(min_truncated == 1 && max_truncated == 1); + long_n_tokens = min_tokens; + long_truncated = max_truncated; + + rc = sqlite3_prepare_v2(db, + "SELECT hash, ranking FROM memory_search(" + " 'chemosynthesis around deep-sea volcanic vents', 10);", + -1, &stmt, NULL); + ASSERT(rc == SQLITE_OK); + + int short_rank = -1, long_rank = -1; + double short_score = 0.0, long_score = 0.0; + int row = 0; + while (sqlite3_step(stmt) == SQLITE_ROW) { + const char *hash = (const char *)sqlite3_column_text(stmt, 0); + double rank = sqlite3_column_double(stmt, 1); + if (hash && strcmp(hash, short_hash) == 0) { + short_rank = row; short_score = rank; + } + if (hash && strcmp(hash, long_hash) == 0) { + long_rank = row; long_score = rank; + } + row++; + } + sqlite3_finalize(stmt); + + ASSERT(short_rank >= 0); + if (long_rank == -1) { + printf("(short: n_tok=%d trunc=%d rank=%d score=%.3f; long: %d bytes n_tok=%d trunc=%d absent from top-10) ", + short_n_tokens, short_truncated, short_rank, short_score, + long_chunk_bytes, long_n_tokens, long_truncated); + } else { + ASSERT(short_rank < long_rank); + printf("(short: n_tok=%d trunc=%d rank=%d score=%.3f; long: %d bytes n_tok=%d trunc=%d rank=%d score=%.3f) ", + short_n_tokens, short_truncated, short_rank, short_score, + long_chunk_bytes, long_n_tokens, long_truncated, long_rank, long_score); + } + + ASSERT_SQL_OK(db, "SELECT memory_set_option('skip_semantic', 0);"); + ASSERT_SQL_OK(db, "SELECT memory_set_option('max_tokens', 400);"); + ASSERT_SQL_OK(db, "SELECT memory_set_option('overlay_tokens', 80);"); + ASSERT_SQL_OK(db, "SELECT memory_set_option('vector_weight', 0.6);"); + ASSERT_SQL_OK(db, "SELECT memory_set_option('text_weight', 0.4);"); + ASSERT_SQL_OK(db, "SELECT memory_set_option('min_score', 0.7);"); +} + // ============================================================================ // Phase 5: Deletion // ============================================================================ @@ -531,6 +1324,7 @@ int main(void) { // Phase 3: Content Management (network calls) RUN_TEST(memory_add_text); RUN_TEST(verify_embedding); + RUN_TEST(verify_embedding_metadata); RUN_TEST(memory_add_text_context); RUN_TEST(memory_add_text_idempotent); #ifndef DBMEM_OMIT_IO @@ -543,6 +1337,19 @@ int main(void) { RUN_TEST(memory_search_ranking); RUN_TEST(memory_search_statement_reuse); + // Phase 4b: Long-text chunking + multi-section retrieval + RUN_TEST(memory_add_long_text_chunking); + RUN_TEST(memory_search_long_text_sections); + + // Phase 4c: Single-chunk near (under) the provider token ceiling + RUN_TEST(memory_search_under_token_limit); + + // Phase 4d: Model-level truncation behavioral signature + RUN_TEST(memory_search_truncation_signature); + + // Phase 4e: Same shape, but text size pushed near the model context window + RUN_TEST(memory_search_truncation_near_model_context); + // Phase 5: Deletion RUN_TEST(memory_delete); RUN_TEST(memory_delete_context); diff --git a/test/unittest.c b/test/unittest.c index 8cc012a..e05d600 100644 --- a/test/unittest.c +++ b/test/unittest.c @@ -1589,7 +1589,7 @@ TEST(sqlite_schema_has_timestamps) { ASSERT(db != NULL); // Check that schema includes created_at column - char sql[256]; + char sql[512]; int rc = exec_get_text(db, "SELECT sql FROM sqlite_master WHERE name='dbmem_content';", sql, sizeof(sql)); @@ -1603,12 +1603,75 @@ TEST(sqlite_schema_has_timestamps) { sql, sizeof(sql)); ASSERT_EQ(rc, SQLITE_OK); ASSERT(strstr(sql, "hash TEXT NOT NULL") != NULL); + ASSERT(strstr(sql, "n_tokens") != NULL); + ASSERT(strstr(sql, "truncated") != NULL); rc = exec_get_text(db, "SELECT sql FROM sqlite_master WHERE name='dbmem_cache';", sql, sizeof(sql)); ASSERT_EQ(rc, SQLITE_OK); ASSERT(strstr(sql, "text_hash TEXT NOT NULL") != NULL); + ASSERT(strstr(sql, "n_tokens") != NULL); + ASSERT(strstr(sql, "truncated") != NULL); + + sqlite3_int64 schema_version = 0; + rc = exec_get_int(db, "SELECT value FROM dbmem_settings WHERE key = 'schema_version';", &schema_version); + ASSERT_EQ(rc, SQLITE_OK); + ASSERT_EQ(schema_version, 2); + + sqlite3_close(db); +} + +TEST(sqlite_schema_migrates_embedding_metadata) { + sqlite3 *db = NULL; + int rc = sqlite3_open(":memory:", &db); + ASSERT_EQ(rc, SQLITE_OK); + + rc = sqlite3_exec(db, + "CREATE TABLE dbmem_settings (key TEXT PRIMARY KEY, value TEXT);" + "INSERT INTO dbmem_settings (key, value) VALUES ('schema_version', '1');" + "CREATE TABLE dbmem_vault (hash TEXT NOT NULL, seq INTEGER NOT NULL, embedding BLOB NOT NULL, offset INTEGER NOT NULL, length INTEGER NOT NULL, PRIMARY KEY (hash, seq));" + "CREATE TABLE dbmem_cache (text_hash TEXT NOT NULL, provider TEXT NOT NULL, model TEXT NOT NULL, embedding BLOB NOT NULL, dimension INTEGER NOT NULL, PRIMARY KEY (text_hash, provider, model));", + NULL, NULL, NULL); + ASSERT_EQ(rc, SQLITE_OK); + + rc = sqlite3_memory_init(db, NULL, NULL); + ASSERT_EQ(rc, SQLITE_OK); + + sqlite3_int64 count = 0; + rc = exec_get_int(db, "SELECT COUNT(*) FROM pragma_table_info('dbmem_vault') WHERE name = 'n_tokens';", &count); + ASSERT_EQ(rc, SQLITE_OK); + ASSERT_EQ(count, 1); + + rc = exec_get_int(db, "SELECT COUNT(*) FROM pragma_table_info('dbmem_vault') WHERE name = 'truncated';", &count); + ASSERT_EQ(rc, SQLITE_OK); + ASSERT_EQ(count, 1); + + rc = exec_get_int(db, "SELECT COUNT(*) FROM pragma_table_info('dbmem_cache') WHERE name = 'n_tokens';", &count); + ASSERT_EQ(rc, SQLITE_OK); + ASSERT_EQ(count, 1); + + rc = exec_get_int(db, "SELECT COUNT(*) FROM pragma_table_info('dbmem_cache') WHERE name = 'truncated';", &count); + ASSERT_EQ(rc, SQLITE_OK); + ASSERT_EQ(count, 1); + + rc = sqlite3_exec(db, + "INSERT INTO dbmem_vault (hash, seq, embedding, offset, length) VALUES (printf('%016x', 900), 0, X'00000000', 0, 4);" + "INSERT INTO dbmem_cache (text_hash, provider, model, embedding, dimension) VALUES (printf('%016x', 901), 'dummy', 'model', X'00000000', 1);", + NULL, NULL, NULL); + ASSERT_EQ(rc, SQLITE_OK); + + rc = exec_get_int(db, "SELECT n_tokens FROM dbmem_vault WHERE hash = printf('%016x', 900);", &count); + ASSERT_EQ(rc, SQLITE_OK); + ASSERT_EQ(count, 0); + + rc = exec_get_int(db, "SELECT truncated FROM dbmem_cache WHERE text_hash = printf('%016x', 901);", &count); + ASSERT_EQ(rc, SQLITE_OK); + ASSERT_EQ(count, 0); + + rc = exec_get_int(db, "SELECT value FROM dbmem_settings WHERE key = 'schema_version';", &count); + ASSERT_EQ(rc, SQLITE_OK); + ASSERT_EQ(count, 2); sqlite3_close(db); } @@ -2115,9 +2178,15 @@ TEST(sqlite_sync_directory_skips_unchanged) { mkdir_p(test_dir); create_test_file(file, content); - // Compute the hash and pre-insert the entry - uint64_t hash = dbmem_hash_compute(content, strlen(content)); - int rc = insert_fake_content(db, hash, file, "notes", (sqlite3_int64)strlen(content)); + // Compute the hash from disk so Windows text-mode newline translation + // cannot make the pre-inserted hash differ from memory_add_directory(). + int64_t len = 0; + char *buf = dbmem_file_read(file, &len); + ASSERT(buf != NULL); + uint64_t hash = dbmem_hash_compute(buf, (size_t)len); + dbmemory_free(buf); + + int rc = insert_fake_content(db, hash, file, "notes", len); ASSERT_EQ(rc, SQLITE_OK); // Sync — file exists with matching hash, should be skipped @@ -2178,7 +2247,7 @@ TEST(sqlite_cache_table_exists) { ASSERT(db != NULL); // Check that dbmem_cache table exists - char sql[256]; + char sql[512]; int rc = exec_get_text(db, "SELECT sql FROM sqlite_master WHERE name='dbmem_cache';", sql, sizeof(sql)); @@ -2189,6 +2258,8 @@ TEST(sqlite_cache_table_exists) { ASSERT(strstr(sql, "model") != NULL); ASSERT(strstr(sql, "embedding") != NULL); ASSERT(strstr(sql, "dimension") != NULL); + ASSERT(strstr(sql, "n_tokens") != NULL); + ASSERT(strstr(sql, "truncated") != NULL); sqlite3_close(db); } @@ -2514,12 +2585,20 @@ static int dummy_compute(void *engine, const char *text, int text_len, void *xda dummy_engine_t *e = (dummy_engine_t *)engine; e->compute_count++; result->n_tokens = text_len / 4; - result->n_tokens_truncated = 0; + result->truncated = false; result->n_embd = e->dimension; result->embedding = e->embedding; return 0; } +static int truncated_dummy_compute(void *engine, const char *text, int text_len, void *xdata, dbmem_embedding_result_t *result) { + int rc = dummy_compute(engine, text, text_len, xdata, result); + if (rc != 0) return rc; + result->n_tokens = 3; + result->truncated = true; + return 0; +} + static void dummy_free(void *engine, void *xdata) { UNUSED_PARAM(xdata); free(engine); @@ -2666,6 +2745,87 @@ TEST(sqlite_custom_provider_add_text) { ASSERT_EQ(rc, SQLITE_OK); ASSERT(result >= 1); + rc = exec_get_int(db, "SELECT n_tokens FROM dbmem_vault LIMIT 1;", &result); + ASSERT_EQ(rc, SQLITE_OK); + ASSERT_EQ(result, 7); + + rc = exec_get_int(db, "SELECT truncated FROM dbmem_vault LIMIT 1;", &result); + ASSERT_EQ(rc, SQLITE_OK); + ASSERT_EQ(result, 0); + + rc = exec_get_int(db, "SELECT n_tokens FROM dbmem_cache LIMIT 1;", &result); + ASSERT_EQ(rc, SQLITE_OK); + ASSERT_EQ(result, 7); + + rc = exec_get_int(db, "SELECT truncated FROM dbmem_cache LIMIT 1;", &result); + ASSERT_EQ(rc, SQLITE_OK); + ASSERT_EQ(result, 0); + + rc = exec_get_int(db, "SELECT memory_clear();", &result); + ASSERT_EQ(rc, SQLITE_OK); + + rc = exec_get_int(db, "SELECT memory_add_text('Hello world, this is a test.');", &result); + ASSERT_EQ(rc, SQLITE_OK); + ASSERT(result >= 1); + + rc = exec_get_int(db, "SELECT n_tokens FROM dbmem_vault LIMIT 1;", &result); + ASSERT_EQ(rc, SQLITE_OK); + ASSERT_EQ(result, 7); + + rc = exec_get_int(db, "SELECT truncated FROM dbmem_vault LIMIT 1;", &result); + ASSERT_EQ(rc, SQLITE_OK); + ASSERT_EQ(result, 0); + + sqlite3_close(db); +} + +TEST(sqlite_custom_provider_persists_truncated_metadata) { + sqlite3 *db = open_test_db(); + ASSERT(db != NULL); + + dbmem_provider_t prov = { .init = dummy_init, .compute = truncated_dummy_compute, .free = dummy_free }; + int rc = sqlite3_memory_register_provider(db, "truncdummy", &prov); + ASSERT_EQ(rc, SQLITE_OK); + + sqlite3_int64 result = 0; + rc = exec_get_int(db, "SELECT memory_set_model('truncdummy', 'test-model');", &result); + ASSERT_EQ(rc, SQLITE_OK); + + rc = exec_get_int(db, "SELECT memory_add_text('This custom provider reports truncation.');", &result); + ASSERT_EQ(rc, SQLITE_OK); + ASSERT(result >= 1); + + rc = exec_get_int(db, "SELECT n_tokens FROM dbmem_vault LIMIT 1;", &result); + ASSERT_EQ(rc, SQLITE_OK); + ASSERT_EQ(result, 3); + + rc = exec_get_int(db, "SELECT truncated FROM dbmem_vault LIMIT 1;", &result); + ASSERT_EQ(rc, SQLITE_OK); + ASSERT_EQ(result, 1); + + rc = exec_get_int(db, "SELECT n_tokens FROM dbmem_cache LIMIT 1;", &result); + ASSERT_EQ(rc, SQLITE_OK); + ASSERT_EQ(result, 3); + + rc = exec_get_int(db, "SELECT truncated FROM dbmem_cache LIMIT 1;", &result); + ASSERT_EQ(rc, SQLITE_OK); + ASSERT_EQ(result, 1); + + rc = exec_get_int(db, "SELECT memory_clear();", &result); + ASSERT_EQ(rc, SQLITE_OK); + + rc = exec_get_int(db, "SELECT memory_add_text('This custom provider reports truncation.');", &result); + ASSERT_EQ(rc, SQLITE_OK); + ASSERT(result >= 1); + + rc = exec_get_int(db, "SELECT n_tokens FROM dbmem_vault LIMIT 1;", &result); + ASSERT_EQ(rc, SQLITE_OK); + ASSERT_EQ(result, 3); + + rc = exec_get_int(db, "SELECT truncated FROM dbmem_vault LIMIT 1;", &result); + ASSERT_EQ(rc, SQLITE_OK); + ASSERT_EQ(result, 1); + sqlite3_close(db); } @@ -2828,6 +2988,7 @@ static void tracking_free(void *engine, void *xdata) { free(engine); } +#ifndef DBMEM_OMIT_REMOTE_ENGINE TEST(sqlite_set_model_releases_previous_engine_on_class_switch) { sqlite3 *db = open_test_db(); ASSERT(db != NULL); @@ -2857,6 +3018,37 @@ TEST(sqlite_set_model_releases_previous_engine_on_class_switch) { sqlite3_close(db); ASSERT_EQ(state.free_count, 1); } +#else +TEST(sqlite_set_model_failed_remote_switch_keeps_custom_engine) { + sqlite3 *db = open_test_db(); + ASSERT(db != NULL); + + sqlite3_int64 result = 0; + int rc = exec_get_int(db, "SELECT memory_set_apikey('test-key');", &result); + ASSERT_EQ(rc, SQLITE_OK); + + tracking_free_state_t state = {0}; + dbmem_provider_t prov = { .init = tracking_init, .compute = tracking_compute, .free = tracking_free, .xdata = &state }; + rc = sqlite3_memory_register_provider(db, "tracker", &prov); + ASSERT_EQ(rc, SQLITE_OK); + + rc = exec_get_int(db, "SELECT memory_set_model('tracker', 'm1');", &result); + ASSERT_EQ(rc, SQLITE_OK); + ASSERT_EQ(state.free_count, 0); + + sqlite3_stmt *stmt = NULL; + rc = sqlite3_prepare_v2(db, "SELECT memory_set_model('openai', 'text-embedding-3-small');", -1, &stmt, NULL); + ASSERT_EQ(rc, SQLITE_OK); + rc = sqlite3_step(stmt); + ASSERT_EQ(rc, SQLITE_ERROR); + sqlite3_finalize(stmt); + + ASSERT_EQ(state.free_count, 0); + + sqlite3_close(db); + ASSERT_EQ(state.free_count, 1); +} +#endif #endif // TEST_SQLITE_EXTENSION @@ -2957,6 +3149,7 @@ int main(int argc, char *argv[]) { RUN_TEST(sqlite_memory_delete_nonexistent); RUN_TEST(sqlite_memory_delete_context_nonexistent); RUN_TEST(sqlite_schema_has_timestamps); + RUN_TEST(sqlite_schema_migrates_embedding_metadata); RUN_TEST(sqlite_direct_insert_with_timestamp); RUN_TEST(sqlite_memory_delete_direct); RUN_TEST(sqlite_memory_delete_context_direct); @@ -2997,12 +3190,17 @@ int main(int argc, char *argv[]) { RUN_TEST(sqlite_custom_provider_register); RUN_TEST(sqlite_custom_provider_set_model); RUN_TEST(sqlite_custom_provider_add_text); + RUN_TEST(sqlite_custom_provider_persists_truncated_metadata); RUN_TEST(sqlite_mdx_preprocessing_applies_only_to_mdx_files); RUN_TEST(sqlite_custom_provider_null_callbacks); RUN_TEST(sqlite_custom_provider_init_error); RUN_TEST(sqlite_custom_provider_apikey_passed); RUN_TEST(sqlite_set_model_failed_reindex_preserves_existing_rows); +#ifndef DBMEM_OMIT_REMOTE_ENGINE RUN_TEST(sqlite_set_model_releases_previous_engine_on_class_switch); +#else + RUN_TEST(sqlite_set_model_failed_remote_switch_keeps_custom_engine); +#endif #endif printf("\n=== Results ===\n");