diff --git a/README.md b/README.md index eac66c7..df424a2 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ ## Python° -![Python Logo](app/static/python.png) +![Python°](app/static/python.png) > Production-ready, open-source FastAPI application with PostgreSQL and blazing-fast full-text search. @@ -23,7 +23,7 @@ This project provides a scalable API backend using FastAPI and PostgreSQL, featu #### Install & Use -### 1. Clone & Setup Environment +#### 1. Clone & Setup Environment ```bash git clone https://github.com/goldlabelapps/python.git @@ -59,7 +59,7 @@ FastAPI auto-generates interactive docs: - `GET /prospects` — Paginated prospects - `POST /prospects/process` — Bulk CSV ingestion -## Full-Text Search (tsvector) +#### Full-Text Search (tsvector) The `prospects` table includes a `search_vector` column (type: tsvector) computed from all text fields on insert/update. A GIN index enables fast, scalable full-text search: @@ -72,15 +72,14 @@ SELECT * FROM prospects WHERE search_vector @@ plainto_tsquery('english', 'searc - The GIN index (`idx_prospects_search_vector`) enables efficient search across large datasets. -## Processing Large CSV Files +#### Processing Large CSV Files The `/prospects/process` endpoint supports robust ingestion of large CSVs (e.g., 1300+ rows, 300KB+), following the same normalization and insertion pattern as `/prospects/seed` but optimized for scale. -## Contributing +#### Contributing Contributions welcome. Please open issues or submit pull requests. - -## License +#### License This project is licensed under the MIT License. See [LICENSE](LICENSE) for details. diff --git a/app/__init__.py b/app/__init__.py index d85556c..49f2f7b 100644 --- a/app/__init__.py +++ b/app/__init__.py @@ -1,4 +1,4 @@ """Python - FastAPI, Postgres, tsvector""" # Current Version -__version__ = "2.2.9" +__version__ = "1" diff --git a/app/api/prompt/__init__.py b/app/api/prompt/__init__.py index b35910c..de2e21d 100644 --- a/app/api/prompt/__init__.py +++ b/app/api/prompt/__init__.py @@ -1,5 +1,4 @@ """Prompt Routes""" from .prompt import router as prompt_router -from .linkedin import router as linkedin_router from .empty import router as empty_router diff --git a/app/api/prompt/linkedin.py b/app/api/prompt/linkedin.py deleted file mode 100644 index ec919a7..0000000 --- a/app/api/prompt/linkedin.py +++ /dev/null @@ -1,201 +0,0 @@ -import os - -from fastapi import APIRouter, Depends, HTTPException - -from app.utils.api_key_auth import get_api_key -from app.utils.db import get_db_connection_direct -from app.utils.make_meta import make_meta - -router = APIRouter() - - -@router.post("/prompt/linkedin") -def linkedin_prompt_success(payload: dict, api_key: str = Depends(get_api_key)) -> dict: - """POST /prompt/linkedin: return cached completion or create a new Gemini analysis.""" - linkedin_url = (payload.get("linkedin_url") or payload.get("linkedinUrl") or "").strip() - if not linkedin_url: - raise HTTPException(status_code=400, detail="Missing 'linkedin_url' in request body.") - - prompt = (payload.get("prompt") or "").strip() - if not prompt: - prompt = ( - "Analyse this LinkedIn profile URL and provide a concise summary of the person, " - "their role, company, seniority, likely responsibilities, and notable signals. " - f"LinkedIn URL: {linkedin_url}" - ) - - gemini_api_key = os.getenv("GEMINI_API_KEY") - if not gemini_api_key: - raise HTTPException(status_code=500, detail="Gemini API key not configured.") - - conn = None - cur = None - try: - conn = get_db_connection_direct() - cur = conn.cursor() - cur.execute( - """ - SELECT EXISTS ( - SELECT 1 - FROM information_schema.columns - WHERE table_schema = 'public' - AND table_name = 'prompt' - AND column_name = 'search_vector' - ); - """ - ) - exists_row = cur.fetchone() - has_search_vector = bool(exists_row and exists_row[0]) - - if has_search_vector: - cur.execute( - """ - SELECT id, prompt, completion, time, model, data - FROM prompt - WHERE ( - COALESCE(data->>'linkedin_url', data->>'linkedinUrl') = %s - OR search_vector @@ plainto_tsquery('english', %s) - OR prompt ILIKE %s - ) - ORDER BY id DESC - LIMIT 1; - """, - (linkedin_url, linkedin_url, f"%{linkedin_url}%"), - ) - else: - cur.execute( - """ - SELECT id, prompt, completion, time, model, data - FROM prompt - WHERE (COALESCE(data->>'linkedin_url', data->>'linkedinUrl') = %s OR prompt ILIKE %s) - ORDER BY id DESC - LIMIT 1; - """, - (linkedin_url, f"%{linkedin_url}%"), - ) - row = cur.fetchone() - - if row: - cur.close() - conn.close() - cur = None - conn = None - return { - "meta": make_meta("success", "LinkedIn URL already analysed"), - "data": { - "cached": True, - "prompt_id": row[0], - "linkedin_url": linkedin_url, - "prompt": row[1], - "completion": row[2], - "time": row[3].isoformat() if row[3] else None, - "model": row[4], - }, - } - - cur.close() - conn.close() - cur = None - conn = None - - import json - import logging - import time as time_mod - from app import __version__ - from google import genai - - client = genai.Client(api_key=gemini_api_key) - model_names = [ - "models/gemini-flash-latest", - "models/gemini-1.5-pro", - "models/gemini-1.5-flash", - "models/gemini-1.0-pro", - "models/gemini-pro", - "models/gemini-pro-vision", - ] - response = None - completion = None - used_model = None - errors = {} - start_time = time_mod.time() - for model_name in model_names: - try: - response = client.models.generate_content(model=model_name, contents=prompt) - completion = getattr(response, "text", None) - if completion: - used_model = model_name - break - except Exception as model_exc: - errors[model_name] = str(model_exc) - continue - - duration = time_mod.time() - start_time - if not completion: - error_details = " | ".join([f"{name}: {message}" for name, message in errors.items()]) - raise Exception( - "No available Gemini model succeeded for generate_content with your API key. " - f"Details: {error_details}" - ) - - record_id = None - record_data = { - "version": __version__, - "linkedin_url": linkedin_url, - } - try: - conn = get_db_connection_direct() - cur = conn.cursor() - data_blob = json.dumps(record_data) - if has_search_vector: - cur.execute( - """ - INSERT INTO prompt (prompt, completion, duration, model, data, search_vector) - VALUES (%s, %s, %s, %s, %s, to_tsvector('english', %s || ' ' || %s)) - RETURNING id; - """, - (prompt, completion, duration, used_model, data_blob, prompt, completion) - ) - else: - cur.execute( - """ - INSERT INTO prompt (prompt, completion, duration, model, data) - VALUES (%s, %s, %s, %s, %s) - RETURNING id; - """, - (prompt, completion, duration, used_model, data_blob) - ) - record_id_row = cur.fetchone() - record_id = record_id_row[0] if record_id_row else None - conn.commit() - cur.close() - conn.close() - cur = None - conn = None - except Exception as db_exc: - logging.error(f"Failed to insert prompt record: {db_exc}") - - return { - "meta": make_meta("success", f"Gemini completion received from {used_model}"), - "data": { - "cached": False, - "id": record_id, - "linkedin_url": linkedin_url, - "prompt": prompt, - "completion": completion, - "duration": duration, - "model": used_model, - "record_data": record_data, - }, - } - except HTTPException: - raise - except Exception as e: - return { - "meta": make_meta("error", f"Gemini API error: {str(e)}"), - "data": {}, - } - finally: - if cur: - cur.close() - if conn: - conn.close() diff --git a/app/api/prompt/prompt.py b/app/api/prompt/prompt.py index 9e5f7a4..023f03e 100644 --- a/app/api/prompt/prompt.py +++ b/app/api/prompt/prompt.py @@ -1,4 +1,5 @@ import os +import hashlib from fastapi import APIRouter, HTTPException, Depends from app.utils.make_meta import make_meta from app.utils.db import get_db_connection_direct @@ -41,15 +42,84 @@ def get_prompt_table_metadata(api_key: str = Depends(get_api_key)) -> dict: @router.post("/prompt") def llm_post(payload: dict) -> dict: - """POST /prompt: send prompt to Gemini, returns completion google-genai SDK.""" - prompt = payload.get("prompt") + """POST /prompt: send prompt to Gemini with DB-backed caching.""" + prompt = (payload.get("prompt") or "").strip() if not prompt: raise HTTPException(status_code=400, detail="Missing 'prompt' in request body.") + api_key = os.getenv("GEMINI_API_KEY") if not api_key: raise HTTPException(status_code=500, detail="Gemini API key not configured.") + + prompt_hash = hashlib.sha256(prompt.encode("utf-8")).hexdigest() + conn = None + cur = None import logging try: + conn = get_db_connection_direct() + cur = conn.cursor() + cur.execute( + """ + SELECT EXISTS ( + SELECT 1 + FROM information_schema.columns + WHERE table_schema = 'public' + AND table_name = 'prompt' + AND column_name = 'search_vector' + ); + """ + ) + exists_row = cur.fetchone() + has_search_vector = bool(exists_row and exists_row[0]) + + # Fast/safe cache hit: exact prompt hash or exact prompt text. + cur.execute( + """ + SELECT id, prompt, completion, time, model + FROM prompt + WHERE COALESCE(data->>'prompt_hash', '') = %s OR prompt = %s + ORDER BY id DESC + LIMIT 1; + """, + (prompt_hash, prompt), + ) + row = cur.fetchone() + + # Fallback cache hit when tsvector exists and query terms match strongly. + if not row and has_search_vector: + cur.execute( + """ + SELECT id, prompt, completion, time, model, + ts_rank_cd(search_vector, plainto_tsquery('english', %s)) AS rank + FROM prompt + WHERE search_vector @@ plainto_tsquery('english', %s) + ORDER BY rank DESC, id DESC + LIMIT 1; + """, + (prompt, prompt), + ) + rank_row = cur.fetchone() + if rank_row and rank_row[5] is not None and float(rank_row[5]) >= 0.35: + row = rank_row[:5] + + cur.close() + conn.close() + cur = None + conn = None + + if row: + return { + "meta": make_meta("success", "Prompt returned from cache"), + "data": { + "cached": True, + "prompt_id": row[0], + "prompt": row[1], + "completion": row[2], + "time": row[3].isoformat() if row[3] else None, + "model": row[4], + }, + } + from google import genai import time as time_mod client = genai.Client(api_key=api_key) @@ -85,17 +155,31 @@ def llm_post(payload: dict) -> dict: try: import json from app import __version__ - data_blob = json.dumps({"version": __version__}) + record_data = { + "version": __version__, + "prompt_hash": prompt_hash, + } + data_blob = json.dumps(record_data) conn = get_db_connection_direct() cur = conn.cursor() - cur.execute( - """ - INSERT INTO prompt (prompt, completion, duration, data, model) - VALUES (%s, %s, %s, %s, %s) - RETURNING id; - """, - (prompt, completion, duration, data_blob, used_model) - ) + if has_search_vector: + cur.execute( + """ + INSERT INTO prompt (prompt, completion, duration, data, model, search_vector) + VALUES (%s, %s, %s, %s, %s, to_tsvector('english', %s || ' ' || %s)) + RETURNING id; + """, + (prompt, completion, duration, data_blob, used_model, prompt, completion) + ) + else: + cur.execute( + """ + INSERT INTO prompt (prompt, completion, duration, data, model) + VALUES (%s, %s, %s, %s, %s) + RETURNING id; + """, + (prompt, completion, duration, data_blob, used_model) + ) record_id_row = cur.fetchone() record_id = record_id_row[0] if record_id_row else None conn.commit() @@ -105,8 +189,23 @@ def llm_post(payload: dict) -> dict: # Log DB error but do not fail the API response logging.error(f"Failed to insert prompt record: {db_exc}") meta = make_meta("success", f"Gemini completion received from {used_model}") - return {"meta": meta, "data": {"id": record_id, "prompt": prompt, "completion": completion}} + return { + "meta": meta, + "data": { + "cached": False, + "id": record_id, + "prompt": prompt, + "completion": completion, + "duration": duration, + "model": used_model, + }, + } except Exception as e: meta = make_meta("error", f"Gemini API error: {str(e)}") return {"meta": meta, "data": {}} + finally: + if cur: + cur.close() + if conn: + conn.close() diff --git a/app/api/root.py b/app/api/root.py index 0ef3392..773c766 100644 --- a/app/api/root.py +++ b/app/api/root.py @@ -31,7 +31,6 @@ def root() -> dict: "name": "Prompt°", "endpoints": [ {"name": "list", "url": f"{base_url}/prompt"}, - {"name": "linkedin", "url": f"{base_url}/prompt/linkedin"}, ] }, { diff --git a/app/api/routes.py b/app/api/routes.py index e933002..82234f3 100644 --- a/app/api/routes.py +++ b/app/api/routes.py @@ -9,7 +9,6 @@ from app.utils.health import router as health_router from app.utils.notify.resend import router as resend_router from app.api.prompt.prompt import router as prompt_router -from app.api.prompt.linkedin import router as linkedin_router from app.api.prompt.empty import router as prompts_empty_router from app.api.prospects.prospects import router as prospects_router from app.api.orders.orders import router as orders_router @@ -19,7 +18,6 @@ router.include_router(resend_router) router.include_router(health_router) router.include_router(prompt_router) -router.include_router(linkedin_router) router.include_router(prompts_empty_router) router.include_router(prospects_router) router.include_router(orders_router) diff --git a/pytest_output.txt b/pytest_output.txt deleted file mode 100644 index 8bf1707..0000000 --- a/pytest_output.txt +++ /dev/null @@ -1,23 +0,0 @@ -============================= test session starts ============================== -platform darwin -- Python 3.11.15, pytest-9.0.2, pluggy-1.6.0 -- /opt/homebrew/opt/python@3.11/bin/python3.11 -cachedir: .pytest_cache -rootdir: /Users/milky/My Drive/GitHub/python -plugins: anyio-4.12.1, Faker-40.12.0 -collecting ... collected 14 items - -tests/test_health.py::test_health_endpoint PASSED [ 7%] -tests/test_health.py::test_health_meta_keys PASSED [ 14%] -tests/test_make_meta.py::test_make_meta_basic PASSED [ 21%] -tests/test_make_meta.py::test_make_meta_default_base_url PASSED [ 28%] -tests/test_make_meta.py::test_make_meta_time_is_int PASSED [ 35%] -tests/test_orders.py::test_get_orders_root PASSED [ 42%] -tests/test_orders.py::test_orders_search_param PASSED [ 50%] -tests/test_orders.py::test_get_queue PASSED [ 57%] -tests/test_orders.py::test_orders_returns_list PASSED [ 64%] -tests/test_prospects.py::test_get_prospects_root PASSED [ 71%] -tests/test_prospects.py::test_prospects_returns_list PASSED [ 78%] -tests/test_resend.py::test_resend_post_email PASSED [ 85%] -tests/test_routes.py::test_root_returns_welcome_message PASSED [ 92%] -tests/test_routes.py::test_health_returns_ok PASSED [100%] - -============================= 14 passed in 10.17s ============================== diff --git a/queue_output.txt b/queue_output.txt deleted file mode 100644 index 224df1e..0000000 --- a/queue_output.txt +++ /dev/null @@ -1,11 +0,0 @@ -id,first_name,last_name,url,email_address,company,position,connected_on,created,updated,hidden,collection,group -22,,,,,,,,1776513092,1776513092,False,prospects,linkedin -23,,,,,,,,1776513092,1776513092,False,prospects,linkedin -24,,,,,,,,1776513092,1776513092,False,prospects,linkedin -25,,,,,,,,1776513092,1776513092,False,prospects,linkedin -26,,,,,,,,1776513092,1776513092,False,prospects,linkedin -27,,,,,,,,1776513092,1776513092,False,prospects,linkedin -28,,,,,,,,1776513092,1776513092,False,prospects,linkedin -29,,,,,,,,1776513092,1776513092,False,prospects,linkedin -30,,,,,,,,1776513092,1776513092,False,prospects,linkedin -21,,,,,,,,1776513092,1776513092,False,prospects,linkedin diff --git "a/Python\302\260.postman_collection.json" "b/tests/Python\302\260.json" similarity index 100% rename from "Python\302\260.postman_collection.json" rename to "tests/Python\302\260.json" diff --git a/tests/test_prompt.py b/tests/test_prompt.py new file mode 100644 index 0000000..0129520 --- /dev/null +++ b/tests/test_prompt.py @@ -0,0 +1,135 @@ +import json +from fastapi.testclient import TestClient + +from app.main import app + +client = TestClient(app) + + +class _MockCursor: + def __init__(self, fetchone_values): + self._fetchone_values = list(fetchone_values) + self._fetchall_values = [] + self.executed = [] + + def execute(self, query, params=None): + self.executed.append((query, params)) + + def fetchone(self): + if self._fetchone_values: + return self._fetchone_values.pop(0) + return None + + def fetchall(self): + if self._fetchall_values: + return self._fetchall_values.pop(0) + return [] + + def close(self): + return None + + +class _MockConnection: + def __init__(self, fetchone_values): + self.cursor_obj = _MockCursor(fetchone_values) + self.committed = False + + def cursor(self): + return self.cursor_obj + + def commit(self): + self.committed = True + + def close(self): + return None + + +class _FakeResponse: + def __init__(self, text): + self.text = text + + +class _FakeModels: + def generate_content(self, model, contents): + return _FakeResponse(f"fresh completion for: {contents}") + + +class _FakeGenAIClient: + def __init__(self, api_key): + self.api_key = api_key + self.models = _FakeModels() + + +def test_prompt_post_returns_cached_response(monkeypatch): + # Keep auth open for tests and avoid needing a real API key. + monkeypatch.setenv("GEMINI_API_KEY", "test-key") + + # 1) search_vector column exists + # 2) exact cache row found by prompt hash / prompt + conn = _MockConnection( + fetchone_values=[ + (True,), + (101, "Who is Ada Lovelace?", "Cached answer", None, "models/gemini-1.5-pro"), + ] + ) + + from app.api.prompt import prompt as prompt_module + + monkeypatch.setattr(prompt_module, "get_db_connection_direct", lambda: conn) + + response = client.post("/prompt", json={"prompt": "Who is Ada Lovelace?"}) + assert response.status_code == 200 + + payload = response.json() + assert payload["meta"]["severity"] == "success" + assert payload["data"]["cached"] is True + assert payload["data"]["prompt_id"] == 101 + assert payload["data"]["completion"] == "Cached answer" + + # Ensure only lookup queries ran; no insert commit should happen. + assert conn.committed is False + + +def test_prompt_post_cache_miss_calls_gemini_and_saves(monkeypatch): + monkeypatch.setenv("GEMINI_API_KEY", "test-key") + + # 1) search_vector column exists + # 2) no exact cache row + # 3) no tsvector fallback cache row + # 4) insert returning id + conn = _MockConnection( + fetchone_values=[ + (True,), + None, + None, + (202,), + ] + ) + + from app.api.prompt import prompt as prompt_module + + monkeypatch.setattr(prompt_module, "get_db_connection_direct", lambda: conn) + + # Patch google.genai client constructor used in route. + import google.genai as genai + + monkeypatch.setattr(genai, "Client", _FakeGenAIClient) + + response = client.post("/prompt", json={"prompt": "Explain quicksort"}) + assert response.status_code == 200 + + payload = response.json() + assert payload["meta"]["severity"] == "success" + assert payload["data"]["cached"] is False + assert payload["data"]["id"] == 202 + assert payload["data"]["prompt"] == "Explain quicksort" + assert payload["data"]["completion"].startswith("fresh completion for: Explain quicksort") + + # Insert path should commit. + assert conn.committed is True + + +def test_prompt_post_missing_prompt_returns_400(): + response = client.post("/prompt", json={}) + assert response.status_code == 400 + assert response.json()["detail"] == "Missing 'prompt' in request body." diff --git a/tests/test_queue.py b/tests/test_queue.py index 4f2cd33..da23849 100644 --- a/tests/test_queue.py +++ b/tests/test_queue.py @@ -12,12 +12,14 @@ def test_get_queue(): assert "meta" in data assert "data" in data queue_data = data["data"] - assert "in_queue" in queue_data - assert "collections" in queue_data - assert "groups" in queue_data + assert "filters" in queue_data assert "filtered" in queue_data assert "total" in queue_data assert "next" in queue_data + assert "collections" in queue_data["filters"] + assert "groups" in queue_data["filters"] + assert "collectionFilter" in queue_data["filters"] + assert "groupFilter" in queue_data["filters"] meta = data["meta"] assert meta["severity"] == "success" assert meta["title"] == "Queue table info"