From b852592fea770ff3cf65dbb4ca37a3dc79f539ae Mon Sep 17 00:00:00 2001
From: Robayed Ashraf <robayedashraf@gmail.com>
Date: Wed, 6 May 2026 05:22:01 +1000
Subject: [PATCH] Add RAGAS evaluation system with golden dataset
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- eval/golden.jsonl: 30-question golden dataset from "Attention Is All
  You Need" (15 factual, 8 reasoning, 5 multi_hop, 2 out_of_scope)
- eval/run.py: evaluation runner — calls DocuMind pipeline directly,
  computes RAGAS metrics with Gemini 2.5 Flash as judge, prints
  per-question table, saves timestamped JSON + overwrites latest.json,
  auto-updates README scores after each run
- eval/results/latest.json: latest evaluation results (30 questions)
- eval/EVALUATION_GUIDE.md: dataset format, usage, cost estimates
- Makefile: run / ui / test / lint / eval / update-readme targets
- requirements.txt: bump ragas 0.0.22 → >=0.2.0
- README.md: evaluation section with live scores, split code blocks,
- Remove legacy eval/ragas_eval.py, eval/run_eval.py, eval/test_queries.json
---
 .env.example             |   1 +
 .gitignore               |   8 +-
 Makefile                 |  28 ++
 README.md                | 132 +++++++---
 eval/EVALUATION_GUIDE.md | 136 ++++++++++
 eval/golden.jsonl        |  30 +++
 eval/ragas_eval.py       | 150 -----------
 eval/results/latest.json | 550 +++++++++++++++++++++++++++++++++++++++
 eval/run.py              | 392 ++++++++++++++++++++++++++++
 eval/run_eval.py         | 101 -------
 eval/test_queries.json   |  27 --
 requirements.txt         |   2 +-
 12 files changed, 1243 insertions(+), 314 deletions(-)
 create mode 100644 Makefile
 create mode 100644 eval/EVALUATION_GUIDE.md
 create mode 100644 eval/golden.jsonl
 delete mode 100644 eval/ragas_eval.py
 create mode 100644 eval/results/latest.json
 create mode 100644 eval/run.py
 delete mode 100644 eval/run_eval.py
 delete mode 100644 eval/test_queries.json

diff --git a/.env.example b/.env.example
index 1f99be1..1e7a7b0 100644
--- a/.env.example
+++ b/.env.example
@@ -1,6 +1,7 @@
 ENVIRONMENT=local
 CHROMA_DIR=chroma_db
 STORAGE_DIR=storage
+ANONYMIZED_TELEMETRY=false
 
 # Google Gemini API key — required for LLM-powered answer generation
 GOOGLE_API_KEY=your-google-api-key-here
diff --git a/.gitignore b/.gitignore
index 5ba35e6..6cf86a3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -32,10 +32,9 @@ data/
 *.log
 logs/
 
-# Evaluation output
-eval/results.json
-eval/report.json
-eval/report.md
+# Evaluation output — timestamped runs stay local; latest.json is committed
+eval/results/[0-9]*.json
+eval/__pycache__/
 
 # Credentials & keys
 *.pem
@@ -45,3 +44,4 @@ eval/report.md
 
 # Test artifacts
 testfile.pdf
+attention.pdf
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..2beff67
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,28 @@
+.PHONY: run ui test lint eval update-readme
+
+run:
+	uvicorn app.main:app --reload --port 8000
+
+ui:
+	streamlit run ui/streamlit_app.py --server.port 8501
+
+test:
+	pytest -q
+
+lint:
+	ruff check .
+
+# Run RAGAS evaluation against the golden dataset.
+# Requires: DOC_ID env var (the indexed doc_id for "Attention Is All You Need")
+#           GOOGLE_API_KEY env var
+# Example:  DOC_ID=abc123 make eval
+eval:
+	@if [ -z "$(DOC_ID)" ]; then \
+		echo "ERROR: DOC_ID is not set.  Run: DOC_ID=<your_doc_id> make eval"; \
+		exit 1; \
+	fi
+	python eval/run.py --doc-id $(DOC_ID)
+
+# Refresh README.md scores from the latest file in eval/results/ (no eval run needed).
+update-readme:
+	python eval/run.py --update-readme
diff --git a/README.md b/README.md
index a770b2b..c9cb55a 100644
--- a/README.md
+++ b/README.md
@@ -94,16 +94,20 @@ Query ──► Vector Dense Search ─┘
 
 ### Option 1 — Docker (Recommended)
 
+**1. Clone the repo**
 ```bash
-# 1. Clone the repo
 git clone https://github.com/robayedl/documind.git
 cd documind
+```
 
-# 2. Set your API key
+**2. Set your API key**
+```bash
 cp .env.example .env
 # Edit .env and add: GOOGLE_API_KEY=your_key_here
+```
 
-# 3. Launch everything
+**3. Launch everything**
+```bash
 docker compose up --build
 ```
 
@@ -115,28 +119,37 @@ docker compose up --build
 
 ### Option 2 — Local Development
 
-**Prerequisites:** Python 3.12, pip
-
+**1. Clone & enter directory**
 ```bash
-# 1. Clone & enter directory
 git clone https://github.com/robayedl/documind.git
 cd documind
+```
 
-# 2. Create virtual environment
+**2. Create virtual environment**
+```bash
 python -m venv .venv
-source .venv/bin/activate      # Windows: .venv\Scripts\activate
+source .venv/bin/activate
+```
+> Windows: `.venv\Scripts\activate`
 
-# 3. Install dependencies
+**3. Install dependencies**
+```bash
 pip install -r requirements.txt
+```
 
-# 4. Configure environment
+**4. Configure environment**
+```bash
 cp .env.example .env
 # Edit .env and set GOOGLE_API_KEY
+```
 
-# 5. Start backend (terminal 1)
+**5. Start backend** _(terminal 1)_
+```bash
 uvicorn app.main:app --reload --port 8000
+```
 
-# 6. Start UI (terminal 2)
+**6. Start UI** _(terminal 2)_
+```bash
 streamlit run ui/streamlit_app.py --server.port 8501
 ```
 
@@ -155,20 +168,26 @@ streamlit run ui/streamlit_app.py --server.port 8501
 
 ### Example: Upload and query a PDF
 
+**Upload**
 ```bash
-# Upload
 DOC_ID=$(curl -s -F "file=@document.pdf" http://localhost:8000/documents \
   | python3 -c "import sys,json; print(json.load(sys.stdin)['doc_id'])")
+```
 
-# Index
+**Index**
+```bash
 curl -s -X POST http://localhost:8000/documents/$DOC_ID/index
+```
 
-# Query (full response)
+**Query (full response)**
+```bash
 curl -s -X POST http://localhost:8000/query \
   -H "Content-Type: application/json" \
   -d "{\"doc_id\": \"$DOC_ID\", \"question\": \"What is this document about?\"}"
+```
 
-# Query (streaming)
+**Query (streaming)**
+```bash
 curl -N -X POST http://localhost:8000/query/stream \
   -H "Content-Type: application/json" \
   -d "{\"doc_id\": \"$DOC_ID\", \"question\": \"Summarize the key points.\"}"
@@ -190,36 +209,85 @@ curl -N -X POST http://localhost:8000/query/stream \
 
 ## Running Tests
 
+**Run full test suite**
 ```bash
-# Run full test suite
 pytest -q
+```
 
-# Run with coverage
+**Run with coverage**
+```bash
 pytest --cov=rag --cov=app -q
+```
 
-# Lint
+**Lint**
+```bash
 ruff check .
 ```
 
 ---
 
-## RAGAS Evaluation
+## Evaluation
+
+DocuMind ships with a 30-question golden dataset built from **"Attention Is All You Need"** (Vaswani et al., 2017) and an automated [RAGAS](https://docs.ragas.io) evaluation runner that uses **Gemini 2.5 Flash** as the judge model.
+
+### Latest Evaluation Results
+
+<!-- EVAL-RESULTS-START -->
+| Metric | Score | |
+|---|---|---|
+| `faithfulness` | 0.848 | ████████████████ |
+| `answer_relevancy` | 0.681 | █████████████ |
+| `context_precision` | 0.803 | ████████████████ |
+| `context_recall` | 0.750 | ███████████████ |
+
+_Evaluated on 30 questions · 2026-05-05 · full results in [`eval/results/latest.json`](eval/results/latest.json)_
+<!-- EVAL-RESULTS-END -->
+
+### Metrics
+
+| Metric | What it measures |
+|---|---|
+| `faithfulness` | Are all claims in the answer grounded in the retrieved context? |
+| `answer_relevancy` | Is the answer actually relevant to the question? |
+| `context_precision` | Are the most relevant chunks ranked highest? |
+| `context_recall` | Does the context cover everything needed to answer the question? |
+
+### Running the evaluation
+
+**1. Start the server, upload, and index the PDF** _(server only needed for this step)_
+```bash
+uvicorn app.main:app --reload --port 8000
+```
+```bash
+DOC_ID=$(curl -s -F "file=@attention.pdf" http://localhost:8000/documents \
+  | python3 -c "import sys,json; print(json.load(sys.stdin)['doc_id'])")
+curl -s -X POST http://localhost:8000/documents/$DOC_ID/index
+```
+> Once indexed, the server can be stopped — `make eval` calls the pipeline directly.
 
-Evaluate pipeline quality with [RAGAS](https://docs.ragas.io) metrics: faithfulness, answer relevancy, context precision, and context recall.
+**2. Run full evaluation**
+```bash
+DOC_ID=$DOC_ID make eval
+```
+> ~10–15 min · ~$0.05–$0.15 in Gemini API calls (30 questions × ~5 LLM calls each for pipeline + RAGAS scoring)
 
+**Quick smoke-test** _(5 questions)_
 ```bash
-# 1. Upload and index your PDF (see API section above)
+python eval/run.py --doc-id $DOC_ID --limit 5
+```
 
-# 2. Update eval/test_queries.json with your doc_id and ground truth answers
+**Filter by category**
+```bash
+python eval/run.py --doc-id $DOC_ID --category factual
+```
 
-# 3. Run evaluation
-python eval/run_eval.py --doc-id <your_doc_id>
+After each run the scores above are automatically updated in this file. To refresh them from the latest result without re-running evaluation:
 
-# 4. View results
-cat eval/results.json
+```bash
+make update-readme
 ```
 
-**Pass criteria:** `faithfulness ≥ 0.70` and `answer_relevancy ≥ 0.70`
+See [eval/EVALUATION_GUIDE.md](eval/EVALUATION_GUIDE.md) for dataset format, how to add entries, and cost estimates.
 
 ---
 
@@ -255,9 +323,10 @@ documind/
 │       └── pdf_viewer.py    # Inline PDF viewer
 │
 ├── eval/
-│   ├── test_queries.json    # Sample Q&A pairs for evaluation
-│   ├── ragas_eval.py        # RAGAS metric computation
-│   └── run_eval.py          # CLI runner + results table
+│   ├── golden.jsonl         # 30-question golden dataset (Attention Is All You Need)
+│   ├── run.py               # RAGAS runner — per-question table, JSON output, README update
+│   ├── results/             # Per-run result JSONs (<UTC-timestamp>.json)
+│   └── EVALUATION_GUIDE.md  # Dataset format, field definitions, usage, cost estimates
 │
 ├── tests/
 │   ├── test_agent.py        # Agent node + graph integration tests
@@ -267,6 +336,7 @@ documind/
 ├── .github/workflows/ci.yml # CI: lint + test on push/PR
 ├── Dockerfile               # Python 3.12 slim image
 ├── docker-compose.yml       # Multi-service: api + streamlit
+├── Makefile                 # run / ui / test / lint / eval targets
 └── requirements.txt
 ```
 
diff --git a/eval/EVALUATION_GUIDE.md b/eval/EVALUATION_GUIDE.md
new file mode 100644
index 0000000..389c2c5
--- /dev/null
+++ b/eval/EVALUATION_GUIDE.md
@@ -0,0 +1,136 @@
+# DocuMind Evaluation
+
+This directory contains the golden dataset and RAGAS evaluation scripts for measuring DocuMind's RAG pipeline quality.
+
+---
+
+## Golden Dataset — `golden.jsonl`
+
+Each line is a JSON object with the following fields:
+
+| Field | Type | Description |
+|---|---|---|
+| `question` | `str` | The user question to ask the pipeline |
+| `expected_answer` | `str` | Ground-truth reference answer |
+| `source_doc` | `str` | Human-readable name of the source document |
+| `source_page` | `int` | Approximate page number in the source PDF (`-1` for out-of-scope) |
+| `category` | `str` | One of `factual`, `reasoning`, `multi_hop`, `out_of_scope` |
+
+### Category definitions
+
+| Category | Description | Count |
+|---|---|---|
+| `factual` | Direct lookup of a specific fact stated in the document | 15 |
+| `reasoning` | Requires understanding cause-and-effect or the "why" behind a design choice | 8 |
+| `multi_hop` | Answer requires connecting information from two or more parts of the document | 5 |
+| `out_of_scope` | Question is unrelated to the document; the system should refuse gracefully | 2 |
+
+The current dataset is built from **"Attention Is All You Need"** (Vaswani et al., 2017).
+
+---
+
+## Adding new entries
+
+1. Open `eval/golden.jsonl` in any text editor.
+2. Append one JSON object per line (no trailing commas; each line is independent).
+3. Use an existing entry as a template:
+
+```jsonl
+{"question": "...", "expected_answer": "...", "source_doc": "...", "source_page": N, "category": "factual"}
+```
+
+4. Keep `expected_answer` concise and factually accurate — it is used as the RAGAS `reference` string.
+5. For out-of-scope questions set `source_page` to `-1`.
+
+---
+
+## Running the evaluation — `run.py`
+
+### Prerequisites
+
+1. The PDF must be uploaded and indexed via the API. Note the `doc_id` returned by `POST /documents/{doc_id}/index`. The server does **not** need to be running during evaluation — `run.py` calls the pipeline directly.
+2. `GOOGLE_API_KEY` must be set in your environment (used as the RAGAS judge LLM).
+
+### Usage
+
+```bash
+# Basic run (requires --doc-id)
+python eval/run.py --doc-id <your_doc_id>
+
+# Limit to a subset of entries (useful for quick smoke tests)
+python eval/run.py --doc-id <your_doc_id> --limit 5
+
+# Filter by category
+python eval/run.py --doc-id <your_doc_id> --category factual
+
+# Override the golden dataset path
+python eval/run.py --doc-id <your_doc_id> --golden path/to/custom.jsonl
+
+# Save results to a custom directory
+python eval/run.py --doc-id <your_doc_id> --results-dir eval/results
+```
+
+Or via Make:
+
+```bash
+DOC_ID=<your_doc_id> make eval
+```
+
+To refresh the root README scores from the latest saved result without re-running evaluation:
+
+```bash
+make update-readme
+# or directly:
+python eval/run.py --update-readme
+```
+
+### Output
+
+- **Console:** per-question table showing question, RAGAS scores, and pass/fail, followed by aggregate means.
+- **File:** `eval/results/<UTC-timestamp>.json` with full per-question and aggregate data.
+- **README.md:** the `Latest results` table in the root README is automatically updated with the new scores.
+
+### Expected runtime and cost
+
+| Dataset size | Approximate wall time | Approximate Gemini API cost |
+|---|---|---|
+| 5 questions | ~2 min | < $0.02 |
+| 30 questions (full) | ~10–15 min | ~$0.05–$0.15 |
+
+Cost depends on answer and context length. Each question triggers:
+- 1 DocuMind pipeline call (Gemini generation + grading)
+- 4–8 Gemini calls for RAGAS metrics (faithfulness, answer relevancy, context precision, context recall)
+
+---
+
+## Metrics
+
+| Metric | What it measures | Ideal |
+|---|---|---|
+| `faithfulness` | Are all claims in the answer supported by retrieved contexts? | → 1.0 |
+| `answer_relevancy` | Is the answer relevant to the question (not off-topic)? | → 1.0 |
+| `context_precision` | Are retrieved chunks ranked with the most relevant ones first? | → 1.0 |
+| `context_recall` | Does the retrieved context cover all the information needed for the reference answer? | → 1.0 |
+
+---
+
+## Results directory
+
+`eval/results/` stores one JSON file per evaluation run, named by UTC timestamp. Each file contains:
+
+```json
+{
+  "timestamp": "2025-...",
+  "doc_id": "...",
+  "golden_file": "eval/golden.jsonl",
+  "n_total": 30,
+  "n_evaluated": 30,
+  "per_question": [...],
+  "means": {
+    "faithfulness": 0.92,
+    "answer_relevancy": 0.88,
+    "context_precision": 0.85,
+    "context_recall": 0.79
+  }
+}
+```
diff --git a/eval/golden.jsonl b/eval/golden.jsonl
new file mode 100644
index 0000000..00ac617
--- /dev/null
+++ b/eval/golden.jsonl
@@ -0,0 +1,30 @@
+{"question": "What is the title of the paper introduced in this document?", "expected_answer": "The paper is titled 'Attention Is All You Need'.", "source_doc": "Attention Is All You Need", "source_page": 1, "category": "factual"}
+{"question": "Who are the authors of the paper?", "expected_answer": "The authors are Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser, and Illia Polosukhin.", "source_doc": "Attention Is All You Need", "source_page": 1, "category": "factual"}
+{"question": "What is the name of the novel model architecture proposed in this paper?", "expected_answer": "The proposed architecture is called the Transformer, a model based solely on attention mechanisms that dispenses with recurrence and convolutions entirely.", "source_doc": "Attention Is All You Need", "source_page": 2, "category": "factual"}
+{"question": "How many encoder layers does the base Transformer model have?", "expected_answer": "The base Transformer model has N=6 stacked encoder layers.", "source_doc": "Attention Is All You Need", "source_page": 3, "category": "factual"}
+{"question": "What is the model dimension d_model used in the base Transformer?", "expected_answer": "The base model uses d_model = 512.", "source_doc": "Attention Is All You Need", "source_page": 3, "category": "factual"}
+{"question": "How many attention heads are used in the multi-head attention of the base model?", "expected_answer": "The base model uses h = 8 parallel attention heads.", "source_doc": "Attention Is All You Need", "source_page": 5, "category": "factual"}
+{"question": "What is the inner-layer dimensionality of the feed-forward network in the base model?", "expected_answer": "The inner-layer dimensionality of the position-wise feed-forward networks is d_ff = 2048.", "source_doc": "Attention Is All You Need", "source_page": 5, "category": "factual"}
+{"question": "What BLEU score did the big Transformer achieve on WMT 2014 English-to-German translation?", "expected_answer": "The big Transformer model achieved 28.4 BLEU on the WMT 2014 English-to-German translation task.", "source_doc": "Attention Is All You Need", "source_page": 8, "category": "factual"}
+{"question": "What BLEU score did the big Transformer achieve on WMT 2014 English-to-French translation?", "expected_answer": "The big Transformer model achieved 41.0 BLEU on the WMT 2014 English-to-French translation task.", "source_doc": "Attention Is All You Need", "source_page": 8, "category": "factual"}
+{"question": "What optimizer is used to train the Transformer?", "expected_answer": "The Adam optimizer is used, with beta_1 = 0.9, beta_2 = 0.98, and epsilon = 10^-9.", "source_doc": "Attention Is All You Need", "source_page": 7, "category": "factual"}
+{"question": "What is the dropout rate applied to the base Transformer model?", "expected_answer": "A dropout rate of P_drop = 0.1 is applied to the output of each sub-layer before the residual connection and layer normalization, and also to the sums of the embeddings and positional encodings in both encoder and decoder stacks.", "source_doc": "Attention Is All You Need", "source_page": 7, "category": "factual"}
+{"question": "How many warmup steps are used in the learning rate schedule?", "expected_answer": "The learning rate schedule uses warmup_steps = 4000.", "source_doc": "Attention Is All You Need", "source_page": 7, "category": "factual"}
+{"question": "What types of attention are used in the Transformer architecture?", "expected_answer": "The Transformer uses three types of attention: encoder self-attention (each position attends to all positions in the encoder), decoder self-attention (each position attends to all previous positions), and encoder-decoder attention (each decoder position attends to all encoder positions).", "source_doc": "Attention Is All You Need", "source_page": 5, "category": "factual"}
+{"question": "What is label smoothing and what value is used in the paper?", "expected_answer": "Label smoothing is a regularization technique where the model is penalized for being too confident. The paper uses label smoothing of value epsilon_ls = 0.1, which hurts perplexity as the model learns to be more unsure, but improves accuracy and BLEU score.", "source_doc": "Attention Is All You Need", "source_page": 7, "category": "factual"}
+{"question": "What dataset is used for English constituency parsing experiments?", "expected_answer": "The Wall Street Journal (WSJ) portion of the Penn Treebank is used for English constituency parsing, containing approximately 40,000 training sentences.", "source_doc": "Attention Is All You Need", "source_page": 9, "category": "factual"}
+{"question": "Why does the Transformer use positional encodings?", "expected_answer": "Since the Transformer contains no recurrence or convolution, it has no built-in sense of token order. Positional encodings are added to the input embeddings to inject information about the relative or absolute position of each token in the sequence, allowing the model to make use of sequence order.", "source_doc": "Attention Is All You Need", "source_page": 6, "category": "reasoning"}
+{"question": "Why does the paper scale the dot product by 1 over the square root of d_k in scaled dot-product attention?", "expected_answer": "For large values of d_k, the dot products grow large in magnitude, pushing the softmax function into regions with extremely small gradients. Dividing by the square root of d_k counteracts this effect, preventing vanishing gradients and stabilizing training.", "source_doc": "Attention Is All You Need", "source_page": 4, "category": "reasoning"}
+{"question": "What is the advantage of multi-head attention over single-head attention?", "expected_answer": "Multi-head attention allows the model to jointly attend to information from different representation subspaces at different positions. With a single attention head, averaging would inhibit this. By projecting queries, keys, and values h times with different learned linear projections, the model captures richer relational structure.", "source_doc": "Attention Is All You Need", "source_page": 4, "category": "reasoning"}
+{"question": "Why does the Transformer enable more parallelization than RNN-based architectures?", "expected_answer": "RNNs must process tokens sequentially because each hidden state depends on the previous one, preventing parallel computation across positions during training. The Transformer's self-attention operates over all positions simultaneously in O(1) sequential operations per layer, enabling full parallelization within each layer.", "source_doc": "Attention Is All You Need", "source_page": 2, "category": "reasoning"}
+{"question": "What is the purpose of masking in the decoder's self-attention layer?", "expected_answer": "The masking prevents positions in the decoder from attending to subsequent positions. This preserves the auto-regressive property: when generating the output at position i, the model can only attend to known outputs at positions less than i, preventing it from seeing future tokens during training.", "source_doc": "Attention Is All You Need", "source_page": 3, "category": "reasoning"}
+{"question": "Why do the authors use sine and cosine functions for positional encodings rather than learned embeddings?", "expected_answer": "Sine and cosine functions allow the model to generalize to sequence lengths longer than those seen during training, since each position's encoding is a deterministic function of its index. They also hypothesize that fixed sinusoidal encodings allow the model to easily learn to attend by relative positions.", "source_doc": "Attention Is All You Need", "source_page": 6, "category": "reasoning"}
+{"question": "Why do the authors claim the Transformer is more efficient for long sequences than convolutional layers?", "expected_answer": "A single convolutional layer does not connect all pairs of input and output positions when the kernel width is smaller than the sequence length; multiple layers are needed to capture distant dependencies. Self-attention connects all positions in a single step with constant path length, making long-range dependencies easier to learn.", "source_doc": "Attention Is All You Need", "source_page": 6, "category": "reasoning"}
+{"question": "How does residual dropout and label smoothing together improve the model despite making individual metrics worse?", "expected_answer": "Residual dropout prevents co-adaptation of features and reduces overfitting. Label smoothing explicitly discourages the model from assigning full probability to any one token, making it more uncertain and improving calibration. While label smoothing increases perplexity (since it penalizes confident predictions), both techniques together improve translation quality as measured by BLEU because they produce better-calibrated, more generalizable models.", "source_doc": "Attention Is All You Need", "source_page": 7, "category": "reasoning"}
+{"question": "Given that the base model uses d_model=512 and h=8 attention heads, what is the dimensionality of each head's queries and keys?", "expected_answer": "Each head operates on d_k = d_v = d_model / h = 512 / 8 = 64 dimensions for both keys/queries and values.", "source_doc": "Attention Is All You Need", "source_page": 5, "category": "multi_hop"}
+{"question": "How does the learning rate formula relate warmup steps, model dimension, and training step number?", "expected_answer": "The learning rate formula is: lrate = d_model^(-0.5) * min(step_num^(-0.5), step_num * warmup_steps^(-1.5)). Using d_model=512 and warmup_steps=4000, the rate increases linearly for the first 4000 steps, then decreases proportionally to the inverse square root of the step number.", "source_doc": "Attention Is All You Need", "source_page": 7, "category": "multi_hop"}
+{"question": "How does encoder-decoder attention differ from encoder self-attention in terms of where queries, keys, and values come from?", "expected_answer": "In encoder self-attention, queries, keys, and values all come from the output of the previous encoder layer, so each position can attend to every position in the encoder. In encoder-decoder attention, the queries come from the previous decoder layer while the keys and values come from the encoder output, allowing every decoder position to attend over all positions in the input sequence.", "source_doc": "Attention Is All You Need", "source_page": 5, "category": "multi_hop"}
+{"question": "How do the training configurations of the base and big Transformer models differ, and how does this relate to their BLEU scores?", "expected_answer": "The base model uses d_model=512, h=8, d_ff=2048, dropout=0.1 and is trained for 100,000 steps with a single machine and 8 GPUs. The big model uses d_model=1024, h=16, d_ff=4096, dropout=0.3 and is trained for 300,000 steps. The big model achieves 28.4 BLEU (English-German) versus the base model's 27.3 BLEU, demonstrating that larger capacity and more training improve performance at the cost of compute.", "source_doc": "Attention Is All You Need", "source_page": 8, "category": "multi_hop"}
+{"question": "How does the Transformer's use of attention with reduced dimensionality per head compare computationally to a single-head attention with full d_model dimensions?", "expected_answer": "Multi-head attention projects the d_model=512 queries, keys, and values h=8 times into d_k=d_v=64 dimensional subspaces, performs attention in parallel on each, and concatenates. The total computational cost is similar to single-head attention at full dimensionality (since h * d_k = d_model), but gains the expressiveness of multiple independent attention functions attending to different subspaces.", "source_doc": "Attention Is All You Need", "source_page": 4, "category": "multi_hop"}
+{"question": "What was the annual revenue of Google in 2017?", "expected_answer": "I do not know based on the provided document. The paper does not contain any information about Google's financial performance.", "source_doc": "Attention Is All You Need", "source_page": -1, "category": "out_of_scope"}
+{"question": "What is the capital city of France?", "expected_answer": "I do not know based on the provided document. The paper does not discuss geography or capital cities.", "source_doc": "Attention Is All You Need", "source_page": -1, "category": "out_of_scope"}
diff --git a/eval/ragas_eval.py b/eval/ragas_eval.py
deleted file mode 100644
index ccfcbac..0000000
--- a/eval/ragas_eval.py
+++ /dev/null
@@ -1,150 +0,0 @@
-from __future__ import annotations
-
-import os
-import sys
-import warnings
-from pathlib import Path
-from typing import Any
-
-import requests
-
-# Suppress third-party deprecation noise
-warnings.filterwarnings("ignore")
-
-# Ensure project root is on sys.path when run directly
-sys.path.insert(0, str(Path(__file__).parent.parent))
-
-from dotenv import load_dotenv
-load_dotenv()
-
-from langchain_google_genai import ChatGoogleGenerativeAI
-from langchain_huggingface import HuggingFaceEmbeddings
-from ragas import EvaluationDataset, SingleTurnSample, evaluate
-from ragas.embeddings import LangchainEmbeddingsWrapper
-from ragas.llms import LangchainLLMWrapper
-from ragas.metrics import (
-    AnswerRelevancy,
-    ContextPrecision,
-    ContextRecall,
-    Faithfulness,
-)
-
-API_BASE = os.getenv("EVAL_API_BASE", "http://localhost:8000")
-
-
-def _make_llm() -> LangchainLLMWrapper:
-    api_key = os.getenv("GOOGLE_API_KEY")
-    if not api_key:
-        raise RuntimeError("GOOGLE_API_KEY is not set.")
-    return LangchainLLMWrapper(
-        ChatGoogleGenerativeAI(
-            model="gemini-2.5-flash",
-            google_api_key=api_key,
-            temperature=0,
-        )
-    )
-
-
-def _make_embeddings() -> LangchainEmbeddingsWrapper:
-    return LangchainEmbeddingsWrapper(
-        HuggingFaceEmbeddings(
-            model_name="sentence-transformers/all-MiniLM-L6-v2",
-            model_kwargs={"device": "cpu"},
-            encode_kwargs={"normalize_embeddings": True},
-        )
-    )
-
-
-def _query_backend(question: str, doc_id: str) -> dict[str, Any]:
-    """Call the /query endpoint and return answer + citation dicts."""
-    resp = requests.post(
-        f"{API_BASE}/query",
-        json={"doc_id": doc_id, "question": question},
-        timeout=120,
-    )
-    resp.raise_for_status()
-    data = resp.json()
-    return {
-        "answer": data.get("answer", ""),
-        "citations": data.get("citations", []),
-    }
-
-
-def evaluate_rag(
-    test_data: list[dict[str, str]],
-    doc_id: str | None = None,
-    verbose: bool = True,
-) -> dict[str, float]:
-    """
-    Run RAGAS evaluation over a list of test samples.
-
-    Each item in test_data must have:
-        question     – the user question
-        ground_truth – the expected answer
-        doc_id       – which document to query (overridden by the doc_id arg)
-    """
-    llm = _make_llm()
-    emb = _make_embeddings()
-
-    metrics = [
-        Faithfulness(llm=llm),
-        AnswerRelevancy(llm=llm, embeddings=emb),
-        ContextPrecision(llm=llm),
-        ContextRecall(llm=llm),
-    ]
-
-    samples: list[SingleTurnSample] = []
-
-    for i, item in enumerate(test_data, start=1):
-        q = item["question"]
-        gt = item["ground_truth"]
-        did = doc_id or item.get("doc_id", "")
-
-        if verbose:
-            print(f"  [{i}/{len(test_data)}] {q[:70]}…")
-
-        try:
-            result = _query_backend(q, did)
-            answer = result["answer"]
-            contexts = [
-                c.get("text") or c.get("ref", "")
-                for c in result["citations"]
-            ] or ["No context retrieved."]
-        except Exception as e:
-            print(f"    ! Query failed: {e}")
-            answer = ""
-            contexts = ["Query failed."]
-
-        samples.append(
-            SingleTurnSample(
-                user_input=q,
-                response=answer,
-                retrieved_contexts=contexts,
-                reference=gt,
-            )
-        )
-
-    dataset = EvaluationDataset(samples=samples)
-
-    if verbose:
-        print("\nRunning RAGAS evaluation…")
-
-    result = evaluate(
-        dataset=dataset,
-        metrics=metrics,
-        raise_exceptions=False,
-        show_progress=verbose,
-    )
-
-    scores: dict[str, float] = {}
-    for m in metrics:
-        val = result[m.name]
-        if val is None:
-            scores[m.name] = 0.0
-        elif isinstance(val, list):
-            valid = [v for v in val if v is not None]
-            scores[m.name] = round(sum(valid) / len(valid), 4) if valid else 0.0
-        else:
-            scores[m.name] = round(float(val), 4)
-
-    return scores
diff --git a/eval/results/latest.json b/eval/results/latest.json
new file mode 100644
index 0000000..74815d3
--- /dev/null
+++ b/eval/results/latest.json
@@ -0,0 +1,550 @@
+{
+  "timestamp": "2026-05-05T18:44:54.373813+00:00",
+  "doc_id": "580f792fed554919883a4a4ad2f518bc",
+  "golden_file": "/Users/robayedashraf/Downloads/documind/eval/golden.jsonl",
+  "n_total": 30,
+  "n_evaluated": 30,
+  "per_question": [
+    {
+      "index": 1,
+      "question": "What is the title of the paper introduced in this document?",
+      "category": "factual",
+      "source_page": 1,
+      "expected_answer": "The paper is titled 'Attention Is All You Need'.",
+      "generated_answer": "The title of the paper is \"Attention Is All You Need\".",
+      "retrieved_contexts": [
+        "Provided proper attribution is provided, Google hereby grants permission to\nreproduce the tables and figures in this paper solely for use in journalistic or\nscholarly works.\nAttention Is All You Need\nAshish Vaswani Noam Shazeer Niki Parmar Jakob Uszkoreit \nGoogle Brain Google Brain Google Research Google Research\navaswani@google.com noam@google.com nikip@google.com usz@google.com\nLlion Jones Aidan N. Gomez ukasz Kaiser \nGoogle Research University of Toronto Google Brain\nllion@google.com aidan@cs.toronto.edu lukaszkaiser@google.com\nIllia Polosukhin \nillia.polosukhin@gmail.com\nAbstract\nThe dominant sequence transduction models are based on complex recurrent or\nconvolutional neural networks that include an encoder and a decoder. The best"
+      ],
+      "scores": {
+        "faithfulness": 1.0,
+        "answer_relevancy": 0.8961412755237484,
+        "context_precision": 0.9999999999,
+        "context_recall": 1.0
+      }
+    },
+    {
+      "index": 2,
+      "question": "Who are the authors of the paper?",
+      "category": "factual",
+      "source_page": 1,
+      "expected_answer": "The authors are Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser, and Illia Polosukhin.",
+      "generated_answer": "I do not know based on the provided document. The document describes the contributions of Niki, Llion, Lukasz, and Aidan to the work, but it does not explicitly state that they are the authors of the paper.",
+      "retrieved_contexts": [
+        "attention and the parameter-free position representation and became the other person involved in nearly every\ndetail. Niki designed, implemented, tuned and evaluated countless model variants in our original codebase and\ntensor2tensor. Llion also experimented with novel model variants, was responsible for our initial codebase, and\nefficient inference and visualizations. Lukasz and Aidan spent countless long days designing various parts of and\nimplementing tensor2tensor, replacing our earlier codebase, greatly improving results and massively accelerating\nour research.\n Work performed while at Google Brain.\n Work performed while at Google Research.\n31st Conference on Neural Information Processing Systems (NIPS 2017), Long Beach, CA, USA.\n3202\nguA\n2\n]LC.sc[\n7v26730.6071:viXra"
+      ],
+      "scores": {
+        "faithfulness": 0.6666666666666666,
+        "answer_relevancy": 0.0,
+        "context_precision": 0.0,
+        "context_recall": 0.0
+      }
+    },
+    {
+      "index": 3,
+      "question": "What is the name of the novel model architecture proposed in this paper?",
+      "category": "factual",
+      "source_page": 2,
+      "expected_answer": "The proposed architecture is called the Transformer, a model based solely on attention mechanisms that dispenses with recurrence and convolutions entirely.",
+      "generated_answer": "The novel model architecture proposed in this paper is the Transformer.",
+      "retrieved_contexts": [
+        "are used in conjunction with a recurrent network.\nIn this work we propose the Transformer, a model architecture eschewing recurrence and instead\nrelying entirely on an attention mechanism to draw global dependencies between input and output.\nThe Transformer allows for significantly more parallelization and can reach a new state of the art in\ntranslation quality after being trained for as little as twelve hours on eight P100 GPUs.\n2 Background\nThe goal of reducing sequential computation also forms the foundation of the Extended Neural GPU\n[16], ByteNet [18] and ConvS2S [9], all of which use convolutional neural networks as basic building\nblock, computing hidden representations in parallel for all input and output positions. In these models,"
+      ],
+      "scores": {
+        "faithfulness": 1.0,
+        "answer_relevancy": 0.9689922786860943,
+        "context_precision": 0.9999999999,
+        "context_recall": 1.0
+      }
+    },
+    {
+      "index": 4,
+      "question": "How many encoder layers does the base Transformer model have?",
+      "category": "factual",
+      "source_page": 3,
+      "expected_answer": "The base Transformer model has N=6 stacked encoder layers.",
+      "generated_answer": "The encoder is composed of a stack of N = 6 identical layers.",
+      "retrieved_contexts": [
+        "Figure 1: The Transformer - model architecture.\nThe Transformer follows this overall architecture using stacked self-attention and point-wise, fully\nconnected layers for both the encoder and decoder, shown in the left and right halves of Figure 1,\nrespectively.\n3.1 Encoder and Decoder Stacks\nEncoder: The encoder is composed of a stack of N = 6 identical layers. Each layer has two\nsub-layers. The first is a multi-head self-attention mechanism, and the second is a simple, position-\nwise fully connected feed-forward network. We employ a residual connection [11] around each of\nthe two sub-layers, followed by layer normalization [1]. That is, the output of each sub-layer is\nLayerNorm(x + Sublayer(x)), where Sublayer(x) is the function implemented by the sub-layer"
+      ],
+      "scores": {
+        "faithfulness": 1.0,
+        "answer_relevancy": 0.8050120529873822,
+        "context_precision": 0.9999999999,
+        "context_recall": 1.0
+      }
+    },
+    {
+      "index": 5,
+      "question": "What is the model dimension d_model used in the base Transformer?",
+      "category": "factual",
+      "source_page": 3,
+      "expected_answer": "The base model uses d_model = 512.",
+      "generated_answer": "The model dimension d_model used in the base Transformer is 512. This is stated in Table 3 for the 'base' model under the 'd_model' column, and also mentioned as \"all sub-layers in the model, as well as the embedding layers, produce outputs of dimension d = 512\" and \"The positional encodings have the same dimension d_model as the embeddings\".",
+      "retrieved_contexts": [
+        "Table 3: Variations on the Transformer architecture. Unlisted values are identical to those of the base\nmodel. All metrics are on the English-to-German translation development set, newstest2013. Listed\nperplexities are per-wordpiece, according to our byte-pair encoding, and should not be compared to\nper-word perplexities.\ntrain PPL BLEU params\nN d d h d d P \nmodel ff k v drop ls steps (dev) (dev) 106\nbase 6 512 2048 8 64 64 0.1 0.1 100K 4.92 25.8 65\n1 512 512 5.29 24.9\n4 128 128 5.00 25.5\n(A)\n16 32 32 4.91 25.8\n32 16 16 5.01 25.4\n16 5.16 25.1 58\n(B)\n32 5.01 25.4 60\n2 6.11 23.7 36\n4 5.19 25.3 50\n8 4.88 25.5 80\n(C) 256 32 32 5.75 24.5 28\n1024 128 128 4.66 26.0 168\n1024 5.12 25.4 53\n4096 4.75 26.2 90\n0.0 5.77 24.6\n0.2 4.95 25.5\n(D)\n0.0 4.67 25.3\n0.2 5.47 25.7",
+        "LayerNorm(x + Sublayer(x)), where Sublayer(x) is the function implemented by the sub-layer\nitself. To facilitate these residual connections, all sub-layers in the model, as well as the embedding\nlayers, produce outputs of dimension d = 512.\nmodel\nDecoder: The decoder is also composed of a stack of N = 6 identical layers. In addition to the two\nsub-layers in each encoder layer, the decoder inserts a third sub-layer, which performs multi-head\nattention over the output of the encoder stack. Similar to the encoder, we employ residual connections\naround each of the sub-layers, followed by layer normalization. We also modify the self-attention\nsub-layer in the decoder stack to prevent positions from attending to subsequent positions. This",
+        "tokens in the sequence. To this end, we add \"positional encodings\" to the input embeddings at the\nbottoms of the encoder and decoder stacks. The positional encodings have the same dimension d\nmodel\nas the embeddings, so that the two can be summed. There are many choices of positional encodings,\nlearned and fixed [9].\nIn this work, we use sine and cosine functions of different frequencies:\nPE = sin(pos/100002i/d model)\n(pos,2i)\nPE = cos(pos/100002i/d model)\n(pos,2i+1)\nwhere pos is the position and i is the dimension. That is, each dimension of the positional encoding\ncorresponds to a sinusoid. The wavelengths form a geometric progression from 2 to 10000 2 . We\nchose this function because we hypothesized it would allow the model to easily learn to attend by"
+      ],
+      "scores": {
+        "faithfulness": 1.0,
+        "answer_relevancy": 0.9999999999999997,
+        "context_precision": 0.99999999995,
+        "context_recall": 1.0
+      }
+    },
+    {
+      "index": 6,
+      "question": "How many attention heads are used in the multi-head attention of the base model?",
+      "category": "factual",
+      "source_page": 5,
+      "expected_answer": "The base model uses h = 8 parallel attention heads.",
+      "generated_answer": "In this work, h = 8 parallel attention layers, or heads, are employed.",
+      "retrieved_contexts": [
+        "output values. These are concatenated and once again projected, resulting in the final values, as\ndepicted in Figure 2.\nMulti-head attention allows the model to jointly attend to information from different representation\nsubspaces at different positions. With a single attention head, averaging inhibits this.\nMultiHead(Q,K,V ) = Concat(head ,...,head )WO\n1 h\nwhere head = Attention(QW Q ,KWK,V WV )\ni i i i\nWhere the projections are parameter matrices W Q Rd model d k, WK Rd model d k, WV Rd model d v\ni i i\nand WO Rhd v d model.\nIn this work we employ h = 8 parallel attention layers, or heads. For each of these we use\nd = d = d /h = 64. Due to the reduced dimension of each head, the total computational cost\nk v model\nis similar to that of single-head attention with full dimensionality."
+      ],
+      "scores": {
+        "faithfulness": 1.0,
+        "answer_relevancy": 0.8016468544613459,
+        "context_precision": 0.9999999999,
+        "context_recall": 1.0
+      }
+    },
+    {
+      "index": 7,
+      "question": "What is the inner-layer dimensionality of the feed-forward network in the base model?",
+      "category": "factual",
+      "source_page": 5,
+      "expected_answer": "The inner-layer dimensionality of the position-wise feed-forward networks is d_ff = 2048.",
+      "generated_answer": "The inner-layer has dimensionality d = 2048.",
+      "retrieved_contexts": [
+        "In addition to attention sub-layers, each of the layers in our encoder and decoder contains a fully\nconnected feed-forward network, which is applied to each position separately and identically. This\nconsists of two linear transformations with a ReLU activation in between.\nFFN(x) = max(0,xW + b )W + b (2)\n1 1 2 2\nWhile the linear transformations are the same across different positions, they use different parameters\nfrom layer to layer. Another way of describing this is as two convolutions with kernel size 1.\nThe dimensionality of input and output is d = 512, and the inner-layer has dimensionality\nmodel\nd = 2048.\nff\n3.4 Embeddings and Softmax\nSimilarly to other sequence transduction models, we use learned embeddings to convert the input"
+      ],
+      "scores": {
+        "faithfulness": 1.0,
+        "answer_relevancy": 0.653870911386084,
+        "context_precision": 0.9999999999,
+        "context_recall": 1.0
+      }
+    },
+    {
+      "index": 8,
+      "question": "What BLEU score did the big Transformer achieve on WMT 2014 English-to-German translation?",
+      "category": "factual",
+      "source_page": 8,
+      "expected_answer": "The big Transformer model achieved 28.4 BLEU on the WMT 2014 English-to-German translation task.",
+      "generated_answer": "The big Transformer model achieved a BLEU score of 28.4 on the WMT 2014 English-to-German translation task. This is stated as \"establishing a new state-of-the-art BLEU score of 28.4\" and is also listed in Table 2 under \"Transformer (big)\" for EN-DE as \"28.4\".",
+      "retrieved_contexts": [
+        "in Table 2) outperforms the best previously reported models (including ensembles) by more than 2.0\nBLEU, establishing a new state-of-the-art BLEU score of 28.4. The configuration of this model is\nlisted in the bottom line of Table 3. Training took 3.5 days on 8 P100 GPUs. Even our base model\nsurpasses all previously published models and ensembles, at a fraction of the training cost of any of\nthe competitive models.\nOn the WMT 2014 English-to-French translation task, our big model achieves a BLEU score of 41.0,\noutperforming all of the previously published single models, at less than 1/4 the training cost of the\nprevious state-of-the-art model. The Transformer (big) model trained for English-to-French used\ndropout rate P = 0.1, instead of 0.3.\ndrop",
+        "convolutional neural networks that include an encoder and a decoder. The best\nperforming models also connect the encoder and decoder through an attention\nmechanism. We propose a new simple network architecture, the Transformer,\nbased solely on attention mechanisms, dispensing with recurrence and convolutions\nentirely. Experiments on two machine translation tasks show these models to\nbe superior in quality while being more parallelizable and requiring significantly\nless time to train. Our model achieves 28.4 BLEU on the WMT 2014 English-\nto-German translation task, improving over the existing best results, including\nensembles, by over 2 BLEU. On the WMT 2014 English-to-French translation task,\nour model establishes a new single-model state-of-the-art BLEU score of 41.8 after",
+        "Table 2: The Transformer achieves better BLEU scores than previous state-of-the-art models on the\nEnglish-to-German and English-to-French newstest2014 tests at a fraction of the training cost.\nBLEU Training Cost (FLOPs)\nModel\nEN-DE EN-FR EN-DE EN-FR\nByteNet [18] 23.75\nDeep-Att + PosUnk [39] 39.2 1.0 1020\nGNMT + RL [38] 24.6 39.92 2.3 1019 1.4 1020\nConvS2S [9] 25.16 40.46 9.6 1018 1.5 1020\nMoE [32] 26.03 40.56 2.0 1019 1.2 1020\nDeep-Att + PosUnk Ensemble [39] 40.4 8.0 1020\nGNMT + RL Ensemble [38] 26.30 41.16 1.8 1020 1.1 1021\nConvS2S Ensemble [9] 26.36 41.29 7.7 1019 1.2 1021\nTransformer (base model) 27.3 38.1 3.3 1018\nTransformer (big) 28.4 41.8 2.3 1019\nResidual Dropout We apply dropout [33] to the output of each sub-layer, before it is added to the",
+        "Residual Dropout We apply dropout [33] to the output of each sub-layer, before it is added to the\nsub-layer input and normalized. In addition, we apply dropout to the sums of the embeddings and the\npositional encodings in both the encoder and decoder stacks. For the base model, we use a rate of\nP = 0.1.\ndrop\nLabel Smoothing During training, we employed label smoothing of value = 0.1 [36]. This\nls\nhurts perplexity, as the model learns to be more unsure, but improves accuracy and BLEU score.\n6 Results\n6.1 Machine Translation\nOn the WMT 2014 English-to-German translation task, the big transformer model (Transformer (big)\nin Table 2) outperforms the best previously reported models (including ensembles) by more than 2.0"
+      ],
+      "scores": {
+        "faithfulness": 1.0,
+        "answer_relevancy": 0.9616188500385624,
+        "context_precision": 0.5833333333041666,
+        "context_recall": 1.0
+      }
+    },
+    {
+      "index": 9,
+      "question": "What BLEU score did the big Transformer achieve on WMT 2014 English-to-French translation?",
+      "category": "factual",
+      "source_page": 8,
+      "expected_answer": "The big Transformer model achieved 41.0 BLEU on the WMT 2014 English-to-French translation task.",
+      "generated_answer": "The big Transformer achieved a BLEU score of 41.8 on the WMT 2014 English-to-French translation task. This is stated in Table 2, where \"Transformer (big)\" is listed with \"41.8\" under the \"EN-FR\" column. Additionally, the context mentions, \"On the WMT 2014 English-to-French translation task, our model establishes a new single-model state-of-the-art BLEU score of 41.8\".",
+      "retrieved_contexts": [
+        "in Table 2) outperforms the best previously reported models (including ensembles) by more than 2.0\nBLEU, establishing a new state-of-the-art BLEU score of 28.4. The configuration of this model is\nlisted in the bottom line of Table 3. Training took 3.5 days on 8 P100 GPUs. Even our base model\nsurpasses all previously published models and ensembles, at a fraction of the training cost of any of\nthe competitive models.\nOn the WMT 2014 English-to-French translation task, our big model achieves a BLEU score of 41.0,\noutperforming all of the previously published single models, at less than 1/4 the training cost of the\nprevious state-of-the-art model. The Transformer (big) model trained for English-to-French used\ndropout rate P = 0.1, instead of 0.3.\ndrop",
+        "convolutional neural networks that include an encoder and a decoder. The best\nperforming models also connect the encoder and decoder through an attention\nmechanism. We propose a new simple network architecture, the Transformer,\nbased solely on attention mechanisms, dispensing with recurrence and convolutions\nentirely. Experiments on two machine translation tasks show these models to\nbe superior in quality while being more parallelizable and requiring significantly\nless time to train. Our model achieves 28.4 BLEU on the WMT 2014 English-\nto-German translation task, improving over the existing best results, including\nensembles, by over 2 BLEU. On the WMT 2014 English-to-French translation task,\nour model establishes a new single-model state-of-the-art BLEU score of 41.8 after",
+        "Table 2: The Transformer achieves better BLEU scores than previous state-of-the-art models on the\nEnglish-to-German and English-to-French newstest2014 tests at a fraction of the training cost.\nBLEU Training Cost (FLOPs)\nModel\nEN-DE EN-FR EN-DE EN-FR\nByteNet [18] 23.75\nDeep-Att + PosUnk [39] 39.2 1.0 1020\nGNMT + RL [38] 24.6 39.92 2.3 1019 1.4 1020\nConvS2S [9] 25.16 40.46 9.6 1018 1.5 1020\nMoE [32] 26.03 40.56 2.0 1019 1.2 1020\nDeep-Att + PosUnk Ensemble [39] 40.4 8.0 1020\nGNMT + RL Ensemble [38] 26.30 41.16 1.8 1020 1.1 1021\nConvS2S Ensemble [9] 26.36 41.29 7.7 1019 1.2 1021\nTransformer (base model) 27.3 38.1 3.3 1018\nTransformer (big) 28.4 41.8 2.3 1019\nResidual Dropout We apply dropout [33] to the output of each sub-layer, before it is added to the"
+      ],
+      "scores": {
+        "faithfulness": 1.0,
+        "answer_relevancy": 0.9841596418029037,
+        "context_precision": 0.9999999999,
+        "context_recall": 1.0
+      }
+    },
+    {
+      "index": 10,
+      "question": "What optimizer is used to train the Transformer?",
+      "category": "factual",
+      "source_page": 7,
+      "expected_answer": "The Adam optimizer is used, with beta_1 = 0.9, beta_2 = 0.98, and epsilon = 10^-9.",
+      "generated_answer": "The Adam optimizer is used.",
+      "retrieved_contexts": [
+        "(3.5 days).\n5.3 Optimizer\nWe used the Adam optimizer [20] with = 0.9, = 0.98 and = 10 9. We varied the learning\n1 2\nrate over the course of training, according to the formula:\nlrate = d 0.5 min(step_num 0.5,step_num warmup_steps 1.5) (3)\nmodel\nThis corresponds to increasing the learning rate linearly for the first warmup_steps training steps,\nand decreasing it thereafter proportionally to the inverse square root of the step number. We used\nwarmup_steps = 4000.\n5.4 Regularization\nWe employ three types of regularization during training:\n7"
+      ],
+      "scores": {
+        "faithfulness": 0.0,
+        "answer_relevancy": 0.6449751646820833,
+        "context_precision": 0.9999999999,
+        "context_recall": 1.0
+      }
+    },
+    {
+      "index": 11,
+      "question": "What is the dropout rate applied to the base Transformer model?",
+      "category": "factual",
+      "source_page": 7,
+      "expected_answer": "A dropout rate of P_drop = 0.1 is applied to the output of each sub-layer before the residual connection and layer normalization, and also to the sums of the embeddings and positional encodings in both encoder and decoder stacks.",
+      "generated_answer": "For the base model, a dropout rate of P = 0.1 is used.",
+      "retrieved_contexts": [
+        "Residual Dropout We apply dropout [33] to the output of each sub-layer, before it is added to the\nsub-layer input and normalized. In addition, we apply dropout to the sums of the embeddings and the\npositional encodings in both the encoder and decoder stacks. For the base model, we use a rate of\nP = 0.1.\ndrop\nLabel Smoothing During training, we employed label smoothing of value = 0.1 [36]. This\nls\nhurts perplexity, as the model learns to be more unsure, but improves accuracy and BLEU score.\n6 Results\n6.1 Machine Translation\nOn the WMT 2014 English-to-German translation task, the big transformer model (Transformer (big)\nin Table 2) outperforms the best previously reported models (including ensembles) by more than 2.0",
+        "dropout rate P = 0.1, instead of 0.3.\ndrop\nFor the base models, we used a single model obtained by averaging the last 5 checkpoints, which\nwere written at 10-minute intervals. For the big models, we averaged the last 20 checkpoints. We\nused beam search with a beam size of 4 and length penalty = 0.6 [38]. These hyperparameters\nwere chosen after experimentation on the development set. We set the maximum output length during\ninference to input length + 50, but terminate early when possible [38].\nTable 2 summarizes our results and compares our translation quality and training costs to other model\narchitectures from the literature. We estimate the number of floating point operations used to train a"
+      ],
+      "scores": {
+        "faithfulness": 1.0,
+        "answer_relevancy": 0.8293357093582051,
+        "context_precision": 0.99999999995,
+        "context_recall": 1.0
+      }
+    },
+    {
+      "index": 12,
+      "question": "How many warmup steps are used in the learning rate schedule?",
+      "category": "factual",
+      "source_page": 7,
+      "expected_answer": "The learning rate schedule uses warmup_steps = 4000.",
+      "generated_answer": "We used warmup_steps = 4000.",
+      "retrieved_contexts": [
+        "(3.5 days).\n5.3 Optimizer\nWe used the Adam optimizer [20] with = 0.9, = 0.98 and = 10 9. We varied the learning\n1 2\nrate over the course of training, according to the formula:\nlrate = d 0.5 min(step_num 0.5,step_num warmup_steps 1.5) (3)\nmodel\nThis corresponds to increasing the learning rate linearly for the first warmup_steps training steps,\nand decreasing it thereafter proportionally to the inverse square root of the step number. We used\nwarmup_steps = 4000.\n5.4 Regularization\nWe employ three types of regularization during training:\n7"
+      ],
+      "scores": {
+        "faithfulness": 1.0,
+        "answer_relevancy": 0.5958444664164306,
+        "context_precision": 0.9999999999,
+        "context_recall": 1.0
+      }
+    },
+    {
+      "index": 13,
+      "question": "What types of attention are used in the Transformer architecture?",
+      "category": "factual",
+      "source_page": 5,
+      "expected_answer": "The Transformer uses three types of attention: encoder self-attention (each position attends to all positions in the encoder), decoder self-attention (each position attends to all previous positions), and encoder-decoder attention (each decoder position attends to all encoder positions).",
+      "generated_answer": "The Transformer uses multi-head attention in three different ways: \"encoder-decoder attention\" layers, self-attention layers in the encoder, and self-attention in the decoder. Self-attention is sometimes called intra-attention.",
+      "retrieved_contexts": [
+        "k v model\nis similar to that of single-head attention with full dimensionality.\n3.2.3 Applications of Attention in our Model\nThe Transformer uses multi-head attention in three different ways:\n In \"encoder-decoder attention\" layers, the queries come from the previous decoder layer,\nand the memory keys and values come from the output of the encoder. This allows every\nposition in the decoder to attend over all positions in the input sequence. This mimics the\ntypical encoder-decoder attention mechanisms in sequence-to-sequence models such as\n[38, 2, 9].\n The encoder contains self-attention layers. In a self-attention layer all of the keys, values\nand queries come from the same place, in this case, the output of the previous layer in the",
+        "In this work, we presented the Transformer, the first sequence transduction model based entirely on\nattention, replacing the recurrent layers most commonly used in encoder-decoder architectures with\nmulti-headed self-attention.\nFor translation tasks, the Transformer can be trained significantly faster than architectures based\non recurrent or convolutional layers. On both WMT 2014 English-to-German and WMT 2014\nEnglish-to-French translation tasks, we achieve a new state of the art. In the former task our best\nmodel outperforms even all previously reported ensembles.\nWe are excited about the future of attention-based models and plan to apply them to other tasks. We\nplan to extend the Transformer to problems involving input and output modalities other than text and",
+        "Figure 1: The Transformer - model architecture.\nThe Transformer follows this overall architecture using stacked self-attention and point-wise, fully\nconnected layers for both the encoder and decoder, shown in the left and right halves of Figure 1,\nrespectively.\n3.1 Encoder and Decoder Stacks\nEncoder: The encoder is composed of a stack of N = 6 identical layers. Each layer has two\nsub-layers. The first is a multi-head self-attention mechanism, and the second is a simple, position-\nwise fully connected feed-forward network. We employ a residual connection [11] around each of\nthe two sub-layers, followed by layer normalization [1]. That is, the output of each sub-layer is\nLayerNorm(x + Sublayer(x)), where Sublayer(x) is the function implemented by the sub-layer",
+        "the number of operations required to relate signals from two arbitrary input or output positions grows\nin the distance between positions, linearly for ConvS2S and logarithmically for ByteNet. This makes\nit more difficult to learn dependencies between distant positions [12]. In the Transformer this is\nreduced to a constant number of operations, albeit at the cost of reduced effective resolution due\nto averaging attention-weighted positions, an effect we counteract with Multi-Head Attention as\ndescribed in section 3.2.\nSelf-attention, sometimes called intra-attention is an attention mechanism relating different positions\nof a single sequence in order to compute a representation of the sequence. Self-attention has been"
+      ],
+      "scores": {
+        "faithfulness": 0.8333333333333334,
+        "answer_relevancy": 0.7749264127909857,
+        "context_precision": 0.0,
+        "context_recall": 0.0
+      }
+    },
+    {
+      "index": 14,
+      "question": "What is label smoothing and what value is used in the paper?",
+      "category": "factual",
+      "source_page": 7,
+      "expected_answer": "Label smoothing is a regularization technique where the model is penalized for being too confident. The paper uses label smoothing of value epsilon_ls = 0.1, which hurts perplexity as the model learns to be more unsure, but improves accuracy and BLEU score.",
+      "generated_answer": "Label smoothing is a technique employed during training that has a value of 0.1. It hurts perplexity because the model learns to be more unsure, but it improves accuracy and BLEU score.",
+      "retrieved_contexts": [
+        "Residual Dropout We apply dropout [33] to the output of each sub-layer, before it is added to the\nsub-layer input and normalized. In addition, we apply dropout to the sums of the embeddings and the\npositional encodings in both the encoder and decoder stacks. For the base model, we use a rate of\nP = 0.1.\ndrop\nLabel Smoothing During training, we employed label smoothing of value = 0.1 [36]. This\nls\nhurts perplexity, as the model learns to be more unsure, but improves accuracy and BLEU score.\n6 Results\n6.1 Machine Translation\nOn the WMT 2014 English-to-German translation task, the big transformer model (Transformer (big)\nin Table 2) outperforms the best previously reported models (including ensembles) by more than 2.0",
+        "Table 3: Variations on the Transformer architecture. Unlisted values are identical to those of the base\nmodel. All metrics are on the English-to-German translation development set, newstest2013. Listed\nperplexities are per-wordpiece, according to our byte-pair encoding, and should not be compared to\nper-word perplexities.\ntrain PPL BLEU params\nN d d h d d P \nmodel ff k v drop ls steps (dev) (dev) 106\nbase 6 512 2048 8 64 64 0.1 0.1 100K 4.92 25.8 65\n1 512 512 5.29 24.9\n4 128 128 5.00 25.5\n(A)\n16 32 32 4.91 25.8\n32 16 16 5.01 25.4\n16 5.16 25.1 58\n(B)\n32 5.01 25.4 60\n2 6.11 23.7 36\n4 5.19 25.3 50\n8 4.88 25.5 80\n(C) 256 32 32 5.75 24.5 28\n1024 128 128 4.66 26.0 168\n1024 5.12 25.4 53\n4096 4.75 26.2 90\n0.0 5.77 24.6\n0.2 4.95 25.5\n(D)\n0.0 4.67 25.3\n0.2 5.47 25.7"
+      ],
+      "scores": {
+        "faithfulness": 1.0,
+        "answer_relevancy": 0.820428300246485,
+        "context_precision": 0.9999999999,
+        "context_recall": 0.5
+      }
+    },
+    {
+      "index": 15,
+      "question": "What dataset is used for English constituency parsing experiments?",
+      "category": "factual",
+      "source_page": 9,
+      "expected_answer": "The Wall Street Journal (WSJ) portion of the Penn Treebank is used for English constituency parsing, containing approximately 40,000 training sentences.",
+      "generated_answer": "The results for English constituency parsing are on Section 23 of WSJ.",
+      "retrieved_contexts": [
+        "Table 4: The Transformer generalizes well to English constituency parsing (Results are on Section 23\nof WSJ)\nParser Training WSJ 23 F1\nVinyals & Kaiser el al. (2014) [37] WSJ only, discriminative 88.3\nPetrov et al. (2006) [29] WSJ only, discriminative 90.4\nZhu et al. (2013) [40] WSJ only, discriminative 90.4\nDyer et al. (2016) [8] WSJ only, discriminative 91.7\nTransformer (4 layers) WSJ only, discriminative 91.3\nZhu et al. (2013) [40] semi-supervised 91.3\nHuang & Harper (2009) [14] semi-supervised 91.3\nMcClosky et al. (2006) [26] semi-supervised 92.1\nVinyals & Kaiser el al. (2014) [37] semi-supervised 92.1\nTransformer (4 layers) semi-supervised 92.7\nLuong et al. (2015) [23] multi-task 93.0\nDyer et al. (2016) [8] generative 93.3",
+        "[25] Mitchell P Marcus, Mary Ann Marcinkiewicz, and Beatrice Santorini. Building a large annotated\ncorpus of english: The penn treebank. Computational linguistics, 19(2):313 330, 1993.\n[26] David McClosky, Eugene Charniak, and Mark Johnson. Effective self-training for parsing. In\nProceedings of the Human Language Technology Conference of the NAACL, Main Conference,\npages 152 159. ACL, June 2006.\n[27] Ankur Parikh, Oscar Ta ckstro m, Dipanjan Das, and Jakob Uszkoreit. A decomposable attention\nmodel. In Empirical Methods in Natural Language Processing, 2016.\n[28] Romain Paulus, Caiming Xiong, and Richard Socher. A deep reinforced model for abstractive\nsummarization. arXiv preprint arXiv:1705.04304, 2017."
+      ],
+      "scores": {
+        "faithfulness": 1.0,
+        "answer_relevancy": 0.7599545742379277,
+        "context_precision": 0.99999999995,
+        "context_recall": 0.0
+      }
+    },
+    {
+      "index": 16,
+      "question": "Why does the Transformer use positional encodings?",
+      "category": "reasoning",
+      "source_page": 6,
+      "expected_answer": "Since the Transformer contains no recurrence or convolution, it has no built-in sense of token order. Positional encodings are added to the input embeddings to inject information about the relative or absolute position of each token in the sequence, allowing the model to make use of sequence order.",
+      "generated_answer": "I could not find a reliable answer in the document after multiple attempts. Please try rephrasing your question or check that the document has been indexed.",
+      "retrieved_contexts": [
+        "tokens in the sequence. To this end, we add \"positional encodings\" to the input embeddings at the\nbottoms of the encoder and decoder stacks. The positional encodings have the same dimension d\nmodel\nas the embeddings, so that the two can be summed. There are many choices of positional encodings,\nlearned and fixed [9].\nIn this work, we use sine and cosine functions of different frequencies:\nPE = sin(pos/100002i/d model)\n(pos,2i)\nPE = cos(pos/100002i/d model)\n(pos,2i+1)\nwhere pos is the position and i is the dimension. That is, each dimension of the positional encoding\ncorresponds to a sinusoid. The wavelengths form a geometric progression from 2 to 10000 2 . We\nchose this function because we hypothesized it would allow the model to easily learn to attend by"
+      ],
+      "scores": {
+        "faithfulness": 0.0,
+        "answer_relevancy": 0.0,
+        "context_precision": 0.0,
+        "context_recall": 0.5
+      }
+    },
+    {
+      "index": 17,
+      "question": "Why does the paper scale the dot product by 1 over the square root of d_k in scaled dot-product attention?",
+      "category": "reasoning",
+      "source_page": 4,
+      "expected_answer": "For large values of d_k, the dot products grow large in magnitude, pushing the softmax function into regions with extremely small gradients. Dividing by the square root of d_k counteracts this effect, preventing vanishing gradients and stabilizing training.",
+      "generated_answer": "The paper scales the dot products by 1 over the square root of d_k to counteract the effect of dot products growing large in magnitude for larger values of d_k. This large magnitude can push the softmax function into regions where it has extremely small gradients.",
+      "retrieved_contexts": [
+        "matrix multiplication code.\nWhile for small values of d the two mechanisms perform similarly, additive attention outperforms\nk\ndot product attention without scaling for larger values of d [3]. We suspect that for large values of\nk\nd , the dot products grow large in magnitude, pushing the softmax function into regions where it has\nk\nextremely small gradients 4. To counteract this effect, we scale the dot products by 1 .\nd\nk\n3.2.2 Multi-Head Attention\nInstead of performing a single attention function with d -dimensional keys, values and queries,\nmodel\nwe found it beneficial to linearly project the queries, keys and values h times with different, learned\nlinear projections to d , d and d dimensions, respectively. On each of these projected versions of\nk k v",
+        "k k v\nqueries, keys and values we then perform the attention function in parallel, yielding d -dimensional\nv\n4To illustrate why the dot products get large, assume that the components of q and k are independent random\nvariables with mean 0 and variance 1. Then their dot product, q k =\n(cid:80)d\nk q k , has mean 0 and variance d .\ni=1 i i k\n4"
+      ],
+      "scores": {
+        "faithfulness": 1.0,
+        "answer_relevancy": 0.9414094078633105,
+        "context_precision": 0.99999999995,
+        "context_recall": 0.5
+      }
+    },
+    {
+      "index": 18,
+      "question": "What is the advantage of multi-head attention over single-head attention?",
+      "category": "reasoning",
+      "source_page": 4,
+      "expected_answer": "Multi-head attention allows the model to jointly attend to information from different representation subspaces at different positions. With a single attention head, averaging would inhibit this. By projecting queries, keys, and values h times with different learned linear projections, the model captures richer relational structure.",
+      "generated_answer": "Multi-head attention allows the model to jointly attend to information from different representation subspaces at different positions. With a single attention head, averaging inhibits this.",
+      "retrieved_contexts": [
+        "output values. These are concatenated and once again projected, resulting in the final values, as\ndepicted in Figure 2.\nMulti-head attention allows the model to jointly attend to information from different representation\nsubspaces at different positions. With a single attention head, averaging inhibits this.\nMultiHead(Q,K,V ) = Concat(head ,...,head )WO\n1 h\nwhere head = Attention(QW Q ,KWK,V WV )\ni i i i\nWhere the projections are parameter matrices W Q Rd model d k, WK Rd model d k, WV Rd model d v\ni i i\nand WO Rhd v d model.\nIn this work we employ h = 8 parallel attention layers, or heads. For each of these we use\nd = d = d /h = 64. Due to the reduced dimension of each head, the total computational cost\nk v model\nis similar to that of single-head attention with full dimensionality."
+      ],
+      "scores": {
+        "faithfulness": 1.0,
+        "answer_relevancy": 0.8151053731082909,
+        "context_precision": 0.9999999999,
+        "context_recall": 0.6666666666666666
+      }
+    },
+    {
+      "index": 19,
+      "question": "Why does the Transformer enable more parallelization than RNN-based architectures?",
+      "category": "reasoning",
+      "source_page": 2,
+      "expected_answer": "RNNs must process tokens sequentially because each hidden state depends on the previous one, preventing parallel computation across positions during training. The Transformer's self-attention operates over all positions simultaneously in O(1) sequential operations per layer, enabling full parallelization within each layer.",
+      "generated_answer": "The Transformer allows for significantly more parallelization because it is a model architecture that eschews recurrence and instead relies entirely on an attention mechanism to draw global dependencies between input and output. It computes representations of its input and output without using sequence-aligned recurrence.",
+      "retrieved_contexts": [
+        "are used in conjunction with a recurrent network.\nIn this work we propose the Transformer, a model architecture eschewing recurrence and instead\nrelying entirely on an attention mechanism to draw global dependencies between input and output.\nThe Transformer allows for significantly more parallelization and can reach a new state of the art in\ntranslation quality after being trained for as little as twelve hours on eight P100 GPUs.\n2 Background\nThe goal of reducing sequential computation also forms the foundation of the Extended Neural GPU\n[16], ByteNet [18] and ConvS2S [9], all of which use convolutional neural networks as basic building\nblock, computing hidden representations in parallel for all input and output positions. In these models,",
+        "of a single sequence in order to compute a representation of the sequence. Self-attention has been\nused successfully in a variety of tasks including reading comprehension, abstractive summarization,\ntextual entailment and learning task-independent sentence representations [4, 27, 28, 22].\nEnd-to-end memory networks are based on a recurrent attention mechanism instead of sequence-\naligned recurrence and have been shown to perform well on simple-language question answering and\nlanguage modeling tasks [34].\nTo the best of our knowledge, however, the Transformer is the first transduction model relying\nentirely on self-attention to compute representations of its input and output without using sequence-"
+      ],
+      "scores": {
+        "faithfulness": 1.0,
+        "answer_relevancy": 0.7626294015280158,
+        "context_precision": 0.49999999995,
+        "context_recall": 0.5
+      }
+    },
+    {
+      "index": 20,
+      "question": "What is the purpose of masking in the decoder's self-attention layer?",
+      "category": "reasoning",
+      "source_page": 3,
+      "expected_answer": "The masking prevents positions in the decoder from attending to subsequent positions. This preserves the auto-regressive property: when generating the output at position i, the model can only attend to known outputs at positions less than i, preventing it from seeing future tokens during training.",
+      "generated_answer": "The purpose of masking in the decoder's self-attention layer is to prevent leftward information flow in the decoder to preserve the auto-regressive property. This is implemented by masking out (setting to ) all values in the input of the softmax which correspond to illegal connections. This modification prevents positions from attending to subsequent positions, ensuring that predictions for position i can depend only on the known outputs at positions less than i.",
+      "retrieved_contexts": [
+        "and queries come from the same place, in this case, the output of the previous layer in the\nencoder. Each position in the encoder can attend to all positions in the previous layer of the\nencoder.\n Similarly, self-attention layers in the decoder allow each position in the decoder to attend to\nall positions in the decoder up to and including that position. We need to prevent leftward\ninformation flow in the decoder to preserve the auto-regressive property. We implement this\ninside of scaled dot-product attention by masking out (setting to ) all values in the input\nof the softmax which correspond to illegal connections. See Figure 2.\n3.3 Position-wise Feed-Forward Networks\nIn addition to attention sub-layers, each of the layers in our encoder and decoder contains a fully",
+        "sub-layer in the decoder stack to prevent positions from attending to subsequent positions. This\nmasking, combined with fact that the output embeddings are offset by one position, ensures that the\npredictions for position i can depend only on the known outputs at positions less than i.\n3.2 Attention\nAn attention function can be described as mapping a query and a set of key-value pairs to an output,\nwhere the query, keys, values, and output are all vectors. The output is computed as a weighted sum\n3",
+        "LayerNorm(x + Sublayer(x)), where Sublayer(x) is the function implemented by the sub-layer\nitself. To facilitate these residual connections, all sub-layers in the model, as well as the embedding\nlayers, produce outputs of dimension d = 512.\nmodel\nDecoder: The decoder is also composed of a stack of N = 6 identical layers. In addition to the two\nsub-layers in each encoder layer, the decoder inserts a third sub-layer, which performs multi-head\nattention over the output of the encoder stack. Similar to the encoder, we employ residual connections\naround each of the sub-layers, followed by layer normalization. We also modify the self-attention\nsub-layer in the decoder stack to prevent positions from attending to subsequent positions. This"
+      ],
+      "scores": {
+        "faithfulness": 0.8333333333333334,
+        "answer_relevancy": 1.0,
+        "context_precision": 0.9999999999666667,
+        "context_recall": 1.0
+      }
+    },
+    {
+      "index": 21,
+      "question": "Why do the authors use sine and cosine functions for positional encodings rather than learned embeddings?",
+      "category": "reasoning",
+      "source_page": 6,
+      "expected_answer": "Sine and cosine functions allow the model to generalize to sequence lengths longer than those seen during training, since each position's encoding is a deterministic function of its index. They also hypothesize that fixed sinusoidal encodings allow the model to easily learn to attend by relative positions.",
+      "generated_answer": "The authors chose the sinusoidal version of positional encodings because it may allow the model to extrapolate to sequence lengths longer than the ones encountered during training. They also experimented with learned positional embeddings and found that the two versions produced nearly identical results, as observed in Table 3 row (E).",
+      "retrieved_contexts": [
+        "tokens in the sequence. To this end, we add \"positional encodings\" to the input embeddings at the\nbottoms of the encoder and decoder stacks. The positional encodings have the same dimension d\nmodel\nas the embeddings, so that the two can be summed. There are many choices of positional encodings,\nlearned and fixed [9].\nIn this work, we use sine and cosine functions of different frequencies:\nPE = sin(pos/100002i/d model)\n(pos,2i)\nPE = cos(pos/100002i/d model)\n(pos,2i+1)\nwhere pos is the position and i is the dimension. That is, each dimension of the positional encoding\ncorresponds to a sinusoid. The wavelengths form a geometric progression from 2 to 10000 2 . We\nchose this function because we hypothesized it would allow the model to easily learn to attend by",
+        "chose this function because we hypothesized it would allow the model to easily learn to attend by\nrelative positions, since for any fixed offset k, PE can be represented as a linear function of\npos+k\nPE .\npos\nWe also experimented with using learned positional embeddings [9] instead, and found that the two\nversions produced nearly identical results (see Table 3 row (E)). We chose the sinusoidal version\nbecause it may allow the model to extrapolate to sequence lengths longer than the ones encountered\nduring training.\n4 Why Self-Attention\nIn this section we compare various aspects of self-attention layers to the recurrent and convolu-\ntional layers commonly used for mapping one variable-length sequence of symbol representations",
+        "In Table 3 rows (B), we observe that reducing the attention key size d hurts model quality. This\nk\nsuggests that determining compatibility is not easy and that a more sophisticated compatibility\nfunction than dot product may be beneficial. We further observe in rows (C) and (D) that, as expected,\nbigger models are better, and dropout is very helpful in avoiding over-fitting. In row (E) we replace our\nsinusoidal positional encoding with learned positional embeddings [9], and observe nearly identical\nresults to the base model.\n6.3 English Constituency Parsing\nTo evaluate if the Transformer can generalize to other tasks we performed experiments on English\nconstituency parsing. This task presents specific challenges: the output is subject to strong structural"
+      ],
+      "scores": {
+        "faithfulness": 1.0,
+        "answer_relevancy": 0.7538105551089143,
+        "context_precision": 0.99999999995,
+        "context_recall": 1.0
+      }
+    },
+    {
+      "index": 22,
+      "question": "Why do the authors claim the Transformer is more efficient for long sequences than convolutional layers?",
+      "category": "reasoning",
+      "source_page": 6,
+      "expected_answer": "A single convolutional layer does not connect all pairs of input and output positions when the kernel width is smaller than the sequence length; multiple layers are needed to capture distant dependencies. Self-attention connects all positions in a single step with constant path length, making long-range dependencies easier to learn.",
+      "generated_answer": "The authors claim that in the Transformer, the number of operations required to relate signals from two arbitrary input or output positions is reduced to a constant number of operations. In contrast, for convolutional layers like ConvS2S, this number grows linearly with the distance between positions, and logarithmically for ByteNet, which makes it more difficult to learn dependencies between distant positions.",
+      "retrieved_contexts": [
+        "length n is smaller than the representation dimensionality d, which is most often the case with\nsentence representations used by state-of-the-art models in machine translations, such as word-piece\n[38] and byte-pair [31] representations. To improve computational performance for tasks involving\nvery long sequences, self-attention could be restricted to considering only a neighborhood of size r in\nthe input sequence centered around the respective output position. This would increase the maximum\npath length to O(n/r). We plan to investigate this approach further in future work.\nA single convolutional layer with kernel width k < n does not connect all pairs of input and output\npositions. Doing so requires a stack of O(n/k) convolutional layers in the case of contiguous kernels,",
+        "the number of operations required to relate signals from two arbitrary input or output positions grows\nin the distance between positions, linearly for ConvS2S and logarithmically for ByteNet. This makes\nit more difficult to learn dependencies between distant positions [12]. In the Transformer this is\nreduced to a constant number of operations, albeit at the cost of reduced effective resolution due\nto averaging attention-weighted positions, an effect we counteract with Multi-Head Attention as\ndescribed in section 3.2.\nSelf-attention, sometimes called intra-attention is an attention mechanism relating different positions\nof a single sequence in order to compute a representation of the sequence. Self-attention has been"
+      ],
+      "scores": {
+        "faithfulness": 1.0,
+        "answer_relevancy": 0.6646183990381221,
+        "context_precision": 0.99999999995,
+        "context_recall": 1.0
+      }
+    },
+    {
+      "index": 23,
+      "question": "How does residual dropout and label smoothing together improve the model despite making individual metrics worse?",
+      "category": "reasoning",
+      "source_page": 7,
+      "expected_answer": "Residual dropout prevents co-adaptation of features and reduces overfitting. Label smoothing explicitly discourages the model from assigning full probability to any one token, making it more uncertain and improving calibration. While label smoothing increases perplexity (since it penalizes confident predictions), both techniques together improve translation quality as measured by BLEU because they produce better-calibrated, more generalizable models.",
+      "generated_answer": "Label smoothing hurts perplexity, as the model learns to be more unsure, but improves accuracy and BLEU score. The provided document does not state that residual dropout makes individual metrics worse. The document does not explain how residual dropout and label smoothing together improve the model despite making individual metrics worse.",
+      "retrieved_contexts": [
+        "Residual Dropout We apply dropout [33] to the output of each sub-layer, before it is added to the\nsub-layer input and normalized. In addition, we apply dropout to the sums of the embeddings and the\npositional encodings in both the encoder and decoder stacks. For the base model, we use a rate of\nP = 0.1.\ndrop\nLabel Smoothing During training, we employed label smoothing of value = 0.1 [36]. This\nls\nhurts perplexity, as the model learns to be more unsure, but improves accuracy and BLEU score.\n6 Results\n6.1 Machine Translation\nOn the WMT 2014 English-to-German translation task, the big transformer model (Transformer (big)\nin Table 2) outperforms the best previously reported models (including ensembles) by more than 2.0",
+        "Table 3: Variations on the Transformer architecture. Unlisted values are identical to those of the base\nmodel. All metrics are on the English-to-German translation development set, newstest2013. Listed\nperplexities are per-wordpiece, according to our byte-pair encoding, and should not be compared to\nper-word perplexities.\ntrain PPL BLEU params\nN d d h d d P \nmodel ff k v drop ls steps (dev) (dev) 106\nbase 6 512 2048 8 64 64 0.1 0.1 100K 4.92 25.8 65\n1 512 512 5.29 24.9\n4 128 128 5.00 25.5\n(A)\n16 32 32 4.91 25.8\n32 16 16 5.01 25.4\n16 5.16 25.1 58\n(B)\n32 5.01 25.4 60\n2 6.11 23.7 36\n4 5.19 25.3 50\n8 4.88 25.5 80\n(C) 256 32 32 5.75 24.5 28\n1024 128 128 4.66 26.0 168\n1024 5.12 25.4 53\n4096 4.75 26.2 90\n0.0 5.77 24.6\n0.2 4.95 25.5\n(D)\n0.0 4.67 25.3\n0.2 5.47 25.7"
+      ],
+      "scores": {
+        "faithfulness": 0.8571428571428571,
+        "answer_relevancy": 0.0,
+        "context_precision": 0.0,
+        "context_recall": 0.3333333333333333
+      }
+    },
+    {
+      "index": 24,
+      "question": "Given that the base model uses d_model=512 and h=8 attention heads, what is the dimensionality of each head's queries and keys?",
+      "category": "multi_hop",
+      "source_page": 5,
+      "expected_answer": "Each head operates on d_k = d_v = d_model / h = 512 / 8 = 64 dimensions for both keys/queries and values.",
+      "generated_answer": "The dimensionality of each head's queries and keys is d_k = 64, as stated by \"For each of these we use d = d = d /h = 64.\"",
+      "retrieved_contexts": [
+        "output values. These are concatenated and once again projected, resulting in the final values, as\ndepicted in Figure 2.\nMulti-head attention allows the model to jointly attend to information from different representation\nsubspaces at different positions. With a single attention head, averaging inhibits this.\nMultiHead(Q,K,V ) = Concat(head ,...,head )WO\n1 h\nwhere head = Attention(QW Q ,KWK,V WV )\ni i i i\nWhere the projections are parameter matrices W Q Rd model d k, WK Rd model d k, WV Rd model d v\ni i i\nand WO Rhd v d model.\nIn this work we employ h = 8 parallel attention layers, or heads. For each of these we use\nd = d = d /h = 64. Due to the reduced dimension of each head, the total computational cost\nk v model\nis similar to that of single-head attention with full dimensionality."
+      ],
+      "scores": {
+        "faithfulness": 1.0,
+        "answer_relevancy": 0.8169825113918759,
+        "context_precision": 0.9999999999,
+        "context_recall": 1.0
+      }
+    },
+    {
+      "index": 25,
+      "question": "How does the learning rate formula relate warmup steps, model dimension, and training step number?",
+      "category": "multi_hop",
+      "source_page": 7,
+      "expected_answer": "The learning rate formula is: lrate = d_model^(-0.5) * min(step_num^(-0.5), step_num * warmup_steps^(-1.5)). Using d_model=512 and warmup_steps=4000, the rate increases linearly for the first 4000 steps, then decreases proportionally to the inverse square root of the step number.",
+      "generated_answer": "The learning rate formula is given as: lrate = d_model^-0.5 * min(step_num^-0.5, step_num * warmup_steps^-1.5).\n\nThis formula relates them by:\n- The learning rate is proportional to the inverse square root of the model dimension (d_model^-0.5).\n- It is also proportional to the minimum of two terms involving the training step number (step_num) and warmup steps (warmup_steps).\n- Specifically, it increases the learning rate linearly for the first warmup_steps training steps.\n- After the warmup_steps, it decreases the learning rate proportionally to the inverse square root of the step number.",
+      "retrieved_contexts": [
+        "(3.5 days).\n5.3 Optimizer\nWe used the Adam optimizer [20] with = 0.9, = 0.98 and = 10 9. We varied the learning\n1 2\nrate over the course of training, according to the formula:\nlrate = d 0.5 min(step_num 0.5,step_num warmup_steps 1.5) (3)\nmodel\nThis corresponds to increasing the learning rate linearly for the first warmup_steps training steps,\nand decreasing it thereafter proportionally to the inverse square root of the step number. We used\nwarmup_steps = 4000.\n5.4 Regularization\nWe employ three types of regularization during training:\n7"
+      ],
+      "scores": {
+        "faithfulness": 1.0,
+        "answer_relevancy": 0.9411763776979067,
+        "context_precision": 0.9999999999,
+        "context_recall": 0.5
+      }
+    },
+    {
+      "index": 26,
+      "question": "How does encoder-decoder attention differ from encoder self-attention in terms of where queries, keys, and values come from?",
+      "category": "multi_hop",
+      "source_page": 5,
+      "expected_answer": "In encoder self-attention, queries, keys, and values all come from the output of the previous encoder layer, so each position can attend to every position in the encoder. In encoder-decoder attention, the queries come from the previous decoder layer while the keys and values come from the encoder output, allowing every decoder position to attend over all positions in the input sequence.",
+      "generated_answer": "In encoder-decoder attention layers, the queries come from the previous decoder layer, and the memory keys and values come from the output of the encoder. In encoder self-attention layers, all of the keys, values, and queries come from the same place, specifically the output of the previous layer in the encoder.",
+      "retrieved_contexts": [
+        "k v model\nis similar to that of single-head attention with full dimensionality.\n3.2.3 Applications of Attention in our Model\nThe Transformer uses multi-head attention in three different ways:\n In \"encoder-decoder attention\" layers, the queries come from the previous decoder layer,\nand the memory keys and values come from the output of the encoder. This allows every\nposition in the decoder to attend over all positions in the input sequence. This mimics the\ntypical encoder-decoder attention mechanisms in sequence-to-sequence models such as\n[38, 2, 9].\n The encoder contains self-attention layers. In a self-attention layer all of the keys, values\nand queries come from the same place, in this case, the output of the previous layer in the",
+        "and queries come from the same place, in this case, the output of the previous layer in the\nencoder. Each position in the encoder can attend to all positions in the previous layer of the\nencoder.\n Similarly, self-attention layers in the decoder allow each position in the decoder to attend to\nall positions in the decoder up to and including that position. We need to prevent leftward\ninformation flow in the decoder to preserve the auto-regressive property. We implement this\ninside of scaled dot-product attention by masking out (setting to ) all values in the input\nof the softmax which correspond to illegal connections. See Figure 2.\n3.3 Position-wise Feed-Forward Networks\nIn addition to attention sub-layers, each of the layers in our encoder and decoder contains a fully"
+      ],
+      "scores": {
+        "faithfulness": 1.0,
+        "answer_relevancy": 0.8874121948799971,
+        "context_precision": 0.9999999999,
+        "context_recall": 1.0
+      }
+    },
+    {
+      "index": 27,
+      "question": "How do the training configurations of the base and big Transformer models differ, and how does this relate to their BLEU scores?",
+      "category": "multi_hop",
+      "source_page": 8,
+      "expected_answer": "The base model uses d_model=512, h=8, d_ff=2048, dropout=0.1 and is trained for 100,000 steps with a single machine and 8 GPUs. The big model uses d_model=1024, h=16, d_ff=4096, dropout=0.3 and is trained for 300,000 steps. The big model achieves 28.4 BLEU (English-German) versus the base model's 27.3 BLEU, demonstrating that larger capacity and more training improve performance at the cost of compute.",
+      "generated_answer": "The big Transformer model, which achieved a BLEU score of 28.4 on the WMT 2014 English-to-German translation task, had its training take 3.5 days on 8 P100 GPUs. The configuration of this model is stated to be listed in the bottom line of Table 3, which is not provided. For the WMT 2014 English-to-French translation task, the Transformer (big) model achieved a BLEU score of 41.8 and used a dropout rate P = 0.1, instead of 0.3.\n\nThe base model achieved a BLEU score of 27.3 on the WMT 2014 English-to-German translation task and 38.1 on the WMT 2014 English-to-French translation task. For the base model, a residual dropout rate of P = 0.1 is used.\n\nThe big model outperforms the base model in BLEU scores on both tasks, with 28.4 versus 27.3 for English-to-German, and 41.8 versus 38.1 for English-to-French. The text indicates that the big model's training cost is a fraction of the previous state-of-the-art model, and the base model surpasses all previously published models and ensembles at a fraction of the training cost of competitive models.",
+      "retrieved_contexts": [
+        "in Table 2) outperforms the best previously reported models (including ensembles) by more than 2.0\nBLEU, establishing a new state-of-the-art BLEU score of 28.4. The configuration of this model is\nlisted in the bottom line of Table 3. Training took 3.5 days on 8 P100 GPUs. Even our base model\nsurpasses all previously published models and ensembles, at a fraction of the training cost of any of\nthe competitive models.\nOn the WMT 2014 English-to-French translation task, our big model achieves a BLEU score of 41.0,\noutperforming all of the previously published single models, at less than 1/4 the training cost of the\nprevious state-of-the-art model. The Transformer (big) model trained for English-to-French used\ndropout rate P = 0.1, instead of 0.3.\ndrop",
+        "Residual Dropout We apply dropout [33] to the output of each sub-layer, before it is added to the\nsub-layer input and normalized. In addition, we apply dropout to the sums of the embeddings and the\npositional encodings in both the encoder and decoder stacks. For the base model, we use a rate of\nP = 0.1.\ndrop\nLabel Smoothing During training, we employed label smoothing of value = 0.1 [36]. This\nls\nhurts perplexity, as the model learns to be more unsure, but improves accuracy and BLEU score.\n6 Results\n6.1 Machine Translation\nOn the WMT 2014 English-to-German translation task, the big transformer model (Transformer (big)\nin Table 2) outperforms the best previously reported models (including ensembles) by more than 2.0",
+        "Table 2: The Transformer achieves better BLEU scores than previous state-of-the-art models on the\nEnglish-to-German and English-to-French newstest2014 tests at a fraction of the training cost.\nBLEU Training Cost (FLOPs)\nModel\nEN-DE EN-FR EN-DE EN-FR\nByteNet [18] 23.75\nDeep-Att + PosUnk [39] 39.2 1.0 1020\nGNMT + RL [38] 24.6 39.92 2.3 1019 1.4 1020\nConvS2S [9] 25.16 40.46 9.6 1018 1.5 1020\nMoE [32] 26.03 40.56 2.0 1019 1.2 1020\nDeep-Att + PosUnk Ensemble [39] 40.4 8.0 1020\nGNMT + RL Ensemble [38] 26.30 41.16 1.8 1020 1.1 1021\nConvS2S Ensemble [9] 26.36 41.29 7.7 1019 1.2 1021\nTransformer (base model) 27.3 38.1 3.3 1018\nTransformer (big) 28.4 41.8 2.3 1019\nResidual Dropout We apply dropout [33] to the output of each sub-layer, before it is added to the",
+        "our model establishes a new single-model state-of-the-art BLEU score of 41.8 after\ntraining for 3.5 days on eight GPUs, a small fraction of the training costs of the\nbest models from the literature. We show that the Transformer generalizes well to\nother tasks by applying it successfully to English constituency parsing both with\nlarge and limited training data.\n Equal contribution. Listing order is random. Jakob proposed replacing RNNs with self-attention and started\nthe effort to evaluate this idea. Ashish, with Illia, designed and implemented the first Transformer models and\nhas been crucially involved in every aspect of this work. Noam proposed scaled dot-product attention, multi-head"
+      ],
+      "scores": {
+        "faithfulness": 1.0,
+        "answer_relevancy": 0.6202971876134573,
+        "context_precision": 0.0,
+        "context_recall": 0.0
+      }
+    },
+    {
+      "index": 28,
+      "question": "How does the Transformer's use of attention with reduced dimensionality per head compare computationally to a single-head attention with full d_model dimensions?",
+      "category": "multi_hop",
+      "source_page": 4,
+      "expected_answer": "Multi-head attention projects the d_model=512 queries, keys, and values h=8 times into d_k=d_v=64 dimensional subspaces, performs attention in parallel on each, and concatenates. The total computational cost is similar to single-head attention at full dimensionality (since h * d_k = d_model), but gains the expressiveness of multiple independent attention functions attending to different subspaces.",
+      "generated_answer": "Due to the reduced dimension of each head, the total computational cost is similar to that of single-head attention with full dimensionality.",
+      "retrieved_contexts": [
+        "output values. These are concatenated and once again projected, resulting in the final values, as\ndepicted in Figure 2.\nMulti-head attention allows the model to jointly attend to information from different representation\nsubspaces at different positions. With a single attention head, averaging inhibits this.\nMultiHead(Q,K,V ) = Concat(head ,...,head )WO\n1 h\nwhere head = Attention(QW Q ,KWK,V WV )\ni i i i\nWhere the projections are parameter matrices W Q Rd model d k, WK Rd model d k, WV Rd model d v\ni i i\nand WO Rhd v d model.\nIn this work we employ h = 8 parallel attention layers, or heads. For each of these we use\nd = d = d /h = 64. Due to the reduced dimension of each head, the total computational cost\nk v model\nis similar to that of single-head attention with full dimensionality."
+      ],
+      "scores": {
+        "faithfulness": 1.0,
+        "answer_relevancy": 0.7296260285833842,
+        "context_precision": 0.9999999999,
+        "context_recall": 1.0
+      }
+    },
+    {
+      "index": 29,
+      "question": "What was the annual revenue of Google in 2017?",
+      "category": "out_of_scope",
+      "source_page": -1,
+      "expected_answer": "I do not know based on the provided document. The paper does not contain any information about Google's financial performance.",
+      "generated_answer": "I could not find a reliable answer in the document after multiple attempts. Please try rephrasing your question or check that the document has been indexed.",
+      "retrieved_contexts": [
+        "No context retrieved."
+      ],
+      "scores": {
+        "faithfulness": 0.0,
+        "answer_relevancy": 0.0,
+        "context_precision": 0.9999999999,
+        "context_recall": 1.0
+      }
+    },
+    {
+      "index": 30,
+      "question": "What is the capital city of France?",
+      "category": "out_of_scope",
+      "source_page": -1,
+      "expected_answer": "I do not know based on the provided document. The paper does not discuss geography or capital cities.",
+      "generated_answer": "I could not find a reliable answer in the document after multiple attempts. Please try rephrasing your question or check that the document has been indexed.",
+      "retrieved_contexts": [
+        "No context retrieved."
+      ],
+      "scores": {
+        "faithfulness": 0.25,
+        "answer_relevancy": 0.0,
+        "context_precision": 0.9999999999,
+        "context_recall": 1.0
+      }
+    }
+  ],
+  "means": {
+    "faithfulness": 0.848,
+    "answer_relevancy": 0.681,
+    "context_precision": 0.8028,
+    "context_recall": 0.75
+  }
+}
\ No newline at end of file
diff --git a/eval/run.py b/eval/run.py
new file mode 100644
index 0000000..e7f66f0
--- /dev/null
+++ b/eval/run.py
@@ -0,0 +1,392 @@
+"""
+DocuMind RAGAS evaluation runner.
+
+Loads eval/golden.jsonl, runs the full DocuMind pipeline for every entry, then
+computes RAGAS metrics using Gemini as the judge model.  Results are printed as
+a per-question table and saved to eval/results/<UTC-timestamp>.json.
+
+Usage:
+    python eval/run.py --doc-id <doc_id>
+    python eval/run.py --doc-id <doc_id> --limit 5
+    python eval/run.py --doc-id <doc_id> --category factual
+    DOC_ID=<doc_id> python eval/run.py
+"""
+from __future__ import annotations
+
+import argparse
+import json
+import logging
+import os
+import sys
+import warnings
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Any
+
+warnings.filterwarnings("ignore")
+# Silence ChromaDB telemetry capture() signature mismatch noise
+logging.getLogger("chromadb.telemetry").setLevel(logging.CRITICAL)
+os.environ.setdefault("ANONYMIZED_TELEMETRY", "false")
+
+# Ensure project root is importable when run as a script
+sys.path.insert(0, str(Path(__file__).parent.parent))
+
+from dotenv import load_dotenv
+
+load_dotenv()
+
+from langchain_google_genai import ChatGoogleGenerativeAI
+from langchain_huggingface import HuggingFaceEmbeddings
+from ragas import EvaluationDataset, SingleTurnSample, evaluate
+from ragas.embeddings import LangchainEmbeddingsWrapper
+from ragas.llms import LangchainLLMWrapper
+from ragas.metrics import AnswerRelevancy, ContextPrecision, ContextRecall, Faithfulness
+
+from rag.agents.graph import run_agent
+
+EVAL_DIR = Path(__file__).parent
+DEFAULT_GOLDEN = EVAL_DIR / "golden.jsonl"
+RESULTS_DIR = EVAL_DIR / "results"
+
+VALID_CATEGORIES = {"factual", "reasoning", "multi_hop", "out_of_scope"}
+
+
+# ─────────────────────────────────────────────
+# Helpers
+# ─────────────────────────────────────────────
+
+def _load_golden(path: Path, category: str | None, limit: int | None) -> list[dict]:
+    entries: list[dict] = []
+    with open(path) as f:
+        for line in f:
+            line = line.strip()
+            if not line:
+                continue
+            entries.append(json.loads(line))
+
+    if category:
+        entries = [e for e in entries if e.get("category") == category]
+
+    if limit:
+        entries = entries[:limit]
+
+    return entries
+
+
+def _run_pipeline(question: str, doc_id: str) -> tuple[str, list[str]]:
+    """Invoke the DocuMind agent and return (answer, retrieved_context_strings)."""
+    state = run_agent(question=question, doc_id=doc_id)
+    answer: str = state.get("generation", "")
+    docs = state.get("documents", [])
+    contexts = [doc.page_content for doc in docs] if docs else ["No context retrieved."]
+    return answer, contexts
+
+
+def _build_ragas_judge() -> tuple[LangchainLLMWrapper, LangchainEmbeddingsWrapper]:
+    api_key = os.getenv("GOOGLE_API_KEY")
+    if not api_key:
+        raise RuntimeError("GOOGLE_API_KEY is not set — required for RAGAS judge.")
+
+    llm = LangchainLLMWrapper(
+        ChatGoogleGenerativeAI(
+            model="gemini-2.5-flash",
+            google_api_key=api_key,
+            temperature=0,
+        )
+    )
+    emb = LangchainEmbeddingsWrapper(
+        HuggingFaceEmbeddings(
+            model_name="sentence-transformers/all-MiniLM-L6-v2",
+            model_kwargs={"device": "cpu"},
+            encode_kwargs={"normalize_embeddings": True},
+        )
+    )
+    return llm, emb
+
+
+def _compute_ragas(
+    samples: list[SingleTurnSample],
+    llm: LangchainLLMWrapper,
+    emb: LangchainEmbeddingsWrapper,
+) -> tuple[dict[str, list[float | None]], dict[str, float]]:
+    metrics = [
+        Faithfulness(llm=llm),
+        AnswerRelevancy(llm=llm, embeddings=emb),
+        ContextPrecision(llm=llm),
+        ContextRecall(llm=llm),
+    ]
+    dataset = EvaluationDataset(samples=samples)
+    result = evaluate(dataset=dataset, metrics=metrics, raise_exceptions=False, show_progress=True)
+
+    # Extract per-sample lists and compute means
+    per_metric_lists: dict[str, list[float | None]] = {}
+    means: dict[str, float] = {}
+
+    for m in metrics:
+        raw = result[m.name]
+        if isinstance(raw, list):
+            per_metric_lists[m.name] = raw
+            valid = [v for v in raw if v is not None]
+            means[m.name] = round(sum(valid) / len(valid), 4) if valid else 0.0
+        elif raw is None:
+            per_metric_lists[m.name] = [None] * len(samples)
+            means[m.name] = 0.0
+        else:
+            per_metric_lists[m.name] = [float(raw)] * len(samples)
+            means[m.name] = round(float(raw), 4)
+
+    return per_metric_lists, means
+
+
+def _print_per_question_table(
+    entries: list[dict],
+    per_metric: dict[str, list[float | None]],
+    answers: list[str],
+) -> None:
+    metric_names = list(per_metric.keys())
+    col_w = 36
+    score_w = 8
+
+    header = f"{'#':<4} {'Question':<{col_w}}"
+    for m in metric_names:
+        abbr = {"faithfulness": "Faith", "answer_relevancy": "AnswRel",
+                "context_precision": "CtxPrec", "context_recall": "CtxRec"}.get(m, m[:7])
+        header += f"  {abbr:>{score_w}}"
+    print("\n" + "─" * len(header))
+    print(header)
+    print("─" * len(header))
+
+    for i, entry in enumerate(entries):
+        q = entry["question"][:col_w - 1] + "…" if len(entry["question"]) > col_w else entry["question"]
+        row = f"{i + 1:<4} {q:<{col_w}}"
+        for m in metric_names:
+            val = per_metric[m][i]
+            row += f"  {val:>{score_w}.3f}" if val is not None else f"  {'N/A':>{score_w}}"
+        print(row)
+
+    print("─" * len(header))
+
+
+def _print_means_table(means: dict[str, float]) -> None:
+    print("\n  Aggregate means:")
+    print("  " + "─" * 36)
+    for metric, score in means.items():
+        bar = "█" * int(score * 20)
+        print(f"  {metric:<24}  {score:.3f}  {bar}")
+    print("  " + "─" * 36)
+
+
+def _save_results(
+    doc_id: str,
+    golden_file: Path,
+    entries: list[dict],
+    answers: list[str],
+    contexts: list[list[str]],
+    per_metric: dict[str, list[float | None]],
+    means: dict[str, float],
+    results_dir: Path = RESULTS_DIR,
+) -> Path:
+    results_dir.mkdir(parents=True, exist_ok=True)
+    ts = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
+    out_path = results_dir / f"{ts}.json"
+
+    per_question: list[dict[str, Any]] = []
+    for i, entry in enumerate(entries):
+        scores = {m: (per_metric[m][i] if per_metric[m][i] is not None else None)
+                  for m in per_metric}
+        per_question.append({
+            "index": i + 1,
+            "question": entry["question"],
+            "category": entry.get("category"),
+            "source_page": entry.get("source_page"),
+            "expected_answer": entry["expected_answer"],
+            "generated_answer": answers[i],
+            "retrieved_contexts": contexts[i],
+            "scores": scores,
+        })
+
+    payload = {
+        "timestamp": datetime.now(timezone.utc).isoformat(),
+        "doc_id": doc_id,
+        "golden_file": str(golden_file),
+        "n_total": len(entries),
+        "n_evaluated": len(entries),
+        "per_question": per_question,
+        "means": means,
+    }
+
+    serialized = json.dumps(payload, indent=2, ensure_ascii=False)
+    out_path.write_text(serialized)
+    (results_dir / "latest.json").write_text(serialized)
+    return out_path
+
+
+_README_START = "<!-- EVAL-RESULTS-START -->"
+_README_END = "<!-- EVAL-RESULTS-END -->"
+_ROOT_README = Path(__file__).parent.parent / "README.md"
+
+
+def _latest_result_file(results_dir: Path = RESULTS_DIR) -> Path | None:
+    """Return the most recently written JSON in results_dir, or None."""
+    files = sorted(results_dir.glob("*.json"))
+    return files[-1] if files else None
+
+
+def update_readme_from_results(
+    results_dir: Path = RESULTS_DIR,
+    result_file: Path | None = None,
+) -> bool:
+    """
+    Write scores from a result JSON into the <!-- EVAL-RESULTS-START/END --> block
+    in README.md.  Pass result_file to use a specific file; otherwise the latest
+    file in results_dir is used.
+    Returns True if the README was updated, False otherwise.
+    """
+    latest = result_file or _latest_result_file(results_dir)
+    if latest is None:
+        print(f"No result files found in {results_dir}.", file=sys.stderr)
+        return False
+
+    data = json.loads(latest.read_text())
+    means: dict[str, float] = data["means"]
+    n_evaluated: int = data["n_evaluated"]
+    timestamp: str = data["timestamp"]
+    date_str = timestamp[:10]
+
+    if not _ROOT_README.exists():
+        print(f"README not found at {_ROOT_README}.", file=sys.stderr)
+        return False
+
+    rows = "\n".join(
+        f"| `{metric}` | {score:.3f} | {'█' * int(score * 20)} |"
+        for metric, score in means.items()
+    )
+    block = (
+        f"{_README_START}\n"
+        f"| Metric | Score | |\n"
+        f"|---|---|---|\n"
+        f"{rows}\n\n"
+        f"_Evaluated on {n_evaluated} questions · {date_str} · "
+        f"full results in [`eval/results/latest.json`](eval/results/latest.json)_\n"
+        f"{_README_END}"
+    )
+
+    content = _ROOT_README.read_text()
+    start = content.find(_README_START)
+    end = content.find(_README_END)
+    if start == -1 or end == -1:
+        print("Markers <!-- EVAL-RESULTS-START/END --> not found in README.md.", file=sys.stderr)
+        return False
+
+    _ROOT_README.write_text(content[:start] + block + content[end + len(_README_END):])
+    print(f"README.md updated from {latest.name}")
+    return True
+
+
+# ─────────────────────────────────────────────
+# Main
+# ─────────────────────────────────────────────
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Run RAGAS evaluation against DocuMind golden dataset")
+    parser.add_argument(
+        "--doc-id",
+        default=os.getenv("DOC_ID"),
+        help="doc_id of the indexed PDF (or set DOC_ID env var)",
+    )
+    parser.add_argument("--golden", default=str(DEFAULT_GOLDEN), help="Path to .jsonl golden dataset")
+    parser.add_argument("--limit", type=int, default=None, help="Evaluate only the first N entries")
+    parser.add_argument(
+        "--category",
+        choices=list(VALID_CATEGORIES),
+        default=None,
+        help="Filter to a single category",
+    )
+    parser.add_argument("--results-dir", default=str(RESULTS_DIR), help="Directory for result JSON files")
+    parser.add_argument(
+        "--update-readme",
+        action="store_true",
+        help="Update README.md from the latest result file and exit (no evaluation run)",
+    )
+    args = parser.parse_args()
+
+    results_dir = Path(args.results_dir)
+
+    if args.update_readme:
+        sys.exit(0 if update_readme_from_results(results_dir) else 1)
+
+    if not args.doc_id:
+        parser.error("--doc-id is required (or set the DOC_ID environment variable)")
+
+    golden_path = Path(args.golden)
+    if not golden_path.exists():
+        print(f"ERROR: golden dataset not found at {golden_path}", file=sys.stderr)
+        sys.exit(1)
+
+    entries = _load_golden(golden_path, args.category, args.limit)
+    if not entries:
+        print("No entries matched the given filters.", file=sys.stderr)
+        sys.exit(1)
+
+    print(f"Loaded {len(entries)} entries from {golden_path}")
+    if args.category:
+        print(f"  Category filter: {args.category}")
+
+    # ── Step 1: run pipeline for every entry ──────────────────────
+    print("\nRunning DocuMind pipeline…")
+    all_answers: list[str] = []
+    all_contexts: list[list[str]] = []
+    ragas_samples: list[SingleTurnSample] = []
+
+    for i, entry in enumerate(entries, start=1):
+        q = entry["question"]
+        print(f"  [{i:>2}/{len(entries)}] {q[:80]}{'…' if len(q) > 80 else ''}")
+
+        try:
+            answer, contexts = _run_pipeline(q, args.doc_id)
+        except Exception as exc:
+            print(f"         pipeline error: {exc}")
+            answer = ""
+            contexts = ["Pipeline error."]
+
+        all_answers.append(answer)
+        all_contexts.append(contexts)
+        ragas_samples.append(
+            SingleTurnSample(
+                user_input=q,
+                response=answer,
+                retrieved_contexts=contexts,
+                reference=entry["expected_answer"],
+            )
+        )
+
+    # ── Step 2: RAGAS scoring ──────────────────────────────────────
+    print("\nBuilding RAGAS judge (Gemini 2.5 Flash)…")
+    llm, emb = _build_ragas_judge()
+
+    print("Computing RAGAS metrics…")
+    per_metric, means = _compute_ragas(ragas_samples, llm, emb)
+
+    # ── Step 3: display results ────────────────────────────────────
+    _print_per_question_table(entries, per_metric, all_answers)
+    _print_means_table(means)
+
+    # ── Step 4: save to disk ───────────────────────────────────────
+    out_path = _save_results(
+        doc_id=args.doc_id,
+        golden_file=golden_path,
+        entries=entries,
+        answers=all_answers,
+        contexts=all_contexts,
+        per_metric=per_metric,
+        means=means,
+        results_dir=results_dir,
+    )
+    print(f"\nResults saved → {out_path}")
+
+    # ── Step 5: update root README from the file we just saved ───
+    update_readme_from_results(result_file=out_path)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/eval/run_eval.py b/eval/run_eval.py
deleted file mode 100644
index bbf8e6b..0000000
--- a/eval/run_eval.py
+++ /dev/null
@@ -1,101 +0,0 @@
-"""
-Run RAGAS evaluation against the live RAG backend.
-
-Usage:
-    python eval/run_eval.py [--doc-id DOC_ID]
-
-    --doc-id   Override the doc_id in test_queries.json for all questions.
-               Required if test_queries.json still has the placeholder value.
-"""
-from __future__ import annotations
-
-import argparse
-import json
-import sys
-from datetime import datetime, timezone
-from pathlib import Path
-
-# Ensure project root is on sys.path when run directly
-sys.path.insert(0, str(Path(__file__).parent.parent))
-
-from eval.ragas_eval import evaluate_rag
-
-EVAL_DIR = Path(__file__).parent
-QUERIES_FILE = EVAL_DIR / "test_queries.json"
-RESULTS_FILE = EVAL_DIR / "results.json"
-
-PASS_THRESHOLD = 0.7
-TRACKED_METRICS = ["faithfulness", "answer_relevancy"]
-
-
-def _load_queries(doc_id_override: str | None) -> list[dict]:
-    with open(QUERIES_FILE) as f:
-        queries = json.load(f)
-
-    if doc_id_override:
-        for q in queries:
-            q["doc_id"] = doc_id_override
-
-    missing = [i + 1 for i, q in enumerate(queries) if q.get("doc_id") == "REPLACE_WITH_YOUR_DOC_ID"]
-    if missing:
-        print(f"ERROR: doc_id not set for queries {missing}.")
-        print("Run with --doc-id <your_doc_id> or edit eval/test_queries.json.")
-        sys.exit(1)
-
-    return queries
-
-
-def _print_table(scores: dict[str, float]) -> None:
-    print("\n" + "─" * 44)
-    print(f"  {'Metric':<26}  {'Score':>6}")
-    print("─" * 44)
-    for metric, score in scores.items():
-        bar = "█" * int(score * 20)
-        print(f"  {metric:<26}  {score:>5.3f}  {bar}")
-    print("─" * 44)
-
-
-def _save_results(scores: dict[str, float], queries: list[dict]) -> None:
-    history: list[dict] = []
-    if RESULTS_FILE.exists():
-        with open(RESULTS_FILE) as f:
-            history = json.load(f)
-
-    history.append({
-        "timestamp": datetime.now(timezone.utc).isoformat(),
-        "n_queries": len(queries),
-        "scores": scores,
-    })
-
-    with open(RESULTS_FILE, "w") as f:
-        json.dump(history, f, indent=2)
-
-    print(f"\nResults saved → {RESULTS_FILE}")
-
-
-def main() -> None:
-    parser = argparse.ArgumentParser(description="Run RAGAS evaluation")
-    parser.add_argument("--doc-id", default=None, help="Override doc_id for all test queries")
-    args = parser.parse_args()
-
-    queries = _load_queries(args.doc_id)
-
-    print(f"Evaluating {len(queries)} queries…\n")
-    scores = evaluate_rag(queries, verbose=True)
-
-    _print_table(scores)
-
-    passed = all(scores.get(m, 0) >= PASS_THRESHOLD for m in TRACKED_METRICS)
-    verdict = "PASS ✓" if passed else "NEEDS IMPROVEMENT ✗"
-    print(f"\nOverall verdict: {verdict}")
-    print(
-        f"  (faithfulness={scores.get('faithfulness', 0):.3f}, "
-        f"answer_relevancy={scores.get('answer_relevancy', 0):.3f}, "
-        f"threshold={PASS_THRESHOLD})"
-    )
-
-    _save_results(scores, queries)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/eval/test_queries.json b/eval/test_queries.json
deleted file mode 100644
index f4352d0..0000000
--- a/eval/test_queries.json
+++ /dev/null
@@ -1,27 +0,0 @@
-[
-  {
-    "question": "What is this document about?",
-    "ground_truth": "The document describes a resume or CV, which is a summary of an individual's education, work experience, skills, and achievements.",
-    "doc_id": "REPLACE_WITH_YOUR_DOC_ID"
-  },
-  {
-    "question": "What technology and skills are described in this document?",
-    "ground_truth": "the skills section lists various technologies and skills, including programming languages (Python C++), machine learning frameworks (TensorFlow, PyTorch, openCV, TensorFlow) and so on.",
-    "doc_id": "REPLACE_WITH_YOUR_DOC_ID"
-  },
-  {
-    "question": "What are the projects mentioned in the document?",
-    "ground_truth": "The document outlines various projects and professional experiences of the individual. These include smart plant monitoring, object detection and tracking, and so on.",
-    "doc_id": "REPLACE_WITH_YOUR_DOC_ID"
-  },
-  {
-    "question": "What technologies are used?",
-    "ground_truth": "The document mentions like machine learning, deep learning, computer vision, CNN, Mediapipe and so on.",
-    "doc_id": "REPLACE_WITH_YOUR_DOC_ID"
-  },
-  {
-    "question": "What is the annual revenue of Apple Inc. in 2099?",
-    "ground_truth": "I could not find any relevant information in the document.",
-    "doc_id": "REPLACE_WITH_YOUR_DOC_ID"
-  }
-]
diff --git a/requirements.txt b/requirements.txt
index 2d67b69..81dfed9 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -36,5 +36,5 @@ streamlit
 watchdog
 
 # Evaluation
-ragas==0.0.22
+ragas>=0.2.0
 datasets