Skip to main content
Retrieval metrics evaluate how well your embedding model, vector store, and reranker find relevant information for your LLM.

Quick Reference

MetricMeasuresLLM RequiredBest For
ContextPrecisionRelevant docs / Total retrievedReducing noise
ContextRecallFound info / All relevant infoAvoiding gaps
MRRPosition of first relevant result⚠️ OptionalReranker quality
NDCGOverall ranking qualityFull ranking optimization

1. ContextPrecision

“How many retrieved documents are actually relevant?”
from rag_opt.eval import RAGEvaluator,ContextPrecision
from rag_opt import init_chat_model

llm = init_chat_model(model="gpt-3.5-turbo",
                    model_provider="openai",
                    api_key=OPENAI_API_KEY)

metric = ContextPrecision(llm=llm, limit_contexts=5)
evaluator = RAGEvaluator(metrics=[metric])
eval_dataset = ... # evaluation dataset from RAG
results = evaluator.evaluate(eval_dataset,return_tensor=False)
Score Guide: 0.8+ = Excellent | 0.6-0.8 = Good | 0.4-0.6 = Fair | < 0.4 = Poor Options:
  • limit_contexts: Evaluate only top N contexts (default: all)
  • prompt: Custom relevance judgment prompt

2. ContextRecall

“Did I retrieve ALL the information needed?”
from rag_opt.eval import RAGEvaluator,ContextRecall
from rag_opt import init_chat_model

llm = init_chat_model(model="gpt-3.5-turbo",
                    model_provider="openai",
                    api_key=OPENAI_API_KEY)

metric = ContextRecall(llm=llm)
evaluator = RAGEvaluator(metrics=[metric])
eval_dataset = ... # rag.get_batch_answers(train_dataset)
results = evaluator.evaluate(eval_dataset,return_tensor=False)
Score Guide: 0.8+ = Excellent | 0.6-0.8 = Good | 0.4-0.6 = Fair | < 0.4 = Poor ⚠️ Warning: Low recall causes hallucinations and incomplete answers.

3. MRR (Mean Reciprocal Rank)

“How quickly do I find the first relevant result?”
from rag_opt.eval import RAGEvaluator,MRR
from rag_opt import init_embeddings

# embedding is required for MRR
embeddings = init_embeddings(model="all-MiniLM-L6-v2",
                             model_provider="huggingface",
                             api_key=HUGGINFACE_API_KEY)

metric = MRR(embedding_model=embeddings)
evaluator = RAGEvaluator(metrics=[metric])
eval_dataset = ... # rag.get_batch_answers(train_dataset)
results = evaluator.evaluate(eval_dataset,return_tensor=False)
Score Guide: 0.8+ = Top result | 0.5-0.8 = Top 2-3 | 0.3-0.5 = Position 3-5 | < 0.3 = Buried deep

4. NDCG (Normalized Discounted Cumulative Gain)

“How close is my ranking to ideal?”
from rag_opt.eval import RAGEvaluator,NDCG
from rag_opt import init_embeddings

# embedding is required for NDCG
embeddings = init_embeddings(model="all-MiniLM-L6-v2",
                             model_provider="huggingface",
                             api_key=HUGGINFACE_API_KEY)

metric = NDCG(embedding_model=embeddings)
evaluator = RAGEvaluator(metrics=[metric])
eval_dataset = ... # rag.get_batch_answers(train_dataset)
results = evaluator.evaluate(eval_dataset,return_tensor=False)
Score Guide: 0.8+ = Near ideal | 0.6-0.8 = Good | 0.4-0.6 = Fair | < 0.4 = Poor MRR vs NDCG:
  • Use MRR when users look at only the first result
  • Use NDCG when overall ranking quality matters

Evaluate All Together

rom rag_opt.eval import ContextPrecision, ContextRecall, MRR, NDCG
from rag_opt import init_chat_model, init_embeddings


llm = init_chat_model(model="gpt-3.5-turbo",
                    model_provider="openai",
                    api_key=OPENAI_API_KEY)

embeddings = init_embeddings(model="all-MiniLM-L6-v2",
                             model_provider="huggingface",
                             api_key=HUGGINFACE_API_KEY)

metrics = [
    ContextPrecision(llm=llm, limit_contexts=5),
    ContextRecall(llm=llm),
    MRR(embedding_model=embeddings),
    NDCG(embedding_model=embeddings)
]

evaluator = RAGEvaluator(metrics=metrics)
eval_dataset = ... # rag.get_batch_answers(train_dataset)
results = evaluator.evaluate(eval_dataset,return_tensor=False)

for result in results.items():
    print(f"{result.name}: {result.value:.3f}")