Documentation Index
Fetch the complete documentation index at: https://ragopt.aboneda.com/llms.txt
Use this file to discover all available pages before exploring further.
The RAGEvaluator measures RAG performance across retrieval, generation, and operational metrics.
Quick Start
from rag_opt import init_chat_model, init_embeddings, init_vectorstore
from rag_opt.rag import RAGWorkflow
from rag_opt.eval.eval import RAGEvaluator
from rag_opt.dataset import TrainDataset
# Initialize RAG Workflow
llm = init_chat_model(model="gpt-3.5-turbo", model_provider="openai", api_key=OPENAI_API_KEY)
embeddings = init_embeddings(
model="all-MiniLM-L6-v2",
model_provider="huggingface",
api_key=HUGGINFACE_API_KEY
)
llm = init_chat_model(
model="gpt-3.5-turbo",
model_provider="openai",
api_key=OPENAI_API_KEY
)
vector_store = init_vectorstore(
provider="faiss",
embeddings=embeddings,
)
rag = RAGWorkflow(
embeddings=embeddings,
vector_store=vector_store,
llm=llm,
retrieval_config={
"search_type": "hybrid",
"k": 3
},
)
# generate evaluation dataset (to be used by evaluator)
train_dataset = TrainDataset.from_json("./rag_dataset.json")
eval_dataset = rag.get_batch_answers(train_dataset)
# Evaluate
evaluator = RAGEvaluator(evaluator_llm=llm,evaluator_embedding=embeddings)
results = evaluator.evaluate(eval_dataset,return_tensor=False)
print(results)
Understanding Results
Results return as MetricResult objects organized by category:
Retrieval: context_precision, context_recall, mrr, ndcg
Generation: response_relevancy, safety, alignment
Operational: cost, latency
{
"response_relevancy": MetricResult(name="response_relevancy", value=0.89, category="GENERATION"),
"context_precision": MetricResult(name="context_precision", value=0.76, category="RETRIEVAL"),
"cost": MetricResult(name="cost", value=0.0234, category="FULL"),
...
}
Learn more about each metric in Metrics Overview.
Targeted Evaluation
Evaluate specific categories:
retrieval_results = evaluator.evaluate_retrieval(eval_dataset)
generation_results = evaluator.evaluate_generation(eval_dataset)
full_results = evaluator.evaluate_full(eval_dataset)
Customization
Metric Weights
Control metric importance during optimization:
evaluator = RAGEvaluator(
evaluator_llm=llm,
evaluator_embedding=embeddings,
objective_weights={
"response_relevancy": 0.5,
"context_precision": 0.3,
"cost": 0.2
}
)
Weights are automatically normalized to sum to 1.0.
Add/Remove Metrics
# Remove unwanted metrics
evaluator.remove_metric("mrr")
# Add custom metric
from rag_opt.eval.metrics import BaseMetric, MetricCategory, MetricResult
from rag_opt.dataset import EvaluationDataset
class CustomMetric(BaseMetric):
is_llm_based: bool = False # Make True if this metric requires llm
category: MetricCategory = MetricCategory.GENERATION
name: str = "my_custom_metric"
def __init__(self):
super().__init__(
negate=False, # True if lower is better
worst_value=0.0
)
def _evaluate(self, dataset:EvaluationDataset, **kwargs) -> list[float]:
""" Your evaluation logic here"""
return [0.85 for _ in dataset.items]
evaluator.add_metric(CustomMetric(), weight=0.5)
Normalization
results = evaluator.evaluate(
eval_dataset,
return_tensor=False,
normalize=True,
normalization_strategy="sum" # Options: sum, softmax, min-max, z-score
)
Advanced Features
Batch Evaluation
Evaluate multiple configurations in parallel:
batch_results = evaluator.evaluate_batch(
[eval_dataset1],
return_tensor=False
)
Optimizer Integration
from rag_opt.optimizer import Optimizer
optimizer = Optimizer(
train_dataset=dataset,
config_path="./rag_config.yaml",
evaluator_llm=llm,
)
Complete Example
from rag_opt.eval.eval import RAGEvaluator
from rag_opt import init_chat_model
from rag_opt.dataset import TrainDataset
# Setup
llm = init_chat_model(model="gpt-3.5-turbo", model_provider="openai", api_key="sk-***")
dataset = TrainDataset.from_json("./rag_dataset.json")
# Create evaluator with custom weights
evaluator = RAGEvaluator(
evaluator_llm=llm,
objective_weights={
"response_relevancy": 0.7,
"cost": 0.2,
"latency": 0.1
}
)
# Remove unnecessary metrics
evaluator.remove_metric("ndcg")
evaluator.remove_metric("mrr")
# Evaluate with normalization
results = evaluator.evaluate(eval_dataset, normalize=True)
# Print results
for metric_name, result in results.items():
print(f"{metric_name}: {result.value}")
print(f"\nOverall Score: {evaluator.compute_objective_score(results)}")