Skip to main content
The RAGEvaluator measures RAG performance across retrieval, generation, and operational metrics.

Quick Start

from rag_opt import init_chat_model, init_embeddings, init_vectorstore
from rag_opt.rag import RAGWorkflow
from rag_opt.eval.eval import RAGEvaluator
from rag_opt.dataset import TrainDataset

# Initialize RAG Workflow
llm = init_chat_model(model="gpt-3.5-turbo", model_provider="openai", api_key=OPENAI_API_KEY)
embeddings = init_embeddings(
    model="all-MiniLM-L6-v2",
    model_provider="huggingface",
    api_key=HUGGINFACE_API_KEY
)
llm = init_chat_model(
    model="gpt-3.5-turbo",
    model_provider="openai",
    api_key=OPENAI_API_KEY
)

vector_store = init_vectorstore(
    provider="faiss",
    embeddings=embeddings,
)

rag = RAGWorkflow(
    embeddings=embeddings,
    vector_store=vector_store,
    llm=llm,
    retrieval_config={
        "search_type": "hybrid",
        "k": 3
    },
)

# generate evaluation dataset (to be used by evaluator)
train_dataset = TrainDataset.from_json("./rag_dataset.json")
eval_dataset = rag.get_batch_answers(train_dataset)


# Evaluate
evaluator = RAGEvaluator(evaluator_llm=llm,evaluator_embedding=embeddings)
results = evaluator.evaluate(eval_dataset,return_tensor=False)
print(results)

Understanding Results

Results return as MetricResult objects organized by category: Retrieval: context_precision, context_recall, mrr, ndcg
Generation: response_relevancy, safety, alignment
Operational: cost, latency
{
    "response_relevancy": MetricResult(name="response_relevancy", value=0.89, category="GENERATION"),
    "context_precision": MetricResult(name="context_precision", value=0.76, category="RETRIEVAL"),
    "cost": MetricResult(name="cost", value=0.0234, category="FULL"),
    ...
}
Learn more about each metric in Metrics Overview.

Targeted Evaluation

Evaluate specific categories:
retrieval_results = evaluator.evaluate_retrieval(eval_dataset)
generation_results = evaluator.evaluate_generation(eval_dataset)
full_results = evaluator.evaluate_full(eval_dataset)

Customization

Metric Weights

Control metric importance during optimization:
evaluator = RAGEvaluator(
    evaluator_llm=llm,
    evaluator_embedding=embeddings,
    objective_weights={
        "response_relevancy": 0.5,
        "context_precision": 0.3,
        "cost": 0.2
    }
)
Weights are automatically normalized to sum to 1.0.

Add/Remove Metrics

# Remove unwanted metrics
evaluator.remove_metric("mrr")

# Add custom metric
from rag_opt.eval.metrics import BaseMetric, MetricCategory, MetricResult
from rag_opt.dataset import EvaluationDataset

class CustomMetric(BaseMetric):
    is_llm_based: bool = False # Make True if this metric requires llm
    category: MetricCategory = MetricCategory.GENERATION
    name: str = "my_custom_metric"

    def __init__(self):
        super().__init__(
            negate=False,  # True if lower is better
            worst_value=0.0
        )
    def _evaluate(self, dataset:EvaluationDataset, **kwargs) -> list[float]:
        """ Your evaluation logic here"""
        return [0.85 for _ in dataset.items]

evaluator.add_metric(CustomMetric(), weight=0.5)

Normalization

results = evaluator.evaluate(
    eval_dataset,
    return_tensor=False,
    normalize=True,
    normalization_strategy="sum"  # Options: sum, softmax, min-max, z-score
)

Advanced Features

Batch Evaluation

Evaluate multiple configurations in parallel:
batch_results = evaluator.evaluate_batch(
    [eval_dataset1],
    return_tensor=False
)

Optimizer Integration

from rag_opt.optimizer import Optimizer
optimizer = Optimizer(
    train_dataset=dataset,
    config_path="./rag_config.yaml",
    evaluator_llm=llm,
)

Complete Example

from rag_opt.eval.eval import RAGEvaluator
from rag_opt import init_chat_model
from rag_opt.dataset import TrainDataset

# Setup
llm = init_chat_model(model="gpt-3.5-turbo", model_provider="openai", api_key="sk-***")
dataset = TrainDataset.from_json("./rag_dataset.json")

# Create evaluator with custom weights
evaluator = RAGEvaluator(
    evaluator_llm=llm,
    objective_weights={
        "response_relevancy": 0.7,
        "cost": 0.2,
        "latency": 0.1
    }
)

# Remove unnecessary metrics
evaluator.remove_metric("ndcg")
evaluator.remove_metric("mrr")

# Evaluate with normalization
results = evaluator.evaluate(eval_dataset, normalize=True)

# Print results
for metric_name, result in results.items():
    print(f"{metric_name}: {result.value}")

print(f"\nOverall Score: {evaluator.compute_objective_score(results)}")