Skip to main content

Evaluation API Reference

SynapseKit's evaluation framework measures RAG and agent quality using LLM-as-judge metrics.

EvaluationPipeline

from synapsekit.evaluation import EvaluationPipeline

pipeline = EvaluationPipeline(
metrics: list[BaseMetric],
llm: BaseLLM | None = None,
)
ParameterTypeDefaultDescription
metricslist[BaseMetric]requiredMetrics to compute during evaluation
llmBaseLLM | NoneNoneDefault judge LLM for metrics that do not define their own

async evaluate(question, answer, contexts, ground_truth=None)

async def evaluate(
question: str,
answer: str,
contexts: list[str],
ground_truth: str | None = None,
) -> EvaluationResult
ParameterTypeDefaultDescription
questionstrrequiredThe user question
answerstrrequiredThe generated answer to evaluate
contextslist[str]requiredRetrieved document chunks used to generate the answer
ground_truthstr | NoneNoneReference answer (required for some metrics)
from synapsekit.evaluation import EvaluationPipeline, FaithfulnessMetric, RelevancyMetric

pipeline = EvaluationPipeline(
metrics=[FaithfulnessMetric(llm=judge_llm), RelevancyMetric(llm=judge_llm)],
)
result = await pipeline.evaluate(
question="What is SynapseKit?",
answer="SynapseKit is an async-first Python library.",
contexts=["SynapseKit is an async-first Python library for building LLM applications."],
)
print(result.scores) # {"faithfulness": 0.94, "relevancy": 0.88}
print(result.overall_score) # 0.91

async evaluate_batch(samples, concurrency=4)

ParameterTypeDefaultDescription
sampleslist[dict]requiredList of dicts, each with question, answer, contexts, optionally ground_truth
concurrencyint4Number of concurrent evaluation calls

EvaluationResult

@dataclass
class EvaluationResult:
question: str
answer: str
contexts: list[str]
ground_truth: str | None
scores: dict[str, float] # metric_name -> 0.0 to 1.0
overall_score: float # mean of all metric scores
reasoning: dict[str, str] # metric_name -> LLM explanation
passed: bool # True if overall_score >= threshold
threshold: float # default 0.7

FaithfulnessMetric

Measures whether every claim in the answer is supported by the retrieved contexts. Score = verified_claims / total_claims.

FaithfulnessMetric(llm: BaseLLM, threshold: float = 0.7)
ParameterTypeDefaultDescription
llmBaseLLMrequiredJudge LLM for claim verification
thresholdfloat0.7Minimum passing score

RelevancyMetric

Measures whether the answer addresses the question. The LLM rates on a 1–5 scale, normalized to 0.0–1.0.

RelevancyMetric(llm: BaseLLM, threshold: float = 0.7)

GroundednessMetric

Compares the answer to a ground-truth reference answer. Requires ground_truth in evaluate().

GroundednessMetric(
llm: BaseLLM,
threshold: float = 0.7,
mode: str = "llm",
)
ParameterTypeDefaultDescription
llmBaseLLMrequiredJudge LLM
thresholdfloat0.7Minimum passing score
modestr"llm""llm" (semantic) or "rouge" (token overlap)

ContextRecallMetric

Measures whether the retrieved contexts contain enough information to answer the question. Score = attributable_sentences / total_sentences in the ground truth. Requires ground_truth.

ContextRecallMetric(llm: BaseLLM, threshold: float = 0.7)

ContextPrecisionMetric

Measures what fraction of the retrieved contexts were actually useful. Score = relevant_chunks / total_chunks.

ContextPrecisionMetric(llm: BaseLLM, threshold: float = 0.7)

@eval_case decorator

Marks a test function as an evaluation case for sk eval CLI integration.

from synapsekit.evaluation import eval_case, EvalCaseMeta

@eval_case(
meta=EvalCaseMeta(
name="rag_basic_factual",
tags=["rag", "factual"],
threshold=0.8,
)
)
async def test_basic_rag(rag_pipeline):
result = await rag_pipeline.aquery("What is SynapseKit?")
return {
"question": "What is SynapseKit?",
"answer": result,
"contexts": [],
}

EvalCaseMeta

@dataclass
class EvalCaseMeta:
name: str
tags: list[str] = field(default_factory=list)
threshold: float = 0.7
metrics: list[str] | None = None
description: str = ""
FieldTypeDefaultDescription
namestrrequiredUnique test case identifier
tagslist[str][]Tags for filtering with sk eval --tag
thresholdfloat0.7Override pipeline threshold for this case
metricslist[str] | NoneNoneSubset of metrics to run; None = use pipeline default
descriptionstr""Human-readable description

Running evaluations

import asyncio
from synapsekit.evaluation import EvaluationPipeline, FaithfulnessMetric, RelevancyMetric

async def main():
pipeline = EvaluationPipeline(
metrics=[FaithfulnessMetric(llm=judge_llm), RelevancyMetric(llm=judge_llm)],
)
result = await pipeline.evaluate(
question="What databases does SynapseKit support?",
answer="SynapseKit supports Redis, SQLite, PostgreSQL, DynamoDB, and MongoDB.",
contexts=[
"SynapseKit's memory backends include Redis, SQLite, and PostgreSQL.",
"DynamoDB and MongoDB memory backends are also available.",
],
)
print(f"Overall: {result.overall_score:.2f}, Passed: {result.passed}")

asyncio.run(main())

From CLI:

sk eval --test-dir tests/eval/ --output eval_report.json
sk eval --tag rag --threshold 0.8

See also