Testing

Reliable LLM applications need a layered test strategy: fast unit tests with mocks, integration tests against real APIs, and eval tests that measure answer quality. This guide covers all three layers plus CI setup with GitHub Actions.

Prerequisites

pip install synapsekit[openai] pytest pytest-asyncio

Configure pytest for async tests in pyproject.toml:

[tool.pytest.ini_options]
asyncio_mode = "auto"

1. Unit testing with MockLLM

MockLLM returns preset responses without making any API calls. Use it for all unit tests.

# tests/conftest.py
import pytest
from synapsekit.llms.base import BaseLLM, LLMConfig
from typing import AsyncIterator


class MockLLM(BaseLLM):
    """Deterministic mock LLM — no API calls, no cost, no flakiness."""

    def __init__(self, responses: list[str] = None):
        super().__init__(config=LLMConfig())
        self.responses = responses or ["Mock response"]
        self._call_count = 0
        self.call_log: list[str] = []  # Record all prompts for assertions

    async def generate(self, prompt: str, **kwargs) -> str:
        self.call_log.append(prompt)
        response = self.responses[self._call_count % len(self.responses)]
        self._call_count += 1
        return response

    async def stream(self, prompt: str, **kwargs) -> AsyncIterator[str]:
        response = await self.generate(prompt, **kwargs)
        for word in response.split():
            yield word + " "

    def reset(self):
        self._call_count = 0
        self.call_log.clear()


@pytest.fixture
def mock_llm():
    return MockLLM(responses=[
        "RAG stands for Retrieval-Augmented Generation.",
        "It combines retrieval with language model generation.",
        "RAG reduces hallucinations by grounding answers in retrieved documents.",
    ])


@pytest.fixture
def single_response_llm():
    return MockLLM(responses=["The answer is 42."])

Testing RAG query

# tests/test_rag.py
import pytest
from synapsekit import RAG


@pytest.mark.asyncio
async def test_rag_returns_answer(mock_llm):
    rag = RAG(llm=mock_llm)
    await rag.aadd(["SynapseKit makes building LLM apps easy."])

    answer = await rag.aquery("What is SynapseKit?")

    assert isinstance(answer, str)
    assert len(answer) > 0
    assert mock_llm._call_count == 1


@pytest.mark.asyncio
async def test_rag_uses_context(mock_llm):
    rag = RAG(llm=mock_llm)
    await rag.aadd(["Python was created by Guido van Rossum in 1991."])

    await rag.aquery("Who created Python?")

    # Verify the prompt contained the retrieved context
    assert len(mock_llm.call_log) == 1
    assert "Guido" in mock_llm.call_log[0]


@pytest.mark.asyncio
async def test_rag_streaming(mock_llm):
    rag = RAG(llm=mock_llm)
    await rag.aadd(["RAG is a retrieval technique."])

    tokens = []
    async for token in rag.astream("What is RAG?"):
        tokens.append(token)

    full_response = "".join(tokens)
    assert len(tokens) > 0
    assert "RAG" in full_response


@pytest.mark.asyncio
async def test_rag_multiple_turns(mock_llm):
    from synapsekit.memory import ConversationMemory

    memory = ConversationMemory(max_messages=10)
    rag = RAG(llm=mock_llm, memory=memory)
    await rag.aadd(["Context document."])

    await rag.aquery("First question")
    await rag.aquery("Second question")

    assert mock_llm._call_count == 2
    messages = await memory.get_messages()
    assert len(messages) == 4  # 2 user + 2 assistant

2. Pytest fixtures for RAG

# tests/conftest.py (additions)
import pytest
from synapsekit import RAG
from synapsekit.memory import ConversationMemory


@pytest.fixture
async def rag_pipeline(mock_llm):
    """RAG pipeline pre-loaded with test documents."""
    rag = RAG(llm=mock_llm)
    await rag.aadd([
        "SynapseKit is an open-source Python library for LLM applications.",
        "It supports RAG, agents, graph workflows, and multi-modal inputs.",
        "SynapseKit v1.2 introduced streaming, serve, and cost intelligence.",
    ])
    return rag


@pytest.fixture
async def rag_with_memory(mock_llm):
    """RAG pipeline with conversation memory."""
    memory = ConversationMemory(max_messages=10)
    rag = RAG(llm=mock_llm, memory=memory)
    await rag.aadd(["SynapseKit supports memory-backed pipelines."])
    return rag, memory


# Usage in tests:
@pytest.mark.asyncio
async def test_using_rag_fixture(rag_pipeline):
    answer = await rag_pipeline.aquery("What is SynapseKit?")
    assert answer is not None


@pytest.mark.asyncio
async def test_memory_fixture(rag_with_memory):
    rag, memory = rag_with_memory
    await rag.aquery("First turn")
    messages = await memory.get_messages()
    assert len(messages) >= 1

3. Testing agents with mock tools

# tests/test_agents.py
import pytest
from unittest.mock import AsyncMock, patch
from synapsekit.agents import FunctionCallingAgent
from synapsekit.tools import tool


@tool
def get_weather(city: str) -> str:
    """Get current weather."""
    return f"{city}: 22°C, sunny"


@tool
async def fetch_price(ticker: str) -> str:
    """Get stock price."""
    return f"{ticker}: $150.00"


@pytest.mark.asyncio
async def test_agent_calls_tool(mock_llm):
    """Verify agent invokes tools when needed."""
    # Configure mock to simulate a tool call response
    mock_llm.responses = [
        '{"tool": "get_weather", "args": {"city": "Paris"}}',  # Tool call
        "The weather in Paris is 22°C and sunny.",               # Final answer
    ]

    agent = FunctionCallingAgent(llm=mock_llm, tools=[get_weather])
    result = await agent.run("What's the weather in Paris?")

    assert result is not None
    assert mock_llm._call_count >= 1


@pytest.mark.asyncio
async def test_agent_with_patched_async_tool(mock_llm):
    """Test agent with a patched async tool to avoid HTTP calls."""
    mock_llm.responses = ["The stock price is $150.00."]

    with patch.object(fetch_price, "__call__", return_value="AAPL: $150.00"):
        agent = FunctionCallingAgent(llm=mock_llm, tools=[fetch_price])
        result = await agent.run("What's the price of AAPL?")

    assert result is not None


@pytest.mark.asyncio
async def test_agent_handles_tool_error(mock_llm):
    """Agent should recover gracefully from tool errors."""
    @tool
    def always_fails(input: str) -> str:
        """A tool that always raises."""
        raise ValueError("Tool error!")

    mock_llm.responses = [
        "I encountered an error with that tool. Let me try a different approach.",
    ]

    agent = FunctionCallingAgent(
        llm=mock_llm,
        tools=[always_fails],
        on_tool_error="continue",
    )
    result = await agent.run("Use the failing tool")
    assert result is not None

4. Integration tests (real API)

Integration tests call real APIs and should only run in CI with valid credentials.

# tests/integration/test_openai_integration.py
import pytest
import os

# Skip entirely if no API key
pytestmark = pytest.mark.skipif(
    not os.getenv("OPENAI_API_KEY"),
    reason="OPENAI_API_KEY not set",
)

from synapsekit import RAG
from synapsekit.llms.openai import OpenAILLM


@pytest.mark.integration
@pytest.mark.asyncio
async def test_real_rag_pipeline():
    """End-to-end RAG test against real OpenAI API."""
    llm = OpenAILLM(model="gpt-4o-mini")
    rag = RAG(llm=llm)

    await rag.aadd([
        "The Eiffel Tower is 330 metres tall and located in Paris.",
    ])

    answer = await rag.aquery("How tall is the Eiffel Tower?")

    assert "330" in answer or "metres" in answer.lower()
    print(f"Real API answer: {answer}")

Run only unit tests (fast, no API):

pytest tests/ -m "not integration" -q
# Expected output:
# .................. 18 passed in 0.43s

Run all tests including integration:

pytest tests/ -q
# Expected output:
# .................... 20 passed in 3.2s

5. Eval testing with `@eval_case`

Eval tests measure answer quality, not just whether code runs. Use synapsekit test to run them.

# tests/evals/test_rag_quality.py
from synapsekit.evaluation import eval_case, EvalConfig
from synapsekit import RAG
from synapsekit.llms.openai import OpenAILLM


@eval_case(
    inputs={"question": "What is RAG?"},
    expected="Retrieval-Augmented Generation",
    metric="contains",  # Check that answer contains the expected string
)
async def test_rag_definition(question: str) -> str:
    llm = OpenAILLM(model="gpt-4o-mini")
    rag = RAG(llm=llm)
    await rag.aadd(["RAG is Retrieval-Augmented Generation, a technique that..."])
    return await rag.aquery(question)


@eval_case(
    inputs={"question": "Summarize quantum computing in one sentence"},
    expected_keywords=["quantum", "computing", "qubit"],
    metric="keyword_coverage",
    threshold=0.8,  # At least 80% of keywords must appear
)
async def test_summary_keywords(question: str) -> str:
    llm = OpenAILLM(model="gpt-4o-mini")
    return await llm.generate(question)


@eval_case(
    inputs={"question": "Is Python a programming language?"},
    expected="yes",
    metric="llm_judge",  # Use an LLM to judge correctness
    judge_prompt="Does the response confirm that Python is a programming language?",
    threshold=0.9,
)
async def test_factual_correctness(question: str) -> str:
    llm = OpenAILLM(model="gpt-4o-mini")
    return await llm.generate(question)

Run evals locally:

synapsekit test tests/evals/ --threshold 0.7
# Expected output:
# Running 3 eval cases...
# test_rag_definition         PASS  score=1.00
# test_summary_keywords       PASS  score=0.83
# test_factual_correctness    PASS  score=0.95
# ----------------------------------------
# Passed: 3/3  (threshold: 0.70)

Run with JSON output for CI:

synapsekit test tests/evals/ --threshold 0.7 --format json --output eval_results.json

6. GitHub Actions CI example

# .github/workflows/ci.yml
name: CI

on:
  push:
    branches: [main]
  pull_request:
    branches: [main]

jobs:
  unit-tests:
    name: Unit Tests
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4

      - uses: actions/setup-python@v5
        with:
          python-version: "3.12"

      - name: Install dependencies
        run: pip install synapsekit[openai] pytest pytest-asyncio

      - name: Run unit tests
        run: pytest tests/ -m "not integration" -q --tb=short
        # Expected output:
        # .................. 18 passed in 0.43s

  eval-tests:
    name: Eval Tests
    runs-on: ubuntu-latest
    needs: unit-tests  # Only run evals after unit tests pass
    steps:
      - uses: actions/checkout@v4

      - uses: actions/setup-python@v5
        with:
          python-version: "3.12"

      - name: Install dependencies
        run: pip install synapsekit[openai]

      - name: Run eval suite
        run: synapsekit test tests/evals/ --threshold 0.7 --format json
        env:
          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}

      - name: Upload eval results
        uses: actions/upload-artifact@v4
        if: always()
        with:
          name: eval-results
          path: eval_results.json

7. Snapshot testing for prompts

Ensure prompt construction does not regress across refactors.

# tests/test_prompt_snapshots.py
import pytest
from synapsekit import RAG
from synapsekit.llms.base import BaseLLM, LLMConfig
from typing import AsyncIterator


class CapturingLLM(BaseLLM):
    """Captures the full prompt for snapshot testing."""

    def __init__(self):
        super().__init__(config=LLMConfig())
        self.captured_prompts: list[str] = []

    async def generate(self, prompt: str, **kwargs) -> str:
        self.captured_prompts.append(prompt)
        return "Test response"

    async def stream(self, prompt: str, **kwargs) -> AsyncIterator[str]:
        yield "Test response"


@pytest.mark.asyncio
async def test_rag_prompt_includes_context():
    capturing_llm = CapturingLLM()
    rag = RAG(llm=capturing_llm)
    await rag.aadd(["Paris is the capital of France."])

    await rag.aquery("What is the capital of France?")

    assert len(capturing_llm.captured_prompts) == 1
    prompt = capturing_llm.captured_prompts[0]

    # Prompt must include the retrieved context
    assert "Paris" in prompt
    assert "capital" in prompt.lower()
    # Prompt must include the question
    assert "What is the capital of France?" in prompt

Summary

Test type	Tool	Speed	Cost
Unit tests	`MockLLM` + pytest	Fast (< 1s)	$0
Integration tests	Real API + `skipif`	Slow (2-10s)	Minimal
Eval tests	`@eval_case` + LLM judge	Slow (5-30s)	Low
Snapshot tests	`CapturingLLM`	Fast (< 1s)	$0

Run unit tests on every commit, integration and eval tests on PRs and main branch merges.

Prerequisites​

1. Unit testing with MockLLM​

Testing RAG query​

2. Pytest fixtures for RAG​

3. Testing agents with mock tools​

4. Integration tests (real API)​

5. Eval testing with @eval_case​

6. GitHub Actions CI example​

7. Snapshot testing for prompts​

Summary​