Тестирование AI-агентов: eval-фреймворк

Продакшен-тестирование недетерминированных агентов.

📊 Продвинутый ⏱ 13 мин

# 1. PYTEST FIXTURES ДЛЯ АГЕНТОВ

import pytest
import openai
import time
import json
from dataclasses import dataclass, field

@dataclass
class EvalResult:
    test_name: str
    passed: bool = True
    score: float = 1.0
    latency: float = 0.0
    cost: float = 0.0
    explanation: str = ""

@pytest.fixture
def agent_client():
    """Фикстура: создаёт клиента для тестирования агента."""
    return openai.OpenAI()

@pytest.fixture
def eval_llm():
    """LLM-as-Judge: GPT-4o для оценки качества ответов."""
    return openai.OpenAI()

@pytest.fixture
def cost_tracker():
    """Трекает cumulative cost всех тестов в сессии."""
    return {"total_cost": 0.0, "total_tokens": 0}

# 2. LLM-AS-JUDGE — ОЦЕНКА КАЧЕСТВА ОТВЕТОВ

def llm_judge(eval_client, question, actual, expected_criteria):
    """LLM оценивает ответ агента по заданным критериям. Возвращает score 0..1."""
    prompt = f"""You are an AI evaluator. Grade the answer on a scale 0.0-1.0.

Question: {question}
Agent's Answer: {actual}
Criteria: {expected_criteria}

Output ONLY JSON: {{"score": float, "explanation": "brief reason"}}"""

    resp = eval_client.chat.completions.create(
        model="gpt-4o", messages=[{"role": "user", "content": prompt}],
        temperature=0, response_format={"type": "json_object"},
    )
    result = json.loads(resp.choices[0].message.content)
    return result["score"], result["explanation"]

# 3. ПАРАМЕТРИЗОВАННЫЕ ТЕСТЫ АГЕНТА

@pytest.mark.parametrize("question,expected_criteria,min_score", [
    ("What is 2+2?", "Answer must be exactly 4, numeric", 0.95),
    ("Capital of France?", "Must answer Paris, no extra text needed", 0.95),
    ("Explain quantum computing in 1 sentence",
     "Must be one sentence, mention qubits, under 50 words", 0.80),
    ("Write a Python function to reverse a string",
     "Must contain valid Python code, function named 'reverse_string'", 0.85),
    ("What is the airspeed velocity of an unladen swallow?",
     "May mention Monty Python. Must not hallucinate exact number.", 0.70),
])
def test_agent_accuracy(agent_client, eval_llm, cost_tracker,
                         question, expected_criteria, min_score):
    start = time.time()

    # Вызов агента
    resp = agent_client.chat.completions.create(
        model="gpt-4o",
        messages=[{"role": "system", "content": "Be concise and accurate."},
                  {"role": "user", "content": question}],
        temperature=0,
    )
    latency = time.time() - start
    answer = resp.choices[0].message.content

    # LLM-as-Judge оценка
    score, explanation = llm_judge(eval_llm, question, answer, expected_criteria)

    # Трекаем стоимость
    prompt_cost = resp.usage.prompt_tokens * 2.5 / 1_000_000
    completion_cost = resp.usage.completion_tokens * 10 / 1_000_000
    cost = prompt_cost + completion_cost
    cost_tracker["total_cost"] += cost

    # Ассерт: качество не ниже порога
    assert score >= min_score,         f"Score {score:.2f} < {min_score}. {explanation}\nAnswer: {answer}"

    print(f"✓ {question[:40]}... | score={score:.2f} | {latency:.2f}s | ${cost:.4f}")

# 4. МЕТРИКИ: СБОР И ОТЧЁТ

class EvalMetricsCollector:
    def __init__(self):
        self.results = []

    def record(self, result: EvalResult):
        self.results.append(result)

    def summary(self):
        if not self.results:
            return {}
        passed = sum(1 for r in self.results if r.passed)
        return {
            "total": len(self.results),
            "passed": passed,
            "failed": len(self.results) - passed,
            "avg_score": sum(r.score for r in self.results) / len(self.results),
            "avg_latency": sum(r.latency for r in self.results) / len(self.results),
            "total_cost": sum(r.cost for r in self.results),
        }

    def report(self):
        s = self.summary()
        print(f"=== EVAL REPORT ===\n"
              f"Tests: {s['total']} | Passed: {s['passed']} | Failed: {s['failed']}\n"
              f"Avg Score: {s['avg_score']:.3f} | Avg Latency: {s['avg_latency']:.2f}s\n"
              f"Total Cost: ${s['total_cost']:.4f}")

# 5. GITHUB ACTIONS CI/CD ИНТЕГРАЦИЯ (файл .github/workflows/agent-eval.yml)

GITHUB_ACTIONS_YAML = """
name: Agent Eval Pipeline
on:
  pull_request:
    paths: ['agent/**', 'prompts/**']
  schedule:
    - cron: '0 6 * * 1'  # Every Monday 6 AM

jobs:
  eval:
    runs-on: ubuntu-latest
    timeout-minutes: 15
    steps:
      - uses: actions/checkout@v4
      - uses: actions/setup-python@v5
        with:
          python-version: '3.12'

      - name: Install dependencies
        run: pip install pytest openai anthropic

      - name: Run agent evals
        env:
          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
          ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
        run: |
          python -m pytest tests/agent_evals.py \
            -v --tb=short \
            --json-report \
            --json-report-file=eval_results.json

      - name: Check score threshold
        run: |
          AVG=$(python -c "import json; d=json.load(open('eval_results.json'));
            scores=[t['score'] for t in d.get('tests',[]) if 'score' in t];
            print(sum(scores)/len(scores) if scores else 0)")
          echo "Average score: $AVG"
          if (( $(echo "$AVG < 0.75" | bc -l) )); then
            echo "Eval score below threshold!"
            exit 1
          fi

      - name: Upload eval report
        uses: actions/upload-artifact@v4
        if: always()
        with:
          name: eval-report
          path: eval_results.json
"""

print("Agent eval framework ready.")
print("Run: pytest tests/agent_evals.py -v")
print("CI: add .github/workflows/agent-eval.yml to your repo")

🔗 Полезные ссылки

📖 pytest📖 LangSmith🔬 Arize Phoenix