Продакшен-тестирование недетерминированных агентов.
import pytest import openai import time import json from dataclasses import dataclass, field @dataclass class EvalResult: test_name: str passed: bool = True score: float = 1.0 latency: float = 0.0 cost: float = 0.0 explanation: str = "" @pytest.fixture def agent_client(): """Фикстура: создаёт клиента для тестирования агента.""" return openai.OpenAI() @pytest.fixture def eval_llm(): """LLM-as-Judge: GPT-4o для оценки качества ответов.""" return openai.OpenAI() @pytest.fixture def cost_tracker(): """Трекает cumulative cost всех тестов в сессии.""" return {"total_cost": 0.0, "total_tokens": 0}
def llm_judge(eval_client, question, actual, expected_criteria): """LLM оценивает ответ агента по заданным критериям. Возвращает score 0..1.""" prompt = f"""You are an AI evaluator. Grade the answer on a scale 0.0-1.0. Question: {question} Agent's Answer: {actual} Criteria: {expected_criteria} Output ONLY JSON: {{"score": float, "explanation": "brief reason"}}""" resp = eval_client.chat.completions.create( model="gpt-4o", messages=[{"role": "user", "content": prompt}], temperature=0, response_format={"type": "json_object"}, ) result = json.loads(resp.choices[0].message.content) return result["score"], result["explanation"]
@pytest.mark.parametrize("question,expected_criteria,min_score", [ ("What is 2+2?", "Answer must be exactly 4, numeric", 0.95), ("Capital of France?", "Must answer Paris, no extra text needed", 0.95), ("Explain quantum computing in 1 sentence", "Must be one sentence, mention qubits, under 50 words", 0.80), ("Write a Python function to reverse a string", "Must contain valid Python code, function named 'reverse_string'", 0.85), ("What is the airspeed velocity of an unladen swallow?", "May mention Monty Python. Must not hallucinate exact number.", 0.70), ]) def test_agent_accuracy(agent_client, eval_llm, cost_tracker, question, expected_criteria, min_score): start = time.time() # Вызов агента resp = agent_client.chat.completions.create( model="gpt-4o", messages=[{"role": "system", "content": "Be concise and accurate."}, {"role": "user", "content": question}], temperature=0, ) latency = time.time() - start answer = resp.choices[0].message.content # LLM-as-Judge оценка score, explanation = llm_judge(eval_llm, question, answer, expected_criteria) # Трекаем стоимость prompt_cost = resp.usage.prompt_tokens * 2.5 / 1_000_000 completion_cost = resp.usage.completion_tokens * 10 / 1_000_000 cost = prompt_cost + completion_cost cost_tracker["total_cost"] += cost # Ассерт: качество не ниже порога assert score >= min_score, f"Score {score:.2f} < {min_score}. {explanation}\nAnswer: {answer}" print(f"✓ {question[:40]}... | score={score:.2f} | {latency:.2f}s | ${cost:.4f}")
class EvalMetricsCollector: def __init__(self): self.results = [] def record(self, result: EvalResult): self.results.append(result) def summary(self): if not self.results: return {} passed = sum(1 for r in self.results if r.passed) return { "total": len(self.results), "passed": passed, "failed": len(self.results) - passed, "avg_score": sum(r.score for r in self.results) / len(self.results), "avg_latency": sum(r.latency for r in self.results) / len(self.results), "total_cost": sum(r.cost for r in self.results), } def report(self): s = self.summary() print(f"=== EVAL REPORT ===\n" f"Tests: {s['total']} | Passed: {s['passed']} | Failed: {s['failed']}\n" f"Avg Score: {s['avg_score']:.3f} | Avg Latency: {s['avg_latency']:.2f}s\n" f"Total Cost: ${s['total_cost']:.4f}")
GITHUB_ACTIONS_YAML = """ name: Agent Eval Pipeline on: pull_request: paths: ['agent/**', 'prompts/**'] schedule: - cron: '0 6 * * 1' # Every Monday 6 AM jobs: eval: runs-on: ubuntu-latest timeout-minutes: 15 steps: - uses: actions/checkout@v4 - uses: actions/setup-python@v5 with: python-version: '3.12' - name: Install dependencies run: pip install pytest openai anthropic - name: Run agent evals env: OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} run: | python -m pytest tests/agent_evals.py \ -v --tb=short \ --json-report \ --json-report-file=eval_results.json - name: Check score threshold run: | AVG=$(python -c "import json; d=json.load(open('eval_results.json')); scores=[t['score'] for t in d.get('tests',[]) if 'score' in t]; print(sum(scores)/len(scores) if scores else 0)") echo "Average score: $AVG" if (( $(echo "$AVG < 0.75" | bc -l) )); then echo "Eval score below threshold!" exit 1 fi - name: Upload eval report uses: actions/upload-artifact@v4 if: always() with: name: eval-report path: eval_results.json """ print("Agent eval framework ready.") print("Run: pytest tests/agent_evals.py -v") print("CI: add .github/workflows/agent-eval.yml to your repo")