AI Agent开发实战(七):测试与评估让Agent更可靠
一、开场:不测试的Agent是炸弹
大家好,我是老金。
AI Agent有个大问题:不确定性。
- 同样的问题,可能给出不同答案
- 有时正确,有时胡说八道
- 边界情况难以预料
怎么办?测试与评估!
今天我们聊聊如何测试AI Agent。
二、测试挑战
2.1 Agent测试难点
| 难点 | 说明 |
|---|---|
| 非确定性 | LLM输出不固定 |
| 依赖外部 | API调用、工具执行 |
| 长链条 | 多步骤流程易出错 |
| 主观性 | 回答好坏难量化 |
2.2 测试策略
┌─────────────────────────────────────────────────────────┐
│ Agent测试金字塔 │
├─────────────────────────────────────────────────────────┤
│ │
│ ┌─────────┐ │
│ │ E2E测试 │ ← 少量,全面 │
│ └─────────┘ │
│ ┌───────────────┐ │
│ │ 集成测试 │ ← 适量,协作 │
│ └───────────────┘ │
│ ┌─────────────────────┐ │
│ │ 单元测试 │ ← 大量,基础 │
│ └─────────────────────┘ │
│ │
└─────────────────────────────────────────────────────────┘
三、单元测试
3.1 工具测试
# tests/test_tools.py
import pytest
from src.tools.weather import WeatherTool
from src.tools.calculator import CalculatorTool
@pytest.mark.asyncio
async def test_weather_tool():
"""测试天气工具"""
tool = WeatherTool()
# 测试正常情况
result = await tool.execute(city="北京")
assert result.success
assert "北京" in result.result
assert "温度" in result.result
# 测试参数验证
assert tool.validate_parameters(city="上海")
assert not tool.validate_parameters() # 缺少city
@pytest.mark.asyncio
async def test_calculator_tool():
"""测试计算器工具"""
tool = CalculatorTool()
# 测试基本计算
result = await tool.execute(expression="2+2")
assert result.success
assert "4" in result.result
# 测试数学函数
result = await tool.execute(expression="sqrt(16)")
assert result.success
assert "4" in result.result
# 测试错误表达式
result = await tool.execute(expression="invalid")
assert not result.success
@pytest.mark.asyncio
async def test_tool_registry():
"""测试工具注册"""
from src.tools.registry import tool_registry
# 检查注册
assert tool_registry.get("get_weather") is not None
assert tool_registry.get("calculate") is not None
# 检查定义
definitions = tool_registry.get_definitions()
assert len(definitions) > 0
assert all("name" in d["function"] for d in definitions)
3.2 记忆测试
# tests/test_memory.py
import pytest
from src.memory.working_memory import WorkingMemory
from src.memory.base import MemoryItem
@pytest.mark.asyncio
async def test_working_memory():
"""测试工作记忆"""
memory = WorkingMemory(max_size=5)
# 测试添加
item = MemoryItem(content="test content")
item_id = await memory.add(item)
assert item_id is not None
# 测试获取
retrieved = await memory.get(item_id)
assert retrieved is not None
assert retrieved.content == "test content"
# 测试容量限制
for i in range(10):
await memory.add(MemoryItem(content=f"item {i}"))
all_items = memory.get_all()
assert len(all_items) == 5 # 应该只有5个
@pytest.mark.asyncio
async def test_memory_search():
"""测试记忆搜索"""
memory = WorkingMemory()
# 添加多条记忆
await memory.add(MemoryItem(content="Python是一种编程语言"))
await memory.add(MemoryItem(content="JavaScript也是一种编程语言"))
await memory.add(MemoryItem(content="今天天气很好"))
# 搜索
results = await memory.search("编程", top_k=2)
assert len(results) == 2
assert all("编程" in r.content for r in results)
四、集成测试
4.1 Agent集成测试
# tests/test_agent_integration.py
import pytest
from unittest.mock import AsyncMock, MagicMock
from src.agents.chat_agent import ChatAgent
from src.agents.tool_agent import ToolAgent
@pytest.mark.asyncio
async def test_chat_agent_integration():
"""测试ChatAgent集成"""
# Mock LLM
mock_llm = MagicMock()
mock_llm.chat = AsyncMock(return_value="这是一个测试回复")
mock_llm.provider = "openai"
# 创建Agent
agent = ChatAgent(
llm_client=mock_llm,
personality="测试"
)
# 测试对话
response = await agent.chat("你好")
assert response == "这是一个测试回复"
assert len(agent.state.messages) == 3 # system + user + assistant
@pytest.mark.asyncio
async def test_tool_agent_integration():
"""测试ToolAgent集成"""
# Mock LLM
mock_llm = MagicMock()
mock_llm.chat_with_tools = AsyncMock(return_value={
"content": "让我查询天气",
"tool_calls": None
})
mock_llm.provider = "openai"
# 创建Agent
agent = ToolAgent(
llm_client=mock_llm,
tools=["get_weather"]
)
# 测试
response = await agent.run("今天天气怎么样?")
assert response is not None
4.2 Mock策略
# tests/mocks.py
from unittest.mock import AsyncMock, MagicMock
from typing import List, Dict, Any
class MockLLMClient:
"""Mock LLM客户端"""
def __init__(self, responses: List[str] = None):
self.responses = responses or ["默认回复"]
self.call_count = 0
async def chat(self, messages: List[Dict[str, str]], **kwargs) -> str:
"""模拟对话"""
response = self.responses[self.call_count % len(self.responses)]
self.call_count += 1
return response
async def chat_with_tools(self, messages, tools, **kwargs) -> Dict[str, Any]:
"""模拟工具调用"""
return {
"content": None,
"tool_calls": [
MagicMock(
function=MagicMock(
name="get_weather",
arguments='{"city": "北京"}'
)
)
]
}
class MockTool:
"""Mock工具"""
name = "mock_tool"
description = "Mock工具"
parameters_schema = {}
async def execute(self, **kwargs):
from src.tools.base import ToolResult
return ToolResult(success=True, result="Mock结果")
五、端到端测试
5.1 E2E测试框架
# tests/e2e/test_e2e.py
import pytest
import asyncio
from src.utils.llm_client import LLMClient
from src.agents.tool_agent import ToolAgent
from src.tools import weather, calculator
class TestE2E:
"""端到端测试"""
@pytest.fixture
def real_llm(self):
"""真实LLM客户端"""
return LLMClient(provider="openai", model="gpt-3.5-turbo")
@pytest.fixture
def agent(self, real_llm):
"""真实Agent"""
return ToolAgent(
llm_client=real_llm,
tools=["get_weather", "calculate"]
)
@pytest.mark.asyncio
@pytest.mark.slow # 标记为慢测试
async def test_weather_query(self, agent):
"""测试天气查询(真实LLM)"""
response = await agent.run("北京今天天气怎么样?")
# 验证响应包含天气相关信息
assert response is not None
assert len(response) > 0
# 可以检查关键词,但不能太严格
weather_keywords = ["天气", "温度", "晴", "雨", "云"]
assert any(kw in response for kw in weather_keywords)
@pytest.mark.asyncio
@pytest.mark.slow
async def test_calculation(self, agent):
"""测试计算(真实LLM)"""
response = await agent.run("计算123*456等于多少?")
assert response is not None
# 123 * 456 = 56088
assert "56088" in response or "56,088" in response
@pytest.mark.asyncio
@pytest.mark.slow
async def test_multi_turn(self, agent):
"""测试多轮对话"""
# 第一轮
r1 = await agent.run("我叫张三")
assert r1 is not None
# 第二轮
r2 = await agent.run("我叫什么名字?")
assert "张三" in r2
六、评估指标
6.1 评估框架
# src/evaluation/metrics.py
from typing import List, Dict, Any
from pydantic import BaseModel
import json
class EvaluationResult(BaseModel):
"""评估结果"""
metric_name: str
score: float
details: Dict[str, Any] = {}
class AgentEvaluator:
"""Agent评估器"""
def __init__(self, llm_client):
self.llm = llm_client
async def evaluate_response(
self,
query: str,
response: str,
expected: str = None,
criteria: List[str] = None
) -> List[EvaluationResult]:
"""评估响应"""
results = []
# 1. 相关性评估
relevance = await self._evaluate_relevance(query, response)
results.append(relevance)
# 2. 准确性评估(如果有expected)
if expected:
accuracy = await self._evaluate_accuracy(response, expected)
results.append(accuracy)
# 3. 完整性评估
completeness = await self._evaluate_completeness(query, response)
results.append(completeness)
# 4. 自定义标准评估
if criteria:
for criterion in criteria:
custom = await self._evaluate_custom(query, response, criterion)
results.append(custom)
return results
async def _evaluate_relevance(
self,
query: str,
response: str
) -> EvaluationResult:
"""评估相关性"""
prompt = f"""
评估回答与问题的相关性:
问题:{query}
回答:{response}
评分标准(0-10分):
- 10分:完全相关,直接回答了问题
- 7分:基本相关,包含关键信息
- 4分:部分相关,有些内容离题
- 0分:完全不相关
只输出分数(数字)。
"""
score_str = await self.llm.chat([
{"role": "user", "content": prompt}
])
try:
score = float(score_str.strip()) / 10
except:
score = 0.5
return EvaluationResult(
metric_name="relevance",
score=score,
details={"query": query, "response": response}
)
async def _evaluate_accuracy(
self,
response: str,
expected: str
) -> EvaluationResult:
"""评估准确性"""
prompt = f"""
评估回答的准确性:
预期回答:{expected}
实际回答:{response}
评分标准(0-10分):
- 10分:完全准确,核心信息一致
- 7分:基本准确,有些许偏差
- 4分:部分准确,有明显偏差
- 0分:完全错误
只输出分数(数字)。
"""
score_str = await self.llm.chat([
{"role": "user", "content": prompt}
])
try:
score = float(score_str.strip()) / 10
except:
score = 0.5
return EvaluationResult(
metric_name="accuracy",
score=score
)
async def _evaluate_completeness(
self,
query: str,
response: str
) -> EvaluationResult:
"""评估完整性"""
prompt = f"""
评估回答的完整性:
问题:{query}
回答:{response}
评分标准(0-10分):
- 10分:完整回答,涵盖了所有要点
- 7分:基本完整,缺少次要信息
- 4分:不够完整,缺少重要信息
- 0分:回答不完整
只输出分数(数字)。
"""
score_str = await self.llm.chat([
{"role": "user", "content": prompt}
])
try:
score = float(score_str.strip()) / 10
except:
score = 0.5
return EvaluationResult(
metric_name="completeness",
score=score
)
async def _evaluate_custom(
self,
query: str,
response: str,
criterion: str
) -> EvaluationResult:
"""自定义标准评估"""
prompt = f"""
按以下标准评估回答:
标准:{criterion}
问题:{query}
回答:{response}
评分(0-10分),只输出分数。
"""
score_str = await self.llm.chat([
{"role": "user", "content": prompt}
])
try:
score = float(score_str.strip()) / 10
except:
score = 0.5
return EvaluationResult(
metric_name=f"custom_{criterion}",
score=score
)
6.2 测试集评估
# src/evaluation/benchmark.py
from typing import List, Dict
from .metrics import AgentEvaluator, EvaluationResult
import json
class TestCase(BaseModel):
"""测试用例"""
id: str
query: str
expected: str = None
criteria: List[str] = []
class Benchmark:
"""基准测试"""
def __init__(self, test_cases: List[TestCase]):
self.test_cases = test_cases
async def run(self, agent, evaluator: AgentEvaluator) -> Dict:
"""运行基准测试"""
results = []
for case in self.test_cases:
# 执行
response = await agent.run(case.query)
# 评估
eval_results = await evaluator.evaluate_response(
query=case.query,
response=response,
expected=case.expected,
criteria=case.criteria
)
results.append({
"case_id": case.id,
"query": case.query,
"response": response,
"evaluations": [r.dict() for r in eval_results]
})
# 统计
metrics_summary = self._summarize(results)
return {
"total_cases": len(self.test_cases),
"results": results,
"summary": metrics_summary
}
def _summarize(self, results: List[Dict]) -> Dict:
"""汇总结果"""
metric_scores = {}
for result in results:
for eval_result in result["evaluations"]:
metric_name = eval_result["metric_name"]
if metric_name not in metric_scores:
metric_scores[metric_name] = []
metric_scores[metric_name].append(eval_result["score"])
summary = {}
for metric, scores in metric_scores.items():
summary[metric] = {
"mean": sum(scores) / len(scores),
"min": min(scores),
"max": max(scores)
}
return summary
# 使用示例
test_cases = [
TestCase(
id="weather_1",
query="北京今天天气怎么样?",
criteria=["准确性", "信息完整性"]
),
TestCase(
id="calc_1",
query="计算123+456",
expected="579"
)
]
七、回归测试
7.1 快照测试
# tests/test_snapshots.py
import pytest
import json
import os
class SnapshotTester:
"""快照测试"""
def __init__(self, snapshot_dir: str = "tests/snapshots"):
self.snapshot_dir = snapshot_dir
os.makedirs(snapshot_dir, exist_ok=True)
async def compare_or_create(
self,
name: str,
actual: str,
update: bool = False
) -> bool:
"""比较或创建快照"""
snapshot_path = os.path.join(self.snapshot_dir, f"{name}.json")
if update or not os.path.exists(snapshot_path):
# 创建/更新快照
with open(snapshot_path, 'w') as f:
json.dump({"content": actual}, f, ensure_ascii=False, indent=2)
return True
else:
# 比较快照
with open(snapshot_path, 'r') as f:
expected = json.load(f)["content"]
# 使用LLM比较相似度(而非精确匹配)
# ...
return True
@pytest.mark.asyncio
async def test_agent_snapshot():
"""快照测试"""
# ...
pass
八、持续评估
8.1 监控指标
# src/evaluation/monitoring.py
from typing import Dict, List
from datetime import datetime
import statistics
class AgentMonitor:
"""Agent监控"""
def __init__(self):
self.metrics: Dict[str, List] = {
"latency": [],
"token_usage": [],
"tool_calls": [],
"errors": []
}
def record(self, metric: str, value: Any):
"""记录指标"""
if metric not in self.metrics:
self.metrics[metric] = []
self.metrics[metric].append({
"value": value,
"timestamp": datetime.now()
})
def get_stats(self, metric: str) -> Dict:
"""获取统计"""
if metric not in self.metrics or not self.metrics[metric]:
return {}
values = [m["value"] for m in self.metrics[metric]]
return {
"count": len(values),
"mean": statistics.mean(values),
"median": statistics.median(values),
"min": min(values),
"max": max(values),
"std": statistics.stdev(values) if len(values) > 1 else 0
}
九、最佳实践
9.1 测试清单
| 测试类型 | 覆盖内容 | 频率 |
|---|---|---|
| 单元测试 | 工具、记忆、工具函数 | 每次提交 |
| 集成测试 | Agent协作、工具调用 | 每次PR |
| E2E测试 | 完整流程 | 每日 |
| 回归测试 | 快照对比 | 每周 |
| 评估测试 | 质量指标 | 每周 |
9.2 CI/CD集成
# .github/workflows/test.yml
name: Agent Tests
on: [push, pull_request]
jobs:
test:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- name: Set up Python
uses: actions/setup-python@v2
with:
python-version: '3.11'
- name: Install dependencies
run: |
pip install -r requirements.txt
pip install pytest pytest-asyncio
- name: Run unit tests
run: pytest tests/unit -v
- name: Run integration tests
run: pytest tests/integration -v
- name: Run E2E tests
if: github.event_name == 'push' && github.ref == 'refs/heads/main'
env:
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
run: pytest tests/e2e -v -m slow
十、总结
测试要点
- 单元测试:测试独立组件
- 集成测试:测试协作
- E2E测试:测试完整流程
- 评估指标:量化质量
- 持续监控:追踪表现
下期预告
下一篇:Agent部署上线——从开发到生产的完整流程!
往期回顾
正文完