Skills 单元测试
写完 Skill 后,如何确认它真的按预期工作?靠手动测试既费时又容易遗漏。
本篇介绍如何为 Skill 建立系统性的测试用例,从脚本单元测试到端到端触发测试。
Skill 测试的两个层次
| 层次 | 测试对象 | 方法 |
|---|---|---|
| 脚本单元测试 | scripts/ 下的 Python/JS 脚本 | pytest / 标准 unittest |
| 触发端到端测试 | Skill 是否被正确触发、输出是否符合预期 | skill-creator 的 eval 框架 |
层次一:脚本单元测试
脚本是 Skill 中最确定性的部分,适合用标准单元测试框架覆盖。
实例
# 文件路径:scripts/tests/test_clean_data.py
import pytest
import os
import tempfile
import csv
# 被测试的模块
import sys
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
from clean_data import remove_duplicates, fill_nulls, strip_whitespace
# ── 测试:去重 ──────────────────────────────────────────
def test_remove_duplicates_basic():
"""正常情况:有重复行时应删除"""
rows = [
{"name": "runoob", "score": "90"},
{"name": "runoob", "score": "90"}, # 重复行
{"name": "RUNOOB", "score": "80"},
]
result = remove_duplicates(rows)
assert len(result) == 2, "应删除 1 条重复行"
def test_remove_duplicates_empty():
"""边界情况:空列表不应报错"""
result = remove_duplicates([])
assert result == []
# ── 测试:空值填充 ──────────────────────────────────────
def test_fill_nulls_numeric():
"""数值列的空值应填充为 0"""
rows = [{"value": "10"}, {"value": ""}, {"value": "20"}]
result = fill_nulls(rows, col="value", fill_with="0")
assert result[1]["value"] == "0"
# ── 测试:去空格 ──────────────────────────────────────
def test_strip_whitespace():
"""字符串首尾空格应被清除"""
rows = [{"name": " runoob "}, {"name": "RUNOOB"}]
result = strip_whitespace(rows, col="name")
assert result[0]["name"] == "runoob"
assert result[1]["name"] == "RUNOOB"
# ── 集成测试:读取真实文件 ──────────────────────────────
def test_process_real_file():
"""创建临时 CSV 文件并测试完整处理流程"""
with tempfile.NamedTemporaryFile(mode="w", suffix=".csv",
delete=False, newline="") as f:
writer = csv.DictWriter(f, fieldnames=["name", "score"])
writer.writeheader()
writer.writerow({"name": " runoob ", "score": "90"})
writer.writerow({"name": " runoob ", "score": "90"}) # 重复
writer.writerow({"name": "RUNOOB", "score": ""}) # 空值
tmp_path = f.name
try:
from clean_data import process_file
result = process_file(tmp_path)
assert result["removed_rows"] == 1
assert result["fixed_values"] == 1
finally:
os.unlink(tmp_path)
import pytest
import os
import tempfile
import csv
# 被测试的模块
import sys
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
from clean_data import remove_duplicates, fill_nulls, strip_whitespace
# ── 测试:去重 ──────────────────────────────────────────
def test_remove_duplicates_basic():
"""正常情况:有重复行时应删除"""
rows = [
{"name": "runoob", "score": "90"},
{"name": "runoob", "score": "90"}, # 重复行
{"name": "RUNOOB", "score": "80"},
]
result = remove_duplicates(rows)
assert len(result) == 2, "应删除 1 条重复行"
def test_remove_duplicates_empty():
"""边界情况:空列表不应报错"""
result = remove_duplicates([])
assert result == []
# ── 测试:空值填充 ──────────────────────────────────────
def test_fill_nulls_numeric():
"""数值列的空值应填充为 0"""
rows = [{"value": "10"}, {"value": ""}, {"value": "20"}]
result = fill_nulls(rows, col="value", fill_with="0")
assert result[1]["value"] == "0"
# ── 测试:去空格 ──────────────────────────────────────
def test_strip_whitespace():
"""字符串首尾空格应被清除"""
rows = [{"name": " runoob "}, {"name": "RUNOOB"}]
result = strip_whitespace(rows, col="name")
assert result[0]["name"] == "runoob"
assert result[1]["name"] == "RUNOOB"
# ── 集成测试:读取真实文件 ──────────────────────────────
def test_process_real_file():
"""创建临时 CSV 文件并测试完整处理流程"""
with tempfile.NamedTemporaryFile(mode="w", suffix=".csv",
delete=False, newline="") as f:
writer = csv.DictWriter(f, fieldnames=["name", "score"])
writer.writeheader()
writer.writerow({"name": " runoob ", "score": "90"})
writer.writerow({"name": " runoob ", "score": "90"}) # 重复
writer.writerow({"name": "RUNOOB", "score": ""}) # 空值
tmp_path = f.name
try:
from clean_data import process_file
result = process_file(tmp_path)
assert result["removed_rows"] == 1
assert result["fixed_values"] == 1
finally:
os.unlink(tmp_path)
运行测试:
# 进入 Skill 目录后运行所有测试 cd my-skill/ pytest scripts/tests/ -v # 生成覆盖率报告 pytest scripts/tests/ --cov=scripts --cov-report=term-missing
输出:
scripts/tests/test_clean_data.py::test_remove_duplicates_basic PASSED scripts/tests/test_clean_data.py::test_remove_duplicates_empty PASSED scripts/tests/test_clean_data.py::test_fill_nulls_numeric PASSED scripts/tests/test_clean_data.py::test_strip_whitespace PASSED scripts/tests/test_clean_data.py::test_process_real_file PASSED 5 passed in 0.12s
层次二:触发端到端测试
触发测试验证的是:当用户发出某个请求时,Claude 是否会使用这个 Skill。
测试用例以 JSON 格式定义,包含用户提示词和期望的行为断言。
实例
[
{
"id": "trigger_basic",
"prompt": "帮我分析这份销售数据 CSV,找出月度趋势并生成统计摘要",
"assertions": [
{
"type": "skill_triggered",
"skill": "csv-analyzer",
"description": "复杂分析请求应触发 csv-analyzer"
}
]
},
{
"id": "trigger_with_file",
"prompt": "我上传了 runoob_sales.csv,请分析各列的数据分布",
"assertions": [
{
"type": "skill_triggered",
"skill": "csv-analyzer"
},
{
"type": "output_contains",
"keyword": "统计",
"description": "输出中应包含统计信息"
}
]
},
{
"id": "no_trigger_simple",
"prompt": "CSV 是什么格式?",
"assertions": [
{
"type": "skill_not_triggered",
"skill": "csv-analyzer",
"description": "简单知识问题不应触发 Skill"
}
]
}
]
{
"id": "trigger_basic",
"prompt": "帮我分析这份销售数据 CSV,找出月度趋势并生成统计摘要",
"assertions": [
{
"type": "skill_triggered",
"skill": "csv-analyzer",
"description": "复杂分析请求应触发 csv-analyzer"
}
]
},
{
"id": "trigger_with_file",
"prompt": "我上传了 runoob_sales.csv,请分析各列的数据分布",
"assertions": [
{
"type": "skill_triggered",
"skill": "csv-analyzer"
},
{
"type": "output_contains",
"keyword": "统计",
"description": "输出中应包含统计信息"
}
]
},
{
"id": "no_trigger_simple",
"prompt": "CSV 是什么格式?",
"assertions": [
{
"type": "skill_not_triggered",
"skill": "csv-analyzer",
"description": "简单知识问题不应触发 Skill"
}
]
}
]
运行触发测试(需要 skill-creator 提供的 eval 工具):
实例
# 运行测试集并生成可视化报告
python -m scripts.run_eval \
--eval-set evals/trigger-eval.json \
--skill-path csv-analyzer/ \
--model claude-sonnet-4-20250514
# 生成 HTML 报告供人工审阅
python eval-viewer/generate_review.py \
--results evals/results/ \
--output evals/review.html
python -m scripts.run_eval \
--eval-set evals/trigger-eval.json \
--skill-path csv-analyzer/ \
--model claude-sonnet-4-20250514
# 生成 HTML 报告供人工审阅
python eval-viewer/generate_review.py \
--results evals/results/ \
--output evals/review.html
测试用例的提示词要足够复杂,简单的"读取 CSV"类请求即使触发失败也不代表 Skill 有问题,因为 Claude 本身就能处理。好的测试用例应当是用户真实会提的多步骤、有明确输出要求的请求。
测试用例的覆盖范围
| 类型 | 要覆盖的场景 |
|---|---|
| 正向触发 | 复杂任务、包含关键词、明确输出格式要求 |
| 负向触发 | 简单问答、只是提及相关词但不需要 Skill |
| 边界输入 | 空文件、超大文件、格式不支持的文件 |
| 错误恢复 | 文件路径错误时的提示是否清晰 |
