Skills 监控与可观测性
Skill 发布后,如何知道它工作得好不好?是否频繁出错?哪些步骤最慢?
本篇介绍如何为 Skill 建立可观测性,通过日志和指标了解运行状态。
可观测性的三个维度
| 维度 | 回答的问题 | 实现方式 |
|---|---|---|
| 日志(Logs) | 发生了什么?出了什么错? | 结构化 JSON 日志文件 |
| 指标(Metrics) | 执行了多少次?耗时多少?成功率多少? | 追加写入指标文件 |
| 执行记录(Trace) | 每次执行经过了哪些步骤?每步耗时? | 步骤级别的时间戳记录 |
结构化日志的输出规范
Skill 脚本应将运行日志写入固定路径的文件,便于事后查阅。
实例
# 文件路径:scripts/skill_logger.py
import json
import os
import sys
from datetime import datetime, timezone
LOG_FILE = "/home/claude/skill_run.log"
def log(level: str, event: str, skill: str = "", **context):
"""
写入结构化日志条目
参数:
level: 日志级别(info / warning / error)
event: 事件名称(snake_case)
skill: Skill 名称
context: 任意附加字段
"""
entry = {
"ts": datetime.now(timezone.utc).isoformat(),
"level": level,
"event": event,
"skill": skill,
**context
}
line = json.dumps(entry, ensure_ascii=False)
# 同时输出到 stderr(实时可见)和日志文件(持久化)
print(line, file=sys.stderr)
with open(LOG_FILE, "a", encoding="utf-8") as f:
f.write(line + "\n")
# 使用示例
if __name__ == "__main__":
log("info", "skill_start", skill="data-analyzer",
file="/mnt/user-data/uploads/runoob.csv")
log("info", "step_complete", skill="data-analyzer",
step="clean", rows_removed=3, elapsed_ms=240)
log("info", "skill_complete", skill="data-analyzer",
output="/mnt/user-data/outputs/report.xlsx", elapsed_ms=1830)
log("error", "step_failed", skill="data-analyzer",
step="generate_chart", error="openpyxl not found")
import json
import os
import sys
from datetime import datetime, timezone
LOG_FILE = "/home/claude/skill_run.log"
def log(level: str, event: str, skill: str = "", **context):
"""
写入结构化日志条目
参数:
level: 日志级别(info / warning / error)
event: 事件名称(snake_case)
skill: Skill 名称
context: 任意附加字段
"""
entry = {
"ts": datetime.now(timezone.utc).isoformat(),
"level": level,
"event": event,
"skill": skill,
**context
}
line = json.dumps(entry, ensure_ascii=False)
# 同时输出到 stderr(实时可见)和日志文件(持久化)
print(line, file=sys.stderr)
with open(LOG_FILE, "a", encoding="utf-8") as f:
f.write(line + "\n")
# 使用示例
if __name__ == "__main__":
log("info", "skill_start", skill="data-analyzer",
file="/mnt/user-data/uploads/runoob.csv")
log("info", "step_complete", skill="data-analyzer",
step="clean", rows_removed=3, elapsed_ms=240)
log("info", "skill_complete", skill="data-analyzer",
output="/mnt/user-data/outputs/report.xlsx", elapsed_ms=1830)
log("error", "step_failed", skill="data-analyzer",
step="generate_chart", error="openpyxl not found")
{"ts": "2026-05-18T10:23:05+00:00", "level": "info", "event": "skill_start", "skill": "data-analyzer", "file": "/mnt/user-data/uploads/runoob.csv"}
{"ts": "2026-05-18T10:23:05+00:00", "level": "info", "event": "step_complete", "skill": "data-analyzer", "step": "clean", "rows_removed": 3, "elapsed_ms": 240}
{"ts": "2026-05-18T10:23:07+00:00", "level": "info", "event": "skill_complete", "skill": "data-analyzer", "output": "/mnt/user-data/outputs/report.xlsx", "elapsed_ms": 1830}
{"ts": "2026-05-18T10:23:07+00:00", "level": "error", "event": "step_failed", "skill": "data-analyzer", "step": "generate_chart", "error": "openpyxl not found"}
指标聚合:统计执行次数与成功率
从日志文件中提取关键指标,了解 Skill 的整体运行状况。
实例
# 文件路径:scripts/metrics_report.py
# 从日志文件统计 Skill 运行指标
import json
import os
from collections import defaultdict
LOG_FILE = "/home/claude/skill_run.log"
def generate_metrics_report(log_file: str) -> dict:
"""读取日志文件,生成指标摘要"""
if not os.path.exists(log_file):
return {"error": f"日志文件不存在:{log_file}"}
counts = defaultdict(int) # 各类事件计数
errors = [] # 错误记录
elapsed = [] # 执行耗时列表(ms)
with open(log_file, encoding="utf-8") as f:
for line in f:
line = line.strip()
if not line:
continue
try:
entry = json.loads(line)
except json.JSONDecodeError:
continue
event = entry.get("event", "")
counts[event] += 1
if entry.get("level") == "error":
errors.append({
"ts": entry.get("ts"),
"event": event,
"error": entry.get("error", ""),
"step": entry.get("step", "")
})
if event == "skill_complete" and "elapsed_ms" in entry:
elapsed.append(entry["elapsed_ms"])
total_runs = counts.get("skill_start", 0)
total_errors = len(errors)
success_rate = round((total_runs - total_errors) / total_runs * 100, 1) \
if total_runs > 0 else 0
return {
"total_runs": total_runs,
"total_errors": total_errors,
"success_rate": f"{success_rate}%",
"avg_elapsed_ms": round(sum(elapsed) / len(elapsed)) if elapsed else 0,
"recent_errors": errors[-5:] # 最近 5 条错误
}
if __name__ == "__main__":
report = generate_metrics_report(LOG_FILE)
print(json.dumps(report, ensure_ascii=False, indent=2))
# 从日志文件统计 Skill 运行指标
import json
import os
from collections import defaultdict
LOG_FILE = "/home/claude/skill_run.log"
def generate_metrics_report(log_file: str) -> dict:
"""读取日志文件,生成指标摘要"""
if not os.path.exists(log_file):
return {"error": f"日志文件不存在:{log_file}"}
counts = defaultdict(int) # 各类事件计数
errors = [] # 错误记录
elapsed = [] # 执行耗时列表(ms)
with open(log_file, encoding="utf-8") as f:
for line in f:
line = line.strip()
if not line:
continue
try:
entry = json.loads(line)
except json.JSONDecodeError:
continue
event = entry.get("event", "")
counts[event] += 1
if entry.get("level") == "error":
errors.append({
"ts": entry.get("ts"),
"event": event,
"error": entry.get("error", ""),
"step": entry.get("step", "")
})
if event == "skill_complete" and "elapsed_ms" in entry:
elapsed.append(entry["elapsed_ms"])
total_runs = counts.get("skill_start", 0)
total_errors = len(errors)
success_rate = round((total_runs - total_errors) / total_runs * 100, 1) \
if total_runs > 0 else 0
return {
"total_runs": total_runs,
"total_errors": total_errors,
"success_rate": f"{success_rate}%",
"avg_elapsed_ms": round(sum(elapsed) / len(elapsed)) if elapsed else 0,
"recent_errors": errors[-5:] # 最近 5 条错误
}
if __name__ == "__main__":
report = generate_metrics_report(LOG_FILE)
print(json.dumps(report, ensure_ascii=False, indent=2))
{
"total_runs": 42,
"total_errors": 3,
"success_rate": "92.9%",
"avg_elapsed_ms": 1650,
"recent_errors": [
{"ts": "2026-05-18T09:12:00+00:00", "event": "step_failed", "step": "generate_chart", "error": "openpyxl not found"}
]
}
执行追踪:记录每步耗时
当某次执行特别慢时,通过步骤级别的耗时记录可以快速定位瓶颈。
实例
# 文件路径:scripts/tracer.py
import time
import json
import sys
from skill_logger import log
class ExecutionTracer:
"""记录一次 Skill 执行中各步骤的耗时"""
def __init__(self, skill_name: str):
self.skill_name = skill_name
self.steps = []
self.start_time = time.perf_counter()
def step(self, step_name: str):
"""标记一个步骤开始,自动记录上一步的耗时"""
now = time.perf_counter()
if self.steps:
# 计算上一步的耗时
self.steps[-1]["elapsed_ms"] = int((now - self.steps[-1]["_start"]) * 1000)
del self.steps[-1]["_start"]
self.steps.append({"name": step_name, "_start": now})
log("info", "step_start", skill=self.skill_name, step=step_name)
def finish(self):
"""标记执行结束,输出完整追踪报告"""
now = time.perf_counter()
if self.steps:
self.steps[-1]["elapsed_ms"] = int((now - self.steps[-1]["_start"]) * 1000)
del self.steps[-1]["_start"]
total_ms = int((now - self.start_time) * 1000)
report = {"skill": self.skill_name,
"total_ms": total_ms, "steps": self.steps}
log("info", "trace_complete", skill=self.skill_name,
total_ms=total_ms, steps=self.steps)
return report
# 使用示例
if __name__ == "__main__":
tracer = ExecutionTracer("data-analyzer")
tracer.step("读取文件")
time.sleep(0.3) # 模拟耗时操作
tracer.step("数据清洗")
time.sleep(0.1)
tracer.step("生成报告")
time.sleep(0.8)
report = tracer.finish()
print(json.dumps(report, ensure_ascii=False, indent=2))
import time
import json
import sys
from skill_logger import log
class ExecutionTracer:
"""记录一次 Skill 执行中各步骤的耗时"""
def __init__(self, skill_name: str):
self.skill_name = skill_name
self.steps = []
self.start_time = time.perf_counter()
def step(self, step_name: str):
"""标记一个步骤开始,自动记录上一步的耗时"""
now = time.perf_counter()
if self.steps:
# 计算上一步的耗时
self.steps[-1]["elapsed_ms"] = int((now - self.steps[-1]["_start"]) * 1000)
del self.steps[-1]["_start"]
self.steps.append({"name": step_name, "_start": now})
log("info", "step_start", skill=self.skill_name, step=step_name)
def finish(self):
"""标记执行结束,输出完整追踪报告"""
now = time.perf_counter()
if self.steps:
self.steps[-1]["elapsed_ms"] = int((now - self.steps[-1]["_start"]) * 1000)
del self.steps[-1]["_start"]
total_ms = int((now - self.start_time) * 1000)
report = {"skill": self.skill_name,
"total_ms": total_ms, "steps": self.steps}
log("info", "trace_complete", skill=self.skill_name,
total_ms=total_ms, steps=self.steps)
return report
# 使用示例
if __name__ == "__main__":
tracer = ExecutionTracer("data-analyzer")
tracer.step("读取文件")
time.sleep(0.3) # 模拟耗时操作
tracer.step("数据清洗")
time.sleep(0.1)
tracer.step("生成报告")
time.sleep(0.8)
report = tracer.finish()
print(json.dumps(report, ensure_ascii=False, indent=2))
{
"skill": "data-analyzer",
"total_ms": 1203,
"steps": [
{"name": "读取文件", "elapsed_ms": 312},
{"name": "数据清洗", "elapsed_ms": 108},
{"name": "生成报告", "elapsed_ms": 783}
]
}
可观测性不是只有大规模系统才需要的。即使是个人使用的 Skill,在出现问题时,一份清晰的日志文件往往能将排查时间从几小时缩短到几分钟。
