现在位置: 首页 > Skills 教程 > 正文

Skills 监控与可观测性

Skill 发布后,如何知道它工作得好不好?是否频繁出错?哪些步骤最慢?

本篇介绍如何为 Skill 建立可观测性,通过日志和指标了解运行状态。


可观测性的三个维度

维度回答的问题实现方式
日志(Logs)发生了什么?出了什么错?结构化 JSON 日志文件
指标(Metrics)执行了多少次?耗时多少?成功率多少?追加写入指标文件
执行记录(Trace)每次执行经过了哪些步骤?每步耗时?步骤级别的时间戳记录

结构化日志的输出规范

Skill 脚本应将运行日志写入固定路径的文件,便于事后查阅。

实例

# 文件路径:scripts/skill_logger.py
import json
import os
import sys
from datetime import datetime, timezone

LOG_FILE = "/home/claude/skill_run.log"

def log(level: str, event: str, skill: str = "", **context):
    """
    写入结构化日志条目

    参数:
        level:   日志级别(info / warning / error)
        event:   事件名称(snake_case)
        skill:   Skill 名称
        context: 任意附加字段
    """

    entry = {
        "ts":    datetime.now(timezone.utc).isoformat(),
        "level": level,
        "event": event,
        "skill": skill,
        **context
    }
    line = json.dumps(entry, ensure_ascii=False)

    # 同时输出到 stderr(实时可见)和日志文件(持久化)
    print(line, file=sys.stderr)
    with open(LOG_FILE, "a", encoding="utf-8") as f:
        f.write(line + "\n")

# 使用示例
if __name__ == "__main__":
    log("info",    "skill_start",    skill="data-analyzer",
        file="/mnt/user-data/uploads/runoob.csv")
    log("info",    "step_complete",  skill="data-analyzer",
        step="clean", rows_removed=3, elapsed_ms=240)
    log("info",    "skill_complete", skill="data-analyzer",
        output="/mnt/user-data/outputs/report.xlsx", elapsed_ms=1830)
    log("error",   "step_failed",    skill="data-analyzer",
        step="generate_chart", error="openpyxl not found")
{"ts": "2026-05-18T10:23:05+00:00", "level": "info",  "event": "skill_start",    "skill": "data-analyzer", "file": "/mnt/user-data/uploads/runoob.csv"}
{"ts": "2026-05-18T10:23:05+00:00", "level": "info",  "event": "step_complete",  "skill": "data-analyzer", "step": "clean", "rows_removed": 3, "elapsed_ms": 240}
{"ts": "2026-05-18T10:23:07+00:00", "level": "info",  "event": "skill_complete", "skill": "data-analyzer", "output": "/mnt/user-data/outputs/report.xlsx", "elapsed_ms": 1830}
{"ts": "2026-05-18T10:23:07+00:00", "level": "error", "event": "step_failed",    "skill": "data-analyzer", "step": "generate_chart", "error": "openpyxl not found"}

指标聚合:统计执行次数与成功率

从日志文件中提取关键指标,了解 Skill 的整体运行状况。

实例

# 文件路径:scripts/metrics_report.py
# 从日志文件统计 Skill 运行指标

import json
import os
from collections import defaultdict

LOG_FILE = "/home/claude/skill_run.log"

def generate_metrics_report(log_file: str) -> dict:
    """读取日志文件,生成指标摘要"""
    if not os.path.exists(log_file):
        return {"error": f"日志文件不存在:{log_file}"}

    counts  = defaultdict(int)        # 各类事件计数
    errors  = []                       # 错误记录
    elapsed = []                       # 执行耗时列表(ms)

    with open(log_file, encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            try:
                entry = json.loads(line)
            except json.JSONDecodeError:
                continue

            event = entry.get("event", "")
            counts[event] += 1

            if entry.get("level") == "error":
                errors.append({
                    "ts":    entry.get("ts"),
                    "event": event,
                    "error": entry.get("error", ""),
                    "step":  entry.get("step", "")
                })

            if event == "skill_complete" and "elapsed_ms" in entry:
                elapsed.append(entry["elapsed_ms"])

    total_runs   = counts.get("skill_start", 0)
    total_errors = len(errors)
    success_rate = round((total_runs - total_errors) / total_runs * 100, 1) \
                   if total_runs > 0 else 0

    return {
        "total_runs":    total_runs,
        "total_errors":  total_errors,
        "success_rate":  f"{success_rate}%",
        "avg_elapsed_ms": round(sum(elapsed) / len(elapsed)) if elapsed else 0,
        "recent_errors": errors[-5:]   # 最近 5 条错误
    }

if __name__ == "__main__":
    report = generate_metrics_report(LOG_FILE)
    print(json.dumps(report, ensure_ascii=False, indent=2))
{
  "total_runs":     42,
  "total_errors":   3,
  "success_rate":   "92.9%",
  "avg_elapsed_ms": 1650,
  "recent_errors": [
    {"ts": "2026-05-18T09:12:00+00:00", "event": "step_failed", "step": "generate_chart", "error": "openpyxl not found"}
  ]
}

执行追踪:记录每步耗时

当某次执行特别慢时,通过步骤级别的耗时记录可以快速定位瓶颈。

实例

# 文件路径:scripts/tracer.py
import time
import json
import sys
from skill_logger import log

class ExecutionTracer:
    """记录一次 Skill 执行中各步骤的耗时"""

    def __init__(self, skill_name: str):
        self.skill_name = skill_name
        self.steps      = []
        self.start_time = time.perf_counter()

    def step(self, step_name: str):
        """标记一个步骤开始,自动记录上一步的耗时"""
        now = time.perf_counter()
        if self.steps:
            # 计算上一步的耗时
            self.steps[-1]["elapsed_ms"] = int((now - self.steps[-1]["_start"]) * 1000)
            del self.steps[-1]["_start"]

        self.steps.append({"name": step_name, "_start": now})
        log("info", "step_start", skill=self.skill_name, step=step_name)

    def finish(self):
        """标记执行结束,输出完整追踪报告"""
        now = time.perf_counter()
        if self.steps:
            self.steps[-1]["elapsed_ms"] = int((now - self.steps[-1]["_start"]) * 1000)
            del self.steps[-1]["_start"]

        total_ms = int((now - self.start_time) * 1000)
        report   = {"skill": self.skill_name,
                    "total_ms": total_ms, "steps": self.steps}

        log("info", "trace_complete", skill=self.skill_name,
            total_ms=total_ms, steps=self.steps)
        return report

# 使用示例
if __name__ == "__main__":
    tracer = ExecutionTracer("data-analyzer")

    tracer.step("读取文件")
    time.sleep(0.3)   # 模拟耗时操作

    tracer.step("数据清洗")
    time.sleep(0.1)

    tracer.step("生成报告")
    time.sleep(0.8)

    report = tracer.finish()
    print(json.dumps(report, ensure_ascii=False, indent=2))
{
  "skill": "data-analyzer",
  "total_ms": 1203,
  "steps": [
    {"name": "读取文件",  "elapsed_ms": 312},
    {"name": "数据清洗",  "elapsed_ms": 108},
    {"name": "生成报告",  "elapsed_ms": 783}
  ]
}

可观测性不是只有大规模系统才需要的。即使是个人使用的 Skill,在出现问题时,一份清晰的日志文件往往能将排查时间从几小时缩短到几分钟。