Clarify cleaner usage evidence boundaries

The cleanup helper now streams local logs, reports which evidence sources were merged, and keeps README table coverage tied to the central skill-name fixture so the documented cleanup signal stays trustworthy for large local histories and mixed imported counts. Constraint: Follow-up addresses PR #178 review comments without changing the non-destructive recommendation model. Rejected: Filtering imported usage JSON by --days inside the helper | imported counts are already aggregated and lack per-record timestamps. Confidence: high Scope-risk: narrow Directive: Keep --usage-json documented as pre-windowed unless the input schema gains timestamped per-record events. Tested: PYTHONPATH=scripts python3 -m unittest scripts.test_k_skill_cleaner Tested: node --test scripts/skill-docs.test.js Tested: npm run lint Tested: npm run typecheck && npm test Tested: npm run ci
2026-06-24 02:04:11 +00:00 · 2026-04-28 18:08:17 +09:00 · 2026-04-28 18:08:17 +09:00 · 0b280839d6
commit 0b280839d6
parent 1935e641a6
5 changed files with 87 additions and 12 deletions
--- a/docs/features/k-skill-cleaner.md
+++ b/docs/features/k-skill-cleaner.md
@ -31,4 +31,4 @@ python3 scripts/k_skill_cleaner.py \
  --keep k-skill-setup,k-skill-cleaner
 ```

-`--days 90`은 최근 90일 window만 카운트한다. timestamp가 없는 로그 줄은 파일 mtime으로 포함/제외를 결정한다. 출력은 파일 삭제를 하지 않는 JSON 리포트다. `zero_triggers`나 `low_usage`만 있는 항목은 바로 삭제하지 말고 검토 후보로 남긴다. `interview_never_use`가 포함된 항목은 사용자의 의도가 확인된 삭제 후보로 보고한다.
+`--days 90`은 최근 90일 window만 카운트한다. timestamp가 없는 로그 줄은 파일 mtime으로 포함/제외를 결정한다. 단, `--usage-json`으로 넣은 값은 이미 집계된 count로 간주하므로 `--days`/`--since`로 다시 필터링하지 않는다. 같은 기간의 통계를 export하거나 직접 전처리한 JSON을 넣어야 한다. 출력은 `usage_json`과 `scanned_logs` provenance를 포함하고, 파일 삭제를 하지 않는 JSON 리포트다. `zero_triggers`나 `low_usage`만 있는 항목은 바로 삭제하지 말고 검토 후보로 남긴다. `interview_never_use`가 포함된 항목은 사용자의 의도가 확인된 삭제 후보로 보고한다.
--- a/k-skill-cleaner/SKILL.md
+++ b/k-skill-cleaner/SKILL.md
@ -53,12 +53,16 @@ For agent exports or hand-curated counts, pass a JSON object mapping skill name
 python3 scripts/k_skill_cleaner.py --skills-root . --usage-json usage-counts.json --days 90
 ```

+`--days` and `--since` filter scanned log records only. `--usage-json` values are already-aggregated counts, so prepare/export that JSON for the same time window before passing it to the helper.
+
 The helper prints JSON with:

 - `skill_count`: number of root-level skills discovered.
 - `candidates`: ranked `remove` or `review` candidates with `trigger_count` and `reasons`.
 - `agent_usage_sources`: the agent-specific paths and caveats above.
 - `time_window`: the effective `--since`/`--days` cutoff and mtime fallback caveat.
+- `usage_json`: whether imported counts were merged and the pre-windowing caveat.
+- `scanned_logs`: how many readable log files were scanned and which paths contributed best-effort evidence.
 - `safety`: reminder that no files were deleted.

 ## Recommendation policy
--- a/k-skill-cleaner/scripts/k_skill_cleaner.py
+++ b/k-skill-cleaner/scripts/k_skill_cleaner.py
@ -228,17 +228,21 @@ def collect_skill_usage(
        path = Path(raw_path).expanduser()
        if not path.is_file():
            continue
-        for line in path.read_text(encoding="utf-8", errors="replace").splitlines():
-            parsed: Any | None = None
-            try:
-                parsed = json.loads(line)
-            except json.JSONDecodeError:
-                parsed = None
-            if not _line_is_in_window(path, line, parsed, since_dt):
-                continue
-            for skill in skills:
-                if (parsed is not None and _json_mentions_skill(parsed, skill)) or _line_mentions_skill(line, skill):
-                    counts[skill] += 1
+        try:
+            with path.open(encoding="utf-8", errors="replace") as handle:
+                for line in handle:
+                    parsed: Any | None = None
+                    try:
+                        parsed = json.loads(line)
+                    except json.JSONDecodeError:
+                        parsed = None
+                    if not _line_is_in_window(path, line, parsed, since_dt):
+                        continue
+                    for skill in skills:
+                        if (parsed is not None and _json_mentions_skill(parsed, skill)) or _line_mentions_skill(line, skill):
+                            counts[skill] += 1
+        except OSError:
+            continue
    return counts


@ -365,6 +369,7 @@ def main(argv: list[str] | None = None) -> int:
    if args.scan_default_logs:
        log_paths.extend(expand_default_log_paths())
    since = _resolve_since(args.days, args.since)
+    scanned_log_paths = sorted({str(path.expanduser()) for path in log_paths if path.expanduser().is_file()})
    log_counts = collect_skill_usage(log_paths, skill_names, since=since)
    for skill, count in log_counts.items():
        usage_counts[skill] = usage_counts.get(skill, 0) + count
@ -382,8 +387,19 @@ def main(argv: list[str] | None = None) -> int:
        "time_window": {
            "since": since.isoformat() if since is not None else None,
            "days": args.days if args.since is None else None,
+            "scope": "Applies to scanned logs only; usage JSON counts are merged as already aggregated/pre-windowed input.",
            "fallback": "Untimestamped log lines are included or skipped by log file mtime.",
        },
+        "usage_json": {
+            "applied": args.usage_json is not None,
+            "path": args.usage_json,
+            "caveat": "Usage JSON counts are treated as already aggregated/pre-windowed and are not filtered by --days or --since.",
+        },
+        "scanned_logs": {
+            "count": len(scanned_log_paths),
+            "paths": scanned_log_paths,
+            "caveat": "Unreadable log files are skipped; trigger detection is best-effort.",
+        },
        "safety": "No files were deleted. Review candidates and remove skills in a separate explicit edit.",
    }
    print(json.dumps(report, ensure_ascii=False, indent=2))
--- a/scripts/skill-docs.test.js
+++ b/scripts/skill-docs.test.js
@ -3281,6 +3281,7 @@ const README_SKILL_NAME_COLUMN_MAPPING = [
  ["네이버 뉴스 검색", "naver-news-search"],
  ["한국어 글자 수 세기", "korean-character-count"],
  ["한국어 유행어 글쓰기", "korean-slang-writing"],
+  ["K-스킬 클리너", "k-skill-cleaner"],
 ];

 test("README skill table header advertises the new 스킬 이름 column (issue #165)", () => {
@ -3296,6 +3297,11 @@ test("README skill table header advertises the new 스킬 이름 column (issue #
 test("README skill table includes inline-code skill names for every documented row (issue #165)", () => {
  const readme = read("README.md");

+  assert.ok(
+    README_SKILL_NAME_COLUMN_MAPPING.some(([, skillName]) => skillName === "k-skill-cleaner"),
+    "expected k-skill-cleaner to be covered by the central README skill-name mapping fixture",
+  );
+
  for (const [label, skillName] of README_SKILL_NAME_COLUMN_MAPPING) {
    const escapedLabel = escapeRegex(label);
    const escapedName = escapeRegex(skillName);
--- a/scripts/test_k_skill_cleaner.py
+++ b/scripts/test_k_skill_cleaner.py
@ -5,6 +5,7 @@ import sys
 import tempfile
 import unittest
 from pathlib import Path
+from unittest.mock import patch

 from k_skill_cleaner import (
    AGENT_USAGE_SOURCES,
@ -85,6 +86,54 @@ class KSkillCleanerTest(unittest.TestCase):
            self.assertEqual(counts["fallback-skill"], 1)
            self.assertEqual(counts["old-fallback"], 0)

+    def test_collect_skill_usage_streams_log_files_without_reading_whole_file(self):
+        with tempfile.TemporaryDirectory() as tmp:
+            log_path = Path(tmp) / "codex.jsonl"
+            log_path.write_text(json.dumps({"skill": "kbo-results"}) + "\n", encoding="utf-8")
+
+            with patch.object(Path, "read_text", side_effect=AssertionError("collect_skill_usage must stream logs")):
+                counts = collect_skill_usage([log_path], ["kbo-results", "unused"])
+
+            self.assertEqual(counts["kbo-results"], 1)
+            self.assertEqual(counts["unused"], 0)
+
+    def test_cli_reports_usage_json_provenance_and_window_caveat(self):
+        with tempfile.TemporaryDirectory() as tmp:
+            root = Path(tmp)
+            skill_dir = root / "kbo-results"
+            skill_dir.mkdir()
+            (skill_dir / "SKILL.md").write_text("---\nname: kbo-results\n", encoding="utf-8")
+            usage_json = root / "usage.json"
+            usage_json.write_text(json.dumps({"kbo-results": 3}), encoding="utf-8")
+
+            result = subprocess.run(
+                [
+                    sys.executable,
+                    "-c",
+                    (
+                        "import sys; "
+                        "from k_skill_cleaner import main; "
+                        "sys.exit(main(sys.argv[1:]))"
+                    ),
+                    "--skills-root",
+                    str(root),
+                    "--usage-json",
+                    str(usage_json),
+                    "--days",
+                    "90",
+                ],
+                check=True,
+                text=True,
+                capture_output=True,
+            )
+            report = json.loads(result.stdout)
+
+            self.assertTrue(report["usage_json"]["applied"])
+            self.assertEqual(report["usage_json"]["path"], str(usage_json))
+            self.assertIn("pre-windowed", report["usage_json"]["caveat"])
+            self.assertEqual(report["scanned_logs"]["count"], 0)
+            self.assertIn("usage JSON", report["time_window"]["scope"])
+
    def test_ranks_deletion_candidates_with_interview_and_usage_reasons(self):
        candidates = rank_cleanup_candidates(
            skill_names=["unused", "rare", "protected", "active"],