Clarify cleaner usage evidence boundaries

The cleanup helper now streams local logs, reports which evidence sources were merged, and keeps README table coverage tied to the central skill-name fixture so the documented cleanup signal stays trustworthy for large local histories and mixed imported counts.

Constraint: Follow-up addresses PR #178 review comments without changing the non-destructive recommendation model.

Rejected: Filtering imported usage JSON by --days inside the helper | imported counts are already aggregated and lack per-record timestamps.

Confidence: high

Scope-risk: narrow

Directive: Keep --usage-json documented as pre-windowed unless the input schema gains timestamped per-record events.

Tested: PYTHONPATH=scripts python3 -m unittest scripts.test_k_skill_cleaner

Tested: node --test scripts/skill-docs.test.js

Tested: npm run lint

Tested: npm run typecheck && npm test

Tested: npm run ci
This commit is contained in:
Jeffrey (Dongkyu) Kim 2026-04-28 18:08:17 +09:00
commit 0b280839d6
5 changed files with 87 additions and 12 deletions

View file

@ -31,4 +31,4 @@ python3 scripts/k_skill_cleaner.py \
--keep k-skill-setup,k-skill-cleaner
```
`--days 90`은 최근 90일 window만 카운트한다. timestamp가 없는 로그 줄은 파일 mtime으로 포함/제외를 결정한다. 출력은 파일 삭제를 하지 않는 JSON 리포트다. `zero_triggers``low_usage`만 있는 항목은 바로 삭제하지 말고 검토 후보로 남긴다. `interview_never_use`가 포함된 항목은 사용자의 의도가 확인된 삭제 후보로 보고한다.
`--days 90`은 최근 90일 window만 카운트한다. timestamp가 없는 로그 줄은 파일 mtime으로 포함/제외를 결정한다. 단, `--usage-json`으로 넣은 값은 이미 집계된 count로 간주하므로 `--days`/`--since`로 다시 필터링하지 않는다. 같은 기간의 통계를 export하거나 직접 전처리한 JSON을 넣어야 한다. 출력은 `usage_json``scanned_logs` provenance를 포함하고, 파일 삭제를 하지 않는 JSON 리포트다. `zero_triggers``low_usage`만 있는 항목은 바로 삭제하지 말고 검토 후보로 남긴다. `interview_never_use`가 포함된 항목은 사용자의 의도가 확인된 삭제 후보로 보고한다.

View file

@ -53,12 +53,16 @@ For agent exports or hand-curated counts, pass a JSON object mapping skill name
python3 scripts/k_skill_cleaner.py --skills-root . --usage-json usage-counts.json --days 90
```
`--days` and `--since` filter scanned log records only. `--usage-json` values are already-aggregated counts, so prepare/export that JSON for the same time window before passing it to the helper.
The helper prints JSON with:
- `skill_count`: number of root-level skills discovered.
- `candidates`: ranked `remove` or `review` candidates with `trigger_count` and `reasons`.
- `agent_usage_sources`: the agent-specific paths and caveats above.
- `time_window`: the effective `--since`/`--days` cutoff and mtime fallback caveat.
- `usage_json`: whether imported counts were merged and the pre-windowing caveat.
- `scanned_logs`: how many readable log files were scanned and which paths contributed best-effort evidence.
- `safety`: reminder that no files were deleted.
## Recommendation policy

View file

@ -228,17 +228,21 @@ def collect_skill_usage(
path = Path(raw_path).expanduser()
if not path.is_file():
continue
for line in path.read_text(encoding="utf-8", errors="replace").splitlines():
parsed: Any | None = None
try:
parsed = json.loads(line)
except json.JSONDecodeError:
parsed = None
if not _line_is_in_window(path, line, parsed, since_dt):
continue
for skill in skills:
if (parsed is not None and _json_mentions_skill(parsed, skill)) or _line_mentions_skill(line, skill):
counts[skill] += 1
try:
with path.open(encoding="utf-8", errors="replace") as handle:
for line in handle:
parsed: Any | None = None
try:
parsed = json.loads(line)
except json.JSONDecodeError:
parsed = None
if not _line_is_in_window(path, line, parsed, since_dt):
continue
for skill in skills:
if (parsed is not None and _json_mentions_skill(parsed, skill)) or _line_mentions_skill(line, skill):
counts[skill] += 1
except OSError:
continue
return counts
@ -365,6 +369,7 @@ def main(argv: list[str] | None = None) -> int:
if args.scan_default_logs:
log_paths.extend(expand_default_log_paths())
since = _resolve_since(args.days, args.since)
scanned_log_paths = sorted({str(path.expanduser()) for path in log_paths if path.expanduser().is_file()})
log_counts = collect_skill_usage(log_paths, skill_names, since=since)
for skill, count in log_counts.items():
usage_counts[skill] = usage_counts.get(skill, 0) + count
@ -382,8 +387,19 @@ def main(argv: list[str] | None = None) -> int:
"time_window": {
"since": since.isoformat() if since is not None else None,
"days": args.days if args.since is None else None,
"scope": "Applies to scanned logs only; usage JSON counts are merged as already aggregated/pre-windowed input.",
"fallback": "Untimestamped log lines are included or skipped by log file mtime.",
},
"usage_json": {
"applied": args.usage_json is not None,
"path": args.usage_json,
"caveat": "Usage JSON counts are treated as already aggregated/pre-windowed and are not filtered by --days or --since.",
},
"scanned_logs": {
"count": len(scanned_log_paths),
"paths": scanned_log_paths,
"caveat": "Unreadable log files are skipped; trigger detection is best-effort.",
},
"safety": "No files were deleted. Review candidates and remove skills in a separate explicit edit.",
}
print(json.dumps(report, ensure_ascii=False, indent=2))

View file

@ -3281,6 +3281,7 @@ const README_SKILL_NAME_COLUMN_MAPPING = [
["네이버 뉴스 검색", "naver-news-search"],
["한국어 글자 수 세기", "korean-character-count"],
["한국어 유행어 글쓰기", "korean-slang-writing"],
["K-스킬 클리너", "k-skill-cleaner"],
];
test("README skill table header advertises the new 스킬 이름 column (issue #165)", () => {
@ -3296,6 +3297,11 @@ test("README skill table header advertises the new 스킬 이름 column (issue #
test("README skill table includes inline-code skill names for every documented row (issue #165)", () => {
const readme = read("README.md");
assert.ok(
README_SKILL_NAME_COLUMN_MAPPING.some(([, skillName]) => skillName === "k-skill-cleaner"),
"expected k-skill-cleaner to be covered by the central README skill-name mapping fixture",
);
for (const [label, skillName] of README_SKILL_NAME_COLUMN_MAPPING) {
const escapedLabel = escapeRegex(label);
const escapedName = escapeRegex(skillName);

View file

@ -5,6 +5,7 @@ import sys
import tempfile
import unittest
from pathlib import Path
from unittest.mock import patch
from k_skill_cleaner import (
AGENT_USAGE_SOURCES,
@ -85,6 +86,54 @@ class KSkillCleanerTest(unittest.TestCase):
self.assertEqual(counts["fallback-skill"], 1)
self.assertEqual(counts["old-fallback"], 0)
def test_collect_skill_usage_streams_log_files_without_reading_whole_file(self):
with tempfile.TemporaryDirectory() as tmp:
log_path = Path(tmp) / "codex.jsonl"
log_path.write_text(json.dumps({"skill": "kbo-results"}) + "\n", encoding="utf-8")
with patch.object(Path, "read_text", side_effect=AssertionError("collect_skill_usage must stream logs")):
counts = collect_skill_usage([log_path], ["kbo-results", "unused"])
self.assertEqual(counts["kbo-results"], 1)
self.assertEqual(counts["unused"], 0)
def test_cli_reports_usage_json_provenance_and_window_caveat(self):
with tempfile.TemporaryDirectory() as tmp:
root = Path(tmp)
skill_dir = root / "kbo-results"
skill_dir.mkdir()
(skill_dir / "SKILL.md").write_text("---\nname: kbo-results\n", encoding="utf-8")
usage_json = root / "usage.json"
usage_json.write_text(json.dumps({"kbo-results": 3}), encoding="utf-8")
result = subprocess.run(
[
sys.executable,
"-c",
(
"import sys; "
"from k_skill_cleaner import main; "
"sys.exit(main(sys.argv[1:]))"
),
"--skills-root",
str(root),
"--usage-json",
str(usage_json),
"--days",
"90",
],
check=True,
text=True,
capture_output=True,
)
report = json.loads(result.stdout)
self.assertTrue(report["usage_json"]["applied"])
self.assertEqual(report["usage_json"]["path"], str(usage_json))
self.assertIn("pre-windowed", report["usage_json"]["caveat"])
self.assertEqual(report["scanned_logs"]["count"], 0)
self.assertIn("usage JSON", report["time_window"]["scope"])
def test_ranks_deletion_candidates_with_interview_and_usage_reasons(self):
candidates = rank_cleanup_candidates(
skill_names=["unused", "rare", "protected", "active"],