mirror of
https://github.com/NomaDamas/k-skill.git
synced 2026-06-24 02:04:11 +00:00
11 skills that need specific inputs (not just a 'demonstrate' query) now ship with a hardcoded test_prompt in config/skill-overrides.yml: flight-ticket-search ICN -> NRT, 2026-08-20 one-way nts-business-registration 124-81-00998 (Samsung Electronics) korean-stock-search 005930 Samsung 5-day quote joseon-sillok-search 키워드 훈민정음 korean-law-search 산업안전보건법 제5조 library-book-search 코스모스 칼 세이건 lotto-results latest round k-schoollunch-menu 서울특별시교육청 초등학교 오늘 식단 delivery-tracking CJ dummy invoice (negative case ok) ticket-availability YES24 / 인터파크 sample zipcode-search 서울특별시 강남구 테헤란로 152 These were previously synthesized from the SKILL.md first When-to-use bullet, which is a one-line teaser without concrete inputs. The agent would then either ask the user for the missing input (partial-success) or fall back to a generic demo (often producing a VERDICT: FAIL response). Both got mis-classified as fail by the judge. qa_utils.synthesize_test_prompt now honors default_inputs.test_prompt as a verbatim override (only appending the VERDICT line if the override does not already include it). Two additional fixes for negative-case correctness: 1. judge-prompt.md: explicitly tells the judge that the agent's literal VERDICT: PASS / VERDICT: FAIL is just a hint, not binding. A skill that correctly returns 'no such business number' or 'invoice not found' for a deliberately invalid input is PASS, not fail. 2. judge-skill.py: drop the deterministic gate that flipped pass to fail when 'VERDICT: PASS' literal was missing from the transcript. That gate was producing false fails for negative-case tests where the agent correctly responded with VERDICT: FAIL because the skill rejected an invalid input. The judge LLM (gpt-5.5) is now trusted to evaluate the transcript against the SKILL.md 'Done when' criteria. Verified live: - nts-business-registration with valid number -> pass/success (0.99) - nts-business-registration with fake number -> pass/success (0.99) - flight-ticket-search ICN->NRT 2026-08-20 -> pass/success (0.99)
99 lines
2.8 KiB
Python
Executable file
99 lines
2.8 KiB
Python
Executable file
from __future__ import annotations
|
|
|
|
import re
|
|
from pathlib import Path
|
|
from typing import Iterable
|
|
|
|
|
|
DEFAULT_TEST_LOCATION = "서울역 (37.5665,126.9780)"
|
|
|
|
VERDICT_INSTRUCTION = (
|
|
"After your answer, end with a single line that is exactly one of: "
|
|
"VERDICT: PASS or VERDICT: FAIL."
|
|
)
|
|
|
|
_STRIKE_RE = re.compile(r"~~\s*`?([A-Za-z0-9][A-Za-z0-9_.\-]*)`?\s*~~")
|
|
_DEPRECATION_MARK_RE = re.compile(r"지원\s*중단")
|
|
|
|
|
|
def load_overrides(path):
|
|
p = Path(path)
|
|
if not p.is_file():
|
|
return {}
|
|
try:
|
|
import yaml
|
|
except ImportError as exc:
|
|
raise RuntimeError(
|
|
"PyYAML is required to load skill-overrides.yml — `pip install pyyaml`"
|
|
) from exc
|
|
|
|
data = yaml.safe_load(p.read_text(encoding="utf-8"))
|
|
if data is None:
|
|
return {}
|
|
if not isinstance(data, dict):
|
|
raise ValueError(
|
|
f"skill-overrides.yml must be a YAML mapping at top level, got {type(data).__name__}"
|
|
)
|
|
return {k: v for k, v in data.items() if isinstance(v, dict)}
|
|
|
|
|
|
def parse_readme_deprecations(readme_path):
|
|
p = Path(readme_path)
|
|
if not p.is_file():
|
|
return set()
|
|
try:
|
|
text = p.read_text(encoding="utf-8")
|
|
except OSError:
|
|
return set()
|
|
|
|
deprecated = set()
|
|
for line in text.splitlines():
|
|
if not _DEPRECATION_MARK_RE.search(line):
|
|
continue
|
|
for match in _STRIKE_RE.finditer(line):
|
|
name = match.group(1).strip()
|
|
if name:
|
|
deprecated.add(name)
|
|
return deprecated
|
|
|
|
|
|
def _first_non_empty(values: Iterable[str]):
|
|
for v in values:
|
|
if isinstance(v, str) and v.strip():
|
|
return v.strip()
|
|
return None
|
|
|
|
|
|
def synthesize_test_prompt(name, when_to_use, description, category_flags, default_inputs):
|
|
flags = category_flags or {}
|
|
inputs = default_inputs or {}
|
|
|
|
override_prompt = inputs.get("test_prompt") if isinstance(inputs, dict) else None
|
|
if isinstance(override_prompt, str) and override_prompt.strip():
|
|
body = override_prompt.strip()
|
|
if VERDICT_INSTRUCTION in body or "VERDICT: PASS" in body:
|
|
return body
|
|
return f"{body} Use the `{name}` skill to answer this. {VERDICT_INSTRUCTION}"
|
|
|
|
query = (
|
|
_first_non_empty(when_to_use or [])
|
|
or (description.strip() if isinstance(description, str) and description.strip() else None)
|
|
or f"Demonstrate the {name} skill."
|
|
)
|
|
|
|
parts = []
|
|
if flags.get("location"):
|
|
loc = inputs.get("location") or DEFAULT_TEST_LOCATION
|
|
parts.append(f"내 현재 위치는 {loc} 이야.")
|
|
parts.append(query)
|
|
parts.append(f"Use the `{name}` skill to answer this. {VERDICT_INSTRUCTION}")
|
|
return " ".join(parts)
|
|
|
|
|
|
__all__ = [
|
|
"DEFAULT_TEST_LOCATION",
|
|
"VERDICT_INSTRUCTION",
|
|
"load_overrides",
|
|
"parse_readme_deprecations",
|
|
"synthesize_test_prompt",
|
|
]
|