k-skill/scripts/geeknews_search.py
Jeffrey (Dongkyu) Kim 43e8625986 Add a repeatable GeekNews lookup path without unofficial APIs
Issue #108 needs a dedicated read-only skill that can browse,
search, and inspect GeekNews posts using the public feed alone.
The implementation adds a fixture-backed Atom helper, skill/docs
surfaces, and install/README wiring, then verifies the helper
against the live GeekNews feed.

Constraint: Must stay RSS-first and avoid new dependencies or unofficial APIs
Constraint: Skill development requires syncing the skill into ~/.claude/skills and ~/.agents/skills during verification
Rejected: Fetch article pages directly for v1 | expands scope beyond the approved RSS-driven workflow
Rejected: Use XML parser modules | current python3 environment has expat issues, so regex + HTML parsing is safer here
Confidence: high
Scope-risk: narrow
Reversibility: clean
Directive: Keep the root helper and geeknews-search/scripts copy behaviorally identical because the installed skill must remain self-contained
Tested: PYTHONPATH=.:scripts python3 -m unittest scripts.test_geeknews_search; node --test scripts/skill-docs.test.js; python3 scripts/geeknews_search.py list --limit 3; python3 scripts/geeknews_search.py search --query Claude --limit 3; python3 scripts/geeknews_search.py detail --id 28439; npm run ci
Not-tested: Non-default feed mirrors or future Atom schema changes beyond the current public GeekNews feed shape
Related: Issue #108
2026-04-13 00:16:35 +09:00

296 lines
9.4 KiB
Python
Executable file

#!/usr/bin/env python3
from __future__ import annotations
import argparse
import json
import re
import urllib.request
from dataclasses import asdict, dataclass
from html import unescape
from html.parser import HTMLParser
from pathlib import Path
GEEKNEWS_FEED_URL = "https://feeds.feedburner.com/geeknews-feed"
class _TextExtractor(HTMLParser):
def __init__(self) -> None:
super().__init__()
self.parts: list[str] = []
def handle_data(self, data: str) -> None:
self.parts.append(data)
def text(self) -> str:
return " ".join(part.strip() for part in self.parts if part.strip())
@dataclass(frozen=True)
class GeekNewsItem:
id: str
title: str
link: str
published: str | None
updated: str | None
author_name: str | None
author_url: str | None
summary: str
content_html: str
def to_dict(self) -> dict[str, object]:
return asdict(self)
@dataclass(frozen=True)
class GeekNewsFeed:
title: str
source_id: str | None
updated: str | None
home_url: str | None
feed_url: str | None
category: str | None
items: list[GeekNewsItem]
def source_dict(self) -> dict[str, object]:
return {
"title": self.title,
"id": self.source_id,
"updated": self.updated,
"home_url": self.home_url,
"feed_url": self.feed_url,
"category": self.category,
}
def _strip_cdata(value: str | None) -> str:
if not value:
return ""
stripped = value.strip()
if stripped.startswith("<![CDATA[") and stripped.endswith("]]>"):
return stripped[9:-3]
return stripped
def _collapse_whitespace(value: str) -> str:
return re.sub(r"\s+", " ", value).strip()
def _clean_xml_text(value: str | None) -> str:
return _collapse_whitespace(unescape(_strip_cdata(value)))
def _html_to_text(html: str) -> str:
parser = _TextExtractor()
parser.feed(html)
parser.close()
return _collapse_whitespace(unescape(parser.text()))
def _first_tag(block: str, tag: str) -> str | None:
match = re.search(rf"<{tag}\b[^>]*>(.*?)</{tag}>", block, re.DOTALL)
if not match:
return None
return _clean_xml_text(match.group(1))
def _first_raw_tag(block: str, tag: str) -> str | None:
match = re.search(rf"<{tag}\b[^>]*>(.*?)</{tag}>", block, re.DOTALL)
if not match:
return None
return _strip_cdata(match.group(1)).strip()
def _first_link_href(block: str) -> str | None:
patterns = (
r"<link\b[^>]*rel=['\"]alternate['\"][^>]*href=['\"]([^'\"]+)['\"]",
r"<link\b[^>]*href=['\"]([^'\"]+)['\"]",
)
for pattern in patterns:
match = re.search(pattern, block)
if match:
return unescape(match.group(1).strip())
return None
def _link_href(block: str, *, rel: str | None = None) -> str | None:
if rel:
match = re.search(
rf"<link\b[^>]*(?:rel|ref)=['\"]{re.escape(rel)}['\"][^>]*href=['\"]([^'\"]+)['\"]",
block,
)
if match:
return unescape(match.group(1).strip())
return _first_link_href(block)
def _feed_prefix(xml_text: str) -> str:
if "<entry" not in xml_text:
return xml_text
return xml_text.split("<entry", 1)[0]
def _entry_blocks(xml_text: str) -> list[str]:
return re.findall(r"<entry\b[^>]*>(.*?)</entry>", xml_text, re.DOTALL)
def _validate_limit(limit: int) -> int:
if limit <= 0:
raise ValueError("limit must be positive")
return limit
def load_feed(xml_text: str) -> GeekNewsFeed:
prefix = _feed_prefix(xml_text)
items = []
for entry in _entry_blocks(xml_text):
author_block_match = re.search(r"<author\b[^>]*>(.*?)</author>", entry, re.DOTALL)
author_block = author_block_match.group(1) if author_block_match else ""
content_html = (_first_raw_tag(entry, "content") or "").strip()
items.append(
GeekNewsItem(
id=_first_tag(entry, "id") or "",
title=_first_tag(entry, "title") or "",
link=_first_link_href(entry) or (_first_tag(entry, "id") or ""),
published=_first_tag(entry, "published") or _first_tag(entry, "updated"),
updated=_first_tag(entry, "updated"),
author_name=_first_tag(author_block, "name"),
author_url=_first_tag(author_block, "uri"),
summary=_html_to_text(content_html),
content_html=content_html,
)
)
category_match = re.search(r"<category\b[^>]*term=['\"]([^'\"]+)['\"]", prefix)
return GeekNewsFeed(
title=_first_tag(prefix, "title") or "GeekNews",
source_id=_first_tag(prefix, "id"),
updated=_first_tag(prefix, "updated"),
home_url=_link_href(prefix, rel="alternate"),
feed_url=_link_href(prefix, rel="self") or _first_tag(prefix, "id"),
category=category_match.group(1) if category_match else None,
items=items,
)
def list_items(feed: GeekNewsFeed, limit: int = 10) -> list[GeekNewsItem]:
return feed.items[:_validate_limit(limit)]
def search_items(feed: GeekNewsFeed, query: str, limit: int = 10) -> list[GeekNewsItem]:
if not query.strip():
raise ValueError("query is required")
limit = _validate_limit(limit)
needle = query.casefold()
matches = []
for item in feed.items:
haystack = "\n".join(
part
for part in (
item.title,
item.summary,
item.author_name or "",
item.author_url or "",
item.id,
item.link,
)
if part
).casefold()
if needle in haystack:
matches.append(item)
if len(matches) >= limit:
break
return matches
def get_item_detail(feed: GeekNewsFeed, lookup: str) -> GeekNewsItem:
normalized_lookup = lookup.strip().casefold()
if not normalized_lookup:
raise ValueError("lookup is required")
for item in feed.items:
candidates = [item.id, item.link, item.title]
lowered = [candidate.casefold() for candidate in candidates if candidate]
if normalized_lookup in lowered or any(normalized_lookup in candidate for candidate in lowered):
return item
raise LookupError(f"No GeekNews entry matched: {lookup}")
def _serialize_items(items: list[GeekNewsItem]) -> list[dict[str, object]]:
return [item.to_dict() for item in items]
def build_list_payload(feed: GeekNewsFeed, limit: int = 10) -> dict[str, object]:
items = list_items(feed, limit=limit)
return {"source": feed.source_dict(), "count": len(items), "items": _serialize_items(items)}
def build_search_payload(feed: GeekNewsFeed, query: str, limit: int = 10) -> dict[str, object]:
items = search_items(feed, query=query, limit=limit)
return {
"source": feed.source_dict(),
"query": query,
"count": len(items),
"items": _serialize_items(items),
}
def build_detail_payload(feed: GeekNewsFeed, lookup: str) -> dict[str, object]:
item = get_item_detail(feed, lookup)
return {"source": feed.source_dict(), "item": item.to_dict()}
def fetch_feed(url: str = GEEKNEWS_FEED_URL, timeout: int = 20) -> str:
request = urllib.request.Request(url, headers={"User-Agent": "k-skill-geeknews/1.0"})
with urllib.request.urlopen(request, timeout=timeout) as response:
charset = response.headers.get_content_charset() or "utf-8"
return response.read().decode(charset, errors="replace")
def _add_feed_source_args(parser: argparse.ArgumentParser) -> None:
parser.add_argument("--feed-url", default=GEEKNEWS_FEED_URL, help="기본값: GeekNews public feed URL")
parser.add_argument("--feed-file", help="테스트/오프라인 검증용 로컬 Atom XML 파일")
def parse_args(argv: list[str] | None = None) -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Read GeekNews entries from the public RSS/Atom feed.")
subparsers = parser.add_subparsers(dest="command", required=True)
list_parser = subparsers.add_parser("list", help="최신 GeekNews 항목 목록")
_add_feed_source_args(list_parser)
list_parser.add_argument("--limit", type=int, default=10)
search_parser = subparsers.add_parser("search", help="제목/요약/작성자 기준 검색")
_add_feed_source_args(search_parser)
search_parser.add_argument("--query", required=True)
search_parser.add_argument("--limit", type=int, default=10)
detail_parser = subparsers.add_parser("detail", help="항목 상세 확인")
_add_feed_source_args(detail_parser)
detail_parser.add_argument("--id", required=True, help="entry id/link/topic id 일부")
return parser.parse_args(argv)
def _load_feed_text(args: argparse.Namespace) -> str:
if args.feed_file:
return Path(args.feed_file).read_text(encoding="utf-8")
return fetch_feed(url=args.feed_url)
def main(argv: list[str] | None = None) -> None:
args = parse_args(argv)
feed = load_feed(_load_feed_text(args))
if args.command == "list":
payload = build_list_payload(feed, limit=args.limit)
elif args.command == "search":
payload = build_search_payload(feed, query=args.query, limit=args.limit)
else:
payload = build_detail_payload(feed, lookup=args.id)
print(json.dumps(payload, ensure_ascii=False, indent=2))
if __name__ == "__main__":
main()