mirror of
https://github.com/NomaDamas/k-skill.git
synced 2026-06-24 02:04:11 +00:00
korean-slang-writing (#133): extract summaries via h2 section anchor + og:description fallback
Namu Wiki's current HTML layout uses build-time-obfuscated CSS class
names (e.g. _36R8DWTn, OZVChh+l) and has no <article>/<main>/<section>
tags, so all six MAIN_CONTENT_CLASSES anchors fail to match and
extract_summary() returned empty with a 'Main content region not
detected' warning on every live page.
Replace the single class-based strategy with a three-tier fallback
chain that pins to progressively weaker but more structurally stable
anchors:
1. First h2 section boundary. Namu Wiki articles consistently open
with '<h2>1. 개요[편집]</h2>' and mark subsequent sections with
numbered h2 headings. Extracting text between the first and
second h2 reliably captures the overview section on every page
sampled (중꺾마, 갓생, 럭키비키, 어쩔티비).
2. MAIN_CONTENT_CLASSES / <article> - kept as a legacy fallback
for older Namu Wiki layouts and for third-party fixtures.
3. og:description meta tag - final safety net before returning
empty, gives the agent at least a ~64-char preview when the
article has unusual structure.
Strip '[편집]' edit-affordance markers and numbered section prefixes
(e.g. '1.2.') from the extracted text so headings don't leak through
as noise.
Live verification (text format):
slang_lookup.py 중꺾마 -> Title + 286-char summary
slang_lookup.py 갓생 -> Title + 96-char summary
slang_lookup.py 럭키비키 -> Title + 59-char summary
slang_lookup.py 어쩔티비 -> Title + 20-char summary
All previously-empty. Not-found / blocked / upstream-error paths and
exit codes are unchanged.
This commit is contained in:
parent
541967e96c
commit
4f31dae11f
2 changed files with 158 additions and 8 deletions
|
|
@ -32,6 +32,17 @@ BLOCK_END_RE = re.compile(r"</(p|div|li|h[1-6])>", re.IGNORECASE)
|
|||
BR_RE = re.compile(r"<br\s*/?>", re.IGNORECASE)
|
||||
WHITESPACE_RE = re.compile(r"[ \t]+")
|
||||
BLANK_LINES_RE = re.compile(r"\n{3,}")
|
||||
H2_TAG_RE = re.compile(r"<h2\b[^>]*>.*?</h2>", re.DOTALL | re.IGNORECASE)
|
||||
SECTION_NUMBER_PREFIX_RE = re.compile(r"^\s*\d+(?:\.\d+)*\.\s+", re.MULTILINE)
|
||||
EDIT_AFFORDANCE_RE = re.compile(r"\[\s*편집\s*\]")
|
||||
OG_DESCRIPTION_RE = re.compile(
|
||||
r'<meta\s+[^>]*property\s*=\s*"og:description"\s+[^>]*content\s*=\s*"([^"]*)"',
|
||||
re.IGNORECASE,
|
||||
)
|
||||
OG_DESCRIPTION_REVERSED_RE = re.compile(
|
||||
r'<meta\s+[^>]*content\s*=\s*"([^"]*)"\s+[^>]*property\s*=\s*"og:description"',
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
MAIN_CONTENT_CLASSES = (
|
||||
"wiki-paragraph",
|
||||
|
|
@ -73,11 +84,29 @@ def _find_main_content(cleaned_html: str) -> str:
|
|||
return ""
|
||||
|
||||
|
||||
def _extract_first_section_between_h2(cleaned_html: str) -> str:
|
||||
matches = list(H2_TAG_RE.finditer(cleaned_html))
|
||||
if not matches:
|
||||
return ""
|
||||
start = matches[0].end()
|
||||
end = matches[1].start() if len(matches) > 1 else len(cleaned_html)
|
||||
return cleaned_html[start:end]
|
||||
|
||||
|
||||
def _extract_og_description(html: str) -> str:
|
||||
match = OG_DESCRIPTION_RE.search(html) or OG_DESCRIPTION_REVERSED_RE.search(html)
|
||||
if not match:
|
||||
return ""
|
||||
return unescape(match.group(1)).strip()
|
||||
|
||||
|
||||
def _html_fragment_to_text(fragment: str) -> str:
|
||||
text = BR_RE.sub("\n", fragment)
|
||||
text = BLOCK_END_RE.sub("\n", text)
|
||||
text = TAG_RE.sub("", text)
|
||||
text = unescape(text)
|
||||
text = EDIT_AFFORDANCE_RE.sub("", text)
|
||||
text = SECTION_NUMBER_PREFIX_RE.sub("", text)
|
||||
lines: list[str] = []
|
||||
for line in text.split("\n"):
|
||||
stripped = WHITESPACE_RE.sub(" ", line).strip()
|
||||
|
|
@ -87,19 +116,34 @@ def _html_fragment_to_text(fragment: str) -> str:
|
|||
return BLANK_LINES_RE.sub("\n\n", joined).strip()
|
||||
|
||||
|
||||
def extract_summary(html: str, *, max_length: int = DEFAULT_MAX_LENGTH) -> str:
|
||||
cleaned = SCRIPT_STYLE_RE.sub("", html)
|
||||
region = _find_main_content(cleaned)
|
||||
if not region:
|
||||
return ""
|
||||
text = _html_fragment_to_text(region)
|
||||
if not text:
|
||||
return ""
|
||||
def _truncate(text: str, max_length: int) -> str:
|
||||
if max_length > 0 and len(text) > max_length:
|
||||
return text[:max_length] + "..."
|
||||
return text
|
||||
|
||||
|
||||
def extract_summary(html: str, *, max_length: int = DEFAULT_MAX_LENGTH) -> str:
|
||||
cleaned = SCRIPT_STYLE_RE.sub("", html)
|
||||
|
||||
h2_section = _extract_first_section_between_h2(cleaned)
|
||||
if h2_section:
|
||||
text = _html_fragment_to_text(h2_section)
|
||||
if text:
|
||||
return _truncate(text, max_length)
|
||||
|
||||
region = _find_main_content(cleaned)
|
||||
if region:
|
||||
text = _html_fragment_to_text(region)
|
||||
if text:
|
||||
return _truncate(text, max_length)
|
||||
|
||||
og_description = _extract_og_description(html)
|
||||
if og_description:
|
||||
return _truncate(og_description, max_length)
|
||||
|
||||
return ""
|
||||
|
||||
|
||||
def _is_url(value: str) -> bool:
|
||||
return value.startswith("http://") or value.startswith("https://")
|
||||
|
||||
|
|
|
|||
|
|
@ -409,6 +409,36 @@ class LookupParsingTest(unittest.TestCase):
|
|||
</html>
|
||||
"""
|
||||
|
||||
HTML_CURRENT_NAMUWIKI = """
|
||||
<!doctype html>
|
||||
<html>
|
||||
<head>
|
||||
<title>중요한 것은 꺾이지 않는 마음 - 나무위키</title>
|
||||
<meta property="og:description" content="RGE전 패배는 괜찮다. 중요한 것은 꺾이지 않는 마음">
|
||||
</head>
|
||||
<body>
|
||||
<div class="_36R8DWTn">
|
||||
<h1 class="_2HZC0kyI"><a href="/w/test" class="kPIqc4b-"><span>중요한 것은 꺾이지 않는 마음</span></a></h1>
|
||||
<div class="RW63SZFE">최근 수정 시각: 2026-03-29 13:14:18</div>
|
||||
<div class="W6XTddIf">
|
||||
<span><a href="/star">별표</a></span>
|
||||
<span><a href="/edit">편집 요청</a></span>
|
||||
</div>
|
||||
<h2 class="_sectionHeading"><span>1. 개요</span><a class="edit-link">[편집]</a></h2>
|
||||
<div class="_sectionBody">
|
||||
<p>'중요한 것은 꺾이지 않는 마음'은 리그 오브 레전드 2022 월드 챔피언십에 참가한 프로게임단
|
||||
DRX 소속 프로게이머 김혁규(Deft) 선수의 인터뷰를 담은 영상의 제목에서 유래된 유행어다.</p>
|
||||
<p>포기하지 않는 불굴의 의지를 의미한다.</p>
|
||||
</div>
|
||||
<h2 class="_sectionHeading"><span>2. 발생 양상</span><a class="edit-link">[편집]</a></h2>
|
||||
<div class="_sectionBody">
|
||||
<p>2022년 LoL 월드 챔피언십에서 DRX가 디펜딩 챔피언 T1을 꺾고 우승하며 회자되었다.</p>
|
||||
</div>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
def test_extract_title_strips_namuwiki_suffix(self) -> None:
|
||||
title = slang_lookup.extract_title(self.HTML_SAMPLE)
|
||||
self.assertEqual(title, "중꺾마")
|
||||
|
|
@ -433,6 +463,82 @@ class LookupParsingTest(unittest.TestCase):
|
|||
summary = slang_lookup.extract_summary("<html><body></body></html>", max_length=1500)
|
||||
self.assertEqual(summary, "")
|
||||
|
||||
def test_extract_summary_uses_h2_section_boundaries_on_current_namuwiki_layout(
|
||||
self,
|
||||
) -> None:
|
||||
"""Must use numbered-h2 anchors when Namu Wiki class names are obfuscated."""
|
||||
summary = slang_lookup.extract_summary(
|
||||
self.HTML_CURRENT_NAMUWIKI, max_length=2000
|
||||
)
|
||||
self.assertIn("중요한 것은 꺾이지 않는 마음", summary)
|
||||
self.assertIn("DRX", summary)
|
||||
self.assertIn("포기하지 않는 불굴의 의지", summary)
|
||||
self.assertNotIn("T1을 꺾고 우승", summary)
|
||||
self.assertNotIn("최근 수정 시각", summary)
|
||||
self.assertNotIn("편집 요청", summary)
|
||||
self.assertNotIn("별표", summary)
|
||||
|
||||
def test_extract_summary_strips_section_heading_edit_affordances(self) -> None:
|
||||
"""[편집] edit affordances and N. section numbering must not leak through."""
|
||||
summary = slang_lookup.extract_summary(
|
||||
self.HTML_CURRENT_NAMUWIKI, max_length=2000
|
||||
)
|
||||
self.assertNotIn("[편집]", summary)
|
||||
self.assertNotIn("1. 개요", summary)
|
||||
|
||||
def test_extract_summary_falls_back_to_og_description_when_no_h2_or_classes(
|
||||
self,
|
||||
) -> None:
|
||||
"""og:description is the final structural fallback before giving up."""
|
||||
html = """
|
||||
<html>
|
||||
<head>
|
||||
<title>럭키비키 - 나무위키</title>
|
||||
<meta property="og:description" content="완전 럭키비키잖아~! 장원영 IVE 의 멤버 장원영 의 발언에서 유래한 초긍정적 마인드를 표현하는 인터넷 밈.">
|
||||
</head>
|
||||
<body>
|
||||
<div class="obfuscated-x1y2z3">navigation chrome only, no real body.</div>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
summary = slang_lookup.extract_summary(html, max_length=500)
|
||||
self.assertIn("럭키비키", summary)
|
||||
self.assertIn("장원영", summary)
|
||||
self.assertNotIn("&", summary)
|
||||
self.assertNotIn("<", summary)
|
||||
|
||||
def test_extract_summary_handles_single_h2_page(self) -> None:
|
||||
"""Single-section pages must still extract body text after the lone h2."""
|
||||
html = """
|
||||
<html><head><title>짧은유행어 - 나무위키</title></head>
|
||||
<body>
|
||||
<h1>짧은유행어</h1>
|
||||
<h2>1. 개요[편집]</h2>
|
||||
<p>이 유행어는 짧은 설명을 가진 유행어이다.</p>
|
||||
<p>두 번째 문단도 포함되어야 한다.</p>
|
||||
</body></html>
|
||||
"""
|
||||
summary = slang_lookup.extract_summary(html, max_length=2000)
|
||||
self.assertIn("짧은 설명", summary)
|
||||
self.assertIn("두 번째 문단", summary)
|
||||
|
||||
def test_extract_summary_prefers_h2_strategy_over_class_strategy(self) -> None:
|
||||
"""h2 boundaries must beat MAIN_CONTENT_CLASSES when both are present."""
|
||||
html = """
|
||||
<html><head><title>test - 나무위키</title></head>
|
||||
<body>
|
||||
<div class="wiki-paragraph">navigation sidebar noise goes here.</div>
|
||||
<h2>1. 개요[편집]</h2>
|
||||
<p>정확한 개요 본문입니다.</p>
|
||||
<h2>2. 상세[편집]</h2>
|
||||
<p>상세 섹션은 제외되어야 합니다.</p>
|
||||
</body></html>
|
||||
"""
|
||||
summary = slang_lookup.extract_summary(html, max_length=2000)
|
||||
self.assertIn("정확한 개요 본문", summary)
|
||||
self.assertNotIn("navigation sidebar noise", summary)
|
||||
self.assertNotIn("상세 섹션은 제외되어야 합니다", summary)
|
||||
|
||||
|
||||
class LookupNetworkTest(unittest.TestCase):
|
||||
def test_lookup_returns_structured_result_on_success(self) -> None:
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue