korean-slang-writing (#133): harden extractor with numbered-h2 gate + category-nav strip

Implements the three non-blocking observations from PR #161 round-3 review: 1. Numbered-h2 gate (reviewer-flagged fragility): Refactored _extract_first_section_between_h2 to extract h2 inner text (stripping nested tags) and filter by '^\\s*\\d+(?:\\.\\d+)*\\.\\s+\\S'. Sidebar widgets like <h2>관련 문서</h2> or <h2>외부 링크</h2> can no longer anchor the extractor - only numbered section headers (1., 1.2., 2.3.4.) do. Handles live Namu Wiki structure where the number sits inside an <a> tag (<a>1.</a> <span>개요</span>), which the round-3 suggested regex-only gate missed. All 29 seed pages continue to produce valid summaries on live fetches. 2. Category-nav template strip (reviewer-flagged long-page noise): a. CATEGORY_NAV_RE strips the inline '[펼치기 · 접기]' marker plus its same-line aftermath (the category list items on the same line). b. DETAILS_PELCHIGI_RE strips the entire <details> block whose <summary> contains 펼치기. Namu Wiki today wraps category nav in exactly this structure, so the strip removes the full noise block (not just the marker line). 꿀잼 summary drops from 3482 chars of category dump to 562 chars starting with the real definition '무언가가 매우 재미있다는 의미의 인터넷 유행어'. Non-category <details> blocks (spoilers, footnotes) are preserved. 3. TDD + mutation coverage: 6 new tests total: 2 numbered-h2 gate tests, 2 inline category-nav tests, 1 <details>-block strip test, 1 <details>-keep test (negative case). All 6 were written first and confirmed RED against the round-2 baseline, then made GREEN after the implementation landed. Each fix path was also mutation-tested (revert regex, remove .sub line) to confirm the tests genuinely catch the target bug class. Suite grows from 45 to 51 tests. All pass. npm run ci exits 0.
2026-06-24 02:04:11 +00:00 · 2026-04-22 14:18:42 +09:00 · 2026-04-22 14:18:42 +09:00 · cc91e55682
commit cc91e55682
parent 4f31dae11f
2 changed files with 183 additions and 4 deletions
--- a/korean-slang-writing/scripts/slang_lookup.py
+++ b/korean-slang-writing/scripts/slang_lookup.py
@ -33,8 +33,17 @@ BR_RE = re.compile(r"<br\s*/?>", re.IGNORECASE)
 WHITESPACE_RE = re.compile(r"[ \t]+")
 BLANK_LINES_RE = re.compile(r"\n{3,}")
 H2_TAG_RE = re.compile(r"<h2\b[^>]*>.*?</h2>", re.DOTALL | re.IGNORECASE)
+NUMBERED_H2_INNER_TEXT_RE = re.compile(r"^\s*\d+(?:\.\d+)*\.\s+\S")
 SECTION_NUMBER_PREFIX_RE = re.compile(r"^\s*\d+(?:\.\d+)*\.\s+", re.MULTILINE)
 EDIT_AFFORDANCE_RE = re.compile(r"\[\s*편집\s*\]")
+CATEGORY_NAV_RE = re.compile(r"\[\s*펼치기\s*[·・•]\s*접기\s*\][^\n]*")
+DETAILS_PELCHIGI_RE = re.compile(
+    r"<details\b[^>]*>"
+    r"\s*<summary\b[^>]*>[^<]*펼치기[^<]*</summary>"
+    r".*?"
+    r"</details>",
+    re.DOTALL | re.IGNORECASE,
+)
 OG_DESCRIPTION_RE = re.compile(
    r'<meta\s+[^>]*property\s*=\s*"og:description"\s+[^>]*content\s*=\s*"([^"]*)"',
    re.IGNORECASE,
@ -84,12 +93,24 @@ def _find_main_content(cleaned_html: str) -> str:
    return ""


+def _h2_inner_text(h2_tag_html: str) -> str:
+    opening_end = h2_tag_html.index(">") + 1
+    closing_start = h2_tag_html.rindex("<")
+    inner = h2_tag_html[opening_end:closing_start]
+    return unescape(TAG_RE.sub("", inner)).strip()
+
+
+def _is_numbered_section_h2(h2_tag_html: str) -> bool:
+    return bool(NUMBERED_H2_INNER_TEXT_RE.match(_h2_inner_text(h2_tag_html)))
+
+
 def _extract_first_section_between_h2(cleaned_html: str) -> str:
-    matches = list(H2_TAG_RE.finditer(cleaned_html))
-    if not matches:
+    all_matches = list(H2_TAG_RE.finditer(cleaned_html))
+    numbered = [m for m in all_matches if _is_numbered_section_h2(m.group(0))]
+    if not numbered:
        return ""
-    start = matches[0].end()
-    end = matches[1].start() if len(matches) > 1 else len(cleaned_html)
+    start = numbered[0].end()
+    end = numbered[1].start() if len(numbered) > 1 else len(cleaned_html)
    return cleaned_html[start:end]


@ -106,6 +127,7 @@ def _html_fragment_to_text(fragment: str) -> str:
    text = TAG_RE.sub("", text)
    text = unescape(text)
    text = EDIT_AFFORDANCE_RE.sub("", text)
+    text = CATEGORY_NAV_RE.sub("", text)
    text = SECTION_NUMBER_PREFIX_RE.sub("", text)
    lines: list[str] = []
    for line in text.split("\n"):
@ -124,6 +146,7 @@ def _truncate(text: str, max_length: int) -> str:

 def extract_summary(html: str, *, max_length: int = DEFAULT_MAX_LENGTH) -> str:
    cleaned = SCRIPT_STYLE_RE.sub("", html)
+    cleaned = DETAILS_PELCHIGI_RE.sub("", cleaned)

    h2_section = _extract_first_section_between_h2(cleaned)
    if h2_section:
--- a/scripts/test_korean_slang_writing.py
+++ b/scripts/test_korean_slang_writing.py
@ -539,6 +539,162 @@ class LookupParsingTest(unittest.TestCase):
        self.assertNotIn("navigation sidebar noise", summary)
        self.assertNotIn("상세 섹션은 제외되어야 합니다", summary)

+    def test_extract_summary_ignores_h2_without_numbered_section_prefix(
+        self,
+    ) -> None:
+        """Sidebar/nav ``<h2>`` widgets without a numbered section prefix
+        (``<h2>관련 문서</h2>``, ``<h2>외부 링크</h2>`` etc.) MUST NOT be treated
+        as section boundaries. When no numbered h2 is present, the extractor
+        falls through to the class-based tier.
+        """
+        html = """
+        <html><head><title>test - 나무위키</title></head>
+        <body>
+          <h2>관련 문서</h2>
+          <div class="navigation-sidebar-chrome">unrelated sidebar body</div>
+          <h2>바로가기</h2>
+          <div class="wiki-paragraph">
+            <p>실제 본문은 여기에 있습니다.</p>
+          </div>
+        </body></html>
+        """
+        summary = slang_lookup.extract_summary(html, max_length=2000)
+        self.assertIn("실제 본문", summary)
+        self.assertNotIn("unrelated sidebar body", summary)
+        self.assertNotIn("관련 문서", summary)
+        self.assertNotIn("바로가기", summary)
+
+    def test_extract_summary_numbered_h2_gate_skips_sidebar_h2_before_section_one(
+        self,
+    ) -> None:
+        """Regression for the reviewer-flagged edge case: a sidebar-style
+        ``<h2>관련 문서</h2>`` placed BEFORE the section ``<h2>1. 개요</h2>``
+        must not anchor the extractor. Only numbered section headers
+        (``\\d+(?:\\.\\d+)*\\.\\s``) can act as section boundaries.
+        """
+        html = """
+        <html><head><title>test - 나무위키</title></head>
+        <body>
+          <h2>관련 문서</h2>
+          <ul><li>link1</li><li>link2</li></ul>
+          <h2>1. 개요[편집]</h2>
+          <p>진짜 개요 본문입니다.</p>
+          <h2>2. 상세[편집]</h2>
+          <p>상세 섹션은 제외됩니다.</p>
+        </body></html>
+        """
+        summary = slang_lookup.extract_summary(html, max_length=2000)
+        self.assertIn("진짜 개요 본문", summary)
+        self.assertNotIn("link1", summary)
+        self.assertNotIn("link2", summary)
+        self.assertNotIn("상세 섹션은 제외됩니다", summary)
+
+    def test_extract_summary_strips_category_nav_template_markers(self) -> None:
+        """Namu Wiki inline category nav templates render as
+        ``[펼치기 · 접기] item · item · item`` inline on one line. The marker
+        itself AND the trailing category items on the same line (its "aftermath")
+        must both be stripped so the agent sees the real prose.
+        """
+        html = """
+        <html><head><title>꿀잼 - 나무위키</title></head>
+        <body>
+          <h2>1. 개요[편집]</h2>
+          <p>문화 및 유행어 [펼치기 · 접기] 밈 모음 (ㄱ항목 · ㄴ항목 · 꿀잼 · ㄹ항목)</p>
+          <p>꿀잼은 '꿀'과 '재미'의 합성어로, 정말 재미있을 때 사용하는 유행어이다.</p>
+          <h2>2. 상세[편집]</h2>
+        </body></html>
+        """
+        summary = slang_lookup.extract_summary(html, max_length=2000)
+        self.assertNotIn("[펼치기 · 접기]", summary)
+        self.assertNotIn("ㄱ항목", summary)
+        self.assertNotIn("ㄹ항목", summary)
+        self.assertNotIn("밈 모음", summary)
+        self.assertIn("꿀잼은", summary)
+        self.assertIn("재미있을 때", summary)
+
+    def test_extract_summary_category_nav_strip_preserves_surrounding_content(
+        self,
+    ) -> None:
+        """Category-nav stripping must only affect the marker-containing line.
+        Content on *other* lines (both before and after) must be preserved.
+        """
+        html = """
+        <html><head><title>test - 나무위키</title></head>
+        <body>
+          <h2>1. 개요[편집]</h2>
+          <p>도입문입니다. 중요한 소개 문장.</p>
+          <p>분류 [펼치기 · 접기] 카테고리A · 카테고리B · 카테고리C</p>
+          <p>이 문단은 반드시 보존되어야 합니다.</p>
+          <h2>2. 상세[편집]</h2>
+        </body></html>
+        """
+        summary = slang_lookup.extract_summary(html, max_length=2000)
+        self.assertIn("도입문입니다", summary)
+        self.assertIn("중요한 소개 문장", summary)
+        self.assertIn("이 문단은 반드시 보존", summary)
+        self.assertNotIn("[펼치기 · 접기]", summary)
+        self.assertNotIn("카테고리A", summary)
+        self.assertNotIn("카테고리C", summary)
+
+    def test_extract_summary_strips_details_block_wrapping_pelchigi_summary(
+        self,
+    ) -> None:
+        """Live Namu Wiki wraps category-nav templates in a ``<details>`` block
+        whose ``<summary>`` label is ``[펼치기 · 접기]``. The entire ``<details>``
+        block (summary + all its body rows/cells) must be stripped, not just
+        the marker line, so multi-line category dumps don't survive into the
+        agent-visible summary.
+        """
+        html = """
+        <html><head><title>꿀잼 - 나무위키</title></head>
+        <body>
+          <h2>1. 개요[편집]</h2>
+          <div class="nav-wrapper">
+            <details class="cat-nav">
+              <summary>[펼치기 · 접기]</summary>
+              <div>문화 및 유행어</div>
+              <div>기타</div>
+              <div>item1 · item2 · item3</div>
+              <div>ㄱ</div>
+              <div>가놈 · 가성비 댓글</div>
+            </details>
+          </div>
+          <p>무언가가 매우 재미있다는 의미의 인터넷 유행어이다.</p>
+          <h2>2. 상세[편집]</h2>
+        </body></html>
+        """
+        summary = slang_lookup.extract_summary(html, max_length=2000)
+        self.assertNotIn("[펼치기 · 접기]", summary)
+        self.assertNotIn("문화 및 유행어", summary)
+        self.assertNotIn("item1", summary)
+        self.assertNotIn("가놈", summary)
+        self.assertNotIn("가성비 댓글", summary)
+        self.assertIn("매우 재미있다는 의미", summary)
+        self.assertIn("인터넷 유행어", summary)
+
+    def test_extract_summary_keeps_details_block_without_pelchigi_summary(
+        self,
+    ) -> None:
+        """``<details>`` blocks whose ``<summary>`` does NOT contain ``펼치기``
+        (e.g. spoilers, footnotes) must be preserved — only the specific
+        category-nav pattern is stripped.
+        """
+        html = """
+        <html><head><title>test - 나무위키</title></head>
+        <body>
+          <h2>1. 개요[편집]</h2>
+          <details>
+            <summary>스포일러 주의</summary>
+            <p>이 내용은 스포일러 정보를 포함합니다.</p>
+          </details>
+          <p>일반 본문도 있습니다.</p>
+          <h2>2. 상세[편집]</h2>
+        </body></html>
+        """
+        summary = slang_lookup.extract_summary(html, max_length=2000)
+        self.assertIn("스포일러", summary)
+        self.assertIn("일반 본문", summary)
+

 class LookupNetworkTest(unittest.TestCase):
    def test_lookup_returns_structured_result_on_success(self) -> None: