Fix JobKorea fallback row parsing

This commit is contained in:
wbjung 2026-06-18 00:04:07 +09:00
commit caa1f0fd0d
2 changed files with 160 additions and 11 deletions

View file

@ -177,6 +177,63 @@ def clean_text(value: str) -> str:
return value.strip()
ACTION_CONTROL_RE = re.compile(
r"^(?:스크랩\s*\d*|저장하기|닫기|포지션\s*제안|메모하기|프로필\s*확인|이력서\s*확인|펼쳐보기|접기|이전|다음)$"
)
ACTION_CONTROL_INLINE_RE = re.compile(
r"(?:스크랩\s*\d+|저장하기|닫기|포지션\s*제안|메모하기|프로필\s*확인|이력서\s*확인|펼쳐보기|접기|이전|다음)"
)
def is_action_control_label(value: str) -> bool:
label = re.sub(r"\s+", " ", html.unescape(value)).strip()
return bool(label and ACTION_CONTROL_RE.match(label))
def filter_action_control_text(value: str) -> str:
lines = []
for line in value.splitlines():
label = line.strip()
if not label or is_action_control_label(label):
continue
label = ACTION_CONTROL_INLINE_RE.sub(" ", label)
label = re.sub(r"\s+", " ", label).strip()
if label:
lines.append(label)
return "\n".join(lines).strip()
def row_contains_other_resume(candidate_markup: str, rno: str) -> bool:
refs: list[str] = []
for href_rno, data_rno in re.findall(r"rNo=(\d+)|data-rno=[\"'](\d+)[\"']", candidate_markup):
refs.append(href_rno or data_rno)
return any(ref != rno for ref in refs)
def extract_regex_candidate_markup(markup: str, match: re.Match[str], rno: str) -> str:
row_start = markup.rfind("<tr", 0, match.start())
if row_start >= 0:
row_open_end = markup.find(">", row_start, match.start())
row_end = markup.find("</tr>", match.end())
row_open = markup[row_start : row_open_end + 1] if row_open_end >= 0 else ""
if row_end >= 0 and f'data-rno="{rno}"' in row_open:
return markup[row_start : row_end + len("</tr>")]
booth_start = markup.rfind('<div class="booth"', 0, match.start())
if booth_start >= 0:
next_booth = markup.find('<div class="booth"', match.end())
section_end = markup.find('</section>', match.end())
end_candidates = [pos for pos in (next_booth, section_end) if pos >= 0]
booth_end = min(end_candidates) if end_candidates else min(len(markup), match.end() + 2500)
booth = markup[booth_start:booth_end]
if not row_contains_other_resume(booth, rno):
return booth
start = max(0, match.start() - 300)
end = min(len(markup), match.end() + 1200)
return markup[start:end]
def parse_with_bs4(markup: str, limit: int) -> list[Candidate] | None:
try:
from bs4 import BeautifulSoup # type: ignore
@ -197,14 +254,32 @@ def parse_with_bs4(markup: str, limit: int) -> list[Candidate] | None:
continue
seen.add(rno)
container = link.find_parent(class_=re.compile(r"booth|list|row|person", re.I)) or link.parent
container = (
link.find_parent("tr", attrs={"data-rno": rno})
or link.find_parent(class_=re.compile(r"(^|\s)booth(\s|$)", re.I))
or link.parent
)
if container and row_contains_other_resume(str(container), rno):
# Broad ancestors such as tblSearchList/personList can contain several resumes.
# Falling back to the link itself is safer than mixing candidate evidence.
container = link.parent
raw = clean_text(str(container)) if container else clean_text(str(link))
texts = [x.get_text(" ", strip=True) for x in (container.find_all(["dt", "dd", "p", "span", "button"]) if container else [])]
text_join = " | ".join(t for t in texts if t)
texts = []
for node in container.find_all(["dt", "dd", "p", "span", "li"]) if container else []:
label = node.get_text(" ", strip=True)
if label and not is_action_control_label(label):
texts.append(label)
for btn in container.select(".keywordSkill button, .keywordBox button") if container else []:
label = btn.get_text(" ", strip=True)
if label and not is_action_control_label(label):
texts.append(label)
text_join = " | ".join(dict.fromkeys(texts))
name = ""
meta = ""
dt = container.find("dt") if container else None
name_scope = container.select_one(".nameAge") if container else None
dt = (name_scope or container).find("dt") if container else None
if dt:
name = dt.get_text(" ", strip=True)
dd = dt.find_next("dd")
@ -217,9 +292,9 @@ def parse_with_bs4(markup: str, limit: int) -> list[Candidate] | None:
meta = "(" + m_name.group(2) + ")"
skills = []
for btn in container.select("button") if container else []:
for btn in container.select(".keywordSkill button, .keywordBox button") if container else []:
label = btn.get_text(" ", strip=True)
if label and label not in {"스크랩", "포지션 제안", "메모하기", "프로필 확인", "이력서 확인"}:
if label and not is_action_control_label(label):
skills.append(label)
candidates.append(
@ -230,7 +305,7 @@ def parse_with_bs4(markup: str, limit: int) -> list[Candidate] | None:
meta=meta,
career=(container.select_one(".career").get_text(" ", strip=True) if container and container.select_one(".career") else ""),
skills=", ".join(skills[:25]),
raw_summary=text_join[:1000] or raw[:1000],
raw_summary=filter_action_control_text(text_join[:1000] or raw[:1000]),
)
)
if len(candidates) >= limit:
@ -246,9 +321,8 @@ def parse_with_regex(markup: str, limit: int) -> list[Candidate]:
if rno in seen:
continue
seen.add(rno)
start = max(0, m.start() - 1000)
end = min(len(markup), m.end() + 2500)
raw = clean_text(markup[start:end])
raw_markup = extract_regex_candidate_markup(markup, m, rno)
raw = clean_text(raw_markup)
name = ""
meta = ""
nm = re.search(r"([가-힣A-Za-z]OO)\s*\(([^)]*)\)", raw)
@ -261,7 +335,7 @@ def parse_with_regex(markup: str, limit: int) -> list[Candidate]:
url=urllib.parse.urljoin(BASE_URL, m.group("href")),
name=name,
meta=meta,
raw_summary=raw[:1000],
raw_summary=filter_action_control_text(raw[:1000]),
)
)
if len(candidates) >= limit:

View file

@ -0,0 +1,75 @@
#!/usr/bin/env python3
"""Fixture tests for JobKorea public fallback parsing."""
from __future__ import annotations
import importlib.util
import sys
import unittest
from pathlib import Path
SCRIPT = Path(__file__).with_name("jobkorea_talent_search.py")
spec = importlib.util.spec_from_file_location("jobkorea_talent_search", SCRIPT)
assert spec is not None
helper = importlib.util.module_from_spec(spec)
sys.modules["jobkorea_talent_search"] = helper
assert spec.loader is not None
spec.loader.exec_module(helper)
FALLBACK_FIXTURE = """
<section class="searchList">
<table class="tblSearchList">
<tbody>
<tr class="dvResumeTr" data-rno="111">
<td class="tdProfile">
<dl class="nameAge"><dt><a class="dvResumeLink" href="/corp/person/find/resume/view?rNo=111" data-rno="111">김OO</a></dt><dd>(, 29)</dd></dl>
<ul class="bullList"><li>25분전 공고 스크랩</li></ul>
</td>
<td class="tdSummary">
<div class="userInfoBox">
<span class="career">경력 4</span>
<p class="title"><a class="dvResumeLink" href="/corp/person/find/resume/view?rNo=111" data-rno="111">퍼포먼스 마케터</a></p>
<div class="keywordSkill keywordBox">
<button type="button" class="js-kwrdSearch">Google Analytics</button>
<button type="button" class="js-kwrdSearch">GA4</button>
</div>
</div>
</td>
<td class="tdAction">
<button>스크랩 1</button><button>이력서 확인</button><button>포지션 제안</button><button>메모하기</button><button>저장하기</button><button>닫기</button>
</td>
</tr>
<tr class="dvResumeTr" data-rno="222">
<td class="tdProfile">
<dl class="nameAge"><dt><a class="dvResumeLink" href="/corp/person/find/resume/view?rNo=222" data-rno="222">박OO</a></dt><dd>(, 31)</dd></dl>
</td>
<td class="tdSummary">
<span class="career">경력 6</span>
<p class="title"><a class="dvResumeLink" href="/corp/person/find/resume/view?rNo=222" data-rno="222">브랜드 마케터</a></p>
<div class="keywordSkill keywordBox"><button type="button" class="js-kwrdSearch">브랜딩</button></div>
</td>
</tr>
</tbody>
</table>
</section>
"""
class JobKoreaFallbackParserTest(unittest.TestCase):
def test_parser_keeps_each_candidate_inside_its_own_row(self) -> None:
candidates = helper.parse_candidates(FALLBACK_FIXTURE, 10)
self.assertEqual([c.rno for c in candidates], ["111", "222"])
self.assertEqual(candidates[0].name, "김OO")
self.assertIn("Google Analytics", candidates[0].raw_summary)
self.assertIn("GA4", candidates[0].raw_summary)
self.assertNotIn("박OO", candidates[0].raw_summary)
self.assertNotIn("브랜딩", candidates[0].raw_summary)
self.assertNotIn("저장하기", candidates[0].raw_summary)
self.assertNotIn("닫기", candidates[0].raw_summary)
self.assertNotIn("포지션 제안", candidates[0].raw_summary)
self.assertNotIn("이력서 확인", candidates[0].raw_summary)
if __name__ == "__main__":
unittest.main()