mirror of
https://github.com/NomaDamas/k-skill.git
synced 2026-06-24 02:04:11 +00:00
Fix JobKorea fallback row parsing
This commit is contained in:
parent
c619d3b7c7
commit
caa1f0fd0d
2 changed files with 160 additions and 11 deletions
|
|
@ -177,6 +177,63 @@ def clean_text(value: str) -> str:
|
|||
return value.strip()
|
||||
|
||||
|
||||
ACTION_CONTROL_RE = re.compile(
|
||||
r"^(?:스크랩\s*\d*|저장하기|닫기|포지션\s*제안|메모하기|프로필\s*확인|이력서\s*확인|펼쳐보기|접기|이전|다음)$"
|
||||
)
|
||||
ACTION_CONTROL_INLINE_RE = re.compile(
|
||||
r"(?:스크랩\s*\d+|저장하기|닫기|포지션\s*제안|메모하기|프로필\s*확인|이력서\s*확인|펼쳐보기|접기|이전|다음)"
|
||||
)
|
||||
|
||||
|
||||
def is_action_control_label(value: str) -> bool:
|
||||
label = re.sub(r"\s+", " ", html.unescape(value)).strip()
|
||||
return bool(label and ACTION_CONTROL_RE.match(label))
|
||||
|
||||
|
||||
def filter_action_control_text(value: str) -> str:
|
||||
lines = []
|
||||
for line in value.splitlines():
|
||||
label = line.strip()
|
||||
if not label or is_action_control_label(label):
|
||||
continue
|
||||
label = ACTION_CONTROL_INLINE_RE.sub(" ", label)
|
||||
label = re.sub(r"\s+", " ", label).strip()
|
||||
if label:
|
||||
lines.append(label)
|
||||
return "\n".join(lines).strip()
|
||||
|
||||
|
||||
def row_contains_other_resume(candidate_markup: str, rno: str) -> bool:
|
||||
refs: list[str] = []
|
||||
for href_rno, data_rno in re.findall(r"rNo=(\d+)|data-rno=[\"'](\d+)[\"']", candidate_markup):
|
||||
refs.append(href_rno or data_rno)
|
||||
return any(ref != rno for ref in refs)
|
||||
|
||||
|
||||
def extract_regex_candidate_markup(markup: str, match: re.Match[str], rno: str) -> str:
|
||||
row_start = markup.rfind("<tr", 0, match.start())
|
||||
if row_start >= 0:
|
||||
row_open_end = markup.find(">", row_start, match.start())
|
||||
row_end = markup.find("</tr>", match.end())
|
||||
row_open = markup[row_start : row_open_end + 1] if row_open_end >= 0 else ""
|
||||
if row_end >= 0 and f'data-rno="{rno}"' in row_open:
|
||||
return markup[row_start : row_end + len("</tr>")]
|
||||
|
||||
booth_start = markup.rfind('<div class="booth"', 0, match.start())
|
||||
if booth_start >= 0:
|
||||
next_booth = markup.find('<div class="booth"', match.end())
|
||||
section_end = markup.find('</section>', match.end())
|
||||
end_candidates = [pos for pos in (next_booth, section_end) if pos >= 0]
|
||||
booth_end = min(end_candidates) if end_candidates else min(len(markup), match.end() + 2500)
|
||||
booth = markup[booth_start:booth_end]
|
||||
if not row_contains_other_resume(booth, rno):
|
||||
return booth
|
||||
|
||||
start = max(0, match.start() - 300)
|
||||
end = min(len(markup), match.end() + 1200)
|
||||
return markup[start:end]
|
||||
|
||||
|
||||
def parse_with_bs4(markup: str, limit: int) -> list[Candidate] | None:
|
||||
try:
|
||||
from bs4 import BeautifulSoup # type: ignore
|
||||
|
|
@ -197,14 +254,32 @@ def parse_with_bs4(markup: str, limit: int) -> list[Candidate] | None:
|
|||
continue
|
||||
seen.add(rno)
|
||||
|
||||
container = link.find_parent(class_=re.compile(r"booth|list|row|person", re.I)) or link.parent
|
||||
container = (
|
||||
link.find_parent("tr", attrs={"data-rno": rno})
|
||||
or link.find_parent(class_=re.compile(r"(^|\s)booth(\s|$)", re.I))
|
||||
or link.parent
|
||||
)
|
||||
if container and row_contains_other_resume(str(container), rno):
|
||||
# Broad ancestors such as tblSearchList/personList can contain several resumes.
|
||||
# Falling back to the link itself is safer than mixing candidate evidence.
|
||||
container = link.parent
|
||||
|
||||
raw = clean_text(str(container)) if container else clean_text(str(link))
|
||||
texts = [x.get_text(" ", strip=True) for x in (container.find_all(["dt", "dd", "p", "span", "button"]) if container else [])]
|
||||
text_join = " | ".join(t for t in texts if t)
|
||||
texts = []
|
||||
for node in container.find_all(["dt", "dd", "p", "span", "li"]) if container else []:
|
||||
label = node.get_text(" ", strip=True)
|
||||
if label and not is_action_control_label(label):
|
||||
texts.append(label)
|
||||
for btn in container.select(".keywordSkill button, .keywordBox button") if container else []:
|
||||
label = btn.get_text(" ", strip=True)
|
||||
if label and not is_action_control_label(label):
|
||||
texts.append(label)
|
||||
text_join = " | ".join(dict.fromkeys(texts))
|
||||
|
||||
name = ""
|
||||
meta = ""
|
||||
dt = container.find("dt") if container else None
|
||||
name_scope = container.select_one(".nameAge") if container else None
|
||||
dt = (name_scope or container).find("dt") if container else None
|
||||
if dt:
|
||||
name = dt.get_text(" ", strip=True)
|
||||
dd = dt.find_next("dd")
|
||||
|
|
@ -217,9 +292,9 @@ def parse_with_bs4(markup: str, limit: int) -> list[Candidate] | None:
|
|||
meta = "(" + m_name.group(2) + ")"
|
||||
|
||||
skills = []
|
||||
for btn in container.select("button") if container else []:
|
||||
for btn in container.select(".keywordSkill button, .keywordBox button") if container else []:
|
||||
label = btn.get_text(" ", strip=True)
|
||||
if label and label not in {"스크랩", "포지션 제안", "메모하기", "프로필 확인", "이력서 확인"}:
|
||||
if label and not is_action_control_label(label):
|
||||
skills.append(label)
|
||||
|
||||
candidates.append(
|
||||
|
|
@ -230,7 +305,7 @@ def parse_with_bs4(markup: str, limit: int) -> list[Candidate] | None:
|
|||
meta=meta,
|
||||
career=(container.select_one(".career").get_text(" ", strip=True) if container and container.select_one(".career") else ""),
|
||||
skills=", ".join(skills[:25]),
|
||||
raw_summary=text_join[:1000] or raw[:1000],
|
||||
raw_summary=filter_action_control_text(text_join[:1000] or raw[:1000]),
|
||||
)
|
||||
)
|
||||
if len(candidates) >= limit:
|
||||
|
|
@ -246,9 +321,8 @@ def parse_with_regex(markup: str, limit: int) -> list[Candidate]:
|
|||
if rno in seen:
|
||||
continue
|
||||
seen.add(rno)
|
||||
start = max(0, m.start() - 1000)
|
||||
end = min(len(markup), m.end() + 2500)
|
||||
raw = clean_text(markup[start:end])
|
||||
raw_markup = extract_regex_candidate_markup(markup, m, rno)
|
||||
raw = clean_text(raw_markup)
|
||||
name = ""
|
||||
meta = ""
|
||||
nm = re.search(r"([가-힣A-Za-z]OO)\s*\(([^)]*)\)", raw)
|
||||
|
|
@ -261,7 +335,7 @@ def parse_with_regex(markup: str, limit: int) -> list[Candidate]:
|
|||
url=urllib.parse.urljoin(BASE_URL, m.group("href")),
|
||||
name=name,
|
||||
meta=meta,
|
||||
raw_summary=raw[:1000],
|
||||
raw_summary=filter_action_control_text(raw[:1000]),
|
||||
)
|
||||
)
|
||||
if len(candidates) >= limit:
|
||||
|
|
|
|||
|
|
@ -0,0 +1,75 @@
|
|||
#!/usr/bin/env python3
|
||||
"""Fixture tests for JobKorea public fallback parsing."""
|
||||
from __future__ import annotations
|
||||
|
||||
import importlib.util
|
||||
import sys
|
||||
import unittest
|
||||
from pathlib import Path
|
||||
|
||||
SCRIPT = Path(__file__).with_name("jobkorea_talent_search.py")
|
||||
spec = importlib.util.spec_from_file_location("jobkorea_talent_search", SCRIPT)
|
||||
assert spec is not None
|
||||
helper = importlib.util.module_from_spec(spec)
|
||||
sys.modules["jobkorea_talent_search"] = helper
|
||||
assert spec.loader is not None
|
||||
spec.loader.exec_module(helper)
|
||||
|
||||
|
||||
FALLBACK_FIXTURE = """
|
||||
<section class="searchList">
|
||||
<table class="tblSearchList">
|
||||
<tbody>
|
||||
<tr class="dvResumeTr" data-rno="111">
|
||||
<td class="tdProfile">
|
||||
<dl class="nameAge"><dt><a class="dvResumeLink" href="/corp/person/find/resume/view?rNo=111" data-rno="111">김OO</a></dt><dd>(여, 만 29세)</dd></dl>
|
||||
<ul class="bullList"><li>25분전 공고 스크랩</li></ul>
|
||||
</td>
|
||||
<td class="tdSummary">
|
||||
<div class="userInfoBox">
|
||||
<span class="career">경력 4년</span>
|
||||
<p class="title"><a class="dvResumeLink" href="/corp/person/find/resume/view?rNo=111" data-rno="111">퍼포먼스 마케터</a></p>
|
||||
<div class="keywordSkill keywordBox">
|
||||
<button type="button" class="js-kwrdSearch">Google Analytics</button>
|
||||
<button type="button" class="js-kwrdSearch">GA4</button>
|
||||
</div>
|
||||
</div>
|
||||
</td>
|
||||
<td class="tdAction">
|
||||
<button>스크랩 1</button><button>이력서 확인</button><button>포지션 제안</button><button>메모하기</button><button>저장하기</button><button>닫기</button>
|
||||
</td>
|
||||
</tr>
|
||||
<tr class="dvResumeTr" data-rno="222">
|
||||
<td class="tdProfile">
|
||||
<dl class="nameAge"><dt><a class="dvResumeLink" href="/corp/person/find/resume/view?rNo=222" data-rno="222">박OO</a></dt><dd>(남, 만 31세)</dd></dl>
|
||||
</td>
|
||||
<td class="tdSummary">
|
||||
<span class="career">경력 6년</span>
|
||||
<p class="title"><a class="dvResumeLink" href="/corp/person/find/resume/view?rNo=222" data-rno="222">브랜드 마케터</a></p>
|
||||
<div class="keywordSkill keywordBox"><button type="button" class="js-kwrdSearch">브랜딩</button></div>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</section>
|
||||
"""
|
||||
|
||||
|
||||
class JobKoreaFallbackParserTest(unittest.TestCase):
|
||||
def test_parser_keeps_each_candidate_inside_its_own_row(self) -> None:
|
||||
candidates = helper.parse_candidates(FALLBACK_FIXTURE, 10)
|
||||
|
||||
self.assertEqual([c.rno for c in candidates], ["111", "222"])
|
||||
self.assertEqual(candidates[0].name, "김OO")
|
||||
self.assertIn("Google Analytics", candidates[0].raw_summary)
|
||||
self.assertIn("GA4", candidates[0].raw_summary)
|
||||
self.assertNotIn("박OO", candidates[0].raw_summary)
|
||||
self.assertNotIn("브랜딩", candidates[0].raw_summary)
|
||||
self.assertNotIn("저장하기", candidates[0].raw_summary)
|
||||
self.assertNotIn("닫기", candidates[0].raw_summary)
|
||||
self.assertNotIn("포지션 제안", candidates[0].raw_summary)
|
||||
self.assertNotIn("이력서 확인", candidates[0].raw_summary)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
Loading…
Add table
Add a link
Reference in a new issue