mirror of
https://github.com/NomaDamas/k-skill.git
synced 2026-06-24 02:04:11 +00:00
186 lines
7 KiB
Python
186 lines
7 KiB
Python
from __future__ import annotations
|
|
|
|
import html
|
|
import re
|
|
import urllib.parse
|
|
|
|
from jobkorea_talent_models import BASE_URL, Candidate
|
|
|
|
ACTION_CONTROL_RE = re.compile(
|
|
r"^(?:스크랩\s*\d*|저장하기|닫기|포지션\s*제안|메모하기|프로필\s*확인|이력서\s*확인|펼쳐보기|접기|이전|다음)$"
|
|
)
|
|
ACTION_CONTROL_INLINE_RE = re.compile(
|
|
r"(?:스크랩\s*\d+|저장하기|닫기|포지션\s*제안|메모하기|프로필\s*확인|이력서\s*확인|펼쳐보기|접기|이전|다음)"
|
|
)
|
|
RESUME_LINK_RE = re.compile(r'href="(?P<href>/corp/person/find/resume/view\?rNo=(?P<rno>\d+))"')
|
|
|
|
|
|
def clean_text(value: str) -> str:
|
|
value = html.unescape(value)
|
|
value = re.sub(r"<script[\s\S]*?</script>", " ", value, flags=re.I)
|
|
value = re.sub(r"<style[\s\S]*?</style>", " ", value, flags=re.I)
|
|
value = re.sub(r"<[^>]+>", " ", value)
|
|
value = re.sub(r"[ \t\r\f\v]+", " ", value)
|
|
value = re.sub(r"\n\s*\n+", "\n", value)
|
|
return value.strip()
|
|
|
|
|
|
def is_action_control_label(value: str) -> bool:
|
|
label = re.sub(r"\s+", " ", html.unescape(value)).strip()
|
|
return bool(label and ACTION_CONTROL_RE.match(label))
|
|
|
|
|
|
def filter_action_control_text(value: str) -> str:
|
|
lines = []
|
|
for line in value.splitlines():
|
|
label = line.strip()
|
|
if not label or is_action_control_label(label):
|
|
continue
|
|
label = ACTION_CONTROL_INLINE_RE.sub(" ", label)
|
|
label = re.sub(r"\s+", " ", label).strip()
|
|
if label:
|
|
lines.append(label)
|
|
return "\n".join(lines).strip()
|
|
|
|
|
|
def row_contains_other_resume(candidate_markup: str, rno: str) -> bool:
|
|
refs: list[str] = []
|
|
for href_rno, data_rno in re.findall(r"rNo=(\d+)|data-rno=[\"'](\d+)[\"']", candidate_markup):
|
|
refs.append(href_rno or data_rno)
|
|
return any(ref != rno for ref in refs)
|
|
|
|
|
|
def extract_regex_candidate_markup(markup: str, match: re.Match[str], rno: str) -> str:
|
|
row_start = markup.rfind("<tr", 0, match.start())
|
|
if row_start >= 0:
|
|
row_open_end = markup.find(">", row_start, match.start())
|
|
row_end = markup.find("</tr>", match.end())
|
|
row_open = markup[row_start : row_open_end + 1] if row_open_end >= 0 else ""
|
|
if row_end >= 0 and f'data-rno="{rno}"' in row_open:
|
|
return markup[row_start : row_end + len("</tr>")]
|
|
|
|
booth_start = markup.rfind('<div class="booth"', 0, match.start())
|
|
if booth_start >= 0:
|
|
next_booth = markup.find('<div class="booth"', match.end())
|
|
section_end = markup.find("</section>", match.end())
|
|
end_candidates = [pos for pos in (next_booth, section_end) if pos >= 0]
|
|
booth_end = min(end_candidates) if end_candidates else min(len(markup), match.end() + 2500)
|
|
booth = markup[booth_start:booth_end]
|
|
if not row_contains_other_resume(booth, rno):
|
|
return booth
|
|
|
|
start = max(0, match.start() - 300)
|
|
end = min(len(markup), match.end() + 1200)
|
|
return markup[start:end]
|
|
|
|
|
|
def parse_with_bs4(markup: str, limit: int) -> list[Candidate] | None:
|
|
try:
|
|
from bs4 import BeautifulSoup
|
|
except ImportError:
|
|
return None
|
|
|
|
soup = BeautifulSoup(markup, "html.parser")
|
|
candidates: list[Candidate] = []
|
|
seen: set[str] = set()
|
|
|
|
for link in soup.select('a[href*="/corp/person/find/resume/view?rNo="]'):
|
|
raw_href = link.get("href", "")
|
|
href = raw_href if isinstance(raw_href, str) else ""
|
|
matched_rno = re.search(r"rNo=(\d+)", href)
|
|
if not matched_rno:
|
|
continue
|
|
rno = matched_rno.group(1)
|
|
if rno in seen:
|
|
continue
|
|
seen.add(rno)
|
|
|
|
container = (
|
|
link.find_parent("tr", attrs={"data-rno": rno})
|
|
or link.find_parent(class_=re.compile(r"(^|\s)booth(\s|$)", re.I))
|
|
or link.parent
|
|
)
|
|
if container and row_contains_other_resume(str(container), rno):
|
|
container = link.parent
|
|
|
|
raw = clean_text(str(container)) if container else clean_text(str(link))
|
|
texts = []
|
|
for node in container.find_all(["dt", "dd", "p", "span", "li"]) if container else []:
|
|
label = node.get_text(" ", strip=True)
|
|
if label and not is_action_control_label(label):
|
|
texts.append(label)
|
|
for btn in container.select(".keywordSkill button, .keywordBox button") if container else []:
|
|
label = btn.get_text(" ", strip=True)
|
|
if label and not is_action_control_label(label):
|
|
texts.append(label)
|
|
text_join = " | ".join(dict.fromkeys(texts))
|
|
|
|
name_scope = container.select_one(".nameAge") if container else None
|
|
dt = (name_scope or container).find("dt") if container else None
|
|
name = dt.get_text(" ", strip=True) if dt else ""
|
|
dd = dt.find_next("dd") if dt else None
|
|
meta = dd.get_text(" ", strip=True) if dd else ""
|
|
if not name:
|
|
m_name = re.search(r"([가-힣A-Za-z]OO)\s*\(([^)]*)\)", raw)
|
|
if m_name:
|
|
name = m_name.group(1)
|
|
meta = "(" + m_name.group(2) + ")"
|
|
|
|
skills = []
|
|
for btn in container.select(".keywordSkill button, .keywordBox button") if container else []:
|
|
label = btn.get_text(" ", strip=True)
|
|
if label and not is_action_control_label(label):
|
|
skills.append(label)
|
|
|
|
career_node = container.select_one(".career") if container else None
|
|
candidates.append(
|
|
Candidate(
|
|
rno=rno,
|
|
url=urllib.parse.urljoin(BASE_URL, href),
|
|
name=name,
|
|
meta=meta,
|
|
career=career_node.get_text(" ", strip=True) if career_node else "",
|
|
skills=", ".join(skills[:25]),
|
|
raw_summary=filter_action_control_text(text_join[:1000] or raw[:1000]),
|
|
)
|
|
)
|
|
if len(candidates) >= limit:
|
|
break
|
|
return candidates
|
|
|
|
|
|
def parse_with_regex(markup: str, limit: int) -> list[Candidate]:
|
|
candidates: list[Candidate] = []
|
|
seen: set[str] = set()
|
|
for match in RESUME_LINK_RE.finditer(markup):
|
|
rno = match.group("rno")
|
|
if rno in seen:
|
|
continue
|
|
seen.add(rno)
|
|
raw_markup = extract_regex_candidate_markup(markup, match, rno)
|
|
raw = clean_text(raw_markup)
|
|
name = ""
|
|
meta = ""
|
|
name_match = re.search(r"([가-힣A-Za-z]OO)\s*\(([^)]*)\)", raw)
|
|
if name_match:
|
|
name = name_match.group(1)
|
|
meta = "(" + name_match.group(2) + ")"
|
|
candidates.append(
|
|
Candidate(
|
|
rno=rno,
|
|
url=urllib.parse.urljoin(BASE_URL, match.group("href")),
|
|
name=name,
|
|
meta=meta,
|
|
raw_summary=filter_action_control_text(raw[:1000]),
|
|
)
|
|
)
|
|
if len(candidates) >= limit:
|
|
break
|
|
return candidates
|
|
|
|
|
|
def parse_candidates(markup: str, limit: int) -> list[Candidate]:
|
|
parsed = parse_with_bs4(markup, limit)
|
|
if parsed is not None:
|
|
return parsed
|
|
return parse_with_regex(markup, limit)
|