mirror of
https://github.com/NomaDamas/k-skill.git
synced 2026-06-24 02:04:11 +00:00
Keep Daangn jobs detail resilient
Add an HTML metadata fallback because the public jobs detail _data route currently returns an empty response while the redirected public page still exposes read-only title/meta data. Constraint: PR 237 must remain read-only and avoid proxy/auth additions for public Daangn surfaces Rejected: Treating the 204 _data response as acceptable | it breaks the documented detail command Confidence: high Scope-risk: narrow Directive: Keep Daangn jobs detail on public HTML/meta fallback unless a stable JSON detail surface is verified Tested: npm run ci; live daangn_jobs.py search/detail smoke Not-tested: authenticated or interactive Daangn actions, intentionally out of scope
This commit is contained in:
parent
95d5e9d05b
commit
3109b6684a
3 changed files with 36 additions and 6 deletions
|
|
@ -38,7 +38,7 @@ metadata:
|
|||
|
||||
- Region resolver: `https://www.daangn.com/kr/api/v1/regions/keyword?keyword=<지역명>`
|
||||
- Search `_data`: `/kr/jobs/?in=<지역명>-<id>&search=<keyword>&_data=routes/kr.jobs._index`
|
||||
- Detail `_data`: `<job-url>?_data=routes%2Fkr.jobs.%24job_post_id`
|
||||
- Detail fallback: `<job-url>` redirects to `jobs.daangn.com/job-posts/<id>` and exposes public HTML title/meta/JSON-LD. The helper first tries the legacy `_data` route and falls back to HTML meta when that route returns an empty response.
|
||||
|
||||
## Workflow
|
||||
|
||||
|
|
@ -58,7 +58,7 @@ python3 daangn-jobs-search/scripts/daangn_jobs.py detail "https://www.daangn.com
|
|||
## Output fields
|
||||
|
||||
- title, company, region, address, salary, salaryType, workDays, workTimeStart, workTimeEnd, closed, url
|
||||
- detail: jobPost 원문
|
||||
- detail: `jobPost` 원문 if the `_data` route is available; otherwise public page `title`, `meta`, and `json_ld`
|
||||
|
||||
## Region handling
|
||||
|
||||
|
|
|
|||
|
|
@ -7,7 +7,10 @@ HEADERS = {"User-Agent":"Mozilla/5.0", "Accept":"application/json,text/html;q=0.
|
|||
def fetch_json(url):
|
||||
req = urllib.request.Request(url, headers=HEADERS)
|
||||
with urllib.request.urlopen(req, timeout=25) as r:
|
||||
return json.load(r)
|
||||
body = r.read()
|
||||
if not body:
|
||||
raise ValueError(f'빈 JSON 응답: {url}')
|
||||
return json.loads(body)
|
||||
|
||||
def fetch_text(url):
|
||||
req = urllib.request.Request(url, headers={"User-Agent":"Mozilla/5.0", "Accept":"text/html"})
|
||||
|
|
@ -42,6 +45,27 @@ def absolute(href):
|
|||
def print_json(obj):
|
||||
print(json.dumps(obj, ensure_ascii=False, indent=2))
|
||||
|
||||
def parse_html_detail(url):
|
||||
html = fetch_text(url)
|
||||
title = re.search(r'<title>(.*?)</title>', html, re.S)
|
||||
meta = {}
|
||||
for m in re.finditer(r'<meta[^>]+(?:property|name)=["\']([^"\']+)["\'][^>]+content=["\']([^"\']*)["\']', html):
|
||||
key, value = m.group(1), unescape(m.group(2)).strip()
|
||||
if key in ('description', 'og:title', 'og:description', 'og:image'):
|
||||
meta[key] = value
|
||||
json_ld = []
|
||||
for m in re.finditer(r'<script[^>]*type=["\']application/ld\+json["\'][^>]*>(.*?)</script>', html, re.S):
|
||||
try:
|
||||
json_ld.append(json.loads(unescape(m.group(1))))
|
||||
except Exception:
|
||||
pass
|
||||
return {
|
||||
'source': url,
|
||||
'title': unescape(title.group(1)).strip() if title else meta.get('og:title'),
|
||||
'meta': meta,
|
||||
'json_ld': json_ld[:3],
|
||||
}
|
||||
|
||||
|
||||
def cmd_search(args):
|
||||
sel=resolve_region(args.region) if args.region else None
|
||||
|
|
@ -59,7 +83,13 @@ def cmd_search(args):
|
|||
|
||||
def cmd_detail(args):
|
||||
u=args.url.rstrip('/')+'/?_data=routes%2Fkr.jobs.%24job_post_id'
|
||||
data=fetch_json(u); print_json({'source':u,'jobPost':data.get('jobPost') or data})
|
||||
try:
|
||||
data=fetch_json(u)
|
||||
print_json({'source':u,'jobPost':data.get('jobPost') or data})
|
||||
except Exception:
|
||||
detail = parse_html_detail(args.url)
|
||||
detail['data_source_attempted'] = u
|
||||
print_json(detail)
|
||||
|
||||
p=argparse.ArgumentParser(description='Daangn jobs read-only search/detail')
|
||||
sub=p.add_subparsers(dest='cmd', required=True)
|
||||
|
|
|
|||
|
|
@ -14,7 +14,7 @@
|
|||
|
||||
1. 지역 해석: `https://www.daangn.com/kr/api/v1/regions/keyword?keyword=<지역명>`
|
||||
2. 검색: `https://www.daangn.com/kr/jobs/?in=<지역명>-<id>&search=<키워드>&_data=routes/kr.jobs._index`
|
||||
3. 상세: `<공고 URL>?_data=routes%2Fkr.jobs.%24job_post_id`
|
||||
3. 상세: `<공고 URL>` → `jobs.daangn.com/job-posts/<id>` 공개 HTML의 title/meta/JSON-LD(헬퍼는 legacy `_data`를 먼저 시도 후 빈 응답이면 HTML 메타로 fallback)
|
||||
|
||||
## 로컬 실행
|
||||
|
||||
|
|
@ -33,7 +33,7 @@ python3 daangn-jobs-search/scripts/daangn_jobs.py detail "https://www.daangn.com
|
|||
|
||||
## 출력 해석
|
||||
|
||||
검색 결과는 `title`, `company`, `region`, `address`, `salary`, `salaryType`, `workDays`, `workTimeStart`, `workTimeEnd`, `closed`, `url`을 우선 확인합니다. 지원 가능 여부나 근무 조건은 상세 조회의 `jobPost` 원문까지 본 뒤 정리합니다.
|
||||
검색 결과는 `title`, `company`, `region`, `address`, `salary`, `salaryType`, `workDays`, `workTimeStart`, `workTimeEnd`, `closed`, `url`을 우선 확인합니다. 상세 조회는 가능하면 `jobPost` 원문을 사용하고, 공개 `_data`가 빈 응답이면 HTML title/meta/JSON-LD를 근거로 정리합니다.
|
||||
|
||||
## 제한사항
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue