mirror of
https://github.com/NomaDamas/k-skill.git
synced 2026-06-24 02:04:11 +00:00
Prevent filtered NEC lookup false negatives
Fix the candidate parser so documented education-superintendent and filtered local-election lookups return bounded, evidence-backed results instead of silently dropping valid rows. Constraint: PR #266 round-3 review required TDD, Ralph verification, and branch update for issue #256. Rejected: Full NEC pagination in this follow-up | broader than the approved change; bounded 100-row fetch now avoids user-limit false negatives and warns when capped. Confidence: high Scope-risk: narrow Directive: Preserve exact-name fail-closed parsing and count raw parsed upstream rows before cap-warning decisions. Tested: git diff --check; node --test packages/local-election-candidate-search/test/index.test.js; npm run lint --workspace local-election-candidate-search; npm run test --workspace local-election-candidate-search; npm pack --workspace local-election-candidate-search --dry-run; live CLI smokes for 오세훈, 조희연, 김동연; CLI help/no-args checks; architect verification CLEAR. Not-tested: Full npm run ci remains blocked by pre-existing repo-wide missing SKILL.md: ohou-today-deal.
This commit is contained in:
parent
bdba986e3e
commit
19de41c166
4 changed files with 93 additions and 5 deletions
|
|
@ -70,10 +70,13 @@ Return concise JSON. Each `items[]` row may include:
|
|||
- `job`, `education`, `career[]`
|
||||
- upstream code fields such as `city_code`, `sgg_city_code`, `town_code`
|
||||
|
||||
`summary.upstream_result_limit` shows the NEC row count requested before local client-side filters. Filtered searches request up to 100 upstream rows first, then apply exact-name matching, local/election/date/region filters, deduplication, and the final `--limit`.
|
||||
|
||||
## Failure modes
|
||||
|
||||
- `no candidate results`: NEC returned no matching card or filters removed all matches.
|
||||
- `unexpected NEC search HTML`: upstream may be in maintenance, NetFunnel queue, login/blocked state, or markup changed.
|
||||
- `NEC search page was capped`: filtered results are based on the maximum fetched page and may require upstream pagination for exhaustive coverage.
|
||||
- Homonyms: the same name can appear across many elections; always show election date/type/district and apply user-provided filters.
|
||||
- Future elections: candidate registration data may be incomplete until NEC publishes it.
|
||||
|
||||
|
|
|
|||
|
|
@ -37,8 +37,11 @@ Each item includes parsed candidate/profile and election fields when present: `n
|
|||
|
||||
By default, the client filters to local-election-related NEC election codes: 시·도지사(3), 구·시·군의 장(4), 시·도의회의원(5), 구·시·군의회의원(6), 광역비례(8), 기초비례(9), 교육감(11). Use `--all` / `localOnly:false` to include non-local races from NEC integrated search.
|
||||
|
||||
`summary.upstream_result_limit` records how many NEC rows were requested before local client-side filters were applied. When election/date/region/local filters are active, the client fetches up to 100 upstream rows first and then applies the user-facing `limit` after exact-name matching, filtering, and deduplication.
|
||||
|
||||
## Boundaries and failure modes
|
||||
|
||||
- NEC integrated search works best with exact Korean candidate names and may return homonyms; use `--election`, `--date`, and `--region` to narrow results.
|
||||
- The upstream is HTML, so parser warnings are returned for empty results, maintenance pages, NetFunnel queues, login prompts, or unexpected markup changes.
|
||||
- If the fetched upstream page reaches the 100-row cap while client-side filters are active, the result includes a warning that additional matches may require pagination.
|
||||
- This package does not automate NEC detail popups, file downloads, account login, CAPTCHA, political filing, or any privileged workflow.
|
||||
|
|
|
|||
|
|
@ -100,7 +100,7 @@ function normalizeSearchOptions(options = {}) {
|
|||
const name = cleanText(options.name ?? options.keyword ?? options.q ?? options.query ?? options.searchKeyword)
|
||||
if (!name) throw new Error("Provide a candidate name to search.")
|
||||
if (name.length > 30) throw new Error("Candidate name must be 30 characters or fewer.")
|
||||
return {
|
||||
const normalized = {
|
||||
name,
|
||||
localOnly: normalizeBoolean(options.localOnly ?? options.local ?? options.onlyLocal, true),
|
||||
electionCode: normalizeElectionCode(options.electionCode ?? options.election ?? options.electionType ?? options.type),
|
||||
|
|
@ -109,6 +109,17 @@ function normalizeSearchOptions(options = {}) {
|
|||
limit: parsePositiveInteger(options.limit ?? options.pageSize, { defaultValue: DEFAULT_LIMIT, min: 1, max: MAX_LIMIT, label: "limit" }),
|
||||
includeHtml: Boolean(options.includeHtml)
|
||||
}
|
||||
normalized.upstreamLimit = parsePositiveInteger(options.upstreamLimit ?? options.recordCountPerPage, {
|
||||
defaultValue: hasClientSideFilters(normalized) ? MAX_LIMIT : normalized.limit,
|
||||
min: normalized.limit,
|
||||
max: MAX_LIMIT,
|
||||
label: "upstream limit"
|
||||
})
|
||||
return normalized
|
||||
}
|
||||
|
||||
function hasClientSideFilters(options) {
|
||||
return Boolean(options.localOnly || options.electionCode || options.electionDate || options.region)
|
||||
}
|
||||
|
||||
function buildSearchRequest(options = {}) {
|
||||
|
|
@ -117,7 +128,7 @@ function buildSearchRequest(options = {}) {
|
|||
searchKeyword: normalized.name,
|
||||
pageIndex: "1",
|
||||
firstIndex: "0",
|
||||
recordCountPerPage: String(normalized.limit)
|
||||
recordCountPerPage: String(normalized.upstreamLimit)
|
||||
}).toString()
|
||||
return {
|
||||
url: NEC_SEARCH_URL,
|
||||
|
|
@ -183,7 +194,7 @@ function parseTitle(titleHtml) {
|
|||
let voteShare = null
|
||||
let elected = /당선/.test(afterMark)
|
||||
|
||||
if (segments.length === 2 && /선거$/.test(segments[0])) {
|
||||
if (segments[0] && /선거$/.test(segments[0])) {
|
||||
party = null
|
||||
electionType = segments[0]
|
||||
district = segments[1]
|
||||
|
|
@ -259,10 +270,14 @@ function parseSearchHtml(html, options = {}) {
|
|||
|
||||
const resultRegex = /<div\b([^>]*)class=(['"])[^'"]*\bresult\b[^'"]*\2([^>]*)>([\s\S]*?)(?=<div\b[^>]*class=(['"])[^'"]*\bresult\b|<div\b[^>]*class=(['"])[^'"]*\bpage\b|<\/body>|$)/gi
|
||||
let parsedResultCards = 0
|
||||
let parsedElectionEntries = 0
|
||||
for (const resultMatch of html.matchAll(resultRegex)) {
|
||||
parsedResultCards += 1
|
||||
const resultAttrs = `${resultMatch[1] || ""} ${resultMatch[3] || ""}`
|
||||
const resultHtml = resultMatch[4]
|
||||
const listRegex = /<div\b([^>]*)class=(['"])[^'"]*\blist\b[^'"]*\2([^>]*)>([\s\S]*?)(?=<div\b[^>]*class=(['"])[^'"]*\blist\b|<\/div>\s*<\/div>\s*(?:<div\b[^>]*class=(['"])[^'"]*\bresult\b|<\/div>|$))/gi
|
||||
const listMatches = [...resultHtml.matchAll(listRegex)]
|
||||
parsedElectionEntries += listMatches.length
|
||||
const nameMatch = resultHtml.match(/<p\b[^>]*class=(['"])[^'"]*\bname\b[^'"]*\1[^>]*>([\s\S]*?)<\/p>/i)
|
||||
const nameHtml = nameMatch ? nameMatch[2] : ""
|
||||
const strongMatch = nameHtml.match(/<strong[^>]*>([\s\S]*?)<\/strong>/i)
|
||||
|
|
@ -280,8 +295,7 @@ function parseSearchHtml(html, options = {}) {
|
|||
const hanja = hanjaMatch ? stripTags(hanjaMatch[2]) : null
|
||||
const { birthDate, gender } = parseBirthDateAndGender(dateMatch ? stripTags(dateMatch[2]) : stripTags(nameHtml), resultAttrs)
|
||||
|
||||
const listRegex = /<div\b([^>]*)class=(['"])[^'"]*\blist\b[^'"]*\2([^>]*)>([\s\S]*?)(?=<div\b[^>]*class=(['"])[^'"]*\blist\b|<\/div>\s*<\/div>\s*(?:<div\b[^>]*class=(['"])[^'"]*\bresult\b|<\/div>|$))/gi
|
||||
for (const listMatch of resultHtml.matchAll(listRegex)) {
|
||||
for (const listMatch of listMatches) {
|
||||
const listAttrs = `${listMatch[1] || ""} ${listMatch[3] || ""}`
|
||||
const listHtml = listMatch[4]
|
||||
const titleMatch = listHtml.match(/<div\b[^>]*class=(['"])[^'"]*\bt\b[^'"]*\1[^>]*>([\s\S]*?)(?:<button\b[^>]*class=(['"])[^'"]*\bmore\b|<div\b[^>]*class=(['"])[^'"]*\bbox\b|$)/i)
|
||||
|
|
@ -323,6 +337,9 @@ function parseSearchHtml(html, options = {}) {
|
|||
if (parsedResultCards === 0 && hasUnparsedCandidateResults(html)) {
|
||||
warnings.push("parser drift suspected: NEC search result markers were present but no supported result cards could be parsed")
|
||||
}
|
||||
if (hasClientSideFilters(normalized) && parsedElectionEntries >= normalized.upstreamLimit) {
|
||||
warnings.push(`NEC search page was capped at ${normalized.upstreamLimit} upstream rows before client-side filters; additional matches may require pagination`)
|
||||
}
|
||||
|
||||
const limitedItems = items.slice(0, normalized.limit)
|
||||
if (limitedItems.length === 0 && warnings.length === 0) warnings.push("no candidate results matched the provided name/filters on the NEC search page")
|
||||
|
|
@ -338,6 +355,7 @@ function parseSearchHtml(html, options = {}) {
|
|||
summary: {
|
||||
returned_count: limitedItems.length,
|
||||
matched_before_limit: items.length,
|
||||
upstream_result_limit: normalized.upstreamLimit,
|
||||
local_only: normalized.localOnly
|
||||
},
|
||||
items: limitedItems,
|
||||
|
|
|
|||
|
|
@ -61,6 +61,19 @@ const SEARCH_HTML = `<!doctype html><html><body>
|
|||
|
||||
const EMPTY_HTML = `<!doctype html><html><body><article class="content"><div class="resultDiv"></div><script>fn_firstView();</script></article></body></html>`
|
||||
const BLOCKED_HTML = `<!doctype html><html><body><h1>서비스 점검 안내</h1><p>NetFunnel 대기열 또는 로그인 확인 후 다시 이용해 주세요.</p></body></html>`
|
||||
const SUPERINTENDENT_HTML = `<!doctype html><html><body>
|
||||
<div class="resultDiv">
|
||||
<div class="result" data-birthday="19561006">
|
||||
<p class="name"><strong>조희연</strong><span class="hanja">(曺喜昖)</span> <span class="date">1956년 10월 06일(남)</span></p>
|
||||
<div class="list" data-election-code="11" data-election-name="20140604" data-city-code="1100">
|
||||
<div class="t">
|
||||
<button type="button"><mark>[2014.06.04] 제6회 전국동시지방선거</mark></button>
|
||||
교육감선거<span class="slash"> /</span> 서울특별시<span class="slash"> /</span> 1,614,564표 (38.10%)
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</body></html>`
|
||||
|
||||
test("normalizeSearchOptions requires an exact candidate name and defaults to local elections", () => {
|
||||
const options = normalizeSearchOptions({ q: " 오세훈 ", limit: "200" })
|
||||
|
|
@ -93,6 +106,15 @@ test("buildSearchRequest posts to the official NEC integrated candidate search",
|
|||
assert.equal(new URLSearchParams(request.body).get("searchKeyword"), "오세훈")
|
||||
})
|
||||
|
||||
test("buildSearchRequest fetches a full upstream page before client-side filters and output limit", () => {
|
||||
const request = buildSearchRequest({ name: "조희연", election: "교육감", region: "서울", limit: 1 })
|
||||
const body = new URLSearchParams(request.body)
|
||||
|
||||
assert.equal(body.get("recordCountPerPage"), "100")
|
||||
assert.equal(request.options.limit, 1)
|
||||
assert.equal(request.options.upstreamLimit, 100)
|
||||
})
|
||||
|
||||
test("parseSearchHtml returns local election candidate entries with profile fields", () => {
|
||||
const result = parseSearchHtml(SEARCH_HTML, { name: "오세훈" })
|
||||
|
||||
|
|
@ -153,6 +175,48 @@ test("parseSearchHtml supports election/date/region filters", () => {
|
|||
assert.equal(result.items[0].district, "서울특별시(동작구가선거구)")
|
||||
})
|
||||
|
||||
test("parseSearchHtml parses no-party education superintendent vote rows for region filters", () => {
|
||||
const result = parseSearchHtml(SUPERINTENDENT_HTML, { name: "조희연", election: "교육감", region: "서울", limit: 5 })
|
||||
|
||||
assert.equal(result.summary.returned_count, 1)
|
||||
assert.equal(result.items[0].party, undefined)
|
||||
assert.equal(result.items[0].election_type, "교육감선거")
|
||||
assert.equal(result.items[0].district, "서울특별시")
|
||||
assert.equal(result.items[0].votes, 1614564)
|
||||
assert.equal(result.items[0].vote_share, "38.10%")
|
||||
assert.equal(result.warnings.join("\n"), "")
|
||||
})
|
||||
|
||||
test("searchCandidates applies output limit after fetching enough upstream rows for filters", async () => {
|
||||
const calls = []
|
||||
const result = await searchCandidates({ name: "조희연", election: "교육감", region: "서울", limit: 1 }, {
|
||||
fetchImpl: async (url, init) => {
|
||||
calls.push({ url, init })
|
||||
return { ok: true, status: 200, text: async () => SUPERINTENDENT_HTML }
|
||||
}
|
||||
})
|
||||
|
||||
assert.equal(new URLSearchParams(calls[0].init.body).get("recordCountPerPage"), "100")
|
||||
assert.equal(result.summary.returned_count, 1)
|
||||
assert.equal(result.summary.matched_before_limit, 1)
|
||||
assert.equal(result.summary.upstream_result_limit, 100)
|
||||
assert.equal(result.items[0].name, "조희연")
|
||||
})
|
||||
|
||||
test("parseSearchHtml warns when a filtered upstream page reaches the fetched row cap", () => {
|
||||
const cappedHtml = SEARCH_HTML.replace("오세훈", "다른후보").replace("김동연", "다른사람")
|
||||
const result = parseSearchHtml(cappedHtml, {
|
||||
name: "오세훈",
|
||||
election: "시도지사",
|
||||
region: "서울",
|
||||
limit: 1,
|
||||
upstreamLimit: 2
|
||||
})
|
||||
|
||||
assert.equal(result.items.length, 0)
|
||||
assert.match(result.warnings.join("\n"), /capped at 2 upstream rows/i)
|
||||
})
|
||||
|
||||
test("parseSearchHtml deduplicates repeated candidate election entries before applying limit", () => {
|
||||
const duplicateList = SEARCH_HTML.match(/<div class="list" data-election-type="4"[\s\S]*?<\/div>\s*<\/div>\s*<div class="list" data-election-code="2"/)[0]
|
||||
.replace(/\s*<div class="list" data-election-code="2"$/, "")
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue