Prevent filtered NEC lookup false negatives

Fix the candidate parser so documented education-superintendent and filtered local-election lookups return bounded, evidence-backed results instead of silently dropping valid rows.

Constraint: PR #266 round-3 review required TDD, Ralph verification, and branch update for issue #256.

Rejected: Full NEC pagination in this follow-up | broader than the approved change; bounded 100-row fetch now avoids user-limit false negatives and warns when capped.

Confidence: high

Scope-risk: narrow

Directive: Preserve exact-name fail-closed parsing and count raw parsed upstream rows before cap-warning decisions.

Tested: git diff --check; node --test packages/local-election-candidate-search/test/index.test.js; npm run lint --workspace local-election-candidate-search; npm run test --workspace local-election-candidate-search; npm pack --workspace local-election-candidate-search --dry-run; live CLI smokes for 오세훈, 조희연, 김동연; CLI help/no-args checks; architect verification CLEAR.

Not-tested: Full npm run ci remains blocked by pre-existing repo-wide missing SKILL.md: ohou-today-deal.
This commit is contained in:
Jeffrey (Dongkyu) Kim 2026-05-18 16:54:50 +09:00
commit 19de41c166
4 changed files with 93 additions and 5 deletions

View file

@ -70,10 +70,13 @@ Return concise JSON. Each `items[]` row may include:
- `job`, `education`, `career[]`
- upstream code fields such as `city_code`, `sgg_city_code`, `town_code`
`summary.upstream_result_limit` shows the NEC row count requested before local client-side filters. Filtered searches request up to 100 upstream rows first, then apply exact-name matching, local/election/date/region filters, deduplication, and the final `--limit`.
## Failure modes
- `no candidate results`: NEC returned no matching card or filters removed all matches.
- `unexpected NEC search HTML`: upstream may be in maintenance, NetFunnel queue, login/blocked state, or markup changed.
- `NEC search page was capped`: filtered results are based on the maximum fetched page and may require upstream pagination for exhaustive coverage.
- Homonyms: the same name can appear across many elections; always show election date/type/district and apply user-provided filters.
- Future elections: candidate registration data may be incomplete until NEC publishes it.

View file

@ -37,8 +37,11 @@ Each item includes parsed candidate/profile and election fields when present: `n
By default, the client filters to local-election-related NEC election codes: 시·도지사(3), 구·시·군의 장(4), 시·도의회의원(5), 구·시·군의회의원(6), 광역비례(8), 기초비례(9), 교육감(11). Use `--all` / `localOnly:false` to include non-local races from NEC integrated search.
`summary.upstream_result_limit` records how many NEC rows were requested before local client-side filters were applied. When election/date/region/local filters are active, the client fetches up to 100 upstream rows first and then applies the user-facing `limit` after exact-name matching, filtering, and deduplication.
## Boundaries and failure modes
- NEC integrated search works best with exact Korean candidate names and may return homonyms; use `--election`, `--date`, and `--region` to narrow results.
- The upstream is HTML, so parser warnings are returned for empty results, maintenance pages, NetFunnel queues, login prompts, or unexpected markup changes.
- If the fetched upstream page reaches the 100-row cap while client-side filters are active, the result includes a warning that additional matches may require pagination.
- This package does not automate NEC detail popups, file downloads, account login, CAPTCHA, political filing, or any privileged workflow.

View file

@ -100,7 +100,7 @@ function normalizeSearchOptions(options = {}) {
const name = cleanText(options.name ?? options.keyword ?? options.q ?? options.query ?? options.searchKeyword)
if (!name) throw new Error("Provide a candidate name to search.")
if (name.length > 30) throw new Error("Candidate name must be 30 characters or fewer.")
return {
const normalized = {
name,
localOnly: normalizeBoolean(options.localOnly ?? options.local ?? options.onlyLocal, true),
electionCode: normalizeElectionCode(options.electionCode ?? options.election ?? options.electionType ?? options.type),
@ -109,6 +109,17 @@ function normalizeSearchOptions(options = {}) {
limit: parsePositiveInteger(options.limit ?? options.pageSize, { defaultValue: DEFAULT_LIMIT, min: 1, max: MAX_LIMIT, label: "limit" }),
includeHtml: Boolean(options.includeHtml)
}
normalized.upstreamLimit = parsePositiveInteger(options.upstreamLimit ?? options.recordCountPerPage, {
defaultValue: hasClientSideFilters(normalized) ? MAX_LIMIT : normalized.limit,
min: normalized.limit,
max: MAX_LIMIT,
label: "upstream limit"
})
return normalized
}
function hasClientSideFilters(options) {
return Boolean(options.localOnly || options.electionCode || options.electionDate || options.region)
}
function buildSearchRequest(options = {}) {
@ -117,7 +128,7 @@ function buildSearchRequest(options = {}) {
searchKeyword: normalized.name,
pageIndex: "1",
firstIndex: "0",
recordCountPerPage: String(normalized.limit)
recordCountPerPage: String(normalized.upstreamLimit)
}).toString()
return {
url: NEC_SEARCH_URL,
@ -183,7 +194,7 @@ function parseTitle(titleHtml) {
let voteShare = null
let elected = /당선/.test(afterMark)
if (segments.length === 2 && /선거$/.test(segments[0])) {
if (segments[0] && /선거$/.test(segments[0])) {
party = null
electionType = segments[0]
district = segments[1]
@ -259,10 +270,14 @@ function parseSearchHtml(html, options = {}) {
const resultRegex = /<div\b([^>]*)class=(['"])[^'"]*\bresult\b[^'"]*\2([^>]*)>([\s\S]*?)(?=<div\b[^>]*class=(['"])[^'"]*\bresult\b|<div\b[^>]*class=(['"])[^'"]*\bpage\b|<\/body>|$)/gi
let parsedResultCards = 0
let parsedElectionEntries = 0
for (const resultMatch of html.matchAll(resultRegex)) {
parsedResultCards += 1
const resultAttrs = `${resultMatch[1] || ""} ${resultMatch[3] || ""}`
const resultHtml = resultMatch[4]
const listRegex = /<div\b([^>]*)class=(['"])[^'"]*\blist\b[^'"]*\2([^>]*)>([\s\S]*?)(?=<div\b[^>]*class=(['"])[^'"]*\blist\b|<\/div>\s*<\/div>\s*(?:<div\b[^>]*class=(['"])[^'"]*\bresult\b|<\/div>|$))/gi
const listMatches = [...resultHtml.matchAll(listRegex)]
parsedElectionEntries += listMatches.length
const nameMatch = resultHtml.match(/<p\b[^>]*class=(['"])[^'"]*\bname\b[^'"]*\1[^>]*>([\s\S]*?)<\/p>/i)
const nameHtml = nameMatch ? nameMatch[2] : ""
const strongMatch = nameHtml.match(/<strong[^>]*>([\s\S]*?)<\/strong>/i)
@ -280,8 +295,7 @@ function parseSearchHtml(html, options = {}) {
const hanja = hanjaMatch ? stripTags(hanjaMatch[2]) : null
const { birthDate, gender } = parseBirthDateAndGender(dateMatch ? stripTags(dateMatch[2]) : stripTags(nameHtml), resultAttrs)
const listRegex = /<div\b([^>]*)class=(['"])[^'"]*\blist\b[^'"]*\2([^>]*)>([\s\S]*?)(?=<div\b[^>]*class=(['"])[^'"]*\blist\b|<\/div>\s*<\/div>\s*(?:<div\b[^>]*class=(['"])[^'"]*\bresult\b|<\/div>|$))/gi
for (const listMatch of resultHtml.matchAll(listRegex)) {
for (const listMatch of listMatches) {
const listAttrs = `${listMatch[1] || ""} ${listMatch[3] || ""}`
const listHtml = listMatch[4]
const titleMatch = listHtml.match(/<div\b[^>]*class=(['"])[^'"]*\bt\b[^'"]*\1[^>]*>([\s\S]*?)(?:<button\b[^>]*class=(['"])[^'"]*\bmore\b|<div\b[^>]*class=(['"])[^'"]*\bbox\b|$)/i)
@ -323,6 +337,9 @@ function parseSearchHtml(html, options = {}) {
if (parsedResultCards === 0 && hasUnparsedCandidateResults(html)) {
warnings.push("parser drift suspected: NEC search result markers were present but no supported result cards could be parsed")
}
if (hasClientSideFilters(normalized) && parsedElectionEntries >= normalized.upstreamLimit) {
warnings.push(`NEC search page was capped at ${normalized.upstreamLimit} upstream rows before client-side filters; additional matches may require pagination`)
}
const limitedItems = items.slice(0, normalized.limit)
if (limitedItems.length === 0 && warnings.length === 0) warnings.push("no candidate results matched the provided name/filters on the NEC search page")
@ -338,6 +355,7 @@ function parseSearchHtml(html, options = {}) {
summary: {
returned_count: limitedItems.length,
matched_before_limit: items.length,
upstream_result_limit: normalized.upstreamLimit,
local_only: normalized.localOnly
},
items: limitedItems,

View file

@ -61,6 +61,19 @@ const SEARCH_HTML = `<!doctype html><html><body>
const EMPTY_HTML = `<!doctype html><html><body><article class="content"><div class="resultDiv"></div><script>fn_firstView();</script></article></body></html>`
const BLOCKED_HTML = `<!doctype html><html><body><h1>서비스 점검 안내</h1><p>NetFunnel 대기열 또는 로그인 확인 후 다시 이용해 주세요.</p></body></html>`
const SUPERINTENDENT_HTML = `<!doctype html><html><body>
<div class="resultDiv">
<div class="result" data-birthday="19561006">
<p class="name"><strong>조희연</strong><span class="hanja">()</span> <span class="date">1956 10 06()</span></p>
<div class="list" data-election-code="11" data-election-name="20140604" data-city-code="1100">
<div class="t">
<button type="button"><mark>[2014.06.04] 제6회 전국동시지방선거</mark></button>
교육감선거<span class="slash"> /</span> <span class="slash"> /</span> 1,614,564 (38.10%)
</div>
</div>
</div>
</div>
</body></html>`
test("normalizeSearchOptions requires an exact candidate name and defaults to local elections", () => {
const options = normalizeSearchOptions({ q: " 오세훈 ", limit: "200" })
@ -93,6 +106,15 @@ test("buildSearchRequest posts to the official NEC integrated candidate search",
assert.equal(new URLSearchParams(request.body).get("searchKeyword"), "오세훈")
})
test("buildSearchRequest fetches a full upstream page before client-side filters and output limit", () => {
const request = buildSearchRequest({ name: "조희연", election: "교육감", region: "서울", limit: 1 })
const body = new URLSearchParams(request.body)
assert.equal(body.get("recordCountPerPage"), "100")
assert.equal(request.options.limit, 1)
assert.equal(request.options.upstreamLimit, 100)
})
test("parseSearchHtml returns local election candidate entries with profile fields", () => {
const result = parseSearchHtml(SEARCH_HTML, { name: "오세훈" })
@ -153,6 +175,48 @@ test("parseSearchHtml supports election/date/region filters", () => {
assert.equal(result.items[0].district, "서울특별시(동작구가선거구)")
})
test("parseSearchHtml parses no-party education superintendent vote rows for region filters", () => {
const result = parseSearchHtml(SUPERINTENDENT_HTML, { name: "조희연", election: "교육감", region: "서울", limit: 5 })
assert.equal(result.summary.returned_count, 1)
assert.equal(result.items[0].party, undefined)
assert.equal(result.items[0].election_type, "교육감선거")
assert.equal(result.items[0].district, "서울특별시")
assert.equal(result.items[0].votes, 1614564)
assert.equal(result.items[0].vote_share, "38.10%")
assert.equal(result.warnings.join("\n"), "")
})
test("searchCandidates applies output limit after fetching enough upstream rows for filters", async () => {
const calls = []
const result = await searchCandidates({ name: "조희연", election: "교육감", region: "서울", limit: 1 }, {
fetchImpl: async (url, init) => {
calls.push({ url, init })
return { ok: true, status: 200, text: async () => SUPERINTENDENT_HTML }
}
})
assert.equal(new URLSearchParams(calls[0].init.body).get("recordCountPerPage"), "100")
assert.equal(result.summary.returned_count, 1)
assert.equal(result.summary.matched_before_limit, 1)
assert.equal(result.summary.upstream_result_limit, 100)
assert.equal(result.items[0].name, "조희연")
})
test("parseSearchHtml warns when a filtered upstream page reaches the fetched row cap", () => {
const cappedHtml = SEARCH_HTML.replace("오세훈", "다른후보").replace("김동연", "다른사람")
const result = parseSearchHtml(cappedHtml, {
name: "오세훈",
election: "시도지사",
region: "서울",
limit: 1,
upstreamLimit: 2
})
assert.equal(result.items.length, 0)
assert.match(result.warnings.join("\n"), /capped at 2 upstream rows/i)
})
test("parseSearchHtml deduplicates repeated candidate election entries before applying limit", () => {
const duplicateList = SEARCH_HTML.match(/<div class="list" data-election-type="4"[\s\S]*?<\/div>\s*<\/div>\s*<div class="list" data-election-code="2"/)[0]
.replace(/\s*<div class="list" data-election-code="2"$/, "")