mirror of
https://github.com/bab2min/Kiwi.git
synced 2026-06-17 01:54:27 +00:00
Implement #195
This commit is contained in:
parent
e371e0d1a3
commit
ec5d992ebe
11 changed files with 174 additions and 61 deletions
|
|
@ -109,6 +109,7 @@ namespace kiwi
|
|||
/**< 선행형태소의 모음조화 조건 */
|
||||
CondPolarity polar() const { return static_cast<CondPolarity>((vpPack >> 4) & 0x7); }
|
||||
|
||||
/**< 추가 분석이 가능한 형태소인지(파생어나 사이시옷이 포함된 합성명사 등) */
|
||||
bool complex() const { return !!(vpPack & 0x80); }
|
||||
|
||||
void setVowel(CondVowel v)
|
||||
|
|
@ -141,8 +142,9 @@ namespace kiwi
|
|||
const KString* kform = nullptr;
|
||||
POSTag tag = POSTag::unknown;
|
||||
CondVowel vowel : 4;
|
||||
CondPolarity polar : 3;
|
||||
CondPolarity polar : 2;
|
||||
bool complex : 1;
|
||||
bool saisiot : 1;
|
||||
uint8_t senseId = 0;
|
||||
uint8_t combineSocket = 0;
|
||||
int32_t combined = 0;
|
||||
|
|
@ -205,7 +207,8 @@ namespace kiwi
|
|||
CondVowel vowel = CondVowel::none;
|
||||
CondPolarity polar = CondPolarity::none;
|
||||
uint8_t formHash = 0;
|
||||
uint8_t zCodaAppendable = 0;
|
||||
uint8_t zCodaAppendable : 1;
|
||||
uint8_t zSiotAppendable : 1;
|
||||
|
||||
Form();
|
||||
~Form();
|
||||
|
|
@ -251,7 +254,7 @@ namespace kiwi
|
|||
* @param morphBase 형태소 배열의 시작 위치
|
||||
* @return 최적화된 형태 정보
|
||||
*/
|
||||
Form bake(const FormRaw& o, const Morpheme* morphBase, bool zCodaAppendable, const Vector<uint32_t>& additionalCands = {});
|
||||
Form bake(const FormRaw& o, const Morpheme* morphBase, bool zCodaAppendable, bool zSiotAppendable, const Vector<uint32_t>& additionalCands = {});
|
||||
|
||||
/**
|
||||
* @brief 변경 가능한 형태소 정보를 bake하여 최적화한다.
|
||||
|
|
|
|||
|
|
@ -25,6 +25,8 @@ namespace kiwi
|
|||
splitComplex = 1 << 22, /**< 더 작은 단위로 분할될 수 있는 형태소는 더 분할하여 매칭한다 */
|
||||
zCoda = 1 << 23, /**< 어미 및 조사에 덧붙은 받침이 있는 경우 이를 분리하여 z_coda 태그로 매칭한다 */
|
||||
compatibleJamo = 1 << 24, /**< 출력시 한글 첫가끝 자모를 호환가능한 자모로 변환한다. */
|
||||
splitSaisiot = 1 << 25, /**< 사이시옷이 포함된 합성명사를 분리하여 매칭한다. */
|
||||
mergeSaisiot = 1 << 26, /**< 사이시옷이 포함된 것으로 추정되는 명사를 결합하여 매칭한다. */
|
||||
joinVSuffix = joinVerbSuffix | joinAdjSuffix,
|
||||
joinAffix = joinNounPrefix | joinNounSuffix | joinVerbSuffix | joinAdjSuffix | joinAdvSuffix,
|
||||
all = url | email | hashtag | mention | serial | emoji | zCoda,
|
||||
|
|
|
|||
|
|
@ -31,7 +31,12 @@ namespace kiwi
|
|||
{
|
||||
return POSTag::jks <= tag && tag <= POSTag::jc;
|
||||
}
|
||||
|
||||
|
||||
inline bool isNNClass(POSTag tag)
|
||||
{
|
||||
return POSTag::nng <= tag && tag <= POSTag::nnb;
|
||||
}
|
||||
|
||||
inline bool isSuffix(POSTag tag)
|
||||
{
|
||||
tag = clearIrregular(tag);
|
||||
|
|
|
|||
|
|
@ -1,4 +1,4 @@
|
|||
/**
|
||||
/**
|
||||
* @file Types.h
|
||||
* @author bab2min (bab2min@gmail.com)
|
||||
* @brief Kiwi C++ API에 쓰이는 주요 타입들을 모아놓은 헤더 파일
|
||||
|
|
@ -211,7 +211,7 @@ namespace kiwi
|
|||
w_url, w_email, w_mention, w_hashtag, w_serial, w_emoji,
|
||||
jks, jkc, jkg, jko, jkb, jkv, jkq, jx, jc,
|
||||
ep, ef, ec, etn, etm,
|
||||
z_coda,
|
||||
z_coda, z_siot,
|
||||
user0, user1, user2, user3, user4,
|
||||
p, /**< 분할된 동사/형용사를 나타내는데 사용됨 */
|
||||
max, /**< POSTag의 총 개수를 나타내는 용도 */
|
||||
|
|
@ -275,7 +275,7 @@ namespace kiwi
|
|||
* @brief 선행 형태소의 양/음성 조건(모음 조화)과 관련된 열거형
|
||||
*
|
||||
*/
|
||||
enum class CondPolarity : char
|
||||
enum class CondPolarity : uint8_t
|
||||
{
|
||||
none, /**< 조건이 설정되지 않음 */
|
||||
positive, /**< 선행 형태소가 양성(ㅏ,ㅑ,ㅗ)인 경우만 등장 가능 */
|
||||
|
|
|
|||
18
src/Form.cpp
18
src/Form.cpp
|
|
@ -1,4 +1,4 @@
|
|||
#include <cassert>
|
||||
#include <cassert>
|
||||
#include <algorithm>
|
||||
#include <kiwi/Utils.h>
|
||||
#include <kiwi/Form.h>
|
||||
|
|
@ -70,7 +70,10 @@ namespace kiwi
|
|||
|
||||
DEFINE_SERIALIZER_OUTSIDE(FormRaw, form, candidate);
|
||||
|
||||
Form::Form() = default;
|
||||
Form::Form()
|
||||
: zCodaAppendable(0), zSiotAppendable(0)
|
||||
{
|
||||
}
|
||||
|
||||
Form::~Form() = default;
|
||||
|
||||
|
|
@ -87,7 +90,7 @@ namespace kiwi
|
|||
return ComparatorIgnoringSpace::less(form, o.form);
|
||||
}
|
||||
|
||||
Form bake(const FormRaw& o, const Morpheme* morphBase, bool zCodaAppendable, const Vector<uint32_t>& additionalCands)
|
||||
Form bake(const FormRaw& o, const Morpheme* morphBase, bool zCodaAppendable, bool zSiotAppendable, const Vector<uint32_t>& additionalCands)
|
||||
{
|
||||
Form ret;
|
||||
ret.numSpaces = count(o.form.begin(), o.form.end(), u' ');
|
||||
|
|
@ -102,6 +105,7 @@ namespace kiwi
|
|||
ret.candidate[i + o.candidate.size()] = morphBase + additionalCands[i];
|
||||
}
|
||||
ret.zCodaAppendable = zCodaAppendable ? 1 : 0;
|
||||
ret.zSiotAppendable = zSiotAppendable ? 1 : 0;
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
|
@ -112,7 +116,6 @@ namespace kiwi
|
|||
ret.tag = o.tag;
|
||||
ret.vowel = o.vowel();
|
||||
ret.polar = o.polar();
|
||||
ret.complex = o.complex();
|
||||
ret.combineSocket = o.combineSocket;
|
||||
ret.combined = o.combined;
|
||||
ret.userScore = o.userScore;
|
||||
|
|
@ -120,11 +123,18 @@ namespace kiwi
|
|||
ret.origMorphemeId = o.origMorphemeId;
|
||||
ret.senseId = o.senseId;
|
||||
ret.chunks = FixedPairVector<const Morpheme*, std::pair<uint8_t, uint8_t>>{ o.chunks.size() };
|
||||
|
||||
bool hasSaisiot = false;
|
||||
for (size_t i = 0; i < o.chunks.size(); ++i)
|
||||
{
|
||||
ret.chunks[i] = morphBase + o.chunks[i];
|
||||
ret.chunks.getSecond(i) = o.chunkPositions[i];
|
||||
hasSaisiot = hasSaisiot || (morphBase[o.chunks[i]].tag == POSTag::z_siot);
|
||||
}
|
||||
// 사이시옷이 포함된 경우는 saisiot을 true로, 그 외에는 complex를 true로 설정
|
||||
ret.complex = o.complex() && !hasSaisiot;
|
||||
ret.saisiot = o.complex() && hasSaisiot;
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -158,19 +158,16 @@ namespace kiwi
|
|||
};
|
||||
|
||||
template<bool typoTolerant>
|
||||
bool getZCodaAppendable(
|
||||
const Form* foundCand,
|
||||
const Form* formBase
|
||||
)
|
||||
const Form& getForm(const Form* foundCand, const Form* formBase)
|
||||
{
|
||||
if (typoTolerant)
|
||||
{
|
||||
auto tCand = reinterpret_cast<const TypoForm*>(foundCand);
|
||||
return tCand->form(formBase).zCodaAppendable;
|
||||
return tCand->form(formBase);
|
||||
}
|
||||
else
|
||||
{
|
||||
return foundCand->zCodaAppendable;
|
||||
return *foundCand;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -229,23 +226,6 @@ namespace kiwi
|
|||
return true;
|
||||
}
|
||||
|
||||
template<bool typoTolerant>
|
||||
size_t getFormLength(
|
||||
const Form* form,
|
||||
const Form* formBase
|
||||
)
|
||||
{
|
||||
if (typoTolerant)
|
||||
{
|
||||
auto tCand = reinterpret_cast<const TypoForm*>(form);
|
||||
return tCand->form(formBase).form.size();
|
||||
}
|
||||
else
|
||||
{
|
||||
return form->form.size();
|
||||
}
|
||||
}
|
||||
|
||||
inline void removeUnconnected(Vector<KGraphNode>& ret, const Vector<KGraphNode>& graph, const Vector<std::pair<uint32_t, uint32_t>>& endPosMap)
|
||||
{
|
||||
thread_local Vector<uint8_t> connectedList;
|
||||
|
|
@ -549,7 +529,7 @@ namespace kiwi
|
|||
if (!cand) break;
|
||||
else if (!trie.hasSubmatch(cand))
|
||||
{
|
||||
if (getFormLength<typoTolerant>(cand, formBase) <= 1) break;
|
||||
if (getForm<typoTolerant>(cand, formBase).form.size() <= 1) break;
|
||||
inserted = true;
|
||||
if (!insertCandidates(candidates, cand, formBase, typoPtrs, str, nonSpaces, 0, boundary, continualTypoCost / 2)) break;
|
||||
}
|
||||
|
|
@ -753,7 +733,7 @@ size_t kiwi::splitByTrie(
|
|||
}
|
||||
};
|
||||
|
||||
bool zCodaFollowable = false;
|
||||
bool zCodaFollowable = false, zSiotFollowable = false;
|
||||
const Form* const fallbackFormBegin = trie.value((size_t)POSTag::nng);
|
||||
const Form* const fallbackFormEnd = trie.value((size_t)POSTag::max);
|
||||
for (; n < str.size(); ++n)
|
||||
|
|
@ -1006,7 +986,12 @@ size_t kiwi::splitByTrie(
|
|||
{
|
||||
candidates.emplace_back(formBase + defaultTagSize + (c - 0x11A8) - 1, 0, nonSpaces.size() - 1);
|
||||
}
|
||||
else if (!!(matchOptions & (Match::splitSaisiot | Match::mergeSaisiot)) && zSiotFollowable && c == 0x11BA && n + 1 < str.size() && isHangulSyllable(str[n + 1]))
|
||||
{
|
||||
candidates.emplace_back(formBase + defaultTagSize + (0x11BA - 0x11A8) - 1, 0, nonSpaces.size() - 1);
|
||||
}
|
||||
zCodaFollowable = false;
|
||||
zSiotFollowable = false;
|
||||
|
||||
// invalidate typo nodes
|
||||
if (continualTypoTolerant)
|
||||
|
|
@ -1107,7 +1092,12 @@ size_t kiwi::splitByTrie(
|
|||
{
|
||||
candidates.emplace_back(formBase + defaultTagSize + (c - 0x11A8) - 1, 0, nonSpaces.size() - 1);
|
||||
}
|
||||
else if (!!(matchOptions & (Match::splitSaisiot | Match::mergeSaisiot)) && zSiotFollowable && c == 0x11BA && n + 1 < str.size() && isHangulSyllable(str[n + 1]))
|
||||
{
|
||||
candidates.emplace_back(formBase + defaultTagSize + (0x11BA - 0x11A8) - 1, 0, nonSpaces.size() - 1);
|
||||
}
|
||||
zCodaFollowable = false;
|
||||
zSiotFollowable = false;
|
||||
|
||||
if (continualTypoTolerant && lastChrType == POSTag::max)
|
||||
{
|
||||
|
|
@ -1128,7 +1118,8 @@ size_t kiwi::splitByTrie(
|
|||
if (!cand) break;
|
||||
else if (!trie.hasSubmatch(cand))
|
||||
{
|
||||
zCodaFollowable = zCodaFollowable || getZCodaAppendable<typoTolerant>(cand, formBase);
|
||||
zCodaFollowable = zCodaFollowable || getForm<typoTolerant>(cand, formBase).zCodaAppendable;
|
||||
zSiotFollowable = zSiotFollowable || getForm<typoTolerant>(cand, formBase).zSiotAppendable;
|
||||
if (!insertCandidates(candidates, cand, formBase, typoPtrs, str, nonSpaces)) break;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
30
src/Kiwi.cpp
30
src/Kiwi.cpp
|
|
@ -541,7 +541,12 @@ namespace kiwi
|
|||
template<class TokenInfoIt>
|
||||
TokenInfoIt joinAffixTokens(TokenInfoIt first, TokenInfoIt last, Match matchOptions)
|
||||
{
|
||||
if (!(matchOptions & (Match::joinNounPrefix | Match::joinNounSuffix | Match::joinVerbSuffix | Match::joinAdjSuffix | Match::joinAdvSuffix))) return last;
|
||||
if (!(matchOptions & (Match::joinNounPrefix
|
||||
| Match::joinNounSuffix
|
||||
| Match::joinVerbSuffix
|
||||
| Match::joinAdjSuffix
|
||||
| Match::joinAdvSuffix
|
||||
| Match::mergeSaisiot))) return last;
|
||||
if (std::distance(first, last) < 2) return last;
|
||||
|
||||
auto next = first;
|
||||
|
|
@ -554,7 +559,7 @@ namespace kiwi
|
|||
// XPN + (NN. | SN) => (NN. | SN)
|
||||
if (!!(matchOptions & Match::joinNounPrefix)
|
||||
&& current.tag == POSTag::xpn
|
||||
&& ((POSTag::nng <= nextToken.tag && nextToken.tag <= POSTag::nnb) || nextToken.tag == POSTag::sn)
|
||||
&& (isNNClass(nextToken.tag) || nextToken.tag == POSTag::sn)
|
||||
)
|
||||
{
|
||||
concatTokens(current, nextToken, nextToken.tag);
|
||||
|
|
@ -563,7 +568,7 @@ namespace kiwi
|
|||
// (NN. | SN) + XSN => (NN. | SN)
|
||||
else if (!!(matchOptions & Match::joinNounSuffix)
|
||||
&& nextToken.tag == POSTag::xsn
|
||||
&& ((POSTag::nng <= current.tag && current.tag <= POSTag::nnb) || current.tag == POSTag::sn)
|
||||
&& (isNNClass(current.tag) || current.tag == POSTag::sn)
|
||||
)
|
||||
{
|
||||
concatTokens(current, nextToken, current.tag);
|
||||
|
|
@ -572,7 +577,7 @@ namespace kiwi
|
|||
// (NN. | XR) + XSV => VV
|
||||
else if (!!(matchOptions & Match::joinVerbSuffix)
|
||||
&& clearIrregular(nextToken.tag) == POSTag::xsv
|
||||
&& ((POSTag::nng <= current.tag && current.tag <= POSTag::nnb) || current.tag == POSTag::xr)
|
||||
&& (isNNClass(current.tag) || current.tag == POSTag::xr)
|
||||
)
|
||||
{
|
||||
concatTokens(current, nextToken, setIrregular(POSTag::vv, isIrregular(nextToken.tag)));
|
||||
|
|
@ -581,7 +586,7 @@ namespace kiwi
|
|||
// (NN. | XR) + XSA => VA
|
||||
else if (!!(matchOptions & Match::joinAdjSuffix)
|
||||
&& clearIrregular(nextToken.tag) == POSTag::xsa
|
||||
&& ((POSTag::nng <= current.tag && current.tag <= POSTag::nnb) || current.tag == POSTag::xr)
|
||||
&& (isNNClass(current.tag) || current.tag == POSTag::xr)
|
||||
)
|
||||
{
|
||||
concatTokens(current, nextToken, setIrregular(POSTag::va, isIrregular(nextToken.tag)));
|
||||
|
|
@ -590,12 +595,24 @@ namespace kiwi
|
|||
// (NN. | XR) + XSM => MAG
|
||||
else if (!!(matchOptions & Match::joinAdvSuffix)
|
||||
&& nextToken.tag == POSTag::xsm
|
||||
&& ((POSTag::nng <= current.tag && current.tag <= POSTag::nnb) || current.tag == POSTag::xr)
|
||||
&& (isNNClass(current.tag) || current.tag == POSTag::xr)
|
||||
)
|
||||
{
|
||||
concatTokens(current, nextToken, POSTag::mag);
|
||||
++next;
|
||||
}
|
||||
// NN. + Z_SIOT + NN. => NN
|
||||
else if (!!(matchOptions & Match::mergeSaisiot)
|
||||
&& nextToken.tag == POSTag::z_siot
|
||||
&& isNNClass(current.tag)
|
||||
&& next + 1 != last
|
||||
&& isNNClass((next + 1)->tag))
|
||||
{
|
||||
current.str.back() += (0x11BA - 0x11A7);
|
||||
concatTokens(current, *(next + 1), POSTag::nng);
|
||||
++next;
|
||||
++next;
|
||||
}
|
||||
else
|
||||
{
|
||||
++first;
|
||||
|
|
@ -1047,6 +1064,7 @@ namespace kiwi
|
|||
topN,
|
||||
false,
|
||||
!!(matchOptions & Match::splitComplex),
|
||||
!!(matchOptions & Match::splitSaisiot),
|
||||
blocklist
|
||||
);
|
||||
insertPathIntoResults(ret, spStatesByRet, res, topN, matchOptions, integrateAllomorph, positionTable, wordPositions, pretokenizedGroup, nodeInWhichPretokenized);
|
||||
|
|
|
|||
|
|
@ -87,6 +87,10 @@ auto KiwiBuilder::loadMorphemesFromTxt(std::istream& is, Fn&& filter) -> Morphem
|
|||
UnorderedMap<pair<KString, POSTag>, Vector<uint8_t>> morphSenseMap;
|
||||
UnorderedMap<pair<KString, POSTag>, size_t> groupMap;
|
||||
|
||||
// add Z_SIOT to morphMap
|
||||
morphMap.emplace(make_tuple(u"\x11BA", undefSenseId, POSTag::z_siot), make_pair(defaultTagSize + 28, defaultTagSize + 28));
|
||||
morphSenseMap.emplace(make_pair(u"\x11BA", POSTag::z_siot), Vector<uint8_t>{ undefSenseId });
|
||||
|
||||
const auto& insertMorph = [&](KString&& form, float score, POSTag tag, CondVowel cvowel, CondPolarity cpolar, bool complex, uint8_t senseId, size_t origMorphemeId = 0, size_t groupId = 0)
|
||||
{
|
||||
auto& fm = addForm(form);
|
||||
|
|
@ -540,7 +544,7 @@ auto KiwiBuilder::restoreMorphemeMap(bool separateDefaultMorpheme) const -> Morp
|
|||
{
|
||||
id = i;
|
||||
}
|
||||
else if (separateDefaultMorpheme && id < defaultFormSize + 2)
|
||||
else if (separateDefaultMorpheme && id < defaultFormSize + 3)
|
||||
{
|
||||
id = i;
|
||||
}
|
||||
|
|
@ -626,7 +630,7 @@ void KiwiBuilder::_addCorpusTo(
|
|||
if ((morph.chunks.empty() || morph.complex()) && !morph.combineSocket)
|
||||
{
|
||||
if (it->second.first != it->second.second
|
||||
&& it->second.first < defaultFormSize + 2
|
||||
&& it->second.first < defaultFormSize + 3
|
||||
&& morphemes[it->second.second].complex()
|
||||
)
|
||||
{
|
||||
|
|
@ -788,7 +792,7 @@ KiwiBuilder::KiwiBuilder(const string& modelPath, size_t _numThreads, BuildOptio
|
|||
void KiwiBuilder::initMorphemes()
|
||||
{
|
||||
forms.resize(defaultFormSize);
|
||||
morphemes.resize(defaultFormSize + 2); // additional places for <s>, </s>
|
||||
morphemes.resize(defaultFormSize + 3); // additional places for <s>, </s>, 사이시옷
|
||||
for (size_t i = 1; i < defaultTagSize; ++i)
|
||||
{
|
||||
forms[i - 1].candidate.emplace_back(i + 1);
|
||||
|
|
@ -803,7 +807,12 @@ void KiwiBuilder::initMorphemes()
|
|||
morphemes[i + defaultTagSize + 1].kform = i + defaultTagSize - 1;
|
||||
morphemes[i + defaultTagSize + 1].userScore = -1.5f;
|
||||
}
|
||||
|
||||
// set value for 사이시옷
|
||||
static constexpr size_t siot = (0x11BA - 0x11A8);
|
||||
forms[defaultTagSize + siot - 1].candidate.emplace_back(defaultTagSize + 28);
|
||||
morphemes[defaultTagSize + 28].tag = POSTag::z_siot;
|
||||
morphemes[defaultTagSize + 28].kform = defaultTagSize + siot - 1;
|
||||
morphemes[defaultTagSize + 28].userScore = -1.5f;
|
||||
}
|
||||
|
||||
KiwiBuilder::KiwiBuilder(const ModelBuildArgs& args)
|
||||
|
|
@ -1017,8 +1026,8 @@ KiwiBuilder::KiwiBuilder(const string& modelPath, const ModelBuildArgs& args)
|
|||
|
||||
auto sbgPairFilter = [&](size_t a, size_t b)
|
||||
{
|
||||
if (a <= (int)POSTag::vcn + 1 || ((int)POSTag::w_serial + 1 < a && a < defaultFormSize + 2)) return false;
|
||||
if ((1 < b && b < (int)POSTag::vcn + 1) || ((int)POSTag::w_serial + 1 < b && b < defaultFormSize + 2)) return false;
|
||||
if (a <= (int)POSTag::vcn + 1 || ((int)POSTag::w_serial + 1 < a && a < defaultFormSize + 3)) return false;
|
||||
if ((1 < b && b < (int)POSTag::vcn + 1) || ((int)POSTag::w_serial + 1 < b && b < defaultFormSize + 3)) return false;
|
||||
return true;
|
||||
};
|
||||
|
||||
|
|
@ -1364,13 +1373,15 @@ void KiwiBuilder::addCombinedMorphemes(
|
|||
else return newForms[id - forms.size()];
|
||||
};
|
||||
|
||||
auto& leftForm = getForm(getMorph(leftId).kform).form;
|
||||
auto& rightForm = getForm(getMorph(rightId).kform).form;
|
||||
const auto& leftMorph = getMorph(leftId);
|
||||
const auto& leftForm = getForm(leftMorph.kform).form;
|
||||
const auto& rightMorph = getMorph(rightId);
|
||||
const auto& rightForm = getForm(rightMorph.kform).form;
|
||||
|
||||
auto res = combiningRule->combine(leftForm, rightForm, ruleId);
|
||||
for (auto& r : res)
|
||||
{
|
||||
if (!r.ignoreRCond && !FeatureTestor::isMatched(&leftForm, getMorph(rightId).vowel()))
|
||||
if (!r.ignoreRCond && !FeatureTestor::isMatched(&leftForm, rightMorph.vowel()))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
|
@ -1399,7 +1410,7 @@ void KiwiBuilder::buildCombinedMorphemes(
|
|||
|
||||
Vector<Vector<size_t>> combiningLeftCands, combiningRightCands;
|
||||
UnorderedMap<std::tuple<KString, POSTag, CondPolarity>, size_t> combiningSuffices;
|
||||
size_t combiningUpdateIdx = defaultFormSize + 2;
|
||||
size_t combiningUpdateIdx = defaultFormSize + 3;
|
||||
|
||||
auto ruleLeftIds = combiningRule->getRuleIdsByLeftTag();
|
||||
auto ruleRightIds = combiningRule->getRuleIdsByRightTag();
|
||||
|
|
@ -1831,6 +1842,37 @@ namespace kiwi
|
|||
return false;
|
||||
}
|
||||
|
||||
inline bool isZSiotAppendable(
|
||||
const KString& form,
|
||||
const Vector<uint32_t>& candidate,
|
||||
const Vector<MorphemeRaw>& morphemes,
|
||||
const Vector<MorphemeRaw>& combinedMorphemes)
|
||||
{
|
||||
const auto getMorph = [&](size_t i) -> const MorphemeRaw&
|
||||
{
|
||||
return i < morphemes.size() ? morphemes[i] : combinedMorphemes[i - morphemes.size()];
|
||||
};
|
||||
|
||||
if (form.empty() || !isHangulSyllable(form.back()) || isHangulCoda(form.back())) return false;
|
||||
|
||||
for (auto i : candidate)
|
||||
{
|
||||
const auto& m = getMorph(i);
|
||||
const auto tag = m.tag;
|
||||
|
||||
if (!isNNClass(tag))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
if (m.lmMorphemeId != getDefaultMorphemeId(tag))
|
||||
{
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
inline bool testSpeicalChr(const u16string& form)
|
||||
{
|
||||
POSTag pos;
|
||||
|
|
@ -1883,14 +1925,16 @@ Kiwi KiwiBuilder::build(const TypoTransformer& typos, float typoCostThreshold) c
|
|||
{
|
||||
auto it = newFormCands.find(ret.forms.size());
|
||||
bool zCodaAppendable = isZCodaAppendable(f.form, f.candidate, morphemes, combinedMorphemes);
|
||||
bool zSiotAppendable = isZSiotAppendable(f.form, f.candidate, morphemes, combinedMorphemes);
|
||||
if (it == newFormCands.end())
|
||||
{
|
||||
ret.forms.emplace_back(bake(f, ret.morphemes.data(), zCodaAppendable));
|
||||
ret.forms.emplace_back(bake(f, ret.morphemes.data(), zCodaAppendable, zSiotAppendable));
|
||||
}
|
||||
else
|
||||
{
|
||||
zCodaAppendable = zCodaAppendable || isZCodaAppendable(f.form, it->second, morphemes, combinedMorphemes);
|
||||
ret.forms.emplace_back(bake(f, ret.morphemes.data(), zCodaAppendable, it->second));
|
||||
zSiotAppendable = zSiotAppendable || isZSiotAppendable(f.form, it->second, morphemes, combinedMorphemes);
|
||||
ret.forms.emplace_back(bake(f, ret.morphemes.data(), zCodaAppendable, zSiotAppendable, it->second));
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -1898,7 +1942,9 @@ Kiwi KiwiBuilder::build(const TypoTransformer& typos, float typoCostThreshold) c
|
|||
{
|
||||
bool zCodaAppendable = isZCodaAppendable(f.form, f.candidate, morphemes, combinedMorphemes)
|
||||
|| isZCodaAppendable(f.form, newFormCands[ret.forms.size()], morphemes, combinedMorphemes);
|
||||
ret.forms.emplace_back(bake(f, ret.morphemes.data(), zCodaAppendable, newFormCands[ret.forms.size()]));
|
||||
bool zSiotAppendable = isZSiotAppendable(f.form, f.candidate, morphemes, combinedMorphemes)
|
||||
|| isZSiotAppendable(f.form, newFormCands[ret.forms.size()], morphemes, combinedMorphemes);
|
||||
ret.forms.emplace_back(bake(f, ret.morphemes.data(), zCodaAppendable, zSiotAppendable, newFormCands[ret.forms.size()]));
|
||||
}
|
||||
|
||||
Vector<size_t> newFormIdMapper(ret.forms.size());
|
||||
|
|
|
|||
|
|
@ -118,6 +118,7 @@ namespace kiwi
|
|||
const size_t topN,
|
||||
bool openEnd,
|
||||
bool splitComplex = false,
|
||||
bool splitSaisiot = false,
|
||||
const std::unordered_set<const Morpheme*>* blocklist = nullptr
|
||||
);
|
||||
|
||||
|
|
@ -134,6 +135,7 @@ namespace kiwi
|
|||
bool unknownForm,
|
||||
const Vector<SpecialState>& prevSpStates,
|
||||
bool splitComplex = false,
|
||||
bool splitSaisiot = false,
|
||||
const std::unordered_set<const Morpheme*>* blocklist = nullptr
|
||||
);
|
||||
|
||||
|
|
@ -689,6 +691,12 @@ namespace kiwi
|
|||
{
|
||||
for (auto& prevPath : cache[prev - startNode])
|
||||
{
|
||||
// 사이시옷 뒤에 명사가 아닌 태그가 오는 경우 제외
|
||||
if (prevPath.morpheme->tag == POSTag::z_siot && !isNNClass(curMorph->tag))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
float candScore = prevPath.accScore + additionalScore;
|
||||
if (prevPath.combineSocket)
|
||||
{
|
||||
|
|
@ -824,6 +832,7 @@ namespace kiwi
|
|||
bool unknownForm,
|
||||
const Vector<SpecialState>& prevSpStates,
|
||||
bool splitComplex,
|
||||
bool splitSaisiot,
|
||||
const std::unordered_set<const Morpheme*>* blocklist
|
||||
)
|
||||
{
|
||||
|
|
@ -858,6 +867,7 @@ namespace kiwi
|
|||
for (auto& curMorph : cands)
|
||||
{
|
||||
if (splitComplex && curMorph->getCombined()->complex) continue;
|
||||
if (splitSaisiot && curMorph->getCombined()->saisiot) continue;
|
||||
if (blocklist && blocklist->count(curMorph->getCombined())) continue;
|
||||
|
||||
// 덧붙은 받침(zCoda)을 위한 지름길
|
||||
|
|
@ -880,6 +890,26 @@ namespace kiwi
|
|||
}
|
||||
continue;
|
||||
}
|
||||
// 사이시옷(zSiot)을 위한 지름길
|
||||
if (curMorph->tag == POSTag::z_siot)
|
||||
{
|
||||
for (auto* prev = node->getPrev(); prev; prev = prev->getSibling())
|
||||
{
|
||||
for (auto& p : cache[prev - startNode])
|
||||
{
|
||||
auto lastTag = kw->morphemes[p.wid].tag;
|
||||
if (!isNNClass(lastTag)) continue;
|
||||
nCache.emplace_back(p);
|
||||
auto& newPath = nCache.back();
|
||||
newPath.accScore += curMorph->userScore * kw->typoCostWeight;
|
||||
newPath.accTypoCost -= curMorph->userScore;
|
||||
newPath.parent = &p;
|
||||
newPath.morpheme = &kw->morphemes[curMorph->lmMorphemeId];
|
||||
newPath.wid = curMorph->lmMorphemeId;
|
||||
}
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
// if the morpheme has chunk set
|
||||
if (!(curMorph->chunks.empty()|| curMorph->complex))
|
||||
|
|
@ -1062,6 +1092,7 @@ namespace kiwi
|
|||
const size_t topN,
|
||||
bool openEnd,
|
||||
bool splitComplex,
|
||||
bool splitSaisiot,
|
||||
const std::unordered_set<const Morpheme*>* blocklist
|
||||
)
|
||||
{
|
||||
|
|
@ -1115,7 +1146,9 @@ namespace kiwi
|
|||
|
||||
if (node->form)
|
||||
{
|
||||
evalPath<LmState>(kw, startNode, node, topN, cache, ownFormList, i, ownFormId, node->form->candidate, false, uniqStates, splitComplex, blocklist);
|
||||
evalPath<LmState>(kw, startNode, node, topN, cache,
|
||||
ownFormList, i, ownFormId, node->form->candidate,
|
||||
false, uniqStates, splitComplex, splitSaisiot, blocklist);
|
||||
if (all_of(node->form->candidate.begin(), node->form->candidate.end(), [](const Morpheme* m)
|
||||
{
|
||||
return m->combineSocket || (!m->chunks.empty() && !m->complex);
|
||||
|
|
@ -1123,12 +1156,16 @@ namespace kiwi
|
|||
{
|
||||
ownFormList.emplace_back(node->form->form);
|
||||
ownFormId = ownFormList.size();
|
||||
evalPath<LmState>(kw, startNode, node, topN, cache, ownFormList, i, ownFormId, unknownNodeLCands, true, uniqStates, splitComplex, blocklist);
|
||||
evalPath<LmState>(kw, startNode, node, topN, cache,
|
||||
ownFormList, i, ownFormId, unknownNodeLCands,
|
||||
true, uniqStates, splitComplex, splitSaisiot, blocklist);
|
||||
};
|
||||
}
|
||||
else
|
||||
{
|
||||
evalPath<LmState>(kw, startNode, node, topN, cache, ownFormList, i, ownFormId, unknownNodeCands, true, uniqStates, splitComplex, blocklist);
|
||||
evalPath<LmState>(kw, startNode, node, topN, cache,
|
||||
ownFormList, i, ownFormId, unknownNodeCands,
|
||||
true, uniqStates, splitComplex, splitSaisiot, blocklist);
|
||||
}
|
||||
|
||||
#ifdef DEBUG_PRINT
|
||||
|
|
|
|||
|
|
@ -622,6 +622,7 @@ namespace kiwi
|
|||
if (tagStr == u"SH") return POSTag::sh;
|
||||
if (tagStr == u"SN") return POSTag::sn;
|
||||
if (tagStr == u"Z_CODA") return POSTag::z_coda;
|
||||
if (tagStr == u"Z_SIOT") return POSTag::z_siot;
|
||||
if (tagStr == u"V") return POSTag::p;
|
||||
if (tagStr == u"A") return POSTag::p;
|
||||
if (tagStr == u"^") return POSTag::unknown;
|
||||
|
|
|
|||
|
|
@ -314,7 +314,7 @@ namespace kiwi
|
|||
"W_URL", "W_EMAIL", "W_MENTION", "W_HASHTAG", "W_SERIAL", "W_EMOJI",
|
||||
"JKS", "JKC", "JKG", "JKO", "JKB", "JKV", "JKQ", "JX", "JC",
|
||||
"EP", "EF", "EC", "ETN", "ETM",
|
||||
"Z_CODA",
|
||||
"Z_CODA", "Z_SIOT",
|
||||
"USER0", "USER1", "USER2", "USER3", "USER4",
|
||||
"P",
|
||||
"@"
|
||||
|
|
@ -356,7 +356,7 @@ namespace kiwi
|
|||
u"W_URL", u"W_EMAIL", u"W_MENTION", u"W_HASHTAG", u"W_SERIAL", u"W_EMOJI",
|
||||
u"JKS", u"JKC", u"JKG", u"JKO", u"JKB", u"JKV", u"JKQ", u"JX", u"JC",
|
||||
u"EP", u"EF", u"EC", u"ETN", u"ETM",
|
||||
u"Z_CODA",
|
||||
u"Z_CODA", u"Z_SIOT",
|
||||
u"USER0", u"USER1", u"USER2", u"USER3", u"USER4",
|
||||
u"P",
|
||||
u"@"
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue