This commit is contained in:
bab2min 2024-10-27 16:35:44 +09:00
commit ec5d992ebe
11 changed files with 174 additions and 61 deletions

View file

@ -109,6 +109,7 @@ namespace kiwi
/**< 선행형태소의 모음조화 조건 */
CondPolarity polar() const { return static_cast<CondPolarity>((vpPack >> 4) & 0x7); }
/**< 추가 분석이 가능한 형태소인지(파생어나 사이시옷이 포함된 합성명사 등) */
bool complex() const { return !!(vpPack & 0x80); }
void setVowel(CondVowel v)
@ -141,8 +142,9 @@ namespace kiwi
const KString* kform = nullptr;
POSTag tag = POSTag::unknown;
CondVowel vowel : 4;
CondPolarity polar : 3;
CondPolarity polar : 2;
bool complex : 1;
bool saisiot : 1;
uint8_t senseId = 0;
uint8_t combineSocket = 0;
int32_t combined = 0;
@ -205,7 +207,8 @@ namespace kiwi
CondVowel vowel = CondVowel::none;
CondPolarity polar = CondPolarity::none;
uint8_t formHash = 0;
uint8_t zCodaAppendable = 0;
uint8_t zCodaAppendable : 1;
uint8_t zSiotAppendable : 1;
Form();
~Form();
@ -251,7 +254,7 @@ namespace kiwi
* @param morphBase
* @return
*/
Form bake(const FormRaw& o, const Morpheme* morphBase, bool zCodaAppendable, const Vector<uint32_t>& additionalCands = {});
Form bake(const FormRaw& o, const Morpheme* morphBase, bool zCodaAppendable, bool zSiotAppendable, const Vector<uint32_t>& additionalCands = {});
/**
* @brief bake하여 .

View file

@ -25,6 +25,8 @@ namespace kiwi
splitComplex = 1 << 22, /**< 더 작은 단위로 분할될 수 있는 형태소는 더 분할하여 매칭한다 */
zCoda = 1 << 23, /**< 어미 및 조사에 덧붙은 받침이 있는 경우 이를 분리하여 z_coda 태그로 매칭한다 */
compatibleJamo = 1 << 24, /**< 출력시 한글 첫가끝 자모를 호환가능한 자모로 변환한다. */
splitSaisiot = 1 << 25, /**< 사이시옷이 포함된 합성명사를 분리하여 매칭한다. */
mergeSaisiot = 1 << 26, /**< 사이시옷이 포함된 것으로 추정되는 명사를 결합하여 매칭한다. */
joinVSuffix = joinVerbSuffix | joinAdjSuffix,
joinAffix = joinNounPrefix | joinNounSuffix | joinVerbSuffix | joinAdjSuffix | joinAdvSuffix,
all = url | email | hashtag | mention | serial | emoji | zCoda,

View file

@ -31,7 +31,12 @@ namespace kiwi
{
return POSTag::jks <= tag && tag <= POSTag::jc;
}
inline bool isNNClass(POSTag tag)
{
return POSTag::nng <= tag && tag <= POSTag::nnb;
}
inline bool isSuffix(POSTag tag)
{
tag = clearIrregular(tag);

View file

@ -1,4 +1,4 @@
/**
/**
* @file Types.h
* @author bab2min (bab2min@gmail.com)
* @brief Kiwi C++ API에
@ -211,7 +211,7 @@ namespace kiwi
w_url, w_email, w_mention, w_hashtag, w_serial, w_emoji,
jks, jkc, jkg, jko, jkb, jkv, jkq, jx, jc,
ep, ef, ec, etn, etm,
z_coda,
z_coda, z_siot,
user0, user1, user2, user3, user4,
p, /**< 분할된 동사/형용사를 나타내는데 사용됨 */
max, /**< POSTag의 총 개수를 나타내는 용도 */
@ -275,7 +275,7 @@ namespace kiwi
* @brief / ( )
*
*/
enum class CondPolarity : char
enum class CondPolarity : uint8_t
{
none, /**< 조건이 설정되지 않음 */
positive, /**< 선행 형태소가 양성(ㅏ,ㅑ,ㅗ)인 경우만 등장 가능 */

View file

@ -1,4 +1,4 @@
#include <cassert>
#include <cassert>
#include <algorithm>
#include <kiwi/Utils.h>
#include <kiwi/Form.h>
@ -70,7 +70,10 @@ namespace kiwi
DEFINE_SERIALIZER_OUTSIDE(FormRaw, form, candidate);
Form::Form() = default;
Form::Form()
: zCodaAppendable(0), zSiotAppendable(0)
{
}
Form::~Form() = default;
@ -87,7 +90,7 @@ namespace kiwi
return ComparatorIgnoringSpace::less(form, o.form);
}
Form bake(const FormRaw& o, const Morpheme* morphBase, bool zCodaAppendable, const Vector<uint32_t>& additionalCands)
Form bake(const FormRaw& o, const Morpheme* morphBase, bool zCodaAppendable, bool zSiotAppendable, const Vector<uint32_t>& additionalCands)
{
Form ret;
ret.numSpaces = count(o.form.begin(), o.form.end(), u' ');
@ -102,6 +105,7 @@ namespace kiwi
ret.candidate[i + o.candidate.size()] = morphBase + additionalCands[i];
}
ret.zCodaAppendable = zCodaAppendable ? 1 : 0;
ret.zSiotAppendable = zSiotAppendable ? 1 : 0;
return ret;
}
@ -112,7 +116,6 @@ namespace kiwi
ret.tag = o.tag;
ret.vowel = o.vowel();
ret.polar = o.polar();
ret.complex = o.complex();
ret.combineSocket = o.combineSocket;
ret.combined = o.combined;
ret.userScore = o.userScore;
@ -120,11 +123,18 @@ namespace kiwi
ret.origMorphemeId = o.origMorphemeId;
ret.senseId = o.senseId;
ret.chunks = FixedPairVector<const Morpheme*, std::pair<uint8_t, uint8_t>>{ o.chunks.size() };
bool hasSaisiot = false;
for (size_t i = 0; i < o.chunks.size(); ++i)
{
ret.chunks[i] = morphBase + o.chunks[i];
ret.chunks.getSecond(i) = o.chunkPositions[i];
hasSaisiot = hasSaisiot || (morphBase[o.chunks[i]].tag == POSTag::z_siot);
}
// 사이시옷이 포함된 경우는 saisiot을 true로, 그 외에는 complex를 true로 설정
ret.complex = o.complex() && !hasSaisiot;
ret.saisiot = o.complex() && hasSaisiot;
return ret;
}

View file

@ -158,19 +158,16 @@ namespace kiwi
};
template<bool typoTolerant>
bool getZCodaAppendable(
const Form* foundCand,
const Form* formBase
)
const Form& getForm(const Form* foundCand, const Form* formBase)
{
if (typoTolerant)
{
auto tCand = reinterpret_cast<const TypoForm*>(foundCand);
return tCand->form(formBase).zCodaAppendable;
return tCand->form(formBase);
}
else
{
return foundCand->zCodaAppendable;
return *foundCand;
}
}
@ -229,23 +226,6 @@ namespace kiwi
return true;
}
template<bool typoTolerant>
size_t getFormLength(
const Form* form,
const Form* formBase
)
{
if (typoTolerant)
{
auto tCand = reinterpret_cast<const TypoForm*>(form);
return tCand->form(formBase).form.size();
}
else
{
return form->form.size();
}
}
inline void removeUnconnected(Vector<KGraphNode>& ret, const Vector<KGraphNode>& graph, const Vector<std::pair<uint32_t, uint32_t>>& endPosMap)
{
thread_local Vector<uint8_t> connectedList;
@ -549,7 +529,7 @@ namespace kiwi
if (!cand) break;
else if (!trie.hasSubmatch(cand))
{
if (getFormLength<typoTolerant>(cand, formBase) <= 1) break;
if (getForm<typoTolerant>(cand, formBase).form.size() <= 1) break;
inserted = true;
if (!insertCandidates(candidates, cand, formBase, typoPtrs, str, nonSpaces, 0, boundary, continualTypoCost / 2)) break;
}
@ -753,7 +733,7 @@ size_t kiwi::splitByTrie(
}
};
bool zCodaFollowable = false;
bool zCodaFollowable = false, zSiotFollowable = false;
const Form* const fallbackFormBegin = trie.value((size_t)POSTag::nng);
const Form* const fallbackFormEnd = trie.value((size_t)POSTag::max);
for (; n < str.size(); ++n)
@ -1006,7 +986,12 @@ size_t kiwi::splitByTrie(
{
candidates.emplace_back(formBase + defaultTagSize + (c - 0x11A8) - 1, 0, nonSpaces.size() - 1);
}
else if (!!(matchOptions & (Match::splitSaisiot | Match::mergeSaisiot)) && zSiotFollowable && c == 0x11BA && n + 1 < str.size() && isHangulSyllable(str[n + 1]))
{
candidates.emplace_back(formBase + defaultTagSize + (0x11BA - 0x11A8) - 1, 0, nonSpaces.size() - 1);
}
zCodaFollowable = false;
zSiotFollowable = false;
// invalidate typo nodes
if (continualTypoTolerant)
@ -1107,7 +1092,12 @@ size_t kiwi::splitByTrie(
{
candidates.emplace_back(formBase + defaultTagSize + (c - 0x11A8) - 1, 0, nonSpaces.size() - 1);
}
else if (!!(matchOptions & (Match::splitSaisiot | Match::mergeSaisiot)) && zSiotFollowable && c == 0x11BA && n + 1 < str.size() && isHangulSyllable(str[n + 1]))
{
candidates.emplace_back(formBase + defaultTagSize + (0x11BA - 0x11A8) - 1, 0, nonSpaces.size() - 1);
}
zCodaFollowable = false;
zSiotFollowable = false;
if (continualTypoTolerant && lastChrType == POSTag::max)
{
@ -1128,7 +1118,8 @@ size_t kiwi::splitByTrie(
if (!cand) break;
else if (!trie.hasSubmatch(cand))
{
zCodaFollowable = zCodaFollowable || getZCodaAppendable<typoTolerant>(cand, formBase);
zCodaFollowable = zCodaFollowable || getForm<typoTolerant>(cand, formBase).zCodaAppendable;
zSiotFollowable = zSiotFollowable || getForm<typoTolerant>(cand, formBase).zSiotAppendable;
if (!insertCandidates(candidates, cand, formBase, typoPtrs, str, nonSpaces)) break;
}
}

View file

@ -541,7 +541,12 @@ namespace kiwi
template<class TokenInfoIt>
TokenInfoIt joinAffixTokens(TokenInfoIt first, TokenInfoIt last, Match matchOptions)
{
if (!(matchOptions & (Match::joinNounPrefix | Match::joinNounSuffix | Match::joinVerbSuffix | Match::joinAdjSuffix | Match::joinAdvSuffix))) return last;
if (!(matchOptions & (Match::joinNounPrefix
| Match::joinNounSuffix
| Match::joinVerbSuffix
| Match::joinAdjSuffix
| Match::joinAdvSuffix
| Match::mergeSaisiot))) return last;
if (std::distance(first, last) < 2) return last;
auto next = first;
@ -554,7 +559,7 @@ namespace kiwi
// XPN + (NN. | SN) => (NN. | SN)
if (!!(matchOptions & Match::joinNounPrefix)
&& current.tag == POSTag::xpn
&& ((POSTag::nng <= nextToken.tag && nextToken.tag <= POSTag::nnb) || nextToken.tag == POSTag::sn)
&& (isNNClass(nextToken.tag) || nextToken.tag == POSTag::sn)
)
{
concatTokens(current, nextToken, nextToken.tag);
@ -563,7 +568,7 @@ namespace kiwi
// (NN. | SN) + XSN => (NN. | SN)
else if (!!(matchOptions & Match::joinNounSuffix)
&& nextToken.tag == POSTag::xsn
&& ((POSTag::nng <= current.tag && current.tag <= POSTag::nnb) || current.tag == POSTag::sn)
&& (isNNClass(current.tag) || current.tag == POSTag::sn)
)
{
concatTokens(current, nextToken, current.tag);
@ -572,7 +577,7 @@ namespace kiwi
// (NN. | XR) + XSV => VV
else if (!!(matchOptions & Match::joinVerbSuffix)
&& clearIrregular(nextToken.tag) == POSTag::xsv
&& ((POSTag::nng <= current.tag && current.tag <= POSTag::nnb) || current.tag == POSTag::xr)
&& (isNNClass(current.tag) || current.tag == POSTag::xr)
)
{
concatTokens(current, nextToken, setIrregular(POSTag::vv, isIrregular(nextToken.tag)));
@ -581,7 +586,7 @@ namespace kiwi
// (NN. | XR) + XSA => VA
else if (!!(matchOptions & Match::joinAdjSuffix)
&& clearIrregular(nextToken.tag) == POSTag::xsa
&& ((POSTag::nng <= current.tag && current.tag <= POSTag::nnb) || current.tag == POSTag::xr)
&& (isNNClass(current.tag) || current.tag == POSTag::xr)
)
{
concatTokens(current, nextToken, setIrregular(POSTag::va, isIrregular(nextToken.tag)));
@ -590,12 +595,24 @@ namespace kiwi
// (NN. | XR) + XSM => MAG
else if (!!(matchOptions & Match::joinAdvSuffix)
&& nextToken.tag == POSTag::xsm
&& ((POSTag::nng <= current.tag && current.tag <= POSTag::nnb) || current.tag == POSTag::xr)
&& (isNNClass(current.tag) || current.tag == POSTag::xr)
)
{
concatTokens(current, nextToken, POSTag::mag);
++next;
}
// NN. + Z_SIOT + NN. => NN
else if (!!(matchOptions & Match::mergeSaisiot)
&& nextToken.tag == POSTag::z_siot
&& isNNClass(current.tag)
&& next + 1 != last
&& isNNClass((next + 1)->tag))
{
current.str.back() += (0x11BA - 0x11A7);
concatTokens(current, *(next + 1), POSTag::nng);
++next;
++next;
}
else
{
++first;
@ -1047,6 +1064,7 @@ namespace kiwi
topN,
false,
!!(matchOptions & Match::splitComplex),
!!(matchOptions & Match::splitSaisiot),
blocklist
);
insertPathIntoResults(ret, spStatesByRet, res, topN, matchOptions, integrateAllomorph, positionTable, wordPositions, pretokenizedGroup, nodeInWhichPretokenized);

View file

@ -87,6 +87,10 @@ auto KiwiBuilder::loadMorphemesFromTxt(std::istream& is, Fn&& filter) -> Morphem
UnorderedMap<pair<KString, POSTag>, Vector<uint8_t>> morphSenseMap;
UnorderedMap<pair<KString, POSTag>, size_t> groupMap;
// add Z_SIOT to morphMap
morphMap.emplace(make_tuple(u"\x11BA", undefSenseId, POSTag::z_siot), make_pair(defaultTagSize + 28, defaultTagSize + 28));
morphSenseMap.emplace(make_pair(u"\x11BA", POSTag::z_siot), Vector<uint8_t>{ undefSenseId });
const auto& insertMorph = [&](KString&& form, float score, POSTag tag, CondVowel cvowel, CondPolarity cpolar, bool complex, uint8_t senseId, size_t origMorphemeId = 0, size_t groupId = 0)
{
auto& fm = addForm(form);
@ -540,7 +544,7 @@ auto KiwiBuilder::restoreMorphemeMap(bool separateDefaultMorpheme) const -> Morp
{
id = i;
}
else if (separateDefaultMorpheme && id < defaultFormSize + 2)
else if (separateDefaultMorpheme && id < defaultFormSize + 3)
{
id = i;
}
@ -626,7 +630,7 @@ void KiwiBuilder::_addCorpusTo(
if ((morph.chunks.empty() || morph.complex()) && !morph.combineSocket)
{
if (it->second.first != it->second.second
&& it->second.first < defaultFormSize + 2
&& it->second.first < defaultFormSize + 3
&& morphemes[it->second.second].complex()
)
{
@ -788,7 +792,7 @@ KiwiBuilder::KiwiBuilder(const string& modelPath, size_t _numThreads, BuildOptio
void KiwiBuilder::initMorphemes()
{
forms.resize(defaultFormSize);
morphemes.resize(defaultFormSize + 2); // additional places for <s>, </s>
morphemes.resize(defaultFormSize + 3); // additional places for <s>, </s>, 사이시옷
for (size_t i = 1; i < defaultTagSize; ++i)
{
forms[i - 1].candidate.emplace_back(i + 1);
@ -803,7 +807,12 @@ void KiwiBuilder::initMorphemes()
morphemes[i + defaultTagSize + 1].kform = i + defaultTagSize - 1;
morphemes[i + defaultTagSize + 1].userScore = -1.5f;
}
// set value for 사이시옷
static constexpr size_t siot = (0x11BA - 0x11A8);
forms[defaultTagSize + siot - 1].candidate.emplace_back(defaultTagSize + 28);
morphemes[defaultTagSize + 28].tag = POSTag::z_siot;
morphemes[defaultTagSize + 28].kform = defaultTagSize + siot - 1;
morphemes[defaultTagSize + 28].userScore = -1.5f;
}
KiwiBuilder::KiwiBuilder(const ModelBuildArgs& args)
@ -1017,8 +1026,8 @@ KiwiBuilder::KiwiBuilder(const string& modelPath, const ModelBuildArgs& args)
auto sbgPairFilter = [&](size_t a, size_t b)
{
if (a <= (int)POSTag::vcn + 1 || ((int)POSTag::w_serial + 1 < a && a < defaultFormSize + 2)) return false;
if ((1 < b && b < (int)POSTag::vcn + 1) || ((int)POSTag::w_serial + 1 < b && b < defaultFormSize + 2)) return false;
if (a <= (int)POSTag::vcn + 1 || ((int)POSTag::w_serial + 1 < a && a < defaultFormSize + 3)) return false;
if ((1 < b && b < (int)POSTag::vcn + 1) || ((int)POSTag::w_serial + 1 < b && b < defaultFormSize + 3)) return false;
return true;
};
@ -1364,13 +1373,15 @@ void KiwiBuilder::addCombinedMorphemes(
else return newForms[id - forms.size()];
};
auto& leftForm = getForm(getMorph(leftId).kform).form;
auto& rightForm = getForm(getMorph(rightId).kform).form;
const auto& leftMorph = getMorph(leftId);
const auto& leftForm = getForm(leftMorph.kform).form;
const auto& rightMorph = getMorph(rightId);
const auto& rightForm = getForm(rightMorph.kform).form;
auto res = combiningRule->combine(leftForm, rightForm, ruleId);
for (auto& r : res)
{
if (!r.ignoreRCond && !FeatureTestor::isMatched(&leftForm, getMorph(rightId).vowel()))
if (!r.ignoreRCond && !FeatureTestor::isMatched(&leftForm, rightMorph.vowel()))
{
continue;
}
@ -1399,7 +1410,7 @@ void KiwiBuilder::buildCombinedMorphemes(
Vector<Vector<size_t>> combiningLeftCands, combiningRightCands;
UnorderedMap<std::tuple<KString, POSTag, CondPolarity>, size_t> combiningSuffices;
size_t combiningUpdateIdx = defaultFormSize + 2;
size_t combiningUpdateIdx = defaultFormSize + 3;
auto ruleLeftIds = combiningRule->getRuleIdsByLeftTag();
auto ruleRightIds = combiningRule->getRuleIdsByRightTag();
@ -1831,6 +1842,37 @@ namespace kiwi
return false;
}
inline bool isZSiotAppendable(
const KString& form,
const Vector<uint32_t>& candidate,
const Vector<MorphemeRaw>& morphemes,
const Vector<MorphemeRaw>& combinedMorphemes)
{
const auto getMorph = [&](size_t i) -> const MorphemeRaw&
{
return i < morphemes.size() ? morphemes[i] : combinedMorphemes[i - morphemes.size()];
};
if (form.empty() || !isHangulSyllable(form.back()) || isHangulCoda(form.back())) return false;
for (auto i : candidate)
{
const auto& m = getMorph(i);
const auto tag = m.tag;
if (!isNNClass(tag))
{
continue;
}
if (m.lmMorphemeId != getDefaultMorphemeId(tag))
{
return true;
}
}
return false;
}
inline bool testSpeicalChr(const u16string& form)
{
POSTag pos;
@ -1883,14 +1925,16 @@ Kiwi KiwiBuilder::build(const TypoTransformer& typos, float typoCostThreshold) c
{
auto it = newFormCands.find(ret.forms.size());
bool zCodaAppendable = isZCodaAppendable(f.form, f.candidate, morphemes, combinedMorphemes);
bool zSiotAppendable = isZSiotAppendable(f.form, f.candidate, morphemes, combinedMorphemes);
if (it == newFormCands.end())
{
ret.forms.emplace_back(bake(f, ret.morphemes.data(), zCodaAppendable));
ret.forms.emplace_back(bake(f, ret.morphemes.data(), zCodaAppendable, zSiotAppendable));
}
else
{
zCodaAppendable = zCodaAppendable || isZCodaAppendable(f.form, it->second, morphemes, combinedMorphemes);
ret.forms.emplace_back(bake(f, ret.morphemes.data(), zCodaAppendable, it->second));
zSiotAppendable = zSiotAppendable || isZSiotAppendable(f.form, it->second, morphemes, combinedMorphemes);
ret.forms.emplace_back(bake(f, ret.morphemes.data(), zCodaAppendable, zSiotAppendable, it->second));
}
}
@ -1898,7 +1942,9 @@ Kiwi KiwiBuilder::build(const TypoTransformer& typos, float typoCostThreshold) c
{
bool zCodaAppendable = isZCodaAppendable(f.form, f.candidate, morphemes, combinedMorphemes)
|| isZCodaAppendable(f.form, newFormCands[ret.forms.size()], morphemes, combinedMorphemes);
ret.forms.emplace_back(bake(f, ret.morphemes.data(), zCodaAppendable, newFormCands[ret.forms.size()]));
bool zSiotAppendable = isZSiotAppendable(f.form, f.candidate, morphemes, combinedMorphemes)
|| isZSiotAppendable(f.form, newFormCands[ret.forms.size()], morphemes, combinedMorphemes);
ret.forms.emplace_back(bake(f, ret.morphemes.data(), zCodaAppendable, zSiotAppendable, newFormCands[ret.forms.size()]));
}
Vector<size_t> newFormIdMapper(ret.forms.size());

View file

@ -118,6 +118,7 @@ namespace kiwi
const size_t topN,
bool openEnd,
bool splitComplex = false,
bool splitSaisiot = false,
const std::unordered_set<const Morpheme*>* blocklist = nullptr
);
@ -134,6 +135,7 @@ namespace kiwi
bool unknownForm,
const Vector<SpecialState>& prevSpStates,
bool splitComplex = false,
bool splitSaisiot = false,
const std::unordered_set<const Morpheme*>* blocklist = nullptr
);
@ -689,6 +691,12 @@ namespace kiwi
{
for (auto& prevPath : cache[prev - startNode])
{
// 사이시옷 뒤에 명사가 아닌 태그가 오는 경우 제외
if (prevPath.morpheme->tag == POSTag::z_siot && !isNNClass(curMorph->tag))
{
continue;
}
float candScore = prevPath.accScore + additionalScore;
if (prevPath.combineSocket)
{
@ -824,6 +832,7 @@ namespace kiwi
bool unknownForm,
const Vector<SpecialState>& prevSpStates,
bool splitComplex,
bool splitSaisiot,
const std::unordered_set<const Morpheme*>* blocklist
)
{
@ -858,6 +867,7 @@ namespace kiwi
for (auto& curMorph : cands)
{
if (splitComplex && curMorph->getCombined()->complex) continue;
if (splitSaisiot && curMorph->getCombined()->saisiot) continue;
if (blocklist && blocklist->count(curMorph->getCombined())) continue;
// 덧붙은 받침(zCoda)을 위한 지름길
@ -880,6 +890,26 @@ namespace kiwi
}
continue;
}
// 사이시옷(zSiot)을 위한 지름길
if (curMorph->tag == POSTag::z_siot)
{
for (auto* prev = node->getPrev(); prev; prev = prev->getSibling())
{
for (auto& p : cache[prev - startNode])
{
auto lastTag = kw->morphemes[p.wid].tag;
if (!isNNClass(lastTag)) continue;
nCache.emplace_back(p);
auto& newPath = nCache.back();
newPath.accScore += curMorph->userScore * kw->typoCostWeight;
newPath.accTypoCost -= curMorph->userScore;
newPath.parent = &p;
newPath.morpheme = &kw->morphemes[curMorph->lmMorphemeId];
newPath.wid = curMorph->lmMorphemeId;
}
}
continue;
}
// if the morpheme has chunk set
if (!(curMorph->chunks.empty()|| curMorph->complex))
@ -1062,6 +1092,7 @@ namespace kiwi
const size_t topN,
bool openEnd,
bool splitComplex,
bool splitSaisiot,
const std::unordered_set<const Morpheme*>* blocklist
)
{
@ -1115,7 +1146,9 @@ namespace kiwi
if (node->form)
{
evalPath<LmState>(kw, startNode, node, topN, cache, ownFormList, i, ownFormId, node->form->candidate, false, uniqStates, splitComplex, blocklist);
evalPath<LmState>(kw, startNode, node, topN, cache,
ownFormList, i, ownFormId, node->form->candidate,
false, uniqStates, splitComplex, splitSaisiot, blocklist);
if (all_of(node->form->candidate.begin(), node->form->candidate.end(), [](const Morpheme* m)
{
return m->combineSocket || (!m->chunks.empty() && !m->complex);
@ -1123,12 +1156,16 @@ namespace kiwi
{
ownFormList.emplace_back(node->form->form);
ownFormId = ownFormList.size();
evalPath<LmState>(kw, startNode, node, topN, cache, ownFormList, i, ownFormId, unknownNodeLCands, true, uniqStates, splitComplex, blocklist);
evalPath<LmState>(kw, startNode, node, topN, cache,
ownFormList, i, ownFormId, unknownNodeLCands,
true, uniqStates, splitComplex, splitSaisiot, blocklist);
};
}
else
{
evalPath<LmState>(kw, startNode, node, topN, cache, ownFormList, i, ownFormId, unknownNodeCands, true, uniqStates, splitComplex, blocklist);
evalPath<LmState>(kw, startNode, node, topN, cache,
ownFormList, i, ownFormId, unknownNodeCands,
true, uniqStates, splitComplex, splitSaisiot, blocklist);
}
#ifdef DEBUG_PRINT

View file

@ -622,6 +622,7 @@ namespace kiwi
if (tagStr == u"SH") return POSTag::sh;
if (tagStr == u"SN") return POSTag::sn;
if (tagStr == u"Z_CODA") return POSTag::z_coda;
if (tagStr == u"Z_SIOT") return POSTag::z_siot;
if (tagStr == u"V") return POSTag::p;
if (tagStr == u"A") return POSTag::p;
if (tagStr == u"^") return POSTag::unknown;

View file

@ -314,7 +314,7 @@ namespace kiwi
"W_URL", "W_EMAIL", "W_MENTION", "W_HASHTAG", "W_SERIAL", "W_EMOJI",
"JKS", "JKC", "JKG", "JKO", "JKB", "JKV", "JKQ", "JX", "JC",
"EP", "EF", "EC", "ETN", "ETM",
"Z_CODA",
"Z_CODA", "Z_SIOT",
"USER0", "USER1", "USER2", "USER3", "USER4",
"P",
"@"
@ -356,7 +356,7 @@ namespace kiwi
u"W_URL", u"W_EMAIL", u"W_MENTION", u"W_HASHTAG", u"W_SERIAL", u"W_EMOJI",
u"JKS", u"JKC", u"JKG", u"JKO", u"JKB", u"JKV", u"JKQ", u"JX", u"JC",
u"EP", u"EF", u"EC", u"ETN", u"ETM",
u"Z_CODA",
u"Z_CODA", u"Z_SIOT",
u"USER0", u"USER1", u"USER2", u"USER3", u"USER4",
u"P",
u"@"