Compare commits

...

2 commits

Author SHA1 Message Date
bab2min
ccc27d080a Add multi morpheme token support to UnigramSwTrainer 2024-07-01 01:09:23 +09:00
bab2min
7b785719b0 Add support for multimorph tokens type 2024-06-14 01:17:20 +09:00
3 changed files with 1187 additions and 303 deletions

View file

@ -2,7 +2,7 @@
* @file SwTokenizer.h
* @author bab2min (bab2min@gmail.com)
* @brief Subword Tokenizer
* @version 0.16.1
* @version 0.18.0
* @date 2022-07-28
*
*
@ -31,6 +31,7 @@ namespace kiwi
glue = 2,
subword = 3,
byte = 4,
multimorph = 5,
punct,
chinese,
};
@ -39,15 +40,22 @@ namespace kiwi
{
const char16_t* form = nullptr;
uint32_t length = 0;
uint32_t internalFormOffset = 0;
uint32_t internalLength = 0;
POSTag pos = POSTag::unknown;
SwTokenFlag flags = SwTokenFlag::none;
uint8_t byte = 0;
SwToken(const char16_t* _form = nullptr, size_t _length = 0,
POSTag _pos = POSTag::unknown, SwTokenFlag _flags = SwTokenFlag::none, uint8_t _byte = 0)
: form{ _form }, length{ (uint32_t)_length }, pos{ _pos }, flags{ _flags }, byte{ _byte }
SwToken(const char16_t* _form = nullptr,
size_t _length = 0,
POSTag _pos = POSTag::unknown,
SwTokenFlag _flags = SwTokenFlag::none,
uint8_t _byte = 0)
: form{ _form }, length{ (uint32_t)_length }, internalLength{ (uint32_t)_length }, pos { _pos }, flags{ _flags }, byte{ _byte }
{
}
const char16_t* internalForm() const { return form + internalFormOffset; }
};
struct SwTokenizerConfig
@ -57,6 +65,7 @@ namespace kiwi
unk, cls, sep, pad, mask, bos, eos, glue
};
std::array<std::string, eos + 1> specialTokens;
std::string additionalJson;
bool doLowercase = false;
bool splitChinese = true;
bool wholeWordUnk = false;
@ -70,7 +79,7 @@ namespace kiwi
bool strict = false; // not implemented yet
bool fallbackHangul = true;
bool fallbackByte = false;
std::string additionalJson;
SwTokenizerConfig()
{
@ -97,6 +106,7 @@ namespace kiwi
bool reduceStrict = false;
bool removeRepetitive = true;
bool preventMixedDigitTokens = true;
size_t maxMultiMorphSize = 0;
};
class SwTokenizer;
@ -163,6 +173,7 @@ namespace kiwi
void* dfTokenizeSubword = nullptr;
void* dfTokenizeSubwordWithOffset = nullptr;
const Kiwi* kiwi = nullptr;
bool multiMorphMode = false;
SwTokenizerConfig config;
Vocab vocab;
utils::FrozenTrie<kchar_t, uint32_t> trie;
@ -170,6 +181,8 @@ namespace kiwi
Vector<float> tokenLProbs;
Vector<uint32_t> morphToSw;
Vector<uint32_t> swToMorph;
Vector<uint32_t> codeToMorph;
UnorderedMap<uint32_t, uint32_t> morphToCode;
Vector<uint32_t> hangulFallbackChrs;
Vector<uint32_t> byteFallbackChrs;
std::array<size_t, SwTokenizerConfig::glue + 1> specialTokenIds = { { 0, } };
@ -186,6 +199,12 @@ namespace kiwi
template<class TokenIt>
void encode(std::vector<uint32_t>& out, TokenIt first, TokenIt last, std::vector<std::pair<uint32_t, uint32_t>>* offset = nullptr) const;
template<class TokenIt>
void encodeWithoutMultiMorph(std::vector<uint32_t>& out, TokenIt first, TokenIt last, std::vector<std::pair<uint32_t, uint32_t>>* offset = nullptr) const;
template<class TokenIt>
void encodeWithMultiMorph(std::vector<uint32_t>& out, TokenIt first, TokenIt last, std::vector<std::pair<uint32_t, uint32_t>>* offset = nullptr) const;
template<class It>
std::string decode(It first, It last, bool ignoreErrors = true) const;
@ -263,11 +282,13 @@ namespace kiwi
UnorderedMap<std::u16string, size_t> wordMap;
Vector<std::pair<const std::u16string, size_t>*> invWordMap;
Vector<size_t> wordCnts;
Vector<size_t> wordCnts, morphCnts;
UnorderedMap<size_t, WordCand> wordSuffix;
UnorderedMap<std::pair<KString, POSTag>, const Morpheme*> reprMorphMap;
HiddenMember<RaggedVector<int32_t>, sizeof(Vector<size_t>) * 2> sents;
UnorderedMap<std::pair<KString, POSTag>, size_t> morphCodeMap;
Vector<const Morpheme*> invMorphCodeMap;
Vector<size_t> tokenFreqs;
UnorderedMap<char32_t, std::u16string> altReprForChrMorphCode;
Vector<std::u16string> chrPrefix;
utils::FrozenTrie<char16_t, size_t> chrTrie;
@ -278,10 +299,28 @@ namespace kiwi
Vector<PrefixAvailability> prefixAvailable;
void addWord(const std::u16string& s, const Vector<const Morpheme*>& morphs, const Vector<size_t>& boundaries, bool spacePrefix);
void addWord(const TokenInfo* first, const TokenInfo* last, bool spacePrefix);
void addKiwiResult(std::vector<TokenInfo>& tokens,
const Vector<const Morpheme*>& verbalSuffices,
const Vector<const Morpheme*>& eomiSuffices,
const UnorderedMap<Vector<const Morpheme*>, const Morpheme*>& complexMorphemes);
void addKiwiResultWithoutMultiMorph(std::vector<TokenInfo>& tokens,
const Vector<const Morpheme*>& verbalSuffices,
const Vector<const Morpheme*>& eomiSuffices,
const UnorderedMap<Vector<const Morpheme*>, const Morpheme*>& complexMorphemes);
void addKiwiResultWithMultiMorph(std::vector<TokenInfo>& tokens,
const Vector<const Morpheme*>& verbalSuffices,
const Vector<const Morpheme*>& eomiSuffices,
const UnorderedMap<Vector<const Morpheme*>, const Morpheme*>& complexMorphemes);
template<class Feeder>
size_t _addSentences(Feeder&& feeder);
void initAltReprForChrMorphCode();
Vector<uint32_t> tokenizeShort(U16StringView s, bool spacePrefix = false) const;
Vector<uint32_t> tokenizeShort(U16StringView s, const Vector<int32_t>& boundaries) const;
std::pair<Vector<uint32_t>, float> tokenizeBest(U16StringView s, bool spacePrefix = false, const Vector<int32_t>* boundaries = nullptr) const;
@ -289,6 +328,8 @@ namespace kiwi
const Morpheme* toReprMorph(const Morpheme* m);
char32_t morphToChrMorphCode(const Morpheme* m);
public:
UnigramSwTrainer(const Kiwi& kiwi, const SwTokenizerConfig& config, const UnigramSwTrainerConfig& trainConfig);
UnigramSwTrainer(const UnigramSwTrainer&);

View file

@ -161,6 +161,38 @@ namespace kiwi
return replace(s, nonstd::basic_string_view<BaseChr, Trait>{ from, fromSize - 1 }, nonstd::basic_string_view<BaseChr, Trait>{ to, toSize - 1 });
}
template<class OutTy>
inline OutTy char32To8(char32_t c, OutTy out)
{
if (c < 0x80)
{
*out++ = (char)c;
}
else if (c < 0x800)
{
*out++ = (char)(0xC0 | (c >> 6));
*out++ = (char)(0x80 | (c & 0x3F));
}
else if (c < 0x10000)
{
*out++ = (char)(0xE0 | (c >> 12));
*out++ = (char)(0x80 | ((c >> 6) & 0x3F));
*out++ = (char)(0x80 | (c & 0x3F));
}
else if (c < 0x110000)
{
*out++ = (char)(0xF0 | (c >> 18));
*out++ = (char)(0x80 | ((c >> 12) & 0x3F));
*out++ = (char)(0x80 | ((c >> 6) & 0x3F));
*out++ = (char)(0x80 | (c & 0x3F));
}
else
{
throw UnicodeException{ "unicode error" };
}
return out;
}
inline void utf8To16(nonstd::string_view str, std::u16string& ret)
{
@ -418,32 +450,7 @@ namespace kiwi
code = mergeSurrogate(code, code2);
}
if (code <= 0x7F)
{
ret.push_back((char)code);
}
else if (code <= 0x7FF)
{
ret.push_back((char)(0xC0 | (code >> 6)));
ret.push_back((char)(0x80 | (code & 0x3F)));
}
else if (code <= 0xFFFF)
{
ret.push_back((char)(0xE0 | (code >> 12)));
ret.push_back((char)(0x80 | ((code >> 6) & 0x3F)));
ret.push_back((char)(0x80 | (code & 0x3F)));
}
else if (code <= 0x10FFFF)
{
ret.push_back((char)(0xF0 | (code >> 18)));
ret.push_back((char)(0x80 | ((code >> 12) & 0x3F)));
ret.push_back((char)(0x80 | ((code >> 6) & 0x3F)));
ret.push_back((char)(0x80 | (code & 0x3F)));
}
else
{
throw UnicodeException{ "unicode error" };
}
char32To8(code, std::back_inserter(ret));
}
return ret;
@ -467,32 +474,7 @@ namespace kiwi
code = mergeSurrogate(code, code2);
}
if (code <= 0x7F)
{
ret.push_back((char)code);
}
else if (code <= 0x7FF)
{
ret.push_back((char)(0xC0 | (code >> 6)));
ret.push_back((char)(0x80 | (code & 0x3F)));
}
else if (code <= 0xFFFF)
{
ret.push_back((char)(0xE0 | (code >> 12)));
ret.push_back((char)(0x80 | ((code >> 6) & 0x3F)));
ret.push_back((char)(0x80 | (code & 0x3F)));
}
else if (code <= 0x10FFFF)
{
ret.push_back((char)(0xF0 | (code >> 18)));
ret.push_back((char)(0x80 | ((code >> 12) & 0x3F)));
ret.push_back((char)(0x80 | ((code >> 6) & 0x3F)));
ret.push_back((char)(0x80 | (code & 0x3F)));
}
else
{
throw UnicodeException{ "unicode error" };
}
char32To8(code, std::back_inserter(ret));
}
positions.emplace_back(ret.size());

File diff suppressed because it is too large Load diff