Add multi morpheme token support to UnigramSwTrainer

Add support for multimorph tokens type
2026-06-17 01:54:27 +00:00 · 2024-07-01 01:09:23 +09:00 · 2024-06-14 01:17:20 +09:00
3 changed files with 1187 additions and 303 deletions
--- a/include/kiwi/SwTokenizer.h
+++ b/include/kiwi/SwTokenizer.h
@ -2,7 +2,7 @@
 * @file SwTokenizer.h
 * @author bab2min (bab2min@gmail.com)
 * @brief Subword Tokenizer
- * @version 0.16.1
+ * @version 0.18.0
 * @date 2022-07-28
 *
 *
@ -31,6 +31,7 @@ namespace kiwi
 		glue = 2,
 		subword = 3,
 		byte = 4,
+		multimorph = 5,
 		punct,
 		chinese,
 	};
@ -39,15 +40,22 @@ namespace kiwi
 	{
 		const char16_t* form = nullptr;
 		uint32_t length = 0;
+		uint32_t internalFormOffset = 0;
+		uint32_t internalLength = 0;
 		POSTag pos = POSTag::unknown;
 		SwTokenFlag flags = SwTokenFlag::none;
 		uint8_t byte = 0;

-		SwToken(const char16_t* _form = nullptr, size_t _length = 0,
-			POSTag _pos = POSTag::unknown, SwTokenFlag _flags = SwTokenFlag::none, uint8_t _byte = 0)
-			: form{ _form }, length{ (uint32_t)_length }, pos{ _pos }, flags{ _flags }, byte{ _byte }
+		SwToken(const char16_t* _form = nullptr, 
+			size_t _length = 0,
+			POSTag _pos = POSTag::unknown, 
+			SwTokenFlag _flags = SwTokenFlag::none, 
+			uint8_t _byte = 0)
+			: form{ _form }, length{ (uint32_t)_length }, internalLength{ (uint32_t)_length }, pos { _pos }, flags{ _flags }, byte{ _byte }
 		{
 		}
+
+		const char16_t* internalForm() const { return form + internalFormOffset; }
 	};

 	struct SwTokenizerConfig
@ -57,6 +65,7 @@ namespace kiwi
 			unk, cls, sep, pad, mask, bos, eos, glue
 		};
 		std::array<std::string, eos + 1> specialTokens;
+		std::string additionalJson;
 		bool doLowercase = false;
 		bool splitChinese = true;
 		bool wholeWordUnk = false;
@ -70,7 +79,7 @@ namespace kiwi
 		bool strict = false; // not implemented yet
 		bool fallbackHangul = true;
 		bool fallbackByte = false;
-		std::string additionalJson;
+		

 		SwTokenizerConfig()
 		{
@ -97,6 +106,7 @@ namespace kiwi
 		bool reduceStrict = false;
 		bool removeRepetitive = true;
 		bool preventMixedDigitTokens = true;
+		size_t maxMultiMorphSize = 0;
 	};

 	class SwTokenizer;
@ -163,6 +173,7 @@ namespace kiwi
 		void* dfTokenizeSubword = nullptr;
 		void* dfTokenizeSubwordWithOffset = nullptr;
 		const Kiwi* kiwi = nullptr;
+		bool multiMorphMode = false;
 		SwTokenizerConfig config;
 		Vocab vocab;
 		utils::FrozenTrie<kchar_t, uint32_t> trie;
@ -170,6 +181,8 @@ namespace kiwi
 		Vector<float> tokenLProbs;
 		Vector<uint32_t> morphToSw;
 		Vector<uint32_t> swToMorph;
+		Vector<uint32_t> codeToMorph;
+		UnorderedMap<uint32_t, uint32_t> morphToCode;
 		Vector<uint32_t> hangulFallbackChrs;
 		Vector<uint32_t> byteFallbackChrs;
 		std::array<size_t, SwTokenizerConfig::glue + 1> specialTokenIds = { { 0, } };
@ -186,6 +199,12 @@ namespace kiwi
 		template<class TokenIt>
 		void encode(std::vector<uint32_t>& out, TokenIt first, TokenIt last, std::vector<std::pair<uint32_t, uint32_t>>* offset = nullptr) const;

+		template<class TokenIt>
+		void encodeWithoutMultiMorph(std::vector<uint32_t>& out, TokenIt first, TokenIt last, std::vector<std::pair<uint32_t, uint32_t>>* offset = nullptr) const;
+
+		template<class TokenIt>
+		void encodeWithMultiMorph(std::vector<uint32_t>& out, TokenIt first, TokenIt last, std::vector<std::pair<uint32_t, uint32_t>>* offset = nullptr) const;
+
 		template<class It>
 		std::string decode(It first, It last, bool ignoreErrors = true) const;

@ -263,11 +282,13 @@ namespace kiwi

 		UnorderedMap<std::u16string, size_t> wordMap;
 		Vector<std::pair<const std::u16string, size_t>*> invWordMap;
-		Vector<size_t> wordCnts;
+		Vector<size_t> wordCnts, morphCnts;
 		UnorderedMap<size_t, WordCand> wordSuffix;
 		UnorderedMap<std::pair<KString, POSTag>, const Morpheme*> reprMorphMap;
-		HiddenMember<RaggedVector<int32_t>, sizeof(Vector<size_t>) * 2> sents;
+		UnorderedMap<std::pair<KString, POSTag>, size_t> morphCodeMap;
+		Vector<const Morpheme*> invMorphCodeMap;
 		Vector<size_t> tokenFreqs;
+		UnorderedMap<char32_t, std::u16string> altReprForChrMorphCode;

 		Vector<std::u16string> chrPrefix;
 		utils::FrozenTrie<char16_t, size_t> chrTrie;
@ -278,10 +299,28 @@ namespace kiwi
 		Vector<PrefixAvailability> prefixAvailable;

 		void addWord(const std::u16string& s, const Vector<const Morpheme*>& morphs, const Vector<size_t>& boundaries, bool spacePrefix);
+		void addWord(const TokenInfo* first, const TokenInfo* last, bool spacePrefix);
+
+		void addKiwiResult(std::vector<TokenInfo>& tokens, 
+			const Vector<const Morpheme*>& verbalSuffices, 
+			const Vector<const Morpheme*>& eomiSuffices, 
+			const UnorderedMap<Vector<const Morpheme*>, const Morpheme*>& complexMorphemes);
+
+		void addKiwiResultWithoutMultiMorph(std::vector<TokenInfo>& tokens,
+			const Vector<const Morpheme*>& verbalSuffices,
+			const Vector<const Morpheme*>& eomiSuffices,
+			const UnorderedMap<Vector<const Morpheme*>, const Morpheme*>& complexMorphemes);
+
+		void addKiwiResultWithMultiMorph(std::vector<TokenInfo>& tokens,
+			const Vector<const Morpheme*>& verbalSuffices,
+			const Vector<const Morpheme*>& eomiSuffices,
+			const UnorderedMap<Vector<const Morpheme*>, const Morpheme*>& complexMorphemes);

 		template<class Feeder>
 		size_t _addSentences(Feeder&& feeder);

+		void initAltReprForChrMorphCode();
+
 		Vector<uint32_t> tokenizeShort(U16StringView s, bool spacePrefix = false) const;
 		Vector<uint32_t> tokenizeShort(U16StringView s, const Vector<int32_t>& boundaries) const;
 		std::pair<Vector<uint32_t>, float> tokenizeBest(U16StringView s, bool spacePrefix = false, const Vector<int32_t>* boundaries = nullptr) const;
@ -289,6 +328,8 @@ namespace kiwi

 		const Morpheme* toReprMorph(const Morpheme* m);

+		char32_t morphToChrMorphCode(const Morpheme* m);
+
 	public:
 		UnigramSwTrainer(const Kiwi& kiwi, const SwTokenizerConfig& config, const UnigramSwTrainerConfig& trainConfig);
 		UnigramSwTrainer(const UnigramSwTrainer&);
--- a/src/StrUtils.h
+++ b/src/StrUtils.h
@ -161,6 +161,38 @@ namespace kiwi
 		return replace(s, nonstd::basic_string_view<BaseChr, Trait>{ from, fromSize - 1 }, nonstd::basic_string_view<BaseChr, Trait>{ to, toSize - 1 });
 	}
 	
+	template<class OutTy>
+	inline OutTy char32To8(char32_t c, OutTy out)
+	{
+		if (c < 0x80)
+		{
+			*out++ = (char)c;
+		}
+		else if (c < 0x800)
+		{
+			*out++ = (char)(0xC0 | (c >> 6));
+			*out++ = (char)(0x80 | (c & 0x3F));
+		}
+		else if (c < 0x10000)
+		{
+			*out++ = (char)(0xE0 | (c >> 12));
+			*out++ = (char)(0x80 | ((c >> 6) & 0x3F));
+			*out++ = (char)(0x80 | (c & 0x3F));
+		}
+		else if (c < 0x110000)
+		{
+			*out++ = (char)(0xF0 | (c >> 18));
+			*out++ = (char)(0x80 | ((c >> 12) & 0x3F));
+			*out++ = (char)(0x80 | ((c >> 6) & 0x3F));
+			*out++ = (char)(0x80 | (c & 0x3F));
+		}
+		else
+		{
+			throw UnicodeException{ "unicode error" };
+		}
+		return out;
+	}
+	

 	inline void utf8To16(nonstd::string_view str, std::u16string& ret)
 	{
@ -418,32 +450,7 @@ namespace kiwi
 				code = mergeSurrogate(code, code2);
 			}

-			if (code <= 0x7F)
-			{
-				ret.push_back((char)code);
-			}
-			else if (code <= 0x7FF)
-			{
-				ret.push_back((char)(0xC0 | (code >> 6)));
-				ret.push_back((char)(0x80 | (code & 0x3F)));
-			}
-			else if (code <= 0xFFFF)
-			{
-				ret.push_back((char)(0xE0 | (code >> 12)));
-				ret.push_back((char)(0x80 | ((code >> 6) & 0x3F)));
-				ret.push_back((char)(0x80 | (code & 0x3F)));
-			}
-			else if (code <= 0x10FFFF)
-			{
-				ret.push_back((char)(0xF0 | (code >> 18)));
-				ret.push_back((char)(0x80 | ((code >> 12) & 0x3F)));
-				ret.push_back((char)(0x80 | ((code >> 6) & 0x3F)));
-				ret.push_back((char)(0x80 | (code & 0x3F)));
-			}
-			else
-			{
-				throw UnicodeException{ "unicode error" };
-			}
+			char32To8(code, std::back_inserter(ret));
 		}

 		return ret;
@ -467,32 +474,7 @@ namespace kiwi
 				code = mergeSurrogate(code, code2);
 			}

-			if (code <= 0x7F)
-			{
-				ret.push_back((char)code);
-			}
-			else if (code <= 0x7FF)
-			{
-				ret.push_back((char)(0xC0 | (code >> 6)));
-				ret.push_back((char)(0x80 | (code & 0x3F)));
-			}
-			else if (code <= 0xFFFF)
-			{
-				ret.push_back((char)(0xE0 | (code >> 12)));
-				ret.push_back((char)(0x80 | ((code >> 6) & 0x3F)));
-				ret.push_back((char)(0x80 | (code & 0x3F)));
-			}
-			else if (code <= 0x10FFFF)
-			{
-				ret.push_back((char)(0xF0 | (code >> 18)));
-				ret.push_back((char)(0x80 | ((code >> 12) & 0x3F)));
-				ret.push_back((char)(0x80 | ((code >> 6) & 0x3F)));
-				ret.push_back((char)(0x80 | (code & 0x3F)));
-			}
-			else
-			{
-				throw UnicodeException{ "unicode error" };
-			}
+			char32To8(code, std::back_inserter(ret));
 		}
 		positions.emplace_back(ret.size());

--- a/src/SwTokenizer.cpp
+++ b/src/SwTokenizer.cpp
Author	SHA1	Message	Date
bab2min	ccc27d080a	Add multi morpheme token support to `UnigramSwTrainer`	2024-07-01 01:09:23 +09:00
bab2min	7b785719b0	Add support for multimorph tokens type	2024-06-14 01:17:20 +09:00