Compare commits

...

5 commits

Author SHA1 Message Date
copilot-swe-agent[bot]
4961562a30 Fix spacing/formatting issues in documentation comments
Co-authored-by: bab2min <19266222+bab2min@users.noreply.github.com>
2026-01-15 06:14:29 +00:00
copilot-swe-agent[bot]
63c346e69a Add Doxygen documentation to PatternMatcher, TagUtils, and Mmap
Co-authored-by: bab2min <19266222+bab2min@users.noreply.github.com>
2026-01-15 06:10:54 +00:00
copilot-swe-agent[bot]
6c80208796 Add Doxygen documentation to SkipBigramModel, CoNgramModel, ThreadPool, and Utils
Co-authored-by: bab2min <19266222+bab2min@users.noreply.github.com>
2026-01-15 06:08:58 +00:00
copilot-swe-agent[bot]
3b4bc20d54 Add Doxygen documentation to LangModel, Joiner, Knlm, WordDetector, and FrozenTrie
Co-authored-by: bab2min <19266222+bab2min@users.noreply.github.com>
2026-01-15 06:05:47 +00:00
copilot-swe-agent[bot]
062ba419c6 Initial plan 2026-01-15 05:57:54 +00:00
17 changed files with 857 additions and 51 deletions

View file

@ -1,3 +1,14 @@
/**
* @file CoNgramModel.h
* @author bab2min (bab2min@gmail.com)
* @brief N-gram (Contextual N-gram Model)
* @version 0.22.1
* @date 2025-11-21
*
* .
* N-gram .
*/
#pragma once
#include <array>
@ -15,25 +26,49 @@ namespace kiwi
{
namespace lm
{
/**
* @brief N-gram
*/
struct CoNgramModelHeader
{
uint64_t vocabSize, contextSize;
uint16_t dim;
uint8_t contextType, outputType;
uint8_t keySize, windowSize, qbit, qgroup;
uint64_t numNodes;
uint64_t nodeOffset, keyOffset, valueOffset, embOffset;
uint64_t vocabSize; /**< 어휘 크기 */
uint64_t contextSize; /**< 문맥 크기 */
uint16_t dim; /**< 임베딩 차원 */
uint8_t contextType; /**< 문맥 타입 */
uint8_t outputType; /**< 출력 타입 */
uint8_t keySize; /**< 키 크기 */
uint8_t windowSize; /**< 윈도우 크기 */
uint8_t qbit; /**< 양자화 비트 수 */
uint8_t qgroup; /**< 양자화 그룹 크기 */
uint64_t numNodes; /**< 노드 개수 */
uint64_t nodeOffset; /**< 노드 데이터 오프셋 */
uint64_t keyOffset; /**< 키 데이터 오프셋 */
uint64_t valueOffset; /**< 값 데이터 오프셋 */
uint64_t embOffset; /**< 임베딩 데이터 오프셋 */
};
/**
* @brief N-gram
*
* @tparam KeyType
* @tparam ValueType
* @tparam DiffType diff
*/
template<class KeyType, class ValueType, class DiffType = int32_t>
struct Node
{
KeyType numNexts = 0;
ValueType value = 0;
DiffType lower = 0;
uint32_t nextOffset = 0;
KeyType numNexts = 0; /**< 다음 노드의 개수 */
ValueType value = 0; /**< 노드 값 */
DiffType lower = 0; /**< 하위 노드로의 오프셋 */
uint32_t nextOffset = 0; /**< 다음 노드들의 시작 오프셋 */
};
/**
* @brief N-gram
*
* .
* .
*/
class CoNgramModelBase : public ILangModel
{
protected:
@ -49,20 +84,84 @@ namespace kiwi
size_t vocabSize() const override { return header.vocabSize; }
size_t getMemorySize() const override { return memorySize; }
/**
* @brief .
* @return CoNgramModelHeader에 const
*/
const CoNgramModelHeader& getHeader() const { return header; }
/**
* @brief .
* @param vocabId ID
* @param topN N개
* @param output ( ID, )
* @return
*/
virtual size_t mostSimilarWords(uint32_t vocabId, size_t topN, std::pair<uint32_t, float>* output) const = 0;
/**
* @brief .
* @param vocabId1 ID
* @param vocabId2 ID
* @return
*/
virtual float wordSimilarity(uint32_t vocabId1, uint32_t vocabId2) const = 0;
/**
* @brief .
* @param contextId ID
* @param topN N개
* @param output
* @return
*/
virtual size_t mostSimilarContexts(uint32_t contextId, size_t topN, std::pair<uint32_t, float>* output) const = 0;
/**
* @brief .
* @param contextId1 ID
* @param contextId2 ID
* @return
*/
virtual float contextSimilarity(uint32_t contextId1, uint32_t contextId2) const = 0;
/**
* @brief .
* @param contextId ID
* @param topN N개
* @param output
* @return
*/
virtual size_t predictWordsFromContext(uint32_t contextId, size_t topN, std::pair<uint32_t, float>* output) const = 0;
/**
* @brief .
* @param contextId ID
* @param bgContextId ID
* @param weight
* @param topN N개
* @param output
* @return
*/
virtual size_t predictWordsFromContextDiff(uint32_t contextId, uint32_t bgContextId, float weight, size_t topN, std::pair<uint32_t, float>* output) const = 0;
/**
* @brief ID 퀀 ID로 .
* @param vocabIds ID
* @param size
* @return ID
*/
virtual uint32_t toContextId(const uint32_t* vocabIds, size_t size) const = 0;
/**
* @brief .
* @return -
*/
virtual std::vector<std::vector<uint32_t>> getContextWordMap() const = 0;
/**
* @brief - .
* @return - const
*/
const std::vector<std::vector<uint32_t>>& getContextWordMapCached() const
{
if (contextWordMapCache.empty())
@ -72,6 +171,16 @@ namespace kiwi
return contextWordMapCache;
}
/**
* @brief .
* @param contextDefinition
* @param embedding
* @param maxContextLength
* @param useVLE VLE(Variable Length Encoding)
* @param reorderContextIdx
* @param selectedEmbIdx
* @return
*/
static utils::MemoryObject build(const std::string& contextDefinition, const std::string& embedding,
size_t maxContextLength = -1,
bool useVLE = true,

View file

@ -1,4 +1,16 @@
#pragma once
/**
* @file FrozenTrie.h
* @author bab2min (bab2min@gmail.com)
* @brief (immutable) Trie
* @version 0.22.1
* @date 2025-11-21
*
* Trie , .
* Aho-Corasick (fail link) .
* .
*/
#pragma once
#include <array>
#include <vector>
@ -15,6 +27,10 @@ namespace kiwi
{
namespace detail
{
/**
* @brief
* @tparam Value
*/
template<class Value, class = void>
struct HasSubmatch {};
@ -66,6 +82,18 @@ namespace kiwi
};
}
/**
* @brief (frozen) Trie
*
* Trie로, .
* Aho-Corasick (fail function)
* .
*
* @tparam _Key ()
* @tparam _Value
* @tparam _Diff diff
* @tparam _HasSubmatch
*/
template<class _Key, class _Value, class _Diff = int32_t, class _HasSubmatch = detail::HasSubmatch<_Value>>
class FrozenTrie : public _HasSubmatch
{
@ -74,19 +102,46 @@ namespace kiwi
using Value = _Value;
using Diff = _Diff;
/**
* @brief Trie의
*/
struct Node
{
Key numNexts = 0;
Diff lower = 0;
uint32_t nextOffset = 0;
Key numNexts = 0; /**< 자식 노드의 개수 */
Diff lower = 0; /**< 하위 노드로의 오프셋 */
uint32_t nextOffset = 0; /**< 다음 노드들의 시작 오프셋 */
/**
* @brief .
* @tparam arch ( )
* @param ft FrozenTrie
* @param c
* @return , nullptr
*/
template<ArchType arch>
const Node* nextOpt(const FrozenTrie& ft, Key c) const;
/**
* @brief .
* @tparam arch
* @param ft FrozenTrie
* @param c
* @return
*/
template<ArchType arch>
const Node* findFail(const FrozenTrie& ft, Key c) const;
/**
* @brief .
* @return
*/
const Node* fail() const;
/**
* @brief .
* @param ft FrozenTrie
* @return const
*/
const Value& val(const FrozenTrie& ft) const;
};
private:

View file

@ -1,3 +1,14 @@
/**
* @file Joiner.h
* @author bab2min (bab2min@gmail.com)
* @brief Joiner
* @version 0.22.1
* @date 2025-11-21
*
* , .
* .
*/
#pragma once
#include "Types.h"
#include "ArchUtils.h"
@ -13,13 +24,23 @@ namespace kiwi
class CompiledRule;
class AutoJoiner;
/**
* @brief
*/
enum class Space
{
none = 0,
no_space = 1,
insert_space = 2,
none = 0, /**< 공백 처리 없음 */
no_space = 1, /**< 공백을 삽입하지 않음 */
insert_space = 2, /**< 공백을 삽입함 */
};
/**
* @brief
*
*
* .
* CompiledRule을 .
*/
class Joiner
{
friend class CompiledRule;
@ -42,19 +63,51 @@ namespace kiwi
Joiner& operator=(const Joiner&);
Joiner& operator=(Joiner&&);
/**
* @brief .
* @param form
* @param tag
* @param space
*/
void add(const std::u16string& form, POSTag tag, Space space = Space::none);
/**
* @brief .
* @param form (C )
* @param tag
* @param space
*/
void add(const char16_t* form, POSTag tag, Space space = Space::none);
/**
* @brief UTF-16 .
* @param rangesOut ( )
* @return UTF-16
*/
std::u16string getU16(std::vector<std::pair<uint32_t, uint32_t>>* rangesOut = nullptr) const;
/**
* @brief UTF-8 .
* @param rangesOut ( )
* @return UTF-8
*/
std::string getU8(std::vector<std::pair<uint32_t, uint32_t>>* rangesOut = nullptr) const;
};
/**
* @brief
*
*
* .
*
* @tparam LmState
*/
template<class LmState>
struct Candidate
{
Joiner joiner;
LmState lmState;
float score = 0;
Joiner joiner; /**< 형태소 결합기 */
LmState lmState; /**< 언어 모델 상태 */
float score = 0; /**< 현재까지의 누적 점수 */
Candidate(const CompiledRule& _cr, const lm::ILangModel* lm)
: joiner{ _cr }, lmState{ lm }
@ -62,10 +115,17 @@ namespace kiwi
}
};
/**
* @brief VoidLangModel을 Candidate
*
* .
*
* @tparam arch
*/
template<ArchType arch>
struct Candidate<lm::VoidState<arch>>
{
Joiner joiner;
Joiner joiner; /**< 형태소 결합기 */
Candidate(const CompiledRule& _cr, const lm::ILangModel* lm)
: joiner{ _cr }
@ -73,6 +133,12 @@ namespace kiwi
}
};
/**
* @brief
*
* 릿 (type erasure) .
* Candidate를 .
*/
class ErasedVector
{
using FnDestruct = void(*)(ErasedVector*);
@ -161,6 +227,13 @@ namespace kiwi
}
};
/**
* @brief
*
*
* .
* .
*/
class AutoJoiner
{
friend class kiwi::Kiwi;
@ -201,12 +274,51 @@ namespace kiwi
AutoJoiner& operator=(const AutoJoiner&);
AutoJoiner& operator=(AutoJoiner&&);
/**
* @brief ID로 .
* @param morphemeId
* @param space
*/
void add(size_t morphemeId, Space space = Space::none);
/**
* @brief (StringView).
* @param form
* @param tag
* @param space
*/
void add(U16StringView form, POSTag tag, Space space = Space::none);
/**
* @brief (u16string).
* @param form
* @param tag
* @param inferRegularity
* @param space
*/
void add(const std::u16string& form, POSTag tag, bool inferRegularity = true, Space space = Space::none);
/**
* @brief (C ).
* @param form
* @param tag
* @param inferRegularity
* @param space
*/
void add(const char16_t* form, POSTag tag, bool inferRegularity = true, Space space = Space::none);
/**
* @brief UTF-16 .
* @param rangesOut ( )
* @return UTF-16
*/
std::u16string getU16(std::vector<std::pair<uint32_t, uint32_t>>* rangesOut = nullptr) const;
/**
* @brief UTF-8 .
* @param rangesOut ( )
* @return UTF-8
*/
std::string getU8(std::vector<std::pair<uint32_t, uint32_t>>* rangesOut = nullptr) const;
};
}

View file

@ -1,4 +1,16 @@
#pragma once
/**
* @file Knlm.h
* @author bab2min (bab2min@gmail.com)
* @brief Kneser-Ney
* @version 0.22.1
* @date 2025-11-21
*
* Kneser-Ney N-gram .
* 퀀 .
* .
*/
#pragma once
#include "LangModel.h"
@ -6,23 +18,55 @@ namespace kiwi
{
namespace lm
{
/**
* @brief Kneser-Ney
*
* .
*/
struct KnLangModelHeader
{
uint64_t num_nodes, node_offset, key_offset, ll_offset, gamma_offset, qtable_offset, htx_offset;
uint64_t unk_id, bos_id, eos_id, vocab_size;
uint8_t order, key_size, diff_size, quantized;
uint32_t extra_buf_size;
uint64_t num_nodes; /**< 노드의 총 개수 */
uint64_t node_offset; /**< 노드 데이터의 시작 오프셋 */
uint64_t key_offset; /**< 키 데이터의 시작 오프셋 */
uint64_t ll_offset; /**< 로그 우도(log-likelihood) 데이터의 시작 오프셋 */
uint64_t gamma_offset; /**< 감마(백오프 가중치) 데이터의 시작 오프셋 */
uint64_t qtable_offset; /**< 양자화 테이블의 시작 오프셋 */
uint64_t htx_offset; /**< 히스토리 변환 데이터의 시작 오프셋 */
uint64_t unk_id; /**< 미등록어(unknown) ID */
uint64_t bos_id; /**< 문장 시작(beginning of sentence) ID */
uint64_t eos_id; /**< 문장 종료(end of sentence) ID */
uint64_t vocab_size; /**< 어휘 크기 */
uint8_t order; /**< N-gram 차수 */
uint8_t key_size; /**< 키의 크기 (바이트) */
uint8_t diff_size; /**< diff 값의 크기 (바이트) */
uint8_t quantized; /**< 양자화 여부 */
uint32_t extra_buf_size; /**< 추가 버퍼 크기 */
};
/**
* @brief Kneser-Ney
*
* N-gram을 .
*
* @tparam KeyType ( )
* @tparam DiffType diff
*/
template<class KeyType, class DiffType = int32_t>
struct KnLangModelNode
{
KeyType num_nexts = 0;
DiffType lower = 0;
uint32_t next_offset = 0;
float ll = 0, gamma = 0;
KeyType num_nexts = 0; /**< 다음 노드의 개수 */
DiffType lower = 0; /**< 하위 노드로의 오프셋 */
uint32_t next_offset = 0; /**< 다음 노드들의 시작 오프셋 */
float ll = 0; /**< 로그 우도 */
float gamma = 0; /**< 백오프 가중치 */
};
/**
* @brief Kneser-Ney
*
* Kneser-Ney .
* N-gram .
*/
class KnLangModelBase : public ILangModel
{
protected:
@ -44,13 +88,34 @@ namespace kiwi
size_t vocabSize() const override { return getHeader().vocab_size; }
size_t getMemorySize() const override { return base.size(); }
/**
* @brief .
* @return KnLangModelHeader에 const
*/
const KnLangModelHeader& getHeader() const { return *reinterpret_cast<const KnLangModelHeader*>(base.get()); }
/**
* @brief .
* @param node_idx
* @return
*/
virtual ptrdiff_t getLowerNode(ptrdiff_t node_idx) const = 0;
virtual size_t nonLeafNodeSize() const = 0;
/**
* @brief .
* @return
*/
virtual const void* getExtraBuf() const = 0;
/**
* @brief Kneser-Ney .
* @param mem
* @param archType ( )
* @param transposed
* @return unique_ptr
*/
static std::unique_ptr<KnLangModelBase> create(utils::MemoryObject&& mem, ArchType archType = ArchType::none, bool transposed = false);
template<class VocabTy, class Trie, class HistoryTx = std::vector<VocabTy>>
@ -64,14 +129,30 @@ namespace kiwi
size_t extra_buf_size = 0
);
/**
* @brief .
* @return const
*/
const utils::MemoryObject& getMemory() const { return base; }
/**
* @brief .
* @param node_idx ( )
* @param next
* @return
*/
template<class Ty>
float progress(ptrdiff_t& node_idx, Ty next) const
{
return _progress(node_idx, next);
}
/**
* @brief 퀀 .
* @param in_first 퀀
* @param in_last 퀀
* @param out_first
*/
template<class InTy, class OutTy>
void evaluate(InTy in_first, InTy in_last, OutTy out_first) const
{
@ -96,6 +177,13 @@ namespace kiwi
}
}
/**
* @brief 퀀 .
* @param in_first 퀀
* @param in_last 퀀
* @param min_score
* @return
*/
template<class InTy>
float sum(InTy in_first, InTy in_last, float min_score = -100) const
{
@ -108,6 +196,12 @@ namespace kiwi
return ret;
}
/**
* @brief .
* @param in_first 퀀
* @param in_last 퀀
* @return
*/
template<class InTy>
std::vector<float> getNextLL(InTy in_first, InTy in_last) const
{

View file

@ -1,3 +1,14 @@
/**
* @file LangModel.h
* @author bab2min (bab2min@gmail.com)
* @brief
* @version 0.22.1
* @date 2025-11-21
*
* .
* 퀀 .
*/
#pragma once
#include <array>
@ -16,21 +27,61 @@ namespace kiwi
{
namespace lm
{
/**
* @brief
*
* .
* 퀀 .
*/
class ILangModel
{
public:
virtual ~ILangModel() = default;
/**
* @brief .
* @return (none, knlm, skipbigram )
*/
virtual ModelType getType() const = 0;
/**
* @brief .
* @return (vocabulary)
*/
virtual size_t vocabSize() const = 0;
/**
* @brief .
* @return ( )
*/
virtual size_t getMemorySize() const = 0;
/**
* @brief .
* @return
*/
virtual void* getFindBestPathFn() const = 0;
/**
* @brief Joiner .
* @return Joiner
*/
virtual void* getNewJoinerFn() const = 0;
};
/**
* @brief 릿
*
* CRTP(Curiously Recurring Template Pattern)
* .
*
* @tparam DerivedLM
*/
template<class DerivedLM>
struct LmStateBase
{
/**
* @brief .
* @param langMdl
* @param nextToken
* @return
*/
float next(const ILangModel* langMdl, typename DerivedLM::VocabType nextToken)
{
using LmStateType = typename DerivedLM::LmStateType;
@ -41,6 +92,14 @@ namespace kiwi
template<ArchType arch>
class VoidLangModel;
/**
* @brief VoidLangModel의
*
* .
* 0 .
*
* @tparam arch
*/
template<ArchType arch>
struct VoidState : public LmStateBase<VoidLangModel<arch>>
{
@ -55,6 +114,14 @@ namespace kiwi
}
};
/**
* @brief
*
* .
* 0 .
*
* @tparam arch
*/
template<ArchType arch>
class VoidLangModel : public ILangModel
{

View file

@ -1,4 +1,15 @@
#pragma once
/**
* @file Mmap.h
* @author bab2min (bab2min@gmail.com)
* @brief
* @version 0.22.1
* @date 2025-11-21
*
* .
* , .
*/
#pragma once
#include <string>
#include <iostream>
#include <fstream>
@ -12,6 +23,9 @@ namespace kiwi
{
namespace detail
{
/**
* @brief Windows RAII
*/
class HandleGuard
{
HANDLE handle = nullptr;

View file

@ -1,3 +1,14 @@
/**
* @file PatternMatcher.h
* @author bab2min (bab2min@gmail.com)
* @brief
* @version 0.22.1
* @date 2025-11-21
*
* URL, , ,
* .
*/
#pragma once
#include <vector>
@ -7,6 +18,11 @@
namespace kiwi
{
/**
* @brief
*
* .
*/
enum class Match : size_t
{
none = 0,
@ -27,12 +43,20 @@ namespace kiwi
compatibleJamo = 1 << 24, /**< 출력시 한글 첫가끝 자모를 호환가능한 자모로 변환한다. */
splitSaisiot = 1 << 25, /**< 사이시옷이 포함된 합성명사를 분리하여 매칭한다. */
mergeSaisiot = 1 << 26, /**< 사이시옷이 포함된 것으로 추정되는 명사를 결합하여 매칭한다. */
joinVSuffix = joinVerbSuffix | joinAdjSuffix,
joinAffix = joinNounPrefix | joinNounSuffix | joinVerbSuffix | joinAdjSuffix | joinAdvSuffix,
all = url | email | hashtag | mention | serial | emoji | zCoda,
allWithNormalizing = all | normalizeCoda,
joinVSuffix = joinVerbSuffix | joinAdjSuffix, /**< 용언 파생접미사 결합 */
joinAffix = joinNounPrefix | joinNounSuffix | joinVerbSuffix | joinAdjSuffix | joinAdvSuffix, /**< 모든 접사 결합 */
all = url | email | hashtag | mention | serial | emoji | zCoda, /**< 모든 웹 패턴 매칭 */
allWithNormalizing = all | normalizeCoda, /**< 모든 패턴과 정규화 */
};
/**
* @brief .
* @param left ()
* @param first
* @param last
* @param matchOptions
* @return ( , )
*/
std::pair<size_t, kiwi::POSTag> matchPattern(char16_t left, const char16_t* first, const char16_t* last, Match matchOptions);
}

View file

@ -1,3 +1,15 @@
/**
* @file SkipBigramModel.h
* @author bab2min (bab2min@gmail.com)
* @brief Skip-bigram
* @version 0.22.1
* @date 2025-11-21
*
* Skip-bigram은 .
* bigram이 ,
* .
*/
#pragma once
#include "Knlm.h"
@ -6,12 +18,25 @@ namespace kiwi
{
namespace lm
{
/**
* @brief Skip-bigram
*/
struct SkipBigramModelHeader
{
uint64_t vocabSize;
uint8_t keySize, windowSize, compressed, quantize, _rsv[4];
uint64_t vocabSize; /**< 어휘 크기 */
uint8_t keySize; /**< 키의 크기 */
uint8_t windowSize; /**< 윈도우 크기 (skip 거리) */
uint8_t compressed; /**< 압축 여부 */
uint8_t quantize; /**< 양자화 비트 수 */
uint8_t _rsv[4]; /**< 예약 필드 */
};
/**
* @brief Skip-bigram
*
* .
* .
*/
class SkipBigramModelBase : public ILangModel
{
protected:
@ -25,8 +50,19 @@ namespace kiwi
size_t vocabSize() const override { return getHeader().vocabSize; }
ModelType getType() const override { return ModelType::sbg; }
/**
* @brief .
* @return SkipBigramModelHeader에 const
*/
const SkipBigramModelHeader& getHeader() const { return *reinterpret_cast<const SkipBigramModelHeader*>(base.get()); }
/**
* @brief Skip-bigram .
* @param knlmMem Kneser-Ney
* @param sbgMem Skip-bigram
* @param archType ( )
* @return Skip-bigram unique_ptr
*/
static std::unique_ptr<SkipBigramModelBase> create(utils::MemoryObject&& knlmMem, utils::MemoryObject&& sbgMem, ArchType archType = ArchType::none);
};
}

View file

@ -1,3 +1,13 @@
/**
* @file TagUtils.h
* @author bab2min (bab2min@gmail.com)
* @brief
* @version 0.22.1
* @date 2025-11-21
*
* , , .
*/
#pragma once
#include <algorithm>
@ -5,49 +15,107 @@
namespace kiwi
{
/**
* @brief 퀀
*
*
* 퀀 .
* .
*/
class TagSequenceScorer
{
float leftBoundaryScores[2][(size_t)POSTag::max] = { { 0, }, };
public:
float weight;
float weight; /**< 점수 가중치 */
/**
* @brief TagSequenceScorer
* @param _weight (: 5)
*/
TagSequenceScorer(float _weight = 5);
/**
* @brief .
* @param hasLeftBoundary
* @param right
* @return
*/
float evalLeftBoundary(bool hasLeftBoundary, POSTag right) const
{
return leftBoundaryScores[hasLeftBoundary ? 1 : 0][(size_t)clearIrregular(right)] * weight;
}
};
/**
* @brief .
* @param tag
* @return (, , ) true
*/
bool isNounClass(POSTag tag);
/**
* @brief .
* @param tag
* @return (, ) true
*/
bool isVerbClass(POSTag tag);
/**
* @brief .
* @param tag
* @return true
*/
inline bool isEClass(POSTag tag)
{
return POSTag::ep <= tag && tag <= POSTag::etm;
}
/**
* @brief .
* @param tag
* @return true
*/
inline bool isJClass(POSTag tag)
{
return POSTag::jks <= tag && tag <= POSTag::jc;
}
/**
* @brief .
* @param tag
* @return true
*/
inline bool isNNClass(POSTag tag)
{
return POSTag::nng <= tag && tag <= POSTag::nnb;
}
/**
* @brief .
* @param tag
* @return true
*/
inline bool isSuffix(POSTag tag)
{
tag = clearIrregular(tag);
return POSTag::xsn <= tag && tag <= POSTag::xsm;
}
/**
* @brief .
* @param tag
* @return true
*/
inline bool isSpecialClass(POSTag tag)
{
return POSTag::sf <= tag && tag <= POSTag::sn;
}
/**
* @brief .
* @param tag
* @return true
*/
inline bool isUserClass(POSTag tag)
{
return POSTag::user0 <= tag && tag <= POSTag::user4;

View file

@ -1,4 +1,18 @@
#pragma once
/**
* @file ThreadPool.h
* @author bab2min (bab2min@gmail.com)
* @brief C++11 Thread Pool
* @version 0.22.1
* @date 2025-11-21
*
* A simple C++11 Thread Pool implementation(https://github.com/progschj/ThreadPool)
* modified by bab2min to have additional parameter threadId
*
* .
* , .
*/
#pragma once
/*
A simple C++11 Thread Pool implementation(https://github.com/progschj/ThreadPool)
@ -19,18 +33,53 @@ namespace kiwi
{
namespace utils
{
/**
* @brief
*
* ,
* .
*/
class ThreadPool
{
public:
/**
* @brief ThreadPool
* @param threads (0 )
* @param maxQueued (0 )
*/
ThreadPool(size_t threads = 0, size_t maxQueued = 0);
~ThreadPool();
/**
* @brief .
*
* ID를 .
*
* @tparam F
* @tparam Args
* @param f
* @param args
* @return future
*/
template<class F, class... Args>
auto enqueue(F&& f, Args&&... args)
->std::future<typename std::invoke_result<F, size_t, Args...>::type>;
/**
* @brief .
* @return
*/
size_t size() const { return workers.size(); }
/**
* @brief .
* @return
*/
size_t numEnqueued() const { return tasks.size(); }
/**
* @brief .
*/
void joinAll();
private:
std::vector<std::thread> workers;

View file

@ -1,4 +1,15 @@
#pragma once
/**
* @file Utils.h
* @author bab2min (bab2min@gmail.com)
* @brief
* @version 0.22.1
* @date 2025-11-21
*
* UTF-8/UTF-16 , ,
* .
*/
#pragma once
#include <iostream>
#include <string>
#include <memory>
@ -30,25 +41,88 @@ namespace kiwi
return std::unique_ptr<T>(new typename std::remove_extent<T>::type[size]);
}
/**
* @brief UTF-8 UTF-16 .
* @param str UTF-8
* @return UTF-16
*/
std::u16string utf8To16(const std::string& str);
/**
* @brief UTF-8 UTF-16 .
* @param str UTF-8
* @param bytePositions UTF-8
* @return UTF-16
*/
std::u16string utf8To16(const std::string& str, std::vector<size_t>& bytePositions);
/**
* @brief UTF-8 .
* @param code
* @return UTF-8
*/
std::string utf8FromCode(char32_t code);
size_t utf8FromCode(std::string& ret, char32_t code);
/**
* @brief UTF-16 UTF-8 .
* @param str UTF-16
* @return UTF-8
*/
std::string utf16To8(const std::u16string& str);
/**
* @brief .
* @param hangul
* @return
*/
KString normalizeHangul(const std::u16string& hangul);
/**
* @brief .
* @param t
* @return (URL, , , ) true
*/
inline bool isWebTag(POSTag t)
{
return POSTag::w_url <= t && t <= POSTag::w_emoji;
}
/**
* @brief .
* @param tagStr
* @return
*/
POSTag toPOSTag(const std::u16string& tagStr);
/**
* @brief .
* @param t
* @return
*/
const char* tagToString(POSTag t);
/**
* @brief .
* @param t
* @return
*/
const kchar_t* tagToKString(POSTag t);
const char* tagRToString(char16_t form, POSTag t);
const kchar_t* tagRToKString(char16_t form, POSTag t);
/**
* @brief .
* @tparam A
* @tparam B
* @tparam C
* @param value
* @param lower ()
* @param upper ()
* @return lower <= value < upper이면 true
*/
template<class A, class B, class C>
inline bool within(A value, B lower, C upper)
{
@ -61,41 +135,82 @@ namespace kiwi
return cont.data() <= value && value < cont.data() + cont.size();
}
/**
* @brief .
* @param chr
* @return (-) true
*/
inline bool isHangulSyllable(char16_t chr)
{
return within(chr, 0xAC00, 0xD7A4);
}
/**
* @brief .
* @param chr
* @return true
*/
inline bool isHangulOnset(char16_t chr)
{
return within(chr, 0x1100, 0x1100 + 19);
}
/**
* @brief .
* @param chr
* @return true
*/
inline bool isHangulCoda(char16_t chr)
{
return within(chr, 0x11A8, 0x11A8 + 27);
}
/**
* @brief .
* @param chr
* @return true
*/
inline bool isHangulVowel(char16_t chr)
{
return within(chr, 0x314F, 0x3164);
}
/**
* @brief .
* @param onset
* @param vowel
* @return
*/
inline char16_t joinOnsetVowel(size_t onset, size_t vowel)
{
return 0xAC00 + (char16_t)((onset * 21 + vowel) * 28);
}
/**
* @brief .
* @param chr
* @return
*/
inline int extractVowel(char16_t chr)
{
return ((chr - 0xAC00) / 28) % 21;
}
/**
* @brief .
* @param chr
* @return true
*/
inline bool isOldHangulOnset(char16_t chr)
{
return within(chr, 0x1100, 0x1160) || within(chr, 0xA960, 0xA980);
}
/**
* @brief .
* @param chr
* @return true
*/
inline bool isOldHangulVowel(char16_t chr)
{
return within(chr, 0x1160, 0x11A8) || within(chr, 0xD7B0, 0xD7CB);

View file

@ -1,15 +1,33 @@
#pragma once
/**
* @file WordDetector.h
* @author bab2min (bab2min@gmail.com)
* @brief WordDetector
* @version 0.22.1
* @date 2025-11-21
*
* .
* (cohesion) (branching entropy) .
*/
#pragma once
#include <kiwi/Types.h>
namespace kiwi
{
/**
* @brief
*/
struct WordInfo
{
std::u16string form;
float score, lBranch, rBranch, lCohesion, rCohesion;
uint32_t freq;
std::map<POSTag, float> posScore;
std::u16string form; /**< 단어의 표면형 */
float score; /**< 단어 점수 */
float lBranch; /**< 좌측 분기 엔트로피 */
float rBranch; /**< 우측 분기 엔트로피 */
float lCohesion; /**< 좌측 응집도 */
float rCohesion; /**< 우측 응집도 */
uint32_t freq; /**< 출현 빈도 */
std::map<POSTag, float> posScore; /**< 품사별 점수 */
WordInfo(std::u16string _form = {},
float _score = 0, float _lBranch = 0, float _rBranch = 0,
@ -20,6 +38,12 @@ namespace kiwi
{}
};
/**
* @brief
*
* .
* .
*/
class WordDetector
{
struct Counter;
@ -38,20 +62,59 @@ namespace kiwi
std::map<POSTag, float> getPosScore(Counter&, const std::map<std::u16string, uint32_t>& cnt, std::map<std::u16string, uint32_t>::iterator it, bool coda, const std::u16string& realForm) const;
public:
/**
* @brief
*/
struct FromRawData {};
static constexpr FromRawData fromRawDataTag = {};
WordDetector() = default;
/**
* @brief WordDetector를 .
* @param modelPath
* @param _numThreads (-1 )
*/
WordDetector(const std::string& modelPath, size_t _numThreads = -1);
/**
* @brief WordDetector를 .
* @param tag FromRawData
* @param modelPath
* @param _numThreads (-1 )
*/
WordDetector(FromRawData, const std::string& modelPath, size_t _numThreads = -1);
/**
* @brief WordDetector를 .
* @param streamProvider
* @param _numThreads (-1 )
*/
WordDetector(const std::function<std::unique_ptr<std::istream>(const std::string&)>& streamProvider, size_t _numThreads = -1);
/**
* @brief WordDetector가 .
* @return true
*/
bool ready() const
{
return !posScore.empty();
}
/**
* @brief .
* @param modelPath
*/
void saveModel(const std::string& modelPath) const;
/**
* @brief .
* @param reader
* @param minCnt
* @param maxWordLen
* @param minScore
* @return
*/
std::vector<WordInfo> extractWords(const U16MultipleReader& reader, size_t minCnt = 10, size_t maxWordLen = 10, float minScore = 0.1f) const;
};

@ -1 +1 @@
Subproject commit 226ce3aed24702bef1b03dba4b3cb55bc0bf31dd
Subproject commit f38e229e754f90fa06b0a99ae7fbbcfcbe7dcabc

2
third_party/cpuinfo vendored

@ -1 +1 @@
Subproject commit 05dd959fa26c7e68fa229495a35f55e06a3b9655
Subproject commit c4b4f4bf08c0cf486fc3111d0244ebf2a48ad01b

@ -1 +1 @@
Subproject commit 52eb8108c5bdec04579160ae17225d66034bd723
Subproject commit ff6133ab49b364a883a55ba75c39e520fea6245b

2
third_party/json vendored

@ -1 +1 @@
Subproject commit 55f93686c01528224f448c19128836e7df245f72
Subproject commit 5ed07097faa6c50199c4a3b66e5ed37d4fbfccc2

@ -1 +1 @@
Subproject commit fbd8b99c2b828428947d70fdc046bb55609be93e
Subproject commit f0cd5505aa102cee991be0367b82506638a16281