#pragma once #include #include #include #include namespace kiwi { std::vector> extractSubstrings( const char16_t* first, const char16_t* last, size_t minCnt, size_t minLength = 2, size_t maxLength = 32, bool longestOnly = true, char16_t stopChr = 0); class PrefixCounter { size_t prefixSize = 0, minCf = 0, numArrays = 0; UnorderedMap token2id; Vector id2Token; Vector buf; Vector tokenClusters; Vector tokenCnts; std::shared_ptr threadPool; template void _addArray(It first, It last); Vector> computeClusterScore() const; public: PrefixCounter(size_t _prefixSize, size_t _minCf, size_t _numWorkers, const std::vector>& clusters = {} ); void addArray(const uint16_t* first, const uint16_t* last); void addArray(const uint32_t* first, const uint32_t* last); void addArray(const int32_t* first, const int32_t* last); void addArray(const uint64_t* first, const uint64_t* last); utils::FrozenTrie count() const; std::unique_ptr buildLM( const std::vector& minCfByOrder, size_t bosTokenId, size_t eosTokenId, size_t unkTokenId, ArchType archType = ArchType::none ) const; }; class ClusterData { const std::pair* clusterScores = nullptr; size_t clusterSize = 0; public: ClusterData(); ClusterData(const void* _ptr, size_t _size); size_t size() const; size_t cluster(size_t i) const; float score(size_t i) const; }; class Kiwi; class NgramExtractor { const Kiwi* kiwi = nullptr; bool gatherLmScore = true; UnorderedMap morph2id; Vector id2morph; Vector buf; Vector scores; Vector docBoundaries; Vector positions; Vector rawDocs; size_t addTokens(const std::vector& tokens); public: struct Candidate { std::u16string text; std::vector tokens; std::vector tokenScores; size_t cnt = 0; size_t df = 0; float score = 0; float npmi = 0; float leftBranch = 0; float rightBranch = 0; float lmScore = 0; }; NgramExtractor(); NgramExtractor(const Kiwi& kiwi, bool gatherLmScore = true); NgramExtractor(const NgramExtractor&); NgramExtractor(NgramExtractor&&) noexcept; NgramExtractor& operator=(const NgramExtractor&); NgramExtractor& operator=(NgramExtractor&&) noexcept; ~NgramExtractor(); size_t addText(const std::u16string& text); size_t addTexts(const U16Reader& reader); std::vector extract(size_t maxCandidates = 1000, size_t minCnt = 10, size_t maxLength = 5, float minScore = 1e-3, size_t numWorkers = 1) const; }; }