Compare commits

..

119 commits

Author SHA1 Message Date
bab2min
693d6f2e02 Bump version to 0.23.2 in CMakeLists.txt and Macro.h 2026-06-11 00:29:55 +09:00
bab2min
dab6f2a665 Fix bab2min/kiwipiepy#221 2026-06-07 22:12:51 +09:00
Minchul Lee
f916576479
Merge pull request #256 from bab2min/dev/oov_global_consistency
Implement Global Consistency of OOV Detection
2026-06-07 21:42:30 +09:00
bab2min
d7d305fa90 Fix workflows to prevent OOM 2026-06-07 16:56:12 +09:00
bab2min
cbca2f4159 Fix errors on consistency computation 2026-05-10 01:55:26 +09:00
bab2min
4f502793e4 implement oovTotalConsistency 2026-05-09 16:31:16 +09:00
bab2min
84bb01f051 Refactor findBestPath 2026-04-25 00:54:37 +09:00
bab2min
270bdcacb5 Optimize Path Evaluation 2026-04-19 01:18:40 +09:00
bab2min
303586ad22 Fix duplicated oov detection in default morphemes 2026-04-19 01:13:10 +09:00
bab2min
d4e3e63b08 Bump version to 0.23.1 2026-04-05 00:45:11 +09:00
bab2min
b30e388e4a Update condition for score discount in FormEvaluator 2026-04-05 00:42:16 +09:00
Minchul Lee
8e3fe94a05
Merge pull request #252 from bab2min/dev/fix_new_splitter
Splitter 신규 구현체에서 종종 빈 결과값을 리턴하는 문제를 해결
2026-04-05 00:26:19 +09:00
bab2min
d446d3da74 Add more test cases for empty result 2026-04-04 22:03:33 +09:00
bab2min
a841678d50 Fix empty result more 2026-04-04 22:02:58 +09:00
Minchul Lee
6308709f3d
Merge pull request #251 from bab2min/dev/fix_new_splitter
Splitter 신규 구현체에서 종종 메모리 과다 사용, 빈 결과값 리턴 등의 문제를 해결
2026-04-04 20:15:37 +09:00
bab2min
cc322a9a1a Add more test cases for empty result bug 2026-04-04 18:41:21 +09:00
bab2min
9e2870b81a Prevent nan at OOV Handling 2026-04-04 18:40:53 +09:00
bab2min
cb7ae0d6b4 Fix empty result when typo correction is enabled 2026-04-04 18:40:16 +09:00
bab2min
599ca9aee2 Add upper limits of lengthening size for memory efficiency 2026-04-04 18:39:40 +09:00
bab2min
cce6e1aa77 Split long non-space string into fixed-length chunk for preventing OOM 2026-04-04 18:39:07 +09:00
Minchul Lee
8839bb5a11
Merge pull request #250 from bab2min/dev/issue_246
문장 시작 위치의 1., 2., 3. 등이 종종 SB 대신 SN로 분석되는 오류 수정
2026-04-04 18:15:26 +09:00
bab2min
17360c4318 Improve RuleBasedScorer for SN ending with points 2026-04-04 17:25:47 +09:00
bab2min
d23c7d395d Add test cases for #246 2026-04-04 17:24:39 +09:00
Minchul Lee
4e732fa615
Update README.md 2026-03-29 00:31:52 +09:00
Copilot
d9a43a8fd0
Add Quantized GEMM kernel for Arm NEON on macOS ARM (#249) 2026-03-28 21:20:11 +09:00
Minchul Lee
4e8805a6e3
Merge pull request #247 from bab2min/dev/avx_vnni_on_msvc
Fix Support of AVX-VNNI on MSVC
2026-03-27 12:22:22 +09:00
bab2min
b8516fdb05 Fix AVX-VNNI support for MSVC 2026-03-27 01:23:47 +09:00
bab2min
f3a8f228e1 Fix wrong AVX-VNNI detection logic 2026-03-27 01:23:16 +09:00
bab2min
25cc639c96 Fix Swift Workflows 2026-03-17 01:00:15 +09:00
bab2min
11baa0b010 Fix npm publish 2026-03-17 00:35:30 +09:00
Minchul Lee
605c2e78c5
Fix tar command to include all dict files 2026-03-17 00:11:47 +09:00
bab2min
86d4733fdd Set default typo transformer for non-standard dialects 2026-03-16 00:42:05 +09:00
Minchul Lee
5c220fcf0e
Merge pull request #245 from bab2min/dev/improve_typo_correction
Improve Typo Correction
2026-03-15 22:52:19 +09:00
bab2min
2f04fa83dc Add more typo corrections for dialects 2026-03-15 21:56:58 +09:00
bab2min
a137b4830a Fix dataset errors 2026-03-15 21:55:17 +09:00
bab2min
b196855830 Remove redundant entries for 충청 and 함경 dialects in combiningRule.txt 2026-03-15 02:35:12 +09:00
bab2min
ee7c4776e1 Update out-of-vocabulary rule parameters for improved typo correction 2026-03-15 02:33:17 +09:00
bab2min
03d6065248 Add boundary condition for typo correction and enhance dialect handling 2026-03-15 02:33:06 +09:00
bab2min
dd63f3bfdd Bump version to 0.23.0 in CMakeLists.txt and Macro.h 2026-03-14 20:29:23 +09:00
Minchul Lee
f68dc34126
Merge pull request #244 from bab2min/dev/optimize_typo_process
Optimize Typo Correction
2026-03-14 20:19:19 +09:00
bab2min
96bffaaffd Add continual typo corrections and adjust lengthening typo cost in Java Binding 2026-03-14 17:50:13 +09:00
bab2min
d7ce2915cf Fixed typo correction bug in bindings 2026-03-14 17:34:26 +09:00
bab2min
f2f24de2d3 Update Swift Binding 2026-03-14 17:22:07 +09:00
bab2min
71aa41cbd8 Update WASM Binding 2026-03-14 17:11:49 +09:00
bab2min
ce2a5184f0 Update Java Binding 2026-03-14 17:11:28 +09:00
bab2min
fc203ca652 Remove support for TypoTransformer in build method and update related calls of KiwiJava 2026-03-14 16:17:45 +09:00
bab2min
ab8f209690 Update tools compatible to the new typo correction API 2026-03-14 16:12:26 +09:00
bab2min
808d90ad1c Update C API compatible to a new typo correction 2026-03-14 15:38:56 +09:00
bab2min
a3c149d173 Fix KiwiBuilder initialization 2026-03-14 01:57:31 +09:00
bab2min
bf8a3964bd Add test for MultiWordTypo handling 2026-03-14 01:49:30 +09:00
bab2min
c77d229d53 Fix errors in Splitter's position calculations & Improve early termination 2026-03-14 01:48:58 +09:00
bab2min
368e7915b0 Add option oldSplitter to evaluator 2026-03-12 21:40:40 +09:00
bab2min
8692e7ea81 Fix wrong computation of NounEvaluator::computeScore 2026-03-12 21:39:45 +09:00
bab2min
683314ab7a Fix test cases 2026-03-12 21:38:29 +09:00
bab2min
6b078693e5 Fix wrong behaviors of new Splitter 2026-03-12 21:37:46 +09:00
bab2min
b455855653 Fix chunking & unk handling of Splitter 2026-03-12 02:27:34 +09:00
bab2min
7a2288456c Update test cases for the new typo function 2026-03-12 00:37:21 +09:00
bab2min
9163b0583b Add utility functions to AnalyzeOption for match and typo transformer configuration 2026-03-12 00:34:09 +09:00
bab2min
12423b0164 Implement optimized splitByTrieUsingTypo() 2026-03-12 00:33:01 +09:00
bab2min
71fddc801a Add getDefaultPreparedTypoSet() 2026-03-12 00:30:03 +09:00
bab2min
fe3cb43be6 Implement continual typo handling & pretokenization to TypoTransformer::generateGraph 2026-03-12 00:29:31 +09:00
bab2min
1e2069e115 Add test cases for PreparedTypoTransformer::generateGraph 2026-02-28 13:51:22 +09:00
bab2min
640fd8c77a Implement prototype of typo-aware splitting 2026-02-28 13:50:30 +09:00
Minchul Lee
54a34a9e15
Update README.md 2026-02-23 21:17:15 +09:00
bab2min
ba17128e25 Add testing instructions for WASM package in README 2026-02-23 01:29:30 +09:00
Minchul Lee
9e5e384dd9
Merge pull request #243 from bab2min/dev/ci_cd_for_wasm
Improve CI & CD for WASM
2026-02-23 01:28:40 +09:00
bab2min
ca6c6e0156 Add a workflow for wasm 2026-02-23 01:22:20 +09:00
bab2min
84521c8985 Fix integer overflows in constexpr computation 2026-02-23 00:55:30 +09:00
bab2min
322801eca0 Add basic unit test for wasm 2026-02-23 00:54:30 +09:00
bab2min
78307330c5 Improve wasm binding compatible to recent Kiwi 2026-02-23 00:48:27 +09:00
bab2min
3c68055666 Fix xcframework build script 2026-02-22 20:44:41 +09:00
Minchul Lee
4580bcd3f4
Merge pull request #242 from bab2min/dev/fix_macos_intel_fail
Fix macOS build configuration
2026-02-22 16:32:21 +09:00
bab2min
89bc4eff46 Fix macOS build configuration by simplifying compiler setup and ensuring correct deployment target 2026-02-22 16:04:50 +09:00
Minchul Lee
83a6f64851
Clarify Swift binding availability for iOS and macOS
Update Swift binding availability note in README.
2026-02-22 15:17:09 +09:00
Minchul Lee
b5990e17b5
Merge pull request #241 from bab2min/copilot/add-kiwi-swift-binding
Fix project root path in build script for XCFramework
2026-02-22 15:10:21 +09:00
bab2min
9aa03a40ae Fix project root path in build script for XCFramework 2026-02-22 15:09:25 +09:00
Minchul Lee
ca6247914f
Merge pull request #239 from bab2min/copilot/add-kiwi-swift-binding
Add Swift bindings for iOS/macOS support
2026-02-22 14:47:09 +09:00
bab2min
95ab533a10 Add joinParticleYo flag to PatternMatcher and C API enums 2026-02-22 02:48:32 +09:00
Minchul Lee
28b9c55250
Merge pull request #240 from bab2min/dev/oov-chr-model
Advanced OOV Handling
2026-02-22 02:34:22 +09:00
bab2min
b900cc7faf Update ZCoda test to incorporate oovChrFreqModel in analysis 2026-02-22 01:54:44 +09:00
bab2min
d7ab4334ec Merge branch 'dev/oov-chr-model' of https://github.com/bab2min/Kiwi into dev/oov-chr-model 2026-02-22 01:22:03 +09:00
bab2min
92a43c1652 Refactor SubstringCounter to use Vector instead of std::vector for table and chars 2026-02-22 01:20:40 +09:00
bab2min
6f4c92bd5d Fix test cases compatible to a new OOV handling 2026-02-22 01:16:43 +09:00
bab2min
d32fdd95f3 Add max_unk_form_size_followed_by_j_class to kiwi_config_t and update global config functions to C API 2026-02-22 01:15:09 +09:00
Minchul Lee
227fad2b4f
Merge branch 'main' into dev/oov-chr-model 2026-02-21 21:11:58 +09:00
bab2min
ada8636259 Add UnkFormScorer.cpp to project compilation 2026-02-21 21:09:34 +09:00
bab2min
b021352fac Refactor NounEvaluator to use unordered_map for golds and update scoring configurations 2026-02-21 21:09:01 +09:00
bab2min
a42cf4094b Add tests for ChrTokenizer, ChrModel, and ChrDataset functionality 2026-02-21 21:08:18 +09:00
bab2min
a6f6bc97b4 Add overrideConfig parameter to analyze function overloads 2026-02-21 21:07:49 +09:00
bab2min
1d94c7f5e3 Improve OOV handling based on Chr, ChrFreq 2026-02-21 21:07:30 +09:00
Minchul Lee
eaea80f978
Update README to include Rust and Flutter wrappers
Added sections for Rust and Flutter wrappers in README.
2026-02-20 20:51:55 +09:00
bab2min
f54dbb96e6 Optimize computing frequency of substrings in oovChrFreq 2026-02-03 02:19:52 +09:00
bab2min
044dcd0583 Add UnkFormScorer.cpp to CMakeLists.txt 2026-02-01 01:53:08 +09:00
bab2min
f6590d9943 Update OOV Chr model binary 2026-02-01 00:39:30 +09:00
bab2min
da1506de3c Refactor UnkFormScorer & Implement OOV Handling based on Chr & ChrFreq 2026-02-01 00:39:12 +09:00
bab2min
3bff514f7e Change the default value of hyper-parameter releated to OOV handling 2026-02-01 00:33:06 +09:00
bab2min
9ef8748b9b Enhance CI workflow for XCFramework build and release
- Updated swift.yml to streamline XCFramework build process and add checksum calculation.
- Improved error handling in Kiwi class for joiner creation.
- Refined README for clarity and added installation instructions.
- Enhanced POSTag enum with irregular conjugation support.
- Updated tests to validate joiner functionality and score checks.
2026-01-23 15:14:52 +09:00
bab2min
d551c7cc91 Rename OOV scoring parameters 2026-01-23 01:51:22 +09:00
copilot-swe-agent[bot]
e95a8712c6 Fix CI check step and add integration tests
- Removed 'swift package diagnose' command (not available in Swift 6.1)
- Added integration tests for KiwiBuilder, tokenize, analyze, splitIntoSentences, and Joiner
- Tests gracefully skip if model files not available (for local development)
- Tests verify actual C library functionality with real tokenization
- All integration tests use ../../models/cong/base path (available in CI)

Co-authored-by: bab2min <19266222+bab2min@users.noreply.github.com>
2026-01-22 13:50:35 +00:00
copilot-swe-agent[bot]
1e25c9781e Build C++ library in CI before running Swift tests
- Reverted the commented-out testVersion() test
- Updated CI workflow to build libkiwi_static.a before Swift tests
- Added linker flags to swift build/test commands to link against the C++ library
- Added linkerSettings to Package.swift for C++ and zlib dependencies
- Tests now run with full C library integration

Co-authored-by: bab2min <19266222+bab2min@users.noreply.github.com>
2026-01-22 13:32:19 +00:00
copilot-swe-agent[bot]
11b0d53dbb Skip version test that requires C library in CI
The testVersion() test calls Kiwi.version which requires the C library to be linked. Since CI doesn't build the C++ library before running Swift tests, this test causes undefined symbol errors. Commented out the test with explanation.

Co-authored-by: bab2min <19266222+bab2min@users.noreply.github.com>
2026-01-22 12:53:06 +00:00
copilot-swe-agent[bot]
6db9c7de98 Fix Swift CI workflow - use system Swift instead of separate installation
Co-authored-by: bab2min <19266222+bab2min@users.noreply.github.com>
2026-01-22 12:34:53 +00:00
copilot-swe-agent[bot]
737952f203 Add Swift bindings development guide
Co-authored-by: bab2min <19266222+bab2min@users.noreply.github.com>
2026-01-22 04:54:15 +00:00
copilot-swe-agent[bot]
d87da45590 Add Swift build configuration and GitHub workflow
Co-authored-by: bab2min <19266222+bab2min@users.noreply.github.com>
2026-01-22 04:53:07 +00:00
copilot-swe-agent[bot]
fed126e1df Add Swift bindings foundation structure and core types
Co-authored-by: bab2min <19266222+bab2min@users.noreply.github.com>
2026-01-22 04:51:37 +00:00
copilot-swe-agent[bot]
6ae75babda Initial plan 2026-01-22 04:45:49 +00:00
Minchul Lee
0ea44c26c0
Merge pull request #237 from bab2min/copilot/add-join-yo-option
Add `Match::joinParticleYo` option to merge EC/EF verbal endings with 요/JX auxiliary particle
2026-01-18 21:50:02 +09:00
bab2min
fade6bcd66 Update test cases for joinParticleYo 2026-01-18 21:09:36 +09:00
bab2min
d3b103c75c Rename joinYo to joinParticleYo for clarity 2026-01-18 20:48:54 +09:00
bab2min
d59673a829 Add a noun character model binary 2026-01-18 15:01:41 +09:00
bab2min
8bad7c6371 Update tools for ChrModel 2026-01-18 15:00:52 +09:00
bab2min
53f5289d55 Update senseId handling for OOV tokens and improve TokenInfo structure 2026-01-18 14:59:05 +09:00
bab2min
13c37d03bd Add nounChrMdl to Kiwi and enhance OOV scoring options 2026-01-18 14:58:12 +09:00
bab2min
d0b6300c64 Enhanced functionality of CoNgramModel for ChrModel
- Introduced new methods for frequency quantization and dequantization.
- Enhanced CoNgramModel constructor to handle different key types and added support for trie frequency.
- Implemented context frequency retrieval and node depth querying.
- Updated progress methods to utilize unpacked context IDs and improved handling of output embedding biases.
- Added a new method to build character models from context definitions and embeddings.
- Improved memory management and alignment for various data structures.
- Refined context node progression logic to accommodate new key handling.
2026-01-18 14:56:11 +09:00
bab2min
54979a5f6c Implement ChrTokenizer & ChrDataset 2026-01-18 14:48:23 +09:00
bab2min
4a7cd6a88b Merge branch 'main' of https://github.com/bab2min/Kiwi 2026-01-18 14:45:33 +09:00
bab2min
03d010a965 Fix errors in eval_data/dialect 2026-01-18 14:45:28 +09:00
copilot-swe-agent[bot]
2f637046a0 Add joinYo option to Match enum and implement joining logic
Co-authored-by: bab2min <19266222+bab2min@users.noreply.github.com>
2026-01-16 09:33:22 +00:00
copilot-swe-agent[bot]
26175c6850 Initial plan 2026-01-16 09:27:08 +00:00
108 changed files with 11382 additions and 1886 deletions

View file

@ -34,13 +34,12 @@ jobs:
echo "CC=gcc-${{ matrix.version }}" >> $GITHUB_ENV
echo "CXX=g++-${{ matrix.version }}" >> $GITHUB_ENV
else
ls -ls /Applications/
sudo xcode-select -switch /Applications/Xcode_${{ matrix.version }}.app
echo "CC=$(brew --prefix llvm@18)/bin/clang" >> $GITHUB_ENV
echo "CXX=$(brew --prefix llvm@18)/bin/clang++" >> $GITHUB_ENV
echo "CC=clang" >> $GITHUB_ENV
echo "CXX=clang++" >> $GITHUB_ENV
fi
- name: Configure Build
run: mkdir build && cd build && cmake -DCMAKE_BUILD_TYPE=Release -DKIWI_JAVA_BINDING=1 ..
run: mkdir build && cd build && cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_OSX_DEPLOYMENT_TARGET=10.15 -DKIWI_JAVA_BINDING=1 ..
- name: Build
run: cd build && make -j2
- name: Run Unit Test

View file

@ -114,7 +114,7 @@ jobs:
asset_name: kiwi-java-${{ steps.get_release.outputs.tag_name }}-mac-${{ matrix.arch }}.jar
asset_content_type: application/octet-stream
- if: matrix.arch == 'arm64'
run: tar -zcvf model.tgz models/cong/base/sj.* models/cong/base/extract.mdl models/cong/base/*.dict models/cong/base/combiningRule.txt models/cong/base/*.mdl
run: tar -zcvf model.tgz models/cong/base/sj.* models/cong/base/*.dict models/cong/base/combiningRule.txt models/cong/base/*.mdl
- name: Upload release binary
if: matrix.arch == 'arm64'
uses: actions/upload-release-asset@v1.0.2
@ -319,20 +319,28 @@ jobs:
build-emscripten:
name: Emscripten
runs-on: ubuntu-latest
permissions:
contents: read
id-token: write
steps:
- uses: actions/checkout@v3
with:
submodules: true
lfs: true
- uses: mymindstorm/setup-emsdk@v14
- uses: actions/setup-node@v4
with:
node-version: '20'
registry-url: 'https://registry.npmjs.org'
- name: Build
run: |
cd bindings/wasm
./build.sh
- uses: JS-DevTools/npm-publish@v3
with:
token: ${{ secrets.NPM_TOKEN }}
package: bindings/wasm/package
- name: Publish to npm
run: |
npm install -g npm@latest
cd bindings/wasm/package
npm publish --provenance --access public
- name: Build documenation
run: |
cd bindings/wasm/package

183
.github/workflows/swift.yml vendored Normal file
View file

@ -0,0 +1,183 @@
name: Swift Bindings
on:
pull_request:
branches: [ main ]
paths:
- 'bindings/swift/**'
- 'include/kiwi/capi.h'
- 'include/kiwi/Macro.h'
- '.github/workflows/swift.yml'
push:
branches: [ main ]
paths:
- 'bindings/swift/**'
- 'include/kiwi/capi.h'
- 'include/kiwi/Macro.h'
tags:
- 'v*'
jobs:
swift-build-test:
name: Swift Build and Test
runs-on: macos-latest
steps:
- uses: actions/checkout@v3
with:
submodules: true
lfs: true
- name: Select Xcode
run: sudo xcode-select -switch /Applications/Xcode.app
- name: Swift Version
run: swift --version
- name: Build C++ Library
run: |
mkdir build && cd build
cmake -DCMAKE_BUILD_TYPE=Release \
-DKIWI_BUILD_DYNAMIC=OFF \
-DKIWI_BUILD_CLI=OFF \
-DKIWI_BUILD_EVALUATOR=OFF \
-DKIWI_BUILD_MODEL_BUILDER=OFF \
-DKIWI_BUILD_TEST=OFF \
-DKIWI_JAVA_BINDING=OFF \
-DKIWI_USE_MIMALLOC=ON \
..
make -j$(sysctl -n hw.ncpu)
- name: Verify Static Library
run: |
if [ ! -f build/libkiwi_static.a ]; then
echo "Error: libkiwi_static.a not found"
exit 1
fi
file build/libkiwi_static.a
ls -lh build/libkiwi_static.a
- name: Build Swift Package
run: |
cd bindings/swift
swift build -v -Xlinker -L../../build -Xlinker -lkiwi_static
- name: Run Swift Tests
run: |
cd bindings/swift
swift test -v -Xlinker -L../../build -Xlinker -lkiwi_static
swift-xcframework:
name: Build XCFramework
runs-on: macos-latest
if: github.event_name == 'push' && github.ref == 'refs/heads/main'
steps:
- uses: actions/checkout@v3
with:
submodules: true
lfs: true
- name: Setup Xcode
run: sudo xcode-select -switch /Applications/Xcode.app
- name: Build XCFramework
run: |
chmod +x bindings/swift/scripts/build-xcframework.sh
bindings/swift/scripts/build-xcframework.sh
- name: Verify XCFramework
run: |
if [ ! -d bindings/swift/xcframework/Kiwi.xcframework ]; then
echo "Error: Kiwi.xcframework not found"
exit 1
fi
if [ ! -f bindings/swift/xcframework/Kiwi.xcframework.zip ]; then
echo "Error: Kiwi.xcframework.zip not found"
exit 1
fi
ls -lh bindings/swift/xcframework/
- name: Calculate Checksum
run: |
cd bindings/swift/xcframework
swift package compute-checksum Kiwi.xcframework.zip > checksum.txt
echo "Checksum: $(cat checksum.txt)"
- name: Archive XCFramework
uses: actions/upload-artifact@v4
with:
name: Kiwi-xcframework
path: |
bindings/swift/xcframework/Kiwi.xcframework.zip
bindings/swift/xcframework/checksum.txt
swift-release:
name: Release XCFramework
runs-on: macos-latest
if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v')
steps:
- uses: actions/checkout@v3
with:
submodules: true
lfs: true
- name: Setup Xcode
run: sudo xcode-select -switch /Applications/Xcode.app
- name: Build XCFramework
run: |
chmod +x bindings/swift/scripts/build-xcframework.sh
bindings/swift/scripts/build-xcframework.sh
- name: Calculate Checksum
id: checksum
run: |
cd bindings/swift/xcframework
CHECKSUM=$(swift package compute-checksum Kiwi.xcframework.zip)
echo "checksum=$CHECKSUM" >> $GITHUB_OUTPUT
echo "Checksum: $CHECKSUM"
- name: Upload to Release
uses: softprops/action-gh-release@v1
with:
files: |
bindings/swift/xcframework/Kiwi.xcframework.zip
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
- name: Output SPM Configuration
run: |
TAG=${GITHUB_REF#refs/tags/}
echo ""
echo "=== Swift Package Manager Configuration ==="
echo ""
echo "Add this to your Package.swift:"
echo ""
echo ".binaryTarget("
echo " name: \"CKiwi\","
echo " url: \"https://github.com/${{ github.repository }}/releases/download/$TAG/Kiwi.xcframework.zip\","
echo " checksum: \"${{ steps.checksum.outputs.checksum }}\""
echo ")"
swift-linux-check:
name: Swift Linux Compatibility Check
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
with:
submodules: true
lfs: true
- name: Setup Swift
uses: swift-actions/setup-swift@v2
with:
swift-version: "5.10"
- name: Check Package Format
run: |
cd bindings/swift
swift package diagnose || true
echo "Note: Linux build may not work without modifications, but checking package structure"

View file

@ -36,7 +36,7 @@ jobs:
- name: Print CPU Info
run: cat /proc/cpuinfo
- name: Build
run: cd build && make -j2
run: cd build && make -j1
- name: Run Unit Test
run: ./build/test/kiwi-test
- name: Run Unit Test in Debug mode

50
.github/workflows/wasm.yml vendored Normal file
View file

@ -0,0 +1,50 @@
name: WASM Bindings
on:
pull_request:
branches: [ main ]
paths:
- 'bindings/wasm/**'
- 'src/**'
- 'include/**'
- 'CMakeLists.txt'
- '.github/workflows/wasm.yml'
push:
branches: [ main ]
paths:
- 'bindings/wasm/**'
- 'src/**'
- 'include/**'
- 'CMakeLists.txt'
- '.github/workflows/wasm.yml'
jobs:
wasm-build-test:
name: WASM Build and Test
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
with:
submodules: true
lfs: true
- name: Setup Node.js
uses: actions/setup-node@v3
with:
node-version: '20'
- name: Setup Emscripten
uses: mymindstorm/setup-emsdk@v14
with:
version: '3.1.64' # Latest stable or specific version
- name: Build WASM
run: |
chmod +x bindings/wasm/build.sh
bindings/wasm/build.sh
- name: Run WASM Unit Test
run: |
cd bindings/wasm/package
npm run test

View file

@ -1,6 +1,6 @@
cmake_minimum_required(VERSION 3.12)
project(kiwi VERSION 0.22.2 DESCRIPTION "Kiwi, Korean Intelligent Word Identifier")
project(kiwi VERSION 0.23.2 DESCRIPTION "Kiwi, Korean Intelligent Word Identifier")
set ( CMAKE_CXX_STANDARD 17 )
set ( CMAKE_VERBOSE_MAKEFILE true )
@ -53,7 +53,7 @@ else()
set ( AVX_VNNI_SUPPORTED OFF )
endif()
if(APPLE)
if(APPLE AND NOT CMAKE_OSX_ARCHITECTURES)
set(CMAKE_OSX_ARCHITECTURES "${KIWI_CPU_ARCH}")
endif()
@ -79,6 +79,7 @@ set ( CORE_SRCS
src/TagUtils.cpp
src/TypoTransformer.cpp
src/UnicodeCase.cpp
src/UnkFormScorer.cpp
src/Utils.cpp
src/WordDetector.cpp
src/archImpl/none.cpp

View file

@ -11,6 +11,7 @@ x86_64:
Other:
[![Action Status ARM64](https://github.com/bab2min/Kiwi/workflows/Arm64-Centos7/badge.svg)](https://github.com/bab2min/Kiwi/actions)
[![Action Status PPC64LE](https://github.com/bab2min/Kiwi/workflows/PPC64LE-Centos7/badge.svg)](https://github.com/bab2min/Kiwi/actions)
[![WASM Bindings](https://github.com/bab2min/Kiwi/actions/workflows/wasm.yml/badge.svg)](https://github.com/bab2min/Kiwi/actions)
Kiwi는 빠른 속도와 범용적인 성능을 지향하는 한국어 형태소 분석기 라이브러리입니다. 한국어 처리에 관심 있는 사람이면 누구나 쉽게 사용할 수 있도록 오픈 소스로 공개 중이며, C++로 구현된 코어 라이브러리를 래핑하여 다양한 프로그래밍 언어에 사용할 수 있도록 준비 중입니다.
@ -32,7 +33,7 @@ Kiwi는 빠른 속도와 범용적인 성능을 지향하는 한국어 형태소
문장 분리 기능을 비롯한 다양한 편의기능을 제공합니다. (문장 분리 성능 평가는 [이 페이지](https://github.com/bab2min/kiwipiepy/tree/main/benchmark/sentence_split)에서 수행가능합니다. )
라이브러리 차원에서 멀티스레딩을 지원하기 때문에 대량의 텍스트를 분석해야할 경우 멀티코어를 활용하여 빠른 분석이 가능합니다. 또한 다양한 시스템에서 상황에 맞춰 선택할 수 있도록 소형/중형/대형 모델을 제공합니다.
라이브러리 차원에서 멀티스레딩을 지원하기 때문에 대량의 텍스트를 분석해야할 경우 멀티코어를 활용하여 빠른 분석이 가능합니다.
## 웹 데모 페이지
최신 버전의 Kiwi를 사용해볼 수 있는 [웹 데모 페이지](https://kiwi.bab2min.pe.kr/)를 제공하고 있습니다.
@ -149,6 +150,12 @@ Android NDK를 통해 Android 앱에서 사용할 수 있는 AAR 라이브러리
- **사용법**: [bindings/java](bindings/java)의 README 참조
- **패키지**: AAR 형태로 제공되어 Gradle 프로젝트에 쉽게 통합 가능
### Swift Wrapper
iOS 12.0 이상 및 macOS 10.14 이상에서 사용 가능한 Swift binding이 제공 예정입니다. 조금만 기다려주세요.
- **최소 요구사항**: iOS 12.0+ / macOS 10.14+, Swift 5.7+
- **사용법**: [bindings/swift](bindings/swift)의 README 참조
- **설치**: Swift Package Manager 지원
### R Wrapper
[mrchypark](https://github.com/mrchypark)님께서 기여해주신 R언어용 wrapper인 [elbird](https://mrchypark.github.io/elbird/)가 있습니다.
@ -158,6 +165,12 @@ Android NDK를 통해 Android 앱에서 사용할 수 있는 AAR 라이브러리
### Web Assembly (Javascript/Typescript)
[RicBent](https://github.com/RicBent)님께서 기여해주신 Web Assembly binding이 있습니다. 이에 대해서는 [bindings/wasm](bindings/wasm)를 참조하시길 바랍니다.
### Rust Wrapper
[JAICHANGPARK](https://github.com/JAICHANGPARK)님께서 개발하신 Rust용 wrapper인 [kiwi-rs](https://github.com/JAICHANGPARK/kiwi-rs)가 있습니다.
### Flutter Wrapper
[JAICHANGPARK](https://github.com/JAICHANGPARK)님께서 개발하신 Rust용 wrapper인 [flutter_kiwi_nlp](https://github.com/JAICHANGPARK/flutter_kiwi_nlp)가 있습니다.
### 응용 프로그램
Kiwi는 C# 기반의 GUI 형태로도 제공됩니다.
형태소 분석기는 사용해야하지만 별도의 프로그래밍 지식이 없는 경우 이 프로그램을 사용하시면 됩니다.

View file

@ -278,6 +278,17 @@ namespace jni
};
}
class JPreparedTypoTransformer : public kiwi::PreparedTypoTransformer, jni::JObject<JPreparedTypoTransformer>
{
public:
static constexpr std::string_view className = "kr/pe/bab2min/KiwiBuilder$PreparedTypoTransformer";
JPreparedTypoTransformer() : PreparedTypoTransformer{} {}
JPreparedTypoTransformer(kiwi::PreparedTypoTransformer&& inst) : PreparedTypoTransformer{ std::move(inst) } {}
JPreparedTypoTransformer(JPreparedTypoTransformer&&) = default;
JPreparedTypoTransformer& operator=(JPreparedTypoTransformer&&) = default;
};
class JKiwi;
class JMorphemeSet : jni::JObject<JMorphemeSet>
@ -322,8 +333,7 @@ public:
JMorphemeSet* _blocklist,
kiwi::Dialect _allowedDialects,
float _dialectCost,
jni::JIterator<jni::JIterator<kiwi::PretokenizedSpan>> _pretokenized
);
jni::JIterator<jni::JIterator<kiwi::PretokenizedSpan>> _pretokenized);
JMultipleTokenResult(JMultipleTokenResult&&) = default;
JMultipleTokenResult& operator=(JMultipleTokenResult&&) = default;
@ -385,8 +395,9 @@ public:
return KIWI_VERSION_STRING;
}
auto analyze(const std::u16string& text, uint64_t topN,
auto analyze(const std::u16string& text, uint64_t topN,
kiwi::Match matchOption, JMorphemeSet* blocklist, kiwi::Dialect allowedDialects, float dialectCost,
JPreparedTypoTransformer* typoTransformer, float typoThreshold,
jni::JIterator<kiwi::PretokenizedSpan> pretokenized) const
{
std::vector<kiwi::PretokenizedSpan> pretokenizedSpans;
@ -394,13 +405,15 @@ public:
{
while (pretokenized.hasNext()) pretokenizedSpans.emplace_back(pretokenized.next());
}
return Kiwi::analyze(text, topN,
kiwi::AnalyzeOption{ matchOption, blocklist ? &blocklist->morphSet : nullptr, false, allowedDialects, dialectCost },
pretokenizedSpans);
kiwi::AnalyzeOption opt{ matchOption, blocklist ? &blocklist->morphSet : nullptr, false, allowedDialects, dialectCost };
opt.typoTransformer = typoTransformer;
opt.typoThreshold = typoThreshold;
return Kiwi::analyze(text, topN, opt, pretokenizedSpans);
}
JFutureTokenResult asyncAnalyze(jni::JRef<JKiwi> _ref, const std::u16string& text, uint64_t topN,
JFutureTokenResult asyncAnalyze(jni::JRef<JKiwi> _ref, const std::u16string& text, uint64_t topN,
kiwi::Match matchOption, JMorphemeSet* blocklist, kiwi::Dialect allowedDialects, float dialectCost,
JPreparedTypoTransformer* typoTransformer, float typoThreshold,
jni::JIterator<kiwi::PretokenizedSpan> pretokenized) const
{
std::vector<kiwi::PretokenizedSpan> pretokenizedSpans;
@ -408,13 +421,15 @@ public:
{
while (pretokenized.hasNext()) pretokenizedSpans.emplace_back(pretokenized.next());
}
return { _ref, Kiwi::asyncAnalyze(text, topN,
kiwi::AnalyzeOption{ matchOption, blocklist ? &blocklist->morphSet : nullptr, false, allowedDialects, dialectCost },
pretokenizedSpans) };
kiwi::AnalyzeOption opt{ matchOption, blocklist ? &blocklist->morphSet : nullptr, false, allowedDialects, dialectCost };
opt.typoTransformer = typoTransformer;
opt.typoThreshold = typoThreshold;
return { _ref, Kiwi::asyncAnalyze(text, topN, opt, pretokenizedSpans) };
}
JMultipleTokenResult analyze2(jni::JRef<JKiwi> _ref, jni::JIterator<std::u16string> texts, uint64_t topN,
JMultipleTokenResult analyze2(jni::JRef<JKiwi> _ref, jni::JIterator<std::u16string> texts, uint64_t topN,
kiwi::Match matchOption, JMorphemeSet* blocklist, kiwi::Dialect allowedDialects, float dialectCost,
JPreparedTypoTransformer* typoTransformer, float typoThreshold,
jni::JIterator<jni::JIterator<kiwi::PretokenizedSpan>> pretokenized) const
{
if (!texts) throw std::bad_optional_access{};
@ -557,6 +572,11 @@ public:
{
TypoTransformer::update(o);
}
JPreparedTypoTransformer prepare() const
{
return TypoTransformer::prepare(true);
}
};
class JStreamProvider : jni::JPureObject<JStreamProvider>
@ -720,16 +740,9 @@ public:
return KiwiBuilder::addPreAnalyzedWord(form, morphs, positions, score);
}
JKiwi build(JTypoTransformer* typos, float typoCostThreshold) const
JKiwi build() const
{
if (typos)
{
return KiwiBuilder::build(*typos, typoCostThreshold);
}
else
{
return KiwiBuilder::build();
}
return KiwiBuilder::build();
}
};
@ -739,6 +752,8 @@ JNIEXPORT jint JNICALL JNI_OnLoad(JavaVM* vm, void* reserved)
{
return gModule.load(vm,
jni::define<JPreparedTypoTransformer>(),
jni::define<JTypoTransformer>()
.template ctor<>()
.template method<&JTypoTransformer::addTypo>("_addTypo")
@ -746,7 +761,8 @@ JNIEXPORT jint JNICALL JNI_OnLoad(JavaVM* vm, void* reserved)
.template method<&JTypoTransformer::setLengtheningTypoCost>("_setLengtheningTypoCost")
.template method<&JTypoTransformer::copy>("copy")
.template method<&JTypoTransformer::update>("_update")
.template method<&JTypoTransformer::scaleCost>("_scaleCost"),
.template method<&JTypoTransformer::scaleCost>("_scaleCost")
.template method<&JTypoTransformer::prepare>("prepare"),
jni::define<JKiwiBuilder>()
.template ctor<std::string, size_t, kiwi::BuildOption, kiwi::ModelType, kiwi::Dialect>()

View file

@ -21,6 +21,11 @@ public class Kiwi implements AutoCloseable {
mention = 1 << 3,
serial = 1 << 4,
emoji = 1 << 5,
oovRuleOnly = 0 << 8,
oovChrModel = 1 << 8,
oovChrFreqModel = 2 << 8,
oovChrFreqBranchModel = 3 << 8,
oovMask = 3 << 8,
normalizeCoda = 1 << 16,
joinNounPrefix = 1 << 17,
joinNounSuffix = 1 << 18,
@ -32,6 +37,7 @@ public class Kiwi implements AutoCloseable {
compatibleJamo = 1 << 24,
splitSaisiot = 1 << 25,
mergeSaisiot = 1 << 26,
joinParticleYo = 1 << 27,
joinVSuffix = joinVerbSuffix | joinAdjSuffix,
joinAffix = joinNounPrefix | joinNounSuffix | joinVerbSuffix | joinAdjSuffix | joinAdvSuffix,
all = url | email | hashtag | mention | serial | zCoda,
@ -167,12 +173,20 @@ public class Kiwi implements AutoCloseable {
public MorphemeSet blocklist;
public short allowedDialects;
public float dialectCost;
public KiwiBuilder.PreparedTypoTransformer typoTransformer;
public float typoThreshold;
public AnalyzeOption(int match, MorphemeSet blocklist, short allowedDialects, float dialectCost) {
public AnalyzeOption(int match, MorphemeSet blocklist, short allowedDialects, float dialectCost, KiwiBuilder.PreparedTypoTransformer typoTransformer, float typoThreshold) {
this.match = match;
this.blocklist = blocklist;
this.allowedDialects = allowedDialects;
this.dialectCost = dialectCost;
this.typoTransformer = typoTransformer;
this.typoThreshold = typoThreshold;
}
public AnalyzeOption(int match, MorphemeSet blocklist, short allowedDialects, float dialectCost) {
this(match, blocklist, allowedDialects, dialectCost, null, 2.5f);
}
public AnalyzeOption(int match, MorphemeSet blocklist) {
@ -427,16 +441,16 @@ public class Kiwi implements AutoCloseable {
return _inst != 0;
}
public native TokenResult[] analyze(String text, int topN, int matchOption, MorphemeSet blocklist, short allowedDialects, float dialectCost, Iterator<PretokenizedSpan> pretokenized);
public native FutureTokenResult asyncAnalyze(String text, int topN, int matchOption, MorphemeSet blocklist, short allowedDialects, float dialectCost, Iterator<PretokenizedSpan> pretokenized);
public native MultipleTokenResult analyze(Iterator<String> texts, int topN, int matchOption, MorphemeSet blocklist, short allowedDialects, float dialectCost, Iterator<Iterator<PretokenizedSpan>> pretokenized);
public native TokenResult[] analyze(String text, int topN, int matchOption, MorphemeSet blocklist, short allowedDialects, float dialectCost, KiwiBuilder.PreparedTypoTransformer typoTransformer, float typoThreshold, Iterator<PretokenizedSpan> pretokenized);
public native FutureTokenResult asyncAnalyze(String text, int topN, int matchOption, MorphemeSet blocklist, short allowedDialects, float dialectCost, KiwiBuilder.PreparedTypoTransformer typoTransformer, float typoThreshold, Iterator<PretokenizedSpan> pretokenized);
public native MultipleTokenResult analyze(Iterator<String> texts, int topN, int matchOption, MorphemeSet blocklist, short allowedDialects, float dialectCost, KiwiBuilder.PreparedTypoTransformer typoTransformer, float typoThreshold, Iterator<Iterator<PretokenizedSpan>> pretokenized);
public native Sentence[] splitIntoSents(String text, int matchOption, boolean returnTokens);
public native String join(JoinableToken[] tokens);
public static native String getVersion();
public TokenResult[] analyze(String text, int topN, AnalyzeOption option, Iterator<PretokenizedSpan> pretokenized) {
return analyze(text, topN, option.match, option.blocklist, option.allowedDialects, option.dialectCost, pretokenized);
return analyze(text, topN, option.match, option.blocklist, option.allowedDialects, option.dialectCost, option.typoTransformer, option.typoThreshold, pretokenized);
}
public TokenResult[] analyze(String text, int topN, AnalyzeOption option) {
@ -444,7 +458,7 @@ public class Kiwi implements AutoCloseable {
}
public FutureTokenResult asyncAnalyze(String text, int topN, AnalyzeOption option, Iterator<PretokenizedSpan> pretokenized) {
return asyncAnalyze(text, topN, option.match, option.blocklist, option.allowedDialects, option.dialectCost, pretokenized);
return asyncAnalyze(text, topN, option.match, option.blocklist, option.allowedDialects, option.dialectCost, option.typoTransformer, option.typoThreshold, pretokenized);
}
public FutureTokenResult asyncAnalyze(String text, int topN, AnalyzeOption option) {
@ -452,7 +466,7 @@ public class Kiwi implements AutoCloseable {
}
public MultipleTokenResult analyze(Iterator<String> texts, int topN, AnalyzeOption option, Iterator<Iterator<PretokenizedSpan>> pretokenized) {
return analyze(texts, topN, option.match, option.blocklist, option.allowedDialects, option.dialectCost, pretokenized);
return analyze(texts, topN, option.match, option.blocklist, option.allowedDialects, option.dialectCost, option.typoTransformer, option.typoThreshold, pretokenized);
}
public MultipleTokenResult analyze(Iterator<String> texts, int topN, AnalyzeOption option) {

View file

@ -59,7 +59,8 @@ public class KiwiBuilder implements AutoCloseable {
final static public byte none = 0,
any = 1,
vowel = 2,
applosive = 8;
applosive = 8,
continual = 9;
}
public static class TypoTransformer implements AutoCloseable {
@ -137,6 +138,27 @@ public class KiwiBuilder implements AutoCloseable {
_scaleCost(scale);
return this;
}
public native PreparedTypoTransformer prepare();
}
public static class PreparedTypoTransformer implements AutoCloseable {
private long _inst;
public PreparedTypoTransformer(long _inst) {
this._inst = _inst;
}
protected void finalize() throws Exception {
close();
}
public boolean isAlive() {
return _inst != 0;
}
@Override
public native void close() throws Exception;
}
public KiwiBuilder(long _inst) {
@ -197,20 +219,12 @@ public class KiwiBuilder implements AutoCloseable {
@Override
public native void close() throws Exception;
public native Kiwi build(TypoTransformer typos, float typoCostThreshold);
public native Kiwi build();
public native boolean addWord(String form, byte tag, float score);
public native boolean addWord(String form, byte tag, float score, String origForm);
public native boolean addPreAnalyzedWord(String form, AnalyzedMorph[] analyzed, float score);
public native int loadDictionary(String path);
public Kiwi build() {
return build(null, 0);
}
public Kiwi build(TypoTransformer typos) {
return build(typos, 2.5f);
}
static {
Kiwi.loadLibrary();
}
@ -317,9 +331,49 @@ public class KiwiBuilder implements AutoCloseable {
.addTypo(new String[]{""}, new String[]{"ᆯᇀ"}, 1e-12f, CondVowel.none)
.addTypo(new String[]{""}, new String[]{"ᆯᇁ"}, 1e-12f, CondVowel.none)
.addTypo(new String[]{""}, new String[]{"ᆯᇂ"}, 1e-12f, CondVowel.none)
.addTypo(new String[]{""}, new String[]{"ᆸᆺ", "ᆸᆻ"}, 1e-12f, CondVowel.none);
.addTypo(new String[]{""}, new String[]{"ᆸᆺ", "ᆸᆻ"}, 1e-12f, CondVowel.none)
.addTypo(new String[]{"ᆨᄋ"}, new String[]{""}, 1.f, CondVowel.continual)
.addTypo(new String[]{"ᆩᄋ", "ᆨᄀ"}, new String[]{"", "ᆨᄀ"}, 1.f, CondVowel.continual)
.addTypo(new String[]{"ᆪᄋ", "ᆪᄒ"}, new String[]{"ᆨᄉ", "ᆨᄊ"}, 1.f, CondVowel.continual)
.addTypo(new String[]{"ᆫᄋ", "ᆫᄒ"}, new String[]{""}, 1.f, CondVowel.continual)
.addTypo(new String[]{"ᆬᄋ", "ᆫᄌ"}, new String[]{"ᆬᄋ", "ᆫᄌ"}, 1.f, CondVowel.continual)
.addTypo(new String[]{"ᆭᄋ"}, new String[]{"ᆫᄒ", ""}, 1.f, CondVowel.continual)
.addTypo(new String[]{"ᆭᄀ"}, new String[]{"ᆫᄏ"}, 1.f, CondVowel.continual)
.addTypo(new String[]{"ᆭᄃ"}, new String[]{"ᆫᄐ"}, 1.f, CondVowel.continual)
.addTypo(new String[]{"ᆭᄇ"}, new String[]{"ᆫᄑ"}, 1.f, CondVowel.continual)
.addTypo(new String[]{"ᆭᄉ"}, new String[]{"ᆫᄉ"}, 1.f, CondVowel.continual)
.addTypo(new String[]{"ᆭᄌ"}, new String[]{"ᆫᄎ"}, 1.f, CondVowel.continual)
.addTypo(new String[]{"ᆮᄋ"}, new String[]{""}, 1.f, CondVowel.continual)
.addTypo(new String[]{"ᆯᄋ", "ᆯᄒ"}, new String[]{""}, 1.f, CondVowel.continual)
.addTypo(new String[]{"ᆰᄋ"}, new String[]{"ᆯᄀ"}, 1.f, CondVowel.continual)
.addTypo(new String[]{"ᆰᄀ"}, new String[]{"ᆯᄁ"}, 1.f, CondVowel.continual)
.addTypo(new String[]{"ᆰᄒ"}, new String[]{"ᆯᄏ"}, 1.f, CondVowel.continual)
.addTypo(new String[]{"ᆱᄋ", "ᆱᄒ"}, new String[]{"ᆯᄆ"}, 1.f, CondVowel.continual)
.addTypo(new String[]{"ᆲᄋ"}, new String[]{"ᆯᄇ"}, 1.f, CondVowel.continual)
.addTypo(new String[]{"ᆲᄇ"}, new String[]{"ᆯᄈ"}, 1.f, CondVowel.continual)
.addTypo(new String[]{"ᆲᄒ"}, new String[]{"ᆯᄑ"}, 1.f, CondVowel.continual)
.addTypo(new String[]{"ᆳᄋ"}, new String[]{"ᆯᄉ"}, 1.f, CondVowel.continual)
.addTypo(new String[]{"ᆳᄉ"}, new String[]{"ᆯᄊ"}, 1.f, CondVowel.continual)
.addTypo(new String[]{"ᆴᄋ", "ᆴᄐ", "ᆴᄒ"}, new String[]{"ᆯᄐ"}, 1.f, CondVowel.continual)
.addTypo(new String[]{"ᆵᄋ", "ᆵᄑ", "ᆵᄒ"}, new String[]{"ᆯᄑ"}, 1.f, CondVowel.continual)
.addTypo(new String[]{"ᆶᄉ"}, new String[]{"ᆯᄉ"}, 1.f, CondVowel.continual)
.addTypo(new String[]{"ᆶᄋ", "ᆶᄒ"}, new String[]{"ᆯᄒ"}, 1.f, CondVowel.continual)
.addTypo(new String[]{"ᆷᄋ", "ᆷᄒ"}, new String[]{""}, 1.f, CondVowel.continual)
.addTypo(new String[]{"ᆸᄋ"}, new String[]{""}, 1.f, CondVowel.continual)
.addTypo(new String[]{"ᆸᄇ"}, new String[]{""}, 1.f, CondVowel.continual)
.addTypo(new String[]{"ᆹᄋ", "ᆹᄒ"}, new String[]{"ᆸᄉ", "ᆸᄊ"}, 1.f, CondVowel.continual)
.addTypo(new String[]{"ᆺᄋ"}, new String[]{""}, 1.f, CondVowel.continual)
.addTypo(new String[]{"ᆻᄋ", "ᆺᄉ"}, new String[]{"", "ᆺᄉ"}, 1.f, CondVowel.continual)
.addTypo(new String[]{"ᆽᄋ"}, new String[]{""}, 1.f, CondVowel.continual)
.addTypo(new String[]{"ᆽᄌ"}, new String[]{""}, 1.f, CondVowel.continual)
.addTypo(new String[]{"ᆾᄋ", "ᆾᄒ", "ᆽᄒ", "ᇂᄌ", "ᇂᄎ"}, new String[]{""}, 1.f, CondVowel.continual)
.addTypo(new String[]{"ᆿᄋ", "ᆿᄒ", "ᆨᄒ", "ᇂᄀ", "ᇂᄏ"}, new String[]{""}, 1.f, CondVowel.continual)
.addTypo(new String[]{"ᇀᄋ", "ᇀᄒ", "ᆮᄒ", "ᇂᄃ", "ᇂᄐ"}, new String[]{""}, 1.f, CondVowel.continual)
.addTypo(new String[]{"ᇁᄋ", "ᇁᄒ", "ᆸᄒ", "ᇂᄇ", "ᇂᄑ"}, new String[]{""}, 1.f, CondVowel.continual)
.addTypo(new String[]{"ᇂᄋ"}, new String[]{""}, 1.f, CondVowel.continual);
final public static TypoTransformer basicTypoSetWithContinual = basicTypoSet.copy().update(continualTypoSet);
final public static TypoTransformer lengtheningTypoSet = new TypoTransformer().setLengtheningTypoCost(0.5f);
final public static TypoTransformer lengtheningTypoSet = new TypoTransformer().setLengtheningTypoCost(0.25f);
}

View file

@ -6,6 +6,7 @@ import java.util.concurrent.Future;
import org.junit.Test;
import kr.pe.bab2min.KiwiBuilder.TypoTransformer;
import kr.pe.bab2min.KiwiBuilder.PreparedTypoTransformer;
import static org.junit.Assert.*;
@ -122,8 +123,10 @@ public class KiwiTest {
public void testTypos() throws Exception {
System.gc();
KiwiBuilder builder = new KiwiBuilder(modelPath);
Kiwi kiwi = builder.build(KiwiBuilder.basicTypoSet);
Kiwi.Token[] tokens = kiwi.tokenize("나 죰 도와죠.", new AnalyzeOption(Kiwi.Match.allWithNormalizing));
Kiwi kiwi = builder.build();
PreparedTypoTransformer preparedTypo = KiwiBuilder.basicTypoSet.prepare();
AnalyzeOption option = new AnalyzeOption(Kiwi.Match.allWithNormalizing, null, Kiwi.Dialect.standard, 0.0f, preparedTypo, 2.5f);
Kiwi.Token[] tokens = kiwi.tokenize("나 죰 도와죠.", option);
System.out.println(Arrays.deepToString(tokens));
assertEquals(tokens[1].form, "");
assertEquals(tokens[4].form, "");
@ -134,29 +137,31 @@ public class KiwiTest {
public void testContinualTypos() throws Exception {
System.gc();
KiwiBuilder builder = new KiwiBuilder(modelPath);
Kiwi kiwi = builder.build(KiwiBuilder.continualTypoSet);
Kiwi kiwi = builder.build();
PreparedTypoTransformer preparedTypo = KiwiBuilder.continualTypoSet.prepare();
AnalyzeOption option = new AnalyzeOption(Kiwi.Match.allWithNormalizing, null, Kiwi.Dialect.standard, 0.0f, preparedTypo, 2.5f);
Kiwi.Token[] tokens = kiwi.tokenize("프로그래미", new AnalyzeOption(Kiwi.Match.allWithNormalizing));
Kiwi.Token[] tokens = kiwi.tokenize("프로그래미", option);
System.out.println(Arrays.deepToString(tokens));
assertEquals(tokens[0].form, "프로그램");
assertEquals(tokens[1].form, "");
tokens = kiwi.tokenize("프로그래믈", new AnalyzeOption(Kiwi.Match.allWithNormalizing));
tokens = kiwi.tokenize("프로그래믈", option);
System.out.println(Arrays.deepToString(tokens));
assertEquals(tokens[0].form, "프로그램");
assertEquals(tokens[1].form, "");
tokens = kiwi.tokenize("오늘사무시레서", new AnalyzeOption(Kiwi.Match.allWithNormalizing));
tokens = kiwi.tokenize("오늘사무시레서", option);
System.out.println(Arrays.deepToString(tokens));
assertEquals(tokens[1].form, "사무실");
assertEquals(tokens[2].form, "에서");
tokens = kiwi.tokenize("법원이 기가캤다.", new AnalyzeOption(Kiwi.Match.allWithNormalizing));
tokens = kiwi.tokenize("법원이 기가캤다.", option);
System.out.println(Arrays.deepToString(tokens));
assertEquals(tokens[2].form, "기각");
assertEquals(tokens[3].form, "");
tokens = kiwi.tokenize("하나도 업써.", new AnalyzeOption(Kiwi.Match.allWithNormalizing));
tokens = kiwi.tokenize("하나도 업써.", option);
System.out.println(Arrays.deepToString(tokens));
assertEquals(tokens[2].form, "");
assertEquals(tokens[3].form, "");
@ -169,19 +174,21 @@ public class KiwiTest {
TypoTransformer typoSet = KiwiBuilder.basicTypoSet.copy()
.update(KiwiBuilder.continualTypoSet)
.update(KiwiBuilder.lengtheningTypoSet);
Kiwi kiwi = builder.build(typoSet);
Kiwi.Token[] tokens = kiwi.tokenize("프로그래미", new AnalyzeOption(Kiwi.Match.allWithNormalizing));
Kiwi kiwi = builder.build();
PreparedTypoTransformer preparedTypo = typoSet.prepare();
AnalyzeOption option = new AnalyzeOption(Kiwi.Match.allWithNormalizing, null, Kiwi.Dialect.standard, 0.0f, preparedTypo, 2.5f);
Kiwi.Token[] tokens = kiwi.tokenize("프로그래미", option);
System.out.println(Arrays.deepToString(tokens));
assertEquals(tokens[0].form, "프로그램");
assertEquals(tokens[1].form, "");
tokens = kiwi.tokenize("지인짜?", new AnalyzeOption(Kiwi.Match.allWithNormalizing));
tokens = kiwi.tokenize("지인짜?", option);
System.out.println(Arrays.deepToString(tokens));
assertEquals(tokens[0].form, "진짜");
assertEquals(tokens[1].form, "?");
tokens = kiwi.tokenize("맗은 물", new AnalyzeOption(Kiwi.Match.allWithNormalizing));
tokens = kiwi.tokenize("맗은 물", option);
System.out.println(Arrays.deepToString(tokens));
assertEquals(tokens[0].form, "");
}

12
bindings/swift/.gitignore vendored Normal file
View file

@ -0,0 +1,12 @@
.DS_Store
/.build
/Packages
/*.xcodeproj
xcuserdata/
DerivedData/
.swiftpm/config/registries.json
.swiftpm/xcode/package.xcworkspace/contents.xcworkspacedata
.netrc
*.xcframework
build/
xcframework/

View file

@ -0,0 +1,25 @@
# CMakeLists.txt for iOS/macOS Swift bindings
cmake_minimum_required(VERSION 3.19)
# This file is used to build Kiwi as a static library for iOS/macOS
# to be bundled into an XCFramework for Swift Package Manager
project(kiwi_swift LANGUAGES CXX)
# Set C++ standard
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
# Disable options that are not needed for Swift bindings
set(KIWI_BUILD_DYNAMIC OFF CACHE BOOL "Build dynamic library" FORCE)
set(KIWI_BUILD_CLI OFF CACHE BOOL "Build CLI tool" FORCE)
set(KIWI_BUILD_EVALUATOR OFF CACHE BOOL "Build Evaluator" FORCE)
set(KIWI_BUILD_MODEL_BUILDER OFF CACHE BOOL "Build Model Builder" FORCE)
set(KIWI_BUILD_TEST OFF CACHE BOOL "Build Test sets" FORCE)
set(KIWI_JAVA_BINDING OFF CACHE BOOL "Build Java binding" FORCE)
# Include the main Kiwi project
include(../../CMakeLists.txt)
# The main CMakeLists.txt should create the kiwi_static target
# which we'll use for the XCFramework

View file

@ -0,0 +1,220 @@
# Swift Bindings Development Guide
## Overview
This document provides technical details for developers working on the Kiwi Swift bindings.
## Architecture
The Swift bindings use a direct C interoperability approach:
```
┌─────────────────────┐
│ Swift API Layer │ User-friendly Swift interface
│ (Kiwi.swift, etc) │
└──────────┬──────────┘
┌──────────┴──────────┐
│ CKiwi Module │ C API bridging via module.modulemap
└──────────┬──────────┘
┌──────────┴──────────┐
│ libkiwi_static.a │ Static C++ library
└─────────────────────┘
```
## File Structure
```
bindings/swift/
├── Package.swift # Swift Package Manager manifest
├── README.md # User documentation
├── CMakeLists.txt # Build configuration
├── .gitignore # Git ignore rules
├── Sources/
│ ├── CKiwi/ # C module for bridging
│ │ ├── module.modulemap # C module definition
│ │ └── include/ # Symbolic links to C headers
│ │ ├── capi.h -> ../../../../../include/kiwi/capi.h
│ │ └── Macro.h -> ../../../../../include/kiwi/Macro.h
│ └── Kiwi/ # Swift wrapper layer
│ ├── Kiwi.swift # Main analyzer class
│ ├── KiwiBuilder.swift # Builder pattern
│ ├── Token.swift # Token structures
│ ├── POSTag.swift # POS tag enum
│ ├── MatchOptions.swift # Analysis options
│ ├── Dialect.swift # Dialect flags
│ ├── Joiner.swift # Morpheme joiner
│ ├── MorphemeSet.swift # Morpheme blacklist
│ ├── TypoTransformer.swift # Typo correction
│ ├── Errors.swift # Error types
│ └── Internal/
│ └── HandleWrapper.swift # RAII wrapper for C handles
├── Tests/
│ └── KiwiTests/
│ └── KiwiTests.swift # Unit tests
└── scripts/
└── build-xcframework.sh # XCFramework build script
```
## Key Design Patterns
### 1. RAII via HandleWrapper
C handles are wrapped in a Swift class that automatically releases resources:
```swift
internal final class HandleWrapper<H> {
let handle: H
private let cleanup: (H) -> Void
init(_ handle: H, cleanup: @escaping (H) -> Void) {
self.handle = handle
self.cleanup = cleanup
}
deinit {
cleanup(handle)
}
}
```
### 2. Swift-Friendly Types
C types are mapped to Swift types:
- C `kiwi_h` → Swift `Kiwi` class
- C `kiwi_token_info_t` → Swift `Token` struct
- C flags → Swift `OptionSet` (MatchOptions, Dialect)
- C enums → Swift `enum` (POSTag)
### 3. Error Handling
C error codes are converted to Swift errors:
```swift
if result != 0 {
if let errorMsg = kiwi_error() {
let error = String(cString: errorMsg)
kiwi_clear_error()
throw KiwiError.operationFailed(error)
}
}
```
## Building
### For Development (macOS)
```bash
cd bindings/swift
swift build
swift test
```
### For Production (XCFramework)
```bash
cd bindings/swift
./scripts/build-xcframework.sh
```
This creates `xcframework/Kiwi.xcframework` containing:
- iOS Device (arm64)
- iOS Simulator (arm64 + x86_64)
- macOS (arm64 + x86_64)
## Testing
### Unit Tests
Run basic unit tests:
```bash
cd bindings/swift
swift test
```
### Integration Tests
Integration tests require actual Kiwi model files. These are not included in unit tests to keep them lightweight.
## CI/CD
GitHub Actions workflow (`.github/workflows/swift.yml`) runs:
1. Swift package build and test
2. XCFramework build (on main branch)
3. Linux compatibility check
## Memory Management
All C handles are automatically released via Swift's ARC system:
- `Kiwi` → calls `kiwi_close()`
- `KiwiBuilder` → calls `kiwi_builder_close()`
- `Joiner` → calls `kiwi_joiner_close()`
- `MorphemeSet` → calls `kiwi_morphset_close()`
- `TypoTransformer` → calls `kiwi_typo_close()` (if owned)
## Thread Safety
The Swift bindings maintain the same thread safety guarantees as the underlying C API:
- Multiple `Kiwi` instances can be used concurrently
- Individual `Kiwi` instances should not be shared across threads without synchronization
## Future Enhancements
Potential areas for improvement:
1. **Binary Distribution**:
- Publish pre-built XCFramework via GitHub Releases
- Update Package.swift to reference binary framework
2. **Additional Features**:
- Word extraction APIs
- Substring extractor
- Pattern matching
3. **Async/Await**:
- Swift async/await wrapper for long-running operations
- Currently only sync APIs are provided
4. **Documentation**:
- DocC documentation comments
- Swift DocC build for hosted documentation
## Contributing
When adding new features:
1. Add C API function calls in appropriate Swift wrapper
2. Convert C types to Swift types appropriately
3. Add error handling
4. Update tests
5. Update README with examples
6. Update this guide if architecture changes
## Troubleshooting
### Symbol Not Found
If you get "symbol not found" errors, ensure:
- Symbolic links in `Sources/CKiwi/include/` are valid
- Headers are properly exposed in module.modulemap
- Library is correctly linked
### Build Failures
Common issues:
- Missing Git LFS files (model files)
- Incorrect symbolic link paths
- Platform-specific build settings
### Runtime Errors
Check:
- Model files are accessible
- Correct path to model directory
- Sufficient memory available
## License
Swift bindings are licensed under the same license as Kiwi.

View file

@ -0,0 +1,39 @@
// swift-tools-version: 5.7
// The swift-tools-version declares the minimum version of Swift required to build this package.
import PackageDescription
let package = Package(
name: "Kiwi",
platforms: [
.iOS(.v12),
.macOS(.v10_14)
],
products: [
.library(
name: "Kiwi",
targets: ["Kiwi"]),
],
dependencies: [],
targets: [
.target(
name: "CKiwi",
dependencies: [],
path: "Sources/CKiwi",
linkerSettings: [
.linkedLibrary("c++"),
.linkedLibrary("z"),
]
),
.target(
name: "Kiwi",
dependencies: ["CKiwi"],
path: "Sources/Kiwi"
),
.testTarget(
name: "KiwiTests",
dependencies: ["Kiwi"],
path: "Tests/KiwiTests"
),
]
)

383
bindings/swift/README.md Normal file
View file

@ -0,0 +1,383 @@
# Kiwi Swift 바인딩
한국어 형태소 분석기 Kiwi의 Swift 바인딩입니다. iOS 및 macOS 앱에서 한국어 자연어 처리를 수행할 수 있습니다.
## 목차
- [요구 사항](#요구-사항)
- [설치](#설치)
- [모델 파일 설정](#모델-파일-설정)
- [기본 사용법](#기본-사용법)
- [고급 기능](#고급-기능)
- [API 레퍼런스](#api-레퍼런스)
- [품사 태그](#품사-태그)
## 요구 사항
- iOS 12.0+ / macOS 10.14+
- Swift 5.7+
- Xcode 14.0+
## 설치
### Swift Package Manager (권장)
#### 방법 1: Xcode에서 추가
1. Xcode에서 **File → Add Package Dependencies...** 선택
2. 저장소 URL 입력: `https://github.com/bab2min/Kiwi.git`
3. 버전 선택 후 **Add Package** 클릭
#### 방법 2: Package.swift에 직접 추가
```swift
// Package.swift
dependencies: [
.package(url: "https://github.com/bab2min/Kiwi.git", from: "0.22.0")
],
targets: [
.target(
name: "YourApp",
dependencies: ["Kiwi"]
)
]
```
## 모델 파일 설정
Kiwi를 사용하려면 모델 파일이 필요합니다. 모델 파일은 [Kiwi 릴리즈 페이지](https://github.com/bab2min/Kiwi/releases)에서 다운로드할 수 있습니다.
### iOS/macOS 앱에서 모델 번들링
1. 모델 폴더를 Xcode 프로젝트에 드래그하여 추가
2. **Copy items if needed** 체크
3. **Create folder references** 선택 (중요!)
4. 타겟에 추가되었는지 확인
```
YourApp/
├── Resources/
│ └── KiwiModels/ ← 모델 폴더
│ ├── combiningRule.txt
│ ├── default.dict
│ ├── extract.mdl
│ └── ...
```
## 기본 사용법
### 형태소 분석
```swift
import Kiwi
do {
// 1. KiwiBuilder 생성 (번들에서 모델 로드)
let builder = try KiwiBuilder(
bundle: .main,
modelDirectory: "KiwiModels"
)
// 2. Kiwi 인스턴스 빌드
let kiwi = try builder.build()
// 3. 형태소 분석
let tokens = try kiwi.tokenize("안녕하세요, 키위 형태소 분석기입니다!")
for token in tokens {
print("\(token.form)/\(token.tag.description)")
}
// 출력:
// 안녕/NNG
// 하/XSA
// 시/EP
// 어요/EF
// ,/SP
// 키위/NNG
// 형태소/NNG
// 분석기/NNG
// 이/VCP
// ㅂ니다/EF
// !/SF
} catch {
print("오류: \(error)")
}
```
### 경로로 모델 로드
```swift
// 직접 경로 지정
let builder = try KiwiBuilder(
modelPath: "/path/to/models",
numThreads: 4 // 스레드 수 지정 (-1: 자동)
)
```
### 다중 분석 결과 얻기
```swift
// topN 개의 분석 후보 반환
let results = try kiwi.analyze("감기는 감기다", topN: 3)
for (index, result) in results.enumerated() {
print("후보 \(index + 1) (점수: \(result.score)):")
for token in result.tokens {
print(" \(token.form)/\(token.tag.description)")
}
}
```
### 문장 분리
```swift
let text = "안녕하세요. 키위입니다. 형태소 분석을 합니다."
let sentences = try kiwi.splitIntoSentences(text)
for sentence in sentences {
print("문장: \(sentence.text)")
print(" 시작: \(sentence.start), 길이: \(sentence.length)")
}
// 출력:
// 문장: 안녕하세요.
// 시작: 0, 길이: 18
// 문장: 키위입니다.
// 시작: 19, 길이: 16
// 문장: 형태소 분석을 합니다.
// 시작: 36, 길이: 28
```
## 고급 기능
### 사용자 사전 추가
```swift
let builder = try KiwiBuilder(bundle: .main, modelDirectory: "KiwiModels")
// 단어 직접 추가
try builder.addWord("키위피", tag: .nnp, score: 0.0) // 고유명사로 추가
try builder.addWord("딥러닝", tag: .nng, score: 0.0) // 일반명사로 추가
// 사전 파일 로드 (탭으로 구분된 형식: 단어\t품사\t점수)
try builder.loadDict("/path/to/user_dict.txt")
let kiwi = try builder.build()
```
### 분석 옵션 설정
```swift
// 기본 옵션으로 분석
let tokens1 = try kiwi.tokenize("www.example.com", options: .all)
// URL, 이메일 등 패턴 매칭 + 정규화
let tokens2 = try kiwi.tokenize("www.example.com", options: .allWithNormalizing)
// 개별 옵션 조합
let customOptions: MatchOptions = [.url, .email, .normalizeCoda]
let tokens3 = try kiwi.tokenize("test@test.com", options: customOptions)
```
**MatchOptions 목록:**
| 옵션 | 설명 |
|------|------|
| `.url` | URL 패턴 인식 |
| `.email` | 이메일 패턴 인식 |
| `.hashtag` | 해시태그 인식 |
| `.mention` | 멘션(@) 인식 |
| `.serial` | 일련번호 인식 |
| `.normalizeCoda` | 받침 정규화 (잇다 → 있다) |
| `.joinNounPrefix` | 체언 접두사 결합 |
| `.joinNounSuffix` | 체언 접미사 결합 |
| `.joinVerbSuffix` | 동사 접미사 결합 |
| `.joinAdjSuffix` | 형용사 접미사 결합 |
| `.splitComplex` | 복합 형태소 분리 |
| `.all` | 기본 전체 옵션 |
| `.allWithNormalizing` | 전체 + 정규화 |
### 방언 지원
```swift
let builder = try KiwiBuilder(
bundle: .main,
modelDirectory: "KiwiModels",
enabledDialects: [.standard, .gyeongsang, .jeolla]
)
let kiwi = try builder.build()
```
**Dialect 목록:**
| 옵션 | 설명 |
|------|------|
| `.standard` | 표준어 (기본) |
| `.gyeonggi` | 경기 방언 |
| `.chungcheong` | 충청 방언 |
| `.gangwon` | 강원 방언 |
| `.gyeongsang` | 경상 방언 |
| `.jeolla` | 전라 방언 |
| `.jeju` | 제주 방언 |
| `.hwanghae` | 황해 방언 |
| `.hamgyeong` | 함경 방언 |
| `.pyeongan` | 평안 방언 |
| `.archaic` | 고어 |
### 오타 교정
```swift
// 기본 오타 교정기 사용
let typoTransformer = try TypoTransformer.basic()
let kiwi = try builder.build(
typoTransformer: typoTransformer,
typoCostThreshold: 2.5
)
let tokens = try kiwi.tokenize("장례희망이 뭐야?") // 오타 자동 교정
```
**TypoTransformer 유형:**
```swift
// 빈 트랜스포머
let empty = try TypoTransformer()
// 기본 오타 세트
let basic = try TypoTransformer.basic()
// 다양한 오타 세트
let continual = try TypoTransformer.default(.continualTypoSet)
let withLengthening = try TypoTransformer.default(.basicTypoSetWithContinualAndLengthening)
```
### 형태소 결합 (Joiner)
형태소를 결합하여 자연스러운 문장을 생성합니다.
```swift
let joiner = try kiwi.createJoiner()
try joiner.add(form: "먹", tag: .vv) // 동사 어간
try joiner.add(form: "었", tag: .ep) // 선어말 어미
try joiner.add(form: "다", tag: .ef) // 종결 어미
let text = try joiner.join()
print(text) // "먹었다"
```
```swift
// 불규칙 활용 자동 처리
let joiner = try kiwi.createJoiner()
try joiner.add(form: "듣", tag: .vvi) // ㄷ불규칙 동사
try joiner.add(form: "어", tag: .ec)
let text = try joiner.join()
print(text) // "들어" (ㄷ → ㄹ 불규칙 적용)
```
### 형태소 블랙리스트
특정 형태소를 분석에서 제외합니다.
```swift
let morphset = try kiwi.createMorphemeSet()
try morphset.add(form: "가", tag: .jks) // 주격조사 '가' 제외
// analyze 시 blocklist로 사용 (향후 지원 예정)
```
### Token 정보 활용
```swift
let tokens = try kiwi.tokenize("서울에서 부산까지")
for token in tokens {
print("""
형태: \(token.form)
품사: \(token.tag.description)
위치: \(token.position) (길이: \(token.length))
어절 번호: \(token.wordPosition)
문장 번호: \(token.sentencePosition)
점수: \(token.score)
오타 비용: \(token.typoCost)
""")
}
```
### JSON 직렬화
`Token`, `TokenResult`, `Sentence`는 모두 `Codable`을 준수합니다.
```swift
let tokens = try kiwi.tokenize("안녕하세요")
let encoder = JSONEncoder()
encoder.outputFormatting = .prettyPrinted
let jsonData = try encoder.encode(tokens)
let jsonString = String(data: jsonData, encoding: .utf8)!
print(jsonString)
```
## API 레퍼런스
### KiwiBuilder
| 메서드 | 설명 |
|--------|------|
| `init(modelPath:numThreads:options:enabledDialects:)` | 경로로 초기화 |
| `init(bundle:modelDirectory:numThreads:options:enabledDialects:)` | 번들로 초기화 |
| `addWord(_:tag:score:)` | 사용자 단어 추가 |
| `loadDict(_:)` | 사전 파일 로드 |
| `build(typoTransformer:typoCostThreshold:)` | Kiwi 인스턴스 생성 |
### Kiwi
| 메서드/프로퍼티 | 설명 |
|----------------|------|
| `version` (static) | Kiwi 버전 문자열 |
| `analyze(_:topN:options:)` | 형태소 분석 (다중 결과) |
| `tokenize(_:options:)` | 형태소 분석 (최상위 결과만) |
| `splitIntoSentences(_:options:)` | 문장 분리 |
| `createJoiner(useLMSearch:)` | Joiner 생성 |
| `createMorphemeSet()` | MorphemeSet 생성 |
### Token
| 프로퍼티 | 타입 | 설명 |
|----------|------|------|
| `form` | `String` | 형태소 문자열 |
| `tag` | `POSTag` | 품사 태그 |
| `position` | `Int` | 원문에서의 위치 (UTF-16) |
| `length` | `Int` | 길이 (UTF-16) |
| `score` | `Float` | 언어 모델 점수 |
| `wordPosition` | `Int` | 어절 번호 |
| `sentencePosition` | `Int` | 문장 번호 |
| `typoCost` | `Float` | 오타 교정 비용 (0이면 교정 안 됨) |
### Joiner
| 메서드 | 설명 |
|--------|------|
| `add(form:tag:autoDetectIrregular:)` | 형태소 추가 |
| `join()` | 결합된 텍스트 반환 |
## 에러 처리
```swift
do {
let builder = try KiwiBuilder(modelPath: "/invalid/path")
} catch KiwiError.modelNotFound(let path) {
print("모델을 찾을 수 없습니다: \(path)")
} catch KiwiError.operationFailed(let message) {
print("작업 실패: \(message)")
} catch KiwiError.invalidHandle {
print("잘못된 핸들")
} catch {
print("알 수 없는 오류: \(error)")
}
```

View file

@ -0,0 +1 @@
../../../../../include/kiwi/Macro.h

View file

@ -0,0 +1 @@
../../../../../include/kiwi/capi.h

View file

@ -0,0 +1,5 @@
module CKiwi {
header "include/capi.h"
header "include/Macro.h"
export *
}

View file

@ -0,0 +1,46 @@
import Foundation
/// Korean dialect flags
public struct Dialect: OptionSet, Codable {
public let rawValue: Int32
public init(rawValue: Int32) {
self.rawValue = rawValue
}
/// Standard Korean ()
public static let standard = Dialect(rawValue: 0)
/// Gyeonggi dialect ( )
public static let gyeonggi = Dialect(rawValue: 1 << 0)
/// Chungcheong dialect ( )
public static let chungcheong = Dialect(rawValue: 1 << 1)
/// Gangwon dialect ( )
public static let gangwon = Dialect(rawValue: 1 << 2)
/// Gyeongsang dialect ( )
public static let gyeongsang = Dialect(rawValue: 1 << 3)
/// Jeolla dialect ( )
public static let jeolla = Dialect(rawValue: 1 << 4)
/// Jeju dialect ( )
public static let jeju = Dialect(rawValue: 1 << 5)
/// Hwanghae dialect ( )
public static let hwanghae = Dialect(rawValue: 1 << 6)
/// Hamgyeong dialect ( )
public static let hamgyeong = Dialect(rawValue: 1 << 7)
/// Pyeongan dialect ( )
public static let pyeongan = Dialect(rawValue: 1 << 8)
/// Archaic Korean ()
public static let archaic = Dialect(rawValue: 1 << 9)
/// All dialects
public static let all = Dialect(rawValue: (1 << 10) - 1)
}

View file

@ -0,0 +1,39 @@
import Foundation
/// Errors that can be thrown by Kiwi operations
public enum KiwiError: Error, LocalizedError {
/// Invalid handle passed to function
case invalidHandle
/// Invalid index or parameter
case invalidIndex
/// Operation failed with error message
case operationFailed(String)
/// General failure
case failure(String)
/// Model file not found
case modelNotFound(String)
/// Invalid UTF-8 string
case invalidString
public var errorDescription: String? {
switch self {
case .invalidHandle:
return "Invalid handle"
case .invalidIndex:
return "Invalid index"
case .operationFailed(let message):
return "Operation failed: \(message)"
case .failure(let message):
return message
case .modelNotFound(let path):
return "Model not found at path: \(path)"
case .invalidString:
return "Invalid UTF-8 string"
}
}
}

View file

@ -0,0 +1,16 @@
import Foundation
/// Internal wrapper for C handles that provides RAII-style cleanup
internal final class HandleWrapper<H> {
let handle: H
private let cleanup: (H) -> Void
init(_ handle: H, cleanup: @escaping (H) -> Void) {
self.handle = handle
self.cleanup = cleanup
}
deinit {
cleanup(handle)
}
}

View file

@ -0,0 +1,54 @@
import Foundation
import CKiwi
/// Joiner for combining morphemes into text
public final class Joiner {
private var wrapper: HandleWrapper<kiwi_joiner_h>?
internal init(handle: kiwi_joiner_h) {
self.wrapper = HandleWrapper(handle) { kiwi_joiner_close($0) }
}
/// Add a morpheme to the joiner
/// - Parameters:
/// - form: Form of the morpheme
/// - tag: Part-of-speech tag
/// - autoDetectIrregular: Automatically detect irregular conjugation (default: true)
/// - Throws: KiwiError if operation fails
public func add(form: String, tag: POSTag, autoDetectIrregular: Bool = true) throws {
guard let handle = wrapper?.handle else {
throw KiwiError.invalidHandle
}
let result = kiwi_joiner_add(handle, form, tag.description, autoDetectIrregular ? 1 : 0)
if result != 0 {
if let errorMsg = kiwi_error() {
let error = String(cString: errorMsg)
kiwi_clear_error()
throw KiwiError.operationFailed(error)
}
throw KiwiError.operationFailed("Failed to add morpheme to joiner")
}
}
/// Get the joined text from all added morphemes
/// - Returns: Combined text
/// - Throws: KiwiError if operation fails
public func join() throws -> String {
guard let handle = wrapper?.handle else {
throw KiwiError.invalidHandle
}
guard let resultPtr = kiwi_joiner_get(handle) else {
if let errorMsg = kiwi_error() {
let error = String(cString: errorMsg)
kiwi_clear_error()
throw KiwiError.operationFailed(error)
}
throw KiwiError.operationFailed("Failed to get joined text")
}
return String(cString: resultPtr)
}
}

View file

@ -0,0 +1,206 @@
import Foundation
import CKiwi
/// Main Kiwi morphological analyzer class
public final class Kiwi {
private var wrapper: HandleWrapper<kiwi_h>?
internal init(handle: kiwi_h) {
self.wrapper = HandleWrapper(handle) { kiwi_close($0) }
}
/// Get Kiwi version string
public static var version: String {
if let versionPtr = kiwi_version() {
return String(cString: versionPtr)
}
return "unknown"
}
/// Analyze text and return morphological analysis results
/// - Parameters:
/// - text: Text to analyze
/// - topN: Number of top results to return (default: 1)
/// - options: Match options (default: .allWithNormalizing)
/// - typoTransformer: Optional prepared typo transformer for typo correction
/// - typoThreshold: Typo cost threshold (default: 2.5)
/// - Returns: Array of token result candidates
/// - Throws: KiwiError if analysis fails
public func analyze(
_ text: String,
topN: Int = 1,
options: MatchOptions = .allWithNormalizing,
typoTransformer: PreparedTypoTransformer? = nil,
typoThreshold: Float = 2.5
) throws -> [TokenResult] {
guard let handle = wrapper?.handle else {
throw KiwiError.invalidHandle
}
var analyzeOption = kiwi_analyze_option_t()
analyzeOption.match_options = options.rawValue
analyzeOption.blocklist = nil
analyzeOption.open_ending = 0
analyzeOption.allowed_dialects = 0
analyzeOption.dialect_cost = 3.0
analyzeOption.typo_transformer = typoTransformer?.handle
analyzeOption.typo_threshold = typoThreshold
guard let result = kiwi_analyze(handle, text, Int32(topN), analyzeOption, nil) else {
if let errorMsg = kiwi_error() {
let error = String(cString: errorMsg)
kiwi_clear_error()
throw KiwiError.operationFailed(error)
}
throw KiwiError.operationFailed("Analysis failed")
}
defer { kiwi_res_close(result) }
let resultSize = kiwi_res_size(result)
guard resultSize >= 0 else {
throw KiwiError.operationFailed("Invalid result size")
}
var results: [TokenResult] = []
results.reserveCapacity(Int(resultSize))
for i in 0..<resultSize {
let prob = kiwi_res_prob(result, i)
let wordNum = kiwi_res_word_num(result, i)
guard wordNum >= 0 else {
continue
}
var tokens: [Token] = []
tokens.reserveCapacity(Int(wordNum))
for j in 0..<wordNum {
if let formPtr = kiwi_res_form(result, i, j),
let tokenInfo = kiwi_res_token_info(result, i, j) {
let form = String(cString: formPtr)
let token = Token(form: form, tokenInfo: tokenInfo.pointee)
tokens.append(token)
}
}
results.append(TokenResult(score: prob, tokens: tokens))
}
return results
}
/// Tokenize text and return simple token array (uses best analysis result)
/// - Parameters:
/// - text: Text to tokenize
/// - options: Match options (default: .allWithNormalizing)
/// - typoTransformer: Optional prepared typo transformer for typo correction
/// - typoThreshold: Typo cost threshold (default: 2.5)
/// - Returns: Array of tokens
/// - Throws: KiwiError if tokenization fails
public func tokenize(
_ text: String,
options: MatchOptions = .allWithNormalizing,
typoTransformer: PreparedTypoTransformer? = nil,
typoThreshold: Float = 2.5
) throws -> [Token] {
let results = try analyze(text, topN: 1, options: options, typoTransformer: typoTransformer, typoThreshold: typoThreshold)
return results.first?.tokens ?? []
}
/// Split text into sentences
/// - Parameters:
/// - text: Text to split
/// - options: Match options (default: .all)
/// - Returns: Array of sentences
/// - Throws: KiwiError if splitting fails
public func splitIntoSentences(
_ text: String,
options: MatchOptions = .all
) throws -> [Sentence] {
guard let handle = wrapper?.handle else {
throw KiwiError.invalidHandle
}
guard let result = kiwi_split_into_sents(handle, text, options.rawValue, nil) else {
if let errorMsg = kiwi_error() {
let error = String(cString: errorMsg)
kiwi_clear_error()
throw KiwiError.operationFailed(error)
}
throw KiwiError.operationFailed("Sentence splitting failed")
}
defer { kiwi_ss_close(result) }
let sentenceCount = kiwi_ss_size(result)
guard sentenceCount >= 0 else {
throw KiwiError.operationFailed("Invalid sentence count")
}
var sentences: [Sentence] = []
sentences.reserveCapacity(Int(sentenceCount))
let textUtf8Count = text.utf8.count
for i in 0..<sentenceCount {
let start = kiwi_ss_begin_position(result, i)
let end = kiwi_ss_end_position(result, i)
// Validate bounds before indexing
if start >= 0 && end >= start && Int(end) <= textUtf8Count {
let startIdx = text.utf8.index(text.utf8.startIndex, offsetBy: Int(start))
let endIdx = text.utf8.index(text.utf8.startIndex, offsetBy: Int(end))
if let sentenceText = String(text.utf8[startIdx..<endIdx]) {
sentences.append(Sentence(
text: sentenceText,
start: Int(start),
length: Int(end - start)
))
}
}
}
return sentences
}
/// Create a new Joiner for combining morphemes into text
/// - Parameter useLMSearch: Use language model search for optimal POS selection (default: true)
/// - Returns: A new Joiner instance
/// - Throws: KiwiError if creation fails
public func createJoiner(useLMSearch: Bool = true) throws -> Joiner {
guard let handle = wrapper?.handle else {
throw KiwiError.invalidHandle
}
guard let joinerHandle = kiwi_new_joiner(handle, useLMSearch ? 1 : 0) else {
if let errorMsg = kiwi_error() {
let error = String(cString: errorMsg)
kiwi_clear_error()
throw KiwiError.operationFailed(error)
}
throw KiwiError.operationFailed("Failed to create joiner")
}
return Joiner(handle: joinerHandle)
}
/// Create a new MorphemeSet (for use as blacklist in analysis)
/// - Returns: A new MorphemeSet instance
/// - Throws: KiwiError if creation fails
public func createMorphemeSet() throws -> MorphemeSet {
guard let handle = wrapper?.handle else {
throw KiwiError.invalidHandle
}
guard let morphsetHandle = kiwi_new_morphset(handle) else {
if let errorMsg = kiwi_error() {
let error = String(cString: errorMsg)
kiwi_clear_error()
throw KiwiError.operationFailed(error)
}
throw KiwiError.operationFailed("Failed to create morpheme set")
}
return MorphemeSet(handle: morphsetHandle)
}
}

View file

@ -0,0 +1,169 @@
import Foundation
import CKiwi
/// Build options for KiwiBuilder
public struct BuildOptions: OptionSet {
public let rawValue: Int32
public init(rawValue: Int32) {
self.rawValue = rawValue
}
/// Integrate allomorphs
public static let integrateAllomorph = BuildOptions(rawValue: 1)
/// Load default dictionary
public static let loadDefaultDict = BuildOptions(rawValue: 2)
/// Load typo dictionary
public static let loadTypoDict = BuildOptions(rawValue: 4)
/// Load multi-dict
public static let loadMultiDict = BuildOptions(rawValue: 8)
/// Default build options
public static let `default`: BuildOptions = [
.integrateAllomorph,
.loadDefaultDict,
.loadTypoDict,
.loadMultiDict
]
}
/// Builder class for creating Kiwi instances
public final class KiwiBuilder {
private var wrapper: HandleWrapper<kiwi_builder_h>?
/// Initialize KiwiBuilder with model path
/// - Parameters:
/// - modelPath: Path to the model directory
/// - numThreads: Number of threads to use (-1 for automatic)
/// - options: Build options
/// - enabledDialects: Enabled dialects
/// - Throws: KiwiError if initialization fails
public init(
modelPath: String,
numThreads: Int = -1,
options: BuildOptions = .default,
enabledDialects: Dialect = .standard
) throws {
let handle = kiwi_builder_init(
modelPath,
Int32(numThreads),
Int32(options.rawValue),
Int32(enabledDialects.rawValue)
)
guard let handle = handle else {
if let errorMsg = kiwi_error() {
let error = String(cString: errorMsg)
kiwi_clear_error()
throw KiwiError.operationFailed(error)
}
throw KiwiError.operationFailed("Failed to initialize KiwiBuilder")
}
self.wrapper = HandleWrapper(handle) { kiwi_builder_close($0) }
}
/// Initialize KiwiBuilder from a Bundle
/// - Parameters:
/// - bundle: Bundle containing the model files
/// - modelDirectory: Name of the model directory in the bundle (default: "KiwiModels")
/// - numThreads: Number of threads to use (-1 for automatic)
/// - options: Build options
/// - enabledDialects: Enabled dialects
/// - Throws: KiwiError if initialization fails or model not found
public convenience init(
bundle: Bundle,
modelDirectory: String = "KiwiModels",
numThreads: Int = -1,
options: BuildOptions = .default,
enabledDialects: Dialect = .standard
) throws {
guard let modelPath = bundle.resourcePath?
.appending("/\(modelDirectory)") else {
throw KiwiError.modelNotFound("Model directory not found in bundle")
}
try self.init(
modelPath: modelPath,
numThreads: numThreads,
options: options,
enabledDialects: enabledDialects
)
}
/// Add a user word to the dictionary
/// - Parameters:
/// - word: Word to add
/// - tag: Part-of-speech tag
/// - score: Score for the word (default: 0)
/// - Returns: true if successful
/// - Throws: KiwiError if operation fails
@discardableResult
public func addWord(_ word: String, tag: POSTag, score: Float = 0) throws -> Bool {
guard let handle = wrapper?.handle else {
throw KiwiError.invalidHandle
}
let result = kiwi_builder_add_word(handle, word, tag.description, score)
if result != 0 {
if let errorMsg = kiwi_error() {
let error = String(cString: errorMsg)
kiwi_clear_error()
throw KiwiError.operationFailed(error)
}
return false
}
return true
}
/// Load user dictionary from file
/// - Parameter dictPath: Path to the dictionary file
/// - Returns: Number of words added
/// - Throws: KiwiError if operation fails
@discardableResult
public func loadDict(_ dictPath: String) throws -> Int {
guard let handle = wrapper?.handle else {
throw KiwiError.invalidHandle
}
let result = kiwi_builder_load_dict(handle, dictPath)
if result < 0 {
if let errorMsg = kiwi_error() {
let error = String(cString: errorMsg)
kiwi_clear_error()
throw KiwiError.operationFailed(error)
}
throw KiwiError.operationFailed("Failed to load dictionary")
}
return Int(result)
}
/// Build a Kiwi instance
/// - Returns: A new Kiwi instance
/// - Throws: KiwiError if build fails
public func build() throws -> Kiwi {
guard let handle = wrapper?.handle else {
throw KiwiError.invalidHandle
}
let kiwiHandle = kiwi_builder_build(handle, nil, 0)
guard let kiwiHandle = kiwiHandle else {
if let errorMsg = kiwi_error() {
let error = String(cString: errorMsg)
kiwi_clear_error()
throw KiwiError.operationFailed(error)
}
throw KiwiError.operationFailed("Failed to build Kiwi")
}
return Kiwi(handle: kiwiHandle)
}
}

View file

@ -0,0 +1,91 @@
import Foundation
/// Options for matching patterns in text analysis
public struct MatchOptions: OptionSet, Codable {
public let rawValue: Int32
public init(rawValue: Int32) {
self.rawValue = rawValue
}
/// Match URL patterns
public static let url = MatchOptions(rawValue: 1 << 0)
/// Match email addresses
public static let email = MatchOptions(rawValue: 1 << 1)
/// Match hashtags
public static let hashtag = MatchOptions(rawValue: 1 << 2)
/// Match mentions (@username)
public static let mention = MatchOptions(rawValue: 1 << 3)
/// Match serial numbers
public static let serial = MatchOptions(rawValue: 1 << 4)
/// Match emoji
public static let emoji = MatchOptions(rawValue: 1 << 5)
/// OOV: use rule-based matching only
public static let oovRuleOnly = MatchOptions(rawValue: 0 << 8)
/// OOV: use character model
public static let oovChrModel = MatchOptions(rawValue: 1 << 8)
/// OOV: use character frequency model
public static let oovChrFreqModel = MatchOptions(rawValue: 2 << 8)
/// OOV: use character frequency and branch model
public static let oovChrFreqBranchModel = MatchOptions(rawValue: 3 << 8)
/// OOV option mask
public static let oovMask = MatchOptions(rawValue: 3 << 8)
/// Normalize coda
public static let normalizeCoda = MatchOptions(rawValue: 1 << 16)
/// Join noun prefix
public static let joinNounPrefix = MatchOptions(rawValue: 1 << 17)
/// Join noun suffix
public static let joinNounSuffix = MatchOptions(rawValue: 1 << 18)
/// Join verb suffix
public static let joinVerbSuffix = MatchOptions(rawValue: 1 << 19)
/// Join adjective suffix
public static let joinAdjSuffix = MatchOptions(rawValue: 1 << 20)
/// Join adverb suffix
public static let joinAdvSuffix = MatchOptions(rawValue: 1 << 21)
/// Join verb and adjective suffixes
public static let joinVSuffix: MatchOptions = [.joinVerbSuffix, .joinAdjSuffix]
/// Join all affixes
public static let joinAffix: MatchOptions = [.joinNounPrefix, .joinNounSuffix, .joinVerbSuffix, .joinAdjSuffix, .joinAdvSuffix]
/// Split complex morphemes
public static let splitComplex = MatchOptions(rawValue: 1 << 22)
/// Match Z coda
public static let zCoda = MatchOptions(rawValue: 1 << 23)
/// Match compatible jamo
public static let compatibleJamo = MatchOptions(rawValue: 1 << 24)
/// Split saisiot
public static let splitSaisiot = MatchOptions(rawValue: 1 << 25)
/// Merge saisiot
public static let mergeSaisiot = MatchOptions(rawValue: 1 << 26)
/// Join particle yo
public static let joinParticleYo = MatchOptions(rawValue: 1 << 27)
/// All basic matching options
public static let all: MatchOptions = [.url, .email, .hashtag, .mention, .serial, .emoji, .zCoda]
/// All matching options with normalization
public static let allWithNormalizing: MatchOptions = [.all, .normalizeCoda]
}

View file

@ -0,0 +1,42 @@
import Foundation
import CKiwi
/// Set of morphemes (used as blacklist in analysis)
public final class MorphemeSet {
private var wrapper: HandleWrapper<kiwi_morphset_h>?
internal init(handle: kiwi_morphset_h) {
self.wrapper = HandleWrapper(handle) { kiwi_morphset_close($0) }
}
internal var handle: kiwi_morphset_h? {
return wrapper?.handle
}
/// Add a morpheme to the set
/// - Parameters:
/// - form: Form of the morpheme
/// - tag: Part-of-speech tag (nil to match all tags)
/// - Returns: Number of morphemes added
/// - Throws: KiwiError if operation fails
@discardableResult
public func add(form: String, tag: POSTag? = nil) throws -> Int {
guard let handle = wrapper?.handle else {
throw KiwiError.invalidHandle
}
let tagStr = tag?.description
let result = kiwi_morphset_add(handle, form, tagStr)
if result < 0 {
if let errorMsg = kiwi_error() {
let error = String(cString: errorMsg)
kiwi_clear_error()
throw KiwiError.operationFailed(error)
}
throw KiwiError.operationFailed("Failed to add morpheme to set")
}
return Int(result)
}
}

View file

@ -0,0 +1,268 @@
import Foundation
/// Part-of-Speech tag enumeration
public enum POSTag: UInt8, CaseIterable, Codable {
case unknown = 0
// Nouns
case nng = 1
case nnp = 2
case nnb = 3
// Verbs
case vv = 4
case va = 5
// Adverbs
case mag = 6
// Numerals
case nr = 7
case np = 8
// Auxiliary
case vx = 9
// Determiners
case mm = 10
case maj = 11
// Interjections
case ic = 12
// Prefixes/Suffixes
case xpn = 13
case xsn = 14
case xsv = 15
case xsa = 16
case xsm = 17
case xr = 18
// Copulas
case vcp = 19
case vcn = 20
// Symbols
case sf = 21
case sp = 22
case ss = 23
case sso = 24
case ssc = 25
case se = 26
case so = 27
case sw = 28
case sb = 29
case sl = 30
case sh = 31
case sn = 32
// Web entities
case w_url = 33
case w_email = 34
case w_mention = 35
case w_hashtag = 36
case w_serial = 37
case w_emoji = 38
// Particles
case jks = 39
case jkc = 40
case jkg = 41
case jko = 42
case jkb = 43
case jkv = 44
case jkq = 45
case jx = 46
case jc = 47
// Endings
case ep = 48
case ef = 49
case ec = 50
case etn = 51
case etm = 52
// Special
case z_coda = 53
case z_siot = 54
// User defined
case user0 = 55
case user1 = 56
case user2 = 57
case user3 = 58
case user4 = 59
// Irregular conjugation tags (base tag | 0x80)
case vvi = 132
case vai = 133
case vxi = 137
case xsai = 144
/// String representation of the POS tag
public var description: String {
switch self {
case .unknown: return "UNK"
case .nng: return "NNG"
case .nnp: return "NNP"
case .nnb: return "NNB"
case .vv: return "VV"
case .va: return "VA"
case .mag: return "MAG"
case .nr: return "NR"
case .np: return "NP"
case .vx: return "VX"
case .mm: return "MM"
case .maj: return "MAJ"
case .ic: return "IC"
case .xpn: return "XPN"
case .xsn: return "XSN"
case .xsv: return "XSV"
case .xsa: return "XSA"
case .xsm: return "XSM"
case .xr: return "XR"
case .vcp: return "VCP"
case .vcn: return "VCN"
case .sf: return "SF"
case .sp: return "SP"
case .ss: return "SS"
case .sso: return "SSO"
case .ssc: return "SSC"
case .se: return "SE"
case .so: return "SO"
case .sw: return "SW"
case .sb: return "SB"
case .sl: return "SL"
case .sh: return "SH"
case .sn: return "SN"
case .w_url: return "W_URL"
case .w_email: return "W_EMAIL"
case .w_mention: return "W_MENTION"
case .w_hashtag: return "W_HASHTAG"
case .w_serial: return "W_SERIAL"
case .w_emoji: return "W_EMOJI"
case .jks: return "JKS"
case .jkc: return "JKC"
case .jkg: return "JKG"
case .jko: return "JKO"
case .jkb: return "JKB"
case .jkv: return "JKV"
case .jkq: return "JKQ"
case .jx: return "JX"
case .jc: return "JC"
case .ep: return "EP"
case .ef: return "EF"
case .ec: return "EC"
case .etn: return "ETN"
case .etm: return "ETM"
case .z_coda: return "Z_CODA"
case .z_siot: return "Z_SIOT"
case .user0: return "USER0"
case .user1: return "USER1"
case .user2: return "USER2"
case .user3: return "USER3"
case .user4: return "USER4"
case .vvi: return "VV-I"
case .vai: return "VA-I"
case .vxi: return "VX-I"
case .xsai: return "XSA-I"
}
}
/// Initialize from string tag name
public init?(string: String) {
switch string.uppercased() {
case "UNK", "UNKNOWN": self = .unknown
case "NNG": self = .nng
case "NNP": self = .nnp
case "NNB": self = .nnb
case "VV", "VV-R": self = .vv
case "VA", "VA-R": self = .va
case "MAG": self = .mag
case "NR": self = .nr
case "NP": self = .np
case "VX", "VX-R": self = .vx
case "MM": self = .mm
case "MAJ": self = .maj
case "IC": self = .ic
case "XPN": self = .xpn
case "XSN": self = .xsn
case "XSV", "XSV-R": self = .xsv
case "XSA", "XSA-R": self = .xsa
case "XSM": self = .xsm
case "XR": self = .xr
case "VCP": self = .vcp
case "VCN": self = .vcn
case "SF": self = .sf
case "SP": self = .sp
case "SS": self = .ss
case "SSO": self = .sso
case "SSC": self = .ssc
case "SE": self = .se
case "SO": self = .so
case "SW": self = .sw
case "SB": self = .sb
case "SL": self = .sl
case "SH": self = .sh
case "SN": self = .sn
case "W_URL": self = .w_url
case "W_EMAIL": self = .w_email
case "W_MENTION": self = .w_mention
case "W_HASHTAG": self = .w_hashtag
case "W_SERIAL": self = .w_serial
case "W_EMOJI": self = .w_emoji
case "JKS": self = .jks
case "JKC": self = .jkc
case "JKG": self = .jkg
case "JKO": self = .jko
case "JKB": self = .jkb
case "JKV": self = .jkv
case "JKQ": self = .jkq
case "JX": self = .jx
case "JC": self = .jc
case "EP": self = .ep
case "EF": self = .ef
case "EC": self = .ec
case "ETN": self = .etn
case "ETM": self = .etm
case "Z_CODA": self = .z_coda
case "Z_SIOT": self = .z_siot
case "USER0": self = .user0
case "USER1": self = .user1
case "USER2": self = .user2
case "USER3": self = .user3
case "USER4": self = .user4
case "VV-I", "VVI": self = .vvi
case "VA-I", "VAI": self = .vai
case "VX-I", "VXI": self = .vxi
case "XSA-I", "XSAI": self = .xsai
default: return nil
}
}
/// Whether this tag represents an irregular conjugation
public var isIrregular: Bool {
return rawValue & 0x80 != 0
}
/// Returns the base tag without the irregular flag
public var baseTag: POSTag {
if isIrregular {
return POSTag(rawValue: rawValue & 0x7F) ?? self
}
return self
}
/// Returns the irregular version of this tag (for VV, VA, VX, XSA, P, PA)
public var irregularTag: POSTag? {
switch self {
case .vv: return .vvi
case .va: return .vai
case .vx: return .vxi
case .xsa: return .xsai
default: return nil
}
}
}
extension POSTag: CustomStringConvertible {}

View file

@ -0,0 +1,128 @@
import Foundation
import CKiwi
/// Represents a morphological token in the analyzed text
public struct Token: Codable {
/// The surface form of the token
public let form: String
/// Part-of-speech tag
public let tag: POSTag
/// Character position in the original text (UTF-16 based)
public let position: Int
/// Length of the token (UTF-16 based)
public let length: Int
/// Language model score for this token
public let score: Float
/// Word position (space-delimited)
public let wordPosition: Int
/// Sentence position
public let sentencePosition: Int
/// Line number
public let lineNumber: Int
/// Sense ID
public let senseId: Int
/// Typo cost (0 if not corrected)
public let typoCost: Float
/// Paired token index for SSO/SSC tags (-1 if none)
public let pairedToken: Int
/// Sub-sentence position (0 if not in sub-sentence)
public let subSentencePosition: Int
/// Dialect information
public let dialect: Dialect
internal init(form: String, tokenInfo: kiwi_token_info_t) {
self.form = form
self.tag = POSTag(rawValue: tokenInfo.tag) ?? .unknown
self.position = Int(tokenInfo.chr_position)
self.length = Int(tokenInfo.length)
self.score = tokenInfo.score
self.wordPosition = Int(tokenInfo.word_position)
self.sentencePosition = Int(tokenInfo.sent_position)
self.lineNumber = Int(tokenInfo.line_number)
self.senseId = Int(tokenInfo.sense_id)
self.typoCost = tokenInfo.typo_cost
self.pairedToken = Int(tokenInfo.paired_token)
self.subSentencePosition = Int(tokenInfo.sub_sent_position)
self.dialect = Dialect(rawValue: Int32(tokenInfo.dialect))
}
public init(
form: String,
tag: POSTag,
position: Int = 0,
length: Int = 0,
score: Float = 0.0,
wordPosition: Int = 0,
sentencePosition: Int = 0,
lineNumber: Int = 0,
senseId: Int = 0,
typoCost: Float = 0.0,
pairedToken: Int = -1,
subSentencePosition: Int = 0,
dialect: Dialect = .standard
) {
self.form = form
self.tag = tag
self.position = position
self.length = length
self.score = score
self.wordPosition = wordPosition
self.sentencePosition = sentencePosition
self.lineNumber = lineNumber
self.senseId = senseId
self.typoCost = typoCost
self.pairedToken = pairedToken
self.subSentencePosition = subSentencePosition
self.dialect = dialect
}
}
extension Token: CustomStringConvertible {
public var description: String {
return "\(form)/\(tag.description)"
}
}
/// Result from analysis containing multiple token candidates
public struct TokenResult: Codable {
/// Probability score for this analysis result
public let score: Float
/// Array of tokens in this analysis
public let tokens: [Token]
public init(score: Float, tokens: [Token]) {
self.score = score
self.tokens = tokens
}
}
/// Represents a sentence in the split result
public struct Sentence: Codable {
/// The sentence text
public let text: String
/// Starting position in original text
public let start: Int
/// Length of the sentence
public let length: Int
public init(text: String, start: Int, length: Int) {
self.text = text
self.start = start
self.length = length
}
}

View file

@ -0,0 +1,123 @@
import Foundation
import CKiwi
/// Typo transformer for automatic typo correction
public final class TypoTransformer {
internal let handle: kiwi_typo_h
private let shouldClose: Bool
internal init(handle: kiwi_typo_h, shouldClose: Bool = true) {
self.handle = handle
self.shouldClose = shouldClose
}
/// Create a new empty typo transformer
/// - Throws: KiwiError if creation fails
public init() throws {
guard let handle = kiwi_typo_init() else {
if let errorMsg = kiwi_error() {
let error = String(cString: errorMsg)
kiwi_clear_error()
throw KiwiError.operationFailed(error)
}
throw KiwiError.operationFailed("Failed to create typo transformer")
}
self.handle = handle
self.shouldClose = true
}
/// Get the default basic typo transformer
/// - Returns: A typo transformer with basic typo set
/// - Throws: KiwiError if creation fails
public static func basic() throws -> TypoTransformer {
guard let handle = kiwi_typo_get_basic() else {
if let errorMsg = kiwi_error() {
let error = String(cString: errorMsg)
kiwi_clear_error()
throw KiwiError.operationFailed(error)
}
throw KiwiError.operationFailed("Failed to get basic typo transformer")
}
return TypoTransformer(handle: handle, shouldClose: false)
}
/// Typo set types
public enum TypoSet: Int32 {
case withoutTypo = 0
case basicTypoSet = 1
case continualTypoSet = 2
case basicTypoSetWithContinual = 3
case lengtheningTypoSet = 4
case basicTypoSetWithContinualAndLengthening = 5
}
/// Get default typo transformer with specified typo set
/// - Parameter typoSet: The typo set to use
/// - Returns: A typo transformer
/// - Throws: KiwiError if creation fails
public static func `default`(_ typoSet: TypoSet = .basicTypoSet) throws -> TypoTransformer {
guard let handle = kiwi_typo_get_default(typoSet.rawValue) else {
if let errorMsg = kiwi_error() {
let error = String(cString: errorMsg)
kiwi_clear_error()
throw KiwiError.operationFailed(error)
}
throw KiwiError.operationFailed("Failed to get default typo transformer")
}
return TypoTransformer(handle: handle, shouldClose: false)
}
/// Copy this typo transformer
/// - Returns: A new typo transformer with the same configuration
/// - Throws: KiwiError if copy fails
public func copy() throws -> TypoTransformer {
guard let newHandle = kiwi_typo_copy(handle) else {
if let errorMsg = kiwi_error() {
let error = String(cString: errorMsg)
kiwi_clear_error()
throw KiwiError.operationFailed(error)
}
throw KiwiError.operationFailed("Failed to copy typo transformer")
}
return TypoTransformer(handle: newHandle, shouldClose: true)
}
/// Prepare the typo transformer for use in analysis
/// - Returns: A PreparedTypoTransformer ready for use
/// - Throws: KiwiError if preparation fails
public func prepare() throws -> PreparedTypoTransformer {
guard let preparedHandle = kiwi_typo_prepare(handle) else {
if let errorMsg = kiwi_error() {
let error = String(cString: errorMsg)
kiwi_clear_error()
throw KiwiError.operationFailed(error)
}
throw KiwiError.operationFailed("Failed to prepare typo transformer")
}
return PreparedTypoTransformer(handle: preparedHandle)
}
deinit {
if shouldClose {
kiwi_typo_close(handle)
}
}
}
/// Prepared typo transformer for use in analysis
public final class PreparedTypoTransformer {
internal let handle: kiwi_prepared_typo_h
internal init(handle: kiwi_prepared_typo_h) {
self.handle = handle
}
deinit {
kiwi_prepared_typo_close(handle)
}
}

View file

@ -0,0 +1,263 @@
import XCTest
@testable import Kiwi
final class KiwiTests: XCTestCase {
func testPOSTagDescription() {
XCTAssertEqual(POSTag.nng.description, "NNG")
XCTAssertEqual(POSTag.nnp.description, "NNP")
XCTAssertEqual(POSTag.vv.description, "VV")
XCTAssertEqual(POSTag.jks.description, "JKS")
}
func testPOSTagFromString() {
XCTAssertEqual(POSTag(string: "NNG"), .nng)
XCTAssertEqual(POSTag(string: "VV"), .vv)
XCTAssertEqual(POSTag(string: "nng"), .nng)
XCTAssertNil(POSTag(string: "INVALID"))
}
func testMatchOptionsBasic() {
let options: MatchOptions = [.url, .email]
XCTAssertTrue(options.contains(.url))
XCTAssertTrue(options.contains(.email))
XCTAssertFalse(options.contains(.hashtag))
}
func testMatchOptionsAll() {
let options = MatchOptions.all
XCTAssertTrue(options.contains(.url))
XCTAssertTrue(options.contains(.email))
XCTAssertTrue(options.contains(.hashtag))
XCTAssertTrue(options.contains(.mention))
}
func testDialectOptions() {
let dialects: Dialect = [.gyeonggi, .jeju]
XCTAssertTrue(dialects.contains(.gyeonggi))
XCTAssertTrue(dialects.contains(.jeju))
XCTAssertFalse(dialects.contains(.gangwon))
}
func testTokenInitialization() {
let token = Token(
form: "테스트",
tag: .nng,
position: 0,
length: 3,
score: 1.0
)
XCTAssertEqual(token.form, "테스트")
XCTAssertEqual(token.tag, .nng)
XCTAssertEqual(token.position, 0)
XCTAssertEqual(token.length, 3)
XCTAssertEqual(token.score, 1.0)
}
func testTokenDescription() {
let token = Token(form: "테스트", tag: .nng)
XCTAssertEqual(token.description, "테스트/NNG")
}
func testVersion() {
let version = Kiwi.version
XCTAssertFalse(version.isEmpty)
XCTAssertNotEqual(version, "unknown")
}
// MARK: - Integration Tests
func testKiwiBuilderAndTokenize() throws {
// Try to find model path - in CI it should be at ../../models/cong/base
let modelPath = "../../models/cong/base"
// Skip test if model not available (for local development without models)
let fileManager = FileManager.default
guard fileManager.fileExists(atPath: modelPath) else {
print("Model not found at \(modelPath), skipping integration test")
return
}
// Create builder and build Kiwi instance
let builder = try KiwiBuilder(modelPath: modelPath, numThreads: 1)
let kiwi = try builder.build()
// Test basic tokenization
let text = "안녕하세요"
let tokens = try kiwi.tokenize(text)
// Verify we got some tokens
XCTAssertFalse(tokens.isEmpty, "Tokenization should return tokens")
// Verify token structure
for token in tokens {
XCTAssertFalse(token.form.isEmpty, "Token form should not be empty")
XCTAssertGreaterThanOrEqual(token.position, 0, "Token position should be non-negative")
XCTAssertGreaterThan(token.length, 0, "Token length should be positive")
}
}
func testKiwiAnalyze() throws {
let modelPath = "../../models/cong/base"
guard FileManager.default.fileExists(atPath: modelPath) else {
print("Model not found, skipping test")
return
}
let builder = try KiwiBuilder(modelPath: modelPath, numThreads: 1)
let kiwi = try builder.build()
// Test analyze with multiple results
let text = "형태소 분석"
let results = try kiwi.analyze(text, topN: 2)
// Should have at least one result
XCTAssertFalse(results.isEmpty, "Analysis should return results")
// First result should have tokens
if let firstResult = results.first {
XCTAssertFalse(firstResult.tokens.isEmpty, "Result should have tokens")
// Score is log probability, so it can be negative
XCTAssertFalse(firstResult.score.isNaN, "Result score should not be NaN")
}
}
func testSplitIntoSentences() throws {
let modelPath = "../../models/cong/base"
guard FileManager.default.fileExists(atPath: modelPath) else {
print("Model not found, skipping test")
return
}
let builder = try KiwiBuilder(modelPath: modelPath, numThreads: 1)
let kiwi = try builder.build()
// Test sentence splitting
let text = "안녕하세요. 키위입니다. 형태소 분석을 합니다."
let sentences = try kiwi.splitIntoSentences(text)
// Should have 3 sentences
XCTAssertEqual(sentences.count, 3, "Should split into 3 sentences")
// Verify sentence structure
for sentence in sentences {
XCTAssertFalse(sentence.text.isEmpty, "Sentence text should not be empty")
XCTAssertGreaterThanOrEqual(sentence.start, 0, "Sentence start should be non-negative")
XCTAssertGreaterThan(sentence.length, 0, "Sentence length should be positive")
}
}
func testJoiner() throws {
let modelPath = "../../models/cong/base"
guard FileManager.default.fileExists(atPath: modelPath) else {
print("Model not found, skipping test")
return
}
let builder = try KiwiBuilder(modelPath: modelPath, numThreads: 1)
let kiwi = try builder.build()
// Test joiner
let joiner = try kiwi.createJoiner()
try joiner.add(form: "형태소", tag: .nng)
try joiner.add(form: "분석", tag: .nng)
let joined = try joiner.join()
XCTAssertFalse(joined.isEmpty, "Joined text should not be empty")
}
// MARK: - Typo Correction Tests
func testBasicTypoCorrection() throws {
let modelPath = "../../models/cong/base"
guard FileManager.default.fileExists(atPath: modelPath) else {
print("Model not found, skipping test")
return
}
let builder = try KiwiBuilder(modelPath: modelPath, numThreads: 1)
let kiwi = try builder.build()
// Prepare basic typo transformer
let typoTransformer = try TypoTransformer.default(.basicTypoSet)
let preparedTypo = try typoTransformer.prepare()
// Without typo correction: '' remains as-is
let tokensNoTypo = try kiwi.tokenize("나 죰 도와죠.")
let formsNoTypo = tokensNoTypo.map { $0.form }
XCTAssertTrue(formsNoTypo.contains(""), "Without typo correction, '죰' should remain")
// With typo correction: '' ''
let tokensWithTypo = try kiwi.tokenize("나 죰 도와죠.", typoTransformer: preparedTypo)
let formsWithTypo = tokensWithTypo.map { $0.form }
XCTAssertTrue(formsWithTypo.contains(""), "With typo correction, '죰' should become '좀'")
}
func testContinualTypoCorrection() throws {
let modelPath = "../../models/cong/base"
guard FileManager.default.fileExists(atPath: modelPath) else {
print("Model not found, skipping test")
return
}
let builder = try KiwiBuilder(modelPath: modelPath, numThreads: 1)
let kiwi = try builder.build()
let typoTransformer = try TypoTransformer.default(.continualTypoSet)
let preparedTypo = try typoTransformer.prepare()
let tokens = try kiwi.tokenize("프로그래미", typoTransformer: preparedTypo)
XCTAssertEqual(tokens[0].form, "프로그램")
XCTAssertEqual(tokens[1].form, "")
}
func testTypoCorrectionViaAnalyze() throws {
let modelPath = "../../models/cong/base"
guard FileManager.default.fileExists(atPath: modelPath) else {
print("Model not found, skipping test")
return
}
let builder = try KiwiBuilder(modelPath: modelPath, numThreads: 1)
let kiwi = try builder.build()
let typoTransformer = try TypoTransformer.default(.basicTypoSet)
let preparedTypo = try typoTransformer.prepare()
let results = try kiwi.analyze("나 죰 도와죠.", topN: 1, typoTransformer: preparedTypo)
XCTAssertFalse(results.isEmpty)
let forms = results[0].tokens.map { $0.form }
XCTAssertTrue(forms.contains(""))
}
func testBasicWithContinualTypoCorrection() throws {
let modelPath = "../../models/cong/base"
guard FileManager.default.fileExists(atPath: modelPath) else {
print("Model not found, skipping test")
return
}
let builder = try KiwiBuilder(modelPath: modelPath, numThreads: 1)
let kiwi = try builder.build()
let typoTransformer = try TypoTransformer.default(.basicTypoSetWithContinual)
let preparedTypo = try typoTransformer.prepare()
// continual typo: '' '' + ''
let tokens1 = try kiwi.tokenize("프로그래미", typoTransformer: preparedTypo)
XCTAssertEqual(tokens1[0].form, "프로그램")
// basic typo: '' ''
let tokens2 = try kiwi.tokenize("나 죰 도와죠.", typoTransformer: preparedTypo)
let forms = tokens2.map { $0.form }
XCTAssertTrue(forms.contains(""))
}
}

View file

@ -0,0 +1,124 @@
#!/bin/bash
set -e
# Build script for creating XCFramework for iOS and macOS
# This script builds the Kiwi library for multiple platforms and architectures
# and combines them into a single XCFramework
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_ROOT="$(cd "$SCRIPT_DIR/../../.." && pwd)"
SWIFT_DIR="$PROJECT_ROOT/bindings/swift"
BUILD_DIR="$SWIFT_DIR/build"
XCFRAMEWORK_DIR="$SWIFT_DIR/xcframework"
# Clean previous builds
echo "Cleaning previous builds..."
rm -rf "$BUILD_DIR"
rm -rf "$XCFRAMEWORK_DIR"
mkdir -p "$BUILD_DIR"
mkdir -p "$XCFRAMEWORK_DIR"
# Function to build for a specific platform
build_platform() {
local PLATFORM=$1
local SDK=$2
local ARCHS=$3
local DEPLOYMENT_TARGET=$4
local BUILD_SUBDIR=$5
echo "Building for $PLATFORM ($ARCHS)..."
local PLATFORM_BUILD_DIR="$BUILD_DIR/$BUILD_SUBDIR"
mkdir -p "$PLATFORM_BUILD_DIR"
cd "$PLATFORM_BUILD_DIR"
cmake "$PROJECT_ROOT" \
-DCMAKE_BUILD_TYPE=Release \
-DCMAKE_SYSTEM_NAME=$PLATFORM \
-DCMAKE_OSX_SYSROOT=$SDK \
-DCMAKE_OSX_ARCHITECTURES="$ARCHS" \
-DCMAKE_OSX_DEPLOYMENT_TARGET=$DEPLOYMENT_TARGET \
-DKIWI_BUILD_DYNAMIC=OFF \
-DKIWI_BUILD_CLI=OFF \
-DKIWI_BUILD_EVALUATOR=OFF \
-DKIWI_BUILD_MODEL_BUILDER=OFF \
-DKIWI_BUILD_TEST=OFF \
-DKIWI_JAVA_BINDING=OFF \
-DKIWI_USE_MIMALLOC=ON \
-DKIWI_USE_CPUINFO=OFF \
-GXcode
cmake --build . --config Release
# Create framework structure
local FRAMEWORK_DIR="$PLATFORM_BUILD_DIR/Kiwi.framework"
mkdir -p "$FRAMEWORK_DIR/Headers"
# Copy library
# Use find to locate the library because Xcode puts it in different places
# depending on the platform (e.g., Release-iphoneos, Release-iphonesimulator, Release)
local LIB_FILE=$(find . -name "libkiwi_static.a" | grep "Release" | head -n 1)
if [ -z "$LIB_FILE" ]; then
echo "Error: libkiwi_static.a not found in $PLATFORM_BUILD_DIR"
exit 1
fi
cp "$LIB_FILE" "$FRAMEWORK_DIR/Kiwi"
# Copy headers
cp "$PROJECT_ROOT/include/kiwi/capi.h" "$FRAMEWORK_DIR/Headers/"
cp "$PROJECT_ROOT/include/kiwi/Macro.h" "$FRAMEWORK_DIR/Headers/"
# Create module map
mkdir -p "$FRAMEWORK_DIR/Modules"
cat > "$FRAMEWORK_DIR/Modules/module.modulemap" << EOF
framework module Kiwi {
umbrella header "capi.h"
export *
module * { export * }
}
EOF
echo "✓ Built $PLATFORM"
}
# Build for iOS Device (arm64)
build_platform "iOS" "iphoneos" "arm64" "12.0" "ios-arm64"
# Build for iOS Simulator (arm64 + x86_64)
build_platform "iOS" "iphonesimulator" "arm64;x86_64" "12.0" "ios-simulator"
# Build for macOS (arm64 + x86_64 universal)
build_platform "Darwin" "macosx" "arm64;x86_64" "10.14" "macos"
# Create XCFramework
echo "Creating XCFramework..."
xcodebuild -create-xcframework \
-framework "$BUILD_DIR/ios-arm64/Kiwi.framework" \
-framework "$BUILD_DIR/ios-simulator/Kiwi.framework" \
-framework "$BUILD_DIR/macos/Kiwi.framework" \
-output "$XCFRAMEWORK_DIR/Kiwi.xcframework"
echo "✓ XCFramework created at $XCFRAMEWORK_DIR/Kiwi.xcframework"
# Create zip file for distribution
echo "Creating zip archive..."
cd "$XCFRAMEWORK_DIR"
zip -r -y Kiwi.xcframework.zip Kiwi.xcframework
# Calculate checksum for Swift Package Manager
CHECKSUM=$(swift package compute-checksum Kiwi.xcframework.zip 2>/dev/null || echo "N/A")
echo "Checksum: $CHECKSUM"
echo ""
echo "Build complete!"
echo "XCFramework location: $XCFRAMEWORK_DIR/Kiwi.xcframework"
echo "Zip archive: $XCFRAMEWORK_DIR/Kiwi.xcframework.zip"
echo ""
echo "To use with Swift Package Manager binaryTarget:"
echo " .binaryTarget("
echo " name: \"CKiwi\","
echo " url: \"<RELEASE_URL>/Kiwi.xcframework.zip\","
echo " checksum: \"$CHECKSUM\""
echo " )"

View file

@ -15,6 +15,17 @@ Running the above command also automatically upgrades to package version if it d
You can also find the recent pre-built package at npm: https://www.npmjs.com/package/kiwi-nlp.
## Testing
To run unit tests for the WASM package, first build the package using `./build.sh`, then run:
```bash
cd package
npm test
```
Tests are powered by [Vitest](https://vitest.dev/) and run in a Node.js environment.
## Documentation
The documentation for the package can be generated by running `npm run doc` inside the `package` directory.

View file

@ -22,6 +22,8 @@ int nextInstanceId() {
static std::map<int, std::unordered_set<const Morpheme*>> morphemeSets;
static std::map<std::string, PreparedTypoTransformer> preparedTypoCache;
int nextMorphemeSetId() {
static int id = 0;
return id++;
@ -132,6 +134,70 @@ std::vector<PretokenizedSpan> parsePretokenizedArg(const json& args, size_t inde
}
const PreparedTypoTransformer* parseTypoArg(const json& args, size_t index, float& typoCostThreshold) {
if (args.size() <= index) return nullptr;
const auto& typoArg = args.at(index);
if (typoArg.is_null()) return nullptr;
typoCostThreshold = getAtOrDefault(args, index + 1, 2.5f);
std::string cacheKey;
if (typoArg.is_string()) {
cacheKey = typoArg.get<std::string>();
if (cacheKey == "none") return nullptr;
} else {
cacheKey = typoArg.dump();
}
auto it = preparedTypoCache.find(cacheKey);
if (it != preparedTypoCache.end()) {
return &it->second;
}
if (typoArg.is_string()) {
const std::string typosStr = typoArg.get<std::string>();
DefaultTypoSet typoSet = DefaultTypoSet::withoutTypo;
if (typosStr == "basic") {
typoSet = DefaultTypoSet::basicTypoSet;
} else if (typosStr == "continual") {
typoSet = DefaultTypoSet::continualTypoSet;
} else if (typosStr == "basicWithContinual") {
typoSet = DefaultTypoSet::basicTypoSetWithContinual;
}
return getDefaultPreparedTypoSet(typoSet);
} else {
TypoTransformer typoTransformer;
for (const auto& def : typoArg.value("defs", json::array())) {
const float cost = def.value("cost", 1.0f);
CondVowel condVowel = CondVowel::none;
const std::string condVowelStr = def.value("condition", "none");
if (condVowelStr == "any") {
condVowel = CondVowel::any;
} else if (condVowelStr == "vowel") {
condVowel = CondVowel::vowel;
} else if (condVowelStr == "applosive") {
condVowel = CondVowel::applosive;
}
for (const auto& orig8 : def["orig"]) {
const auto orig16 = utf8To16(orig8);
for (const auto& error8 : def["error"]) {
typoTransformer.addTypo(orig16, utf8To16(error8), cost, condVowel);
}
}
}
const float continualTypoCost = typoArg.value("continualTypoCost", 1.0f);
typoTransformer.setContinualTypoCost(continualTypoCost);
auto [insertIt, _] = preparedTypoCache.emplace(cacheKey, typoTransformer.prepare(true));
return &insertIt->second;
}
}
inline json serializeTokenInfo(const Kiwi& kiwi, const TokenInfo& tokenInfo) {
return {
{ "str", utf16To8(tokenInfo.str) },
@ -345,54 +411,7 @@ json build(const json& args) {
builder.addPreAnalyzedWord(form, analyzed, positions, score);
}
const auto typos = buildArgs.value("typos", json(nullptr));
const float typoCostThreshold = buildArgs.value("typoCostThreshold", 2.5f);
if (typos.is_null()) {
instances.emplace(id, builder.build(DefaultTypoSet::withoutTypo, typoCostThreshold));
} else if (typos.is_string()) {
DefaultTypoSet typoSet = DefaultTypoSet::withoutTypo;
const std::string typosStr = typos.get<std::string>();
if (typosStr == "basic") {
typoSet = DefaultTypoSet::basicTypoSet;
} else if (typosStr == "continual") {
typoSet = DefaultTypoSet::continualTypoSet;
} else if (typosStr == "basicWithContinual") {
typoSet = DefaultTypoSet::basicTypoSetWithContinual;
}
instances.emplace(id, builder.build(typoSet, typoCostThreshold));
} else {
TypoTransformer typoTransformer;
for (const auto& def : typos.value("defs", json::array())) {
const float cost = def.value("cost", 1.0f);
CondVowel condVowel = CondVowel::none;
const std::string condVowelStr = def.value("condVowel", "none");
if (condVowelStr == "any") {
condVowel = CondVowel::any;
} else if (condVowelStr == "vowel") {
condVowel = CondVowel::vowel;
} else if (condVowelStr == "applosive") {
condVowel = CondVowel::applosive;
}
for (const auto& orig8 : def["orig"]) {
const auto orig16 = utf8To16(orig8);
for (const auto& error8 : def["error"]) {
typoTransformer.addTypo(orig16, utf8To16(error8), cost, condVowel);
}
}
}
const float continualTypoCost = typos.value("continualTypoCost", 1.0f);
typoTransformer.setContinualTypoCost(continualTypoCost);
instances.emplace(id, builder.build(typoTransformer, typoCostThreshold));
}
instances.emplace(id, builder.build());
return id;
}
@ -410,10 +429,15 @@ json kiwiAnalyze(Kiwi& kiwi, const json& args) {
const Match matchOptions = getAtOrDefault(args, 1, Match::allWithNormalizing);
const BlockListArg blockListArg(kiwi, args, 2);
const auto pretokenized = parsePretokenizedArg(args, 3);
const auto allowedDialects = parseDialects(getAtOrDefault(args, 4, std::string{ "standard" }));
const auto dialectCost = getAtOrDefault(args, 5, 3.0f);
const TokenResult tokenResult = kiwi.analyze(str, AnalyzeOption{ matchOptions, blockListArg.setPtr(), false, allowedDialects, dialectCost }, pretokenized);
float typoCostThreshold = 2.5f;
const auto* typoTransformer = parseTypoArg(args, 4, typoCostThreshold);
const auto allowedDialects = parseDialects(getAtOrDefault(args, 6, std::string{ "standard" }));
const auto dialectCost = getAtOrDefault(args, 7, 3.0f);
AnalyzeOption opt{ matchOptions, blockListArg.setPtr(), false, allowedDialects, dialectCost };
opt.typoTransformer = typoTransformer;
opt.typoThreshold = typoCostThreshold;
const TokenResult tokenResult = kiwi.analyze(str, opt, pretokenized);
return serializeTokenResult(kiwi, tokenResult);
}
@ -424,10 +448,15 @@ json kiwiAnalyzeTopN(Kiwi& kiwi, const json& args) {
const Match matchOptions = getAtOrDefault(args, 2, Match::allWithNormalizing);
const BlockListArg blockListArg(kiwi, args, 3);
const auto pretokenized = parsePretokenizedArg(args, 4);
const auto allowedDialects = parseDialects(getAtOrDefault(args, 4, std::string{ "standard" }));
const auto dialectCost = getAtOrDefault(args, 5, 3.0f);
float typoCostThreshold = 2.5f;
const auto* typoTransformer = parseTypoArg(args, 5, typoCostThreshold);
const auto allowedDialects = parseDialects(getAtOrDefault(args, 7, std::string{ "standard" }));
const auto dialectCost = getAtOrDefault(args, 8, 3.0f);
const std::vector<TokenResult> tokenResults = kiwi.analyze(str, topN, AnalyzeOption{ matchOptions, blockListArg.setPtr(), false, allowedDialects, dialectCost }, pretokenized);
AnalyzeOption opt{ matchOptions, blockListArg.setPtr(), false, allowedDialects, dialectCost };
opt.typoTransformer = typoTransformer;
opt.typoThreshold = typoCostThreshold;
const std::vector<TokenResult> tokenResults = kiwi.analyze(str, topN, opt, pretokenized);
return serializeTokenResultVec(kiwi, tokenResults);
}
@ -437,10 +466,15 @@ json kiwiTokenize(Kiwi& kiwi, const json& args) {
const Match matchOptions = getAtOrDefault(args, 1, Match::allWithNormalizing);
const BlockListArg blockListArg(kiwi, args, 2);
const auto pretokenized = parsePretokenizedArg(args, 3);
const auto allowedDialects = parseDialects(getAtOrDefault(args, 4, std::string{ "standard" }));
const auto dialectCost = getAtOrDefault(args, 5, 3.0f);
const TokenResult tokenResult = kiwi.analyze(str, AnalyzeOption{ matchOptions, blockListArg.setPtr(), false, allowedDialects, dialectCost }, pretokenized);
float typoCostThreshold = 2.5f;
const auto* typoTransformer = parseTypoArg(args, 4, typoCostThreshold);
const auto allowedDialects = parseDialects(getAtOrDefault(args, 6, std::string{ "standard" }));
const auto dialectCost = getAtOrDefault(args, 7, 3.0f);
AnalyzeOption opt{ matchOptions, blockListArg.setPtr(), false, allowedDialects, dialectCost };
opt.typoTransformer = typoTransformer;
opt.typoThreshold = typoCostThreshold;
const TokenResult tokenResult = kiwi.analyze(str, opt, pretokenized);
return serializeTokenInfoVec(kiwi, tokenResult.first);
}
@ -451,10 +485,15 @@ json kiwiTokenizeTopN(Kiwi& kiwi, const json& args) {
const Match matchOptions = getAtOrDefault(args, 2, Match::allWithNormalizing);
const BlockListArg blockListArg(kiwi, args, 3);
const auto pretokenized = parsePretokenizedArg(args, 4);
const auto allowedDialects = parseDialects(getAtOrDefault(args, 5, std::string{ "standard" }));
const auto dialectCost = getAtOrDefault(args, 6, 3.0f);
float typoCostThreshold = 2.5f;
const auto* typoTransformer = parseTypoArg(args, 5, typoCostThreshold);
const auto allowedDialects = parseDialects(getAtOrDefault(args, 7, std::string{ "standard" }));
const auto dialectCost = getAtOrDefault(args, 8, 3.0f);
const std::vector<TokenResult> tokenResults = kiwi.analyze(str, topN, AnalyzeOption{ matchOptions, blockListArg.setPtr(), false, allowedDialects, dialectCost }, pretokenized);
AnalyzeOption opt{ matchOptions, blockListArg.setPtr(), false, allowedDialects, dialectCost };
opt.typoTransformer = typoTransformer;
opt.typoThreshold = typoCostThreshold;
const std::vector<TokenResult> tokenResults = kiwi.analyze(str, topN, opt, pretokenized);
json result = json::array();
for (const TokenResult& tokenResult : tokenResults) {
@ -528,25 +567,36 @@ json kiwiGetGlobalConfig(Kiwi& kiwi, const json& args) {
auto config = kiwi.getGlobalConfig();
obj["integrateAllomorph"] = config.integrateAllomorph;
obj["cutOffThreshold"] = config.cutOffThreshold;
obj["unkFormScoreScale"] = config.unkFormScoreScale;
obj["unkFormScoreBias"] = config.unkFormScoreBias;
obj["oovRuleScale"] = config.oovRuleScale;
obj["oovRuleBias"] = config.oovRuleBias;
obj["oovChrBias"] = config.oovChrBias;
obj["oovGlobalWeight"] = config.oovGlobalWeight;
obj["oovLocalWeight"] = config.oovLocalWeight;
obj["oovGlobalMinFreq"] = config.oovGlobalMinFreq;
obj["spacePenalty"] = config.spacePenalty;
obj["typoCostWeight"] = config.typoCostWeight;
obj["maxUnkFormSize"] = config.maxUnkFormSize;
obj["maxUnkFormSizeFollowedByJClass"] = config.maxUnkFormSizeFollowedByJClass;
obj["spaceTolerance"] = config.spaceTolerance;
return obj;
}
json kiwiSetGlobalConfig(Kiwi& kiwi, const json& args) {
KiwiConfig config;
if (args.contains("integrateAllomorph")) config.integrateAllomorph = args["integrateAllomorph"];
if (args.contains("cutOffThreshold")) config.cutOffThreshold = args["cutOffThreshold"];
if (args.contains("unkFormScoreScale")) config.unkFormScoreScale = args["unkFormScoreScale"];
if (args.contains("unkFormScoreBias")) config.unkFormScoreBias = args["unkFormScoreBias"];
if (args.contains("spacePenalty")) config.spacePenalty = args["spacePenalty"];
if (args.contains("typoCostWeight")) config.typoCostWeight = args["typoCostWeight"];
if (args.contains("maxUnkFormSize")) config.maxUnkFormSize = args["maxUnkFormSize"];
if (args.contains("spaceTolerance")) config.spaceTolerance = args["spaceTolerance"];
auto config = kiwi.getGlobalConfig();
const json& configArg = args[0];
if (configArg.contains("integrateAllomorph")) config.integrateAllomorph = configArg["integrateAllomorph"];
if (configArg.contains("cutOffThreshold")) config.cutOffThreshold = configArg["cutOffThreshold"];
if (configArg.contains("oovRuleScale")) config.oovRuleScale = configArg["oovRuleScale"];
if (configArg.contains("oovRuleBias")) config.oovRuleBias = configArg["oovRuleBias"];
if (configArg.contains("oovChrBias")) config.oovChrBias = configArg["oovChrBias"];
if (configArg.contains("oovGlobalWeight")) config.oovGlobalWeight = configArg["oovGlobalWeight"];
if (configArg.contains("oovLocalWeight")) config.oovLocalWeight = configArg["oovLocalWeight"];
if (configArg.contains("oovGlobalMinFreq")) config.oovGlobalMinFreq = configArg["oovGlobalMinFreq"];
if (configArg.contains("spacePenalty")) config.spacePenalty = configArg["spacePenalty"];
if (configArg.contains("typoCostWeight")) config.typoCostWeight = configArg["typoCostWeight"];
if (configArg.contains("maxUnkFormSize")) config.maxUnkFormSize = configArg["maxUnkFormSize"];
if (configArg.contains("maxUnkFormSizeFollowedByJClass")) config.maxUnkFormSizeFollowedByJClass = configArg["maxUnkFormSizeFollowedByJClass"];
if (configArg.contains("spaceTolerance")) config.spaceTolerance = configArg["spaceTolerance"];
kiwi.setGlobalConfig(config);
return nullptr;
}

File diff suppressed because it is too large Load diff

View file

@ -21,12 +21,20 @@
"url": "https://github.com/bab2min/Kiwi/issues"
},
"homepage": "https://lab.bab2min.pe.kr/kiwi",
"publishConfig": {
"provenance": true,
"access": "public"
},
"devDependencies": {
"@types/node": "^20.0.0",
"typedoc": "^0.26.2",
"typescript": "^5.4.5"
"typescript": "^5.4.5",
"undici-types": "^7.22.0",
"vitest": "^1.6.0"
},
"scripts": {
"build": "tsc",
"doc": "typedoc --out doc src"
"doc": "typedoc --out doc src",
"test": "vitest run"
}
}

View file

@ -148,14 +148,4 @@ export interface BuildArgs {
* - `cong-global`: (experimental) Contextual N-gram embedding Language Model. It consists of lightweighted neural networks that can estimate the relationships between morphemes over large distances (up to 7 real morphemes) with high accuracy.
*/
modelType?: 'none' | 'largest' | 'knlm' | 'sbg' | 'cong' | 'cong-global';
/**
* The typo information to use for correction.
* Can be one of the built in `none`, `basic`, `continual`, `basicWithContinual` typo sets, or a custom {@link TypoTransformer}.
* Defaults to `none`, which disables typo correction.
*/
typos?: 'none' | 'basic' | 'continual' | 'basicWithContinual' | TypoTransformer;
/**
* The maximum typo cost to consider when correcting typos. Typos beyond this cost will not be explored. Defaults to 2.5.
*/
typoCostThreshold?: number;
};

View file

@ -1,4 +1,5 @@
import { AsyncMethods } from './util.js';
import { TypoTransformer } from './build-args.js';
/**
* Describes a single morpheme in the input string of the morphological analysis.
@ -82,6 +83,11 @@ export enum Match {
mention = 1 << 3,
serial = 1 << 4,
emoji = 1 << 5,
oovRuleOnly = 0 << 8,
oovChrModel = 1 << 8,
oovChrFreqModel = 2 << 8,
oovChrFreqBranchModel = 3 << 8,
oovMask = 3 << 8,
normalizeCoda = 1 << 16,
joinNounPrefix = 1 << 17,
joinNounSuffix = 1 << 18,
@ -91,6 +97,9 @@ export enum Match {
splitComplex = 1 << 22,
zCoda = 1 << 23,
compatibleJamo = 1 << 24,
splitSaisiot = 1 << 25,
mergeSaisiot = 1 << 26,
joinParticleYo = 1 << 27,
joinVSuffix = joinVerbSuffix | joinAdjSuffix,
joinAffix = joinNounPrefix |
joinNounSuffix |
@ -150,6 +159,25 @@ export interface PretokenizedSpan {
tokenization: PretokenizedToken[];
}
/**
* Describes global configuration for Kiwi.
*/
export interface KiwiConfig {
integrateAllomorph?: boolean;
cutOffThreshold?: number;
oovRuleScale?: number;
oovRuleBias?: number;
oovChrBias?: number;
oovGlobalWeight?: number;
oovLocalWeight?: number;
oovGlobalMinFreq?: number;
spacePenalty?: number;
typoCostWeight?: number;
maxUnkFormSize?: number;
maxUnkFormSizeFollowedByJClass?: number;
spaceTolerance?: number;
}
/**
* Interface that performs the actual morphological analysis.
* Cannot be constructed directly, use {@link KiwiBuilder} to create a new instance.
@ -177,7 +205,9 @@ export interface Kiwi {
str: string,
matchOptions?: Match,
blockList?: Morph[] | MorphemeSet,
pretokenized?: PretokenizedSpan[]
pretokenized?: PretokenizedSpan[],
typos?: 'none' | 'basic' | 'continual' | 'basicWithContinual' | TypoTransformer,
typoCostThreshold?: number
) => TokenResult;
/**
* Performs morphological analysis. Returns multiple list of tokens along with an analysis score. Use `tokenizeTopN` if the result scores are not needed. Use `analyze` if you need only one result.
@ -193,7 +223,9 @@ export interface Kiwi {
n: number,
matchOptions?: Match,
blockList?: Morph[] | MorphemeSet,
pretokenized?: PretokenizedSpan[]
pretokenized?: PretokenizedSpan[],
typos?: 'none' | 'basic' | 'continual' | 'basicWithContinual' | TypoTransformer,
typoCostThreshold?: number
) => TokenResult[];
/**
* Performs morphological analysis. Returns a single list of tokens. Use `analyze` if the result score is needed. Use `tokenizeTopN` if you need multiple results.
@ -207,7 +239,9 @@ export interface Kiwi {
str: string,
matchOptions?: Match,
blockList?: Morph[] | MorphemeSet,
pretokenized?: PretokenizedSpan[]
pretokenized?: PretokenizedSpan[],
typos?: 'none' | 'basic' | 'continual' | 'basicWithContinual' | TypoTransformer,
typoCostThreshold?: number
) => TokenInfo[];
/**
* Performs morphological analysis. Returns multiple lists of tokens. Use `analyzeTopN` if the result scores are needed. Use `tokenize` if you need only one result.
@ -223,7 +257,9 @@ export interface Kiwi {
n: number,
matchOptions?: Match,
blockList?: Morph[] | MorphemeSet,
pretokenized?: PretokenizedSpan[]
pretokenized?: PretokenizedSpan[],
typos?: 'none' | 'basic' | 'continual' | 'basicWithContinual' | TypoTransformer,
typoCostThreshold?: number
) => TokenInfo[][];
/**
* Returns the input text split into sentences. This method uses stemming internally during the sentence splitting process, so it can also be used to get stemming results simultaneously with sentence splitting.
@ -249,22 +285,10 @@ export interface Kiwi {
lmSearch?: boolean,
withRanges?: boolean
) => SentenceJoinResult;
getCutOffThreshold: () => number;
setCutOffThreshold: (v: number) => void;
getUnkScoreBias: () => number;
setUnkScoreBias: (v: number) => void;
getUnkScoreScale: () => number;
setUnkScoreScale: (v: number) => void;
getMaxUnkFormSize: () => number;
setMaxUnkFormSize: (v: number) => void;
getSpaceTolerance: () => number;
setSpaceTolerance: (v: number) => void;
getSpacePenalty: () => number;
setSpacePenalty: (v: number) => void;
getTypoCostWeight: () => number;
setTypoCostWeight: (v: number) => void;
getIntegrateAllomorphic: () => boolean;
setIntegrateAllomorphic: (v: boolean) => void;
getGlobalConfig: () => KiwiConfig;
setGlobalConfig: (config: KiwiConfig) => void;
/**
* Creates a reusable morpheme set from a list of morphemes. This is intended to be used as the `blockList` parameter for the analyse and tokenize methods.
* NOTE: The morpheme set must be destroyed using `destroyMorphemeSet` when it is no longer needed. Otherwise, it will cause a memory leak.
@ -280,6 +304,7 @@ export interface Kiwi {
destroyMorphemeSet: (id: MorphemeSet) => void;
}
/**
* Interface that performs the actual morphological analysis.
* Same as `Kiwi`, but with all methods returning promises. This can be used when the original `Kiwi` object is constructed with a Web Worker.

View file

@ -0,0 +1,132 @@
import { describe, it, expect, beforeAll } from 'vitest';
import { KiwiBuilder } from '../src/index.js';
import { Kiwi, Match } from '../src/kiwi.js';
import * as fs from 'fs';
import * as path from 'path';
const PROJECT_ROOT = path.resolve(__dirname, '../../../../');
const WASM_PATH = path.resolve(PROJECT_ROOT, 'bindings/wasm/build/bindings/wasm/kiwi-wasm.wasm');
const MODEL_DIR = path.resolve(PROJECT_ROOT, 'models/cong/base');
function loadModelFiles(): Record<string, Uint8Array> {
const modelFiles: Record<string, Uint8Array> = {};
const files = [
'combiningRule.txt', 'cong.mdl', 'default.dict',
'dialect.dict', 'extract.mdl', 'multi.dict',
'nounchr.mdl', 'sj.morph', 'typo.dict'
];
for (const file of files) {
const filePath = path.join(MODEL_DIR, file);
if (fs.existsSync(filePath)) {
modelFiles[file] = fs.readFileSync(filePath);
}
}
return modelFiles;
}
describe('Kiwi WASM', () => {
let kiwiBuilder: KiwiBuilder;
let kiwi: Kiwi;
beforeAll(async () => {
if (!fs.existsSync(WASM_PATH)) {
console.warn(`WASM file not found at ${WASM_PATH}. Skipping tests.`);
return;
}
kiwiBuilder = await KiwiBuilder.create(WASM_PATH);
kiwi = await kiwiBuilder.build({
modelFiles: loadModelFiles(),
modelType: 'cong',
integrateAllomorph: true,
});
});
it('should be initialized', async () => {
if (!kiwiBuilder) return;
expect(kiwiBuilder.version()).toBeTypeOf('string');
});
it('should tokenize text', async () => {
if (!kiwi) return;
expect(kiwi.ready()).toBe(true);
const result = kiwi.tokenize('안녕하세요 세계');
expect(result.length).toBeGreaterThan(0);
const tokens = result.map(t => t.str);
expect(tokens).toContain('안녕');
});
it('should split sentences', async () => {
if (!kiwi) return;
const result = kiwi.splitIntoSents('안녕하세요. 반갑습니다!');
expect(result.spans.length).toBe(2);
});
it('should correct typos with basic typo set via tokenize', async () => {
if (!kiwi) return;
// Without typo correction
const tokensNoTypo = kiwi.tokenize('나 죰 도와죠.');
expect(tokensNoTypo.map(t => t.str)).toContain('죰');
// With basic typo correction at analyze time
const tokensWithTypo = kiwi.tokenize('나 죰 도와죠.', Match.allWithNormalizing, undefined, undefined, 'basic');
expect(tokensWithTypo.map(t => t.str)).toContain('좀');
});
it('should correct typos with basic typo set via analyze', async () => {
if (!kiwi) return;
const result = kiwi.analyze('나 죰 도와죠.', Match.allWithNormalizing, undefined, undefined, 'basic');
expect(result.tokens.map(t => t.str)).toContain('좀');
});
it('should correct typos with basic typo set via analyzeTopN', async () => {
if (!kiwi) return;
const results = kiwi.analyzeTopN('나 죰 도와죠.', 3, Match.allWithNormalizing, undefined, undefined, 'basic');
expect(results.length).toBeGreaterThan(0);
expect(results[0].tokens.map(t => t.str)).toContain('좀');
});
it('should correct continual typos', async () => {
if (!kiwi) return;
const tokens = kiwi.tokenize('프로그래미', Match.allWithNormalizing, undefined, undefined, 'continual');
const forms = tokens.map(t => t.str);
expect(forms[0]).toBe('프로그램');
expect(forms[1]).toBe('이');
});
it('should correct typos with basicWithContinual', async () => {
if (!kiwi) return;
// continual typo
const tokens1 = kiwi.tokenize('프로그래미', Match.allWithNormalizing, undefined, undefined, 'basicWithContinual');
expect(tokens1.map(t => t.str)[0]).toBe('프로그램');
// basic typo
const tokens2 = kiwi.tokenize('나 죰 도와죠.', Match.allWithNormalizing, undefined, undefined, 'basicWithContinual');
expect(tokens2.map(t => t.str)).toContain('좀');
});
it('should get and set global config', async () => {
if (!kiwi) return;
const config = kiwi.getGlobalConfig();
expect(config.cutOffThreshold).toBeTypeOf('number');
const originalThreshold = config.cutOffThreshold;
kiwi.setGlobalConfig({ cutOffThreshold: 10 });
const newConfig = kiwi.getGlobalConfig();
expect(newConfig.cutOffThreshold).toBe(10);
// Restore
kiwi.setGlobalConfig({ cutOffThreshold: originalThreshold });
});
});

View file

@ -1,9 +1,11 @@
{
"compilerOptions": {
"module": "ES6",
"moduleResolution": "node",
"target": "ES2017",
"allowJs": true,
"declaration": true,
"skipLibCheck": true,
"outDir": "./dist"
},
"include": ["src/**/*"],

View file

@ -11,19 +11,19 @@
아 때는 싸드바리로 배깥을 돌아댕기기도 하다와. 아__23/NNG 때/NNG 는/JX 싸드바리__1/NNG 로/XSM 배깥__1/NNG 을/JKO 돌아댕기__1/VV 기/ETN 도/JX 하/VX 다와/EF ./SF 아이 때는 알몸으로 바깥을 돌아다니기도 합디다
이따가 지냑에 동네 으른들 모세 놓구 술대즙으 한다든데요. 이따가/MAG 지냑__1/NNG 에/JKB 동네/NNG 으른__1/NNG 들/XSN 모시/VV 어/EC 놓/VV 구/EC 술대즙__1/NNG 으/JKO 하/VV ᆫ다든데/EF 요/JX ./SF 이따가 저녁에 동네 어른들 모셔 놓고 술대접을 한다던데요.
할루근네 한 번씩 말으 일기든 우리 아덜내미거 이제는 다 커서 취직으 했잖소. 할루근네__1/MAG 한/MM 번/NNB 씩/XSN 말__34/NNG 으/JKO 일기__1/VV 든/ETM 우리/NP 아덜내미__1/NNG 거/JKS 이제/NNG 는/JX 다/MAG 크/VV 어서/EC 취직/NNG 으/JKO 하/XSV 었/EP 지/EC 않/VX 소/EC ./SF 하루건너 한 번씩 말썽을 일으키던 우리 아들내미가 이제는 다 커서 취직을 했잖소.
되바래진 어린눔이 버르장머리라군 웂구 아주 앨미르와. 되바래지/VV ᆫ/ETM 어린눔__1/NNG 이/JKS 버르장머리/NNG 이/VCP 라구/EC ᆫ/JX 웂__1/VA 구/EC 아주/MAG 앨미릅/VA-I 어/EF ./SF 되바라진 어린놈이 버르장머리라곤 없고 아주 얄미워.
되바래진 어린눔이 버르장머리라군 웂구 아주 앨미르와. 되바래지/VV ᆫ/ETM 어린눔__1/NNG 이/JKS 버르장머리/NNG 이/VCP 라구/EC ᆫ/JX 웂__1/VA 구/EC 아주/MAG 앨미릅/VA-I 어/EF ./SF 되바라진 어린놈이 버르장머리라곤 없고 아주 얄미워.
지가 잘못해 놓구 외루 화르 내드라니! 지__49/NNG 가/JKS 잘못/MAG 하/VV 어/EC 놓/VX 구/EC 외루__3/MAG 화/NNG 르/JKO 내/VV 드라니/EF !/SF 자기가 잘못해 놓고 외려 화를 내더라니!
맛없는 음석이래도 개 주자니 아꿉다. 맛없/VA 는/ETM 음석__1/NNG 이/VCP 래도/EC 개/NNG 주/VV 자니/EC 아꿉__1/VA-I 다/EF ./SF 맛없는 음식이라도 개 주자니 아깝다.
말 안 듣구 나가 놀다가 넘어졌다니 그것 아주 싸고지다. 말/NNG 안/MAG 듣/VV-I 구/EC 나가/VV 어/EC 놀/VV 다가/EC 넘어지/VV 었/EP 다니/EC 그것/NP 아주/MAG 싸고지__1/NNG 이/VCP 다/EF ./SF 말 안 듣고 나가 놀다가 넘어졌다니 그것 아주 쌤통이다.
자는 어렜을 적부터 대두 쫄라 대는 버릇이 있었아. 자/NP 는/JX 어리/VA 었/EP 을/ETM 적/NNB 부터/JX 대두__9/MAG 쪼르/VV 어/EC 대/VX 는/ETM 버릇/NNG 이/JKS 있/VV 었/EP 아/EF ./SF 쟤는 어렸을 적부터 되우 졸라 대는 버릇이 있었어.
초저욹이래두 바람이 차니까 옷으 뜨시게 입구 나가라. 초저욹__1/NNG 이/VCP 래두/EC 바람/NNG 이/JKS 차/VA 니까/EC 옷/NNG 으/JKO 뜨시__1/VA 게/EC 입/VV-R 구/EC 나가/VV 라/EF ./SF 초겨울이라도 바람이 차니까 옷을 따뜻하게 입고 나가라.
책상 모새이에 부닺헤서 멍당구가 시퍼렇게 들었아. 책상/NNG 모새이__1/NNG 에/JKB 부닺히/VV 어서/EC 멍당구__1/NNG 가/NNG 시퍼렇/VA-I 게/EC 들/VV 었/EP 아/EF ./SF 책상 모서리에 부딪혀서 멍이 시퍼렇게 들었어.
책상 모새이에 부닺헤서 멍당구가 시퍼렇게 들었아. 책상/NNG 모새이__1/NNG 에/JKB 부닺히/VV 어서/EC 멍당구__1/NNG 가/JKS 시퍼렇/VA-I 게/EC 들/VV 었/EP 아/EF ./SF 책상 모서리에 부딪혀서 멍이 시퍼렇게 들었어.
올해는 가믐더우 땜에 나락이 쨀고 그래서 걱정이래. 올해/NNG 는/JX 가믐더우__1/NNG 땜/NNB 에/JKB 나락__2/NNG 이/MM 쨀__1/VA 고/EC 그래서/MAJ 걱정/NNG 이/VCP 래/EF ./SF 올해는 가뭄더위 땜에 벼가 잘고 그래서 걱정이래.
버점은 허옇기 일어나는 것도 있고 자꾸 번지 나가는 것도 있어여. 버점__1/NNG 은/JX 허옇/VA-I 기/ETN 일어나/VV 는/ETM 것/NNB 도/JX 있/VA 고/EC 자꾸/MAG 번지/VV 어/EC 나가/VX 는/ETM 것/NNB 도/JX 있/VA 어여/EF ./SF 버짐은 허옇게 일어나는 것도 있고 자꾸 번져 나가는 것도 있어요.
우산 없이 길으 가다가 소나기르 좔락 다 맞았다. 우산/NNG 없이/MAG 길/NNG 으/JKO 가/VV 다가/EC 소나기/NNG 르/JKO 좔락__1/MAG 다/MAG 맞/VV 었/EP 다/EF ./SF 우산 없이 길을 가다가 소나기를 쫄딱 다 맞았다.
콩국시가 아주 걸찌한 기 맛이 참 고만이네. 콩국시__1/NNG 가/JKS 아주/MAG 걸찌/XR 하/XSA ᆫ/ETM 기__76/NNB 이/JKS 맛/NNG 이/JKS 참/MAG 고만/MAG 이/VCP 네/EF ./SF 콩국수가 아주 걸쭉한 게 맛이 참 고만이네.
가서 쇠궁이에 여물 좀 줘라. 가/VV 어서/EC 쇠궁이__1/NNG 에/JKB 여물/NNG 좀/MAG 주/VV 어라/EF ./SF 가서 소구유에 여물 좀 줘라.
구리가 궁게이로 드가 뿠어. 구렁이/NNG 가/JKS 궁게이__1/NNG 로/XSM 드가/VV 어/EC 뿌/VX 었/EP 어/EF ./SF 구렁이가 구멍으로 들어가 버렸어.
구리가 궁게이로 드가 뿠어. 구리__14/NNG 가/JKS 궁게이__1/NNG 로/XSM 드가/VV 어/EC 뿌/VX 었/EP 어/EF ./SF 구렁이가 구멍으로 들어가 버렸어.
엉차 내가 다 할 일이잖소. 엉차__1/MAG 나/NP 가/JKS 다/MAG 하/VV ᆯ/ETM 일/NNG 이/VCP 잖소/EF ./SF 어차피 내가 다 할 일이잖소.
어제 자에 갔더거 어릴 적 친구르 맞주이했잖소. 어제/MAG 자__53/NNG 에/JKB 가/VV 었/EP 더거/EC 어리/VA ᆯ/ETM 적/NNB 친구/NNG 르/JKO 맞주이하/VV 었/EP 잖소/EF ./SF 어제 장에 갔다가 어릴 적 친구와 마주쳤잖소.
떡으 할라모 실그가 있어야 하잖소. 떡/NNG 으/JKO 하/VV ᆯ라모/EC 실그__1/NNG 가/JKS 있/VV 어야/EC 하/VX 잖소/EF ./SF 떡을 하려면 시루가 있어야 하잖소.

View file

@ -28,7 +28,7 @@
나는 그 음식을 음작음작 먹엇주. 나/NP 는/JX 그/MM 음식/NNG 을/JKO 음작음작__1/MAG 먹/VV 엇/EP 주/EF ./SF 나는 그 음식을 우물우물 먹었지.
작년에 입어난 바진디 오돌랑한 거 보난 지레가 하영 큰 셍이라. 작년/NNG 에/JKB 입/VV-R 어나/EP ᆫ/ETM 바지/NNG 이/VCP ᆫ디/EC 오돌랑하__1/VA ᆫ/ETM 거/NNB 보/VV 난/EC 지레__3/NNG 가/JKS 하영__1/MAG 크/VV ᆫ/ETM 셍__2/NNB 이/VCP 라/EF ./SF 작년에 입었던 바지인데 껑충한 것을 보니 키가 많이 큰 모양이야.
낭이 잘 자라젠 ᄒᆞ민 알거시려 줘사주게. 낭__2/NNG 이/JKS 잘/MAG 자라/VV 젠/EC ᄒᆞ/VX 민/EC 알거시리__1/VV 어/EC 주/VX 어사/EC 주/EF 게/EF ./SF 나무가 잘 자라려고 하면 가지치기해 줘야지.
보릿낭을 긏앙 입에 대영 불민 소리가 나는디 걸 보리피리렌 ᄀᆞᆮ주. 보릿낭__1/NNG 을/JKO 긏/VV 엉/EC 입/NNG 에/JKB 대/VV 엉/EC 불/VV 민/EC 소리/NNG 가/JKS 나/VV 는디/EC 거/NP ᆯ/JKO 보리피리/NNG 이/VCP 렌/EC ᄀᆞᆮ/VV-I 주/EF ./SF 보릿짚을 잘라서 입에 대고 불면 소리가 나는데 그걸 보리피리라고 말하지.
보릿낭을 긏앙 입에 대영 불민 소리가 나는디 걸 보리피리렌 ᄀᆞᆮ주. 보릿낭__1/NNG 을/JKO 긏/VV 엉/EC 입/NNG 에/JKB 대/VV 엉/EC 불/VV 민/EC 소리/NNG 가/JKS 나/VV 는디/EC 거/NP ᆯ/JKO 보리피리/NNG 이/VCP 렌/EC ᄀᆞᆮ/VV-I 주/EF ./SF 보릿짚을 잘라서 입에 대고 불면 소리가 나는데 그걸 보리피리라고 말하지.
부리땡이에 데나네 조심허라. 부리땡이__1/NNG 에/JKB 데/VV 나네/EC 조심/NNG ᄒᆞ/XSV 어라/EF ./SF 부지깽이에 데니까 조심해라.
친구덜끼리 모연 춤추곡 놀레 부르곡 잠베질하곡 놀앗주게. 친구/NNG 덜/XSN 끼리/XSN 모이/VV 언/EC 춤추/VV 곡/EC 놀래__1/NNG 부르/VV 곡/EC 잠베질하__1/VV 곡/EC 놀/VV 엇/EP 주/EF 게/EF ./SF 친구들끼리 모여서 춤추고 노래하고 재잘거리고 놀았지요.
보리 ᄀᆞᆯ앙 보리ᄊᆞᆯ은 사름 먹곡 보리체는 쉐나 뒈지 것으로 주곡 헷엇지. 보리/NNG ᄀᆞᆯ__2/VV 엉/EC 보리/NNG ᄊᆞᆯ/NNG 은/JX 사름__2/NNG 먹/VV 곡/EC 보리/NNG 체/NNG 는/JX 쉐__4/NNG 나/JC 뒈지__1/NNG 것__7/NNG 으로/JKB 주/VV 곡/EC ᄒᆞ/VX 엇엇/EP 지/EF ./SF 보리 갈아서 보리쌀은 사람 먹고 보리체는 소나 돼지 먹이로 주고 했었지.
@ -80,7 +80,7 @@
아이덜신디 밥 ᄒᆞ꼼 줄 때도 가지깽이에 떵 주민 안 뒈여. 아이/NNG 덜/XSN 신디__1/JKB 밥/NNG ᄒᆞ꼼__3/MAG 주/VV ᆯ/ETM 때/NNG 도/JX 가지깽이__1/NNG 에/JKB 뜨/VV 엉/EC 주/VX 민/EC 안/MAG 뒈__3/VV 어/EF ./SF 아이들에게 밥 조금 줄 때도 바리뚜껑에 떠서 주면 안 돼.
옛날엔 배 골르지 아녀게 지내민 것으로 뒈엇엇주. 옛날/NNG 에/JKB ᆫ/JX 배/NNG 골르__4/VV 지/EC 아니/VCN 어게/EC 지내/VV 민/EC 것__9/NP 으로/JKB 뒈__3/VV 엇엇/EP 주/EF ./SF 옛날엔 배를 곯지 않게 지내면 그것으로 되었었지.
여이난 씨언한 물 좀 줍서. 여이__2/VV 난/EC 씨언하__1/VA ᆫ/ETM 물/NNG 좀/MAG 주/VV ᆸ서/EF ./SF 목마르니 시원한 물 좀 주세요.
ᄒᆞ꼼허민 벨착벨착, 그 성질머리부터 고쳐사 헤. ᄒᆞ꼼__3/MAG ᄒᆞ/VV 민/EC 벨착벨착__1/MAG ,/SP 그/MM 성질/NNG 머리/NNG 부터/JX 고치/VV 어사/EC ᄒᆞ/VX 어/EF ./SF 툭하면 발끈발끈, 그 성질부터 고쳐야 해.
ᄒᆞ꼼허민 벨착벨착, 그 성질머리부터 고쳐사 헤. ᄒᆞ꼼__3/MAG ᄒᆞ/VV 민/EC 벨착벨착__1/MAG ,/SP 그/MM 성질/NNG 머리/NNG 부터/JX 고치/VV 어사/EC ᄒᆞ/VX 어/EF ./SF 툭하면 발끈발끈, 그 성질부터 고쳐야 해.
체허영 소화 안 뒈곡 헐 때 바농땡이로 손 따민 내려가메. 체/NNG ᄒᆞ/XSV 엉/EC 소화/NNG 안/MAG 뒈__3/VV 곡/EC ᄒᆞ/VV ᆯ/ETM 때/NNG 바농땡이__1/NNG 로/JKB 손/NNG 따/VV 민/EC 내려가/VV 메/EF ./SF 체해서 소화 안 되고 할 때 바늘로 손을 따면 내려가지.
그 사름은 경 심보가 나쁘난 일찍 뒈여졋주. 그/MM 사름__2/NNG 은/JX 경__1/MAG 심보/NNG 가/JKS 나쁘/VA 난/EC 일찍/MAG 뒈여지__1/VV 엇/EP 주/EF ./SF 그 사람은 그리 심보가 나쁘니까 일찍 죽었지.
밥맛도 엇고 ᄒᆞ영 그냥 물에 잠아먹엇주게. 밥맛/NNG 도/JX 엇__2/VA 고/EC ᄒᆞ/VX 엉/EC 그냥/MAG 물/NNG 에/JKB 잠아먹__1/VV 엇/EP 주/EF 게/EF ./SF 밥맛도 없고 해서 그냥 물에 말아먹었지요.

View file

@ -1,14 +1,3 @@
/**
* @file CoNgramModel.h
* @author bab2min (bab2min@gmail.com)
* @brief N-gram (Contextual N-gram Model)
* @version 0.22.1
* @date 2025-11-21
*
* .
* N-gram .
*/
#pragma once
#include <array>
@ -26,49 +15,53 @@ namespace kiwi
{
namespace lm
{
/**
* @brief N-gram
*/
struct CoNgramModelHeader
{
uint64_t vocabSize; /**< 어휘 크기 */
uint64_t contextSize; /**< 문맥 크기 */
uint16_t dim; /**< 임베딩 차원 */
uint8_t contextType; /**< 문맥 타입 */
uint8_t outputType; /**< 출력 타입 */
uint8_t keySize; /**< 키 크기 */
uint8_t windowSize; /**< 윈도우 크기 */
uint8_t qbit; /**< 양자화 비트 수 */
uint8_t qgroup; /**< 양자화 그룹 크기 */
uint64_t numNodes; /**< 노드 개수 */
uint64_t nodeOffset; /**< 노드 데이터 오프셋 */
uint64_t keyOffset; /**< 키 데이터 오프셋 */
uint64_t valueOffset; /**< 값 데이터 오프셋 */
uint64_t embOffset; /**< 임베딩 데이터 오프셋 */
enum
{
hasOutputEmbBias = 1 << 0,
hasReorderedVocab = 1 << 1,
hasTrieFrequency = 1 << 2,
};
uint64_t vocabSize, contextSize;
uint16_t dim;
uint16_t flags;
uint8_t keySize, windowSize, qbit, qgroup;
uint64_t numNodes;
uint64_t nodeOffset, keyOffset, valueOffset, embOffset;
};
/**
* @brief N-gram
*
* @tparam KeyType
* @tparam ValueType
* @tparam DiffType diff
*/
template<class KeyType, class ValueType, class DiffType = int32_t>
struct Node
{
KeyType numNexts = 0; /**< 다음 노드의 개수 */
ValueType value = 0; /**< 노드 값 */
DiffType lower = 0; /**< 하위 노드로의 오프셋 */
uint32_t nextOffset = 0; /**< 다음 노드들의 시작 오프셋 */
KeyType numNexts = 0;
ValueType value = 0;
DiffType lower = 0;
uint32_t nextOffset = 0;
};
/**
* @brief N-gram
*
* .
* .
*/
template<>
struct Node<uint16_t, uint32_t, int32_t>
{
uint16_t numNexts = 0;
uint16_t depth = 0;
uint32_t value = 0;
int32_t lower = 0;
uint32_t nextOffset = 0;
};
template<class T>
struct HasDepthField : public std::false_type
{
};
template<>
struct HasDepthField<Node<uint16_t, uint32_t, int32_t>> : public std::true_type
{
};
class CoNgramModelBase : public ILangModel
{
protected:
@ -84,84 +77,25 @@ namespace kiwi
size_t vocabSize() const override { return header.vocabSize; }
size_t getMemorySize() const override { return memorySize; }
/**
* @brief .
* @return CoNgramModelHeader에 const
*/
const CoNgramModelHeader& getHeader() const { return header; }
/**
* @brief .
* @param vocabId ID
* @param topN N개
* @param output ( ID, )
* @return
*/
virtual size_t mostSimilarWords(uint32_t vocabId, size_t topN, std::pair<uint32_t, float>* output) const = 0;
/**
* @brief .
* @param vocabId1 ID
* @param vocabId2 ID
* @return
*/
virtual float wordSimilarity(uint32_t vocabId1, uint32_t vocabId2) const = 0;
/**
* @brief .
* @param contextId ID
* @param topN N개
* @param output
* @return
*/
virtual size_t mostSimilarContexts(uint32_t contextId, size_t topN, std::pair<uint32_t, float>* output) const = 0;
/**
* @brief .
* @param contextId1 ID
* @param contextId2 ID
* @return
*/
virtual float contextSimilarity(uint32_t contextId1, uint32_t contextId2) const = 0;
/**
* @brief .
* @param contextId ID
* @param topN N개
* @param output
* @return
*/
virtual size_t predictWordsFromContext(uint32_t contextId, size_t topN, std::pair<uint32_t, float>* output) const = 0;
/**
* @brief .
* @param contextId ID
* @param bgContextId ID
* @param weight
* @param topN N개
* @param output
* @return
*/
virtual size_t predictWordsFromContextDiff(uint32_t contextId, uint32_t bgContextId, float weight, size_t topN, std::pair<uint32_t, float>* output) const = 0;
/**
* @brief ID 퀀 ID로 .
* @param vocabIds ID
* @param size
* @return ID
*/
virtual uint32_t toContextId(const uint32_t* vocabIds, size_t size) const = 0;
/**
* @brief .
* @return -
*/
virtual std::vector<std::vector<uint32_t>> getContextWordMap() const = 0;
virtual float getContextFrequency(uint32_t contextId) const = 0;
virtual float getContextEntropy(uint32_t contextId) const = 0;
virtual size_t getNodeDepth(uint32_t nodeId) const = 0;
virtual std::vector<std::vector<uint32_t>> getContextWordMap() const = 0;
virtual float progressOneStep(int32_t& nodeIdx, uint32_t& contextIdx, uint32_t next) const = 0;
/**
* @brief - .
* @return - const
*/
const std::vector<std::vector<uint32_t>>& getContextWordMapCached() const
{
if (contextWordMapCache.empty())
@ -171,22 +105,15 @@ namespace kiwi
return contextWordMapCache;
}
/**
* @brief .
* @param contextDefinition
* @param embedding
* @param maxContextLength
* @param useVLE VLE(Variable Length Encoding)
* @param reorderContextIdx
* @param selectedEmbIdx
* @return
*/
static utils::MemoryObject build(const std::string& contextDefinition, const std::string& embedding,
size_t maxContextLength = -1,
bool useVLE = true,
bool reorderContextIdx = true,
const std::vector<size_t>* selectedEmbIdx = nullptr);
static utils::MemoryObject buildChrModel(const std::string& contextDefinition, const std::string& embedding,
size_t maxContextLength = -1, bool reorderContextIdx = true, bool eraseRedundantContexts = false);
static std::unique_ptr<CoNgramModelBase> create(utils::MemoryObject&& mem,
ArchType archType = ArchType::none,
bool useDistantTokens = false,

View file

@ -136,4 +136,81 @@ namespace kiwi
std::vector<std::pair<std::vector<uint32_t>, size_t>> extractPrefixes(size_t minCnt, size_t maxLength, size_t numWorkers = 1, bool exclusiveCnt = false) const;
};
class ChrTokenizer
{
public:
enum class Token : int32_t
{
bos = 0,
eos = 0,
sf, sp, ss, sso, ssc, se, so, sw, sh,
hangulSyllableStart,
hangulCodaStart = hangulSyllableStart + 399,
asciiStart = hangulCodaStart + 27,
max = asciiStart + 94,
};
size_t encodeOne(char32_t ch) const;
size_t encode(std::string_view text, int32_t* outBuf, size_t bufSize) const;
std::u16string decode(const int32_t* tokenBuf, size_t tokenCnt) const;
size_t vocabSize() const { return static_cast<size_t>(Token::max); }
};
class ChrDataset
{
static constexpr int32_t nonVocab = -1;
HiddenMember<RaggedVector<int32_t>, sizeof(Vector<size_t>) * 2> sents;
Vector<float> sentWeights, sentSampled;
Vector<uint32_t> shuffledIdcs;
Vector<uint32_t> nonLabelPrefixSizes;
double totalWeight = 0.;
size_t totalSampled = 0;
std::unique_ptr<utils::ThreadPool> workers;
float prefixDropoutProb = 0.f;
std::mt19937_64 rng;
utils::FrozenTrie<uint32_t, uint32_t> contextualMapper;
size_t batchSize = 0;
size_t causalContextSize = 0;
size_t windowSize = 0;
size_t currentSeed = 0;
size_t consumedSents = 0;
bool sampleWithoutWeights = false;
template<class InTy, class OutTy>
size_t _next(InTy in, OutTy out);
public:
ChrDataset(size_t _batchSize = 0,
size_t _causalContextSize = 0,
size_t _windowSize = 0,
float _prefixDropoutProb = 0.f,
bool _sampleWithoutWeights = false,
const std::vector<std::pair<size_t, std::vector<uint32_t>>>& contextualMapper = {}
);
~ChrDataset();
ChrDataset(const ChrDataset&) = delete;
ChrDataset(ChrDataset&&) /*noexcept*/;
ChrDataset& operator=(const ChrDataset&) = delete;
ChrDataset& operator=(ChrDataset&&) /*noexcept*/;
void addSentence(std::string_view sentence, float weight = 1.f, std::string_view nonLabelPrefix = {});
size_t numSents() const;
double getTotalWeight() const { return totalWeight; }
size_t getBatchSize() const { return batchSize; }
size_t getCausalContextSize() const { return causalContextSize; }
size_t getWindowSize() const { return windowSize; }
size_t vocabSize() const { return ChrTokenizer{}.vocabSize(); }
std::vector<float> getVocabProbs(double epsilon = 0.1) const;
void seed(size_t newSeed);
void reset();
size_t next(int32_t* in, int32_t* out);
size_t next(int64_t* in, int64_t* out);
std::vector<std::pair<std::vector<uint32_t>, double>> extractPrefixes(float resolution, float minWeight, size_t maxLength, size_t numWorkers = 1, bool exclusiveCnt = false) const;
};
}

View file

@ -2,8 +2,8 @@
* @file Form.h
* @author bab2min (bab2min@gmail.com)
* @brief
* @version 0.22.1
* @date 2025-11-21
* @version 0.23.1
* @date 2026-04-05
*
*
*/
@ -238,6 +238,9 @@ namespace kiwi
uint8_t formHash = 0;
uint8_t zCodaAppendable : 1;
uint8_t zSiotAppendable : 1;
uint8_t hasJClass : 1;
uint8_t hasAnyFullMorphemes : 1;
uint8_t _reserved : 4;
Dialect dialect = Dialect::standard;
Form();

View file

@ -1,16 +1,4 @@
/**
* @file FrozenTrie.h
* @author bab2min (bab2min@gmail.com)
* @brief (immutable) Trie
* @version 0.22.1
* @date 2025-11-21
*
* Trie , .
* Aho-Corasick (fail link) .
* .
*/
#pragma once
#pragma once
#include <array>
#include <vector>
@ -27,10 +15,6 @@ namespace kiwi
{
namespace detail
{
/**
* @brief
* @tparam Value
*/
template<class Value, class = void>
struct HasSubmatch {};
@ -82,18 +66,6 @@ namespace kiwi
};
}
/**
* @brief (frozen) Trie
*
* Trie로, .
* Aho-Corasick (fail function)
* .
*
* @tparam _Key ()
* @tparam _Value
* @tparam _Diff diff
* @tparam _HasSubmatch
*/
template<class _Key, class _Value, class _Diff = int32_t, class _HasSubmatch = detail::HasSubmatch<_Value>>
class FrozenTrie : public _HasSubmatch
{
@ -102,46 +74,20 @@ namespace kiwi
using Value = _Value;
using Diff = _Diff;
/**
* @brief Trie의
*/
struct Node
{
Key numNexts = 0; /**< 자식 노드의 개수 */
Diff lower = 0; /**< 하위 노드로의 오프셋 */
uint32_t nextOffset = 0; /**< 다음 노드들의 시작 오프셋 */
Key numNexts = 0;
uint16_t depth = 0;
Diff lower = 0;
uint32_t nextOffset = 0;
/**
* @brief .
* @tparam arch ( )
* @param ft FrozenTrie
* @param c
* @return , nullptr
*/
template<ArchType arch>
const Node* nextOpt(const FrozenTrie& ft, Key c) const;
/**
* @brief .
* @tparam arch
* @param ft FrozenTrie
* @param c
* @return
*/
template<ArchType arch>
const Node* findFail(const FrozenTrie& ft, Key c) const;
/**
* @brief .
* @return
*/
const Node* fail() const;
/**
* @brief .
* @param ft FrozenTrie
* @return const
*/
const Value& val(const FrozenTrie& ft) const;
};
private:

View file

@ -1,14 +1,3 @@
/**
* @file Joiner.h
* @author bab2min (bab2min@gmail.com)
* @brief Joiner
* @version 0.22.1
* @date 2025-11-21
*
* , .
* .
*/
#pragma once
#include "Types.h"
#include "ArchUtils.h"
@ -24,23 +13,13 @@ namespace kiwi
class CompiledRule;
class AutoJoiner;
/**
* @brief
*/
enum class Space
{
none = 0, /**< 공백 처리 없음 */
no_space = 1, /**< 공백을 삽입하지 않음 */
insert_space = 2, /**< 공백을 삽입함 */
none = 0,
no_space = 1,
insert_space = 2,
};
/**
* @brief
*
*
* .
* CompiledRule을 .
*/
class Joiner
{
friend class CompiledRule;
@ -63,51 +42,19 @@ namespace kiwi
Joiner& operator=(const Joiner&);
Joiner& operator=(Joiner&&);
/**
* @brief .
* @param form
* @param tag
* @param space
*/
void add(const std::u16string& form, POSTag tag, Space space = Space::none);
/**
* @brief .
* @param form (C )
* @param tag
* @param space
*/
void add(const char16_t* form, POSTag tag, Space space = Space::none);
/**
* @brief UTF-16 .
* @param rangesOut ( )
* @return UTF-16
*/
std::u16string getU16(std::vector<std::pair<uint32_t, uint32_t>>* rangesOut = nullptr) const;
/**
* @brief UTF-8 .
* @param rangesOut ( )
* @return UTF-8
*/
std::string getU8(std::vector<std::pair<uint32_t, uint32_t>>* rangesOut = nullptr) const;
};
/**
* @brief
*
*
* .
*
* @tparam LmState
*/
template<class LmState>
struct Candidate
{
Joiner joiner; /**< 형태소 결합기 */
LmState lmState; /**< 언어 모델 상태 */
float score = 0; /**< 현재까지의 누적 점수 */
Joiner joiner;
LmState lmState;
float score = 0;
Candidate(const CompiledRule& _cr, const lm::ILangModel* lm)
: joiner{ _cr }, lmState{ lm }
@ -115,17 +62,10 @@ namespace kiwi
}
};
/**
* @brief VoidLangModel을 Candidate
*
* .
*
* @tparam arch
*/
template<ArchType arch>
struct Candidate<lm::VoidState<arch>>
{
Joiner joiner; /**< 형태소 결합기 */
Joiner joiner;
Candidate(const CompiledRule& _cr, const lm::ILangModel* lm)
: joiner{ _cr }
@ -133,12 +73,6 @@ namespace kiwi
}
};
/**
* @brief
*
* 릿 (type erasure) .
* Candidate를 .
*/
class ErasedVector
{
using FnDestruct = void(*)(ErasedVector*);
@ -227,13 +161,6 @@ namespace kiwi
}
};
/**
* @brief
*
*
* .
* .
*/
class AutoJoiner
{
friend class kiwi::Kiwi;
@ -274,51 +201,12 @@ namespace kiwi
AutoJoiner& operator=(const AutoJoiner&);
AutoJoiner& operator=(AutoJoiner&&);
/**
* @brief ID로 .
* @param morphemeId
* @param space
*/
void add(size_t morphemeId, Space space = Space::none);
/**
* @brief (StringView).
* @param form
* @param tag
* @param space
*/
void add(U16StringView form, POSTag tag, Space space = Space::none);
/**
* @brief (u16string).
* @param form
* @param tag
* @param inferRegularity
* @param space
*/
void add(const std::u16string& form, POSTag tag, bool inferRegularity = true, Space space = Space::none);
/**
* @brief (C ).
* @param form
* @param tag
* @param inferRegularity
* @param space
*/
void add(const char16_t* form, POSTag tag, bool inferRegularity = true, Space space = Space::none);
/**
* @brief UTF-16 .
* @param rangesOut ( )
* @return UTF-16
*/
std::u16string getU16(std::vector<std::pair<uint32_t, uint32_t>>* rangesOut = nullptr) const;
/**
* @brief UTF-8 .
* @param rangesOut ( )
* @return UTF-8
*/
std::string getU8(std::vector<std::pair<uint32_t, uint32_t>>* rangesOut = nullptr) const;
};
}

View file

@ -2,8 +2,8 @@
* @file Kiwi.h
* @author bab2min (bab2min@gmail.com)
* @brief Kiwi C++ API를
* @version 0.22.1
* @date 2025-11-21
* @version 0.23.1
* @date 2026-04-05
*
*
*/
@ -68,21 +68,69 @@ namespace kiwi
struct AnalyzeOption
{
/**
* @brief
*/
Match match = Match::allWithNormalizing;
/**
* @brief
*/
const std::unordered_set<const Morpheme*>* blocklist = nullptr;
/**
* @brief
*/
bool openEnding = false;
/**
* @brief . `Dialect::standard`. OR .
*/
Dialect allowedDialects = Dialect::standard;
/**
* @brief .
*/
float dialectCost = 3.f;
/**
* @brief PreparedTypoTransformer
* TypoTransformer::prepare(true) .
* allowedDialects에 , typoTransformer가 nullptr인 getDefaultPreparedTypoSet(DefaultTypoSet::dialect) PreparedTypoTransformer가 .
*/
const PreparedTypoTransformer* typoTransformer = nullptr;
/**
* @brief
*/
float typoThreshold = 2.5f;
AnalyzeOption() = default;
AnalyzeOption(Match m,
const std::unordered_set<const Morpheme*>* bl = nullptr,
bool oe = false,
Dialect ad = Dialect::standard,
float dc = 3.f
)
: match{ m }, blocklist{ bl }, openEnding{ oe }, allowedDialects{ ad }, dialectCost{ dc }
float dc = 3.f,
const PreparedTypoTransformer* tt = nullptr,
float tth = 2.5f
)
: match{ m }, blocklist{ bl }, openEnding{ oe }, allowedDialects{ ad }, dialectCost{ dc }, typoTransformer{ tt }, typoThreshold{ tth }
{}
AnalyzeOption withMatch(Match m) const
{
AnalyzeOption copy = *this;
copy.match = m;
return copy;
}
AnalyzeOption withTypoTransformer(const PreparedTypoTransformer* tt, float typoThreshold = 2.5f) const
{
AnalyzeOption copy = *this;
copy.typoTransformer = tt;
copy.typoThreshold = typoThreshold;
return copy;
}
};
struct MorphemeDef
@ -103,12 +151,19 @@ namespace kiwi
{
bool integrateAllomorph = true;
float cutOffThreshold = 8;
float unkFormScoreScale = 5;
float unkFormScoreBias = 5;
float oovRuleScale = 4;
float oovRuleBias = 4;
float oovChrBias = 0;
float oovGlobalWeight = 35;
float oovLocalWeight = 3;
float oovGlobalMinFreq = 4;
float spacePenalty = 7;
float typoCostWeight = 6;
uint32_t maxUnkFormSize = 6;
uint32_t maxUnkFormSizeFollowedByJClass = (uint32_t)-1;
uint32_t spaceTolerance = 0;
float oovCutOffThreshold = 5;
float oovTotalSmoothness = 0.1f;
void validate() const;
};
@ -121,7 +176,7 @@ namespace kiwi
{
friend class KiwiBuilder;
template<class LangModel> friend struct BestPathFinder;
template<class LmState, class> friend struct PathEvaluator;
template<class WordLL, class> friend struct PathEvaluator;
template<class LmState> friend struct MorphemeEvaluator;
friend class cmb::AutoJoiner;
template<template<ArchType> class LmState> friend struct NewAutoJoinerGetter;
@ -141,6 +196,7 @@ namespace kiwi
Vector<TypoForm> typoForms;
utils::FrozenTrie<kchar_t, const Form*> formTrie;
std::shared_ptr<lm::ILangModel> langMdl;
std::shared_ptr<lm::CoNgramModelBase> nounChrMdl;
std::shared_ptr<cmb::CompiledRule> combiningRule;
std::unique_ptr<utils::ThreadPool> pool;
@ -232,7 +288,7 @@ namespace kiwi
const std::optional<KiwiConfig>& overrideConfig = {}
) const
{
return analyze(str, 1, option, pretokenized)[0];
return analyze(str, 1, option, pretokenized, overrideConfig)[0];
}
/**
@ -249,7 +305,7 @@ namespace kiwi
{
std::vector<size_t> bytePositions;
auto u16str = utf8To16(str, bytePositions);
return analyze(u16str, option, mapPretokenizedSpansToU16(pretokenized, bytePositions));
return analyze(u16str, option, mapPretokenizedSpansToU16(pretokenized, bytePositions), overrideConfig);
}
/**
@ -280,7 +336,7 @@ namespace kiwi
{
std::vector<size_t> bytePositions;
auto u16str = utf8To16(str, bytePositions);
return analyze(u16str, topN, option, mapPretokenizedSpansToU16(pretokenized, bytePositions));
return analyze(u16str, topN, option, mapPretokenizedSpansToU16(pretokenized, bytePositions), overrideConfig);
}
/**
@ -537,6 +593,7 @@ namespace kiwi
Vector<MorphemeRaw> morphemes;
UnorderedMap<KString, size_t> formMap;
std::shared_ptr<lm::ILangModel> langMdl;
std::shared_ptr<lm::CoNgramModelBase> nounChrMdl;
std::shared_ptr<cmb::CompiledRule> combiningRule;
WordDetector detector;
Map<int, int> ruleProfilingCnt;

View file

@ -1,16 +1,4 @@
/**
* @file Knlm.h
* @author bab2min (bab2min@gmail.com)
* @brief Kneser-Ney
* @version 0.22.1
* @date 2025-11-21
*
* Kneser-Ney N-gram .
* 퀀 .
* .
*/
#pragma once
#pragma once
#include "LangModel.h"
@ -18,55 +6,23 @@ namespace kiwi
{
namespace lm
{
/**
* @brief Kneser-Ney
*
* .
*/
struct KnLangModelHeader
{
uint64_t num_nodes; /**< 노드의 총 개수 */
uint64_t node_offset; /**< 노드 데이터의 시작 오프셋 */
uint64_t key_offset; /**< 키 데이터의 시작 오프셋 */
uint64_t ll_offset; /**< 로그 우도(log-likelihood) 데이터의 시작 오프셋 */
uint64_t gamma_offset; /**< 감마(백오프 가중치) 데이터의 시작 오프셋 */
uint64_t qtable_offset; /**< 양자화 테이블의 시작 오프셋 */
uint64_t htx_offset; /**< 히스토리 변환 데이터의 시작 오프셋 */
uint64_t unk_id; /**< 미등록어(unknown) ID */
uint64_t bos_id; /**< 문장 시작(beginning of sentence) ID */
uint64_t eos_id; /**< 문장 종료(end of sentence) ID */
uint64_t vocab_size; /**< 어휘 크기 */
uint8_t order; /**< N-gram 차수 */
uint8_t key_size; /**< 키의 크기 (바이트) */
uint8_t diff_size; /**< diff 값의 크기 (바이트) */
uint8_t quantized; /**< 양자화 여부 */
uint32_t extra_buf_size; /**< 추가 버퍼 크기 */
uint64_t num_nodes, node_offset, key_offset, ll_offset, gamma_offset, qtable_offset, htx_offset;
uint64_t unk_id, bos_id, eos_id, vocab_size;
uint8_t order, key_size, diff_size, quantized;
uint32_t extra_buf_size;
};
/**
* @brief Kneser-Ney
*
* N-gram을 .
*
* @tparam KeyType ( )
* @tparam DiffType diff
*/
template<class KeyType, class DiffType = int32_t>
struct KnLangModelNode
{
KeyType num_nexts = 0; /**< 다음 노드의 개수 */
DiffType lower = 0; /**< 하위 노드로의 오프셋 */
uint32_t next_offset = 0; /**< 다음 노드들의 시작 오프셋 */
float ll = 0; /**< 로그 우도 */
float gamma = 0; /**< 백오프 가중치 */
KeyType num_nexts = 0;
DiffType lower = 0;
uint32_t next_offset = 0;
float ll = 0, gamma = 0;
};
/**
* @brief Kneser-Ney
*
* Kneser-Ney .
* N-gram .
*/
class KnLangModelBase : public ILangModel
{
protected:
@ -88,34 +44,13 @@ namespace kiwi
size_t vocabSize() const override { return getHeader().vocab_size; }
size_t getMemorySize() const override { return base.size(); }
/**
* @brief .
* @return KnLangModelHeader에 const
*/
const KnLangModelHeader& getHeader() const { return *reinterpret_cast<const KnLangModelHeader*>(base.get()); }
/**
* @brief .
* @param node_idx
* @return
*/
virtual ptrdiff_t getLowerNode(ptrdiff_t node_idx) const = 0;
virtual size_t nonLeafNodeSize() const = 0;
/**
* @brief .
* @return
*/
virtual const void* getExtraBuf() const = 0;
/**
* @brief Kneser-Ney .
* @param mem
* @param archType ( )
* @param transposed
* @return unique_ptr
*/
static std::unique_ptr<KnLangModelBase> create(utils::MemoryObject&& mem, ArchType archType = ArchType::none, bool transposed = false);
template<class VocabTy, class Trie, class HistoryTx = std::vector<VocabTy>>
@ -129,30 +64,14 @@ namespace kiwi
size_t extra_buf_size = 0
);
/**
* @brief .
* @return const
*/
const utils::MemoryObject& getMemory() const { return base; }
/**
* @brief .
* @param node_idx ( )
* @param next
* @return
*/
template<class Ty>
float progress(ptrdiff_t& node_idx, Ty next) const
{
return _progress(node_idx, next);
}
/**
* @brief 퀀 .
* @param in_first 퀀
* @param in_last 퀀
* @param out_first
*/
template<class InTy, class OutTy>
void evaluate(InTy in_first, InTy in_last, OutTy out_first) const
{
@ -177,13 +96,6 @@ namespace kiwi
}
}
/**
* @brief 퀀 .
* @param in_first 퀀
* @param in_last 퀀
* @param min_score
* @return
*/
template<class InTy>
float sum(InTy in_first, InTy in_last, float min_score = -100) const
{
@ -196,12 +108,6 @@ namespace kiwi
return ret;
}
/**
* @brief .
* @param in_first 퀀
* @param in_last 퀀
* @return
*/
template<class InTy>
std::vector<float> getNextLL(InTy in_first, InTy in_last) const
{

View file

@ -1,14 +1,3 @@
/**
* @file LangModel.h
* @author bab2min (bab2min@gmail.com)
* @brief
* @version 0.22.1
* @date 2025-11-21
*
* .
* 퀀 .
*/
#pragma once
#include <array>
@ -27,61 +16,21 @@ namespace kiwi
{
namespace lm
{
/**
* @brief
*
* .
* 퀀 .
*/
class ILangModel
{
public:
virtual ~ILangModel() = default;
/**
* @brief .
* @return (none, knlm, skipbigram )
*/
virtual ModelType getType() const = 0;
/**
* @brief .
* @return (vocabulary)
*/
virtual size_t vocabSize() const = 0;
/**
* @brief .
* @return ( )
*/
virtual size_t getMemorySize() const = 0;
/**
* @brief .
* @return
*/
virtual void* getFindBestPathFn() const = 0;
/**
* @brief Joiner .
* @return Joiner
*/
virtual void* getNewJoinerFn() const = 0;
};
/**
* @brief 릿
*
* CRTP(Curiously Recurring Template Pattern)
* .
*
* @tparam DerivedLM
*/
template<class DerivedLM>
struct LmStateBase
{
/**
* @brief .
* @param langMdl
* @param nextToken
* @return
*/
float next(const ILangModel* langMdl, typename DerivedLM::VocabType nextToken)
{
using LmStateType = typename DerivedLM::LmStateType;
@ -92,14 +41,6 @@ namespace kiwi
template<ArchType arch>
class VoidLangModel;
/**
* @brief VoidLangModel의
*
* .
* 0 .
*
* @tparam arch
*/
template<ArchType arch>
struct VoidState : public LmStateBase<VoidLangModel<arch>>
{
@ -114,14 +55,6 @@ namespace kiwi
}
};
/**
* @brief
*
* .
* 0 .
*
* @tparam arch
*/
template<ArchType arch>
class VoidLangModel : public ILangModel
{

View file

@ -4,7 +4,7 @@
#define KIWI_STR(x) KIWI_STR_HELPER(x)
#define KIWI_VERSION_MAJOR 0
#define KIWI_VERSION_MINOR 22
#define KIWI_VERSION_MINOR 23
#define KIWI_VERSION_PATCH 2
#define KIWI_VERSION_STRING KIWI_STR(KIWI_VERSION_MAJOR) "." KIWI_STR(KIWI_VERSION_MINOR) "." KIWI_STR(KIWI_VERSION_PATCH)

View file

@ -1,15 +1,4 @@
/**
* @file Mmap.h
* @author bab2min (bab2min@gmail.com)
* @brief
* @version 0.22.1
* @date 2025-11-21
*
* .
* , .
*/
#pragma once
#pragma once
#include <string>
#include <iostream>
#include <fstream>
@ -23,9 +12,6 @@ namespace kiwi
{
namespace detail
{
/**
* @brief Windows RAII
*/
class HandleGuard
{
HANDLE handle = nullptr;

View file

@ -1,14 +1,3 @@
/**
* @file PatternMatcher.h
* @author bab2min (bab2min@gmail.com)
* @brief
* @version 0.22.1
* @date 2025-11-21
*
* URL, , ,
* .
*/
#pragma once
#include <vector>
@ -18,11 +7,6 @@
namespace kiwi
{
/**
* @brief
*
* .
*/
enum class Match : size_t
{
none = 0,
@ -32,6 +16,14 @@ namespace kiwi
mention = 1 << 3, /**< 멘션 형태의 텍스트(@멘션)를 w_mention 태그에 매칭한다 */
serial = 1 << 4, /**< 일련 번호 형태의 텍스트를 w_serial 태그에 매칭한다 */
emoji = 1 << 5, /**< 이모지 문자를 w_emoji 태그에 매칭한다 */
oovRuleOnly = 0 << 8, /**< 사전에 없는 단어에 대해 규칙만을 사용하여 매칭한다 */
oovChrModel = 1 << 8, /**< 사전에 없는 단어에 대해 문자 기반 OOV 모델을 사용하여 매칭한다 */
oovChrFreqModel = 2 << 8, /**< 사전에 없는 단어에 대해 문자 빈도 기반 OOV 모델을 사용하여 매칭한다 */
oovChrFreqBranchModel = 3 << 8, /**< 사전에 없는 단어에 대해 문자 빈도 및 브랜치 기반 OOV 모델을 사용하여 매칭한다 */
oovMask = 3 << 8, /**< OOV 옵션들에 대한 마스크 */
oovTotalConsistency = 1 << 10, /**< */
normalizeCoda = 1 << 16, /**< 초성체가 앞 어절의 받침에 따라붙은 경우를 정규화하여 매칭한다 */
joinNounPrefix = 1 << 17, /**< 체언접두사(XPN)를 분리하지 않고 합쳐서 매칭한다 */
joinNounSuffix = 1 << 18, /**< 명사파생접미사(XSN)를 분리하지 않고 합쳐서 매칭한다 */
@ -43,20 +35,16 @@ namespace kiwi
compatibleJamo = 1 << 24, /**< 출력시 한글 첫가끝 자모를 호환가능한 자모로 변환한다. */
splitSaisiot = 1 << 25, /**< 사이시옷이 포함된 합성명사를 분리하여 매칭한다. */
mergeSaisiot = 1 << 26, /**< 사이시옷이 포함된 것으로 추정되는 명사를 결합하여 매칭한다. */
joinVSuffix = joinVerbSuffix | joinAdjSuffix, /**< 용언 파생접미사 결합 */
joinAffix = joinNounPrefix | joinNounSuffix | joinVerbSuffix | joinAdjSuffix | joinAdvSuffix, /**< 모든 접사 결합 */
all = url | email | hashtag | mention | serial | emoji | zCoda, /**< 모든 웹 패턴 매칭 */
allWithNormalizing = all | normalizeCoda, /**< 모든 패턴과 정규화 */
joinParticleYo = 1 << 27, /**< 어미(EC/EF)와 조사 "요/JX"를 통합하여 매칭한다 (예: 고/EC + 요/JX => 고요/EC) */
useOldSplitter = 1 << 30,
joinVSuffix = joinVerbSuffix | joinAdjSuffix,
joinAffix = joinNounPrefix | joinNounSuffix | joinVerbSuffix | joinAdjSuffix | joinAdvSuffix,
all = url | email | hashtag | mention | serial | emoji | zCoda,
allWithNormalizing = all | normalizeCoda,
};
/**
* @brief .
* @param left ()
* @param first
* @param last
* @param matchOptions
* @return ( , )
*/
std::pair<size_t, kiwi::POSTag> matchPattern(char16_t left, const char16_t* first, const char16_t* last, Match matchOptions);
}

View file

@ -1,15 +1,3 @@
/**
* @file SkipBigramModel.h
* @author bab2min (bab2min@gmail.com)
* @brief Skip-bigram
* @version 0.22.1
* @date 2025-11-21
*
* Skip-bigram은 .
* bigram이 ,
* .
*/
#pragma once
#include "Knlm.h"
@ -18,25 +6,12 @@ namespace kiwi
{
namespace lm
{
/**
* @brief Skip-bigram
*/
struct SkipBigramModelHeader
{
uint64_t vocabSize; /**< 어휘 크기 */
uint8_t keySize; /**< 키의 크기 */
uint8_t windowSize; /**< 윈도우 크기 (skip 거리) */
uint8_t compressed; /**< 압축 여부 */
uint8_t quantize; /**< 양자화 비트 수 */
uint8_t _rsv[4]; /**< 예약 필드 */
uint64_t vocabSize;
uint8_t keySize, windowSize, compressed, quantize, _rsv[4];
};
/**
* @brief Skip-bigram
*
* .
* .
*/
class SkipBigramModelBase : public ILangModel
{
protected:
@ -50,19 +25,8 @@ namespace kiwi
size_t vocabSize() const override { return getHeader().vocabSize; }
ModelType getType() const override { return ModelType::sbg; }
/**
* @brief .
* @return SkipBigramModelHeader에 const
*/
const SkipBigramModelHeader& getHeader() const { return *reinterpret_cast<const SkipBigramModelHeader*>(base.get()); }
/**
* @brief Skip-bigram .
* @param knlmMem Kneser-Ney
* @param sbgMem Skip-bigram
* @param archType ( )
* @return Skip-bigram unique_ptr
*/
static std::unique_ptr<SkipBigramModelBase> create(utils::MemoryObject&& knlmMem, utils::MemoryObject&& sbgMem, ArchType archType = ArchType::none);
};
}

View file

@ -1,9 +1,9 @@
/**
* @file SwTokenizer.h
* @author bab2min (bab2min@gmail.com)
* @brief Subword Tokenizer
* @version 0.22.1
* @date 2025-11-21
* @brief Subword Tokenizer의
* @version 0.23.1
* @date 2026-04-05
*
*
*/

View file

@ -1,13 +1,3 @@
/**
* @file TagUtils.h
* @author bab2min (bab2min@gmail.com)
* @brief
* @version 0.22.1
* @date 2025-11-21
*
* , , .
*/
#pragma once
#include <algorithm>
@ -15,107 +5,49 @@
namespace kiwi
{
/**
* @brief 퀀
*
*
* 퀀 .
* .
*/
class TagSequenceScorer
{
float leftBoundaryScores[2][(size_t)POSTag::max] = { { 0, }, };
public:
float weight; /**< 점수 가중치 */
float weight;
/**
* @brief TagSequenceScorer
* @param _weight (: 5)
*/
TagSequenceScorer(float _weight = 5);
/**
* @brief .
* @param hasLeftBoundary
* @param right
* @return
*/
float evalLeftBoundary(bool hasLeftBoundary, POSTag right) const
{
return leftBoundaryScores[hasLeftBoundary ? 1 : 0][(size_t)clearIrregular(right)] * weight;
}
};
/**
* @brief .
* @param tag
* @return (, , ) true
*/
bool isNounClass(POSTag tag);
/**
* @brief .
* @param tag
* @return (, ) true
*/
bool isVerbClass(POSTag tag);
/**
* @brief .
* @param tag
* @return true
*/
inline bool isEClass(POSTag tag)
{
return POSTag::ep <= tag && tag <= POSTag::etm;
}
/**
* @brief .
* @param tag
* @return true
*/
inline bool isJClass(POSTag tag)
{
return POSTag::jks <= tag && tag <= POSTag::jc;
}
/**
* @brief .
* @param tag
* @return true
*/
inline bool isNNClass(POSTag tag)
{
return POSTag::nng <= tag && tag <= POSTag::nnb;
}
/**
* @brief .
* @param tag
* @return true
*/
inline bool isSuffix(POSTag tag)
{
tag = clearIrregular(tag);
return POSTag::xsn <= tag && tag <= POSTag::xsm;
}
/**
* @brief .
* @param tag
* @return true
*/
inline bool isSpecialClass(POSTag tag)
{
return POSTag::sf <= tag && tag <= POSTag::sn;
}
/**
* @brief .
* @param tag
* @return true
*/
inline bool isUserClass(POSTag tag)
{
return POSTag::user0 <= tag && tag <= POSTag::user4;

View file

@ -1,18 +1,4 @@
/**
* @file ThreadPool.h
* @author bab2min (bab2min@gmail.com)
* @brief C++11 Thread Pool
* @version 0.22.1
* @date 2025-11-21
*
* A simple C++11 Thread Pool implementation(https://github.com/progschj/ThreadPool)
* modified by bab2min to have additional parameter threadId
*
* .
* , .
*/
#pragma once
#pragma once
/*
A simple C++11 Thread Pool implementation(https://github.com/progschj/ThreadPool)
@ -33,53 +19,18 @@ namespace kiwi
{
namespace utils
{
/**
* @brief
*
* ,
* .
*/
class ThreadPool
{
public:
/**
* @brief ThreadPool
* @param threads (0 )
* @param maxQueued (0 )
*/
ThreadPool(size_t threads = 0, size_t maxQueued = 0);
~ThreadPool();
/**
* @brief .
*
* ID를 .
*
* @tparam F
* @tparam Args
* @param f
* @param args
* @return future
*/
template<class F, class... Args>
auto enqueue(F&& f, Args&&... args)
->std::future<typename std::invoke_result<F, size_t, Args...>::type>;
/**
* @brief .
* @return
*/
size_t size() const { return workers.size(); }
/**
* @brief .
* @return
*/
size_t numEnqueued() const { return tasks.size(); }
/**
* @brief .
*/
void joinAll();
private:
std::vector<std::thread> workers;

View file

@ -2,8 +2,8 @@
* @file Types.h
* @author bab2min (bab2min@gmail.com)
* @brief Kiwi C++ API에
* @version 0.22.1
* @date 2025-11-21
* @version 0.23.1
* @date 2026-04-05
*
*
*/
@ -31,23 +31,23 @@
#include "ScriptType.h"
#define KIWI_DEFINE_ENUM_FLAG_OPERATORS(Type) \
inline Type operator~(Type a)\
inline constexpr Type operator~(Type a)\
{\
return static_cast<Type>(~static_cast<typename std::underlying_type<Type>::type>(a));\
}\
inline bool operator!(Type a)\
inline constexpr bool operator!(Type a)\
{\
return a == static_cast<Type>(0);\
}\
inline Type operator|(Type a, Type b)\
inline constexpr Type operator|(Type a, Type b)\
{\
return static_cast<Type>(static_cast<typename std::underlying_type<Type>::type>(a) | static_cast<typename std::underlying_type<Type>::type>(b));\
}\
inline Type operator&(Type a, Type b)\
inline constexpr Type operator&(Type a, Type b)\
{\
return static_cast<Type>(static_cast<typename std::underlying_type<Type>::type>(a) & static_cast<typename std::underlying_type<Type>::type>(b));\
}\
inline Type operator^(Type a, Type b)\
inline constexpr Type operator^(Type a, Type b)\
{\
return static_cast<Type>(static_cast<typename std::underlying_type<Type>::type>(a) ^ static_cast<typename std::underlying_type<Type>::type>(b));\
}\
@ -267,7 +267,9 @@ namespace kiwi
non_vowel, /**< `vowel`의 부정 */
non_vocalic, /**< `vocalic`의 부정 */
non_vocalic_h, /**< `vocalic_h`의 부정 */
applosive = 8, /**< 불파음 받침(ㄴㄹㅁㅇ을 제외한 모든 받침)*/ // not necessary, but fixed MSVC's weird bug
applosive, /**< 오타 교정용: 불파음 받침(ㄴㄹㅁㅇ을 제외한 모든 받침) */
continual, /**< 오타 교정용: 연철 환경임을 표시 */
boundary, /**< 오타 교정용: 형태소 경계임을 표시 */
};
/**
@ -349,7 +351,7 @@ namespace kiwi
uint16_t length = 0; /**< 길이(UTF16 문자 기준) */
POSTag tag = POSTag::unknown; /**< 품사 태그 */
union {
uint8_t senseId = 0; /**< 의미 번호 */
uint8_t senseId = 0; /**< 의미 번호 (OOV인 경우 -1)*/
ScriptType script; /**< 유니코드 영역에 기반한 문자 타입 */
};
float score = 0; /**< 해당 형태소의 언어모델 점수 */
@ -358,7 +360,7 @@ namespace kiwi
uint32_t pairedToken = -1; /**< SSO, SSC 태그에 속하는 형태소의 경우 쌍을 이루는 반대쪽 형태소의 위치(-1인 경우 해당하는 형태소가 없는 것을 뜻함) */
uint32_t subSentPosition = 0; /**< 인용부호나 괄호로 둘러싸인 하위 문장의 번호. 1부터 시작. 0인 경우 하위 문장이 아님을 뜻함 */
Dialect dialect = Dialect::standard; /**< 방언 정보 */
const Morpheme* morph = nullptr; /**< 기타 형태소 정보에 대한 포인터 (OOV인 경우 nullptr) */
const Morpheme* morph = nullptr; /**< 기타 형태소 정보에 대한 포인터 */
TokenInfo() = default;
@ -384,6 +386,8 @@ namespace kiwi
}
uint32_t endPos() const { return position + length; }
bool isOOV() const { return senseId == (uint8_t)(-1); }
};
struct BasicToken

View file

@ -2,8 +2,8 @@
* @file TypoTransformer.h
* @author bab2min (bab2min@gmail.com)
* @brief TypoTransformer .
* @version 0.22.1
* @date 2025-11-21
* @version 0.23.1
* @date 2026-04-05
*
*
*/
@ -127,6 +127,34 @@ namespace kiwi
class KiwiBuilder;
class TypoTransformer;
struct TypoGraphNode
{
U16StringView form;
uint32_t endPos = 0;
float typoCost = 0;
uint32_t prevOffset = 0;
uint32_t siblingOffset = 0;
uint8_t continualTypoIdx = 0;
Dialect dialect = Dialect::standard;
TypoGraphNode(U16StringView _form = {}, uint32_t _endPos = 0, float _typoCost = 0, uint32_t _prevOffset = 0, uint32_t _siblingOffset = 0)
: form{ _form }, endPos{ _endPos }, typoCost{ _typoCost }, prevOffset{ _prevOffset }, siblingOffset{ _siblingOffset }
{
}
const TypoGraphNode* getPrev() const
{
if (!prevOffset) return nullptr;
return this - prevOffset;
}
const TypoGraphNode* getSibling() const
{
if (!siblingOffset) return nullptr;
return this + siblingOffset;
}
};
/**
* @brief . kiwi::TypoTransformer::prepare() .
*/
@ -191,7 +219,7 @@ namespace kiwi
public:
PreparedTypoTransformer();
PreparedTypoTransformer(const TypoTransformer& tt);
PreparedTypoTransformer(const TypoTransformer& tt, bool inverse = false);
~PreparedTypoTransformer();
PreparedTypoTransformer(const PreparedTypoTransformer&) = delete;
PreparedTypoTransformer(PreparedTypoTransformer&&) noexcept;
@ -217,6 +245,13 @@ namespace kiwi
* @param costThreshold
*/
TypoCandidates<true> generate(const std::u16string& orig, float costThreshold = 2.5f) const;
template<class Alloc>
size_t generateGraph(U16StringView normalizedStr, std::vector<TypoGraphNode, Alloc>& graphOut,
Dialect allowedDialect = Dialect::standard,
const std::pair<uint32_t, uint32_t>* pretokenizedFirst = nullptr,
const std::pair<uint32_t, uint32_t>* pretokenizedLast = nullptr,
size_t* maxContinualTypoIdxOut = nullptr) const;
};
/**
@ -389,10 +424,12 @@ namespace kiwi
/**
* @brief TypoTransformer를 PreparedTypoTransformer를 .
* PreparedTypoTransformer는 kiwi::KiwiBuilder에 .
*
* @param inverse false일 , true일 . false입니다.
*/
PreparedTypoTransformer prepare() const
PreparedTypoTransformer prepare(bool inverse = false) const
{
return { *this };
return { *this, inverse };
}
};
@ -413,4 +450,6 @@ namespace kiwi
* @param set
*/
const TypoTransformer& getDefaultTypoSet(DefaultTypoSet set);
const PreparedTypoTransformer* getDefaultPreparedTypoSet(DefaultTypoSet set);
}

View file

@ -1,15 +1,4 @@
/**
* @file Utils.h
* @author bab2min (bab2min@gmail.com)
* @brief
* @version 0.22.1
* @date 2025-11-21
*
* UTF-8/UTF-16 , ,
* .
*/
#pragma once
#pragma once
#include <iostream>
#include <string>
#include <memory>
@ -41,88 +30,25 @@ namespace kiwi
return std::unique_ptr<T>(new typename std::remove_extent<T>::type[size]);
}
/**
* @brief UTF-8 UTF-16 .
* @param str UTF-8
* @return UTF-16
*/
std::u16string utf8To16(const std::string& str);
/**
* @brief UTF-8 UTF-16 .
* @param str UTF-8
* @param bytePositions UTF-8
* @return UTF-16
*/
std::u16string utf8To16(const std::string& str, std::vector<size_t>& bytePositions);
/**
* @brief UTF-8 .
* @param code
* @return UTF-8
*/
std::string utf8FromCode(char32_t code);
size_t utf8FromCode(std::string& ret, char32_t code);
/**
* @brief UTF-16 UTF-8 .
* @param str UTF-16
* @return UTF-8
*/
std::string utf16To8(const std::u16string& str);
/**
* @brief .
* @param hangul
* @return
*/
KString normalizeHangul(const std::u16string& hangul);
/**
* @brief .
* @param t
* @return (URL, , , ) true
*/
inline bool isWebTag(POSTag t)
{
return POSTag::w_url <= t && t <= POSTag::w_emoji;
}
/**
* @brief .
* @param tagStr
* @return
*/
POSTag toPOSTag(const std::u16string& tagStr);
/**
* @brief .
* @param t
* @return
*/
const char* tagToString(POSTag t);
/**
* @brief .
* @param t
* @return
*/
const kchar_t* tagToKString(POSTag t);
const char* tagRToString(char16_t form, POSTag t);
const kchar_t* tagRToKString(char16_t form, POSTag t);
/**
* @brief .
* @tparam A
* @tparam B
* @tparam C
* @param value
* @param lower ()
* @param upper ()
* @return lower <= value < upper이면 true
*/
template<class A, class B, class C>
inline bool within(A value, B lower, C upper)
{
@ -135,82 +61,41 @@ namespace kiwi
return cont.data() <= value && value < cont.data() + cont.size();
}
/**
* @brief .
* @param chr
* @return (-) true
*/
inline bool isHangulSyllable(char16_t chr)
{
return within(chr, 0xAC00, 0xD7A4);
}
/**
* @brief .
* @param chr
* @return true
*/
inline bool isHangulOnset(char16_t chr)
{
return within(chr, 0x1100, 0x1100 + 19);
}
/**
* @brief .
* @param chr
* @return true
*/
inline bool isHangulCoda(char16_t chr)
{
return within(chr, 0x11A8, 0x11A8 + 27);
}
/**
* @brief .
* @param chr
* @return true
*/
inline bool isHangulVowel(char16_t chr)
{
return within(chr, 0x314F, 0x3164);
}
/**
* @brief .
* @param onset
* @param vowel
* @return
*/
inline char16_t joinOnsetVowel(size_t onset, size_t vowel)
{
return 0xAC00 + (char16_t)((onset * 21 + vowel) * 28);
}
/**
* @brief .
* @param chr
* @return
*/
inline int extractVowel(char16_t chr)
{
return ((chr - 0xAC00) / 28) % 21;
}
/**
* @brief .
* @param chr
* @return true
*/
inline bool isOldHangulOnset(char16_t chr)
{
return within(chr, 0x1100, 0x1160) || within(chr, 0xA960, 0xA980);
}
/**
* @brief .
* @param chr
* @return true
*/
inline bool isOldHangulVowel(char16_t chr)
{
return within(chr, 0x1160, 0x11A8) || within(chr, 0xD7B0, 0xD7CB);
@ -246,6 +131,39 @@ namespace kiwi
return os << utf16To8({ str.begin(), str.end() });
}
template<class Str, class It>
inline void normalizeHangul(Str& ret, It first, It last)
{
ret.reserve((size_t)(std::distance(first, last) * 1.5));
for (; first != last; ++first)
{
char16_t c = *first;
if (c == 0xB42C) c = 0xB410; // '됬'을 '됐'으로 강제교정
if (isHangulSyllable(c))
{
int coda = (c - 0xAC00) % 28;
ret.push_back(c - coda);
if (coda) ret.push_back(coda + 0x11A7);
}
else if (!ret.empty() && isHangulOnset(ret.back())
&& 0x1161 <= c && c < 0x1176)
{
// 첫가끝 초성 + 중성 중 현대한글 음절로 가능한 것은 결합
ret.back() = (char16_t)(0xAC00 + ((ret.back() - 0x1100) * 21 * 28) + ((c - 0x1161) * 28));
}
else
{
ret.push_back(c);
}
}
}
template<class Str>
inline void normalizeHangul(Str& ret, std::u16string_view sv)
{
normalizeHangul(ret, sv.begin(), sv.end());
}
template<class It>
inline std::u16string joinHangul(It first, It last)
{

View file

@ -1,33 +1,15 @@
/**
* @file WordDetector.h
* @author bab2min (bab2min@gmail.com)
* @brief WordDetector
* @version 0.22.1
* @date 2025-11-21
*
* .
* (cohesion) (branching entropy) .
*/
#pragma once
#pragma once
#include <kiwi/Types.h>
namespace kiwi
{
/**
* @brief
*/
struct WordInfo
{
std::u16string form; /**< 단어의 표면형 */
float score; /**< 단어 점수 */
float lBranch; /**< 좌측 분기 엔트로피 */
float rBranch; /**< 우측 분기 엔트로피 */
float lCohesion; /**< 좌측 응집도 */
float rCohesion; /**< 우측 응집도 */
uint32_t freq; /**< 출현 빈도 */
std::map<POSTag, float> posScore; /**< 품사별 점수 */
std::u16string form;
float score, lBranch, rBranch, lCohesion, rCohesion;
uint32_t freq;
std::map<POSTag, float> posScore;
WordInfo(std::u16string _form = {},
float _score = 0, float _lBranch = 0, float _rBranch = 0,
@ -38,12 +20,6 @@ namespace kiwi
{}
};
/**
* @brief
*
* .
* .
*/
class WordDetector
{
struct Counter;
@ -62,59 +38,20 @@ namespace kiwi
std::map<POSTag, float> getPosScore(Counter&, const std::map<std::u16string, uint32_t>& cnt, std::map<std::u16string, uint32_t>::iterator it, bool coda, const std::u16string& realForm) const;
public:
/**
* @brief
*/
struct FromRawData {};
static constexpr FromRawData fromRawDataTag = {};
WordDetector() = default;
/**
* @brief WordDetector를 .
* @param modelPath
* @param _numThreads (-1 )
*/
WordDetector(const std::string& modelPath, size_t _numThreads = -1);
/**
* @brief WordDetector를 .
* @param tag FromRawData
* @param modelPath
* @param _numThreads (-1 )
*/
WordDetector(FromRawData, const std::string& modelPath, size_t _numThreads = -1);
/**
* @brief WordDetector를 .
* @param streamProvider
* @param _numThreads (-1 )
*/
WordDetector(const std::function<std::unique_ptr<std::istream>(const std::string&)>& streamProvider, size_t _numThreads = -1);
/**
* @brief WordDetector가 .
* @return true
*/
bool ready() const
{
return !posScore.empty();
}
/**
* @brief .
* @param modelPath
*/
void saveModel(const std::string& modelPath) const;
/**
* @brief .
* @param reader
* @param minCnt
* @param maxWordLen
* @param minScore
* @return
*/
std::vector<WordInfo> extractWords(const U16MultipleReader& reader, size_t minCnt = 10, size_t maxWordLen = 10, float minScore = 0.1f) const;
};

View file

@ -2,8 +2,8 @@
* @file capi.h
* @author bab2min (bab2min@gmail.com)
* @brief Kiwi C API를
* @version 0.22.1
* @date 2025-11-21
* @version 0.23.1
* @date 2026-04-05
*
*
*/
@ -35,6 +35,7 @@ typedef struct kiwi_joiner* kiwi_joiner_h;
typedef struct kiwi_typo* kiwi_typo_h;
typedef struct kiwi_morphset* kiwi_morphset_h;
typedef struct kiwi_pretokenized* kiwi_pretokenized_h;
typedef struct kiwi_prepared_typo* kiwi_prepared_typo_h;
typedef unsigned short kchar16_t;
typedef struct kiwi_swtokenizer* kiwi_swtokenizer_h;
@ -71,11 +72,16 @@ typedef struct {
typedef struct {
uint8_t integrate_allomorph; /**< 이형태 형태소의 통합 여부 */
float cut_off_threshold; /**< 분석 과정에서 이 값보다 더 크게 차이가 나는 후보들은 제거합니다. */
float unk_form_score_scale; /**< 미등재 형태 추출 시 사용하는 기울기 값 */
float unk_form_score_bias; /**< 미등재 형태 추출 시 사용하는 편향 값 */
float oov_rule_scale; /**< 미등재 형태 추출 시 사용하는 기울기 값 */
float oov_rule_bias; /**< 미등재 형태 추출 시 사용하는 편향 값 */
float oov_chr_bias; /**< 미등재 형태 추출 시 사용하는 문자 기반 점수의 편향 값 */
float oov_global_weight; /**< 미등재 형태 추출 시 사용하는 전역 빈도 가중치 */
float oov_local_weight; /**< 미등재 형태 추출 시 사용하는 국부 빈도 가중치 */
float oov_global_min_freq; /**< 미등재 형태 추출 시 사용하는 전역 최소 빈도 */
float space_penalty; /**< 공백 패널티 */
float typo_cost_weight; /**< 오타 비용의 가중치 */
uint32_t max_unk_form_size; /**< 미등재 형태의 최대 크기 */
uint32_t max_unk_form_size_followed_by_j_class; /**< (조사가 뒤따르는 경우) 미등재 형태의 최대 크기 */
uint32_t space_tolerance; /**< 공백 허용치 */
} kiwi_config_t;
@ -171,11 +177,18 @@ enum
enum
{
KIWI_MATCH_URL = 1,
KIWI_MATCH_EMAIL = 2,
KIWI_MATCH_HASHTAG = 4,
KIWI_MATCH_MENTION = 8,
KIWI_MATCH_SERIAL = 16,
KIWI_MATCH_URL = 1 << 0,
KIWI_MATCH_EMAIL = 1 << 1,
KIWI_MATCH_HASHTAG = 1 << 2,
KIWI_MATCH_MENTION = 1 << 3,
KIWI_MATCH_SERIAL = 1 << 4,
KIWI_MATCH_EMOJI = 1 << 5,
KIWI_MATCH_OOV_RULE_ONLY = 0 << 8,
KIWI_MATCH_OOV_CHR_MODEL = 1 << 8,
KIWI_MATCH_OOV_CHR_FREQ_MODEL = 2 << 8,
KIWI_MATCH_OOV_CHR_FREQ_BRANCH_MODEL = 3 << 8,
KIWI_MATCH_OOV_MASK = 3 << 8,
KIWI_MATCH_NORMALIZE_CODA = 1 << 16,
KIWI_MATCH_JOIN_NOUN_PREFIX = 1 << 17,
@ -190,8 +203,10 @@ enum
KIWI_MATCH_COMPATIBLE_JAMO = 1 << 24,
KIWI_MATCH_SPLIT_SAISIOT = 1 << 25,
KIWI_MATCH_MERGE_SAISIOT = 1 << 26,
KIWI_MATCH_JOIN_PARTICLE_YO = 1 << 27,
KIWI_MATCH_USE_OLD_SPLITTER = 1 << 30,
KIWI_MATCH_ALL = KIWI_MATCH_URL | KIWI_MATCH_EMAIL | KIWI_MATCH_HASHTAG | KIWI_MATCH_MENTION | KIWI_MATCH_SERIAL | KIWI_MATCH_Z_CODA,
KIWI_MATCH_ALL = KIWI_MATCH_URL | KIWI_MATCH_EMAIL | KIWI_MATCH_HASHTAG | KIWI_MATCH_MENTION | KIWI_MATCH_SERIAL | KIWI_MATCH_EMOJI | KIWI_MATCH_Z_CODA,
KIWI_MATCH_ALL_WITH_NORMALIZING = KIWI_MATCH_ALL | KIWI_MATCH_NORMALIZE_CODA,
};
@ -307,6 +322,33 @@ DECL_DLL int kiwi_builder_add_word(kiwi_builder_h handle, const char* word, cons
*/
DECL_DLL int kiwi_builder_add_alias_word(kiwi_builder_h handle, const char* alias, const char* pos, float score, const char* orig_word);
/**
* @brief . .
*
* @param handle KiwiBuilder의 .
* @param word (utf-8).
* @param pos (kiwi#POSTag).
* @param sense_id .
* @param dialect . KIWI_DIALECT_* .
* @param score .
* @return 0 .
*/
DECL_DLL int kiwi_builder_add_word_with_def(kiwi_builder_h handle, const char* word, const char* pos, int sense_id, int dialect, float score);
/**
* @brief . .
*
* @param handle KiwiBuilder의 .
* @param alias (utf-8)
* @param pos (kiwi#POSTag).
* @param sense_id .
* @param dialect . KIWI_DIALECT_* .
* @param score .
* @param orig_word (utf-8)
* @return 0 .
*/
DECL_DLL int kiwi_builder_add_alias_word_with_def(kiwi_builder_h handle, const char* alias, const char* pos, int sense_id, int dialect, float score, const char* orig_word);
/**
* @brief .
* .
@ -446,6 +488,7 @@ enum
KIWI_TYPO_BASIC_TYPO_SET_WITH_CONTINUAL = 3,
KIWI_TYPO_LENGTHENING_TYPO_SET = 4,
KIWI_TYPO_BASIC_TYPO_SET_WITH_CONTINUAL_AND_LENGTHENING = 5,
KIWI_TYPO_DIALECT = 6,
};
/**
@ -526,15 +569,34 @@ DECL_DLL int kiwi_typo_set_lengthening_typo_cost(kiwi_typo_h handle, float thres
*/
DECL_DLL int kiwi_typo_close(kiwi_typo_h handle);
/**
* @brief .
*
* @param handle
* @return . null를 .
*
* @note kiwi_prepared_typo_close를 .
*/
DECL_DLL kiwi_prepared_typo_h kiwi_typo_prepare(kiwi_typo_h handle);
/**
* @brief .
*
* @param handle
* @return 0 . .
*/
DECL_DLL int kiwi_prepared_typo_close(kiwi_prepared_typo_h handle);
/**
* @brief KiwiBuilder를 Kiwi instance를 .
*
* @param model_path (e.g., ./models/base).
* @param num_threads (-1 , ).
* @param options . KIWI_BUILD_* .
* @param enabled_dialects . KIWI_DIALECT_* .
* @return Kiwi의 .
*/
DECL_DLL kiwi_h kiwi_init(const char* model_path, int num_threads, int options);
DECL_DLL kiwi_h kiwi_init(const char* model_path, int num_threads, int options, int enabled_dialects);
/**
* @brief .
@ -603,6 +665,8 @@ typedef struct {
int open_ending; /**< 마지막 형태소 다음 문장을 종결하지 않고 열린 상태로 끝낼지를 설정니다. 기본값은 0으로 마지막 형태소 다음 바로 문장을 종결합니다. */
int allowed_dialects; /**< KIWI_DIALECT_* 열거형 참고 */
float dialect_cost; /**< 방언 형태소에 추가되는 비용. 기본값은 3 */
kiwi_prepared_typo_h typo_transformer; /**< 분석 시 사용할 오타 교정기. null인 경우 사용하지 않습니다. */
float typo_threshold; /**< 오타 교정 비용 임계값. 기본값은 2.5 */
} kiwi_analyze_option_t;
/**

View file

@ -132,9 +132,6 @@ NNB VCP,JK
^터 이 테-1
^것 이 게-1
NNB VCP,JK <충청>
NP VCP,JK
터 이 테-1
것 이 게-1
@ -235,22 +232,5 @@ E E
리 어 려
시 어 셔
NN JK <함경>
ㅣ 이 ㅣ
ㅐ 이 ㅐ
ㅔ 이 ㅔ
ㅚ 이 ㅚ
ㅟ 이 ㅟ
ㅏ 이 ㅐ
ㅓ 이 ㅔ
ㅘ 이 ㅙ
ㅝ 이 ㅞ
ㅜ 우 ㅜ
ㅠ 우 ㅠ
EC VX <함경>
디 애 대,댸
VX E <함경>
#ᇂ 고 )(코
#ᇂ 구 )(쿠

BIN
models/cong/base/nounchr.mdl (Stored with Git LFS) Normal file

Binary file not shown.

View file

@ -59,7 +59,7 @@ namespace kiwi
static_cast<std::ptrdiff_t>(ArchType::sse4_1)
#endif
#if CPUINFO_ARCH_ARM64
//static_cast<std::ptrdiff_t>(ArchType::neon)
static_cast<std::ptrdiff_t>(ArchType::neon)
#endif
#else
#ifdef KIWI_ARCH_X86_64
@ -72,7 +72,7 @@ namespace kiwi
static_cast<std::ptrdiff_t>(ArchType::sse4_1)
#endif
#ifdef KIWI_ARCH_ARM64
//static_cast<std::ptrdiff_t>(ArchType::neon)
static_cast<std::ptrdiff_t>(ArchType::neon)
#endif
#endif
>;

View file

@ -15,7 +15,7 @@ ArchType kiwi::getBestArch()
if (cpuinfo_has_x86_avx512vnni()) return ArchType::avx512vnni;
if (cpuinfo_has_x86_avx512bw()) return ArchType::avx512bw;
#ifdef KIWI_AVX_VNNI_SUPPORTED
if (cpuinfo_has_x86_avx_vnni_int8()) return ArchType::avx_vnni;
if (cpuinfo_has_x86_avxvnni()) return ArchType::avx_vnni;
#endif
if (cpuinfo_has_x86_avx2()) return ArchType::avx2;
if (cpuinfo_has_x86_sse4_1()) return ArchType::sse4_1;

View file

@ -5,9 +5,6 @@
namespace kiwi
{
template<class LmState>
struct WordLL;
using Wid = uint32_t;
enum class PathEvaluatingMode
@ -18,40 +15,50 @@ namespace kiwi
top1,
};
template<class LmState>
template<class _LmState, bool _hasOovCounter = false>
struct WordLL
{
using LmState = _LmState;
static constexpr bool hasOovCounter = _hasOovCounter;
LmState lmState;
float accScore = 0, firstChunkScore = 0;
uint32_t parent = 0;
Wid wid = 0;
uint16_t ownFormId = 0;
uint8_t combineSocket = 0;
uint8_t prevRootId = 0;
SpecialState spState;
uint8_t rootId = 0;
std::conditional_t<hasOovCounter, uint16_t, uint8_t> oovFlag = 0;
std::conditional_t<hasOovCounter, uint32_t, uint8_t> oovCntArenaPtr = 0;
const Morpheme* morpheme = nullptr;
float accScore = 0, firstChunkScore = 0, accTypoCost = 0, accDialectCost = 0;
const WordLL* parent = nullptr;
Wid wid = 0;
uint16_t ownFormId = 0;
uint8_t combineSocket = 0;
WordLL() = default;
WordLL(const Morpheme* _morph, float _accScore, float _firstChunkScore, float _accTypoCost, float _accDialectCost,
const WordLL* _parent, LmState _lmState, SpecialState _spState)
WordLL(const Morpheme* _morph, float _accScore, float _firstChunkScore,
uint32_t _parent, LmState _lmState, SpecialState _spState, uint8_t _rootId,
uint16_t _oovFlag = 0,
uint32_t _oovCntArenaPtr = 0
)
: morpheme{ _morph },
accScore{ _accScore },
firstChunkScore{ _firstChunkScore },
accTypoCost{ _accTypoCost },
accDialectCost{ _accDialectCost },
parent{ _parent },
lmState{ _lmState },
spState{ _spState },
rootId{ _parent ? _parent->rootId : (uint8_t)0 }
rootId{ _rootId },
oovFlag{ (decltype(oovFlag))_oovFlag },
oovCntArenaPtr{ (decltype(oovCntArenaPtr))_oovCntArenaPtr }
{
}
const WordLL* root() const
const WordLL* root(const WordLL* base) const
{
if (parent) return parent->root();
if (parent) return base[parent].root(base);
else return this;
}
@ -66,10 +73,10 @@ namespace kiwi
}
};
template<class LmState>
struct Hash<WordLL<LmState>>
template<class LmState, bool useOOVGlobalConsistency>
struct Hash<WordLL<LmState, useOOVGlobalConsistency>>
{
size_t operator()(const WordLL<LmState>& p) const
size_t operator()(const WordLL<LmState, useOOVGlobalConsistency>& p) const
{
size_t ret = Hash<LmState>{}(p.lmState);
ret = *reinterpret_cast<const uint16_t*>(&p.prevRootId) ^ ((ret << 3) | (ret >> (sizeof(size_t) * 8 - 3)));
@ -121,28 +128,33 @@ namespace kiwi
struct WordLLGreater
{
template<class LmState>
bool operator()(const WordLL<LmState>& a, const WordLL<LmState>& b) const
template<class WordLL>
bool operator()(const WordLL& a, const WordLL& b) const
{
return a.accScore > b.accScore;
}
};
template<class LmState>
inline std::ostream& printDebugPath(std::ostream& os, const WordLL<LmState>& src)
template<class Vector, class FormVector>
inline std::ostream& printDebugPath(std::ostream& os, Vector&& pathes, size_t pathIdx, FormVector&& ownFormList)
{
if (src.parent)
auto& path = pathes[pathIdx];
if (path.parent != pathIdx)
{
printDebugPath(os, *src.parent);
printDebugPath(os, pathes, path.parent, ownFormList);
}
if (src.morpheme) src.morpheme->print(os);
if (path.morpheme)
{
if (path.ownFormId) os << utf16To8(joinHangul(ownFormList[path.ownFormId - 1].begin(), ownFormList[path.ownFormId - 1].end()));
path.morpheme->print(os);
}
else os << "NULL";
os << " , ";
return os;
}
template<PathEvaluatingMode mode, class LmState>
template<PathEvaluatingMode mode, class WordLL>
class BestPathConatiner;
template<PathEvaluatingMode mode>
@ -151,12 +163,13 @@ namespace kiwi
static constexpr size_t maxSize = -1;
};
template<class LmState>
class BestPathConatiner<PathEvaluatingMode::topN, LmState>
template<class WordLL>
class BestPathConatiner<PathEvaluatingMode::topN, WordLL>
{
using LmState = typename WordLL::LmState;
// pair: [index, size]
UnorderedMap<PathHash<LmState>, std::pair<uint32_t, uint32_t>> bestPathIndex;
Vector<WordLL<LmState>> bestPathValues;
Vector<WordLL> bestPathValues;
public:
inline void clear()
@ -166,15 +179,18 @@ namespace kiwi
}
inline void insert(size_t topN, uint8_t prevRootId, uint8_t rootId,
const Morpheme* morph, float accScore, float firstChunkScore, float accTypoCost, float accDialectCost,
const WordLL<LmState>* parent, LmState&& lmState, SpecialState spState)
const Morpheme* morph, float accScore, float firstChunkScore,
const WordLL* base, uint32_t parent, LmState&& lmState, SpecialState spState)
{
PathHash<LmState> ph{ lmState, prevRootId, spState };
auto inserted = bestPathIndex.emplace(ph, std::make_pair((uint32_t)bestPathValues.size(), 1));
if (inserted.second)
{
bestPathValues.emplace_back(morph, accScore, firstChunkScore, accTypoCost, accDialectCost,
parent, std::move(lmState), spState);
bestPathValues.emplace_back(morph, accScore, firstChunkScore,
parent, std::move(lmState), spState,
parent ? base[parent].rootId : (uint8_t)0,
(uint16_t)(parent ? base[parent].oovFlag : 0),
parent ? base[parent].oovCntArenaPtr : (uint32_t)0);
if (rootId != commonRootId) bestPathValues.back().rootId = rootId;
bestPathValues.resize(bestPathValues.size() + topN - 1);
}
@ -184,8 +200,12 @@ namespace kiwi
auto bestPathLast = bestPathValues.begin() + inserted.first->second.first + inserted.first->second.second;
if (std::distance(bestPathFirst, bestPathLast) < topN)
{
*bestPathLast = WordLL<LmState>{ morph, accScore, firstChunkScore, accTypoCost, accDialectCost,
parent, std::move(lmState), spState };
*bestPathLast = WordLL{ morph, accScore, firstChunkScore,
parent, std::move(lmState), spState,
parent ? base[parent].rootId : (uint8_t)0,
(uint16_t)(parent ? base[parent].oovFlag : 0),
parent ? base[parent].oovCntArenaPtr : (uint32_t)0
};
if (rootId != commonRootId) bestPathLast->rootId = rootId;
std::push_heap(bestPathFirst, bestPathLast + 1, WordLLGreater{});
++inserted.first->second.second;
@ -195,8 +215,12 @@ namespace kiwi
if (accScore > bestPathFirst->accScore)
{
std::pop_heap(bestPathFirst, bestPathLast, WordLLGreater{});
*(bestPathLast - 1) = WordLL<LmState>{ morph, accScore, firstChunkScore, accTypoCost, accDialectCost,
parent, std::move(lmState), spState };
*(bestPathLast - 1) = WordLL{ morph, accScore, firstChunkScore,
parent, std::move(lmState), spState,
parent ? base[parent].rootId : (uint8_t)0,
(uint16_t)(parent ? base[parent].oovFlag : 0),
parent ? base[parent].oovCntArenaPtr : (uint32_t)0
};
if (rootId != commonRootId) (*(bestPathLast - 1)).rootId = rootId;
std::push_heap(bestPathFirst, bestPathLast, WordLLGreater{});
}
@ -204,7 +228,7 @@ namespace kiwi
}
}
inline void writeTo(Vector<WordLL<LmState>>& resultOut, const Morpheme* curMorph, Wid lastSeqId, size_t ownFormId)
inline void writeTo(Vector<WordLL>& resultOut, const Morpheme* curMorph, Wid lastSeqId, size_t ownFormId)
{
for (auto& p : bestPathIndex)
{
@ -227,10 +251,11 @@ namespace kiwi
}
};
template<class LmState>
class BestPathConatiner<PathEvaluatingMode::top1, LmState>
template<class WordLL>
class BestPathConatiner<PathEvaluatingMode::top1, WordLL>
{
UnorderedSet<WordLL<LmState>> bestPathes;
using LmState = typename WordLL::LmState;
UnorderedSet<WordLL> bestPathes;
public:
inline void clear()
{
@ -238,11 +263,15 @@ namespace kiwi
}
inline void insert(size_t topN, uint8_t prevRootId, uint8_t rootId,
const Morpheme* morph, float accScore, float firstChunkScore, float accTypoCost, float accDialectCost,
const WordLL<LmState>* parent, LmState&& lmState, SpecialState spState)
const Morpheme* morph, float accScore, float firstChunkScore,
const WordLL* base, uint32_t parent, LmState&& lmState, SpecialState spState)
{
WordLL<LmState> newPath{ morph, accScore, firstChunkScore, accTypoCost, accDialectCost,
parent, std::move(lmState), spState };
WordLL newPath{ morph, accScore, firstChunkScore,
parent, std::move(lmState), spState,
parent ? base[parent].rootId : (uint8_t)0,
(uint16_t)(parent ? base[parent].oovFlag : 0),
parent ? base[parent].oovCntArenaPtr : (uint32_t)0
};
newPath.prevRootId = prevRootId;
if (rootId != commonRootId) newPath.rootId = rootId;
auto inserted = bestPathes.emplace(newPath);
@ -250,7 +279,7 @@ namespace kiwi
{
// this is dangerous, but we can update the key safely
// because an equality between the two objects is guaranteed
auto& target = const_cast<WordLL<LmState>&>(*inserted.first);
auto& target = const_cast<WordLL&>(*inserted.first);
if (accScore > target.accScore)
{
target = newPath;
@ -258,7 +287,7 @@ namespace kiwi
}
}
inline void writeTo(Vector<WordLL<LmState>>& resultOut, const Morpheme* curMorph, Wid lastSeqId, size_t ownFormId)
inline void writeTo(Vector<WordLL>& resultOut, const Morpheme* curMorph, Wid lastSeqId, size_t ownFormId)
{
for (auto& p : bestPathes)
{
@ -288,13 +317,14 @@ namespace kiwi
static constexpr size_t maxSize = BestPathContainerTraits<PathEvaluatingMode::top1Small>::maxSize * 4;
};
template<class LmState, size_t bucketBits>
template<class WordLL, size_t bucketBits>
class BucketedHashContainer
{
using LmState = typename WordLL::LmState;
static constexpr size_t bucketSize = 1 << bucketBits;
std::array<std::array<uint8_t, BestPathContainerTraits<PathEvaluatingMode::top1Small>::maxSize>, bucketSize> hashes;
std::array<Vector<WordLL<LmState>>, bucketSize> values;
std::array<Vector<WordLL>, bucketSize> values;
public:
BucketedHashContainer()
@ -315,11 +345,11 @@ namespace kiwi
template<ArchType archType>
inline void insertOptimized(size_t topN, uint8_t prevRootId, uint8_t rootId,
const Morpheme* morph, float accScore, float firstChunkScore, float accTypoCost, float accDialectCost,
const WordLL<LmState>* parent, LmState&& lmState, SpecialState spState)
const Morpheme* morph, float accScore, float firstChunkScore,
const WordLL* base, uint32_t parent, LmState&& lmState, SpecialState spState)
{
static constexpr size_t numBits = sizeof(size_t) * 8;
const size_t h = Hash<WordLL<LmState>>{}(lmState, prevRootId, spState);
const size_t h = Hash<WordLL>{}(lmState, prevRootId, spState);
const size_t bucket = (h >> 8) & (bucketSize - 1);
auto& hash = hashes[bucket];
auto& value = values[bucket];
@ -355,8 +385,12 @@ namespace kiwi
if (value.size() < hash.size())
{
hash[value.size()] = h;
value.emplace_back(morph, accScore, firstChunkScore, accTypoCost, accDialectCost,
parent, std::move(lmState), spState);
value.emplace_back(morph, accScore, firstChunkScore,
parent, std::move(lmState), spState,
parent ? base[parent].rootId : (uint8_t)0,
(uint16_t)(parent ? base[parent].oovFlag : 0),
parent ? base[parent].oovCntArenaPtr : (uint32_t)0
);
value.back().prevRootId = prevRootId;
if (rootId != commonRootId) value.back().rootId = rootId;
}
@ -374,29 +408,29 @@ namespace kiwi
target.morpheme = morph;
target.accScore = accScore;
target.firstChunkScore = firstChunkScore;
target.accTypoCost = accTypoCost;
target.accDialectCost = accDialectCost;
target.parent = parent;
target.lmState = std::move(lmState);
target.spState = spState;
target.rootId = parent ? parent->rootId : 0;
target.rootId = parent ? base[parent].rootId : 0;
if (rootId != commonRootId) target.rootId = rootId;
target.oovFlag = parent ? base[parent].oovFlag : 0;
target.oovCntArenaPtr = parent ? base[parent].oovCntArenaPtr : 0;
}
}
}
inline void insert(size_t topN, uint8_t prevRootId, uint8_t rootId,
const Morpheme* morph, float accScore, float firstChunkScore, float accTypoCost, float accDialectCost,
const WordLL<LmState>* parent, LmState&& lmState, SpecialState spState)
const Morpheme* morph, float accScore, float firstChunkScore,
const WordLL* base, uint32_t parent, LmState&& lmState, SpecialState spState)
{
static constexpr ArchType archType = LmState::arch;
if constexpr (archType != ArchType::none && archType != ArchType::balanced)
{
return insertOptimized<archType>(topN, prevRootId, rootId, morph, accScore, firstChunkScore, accTypoCost, accDialectCost,
parent, std::move(lmState), spState);
return insertOptimized<archType>(topN, prevRootId, rootId, morph, accScore, firstChunkScore,
base, parent, std::move(lmState), spState);
}
const size_t h = Hash<WordLL<LmState>>{}(lmState, prevRootId, spState);
const size_t h = Hash<WordLL>{}(lmState, prevRootId, spState);
const size_t bucket = (h >> 8) & (bucketSize - 1);
auto& hash = hashes[bucket];
auto& value = values[bucket];
@ -418,8 +452,12 @@ namespace kiwi
if (value.size() < hash.size())
{
hash[value.size()] = h;
value.emplace_back(morph, accScore, firstChunkScore, accTypoCost, accDialectCost,
parent, std::move(lmState), spState);
value.emplace_back(morph, accScore, firstChunkScore,
parent, std::move(lmState), spState,
parent ? base[parent].rootId : (uint8_t)0,
(uint16_t)(parent ? base[parent].oovFlag : 0),
parent ? base[parent].oovCntArenaPtr : (uint32_t)0
);
value.back().prevRootId = prevRootId;
if (rootId != commonRootId) value.back().rootId = rootId;
}
@ -437,18 +475,18 @@ namespace kiwi
target.morpheme = morph;
target.accScore = accScore;
target.firstChunkScore = firstChunkScore;
target.accTypoCost = accTypoCost;
target.accDialectCost = accDialectCost;
target.parent = parent;
target.lmState = std::move(lmState);
target.spState = spState;
target.rootId = parent ? parent->rootId : 0;
target.rootId = parent ? base[parent].rootId : 0;
if (rootId != commonRootId) target.rootId = rootId;
target.oovFlag = parent ? base[parent].oovFlag : 0;
target.oovCntArenaPtr = parent ? base[parent].oovCntArenaPtr : 0;
}
}
}
inline void writeTo(Vector<WordLL<LmState>>& resultOut, const Morpheme* curMorph, Wid lastSeqId, size_t ownFormId)
inline void writeTo(Vector<WordLL>& resultOut, const Morpheme* curMorph, Wid lastSeqId, size_t ownFormId)
{
for (auto& v : values)
{
@ -470,15 +508,15 @@ namespace kiwi
};
template<class LmState>
class alignas(BestPathContainerTraits<PathEvaluatingMode::top1Small>::maxSize) BestPathConatiner<PathEvaluatingMode::top1Small, LmState>
: public BucketedHashContainer<LmState, 0>
template<class WordLL>
class alignas(BestPathContainerTraits<PathEvaluatingMode::top1Small>::maxSize) BestPathConatiner<PathEvaluatingMode::top1Small, WordLL>
: public BucketedHashContainer<WordLL, 0>
{
};
template<class LmState>
class alignas(BestPathContainerTraits<PathEvaluatingMode::top1Small>::maxSize) BestPathConatiner<PathEvaluatingMode::top1Medium, LmState>
: public BucketedHashContainer<LmState, 2>
template<class WordLL>
class alignas(BestPathContainerTraits<PathEvaluatingMode::top1Small>::maxSize) BestPathConatiner<PathEvaluatingMode::top1Medium, WordLL>
: public BucketedHashContainer<WordLL, 2>
{
};
}

File diff suppressed because it is too large Load diff

View file

@ -15,6 +15,29 @@ namespace kiwi
{
namespace lm
{
/*
* quantize frequency scale
*
* values <= 16 are linearly mapped
* values > 16 are mapped logarithmically:
*/
inline uint8_t quantizeFrequencyScale(float freq)
{
if (freq <= 0) return 0;
if (freq <= 16) return (uint8_t)freq;
const float logFreq = log2f(freq);
const float rounded = round(logFreq * 8) - 16;
if (rounded >= 255) return 255;
return (uint8_t)rounded;
}
inline float dequantizeFrequencyScale(uint8_t qfreq)
{
if (qfreq <= 16) return (float)qfreq;
const float logFreq = ((float)qfreq + 16) / 8.0f;
return powf(2.0f, logFreq);
}
template<size_t windowSize, ArchType _arch, class VocabTy, class VlVocabTy, bool quantized>
class CoNgramState;
@ -28,13 +51,40 @@ namespace kiwi
const uint8_t* alignedKeyValueData = nullptr;
std::unique_ptr<int32_t[]> allRootValueData;
std::unique_ptr<uint8_t[]> allEmbs;
const uint8_t* contextEmbPtr = nullptr; // [numContexts, (dim + scale? + bias + confid + vts)]
const uint8_t* contextEmbPtr = nullptr; // [numContexts, (dim + scale? + bias + confid + vts)] (quantized NEON: dim stores S8 values)
const uint8_t* outputEmbPtr = nullptr; // [numOutputs, (dim + scale? + sum?)]
const uint8_t* distantEmbPtr = nullptr; // [numOutputs, (dim + scale? + bias + confid + pad?)]
const float* positionConfidPtr = nullptr;
const uint8_t* distantMaskPtr = nullptr;
const float* outputEmbBiasPtr = nullptr;
const KeyType* invertedContextVocabPtr = nullptr;
const float* invNormContextPtr = nullptr;
const float* invNormOutputPtr = nullptr;
const float* contextEmbEntropyPtr = nullptr;
inline uint32_t unpackContextId(uint32_t v) const
{
if (header.flags & header.hasTrieFrequency)
{
return v & 0x00FFFFFF;
}
else
{
return v;
}
}
inline float unpackTrieFrequency(uint32_t v) const
{
if (header.flags & header.hasTrieFrequency)
{
return dequantizeFrequencyScale(v >> 24);
}
else
{
return 0.f;
}
}
inline size_t contextEmbStride() const
{
@ -59,11 +109,16 @@ namespace kiwi
return reinterpret_cast<const float*>(contextEmbPtr + idx * contextEmbStride());
}
inline const uint8_t* getContextQuantEmb(uint32_t idx) const
inline const uint8_t* getContextQuantEmb(size_t idx) const
{
return contextEmbPtr + idx * contextEmbStride();
}
inline const int8_t* getContextQuantEmbS8(size_t idx) const
{
return reinterpret_cast<const int8_t*>(contextEmbPtr + idx * contextEmbStride());
}
inline float getContextBias(uint32_t idx) const
{
const size_t offset = quantized ?
@ -204,26 +259,53 @@ namespace kiwi
float contextSimilarity(uint32_t contextId1, uint32_t contextId2) const override;
size_t predictWordsFromContext(uint32_t contextId, size_t topN, std::pair<uint32_t, float>* output) const override;
size_t predictWordsFromContextDiff(uint32_t contextId, uint32_t bgContextId, float weight, size_t topN, std::pair<uint32_t, float>* output) const override;
float progressOneStep(int32_t& nodeIdx, uint32_t& contextIdx, uint32_t next) const override;
float getContextFrequency(uint32_t contextId) const override;
float getContextEntropy(uint32_t contextId) const override;
size_t getNodeDepth(uint32_t nodeId) const override;
uint32_t toContextId(const uint32_t* vocabIds, size_t size) const override;
std::vector<std::vector<uint32_t>> getContextWordMap() const override;
uint32_t progressContextNode(int32_t& nodeIdx, KeyType next) const
{
if (invertedContextVocabPtr)
{
next = invertedContextVocabPtr[next];
}
if constexpr (std::is_same_v<KeyType, VlKeyType>)
{
return progressContextNodeVl(nodeIdx, next);
}
static constexpr size_t tMax = (1 << 16) - (1 << 10) * 2;
static constexpr size_t keyWidth = sizeof(VlKeyType) * 8,
surrogateBitWidth = keyWidth == 16 ? 10 : 5,
surrogateBitMask = (1 << surrogateBitWidth) - 1;
static constexpr size_t tMax = (keyWidth >= sizeof(size_t) * 8) ? (size_t)-1 - (((size_t)1 << surrogateBitWidth) * 2 - 1) : (((size_t)1 << keyWidth) - ((size_t)1 << surrogateBitWidth) * 2);
if (next < tMax)
{
return progressContextNodeVl(nodeIdx, next);
}
next -= tMax;
const size_t high = next >> 10, low = next & 0x3FF;
const size_t high = next >> surrogateBitWidth, low = next & surrogateBitMask;
progressContextNodeVl(nodeIdx, tMax + high);
return progressContextNodeVl(nodeIdx, tMax + (1 << 10) + low);
return progressContextNodeVl(nodeIdx, tMax + (1 << surrogateBitWidth) + low);
}
bool isSecondSurrogate(VlKeyType k) const
{
if constexpr (std::is_same_v<KeyType, VlKeyType>)
{
return false;
}
static constexpr size_t keyWidth = sizeof(VlKeyType) * 8,
surrogateBitWidth = keyWidth == 16 ? 10 : 5,
surrogateBitMask = (1 << surrogateBitWidth) - 1;
static constexpr size_t tMax = (keyWidth >= sizeof(size_t) * 8) ? (size_t)-1 - (((size_t)1 << surrogateBitWidth) * 2 - 1) : (((size_t)1 << keyWidth) - ((size_t)1 << surrogateBitWidth) * 2);
return k >= tMax + (1 << surrogateBitWidth);
}
uint32_t progressContextNodeVl(int32_t& nodeIdx, VlKeyType next) const
@ -426,21 +508,17 @@ namespace kiwi
return (x << r) | (x >> (sizeof(size_t) * 8 - r));
}
template<>
struct Hash<uint32_t>
inline size_t hashUint32(uint32_t v)
{
size_t operator()(uint32_t v) const
{
return ((size_t)v * largePrime) ^ rol((size_t)v, sizeof(size_t) * 4 + 1);
}
};
return ((size_t)v * largePrime) ^ rol((size_t)v, sizeof(size_t) * 4 + 1);
}
template<size_t windowSize, ArchType arch, class VocabTy, class VlVocabTy, bool quantized>
struct Hash<lm::CoNgramState<windowSize, arch, VocabTy, VlVocabTy, quantized>>
{
size_t operator()(const lm::CoNgramState<windowSize, arch, VocabTy, VlVocabTy, quantized>& state) const
{
size_t ret = Hash<uint32_t>{}(state.node);
size_t ret = hashUint32(state.node);
static constexpr size_t cmpStart = windowSize - sizeof(size_t) / sizeof(VocabTy);
size_t h = *reinterpret_cast<const size_t*>(&state.history[cmpStart]);
h = (h * largePrime) ^ rol(h, sizeof(size_t) * 4 - 1);
@ -454,7 +532,7 @@ namespace kiwi
{
size_t operator()(const lm::CoNgramState<0, arch, VocabTy, VlVocabTy, quantized>& state) const
{
size_t ret = Hash<uint32_t>{}(state.node);
size_t ret = hashUint32(state.node);
return ret;
}
};

View file

@ -2,6 +2,7 @@
#include <kiwi/SubstringExtractor.h>
#include "FrozenTrie.hpp"
#include "RaggedVector.hpp"
#include "StrUtils.h"
using namespace kiwi;
@ -43,9 +44,7 @@ HSDataset::HSDataset(size_t _batchSize,
{
}
HSDataset::~HSDataset()
{
}
HSDataset::~HSDataset() = default;
HSDataset::HSDataset(HSDataset&& o) /*noexcept*/ = default;
@ -802,3 +801,452 @@ std::vector<std::pair<std::vector<uint32_t>, size_t>> HSDataset::extractPrefixes
});
return ret;
}
size_t ChrTokenizer::encodeOne(char32_t c) const
{
if (isHangulSyllable(c))
{
int32_t i = (c - 0xAC00) / 28;
return (int32_t)Token::hangulSyllableStart + i;
}
else if (isHangulCoda(c))
{
int32_t i = c - 0x11A8;
return (int32_t)Token::hangulCodaStart + i;
}
else if (0x21 <= c && c < 0x7F)
{
return (int32_t)Token::asciiStart + (c - 0x21);
}
else
{
const POSTag type = identifySpecialChr(c);
switch (type)
{
case POSTag::sf:
return (int32_t)Token::sf;
case POSTag::sp:
return (int32_t)Token::sp;
case POSTag::ss:
return (int32_t)Token::ss;
case POSTag::sso:
return (int32_t)Token::sso;
case POSTag::ssc:
return (int32_t)Token::ssc;
case POSTag::se:
return (int32_t)Token::se;
case POSTag::so:
return (int32_t)Token::so;
case POSTag::sh:
return (int32_t)Token::sh;
default:
return (int32_t)Token::sw;
}
}
return 0;
}
size_t ChrTokenizer::encode(std::string_view text, int32_t* outBuf, size_t bufSize) const
{
size_t written = 0;
const auto normalizedText = normalizeHangul(utf8To16(text));
for (auto c : normalizedText)
{
if (written >= bufSize) break;
outBuf[written++] = encodeOne(c);
}
return written;
}
std::u16string ChrTokenizer::decode(const int32_t* tokenBuf, size_t tokenCnt) const
{
KString result;
for (size_t i = 0; i < tokenCnt; ++i)
{
int32_t t = tokenBuf[i];
if (Token::hangulSyllableStart <= (Token)t && (Token)t < Token::hangulCodaStart)
{
char16_t c = 0xAC00 + (uint16_t)(t - (int32_t)Token::hangulSyllableStart) * 28;
result.push_back(c);
}
else if (Token::hangulCodaStart <= (Token)t && (Token)t < Token::asciiStart)
{
char16_t c = 0x11A8 + (uint16_t)(t - (int32_t)Token::hangulCodaStart);
result.push_back(c);
}
else if (Token::asciiStart <= (Token)t && (Token)t < Token::max)
{
char16_t c = 0x21 + (uint16_t)(t - (int32_t)Token::asciiStart);
result.push_back(c);
}
else
{
switch ((Token)t)
{
case Token::sf:
result.push_back(u'.');
break;
case Token::sp:
result.push_back(u',');
break;
case Token::ss:
result.push_back(u'"');
break;
case Token::sso:
result.push_back(u'(');
break;
case Token::ssc:
result.push_back(u')');
break;
case Token::se:
result.push_back(u'\u2026');
break;
case Token::so:
result.push_back(u'\u223c');
break;
case Token::sh:
result.push_back(u'');
break;
case Token::sw:
result.push_back(u'');
break;
default:
break;
}
}
}
return joinHangul(result);
}
ChrDataset::ChrDataset(size_t _batchSize, size_t _causalContextSize, size_t _windowSize, float _prefixDropoutProb, bool _sampleWithoutWeights,
const std::vector<std::pair<size_t, std::vector<uint32_t>>>& _contextualMapper
)
: batchSize(_batchSize), causalContextSize(_causalContextSize), windowSize(_windowSize), prefixDropoutProb(_prefixDropoutProb), sampleWithoutWeights(_sampleWithoutWeights)
{
rng.seed(currentSeed);
if (!_contextualMapper.empty())
{
utils::ContinuousTrie<utils::TrieNodeEx<uint32_t, uint32_t>> cmTrie(1);
for (auto& p : _contextualMapper)
{
cmTrie.build(p.second.begin(), p.second.end(), p.first + 1);
}
cmTrie.fillFail();
contextualMapper = utils::FrozenTrie<uint32_t, uint32_t>{ cmTrie, ArchTypeHolder<ArchType::balanced>{} };
}
}
ChrDataset::~ChrDataset() = default;
ChrDataset::ChrDataset(ChrDataset&&) = default;
ChrDataset& ChrDataset::operator=(ChrDataset&&) = default;
void ChrDataset::addSentence(std::string_view sentence, float weight, std::string_view nonLabelPrefix)
{
ChrTokenizer tokenizer;
thread_local Vector<int32_t> tokenBuf;
tokenBuf.resize(sentence.size() + nonLabelPrefix.size());
std::string joined;
joined += nonLabelPrefix;
joined += sentence;
const size_t prefixSize = tokenizer.encode(nonLabelPrefix, tokenBuf.data(), tokenBuf.size());
const size_t tokenCnt = tokenizer.encode(joined, tokenBuf.data(), tokenBuf.size());
auto& sents = this->sents.get();
sents.emplace_back();
sents.insert_data(tokenBuf.begin(), tokenBuf.begin() + tokenCnt);
sentWeights.emplace_back(weight);
nonLabelPrefixSizes.emplace_back(prefixSize);
totalWeight += weight;
}
size_t ChrDataset::numSents() const
{
return sents.get().size();
}
void ChrDataset::seed(size_t newSeed)
{
currentSeed = newSeed;
rng.seed(newSeed);
}
void ChrDataset::reset()
{
seed(currentSeed);
sentSampled.clear();
shuffledIdcs.clear();
totalSampled = 0;
consumedSents = 0;
}
class InputTokenMapper
{
const utils::FrozenTrie<uint32_t, uint32_t>& cmTrie;
const utils::FrozenTrie<uint32_t, uint32_t>::Node* node = nullptr;
public:
InputTokenMapper(const utils::FrozenTrie<uint32_t, uint32_t>& trie)
: cmTrie{ trie }
{
if (!cmTrie.empty())
{
node = cmTrie.root();
}
}
int32_t operator()(int32_t inputToken)
{
if (cmTrie.empty() || inputToken == 0)
{
return inputToken;
}
auto* next = node->template nextOpt<ArchType::balanced>(cmTrie, inputToken);
while (!next)
{
node = node->fail();
if (!node) break;
next = node->template nextOpt<ArchType::balanced>(cmTrie, inputToken);
}
if (next)
{
node = next;
auto val = next->val(cmTrie);
if (cmTrie.hasMatch(val))
{
return val - 1;
}
else if (cmTrie.hasSubmatch(val))
{
auto sub = next->fail();
for (; sub; sub = sub->fail())
{
val = sub->val(cmTrie);
if (cmTrie.hasMatch(val))
{
break;
}
}
if (sub) return val - 1;
else return -1;
}
}
else
{
node = cmTrie.root();
return -1;
}
}
};
template<class InTy, class OutTy>
size_t ChrDataset::_next(InTy in, OutTy out)
{
if (sentSampled.size() != sentWeights.size())
{
sentSampled.resize(sentWeights.size());
}
if (sampleWithoutWeights)
{
if (shuffledIdcs.size() != sentWeights.size())
{
shuffledIdcs.resize(sentWeights.size());
std::iota(shuffledIdcs.begin(), shuffledIdcs.end(), 0);
std::shuffle(shuffledIdcs.begin(), shuffledIdcs.end(), rng);
consumedSents = 0;
}
}
else
{
if (totalSampled <= 0)
{
shuffledIdcs.resize(sentWeights.size());
std::iota(shuffledIdcs.begin(), shuffledIdcs.end(), 0);
}
else
{
shuffledIdcs.clear();
const float totalWeight = this->totalWeight,
totalSampled = this->totalSampled;
for (size_t i = 0; i < sentWeights.size(); ++i)
{
const float w = sentWeights[i] / totalWeight;
const float s = sentSampled[i] / totalSampled;
if (s < w)
{
shuffledIdcs.emplace_back(i);
}
}
}
std::shuffle(shuffledIdcs.begin(), shuffledIdcs.end(), rng);
}
auto& sents = this->sents.get();
size_t b;
for (b = 0; b < batchSize; ++b)
{
if (sampleWithoutWeights && b + consumedSents >= shuffledIdcs.size())
{
break;
}
const size_t i = sampleWithoutWeights ? shuffledIdcs[b + consumedSents] : shuffledIdcs[b % shuffledIdcs.size()];
sentSampled[i] += 1.f;
totalSampled += 1;
size_t start = 0;
if (prefixDropoutProb > 0 && std::generate_canonical<float, 32>(rng) < prefixDropoutProb)
{
start = (size_t)((std::max(sents[i].size(), (size_t)2) - 2) * std::generate_canonical<float, 32>(rng));
}
const size_t nonLabelPrefixSize = nonLabelPrefixSizes[i];
const size_t end = std::min(sents[i].size() + 1, causalContextSize);
InputTokenMapper tokenMapper{ contextualMapper };
for (size_t j = start; j < end; ++j)
{
const auto inputToken = j > 0 ? sents[i][j - 1] : 0;
*in = tokenMapper(inputToken);
++in;
*out = j < nonLabelPrefixSize ? nonVocab : (j < sents[i].size() ? sents[i][j] : 0);
++out;
}
for (size_t j = end - start; j < causalContextSize; ++j)
{
*in = nonVocab;
++in;
*out = nonVocab;
++out;
}
}
if (sampleWithoutWeights)
{
consumedSents += b;
}
return b;
}
size_t ChrDataset::next(int32_t* in, int32_t* out)
{
return _next(in, out);
}
size_t ChrDataset::next(int64_t* in, int64_t* out)
{
return _next(in, out);
}
std::vector<float> ChrDataset::getVocabProbs(double epsilon) const
{
Vector<double> weights(vocabSize(), epsilon);
for (size_t i = 0; i < sentWeights.size(); ++i)
{
auto sent = sents.get()[i];
for (auto token : sent)
{
auto v = token;
if (v < 0 || v >= vocabSize())
{
continue;
}
weights[v] += sentWeights[i];
}
weights[0] += sentWeights[i]; // for EOS
}
const double totalWeight = std::accumulate(weights.begin(), weights.end(), 0.0);
std::vector<float> probs(vocabSize());
for (size_t i = 0; i < vocabSize(); ++i)
{
probs[i] = (float)(weights[i] / totalWeight);
}
return probs;
}
std::vector<std::pair<std::vector<uint32_t>, double>> ChrDataset::extractPrefixes(
float resolution, float minWeight,
size_t maxLength, size_t numWorkers, bool exclusiveCnt) const
{
using Pair = std::pair<std::vector<uint32_t>, double>;
std::vector<Pair> ret;
const size_t minCnt = (size_t)ceil(minWeight / resolution);
PrefixCounter counter{ maxLength, minCnt, numWorkers };
Vector<int32_t> tokenBuf;
for (size_t i = 0; i < sents.get().size(); ++i)
{
const auto sent = sents.get()[i];
tokenBuf.clear();
tokenBuf.emplace_back(0);
tokenBuf.insert(tokenBuf.end(), sent.begin(), sent.end());
const size_t n = (size_t)ceil(sentWeights[i] / resolution);
for (size_t j = 0; j < n; ++j)
{
counter.addArray(tokenBuf.data(), tokenBuf.data() + tokenBuf.size());
}
}
auto trie = counter.count();
if (exclusiveCnt)
{
Vector<UnorderedMap<Vector<uint32_t>, size_t>> cnts_by_length(maxLength);
trie.traverse([&](size_t cnt, const std::vector<uint32_t>& prefix)
{
if (cnt < minCnt) return;
if (std::find_if(prefix.begin() + 1, prefix.end(), [](uint32_t t) { return t == 0; }) != prefix.end())
{
return;
}
Vector<uint32_t> p(prefix.begin(), prefix.end());
cnts_by_length[p.size() - 1].emplace(move(p), cnt);
});
Vector<uint32_t> suffix;
suffix.reserve(maxLength);
for (size_t i = 1; i < maxLength; ++i)
{
for (auto& p : cnts_by_length[i])
{
suffix.clear();
suffix.insert(suffix.end(), p.first.begin() + 1, p.first.end());
auto it = cnts_by_length[i - 1].find(suffix);
if (it == cnts_by_length[i - 1].end() || it->second < p.second)
{
throw std::runtime_error("This should not happen");
}
it->second -= p.second;
}
}
for (auto& cnts : cnts_by_length)
{
for (auto& p : cnts)
{
if (p.second < minCnt) continue;
ret.emplace_back(std::vector<uint32_t>{ p.first.begin(), p.first.end() }, (double)p.second * resolution);
}
}
}
else
{
trie.traverse([&](size_t cnt, const std::vector<uint32_t>& prefix)
{
if (cnt < minCnt) return;
if (std::find_if(prefix.begin() + 1, prefix.end(), [](uint32_t t) { return t == 0; }) != prefix.end())
{
return;
}
ret.emplace_back(prefix, (double)cnt * resolution);
});
}
std::sort(ret.begin(), ret.end(), [](const Pair& a, const Pair& b)
{
return a.second > b.second;
});
return ret;
}

View file

@ -106,6 +106,9 @@ namespace kiwi
}
ret.zCodaAppendable = zCodaAppendable ? 1 : 0;
ret.zSiotAppendable = zSiotAppendable ? 1 : 0;
// 다음 값들은 KiwiBuilder::build에서 채워 넣는다
ret.hasJClass = 0;
ret.hasAnyFullMorphemes = 0;
return ret;
}

View file

@ -136,6 +136,7 @@ namespace kiwi
auto v = nextDiffs[p->nextOffset + i];
if (v <= 0) continue;
auto* child = &p[v];
child->depth = p->depth + 1;
child->lower = p->template findFail<archType>(*this, k) - child;
dq.emplace_back(child);
}

File diff suppressed because it is too large Load diff

View file

@ -5,6 +5,7 @@
#include <kiwi/Form.h>
#include <kiwi/PatternMatcher.h>
#include <kiwi/FrozenTrie.h>
#include <kiwi/TypoTransformer.h>
#include "StrUtils.h"
@ -97,7 +98,10 @@ namespace kiwi
Match matchOptions,
Dialect allowedDialect,
size_t maxUnkFormSize,
size_t maxUnkFormSizeFollowedByJClass,
size_t spaceTolerance,
const PreparedTypoTransformer* typoTransformer,
float typoThreshold,
float continualTypoCost,
float lengtheningTypoCost,
const PretokenizedSpanGroup::Span*& pretokenizedFirst,

View file

@ -499,7 +499,8 @@ namespace kiwi
| Match::joinVerbSuffix
| Match::joinAdjSuffix
| Match::joinAdvSuffix
| Match::mergeSaisiot))) return last;
| Match::mergeSaisiot
| Match::joinParticleYo))) return last;
if (std::distance(first, last) < 2) return last;
auto next = first;
@ -566,6 +567,15 @@ namespace kiwi
++next;
++next;
}
// (EC | EF) + JX(요) => (EC | EF)
else if (!!(matchOptions & Match::joinParticleYo)
&& nextToken.tag == POSTag::jx
&& nextToken.morph && *nextToken.morph->kform == u""
&& (current.tag == POSTag::ec || current.tag == POSTag::ef))
{
concatTokens(current, nextToken, current.tag);
++next;
}
else
{
++first;
@ -603,7 +613,8 @@ namespace kiwi
inline void insertPathIntoResults(
vector<TokenResult>& ret,
Vector<SpecialState>& spStatesByRet,
Vector<PackedState>& spStatesByRet,
Vector<uint8_t>& oovTotalCnt,
const Vector<PathResult>& pathes,
size_t topN,
Match matchOptions,
@ -616,6 +627,15 @@ namespace kiwi
{
Vector<size_t> parentMap;
uint32_t oovCntArenaMinPtr = -1;
if (!!(matchOptions & Match::oovTotalConsistency))
{
for (auto& p : pathes)
{
oovCntArenaMinPtr = min(oovCntArenaMinPtr, p.curState.oovCntArenaPtr());
}
}
if (ret.empty())
{
const size_t n = min(pathes.size(), topN * 2);
@ -626,7 +646,7 @@ namespace kiwi
}
else
{
UnorderedMap<uint8_t, uint32_t> prevParents;
UnorderedMap<PackedState, uint32_t> prevParents;
Vector<uint8_t> selectedPathes(pathes.size());
for (size_t i = 0; i < ret.size(); ++i)
{
@ -667,7 +687,7 @@ namespace kiwi
}
}
UnorderedMap<uint8_t, uint32_t> spStateCnt;
UnorderedMap<PackedState, uint32_t> spStateCnt;
size_t validTarget = 0;
for (size_t i = 0; i < ret.size(); ++i)
{
@ -728,6 +748,10 @@ namespace kiwi
token.typoCost = s.typoCost;
token.typoFormId = s.typoFormId;
token.senseId = s.morph->senseId;
if ((s.morph->tag == POSTag::nng || s.morph->tag == POSTag::nnp) && !s.str.empty())
{
token.senseId = -1; // OOV인 경우에는 senseId를 -1로 설정
}
updateTokenInfoScript(token);
token.dialect = s.morph->dialect;
auto ptId = nodeInWhichPretokenized[s.nodeId] + 1;
@ -751,7 +775,7 @@ namespace kiwi
sort(idx.begin(), idx.end(), [&](size_t a, size_t b) { return ret[a].second > ret[b].second; });
Vector<TokenResult> sortedRet;
Vector<SpecialState> sortedSpStatesByRet;
Vector<PackedState> sortedSpStatesByRet;
const size_t maxCands = min(topN * 2, validTarget);
for (size_t i = 0; i < maxCands; ++i)
{
@ -763,8 +787,16 @@ namespace kiwi
for (size_t i = 0; i < maxCands; ++i)
{
ret.emplace_back(move(sortedRet[i]));
spStatesByRet.emplace_back(sortedSpStatesByRet[i]);
if (!!(matchOptions & Match::oovTotalConsistency))
{
spStatesByRet.emplace_back(sortedSpStatesByRet[i].specialState(), sortedSpStatesByRet[i].oovCntArenaPtr() - oovCntArenaMinPtr);
}
else
{
spStatesByRet.emplace_back(sortedSpStatesByRet[i]);
}
}
if (!!(matchOptions & Match::oovTotalConsistency)) oovTotalCnt.erase(oovTotalCnt.begin(), oovTotalCnt.begin() + oovCntArenaMinPtr);
}
inline void makePretokenizedSpanGroup(
@ -960,14 +992,24 @@ namespace kiwi
throw invalid_argument{ "`cutOffThreshold` should be >= 0." };
}
if (unkFormScoreScale < 0)
if (oovRuleScale < 0)
{
throw invalid_argument{ "`unkFormScoreScale` should be >= 0." };
throw invalid_argument{ "`oovRuleScale` should be >= 0." };
}
if (unkFormScoreBias < 0)
if (oovGlobalWeight <= 0)
{
throw invalid_argument{ "`unkFormScoreBias` should be >= 0." };
throw invalid_argument{ "`oovGlobalWeight` should be > 0." };
}
if (oovLocalWeight <= 0)
{
throw invalid_argument{ "`oovLocalWeight` should be > 0." };
}
if (oovGlobalMinFreq <= 0)
{
throw invalid_argument{ "`oovGlobalMinFreq` should be >= 0." };
}
if (spacePenalty <= 0)
@ -986,6 +1028,9 @@ namespace kiwi
}
}
std::ostream* logStream = &std::cerr;
int doLogging = 0;
vector<TokenResult> Kiwi::analyze(const u16string& str, size_t topN, AnalyzeOption option,
const vector<PretokenizedSpan>& pretokenized,
const optional<KiwiConfig>& overrideConfig
@ -1004,6 +1049,17 @@ namespace kiwi
if (!!(option.match & Match::normalizeCoda)) normalizeCoda(normalizedStr.begin(), normalizedStr.end());
if ((option.match & Match::oovMask) >= Match::oovChrModel && !nounChrMdl)
{
throw invalid_argument{ "`oovChrModel` option is set but the character-level noun model is not loaded." };
}
if (option.allowedDialects != Dialect::standard && option.typoTransformer == nullptr)
{
option.typoTransformer = getDefaultPreparedTypoSet(DefaultTypoSet::dialect);
option.typoThreshold = 2.5f;
}
makePretokenizedSpanGroup(
pretokenizedGroup,
pretokenized,
@ -1019,18 +1075,59 @@ namespace kiwi
wordPositions.clear();
getWordPositions(wordPositions, str.begin(), str.end());
SubstringCounter substringCounter;
if ((option.match & Match::oovMask) >= Match::oovChrFreqModel || !!(option.match & Match::oovTotalConsistency))
{
thread_local Vector<char16_t> filteredStr;
filteredStr.clear();
filteredStr.reserve(normalizedStr.size());
for (size_t i = 0; i < normalizedStr.size(); ++i)
{
auto c = normalizedStr[i];
const POSTag chrType = identifySpecialChr(c);
switch (chrType)
{
case POSTag::unknown:
case POSTag::sf:
case POSTag::sp:
case POSTag::ss:
case POSTag::sso:
case POSTag::ssc:
case POSTag::se:
case POSTag::so:
case POSTag::sw:
case POSTag::sb:
c = u' ';
break;
}
filteredStr.emplace_back(c);
}
substringCounter = SubstringCounter{ filteredStr.data(), filteredStr.size() };
}
vector<TokenResult> ret;
Vector<SpecialState> spStatesByRet;
Vector<PackedState> spStatesByRet;
thread_local Vector<KGraphNode> nodes;
thread_local Vector<uint32_t> nodeInWhichPretokenized;
thread_local UnorderedMap<U16StringView, size_t> oovTotalMap;
thread_local UnorderedMap<OovOrForm,Vector<uint16_t>> oovPrefixLists;
thread_local Vector<uint8_t> oovTotalCnt;
oovTotalMap.clear();
oovPrefixLists.clear();
oovTotalCnt.clear();
const auto* pretokenizedFirst = pretokenizedGroup.spans.data();
const auto* pretokenizedLast = pretokenizedFirst + pretokenizedGroup.spans.size();
size_t splitEnd = 0;
chrono::steady_clock::time_point startTime;
while (splitEnd < normalizedStr.size())
{
if (doLogging)
{
startTime = chrono::steady_clock::now();
}
nodes.clear();
auto* pretokenizedPrev = pretokenizedFirst;
splitEnd = (*reinterpret_cast<FnSplitByTrie>(dfSplitByTrie))(
const size_t newSplitEnd = (*reinterpret_cast<FnSplitByTrie>(dfSplitByTrie))(
nodes,
forms.data(),
typoPtrs.data(),
@ -1040,17 +1137,27 @@ namespace kiwi
option.match,
option.allowedDialects,
config.maxUnkFormSize,
config.maxUnkFormSizeFollowedByJClass,
config.spaceTolerance,
option.typoTransformer,
option.typoThreshold,
continualTypoCost,
lengtheningTypoCost,
pretokenizedFirst,
pretokenizedLast
);
if (doLogging)
{
auto input = utf16To8(joinHangul(normalizedStr.substr(splitEnd, newSplitEnd - splitEnd)));
*logStream << "Input: " << input << "\nNodes: " << nodes.size() << endl;
}
splitEnd = newSplitEnd;
if (nodes.size() <= 2) continue;
findPretokenizedGroupOfNode(nodeInWhichPretokenized, nodes, pretokenizedPrev, pretokenizedFirst);
Vector<PathResult> res = (*reinterpret_cast<FnFindBestPath>(dfFindBestPath))(
Vector<PathResult> res = (*reinterpret_cast<FnFindBestPath>(dfFindBestPath))(FindBestPathArgs{
this,
config,
spStatesByRet,
@ -1058,15 +1165,28 @@ namespace kiwi
nodes.data(),
nodes.size(),
topN,
(size_t)(option.match & Match::oovMask),
!!(option.match & Match::oovTotalConsistency) ? &oovTotalMap : nullptr,
!!(option.match & Match::oovTotalConsistency) ? &oovTotalCnt : nullptr,
!!(option.match & Match::oovTotalConsistency) ? &oovPrefixLists : nullptr,
!!(option.match & Match::oovTotalConsistency) ? &ret : nullptr,
option.openEnding && splitEnd == normalizedStr.size(),
!!(option.match & Match::splitComplex),
!!(option.match & Match::splitSaisiot),
!!(option.match & Match::mergeSaisiot),
option.blocklist,
option.allowedDialects,
option.dialectCost
);
insertPathIntoResults(ret, spStatesByRet, res, topN, option.match, config.integrateAllomorph, positionTable, wordPositions, pretokenizedGroup, nodeInWhichPretokenized);
option.dialectCost,
(option.match & Match::oovMask) >= Match::oovChrFreqModel ? &substringCounter : nullptr
});
insertPathIntoResults(ret, spStatesByRet, oovTotalCnt,
res, topN, option.match, config.integrateAllomorph,
positionTable, wordPositions, pretokenizedGroup, nodeInWhichPretokenized);
if (doLogging)
{
auto duration = chrono::duration_cast<chrono::milliseconds>(chrono::steady_clock::now() - startTime).count();
*logStream << "Time: " << duration << "ms\n" << endl;
}
}
sort(ret.begin(), ret.end(), [](const TokenResult& a, const TokenResult& b)

View file

@ -1090,6 +1090,14 @@ KiwiBuilder::KiwiBuilder(StreamProvider streamProvider, size_t _numThreads, Buil
throw IOException{ "Cannot open required file: combiningRule.txt" };
}
}
if (auto stream = streamProvider("nounchr.mdl"))
{
nounChrMdl = lm::CoNgramModelBase::create(utils::createMemoryObjectFromStream(*stream),
archType,
false,
(modelType == ModelType::cong || modelType == ModelType::congGlobal));
}
}
KiwiBuilder::KiwiBuilder(const string& modelPath, size_t _numThreads, BuildOption _options, ModelType _modelType, Dialect _enabledDialects)
@ -2378,6 +2386,7 @@ Kiwi KiwiBuilder::build(const TypoTransformer& typos, float typoCostThreshold) c
{
Kiwi ret{ archType, langMdl, !typos.empty(), typos.isContinualTypoEnabled(), typos.isLengtheningTypoEnabled() };
ret.enabledDialects = enabledDialects;
ret.nounChrMdl = nounChrMdl;
Vector<FormRaw> combinedForms;
Vector<MorphemeRaw> combinedMorphemes;
@ -2459,6 +2468,7 @@ Kiwi KiwiBuilder::build(const TypoTransformer& typos, float typoCostThreshold) c
for (size_t i = 0; i < defaultFormSize; ++i)
{
formTrie[i + 1].val = &ret.forms[i];
ret.forms[i].hasAnyFullMorphemes = true;
}
Vector<const Form*> sortedForms;
@ -2477,6 +2487,17 @@ Kiwi KiwiBuilder::build(const TypoTransformer& typos, float typoCostThreshold) c
f.polar = accumulate(f.candidate.begin(), f.candidate.end(), f.candidate[0]->polar, reducePolar);
}
f.hasJClass = any_of(f.candidate.begin(), f.candidate.end(), [&](const Morpheme* m)
{
return isJClass(m->tag) || m->tag == POSTag::ec || m->tag == POSTag::ef;
});
f.hasAnyFullMorphemes = any_of(f.candidate.begin(), f.candidate.end(), [&](const Morpheme* m)
{
const auto tag = clearIrregular(m->tag);
return m->dialect == Dialect::standard && tag != POSTag::unknown && tag != POSTag::pa && tag != POSTag::pv;
});
f.dialect = accumulate(f.candidate.begin(), f.candidate.end(), f.candidate[0]->dialect, reduceDialect);
if (f.dialect != Dialect::standard && !(enabledDialects & f.dialect))
{

View file

@ -1,5 +1,6 @@
#pragma once
#include <kiwi/Kiwi.h>
#include "UnkFormScorer.h"
namespace kiwi
{
@ -13,6 +14,11 @@ namespace kiwi
{
}
explicit SpecialState(uint8_t val)
{
reinterpret_cast<uint8_t&>(*this) = val;
}
operator uint8_t() const
{
return reinterpret_cast<const uint8_t&>(*this);
@ -68,43 +74,231 @@ namespace kiwi
};
using Path = Vector<PathNode>;
struct PackedState
{
uint32_t data = 0;
PackedState() = default;
PackedState(SpecialState state, uint32_t oovCntArenaPtr = 0)
{
data = (oovCntArenaPtr << 8) | (uint8_t)state;
}
SpecialState specialState() const
{
return (SpecialState)(uint8_t)(data & 0xFF);
}
uint32_t oovCntArenaPtr() const
{
return data >> 8;
}
void setSpecialState(SpecialState state)
{
data = (data & 0xFFFFFF00) | (uint8_t)state;
}
void setOovCntArenaPtr(uint32_t ptr)
{
data = (data & 0xFF) | (ptr << 8);
}
bool operator<(const PackedState& o) const
{
return data < o.data;
}
bool operator==(const PackedState& o) const
{
return data == o.data;
}
};
template<>
struct Hash<PackedState>
{
size_t operator()(const PackedState& s) const
{
return std::hash<uint32_t>{}(s.data);
}
};
struct PathResult
{
Path path;
float score = 0;
SpecialState prevState;
SpecialState curState;
PackedState prevState;
PackedState curState;
PathResult(Path&& _path = {}, float _score = 0, SpecialState _prevState = {}, SpecialState _curState = {})
PathResult(Path&& _path = {}, float _score = 0, PackedState _prevState = {}, PackedState _curState = {})
: path{ move(_path) }, score{ _score }, prevState{ _prevState }, curState{ _curState }
{
sizeof(PathResult);
}
PathResult(const Path& _path, float _score = 0, SpecialState _prevState = {}, SpecialState _curState = {})
PathResult(const Path& _path, float _score = 0, PackedState _prevState = {}, PackedState _curState = {})
: path{ _path }, score{ _score }, prevState{ _prevState }, curState{ _curState }
{
}
};
template<class LangModel>
struct BestPathFinder
class OovOrForm : public U16StringView
{
static Vector<PathResult> findBestPath(const Kiwi* kw,
const KiwiConfig& config,
const Vector<SpecialState>& prevSpStates,
const KString& normForm,
const KGraphNode* graph,
const size_t graphSize,
const size_t topN,
bool openEnding,
bool splitComplex = false,
bool splitSaisiot = false,
bool mergeSaisiot = false,
const std::unordered_set<const Morpheme*>* blocklist = nullptr,
Dialect allowedDialects = Dialect::standard,
float dialectCost = 0.f
);
public:
explicit OovOrForm(const char16_t* str, size_t len) : U16StringView{ len ? str : nullptr, len } {}
OovOrForm(U16StringView str) : OovOrForm{ str.data(), str.size() } {}
explicit OovOrForm(const Form* form) : U16StringView{ reinterpret_cast<const char16_t*>(form), 0 } {}
const Form* asForm() const
{
if (size() > 0) return nullptr;
return reinterpret_cast<const Form*>(data());
}
U16StringView asOov() const
{
return *this;
}
bool operator==(const OovOrForm& o) const
{
const Form* form1 = asForm();
const Form* form2 = o.asForm();
if (form1 && form2)
{
return form1 == form2;
}
else if (!form1 && !form2)
{
return asOov() == o.asOov();
}
else
{
return false;
}
}
};
using FnFindBestPath = decltype(&BestPathFinder<void>::findBestPath);
template<>
struct Hash<OovOrForm>
{
size_t operator()(const OovOrForm& o) const
{
const Form* form = o.asForm();
if (form)
{
return Hash<const Form*>{}(form);
}
else
{
return Hash<U16StringView>{}(o.asOov());
}
}
};
struct FindBestPathArgs
{
const Kiwi* kw;
const KiwiConfig& config;
const Vector<PackedState>& prevSpStates;
const KString& normForm;
const KGraphNode* graph;
size_t graphSize;
size_t topN;
size_t oovScoringType;
UnorderedMap<U16StringView, size_t>* oovTotalMap;
Vector<uint8_t>* oovTotalCnt;
UnorderedMap<OovOrForm, Vector<uint16_t>>* oovPrefixLists;
const std::vector<TokenResult>* prevResults = nullptr;
bool openEnding;
bool splitComplex = false;
bool splitSaisiot = false;
bool mergeSaisiot = false;
const std::unordered_set<const Morpheme*>* blocklist = nullptr;
Dialect allowedDialects = Dialect::standard;
float dialectCost = 0.f;
const SubstringCounter* substringCounter = nullptr;
};
class OovUnigramScorer
{
const UnorderedMap<U16StringView, size_t>* oovTotalMap = nullptr;
const Vector<uint8_t>* oovTotalCnt = nullptr;
const KGraphNode* graph = nullptr;
const uint32_t* oovCands = nullptr;
size_t oovCandSize = 0;
float smoothness = 0;
public:
OovUnigramScorer(
const UnorderedMap<U16StringView, size_t>* _oovTotalMap,
const Vector<uint8_t>* _oovTotalCnt,
const KGraphNode* _graph,
const uint32_t* _oovCands,
size_t _oovCandSize,
float _smoothness
)
: oovTotalMap{ _oovTotalMap }, oovTotalCnt{ _oovTotalCnt }, graph{ _graph }, oovCands{ _oovCands }, oovCandSize{ _oovCandSize }, smoothness{ _smoothness }
{
}
bool empty() const
{
return oovCandSize == 0;
}
float score(uint32_t cntArenaPtr, uint32_t nodeIdx) const;
};
template<class LangModel>
struct BestPathFinder : public FindBestPathArgs
{
using LmState = typename LangModel::LmStateType;
size_t insertOovPrefices(size_t targetNodeIdx, size_t oovIdx);
template<class WordLL, class Func>
void traverseNodesWithEndPos(
Vector<WordLL>& pathes,
const Vector<size_t>& pathIndices,
size_t targetNodeIdx,
Func&& func
) const;
template<class WordLL>
void updateOovTotalMap(
Vector<WordLL>& pathes,
Vector<size_t>& pathIndices,
size_t prevOovIdx, size_t bit, size_t i = -1);
template<class WordLL>
void updatePrefixCnts(
Vector<WordLL>& pathes,
Vector<size_t>& pathIndices,
size_t nodeIdx,
const Vector<uint32_t>& currentOovNodeIdcs);
void findOovNodes(
size_t nodeIdx,
Vector<uint32_t>& oovNodeIdcs
) const;
template<bool useOovTotalConsistency>
Vector<PathResult> findBestPathDispatched();
static Vector<PathResult> findBestPath(const FindBestPathArgs& args)
{
BestPathFinder<LangModel> finder{ args };
if (args.oovTotalMap)
{
return finder.findBestPathDispatched<true>();
}
else
{
return finder.findBestPathDispatched<false>();
}
}
};
using FnFindBestPath = Vector<PathResult>(*)(const FindBestPathArgs&);
}

File diff suppressed because it is too large Load diff

View file

@ -622,7 +622,11 @@ namespace kiwi
{
pa = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(&a[i]));
pb = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(&b[i]));
#ifdef _MSC_VER
acc = _mm256_dpbusd_avx_epi32(acc, pa, pb);
#else
acc = _mm256_dpbusd_epi32(acc, pa, pb);
#endif
}
// reduce sum of eight int32_t to one int32_t
__m256i sum = _mm256_hadd_epi32(acc, acc);
@ -892,11 +896,18 @@ namespace kiwi
static STRONG_INLINE int32_t dotprod(const uint8_t* a, const int8_t* b, size_t size)
{
int32x4_t sum = vdupq_n_s32(0);
uint16x8_t pa;
int8x16_t pb;
for (size_t i = 0; i < size; i += 16)
{
//
uint8x16_t pa = vld1q_u8(a + i);
int8x16_t pb = vld1q_s8(b + i);
// Extend a (uint8, 0-255) to int16 via zero-extend, b (int8) via sign-extend
// Product range: 0*(-128) to 255*127 = [-32640, 32385], fits in int16
int16x8_t pa_lo = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pa)));
int16x8_t pa_hi = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pa)));
int16x8_t pb_lo = vmovl_s8(vget_low_s8(pb));
int16x8_t pb_hi = vmovl_s8(vget_high_s8(pb));
sum = vpadalq_s16(sum, vmulq_s16(pa_lo, pb_lo));
sum = vpadalq_s16(sum, vmulq_s16(pa_hi, pb_hi));
}
sum = vpaddq_s32(sum, sum);
sum = vpaddq_s32(sum, sum);

View file

@ -260,27 +260,20 @@ namespace kiwi
}
}
template<class Ty>
class ContainerSearcher
{
std::vector<const Ty*> data;
std::vector<size_t> idx;
const size_t* idcs;
const size_t size;
public:
template<class AllocA, class AllocB>
ContainerSearcher(const std::vector<std::vector<Ty, AllocB>, AllocA>& v)
: data(v.size()), idx(v.size())
template<class Alloc>
ContainerSearcher(const std::vector<size_t, Alloc>& _idcs)
: idcs(_idcs.data()), size(_idcs.size())
{
for (size_t i = 0; i < v.size(); ++i)
{
data[i] = v[i].data();
}
sortWriteIdx(data.begin(), data.end(), idx.begin());
}
size_t operator()(const Ty* v) const
size_t operator()(size_t v) const
{
return idx[(std::upper_bound(data.begin(), data.end(), v) - data.begin()) - 1];
return std::upper_bound(idcs, idcs + size, v) - idcs - 1;
}
};
}

View file

@ -481,28 +481,7 @@ namespace kiwi
inline KString normalizeHangul(It first, It last)
{
KString ret;
ret.reserve((size_t)(std::distance(first, last) * 1.5));
for (; first != last; ++first)
{
char16_t c = *first;
if (c == 0xB42C) c = 0xB410; // '됬'을 '됐'으로 강제교정
if (isHangulSyllable(c))
{
int coda = (c - 0xAC00) % 28;
ret.push_back(c - coda);
if (coda) ret.push_back(coda + 0x11A7);
}
else if (!ret.empty() && isHangulOnset(ret.back())
&& 0x1161 <= c && c < 0x1176)
{
// 첫가끝 초성 + 중성 중 현대한글 음절로 가능한 것은 결합
ret.back() = (char16_t)(0xAC00 + ((ret.back() - 0x1100) * 21 * 28) + ((c - 0x1161) * 28));
}
else
{
ret.push_back(c);
}
}
normalizeHangul(ret, first, last);
return ret;
}

194
src/SubstringCounter.hpp Normal file
View file

@ -0,0 +1,194 @@
#pragma once
#include <cstddef>
#include <cstdint>
#include <cstring>
#include <algorithm>
#include <bitset>
#include <vector>
#include <limits>
#include <kiwi/Types.h>
namespace kiwi
{
class SubstringCounter
{
struct Entry
{
const char16_t* ptr = nullptr;
uint32_t hash = 0;
uint16_t length = 0;
uint16_t count = 0;
};
Vector<Entry> table;
uint32_t mask = 0;
size_t entryCount = 0;
Vector<char16_t> chars;
static constexpr uint32_t kPrime = 0x01000193;
static constexpr uint32_t kOffset = 0x811c9dc5;
void grow()
{
size_t newSize = table.size() * 2;
uint32_t newMask = (uint32_t)(newSize - 1);
Vector<Entry> newTable(newSize);
for (auto& e : table)
{
if (!e.ptr) continue;
size_t slot = e.hash & newMask;
while (newTable[slot].ptr)
{
slot = (slot + 1) & newMask;
}
newTable[slot] = e;
}
table = std::move(newTable);
mask = newMask;
}
void insertOrIncrement(uint32_t hash, const char16_t* ptr, size_t length)
{
if (entryCount * 5 > table.size() * 3) // load factor > 0.6
{
grow();
}
size_t slot = hash & mask;
while (true)
{
auto& e = table[slot];
if (!e.ptr)
{
e.hash = hash;
e.ptr = ptr;
e.length = (uint16_t)length;
e.count = 1;
++entryCount;
return;
}
if (e.hash == hash && e.length == length &&
std::memcmp(e.ptr, ptr, length * sizeof(char16_t)) == 0)
{
if (e.count < std::numeric_limits<decltype(e.count)>::max())
{
++e.count;
}
return;
}
slot = (slot + 1) & mask;
}
}
public:
SubstringCounter() = default;
SubstringCounter(const char16_t* data, size_t size, size_t maxLen = 32)
{
// estimate initial table size
size_t estimatedEntries = size * 8;
size_t tableSize = 16;
while (tableSize < estimatedEntries * 2) tableSize *= 2;
table.resize(tableSize);
mask = (uint32_t)(tableSize - 1);
entryCount = 0;
// collect unique chars
std::bitset<0x10000> seen;
size_t segStart = 0;
for (size_t s = 0; s <= size; ++s)
{
if (s == size || data[s] == u' ')
{
for (size_t i = segStart; i < s; ++i)
{
uint32_t rollingHash = 0;
const size_t jEnd = std::min(i + maxLen, s);
for (size_t j = i; j < jEnd; ++j)
{
const auto c = data[j];
if (j == i)
rollingHash = initHash(c);
else
rollingHash = extendHash(rollingHash, c);
insertOrIncrement(rollingHash, &data[i], j - i + 1);
if (!seen[c])
{
seen[c] = true;
}
}
}
segStart = s + 1;
}
}
// build sorted unique chars vector
for (size_t i = 0; i < 0x10000; ++i)
{
if (seen[i])
{
chars.push_back((char16_t)i);
}
}
}
size_t count(uint32_t hash, const char16_t* data, size_t len) const
{
if (table.empty()) return 0;
size_t slot = hash & mask;
while (true)
{
auto& e = table[slot];
if (!e.ptr) return 0;
if (e.hash == hash && e.length == len &&
std::memcmp(e.ptr, data, len * sizeof(char16_t)) == 0)
{
return e.count;
}
slot = (slot + 1) & mask;
}
}
size_t count(U16StringView str) const
{
return count(hash(str), str.data(), str.size());
}
const Vector<char16_t>& getUniqueChars() const
{
return chars;
}
static uint32_t initHash(char16_t c)
{
return (uint32_t)c * kPrime + kOffset;
}
static uint32_t extendHash(uint32_t prev, char16_t c)
{
return prev * kPrime + (uint32_t)c;
}
static uint32_t hash(const char16_t* data, size_t len)
{
if (len == 0) return kOffset;
uint32_t h = initHash(data[0]);
for (size_t i = 1; i < len; ++i)
{
h = extendHash(h, data[i]);
}
return h;
}
static uint32_t hash(U16StringView str)
{
return hash(str.data(), str.size());
}
};
}

View file

@ -3,6 +3,7 @@
#include <kiwi/Utils.h>
#include "StrUtils.h"
#include "FrozenTrie.hpp"
#include "FeatureTestor.h"
using namespace std;
using namespace kiwi;
@ -224,7 +225,7 @@ void TypoTransformer::addTypoWithCond(const KString& orig, const KString& error,
{
if (orig == error) return;
if (leftCond == CondVowel::none || leftCond == CondVowel::vowel || leftCond == CondVowel::any)
if (leftCond == CondVowel::none || leftCond == CondVowel::vowel || leftCond == CondVowel::any || leftCond == CondVowel::continual || leftCond == CondVowel::boundary)
{
auto inserted = typos.emplace(make_tuple(orig, error, leftCond, dialect), cost);
if (!inserted.second)
@ -431,13 +432,19 @@ namespace kiwi
PreparedTypoTransformer::PreparedTypoTransformer() = default;
PreparedTypoTransformer::PreparedTypoTransformer(const TypoTransformer& tt)
PreparedTypoTransformer::PreparedTypoTransformer(const TypoTransformer& tt, bool inverse)
: continualTypoThreshold{ tt.continualTypoThreshold }, lengtheningTypoThreshold{ tt.lengtheningTypoThreshold }
{
IntermediateTypoTransformer itt;
for (auto& t : tt.typos)
{
itt.addTypo(get<0>(t.first), get<1>(t.first), t.second, get<2>(t.first), get<3>(t.first));
itt.addTypo(
inverse ? get<1>(t.first) : get<0>(t.first),
inverse ? get<0>(t.first) : get<1>(t.first),
t.second,
get<2>(t.first),
get<3>(t.first)
);
}
strPool = std::move(itt.strPool);
@ -445,20 +452,25 @@ PreparedTypoTransformer::PreparedTypoTransformer(const TypoTransformer& tt)
for (auto& rs : itt.replacements) tot += rs.size();
replacements.reserve(tot);
Vector<std::pair<const ReplInfo*, uint32_t>> patData;
Vector<pair<const ReplInfo*, uint32_t>> patData;
for (auto& rs : itt.replacements)
{
patData.emplace_back(replacements.data() + replacements.size(), rs.size());
for (auto& r : rs)
{
replacements.emplace_back(strPool.data() + r.begin, r.end - r.begin, r.cost, r.leftCond, r.dialect);
auto rBegin = r.begin;
if (inverse && r.leftCond == CondVowel::applosive && strPool[rBegin] == 0)
{
rBegin++;
}
replacements.emplace_back(strPool.data() + rBegin, r.end - rBegin, r.cost, r.leftCond, r.dialect);
}
}
patTrie = decltype(patTrie){ itt.patTrie, ArchTypeHolder<ArchType::none>{}, [&](const IntermediateTypoTransformer::TrieNode& o) -> PatInfo
{
uint32_t depth = o.depth;
if (o.val && patData[o.val - 1].first->leftCond == CondVowel::applosive)
if (!inverse && o.val && patData[o.val - 1].first->leftCond == CondVowel::applosive)
{
depth--;
}
@ -569,6 +581,463 @@ TypoCandidates<true> PreparedTypoTransformer::generate(const u16string& orig, fl
return _generate<true>(normalizeHangul(orig), costThreshold);
}
/*
* ->
'' Char DAG는 .
Idx 0 1 2
BOS -> -> -> EOS
-> -> -> EOS
*/
template<class Alloc, class... Args>
inline bool appendNewNode(vector<TypoGraphNode, Alloc>& nodes, Vector<pair<uint32_t, uint32_t>>& endPosMap, size_t endPosMapOffset, U16StringView form, size_t startPos, size_t endPos, Args&&... args)
{
static constexpr uint32_t npos = -1;
if (startPos != -1 && endPosMap[startPos - endPosMapOffset].first == npos)
{
return false;
}
size_t newId = nodes.size();
nodes.emplace_back(form, endPos, forward<Args>(args)...);
TypoGraphNode& nnode = nodes.back();
if (startPos == -1)
{
nnode.prevOffset = newId - 1;
}
else
{
nnode.prevOffset = endPosMap[startPos - endPosMapOffset].first; // absolute offset for now, will be converted to relative offset later
}
if (nnode.endPos >= endPosMap.size() + endPosMapOffset) return true;
if (endPosMap[nnode.endPos - endPosMapOffset].first == npos)
{
endPosMap[nnode.endPos - endPosMapOffset].first = newId;
}
else
{
nodes[endPosMap[nnode.endPos - endPosMapOffset].second].siblingOffset = newId; // absolute offset for now, will be converted to relative offset later
}
endPosMap[nnode.endPos - endPosMapOffset].second = newId;
return true;
}
// onset: ㅇ=11, ㅎ=18
inline char16_t overrideOnset(char16_t c, const int onset = 11)
{
if (!isHangulSyllable(c)) return 0;
const int vowel = (c - 0xAC00) / 28 % 21;
const int coda = (c - 0xAC00) % 28;
return 0xAC00 + onset * 28 * 21 + vowel * 28 + coda;
}
// 받침 + 초성 ㅇ이 연철된 경우
struct ContinualIeungDecomposer
{
static constexpr size_t boundaryId = 1;
char16_t onsetToCoda(char16_t c, char16_t prev)
{
static constexpr char16_t o2c[] = {
0x11A8, // ㄱ
0x11A9, // ㄲ
0x11AB, // ㄴ
0x11AE, // ㄷ
0, // ㄸ
0x11AF, // ㄹ
0x11B7, // ㅁ
0x11B8, // ㅂ
0, // ㅃ
0x11BA, // ㅅ
0x11BB, // ㅆ
0, // ㅇ
0x11BD, // ㅈ
0, // ㅉ
0x11BE, // ㅊ
0x11BF, // ㅋ
0x11C0, // ㅌ
0x11C1, // ㅍ
0x11C2, // ㅎ
};
if (isHangulSyllable(c))
{
int onset = (c - 0xAC00) / 28 / 21;
return o2c[onset];
}
switch (c)
{
case u'': return 0x11A8;
case u'': return 0x11A9;
case u'': return 0x11AB;
case u'': return 0x11AE;
case u'': return 0x11AF;
case u'': return 0x11B7;
case u'': return 0x11B8;
case u'': return 0x11BA;
case u'': return 0x11BB;
case u'': return 0x11BD;
case u'': return 0x11BE;
case u'': return 0x11BF;
case u'': return 0x11C0;
case u'': return 0x11C1;
case u'': return 0x11C2;
default: return 0;
}
return 0;
}
char16_t dropRightSyllable(char16_t c)
{
return overrideOnset(c, 11);
}
};
// 받침 + 초성 ㅎ이 연철된 경우
struct ContinualHieutDecomposer
{
static constexpr size_t boundaryId = 2;
char16_t onsetToCoda(char16_t c)
{
static constexpr char16_t o2c[] = {
0, // ㄱ
0, // ㄲ
0x11AB, // ㄴ
0, // ㄷ
0, // ㄸ
0x11AF, // ㄹ
0x11B7, // ㅁ
0, // ㅂ
0, // ㅃ
0x11BA, // ㅅ
0, // ㅆ
0, // ㅇ
0, // ㅈ
0, // ㅉ
0x11BD, // ㅊ->ㅈ
0x11A8, // ㅋ->ㄱ
0x11AE, // ㅌ->ㄷ
0x11B8, // ㅍ->ㅂ
0, // ㅎ
};
if (isHangulSyllable(c))
{
int onset = (c - 0xAC00) / 28 / 21;
return o2c[onset];
}
return 0;
}
char16_t dropRightSyllable(char16_t c)
{
return overrideOnset(c, 18);
}
};
// 받침 ㅎ + ㅎ이 아닌 초성이 연철된 경우
struct ContinualCodaDecomposer
{
static constexpr size_t boundaryId = 3;
char16_t onsetToCoda(char16_t c)
{
static constexpr char16_t o2c[] = {
0, // ㄱ
0, // ㄲ
0, // ㄴ
0, // ㄷ
0, // ㄸ
0, // ㄹ
0, // ㅁ
0, // ㅂ
0, // ㅃ
0, // ㅅ
0, // ㅆ
0, // ㅇ
0, // ㅈ
0, // ㅉ
0x11C2, // ㅊ->ㅎ
0x11C2, // ㅋ->ㅎ
0x11C2, // ㅌ->ㅎ
0x11C2, // ㅍ->ㅎ
0, // ㅎ
};
if (isHangulSyllable(c))
{
int onset = (c - 0xAC00) / 28 / 21;
return o2c[onset];
}
return 0;
}
char16_t dropRightSyllable(char16_t c)
{
const int onset = (c - 0xAC00) / 28 / 21;
const int vowel = (c - 0xAC00) / 28 % 21;
const int coda = (c - 0xAC00) % 28;
static constexpr char16_t onsetMap[] = {
0, // ㄱ
0, // ㄲ
0, // ㄴ
0, // ㄷ
0, // ㄸ
0, // ㄹ
0, // ㅁ
0, // ㅂ
0, // ㅃ
0, // ㅅ
0, // ㅆ
0, // ㅇ
0, // ㅈ
0, // ㅉ
12, // ㅊ->ㅈ
0, // ㅋ->ㄱ
3, // ㅌ->ㄷ
7, // ㅍ->ㅂ
0, // ㅎ
};
return 0xAC00 + (onsetMap[onset] * 21 + vowel) * 28 + coda;
}
};
template<class Alloc>
size_t PreparedTypoTransformer::generateGraph(U16StringView str,
vector<TypoGraphNode, Alloc>& graphOut,
Dialect allowedDialect,
const pair<uint32_t, uint32_t>* pretokenizedFirst,
const pair<uint32_t, uint32_t>* pretokenizedLast,
size_t* maxContinualTypoIdxOut
) const
{
const bool continualTypoEnabled = isfinite(continualTypoThreshold);
static constexpr size_t npos = -1;
using MatchInfo = tuple<size_t, PatInfo>; // (endPos, patternInfo)
thread_local Vector<TypoGraphNode> tempGraph;
thread_local Vector<MatchInfo> matches;
thread_local Vector<size_t> breakPoints;
thread_local Vector<pair<uint32_t, uint32_t>> endPosMap; // (first position, last position)
thread_local UnorderedMap<char16_t, pair<size_t, size_t>> continualTypoIdxMap;
matches.clear();
endPosMap.clear();
endPosMap.emplace_back(0, 0);
size_t last = 0;
tempGraph.clear();
tempGraph.emplace_back(U16StringView{ str.data(), 0 }, 0);
const auto& insertBranch = [&]()
{
const size_t totStartPos = get<size_t>(matches[0]) - get<PatInfo>(matches[0]).patLength;
const size_t totEndPos = get<size_t>(matches.back());
const auto v = endPosMap.back();
endPosMap.clear();
endPosMap.resize((totEndPos - last) + 1, make_pair(npos, npos));
endPosMap[0] = v;
breakPoints.clear();
breakPoints.emplace_back(totStartPos);
for (auto& m : matches)
{
const size_t e = get<size_t>(m);
const size_t s = e - get<PatInfo>(m).patLength;
breakPoints.emplace_back(e);
}
breakPoints.emplace_back(totEndPos);
sort(breakPoints.begin(), breakPoints.end());
breakPoints.erase(unique(breakPoints.begin(), breakPoints.end()), breakPoints.end());
sort(matches.begin(), matches.end(), [](const MatchInfo& a, const MatchInfo& b)
{
return get<size_t>(a) - get<PatInfo>(a).patLength < get<size_t>(b) - get<PatInfo>(b).patLength;
}
);
if (last < totStartPos)
{
appendNewNode(tempGraph, endPosMap, last, U16StringView{ str.data() + last, totStartPos - last }, last, totStartPos);
}
for (size_t i = 1; i < breakPoints.size(); ++i)
{
appendNewNode(tempGraph, endPosMap, last, U16StringView{ str.data() + breakPoints[i - 1], breakPoints[i] - breakPoints[i - 1] }, breakPoints[i - 1], breakPoints[i]);
}
for (auto& m : matches)
{
auto [endPos, patInfo] = m;
const size_t e = endPos;
const size_t s = e - patInfo.patLength;
continualTypoIdxMap.clear();
for (size_t j = 0; j < patInfo.size; ++j)
{
auto& repl = patInfo.repl[j];
if (repl.dialect != Dialect::standard && !(allowedDialect & repl.dialect)) continue;
if (repl.leftCond == CondVowel::vowel)
{
if (s == 0 || !isHangulSyllable(str[s - 1])) continue;
}
else if (repl.leftCond == CondVowel::any)
{
if (s == 0) continue;
}
else if (repl.leftCond == CondVowel::continual || repl.leftCond == CondVowel::boundary)
{
if (repl.leftCond == CondVowel::continual && (s == 0 || !isHangulSyllable(str[s - 1]))) continue;
if (repl.leftCond == CondVowel::continual && !isfinite(continualTypoThreshold)) continue;
const float scale = repl.leftCond == CondVowel::continual ? continualTypoThreshold : 1.f;
const auto [it, inserted] = continualTypoIdxMap.emplace(*repl.str, make_pair(continualTypoIdxMap.size() + 1, 0));
auto& [continualTypoIdx, continualTypoNodeIdx] = it->second;
if (inserted)
{
if (appendNewNode(tempGraph, endPosMap, last, U16StringView{ repl.str, 1 }, s, -1, repl.cost * scale / 2))
{
tempGraph.back().endPos = e;
tempGraph.back().continualTypoIdx = continualTypoIdx;
tempGraph.back().dialect = repl.dialect;
continualTypoNodeIdx = tempGraph.size() - 1;
if (appendNewNode(tempGraph, endPosMap, last, U16StringView{ repl.str + 1, repl.length - 1 }, -1, e, repl.cost * scale / 2))
{
tempGraph.back().prevOffset = continualTypoNodeIdx;
tempGraph.back().dialect = repl.dialect;
}
}
else
{
continualTypoIdxMap.erase(it);
}
}
else
{
if (appendNewNode(tempGraph, endPosMap, last, U16StringView{ repl.str + 1, repl.length - 1 }, -1, e, repl.cost * scale / 2))
{
tempGraph.back().prevOffset = continualTypoNodeIdx;
tempGraph.back().dialect = repl.dialect;
}
}
continue;
}
else
{
if (!FeatureTestor::isMatched(str.data(), str.data() + s, repl.leftCond)) continue;
}
if (appendNewNode(tempGraph, endPosMap, last, U16StringView{ repl.str, repl.length }, s, e, repl.cost))
{
tempGraph.back().dialect = repl.dialect;
}
}
if (maxContinualTypoIdxOut)
{
*maxContinualTypoIdxOut = max(*maxContinualTypoIdxOut, continualTypoIdxMap.size() + 1);
}
}
last = totEndPos;
matches.clear();
};
auto node = patTrie.root()->nextOpt<ArchType::none>(patTrie, 0);
for (size_t i = 0; i < str.size(); ++i)
{
if (pretokenizedFirst < pretokenizedLast && pretokenizedFirst->first == i)
{
const auto prevLast = last;
if (!matches.empty())
{
insertBranch();
}
node = patTrie.root();
appendNewNode(tempGraph, endPosMap, prevLast,
U16StringView{ str.data() + last, pretokenizedFirst->second - last },
last, pretokenizedFirst->second
);
last = pretokenizedFirst->second;
endPosMap.clear();
endPosMap.emplace_back(tempGraph.size() - 1, tempGraph.size() - 1);
i += pretokenizedFirst->second - pretokenizedFirst->first - 1;
++pretokenizedFirst;
continue;
}
auto nnode = node->nextOpt<ArchType::none>(patTrie, str[i]);
while (!nnode)
{
node = node->fail();
if (node)
{
nnode = node->nextOpt<ArchType::none>(patTrie, str[i]);
}
else
{
node = patTrie.root();
break;
}
}
if (!nnode) continue;
node = nnode;
auto& v = node->val(patTrie);
if (patTrie.isNull(v)) continue;
const size_t endPos = i + 1;
const size_t startPos = endPos - v.patLength;
if (!matches.empty() && get<size_t>(matches.back()) < startPos)
{
insertBranch();
}
for (auto sub = node; sub; sub = sub->fail())
{
auto& sv = sub->val(patTrie);
if (patTrie.isNull(sv)) break;
if (patTrie.hasSubmatch(sv)) continue;
matches.emplace_back(endPos, sv);
}
}
if (!matches.empty())
{
insertBranch();
}
const auto v = endPosMap.back();
endPosMap.clear();
endPosMap.resize(1);
endPosMap[0] = v;
appendNewNode(tempGraph, endPosMap, last, U16StringView{ str.data() + last, str.size() - last }, last, str.size() + 1);
tempGraph.back().endPos = str.size();
auto& sortIdx = breakPoints;
sortIdx.clear();
sortIdx.resize(tempGraph.size() * 2);
auto reverseIdx = sortIdx.begin() + tempGraph.size();
iota(sortIdx.begin(), reverseIdx, 0);
stable_sort(sortIdx.begin(), reverseIdx, [&](size_t a, size_t b)
{
return tempGraph[a].endPos < tempGraph[b].endPos;
}
);
for (size_t i = 0; i < tempGraph.size(); ++i)
{
reverseIdx[sortIdx[i]] = i;
}
graphOut.clear();
graphOut.reserve(tempGraph.size());
for (size_t i = 0; i < tempGraph.size(); ++i)
{
graphOut.push_back(tempGraph[sortIdx[i]]);
auto& n = graphOut.back();
n.prevOffset = i - reverseIdx[n.prevOffset];
if (n.siblingOffset != 0) n.siblingOffset = reverseIdx[n.siblingOffset] - i;
}
return graphOut.size();
}
namespace kiwi
{
template class TypoCandidates<true>;
@ -579,6 +1048,12 @@ namespace kiwi
template TypoCandidates<true> PreparedTypoTransformer::_generate<true>(const KString&, float) const;
template TypoCandidates<false> PreparedTypoTransformer::_generate<false>(const KString&, float) const;
template size_t PreparedTypoTransformer::generateGraph<allocator<TypoGraphNode>>(
U16StringView, vector<TypoGraphNode, allocator<TypoGraphNode>>&, Dialect, const pair<uint32_t, uint32_t>*, const pair<uint32_t, uint32_t>*, size_t*) const;
#ifdef KIWI_USE_MIMALLOC
template size_t PreparedTypoTransformer::generateGraph<mi_stl_allocator<TypoGraphNode>>(
U16StringView, vector<TypoGraphNode, mi_stl_allocator<TypoGraphNode>>&, Dialect, const pair<uint32_t, uint32_t>*, const pair<uint32_t, uint32_t>*, size_t*) const;
#endif
const TypoTransformer& getDefaultTypoSet(DefaultTypoSet set)
{
@ -688,6 +1163,46 @@ namespace kiwi
TypoDef{ {u""}, {u"ᆯᇁ"}, 1e-12f, CondVowel::none },
TypoDef{ {u""}, {u"ᆯᇂ"}, 1e-12f, CondVowel::none },
TypoDef{ {u""}, {u"ᆸᆺ", u"ᆸᆻ"}, 1e-12f, CondVowel::none },
TypoDef{ {u"ᆨᄋ"}, {u""}, 1.f, CondVowel::continual },
TypoDef{ {u"ᆩᄋ", u"ᆨᄀ"}, {u"", u"ᆨᄀ"}, 1.f, CondVowel::continual },
TypoDef{ {u"ᆪᄋ", u"ᆪᄒ"}, {u"ᆨᄉ", u"ᆨᄊ"}, 1.f, CondVowel::continual },
TypoDef{ {u"ᆫᄋ", u"ᆫᄒ"}, {u""}, 1.f, CondVowel::continual },
TypoDef{ {u"ᆬᄋ", u"ᆫᄌ"}, {u"ᆬᄋ", u"ᆫᄌ"}, 1.f, CondVowel::continual },
TypoDef{ {u"ᆭᄋ"}, {u"ᆫᄒ", u""}, 1.f, CondVowel::continual },
TypoDef{ {u"ᆭᄀ"}, {u"ᆫᄏ"}, 1.f, CondVowel::continual },
TypoDef{ {u"ᆭᄃ"}, {u"ᆫᄐ"}, 1.f, CondVowel::continual },
TypoDef{ {u"ᆭᄇ"}, {u"ᆫᄑ"}, 1.f, CondVowel::continual },
TypoDef{ {u"ᆭᄉ"}, {u"ᆫᄉ"}, 1.f, CondVowel::continual },
TypoDef{ {u"ᆭᄌ"}, {u"ᆫᄎ"}, 1.f, CondVowel::continual },
TypoDef{ {u"ᆮᄋ"}, {u""}, 1.f, CondVowel::continual },
TypoDef{ {u"ᆯᄋ", u"ᆯᄒ"}, {u""}, 1.f, CondVowel::continual },
TypoDef{ {u"ᆰᄋ"}, {u"ᆯᄀ"}, 1.f, CondVowel::continual },
TypoDef{ {u"ᆰᄀ"}, {u"ᆯᄁ"}, 1.f, CondVowel::continual },
TypoDef{ {u"ᆰᄒ"}, {u"ᆯᄏ"}, 1.f, CondVowel::continual },
TypoDef{ {u"ᆱᄋ", u"ᆱᄒ"}, {u"ᆯᄆ"}, 1.f, CondVowel::continual},
TypoDef{ {u"ᆲᄋ"}, {u"ᆯᄇ"}, 1.f, CondVowel::continual },
TypoDef{ {u"ᆲᄇ"}, {u"ᆯᄈ"}, 1.f, CondVowel::continual },
TypoDef{ {u"ᆲᄒ"}, {u"ᆯᄑ"}, 1.f, CondVowel::continual },
TypoDef{ {u"ᆳᄋ"}, {u"ᆯᄉ"}, 1.f, CondVowel::continual },
TypoDef{ {u"ᆳᄉ"}, {u"ᆯᄊ"}, 1.f, CondVowel::continual },
TypoDef{ {u"ᆴᄋ", u"ᆴᄐ", u"ᆴᄒ"}, {u"ᆯᄐ"}, 1.f, CondVowel::continual },
TypoDef{ {u"ᆵᄋ", u"ᆵᄑ", u"ᆵᄒ"}, {u"ᆯᄑ"}, 1.f, CondVowel::continual },
TypoDef{ {u"ᆶᄉ"}, {u"ᆯᄉ"}, 1.f, CondVowel::continual },
TypoDef{ {u"ᆶᄋ", u"ᆶᄒ"}, {u"ᆯᄒ"}, 1.f, CondVowel::continual },
TypoDef{ {u"ᆷᄋ", u"ᆷᄒ"}, {u""}, 1.f, CondVowel::continual },
TypoDef{ {u"ᆸᄋ"}, {u""}, 1.f, CondVowel::continual },
TypoDef{ {u"ᆸᄇ"}, {u""}, 1.f, CondVowel::continual },
TypoDef{ {u"ᆹᄋ", u"ᆹᄒ"}, {u"ᆸᄉ", u"ᆸᄊ"}, 1.f, CondVowel::continual },
TypoDef{ {u"ᆺᄋ"}, {u""}, 1.f, CondVowel::continual },
TypoDef{ {u"ᆻᄋ", u"ᆺᄉ"}, {u"", u"ᆺᄉ"}, 1.f, CondVowel::continual },
TypoDef{ {u"ᆽᄋ"}, {u""}, 1.f, CondVowel::continual },
TypoDef{ {u"ᆽᄌ"}, {u""}, 1.f, CondVowel::continual },
TypoDef{ {u"ᆾᄋ", u"ᆾᄒ", u"ᆽᄒ", u"ᇂᄌ", u"ᇂᄎ"}, {u""}, 1.f, CondVowel::continual },
TypoDef{ {u"ᆿᄋ", u"ᆿᄒ", u"ᆨᄒ", u"ᇂᄀ", u"ᇂᄏ"}, {u""}, 1.f, CondVowel::continual },
TypoDef{ {u"ᇀᄋ", u"ᇀᄒ", u"ᆮᄒ", u"ᇂᄃ", u"ᇂᄐ"}, {u""}, 1.f, CondVowel::continual },
TypoDef{ {u"ᇁᄋ", u"ᇁᄒ", u"ᆸᄒ", u"ᇂᄇ", u"ᇂᄑ"}, {u""}, 1.f, CondVowel::continual },
TypoDef{ {u"ᇂᄋ"}, {u""}, 1.f, CondVowel::continual },
});
static const TypoTransformer basicTypoSetWithContinual = basicTypoSet | continualTypoSet;
@ -700,7 +1215,22 @@ namespace kiwi
TypoDef{ {u""}, {u""}, 0.5f, CondVowel::none },
TypoDef{ {u""}, {u""}, 0.5f, CondVowel::none },
TypoDef{ {u"", u""}, {u"", u""}, 1.f, CondVowel::none },
}.copyWithDialectOverriding(Dialect::jeju);
}.copyWithDialectOverriding(Dialect::jeju) | TypoTransformer{
TypoDef{ {u"ㅣ이"}, {u""}, 0.2f, CondVowel::boundary },
TypoDef{ {u"ㅏ이", u"ㅐ이"}, {u""}, 0.2f, CondVowel::boundary },
TypoDef{ {u"ㅓ이", u"ㅔ이"}, {u""}, 0.2f, CondVowel::boundary },
TypoDef{ {u"ㅘ이"}, {u""}, 0.2f, CondVowel::boundary },
TypoDef{ {u"ㅚ이"}, {u""}, 0.2f, CondVowel::boundary },
TypoDef{ {u"ㅝ이"}, {u""}, 0.2f, CondVowel::boundary },
TypoDef{ {u"ㅟ이"}, {u""}, 0.2f, CondVowel::boundary },
TypoDef{ {u"ㅜ우"}, {u""}, 0.2f, CondVowel::boundary },
TypoDef{ {u"ㅠ우"}, {u""}, 0.2f, CondVowel::boundary },
}.copyWithDialectOverriding(Dialect::hamgyeong) | TypoTransformer{
TypoDef{ {u"ㅣ어"}, {u""}, 0.25f, CondVowel::boundary },
TypoDef{ {u"ㅣ어"}, {u""}, 0.5f, CondVowel::boundary },
}.copyWithDialectOverriding(Dialect::hamgyeong | Dialect::gyeongsang | Dialect::gangwon) | TypoTransformer{
TypoDef{ {u"ㅣ었"}, {u"ㅣᆻ"}, 0.25f, CondVowel::boundary },
}.copyWithDialectOverriding(Dialect::gyeongsang);
switch (set)
{
@ -723,4 +1253,33 @@ namespace kiwi
}
}
const PreparedTypoTransformer* getDefaultPreparedTypoSet(DefaultTypoSet set)
{
static const auto defaultTypoSet = getDefaultTypoSet(DefaultTypoSet::basicTypoSet).prepare(true);
static const auto defaultTypoSetWithContinual = getDefaultTypoSet(DefaultTypoSet::basicTypoSetWithContinual).prepare(true);
static const auto defaultTypoSetWithContinualAndLengthening = getDefaultTypoSet(DefaultTypoSet::basicTypoSetWithContinualAndLengthening).prepare(true);
static const auto continualTypoSet = getDefaultTypoSet(DefaultTypoSet::continualTypoSet).prepare(true);
static const auto lengtheningTypoSet = getDefaultTypoSet(DefaultTypoSet::lengtheningTypoSet).prepare(true);
static const auto dialect = getDefaultTypoSet(DefaultTypoSet::dialect).prepare(true);
switch (set)
{
case kiwi::DefaultTypoSet::withoutTypo:
return nullptr;
case kiwi::DefaultTypoSet::basicTypoSet:
return &defaultTypoSet;
case kiwi::DefaultTypoSet::continualTypoSet:
return &continualTypoSet;
case kiwi::DefaultTypoSet::basicTypoSetWithContinual:
return &defaultTypoSetWithContinual;
case kiwi::DefaultTypoSet::lengtheningTypoSet:
return &lengtheningTypoSet;
case kiwi::DefaultTypoSet::basicTypoSetWithContinualAndLengthening:
return &defaultTypoSetWithContinualAndLengthening;
case kiwi::DefaultTypoSet::dialect:
return &dialect;
default:
throw invalid_argument{ "Invalid `DefaultTypoSet`" };
}
}
}

190
src/UnkFormScorer.cpp Normal file
View file

@ -0,0 +1,190 @@
#include <kiwi/Dataset.h>
#include "UnkFormScorer.h"
#include "SubstringCounter.hpp"
using namespace std;
using namespace kiwi;
UnkFormScorer::UnkFormScorer(float scale, float bias,
const lm::CoNgramModelBase* _chrModel, float _chrBias,
const SubstringCounter* _substringCounter,
float _globalWeight,
float _localWeight,
float _globalMinFreq,
bool _useChrFreqBranchModel)
: chrModel{ _chrModel }, substringCounter{ _substringCounter },
oovRuleScale{ scale }, oovRuleBias{ bias },
chrBias{ _chrBias },
globalWeight{ _globalWeight }, localWeight{ _localWeight },
globalMinFreq{ _globalMinFreq },
useChrFreqBranchModel{ _useChrFreqBranchModel }
{
if (chrModel)
{
chrModel->progressOneStep(bosNodeIdx, bosContextIdx, 0); // BOS
}
}
float UnkFormScorer::ruleBasedScore(const U16StringView& form) const
{
float penalty = 0;
if (form.size() > 0)
{
char32_t chrs[2] = { 0,0 };
for (size_t i = 0, j = 0; i < form.size() && j < 2; ++j)
{
if (isHighSurrogate(form[i]))
{
chrs[j] = mergeSurrogate(form[i], form[i + 1]);
i += 2;
}
else
{
chrs[j] = form[i];
++i;
}
}
if (isEmoji(chrs[0], chrs[1])) penalty = -10;
}
return penalty - (form.size() * oovRuleScale + oovRuleBias);
}
float UnkFormScorer::chrBasedScore(const U16StringView& form) const
{
int32_t nodeIdx = bosNodeIdx;
uint32_t contextIdx = bosContextIdx;
ChrTokenizer tokenizer;
float score = 0;
for (char16_t c : form)
{
const size_t token = tokenizer.encodeOne(c);
score += chrModel->progressOneStep(nodeIdx, contextIdx, token);
}
score += chrModel->progressOneStep(nodeIdx, contextIdx, 0); // EOS
score -= chrBias;
return score;
}
float UnkFormScorer::chrFreqBasedScore(const U16StringView& form) const
{
int32_t nodeIdx = bosNodeIdx;
uint32_t contextIdx = bosContextIdx;
ChrTokenizer tokenizer;
float score = 0;
uint32_t rollingHash = 0;
for (size_t i = 0; i < form.size(); ++i)
{
const auto c = form[i];
const size_t depth = chrModel->getNodeDepth(nodeIdx);
const float globalContextFreq = depth < i ? globalMinFreq : max(chrModel->getContextFrequency(contextIdx), globalMinFreq);
const float globalContextFreqSat = tanhf(globalContextFreq / globalWeight) * globalWeight;
const size_t token = tokenizer.encodeOne(c);
const float lprob = chrModel->progressOneStep(nodeIdx, contextIdx, token);
if (i == 0)
{
rollingHash = SubstringCounter::initHash(c);
score += lprob;
}
else
{
const float localContextFreq = (float)substringCounter->count(rollingHash, form.data(), i) - 1;
rollingHash = SubstringCounter::extendHash(rollingHash, c);
if (localContextFreq > 0)
{
const float curFreq = (float)substringCounter->count(rollingHash, form.data(), i + 1) - 1;
if (curFreq < 0) return -99999.f; // should not happen, but just in case
const float localContextFreqSat = tanhf(localContextFreq / localWeight) * localWeight;
const float localFreq = curFreq * (localContextFreqSat / localContextFreq);
const float globalFreq = globalContextFreqSat * expf(lprob);
const float mixedProb = logf((localFreq + globalFreq) / (localContextFreqSat + globalContextFreqSat));
score += mixedProb;
}
else
{
score += lprob;
}
}
}
score += chrModel->progressOneStep(nodeIdx, contextIdx, 0); // EOS
score -= chrBias;
return score;
}
float UnkFormScorer::chrFreqBranchBasedScore(const U16StringView& form) const
{
return chrFreqBasedScore(form);
// not implemented yet
int32_t nodeIdx = bosNodeIdx;
uint32_t contextIdx = bosContextIdx;
ChrTokenizer tokenizer;
float score = 0;
array<char16_t, 33> buf;
Vector<pair<char16_t, float>> nextChrs;
uint32_t rollingHash = 0;
for (size_t i = 0; i < form.size(); ++i)
{
const auto c = form[i];
const size_t depth = chrModel->getNodeDepth(nodeIdx);
const float globalContextFreq = depth < i ? globalMinFreq : max(chrModel->getContextFrequency(contextIdx), globalMinFreq);
const float globalContextFreqSat = tanhf(globalContextFreq / globalWeight) * globalWeight;
const size_t token = tokenizer.encodeOne(c);
const float lprob = chrModel->progressOneStep(nodeIdx, contextIdx, token);
const float branchEntropy = chrModel->getContextEntropy(contextIdx);
if (i == 0)
{
rollingHash = SubstringCounter::initHash(c);
// enumerate next characters
buf[0] = c;
nextChrs.clear();
for (char16_t nextChr : substringCounter->getUniqueChars())
{
buf[1] = nextChr;
auto h = SubstringCounter::extendHash(rollingHash, nextChr);
auto cnt = substringCounter->count(h, buf.data(), 2);
if (cnt > 0)
{
nextChrs.emplace_back(nextChr, (float)cnt);
}
}
score += lprob;
}
else
{
const float localContextFreq = (float)substringCounter->count(rollingHash, form.data(), i) - 1;
rollingHash = SubstringCounter::extendHash(rollingHash, c);
if (localContextFreq > 0)
{
// enumerate next characters
memcpy(buf.data(), form.data(), (i + 1) * sizeof(char16_t));
nextChrs.clear();
for (char16_t nextChr : substringCounter->getUniqueChars())
{
buf[i + 1] = nextChr;
auto h = SubstringCounter::extendHash(rollingHash, nextChr);
auto cnt = substringCounter->count(h, buf.data(), i + 2);
if (cnt > 0)
{
nextChrs.emplace_back(nextChr, (float)cnt);
}
}
const float curFreq = (float)substringCounter->count(rollingHash, form.data(), i + 1) - 1;
const float localContextFreqSat = tanhf(localContextFreq / localWeight) * localWeight;
const float localFreq = curFreq * (localContextFreqSat / localContextFreq);
const float globalFreq = globalContextFreqSat * expf(lprob);
const float mixedProb = logf((localFreq + globalFreq) / (localContextFreqSat + globalContextFreqSat));
score += mixedProb;
}
else
{
score += lprob;
}
}
}
score += chrModel->progressOneStep(nodeIdx, contextIdx, 0); // EOS
score -= chrBias;
return score;
}

62
src/UnkFormScorer.h Normal file
View file

@ -0,0 +1,62 @@
#pragma once
#include <kiwi/CoNgramModel.h>
namespace kiwi
{
class SubstringCounter;
class UnkFormScorer
{
const lm::CoNgramModelBase* chrModel = nullptr;
const SubstringCounter* substringCounter = nullptr;
float oovRuleScale = 0;
float oovRuleBias = 0;
float chrBias = 0;
float globalWeight = 0;
float localWeight = 0;
float globalMinFreq = 4.f;
int32_t bosNodeIdx = 0;
uint32_t bosContextIdx = 0;
bool useChrFreqBranchModel = false;
public:
UnkFormScorer(float scale, float bias,
const lm::CoNgramModelBase* _chrModel, float _chrBias,
const SubstringCounter* _substringCounter,
float _globalWeight = 60.f,
float _localWeight = 3.f,
float _globalMinFreq = 4.f,
bool _useChrFreqBranchModel = false);
float ruleBasedScore(const U16StringView& form) const;
float chrBasedScore(const U16StringView& form) const;
float chrFreqBasedScore(const U16StringView& form) const;
float chrFreqBranchBasedScore(const U16StringView& form) const;
float operator()(const U16StringView& form) const
{
if (chrModel && substringCounter)
{
if (useChrFreqBranchModel)
{
return chrFreqBranchBasedScore(form);
}
else
{
return chrFreqBasedScore(form);
}
}
else if (chrModel)
{
return chrBasedScore(form);
}
else
{
return ruleBasedScore(form);
}
}
};
}

View file

@ -609,7 +609,11 @@ namespace kiwi
{
return [modelPath](const std::string& filename) -> std::unique_ptr<std::istream> {
std::string fullPath = modelPath + "/" + filename;
#if defined(_WIN32) || defined(_WIN64)
auto stream = std::make_unique<std::ifstream>((const wchar_t*)utf8To16(fullPath).c_str(), std::ios::binary);
#else
auto stream = std::make_unique<std::ifstream>(fullPath, std::ios::binary);
#endif
if (!stream->is_open()) {
return nullptr;
}

View file

@ -3,7 +3,11 @@
#include <cstring>
#ifdef USE_VNNI
#ifdef _MSC_VER
#define DPBUSD _mm256_dpbusd_avx_epi32
#else
#define DPBUSD _mm256_dpbusd_epi32
#endif
#define DETAIL detailVnni
#else
#define DPBUSD emulated_dpbusd

View file

@ -1,5 +1,6 @@
#include "../MathFunc.hpp"
#include "../qgemm.hpp"
#include <arm_neon.h>
namespace kiwi
{
@ -22,34 +23,246 @@ namespace kiwi
float* c, size_t ldc
);
static FORCE_INLINE int32_t reduce_sum_s32(int32x4_t v)
{
v = vpaddq_s32(v, v);
v = vpaddq_s32(v, v);
return vgetq_lane_s32(v, 0);
}
// gemv: compute c[i] = (dotprod(a_uint8, b_int8[i]) - bSum[i]) * aScale * bScale[i]
// a: [k uint8][float aScale], b rows: [k int8][float bScale][int32 bSum]
inline void gemv_neon(size_t m, size_t k, const uint8_t* a, const int8_t* b, size_t ldb, float* c)
{
const float aScale = *reinterpret_cast<const float*>(a + k);
float bScale[4];
int32_t bSum[4];
const float32x4_t vaScale = vdupq_n_f32(aScale);
for (size_t mi = 0; mi < m; mi += 4)
{
const int8_t* bPtr0 = b + ldb * (mi + 0);
const int8_t* bPtr1 = b + ldb * (mi + 1);
const int8_t* bPtr2 = b + ldb * (mi + 2);
const int8_t* bPtr3 = b + ldb * (mi + 3);
int32x4_t sum0 = vdupq_n_s32(0);
int32x4_t sum1 = vdupq_n_s32(0);
int32x4_t sum2 = vdupq_n_s32(0);
int32x4_t sum3 = vdupq_n_s32(0);
for (size_t j = 0; j < k; j += 16)
{
uint8x16_t pa = vld1q_u8(a + j);
int8x16_t pb0 = vld1q_s8(bPtr0 + j);
int8x16_t pb1 = vld1q_s8(bPtr1 + j);
int8x16_t pb2 = vld1q_s8(bPtr2 + j);
int8x16_t pb3 = vld1q_s8(bPtr3 + j);
// Extend a (uint8) to int16 via zero-extend; b (int8) via sign-extend
// Product fits in int16: range [-32640, 32385]
int16x8_t pa_lo = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pa)));
int16x8_t pa_hi = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pa)));
sum0 = vpadalq_s16(sum0, vmulq_s16(pa_lo, vmovl_s8(vget_low_s8(pb0))));
sum0 = vpadalq_s16(sum0, vmulq_s16(pa_hi, vmovl_s8(vget_high_s8(pb0))));
sum1 = vpadalq_s16(sum1, vmulq_s16(pa_lo, vmovl_s8(vget_low_s8(pb1))));
sum1 = vpadalq_s16(sum1, vmulq_s16(pa_hi, vmovl_s8(vget_high_s8(pb1))));
sum2 = vpadalq_s16(sum2, vmulq_s16(pa_lo, vmovl_s8(vget_low_s8(pb2))));
sum2 = vpadalq_s16(sum2, vmulq_s16(pa_hi, vmovl_s8(vget_high_s8(pb2))));
sum3 = vpadalq_s16(sum3, vmulq_s16(pa_lo, vmovl_s8(vget_low_s8(pb3))));
sum3 = vpadalq_s16(sum3, vmulq_s16(pa_hi, vmovl_s8(vget_high_s8(pb3))));
}
bScale[0] = *reinterpret_cast<const float*>(bPtr0 + k);
bScale[1] = *reinterpret_cast<const float*>(bPtr1 + k);
bScale[2] = *reinterpret_cast<const float*>(bPtr2 + k);
bScale[3] = *reinterpret_cast<const float*>(bPtr3 + k);
bSum[0] = *reinterpret_cast<const int32_t*>(bPtr0 + k + 4);
bSum[1] = *reinterpret_cast<const int32_t*>(bPtr1 + k + 4);
bSum[2] = *reinterpret_cast<const int32_t*>(bPtr2 + k + 4);
bSum[3] = *reinterpret_cast<const int32_t*>(bPtr3 + k + 4);
const int32_t sArr[4] = {
reduce_sum_s32(sum0) - bSum[0],
reduce_sum_s32(sum1) - bSum[1],
reduce_sum_s32(sum2) - bSum[2],
reduce_sum_s32(sum3) - bSum[3]
};
const float32x4_t vbScale = vld1q_f32(bScale);
const float32x4_t vfsums = vcvtq_f32_s32(vld1q_s32(sArr));
vst1q_f32(c + mi, vmulq_f32(vmulq_f32(vfsums, vaScale), vbScale));
}
}
// gemvS8S8: native int8 x int8 GEMV, no bias correction needed
// a: [k int8][float aScale], b rows: [k int8][float bScale][int32 bSum (unused)]
// result[i] = dotprod(a, b[i]) * aScale * bScale[i]
inline void gemvS8S8_neon(size_t m, size_t k, const int8_t* a, const int8_t* b, size_t ldb, float* c)
{
const float aScale = *reinterpret_cast<const float*>(a + k);
float bScale[4];
const float32x4_t vaScale = vdupq_n_f32(aScale);
for (size_t mi = 0; mi < m; mi += 4)
{
const int8_t* bPtr0 = b + ldb * (mi + 0);
const int8_t* bPtr1 = b + ldb * (mi + 1);
const int8_t* bPtr2 = b + ldb * (mi + 2);
const int8_t* bPtr3 = b + ldb * (mi + 3);
int32x4_t sum0 = vdupq_n_s32(0);
int32x4_t sum1 = vdupq_n_s32(0);
int32x4_t sum2 = vdupq_n_s32(0);
int32x4_t sum3 = vdupq_n_s32(0);
for (size_t j = 0; j < k; j += 16)
{
int8x16_t pa = vld1q_s8(a + j);
int8x16_t pb0 = vld1q_s8(bPtr0 + j);
int8x16_t pb1 = vld1q_s8(bPtr1 + j);
int8x16_t pb2 = vld1q_s8(bPtr2 + j);
int8x16_t pb3 = vld1q_s8(bPtr3 + j);
// Native int8 x int8 dot product using vmull_s8
// Product range: [-128*127, 127*127] = [-16256, 16129], fits in int16
sum0 = vpadalq_s16(sum0, vmull_s8(vget_low_s8(pa), vget_low_s8(pb0)));
sum0 = vpadalq_s16(sum0, vmull_s8(vget_high_s8(pa), vget_high_s8(pb0)));
sum1 = vpadalq_s16(sum1, vmull_s8(vget_low_s8(pa), vget_low_s8(pb1)));
sum1 = vpadalq_s16(sum1, vmull_s8(vget_high_s8(pa), vget_high_s8(pb1)));
sum2 = vpadalq_s16(sum2, vmull_s8(vget_low_s8(pa), vget_low_s8(pb2)));
sum2 = vpadalq_s16(sum2, vmull_s8(vget_high_s8(pa), vget_high_s8(pb2)));
sum3 = vpadalq_s16(sum3, vmull_s8(vget_low_s8(pa), vget_low_s8(pb3)));
sum3 = vpadalq_s16(sum3, vmull_s8(vget_high_s8(pa), vget_high_s8(pb3)));
}
bScale[0] = *reinterpret_cast<const float*>(bPtr0 + k);
bScale[1] = *reinterpret_cast<const float*>(bPtr1 + k);
bScale[2] = *reinterpret_cast<const float*>(bPtr2 + k);
bScale[3] = *reinterpret_cast<const float*>(bPtr3 + k);
// bSum correction is not needed: native int8 x int8 gives exact result
const int32_t sArr[4] = {
reduce_sum_s32(sum0),
reduce_sum_s32(sum1),
reduce_sum_s32(sum2),
reduce_sum_s32(sum3)
};
const float32x4_t vbScale = vld1q_f32(bScale);
const float32x4_t vfsums = vcvtq_f32_s32(vld1q_s32(sArr));
vst1q_f32(c + mi, vmulq_f32(vmulq_f32(vfsums, vaScale), vbScale));
}
}
// gemvU8U8: centered uint8 x uint8 GEMV (both a and b represent int8 biased by +128)
// result[i] = sum((a-128) * (b[i]-128)) * aScale * bScale[i]
inline void gemvU8U8_neon(size_t m, size_t k, const uint8_t* a, const uint8_t* b, size_t ldb, float* c)
{
const uint8x16_t bias = vdupq_n_u8(128);
const float aScale = *reinterpret_cast<const float*>(a + k);
float bScale[4];
const float32x4_t vaScale = vdupq_n_f32(aScale);
for (size_t mi = 0; mi < m; mi += 4)
{
const uint8_t* bPtr0 = b + ldb * (mi + 0);
const uint8_t* bPtr1 = b + ldb * (mi + 1);
const uint8_t* bPtr2 = b + ldb * (mi + 2);
const uint8_t* bPtr3 = b + ldb * (mi + 3);
int32x4_t sum0 = vdupq_n_s32(0);
int32x4_t sum1 = vdupq_n_s32(0);
int32x4_t sum2 = vdupq_n_s32(0);
int32x4_t sum3 = vdupq_n_s32(0);
for (size_t j = 0; j < k; j += 16)
{
// Convert from uint8 (0-255) to int8 (-128 to 127) via XOR 0x80
int8x16_t pa = vreinterpretq_s8_u8(veorq_u8(vld1q_u8(a + j), bias));
int8x16_t pb0 = vreinterpretq_s8_u8(veorq_u8(vld1q_u8(bPtr0 + j), bias));
int8x16_t pb1 = vreinterpretq_s8_u8(veorq_u8(vld1q_u8(bPtr1 + j), bias));
int8x16_t pb2 = vreinterpretq_s8_u8(veorq_u8(vld1q_u8(bPtr2 + j), bias));
int8x16_t pb3 = vreinterpretq_s8_u8(veorq_u8(vld1q_u8(bPtr3 + j), bias));
sum0 = vpadalq_s16(sum0, vmull_s8(vget_low_s8(pa), vget_low_s8(pb0)));
sum0 = vpadalq_s16(sum0, vmull_s8(vget_high_s8(pa), vget_high_s8(pb0)));
sum1 = vpadalq_s16(sum1, vmull_s8(vget_low_s8(pa), vget_low_s8(pb1)));
sum1 = vpadalq_s16(sum1, vmull_s8(vget_high_s8(pa), vget_high_s8(pb1)));
sum2 = vpadalq_s16(sum2, vmull_s8(vget_low_s8(pa), vget_low_s8(pb2)));
sum2 = vpadalq_s16(sum2, vmull_s8(vget_high_s8(pa), vget_high_s8(pb2)));
sum3 = vpadalq_s16(sum3, vmull_s8(vget_low_s8(pa), vget_low_s8(pb3)));
sum3 = vpadalq_s16(sum3, vmull_s8(vget_high_s8(pa), vget_high_s8(pb3)));
}
bScale[0] = *reinterpret_cast<const float*>(bPtr0 + k);
bScale[1] = *reinterpret_cast<const float*>(bPtr1 + k);
bScale[2] = *reinterpret_cast<const float*>(bPtr2 + k);
bScale[3] = *reinterpret_cast<const float*>(bPtr3 + k);
const int32_t sArr[4] = {
reduce_sum_s32(sum0),
reduce_sum_s32(sum1),
reduce_sum_s32(sum2),
reduce_sum_s32(sum3)
};
const float32x4_t vbScale = vld1q_f32(bScale);
const float32x4_t vfsums = vcvtq_f32_s32(vld1q_s32(sArr));
vst1q_f32(c + mi, vmulq_f32(vmulq_f32(vfsums, vaScale), vbScale));
}
}
template<>
void gemv<ArchType::neon>(size_t m, size_t k, const uint8_t* a, const int8_t* b, size_t ldb, float* c)
{
throw std::runtime_error("Not implemented yet");
return gemv_neon(m, k, a, b, ldb, c);
}
template<>
void gemvS8S8<ArchType::neon>(size_t m, size_t k, const int8_t* a, const int8_t* b, size_t ldb, float* c)
{
throw std::runtime_error("Not implemented yet");
return gemvS8S8_neon(m, k, a, b, ldb, c);
}
template<>
void gemvU8U8<ArchType::neon>(size_t m, size_t k, const uint8_t* a, const uint8_t* b, size_t ldb, float* c)
{
throw std::runtime_error("Not implemented yet");
return gemvU8U8_neon(m, k, a, b, ldb, c);
}
template<>
float dotS8S8<ArchType::neon>(size_t k, const int8_t* a, const int8_t* b)
{
throw std::runtime_error("Not implemented yet");
const float aScale = *reinterpret_cast<const float*>(a + k);
const float bScale = *reinterpret_cast<const float*>(b + k);
// No bSum correction needed for native int8 x int8
int32x4_t sum = vdupq_n_s32(0);
for (size_t i = 0; i < k; i += 16)
{
int8x16_t pa = vld1q_s8(a + i);
int8x16_t pb = vld1q_s8(b + i);
sum = vpadalq_s16(sum, vmull_s8(vget_low_s8(pa), vget_low_s8(pb)));
sum = vpadalq_s16(sum, vmull_s8(vget_high_s8(pa), vget_high_s8(pb)));
}
return static_cast<float>(reduce_sum_s32(sum)) * aScale * bScale;
}
template<>
float dotU8U8<ArchType::neon>(size_t k, const uint8_t* a, const uint8_t* b)
{
throw std::runtime_error("Not implemented yet");
const float aScale = *reinterpret_cast<const float*>(a + k);
const float bScale = *reinterpret_cast<const float*>(b + k);
const uint8x16_t bias = vdupq_n_u8(128);
int32x4_t sum = vdupq_n_s32(0);
for (size_t i = 0; i < k; i += 16)
{
int8x16_t pa = vreinterpretq_s8_u8(veorq_u8(vld1q_u8(a + i), bias));
int8x16_t pb = vreinterpretq_s8_u8(veorq_u8(vld1q_u8(b + i), bias));
sum = vpadalq_s16(sum, vmull_s8(vget_low_s8(pa), vget_low_s8(pb)));
sum = vpadalq_s16(sum, vmull_s8(vget_high_s8(pa), vget_high_s8(pb)));
}
return static_cast<float>(reduce_sum_s32(sum)) * aScale * bScale;
}
template<>
@ -59,7 +272,50 @@ namespace kiwi
float* out
)
{
throw std::runtime_error("Not implemented yet");
for (size_t mi = 0; mi < m; mi += 4)
{
const int8_t* aPtr0 = a + lda * (mi + 0);
const int8_t* aPtr1 = a + lda * (mi + 1);
const int8_t* aPtr2 = a + lda * (mi + 2);
const int8_t* aPtr3 = a + lda * (mi + 3);
int32x4_t sum0 = vdupq_n_s32(0);
int32x4_t sum1 = vdupq_n_s32(0);
int32x4_t sum2 = vdupq_n_s32(0);
int32x4_t sum3 = vdupq_n_s32(0);
for (size_t j = 0; j < k; j += 16)
{
int8x16_t pa0 = vld1q_s8(aPtr0 + j);
int8x16_t pa1 = vld1q_s8(aPtr1 + j);
int8x16_t pa2 = vld1q_s8(aPtr2 + j);
int8x16_t pa3 = vld1q_s8(aPtr3 + j);
// Compute a^2 using native int8 x int8 multiply
// Max product: (-128)*(-128) = 16384, fits in int16
sum0 = vpadalq_s16(sum0, vmull_s8(vget_low_s8(pa0), vget_low_s8(pa0)));
sum0 = vpadalq_s16(sum0, vmull_s8(vget_high_s8(pa0), vget_high_s8(pa0)));
sum1 = vpadalq_s16(sum1, vmull_s8(vget_low_s8(pa1), vget_low_s8(pa1)));
sum1 = vpadalq_s16(sum1, vmull_s8(vget_high_s8(pa1), vget_high_s8(pa1)));
sum2 = vpadalq_s16(sum2, vmull_s8(vget_low_s8(pa2), vget_low_s8(pa2)));
sum2 = vpadalq_s16(sum2, vmull_s8(vget_high_s8(pa2), vget_high_s8(pa2)));
sum3 = vpadalq_s16(sum3, vmull_s8(vget_low_s8(pa3), vget_low_s8(pa3)));
sum3 = vpadalq_s16(sum3, vmull_s8(vget_high_s8(pa3), vget_high_s8(pa3)));
}
const float aScale0 = *reinterpret_cast<const float*>(aPtr0 + k);
const float aScale1 = *reinterpret_cast<const float*>(aPtr1 + k);
const float aScale2 = *reinterpret_cast<const float*>(aPtr2 + k);
const float aScale3 = *reinterpret_cast<const float*>(aPtr3 + k);
const float rArr[4] = {
static_cast<float>(reduce_sum_s32(sum0)) * aScale0 * aScale0,
static_cast<float>(reduce_sum_s32(sum1)) * aScale1 * aScale1,
static_cast<float>(reduce_sum_s32(sum2)) * aScale2 * aScale2,
static_cast<float>(reduce_sum_s32(sum3)) * aScale3 * aScale3
};
vst1q_f32(out + mi, vrsqrteq_f32(vld1q_f32(rArr)));
}
}
template<>
@ -69,7 +325,52 @@ namespace kiwi
float* out
)
{
throw std::runtime_error("Not implemented yet");
const uint8x16_t bias = vdupq_n_u8(128);
for (size_t mi = 0; mi < m; mi += 4)
{
const uint8_t* aPtr0 = a + lda * (mi + 0);
const uint8_t* aPtr1 = a + lda * (mi + 1);
const uint8_t* aPtr2 = a + lda * (mi + 2);
const uint8_t* aPtr3 = a + lda * (mi + 3);
int32x4_t sum0 = vdupq_n_s32(0);
int32x4_t sum1 = vdupq_n_s32(0);
int32x4_t sum2 = vdupq_n_s32(0);
int32x4_t sum3 = vdupq_n_s32(0);
for (size_t j = 0; j < k; j += 16)
{
// Center uint8 to int8 via XOR 0x80: (a-128)
int8x16_t pa0 = vreinterpretq_s8_u8(veorq_u8(vld1q_u8(aPtr0 + j), bias));
int8x16_t pa1 = vreinterpretq_s8_u8(veorq_u8(vld1q_u8(aPtr1 + j), bias));
int8x16_t pa2 = vreinterpretq_s8_u8(veorq_u8(vld1q_u8(aPtr2 + j), bias));
int8x16_t pa3 = vreinterpretq_s8_u8(veorq_u8(vld1q_u8(aPtr3 + j), bias));
// Compute (a-128)^2
sum0 = vpadalq_s16(sum0, vmull_s8(vget_low_s8(pa0), vget_low_s8(pa0)));
sum0 = vpadalq_s16(sum0, vmull_s8(vget_high_s8(pa0), vget_high_s8(pa0)));
sum1 = vpadalq_s16(sum1, vmull_s8(vget_low_s8(pa1), vget_low_s8(pa1)));
sum1 = vpadalq_s16(sum1, vmull_s8(vget_high_s8(pa1), vget_high_s8(pa1)));
sum2 = vpadalq_s16(sum2, vmull_s8(vget_low_s8(pa2), vget_low_s8(pa2)));
sum2 = vpadalq_s16(sum2, vmull_s8(vget_high_s8(pa2), vget_high_s8(pa2)));
sum3 = vpadalq_s16(sum3, vmull_s8(vget_low_s8(pa3), vget_low_s8(pa3)));
sum3 = vpadalq_s16(sum3, vmull_s8(vget_high_s8(pa3), vget_high_s8(pa3)));
}
const float aScale0 = *reinterpret_cast<const float*>(aPtr0 + k);
const float aScale1 = *reinterpret_cast<const float*>(aPtr1 + k);
const float aScale2 = *reinterpret_cast<const float*>(aPtr2 + k);
const float aScale3 = *reinterpret_cast<const float*>(aPtr3 + k);
const float rArr[4] = {
static_cast<float>(reduce_sum_s32(sum0)) * aScale0 * aScale0,
static_cast<float>(reduce_sum_s32(sum1)) * aScale1 * aScale1,
static_cast<float>(reduce_sum_s32(sum2)) * aScale2 * aScale2,
static_cast<float>(reduce_sum_s32(sum3)) * aScale3 * aScale3
};
vst1q_f32(out + mi, vrsqrteq_f32(vld1q_f32(rArr)));
}
}
template<>

View file

@ -42,10 +42,10 @@ namespace kiwi
scale = (scale & 0x3F) + scaleBias;
lower = (lower - lzp) * scale;
lower += (lower >= 0) ? 4 : -4;
lower += (lower >= 0) ? 4 : -4; // for round up
lower /= scaleDivider;
upper = (upper - lzp) * scale;
upper += (upper >= 0) ? 4 : -4;
upper += (upper >= 0) ? 4 : -4; // for round up
upper /= scaleDivider;
if (toUint8)
{

View file

@ -48,6 +48,10 @@ struct kiwi_typo : public TypoTransformer
{
};
struct kiwi_prepared_typo : public PreparedTypoTransformer
{
};
struct kiwi_morphset
{
Kiwi* inst = nullptr;
@ -284,6 +288,40 @@ int kiwi_builder_add_alias_word(kiwi_builder_h handle, const char* alias, const
}
}
int kiwi_builder_add_word_with_def(kiwi_builder_h handle, const char* word, const char* pos, int sense_id, int dialect, float score)
{
if (!handle) return KIWIERR_INVALID_HANDLE;
auto* kb = (KiwiBuilder*)handle;
try
{
MorphemeDef def{ parse_tag(pos), (uint8_t)sense_id, (Dialect)dialect };
if (kb->addWord(utf8To16(word), def, score).second) return 0;
return KIWIERR_FAIL;
}
catch (...)
{
currentError = current_exception();
return KIWIERR_FAIL;
}
}
int kiwi_builder_add_alias_word_with_def(kiwi_builder_h handle, const char* alias, const char* pos, int sense_id, int dialect, float score, const char* orig_word)
{
if (!handle) return KIWIERR_INVALID_HANDLE;
auto* kb = (KiwiBuilder*)handle;
try
{
MorphemeDef def{ parse_tag(pos), (uint8_t)sense_id, (Dialect)dialect };
if (kb->addWord(utf8To16(alias), def, score, utf8To16(orig_word)).second) return 0;
return KIWIERR_FAIL;
}
catch (...)
{
currentError = current_exception();
return KIWIERR_FAIL;
}
}
int kiwi_builder_add_pre_analyzed_word(kiwi_builder_h handle, const char* form, int size, const char** analyzed_morphs, const char** analyzed_pos, float score, const int* positions)
{
if (!handle) return KIWIERR_INVALID_HANDLE;
@ -647,11 +685,48 @@ int kiwi_typo_close(kiwi_typo_h handle)
}
}
kiwi_h kiwi_init(const char * modelPath, int num_threads, int options)
kiwi_prepared_typo_h kiwi_typo_prepare(kiwi_typo_h handle)
{
if (!handle) return nullptr;
try
{
return new kiwi_prepared_typo{ handle->prepare(true) };
}
catch (...)
{
currentError = current_exception();
return nullptr;
}
}
int kiwi_prepared_typo_close(kiwi_prepared_typo_h handle)
{
if (!handle) return KIWIERR_INVALID_HANDLE;
try
{
delete handle;
return 0;
}
catch (...)
{
currentError = current_exception();
return -1;
}
}
kiwi_h kiwi_init(const char * modelPath, int num_threads, int options, int enabled_dialects)
{
try
{
return (kiwi_h)new Kiwi{ KiwiBuilder{ modelPath, (size_t)num_threads, (BuildOption)options }.build() };
BuildOption buildOption = (BuildOption)(options & 0xFF);
const auto mtMask = options & (KIWI_BUILD_MODEL_TYPE_LARGEST | KIWI_BUILD_MODEL_TYPE_KNLM | KIWI_BUILD_MODEL_TYPE_SBG | KIWI_BUILD_MODEL_TYPE_CONG | KIWI_BUILD_MODEL_TYPE_CONG_GLOBAL);
const ModelType modelType = (mtMask == KIWI_BUILD_MODEL_TYPE_LARGEST) ? ModelType::largest
: (mtMask == KIWI_BUILD_MODEL_TYPE_KNLM) ? ModelType::knlm
: (mtMask == KIWI_BUILD_MODEL_TYPE_SBG) ? ModelType::sbg
: (mtMask == KIWI_BUILD_MODEL_TYPE_CONG) ? ModelType::cong
: (mtMask == KIWI_BUILD_MODEL_TYPE_CONG_GLOBAL) ? ModelType::congGlobal
: ModelType::none;
return (kiwi_h)new Kiwi{ KiwiBuilder{ modelPath, (size_t)num_threads, buildOption, modelType, (Dialect)enabled_dialects }.build() };
}
catch (...)
{
@ -669,11 +744,16 @@ void kiwi_set_global_config(kiwi_h handle, kiwi_config_t config)
KiwiConfig kconfig{
!!config.integrate_allomorph,
config.cut_off_threshold,
config.unk_form_score_scale,
config.unk_form_score_bias,
config.oov_rule_scale,
config.oov_rule_bias,
config.oov_chr_bias,
config.oov_global_weight,
config.oov_local_weight,
config.oov_global_min_freq,
config.space_penalty,
config.typo_cost_weight,
config.max_unk_form_size,
config.max_unk_form_size_followed_by_j_class,
config.space_tolerance,
};
kiwi->setGlobalConfig(kconfig);
@ -694,11 +774,16 @@ kiwi_config_t kiwi_get_global_config(kiwi_h handle)
KiwiConfig kconfig = kiwi->getGlobalConfig();
config.integrate_allomorph = kconfig.integrateAllomorph;
config.cut_off_threshold = kconfig.cutOffThreshold;
config.unk_form_score_scale = kconfig.unkFormScoreScale;
config.unk_form_score_bias = kconfig.unkFormScoreBias;
config.oov_rule_scale = kconfig.oovRuleScale;
config.oov_rule_bias = kconfig.oovRuleBias;
config.oov_chr_bias = kconfig.oovChrBias;
config.oov_global_weight = kconfig.oovGlobalWeight;
config.oov_local_weight = kconfig.oovLocalWeight;
config.oov_global_min_freq = kconfig.oovGlobalMinFreq;
config.space_penalty = kconfig.spacePenalty;
config.typo_cost_weight = kconfig.typoCostWeight;
config.max_unk_form_size = kconfig.maxUnkFormSize;
config.max_unk_form_size_followed_by_j_class = kconfig.maxUnkFormSizeFollowedByJClass;
config.space_tolerance = kconfig.spaceTolerance;
}
catch (...)
@ -785,7 +870,9 @@ inline AnalyzeOption toAnalyzeOption(kiwi_analyze_option_t option)
option.blocklist ? &option.blocklist->morphemes : nullptr,
!!option.open_ending,
(Dialect)option.allowed_dialects,
option.dialect_cost
option.dialect_cost,
option.typo_transformer,
option.typo_threshold
};
}

View file

@ -61,12 +61,23 @@ namespace kiwi
{
const auto* aPtr = aBuffer + i * (k + 8);
const auto* bPtr = bBuffer + j * (k + 8);
int32_t acc = op.dotprod(aPtr, bPtr, k);
const float contextScale = *reinterpret_cast<const float*>(aPtr + k),
outputScale = *reinterpret_cast<const float*>(bPtr + k),
float contextBias;
if constexpr (archType == ArchType::neon)
{
const auto* aPtrS8 = reinterpret_cast<const int8_t*>(aPtr);
const float score = dotS8S8<archType>(k, aPtrS8, bPtr);
contextBias = *reinterpret_cast<const float*>(aPtr + k + 4);
const int32_t hsum = *reinterpret_cast<const int32_t*>(bPtr + k + 4);
c[i * ldc + j] = (acc - hsum) * contextScale * outputScale + contextBias;
c[i * ldc + j] = score + contextBias;
}
else
{
const int32_t acc = op.dotprod(aPtr, bPtr, k);
const float contextScale = *reinterpret_cast<const float*>(aPtr + k);
const float outputScale = *reinterpret_cast<const float*>(bPtr + k);
contextBias = *reinterpret_cast<const float*>(aPtr + k + 4);
const int32_t hsum = *reinterpret_cast<const int32_t*>(bPtr + k + 4);
c[i * ldc + j] = (acc - hsum) * contextScale * outputScale + contextBias;
}
}
}
}

View file

@ -228,5 +228,14 @@ namespace sais
});
return std::accumulate(numSuffices.begin(), numSuffices.end(), (size_t)0);
}
template<class Fn>
size_t enumNextChr(const std::pair<size_t, size_t>& range, Fn&& fn) const
{
return waveletTree.enumerate(range.first, range.second, [&](ChrTy c, size_t cl, size_t cr)
{
return fn(c, cr - cl);
});
}
};
}

View file

@ -5,13 +5,13 @@
kiwi_h reuse_kiwi_instance()
{
static kiwi_h kw = kiwi_init(MODEL_PATH, -1, KIWI_BUILD_DEFAULT);
static kiwi_h kw = kiwi_init(MODEL_PATH, -1, KIWI_BUILD_DEFAULT, 0);
return kw;
}
TEST(KiwiC, InitClose)
{
kiwi_h kw = kiwi_init(MODEL_PATH, -1, KIWI_BUILD_DEFAULT);
kiwi_h kw = kiwi_init(MODEL_PATH, -1, KIWI_BUILD_DEFAULT, 0);
EXPECT_NE(kw, nullptr);
EXPECT_EQ(kiwi_close(kw), 0);
}
@ -63,7 +63,7 @@ int mt_receiver(int idx, kiwi_res_h res, void* user)
TEST(KiwiC, AnalyzeMultithread)
{
auto data = loadTestCorpus();
kiwi_h kw = kiwi_init(MODEL_PATH, 2, KIWI_BUILD_DEFAULT);
kiwi_h kw = kiwi_init(MODEL_PATH, 2, KIWI_BUILD_DEFAULT, 0);
EXPECT_NE(kw, nullptr);
kiwi_analyze_option_t option = { KIWI_MATCH_ALL, };
EXPECT_EQ(kiwi_analyze_m(kw, mt_reader, mt_receiver, &data, 1, option), data.size());
@ -256,12 +256,16 @@ TEST(KiwiC, AnalyzeBasicTypoSet)
{
kiwi_h okw = reuse_kiwi_instance(), typo_kw;
kiwi_builder_h builder = kiwi_builder_init(MODEL_PATH, -1, KIWI_BUILD_DEFAULT, KIWI_DIALECT_STANDARD);
typo_kw = kiwi_builder_build(builder, kiwi_typo_get_default(KIWI_TYPO_BASIC_TYPO_SET), 2.5f);
typo_kw = kiwi_builder_build(builder, nullptr, 0);
kiwi_config_t config = kiwi_get_global_config(typo_kw);
config.typo_cost_weight = 5;
kiwi_set_global_config(typo_kw, config);
kiwi_analyze_option_t option = { KIWI_MATCH_ALL_WITH_NORMALIZING, };
kiwi_prepared_typo_h ptt = kiwi_typo_prepare(kiwi_typo_get_default(KIWI_TYPO_BASIC_TYPO_SET));
kiwi_analyze_option_t option = { KIWI_MATCH_ALL_WITH_NORMALIZING | KIWI_MATCH_OOV_CHR_FREQ_MODEL, };
option.typo_transformer = ptt;
option.typo_threshold = 2.5;
kiwi_res_h o, c;
for (const char* s : { u8"외않됀데?", u8"나 죰 도와죠.", u8"잘했따", u8"외구거 공부", u8"맗은 믈을 마셧다!" })
{
@ -272,6 +276,7 @@ TEST(KiwiC, AnalyzeBasicTypoSet)
EXPECT_EQ(kiwi_res_close(c), 0);
}
EXPECT_EQ(kiwi_prepared_typo_close(ptt), 0);
EXPECT_EQ(kiwi_builder_close(builder), 0);
EXPECT_EQ(kiwi_close(typo_kw), 0);
}
@ -289,12 +294,16 @@ TEST(KiwiC, CustomTypoSet)
kiwi_typo_update(custom_typo, continual_typo);
kiwi_typo_update(custom_typo, lengthening_typo);
typo_kw = kiwi_builder_build(builder, custom_typo, 2.5f);
kiwi_prepared_typo_h ptt = kiwi_typo_prepare(custom_typo);
typo_kw = kiwi_builder_build(builder, nullptr, 0);
kiwi_config_t config = kiwi_get_global_config(typo_kw);
config.typo_cost_weight = 5;
kiwi_set_global_config(typo_kw, config);
kiwi_analyze_option_t option = { KIWI_MATCH_ALL_WITH_NORMALIZING, };
kiwi_analyze_option_t option = { KIWI_MATCH_ALL_WITH_NORMALIZING | KIWI_MATCH_OOV_CHR_FREQ_MODEL, };
option.typo_transformer = ptt;
option.typo_threshold = 2.5;
kiwi_res_h o, c;
for (const char* s : { u8"외않됀데?", u8"나 죰 도와죠.", u8"자알했따", u8"외구거 공부", u8"맗은 믈을 마셧다!" })
{
@ -305,6 +314,7 @@ TEST(KiwiC, CustomTypoSet)
EXPECT_EQ(kiwi_res_close(c), 0);
}
EXPECT_EQ(kiwi_prepared_typo_close(ptt), 0);
EXPECT_EQ(kiwi_typo_close(custom_typo), 0);
EXPECT_EQ(kiwi_builder_close(builder), 0);
EXPECT_EQ(kiwi_close(typo_kw), 0);

View file

@ -22,17 +22,17 @@ TestInitializer _global_initializer;
using namespace kiwi;
inline testing::AssertionResult testTokenization(Kiwi& kiwi, const std::u16string& s)
inline testing::AssertionResult testTokenization(Kiwi& kiwi, const std::u16string& s, AnalyzeOption option)
{
auto tokens = kiwi.analyze(s, Match::all).first;
if (tokens.empty()) return testing::AssertionFailure() << "kiwi.analyze(" << testing::PrintToString(s) << ") yields an empty result.";
auto tokens = kiwi.analyze(s, option).first;
if (tokens.empty()) return testing::AssertionFailure() << "kiwi.analyze(" << utf16To8(s) << ") yields an empty result.";
if (tokens.back().position + tokens.back().length == s.size())
{
return testing::AssertionSuccess();
}
else
{
return testing::AssertionFailure() << "the result of kiwi.analyze(" << testing::PrintToString(s) << ") ends at " << (tokens.back().position + tokens.back().length);
return testing::AssertionFailure() << "the result of kiwi.analyze(" << utf16To8(s) << ") ends at " << (tokens.back().position + tokens.back().length);
}
}
@ -57,6 +57,120 @@ Kiwi& reuseKiwiInstance()
return kiwi;
}
TEST(KiwiCpp, ChrTokenizer)
{
ChrTokenizer tokenizer;
const std::string_view s = u8"안녕하세요.오늘날씨가참좋네요!Adx9810::~";
Vector<int32_t> encodedBuf(s.size());
encodedBuf.erase(encodedBuf.begin() + tokenizer.encode(s, encodedBuf.data(), encodedBuf.size()), encodedBuf.end());
EXPECT_TRUE(std::all_of(encodedBuf.begin(), encodedBuf.end(), [&](int32_t t) { return t < tokenizer.vocabSize(); }));
std::string decoded = utf16To8(tokenizer.decode(encodedBuf.data(), encodedBuf.size()));
EXPECT_EQ(s, decoded);
}
TEST(KiwiCpp, ChrModel)
{
ChrTokenizer tokenizer;
Kiwi& kiwi = reuseKiwiInstance();
auto streamProvider = utils::makeFilesystemProvider(MODEL_PATH);
auto stream = streamProvider("nounchr.mdl");
auto chrModel = lm::CoNgramModelBase::create(utils::createMemoryObjectFromStream(*stream),
kiwi.archType(),
false,
true
);
EXPECT_EQ(chrModel->vocabSize(), tokenizer.vocabSize());
std::array<int32_t, 256> buf = { 0, };
for (auto str : {
"한국어",
"됐습니다.",
"AS365버전",
"형태",
"형태를",
"바다를",
"샤를",
"카를",
"아를",
"자갈을",
"생선마을",
"북구을",
"분당을",
"사람을",
"도서관을",
"이민철",
"김민철",
"황보민수",
"남궁민수",
})
{
size_t size = tokenizer.encode(str, buf.data(), buf.size());
buf[size++] = 0;
float accScore = 0;
int32_t nodeIdx = 0;
uint32_t contextIdx = 0;
chrModel->progressOneStep(nodeIdx, contextIdx, 0);
for (size_t i = 0; i < size; ++i)
{
const size_t depth = chrModel->getNodeDepth(nodeIdx);
const float score = chrModel->progressOneStep(nodeIdx, contextIdx, buf[i]);
const float freq = chrModel->getContextFrequency(contextIdx);
const float entropy = chrModel->getContextEntropy(contextIdx);
auto tokenStr = utf16To8(tokenizer.decode(&buf[i], 1));
std::cerr << " Token: " << tokenStr << "(" << buf[i] << ") Score: " << score << " Depth: " << depth << " Freq: " << freq << " Entropy: " << entropy << std::endl;
EXPECT_LT(score, 0.01);
accScore += score;
}
std::cerr << "AccScore for \"" << str << "\": " << accScore << " AvgScore: " << (accScore / (size - 1)) << std::endl;
EXPECT_LT(accScore, 0);
}
}
TEST(KiwiCpp, ChrDataset)
{
constexpr size_t batchSize = 64, contextSize = 8, sentSize = 1000;
ChrDataset dataset{ batchSize, contextSize, 0, 0.f };
double totalWeight = 0.f;
for (size_t i = 0; i < sentSize; ++i)
{
const float weight = 1.f / (i + 2.f);
dataset.addSentence(std::to_string(i), weight, "0");
totalWeight += weight;
}
auto vocabProbs = dataset.getVocabProbs();
EXPECT_EQ(vocabProbs.size(), dataset.vocabSize());
std::array<int32_t, batchSize* contextSize> inBuf, outBuf;
ChrTokenizer tokenizer;
Vector<size_t> cnts(sentSize);
size_t totalSampled = 0;
for (size_t b = 0; b < 10000; ++b)
{
const size_t n = dataset.next(inBuf.data(), outBuf.data());
for (size_t i = 0; i < n; ++i)
{
const auto decoded = tokenizer.decode(&inBuf[i * contextSize + 1], contextSize - 1);
const size_t v = std::stoi(utf16To8(decoded));
cnts[v] += 1;
totalSampled += 1;
}
}
for (size_t i = 0; i < sentSize; ++i)
{
const float expectedProb = (float)((1.f / (i + 2.f)) / totalWeight);
const float actualProb = (float)(cnts[i] / (double)totalSampled);
EXPECT_NEAR(expectedProb, actualProb, expectedProb * 0.1f) << " for sentence " << i;
}
}
TEST(KiwiCpp, ExtractSubstrings)
{
const std::u16string s = u"자, 너 오늘 하루 뭐 했니? "
@ -120,10 +234,21 @@ TEST(KiwiCpp, EmptyResult)
u"스틸블루",
u"15살이었므로",
u"타란튤라",
u"꽃게 맛이 가장 좋다는 봄철에는 알이 통통하게 든 암 꽃게가 많이 잡히며, 게 딱지 속에 노란 알과 내장이 가득하여 게장으로 담그면 좋고, 가을에는 살이 통통하게 오른 숫 꽃게가 많이 잡히는데 살이 많고 찌더라도 퍽퍽하지 않고 부드러워 찜으로 요리하면 좋다.",
};
for (auto s : testCases)
{
EXPECT_TRUE(testTokenization(kiwi, s));
EXPECT_TRUE(testTokenization(kiwi, s, Match::all));
}
AnalyzeOption option = Match::allWithNormalizing | Match::oovChrFreqModel | Match::mergeSaisiot;
option.typoTransformer = getDefaultPreparedTypoSet(DefaultTypoSet::basicTypoSetWithContinualAndLengthening);
auto testCases2 = {
u"해물톳짜장 나왔습니당 쫄깃한 면발 위에 듬뿍 얹어진 톳 그 위에 방풍나물 마라도 짜장면 맛집 인정!",
};
for (auto s : testCases2)
{
EXPECT_TRUE(testTokenization(kiwi, s, option));
}
}
@ -154,15 +279,32 @@ TEST(KiwiCpp, SingleConsonantMorpheme)
TEST(KiwiCpp, SpecialTokenErrorOnContinualTypo)
{
KiwiBuilder builder{ MODEL_PATH, 0, BuildOption::default_, ModelType::none };
Kiwi typoKiwi = builder.build(DefaultTypoSet::continualTypoSet);
Kiwi typoKiwi = builder.build();
AnalyzeOption option = Match::allWithNormalizing;
option.typoTransformer = getDefaultPreparedTypoSet(DefaultTypoSet::continualTypoSet);
auto res = typoKiwi.analyze(u"감사합니다 -친구들과", Match::allWithNormalizing).first;
auto res = typoKiwi.analyze(u"감사합니다 -친구들과", option).first;
EXPECT_EQ(res[0].str, u"감사");
EXPECT_EQ(res[1].str, u"");
EXPECT_EQ(res[3].str, u"-");
EXPECT_EQ(res[3].tag, POSTag::so);
}
TEST(KiwiCpp, MultiWordTypo)
{
Kiwi& kiwi = reuseKiwiInstance();
AnalyzeOption option = Match::allWithNormalizing;
auto res = kiwi.analyze(u"존 F. 케네디 주니어", option).first;
EXPECT_EQ(res[0].str, u"존 F. 케네디 주니어");
res = kiwi.analyze(u"존 F. 캐네디 주니어", option).first;
EXPECT_NE(res[0].str, u"존 F. 케네디 주니어");
option.typoTransformer = getDefaultPreparedTypoSet(DefaultTypoSet::basicTypoSet);
res = kiwi.analyze(u"존 F. 캐네디 주니어", option).first;
EXPECT_EQ(res[0].str, u"존 F. 케네디 주니어");
res = kiwi.analyze(u"존F.캐네디주니어", option).first;
EXPECT_EQ(res[0].str, u"존 F. 케네디 주니어");
}
TEST(KiwiCpp, SplitComplex)
{
Kiwi& kiwi = reuseKiwiInstance();
@ -308,7 +450,8 @@ TEST(KiwiCpp, Pretokenized)
{
Kiwi& kiwi = reuseKiwiInstance();
auto str = u"드디어패트와 매트가 2017년에 국내 개봉했다. 패트와매트는 2016년...";
AnalyzeOption option = Match::allWithNormalizing;
std::vector<TokenInfo> res;
{
std::vector<PretokenizedSpan> pretokenized = {
@ -317,7 +460,7 @@ TEST(KiwiCpp, Pretokenized)
PretokenizedSpan{ 34, 39, {} },
};
res = kiwi.analyze(str, Match::allWithNormalizing, pretokenized).first;
res = kiwi.analyze(str, option, pretokenized).first;
EXPECT_EQ(res[1].str, u"패트와 매트");
EXPECT_EQ(res[3].str, u"2017년");
EXPECT_EQ(res[13].str, u"2016년");
@ -330,7 +473,7 @@ TEST(KiwiCpp, Pretokenized)
PretokenizedSpan{ 21, 24, { BasicToken{ u"개봉하", 0, 3, POSTag::vv }, BasicToken{ u"", 2, 3, POSTag::ep } }},
};
res = kiwi.analyze(str, Match::allWithNormalizing, pretokenized).first;
res = kiwi.analyze(str, option, pretokenized).first;
EXPECT_EQ(res[7].str, u"개봉하");
EXPECT_EQ(res[7].tag, POSTag::vv);
EXPECT_EQ(res[7].position, 21);
@ -351,8 +494,8 @@ TEST(KiwiCpp, Pretokenized)
PretokenizedSpan{ 16, 17, { BasicToken{ u"", 0, 1, POSTag::jkb } } },
};
auto ref = kiwi.analyze(str, Match::allWithNormalizing).first;
res = kiwi.analyze(str, Match::allWithNormalizing, pretokenized).first;
auto ref = kiwi.analyze(str, option).first;
res = kiwi.analyze(str, option, pretokenized).first;
EXPECT_EQ(res[2].tag, POSTag::jks);
EXPECT_EQ(res[2].morph, ref[2].morph);
EXPECT_FLOAT_EQ(res[2].score, ref[2].score);
@ -367,8 +510,80 @@ TEST(KiwiCpp, Pretokenized)
PretokenizedSpan{ 3, 4, { BasicToken{ u"", 0, 1, POSTag::vv } } },
};
auto ref = kiwi.analyze(str2, Match::allWithNormalizing).first;
res = kiwi.analyze(str2, Match::allWithNormalizing, pretokenized).first;
auto ref = kiwi.analyze(str2, option).first;
res = kiwi.analyze(str2, option, pretokenized).first;
EXPECT_EQ(res[2].tag, POSTag::vvi);
EXPECT_EQ(res[2].morph, ref[2].morph);
}
}
TEST(KiwiCpp, PretokenizedWithTypo)
{
Kiwi& kiwi = reuseKiwiInstance();
auto str = u"드디어패트와 매트가 2017년에 국내 개봉했다. 패트와매트는 2016년...";
AnalyzeOption option = Match::allWithNormalizing;
option.typoTransformer = getDefaultPreparedTypoSet(DefaultTypoSet::basicTypoSetWithContinualAndLengthening);
std::vector<TokenInfo> res;
{
std::vector<PretokenizedSpan> pretokenized = {
PretokenizedSpan{ 3, 9, {} },
PretokenizedSpan{ 11, 16, {} },
PretokenizedSpan{ 34, 39, {} },
};
res = kiwi.analyze(str, option, pretokenized).first;
EXPECT_EQ(res[1].str, u"패트와 매트");
EXPECT_EQ(res[3].str, u"2017년");
EXPECT_EQ(res[13].str, u"2016년");
}
{
std::vector<PretokenizedSpan> pretokenized = {
PretokenizedSpan{ 27, 29, { BasicToken{ u"페트", 0, 2, POSTag::nnb } } },
PretokenizedSpan{ 30, 32, {} },
PretokenizedSpan{ 21, 24, { BasicToken{ u"개봉하", 0, 3, POSTag::vv }, BasicToken{ u"", 2, 3, POSTag::ep } }},
};
res = kiwi.analyze(str, option, pretokenized).first;
EXPECT_EQ(res[7].str, u"개봉하");
EXPECT_EQ(res[7].tag, POSTag::vv);
EXPECT_EQ(res[7].position, 21);
EXPECT_EQ(res[7].length, 3);
EXPECT_EQ(res[8].str, u"");
EXPECT_EQ(res[8].tag, POSTag::ep);
EXPECT_EQ(res[8].position, 23);
EXPECT_EQ(res[8].length, 1);
EXPECT_EQ(res[11].str, u"페트");
EXPECT_EQ(res[11].tag, POSTag::nnb);
EXPECT_EQ(res[13].str, u"매트");
EXPECT_EQ(res[13].tag, POSTag::nng);
}
{
std::vector<PretokenizedSpan> pretokenized = {
PretokenizedSpan{ 9, 10, { BasicToken{ u"", 0, 1, POSTag::jks } } },
PretokenizedSpan{ 16, 17, { BasicToken{ u"", 0, 1, POSTag::jkb } } },
};
auto ref = kiwi.analyze(str, option).first;
res = kiwi.analyze(str, option, pretokenized).first;
EXPECT_EQ(res[2].tag, POSTag::jks);
EXPECT_EQ(res[2].morph, ref[2].morph);
EXPECT_FLOAT_EQ(res[2].score, ref[2].score);
EXPECT_EQ(res[5].tag, POSTag::jkb);
EXPECT_EQ(res[5].morph, ref[5].morph);
EXPECT_FLOAT_EQ(res[5].score, ref[5].score);
}
{
auto str2 = u"길을 걷다";
std::vector<PretokenizedSpan> pretokenized = {
PretokenizedSpan{ 3, 4, { BasicToken{ u"", 0, 1, POSTag::vv } } },
};
auto ref = kiwi.analyze(str2, option).first;
res = kiwi.analyze(str2, option, pretokenized).first;
EXPECT_EQ(res[2].tag, POSTag::vvi);
EXPECT_EQ(res[2].morph, ref[2].morph);
}
@ -1090,19 +1305,19 @@ TEST(KiwiCpp, AnalyzeError01)
TEST(KiwiCpp, NormalizeCoda)
{
Kiwi& kiwi = reuseKiwiInstance();
TokenResult res = kiwi.analyze(u"키윜ㅋㅋ", Match::allWithNormalizing);
TokenResult res = kiwi.analyze(u"키윜ㅋㅋ", Match::allWithNormalizing | Match::oovChrModel);
EXPECT_EQ(res.first.back().str, std::u16string{ u"ㅋㅋㅋ" });
res = kiwi.analyze(u"키윟ㅎ", Match::allWithNormalizing);
res = kiwi.analyze(u"키윟ㅎ", Match::allWithNormalizing | Match::oovChrModel);
EXPECT_EQ(res.first.back().str, std::u16string{ u"ㅎㅎ" });
res = kiwi.analyze(u"키윅ㄱ", Match::allWithNormalizing);
res = kiwi.analyze(u"키윅ㄱ", Match::allWithNormalizing | Match::oovChrModel);
EXPECT_EQ(res.first.back().str, std::u16string{ u"ㄱㄱ" });
res = kiwi.analyze(u"키윈ㄴㄴ", Match::allWithNormalizing);
res = kiwi.analyze(u"키윈ㄴㄴ", Match::allWithNormalizing | Match::oovChrModel);
EXPECT_EQ(res.first.back().str, std::u16string{ u"ㄴㄴㄴ" });
res = kiwi.analyze(u"키윊ㅎㅎ", Match::allWithNormalizing);
res = kiwi.analyze(u"키윊ㅎㅎ", Match::allWithNormalizing | Match::oovChrModel);
EXPECT_EQ(res.first.back().str, std::u16string{ u"ㅎㅎ" });
res = kiwi.analyze(u"키윍ㄱㄱ", Match::allWithNormalizing);
res = kiwi.analyze(u"키윍ㄱㄱ", Match::allWithNormalizing | Match::oovChrModel);
EXPECT_EQ(res.first.back().str, std::u16string{u"ㄱㄱ"});
}
}
TEST(KiwiCpp, ZCoda)
{
@ -1117,9 +1332,9 @@ TEST(KiwiCpp, ZCoda)
};
for (auto s : testCases)
{
auto res1 = kiwi.analyze(s.first, Match::allWithNormalizing);
auto res2 = kiwi.analyze(s.second, Match::allWithNormalizing);
auto res3 = kiwi.analyze(s.second, Match::allWithNormalizing & ~Match::zCoda);
auto res1 = kiwi.analyze(s.first, Match::allWithNormalizing | Match::oovChrFreqModel);
auto res2 = kiwi.analyze(s.second, Match::allWithNormalizing | Match::oovChrFreqModel);
auto res3 = kiwi.analyze(s.second, (Match::allWithNormalizing | Match::oovChrFreqModel) & ~Match::zCoda);
EXPECT_GE(res1.second - kiwi.getGlobalConfig().typoCostWeight, res2.second);
EXPECT_GT(res2.second, res3.second);
EXPECT_EQ(res2.first[res2.first.size() - 2].tag, POSTag::z_coda);
@ -1130,8 +1345,9 @@ TEST(KiwiCpp, ZCoda)
TEST(KiwiCpp, ZSiot)
{
Kiwi& kiwi = reuseKiwiInstance();
auto resSplit = kiwi.analyze(u"찰랑찰랑한 머릿결과 볼륨감", Match::allWithNormalizing | Match::splitSaisiot);
KiwiConfig config = kiwi.getGlobalConfig();
config.oovRuleScale = 6;
auto resSplit = kiwi.analyze(u"찰랑찰랑한 머릿결과 볼륨감", Match::allWithNormalizing | Match::splitSaisiot, {}, config);
EXPECT_EQ(resSplit.first.size(), 8);
EXPECT_EQ(resSplit.first[3].str, u"머리");
EXPECT_EQ(resSplit.first[4].tag, POSTag::z_siot);
@ -1139,9 +1355,9 @@ TEST(KiwiCpp, ZSiot)
for (auto s : {u"하굣길", u"만둣국", u"나뭇잎", u"세숫물", u"고춧가루", u"시곗바늘", u"사글셋방"})
{
auto resNone = kiwi.analyze(s, Match::allWithNormalizing);
auto resSplit = kiwi.analyze(s, Match::allWithNormalizing | Match::splitSaisiot);
auto resMerge = kiwi.analyze(s, Match::allWithNormalizing | Match::mergeSaisiot);
auto resNone = kiwi.analyze(s, Match::allWithNormalizing, {}, config);
auto resSplit = kiwi.analyze(s, Match::allWithNormalizing | Match::splitSaisiot, {}, config);
auto resMerge = kiwi.analyze(s, Match::allWithNormalizing | Match::mergeSaisiot, {}, config);
EXPECT_FALSE(std::any_of(resNone.first.begin(), resNone.first.end(), [](const TokenInfo& token) { return token.tag == POSTag::z_siot; }));
EXPECT_EQ(resSplit.first.size(), 3);
EXPECT_EQ(resSplit.first[0].tag, POSTag::nng);
@ -1153,9 +1369,9 @@ TEST(KiwiCpp, ZSiot)
for (auto s : {u"발렛 파킹", u"미닛"})
{
auto resNone = kiwi.analyze(s, Match::allWithNormalizing);
auto resSplit = kiwi.analyze(s, Match::allWithNormalizing | Match::splitSaisiot);
auto resMerge = kiwi.analyze(s, Match::allWithNormalizing | Match::mergeSaisiot);
auto resNone = kiwi.analyze(s, Match::allWithNormalizing, {}, config);
auto resSplit = kiwi.analyze(s, Match::allWithNormalizing | Match::splitSaisiot, {}, config);
auto resMerge = kiwi.analyze(s, Match::allWithNormalizing | Match::mergeSaisiot, {}, config);
EXPECT_EQ(resNone.second, resSplit.second);
EXPECT_EQ(resNone.second, resMerge.second);
EXPECT_FALSE(std::any_of(resSplit.first.begin(), resSplit.first.end(), [](const TokenInfo& token) { return token.tag == POSTag::z_siot; }));
@ -1164,13 +1380,16 @@ TEST(KiwiCpp, ZSiot)
TEST(KiwiCpp, ZSiotWithTypo)
{
Kiwi kiwi = KiwiBuilder{ MODEL_PATH, 0, BuildOption::default_, }.build(getDefaultTypoSet(DefaultTypoSet::basicTypoSetWithContinual));
Kiwi kiwi = KiwiBuilder{ MODEL_PATH, 0, BuildOption::default_, }.build();
AnalyzeOption option;
option.typoTransformer = getDefaultPreparedTypoSet(DefaultTypoSet::basicTypoSetWithContinual);
KiwiConfig config = kiwi.getGlobalConfig();
config.oovRuleScale = 6;
for (auto s : { u"하굣길", u"만둣국", u"나뭇잎", u"세숫물", u"고춧가루", u"시곗바늘", u"사글셋방" })
{
auto resNone = kiwi.analyze(s, Match::allWithNormalizing);
auto resSplit = kiwi.analyze(s, Match::allWithNormalizing | Match::splitSaisiot);
auto resMerge = kiwi.analyze(s, Match::allWithNormalizing | Match::mergeSaisiot);
auto resNone = kiwi.analyze(s, option.withMatch(Match::allWithNormalizing), {}, config);
auto resSplit = kiwi.analyze(s, option.withMatch(Match::allWithNormalizing | Match::splitSaisiot), {}, config);
auto resMerge = kiwi.analyze(s, option.withMatch(Match::allWithNormalizing | Match::mergeSaisiot), {}, config);
EXPECT_FALSE(std::any_of(resNone.first.begin(), resNone.first.end(), [](const TokenInfo& token) { return token.tag == POSTag::z_siot; }));
EXPECT_EQ(resSplit.first.size(), 3);
EXPECT_EQ(resSplit.first[0].tag, POSTag::nng);
@ -1182,9 +1401,9 @@ TEST(KiwiCpp, ZSiotWithTypo)
for (auto s : { u"발렛 파킹", u"미닛" })
{
auto resNone = kiwi.analyze(s, Match::allWithNormalizing);
auto resSplit = kiwi.analyze(s, Match::allWithNormalizing | Match::splitSaisiot);
auto resMerge = kiwi.analyze(s, Match::allWithNormalizing | Match::mergeSaisiot);
auto resNone = kiwi.analyze(s, option.withMatch(Match::allWithNormalizing), {}, config);
auto resSplit = kiwi.analyze(s, option.withMatch(Match::allWithNormalizing | Match::splitSaisiot), {}, config);
auto resMerge = kiwi.analyze(s, option.withMatch(Match::allWithNormalizing | Match::mergeSaisiot), {}, config);
EXPECT_EQ(resNone.second, resSplit.second);
EXPECT_EQ(resNone.second, resMerge.second);
EXPECT_FALSE(std::any_of(resSplit.first.begin(), resSplit.first.end(), [](const TokenInfo& token) { return token.tag == POSTag::z_siot; }));
@ -1468,6 +1687,19 @@ TEST(KiwiCpp, JoinAffix)
EXPECT_EQ(res5.first[5].str, u"배송되");
}
TEST(KiwiCpp, JoinParticleYo)
{
Kiwi& kiwi = reuseKiwiInstance();
auto sample1 = u"밥을 먹는다던가요";
auto res_without = kiwi.analyze(sample1, Match::none).first;
auto res_with = kiwi.analyze(sample1, Match::joinParticleYo).first;
EXPECT_EQ(res_without[res_without.size() - 2].str, u"는다던가");
EXPECT_EQ(res_without[res_without.size() - 1].str, u"");
EXPECT_EQ(res_with[res_with.size() - 1].str, u"는다던가요");
}
TEST(KiwiCpp, CompatibleJamo)
{
Kiwi& kiwi = reuseKiwiInstance();
@ -1815,8 +2047,10 @@ TEST(KiwiCpp, Issue205)
EXPECT_EQ(res1[0].str, u"함박 스테이크");
auto kiwi2 = builder.build(DefaultTypoSet::basicTypoSetWithContinual);
auto res2 = kiwi2.analyze(u"함박 스테이크를 먹었습니다", Match::allWithNormalizing).first;
auto kiwi2 = builder.build();
AnalyzeOption option = Match::allWithNormalizing;
option.typoTransformer = getDefaultPreparedTypoSet(DefaultTypoSet::basicTypoSetWithContinual);
auto res2 = kiwi2.analyze(u"함박 스테이크를 먹었습니다", option).first;
EXPECT_EQ(res2[0].str, u"함박 스테이크");
}
@ -1919,3 +2153,19 @@ TEST(KiwiCpp, Issue231)
EXPECT_EQ(tokens.size(), 1);
EXPECT_EQ(tokens[0].str, u"");
}
TEST(KiwiCpp, Issue246)
{
auto& kiwi = reuseKiwiInstance();
for (auto s : {
u"1. 분석",
u"1. 해야 하는 일",
u"1. 해야 하는 업무",
u"1. 수학적 증명",
u"1. Dataset"
})
{
auto res = kiwi.analyze(s, 5, Match::allWithNormalizing);
EXPECT_EQ(res[0].first[0].tag, POSTag::sb) << " for input: " << utf16To8(s);
}
}

View file

@ -5,6 +5,27 @@
using namespace kiwi;
TEST(KiwiTypo, GenerateGraph)
{
TypoTransformer tt;
tt.addTypo(u"", u"");
tt.addTypo(u"", u"");
tt.addTypo(u"", u"");
auto ptt = tt.prepare(true);
std::vector<TypoGraphNode> graph;
std::u16string nstr;
normalizeHangul(nstr, std::u16string_view{ u"그럼 내괴다룄네" });
auto size = ptt.generateGraph(nstr, graph);
EXPECT_EQ(size, 11);
ptt = getDefaultTypoSet(DefaultTypoSet::basicTypoSet).prepare(true);
nstr.clear();
normalizeHangul(nstr, std::u16string_view{ u"앗뿔싸 그럼 오늘부터 다시 열심히 해보자꾸나." });
size = ptt.generateGraph(nstr, graph);
EXPECT_GT(size, 0);
}
TEST(KiwiTypo, Generate)
{
TypoTransformer tt;
@ -13,14 +34,14 @@ TEST(KiwiTypo, Generate)
tt.addTypo(u"사에", u"사레");
auto ptt = tt.prepare();
UnorderedMap<std::u16string, float> typos;
typos.clear();
for (auto e : ptt.generate(u"%없어"))
{
typos.emplace(e.str, e.cost);
}
EXPECT_EQ(typos.size(), 1);
typos.clear();
for (auto e : ptt.generate(u"개가납네", 2))
{
@ -56,7 +77,7 @@ TEST(KiwiTypo, Generate)
TEST(KiwiTypo, BasicTypoSet)
{
auto ptt = getDefaultTypoSet(DefaultTypoSet::basicTypoSet).prepare();
for (auto t : ptt.generate(u""))
{
}
@ -75,25 +96,31 @@ TEST(KiwiTypo, Builder)
TypoTransformer tt;
tt.addTypo(u"", u"");
tt.addTypo(u"", u"");
Kiwi kiwi = KiwiBuilder{ MODEL_PATH, 0, BuildOption::default_, }.build(tt);
auto ptt = tt.prepare(true);
Kiwi kiwi = KiwiBuilder{ MODEL_PATH, 0, BuildOption::default_, }.build();
AnalyzeOption option;
option.match = Match::allWithNormalizing;
option.typoTransformer = &ptt;
auto config = kiwi.getGlobalConfig();
TokenResult ret;
config.typoCostWeight = 1e-9;
kiwi.setGlobalConfig(config);
ret = kiwi.analyze(u"문화제 보호", Match::allWithNormalizing);
ret = kiwi.analyze(u"문화제 보호", option);
config.typoCostWeight = 2;
kiwi.setGlobalConfig(config);
ret = kiwi.analyze(u"문화제 보호", Match::allWithNormalizing);
ret = kiwi.analyze(u"문화제 보호", option);
config.typoCostWeight = 4;
kiwi.setGlobalConfig(config);
ret = kiwi.analyze(u"문화제 보호", Match::allWithNormalizing);
ret = kiwi.analyze(u"문화제 보호", option);
config.typoCostWeight = 6;
kiwi.setGlobalConfig(config);
ret = kiwi.analyze(u"문화제 보호", Match::allWithNormalizing);
ret = kiwi.analyze(u"문화제 보호", option);
}
TEST(KiwiTypo, AnalyzeBasicTypoSet)
@ -101,73 +128,83 @@ TEST(KiwiTypo, AnalyzeBasicTypoSet)
KiwiBuilder builder{ MODEL_PATH, 0, BuildOption::default_, };
Kiwi kiwi = builder.build();
Kiwi typoKiwi = builder.build(DefaultTypoSet::basicTypoSet);
auto config = typoKiwi.getGlobalConfig();
auto ptt = getDefaultTypoSet(DefaultTypoSet::basicTypoSet).prepare(true);
AnalyzeOption option;
option.match = Match::allWithNormalizing | Match::oovChrFreqModel;
option.typoTransformer = &ptt;
auto config = kiwi.getGlobalConfig();
config.typoCostWeight = 5;
typoKiwi.setGlobalConfig(config);
kiwi.setGlobalConfig(config);
TokenResult o = kiwi.analyze(u"외않됀데?", Match::allWithNormalizing);
TokenResult c = typoKiwi.analyze(u"외않됀데?", Match::allWithNormalizing);
TokenResult o = kiwi.analyze(u"외않됀데?", Match::allWithNormalizing | Match::oovChrFreqModel);
TokenResult c = kiwi.analyze(u"외않됀데?", option);
EXPECT_TRUE(o.second < c.second);
o = kiwi.analyze(u"나 죰 도와죠.", Match::allWithNormalizing);
c = typoKiwi.analyze(u"나 죰 도와죠.", Match::allWithNormalizing);
o = kiwi.analyze(u"나 죰 도와죠.", Match::allWithNormalizing | Match::oovChrFreqModel);
c = kiwi.analyze(u"나 죰 도와죠.", option);
EXPECT_TRUE(o.second < c.second);
o = kiwi.analyze(u"잘했따", Match::allWithNormalizing);
c = typoKiwi.analyze(u"잘했따", Match::allWithNormalizing);
o = kiwi.analyze(u"잘했따", Match::allWithNormalizing | Match::oovChrFreqModel);
c = kiwi.analyze(u"잘했따", option);
EXPECT_TRUE(o.second < c.second);
o = kiwi.analyze(u"외구거 공부", Match::allWithNormalizing);
c = typoKiwi.analyze(u"외구거 공부", Match::allWithNormalizing);
o = kiwi.analyze(u"외구거 공부", Match::allWithNormalizing | Match::oovChrFreqModel);
c = kiwi.analyze(u"외구거 공부", option);
EXPECT_TRUE(o.second < c.second);
o = kiwi.analyze(u"맗은 믈을 마셧다!", Match::allWithNormalizing);
c = typoKiwi.analyze(u"맗은 믈을 마셧다!", Match::allWithNormalizing);
o = kiwi.analyze(u"맗은 믈을 마셧다!", Match::allWithNormalizing | Match::oovChrFreqModel);
c = kiwi.analyze(u"맗은 믈을 마셧다!", option);
EXPECT_TRUE(o.second < c.second);
o = kiwi.analyze(u"Wertheimer)가 자신의 논문 <운동지각에 관한 실험연구>(Experimental studies on the perception of movement)을 통해 일상적인 지각 현상에 대한 새로운 시각을 제시한 시기이다.",
Match::allWithNormalizing);
c = typoKiwi.analyze(u"Wertheimer)가 자신의 논문 <운동지각에 관한 실험연구>(Experimental studies on the perception of movement)을 통해 일상적인 지각 현상에 대한 새로운 시각을 제시한 시기이다.",
Match::allWithNormalizing);
Match::allWithNormalizing | Match::oovChrFreqModel);
c = kiwi.analyze(u"Wertheimer)가 자신의 논문 <운동지각에 관한 실험연구>(Experimental studies on the perception of movement)을 통해 일상적인 지각 현상에 대한 새로운 시각을 제시한 시기이다.",
option);
}
TEST(KiwiTypo, ContinualTypoSet)
{
KiwiBuilder builder{ MODEL_PATH, 0, BuildOption::default_, };
Kiwi typoKiwi = builder.build(DefaultTypoSet::continualTypoSet);
Kiwi kiwi = builder.build();
auto res = typoKiwi.analyze(u"프로그래미", Match::allWithNormalizing).first;
auto ptt = getDefaultTypoSet(DefaultTypoSet::continualTypoSet).prepare(true);
AnalyzeOption option{ Match::allWithNormalizing };
option.typoTransformer = &ptt;
auto res = kiwi.analyze(u"프로그래미", option).first;
EXPECT_EQ(res.size(), 2);
EXPECT_EQ(res[0].str, u"프로그램");
EXPECT_EQ(res[1].str, u"");
res = typoKiwi.analyze(u"프로그래믈", Match::allWithNormalizing).first;
res = kiwi.analyze(u"프로그래믈", option).first;
EXPECT_EQ(res.size(), 2);
EXPECT_EQ(res[0].str, u"프로그램");
EXPECT_EQ(res[1].str, u"");
res = typoKiwi.analyze(u"오늘사무시레서", Match::allWithNormalizing).first;
res = kiwi.analyze(u"오늘사무시레서", option).first;
EXPECT_EQ(res.size(), 3);
EXPECT_EQ(res[1].str, u"사무실");
EXPECT_EQ(res[2].str, u"에서");
res = typoKiwi.analyze(u"법원이 기가캤다.", Match::allWithNormalizing).first;
res = kiwi.analyze(u"법원이 기가캤다.", option).first;
EXPECT_EQ(res.size(), 7);
EXPECT_EQ(res[2].str, u"기각");
EXPECT_EQ(res[3].str, u"");
res = typoKiwi.analyze(u"하나도 업써.", Match::allWithNormalizing).first;
res = kiwi.analyze(u"하나도 업써.", option).first;
EXPECT_EQ(res.size(), 5);
EXPECT_EQ(res[2].str, u"");
EXPECT_EQ(res[3].str, u"");
res = typoKiwi.analyze(u"말근 하늘", Match::allWithNormalizing).first;
res = kiwi.analyze(u"말근 하늘", option).first;
EXPECT_EQ(res.size(), 3);
EXPECT_EQ(res[0].str, u"");
EXPECT_EQ(res[1].str, u"");
res = typoKiwi.analyze(u"아주 만타.", Match::allWithNormalizing).first;
res = kiwi.analyze(u"아주 만타.", option).first;
EXPECT_EQ(res.size(), 4);
EXPECT_EQ(res[1].str, u"");
EXPECT_EQ(res[2].str, u"");
@ -177,74 +214,84 @@ TEST(KiwiTypo, ContinualTypoSet)
TEST(KiwiTypo, BasicTypoSetWithContinual)
{
KiwiBuilder builder{ MODEL_PATH, 0, BuildOption::default_, };
Kiwi typoKiwi = builder.build(DefaultTypoSet::basicTypoSetWithContinual);
Kiwi kiwi = builder.build();
auto res = typoKiwi.analyze(u"프로그레미", Match::allWithNormalizing).first;
auto ptt = getDefaultTypoSet(DefaultTypoSet::basicTypoSetWithContinual).prepare(true);
AnalyzeOption option;
option.match = Match::allWithNormalizing | Match::oovChrFreqModel;
option.typoTransformer = &ptt;
auto config = kiwi.getGlobalConfig();
auto res = kiwi.analyze(u"프로그레믈", option, {}, config).first;
EXPECT_EQ(res.size(), 2);
EXPECT_EQ(res[0].str, u"프로그램");
EXPECT_EQ(res[1].str, u"");
if (res.size() > 1) EXPECT_EQ(res[1].str, u"");
res = typoKiwi.analyze(u"프로그레믈", Match::allWithNormalizing).first;
EXPECT_EQ(res.size(), 2);
EXPECT_EQ(res[0].str, u"프로그램");
EXPECT_EQ(res[1].str, u"");
res = typoKiwi.analyze(u"오늘사므시레서", Match::allWithNormalizing).first;
res = kiwi.analyze(u"오늘사므시레서", option, {}, config).first;
EXPECT_EQ(res.size(), 3);
EXPECT_EQ(res[1].str, u"사무실");
EXPECT_EQ(res[2].str, u"에서");
if (res.size() > 1) EXPECT_EQ(res[1].str, u"사무실");
if (res.size() > 2) EXPECT_EQ(res[2].str, u"에서");
res = typoKiwi.analyze(u"버붠이 기가캤다.", Match::allWithNormalizing).first;
res = kiwi.analyze(u"버붠이 기가캤다.", option, {}, config).first;
EXPECT_EQ(res.size(), 7);
EXPECT_EQ(res[2].str, u"기각");
EXPECT_EQ(res[3].str, u"");
if (res.size() > 2) EXPECT_EQ(res[2].str, u"기각");
if (res.size() > 3) EXPECT_EQ(res[3].str, u"");
res = typoKiwi.analyze(u"하나도 업써.", Match::allWithNormalizing).first;
res = kiwi.analyze(u"하나도 업써.", option, {}, config).first;
EXPECT_EQ(res.size(), 5);
EXPECT_EQ(res[2].str, u"");
EXPECT_EQ(res[3].str, u"");
if (res.size() > 2) EXPECT_EQ(res[2].str, u"");
if (res.size() > 3) EXPECT_EQ(res[3].str, u"");
res = typoKiwi.analyze(u"말근 하늘", Match::allWithNormalizing).first;
res = kiwi.analyze(u"말근 하늘", option, {}, config).first;
EXPECT_EQ(res.size(), 3);
EXPECT_EQ(res[0].str, u"");
EXPECT_EQ(res[1].str, u"");
if (res.size() > 1) EXPECT_EQ(res[1].str, u"");
res = typoKiwi.analyze(u"아주 만타.", Match::allWithNormalizing).first;
res = kiwi.analyze(u"아주 만타.", option, {}, config).first;
EXPECT_EQ(res.size(), 4);
EXPECT_EQ(res[1].str, u"");
EXPECT_EQ(res[2].str, u"");
if (res.size() > 1) EXPECT_EQ(res[1].str, u"");
if (res.size() > 2) EXPECT_EQ(res[2].str, u"");
}
TEST(KiwiTypo, LengtheningTypoSet)
{
KiwiBuilder builder{ MODEL_PATH, 0, BuildOption::default_, };
Kiwi typoKiwi = builder.build(DefaultTypoSet::lengtheningTypoSet);
const float typoCost = typoKiwi.getGlobalConfig().typoCostWeight * 0.25f;
Kiwi kiwi = builder.build();
auto ref = typoKiwi.analyze(u"진짜?", Match::allWithNormalizing);
auto res = typoKiwi.analyze(u"지인짜?", Match::allWithNormalizing);
auto ptt = getDefaultTypoSet(DefaultTypoSet::lengtheningTypoSet).prepare(true);
AnalyzeOption option;
option.match = Match::allWithNormalizing;
option.typoTransformer = &ptt;
const float typoCost = kiwi.getGlobalConfig().typoCostWeight * 0.25f;
auto ref = kiwi.analyze(u"진짜?", option);
auto res = kiwi.analyze(u"지인짜?", option);
EXPECT_FLOAT_EQ(ref.second - 4 * typoCost, res.second);
EXPECT_EQ(res.first.size(), 2);
EXPECT_EQ(res.first[0].str, u"진짜");
EXPECT_EQ(res.first[1].str, u"?");
res = typoKiwi.analyze(u"지인짜아?", Match::allWithNormalizing);
res = kiwi.analyze(u"지인짜아?", option);
EXPECT_FLOAT_EQ(ref.second - 5 * typoCost, res.second);
EXPECT_EQ(res.first.size(), 2);
EXPECT_EQ(res.first[0].str, u"진짜");
EXPECT_EQ(res.first[1].str, u"?");
res = typoKiwi.analyze(u"그으으래?", Match::allWithNormalizing);
res = kiwi.analyze(u"그으으래?", option);
EXPECT_EQ(res.first.size(), 2);
EXPECT_EQ(res.first[0].str, u"그래");
EXPECT_EQ(res.first[1].str, u"?");
res = typoKiwi.analyze(u"그으으으으래?", Match::allWithNormalizing);
res = kiwi.analyze(u"그으으으으래?", option);
EXPECT_EQ(res.first.size(), 2);
EXPECT_EQ(res.first[0].str, u"그래");
EXPECT_EQ(res.first[1].str, u"?");
res = typoKiwi.analyze(u"학교오를 가야아해", Match::allWithNormalizing);
res = kiwi.analyze(u"학교오를 가야아해", option);
EXPECT_EQ(res.first.size(), 6);
EXPECT_EQ(res.first[0].str, u"학교");
EXPECT_EQ(res.first[1].str, u"");

@ -1 +1 @@
Subproject commit f38e229e754f90fa06b0a99ae7fbbcfcbe7dcabc
Subproject commit 226ce3aed24702bef1b03dba4b3cb55bc0bf31dd

2
third_party/cpuinfo vendored

@ -1 +1 @@
Subproject commit c4b4f4bf08c0cf486fc3111d0244ebf2a48ad01b
Subproject commit 05dd959fa26c7e68fa229495a35f55e06a3b9655

@ -1 +1 @@
Subproject commit ff6133ab49b364a883a55ba75c39e520fea6245b
Subproject commit 52eb8108c5bdec04579160ae17225d66034bd723

2
third_party/json vendored

@ -1 +1 @@
Subproject commit 5ed07097faa6c50199c4a3b66e5ed37d4fbfccc2
Subproject commit 55f93686c01528224f448c19128836e7df245f72

@ -1 +1 @@
Subproject commit f0cd5505aa102cee991be0367b82506638a16281
Subproject commit fbd8b99c2b828428947d70fdc046bb55609be93e

Some files were not shown because too many files have changed in this diff Show more