mirror of
https://github.com/bab2min/Kiwi.git
synced 2026-06-17 01:54:27 +00:00
Compare commits
119 commits
copilot/en
...
main
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
693d6f2e02 | ||
|
|
dab6f2a665 | ||
|
|
f916576479 |
||
|
|
d7d305fa90 | ||
|
|
cbca2f4159 | ||
|
|
4f502793e4 | ||
|
|
84bb01f051 | ||
|
|
270bdcacb5 | ||
|
|
303586ad22 | ||
|
|
d4e3e63b08 | ||
|
|
b30e388e4a | ||
|
|
8e3fe94a05 |
||
|
|
d446d3da74 | ||
|
|
a841678d50 | ||
|
|
6308709f3d |
||
|
|
cc322a9a1a | ||
|
|
9e2870b81a | ||
|
|
cb7ae0d6b4 | ||
|
|
599ca9aee2 | ||
|
|
cce6e1aa77 | ||
|
|
8839bb5a11 |
||
|
|
17360c4318 | ||
|
|
d23c7d395d | ||
|
|
4e732fa615 |
||
|
|
d9a43a8fd0 |
||
|
|
4e8805a6e3 |
||
|
|
b8516fdb05 | ||
|
|
f3a8f228e1 | ||
|
|
25cc639c96 | ||
|
|
11baa0b010 | ||
|
|
605c2e78c5 |
||
|
|
86d4733fdd | ||
|
|
5c220fcf0e |
||
|
|
2f04fa83dc | ||
|
|
a137b4830a | ||
|
|
b196855830 | ||
|
|
ee7c4776e1 | ||
|
|
03d6065248 | ||
|
|
dd63f3bfdd | ||
|
|
f68dc34126 |
||
|
|
96bffaaffd | ||
|
|
d7ce2915cf | ||
|
|
f2f24de2d3 | ||
|
|
71aa41cbd8 | ||
|
|
ce2a5184f0 | ||
|
|
fc203ca652 | ||
|
|
ab8f209690 | ||
|
|
808d90ad1c | ||
|
|
a3c149d173 | ||
|
|
bf8a3964bd | ||
|
|
c77d229d53 | ||
|
|
368e7915b0 | ||
|
|
8692e7ea81 | ||
|
|
683314ab7a | ||
|
|
6b078693e5 | ||
|
|
b455855653 | ||
|
|
7a2288456c | ||
|
|
9163b0583b | ||
|
|
12423b0164 | ||
|
|
71fddc801a | ||
|
|
fe3cb43be6 | ||
|
|
1e2069e115 | ||
|
|
640fd8c77a | ||
|
|
54a34a9e15 |
||
|
|
ba17128e25 | ||
|
|
9e5e384dd9 |
||
|
|
ca6c6e0156 | ||
|
|
84521c8985 | ||
|
|
322801eca0 | ||
|
|
78307330c5 | ||
|
|
3c68055666 | ||
|
|
4580bcd3f4 |
||
|
|
89bc4eff46 | ||
|
|
83a6f64851 |
||
|
|
b5990e17b5 |
||
|
|
9aa03a40ae | ||
|
|
ca6247914f |
||
|
|
95ab533a10 | ||
|
|
28b9c55250 |
||
|
|
b900cc7faf | ||
|
|
d7ab4334ec | ||
|
|
92a43c1652 | ||
|
|
6f4c92bd5d | ||
|
|
d32fdd95f3 | ||
|
|
227fad2b4f |
||
|
|
ada8636259 | ||
|
|
b021352fac | ||
|
|
a42cf4094b | ||
|
|
a6f6bc97b4 | ||
|
|
1d94c7f5e3 | ||
|
|
eaea80f978 |
||
|
|
f54dbb96e6 | ||
|
|
044dcd0583 | ||
|
|
f6590d9943 | ||
|
|
da1506de3c | ||
|
|
3bff514f7e | ||
|
|
9ef8748b9b | ||
|
|
d551c7cc91 | ||
|
|
e95a8712c6 | ||
|
|
1e25c9781e | ||
|
|
11b0d53dbb | ||
|
|
6db9c7de98 | ||
|
|
737952f203 | ||
|
|
d87da45590 | ||
|
|
fed126e1df | ||
|
|
6ae75babda | ||
|
|
0ea44c26c0 |
||
|
|
fade6bcd66 | ||
|
|
d3b103c75c | ||
|
|
d59673a829 | ||
|
|
8bad7c6371 | ||
|
|
53f5289d55 | ||
|
|
13c37d03bd | ||
|
|
d0b6300c64 | ||
|
|
54979a5f6c | ||
|
|
4a7cd6a88b | ||
|
|
03d010a965 | ||
|
|
2f637046a0 | ||
|
|
26175c6850 |
95 changed files with 11332 additions and 1030 deletions
7
.github/workflows/macos.yml
vendored
7
.github/workflows/macos.yml
vendored
|
|
@ -34,13 +34,12 @@ jobs:
|
|||
echo "CC=gcc-${{ matrix.version }}" >> $GITHUB_ENV
|
||||
echo "CXX=g++-${{ matrix.version }}" >> $GITHUB_ENV
|
||||
else
|
||||
ls -ls /Applications/
|
||||
sudo xcode-select -switch /Applications/Xcode_${{ matrix.version }}.app
|
||||
echo "CC=$(brew --prefix llvm@18)/bin/clang" >> $GITHUB_ENV
|
||||
echo "CXX=$(brew --prefix llvm@18)/bin/clang++" >> $GITHUB_ENV
|
||||
echo "CC=clang" >> $GITHUB_ENV
|
||||
echo "CXX=clang++" >> $GITHUB_ENV
|
||||
fi
|
||||
- name: Configure Build
|
||||
run: mkdir build && cd build && cmake -DCMAKE_BUILD_TYPE=Release -DKIWI_JAVA_BINDING=1 ..
|
||||
run: mkdir build && cd build && cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_OSX_DEPLOYMENT_TARGET=10.15 -DKIWI_JAVA_BINDING=1 ..
|
||||
- name: Build
|
||||
run: cd build && make -j2
|
||||
- name: Run Unit Test
|
||||
|
|
|
|||
18
.github/workflows/release.yml
vendored
18
.github/workflows/release.yml
vendored
|
|
@ -114,7 +114,7 @@ jobs:
|
|||
asset_name: kiwi-java-${{ steps.get_release.outputs.tag_name }}-mac-${{ matrix.arch }}.jar
|
||||
asset_content_type: application/octet-stream
|
||||
- if: matrix.arch == 'arm64'
|
||||
run: tar -zcvf model.tgz models/cong/base/sj.* models/cong/base/extract.mdl models/cong/base/*.dict models/cong/base/combiningRule.txt models/cong/base/*.mdl
|
||||
run: tar -zcvf model.tgz models/cong/base/sj.* models/cong/base/*.dict models/cong/base/combiningRule.txt models/cong/base/*.mdl
|
||||
- name: Upload release binary
|
||||
if: matrix.arch == 'arm64'
|
||||
uses: actions/upload-release-asset@v1.0.2
|
||||
|
|
@ -319,20 +319,28 @@ jobs:
|
|||
build-emscripten:
|
||||
name: Emscripten
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
contents: read
|
||||
id-token: write
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
with:
|
||||
submodules: true
|
||||
lfs: true
|
||||
- uses: mymindstorm/setup-emsdk@v14
|
||||
- uses: actions/setup-node@v4
|
||||
with:
|
||||
node-version: '20'
|
||||
registry-url: 'https://registry.npmjs.org'
|
||||
- name: Build
|
||||
run: |
|
||||
cd bindings/wasm
|
||||
./build.sh
|
||||
- uses: JS-DevTools/npm-publish@v3
|
||||
with:
|
||||
token: ${{ secrets.NPM_TOKEN }}
|
||||
package: bindings/wasm/package
|
||||
- name: Publish to npm
|
||||
run: |
|
||||
npm install -g npm@latest
|
||||
cd bindings/wasm/package
|
||||
npm publish --provenance --access public
|
||||
- name: Build documenation
|
||||
run: |
|
||||
cd bindings/wasm/package
|
||||
|
|
|
|||
183
.github/workflows/swift.yml
vendored
Normal file
183
.github/workflows/swift.yml
vendored
Normal file
|
|
@ -0,0 +1,183 @@
|
|||
name: Swift Bindings
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
branches: [ main ]
|
||||
paths:
|
||||
- 'bindings/swift/**'
|
||||
- 'include/kiwi/capi.h'
|
||||
- 'include/kiwi/Macro.h'
|
||||
- '.github/workflows/swift.yml'
|
||||
push:
|
||||
branches: [ main ]
|
||||
paths:
|
||||
- 'bindings/swift/**'
|
||||
- 'include/kiwi/capi.h'
|
||||
- 'include/kiwi/Macro.h'
|
||||
tags:
|
||||
- 'v*'
|
||||
|
||||
jobs:
|
||||
swift-build-test:
|
||||
name: Swift Build and Test
|
||||
runs-on: macos-latest
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
with:
|
||||
submodules: true
|
||||
lfs: true
|
||||
|
||||
- name: Select Xcode
|
||||
run: sudo xcode-select -switch /Applications/Xcode.app
|
||||
|
||||
- name: Swift Version
|
||||
run: swift --version
|
||||
|
||||
- name: Build C++ Library
|
||||
run: |
|
||||
mkdir build && cd build
|
||||
cmake -DCMAKE_BUILD_TYPE=Release \
|
||||
-DKIWI_BUILD_DYNAMIC=OFF \
|
||||
-DKIWI_BUILD_CLI=OFF \
|
||||
-DKIWI_BUILD_EVALUATOR=OFF \
|
||||
-DKIWI_BUILD_MODEL_BUILDER=OFF \
|
||||
-DKIWI_BUILD_TEST=OFF \
|
||||
-DKIWI_JAVA_BINDING=OFF \
|
||||
-DKIWI_USE_MIMALLOC=ON \
|
||||
..
|
||||
make -j$(sysctl -n hw.ncpu)
|
||||
|
||||
- name: Verify Static Library
|
||||
run: |
|
||||
if [ ! -f build/libkiwi_static.a ]; then
|
||||
echo "Error: libkiwi_static.a not found"
|
||||
exit 1
|
||||
fi
|
||||
file build/libkiwi_static.a
|
||||
ls -lh build/libkiwi_static.a
|
||||
|
||||
- name: Build Swift Package
|
||||
run: |
|
||||
cd bindings/swift
|
||||
swift build -v -Xlinker -L../../build -Xlinker -lkiwi_static
|
||||
|
||||
- name: Run Swift Tests
|
||||
run: |
|
||||
cd bindings/swift
|
||||
swift test -v -Xlinker -L../../build -Xlinker -lkiwi_static
|
||||
|
||||
swift-xcframework:
|
||||
name: Build XCFramework
|
||||
runs-on: macos-latest
|
||||
if: github.event_name == 'push' && github.ref == 'refs/heads/main'
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
with:
|
||||
submodules: true
|
||||
lfs: true
|
||||
|
||||
- name: Setup Xcode
|
||||
run: sudo xcode-select -switch /Applications/Xcode.app
|
||||
|
||||
- name: Build XCFramework
|
||||
run: |
|
||||
chmod +x bindings/swift/scripts/build-xcframework.sh
|
||||
bindings/swift/scripts/build-xcframework.sh
|
||||
|
||||
- name: Verify XCFramework
|
||||
run: |
|
||||
if [ ! -d bindings/swift/xcframework/Kiwi.xcframework ]; then
|
||||
echo "Error: Kiwi.xcframework not found"
|
||||
exit 1
|
||||
fi
|
||||
if [ ! -f bindings/swift/xcframework/Kiwi.xcframework.zip ]; then
|
||||
echo "Error: Kiwi.xcframework.zip not found"
|
||||
exit 1
|
||||
fi
|
||||
ls -lh bindings/swift/xcframework/
|
||||
|
||||
- name: Calculate Checksum
|
||||
run: |
|
||||
cd bindings/swift/xcframework
|
||||
swift package compute-checksum Kiwi.xcframework.zip > checksum.txt
|
||||
echo "Checksum: $(cat checksum.txt)"
|
||||
|
||||
- name: Archive XCFramework
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: Kiwi-xcframework
|
||||
path: |
|
||||
bindings/swift/xcframework/Kiwi.xcframework.zip
|
||||
bindings/swift/xcframework/checksum.txt
|
||||
|
||||
swift-release:
|
||||
name: Release XCFramework
|
||||
runs-on: macos-latest
|
||||
if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v')
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
with:
|
||||
submodules: true
|
||||
lfs: true
|
||||
|
||||
- name: Setup Xcode
|
||||
run: sudo xcode-select -switch /Applications/Xcode.app
|
||||
|
||||
- name: Build XCFramework
|
||||
run: |
|
||||
chmod +x bindings/swift/scripts/build-xcframework.sh
|
||||
bindings/swift/scripts/build-xcframework.sh
|
||||
|
||||
- name: Calculate Checksum
|
||||
id: checksum
|
||||
run: |
|
||||
cd bindings/swift/xcframework
|
||||
CHECKSUM=$(swift package compute-checksum Kiwi.xcframework.zip)
|
||||
echo "checksum=$CHECKSUM" >> $GITHUB_OUTPUT
|
||||
echo "Checksum: $CHECKSUM"
|
||||
|
||||
- name: Upload to Release
|
||||
uses: softprops/action-gh-release@v1
|
||||
with:
|
||||
files: |
|
||||
bindings/swift/xcframework/Kiwi.xcframework.zip
|
||||
env:
|
||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
- name: Output SPM Configuration
|
||||
run: |
|
||||
TAG=${GITHUB_REF#refs/tags/}
|
||||
echo ""
|
||||
echo "=== Swift Package Manager Configuration ==="
|
||||
echo ""
|
||||
echo "Add this to your Package.swift:"
|
||||
echo ""
|
||||
echo ".binaryTarget("
|
||||
echo " name: \"CKiwi\","
|
||||
echo " url: \"https://github.com/${{ github.repository }}/releases/download/$TAG/Kiwi.xcframework.zip\","
|
||||
echo " checksum: \"${{ steps.checksum.outputs.checksum }}\""
|
||||
echo ")"
|
||||
|
||||
swift-linux-check:
|
||||
name: Swift Linux Compatibility Check
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
with:
|
||||
submodules: true
|
||||
lfs: true
|
||||
|
||||
- name: Setup Swift
|
||||
uses: swift-actions/setup-swift@v2
|
||||
with:
|
||||
swift-version: "5.10"
|
||||
|
||||
- name: Check Package Format
|
||||
run: |
|
||||
cd bindings/swift
|
||||
swift package diagnose || true
|
||||
echo "Note: Linux build may not work without modifications, but checking package structure"
|
||||
2
.github/workflows/ubuntu.yml
vendored
2
.github/workflows/ubuntu.yml
vendored
|
|
@ -36,7 +36,7 @@ jobs:
|
|||
- name: Print CPU Info
|
||||
run: cat /proc/cpuinfo
|
||||
- name: Build
|
||||
run: cd build && make -j2
|
||||
run: cd build && make -j1
|
||||
- name: Run Unit Test
|
||||
run: ./build/test/kiwi-test
|
||||
- name: Run Unit Test in Debug mode
|
||||
|
|
|
|||
50
.github/workflows/wasm.yml
vendored
Normal file
50
.github/workflows/wasm.yml
vendored
Normal file
|
|
@ -0,0 +1,50 @@
|
|||
name: WASM Bindings
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
branches: [ main ]
|
||||
paths:
|
||||
- 'bindings/wasm/**'
|
||||
- 'src/**'
|
||||
- 'include/**'
|
||||
- 'CMakeLists.txt'
|
||||
- '.github/workflows/wasm.yml'
|
||||
push:
|
||||
branches: [ main ]
|
||||
paths:
|
||||
- 'bindings/wasm/**'
|
||||
- 'src/**'
|
||||
- 'include/**'
|
||||
- 'CMakeLists.txt'
|
||||
- '.github/workflows/wasm.yml'
|
||||
|
||||
jobs:
|
||||
wasm-build-test:
|
||||
name: WASM Build and Test
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
with:
|
||||
submodules: true
|
||||
lfs: true
|
||||
|
||||
- name: Setup Node.js
|
||||
uses: actions/setup-node@v3
|
||||
with:
|
||||
node-version: '20'
|
||||
|
||||
- name: Setup Emscripten
|
||||
uses: mymindstorm/setup-emsdk@v14
|
||||
with:
|
||||
version: '3.1.64' # Latest stable or specific version
|
||||
|
||||
- name: Build WASM
|
||||
run: |
|
||||
chmod +x bindings/wasm/build.sh
|
||||
bindings/wasm/build.sh
|
||||
|
||||
- name: Run WASM Unit Test
|
||||
run: |
|
||||
cd bindings/wasm/package
|
||||
npm run test
|
||||
|
|
@ -1,6 +1,6 @@
|
|||
cmake_minimum_required(VERSION 3.12)
|
||||
|
||||
project(kiwi VERSION 0.22.2 DESCRIPTION "Kiwi, Korean Intelligent Word Identifier")
|
||||
project(kiwi VERSION 0.23.2 DESCRIPTION "Kiwi, Korean Intelligent Word Identifier")
|
||||
|
||||
set ( CMAKE_CXX_STANDARD 17 )
|
||||
set ( CMAKE_VERBOSE_MAKEFILE true )
|
||||
|
|
@ -53,7 +53,7 @@ else()
|
|||
set ( AVX_VNNI_SUPPORTED OFF )
|
||||
endif()
|
||||
|
||||
if(APPLE)
|
||||
if(APPLE AND NOT CMAKE_OSX_ARCHITECTURES)
|
||||
set(CMAKE_OSX_ARCHITECTURES "${KIWI_CPU_ARCH}")
|
||||
endif()
|
||||
|
||||
|
|
@ -79,6 +79,7 @@ set ( CORE_SRCS
|
|||
src/TagUtils.cpp
|
||||
src/TypoTransformer.cpp
|
||||
src/UnicodeCase.cpp
|
||||
src/UnkFormScorer.cpp
|
||||
src/Utils.cpp
|
||||
src/WordDetector.cpp
|
||||
src/archImpl/none.cpp
|
||||
|
|
|
|||
15
README.md
15
README.md
|
|
@ -11,6 +11,7 @@ x86_64:
|
|||
Other:
|
||||
[](https://github.com/bab2min/Kiwi/actions)
|
||||
[](https://github.com/bab2min/Kiwi/actions)
|
||||
[](https://github.com/bab2min/Kiwi/actions)
|
||||
|
||||
Kiwi는 빠른 속도와 범용적인 성능을 지향하는 한국어 형태소 분석기 라이브러리입니다. 한국어 처리에 관심 있는 사람이면 누구나 쉽게 사용할 수 있도록 오픈 소스로 공개 중이며, C++로 구현된 코어 라이브러리를 래핑하여 다양한 프로그래밍 언어에 사용할 수 있도록 준비 중입니다.
|
||||
|
||||
|
|
@ -32,7 +33,7 @@ Kiwi는 빠른 속도와 범용적인 성능을 지향하는 한국어 형태소
|
|||
|
||||
문장 분리 기능을 비롯한 다양한 편의기능을 제공합니다. (문장 분리 성능 평가는 [이 페이지](https://github.com/bab2min/kiwipiepy/tree/main/benchmark/sentence_split)에서 수행가능합니다. )
|
||||
|
||||
라이브러리 차원에서 멀티스레딩을 지원하기 때문에 대량의 텍스트를 분석해야할 경우 멀티코어를 활용하여 빠른 분석이 가능합니다. 또한 다양한 시스템에서 상황에 맞춰 선택할 수 있도록 소형/중형/대형 모델을 제공합니다.
|
||||
라이브러리 차원에서 멀티스레딩을 지원하기 때문에 대량의 텍스트를 분석해야할 경우 멀티코어를 활용하여 빠른 분석이 가능합니다.
|
||||
|
||||
## 웹 데모 페이지
|
||||
최신 버전의 Kiwi를 사용해볼 수 있는 [웹 데모 페이지](https://kiwi.bab2min.pe.kr/)를 제공하고 있습니다.
|
||||
|
|
@ -149,6 +150,12 @@ Android NDK를 통해 Android 앱에서 사용할 수 있는 AAR 라이브러리
|
|||
- **사용법**: [bindings/java](bindings/java)의 README 참조
|
||||
- **패키지**: AAR 형태로 제공되어 Gradle 프로젝트에 쉽게 통합 가능
|
||||
|
||||
### Swift Wrapper
|
||||
iOS 12.0 이상 및 macOS 10.14 이상에서 사용 가능한 Swift binding이 제공 예정입니다. 조금만 기다려주세요.
|
||||
- **최소 요구사항**: iOS 12.0+ / macOS 10.14+, Swift 5.7+
|
||||
- **사용법**: [bindings/swift](bindings/swift)의 README 참조
|
||||
- **설치**: Swift Package Manager 지원
|
||||
|
||||
### R Wrapper
|
||||
[mrchypark](https://github.com/mrchypark)님께서 기여해주신 R언어용 wrapper인 [elbird](https://mrchypark.github.io/elbird/)가 있습니다.
|
||||
|
||||
|
|
@ -158,6 +165,12 @@ Android NDK를 통해 Android 앱에서 사용할 수 있는 AAR 라이브러리
|
|||
### Web Assembly (Javascript/Typescript)
|
||||
[RicBent](https://github.com/RicBent)님께서 기여해주신 Web Assembly binding이 있습니다. 이에 대해서는 [bindings/wasm](bindings/wasm)를 참조하시길 바랍니다.
|
||||
|
||||
### Rust Wrapper
|
||||
[JAICHANGPARK](https://github.com/JAICHANGPARK)님께서 개발하신 Rust용 wrapper인 [kiwi-rs](https://github.com/JAICHANGPARK/kiwi-rs)가 있습니다.
|
||||
|
||||
### Flutter Wrapper
|
||||
[JAICHANGPARK](https://github.com/JAICHANGPARK)님께서 개발하신 Rust용 wrapper인 [flutter_kiwi_nlp](https://github.com/JAICHANGPARK/flutter_kiwi_nlp)가 있습니다.
|
||||
|
||||
### 응용 프로그램
|
||||
Kiwi는 C# 기반의 GUI 형태로도 제공됩니다.
|
||||
형태소 분석기는 사용해야하지만 별도의 프로그래밍 지식이 없는 경우 이 프로그램을 사용하시면 됩니다.
|
||||
|
|
|
|||
|
|
@ -278,6 +278,17 @@ namespace jni
|
|||
};
|
||||
}
|
||||
|
||||
class JPreparedTypoTransformer : public kiwi::PreparedTypoTransformer, jni::JObject<JPreparedTypoTransformer>
|
||||
{
|
||||
public:
|
||||
static constexpr std::string_view className = "kr/pe/bab2min/KiwiBuilder$PreparedTypoTransformer";
|
||||
|
||||
JPreparedTypoTransformer() : PreparedTypoTransformer{} {}
|
||||
JPreparedTypoTransformer(kiwi::PreparedTypoTransformer&& inst) : PreparedTypoTransformer{ std::move(inst) } {}
|
||||
JPreparedTypoTransformer(JPreparedTypoTransformer&&) = default;
|
||||
JPreparedTypoTransformer& operator=(JPreparedTypoTransformer&&) = default;
|
||||
};
|
||||
|
||||
class JKiwi;
|
||||
|
||||
class JMorphemeSet : jni::JObject<JMorphemeSet>
|
||||
|
|
@ -322,8 +333,7 @@ public:
|
|||
JMorphemeSet* _blocklist,
|
||||
kiwi::Dialect _allowedDialects,
|
||||
float _dialectCost,
|
||||
jni::JIterator<jni::JIterator<kiwi::PretokenizedSpan>> _pretokenized
|
||||
);
|
||||
jni::JIterator<jni::JIterator<kiwi::PretokenizedSpan>> _pretokenized);
|
||||
JMultipleTokenResult(JMultipleTokenResult&&) = default;
|
||||
JMultipleTokenResult& operator=(JMultipleTokenResult&&) = default;
|
||||
|
||||
|
|
@ -385,8 +395,9 @@ public:
|
|||
return KIWI_VERSION_STRING;
|
||||
}
|
||||
|
||||
auto analyze(const std::u16string& text, uint64_t topN,
|
||||
auto analyze(const std::u16string& text, uint64_t topN,
|
||||
kiwi::Match matchOption, JMorphemeSet* blocklist, kiwi::Dialect allowedDialects, float dialectCost,
|
||||
JPreparedTypoTransformer* typoTransformer, float typoThreshold,
|
||||
jni::JIterator<kiwi::PretokenizedSpan> pretokenized) const
|
||||
{
|
||||
std::vector<kiwi::PretokenizedSpan> pretokenizedSpans;
|
||||
|
|
@ -394,13 +405,15 @@ public:
|
|||
{
|
||||
while (pretokenized.hasNext()) pretokenizedSpans.emplace_back(pretokenized.next());
|
||||
}
|
||||
return Kiwi::analyze(text, topN,
|
||||
kiwi::AnalyzeOption{ matchOption, blocklist ? &blocklist->morphSet : nullptr, false, allowedDialects, dialectCost },
|
||||
pretokenizedSpans);
|
||||
kiwi::AnalyzeOption opt{ matchOption, blocklist ? &blocklist->morphSet : nullptr, false, allowedDialects, dialectCost };
|
||||
opt.typoTransformer = typoTransformer;
|
||||
opt.typoThreshold = typoThreshold;
|
||||
return Kiwi::analyze(text, topN, opt, pretokenizedSpans);
|
||||
}
|
||||
|
||||
JFutureTokenResult asyncAnalyze(jni::JRef<JKiwi> _ref, const std::u16string& text, uint64_t topN,
|
||||
JFutureTokenResult asyncAnalyze(jni::JRef<JKiwi> _ref, const std::u16string& text, uint64_t topN,
|
||||
kiwi::Match matchOption, JMorphemeSet* blocklist, kiwi::Dialect allowedDialects, float dialectCost,
|
||||
JPreparedTypoTransformer* typoTransformer, float typoThreshold,
|
||||
jni::JIterator<kiwi::PretokenizedSpan> pretokenized) const
|
||||
{
|
||||
std::vector<kiwi::PretokenizedSpan> pretokenizedSpans;
|
||||
|
|
@ -408,13 +421,15 @@ public:
|
|||
{
|
||||
while (pretokenized.hasNext()) pretokenizedSpans.emplace_back(pretokenized.next());
|
||||
}
|
||||
return { _ref, Kiwi::asyncAnalyze(text, topN,
|
||||
kiwi::AnalyzeOption{ matchOption, blocklist ? &blocklist->morphSet : nullptr, false, allowedDialects, dialectCost },
|
||||
pretokenizedSpans) };
|
||||
kiwi::AnalyzeOption opt{ matchOption, blocklist ? &blocklist->morphSet : nullptr, false, allowedDialects, dialectCost };
|
||||
opt.typoTransformer = typoTransformer;
|
||||
opt.typoThreshold = typoThreshold;
|
||||
return { _ref, Kiwi::asyncAnalyze(text, topN, opt, pretokenizedSpans) };
|
||||
}
|
||||
|
||||
JMultipleTokenResult analyze2(jni::JRef<JKiwi> _ref, jni::JIterator<std::u16string> texts, uint64_t topN,
|
||||
JMultipleTokenResult analyze2(jni::JRef<JKiwi> _ref, jni::JIterator<std::u16string> texts, uint64_t topN,
|
||||
kiwi::Match matchOption, JMorphemeSet* blocklist, kiwi::Dialect allowedDialects, float dialectCost,
|
||||
JPreparedTypoTransformer* typoTransformer, float typoThreshold,
|
||||
jni::JIterator<jni::JIterator<kiwi::PretokenizedSpan>> pretokenized) const
|
||||
{
|
||||
if (!texts) throw std::bad_optional_access{};
|
||||
|
|
@ -557,6 +572,11 @@ public:
|
|||
{
|
||||
TypoTransformer::update(o);
|
||||
}
|
||||
|
||||
JPreparedTypoTransformer prepare() const
|
||||
{
|
||||
return TypoTransformer::prepare(true);
|
||||
}
|
||||
};
|
||||
|
||||
class JStreamProvider : jni::JPureObject<JStreamProvider>
|
||||
|
|
@ -720,16 +740,9 @@ public:
|
|||
return KiwiBuilder::addPreAnalyzedWord(form, morphs, positions, score);
|
||||
}
|
||||
|
||||
JKiwi build(JTypoTransformer* typos, float typoCostThreshold) const
|
||||
JKiwi build() const
|
||||
{
|
||||
if (typos)
|
||||
{
|
||||
return KiwiBuilder::build(*typos, typoCostThreshold);
|
||||
}
|
||||
else
|
||||
{
|
||||
return KiwiBuilder::build();
|
||||
}
|
||||
return KiwiBuilder::build();
|
||||
}
|
||||
};
|
||||
|
||||
|
|
@ -739,6 +752,8 @@ JNIEXPORT jint JNICALL JNI_OnLoad(JavaVM* vm, void* reserved)
|
|||
{
|
||||
return gModule.load(vm,
|
||||
|
||||
jni::define<JPreparedTypoTransformer>(),
|
||||
|
||||
jni::define<JTypoTransformer>()
|
||||
.template ctor<>()
|
||||
.template method<&JTypoTransformer::addTypo>("_addTypo")
|
||||
|
|
@ -746,7 +761,8 @@ JNIEXPORT jint JNICALL JNI_OnLoad(JavaVM* vm, void* reserved)
|
|||
.template method<&JTypoTransformer::setLengtheningTypoCost>("_setLengtheningTypoCost")
|
||||
.template method<&JTypoTransformer::copy>("copy")
|
||||
.template method<&JTypoTransformer::update>("_update")
|
||||
.template method<&JTypoTransformer::scaleCost>("_scaleCost"),
|
||||
.template method<&JTypoTransformer::scaleCost>("_scaleCost")
|
||||
.template method<&JTypoTransformer::prepare>("prepare"),
|
||||
|
||||
jni::define<JKiwiBuilder>()
|
||||
.template ctor<std::string, size_t, kiwi::BuildOption, kiwi::ModelType, kiwi::Dialect>()
|
||||
|
|
|
|||
|
|
@ -21,6 +21,11 @@ public class Kiwi implements AutoCloseable {
|
|||
mention = 1 << 3,
|
||||
serial = 1 << 4,
|
||||
emoji = 1 << 5,
|
||||
oovRuleOnly = 0 << 8,
|
||||
oovChrModel = 1 << 8,
|
||||
oovChrFreqModel = 2 << 8,
|
||||
oovChrFreqBranchModel = 3 << 8,
|
||||
oovMask = 3 << 8,
|
||||
normalizeCoda = 1 << 16,
|
||||
joinNounPrefix = 1 << 17,
|
||||
joinNounSuffix = 1 << 18,
|
||||
|
|
@ -32,6 +37,7 @@ public class Kiwi implements AutoCloseable {
|
|||
compatibleJamo = 1 << 24,
|
||||
splitSaisiot = 1 << 25,
|
||||
mergeSaisiot = 1 << 26,
|
||||
joinParticleYo = 1 << 27,
|
||||
joinVSuffix = joinVerbSuffix | joinAdjSuffix,
|
||||
joinAffix = joinNounPrefix | joinNounSuffix | joinVerbSuffix | joinAdjSuffix | joinAdvSuffix,
|
||||
all = url | email | hashtag | mention | serial | zCoda,
|
||||
|
|
@ -167,12 +173,20 @@ public class Kiwi implements AutoCloseable {
|
|||
public MorphemeSet blocklist;
|
||||
public short allowedDialects;
|
||||
public float dialectCost;
|
||||
public KiwiBuilder.PreparedTypoTransformer typoTransformer;
|
||||
public float typoThreshold;
|
||||
|
||||
public AnalyzeOption(int match, MorphemeSet blocklist, short allowedDialects, float dialectCost) {
|
||||
public AnalyzeOption(int match, MorphemeSet blocklist, short allowedDialects, float dialectCost, KiwiBuilder.PreparedTypoTransformer typoTransformer, float typoThreshold) {
|
||||
this.match = match;
|
||||
this.blocklist = blocklist;
|
||||
this.allowedDialects = allowedDialects;
|
||||
this.dialectCost = dialectCost;
|
||||
this.typoTransformer = typoTransformer;
|
||||
this.typoThreshold = typoThreshold;
|
||||
}
|
||||
|
||||
public AnalyzeOption(int match, MorphemeSet blocklist, short allowedDialects, float dialectCost) {
|
||||
this(match, blocklist, allowedDialects, dialectCost, null, 2.5f);
|
||||
}
|
||||
|
||||
public AnalyzeOption(int match, MorphemeSet blocklist) {
|
||||
|
|
@ -427,16 +441,16 @@ public class Kiwi implements AutoCloseable {
|
|||
return _inst != 0;
|
||||
}
|
||||
|
||||
public native TokenResult[] analyze(String text, int topN, int matchOption, MorphemeSet blocklist, short allowedDialects, float dialectCost, Iterator<PretokenizedSpan> pretokenized);
|
||||
public native FutureTokenResult asyncAnalyze(String text, int topN, int matchOption, MorphemeSet blocklist, short allowedDialects, float dialectCost, Iterator<PretokenizedSpan> pretokenized);
|
||||
public native MultipleTokenResult analyze(Iterator<String> texts, int topN, int matchOption, MorphemeSet blocklist, short allowedDialects, float dialectCost, Iterator<Iterator<PretokenizedSpan>> pretokenized);
|
||||
public native TokenResult[] analyze(String text, int topN, int matchOption, MorphemeSet blocklist, short allowedDialects, float dialectCost, KiwiBuilder.PreparedTypoTransformer typoTransformer, float typoThreshold, Iterator<PretokenizedSpan> pretokenized);
|
||||
public native FutureTokenResult asyncAnalyze(String text, int topN, int matchOption, MorphemeSet blocklist, short allowedDialects, float dialectCost, KiwiBuilder.PreparedTypoTransformer typoTransformer, float typoThreshold, Iterator<PretokenizedSpan> pretokenized);
|
||||
public native MultipleTokenResult analyze(Iterator<String> texts, int topN, int matchOption, MorphemeSet blocklist, short allowedDialects, float dialectCost, KiwiBuilder.PreparedTypoTransformer typoTransformer, float typoThreshold, Iterator<Iterator<PretokenizedSpan>> pretokenized);
|
||||
public native Sentence[] splitIntoSents(String text, int matchOption, boolean returnTokens);
|
||||
public native String join(JoinableToken[] tokens);
|
||||
|
||||
public static native String getVersion();
|
||||
|
||||
public TokenResult[] analyze(String text, int topN, AnalyzeOption option, Iterator<PretokenizedSpan> pretokenized) {
|
||||
return analyze(text, topN, option.match, option.blocklist, option.allowedDialects, option.dialectCost, pretokenized);
|
||||
return analyze(text, topN, option.match, option.blocklist, option.allowedDialects, option.dialectCost, option.typoTransformer, option.typoThreshold, pretokenized);
|
||||
}
|
||||
|
||||
public TokenResult[] analyze(String text, int topN, AnalyzeOption option) {
|
||||
|
|
@ -444,7 +458,7 @@ public class Kiwi implements AutoCloseable {
|
|||
}
|
||||
|
||||
public FutureTokenResult asyncAnalyze(String text, int topN, AnalyzeOption option, Iterator<PretokenizedSpan> pretokenized) {
|
||||
return asyncAnalyze(text, topN, option.match, option.blocklist, option.allowedDialects, option.dialectCost, pretokenized);
|
||||
return asyncAnalyze(text, topN, option.match, option.blocklist, option.allowedDialects, option.dialectCost, option.typoTransformer, option.typoThreshold, pretokenized);
|
||||
}
|
||||
|
||||
public FutureTokenResult asyncAnalyze(String text, int topN, AnalyzeOption option) {
|
||||
|
|
@ -452,7 +466,7 @@ public class Kiwi implements AutoCloseable {
|
|||
}
|
||||
|
||||
public MultipleTokenResult analyze(Iterator<String> texts, int topN, AnalyzeOption option, Iterator<Iterator<PretokenizedSpan>> pretokenized) {
|
||||
return analyze(texts, topN, option.match, option.blocklist, option.allowedDialects, option.dialectCost, pretokenized);
|
||||
return analyze(texts, topN, option.match, option.blocklist, option.allowedDialects, option.dialectCost, option.typoTransformer, option.typoThreshold, pretokenized);
|
||||
}
|
||||
|
||||
public MultipleTokenResult analyze(Iterator<String> texts, int topN, AnalyzeOption option) {
|
||||
|
|
|
|||
|
|
@ -59,7 +59,8 @@ public class KiwiBuilder implements AutoCloseable {
|
|||
final static public byte none = 0,
|
||||
any = 1,
|
||||
vowel = 2,
|
||||
applosive = 8;
|
||||
applosive = 8,
|
||||
continual = 9;
|
||||
}
|
||||
|
||||
public static class TypoTransformer implements AutoCloseable {
|
||||
|
|
@ -137,6 +138,27 @@ public class KiwiBuilder implements AutoCloseable {
|
|||
_scaleCost(scale);
|
||||
return this;
|
||||
}
|
||||
|
||||
public native PreparedTypoTransformer prepare();
|
||||
}
|
||||
|
||||
public static class PreparedTypoTransformer implements AutoCloseable {
|
||||
private long _inst;
|
||||
|
||||
public PreparedTypoTransformer(long _inst) {
|
||||
this._inst = _inst;
|
||||
}
|
||||
|
||||
protected void finalize() throws Exception {
|
||||
close();
|
||||
}
|
||||
|
||||
public boolean isAlive() {
|
||||
return _inst != 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public native void close() throws Exception;
|
||||
}
|
||||
|
||||
public KiwiBuilder(long _inst) {
|
||||
|
|
@ -197,20 +219,12 @@ public class KiwiBuilder implements AutoCloseable {
|
|||
@Override
|
||||
public native void close() throws Exception;
|
||||
|
||||
public native Kiwi build(TypoTransformer typos, float typoCostThreshold);
|
||||
public native Kiwi build();
|
||||
public native boolean addWord(String form, byte tag, float score);
|
||||
public native boolean addWord(String form, byte tag, float score, String origForm);
|
||||
public native boolean addPreAnalyzedWord(String form, AnalyzedMorph[] analyzed, float score);
|
||||
public native int loadDictionary(String path);
|
||||
|
||||
public Kiwi build() {
|
||||
return build(null, 0);
|
||||
}
|
||||
|
||||
public Kiwi build(TypoTransformer typos) {
|
||||
return build(typos, 2.5f);
|
||||
}
|
||||
|
||||
static {
|
||||
Kiwi.loadLibrary();
|
||||
}
|
||||
|
|
@ -317,9 +331,49 @@ public class KiwiBuilder implements AutoCloseable {
|
|||
.addTypo(new String[]{"ᆴ"}, new String[]{"ᆯᇀ"}, 1e-12f, CondVowel.none)
|
||||
.addTypo(new String[]{"ᆵ"}, new String[]{"ᆯᇁ"}, 1e-12f, CondVowel.none)
|
||||
.addTypo(new String[]{"ᆶ"}, new String[]{"ᆯᇂ"}, 1e-12f, CondVowel.none)
|
||||
.addTypo(new String[]{"ᆹ"}, new String[]{"ᆸᆺ", "ᆸᆻ"}, 1e-12f, CondVowel.none);
|
||||
.addTypo(new String[]{"ᆹ"}, new String[]{"ᆸᆺ", "ᆸᆻ"}, 1e-12f, CondVowel.none)
|
||||
|
||||
.addTypo(new String[]{"ᆨᄋ"}, new String[]{"ᄀ"}, 1.f, CondVowel.continual)
|
||||
.addTypo(new String[]{"ᆩᄋ", "ᆨᄀ"}, new String[]{"ᄁ", "ᆨᄀ"}, 1.f, CondVowel.continual)
|
||||
.addTypo(new String[]{"ᆪᄋ", "ᆪᄒ"}, new String[]{"ᆨᄉ", "ᆨᄊ"}, 1.f, CondVowel.continual)
|
||||
.addTypo(new String[]{"ᆫᄋ", "ᆫᄒ"}, new String[]{"ᄂ"}, 1.f, CondVowel.continual)
|
||||
.addTypo(new String[]{"ᆬᄋ", "ᆫᄌ"}, new String[]{"ᆬᄋ", "ᆫᄌ"}, 1.f, CondVowel.continual)
|
||||
.addTypo(new String[]{"ᆭᄋ"}, new String[]{"ᆫᄒ", "ᄂ"}, 1.f, CondVowel.continual)
|
||||
.addTypo(new String[]{"ᆭᄀ"}, new String[]{"ᆫᄏ"}, 1.f, CondVowel.continual)
|
||||
.addTypo(new String[]{"ᆭᄃ"}, new String[]{"ᆫᄐ"}, 1.f, CondVowel.continual)
|
||||
.addTypo(new String[]{"ᆭᄇ"}, new String[]{"ᆫᄑ"}, 1.f, CondVowel.continual)
|
||||
.addTypo(new String[]{"ᆭᄉ"}, new String[]{"ᆫᄉ"}, 1.f, CondVowel.continual)
|
||||
.addTypo(new String[]{"ᆭᄌ"}, new String[]{"ᆫᄎ"}, 1.f, CondVowel.continual)
|
||||
.addTypo(new String[]{"ᆮᄋ"}, new String[]{"ᄃ"}, 1.f, CondVowel.continual)
|
||||
.addTypo(new String[]{"ᆯᄋ", "ᆯᄒ"}, new String[]{"ᄅ"}, 1.f, CondVowel.continual)
|
||||
.addTypo(new String[]{"ᆰᄋ"}, new String[]{"ᆯᄀ"}, 1.f, CondVowel.continual)
|
||||
.addTypo(new String[]{"ᆰᄀ"}, new String[]{"ᆯᄁ"}, 1.f, CondVowel.continual)
|
||||
.addTypo(new String[]{"ᆰᄒ"}, new String[]{"ᆯᄏ"}, 1.f, CondVowel.continual)
|
||||
.addTypo(new String[]{"ᆱᄋ", "ᆱᄒ"}, new String[]{"ᆯᄆ"}, 1.f, CondVowel.continual)
|
||||
.addTypo(new String[]{"ᆲᄋ"}, new String[]{"ᆯᄇ"}, 1.f, CondVowel.continual)
|
||||
.addTypo(new String[]{"ᆲᄇ"}, new String[]{"ᆯᄈ"}, 1.f, CondVowel.continual)
|
||||
.addTypo(new String[]{"ᆲᄒ"}, new String[]{"ᆯᄑ"}, 1.f, CondVowel.continual)
|
||||
.addTypo(new String[]{"ᆳᄋ"}, new String[]{"ᆯᄉ"}, 1.f, CondVowel.continual)
|
||||
.addTypo(new String[]{"ᆳᄉ"}, new String[]{"ᆯᄊ"}, 1.f, CondVowel.continual)
|
||||
.addTypo(new String[]{"ᆴᄋ", "ᆴᄐ", "ᆴᄒ"}, new String[]{"ᆯᄐ"}, 1.f, CondVowel.continual)
|
||||
.addTypo(new String[]{"ᆵᄋ", "ᆵᄑ", "ᆵᄒ"}, new String[]{"ᆯᄑ"}, 1.f, CondVowel.continual)
|
||||
.addTypo(new String[]{"ᆶᄉ"}, new String[]{"ᆯᄉ"}, 1.f, CondVowel.continual)
|
||||
.addTypo(new String[]{"ᆶᄋ", "ᆶᄒ"}, new String[]{"ᆯᄒ"}, 1.f, CondVowel.continual)
|
||||
.addTypo(new String[]{"ᆷᄋ", "ᆷᄒ"}, new String[]{"ᄆ"}, 1.f, CondVowel.continual)
|
||||
.addTypo(new String[]{"ᆸᄋ"}, new String[]{"ᄇ"}, 1.f, CondVowel.continual)
|
||||
.addTypo(new String[]{"ᆸᄇ"}, new String[]{"ᄈ"}, 1.f, CondVowel.continual)
|
||||
.addTypo(new String[]{"ᆹᄋ", "ᆹᄒ"}, new String[]{"ᆸᄉ", "ᆸᄊ"}, 1.f, CondVowel.continual)
|
||||
.addTypo(new String[]{"ᆺᄋ"}, new String[]{"ᄉ"}, 1.f, CondVowel.continual)
|
||||
.addTypo(new String[]{"ᆻᄋ", "ᆺᄉ"}, new String[]{"ᄊ", "ᆺᄉ"}, 1.f, CondVowel.continual)
|
||||
.addTypo(new String[]{"ᆽᄋ"}, new String[]{"ᄌ"}, 1.f, CondVowel.continual)
|
||||
.addTypo(new String[]{"ᆽᄌ"}, new String[]{"ᄍ"}, 1.f, CondVowel.continual)
|
||||
.addTypo(new String[]{"ᆾᄋ", "ᆾᄒ", "ᆽᄒ", "ᇂᄌ", "ᇂᄎ"}, new String[]{"ᄎ"}, 1.f, CondVowel.continual)
|
||||
.addTypo(new String[]{"ᆿᄋ", "ᆿᄒ", "ᆨᄒ", "ᇂᄀ", "ᇂᄏ"}, new String[]{"ᄏ"}, 1.f, CondVowel.continual)
|
||||
.addTypo(new String[]{"ᇀᄋ", "ᇀᄒ", "ᆮᄒ", "ᇂᄃ", "ᇂᄐ"}, new String[]{"ᄐ"}, 1.f, CondVowel.continual)
|
||||
.addTypo(new String[]{"ᇁᄋ", "ᇁᄒ", "ᆸᄒ", "ᇂᄇ", "ᇂᄑ"}, new String[]{"ᄑ"}, 1.f, CondVowel.continual)
|
||||
.addTypo(new String[]{"ᇂᄋ"}, new String[]{"ᄒ"}, 1.f, CondVowel.continual);
|
||||
|
||||
final public static TypoTransformer basicTypoSetWithContinual = basicTypoSet.copy().update(continualTypoSet);
|
||||
|
||||
final public static TypoTransformer lengtheningTypoSet = new TypoTransformer().setLengtheningTypoCost(0.5f);
|
||||
final public static TypoTransformer lengtheningTypoSet = new TypoTransformer().setLengtheningTypoCost(0.25f);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -6,6 +6,7 @@ import java.util.concurrent.Future;
|
|||
import org.junit.Test;
|
||||
|
||||
import kr.pe.bab2min.KiwiBuilder.TypoTransformer;
|
||||
import kr.pe.bab2min.KiwiBuilder.PreparedTypoTransformer;
|
||||
|
||||
import static org.junit.Assert.*;
|
||||
|
||||
|
|
@ -122,8 +123,10 @@ public class KiwiTest {
|
|||
public void testTypos() throws Exception {
|
||||
System.gc();
|
||||
KiwiBuilder builder = new KiwiBuilder(modelPath);
|
||||
Kiwi kiwi = builder.build(KiwiBuilder.basicTypoSet);
|
||||
Kiwi.Token[] tokens = kiwi.tokenize("나 죰 도와죠.", new AnalyzeOption(Kiwi.Match.allWithNormalizing));
|
||||
Kiwi kiwi = builder.build();
|
||||
PreparedTypoTransformer preparedTypo = KiwiBuilder.basicTypoSet.prepare();
|
||||
AnalyzeOption option = new AnalyzeOption(Kiwi.Match.allWithNormalizing, null, Kiwi.Dialect.standard, 0.0f, preparedTypo, 2.5f);
|
||||
Kiwi.Token[] tokens = kiwi.tokenize("나 죰 도와죠.", option);
|
||||
System.out.println(Arrays.deepToString(tokens));
|
||||
assertEquals(tokens[1].form, "좀");
|
||||
assertEquals(tokens[4].form, "주");
|
||||
|
|
@ -134,29 +137,31 @@ public class KiwiTest {
|
|||
public void testContinualTypos() throws Exception {
|
||||
System.gc();
|
||||
KiwiBuilder builder = new KiwiBuilder(modelPath);
|
||||
Kiwi kiwi = builder.build(KiwiBuilder.continualTypoSet);
|
||||
Kiwi kiwi = builder.build();
|
||||
PreparedTypoTransformer preparedTypo = KiwiBuilder.continualTypoSet.prepare();
|
||||
AnalyzeOption option = new AnalyzeOption(Kiwi.Match.allWithNormalizing, null, Kiwi.Dialect.standard, 0.0f, preparedTypo, 2.5f);
|
||||
|
||||
Kiwi.Token[] tokens = kiwi.tokenize("프로그래미", new AnalyzeOption(Kiwi.Match.allWithNormalizing));
|
||||
Kiwi.Token[] tokens = kiwi.tokenize("프로그래미", option);
|
||||
System.out.println(Arrays.deepToString(tokens));
|
||||
assertEquals(tokens[0].form, "프로그램");
|
||||
assertEquals(tokens[1].form, "이");
|
||||
|
||||
tokens = kiwi.tokenize("프로그래믈", new AnalyzeOption(Kiwi.Match.allWithNormalizing));
|
||||
tokens = kiwi.tokenize("프로그래믈", option);
|
||||
System.out.println(Arrays.deepToString(tokens));
|
||||
assertEquals(tokens[0].form, "프로그램");
|
||||
assertEquals(tokens[1].form, "을");
|
||||
|
||||
tokens = kiwi.tokenize("오늘사무시레서", new AnalyzeOption(Kiwi.Match.allWithNormalizing));
|
||||
tokens = kiwi.tokenize("오늘사무시레서", option);
|
||||
System.out.println(Arrays.deepToString(tokens));
|
||||
assertEquals(tokens[1].form, "사무실");
|
||||
assertEquals(tokens[2].form, "에서");
|
||||
|
||||
tokens = kiwi.tokenize("법원이 기가캤다.", new AnalyzeOption(Kiwi.Match.allWithNormalizing));
|
||||
tokens = kiwi.tokenize("법원이 기가캤다.", option);
|
||||
System.out.println(Arrays.deepToString(tokens));
|
||||
assertEquals(tokens[2].form, "기각");
|
||||
assertEquals(tokens[3].form, "하");
|
||||
|
||||
tokens = kiwi.tokenize("하나도 업써.", new AnalyzeOption(Kiwi.Match.allWithNormalizing));
|
||||
tokens = kiwi.tokenize("하나도 업써.", option);
|
||||
System.out.println(Arrays.deepToString(tokens));
|
||||
assertEquals(tokens[2].form, "없");
|
||||
assertEquals(tokens[3].form, "어");
|
||||
|
|
@ -169,19 +174,21 @@ public class KiwiTest {
|
|||
TypoTransformer typoSet = KiwiBuilder.basicTypoSet.copy()
|
||||
.update(KiwiBuilder.continualTypoSet)
|
||||
.update(KiwiBuilder.lengtheningTypoSet);
|
||||
Kiwi kiwi = builder.build(typoSet);
|
||||
|
||||
Kiwi.Token[] tokens = kiwi.tokenize("프로그래미", new AnalyzeOption(Kiwi.Match.allWithNormalizing));
|
||||
Kiwi kiwi = builder.build();
|
||||
PreparedTypoTransformer preparedTypo = typoSet.prepare();
|
||||
AnalyzeOption option = new AnalyzeOption(Kiwi.Match.allWithNormalizing, null, Kiwi.Dialect.standard, 0.0f, preparedTypo, 2.5f);
|
||||
|
||||
Kiwi.Token[] tokens = kiwi.tokenize("프로그래미", option);
|
||||
System.out.println(Arrays.deepToString(tokens));
|
||||
assertEquals(tokens[0].form, "프로그램");
|
||||
assertEquals(tokens[1].form, "이");
|
||||
|
||||
tokens = kiwi.tokenize("지인짜?", new AnalyzeOption(Kiwi.Match.allWithNormalizing));
|
||||
tokens = kiwi.tokenize("지인짜?", option);
|
||||
System.out.println(Arrays.deepToString(tokens));
|
||||
assertEquals(tokens[0].form, "진짜");
|
||||
assertEquals(tokens[1].form, "?");
|
||||
|
||||
tokens = kiwi.tokenize("맗은 물", new AnalyzeOption(Kiwi.Match.allWithNormalizing));
|
||||
tokens = kiwi.tokenize("맗은 물", option);
|
||||
System.out.println(Arrays.deepToString(tokens));
|
||||
assertEquals(tokens[0].form, "맑");
|
||||
}
|
||||
|
|
|
|||
12
bindings/swift/.gitignore
vendored
Normal file
12
bindings/swift/.gitignore
vendored
Normal file
|
|
@ -0,0 +1,12 @@
|
|||
.DS_Store
|
||||
/.build
|
||||
/Packages
|
||||
/*.xcodeproj
|
||||
xcuserdata/
|
||||
DerivedData/
|
||||
.swiftpm/config/registries.json
|
||||
.swiftpm/xcode/package.xcworkspace/contents.xcworkspacedata
|
||||
.netrc
|
||||
*.xcframework
|
||||
build/
|
||||
xcframework/
|
||||
25
bindings/swift/CMakeLists.txt
Normal file
25
bindings/swift/CMakeLists.txt
Normal file
|
|
@ -0,0 +1,25 @@
|
|||
# CMakeLists.txt for iOS/macOS Swift bindings
|
||||
cmake_minimum_required(VERSION 3.19)
|
||||
|
||||
# This file is used to build Kiwi as a static library for iOS/macOS
|
||||
# to be bundled into an XCFramework for Swift Package Manager
|
||||
|
||||
project(kiwi_swift LANGUAGES CXX)
|
||||
|
||||
# Set C++ standard
|
||||
set(CMAKE_CXX_STANDARD 17)
|
||||
set(CMAKE_CXX_STANDARD_REQUIRED ON)
|
||||
|
||||
# Disable options that are not needed for Swift bindings
|
||||
set(KIWI_BUILD_DYNAMIC OFF CACHE BOOL "Build dynamic library" FORCE)
|
||||
set(KIWI_BUILD_CLI OFF CACHE BOOL "Build CLI tool" FORCE)
|
||||
set(KIWI_BUILD_EVALUATOR OFF CACHE BOOL "Build Evaluator" FORCE)
|
||||
set(KIWI_BUILD_MODEL_BUILDER OFF CACHE BOOL "Build Model Builder" FORCE)
|
||||
set(KIWI_BUILD_TEST OFF CACHE BOOL "Build Test sets" FORCE)
|
||||
set(KIWI_JAVA_BINDING OFF CACHE BOOL "Build Java binding" FORCE)
|
||||
|
||||
# Include the main Kiwi project
|
||||
include(../../CMakeLists.txt)
|
||||
|
||||
# The main CMakeLists.txt should create the kiwi_static target
|
||||
# which we'll use for the XCFramework
|
||||
220
bindings/swift/DEVELOPMENT.md
Normal file
220
bindings/swift/DEVELOPMENT.md
Normal file
|
|
@ -0,0 +1,220 @@
|
|||
# Swift Bindings Development Guide
|
||||
|
||||
## Overview
|
||||
|
||||
This document provides technical details for developers working on the Kiwi Swift bindings.
|
||||
|
||||
## Architecture
|
||||
|
||||
The Swift bindings use a direct C interoperability approach:
|
||||
|
||||
```
|
||||
┌─────────────────────┐
|
||||
│ Swift API Layer │ User-friendly Swift interface
|
||||
│ (Kiwi.swift, etc) │
|
||||
└──────────┬──────────┘
|
||||
│
|
||||
┌──────────┴──────────┐
|
||||
│ CKiwi Module │ C API bridging via module.modulemap
|
||||
└──────────┬──────────┘
|
||||
│
|
||||
┌──────────┴──────────┐
|
||||
│ libkiwi_static.a │ Static C++ library
|
||||
└─────────────────────┘
|
||||
```
|
||||
|
||||
## File Structure
|
||||
|
||||
```
|
||||
bindings/swift/
|
||||
├── Package.swift # Swift Package Manager manifest
|
||||
├── README.md # User documentation
|
||||
├── CMakeLists.txt # Build configuration
|
||||
├── .gitignore # Git ignore rules
|
||||
├── Sources/
|
||||
│ ├── CKiwi/ # C module for bridging
|
||||
│ │ ├── module.modulemap # C module definition
|
||||
│ │ └── include/ # Symbolic links to C headers
|
||||
│ │ ├── capi.h -> ../../../../../include/kiwi/capi.h
|
||||
│ │ └── Macro.h -> ../../../../../include/kiwi/Macro.h
|
||||
│ └── Kiwi/ # Swift wrapper layer
|
||||
│ ├── Kiwi.swift # Main analyzer class
|
||||
│ ├── KiwiBuilder.swift # Builder pattern
|
||||
│ ├── Token.swift # Token structures
|
||||
│ ├── POSTag.swift # POS tag enum
|
||||
│ ├── MatchOptions.swift # Analysis options
|
||||
│ ├── Dialect.swift # Dialect flags
|
||||
│ ├── Joiner.swift # Morpheme joiner
|
||||
│ ├── MorphemeSet.swift # Morpheme blacklist
|
||||
│ ├── TypoTransformer.swift # Typo correction
|
||||
│ ├── Errors.swift # Error types
|
||||
│ └── Internal/
|
||||
│ └── HandleWrapper.swift # RAII wrapper for C handles
|
||||
├── Tests/
|
||||
│ └── KiwiTests/
|
||||
│ └── KiwiTests.swift # Unit tests
|
||||
└── scripts/
|
||||
└── build-xcframework.sh # XCFramework build script
|
||||
```
|
||||
|
||||
## Key Design Patterns
|
||||
|
||||
### 1. RAII via HandleWrapper
|
||||
|
||||
C handles are wrapped in a Swift class that automatically releases resources:
|
||||
|
||||
```swift
|
||||
internal final class HandleWrapper<H> {
|
||||
let handle: H
|
||||
private let cleanup: (H) -> Void
|
||||
|
||||
init(_ handle: H, cleanup: @escaping (H) -> Void) {
|
||||
self.handle = handle
|
||||
self.cleanup = cleanup
|
||||
}
|
||||
|
||||
deinit {
|
||||
cleanup(handle)
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### 2. Swift-Friendly Types
|
||||
|
||||
C types are mapped to Swift types:
|
||||
- C `kiwi_h` → Swift `Kiwi` class
|
||||
- C `kiwi_token_info_t` → Swift `Token` struct
|
||||
- C flags → Swift `OptionSet` (MatchOptions, Dialect)
|
||||
- C enums → Swift `enum` (POSTag)
|
||||
|
||||
### 3. Error Handling
|
||||
|
||||
C error codes are converted to Swift errors:
|
||||
|
||||
```swift
|
||||
if result != 0 {
|
||||
if let errorMsg = kiwi_error() {
|
||||
let error = String(cString: errorMsg)
|
||||
kiwi_clear_error()
|
||||
throw KiwiError.operationFailed(error)
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Building
|
||||
|
||||
### For Development (macOS)
|
||||
|
||||
```bash
|
||||
cd bindings/swift
|
||||
swift build
|
||||
swift test
|
||||
```
|
||||
|
||||
### For Production (XCFramework)
|
||||
|
||||
```bash
|
||||
cd bindings/swift
|
||||
./scripts/build-xcframework.sh
|
||||
```
|
||||
|
||||
This creates `xcframework/Kiwi.xcframework` containing:
|
||||
- iOS Device (arm64)
|
||||
- iOS Simulator (arm64 + x86_64)
|
||||
- macOS (arm64 + x86_64)
|
||||
|
||||
## Testing
|
||||
|
||||
### Unit Tests
|
||||
|
||||
Run basic unit tests:
|
||||
```bash
|
||||
cd bindings/swift
|
||||
swift test
|
||||
```
|
||||
|
||||
### Integration Tests
|
||||
|
||||
Integration tests require actual Kiwi model files. These are not included in unit tests to keep them lightweight.
|
||||
|
||||
## CI/CD
|
||||
|
||||
GitHub Actions workflow (`.github/workflows/swift.yml`) runs:
|
||||
1. Swift package build and test
|
||||
2. XCFramework build (on main branch)
|
||||
3. Linux compatibility check
|
||||
|
||||
## Memory Management
|
||||
|
||||
All C handles are automatically released via Swift's ARC system:
|
||||
|
||||
- `Kiwi` → calls `kiwi_close()`
|
||||
- `KiwiBuilder` → calls `kiwi_builder_close()`
|
||||
- `Joiner` → calls `kiwi_joiner_close()`
|
||||
- `MorphemeSet` → calls `kiwi_morphset_close()`
|
||||
- `TypoTransformer` → calls `kiwi_typo_close()` (if owned)
|
||||
|
||||
## Thread Safety
|
||||
|
||||
The Swift bindings maintain the same thread safety guarantees as the underlying C API:
|
||||
- Multiple `Kiwi` instances can be used concurrently
|
||||
- Individual `Kiwi` instances should not be shared across threads without synchronization
|
||||
|
||||
## Future Enhancements
|
||||
|
||||
Potential areas for improvement:
|
||||
|
||||
1. **Binary Distribution**:
|
||||
- Publish pre-built XCFramework via GitHub Releases
|
||||
- Update Package.swift to reference binary framework
|
||||
|
||||
2. **Additional Features**:
|
||||
- Word extraction APIs
|
||||
- Substring extractor
|
||||
- Pattern matching
|
||||
|
||||
3. **Async/Await**:
|
||||
- Swift async/await wrapper for long-running operations
|
||||
- Currently only sync APIs are provided
|
||||
|
||||
4. **Documentation**:
|
||||
- DocC documentation comments
|
||||
- Swift DocC build for hosted documentation
|
||||
|
||||
## Contributing
|
||||
|
||||
When adding new features:
|
||||
|
||||
1. Add C API function calls in appropriate Swift wrapper
|
||||
2. Convert C types to Swift types appropriately
|
||||
3. Add error handling
|
||||
4. Update tests
|
||||
5. Update README with examples
|
||||
6. Update this guide if architecture changes
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Symbol Not Found
|
||||
|
||||
If you get "symbol not found" errors, ensure:
|
||||
- Symbolic links in `Sources/CKiwi/include/` are valid
|
||||
- Headers are properly exposed in module.modulemap
|
||||
- Library is correctly linked
|
||||
|
||||
### Build Failures
|
||||
|
||||
Common issues:
|
||||
- Missing Git LFS files (model files)
|
||||
- Incorrect symbolic link paths
|
||||
- Platform-specific build settings
|
||||
|
||||
### Runtime Errors
|
||||
|
||||
Check:
|
||||
- Model files are accessible
|
||||
- Correct path to model directory
|
||||
- Sufficient memory available
|
||||
|
||||
## License
|
||||
|
||||
Swift bindings are licensed under the same license as Kiwi.
|
||||
39
bindings/swift/Package.swift
Normal file
39
bindings/swift/Package.swift
Normal file
|
|
@ -0,0 +1,39 @@
|
|||
// swift-tools-version: 5.7
|
||||
// The swift-tools-version declares the minimum version of Swift required to build this package.
|
||||
|
||||
import PackageDescription
|
||||
|
||||
let package = Package(
|
||||
name: "Kiwi",
|
||||
platforms: [
|
||||
.iOS(.v12),
|
||||
.macOS(.v10_14)
|
||||
],
|
||||
products: [
|
||||
.library(
|
||||
name: "Kiwi",
|
||||
targets: ["Kiwi"]),
|
||||
],
|
||||
dependencies: [],
|
||||
targets: [
|
||||
.target(
|
||||
name: "CKiwi",
|
||||
dependencies: [],
|
||||
path: "Sources/CKiwi",
|
||||
linkerSettings: [
|
||||
.linkedLibrary("c++"),
|
||||
.linkedLibrary("z"),
|
||||
]
|
||||
),
|
||||
.target(
|
||||
name: "Kiwi",
|
||||
dependencies: ["CKiwi"],
|
||||
path: "Sources/Kiwi"
|
||||
),
|
||||
.testTarget(
|
||||
name: "KiwiTests",
|
||||
dependencies: ["Kiwi"],
|
||||
path: "Tests/KiwiTests"
|
||||
),
|
||||
]
|
||||
)
|
||||
383
bindings/swift/README.md
Normal file
383
bindings/swift/README.md
Normal file
|
|
@ -0,0 +1,383 @@
|
|||
# Kiwi Swift 바인딩
|
||||
|
||||
한국어 형태소 분석기 Kiwi의 Swift 바인딩입니다. iOS 및 macOS 앱에서 한국어 자연어 처리를 수행할 수 있습니다.
|
||||
|
||||
## 목차
|
||||
|
||||
- [요구 사항](#요구-사항)
|
||||
- [설치](#설치)
|
||||
- [모델 파일 설정](#모델-파일-설정)
|
||||
- [기본 사용법](#기본-사용법)
|
||||
- [고급 기능](#고급-기능)
|
||||
- [API 레퍼런스](#api-레퍼런스)
|
||||
- [품사 태그](#품사-태그)
|
||||
|
||||
## 요구 사항
|
||||
|
||||
- iOS 12.0+ / macOS 10.14+
|
||||
- Swift 5.7+
|
||||
- Xcode 14.0+
|
||||
|
||||
## 설치
|
||||
|
||||
### Swift Package Manager (권장)
|
||||
|
||||
#### 방법 1: Xcode에서 추가
|
||||
|
||||
1. Xcode에서 **File → Add Package Dependencies...** 선택
|
||||
2. 저장소 URL 입력: `https://github.com/bab2min/Kiwi.git`
|
||||
3. 버전 선택 후 **Add Package** 클릭
|
||||
|
||||
#### 방법 2: Package.swift에 직접 추가
|
||||
|
||||
```swift
|
||||
// Package.swift
|
||||
dependencies: [
|
||||
.package(url: "https://github.com/bab2min/Kiwi.git", from: "0.22.0")
|
||||
],
|
||||
targets: [
|
||||
.target(
|
||||
name: "YourApp",
|
||||
dependencies: ["Kiwi"]
|
||||
)
|
||||
]
|
||||
```
|
||||
|
||||
## 모델 파일 설정
|
||||
|
||||
Kiwi를 사용하려면 모델 파일이 필요합니다. 모델 파일은 [Kiwi 릴리즈 페이지](https://github.com/bab2min/Kiwi/releases)에서 다운로드할 수 있습니다.
|
||||
|
||||
### iOS/macOS 앱에서 모델 번들링
|
||||
|
||||
1. 모델 폴더를 Xcode 프로젝트에 드래그하여 추가
|
||||
2. **Copy items if needed** 체크
|
||||
3. **Create folder references** 선택 (중요!)
|
||||
4. 타겟에 추가되었는지 확인
|
||||
|
||||
```
|
||||
YourApp/
|
||||
├── Resources/
|
||||
│ └── KiwiModels/ ← 모델 폴더
|
||||
│ ├── combiningRule.txt
|
||||
│ ├── default.dict
|
||||
│ ├── extract.mdl
|
||||
│ └── ...
|
||||
```
|
||||
|
||||
## 기본 사용법
|
||||
|
||||
### 형태소 분석
|
||||
|
||||
```swift
|
||||
import Kiwi
|
||||
|
||||
do {
|
||||
// 1. KiwiBuilder 생성 (번들에서 모델 로드)
|
||||
let builder = try KiwiBuilder(
|
||||
bundle: .main,
|
||||
modelDirectory: "KiwiModels"
|
||||
)
|
||||
|
||||
// 2. Kiwi 인스턴스 빌드
|
||||
let kiwi = try builder.build()
|
||||
|
||||
// 3. 형태소 분석
|
||||
let tokens = try kiwi.tokenize("안녕하세요, 키위 형태소 분석기입니다!")
|
||||
|
||||
for token in tokens {
|
||||
print("\(token.form)/\(token.tag.description)")
|
||||
}
|
||||
// 출력:
|
||||
// 안녕/NNG
|
||||
// 하/XSA
|
||||
// 시/EP
|
||||
// 어요/EF
|
||||
// ,/SP
|
||||
// 키위/NNG
|
||||
// 형태소/NNG
|
||||
// 분석기/NNG
|
||||
// 이/VCP
|
||||
// ㅂ니다/EF
|
||||
// !/SF
|
||||
|
||||
} catch {
|
||||
print("오류: \(error)")
|
||||
}
|
||||
```
|
||||
|
||||
### 경로로 모델 로드
|
||||
|
||||
```swift
|
||||
// 직접 경로 지정
|
||||
let builder = try KiwiBuilder(
|
||||
modelPath: "/path/to/models",
|
||||
numThreads: 4 // 스레드 수 지정 (-1: 자동)
|
||||
)
|
||||
```
|
||||
|
||||
### 다중 분석 결과 얻기
|
||||
|
||||
```swift
|
||||
// topN 개의 분석 후보 반환
|
||||
let results = try kiwi.analyze("감기는 감기다", topN: 3)
|
||||
|
||||
for (index, result) in results.enumerated() {
|
||||
print("후보 \(index + 1) (점수: \(result.score)):")
|
||||
for token in result.tokens {
|
||||
print(" \(token.form)/\(token.tag.description)")
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### 문장 분리
|
||||
|
||||
```swift
|
||||
let text = "안녕하세요. 키위입니다. 형태소 분석을 합니다."
|
||||
let sentences = try kiwi.splitIntoSentences(text)
|
||||
|
||||
for sentence in sentences {
|
||||
print("문장: \(sentence.text)")
|
||||
print(" 시작: \(sentence.start), 길이: \(sentence.length)")
|
||||
}
|
||||
// 출력:
|
||||
// 문장: 안녕하세요.
|
||||
// 시작: 0, 길이: 18
|
||||
// 문장: 키위입니다.
|
||||
// 시작: 19, 길이: 16
|
||||
// 문장: 형태소 분석을 합니다.
|
||||
// 시작: 36, 길이: 28
|
||||
```
|
||||
|
||||
## 고급 기능
|
||||
|
||||
### 사용자 사전 추가
|
||||
|
||||
```swift
|
||||
let builder = try KiwiBuilder(bundle: .main, modelDirectory: "KiwiModels")
|
||||
|
||||
// 단어 직접 추가
|
||||
try builder.addWord("키위피", tag: .nnp, score: 0.0) // 고유명사로 추가
|
||||
try builder.addWord("딥러닝", tag: .nng, score: 0.0) // 일반명사로 추가
|
||||
|
||||
// 사전 파일 로드 (탭으로 구분된 형식: 단어\t품사\t점수)
|
||||
try builder.loadDict("/path/to/user_dict.txt")
|
||||
|
||||
let kiwi = try builder.build()
|
||||
```
|
||||
|
||||
### 분석 옵션 설정
|
||||
|
||||
```swift
|
||||
// 기본 옵션으로 분석
|
||||
let tokens1 = try kiwi.tokenize("www.example.com", options: .all)
|
||||
|
||||
// URL, 이메일 등 패턴 매칭 + 정규화
|
||||
let tokens2 = try kiwi.tokenize("www.example.com", options: .allWithNormalizing)
|
||||
|
||||
// 개별 옵션 조합
|
||||
let customOptions: MatchOptions = [.url, .email, .normalizeCoda]
|
||||
let tokens3 = try kiwi.tokenize("test@test.com", options: customOptions)
|
||||
```
|
||||
|
||||
**MatchOptions 목록:**
|
||||
|
||||
| 옵션 | 설명 |
|
||||
|------|------|
|
||||
| `.url` | URL 패턴 인식 |
|
||||
| `.email` | 이메일 패턴 인식 |
|
||||
| `.hashtag` | 해시태그 인식 |
|
||||
| `.mention` | 멘션(@) 인식 |
|
||||
| `.serial` | 일련번호 인식 |
|
||||
| `.normalizeCoda` | 받침 정규화 (잇다 → 있다) |
|
||||
| `.joinNounPrefix` | 체언 접두사 결합 |
|
||||
| `.joinNounSuffix` | 체언 접미사 결합 |
|
||||
| `.joinVerbSuffix` | 동사 접미사 결합 |
|
||||
| `.joinAdjSuffix` | 형용사 접미사 결합 |
|
||||
| `.splitComplex` | 복합 형태소 분리 |
|
||||
| `.all` | 기본 전체 옵션 |
|
||||
| `.allWithNormalizing` | 전체 + 정규화 |
|
||||
|
||||
### 방언 지원
|
||||
|
||||
```swift
|
||||
let builder = try KiwiBuilder(
|
||||
bundle: .main,
|
||||
modelDirectory: "KiwiModels",
|
||||
enabledDialects: [.standard, .gyeongsang, .jeolla]
|
||||
)
|
||||
|
||||
let kiwi = try builder.build()
|
||||
```
|
||||
|
||||
**Dialect 목록:**
|
||||
|
||||
| 옵션 | 설명 |
|
||||
|------|------|
|
||||
| `.standard` | 표준어 (기본) |
|
||||
| `.gyeonggi` | 경기 방언 |
|
||||
| `.chungcheong` | 충청 방언 |
|
||||
| `.gangwon` | 강원 방언 |
|
||||
| `.gyeongsang` | 경상 방언 |
|
||||
| `.jeolla` | 전라 방언 |
|
||||
| `.jeju` | 제주 방언 |
|
||||
| `.hwanghae` | 황해 방언 |
|
||||
| `.hamgyeong` | 함경 방언 |
|
||||
| `.pyeongan` | 평안 방언 |
|
||||
| `.archaic` | 고어 |
|
||||
|
||||
### 오타 교정
|
||||
|
||||
```swift
|
||||
// 기본 오타 교정기 사용
|
||||
let typoTransformer = try TypoTransformer.basic()
|
||||
|
||||
let kiwi = try builder.build(
|
||||
typoTransformer: typoTransformer,
|
||||
typoCostThreshold: 2.5
|
||||
)
|
||||
|
||||
let tokens = try kiwi.tokenize("장례희망이 뭐야?") // 오타 자동 교정
|
||||
```
|
||||
|
||||
**TypoTransformer 유형:**
|
||||
|
||||
```swift
|
||||
// 빈 트랜스포머
|
||||
let empty = try TypoTransformer()
|
||||
|
||||
// 기본 오타 세트
|
||||
let basic = try TypoTransformer.basic()
|
||||
|
||||
// 다양한 오타 세트
|
||||
let continual = try TypoTransformer.default(.continualTypoSet)
|
||||
let withLengthening = try TypoTransformer.default(.basicTypoSetWithContinualAndLengthening)
|
||||
```
|
||||
|
||||
### 형태소 결합 (Joiner)
|
||||
|
||||
형태소를 결합하여 자연스러운 문장을 생성합니다.
|
||||
|
||||
```swift
|
||||
let joiner = try kiwi.createJoiner()
|
||||
|
||||
try joiner.add(form: "먹", tag: .vv) // 동사 어간
|
||||
try joiner.add(form: "었", tag: .ep) // 선어말 어미
|
||||
try joiner.add(form: "다", tag: .ef) // 종결 어미
|
||||
|
||||
let text = try joiner.join()
|
||||
print(text) // "먹었다"
|
||||
```
|
||||
|
||||
```swift
|
||||
// 불규칙 활용 자동 처리
|
||||
let joiner = try kiwi.createJoiner()
|
||||
|
||||
try joiner.add(form: "듣", tag: .vvi) // ㄷ불규칙 동사
|
||||
try joiner.add(form: "어", tag: .ec)
|
||||
|
||||
let text = try joiner.join()
|
||||
print(text) // "들어" (ㄷ → ㄹ 불규칙 적용)
|
||||
```
|
||||
|
||||
### 형태소 블랙리스트
|
||||
|
||||
특정 형태소를 분석에서 제외합니다.
|
||||
|
||||
```swift
|
||||
let morphset = try kiwi.createMorphemeSet()
|
||||
try morphset.add(form: "가", tag: .jks) // 주격조사 '가' 제외
|
||||
|
||||
// analyze 시 blocklist로 사용 (향후 지원 예정)
|
||||
```
|
||||
|
||||
### Token 정보 활용
|
||||
|
||||
```swift
|
||||
let tokens = try kiwi.tokenize("서울에서 부산까지")
|
||||
|
||||
for token in tokens {
|
||||
print("""
|
||||
형태: \(token.form)
|
||||
품사: \(token.tag.description)
|
||||
위치: \(token.position) (길이: \(token.length))
|
||||
어절 번호: \(token.wordPosition)
|
||||
문장 번호: \(token.sentencePosition)
|
||||
점수: \(token.score)
|
||||
오타 비용: \(token.typoCost)
|
||||
""")
|
||||
}
|
||||
```
|
||||
|
||||
### JSON 직렬화
|
||||
|
||||
`Token`, `TokenResult`, `Sentence`는 모두 `Codable`을 준수합니다.
|
||||
|
||||
```swift
|
||||
let tokens = try kiwi.tokenize("안녕하세요")
|
||||
|
||||
let encoder = JSONEncoder()
|
||||
encoder.outputFormatting = .prettyPrinted
|
||||
|
||||
let jsonData = try encoder.encode(tokens)
|
||||
let jsonString = String(data: jsonData, encoding: .utf8)!
|
||||
print(jsonString)
|
||||
```
|
||||
|
||||
## API 레퍼런스
|
||||
|
||||
### KiwiBuilder
|
||||
|
||||
| 메서드 | 설명 |
|
||||
|--------|------|
|
||||
| `init(modelPath:numThreads:options:enabledDialects:)` | 경로로 초기화 |
|
||||
| `init(bundle:modelDirectory:numThreads:options:enabledDialects:)` | 번들로 초기화 |
|
||||
| `addWord(_:tag:score:)` | 사용자 단어 추가 |
|
||||
| `loadDict(_:)` | 사전 파일 로드 |
|
||||
| `build(typoTransformer:typoCostThreshold:)` | Kiwi 인스턴스 생성 |
|
||||
|
||||
### Kiwi
|
||||
|
||||
| 메서드/프로퍼티 | 설명 |
|
||||
|----------------|------|
|
||||
| `version` (static) | Kiwi 버전 문자열 |
|
||||
| `analyze(_:topN:options:)` | 형태소 분석 (다중 결과) |
|
||||
| `tokenize(_:options:)` | 형태소 분석 (최상위 결과만) |
|
||||
| `splitIntoSentences(_:options:)` | 문장 분리 |
|
||||
| `createJoiner(useLMSearch:)` | Joiner 생성 |
|
||||
| `createMorphemeSet()` | MorphemeSet 생성 |
|
||||
|
||||
### Token
|
||||
|
||||
| 프로퍼티 | 타입 | 설명 |
|
||||
|----------|------|------|
|
||||
| `form` | `String` | 형태소 문자열 |
|
||||
| `tag` | `POSTag` | 품사 태그 |
|
||||
| `position` | `Int` | 원문에서의 위치 (UTF-16) |
|
||||
| `length` | `Int` | 길이 (UTF-16) |
|
||||
| `score` | `Float` | 언어 모델 점수 |
|
||||
| `wordPosition` | `Int` | 어절 번호 |
|
||||
| `sentencePosition` | `Int` | 문장 번호 |
|
||||
| `typoCost` | `Float` | 오타 교정 비용 (0이면 교정 안 됨) |
|
||||
|
||||
### Joiner
|
||||
|
||||
| 메서드 | 설명 |
|
||||
|--------|------|
|
||||
| `add(form:tag:autoDetectIrregular:)` | 형태소 추가 |
|
||||
| `join()` | 결합된 텍스트 반환 |
|
||||
|
||||
## 에러 처리
|
||||
|
||||
```swift
|
||||
do {
|
||||
let builder = try KiwiBuilder(modelPath: "/invalid/path")
|
||||
} catch KiwiError.modelNotFound(let path) {
|
||||
print("모델을 찾을 수 없습니다: \(path)")
|
||||
} catch KiwiError.operationFailed(let message) {
|
||||
print("작업 실패: \(message)")
|
||||
} catch KiwiError.invalidHandle {
|
||||
print("잘못된 핸들")
|
||||
} catch {
|
||||
print("알 수 없는 오류: \(error)")
|
||||
}
|
||||
```
|
||||
1
bindings/swift/Sources/CKiwi/include/Macro.h
Symbolic link
1
bindings/swift/Sources/CKiwi/include/Macro.h
Symbolic link
|
|
@ -0,0 +1 @@
|
|||
../../../../../include/kiwi/Macro.h
|
||||
1
bindings/swift/Sources/CKiwi/include/capi.h
Symbolic link
1
bindings/swift/Sources/CKiwi/include/capi.h
Symbolic link
|
|
@ -0,0 +1 @@
|
|||
../../../../../include/kiwi/capi.h
|
||||
5
bindings/swift/Sources/CKiwi/module.modulemap
Normal file
5
bindings/swift/Sources/CKiwi/module.modulemap
Normal file
|
|
@ -0,0 +1,5 @@
|
|||
module CKiwi {
|
||||
header "include/capi.h"
|
||||
header "include/Macro.h"
|
||||
export *
|
||||
}
|
||||
46
bindings/swift/Sources/Kiwi/Dialect.swift
Normal file
46
bindings/swift/Sources/Kiwi/Dialect.swift
Normal file
|
|
@ -0,0 +1,46 @@
|
|||
import Foundation
|
||||
|
||||
/// Korean dialect flags
|
||||
public struct Dialect: OptionSet, Codable {
|
||||
public let rawValue: Int32
|
||||
|
||||
public init(rawValue: Int32) {
|
||||
self.rawValue = rawValue
|
||||
}
|
||||
|
||||
/// Standard Korean (표준어)
|
||||
public static let standard = Dialect(rawValue: 0)
|
||||
|
||||
/// Gyeonggi dialect (경기 방언)
|
||||
public static let gyeonggi = Dialect(rawValue: 1 << 0)
|
||||
|
||||
/// Chungcheong dialect (충청 방언)
|
||||
public static let chungcheong = Dialect(rawValue: 1 << 1)
|
||||
|
||||
/// Gangwon dialect (강원 방언)
|
||||
public static let gangwon = Dialect(rawValue: 1 << 2)
|
||||
|
||||
/// Gyeongsang dialect (경상 방언)
|
||||
public static let gyeongsang = Dialect(rawValue: 1 << 3)
|
||||
|
||||
/// Jeolla dialect (전라 방언)
|
||||
public static let jeolla = Dialect(rawValue: 1 << 4)
|
||||
|
||||
/// Jeju dialect (제주 방언)
|
||||
public static let jeju = Dialect(rawValue: 1 << 5)
|
||||
|
||||
/// Hwanghae dialect (황해 방언)
|
||||
public static let hwanghae = Dialect(rawValue: 1 << 6)
|
||||
|
||||
/// Hamgyeong dialect (함경 방언)
|
||||
public static let hamgyeong = Dialect(rawValue: 1 << 7)
|
||||
|
||||
/// Pyeongan dialect (평안 방언)
|
||||
public static let pyeongan = Dialect(rawValue: 1 << 8)
|
||||
|
||||
/// Archaic Korean (고어)
|
||||
public static let archaic = Dialect(rawValue: 1 << 9)
|
||||
|
||||
/// All dialects
|
||||
public static let all = Dialect(rawValue: (1 << 10) - 1)
|
||||
}
|
||||
39
bindings/swift/Sources/Kiwi/Errors.swift
Normal file
39
bindings/swift/Sources/Kiwi/Errors.swift
Normal file
|
|
@ -0,0 +1,39 @@
|
|||
import Foundation
|
||||
|
||||
/// Errors that can be thrown by Kiwi operations
|
||||
public enum KiwiError: Error, LocalizedError {
|
||||
/// Invalid handle passed to function
|
||||
case invalidHandle
|
||||
|
||||
/// Invalid index or parameter
|
||||
case invalidIndex
|
||||
|
||||
/// Operation failed with error message
|
||||
case operationFailed(String)
|
||||
|
||||
/// General failure
|
||||
case failure(String)
|
||||
|
||||
/// Model file not found
|
||||
case modelNotFound(String)
|
||||
|
||||
/// Invalid UTF-8 string
|
||||
case invalidString
|
||||
|
||||
public var errorDescription: String? {
|
||||
switch self {
|
||||
case .invalidHandle:
|
||||
return "Invalid handle"
|
||||
case .invalidIndex:
|
||||
return "Invalid index"
|
||||
case .operationFailed(let message):
|
||||
return "Operation failed: \(message)"
|
||||
case .failure(let message):
|
||||
return message
|
||||
case .modelNotFound(let path):
|
||||
return "Model not found at path: \(path)"
|
||||
case .invalidString:
|
||||
return "Invalid UTF-8 string"
|
||||
}
|
||||
}
|
||||
}
|
||||
16
bindings/swift/Sources/Kiwi/Internal/HandleWrapper.swift
Normal file
16
bindings/swift/Sources/Kiwi/Internal/HandleWrapper.swift
Normal file
|
|
@ -0,0 +1,16 @@
|
|||
import Foundation
|
||||
|
||||
/// Internal wrapper for C handles that provides RAII-style cleanup
|
||||
internal final class HandleWrapper<H> {
|
||||
let handle: H
|
||||
private let cleanup: (H) -> Void
|
||||
|
||||
init(_ handle: H, cleanup: @escaping (H) -> Void) {
|
||||
self.handle = handle
|
||||
self.cleanup = cleanup
|
||||
}
|
||||
|
||||
deinit {
|
||||
cleanup(handle)
|
||||
}
|
||||
}
|
||||
54
bindings/swift/Sources/Kiwi/Joiner.swift
Normal file
54
bindings/swift/Sources/Kiwi/Joiner.swift
Normal file
|
|
@ -0,0 +1,54 @@
|
|||
import Foundation
|
||||
import CKiwi
|
||||
|
||||
/// Joiner for combining morphemes into text
|
||||
public final class Joiner {
|
||||
private var wrapper: HandleWrapper<kiwi_joiner_h>?
|
||||
|
||||
internal init(handle: kiwi_joiner_h) {
|
||||
self.wrapper = HandleWrapper(handle) { kiwi_joiner_close($0) }
|
||||
}
|
||||
|
||||
/// Add a morpheme to the joiner
|
||||
/// - Parameters:
|
||||
/// - form: Form of the morpheme
|
||||
/// - tag: Part-of-speech tag
|
||||
/// - autoDetectIrregular: Automatically detect irregular conjugation (default: true)
|
||||
/// - Throws: KiwiError if operation fails
|
||||
public func add(form: String, tag: POSTag, autoDetectIrregular: Bool = true) throws {
|
||||
guard let handle = wrapper?.handle else {
|
||||
throw KiwiError.invalidHandle
|
||||
}
|
||||
|
||||
let result = kiwi_joiner_add(handle, form, tag.description, autoDetectIrregular ? 1 : 0)
|
||||
|
||||
if result != 0 {
|
||||
if let errorMsg = kiwi_error() {
|
||||
let error = String(cString: errorMsg)
|
||||
kiwi_clear_error()
|
||||
throw KiwiError.operationFailed(error)
|
||||
}
|
||||
throw KiwiError.operationFailed("Failed to add morpheme to joiner")
|
||||
}
|
||||
}
|
||||
|
||||
/// Get the joined text from all added morphemes
|
||||
/// - Returns: Combined text
|
||||
/// - Throws: KiwiError if operation fails
|
||||
public func join() throws -> String {
|
||||
guard let handle = wrapper?.handle else {
|
||||
throw KiwiError.invalidHandle
|
||||
}
|
||||
|
||||
guard let resultPtr = kiwi_joiner_get(handle) else {
|
||||
if let errorMsg = kiwi_error() {
|
||||
let error = String(cString: errorMsg)
|
||||
kiwi_clear_error()
|
||||
throw KiwiError.operationFailed(error)
|
||||
}
|
||||
throw KiwiError.operationFailed("Failed to get joined text")
|
||||
}
|
||||
|
||||
return String(cString: resultPtr)
|
||||
}
|
||||
}
|
||||
206
bindings/swift/Sources/Kiwi/Kiwi.swift
Normal file
206
bindings/swift/Sources/Kiwi/Kiwi.swift
Normal file
|
|
@ -0,0 +1,206 @@
|
|||
import Foundation
|
||||
import CKiwi
|
||||
|
||||
/// Main Kiwi morphological analyzer class
|
||||
public final class Kiwi {
|
||||
private var wrapper: HandleWrapper<kiwi_h>?
|
||||
|
||||
internal init(handle: kiwi_h) {
|
||||
self.wrapper = HandleWrapper(handle) { kiwi_close($0) }
|
||||
}
|
||||
|
||||
/// Get Kiwi version string
|
||||
public static var version: String {
|
||||
if let versionPtr = kiwi_version() {
|
||||
return String(cString: versionPtr)
|
||||
}
|
||||
return "unknown"
|
||||
}
|
||||
|
||||
/// Analyze text and return morphological analysis results
|
||||
/// - Parameters:
|
||||
/// - text: Text to analyze
|
||||
/// - topN: Number of top results to return (default: 1)
|
||||
/// - options: Match options (default: .allWithNormalizing)
|
||||
/// - typoTransformer: Optional prepared typo transformer for typo correction
|
||||
/// - typoThreshold: Typo cost threshold (default: 2.5)
|
||||
/// - Returns: Array of token result candidates
|
||||
/// - Throws: KiwiError if analysis fails
|
||||
public func analyze(
|
||||
_ text: String,
|
||||
topN: Int = 1,
|
||||
options: MatchOptions = .allWithNormalizing,
|
||||
typoTransformer: PreparedTypoTransformer? = nil,
|
||||
typoThreshold: Float = 2.5
|
||||
) throws -> [TokenResult] {
|
||||
guard let handle = wrapper?.handle else {
|
||||
throw KiwiError.invalidHandle
|
||||
}
|
||||
|
||||
var analyzeOption = kiwi_analyze_option_t()
|
||||
analyzeOption.match_options = options.rawValue
|
||||
analyzeOption.blocklist = nil
|
||||
analyzeOption.open_ending = 0
|
||||
analyzeOption.allowed_dialects = 0
|
||||
analyzeOption.dialect_cost = 3.0
|
||||
analyzeOption.typo_transformer = typoTransformer?.handle
|
||||
analyzeOption.typo_threshold = typoThreshold
|
||||
|
||||
guard let result = kiwi_analyze(handle, text, Int32(topN), analyzeOption, nil) else {
|
||||
if let errorMsg = kiwi_error() {
|
||||
let error = String(cString: errorMsg)
|
||||
kiwi_clear_error()
|
||||
throw KiwiError.operationFailed(error)
|
||||
}
|
||||
throw KiwiError.operationFailed("Analysis failed")
|
||||
}
|
||||
|
||||
defer { kiwi_res_close(result) }
|
||||
|
||||
let resultSize = kiwi_res_size(result)
|
||||
guard resultSize >= 0 else {
|
||||
throw KiwiError.operationFailed("Invalid result size")
|
||||
}
|
||||
|
||||
var results: [TokenResult] = []
|
||||
results.reserveCapacity(Int(resultSize))
|
||||
|
||||
for i in 0..<resultSize {
|
||||
let prob = kiwi_res_prob(result, i)
|
||||
let wordNum = kiwi_res_word_num(result, i)
|
||||
guard wordNum >= 0 else {
|
||||
continue
|
||||
}
|
||||
|
||||
var tokens: [Token] = []
|
||||
tokens.reserveCapacity(Int(wordNum))
|
||||
|
||||
for j in 0..<wordNum {
|
||||
if let formPtr = kiwi_res_form(result, i, j),
|
||||
let tokenInfo = kiwi_res_token_info(result, i, j) {
|
||||
let form = String(cString: formPtr)
|
||||
let token = Token(form: form, tokenInfo: tokenInfo.pointee)
|
||||
tokens.append(token)
|
||||
}
|
||||
}
|
||||
|
||||
results.append(TokenResult(score: prob, tokens: tokens))
|
||||
}
|
||||
|
||||
return results
|
||||
}
|
||||
|
||||
/// Tokenize text and return simple token array (uses best analysis result)
|
||||
/// - Parameters:
|
||||
/// - text: Text to tokenize
|
||||
/// - options: Match options (default: .allWithNormalizing)
|
||||
/// - typoTransformer: Optional prepared typo transformer for typo correction
|
||||
/// - typoThreshold: Typo cost threshold (default: 2.5)
|
||||
/// - Returns: Array of tokens
|
||||
/// - Throws: KiwiError if tokenization fails
|
||||
public func tokenize(
|
||||
_ text: String,
|
||||
options: MatchOptions = .allWithNormalizing,
|
||||
typoTransformer: PreparedTypoTransformer? = nil,
|
||||
typoThreshold: Float = 2.5
|
||||
) throws -> [Token] {
|
||||
let results = try analyze(text, topN: 1, options: options, typoTransformer: typoTransformer, typoThreshold: typoThreshold)
|
||||
return results.first?.tokens ?? []
|
||||
}
|
||||
|
||||
/// Split text into sentences
|
||||
/// - Parameters:
|
||||
/// - text: Text to split
|
||||
/// - options: Match options (default: .all)
|
||||
/// - Returns: Array of sentences
|
||||
/// - Throws: KiwiError if splitting fails
|
||||
public func splitIntoSentences(
|
||||
_ text: String,
|
||||
options: MatchOptions = .all
|
||||
) throws -> [Sentence] {
|
||||
guard let handle = wrapper?.handle else {
|
||||
throw KiwiError.invalidHandle
|
||||
}
|
||||
|
||||
guard let result = kiwi_split_into_sents(handle, text, options.rawValue, nil) else {
|
||||
if let errorMsg = kiwi_error() {
|
||||
let error = String(cString: errorMsg)
|
||||
kiwi_clear_error()
|
||||
throw KiwiError.operationFailed(error)
|
||||
}
|
||||
throw KiwiError.operationFailed("Sentence splitting failed")
|
||||
}
|
||||
|
||||
defer { kiwi_ss_close(result) }
|
||||
|
||||
let sentenceCount = kiwi_ss_size(result)
|
||||
guard sentenceCount >= 0 else {
|
||||
throw KiwiError.operationFailed("Invalid sentence count")
|
||||
}
|
||||
|
||||
var sentences: [Sentence] = []
|
||||
sentences.reserveCapacity(Int(sentenceCount))
|
||||
|
||||
let textUtf8Count = text.utf8.count
|
||||
for i in 0..<sentenceCount {
|
||||
let start = kiwi_ss_begin_position(result, i)
|
||||
let end = kiwi_ss_end_position(result, i)
|
||||
|
||||
// Validate bounds before indexing
|
||||
if start >= 0 && end >= start && Int(end) <= textUtf8Count {
|
||||
let startIdx = text.utf8.index(text.utf8.startIndex, offsetBy: Int(start))
|
||||
let endIdx = text.utf8.index(text.utf8.startIndex, offsetBy: Int(end))
|
||||
if let sentenceText = String(text.utf8[startIdx..<endIdx]) {
|
||||
sentences.append(Sentence(
|
||||
text: sentenceText,
|
||||
start: Int(start),
|
||||
length: Int(end - start)
|
||||
))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return sentences
|
||||
}
|
||||
|
||||
/// Create a new Joiner for combining morphemes into text
|
||||
/// - Parameter useLMSearch: Use language model search for optimal POS selection (default: true)
|
||||
/// - Returns: A new Joiner instance
|
||||
/// - Throws: KiwiError if creation fails
|
||||
public func createJoiner(useLMSearch: Bool = true) throws -> Joiner {
|
||||
guard let handle = wrapper?.handle else {
|
||||
throw KiwiError.invalidHandle
|
||||
}
|
||||
|
||||
guard let joinerHandle = kiwi_new_joiner(handle, useLMSearch ? 1 : 0) else {
|
||||
if let errorMsg = kiwi_error() {
|
||||
let error = String(cString: errorMsg)
|
||||
kiwi_clear_error()
|
||||
throw KiwiError.operationFailed(error)
|
||||
}
|
||||
throw KiwiError.operationFailed("Failed to create joiner")
|
||||
}
|
||||
|
||||
return Joiner(handle: joinerHandle)
|
||||
}
|
||||
|
||||
/// Create a new MorphemeSet (for use as blacklist in analysis)
|
||||
/// - Returns: A new MorphemeSet instance
|
||||
/// - Throws: KiwiError if creation fails
|
||||
public func createMorphemeSet() throws -> MorphemeSet {
|
||||
guard let handle = wrapper?.handle else {
|
||||
throw KiwiError.invalidHandle
|
||||
}
|
||||
|
||||
guard let morphsetHandle = kiwi_new_morphset(handle) else {
|
||||
if let errorMsg = kiwi_error() {
|
||||
let error = String(cString: errorMsg)
|
||||
kiwi_clear_error()
|
||||
throw KiwiError.operationFailed(error)
|
||||
}
|
||||
throw KiwiError.operationFailed("Failed to create morpheme set")
|
||||
}
|
||||
|
||||
return MorphemeSet(handle: morphsetHandle)
|
||||
}
|
||||
}
|
||||
169
bindings/swift/Sources/Kiwi/KiwiBuilder.swift
Normal file
169
bindings/swift/Sources/Kiwi/KiwiBuilder.swift
Normal file
|
|
@ -0,0 +1,169 @@
|
|||
import Foundation
|
||||
import CKiwi
|
||||
|
||||
/// Build options for KiwiBuilder
|
||||
public struct BuildOptions: OptionSet {
|
||||
public let rawValue: Int32
|
||||
|
||||
public init(rawValue: Int32) {
|
||||
self.rawValue = rawValue
|
||||
}
|
||||
|
||||
/// Integrate allomorphs
|
||||
public static let integrateAllomorph = BuildOptions(rawValue: 1)
|
||||
|
||||
/// Load default dictionary
|
||||
public static let loadDefaultDict = BuildOptions(rawValue: 2)
|
||||
|
||||
/// Load typo dictionary
|
||||
public static let loadTypoDict = BuildOptions(rawValue: 4)
|
||||
|
||||
/// Load multi-dict
|
||||
public static let loadMultiDict = BuildOptions(rawValue: 8)
|
||||
|
||||
/// Default build options
|
||||
public static let `default`: BuildOptions = [
|
||||
.integrateAllomorph,
|
||||
.loadDefaultDict,
|
||||
.loadTypoDict,
|
||||
.loadMultiDict
|
||||
]
|
||||
}
|
||||
|
||||
/// Builder class for creating Kiwi instances
|
||||
public final class KiwiBuilder {
|
||||
private var wrapper: HandleWrapper<kiwi_builder_h>?
|
||||
|
||||
/// Initialize KiwiBuilder with model path
|
||||
/// - Parameters:
|
||||
/// - modelPath: Path to the model directory
|
||||
/// - numThreads: Number of threads to use (-1 for automatic)
|
||||
/// - options: Build options
|
||||
/// - enabledDialects: Enabled dialects
|
||||
/// - Throws: KiwiError if initialization fails
|
||||
public init(
|
||||
modelPath: String,
|
||||
numThreads: Int = -1,
|
||||
options: BuildOptions = .default,
|
||||
enabledDialects: Dialect = .standard
|
||||
) throws {
|
||||
let handle = kiwi_builder_init(
|
||||
modelPath,
|
||||
Int32(numThreads),
|
||||
Int32(options.rawValue),
|
||||
Int32(enabledDialects.rawValue)
|
||||
)
|
||||
|
||||
guard let handle = handle else {
|
||||
if let errorMsg = kiwi_error() {
|
||||
let error = String(cString: errorMsg)
|
||||
kiwi_clear_error()
|
||||
throw KiwiError.operationFailed(error)
|
||||
}
|
||||
throw KiwiError.operationFailed("Failed to initialize KiwiBuilder")
|
||||
}
|
||||
|
||||
self.wrapper = HandleWrapper(handle) { kiwi_builder_close($0) }
|
||||
}
|
||||
|
||||
/// Initialize KiwiBuilder from a Bundle
|
||||
/// - Parameters:
|
||||
/// - bundle: Bundle containing the model files
|
||||
/// - modelDirectory: Name of the model directory in the bundle (default: "KiwiModels")
|
||||
/// - numThreads: Number of threads to use (-1 for automatic)
|
||||
/// - options: Build options
|
||||
/// - enabledDialects: Enabled dialects
|
||||
/// - Throws: KiwiError if initialization fails or model not found
|
||||
public convenience init(
|
||||
bundle: Bundle,
|
||||
modelDirectory: String = "KiwiModels",
|
||||
numThreads: Int = -1,
|
||||
options: BuildOptions = .default,
|
||||
enabledDialects: Dialect = .standard
|
||||
) throws {
|
||||
guard let modelPath = bundle.resourcePath?
|
||||
.appending("/\(modelDirectory)") else {
|
||||
throw KiwiError.modelNotFound("Model directory not found in bundle")
|
||||
}
|
||||
|
||||
try self.init(
|
||||
modelPath: modelPath,
|
||||
numThreads: numThreads,
|
||||
options: options,
|
||||
enabledDialects: enabledDialects
|
||||
)
|
||||
}
|
||||
|
||||
/// Add a user word to the dictionary
|
||||
/// - Parameters:
|
||||
/// - word: Word to add
|
||||
/// - tag: Part-of-speech tag
|
||||
/// - score: Score for the word (default: 0)
|
||||
/// - Returns: true if successful
|
||||
/// - Throws: KiwiError if operation fails
|
||||
@discardableResult
|
||||
public func addWord(_ word: String, tag: POSTag, score: Float = 0) throws -> Bool {
|
||||
guard let handle = wrapper?.handle else {
|
||||
throw KiwiError.invalidHandle
|
||||
}
|
||||
|
||||
let result = kiwi_builder_add_word(handle, word, tag.description, score)
|
||||
|
||||
if result != 0 {
|
||||
if let errorMsg = kiwi_error() {
|
||||
let error = String(cString: errorMsg)
|
||||
kiwi_clear_error()
|
||||
throw KiwiError.operationFailed(error)
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
return true
|
||||
}
|
||||
|
||||
/// Load user dictionary from file
|
||||
/// - Parameter dictPath: Path to the dictionary file
|
||||
/// - Returns: Number of words added
|
||||
/// - Throws: KiwiError if operation fails
|
||||
@discardableResult
|
||||
public func loadDict(_ dictPath: String) throws -> Int {
|
||||
guard let handle = wrapper?.handle else {
|
||||
throw KiwiError.invalidHandle
|
||||
}
|
||||
|
||||
let result = kiwi_builder_load_dict(handle, dictPath)
|
||||
|
||||
if result < 0 {
|
||||
if let errorMsg = kiwi_error() {
|
||||
let error = String(cString: errorMsg)
|
||||
kiwi_clear_error()
|
||||
throw KiwiError.operationFailed(error)
|
||||
}
|
||||
throw KiwiError.operationFailed("Failed to load dictionary")
|
||||
}
|
||||
|
||||
return Int(result)
|
||||
}
|
||||
|
||||
/// Build a Kiwi instance
|
||||
/// - Returns: A new Kiwi instance
|
||||
/// - Throws: KiwiError if build fails
|
||||
public func build() throws -> Kiwi {
|
||||
guard let handle = wrapper?.handle else {
|
||||
throw KiwiError.invalidHandle
|
||||
}
|
||||
|
||||
let kiwiHandle = kiwi_builder_build(handle, nil, 0)
|
||||
|
||||
guard let kiwiHandle = kiwiHandle else {
|
||||
if let errorMsg = kiwi_error() {
|
||||
let error = String(cString: errorMsg)
|
||||
kiwi_clear_error()
|
||||
throw KiwiError.operationFailed(error)
|
||||
}
|
||||
throw KiwiError.operationFailed("Failed to build Kiwi")
|
||||
}
|
||||
|
||||
return Kiwi(handle: kiwiHandle)
|
||||
}
|
||||
}
|
||||
91
bindings/swift/Sources/Kiwi/MatchOptions.swift
Normal file
91
bindings/swift/Sources/Kiwi/MatchOptions.swift
Normal file
|
|
@ -0,0 +1,91 @@
|
|||
import Foundation
|
||||
|
||||
/// Options for matching patterns in text analysis
|
||||
public struct MatchOptions: OptionSet, Codable {
|
||||
public let rawValue: Int32
|
||||
|
||||
public init(rawValue: Int32) {
|
||||
self.rawValue = rawValue
|
||||
}
|
||||
|
||||
/// Match URL patterns
|
||||
public static let url = MatchOptions(rawValue: 1 << 0)
|
||||
|
||||
/// Match email addresses
|
||||
public static let email = MatchOptions(rawValue: 1 << 1)
|
||||
|
||||
/// Match hashtags
|
||||
public static let hashtag = MatchOptions(rawValue: 1 << 2)
|
||||
|
||||
/// Match mentions (@username)
|
||||
public static let mention = MatchOptions(rawValue: 1 << 3)
|
||||
|
||||
/// Match serial numbers
|
||||
public static let serial = MatchOptions(rawValue: 1 << 4)
|
||||
|
||||
/// Match emoji
|
||||
public static let emoji = MatchOptions(rawValue: 1 << 5)
|
||||
|
||||
/// OOV: use rule-based matching only
|
||||
public static let oovRuleOnly = MatchOptions(rawValue: 0 << 8)
|
||||
|
||||
/// OOV: use character model
|
||||
public static let oovChrModel = MatchOptions(rawValue: 1 << 8)
|
||||
|
||||
/// OOV: use character frequency model
|
||||
public static let oovChrFreqModel = MatchOptions(rawValue: 2 << 8)
|
||||
|
||||
/// OOV: use character frequency and branch model
|
||||
public static let oovChrFreqBranchModel = MatchOptions(rawValue: 3 << 8)
|
||||
|
||||
/// OOV option mask
|
||||
public static let oovMask = MatchOptions(rawValue: 3 << 8)
|
||||
|
||||
/// Normalize coda
|
||||
public static let normalizeCoda = MatchOptions(rawValue: 1 << 16)
|
||||
|
||||
/// Join noun prefix
|
||||
public static let joinNounPrefix = MatchOptions(rawValue: 1 << 17)
|
||||
|
||||
/// Join noun suffix
|
||||
public static let joinNounSuffix = MatchOptions(rawValue: 1 << 18)
|
||||
|
||||
/// Join verb suffix
|
||||
public static let joinVerbSuffix = MatchOptions(rawValue: 1 << 19)
|
||||
|
||||
/// Join adjective suffix
|
||||
public static let joinAdjSuffix = MatchOptions(rawValue: 1 << 20)
|
||||
|
||||
/// Join adverb suffix
|
||||
public static let joinAdvSuffix = MatchOptions(rawValue: 1 << 21)
|
||||
|
||||
/// Join verb and adjective suffixes
|
||||
public static let joinVSuffix: MatchOptions = [.joinVerbSuffix, .joinAdjSuffix]
|
||||
|
||||
/// Join all affixes
|
||||
public static let joinAffix: MatchOptions = [.joinNounPrefix, .joinNounSuffix, .joinVerbSuffix, .joinAdjSuffix, .joinAdvSuffix]
|
||||
|
||||
/// Split complex morphemes
|
||||
public static let splitComplex = MatchOptions(rawValue: 1 << 22)
|
||||
|
||||
/// Match Z coda
|
||||
public static let zCoda = MatchOptions(rawValue: 1 << 23)
|
||||
|
||||
/// Match compatible jamo
|
||||
public static let compatibleJamo = MatchOptions(rawValue: 1 << 24)
|
||||
|
||||
/// Split saisiot
|
||||
public static let splitSaisiot = MatchOptions(rawValue: 1 << 25)
|
||||
|
||||
/// Merge saisiot
|
||||
public static let mergeSaisiot = MatchOptions(rawValue: 1 << 26)
|
||||
|
||||
/// Join particle yo
|
||||
public static let joinParticleYo = MatchOptions(rawValue: 1 << 27)
|
||||
|
||||
/// All basic matching options
|
||||
public static let all: MatchOptions = [.url, .email, .hashtag, .mention, .serial, .emoji, .zCoda]
|
||||
|
||||
/// All matching options with normalization
|
||||
public static let allWithNormalizing: MatchOptions = [.all, .normalizeCoda]
|
||||
}
|
||||
42
bindings/swift/Sources/Kiwi/MorphemeSet.swift
Normal file
42
bindings/swift/Sources/Kiwi/MorphemeSet.swift
Normal file
|
|
@ -0,0 +1,42 @@
|
|||
import Foundation
|
||||
import CKiwi
|
||||
|
||||
/// Set of morphemes (used as blacklist in analysis)
|
||||
public final class MorphemeSet {
|
||||
private var wrapper: HandleWrapper<kiwi_morphset_h>?
|
||||
|
||||
internal init(handle: kiwi_morphset_h) {
|
||||
self.wrapper = HandleWrapper(handle) { kiwi_morphset_close($0) }
|
||||
}
|
||||
|
||||
internal var handle: kiwi_morphset_h? {
|
||||
return wrapper?.handle
|
||||
}
|
||||
|
||||
/// Add a morpheme to the set
|
||||
/// - Parameters:
|
||||
/// - form: Form of the morpheme
|
||||
/// - tag: Part-of-speech tag (nil to match all tags)
|
||||
/// - Returns: Number of morphemes added
|
||||
/// - Throws: KiwiError if operation fails
|
||||
@discardableResult
|
||||
public func add(form: String, tag: POSTag? = nil) throws -> Int {
|
||||
guard let handle = wrapper?.handle else {
|
||||
throw KiwiError.invalidHandle
|
||||
}
|
||||
|
||||
let tagStr = tag?.description
|
||||
let result = kiwi_morphset_add(handle, form, tagStr)
|
||||
|
||||
if result < 0 {
|
||||
if let errorMsg = kiwi_error() {
|
||||
let error = String(cString: errorMsg)
|
||||
kiwi_clear_error()
|
||||
throw KiwiError.operationFailed(error)
|
||||
}
|
||||
throw KiwiError.operationFailed("Failed to add morpheme to set")
|
||||
}
|
||||
|
||||
return Int(result)
|
||||
}
|
||||
}
|
||||
268
bindings/swift/Sources/Kiwi/POSTag.swift
Normal file
268
bindings/swift/Sources/Kiwi/POSTag.swift
Normal file
|
|
@ -0,0 +1,268 @@
|
|||
import Foundation
|
||||
|
||||
/// Part-of-Speech tag enumeration
|
||||
public enum POSTag: UInt8, CaseIterable, Codable {
|
||||
case unknown = 0
|
||||
|
||||
// Nouns
|
||||
case nng = 1
|
||||
case nnp = 2
|
||||
case nnb = 3
|
||||
|
||||
// Verbs
|
||||
case vv = 4
|
||||
case va = 5
|
||||
|
||||
// Adverbs
|
||||
case mag = 6
|
||||
|
||||
// Numerals
|
||||
case nr = 7
|
||||
case np = 8
|
||||
|
||||
// Auxiliary
|
||||
case vx = 9
|
||||
|
||||
// Determiners
|
||||
case mm = 10
|
||||
case maj = 11
|
||||
|
||||
// Interjections
|
||||
case ic = 12
|
||||
|
||||
// Prefixes/Suffixes
|
||||
case xpn = 13
|
||||
case xsn = 14
|
||||
case xsv = 15
|
||||
case xsa = 16
|
||||
case xsm = 17
|
||||
case xr = 18
|
||||
|
||||
// Copulas
|
||||
case vcp = 19
|
||||
case vcn = 20
|
||||
|
||||
// Symbols
|
||||
case sf = 21
|
||||
case sp = 22
|
||||
case ss = 23
|
||||
case sso = 24
|
||||
case ssc = 25
|
||||
case se = 26
|
||||
case so = 27
|
||||
case sw = 28
|
||||
case sb = 29
|
||||
case sl = 30
|
||||
case sh = 31
|
||||
case sn = 32
|
||||
|
||||
// Web entities
|
||||
case w_url = 33
|
||||
case w_email = 34
|
||||
case w_mention = 35
|
||||
case w_hashtag = 36
|
||||
case w_serial = 37
|
||||
case w_emoji = 38
|
||||
|
||||
// Particles
|
||||
case jks = 39
|
||||
case jkc = 40
|
||||
case jkg = 41
|
||||
case jko = 42
|
||||
case jkb = 43
|
||||
case jkv = 44
|
||||
case jkq = 45
|
||||
case jx = 46
|
||||
case jc = 47
|
||||
|
||||
// Endings
|
||||
case ep = 48
|
||||
case ef = 49
|
||||
case ec = 50
|
||||
case etn = 51
|
||||
case etm = 52
|
||||
|
||||
// Special
|
||||
case z_coda = 53
|
||||
case z_siot = 54
|
||||
|
||||
// User defined
|
||||
case user0 = 55
|
||||
case user1 = 56
|
||||
case user2 = 57
|
||||
case user3 = 58
|
||||
case user4 = 59
|
||||
|
||||
// Irregular conjugation tags (base tag | 0x80)
|
||||
case vvi = 132
|
||||
case vai = 133
|
||||
case vxi = 137
|
||||
case xsai = 144
|
||||
|
||||
/// String representation of the POS tag
|
||||
public var description: String {
|
||||
switch self {
|
||||
case .unknown: return "UNK"
|
||||
case .nng: return "NNG"
|
||||
case .nnp: return "NNP"
|
||||
case .nnb: return "NNB"
|
||||
case .vv: return "VV"
|
||||
case .va: return "VA"
|
||||
case .mag: return "MAG"
|
||||
case .nr: return "NR"
|
||||
case .np: return "NP"
|
||||
case .vx: return "VX"
|
||||
case .mm: return "MM"
|
||||
case .maj: return "MAJ"
|
||||
case .ic: return "IC"
|
||||
case .xpn: return "XPN"
|
||||
case .xsn: return "XSN"
|
||||
case .xsv: return "XSV"
|
||||
case .xsa: return "XSA"
|
||||
case .xsm: return "XSM"
|
||||
case .xr: return "XR"
|
||||
case .vcp: return "VCP"
|
||||
case .vcn: return "VCN"
|
||||
case .sf: return "SF"
|
||||
case .sp: return "SP"
|
||||
case .ss: return "SS"
|
||||
case .sso: return "SSO"
|
||||
case .ssc: return "SSC"
|
||||
case .se: return "SE"
|
||||
case .so: return "SO"
|
||||
case .sw: return "SW"
|
||||
case .sb: return "SB"
|
||||
case .sl: return "SL"
|
||||
case .sh: return "SH"
|
||||
case .sn: return "SN"
|
||||
case .w_url: return "W_URL"
|
||||
case .w_email: return "W_EMAIL"
|
||||
case .w_mention: return "W_MENTION"
|
||||
case .w_hashtag: return "W_HASHTAG"
|
||||
case .w_serial: return "W_SERIAL"
|
||||
case .w_emoji: return "W_EMOJI"
|
||||
case .jks: return "JKS"
|
||||
case .jkc: return "JKC"
|
||||
case .jkg: return "JKG"
|
||||
case .jko: return "JKO"
|
||||
case .jkb: return "JKB"
|
||||
case .jkv: return "JKV"
|
||||
case .jkq: return "JKQ"
|
||||
case .jx: return "JX"
|
||||
case .jc: return "JC"
|
||||
case .ep: return "EP"
|
||||
case .ef: return "EF"
|
||||
case .ec: return "EC"
|
||||
case .etn: return "ETN"
|
||||
case .etm: return "ETM"
|
||||
case .z_coda: return "Z_CODA"
|
||||
case .z_siot: return "Z_SIOT"
|
||||
case .user0: return "USER0"
|
||||
case .user1: return "USER1"
|
||||
case .user2: return "USER2"
|
||||
case .user3: return "USER3"
|
||||
case .user4: return "USER4"
|
||||
case .vvi: return "VV-I"
|
||||
case .vai: return "VA-I"
|
||||
case .vxi: return "VX-I"
|
||||
case .xsai: return "XSA-I"
|
||||
}
|
||||
}
|
||||
|
||||
/// Initialize from string tag name
|
||||
public init?(string: String) {
|
||||
switch string.uppercased() {
|
||||
case "UNK", "UNKNOWN": self = .unknown
|
||||
case "NNG": self = .nng
|
||||
case "NNP": self = .nnp
|
||||
case "NNB": self = .nnb
|
||||
case "VV", "VV-R": self = .vv
|
||||
case "VA", "VA-R": self = .va
|
||||
case "MAG": self = .mag
|
||||
case "NR": self = .nr
|
||||
case "NP": self = .np
|
||||
case "VX", "VX-R": self = .vx
|
||||
case "MM": self = .mm
|
||||
case "MAJ": self = .maj
|
||||
case "IC": self = .ic
|
||||
case "XPN": self = .xpn
|
||||
case "XSN": self = .xsn
|
||||
case "XSV", "XSV-R": self = .xsv
|
||||
case "XSA", "XSA-R": self = .xsa
|
||||
case "XSM": self = .xsm
|
||||
case "XR": self = .xr
|
||||
case "VCP": self = .vcp
|
||||
case "VCN": self = .vcn
|
||||
case "SF": self = .sf
|
||||
case "SP": self = .sp
|
||||
case "SS": self = .ss
|
||||
case "SSO": self = .sso
|
||||
case "SSC": self = .ssc
|
||||
case "SE": self = .se
|
||||
case "SO": self = .so
|
||||
case "SW": self = .sw
|
||||
case "SB": self = .sb
|
||||
case "SL": self = .sl
|
||||
case "SH": self = .sh
|
||||
case "SN": self = .sn
|
||||
case "W_URL": self = .w_url
|
||||
case "W_EMAIL": self = .w_email
|
||||
case "W_MENTION": self = .w_mention
|
||||
case "W_HASHTAG": self = .w_hashtag
|
||||
case "W_SERIAL": self = .w_serial
|
||||
case "W_EMOJI": self = .w_emoji
|
||||
case "JKS": self = .jks
|
||||
case "JKC": self = .jkc
|
||||
case "JKG": self = .jkg
|
||||
case "JKO": self = .jko
|
||||
case "JKB": self = .jkb
|
||||
case "JKV": self = .jkv
|
||||
case "JKQ": self = .jkq
|
||||
case "JX": self = .jx
|
||||
case "JC": self = .jc
|
||||
case "EP": self = .ep
|
||||
case "EF": self = .ef
|
||||
case "EC": self = .ec
|
||||
case "ETN": self = .etn
|
||||
case "ETM": self = .etm
|
||||
case "Z_CODA": self = .z_coda
|
||||
case "Z_SIOT": self = .z_siot
|
||||
case "USER0": self = .user0
|
||||
case "USER1": self = .user1
|
||||
case "USER2": self = .user2
|
||||
case "USER3": self = .user3
|
||||
case "USER4": self = .user4
|
||||
case "VV-I", "VVI": self = .vvi
|
||||
case "VA-I", "VAI": self = .vai
|
||||
case "VX-I", "VXI": self = .vxi
|
||||
case "XSA-I", "XSAI": self = .xsai
|
||||
default: return nil
|
||||
}
|
||||
}
|
||||
|
||||
/// Whether this tag represents an irregular conjugation
|
||||
public var isIrregular: Bool {
|
||||
return rawValue & 0x80 != 0
|
||||
}
|
||||
|
||||
/// Returns the base tag without the irregular flag
|
||||
public var baseTag: POSTag {
|
||||
if isIrregular {
|
||||
return POSTag(rawValue: rawValue & 0x7F) ?? self
|
||||
}
|
||||
return self
|
||||
}
|
||||
|
||||
/// Returns the irregular version of this tag (for VV, VA, VX, XSA, P, PA)
|
||||
public var irregularTag: POSTag? {
|
||||
switch self {
|
||||
case .vv: return .vvi
|
||||
case .va: return .vai
|
||||
case .vx: return .vxi
|
||||
case .xsa: return .xsai
|
||||
default: return nil
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
extension POSTag: CustomStringConvertible {}
|
||||
128
bindings/swift/Sources/Kiwi/Token.swift
Normal file
128
bindings/swift/Sources/Kiwi/Token.swift
Normal file
|
|
@ -0,0 +1,128 @@
|
|||
import Foundation
|
||||
import CKiwi
|
||||
|
||||
/// Represents a morphological token in the analyzed text
|
||||
public struct Token: Codable {
|
||||
/// The surface form of the token
|
||||
public let form: String
|
||||
|
||||
/// Part-of-speech tag
|
||||
public let tag: POSTag
|
||||
|
||||
/// Character position in the original text (UTF-16 based)
|
||||
public let position: Int
|
||||
|
||||
/// Length of the token (UTF-16 based)
|
||||
public let length: Int
|
||||
|
||||
/// Language model score for this token
|
||||
public let score: Float
|
||||
|
||||
/// Word position (space-delimited)
|
||||
public let wordPosition: Int
|
||||
|
||||
/// Sentence position
|
||||
public let sentencePosition: Int
|
||||
|
||||
/// Line number
|
||||
public let lineNumber: Int
|
||||
|
||||
/// Sense ID
|
||||
public let senseId: Int
|
||||
|
||||
/// Typo cost (0 if not corrected)
|
||||
public let typoCost: Float
|
||||
|
||||
/// Paired token index for SSO/SSC tags (-1 if none)
|
||||
public let pairedToken: Int
|
||||
|
||||
/// Sub-sentence position (0 if not in sub-sentence)
|
||||
public let subSentencePosition: Int
|
||||
|
||||
/// Dialect information
|
||||
public let dialect: Dialect
|
||||
|
||||
internal init(form: String, tokenInfo: kiwi_token_info_t) {
|
||||
self.form = form
|
||||
self.tag = POSTag(rawValue: tokenInfo.tag) ?? .unknown
|
||||
self.position = Int(tokenInfo.chr_position)
|
||||
self.length = Int(tokenInfo.length)
|
||||
self.score = tokenInfo.score
|
||||
self.wordPosition = Int(tokenInfo.word_position)
|
||||
self.sentencePosition = Int(tokenInfo.sent_position)
|
||||
self.lineNumber = Int(tokenInfo.line_number)
|
||||
self.senseId = Int(tokenInfo.sense_id)
|
||||
self.typoCost = tokenInfo.typo_cost
|
||||
self.pairedToken = Int(tokenInfo.paired_token)
|
||||
self.subSentencePosition = Int(tokenInfo.sub_sent_position)
|
||||
self.dialect = Dialect(rawValue: Int32(tokenInfo.dialect))
|
||||
}
|
||||
|
||||
public init(
|
||||
form: String,
|
||||
tag: POSTag,
|
||||
position: Int = 0,
|
||||
length: Int = 0,
|
||||
score: Float = 0.0,
|
||||
wordPosition: Int = 0,
|
||||
sentencePosition: Int = 0,
|
||||
lineNumber: Int = 0,
|
||||
senseId: Int = 0,
|
||||
typoCost: Float = 0.0,
|
||||
pairedToken: Int = -1,
|
||||
subSentencePosition: Int = 0,
|
||||
dialect: Dialect = .standard
|
||||
) {
|
||||
self.form = form
|
||||
self.tag = tag
|
||||
self.position = position
|
||||
self.length = length
|
||||
self.score = score
|
||||
self.wordPosition = wordPosition
|
||||
self.sentencePosition = sentencePosition
|
||||
self.lineNumber = lineNumber
|
||||
self.senseId = senseId
|
||||
self.typoCost = typoCost
|
||||
self.pairedToken = pairedToken
|
||||
self.subSentencePosition = subSentencePosition
|
||||
self.dialect = dialect
|
||||
}
|
||||
}
|
||||
|
||||
extension Token: CustomStringConvertible {
|
||||
public var description: String {
|
||||
return "\(form)/\(tag.description)"
|
||||
}
|
||||
}
|
||||
|
||||
/// Result from analysis containing multiple token candidates
|
||||
public struct TokenResult: Codable {
|
||||
/// Probability score for this analysis result
|
||||
public let score: Float
|
||||
|
||||
/// Array of tokens in this analysis
|
||||
public let tokens: [Token]
|
||||
|
||||
public init(score: Float, tokens: [Token]) {
|
||||
self.score = score
|
||||
self.tokens = tokens
|
||||
}
|
||||
}
|
||||
|
||||
/// Represents a sentence in the split result
|
||||
public struct Sentence: Codable {
|
||||
/// The sentence text
|
||||
public let text: String
|
||||
|
||||
/// Starting position in original text
|
||||
public let start: Int
|
||||
|
||||
/// Length of the sentence
|
||||
public let length: Int
|
||||
|
||||
public init(text: String, start: Int, length: Int) {
|
||||
self.text = text
|
||||
self.start = start
|
||||
self.length = length
|
||||
}
|
||||
}
|
||||
123
bindings/swift/Sources/Kiwi/TypoTransformer.swift
Normal file
123
bindings/swift/Sources/Kiwi/TypoTransformer.swift
Normal file
|
|
@ -0,0 +1,123 @@
|
|||
import Foundation
|
||||
import CKiwi
|
||||
|
||||
/// Typo transformer for automatic typo correction
|
||||
public final class TypoTransformer {
|
||||
internal let handle: kiwi_typo_h
|
||||
private let shouldClose: Bool
|
||||
|
||||
internal init(handle: kiwi_typo_h, shouldClose: Bool = true) {
|
||||
self.handle = handle
|
||||
self.shouldClose = shouldClose
|
||||
}
|
||||
|
||||
/// Create a new empty typo transformer
|
||||
/// - Throws: KiwiError if creation fails
|
||||
public init() throws {
|
||||
guard let handle = kiwi_typo_init() else {
|
||||
if let errorMsg = kiwi_error() {
|
||||
let error = String(cString: errorMsg)
|
||||
kiwi_clear_error()
|
||||
throw KiwiError.operationFailed(error)
|
||||
}
|
||||
throw KiwiError.operationFailed("Failed to create typo transformer")
|
||||
}
|
||||
|
||||
self.handle = handle
|
||||
self.shouldClose = true
|
||||
}
|
||||
|
||||
/// Get the default basic typo transformer
|
||||
/// - Returns: A typo transformer with basic typo set
|
||||
/// - Throws: KiwiError if creation fails
|
||||
public static func basic() throws -> TypoTransformer {
|
||||
guard let handle = kiwi_typo_get_basic() else {
|
||||
if let errorMsg = kiwi_error() {
|
||||
let error = String(cString: errorMsg)
|
||||
kiwi_clear_error()
|
||||
throw KiwiError.operationFailed(error)
|
||||
}
|
||||
throw KiwiError.operationFailed("Failed to get basic typo transformer")
|
||||
}
|
||||
|
||||
return TypoTransformer(handle: handle, shouldClose: false)
|
||||
}
|
||||
|
||||
/// Typo set types
|
||||
public enum TypoSet: Int32 {
|
||||
case withoutTypo = 0
|
||||
case basicTypoSet = 1
|
||||
case continualTypoSet = 2
|
||||
case basicTypoSetWithContinual = 3
|
||||
case lengtheningTypoSet = 4
|
||||
case basicTypoSetWithContinualAndLengthening = 5
|
||||
}
|
||||
|
||||
/// Get default typo transformer with specified typo set
|
||||
/// - Parameter typoSet: The typo set to use
|
||||
/// - Returns: A typo transformer
|
||||
/// - Throws: KiwiError if creation fails
|
||||
public static func `default`(_ typoSet: TypoSet = .basicTypoSet) throws -> TypoTransformer {
|
||||
guard let handle = kiwi_typo_get_default(typoSet.rawValue) else {
|
||||
if let errorMsg = kiwi_error() {
|
||||
let error = String(cString: errorMsg)
|
||||
kiwi_clear_error()
|
||||
throw KiwiError.operationFailed(error)
|
||||
}
|
||||
throw KiwiError.operationFailed("Failed to get default typo transformer")
|
||||
}
|
||||
|
||||
return TypoTransformer(handle: handle, shouldClose: false)
|
||||
}
|
||||
|
||||
/// Copy this typo transformer
|
||||
/// - Returns: A new typo transformer with the same configuration
|
||||
/// - Throws: KiwiError if copy fails
|
||||
public func copy() throws -> TypoTransformer {
|
||||
guard let newHandle = kiwi_typo_copy(handle) else {
|
||||
if let errorMsg = kiwi_error() {
|
||||
let error = String(cString: errorMsg)
|
||||
kiwi_clear_error()
|
||||
throw KiwiError.operationFailed(error)
|
||||
}
|
||||
throw KiwiError.operationFailed("Failed to copy typo transformer")
|
||||
}
|
||||
|
||||
return TypoTransformer(handle: newHandle, shouldClose: true)
|
||||
}
|
||||
|
||||
/// Prepare the typo transformer for use in analysis
|
||||
/// - Returns: A PreparedTypoTransformer ready for use
|
||||
/// - Throws: KiwiError if preparation fails
|
||||
public func prepare() throws -> PreparedTypoTransformer {
|
||||
guard let preparedHandle = kiwi_typo_prepare(handle) else {
|
||||
if let errorMsg = kiwi_error() {
|
||||
let error = String(cString: errorMsg)
|
||||
kiwi_clear_error()
|
||||
throw KiwiError.operationFailed(error)
|
||||
}
|
||||
throw KiwiError.operationFailed("Failed to prepare typo transformer")
|
||||
}
|
||||
|
||||
return PreparedTypoTransformer(handle: preparedHandle)
|
||||
}
|
||||
|
||||
deinit {
|
||||
if shouldClose {
|
||||
kiwi_typo_close(handle)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Prepared typo transformer for use in analysis
|
||||
public final class PreparedTypoTransformer {
|
||||
internal let handle: kiwi_prepared_typo_h
|
||||
|
||||
internal init(handle: kiwi_prepared_typo_h) {
|
||||
self.handle = handle
|
||||
}
|
||||
|
||||
deinit {
|
||||
kiwi_prepared_typo_close(handle)
|
||||
}
|
||||
}
|
||||
263
bindings/swift/Tests/KiwiTests/KiwiTests.swift
Normal file
263
bindings/swift/Tests/KiwiTests/KiwiTests.swift
Normal file
|
|
@ -0,0 +1,263 @@
|
|||
import XCTest
|
||||
@testable import Kiwi
|
||||
|
||||
final class KiwiTests: XCTestCase {
|
||||
|
||||
func testPOSTagDescription() {
|
||||
XCTAssertEqual(POSTag.nng.description, "NNG")
|
||||
XCTAssertEqual(POSTag.nnp.description, "NNP")
|
||||
XCTAssertEqual(POSTag.vv.description, "VV")
|
||||
XCTAssertEqual(POSTag.jks.description, "JKS")
|
||||
}
|
||||
|
||||
func testPOSTagFromString() {
|
||||
XCTAssertEqual(POSTag(string: "NNG"), .nng)
|
||||
XCTAssertEqual(POSTag(string: "VV"), .vv)
|
||||
XCTAssertEqual(POSTag(string: "nng"), .nng)
|
||||
XCTAssertNil(POSTag(string: "INVALID"))
|
||||
}
|
||||
|
||||
func testMatchOptionsBasic() {
|
||||
let options: MatchOptions = [.url, .email]
|
||||
XCTAssertTrue(options.contains(.url))
|
||||
XCTAssertTrue(options.contains(.email))
|
||||
XCTAssertFalse(options.contains(.hashtag))
|
||||
}
|
||||
|
||||
func testMatchOptionsAll() {
|
||||
let options = MatchOptions.all
|
||||
XCTAssertTrue(options.contains(.url))
|
||||
XCTAssertTrue(options.contains(.email))
|
||||
XCTAssertTrue(options.contains(.hashtag))
|
||||
XCTAssertTrue(options.contains(.mention))
|
||||
}
|
||||
|
||||
func testDialectOptions() {
|
||||
let dialects: Dialect = [.gyeonggi, .jeju]
|
||||
XCTAssertTrue(dialects.contains(.gyeonggi))
|
||||
XCTAssertTrue(dialects.contains(.jeju))
|
||||
XCTAssertFalse(dialects.contains(.gangwon))
|
||||
}
|
||||
|
||||
func testTokenInitialization() {
|
||||
let token = Token(
|
||||
form: "테스트",
|
||||
tag: .nng,
|
||||
position: 0,
|
||||
length: 3,
|
||||
score: 1.0
|
||||
)
|
||||
|
||||
XCTAssertEqual(token.form, "테스트")
|
||||
XCTAssertEqual(token.tag, .nng)
|
||||
XCTAssertEqual(token.position, 0)
|
||||
XCTAssertEqual(token.length, 3)
|
||||
XCTAssertEqual(token.score, 1.0)
|
||||
}
|
||||
|
||||
func testTokenDescription() {
|
||||
let token = Token(form: "테스트", tag: .nng)
|
||||
XCTAssertEqual(token.description, "테스트/NNG")
|
||||
}
|
||||
|
||||
func testVersion() {
|
||||
let version = Kiwi.version
|
||||
XCTAssertFalse(version.isEmpty)
|
||||
XCTAssertNotEqual(version, "unknown")
|
||||
}
|
||||
|
||||
// MARK: - Integration Tests
|
||||
|
||||
func testKiwiBuilderAndTokenize() throws {
|
||||
// Try to find model path - in CI it should be at ../../models/cong/base
|
||||
let modelPath = "../../models/cong/base"
|
||||
|
||||
// Skip test if model not available (for local development without models)
|
||||
let fileManager = FileManager.default
|
||||
guard fileManager.fileExists(atPath: modelPath) else {
|
||||
print("Model not found at \(modelPath), skipping integration test")
|
||||
return
|
||||
}
|
||||
|
||||
// Create builder and build Kiwi instance
|
||||
let builder = try KiwiBuilder(modelPath: modelPath, numThreads: 1)
|
||||
let kiwi = try builder.build()
|
||||
|
||||
// Test basic tokenization
|
||||
let text = "안녕하세요"
|
||||
let tokens = try kiwi.tokenize(text)
|
||||
|
||||
// Verify we got some tokens
|
||||
XCTAssertFalse(tokens.isEmpty, "Tokenization should return tokens")
|
||||
|
||||
// Verify token structure
|
||||
for token in tokens {
|
||||
XCTAssertFalse(token.form.isEmpty, "Token form should not be empty")
|
||||
XCTAssertGreaterThanOrEqual(token.position, 0, "Token position should be non-negative")
|
||||
XCTAssertGreaterThan(token.length, 0, "Token length should be positive")
|
||||
}
|
||||
}
|
||||
|
||||
func testKiwiAnalyze() throws {
|
||||
let modelPath = "../../models/cong/base"
|
||||
|
||||
guard FileManager.default.fileExists(atPath: modelPath) else {
|
||||
print("Model not found, skipping test")
|
||||
return
|
||||
}
|
||||
|
||||
let builder = try KiwiBuilder(modelPath: modelPath, numThreads: 1)
|
||||
let kiwi = try builder.build()
|
||||
|
||||
// Test analyze with multiple results
|
||||
let text = "형태소 분석"
|
||||
let results = try kiwi.analyze(text, topN: 2)
|
||||
|
||||
// Should have at least one result
|
||||
XCTAssertFalse(results.isEmpty, "Analysis should return results")
|
||||
|
||||
// First result should have tokens
|
||||
if let firstResult = results.first {
|
||||
XCTAssertFalse(firstResult.tokens.isEmpty, "Result should have tokens")
|
||||
// Score is log probability, so it can be negative
|
||||
XCTAssertFalse(firstResult.score.isNaN, "Result score should not be NaN")
|
||||
}
|
||||
}
|
||||
|
||||
func testSplitIntoSentences() throws {
|
||||
let modelPath = "../../models/cong/base"
|
||||
|
||||
guard FileManager.default.fileExists(atPath: modelPath) else {
|
||||
print("Model not found, skipping test")
|
||||
return
|
||||
}
|
||||
|
||||
let builder = try KiwiBuilder(modelPath: modelPath, numThreads: 1)
|
||||
let kiwi = try builder.build()
|
||||
|
||||
// Test sentence splitting
|
||||
let text = "안녕하세요. 키위입니다. 형태소 분석을 합니다."
|
||||
let sentences = try kiwi.splitIntoSentences(text)
|
||||
|
||||
// Should have 3 sentences
|
||||
XCTAssertEqual(sentences.count, 3, "Should split into 3 sentences")
|
||||
|
||||
// Verify sentence structure
|
||||
for sentence in sentences {
|
||||
XCTAssertFalse(sentence.text.isEmpty, "Sentence text should not be empty")
|
||||
XCTAssertGreaterThanOrEqual(sentence.start, 0, "Sentence start should be non-negative")
|
||||
XCTAssertGreaterThan(sentence.length, 0, "Sentence length should be positive")
|
||||
}
|
||||
}
|
||||
|
||||
func testJoiner() throws {
|
||||
let modelPath = "../../models/cong/base"
|
||||
|
||||
guard FileManager.default.fileExists(atPath: modelPath) else {
|
||||
print("Model not found, skipping test")
|
||||
return
|
||||
}
|
||||
|
||||
let builder = try KiwiBuilder(modelPath: modelPath, numThreads: 1)
|
||||
let kiwi = try builder.build()
|
||||
|
||||
// Test joiner
|
||||
let joiner = try kiwi.createJoiner()
|
||||
try joiner.add(form: "형태소", tag: .nng)
|
||||
try joiner.add(form: "분석", tag: .nng)
|
||||
|
||||
let joined = try joiner.join()
|
||||
XCTAssertFalse(joined.isEmpty, "Joined text should not be empty")
|
||||
}
|
||||
|
||||
// MARK: - Typo Correction Tests
|
||||
|
||||
func testBasicTypoCorrection() throws {
|
||||
let modelPath = "../../models/cong/base"
|
||||
|
||||
guard FileManager.default.fileExists(atPath: modelPath) else {
|
||||
print("Model not found, skipping test")
|
||||
return
|
||||
}
|
||||
|
||||
let builder = try KiwiBuilder(modelPath: modelPath, numThreads: 1)
|
||||
let kiwi = try builder.build()
|
||||
|
||||
// Prepare basic typo transformer
|
||||
let typoTransformer = try TypoTransformer.default(.basicTypoSet)
|
||||
let preparedTypo = try typoTransformer.prepare()
|
||||
|
||||
// Without typo correction: '죰' remains as-is
|
||||
let tokensNoTypo = try kiwi.tokenize("나 죰 도와죠.")
|
||||
let formsNoTypo = tokensNoTypo.map { $0.form }
|
||||
XCTAssertTrue(formsNoTypo.contains("죰"), "Without typo correction, '죰' should remain")
|
||||
|
||||
// With typo correction: '죰' → '좀'
|
||||
let tokensWithTypo = try kiwi.tokenize("나 죰 도와죠.", typoTransformer: preparedTypo)
|
||||
let formsWithTypo = tokensWithTypo.map { $0.form }
|
||||
XCTAssertTrue(formsWithTypo.contains("좀"), "With typo correction, '죰' should become '좀'")
|
||||
}
|
||||
|
||||
func testContinualTypoCorrection() throws {
|
||||
let modelPath = "../../models/cong/base"
|
||||
|
||||
guard FileManager.default.fileExists(atPath: modelPath) else {
|
||||
print("Model not found, skipping test")
|
||||
return
|
||||
}
|
||||
|
||||
let builder = try KiwiBuilder(modelPath: modelPath, numThreads: 1)
|
||||
let kiwi = try builder.build()
|
||||
|
||||
let typoTransformer = try TypoTransformer.default(.continualTypoSet)
|
||||
let preparedTypo = try typoTransformer.prepare()
|
||||
|
||||
let tokens = try kiwi.tokenize("프로그래미", typoTransformer: preparedTypo)
|
||||
XCTAssertEqual(tokens[0].form, "프로그램")
|
||||
XCTAssertEqual(tokens[1].form, "이")
|
||||
}
|
||||
|
||||
func testTypoCorrectionViaAnalyze() throws {
|
||||
let modelPath = "../../models/cong/base"
|
||||
|
||||
guard FileManager.default.fileExists(atPath: modelPath) else {
|
||||
print("Model not found, skipping test")
|
||||
return
|
||||
}
|
||||
|
||||
let builder = try KiwiBuilder(modelPath: modelPath, numThreads: 1)
|
||||
let kiwi = try builder.build()
|
||||
|
||||
let typoTransformer = try TypoTransformer.default(.basicTypoSet)
|
||||
let preparedTypo = try typoTransformer.prepare()
|
||||
|
||||
let results = try kiwi.analyze("나 죰 도와죠.", topN: 1, typoTransformer: preparedTypo)
|
||||
XCTAssertFalse(results.isEmpty)
|
||||
let forms = results[0].tokens.map { $0.form }
|
||||
XCTAssertTrue(forms.contains("좀"))
|
||||
}
|
||||
|
||||
func testBasicWithContinualTypoCorrection() throws {
|
||||
let modelPath = "../../models/cong/base"
|
||||
|
||||
guard FileManager.default.fileExists(atPath: modelPath) else {
|
||||
print("Model not found, skipping test")
|
||||
return
|
||||
}
|
||||
|
||||
let builder = try KiwiBuilder(modelPath: modelPath, numThreads: 1)
|
||||
let kiwi = try builder.build()
|
||||
|
||||
let typoTransformer = try TypoTransformer.default(.basicTypoSetWithContinual)
|
||||
let preparedTypo = try typoTransformer.prepare()
|
||||
|
||||
// continual typo: '프로그래미' → '프로그램' + '이'
|
||||
let tokens1 = try kiwi.tokenize("프로그래미", typoTransformer: preparedTypo)
|
||||
XCTAssertEqual(tokens1[0].form, "프로그램")
|
||||
|
||||
// basic typo: '죰' → '좀'
|
||||
let tokens2 = try kiwi.tokenize("나 죰 도와죠.", typoTransformer: preparedTypo)
|
||||
let forms = tokens2.map { $0.form }
|
||||
XCTAssertTrue(forms.contains("좀"))
|
||||
}
|
||||
}
|
||||
124
bindings/swift/scripts/build-xcframework.sh
Executable file
124
bindings/swift/scripts/build-xcframework.sh
Executable file
|
|
@ -0,0 +1,124 @@
|
|||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
# Build script for creating XCFramework for iOS and macOS
|
||||
# This script builds the Kiwi library for multiple platforms and architectures
|
||||
# and combines them into a single XCFramework
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
PROJECT_ROOT="$(cd "$SCRIPT_DIR/../../.." && pwd)"
|
||||
SWIFT_DIR="$PROJECT_ROOT/bindings/swift"
|
||||
BUILD_DIR="$SWIFT_DIR/build"
|
||||
XCFRAMEWORK_DIR="$SWIFT_DIR/xcframework"
|
||||
|
||||
# Clean previous builds
|
||||
echo "Cleaning previous builds..."
|
||||
rm -rf "$BUILD_DIR"
|
||||
rm -rf "$XCFRAMEWORK_DIR"
|
||||
|
||||
mkdir -p "$BUILD_DIR"
|
||||
mkdir -p "$XCFRAMEWORK_DIR"
|
||||
|
||||
# Function to build for a specific platform
|
||||
build_platform() {
|
||||
local PLATFORM=$1
|
||||
local SDK=$2
|
||||
local ARCHS=$3
|
||||
local DEPLOYMENT_TARGET=$4
|
||||
local BUILD_SUBDIR=$5
|
||||
|
||||
echo "Building for $PLATFORM ($ARCHS)..."
|
||||
|
||||
local PLATFORM_BUILD_DIR="$BUILD_DIR/$BUILD_SUBDIR"
|
||||
mkdir -p "$PLATFORM_BUILD_DIR"
|
||||
|
||||
cd "$PLATFORM_BUILD_DIR"
|
||||
|
||||
cmake "$PROJECT_ROOT" \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DCMAKE_SYSTEM_NAME=$PLATFORM \
|
||||
-DCMAKE_OSX_SYSROOT=$SDK \
|
||||
-DCMAKE_OSX_ARCHITECTURES="$ARCHS" \
|
||||
-DCMAKE_OSX_DEPLOYMENT_TARGET=$DEPLOYMENT_TARGET \
|
||||
-DKIWI_BUILD_DYNAMIC=OFF \
|
||||
-DKIWI_BUILD_CLI=OFF \
|
||||
-DKIWI_BUILD_EVALUATOR=OFF \
|
||||
-DKIWI_BUILD_MODEL_BUILDER=OFF \
|
||||
-DKIWI_BUILD_TEST=OFF \
|
||||
-DKIWI_JAVA_BINDING=OFF \
|
||||
-DKIWI_USE_MIMALLOC=ON \
|
||||
-DKIWI_USE_CPUINFO=OFF \
|
||||
-GXcode
|
||||
|
||||
cmake --build . --config Release
|
||||
|
||||
# Create framework structure
|
||||
local FRAMEWORK_DIR="$PLATFORM_BUILD_DIR/Kiwi.framework"
|
||||
mkdir -p "$FRAMEWORK_DIR/Headers"
|
||||
|
||||
# Copy library
|
||||
# Use find to locate the library because Xcode puts it in different places
|
||||
# depending on the platform (e.g., Release-iphoneos, Release-iphonesimulator, Release)
|
||||
local LIB_FILE=$(find . -name "libkiwi_static.a" | grep "Release" | head -n 1)
|
||||
if [ -z "$LIB_FILE" ]; then
|
||||
echo "Error: libkiwi_static.a not found in $PLATFORM_BUILD_DIR"
|
||||
exit 1
|
||||
fi
|
||||
cp "$LIB_FILE" "$FRAMEWORK_DIR/Kiwi"
|
||||
|
||||
# Copy headers
|
||||
cp "$PROJECT_ROOT/include/kiwi/capi.h" "$FRAMEWORK_DIR/Headers/"
|
||||
cp "$PROJECT_ROOT/include/kiwi/Macro.h" "$FRAMEWORK_DIR/Headers/"
|
||||
|
||||
# Create module map
|
||||
mkdir -p "$FRAMEWORK_DIR/Modules"
|
||||
cat > "$FRAMEWORK_DIR/Modules/module.modulemap" << EOF
|
||||
framework module Kiwi {
|
||||
umbrella header "capi.h"
|
||||
export *
|
||||
module * { export * }
|
||||
}
|
||||
EOF
|
||||
|
||||
echo "✓ Built $PLATFORM"
|
||||
}
|
||||
|
||||
# Build for iOS Device (arm64)
|
||||
build_platform "iOS" "iphoneos" "arm64" "12.0" "ios-arm64"
|
||||
|
||||
# Build for iOS Simulator (arm64 + x86_64)
|
||||
build_platform "iOS" "iphonesimulator" "arm64;x86_64" "12.0" "ios-simulator"
|
||||
|
||||
# Build for macOS (arm64 + x86_64 universal)
|
||||
build_platform "Darwin" "macosx" "arm64;x86_64" "10.14" "macos"
|
||||
|
||||
# Create XCFramework
|
||||
echo "Creating XCFramework..."
|
||||
xcodebuild -create-xcframework \
|
||||
-framework "$BUILD_DIR/ios-arm64/Kiwi.framework" \
|
||||
-framework "$BUILD_DIR/ios-simulator/Kiwi.framework" \
|
||||
-framework "$BUILD_DIR/macos/Kiwi.framework" \
|
||||
-output "$XCFRAMEWORK_DIR/Kiwi.xcframework"
|
||||
|
||||
echo "✓ XCFramework created at $XCFRAMEWORK_DIR/Kiwi.xcframework"
|
||||
|
||||
# Create zip file for distribution
|
||||
echo "Creating zip archive..."
|
||||
cd "$XCFRAMEWORK_DIR"
|
||||
zip -r -y Kiwi.xcframework.zip Kiwi.xcframework
|
||||
|
||||
# Calculate checksum for Swift Package Manager
|
||||
CHECKSUM=$(swift package compute-checksum Kiwi.xcframework.zip 2>/dev/null || echo "N/A")
|
||||
echo "Checksum: $CHECKSUM"
|
||||
|
||||
echo ""
|
||||
echo "Build complete!"
|
||||
echo "XCFramework location: $XCFRAMEWORK_DIR/Kiwi.xcframework"
|
||||
echo "Zip archive: $XCFRAMEWORK_DIR/Kiwi.xcframework.zip"
|
||||
echo ""
|
||||
echo "To use with Swift Package Manager binaryTarget:"
|
||||
echo " .binaryTarget("
|
||||
echo " name: \"CKiwi\","
|
||||
echo " url: \"<RELEASE_URL>/Kiwi.xcframework.zip\","
|
||||
echo " checksum: \"$CHECKSUM\""
|
||||
echo " )"
|
||||
|
|
@ -15,6 +15,17 @@ Running the above command also automatically upgrades to package version if it d
|
|||
|
||||
You can also find the recent pre-built package at npm: https://www.npmjs.com/package/kiwi-nlp.
|
||||
|
||||
## Testing
|
||||
|
||||
To run unit tests for the WASM package, first build the package using `./build.sh`, then run:
|
||||
|
||||
```bash
|
||||
cd package
|
||||
npm test
|
||||
```
|
||||
|
||||
Tests are powered by [Vitest](https://vitest.dev/) and run in a Node.js environment.
|
||||
|
||||
## Documentation
|
||||
|
||||
The documentation for the package can be generated by running `npm run doc` inside the `package` directory.
|
||||
|
|
|
|||
|
|
@ -22,6 +22,8 @@ int nextInstanceId() {
|
|||
|
||||
static std::map<int, std::unordered_set<const Morpheme*>> morphemeSets;
|
||||
|
||||
static std::map<std::string, PreparedTypoTransformer> preparedTypoCache;
|
||||
|
||||
int nextMorphemeSetId() {
|
||||
static int id = 0;
|
||||
return id++;
|
||||
|
|
@ -132,6 +134,70 @@ std::vector<PretokenizedSpan> parsePretokenizedArg(const json& args, size_t inde
|
|||
}
|
||||
|
||||
|
||||
const PreparedTypoTransformer* parseTypoArg(const json& args, size_t index, float& typoCostThreshold) {
|
||||
if (args.size() <= index) return nullptr;
|
||||
const auto& typoArg = args.at(index);
|
||||
if (typoArg.is_null()) return nullptr;
|
||||
|
||||
typoCostThreshold = getAtOrDefault(args, index + 1, 2.5f);
|
||||
|
||||
std::string cacheKey;
|
||||
if (typoArg.is_string()) {
|
||||
cacheKey = typoArg.get<std::string>();
|
||||
if (cacheKey == "none") return nullptr;
|
||||
} else {
|
||||
cacheKey = typoArg.dump();
|
||||
}
|
||||
|
||||
auto it = preparedTypoCache.find(cacheKey);
|
||||
if (it != preparedTypoCache.end()) {
|
||||
return &it->second;
|
||||
}
|
||||
|
||||
if (typoArg.is_string()) {
|
||||
const std::string typosStr = typoArg.get<std::string>();
|
||||
DefaultTypoSet typoSet = DefaultTypoSet::withoutTypo;
|
||||
if (typosStr == "basic") {
|
||||
typoSet = DefaultTypoSet::basicTypoSet;
|
||||
} else if (typosStr == "continual") {
|
||||
typoSet = DefaultTypoSet::continualTypoSet;
|
||||
} else if (typosStr == "basicWithContinual") {
|
||||
typoSet = DefaultTypoSet::basicTypoSetWithContinual;
|
||||
}
|
||||
return getDefaultPreparedTypoSet(typoSet);
|
||||
} else {
|
||||
TypoTransformer typoTransformer;
|
||||
for (const auto& def : typoArg.value("defs", json::array())) {
|
||||
const float cost = def.value("cost", 1.0f);
|
||||
|
||||
CondVowel condVowel = CondVowel::none;
|
||||
const std::string condVowelStr = def.value("condition", "none");
|
||||
|
||||
if (condVowelStr == "any") {
|
||||
condVowel = CondVowel::any;
|
||||
} else if (condVowelStr == "vowel") {
|
||||
condVowel = CondVowel::vowel;
|
||||
} else if (condVowelStr == "applosive") {
|
||||
condVowel = CondVowel::applosive;
|
||||
}
|
||||
|
||||
for (const auto& orig8 : def["orig"]) {
|
||||
const auto orig16 = utf8To16(orig8);
|
||||
for (const auto& error8 : def["error"]) {
|
||||
typoTransformer.addTypo(orig16, utf8To16(error8), cost, condVowel);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const float continualTypoCost = typoArg.value("continualTypoCost", 1.0f);
|
||||
typoTransformer.setContinualTypoCost(continualTypoCost);
|
||||
|
||||
auto [insertIt, _] = preparedTypoCache.emplace(cacheKey, typoTransformer.prepare(true));
|
||||
return &insertIt->second;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
inline json serializeTokenInfo(const Kiwi& kiwi, const TokenInfo& tokenInfo) {
|
||||
return {
|
||||
{ "str", utf16To8(tokenInfo.str) },
|
||||
|
|
@ -345,54 +411,7 @@ json build(const json& args) {
|
|||
builder.addPreAnalyzedWord(form, analyzed, positions, score);
|
||||
}
|
||||
|
||||
const auto typos = buildArgs.value("typos", json(nullptr));
|
||||
const float typoCostThreshold = buildArgs.value("typoCostThreshold", 2.5f);
|
||||
|
||||
if (typos.is_null()) {
|
||||
instances.emplace(id, builder.build(DefaultTypoSet::withoutTypo, typoCostThreshold));
|
||||
} else if (typos.is_string()) {
|
||||
DefaultTypoSet typoSet = DefaultTypoSet::withoutTypo;
|
||||
const std::string typosStr = typos.get<std::string>();
|
||||
|
||||
if (typosStr == "basic") {
|
||||
typoSet = DefaultTypoSet::basicTypoSet;
|
||||
} else if (typosStr == "continual") {
|
||||
typoSet = DefaultTypoSet::continualTypoSet;
|
||||
} else if (typosStr == "basicWithContinual") {
|
||||
typoSet = DefaultTypoSet::basicTypoSetWithContinual;
|
||||
}
|
||||
|
||||
instances.emplace(id, builder.build(typoSet, typoCostThreshold));
|
||||
} else {
|
||||
TypoTransformer typoTransformer;
|
||||
|
||||
for (const auto& def : typos.value("defs", json::array())) {
|
||||
const float cost = def.value("cost", 1.0f);
|
||||
|
||||
CondVowel condVowel = CondVowel::none;
|
||||
const std::string condVowelStr = def.value("condVowel", "none");
|
||||
|
||||
if (condVowelStr == "any") {
|
||||
condVowel = CondVowel::any;
|
||||
} else if (condVowelStr == "vowel") {
|
||||
condVowel = CondVowel::vowel;
|
||||
} else if (condVowelStr == "applosive") {
|
||||
condVowel = CondVowel::applosive;
|
||||
}
|
||||
|
||||
for (const auto& orig8 : def["orig"]) {
|
||||
const auto orig16 = utf8To16(orig8);
|
||||
for (const auto& error8 : def["error"]) {
|
||||
typoTransformer.addTypo(orig16, utf8To16(error8), cost, condVowel);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const float continualTypoCost = typos.value("continualTypoCost", 1.0f);
|
||||
typoTransformer.setContinualTypoCost(continualTypoCost);
|
||||
|
||||
instances.emplace(id, builder.build(typoTransformer, typoCostThreshold));
|
||||
}
|
||||
instances.emplace(id, builder.build());
|
||||
|
||||
return id;
|
||||
}
|
||||
|
|
@ -410,10 +429,15 @@ json kiwiAnalyze(Kiwi& kiwi, const json& args) {
|
|||
const Match matchOptions = getAtOrDefault(args, 1, Match::allWithNormalizing);
|
||||
const BlockListArg blockListArg(kiwi, args, 2);
|
||||
const auto pretokenized = parsePretokenizedArg(args, 3);
|
||||
const auto allowedDialects = parseDialects(getAtOrDefault(args, 4, std::string{ "standard" }));
|
||||
const auto dialectCost = getAtOrDefault(args, 5, 3.0f);
|
||||
|
||||
const TokenResult tokenResult = kiwi.analyze(str, AnalyzeOption{ matchOptions, blockListArg.setPtr(), false, allowedDialects, dialectCost }, pretokenized);
|
||||
float typoCostThreshold = 2.5f;
|
||||
const auto* typoTransformer = parseTypoArg(args, 4, typoCostThreshold);
|
||||
const auto allowedDialects = parseDialects(getAtOrDefault(args, 6, std::string{ "standard" }));
|
||||
const auto dialectCost = getAtOrDefault(args, 7, 3.0f);
|
||||
|
||||
AnalyzeOption opt{ matchOptions, blockListArg.setPtr(), false, allowedDialects, dialectCost };
|
||||
opt.typoTransformer = typoTransformer;
|
||||
opt.typoThreshold = typoCostThreshold;
|
||||
const TokenResult tokenResult = kiwi.analyze(str, opt, pretokenized);
|
||||
|
||||
return serializeTokenResult(kiwi, tokenResult);
|
||||
}
|
||||
|
|
@ -424,10 +448,15 @@ json kiwiAnalyzeTopN(Kiwi& kiwi, const json& args) {
|
|||
const Match matchOptions = getAtOrDefault(args, 2, Match::allWithNormalizing);
|
||||
const BlockListArg blockListArg(kiwi, args, 3);
|
||||
const auto pretokenized = parsePretokenizedArg(args, 4);
|
||||
const auto allowedDialects = parseDialects(getAtOrDefault(args, 4, std::string{ "standard" }));
|
||||
const auto dialectCost = getAtOrDefault(args, 5, 3.0f);
|
||||
float typoCostThreshold = 2.5f;
|
||||
const auto* typoTransformer = parseTypoArg(args, 5, typoCostThreshold);
|
||||
const auto allowedDialects = parseDialects(getAtOrDefault(args, 7, std::string{ "standard" }));
|
||||
const auto dialectCost = getAtOrDefault(args, 8, 3.0f);
|
||||
|
||||
const std::vector<TokenResult> tokenResults = kiwi.analyze(str, topN, AnalyzeOption{ matchOptions, blockListArg.setPtr(), false, allowedDialects, dialectCost }, pretokenized);
|
||||
AnalyzeOption opt{ matchOptions, blockListArg.setPtr(), false, allowedDialects, dialectCost };
|
||||
opt.typoTransformer = typoTransformer;
|
||||
opt.typoThreshold = typoCostThreshold;
|
||||
const std::vector<TokenResult> tokenResults = kiwi.analyze(str, topN, opt, pretokenized);
|
||||
|
||||
return serializeTokenResultVec(kiwi, tokenResults);
|
||||
}
|
||||
|
|
@ -437,10 +466,15 @@ json kiwiTokenize(Kiwi& kiwi, const json& args) {
|
|||
const Match matchOptions = getAtOrDefault(args, 1, Match::allWithNormalizing);
|
||||
const BlockListArg blockListArg(kiwi, args, 2);
|
||||
const auto pretokenized = parsePretokenizedArg(args, 3);
|
||||
const auto allowedDialects = parseDialects(getAtOrDefault(args, 4, std::string{ "standard" }));
|
||||
const auto dialectCost = getAtOrDefault(args, 5, 3.0f);
|
||||
|
||||
const TokenResult tokenResult = kiwi.analyze(str, AnalyzeOption{ matchOptions, blockListArg.setPtr(), false, allowedDialects, dialectCost }, pretokenized);
|
||||
float typoCostThreshold = 2.5f;
|
||||
const auto* typoTransformer = parseTypoArg(args, 4, typoCostThreshold);
|
||||
const auto allowedDialects = parseDialects(getAtOrDefault(args, 6, std::string{ "standard" }));
|
||||
const auto dialectCost = getAtOrDefault(args, 7, 3.0f);
|
||||
|
||||
AnalyzeOption opt{ matchOptions, blockListArg.setPtr(), false, allowedDialects, dialectCost };
|
||||
opt.typoTransformer = typoTransformer;
|
||||
opt.typoThreshold = typoCostThreshold;
|
||||
const TokenResult tokenResult = kiwi.analyze(str, opt, pretokenized);
|
||||
|
||||
return serializeTokenInfoVec(kiwi, tokenResult.first);
|
||||
}
|
||||
|
|
@ -451,10 +485,15 @@ json kiwiTokenizeTopN(Kiwi& kiwi, const json& args) {
|
|||
const Match matchOptions = getAtOrDefault(args, 2, Match::allWithNormalizing);
|
||||
const BlockListArg blockListArg(kiwi, args, 3);
|
||||
const auto pretokenized = parsePretokenizedArg(args, 4);
|
||||
const auto allowedDialects = parseDialects(getAtOrDefault(args, 5, std::string{ "standard" }));
|
||||
const auto dialectCost = getAtOrDefault(args, 6, 3.0f);
|
||||
float typoCostThreshold = 2.5f;
|
||||
const auto* typoTransformer = parseTypoArg(args, 5, typoCostThreshold);
|
||||
const auto allowedDialects = parseDialects(getAtOrDefault(args, 7, std::string{ "standard" }));
|
||||
const auto dialectCost = getAtOrDefault(args, 8, 3.0f);
|
||||
|
||||
const std::vector<TokenResult> tokenResults = kiwi.analyze(str, topN, AnalyzeOption{ matchOptions, blockListArg.setPtr(), false, allowedDialects, dialectCost }, pretokenized);
|
||||
AnalyzeOption opt{ matchOptions, blockListArg.setPtr(), false, allowedDialects, dialectCost };
|
||||
opt.typoTransformer = typoTransformer;
|
||||
opt.typoThreshold = typoCostThreshold;
|
||||
const std::vector<TokenResult> tokenResults = kiwi.analyze(str, topN, opt, pretokenized);
|
||||
|
||||
json result = json::array();
|
||||
for (const TokenResult& tokenResult : tokenResults) {
|
||||
|
|
@ -528,25 +567,36 @@ json kiwiGetGlobalConfig(Kiwi& kiwi, const json& args) {
|
|||
auto config = kiwi.getGlobalConfig();
|
||||
obj["integrateAllomorph"] = config.integrateAllomorph;
|
||||
obj["cutOffThreshold"] = config.cutOffThreshold;
|
||||
obj["unkFormScoreScale"] = config.unkFormScoreScale;
|
||||
obj["unkFormScoreBias"] = config.unkFormScoreBias;
|
||||
obj["oovRuleScale"] = config.oovRuleScale;
|
||||
obj["oovRuleBias"] = config.oovRuleBias;
|
||||
obj["oovChrBias"] = config.oovChrBias;
|
||||
obj["oovGlobalWeight"] = config.oovGlobalWeight;
|
||||
obj["oovLocalWeight"] = config.oovLocalWeight;
|
||||
obj["oovGlobalMinFreq"] = config.oovGlobalMinFreq;
|
||||
obj["spacePenalty"] = config.spacePenalty;
|
||||
obj["typoCostWeight"] = config.typoCostWeight;
|
||||
obj["maxUnkFormSize"] = config.maxUnkFormSize;
|
||||
obj["maxUnkFormSizeFollowedByJClass"] = config.maxUnkFormSizeFollowedByJClass;
|
||||
obj["spaceTolerance"] = config.spaceTolerance;
|
||||
return obj;
|
||||
}
|
||||
|
||||
json kiwiSetGlobalConfig(Kiwi& kiwi, const json& args) {
|
||||
KiwiConfig config;
|
||||
if (args.contains("integrateAllomorph")) config.integrateAllomorph = args["integrateAllomorph"];
|
||||
if (args.contains("cutOffThreshold")) config.cutOffThreshold = args["cutOffThreshold"];
|
||||
if (args.contains("unkFormScoreScale")) config.unkFormScoreScale = args["unkFormScoreScale"];
|
||||
if (args.contains("unkFormScoreBias")) config.unkFormScoreBias = args["unkFormScoreBias"];
|
||||
if (args.contains("spacePenalty")) config.spacePenalty = args["spacePenalty"];
|
||||
if (args.contains("typoCostWeight")) config.typoCostWeight = args["typoCostWeight"];
|
||||
if (args.contains("maxUnkFormSize")) config.maxUnkFormSize = args["maxUnkFormSize"];
|
||||
if (args.contains("spaceTolerance")) config.spaceTolerance = args["spaceTolerance"];
|
||||
auto config = kiwi.getGlobalConfig();
|
||||
const json& configArg = args[0];
|
||||
if (configArg.contains("integrateAllomorph")) config.integrateAllomorph = configArg["integrateAllomorph"];
|
||||
if (configArg.contains("cutOffThreshold")) config.cutOffThreshold = configArg["cutOffThreshold"];
|
||||
if (configArg.contains("oovRuleScale")) config.oovRuleScale = configArg["oovRuleScale"];
|
||||
if (configArg.contains("oovRuleBias")) config.oovRuleBias = configArg["oovRuleBias"];
|
||||
if (configArg.contains("oovChrBias")) config.oovChrBias = configArg["oovChrBias"];
|
||||
if (configArg.contains("oovGlobalWeight")) config.oovGlobalWeight = configArg["oovGlobalWeight"];
|
||||
if (configArg.contains("oovLocalWeight")) config.oovLocalWeight = configArg["oovLocalWeight"];
|
||||
if (configArg.contains("oovGlobalMinFreq")) config.oovGlobalMinFreq = configArg["oovGlobalMinFreq"];
|
||||
if (configArg.contains("spacePenalty")) config.spacePenalty = configArg["spacePenalty"];
|
||||
if (configArg.contains("typoCostWeight")) config.typoCostWeight = configArg["typoCostWeight"];
|
||||
if (configArg.contains("maxUnkFormSize")) config.maxUnkFormSize = configArg["maxUnkFormSize"];
|
||||
if (configArg.contains("maxUnkFormSizeFollowedByJClass")) config.maxUnkFormSizeFollowedByJClass = configArg["maxUnkFormSizeFollowedByJClass"];
|
||||
if (configArg.contains("spaceTolerance")) config.spaceTolerance = configArg["spaceTolerance"];
|
||||
kiwi.setGlobalConfig(config);
|
||||
return nullptr;
|
||||
}
|
||||
|
|
|
|||
1825
bindings/wasm/package/package-lock.json
generated
1825
bindings/wasm/package/package-lock.json
generated
File diff suppressed because it is too large
Load diff
|
|
@ -21,12 +21,20 @@
|
|||
"url": "https://github.com/bab2min/Kiwi/issues"
|
||||
},
|
||||
"homepage": "https://lab.bab2min.pe.kr/kiwi",
|
||||
"publishConfig": {
|
||||
"provenance": true,
|
||||
"access": "public"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@types/node": "^20.0.0",
|
||||
"typedoc": "^0.26.2",
|
||||
"typescript": "^5.4.5"
|
||||
"typescript": "^5.4.5",
|
||||
"undici-types": "^7.22.0",
|
||||
"vitest": "^1.6.0"
|
||||
},
|
||||
"scripts": {
|
||||
"build": "tsc",
|
||||
"doc": "typedoc --out doc src"
|
||||
"doc": "typedoc --out doc src",
|
||||
"test": "vitest run"
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -148,14 +148,4 @@ export interface BuildArgs {
|
|||
* - `cong-global`: (experimental) Contextual N-gram embedding Language Model. It consists of lightweighted neural networks that can estimate the relationships between morphemes over large distances (up to 7 real morphemes) with high accuracy.
|
||||
*/
|
||||
modelType?: 'none' | 'largest' | 'knlm' | 'sbg' | 'cong' | 'cong-global';
|
||||
/**
|
||||
* The typo information to use for correction.
|
||||
* Can be one of the built in `none`, `basic`, `continual`, `basicWithContinual` typo sets, or a custom {@link TypoTransformer}.
|
||||
* Defaults to `none`, which disables typo correction.
|
||||
*/
|
||||
typos?: 'none' | 'basic' | 'continual' | 'basicWithContinual' | TypoTransformer;
|
||||
/**
|
||||
* The maximum typo cost to consider when correcting typos. Typos beyond this cost will not be explored. Defaults to 2.5.
|
||||
*/
|
||||
typoCostThreshold?: number;
|
||||
};
|
||||
|
|
|
|||
|
|
@ -1,4 +1,5 @@
|
|||
import { AsyncMethods } from './util.js';
|
||||
import { TypoTransformer } from './build-args.js';
|
||||
|
||||
/**
|
||||
* Describes a single morpheme in the input string of the morphological analysis.
|
||||
|
|
@ -82,6 +83,11 @@ export enum Match {
|
|||
mention = 1 << 3,
|
||||
serial = 1 << 4,
|
||||
emoji = 1 << 5,
|
||||
oovRuleOnly = 0 << 8,
|
||||
oovChrModel = 1 << 8,
|
||||
oovChrFreqModel = 2 << 8,
|
||||
oovChrFreqBranchModel = 3 << 8,
|
||||
oovMask = 3 << 8,
|
||||
normalizeCoda = 1 << 16,
|
||||
joinNounPrefix = 1 << 17,
|
||||
joinNounSuffix = 1 << 18,
|
||||
|
|
@ -91,6 +97,9 @@ export enum Match {
|
|||
splitComplex = 1 << 22,
|
||||
zCoda = 1 << 23,
|
||||
compatibleJamo = 1 << 24,
|
||||
splitSaisiot = 1 << 25,
|
||||
mergeSaisiot = 1 << 26,
|
||||
joinParticleYo = 1 << 27,
|
||||
joinVSuffix = joinVerbSuffix | joinAdjSuffix,
|
||||
joinAffix = joinNounPrefix |
|
||||
joinNounSuffix |
|
||||
|
|
@ -150,6 +159,25 @@ export interface PretokenizedSpan {
|
|||
tokenization: PretokenizedToken[];
|
||||
}
|
||||
|
||||
/**
|
||||
* Describes global configuration for Kiwi.
|
||||
*/
|
||||
export interface KiwiConfig {
|
||||
integrateAllomorph?: boolean;
|
||||
cutOffThreshold?: number;
|
||||
oovRuleScale?: number;
|
||||
oovRuleBias?: number;
|
||||
oovChrBias?: number;
|
||||
oovGlobalWeight?: number;
|
||||
oovLocalWeight?: number;
|
||||
oovGlobalMinFreq?: number;
|
||||
spacePenalty?: number;
|
||||
typoCostWeight?: number;
|
||||
maxUnkFormSize?: number;
|
||||
maxUnkFormSizeFollowedByJClass?: number;
|
||||
spaceTolerance?: number;
|
||||
}
|
||||
|
||||
/**
|
||||
* Interface that performs the actual morphological analysis.
|
||||
* Cannot be constructed directly, use {@link KiwiBuilder} to create a new instance.
|
||||
|
|
@ -177,7 +205,9 @@ export interface Kiwi {
|
|||
str: string,
|
||||
matchOptions?: Match,
|
||||
blockList?: Morph[] | MorphemeSet,
|
||||
pretokenized?: PretokenizedSpan[]
|
||||
pretokenized?: PretokenizedSpan[],
|
||||
typos?: 'none' | 'basic' | 'continual' | 'basicWithContinual' | TypoTransformer,
|
||||
typoCostThreshold?: number
|
||||
) => TokenResult;
|
||||
/**
|
||||
* Performs morphological analysis. Returns multiple list of tokens along with an analysis score. Use `tokenizeTopN` if the result scores are not needed. Use `analyze` if you need only one result.
|
||||
|
|
@ -193,7 +223,9 @@ export interface Kiwi {
|
|||
n: number,
|
||||
matchOptions?: Match,
|
||||
blockList?: Morph[] | MorphemeSet,
|
||||
pretokenized?: PretokenizedSpan[]
|
||||
pretokenized?: PretokenizedSpan[],
|
||||
typos?: 'none' | 'basic' | 'continual' | 'basicWithContinual' | TypoTransformer,
|
||||
typoCostThreshold?: number
|
||||
) => TokenResult[];
|
||||
/**
|
||||
* Performs morphological analysis. Returns a single list of tokens. Use `analyze` if the result score is needed. Use `tokenizeTopN` if you need multiple results.
|
||||
|
|
@ -207,7 +239,9 @@ export interface Kiwi {
|
|||
str: string,
|
||||
matchOptions?: Match,
|
||||
blockList?: Morph[] | MorphemeSet,
|
||||
pretokenized?: PretokenizedSpan[]
|
||||
pretokenized?: PretokenizedSpan[],
|
||||
typos?: 'none' | 'basic' | 'continual' | 'basicWithContinual' | TypoTransformer,
|
||||
typoCostThreshold?: number
|
||||
) => TokenInfo[];
|
||||
/**
|
||||
* Performs morphological analysis. Returns multiple lists of tokens. Use `analyzeTopN` if the result scores are needed. Use `tokenize` if you need only one result.
|
||||
|
|
@ -223,7 +257,9 @@ export interface Kiwi {
|
|||
n: number,
|
||||
matchOptions?: Match,
|
||||
blockList?: Morph[] | MorphemeSet,
|
||||
pretokenized?: PretokenizedSpan[]
|
||||
pretokenized?: PretokenizedSpan[],
|
||||
typos?: 'none' | 'basic' | 'continual' | 'basicWithContinual' | TypoTransformer,
|
||||
typoCostThreshold?: number
|
||||
) => TokenInfo[][];
|
||||
/**
|
||||
* Returns the input text split into sentences. This method uses stemming internally during the sentence splitting process, so it can also be used to get stemming results simultaneously with sentence splitting.
|
||||
|
|
@ -249,22 +285,10 @@ export interface Kiwi {
|
|||
lmSearch?: boolean,
|
||||
withRanges?: boolean
|
||||
) => SentenceJoinResult;
|
||||
getCutOffThreshold: () => number;
|
||||
setCutOffThreshold: (v: number) => void;
|
||||
getUnkScoreBias: () => number;
|
||||
setUnkScoreBias: (v: number) => void;
|
||||
getUnkScoreScale: () => number;
|
||||
setUnkScoreScale: (v: number) => void;
|
||||
getMaxUnkFormSize: () => number;
|
||||
setMaxUnkFormSize: (v: number) => void;
|
||||
getSpaceTolerance: () => number;
|
||||
setSpaceTolerance: (v: number) => void;
|
||||
getSpacePenalty: () => number;
|
||||
setSpacePenalty: (v: number) => void;
|
||||
getTypoCostWeight: () => number;
|
||||
setTypoCostWeight: (v: number) => void;
|
||||
getIntegrateAllomorphic: () => boolean;
|
||||
setIntegrateAllomorphic: (v: boolean) => void;
|
||||
|
||||
getGlobalConfig: () => KiwiConfig;
|
||||
setGlobalConfig: (config: KiwiConfig) => void;
|
||||
|
||||
/**
|
||||
* Creates a reusable morpheme set from a list of morphemes. This is intended to be used as the `blockList` parameter for the analyse and tokenize methods.
|
||||
* NOTE: The morpheme set must be destroyed using `destroyMorphemeSet` when it is no longer needed. Otherwise, it will cause a memory leak.
|
||||
|
|
@ -280,6 +304,7 @@ export interface Kiwi {
|
|||
destroyMorphemeSet: (id: MorphemeSet) => void;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Interface that performs the actual morphological analysis.
|
||||
* Same as `Kiwi`, but with all methods returning promises. This can be used when the original `Kiwi` object is constructed with a Web Worker.
|
||||
|
|
|
|||
132
bindings/wasm/package/test/kiwi.test.ts
Normal file
132
bindings/wasm/package/test/kiwi.test.ts
Normal file
|
|
@ -0,0 +1,132 @@
|
|||
import { describe, it, expect, beforeAll } from 'vitest';
|
||||
import { KiwiBuilder } from '../src/index.js';
|
||||
import { Kiwi, Match } from '../src/kiwi.js';
|
||||
import * as fs from 'fs';
|
||||
import * as path from 'path';
|
||||
|
||||
const PROJECT_ROOT = path.resolve(__dirname, '../../../../');
|
||||
const WASM_PATH = path.resolve(PROJECT_ROOT, 'bindings/wasm/build/bindings/wasm/kiwi-wasm.wasm');
|
||||
const MODEL_DIR = path.resolve(PROJECT_ROOT, 'models/cong/base');
|
||||
|
||||
function loadModelFiles(): Record<string, Uint8Array> {
|
||||
const modelFiles: Record<string, Uint8Array> = {};
|
||||
const files = [
|
||||
'combiningRule.txt', 'cong.mdl', 'default.dict',
|
||||
'dialect.dict', 'extract.mdl', 'multi.dict',
|
||||
'nounchr.mdl', 'sj.morph', 'typo.dict'
|
||||
];
|
||||
|
||||
for (const file of files) {
|
||||
const filePath = path.join(MODEL_DIR, file);
|
||||
if (fs.existsSync(filePath)) {
|
||||
modelFiles[file] = fs.readFileSync(filePath);
|
||||
}
|
||||
}
|
||||
return modelFiles;
|
||||
}
|
||||
|
||||
describe('Kiwi WASM', () => {
|
||||
let kiwiBuilder: KiwiBuilder;
|
||||
let kiwi: Kiwi;
|
||||
|
||||
beforeAll(async () => {
|
||||
if (!fs.existsSync(WASM_PATH)) {
|
||||
console.warn(`WASM file not found at ${WASM_PATH}. Skipping tests.`);
|
||||
return;
|
||||
}
|
||||
kiwiBuilder = await KiwiBuilder.create(WASM_PATH);
|
||||
kiwi = await kiwiBuilder.build({
|
||||
modelFiles: loadModelFiles(),
|
||||
modelType: 'cong',
|
||||
integrateAllomorph: true,
|
||||
});
|
||||
});
|
||||
|
||||
it('should be initialized', async () => {
|
||||
if (!kiwiBuilder) return;
|
||||
expect(kiwiBuilder.version()).toBeTypeOf('string');
|
||||
});
|
||||
|
||||
it('should tokenize text', async () => {
|
||||
if (!kiwi) return;
|
||||
|
||||
expect(kiwi.ready()).toBe(true);
|
||||
|
||||
const result = kiwi.tokenize('안녕하세요 세계');
|
||||
expect(result.length).toBeGreaterThan(0);
|
||||
|
||||
const tokens = result.map(t => t.str);
|
||||
expect(tokens).toContain('안녕');
|
||||
});
|
||||
|
||||
it('should split sentences', async () => {
|
||||
if (!kiwi) return;
|
||||
|
||||
const result = kiwi.splitIntoSents('안녕하세요. 반갑습니다!');
|
||||
expect(result.spans.length).toBe(2);
|
||||
});
|
||||
|
||||
it('should correct typos with basic typo set via tokenize', async () => {
|
||||
if (!kiwi) return;
|
||||
|
||||
// Without typo correction
|
||||
const tokensNoTypo = kiwi.tokenize('나 죰 도와죠.');
|
||||
expect(tokensNoTypo.map(t => t.str)).toContain('죰');
|
||||
|
||||
// With basic typo correction at analyze time
|
||||
const tokensWithTypo = kiwi.tokenize('나 죰 도와죠.', Match.allWithNormalizing, undefined, undefined, 'basic');
|
||||
expect(tokensWithTypo.map(t => t.str)).toContain('좀');
|
||||
});
|
||||
|
||||
it('should correct typos with basic typo set via analyze', async () => {
|
||||
if (!kiwi) return;
|
||||
|
||||
const result = kiwi.analyze('나 죰 도와죠.', Match.allWithNormalizing, undefined, undefined, 'basic');
|
||||
expect(result.tokens.map(t => t.str)).toContain('좀');
|
||||
});
|
||||
|
||||
it('should correct typos with basic typo set via analyzeTopN', async () => {
|
||||
if (!kiwi) return;
|
||||
|
||||
const results = kiwi.analyzeTopN('나 죰 도와죠.', 3, Match.allWithNormalizing, undefined, undefined, 'basic');
|
||||
expect(results.length).toBeGreaterThan(0);
|
||||
expect(results[0].tokens.map(t => t.str)).toContain('좀');
|
||||
});
|
||||
|
||||
it('should correct continual typos', async () => {
|
||||
if (!kiwi) return;
|
||||
|
||||
const tokens = kiwi.tokenize('프로그래미', Match.allWithNormalizing, undefined, undefined, 'continual');
|
||||
const forms = tokens.map(t => t.str);
|
||||
expect(forms[0]).toBe('프로그램');
|
||||
expect(forms[1]).toBe('이');
|
||||
});
|
||||
|
||||
it('should correct typos with basicWithContinual', async () => {
|
||||
if (!kiwi) return;
|
||||
|
||||
// continual typo
|
||||
const tokens1 = kiwi.tokenize('프로그래미', Match.allWithNormalizing, undefined, undefined, 'basicWithContinual');
|
||||
expect(tokens1.map(t => t.str)[0]).toBe('프로그램');
|
||||
|
||||
// basic typo
|
||||
const tokens2 = kiwi.tokenize('나 죰 도와죠.', Match.allWithNormalizing, undefined, undefined, 'basicWithContinual');
|
||||
expect(tokens2.map(t => t.str)).toContain('좀');
|
||||
});
|
||||
|
||||
it('should get and set global config', async () => {
|
||||
if (!kiwi) return;
|
||||
|
||||
const config = kiwi.getGlobalConfig();
|
||||
expect(config.cutOffThreshold).toBeTypeOf('number');
|
||||
|
||||
const originalThreshold = config.cutOffThreshold;
|
||||
kiwi.setGlobalConfig({ cutOffThreshold: 10 });
|
||||
|
||||
const newConfig = kiwi.getGlobalConfig();
|
||||
expect(newConfig.cutOffThreshold).toBe(10);
|
||||
|
||||
// Restore
|
||||
kiwi.setGlobalConfig({ cutOffThreshold: originalThreshold });
|
||||
});
|
||||
});
|
||||
|
|
@ -1,9 +1,11 @@
|
|||
{
|
||||
"compilerOptions": {
|
||||
"module": "ES6",
|
||||
"moduleResolution": "node",
|
||||
"target": "ES2017",
|
||||
"allowJs": true,
|
||||
"declaration": true,
|
||||
"skipLibCheck": true,
|
||||
"outDir": "./dist"
|
||||
},
|
||||
"include": ["src/**/*"],
|
||||
|
|
|
|||
|
|
@ -11,19 +11,19 @@
|
|||
아 때는 싸드바리로 배깥을 돌아댕기기도 하다와. 아__23/NNG 때/NNG 는/JX 싸드바리__1/NNG 로/XSM 배깥__1/NNG 을/JKO 돌아댕기__1/VV 기/ETN 도/JX 하/VX 다와/EF ./SF 아이 때는 알몸으로 바깥을 돌아다니기도 합디다
|
||||
이따가 지냑에 동네 으른들 모세 놓구 술대즙으 한다든데요. 이따가/MAG 지냑__1/NNG 에/JKB 동네/NNG 으른__1/NNG 들/XSN 모시/VV 어/EC 놓/VV 구/EC 술대즙__1/NNG 으/JKO 하/VV ᆫ다든데/EF 요/JX ./SF 이따가 저녁에 동네 어른들 모셔 놓고 술대접을 한다던데요.
|
||||
할루근네 한 번씩 말으 일기든 우리 아덜내미거 이제는 다 커서 취직으 했잖소. 할루근네__1/MAG 한/MM 번/NNB 씩/XSN 말__34/NNG 으/JKO 일기__1/VV 든/ETM 우리/NP 아덜내미__1/NNG 거/JKS 이제/NNG 는/JX 다/MAG 크/VV 어서/EC 취직/NNG 으/JKO 하/XSV 었/EP 지/EC 않/VX 소/EC ./SF 하루건너 한 번씩 말썽을 일으키던 우리 아들내미가 이제는 다 커서 취직을 했잖소.
|
||||
되바래진 어린눔이 버르장머리라군 웂구 아주 앨미르와. 되바라래지/VV ᆫ/ETM 어린눔__1/NNG 이/JKS 버르장머리/NNG 이/VCP 라구/EC ᆫ/JX 웂__1/VA 구/EC 아주/MAG 앨미릅/VA-I 어/EF ./SF 되바라진 어린놈이 버르장머리라곤 없고 아주 얄미워.
|
||||
되바래진 어린눔이 버르장머리라군 웂구 아주 앨미르와. 되바래지/VV ᆫ/ETM 어린눔__1/NNG 이/JKS 버르장머리/NNG 이/VCP 라구/EC ᆫ/JX 웂__1/VA 구/EC 아주/MAG 앨미릅/VA-I 어/EF ./SF 되바라진 어린놈이 버르장머리라곤 없고 아주 얄미워.
|
||||
지가 잘못해 놓구 외루 화르 내드라니! 지__49/NNG 가/JKS 잘못/MAG 하/VV 어/EC 놓/VX 구/EC 외루__3/MAG 화/NNG 르/JKO 내/VV 드라니/EF !/SF 자기가 잘못해 놓고 외려 화를 내더라니!
|
||||
맛없는 음석이래도 개 주자니 아꿉다. 맛없/VA 는/ETM 음석__1/NNG 이/VCP 래도/EC 개/NNG 주/VV 자니/EC 아꿉__1/VA-I 다/EF ./SF 맛없는 음식이라도 개 주자니 아깝다.
|
||||
말 안 듣구 나가 놀다가 넘어졌다니 그것 아주 싸고지다. 말/NNG 안/MAG 듣/VV-I 구/EC 나가/VV 어/EC 놀/VV 다가/EC 넘어지/VV 었/EP 다니/EC 그것/NP 아주/MAG 싸고지__1/NNG 이/VCP 다/EF ./SF 말 안 듣고 나가 놀다가 넘어졌다니 그것 아주 쌤통이다.
|
||||
자는 어렜을 적부터 대두 쫄라 대는 버릇이 있었아. 자/NP 는/JX 어리/VA 었/EP 을/ETM 적/NNB 부터/JX 대두__9/MAG 쪼르/VV 어/EC 대/VX 는/ETM 버릇/NNG 이/JKS 있/VV 었/EP 아/EF ./SF 쟤는 어렸을 적부터 되우 졸라 대는 버릇이 있었어.
|
||||
초저욹이래두 바람이 차니까 옷으 뜨시게 입구 나가라. 초저욹__1/NNG 이/VCP 래두/EC 바람/NNG 이/JKS 차/VA 니까/EC 옷/NNG 으/JKO 뜨시__1/VA 게/EC 입/VV-R 구/EC 나가/VV 라/EF ./SF 초겨울이라도 바람이 차니까 옷을 따뜻하게 입고 나가라.
|
||||
책상 모새이에 부닺헤서 멍당구가 시퍼렇게 들었아. 책상/NNG 모새이__1/NNG 에/JKB 부닺히/VV 어서/EC 멍당구__1/NNG 가/NNG 시퍼렇/VA-I 게/EC 들/VV 었/EP 아/EF ./SF 책상 모서리에 부딪혀서 멍이 시퍼렇게 들었어.
|
||||
책상 모새이에 부닺헤서 멍당구가 시퍼렇게 들었아. 책상/NNG 모새이__1/NNG 에/JKB 부닺히/VV 어서/EC 멍당구__1/NNG 가/JKS 시퍼렇/VA-I 게/EC 들/VV 었/EP 아/EF ./SF 책상 모서리에 부딪혀서 멍이 시퍼렇게 들었어.
|
||||
올해는 가믐더우 땜에 나락이 쨀고 그래서 걱정이래. 올해/NNG 는/JX 가믐더우__1/NNG 땜/NNB 에/JKB 나락__2/NNG 이/MM 쨀__1/VA 고/EC 그래서/MAJ 걱정/NNG 이/VCP 래/EF ./SF 올해는 가뭄더위 땜에 벼가 잘고 그래서 걱정이래.
|
||||
버점은 허옇기 일어나는 것도 있고 자꾸 번지 나가는 것도 있어여. 버점__1/NNG 은/JX 허옇/VA-I 기/ETN 일어나/VV 는/ETM 것/NNB 도/JX 있/VA 고/EC 자꾸/MAG 번지/VV 어/EC 나가/VX 는/ETM 것/NNB 도/JX 있/VA 어여/EF ./SF 버짐은 허옇게 일어나는 것도 있고 자꾸 번져 나가는 것도 있어요.
|
||||
우산 없이 길으 가다가 소나기르 좔락 다 맞았다. 우산/NNG 없이/MAG 길/NNG 으/JKO 가/VV 다가/EC 소나기/NNG 르/JKO 좔락__1/MAG 다/MAG 맞/VV 었/EP 다/EF ./SF 우산 없이 길을 가다가 소나기를 쫄딱 다 맞았다.
|
||||
콩국시가 아주 걸찌한 기 맛이 참 고만이네. 콩국시__1/NNG 가/JKS 아주/MAG 걸찌/XR 하/XSA ᆫ/ETM 기__76/NNB 이/JKS 맛/NNG 이/JKS 참/MAG 고만/MAG 이/VCP 네/EF ./SF 콩국수가 아주 걸쭉한 게 맛이 참 고만이네.
|
||||
가서 쇠궁이에 여물 좀 줘라. 가/VV 어서/EC 쇠궁이__1/NNG 에/JKB 여물/NNG 좀/MAG 주/VV 어라/EF ./SF 가서 소구유에 여물 좀 줘라.
|
||||
구리가 궁게이로 드가 뿠어. 구렁이/NNG 가/JKS 궁게이__1/NNG 로/XSM 드가/VV 어/EC 뿌/VX 었/EP 어/EF ./SF 구렁이가 구멍으로 들어가 버렸어.
|
||||
구리가 궁게이로 드가 뿠어. 구리__14/NNG 가/JKS 궁게이__1/NNG 로/XSM 드가/VV 어/EC 뿌/VX 었/EP 어/EF ./SF 구렁이가 구멍으로 들어가 버렸어.
|
||||
엉차 내가 다 할 일이잖소. 엉차__1/MAG 나/NP 가/JKS 다/MAG 하/VV ᆯ/ETM 일/NNG 이/VCP 잖소/EF ./SF 어차피 내가 다 할 일이잖소.
|
||||
어제 자에 갔더거 어릴 적 친구르 맞주이했잖소. 어제/MAG 자__53/NNG 에/JKB 가/VV 었/EP 더거/EC 어리/VA ᆯ/ETM 적/NNB 친구/NNG 르/JKO 맞주이하/VV 었/EP 잖소/EF ./SF 어제 장에 갔다가 어릴 적 친구와 마주쳤잖소.
|
||||
떡으 할라모 실그가 있어야 하잖소. 떡/NNG 으/JKO 하/VV ᆯ라모/EC 실그__1/NNG 가/JKS 있/VV 어야/EC 하/VX 잖소/EF ./SF 떡을 하려면 시루가 있어야 하잖소.
|
||||
|
|
|
|||
|
|
@ -28,7 +28,7 @@
|
|||
나는 그 음식을 음작음작 먹엇주. 나/NP 는/JX 그/MM 음식/NNG 을/JKO 음작음작__1/MAG 먹/VV 엇/EP 주/EF ./SF 나는 그 음식을 우물우물 먹었지.
|
||||
작년에 입어난 바진디 오돌랑한 거 보난 지레가 하영 큰 셍이라. 작년/NNG 에/JKB 입/VV-R 어나/EP ᆫ/ETM 바지/NNG 이/VCP ᆫ디/EC 오돌랑하__1/VA ᆫ/ETM 거/NNB 보/VV 난/EC 지레__3/NNG 가/JKS 하영__1/MAG 크/VV ᆫ/ETM 셍__2/NNB 이/VCP 라/EF ./SF 작년에 입었던 바지인데 껑충한 것을 보니 키가 많이 큰 모양이야.
|
||||
낭이 잘 자라젠 ᄒᆞ민 알거시려 줘사주게. 낭__2/NNG 이/JKS 잘/MAG 자라/VV 젠/EC ᄒᆞ/VX 민/EC 알거시리__1/VV 어/EC 주/VX 어사/EC 주/EF 게/EF ./SF 나무가 잘 자라려고 하면 가지치기해 줘야지.
|
||||
보릿낭을 긏앙 입에 대영 불민 소리가 나는디 걸 보리피리렌 ᄀᆞᆮ주. 보릿낭__1/NNG 을/JKO 긏/VV 엉/EC 입/NNG 에/JKB 대/VV 엉/EC 불/VV 민/EC 소리/NNG 가/JKS 나/VV 는디/EC 거/NP ᆯ/JKO 보리피리/NNG 이/VCP 렌/EC ᄀᆞᆮ/VV-I 주/EF ./SF 보릿짚을 잘라서 입에 대고 불면 소리가 나는데 그걸 보리피리라고 말하지.
|
||||
보릿낭을 긏앙 입에 대영 불민 소리가 나는디 걸 보리피리렌 ᄀᆞᆮ주. 보릿낭__1/NNG 을/JKO 긏/VV 엉/EC 입/NNG 에/JKB 대/VV 엉/EC 불/VV 민/EC 소리/NNG 가/JKS 나/VV 는디/EC 거/NP ᆯ/JKO 보리피리/NNG 이/VCP 렌/EC ᄀᆞᆮ/VV-I 주/EF ./SF 보릿짚을 잘라서 입에 대고 불면 소리가 나는데 그걸 보리피리라고 말하지.
|
||||
부리땡이에 데나네 조심허라. 부리땡이__1/NNG 에/JKB 데/VV 나네/EC 조심/NNG ᄒᆞ/XSV 어라/EF ./SF 부지깽이에 데니까 조심해라.
|
||||
친구덜끼리 모연 춤추곡 놀레 부르곡 잠베질하곡 놀앗주게. 친구/NNG 덜/XSN 끼리/XSN 모이/VV 언/EC 춤추/VV 곡/EC 놀래__1/NNG 부르/VV 곡/EC 잠베질하__1/VV 곡/EC 놀/VV 엇/EP 주/EF 게/EF ./SF 친구들끼리 모여서 춤추고 노래하고 재잘거리고 놀았지요.
|
||||
보리 ᄀᆞᆯ앙 보리ᄊᆞᆯ은 사름 먹곡 보리체는 쉐나 뒈지 것으로 주곡 헷엇지. 보리/NNG ᄀᆞᆯ__2/VV 엉/EC 보리/NNG ᄊᆞᆯ/NNG 은/JX 사름__2/NNG 먹/VV 곡/EC 보리/NNG 체/NNG 는/JX 쉐__4/NNG 나/JC 뒈지__1/NNG 것__7/NNG 으로/JKB 주/VV 곡/EC ᄒᆞ/VX 엇엇/EP 지/EF ./SF 보리 갈아서 보리쌀은 사람 먹고 보리체는 소나 돼지 먹이로 주고 했었지.
|
||||
|
|
@ -80,7 +80,7 @@
|
|||
아이덜신디 밥 ᄒᆞ꼼 줄 때도 가지깽이에 떵 주민 안 뒈여. 아이/NNG 덜/XSN 신디__1/JKB 밥/NNG ᄒᆞ꼼__3/MAG 주/VV ᆯ/ETM 때/NNG 도/JX 가지깽이__1/NNG 에/JKB 뜨/VV 엉/EC 주/VX 민/EC 안/MAG 뒈__3/VV 어/EF ./SF 아이들에게 밥 조금 줄 때도 바리뚜껑에 떠서 주면 안 돼.
|
||||
옛날엔 배 골르지 아녀게 지내민 것으로 뒈엇엇주. 옛날/NNG 에/JKB ᆫ/JX 배/NNG 골르__4/VV 지/EC 아니/VCN 어게/EC 지내/VV 민/EC 것__9/NP 으로/JKB 뒈__3/VV 엇엇/EP 주/EF ./SF 옛날엔 배를 곯지 않게 지내면 그것으로 되었었지.
|
||||
여이난 씨언한 물 좀 줍서. 여이__2/VV 난/EC 씨언하__1/VA ᆫ/ETM 물/NNG 좀/MAG 주/VV ᆸ서/EF ./SF 목마르니 시원한 물 좀 주세요.
|
||||
ᄒᆞ꼼허민 벨착벨착, 그 성질머리부터 고쳐사 헤. ᄒᆞ꼼__3/MAG ᄒᆞ/VV 민/EC 벨착벨착__1/MAG ,/SP 그/MM 성질/NNG 머리/NNG 부터/JX 고치/VV 어사/EC ᄒᆞ/VX 어/EF ./SF 툭하면 발끈발끈, 그 성질부터 고쳐야 해.
|
||||
ᄒᆞ꼼허민 벨착벨착, 그 성질머리부터 고쳐사 헤. ᄒᆞ꼼__3/MAG ᄒᆞ/VV 민/EC 벨착벨착__1/MAG ,/SP 그/MM 성질/NNG 머리/NNG 부터/JX 고치/VV 어사/EC ᄒᆞ/VX 어/EF ./SF 툭하면 발끈발끈, 그 성질부터 고쳐야 해.
|
||||
체허영 소화 안 뒈곡 헐 때 바농땡이로 손 따민 내려가메. 체/NNG ᄒᆞ/XSV 엉/EC 소화/NNG 안/MAG 뒈__3/VV 곡/EC ᄒᆞ/VV ᆯ/ETM 때/NNG 바농땡이__1/NNG 로/JKB 손/NNG 따/VV 민/EC 내려가/VV 메/EF ./SF 체해서 소화 안 되고 할 때 바늘로 손을 따면 내려가지.
|
||||
그 사름은 경 심보가 나쁘난 일찍 뒈여졋주. 그/MM 사름__2/NNG 은/JX 경__1/MAG 심보/NNG 가/JKS 나쁘/VA 난/EC 일찍/MAG 뒈여지__1/VV 엇/EP 주/EF ./SF 그 사람은 그리 심보가 나쁘니까 일찍 죽었지.
|
||||
밥맛도 엇고 ᄒᆞ영 그냥 물에 잠아먹엇주게. 밥맛/NNG 도/JX 엇__2/VA 고/EC ᄒᆞ/VX 엉/EC 그냥/MAG 물/NNG 에/JKB 잠아먹__1/VV 엇/EP 주/EF 게/EF ./SF 밥맛도 없고 해서 그냥 물에 말아먹었지요.
|
||||
|
|
|
|||
|
|
@ -17,9 +17,16 @@ namespace kiwi
|
|||
{
|
||||
struct CoNgramModelHeader
|
||||
{
|
||||
enum
|
||||
{
|
||||
hasOutputEmbBias = 1 << 0,
|
||||
hasReorderedVocab = 1 << 1,
|
||||
hasTrieFrequency = 1 << 2,
|
||||
};
|
||||
|
||||
uint64_t vocabSize, contextSize;
|
||||
uint16_t dim;
|
||||
uint8_t contextType, outputType;
|
||||
uint16_t flags;
|
||||
uint8_t keySize, windowSize, qbit, qgroup;
|
||||
uint64_t numNodes;
|
||||
uint64_t nodeOffset, keyOffset, valueOffset, embOffset;
|
||||
|
|
@ -34,6 +41,27 @@ namespace kiwi
|
|||
uint32_t nextOffset = 0;
|
||||
};
|
||||
|
||||
template<>
|
||||
struct Node<uint16_t, uint32_t, int32_t>
|
||||
{
|
||||
uint16_t numNexts = 0;
|
||||
uint16_t depth = 0;
|
||||
uint32_t value = 0;
|
||||
int32_t lower = 0;
|
||||
uint32_t nextOffset = 0;
|
||||
};
|
||||
|
||||
template<class T>
|
||||
struct HasDepthField : public std::false_type
|
||||
{
|
||||
};
|
||||
|
||||
template<>
|
||||
struct HasDepthField<Node<uint16_t, uint32_t, int32_t>> : public std::true_type
|
||||
{
|
||||
};
|
||||
|
||||
|
||||
class CoNgramModelBase : public ILangModel
|
||||
{
|
||||
protected:
|
||||
|
|
@ -61,7 +89,12 @@ namespace kiwi
|
|||
virtual size_t predictWordsFromContextDiff(uint32_t contextId, uint32_t bgContextId, float weight, size_t topN, std::pair<uint32_t, float>* output) const = 0;
|
||||
|
||||
virtual uint32_t toContextId(const uint32_t* vocabIds, size_t size) const = 0;
|
||||
virtual float getContextFrequency(uint32_t contextId) const = 0;
|
||||
virtual float getContextEntropy(uint32_t contextId) const = 0;
|
||||
virtual size_t getNodeDepth(uint32_t nodeId) const = 0;
|
||||
|
||||
virtual std::vector<std::vector<uint32_t>> getContextWordMap() const = 0;
|
||||
virtual float progressOneStep(int32_t& nodeIdx, uint32_t& contextIdx, uint32_t next) const = 0;
|
||||
|
||||
const std::vector<std::vector<uint32_t>>& getContextWordMapCached() const
|
||||
{
|
||||
|
|
@ -78,6 +111,9 @@ namespace kiwi
|
|||
bool reorderContextIdx = true,
|
||||
const std::vector<size_t>* selectedEmbIdx = nullptr);
|
||||
|
||||
static utils::MemoryObject buildChrModel(const std::string& contextDefinition, const std::string& embedding,
|
||||
size_t maxContextLength = -1, bool reorderContextIdx = true, bool eraseRedundantContexts = false);
|
||||
|
||||
static std::unique_ptr<CoNgramModelBase> create(utils::MemoryObject&& mem,
|
||||
ArchType archType = ArchType::none,
|
||||
bool useDistantTokens = false,
|
||||
|
|
|
|||
|
|
@ -136,4 +136,81 @@ namespace kiwi
|
|||
|
||||
std::vector<std::pair<std::vector<uint32_t>, size_t>> extractPrefixes(size_t minCnt, size_t maxLength, size_t numWorkers = 1, bool exclusiveCnt = false) const;
|
||||
};
|
||||
|
||||
class ChrTokenizer
|
||||
{
|
||||
public:
|
||||
|
||||
enum class Token : int32_t
|
||||
{
|
||||
bos = 0,
|
||||
eos = 0,
|
||||
sf, sp, ss, sso, ssc, se, so, sw, sh,
|
||||
hangulSyllableStart,
|
||||
hangulCodaStart = hangulSyllableStart + 399,
|
||||
asciiStart = hangulCodaStart + 27,
|
||||
max = asciiStart + 94,
|
||||
};
|
||||
size_t encodeOne(char32_t ch) const;
|
||||
size_t encode(std::string_view text, int32_t* outBuf, size_t bufSize) const;
|
||||
std::u16string decode(const int32_t* tokenBuf, size_t tokenCnt) const;
|
||||
size_t vocabSize() const { return static_cast<size_t>(Token::max); }
|
||||
};
|
||||
|
||||
class ChrDataset
|
||||
{
|
||||
static constexpr int32_t nonVocab = -1;
|
||||
|
||||
HiddenMember<RaggedVector<int32_t>, sizeof(Vector<size_t>) * 2> sents;
|
||||
Vector<float> sentWeights, sentSampled;
|
||||
Vector<uint32_t> shuffledIdcs;
|
||||
Vector<uint32_t> nonLabelPrefixSizes;
|
||||
double totalWeight = 0.;
|
||||
size_t totalSampled = 0;
|
||||
std::unique_ptr<utils::ThreadPool> workers;
|
||||
float prefixDropoutProb = 0.f;
|
||||
std::mt19937_64 rng;
|
||||
utils::FrozenTrie<uint32_t, uint32_t> contextualMapper;
|
||||
size_t batchSize = 0;
|
||||
size_t causalContextSize = 0;
|
||||
size_t windowSize = 0;
|
||||
size_t currentSeed = 0;
|
||||
size_t consumedSents = 0;
|
||||
bool sampleWithoutWeights = false;
|
||||
|
||||
template<class InTy, class OutTy>
|
||||
size_t _next(InTy in, OutTy out);
|
||||
|
||||
public:
|
||||
ChrDataset(size_t _batchSize = 0,
|
||||
size_t _causalContextSize = 0,
|
||||
size_t _windowSize = 0,
|
||||
float _prefixDropoutProb = 0.f,
|
||||
bool _sampleWithoutWeights = false,
|
||||
const std::vector<std::pair<size_t, std::vector<uint32_t>>>& contextualMapper = {}
|
||||
);
|
||||
~ChrDataset();
|
||||
ChrDataset(const ChrDataset&) = delete;
|
||||
ChrDataset(ChrDataset&&) /*noexcept*/;
|
||||
ChrDataset& operator=(const ChrDataset&) = delete;
|
||||
ChrDataset& operator=(ChrDataset&&) /*noexcept*/;
|
||||
|
||||
void addSentence(std::string_view sentence, float weight = 1.f, std::string_view nonLabelPrefix = {});
|
||||
|
||||
size_t numSents() const;
|
||||
|
||||
double getTotalWeight() const { return totalWeight; }
|
||||
size_t getBatchSize() const { return batchSize; }
|
||||
size_t getCausalContextSize() const { return causalContextSize; }
|
||||
size_t getWindowSize() const { return windowSize; }
|
||||
size_t vocabSize() const { return ChrTokenizer{}.vocabSize(); }
|
||||
std::vector<float> getVocabProbs(double epsilon = 0.1) const;
|
||||
|
||||
void seed(size_t newSeed);
|
||||
void reset();
|
||||
size_t next(int32_t* in, int32_t* out);
|
||||
size_t next(int64_t* in, int64_t* out);
|
||||
|
||||
std::vector<std::pair<std::vector<uint32_t>, double>> extractPrefixes(float resolution, float minWeight, size_t maxLength, size_t numWorkers = 1, bool exclusiveCnt = false) const;
|
||||
};
|
||||
}
|
||||
|
|
|
|||
|
|
@ -2,8 +2,8 @@
|
|||
* @file Form.h
|
||||
* @author bab2min (bab2min@gmail.com)
|
||||
* @brief 형태 및 형태소에 관한 정보를 담는 구조체들이 선언된 헤더
|
||||
* @version 0.22.1
|
||||
* @date 2025-11-21
|
||||
* @version 0.23.1
|
||||
* @date 2026-04-05
|
||||
*
|
||||
*
|
||||
*/
|
||||
|
|
@ -238,6 +238,9 @@ namespace kiwi
|
|||
uint8_t formHash = 0;
|
||||
uint8_t zCodaAppendable : 1;
|
||||
uint8_t zSiotAppendable : 1;
|
||||
uint8_t hasJClass : 1;
|
||||
uint8_t hasAnyFullMorphemes : 1;
|
||||
uint8_t _reserved : 4;
|
||||
Dialect dialect = Dialect::standard;
|
||||
|
||||
Form();
|
||||
|
|
|
|||
|
|
@ -77,6 +77,7 @@ namespace kiwi
|
|||
struct Node
|
||||
{
|
||||
Key numNexts = 0;
|
||||
uint16_t depth = 0;
|
||||
Diff lower = 0;
|
||||
uint32_t nextOffset = 0;
|
||||
|
||||
|
|
|
|||
|
|
@ -2,8 +2,8 @@
|
|||
* @file Kiwi.h
|
||||
* @author bab2min (bab2min@gmail.com)
|
||||
* @brief Kiwi C++ API를 담고 있는 헤더 파일
|
||||
* @version 0.22.1
|
||||
* @date 2025-11-21
|
||||
* @version 0.23.1
|
||||
* @date 2026-04-05
|
||||
*
|
||||
*
|
||||
*/
|
||||
|
|
@ -68,21 +68,69 @@ namespace kiwi
|
|||
|
||||
struct AnalyzeOption
|
||||
{
|
||||
/**
|
||||
* @brief 형태소 분석 시 매칭 옵션
|
||||
*/
|
||||
Match match = Match::allWithNormalizing;
|
||||
|
||||
/**
|
||||
* @brief 분석 결과에서 제외할 형태소 목록
|
||||
*/
|
||||
const std::unordered_set<const Morpheme*>* blocklist = nullptr;
|
||||
|
||||
/**
|
||||
* @brief 열린 결말 허용 여부
|
||||
*/
|
||||
bool openEnding = false;
|
||||
|
||||
/**
|
||||
* @brief 분석에 허용할 방언을 설정한다. 기본값은 표준어만 허용하는 `Dialect::standard`이다. 여러 방언을 허용하려면 비트 OR 연산자로 조합해서 설정할 수 있다.
|
||||
*/
|
||||
Dialect allowedDialects = Dialect::standard;
|
||||
|
||||
/**
|
||||
* @brief 방언에 적용할 페널티 비용.
|
||||
*/
|
||||
float dialectCost = 3.f;
|
||||
|
||||
/**
|
||||
* @brief 오타 교정에 사용할 PreparedTypoTransformer 객체
|
||||
* 이 객체는 TypoTransformer::prepare(true)로 생성되어야 한다.
|
||||
* allowedDialects에 방언을 포함하고 있고, typoTransformer가 nullptr인 경우 자동으로 getDefaultPreparedTypoSet(DefaultTypoSet::dialect)에서 생성된 PreparedTypoTransformer가 사용된다.
|
||||
*/
|
||||
const PreparedTypoTransformer* typoTransformer = nullptr;
|
||||
|
||||
/**
|
||||
* @brief 오타 교정 시 허용할 최대 비용 임계값
|
||||
*/
|
||||
float typoThreshold = 2.5f;
|
||||
|
||||
AnalyzeOption() = default;
|
||||
AnalyzeOption(Match m,
|
||||
const std::unordered_set<const Morpheme*>* bl = nullptr,
|
||||
bool oe = false,
|
||||
Dialect ad = Dialect::standard,
|
||||
float dc = 3.f
|
||||
)
|
||||
: match{ m }, blocklist{ bl }, openEnding{ oe }, allowedDialects{ ad }, dialectCost{ dc }
|
||||
float dc = 3.f,
|
||||
const PreparedTypoTransformer* tt = nullptr,
|
||||
float tth = 2.5f
|
||||
)
|
||||
: match{ m }, blocklist{ bl }, openEnding{ oe }, allowedDialects{ ad }, dialectCost{ dc }, typoTransformer{ tt }, typoThreshold{ tth }
|
||||
{}
|
||||
|
||||
AnalyzeOption withMatch(Match m) const
|
||||
{
|
||||
AnalyzeOption copy = *this;
|
||||
copy.match = m;
|
||||
return copy;
|
||||
}
|
||||
|
||||
AnalyzeOption withTypoTransformer(const PreparedTypoTransformer* tt, float typoThreshold = 2.5f) const
|
||||
{
|
||||
AnalyzeOption copy = *this;
|
||||
copy.typoTransformer = tt;
|
||||
copy.typoThreshold = typoThreshold;
|
||||
return copy;
|
||||
}
|
||||
};
|
||||
|
||||
struct MorphemeDef
|
||||
|
|
@ -103,12 +151,19 @@ namespace kiwi
|
|||
{
|
||||
bool integrateAllomorph = true;
|
||||
float cutOffThreshold = 8;
|
||||
float unkFormScoreScale = 5;
|
||||
float unkFormScoreBias = 5;
|
||||
float oovRuleScale = 4;
|
||||
float oovRuleBias = 4;
|
||||
float oovChrBias = 0;
|
||||
float oovGlobalWeight = 35;
|
||||
float oovLocalWeight = 3;
|
||||
float oovGlobalMinFreq = 4;
|
||||
float spacePenalty = 7;
|
||||
float typoCostWeight = 6;
|
||||
uint32_t maxUnkFormSize = 6;
|
||||
uint32_t maxUnkFormSizeFollowedByJClass = (uint32_t)-1;
|
||||
uint32_t spaceTolerance = 0;
|
||||
float oovCutOffThreshold = 5;
|
||||
float oovTotalSmoothness = 0.1f;
|
||||
|
||||
void validate() const;
|
||||
};
|
||||
|
|
@ -121,7 +176,7 @@ namespace kiwi
|
|||
{
|
||||
friend class KiwiBuilder;
|
||||
template<class LangModel> friend struct BestPathFinder;
|
||||
template<class LmState, class> friend struct PathEvaluator;
|
||||
template<class WordLL, class> friend struct PathEvaluator;
|
||||
template<class LmState> friend struct MorphemeEvaluator;
|
||||
friend class cmb::AutoJoiner;
|
||||
template<template<ArchType> class LmState> friend struct NewAutoJoinerGetter;
|
||||
|
|
@ -141,6 +196,7 @@ namespace kiwi
|
|||
Vector<TypoForm> typoForms;
|
||||
utils::FrozenTrie<kchar_t, const Form*> formTrie;
|
||||
std::shared_ptr<lm::ILangModel> langMdl;
|
||||
std::shared_ptr<lm::CoNgramModelBase> nounChrMdl;
|
||||
std::shared_ptr<cmb::CompiledRule> combiningRule;
|
||||
std::unique_ptr<utils::ThreadPool> pool;
|
||||
|
||||
|
|
@ -232,7 +288,7 @@ namespace kiwi
|
|||
const std::optional<KiwiConfig>& overrideConfig = {}
|
||||
) const
|
||||
{
|
||||
return analyze(str, 1, option, pretokenized)[0];
|
||||
return analyze(str, 1, option, pretokenized, overrideConfig)[0];
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -249,7 +305,7 @@ namespace kiwi
|
|||
{
|
||||
std::vector<size_t> bytePositions;
|
||||
auto u16str = utf8To16(str, bytePositions);
|
||||
return analyze(u16str, option, mapPretokenizedSpansToU16(pretokenized, bytePositions));
|
||||
return analyze(u16str, option, mapPretokenizedSpansToU16(pretokenized, bytePositions), overrideConfig);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -280,7 +336,7 @@ namespace kiwi
|
|||
{
|
||||
std::vector<size_t> bytePositions;
|
||||
auto u16str = utf8To16(str, bytePositions);
|
||||
return analyze(u16str, topN, option, mapPretokenizedSpansToU16(pretokenized, bytePositions));
|
||||
return analyze(u16str, topN, option, mapPretokenizedSpansToU16(pretokenized, bytePositions), overrideConfig);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -537,6 +593,7 @@ namespace kiwi
|
|||
Vector<MorphemeRaw> morphemes;
|
||||
UnorderedMap<KString, size_t> formMap;
|
||||
std::shared_ptr<lm::ILangModel> langMdl;
|
||||
std::shared_ptr<lm::CoNgramModelBase> nounChrMdl;
|
||||
std::shared_ptr<cmb::CompiledRule> combiningRule;
|
||||
WordDetector detector;
|
||||
Map<int, int> ruleProfilingCnt;
|
||||
|
|
|
|||
|
|
@ -4,7 +4,7 @@
|
|||
#define KIWI_STR(x) KIWI_STR_HELPER(x)
|
||||
|
||||
#define KIWI_VERSION_MAJOR 0
|
||||
#define KIWI_VERSION_MINOR 22
|
||||
#define KIWI_VERSION_MINOR 23
|
||||
#define KIWI_VERSION_PATCH 2
|
||||
|
||||
#define KIWI_VERSION_STRING KIWI_STR(KIWI_VERSION_MAJOR) "." KIWI_STR(KIWI_VERSION_MINOR) "." KIWI_STR(KIWI_VERSION_PATCH)
|
||||
|
|
|
|||
|
|
@ -16,6 +16,14 @@ namespace kiwi
|
|||
mention = 1 << 3, /**< 멘션 형태의 텍스트(@멘션)를 w_mention 태그에 매칭한다 */
|
||||
serial = 1 << 4, /**< 일련 번호 형태의 텍스트를 w_serial 태그에 매칭한다 */
|
||||
emoji = 1 << 5, /**< 이모지 문자를 w_emoji 태그에 매칭한다 */
|
||||
|
||||
oovRuleOnly = 0 << 8, /**< 사전에 없는 단어에 대해 규칙만을 사용하여 매칭한다 */
|
||||
oovChrModel = 1 << 8, /**< 사전에 없는 단어에 대해 문자 기반 OOV 모델을 사용하여 매칭한다 */
|
||||
oovChrFreqModel = 2 << 8, /**< 사전에 없는 단어에 대해 문자 빈도 기반 OOV 모델을 사용하여 매칭한다 */
|
||||
oovChrFreqBranchModel = 3 << 8, /**< 사전에 없는 단어에 대해 문자 빈도 및 브랜치 기반 OOV 모델을 사용하여 매칭한다 */
|
||||
oovMask = 3 << 8, /**< OOV 옵션들에 대한 마스크 */
|
||||
oovTotalConsistency = 1 << 10, /**< */
|
||||
|
||||
normalizeCoda = 1 << 16, /**< 초성체가 앞 어절의 받침에 따라붙은 경우를 정규화하여 매칭한다 */
|
||||
joinNounPrefix = 1 << 17, /**< 체언접두사(XPN)를 분리하지 않고 합쳐서 매칭한다 */
|
||||
joinNounSuffix = 1 << 18, /**< 명사파생접미사(XSN)를 분리하지 않고 합쳐서 매칭한다 */
|
||||
|
|
@ -27,6 +35,10 @@ namespace kiwi
|
|||
compatibleJamo = 1 << 24, /**< 출력시 한글 첫가끝 자모를 호환가능한 자모로 변환한다. */
|
||||
splitSaisiot = 1 << 25, /**< 사이시옷이 포함된 합성명사를 분리하여 매칭한다. */
|
||||
mergeSaisiot = 1 << 26, /**< 사이시옷이 포함된 것으로 추정되는 명사를 결합하여 매칭한다. */
|
||||
joinParticleYo = 1 << 27, /**< 어미(EC/EF)와 조사 "요/JX"를 통합하여 매칭한다 (예: 고/EC + 요/JX => 고요/EC) */
|
||||
|
||||
useOldSplitter = 1 << 30,
|
||||
|
||||
joinVSuffix = joinVerbSuffix | joinAdjSuffix,
|
||||
joinAffix = joinNounPrefix | joinNounSuffix | joinVerbSuffix | joinAdjSuffix | joinAdvSuffix,
|
||||
all = url | email | hashtag | mention | serial | emoji | zCoda,
|
||||
|
|
|
|||
|
|
@ -1,9 +1,9 @@
|
|||
/**
|
||||
* @file SwTokenizer.h
|
||||
* @author bab2min (bab2min@gmail.com)
|
||||
* @brief Subword Tokenizer
|
||||
* @version 0.22.1
|
||||
* @date 2025-11-21
|
||||
* @brief 형태소 기반의 Subword Tokenizer의 인터페이스를 담고 있는 헤더 파일
|
||||
* @version 0.23.1
|
||||
* @date 2026-04-05
|
||||
*
|
||||
*
|
||||
*/
|
||||
|
|
|
|||
|
|
@ -2,8 +2,8 @@
|
|||
* @file Types.h
|
||||
* @author bab2min (bab2min@gmail.com)
|
||||
* @brief Kiwi C++ API에 쓰이는 주요 타입들을 모아놓은 헤더 파일
|
||||
* @version 0.22.1
|
||||
* @date 2025-11-21
|
||||
* @version 0.23.1
|
||||
* @date 2026-04-05
|
||||
*
|
||||
*
|
||||
*/
|
||||
|
|
@ -31,23 +31,23 @@
|
|||
#include "ScriptType.h"
|
||||
|
||||
#define KIWI_DEFINE_ENUM_FLAG_OPERATORS(Type) \
|
||||
inline Type operator~(Type a)\
|
||||
inline constexpr Type operator~(Type a)\
|
||||
{\
|
||||
return static_cast<Type>(~static_cast<typename std::underlying_type<Type>::type>(a));\
|
||||
}\
|
||||
inline bool operator!(Type a)\
|
||||
inline constexpr bool operator!(Type a)\
|
||||
{\
|
||||
return a == static_cast<Type>(0);\
|
||||
}\
|
||||
inline Type operator|(Type a, Type b)\
|
||||
inline constexpr Type operator|(Type a, Type b)\
|
||||
{\
|
||||
return static_cast<Type>(static_cast<typename std::underlying_type<Type>::type>(a) | static_cast<typename std::underlying_type<Type>::type>(b));\
|
||||
}\
|
||||
inline Type operator&(Type a, Type b)\
|
||||
inline constexpr Type operator&(Type a, Type b)\
|
||||
{\
|
||||
return static_cast<Type>(static_cast<typename std::underlying_type<Type>::type>(a) & static_cast<typename std::underlying_type<Type>::type>(b));\
|
||||
}\
|
||||
inline Type operator^(Type a, Type b)\
|
||||
inline constexpr Type operator^(Type a, Type b)\
|
||||
{\
|
||||
return static_cast<Type>(static_cast<typename std::underlying_type<Type>::type>(a) ^ static_cast<typename std::underlying_type<Type>::type>(b));\
|
||||
}\
|
||||
|
|
@ -267,7 +267,9 @@ namespace kiwi
|
|||
non_vowel, /**< `vowel`의 부정 */
|
||||
non_vocalic, /**< `vocalic`의 부정 */
|
||||
non_vocalic_h, /**< `vocalic_h`의 부정 */
|
||||
applosive = 8, /**< 불파음 받침(ㄴㄹㅁㅇ을 제외한 모든 받침)*/ // not necessary, but fixed MSVC's weird bug
|
||||
applosive, /**< 오타 교정용: 불파음 받침(ㄴㄹㅁㅇ을 제외한 모든 받침) */
|
||||
continual, /**< 오타 교정용: 연철 환경임을 표시 */
|
||||
boundary, /**< 오타 교정용: 형태소 경계임을 표시 */
|
||||
};
|
||||
|
||||
/**
|
||||
|
|
@ -349,7 +351,7 @@ namespace kiwi
|
|||
uint16_t length = 0; /**< 길이(UTF16 문자 기준) */
|
||||
POSTag tag = POSTag::unknown; /**< 품사 태그 */
|
||||
union {
|
||||
uint8_t senseId = 0; /**< 의미 번호 */
|
||||
uint8_t senseId = 0; /**< 의미 번호 (OOV인 경우 -1)*/
|
||||
ScriptType script; /**< 유니코드 영역에 기반한 문자 타입 */
|
||||
};
|
||||
float score = 0; /**< 해당 형태소의 언어모델 점수 */
|
||||
|
|
@ -358,7 +360,7 @@ namespace kiwi
|
|||
uint32_t pairedToken = -1; /**< SSO, SSC 태그에 속하는 형태소의 경우 쌍을 이루는 반대쪽 형태소의 위치(-1인 경우 해당하는 형태소가 없는 것을 뜻함) */
|
||||
uint32_t subSentPosition = 0; /**< 인용부호나 괄호로 둘러싸인 하위 문장의 번호. 1부터 시작. 0인 경우 하위 문장이 아님을 뜻함 */
|
||||
Dialect dialect = Dialect::standard; /**< 방언 정보 */
|
||||
const Morpheme* morph = nullptr; /**< 기타 형태소 정보에 대한 포인터 (OOV인 경우 nullptr) */
|
||||
const Morpheme* morph = nullptr; /**< 기타 형태소 정보에 대한 포인터 */
|
||||
|
||||
TokenInfo() = default;
|
||||
|
||||
|
|
@ -384,6 +386,8 @@ namespace kiwi
|
|||
}
|
||||
|
||||
uint32_t endPos() const { return position + length; }
|
||||
|
||||
bool isOOV() const { return senseId == (uint8_t)(-1); }
|
||||
};
|
||||
|
||||
struct BasicToken
|
||||
|
|
|
|||
|
|
@ -2,8 +2,8 @@
|
|||
* @file TypoTransformer.h
|
||||
* @author bab2min (bab2min@gmail.com)
|
||||
* @brief 오타 교정에 사용되는 TypoTransformer 및 관련 클래스들을 정의합니다.
|
||||
* @version 0.22.1
|
||||
* @date 2025-11-21
|
||||
* @version 0.23.1
|
||||
* @date 2026-04-05
|
||||
*
|
||||
*
|
||||
*/
|
||||
|
|
@ -127,6 +127,34 @@ namespace kiwi
|
|||
class KiwiBuilder;
|
||||
class TypoTransformer;
|
||||
|
||||
struct TypoGraphNode
|
||||
{
|
||||
U16StringView form;
|
||||
uint32_t endPos = 0;
|
||||
float typoCost = 0;
|
||||
uint32_t prevOffset = 0;
|
||||
uint32_t siblingOffset = 0;
|
||||
uint8_t continualTypoIdx = 0;
|
||||
Dialect dialect = Dialect::standard;
|
||||
|
||||
TypoGraphNode(U16StringView _form = {}, uint32_t _endPos = 0, float _typoCost = 0, uint32_t _prevOffset = 0, uint32_t _siblingOffset = 0)
|
||||
: form{ _form }, endPos{ _endPos }, typoCost{ _typoCost }, prevOffset{ _prevOffset }, siblingOffset{ _siblingOffset }
|
||||
{
|
||||
}
|
||||
|
||||
const TypoGraphNode* getPrev() const
|
||||
{
|
||||
if (!prevOffset) return nullptr;
|
||||
return this - prevOffset;
|
||||
}
|
||||
|
||||
const TypoGraphNode* getSibling() const
|
||||
{
|
||||
if (!siblingOffset) return nullptr;
|
||||
return this + siblingOffset;
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief 오타 생성 및 교정 준비가 완료된 오타 생성기. kiwi::TypoTransformer::prepare()로부터 생성됩니다.
|
||||
*/
|
||||
|
|
@ -191,7 +219,7 @@ namespace kiwi
|
|||
|
||||
public:
|
||||
PreparedTypoTransformer();
|
||||
PreparedTypoTransformer(const TypoTransformer& tt);
|
||||
PreparedTypoTransformer(const TypoTransformer& tt, bool inverse = false);
|
||||
~PreparedTypoTransformer();
|
||||
PreparedTypoTransformer(const PreparedTypoTransformer&) = delete;
|
||||
PreparedTypoTransformer(PreparedTypoTransformer&&) noexcept;
|
||||
|
|
@ -217,6 +245,13 @@ namespace kiwi
|
|||
* @param costThreshold 생성할 오타 후보의 비용 상한
|
||||
*/
|
||||
TypoCandidates<true> generate(const std::u16string& orig, float costThreshold = 2.5f) const;
|
||||
|
||||
template<class Alloc>
|
||||
size_t generateGraph(U16StringView normalizedStr, std::vector<TypoGraphNode, Alloc>& graphOut,
|
||||
Dialect allowedDialect = Dialect::standard,
|
||||
const std::pair<uint32_t, uint32_t>* pretokenizedFirst = nullptr,
|
||||
const std::pair<uint32_t, uint32_t>* pretokenizedLast = nullptr,
|
||||
size_t* maxContinualTypoIdxOut = nullptr) const;
|
||||
};
|
||||
|
||||
/**
|
||||
|
|
@ -389,10 +424,12 @@ namespace kiwi
|
|||
/**
|
||||
* @brief 현재 TypoTransformer를 사용하여 PreparedTypoTransformer를 생성합니다.
|
||||
* PreparedTypoTransformer는 실제로 오타를 생성하거나 kiwi::KiwiBuilder에 전달되어 오타 교정에 사용될 수 있습니다.
|
||||
*
|
||||
* @param inverse false일 경우 원본을 오타로 변환하는 변환기를, true일 경우 오타를 원본으로 변환하는 변환기를 생성합니다. 기본값은 false입니다.
|
||||
*/
|
||||
PreparedTypoTransformer prepare() const
|
||||
PreparedTypoTransformer prepare(bool inverse = false) const
|
||||
{
|
||||
return { *this };
|
||||
return { *this, inverse };
|
||||
}
|
||||
};
|
||||
|
||||
|
|
@ -413,4 +450,6 @@ namespace kiwi
|
|||
* @param set 사용할 기본 내장 오타 생성기의 종류
|
||||
*/
|
||||
const TypoTransformer& getDefaultTypoSet(DefaultTypoSet set);
|
||||
|
||||
const PreparedTypoTransformer* getDefaultPreparedTypoSet(DefaultTypoSet set);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -131,6 +131,39 @@ namespace kiwi
|
|||
return os << utf16To8({ str.begin(), str.end() });
|
||||
}
|
||||
|
||||
template<class Str, class It>
|
||||
inline void normalizeHangul(Str& ret, It first, It last)
|
||||
{
|
||||
ret.reserve((size_t)(std::distance(first, last) * 1.5));
|
||||
for (; first != last; ++first)
|
||||
{
|
||||
char16_t c = *first;
|
||||
if (c == 0xB42C) c = 0xB410; // '됬'을 '됐'으로 강제교정
|
||||
if (isHangulSyllable(c))
|
||||
{
|
||||
int coda = (c - 0xAC00) % 28;
|
||||
ret.push_back(c - coda);
|
||||
if (coda) ret.push_back(coda + 0x11A7);
|
||||
}
|
||||
else if (!ret.empty() && isHangulOnset(ret.back())
|
||||
&& 0x1161 <= c && c < 0x1176)
|
||||
{
|
||||
// 첫가끝 초성 + 중성 중 현대한글 음절로 가능한 것은 결합
|
||||
ret.back() = (char16_t)(0xAC00 + ((ret.back() - 0x1100) * 21 * 28) + ((c - 0x1161) * 28));
|
||||
}
|
||||
else
|
||||
{
|
||||
ret.push_back(c);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<class Str>
|
||||
inline void normalizeHangul(Str& ret, std::u16string_view sv)
|
||||
{
|
||||
normalizeHangul(ret, sv.begin(), sv.end());
|
||||
}
|
||||
|
||||
template<class It>
|
||||
inline std::u16string joinHangul(It first, It last)
|
||||
{
|
||||
|
|
|
|||
|
|
@ -2,8 +2,8 @@
|
|||
* @file capi.h
|
||||
* @author bab2min (bab2min@gmail.com)
|
||||
* @brief Kiwi C API를 담고 있는 헤더 파일
|
||||
* @version 0.22.1
|
||||
* @date 2025-11-21
|
||||
* @version 0.23.1
|
||||
* @date 2026-04-05
|
||||
*
|
||||
*
|
||||
*/
|
||||
|
|
@ -35,6 +35,7 @@ typedef struct kiwi_joiner* kiwi_joiner_h;
|
|||
typedef struct kiwi_typo* kiwi_typo_h;
|
||||
typedef struct kiwi_morphset* kiwi_morphset_h;
|
||||
typedef struct kiwi_pretokenized* kiwi_pretokenized_h;
|
||||
typedef struct kiwi_prepared_typo* kiwi_prepared_typo_h;
|
||||
typedef unsigned short kchar16_t;
|
||||
|
||||
typedef struct kiwi_swtokenizer* kiwi_swtokenizer_h;
|
||||
|
|
@ -71,11 +72,16 @@ typedef struct {
|
|||
typedef struct {
|
||||
uint8_t integrate_allomorph; /**< 이형태 형태소의 통합 여부 */
|
||||
float cut_off_threshold; /**< 분석 과정에서 이 값보다 더 크게 차이가 나는 후보들은 제거합니다. */
|
||||
float unk_form_score_scale; /**< 미등재 형태 추출 시 사용하는 기울기 값 */
|
||||
float unk_form_score_bias; /**< 미등재 형태 추출 시 사용하는 편향 값 */
|
||||
float oov_rule_scale; /**< 미등재 형태 추출 시 사용하는 기울기 값 */
|
||||
float oov_rule_bias; /**< 미등재 형태 추출 시 사용하는 편향 값 */
|
||||
float oov_chr_bias; /**< 미등재 형태 추출 시 사용하는 문자 기반 점수의 편향 값 */
|
||||
float oov_global_weight; /**< 미등재 형태 추출 시 사용하는 전역 빈도 가중치 */
|
||||
float oov_local_weight; /**< 미등재 형태 추출 시 사용하는 국부 빈도 가중치 */
|
||||
float oov_global_min_freq; /**< 미등재 형태 추출 시 사용하는 전역 최소 빈도 */
|
||||
float space_penalty; /**< 공백 패널티 */
|
||||
float typo_cost_weight; /**< 오타 비용의 가중치 */
|
||||
uint32_t max_unk_form_size; /**< 미등재 형태의 최대 크기 */
|
||||
uint32_t max_unk_form_size_followed_by_j_class; /**< (조사가 뒤따르는 경우) 미등재 형태의 최대 크기 */
|
||||
uint32_t space_tolerance; /**< 공백 허용치 */
|
||||
} kiwi_config_t;
|
||||
|
||||
|
|
@ -171,11 +177,18 @@ enum
|
|||
|
||||
enum
|
||||
{
|
||||
KIWI_MATCH_URL = 1,
|
||||
KIWI_MATCH_EMAIL = 2,
|
||||
KIWI_MATCH_HASHTAG = 4,
|
||||
KIWI_MATCH_MENTION = 8,
|
||||
KIWI_MATCH_SERIAL = 16,
|
||||
KIWI_MATCH_URL = 1 << 0,
|
||||
KIWI_MATCH_EMAIL = 1 << 1,
|
||||
KIWI_MATCH_HASHTAG = 1 << 2,
|
||||
KIWI_MATCH_MENTION = 1 << 3,
|
||||
KIWI_MATCH_SERIAL = 1 << 4,
|
||||
KIWI_MATCH_EMOJI = 1 << 5,
|
||||
|
||||
KIWI_MATCH_OOV_RULE_ONLY = 0 << 8,
|
||||
KIWI_MATCH_OOV_CHR_MODEL = 1 << 8,
|
||||
KIWI_MATCH_OOV_CHR_FREQ_MODEL = 2 << 8,
|
||||
KIWI_MATCH_OOV_CHR_FREQ_BRANCH_MODEL = 3 << 8,
|
||||
KIWI_MATCH_OOV_MASK = 3 << 8,
|
||||
|
||||
KIWI_MATCH_NORMALIZE_CODA = 1 << 16,
|
||||
KIWI_MATCH_JOIN_NOUN_PREFIX = 1 << 17,
|
||||
|
|
@ -190,8 +203,10 @@ enum
|
|||
KIWI_MATCH_COMPATIBLE_JAMO = 1 << 24,
|
||||
KIWI_MATCH_SPLIT_SAISIOT = 1 << 25,
|
||||
KIWI_MATCH_MERGE_SAISIOT = 1 << 26,
|
||||
KIWI_MATCH_JOIN_PARTICLE_YO = 1 << 27,
|
||||
KIWI_MATCH_USE_OLD_SPLITTER = 1 << 30,
|
||||
|
||||
KIWI_MATCH_ALL = KIWI_MATCH_URL | KIWI_MATCH_EMAIL | KIWI_MATCH_HASHTAG | KIWI_MATCH_MENTION | KIWI_MATCH_SERIAL | KIWI_MATCH_Z_CODA,
|
||||
KIWI_MATCH_ALL = KIWI_MATCH_URL | KIWI_MATCH_EMAIL | KIWI_MATCH_HASHTAG | KIWI_MATCH_MENTION | KIWI_MATCH_SERIAL | KIWI_MATCH_EMOJI | KIWI_MATCH_Z_CODA,
|
||||
KIWI_MATCH_ALL_WITH_NORMALIZING = KIWI_MATCH_ALL | KIWI_MATCH_NORMALIZE_CODA,
|
||||
};
|
||||
|
||||
|
|
@ -307,6 +322,33 @@ DECL_DLL int kiwi_builder_add_word(kiwi_builder_h handle, const char* word, cons
|
|||
*/
|
||||
DECL_DLL int kiwi_builder_add_alias_word(kiwi_builder_h handle, const char* alias, const char* pos, float score, const char* orig_word);
|
||||
|
||||
/**
|
||||
* @brief 사용자 형태소를 추가합니다. 의미 번호와 방언 정보를 지정할 수 있습니다.
|
||||
*
|
||||
* @param handle KiwiBuilder의 핸들.
|
||||
* @param word 추가할 형태소 (utf-8).
|
||||
* @param pos 품사 태그 (kiwi#POSTag).
|
||||
* @param sense_id 의미 번호.
|
||||
* @param dialect 방언 정보. KIWI_DIALECT_* 열거형을 참조하십시오.
|
||||
* @param score 점수.
|
||||
* @return 성공 시 0를 반환합니다.
|
||||
*/
|
||||
DECL_DLL int kiwi_builder_add_word_with_def(kiwi_builder_h handle, const char* word, const char* pos, int sense_id, int dialect, float score);
|
||||
|
||||
/**
|
||||
* @brief 원본 형태소를 기반으로하는 새 형태소를 추가합니다. 의미 번호와 방언 정보를 지정할 수 있습니다.
|
||||
*
|
||||
* @param handle KiwiBuilder의 핸들.
|
||||
* @param alias 새 형태소 (utf-8)
|
||||
* @param pos 품사 태그 (kiwi#POSTag).
|
||||
* @param sense_id 의미 번호.
|
||||
* @param dialect 방언 정보. KIWI_DIALECT_* 열거형을 참조하십시오.
|
||||
* @param score 점수.
|
||||
* @param orig_word 원 형태소 (utf-8)
|
||||
* @return 성공 시 0를 반환합니다.
|
||||
*/
|
||||
DECL_DLL int kiwi_builder_add_alias_word_with_def(kiwi_builder_h handle, const char* alias, const char* pos, int sense_id, int dialect, float score, const char* orig_word);
|
||||
|
||||
/**
|
||||
* @brief 기분석 형태소열을 추가합니다.
|
||||
* 불규칙적으로 분석되어야하는 패턴을 추가하는 데 용이합니다.
|
||||
|
|
@ -446,6 +488,7 @@ enum
|
|||
KIWI_TYPO_BASIC_TYPO_SET_WITH_CONTINUAL = 3,
|
||||
KIWI_TYPO_LENGTHENING_TYPO_SET = 4,
|
||||
KIWI_TYPO_BASIC_TYPO_SET_WITH_CONTINUAL_AND_LENGTHENING = 5,
|
||||
KIWI_TYPO_DIALECT = 6,
|
||||
};
|
||||
|
||||
/**
|
||||
|
|
@ -526,15 +569,34 @@ DECL_DLL int kiwi_typo_set_lengthening_typo_cost(kiwi_typo_h handle, float thres
|
|||
*/
|
||||
DECL_DLL int kiwi_typo_close(kiwi_typo_h handle);
|
||||
|
||||
/**
|
||||
* @brief 오타 교정기로부터 준비된 오타 교정기를 생성합니다.
|
||||
*
|
||||
* @param handle 오타 교정기의 핸들
|
||||
* @return 성공 시 준비된 오타 교정기의 핸들을 반환합니다. 실패 시 null를 반환하고 에러 메세지를 설정합니다.
|
||||
*
|
||||
* @note 생성된 핸들은 kiwi_prepared_typo_close를 통해 반드시 해제되어야 합니다.
|
||||
*/
|
||||
DECL_DLL kiwi_prepared_typo_h kiwi_typo_prepare(kiwi_typo_h handle);
|
||||
|
||||
/**
|
||||
* @brief 사용이 끝난 준비된 오타 교정기를 해제합니다.
|
||||
*
|
||||
* @param handle 준비된 오타 교정기의 핸들
|
||||
* @return 성공 시 0를 반환합니다. 실패 시 음수를 반환하고 에러 메세지를 설정합니다.
|
||||
*/
|
||||
DECL_DLL int kiwi_prepared_typo_close(kiwi_prepared_typo_h handle);
|
||||
|
||||
/**
|
||||
* @brief KiwiBuilder를 거치지 않고 바로 Kiwi instance를 생성합니다.
|
||||
*
|
||||
* @param model_path 모델이 들어있는 디렉토리 경로 (e.g., ./models/base).
|
||||
* @param num_threads 사용할 쓰레드의 수 (-1일 경우, 자동으로 설정).
|
||||
* @param options 생성 옵션. KIWI_BUILD_* 참조.
|
||||
* @param enabled_dialects 활성화할 방언. KIWI_DIALECT_* 열거형을 참조하십시오.
|
||||
* @return Kiwi의 핸들.
|
||||
*/
|
||||
DECL_DLL kiwi_h kiwi_init(const char* model_path, int num_threads, int options);
|
||||
DECL_DLL kiwi_h kiwi_init(const char* model_path, int num_threads, int options, int enabled_dialects);
|
||||
|
||||
/**
|
||||
* @brief 글로벌 설정 값을 변경합니다.
|
||||
|
|
@ -603,6 +665,8 @@ typedef struct {
|
|||
int open_ending; /**< 마지막 형태소 다음 문장을 종결하지 않고 열린 상태로 끝낼지를 설정니다. 기본값은 0으로 마지막 형태소 다음 바로 문장을 종결합니다. */
|
||||
int allowed_dialects; /**< KIWI_DIALECT_* 열거형 참고 */
|
||||
float dialect_cost; /**< 방언 형태소에 추가되는 비용. 기본값은 3 */
|
||||
kiwi_prepared_typo_h typo_transformer; /**< 분석 시 사용할 오타 교정기. null인 경우 사용하지 않습니다. */
|
||||
float typo_threshold; /**< 오타 교정 비용 임계값. 기본값은 2.5 */
|
||||
} kiwi_analyze_option_t;
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -132,9 +132,6 @@ NNB VCP,JK
|
|||
^터 이 테-1
|
||||
^것 이 게-1
|
||||
|
||||
NNB VCP,JK <충청>
|
||||
|
||||
|
||||
NP VCP,JK
|
||||
터 이 테-1
|
||||
것 이 게-1
|
||||
|
|
@ -235,22 +232,5 @@ E E
|
|||
리 어 려
|
||||
시 어 셔
|
||||
|
||||
NN JK <함경>
|
||||
ㅣ 이 ㅣ
|
||||
ㅐ 이 ㅐ
|
||||
ㅔ 이 ㅔ
|
||||
ㅚ 이 ㅚ
|
||||
ㅟ 이 ㅟ
|
||||
ㅏ 이 ㅐ
|
||||
ㅓ 이 ㅔ
|
||||
ㅘ 이 ㅙ
|
||||
ㅝ 이 ㅞ
|
||||
ㅜ 우 ㅜ
|
||||
ㅠ 우 ㅠ
|
||||
|
||||
EC VX <함경>
|
||||
디 애 대,댸
|
||||
|
||||
VX E <함경>
|
||||
#ᇂ 고 )(코
|
||||
#ᇂ 구 )(쿠
|
||||
|
|
|
|||
BIN
models/cong/base/nounchr.mdl
(Stored with Git LFS)
Normal file
BIN
models/cong/base/nounchr.mdl
(Stored with Git LFS)
Normal file
Binary file not shown.
|
|
@ -59,7 +59,7 @@ namespace kiwi
|
|||
static_cast<std::ptrdiff_t>(ArchType::sse4_1)
|
||||
#endif
|
||||
#if CPUINFO_ARCH_ARM64
|
||||
//static_cast<std::ptrdiff_t>(ArchType::neon)
|
||||
static_cast<std::ptrdiff_t>(ArchType::neon)
|
||||
#endif
|
||||
#else
|
||||
#ifdef KIWI_ARCH_X86_64
|
||||
|
|
@ -72,7 +72,7 @@ namespace kiwi
|
|||
static_cast<std::ptrdiff_t>(ArchType::sse4_1)
|
||||
#endif
|
||||
#ifdef KIWI_ARCH_ARM64
|
||||
//static_cast<std::ptrdiff_t>(ArchType::neon)
|
||||
static_cast<std::ptrdiff_t>(ArchType::neon)
|
||||
#endif
|
||||
#endif
|
||||
>;
|
||||
|
|
|
|||
|
|
@ -15,7 +15,7 @@ ArchType kiwi::getBestArch()
|
|||
if (cpuinfo_has_x86_avx512vnni()) return ArchType::avx512vnni;
|
||||
if (cpuinfo_has_x86_avx512bw()) return ArchType::avx512bw;
|
||||
#ifdef KIWI_AVX_VNNI_SUPPORTED
|
||||
if (cpuinfo_has_x86_avx_vnni_int8()) return ArchType::avx_vnni;
|
||||
if (cpuinfo_has_x86_avxvnni()) return ArchType::avx_vnni;
|
||||
#endif
|
||||
if (cpuinfo_has_x86_avx2()) return ArchType::avx2;
|
||||
if (cpuinfo_has_x86_sse4_1()) return ArchType::sse4_1;
|
||||
|
|
|
|||
|
|
@ -5,9 +5,6 @@
|
|||
|
||||
namespace kiwi
|
||||
{
|
||||
template<class LmState>
|
||||
struct WordLL;
|
||||
|
||||
using Wid = uint32_t;
|
||||
|
||||
enum class PathEvaluatingMode
|
||||
|
|
@ -18,40 +15,50 @@ namespace kiwi
|
|||
top1,
|
||||
};
|
||||
|
||||
template<class LmState>
|
||||
template<class _LmState, bool _hasOovCounter = false>
|
||||
struct WordLL
|
||||
{
|
||||
using LmState = _LmState;
|
||||
static constexpr bool hasOovCounter = _hasOovCounter;
|
||||
|
||||
LmState lmState;
|
||||
|
||||
float accScore = 0, firstChunkScore = 0;
|
||||
uint32_t parent = 0;
|
||||
Wid wid = 0;
|
||||
uint16_t ownFormId = 0;
|
||||
uint8_t combineSocket = 0;
|
||||
uint8_t prevRootId = 0;
|
||||
SpecialState spState;
|
||||
uint8_t rootId = 0;
|
||||
|
||||
std::conditional_t<hasOovCounter, uint16_t, uint8_t> oovFlag = 0;
|
||||
std::conditional_t<hasOovCounter, uint32_t, uint8_t> oovCntArenaPtr = 0;
|
||||
|
||||
const Morpheme* morpheme = nullptr;
|
||||
float accScore = 0, firstChunkScore = 0, accTypoCost = 0, accDialectCost = 0;
|
||||
const WordLL* parent = nullptr;
|
||||
Wid wid = 0;
|
||||
uint16_t ownFormId = 0;
|
||||
uint8_t combineSocket = 0;
|
||||
|
||||
WordLL() = default;
|
||||
|
||||
WordLL(const Morpheme* _morph, float _accScore, float _firstChunkScore, float _accTypoCost, float _accDialectCost,
|
||||
const WordLL* _parent, LmState _lmState, SpecialState _spState)
|
||||
WordLL(const Morpheme* _morph, float _accScore, float _firstChunkScore,
|
||||
uint32_t _parent, LmState _lmState, SpecialState _spState, uint8_t _rootId,
|
||||
uint16_t _oovFlag = 0,
|
||||
uint32_t _oovCntArenaPtr = 0
|
||||
)
|
||||
: morpheme{ _morph },
|
||||
accScore{ _accScore },
|
||||
firstChunkScore{ _firstChunkScore },
|
||||
accTypoCost{ _accTypoCost },
|
||||
accDialectCost{ _accDialectCost },
|
||||
parent{ _parent },
|
||||
lmState{ _lmState },
|
||||
spState{ _spState },
|
||||
rootId{ _parent ? _parent->rootId : (uint8_t)0 }
|
||||
rootId{ _rootId },
|
||||
oovFlag{ (decltype(oovFlag))_oovFlag },
|
||||
oovCntArenaPtr{ (decltype(oovCntArenaPtr))_oovCntArenaPtr }
|
||||
{
|
||||
}
|
||||
|
||||
const WordLL* root() const
|
||||
const WordLL* root(const WordLL* base) const
|
||||
{
|
||||
if (parent) return parent->root();
|
||||
if (parent) return base[parent].root(base);
|
||||
else return this;
|
||||
}
|
||||
|
||||
|
|
@ -66,10 +73,10 @@ namespace kiwi
|
|||
}
|
||||
};
|
||||
|
||||
template<class LmState>
|
||||
struct Hash<WordLL<LmState>>
|
||||
template<class LmState, bool useOOVGlobalConsistency>
|
||||
struct Hash<WordLL<LmState, useOOVGlobalConsistency>>
|
||||
{
|
||||
size_t operator()(const WordLL<LmState>& p) const
|
||||
size_t operator()(const WordLL<LmState, useOOVGlobalConsistency>& p) const
|
||||
{
|
||||
size_t ret = Hash<LmState>{}(p.lmState);
|
||||
ret = *reinterpret_cast<const uint16_t*>(&p.prevRootId) ^ ((ret << 3) | (ret >> (sizeof(size_t) * 8 - 3)));
|
||||
|
|
@ -121,28 +128,33 @@ namespace kiwi
|
|||
|
||||
struct WordLLGreater
|
||||
{
|
||||
template<class LmState>
|
||||
bool operator()(const WordLL<LmState>& a, const WordLL<LmState>& b) const
|
||||
template<class WordLL>
|
||||
bool operator()(const WordLL& a, const WordLL& b) const
|
||||
{
|
||||
return a.accScore > b.accScore;
|
||||
}
|
||||
};
|
||||
|
||||
template<class LmState>
|
||||
inline std::ostream& printDebugPath(std::ostream& os, const WordLL<LmState>& src)
|
||||
template<class Vector, class FormVector>
|
||||
inline std::ostream& printDebugPath(std::ostream& os, Vector&& pathes, size_t pathIdx, FormVector&& ownFormList)
|
||||
{
|
||||
if (src.parent)
|
||||
auto& path = pathes[pathIdx];
|
||||
if (path.parent != pathIdx)
|
||||
{
|
||||
printDebugPath(os, *src.parent);
|
||||
printDebugPath(os, pathes, path.parent, ownFormList);
|
||||
}
|
||||
|
||||
if (src.morpheme) src.morpheme->print(os);
|
||||
if (path.morpheme)
|
||||
{
|
||||
if (path.ownFormId) os << utf16To8(joinHangul(ownFormList[path.ownFormId - 1].begin(), ownFormList[path.ownFormId - 1].end()));
|
||||
path.morpheme->print(os);
|
||||
}
|
||||
else os << "NULL";
|
||||
os << " , ";
|
||||
return os;
|
||||
}
|
||||
|
||||
template<PathEvaluatingMode mode, class LmState>
|
||||
template<PathEvaluatingMode mode, class WordLL>
|
||||
class BestPathConatiner;
|
||||
|
||||
template<PathEvaluatingMode mode>
|
||||
|
|
@ -151,12 +163,13 @@ namespace kiwi
|
|||
static constexpr size_t maxSize = -1;
|
||||
};
|
||||
|
||||
template<class LmState>
|
||||
class BestPathConatiner<PathEvaluatingMode::topN, LmState>
|
||||
template<class WordLL>
|
||||
class BestPathConatiner<PathEvaluatingMode::topN, WordLL>
|
||||
{
|
||||
using LmState = typename WordLL::LmState;
|
||||
// pair: [index, size]
|
||||
UnorderedMap<PathHash<LmState>, std::pair<uint32_t, uint32_t>> bestPathIndex;
|
||||
Vector<WordLL<LmState>> bestPathValues;
|
||||
Vector<WordLL> bestPathValues;
|
||||
public:
|
||||
|
||||
inline void clear()
|
||||
|
|
@ -166,15 +179,18 @@ namespace kiwi
|
|||
}
|
||||
|
||||
inline void insert(size_t topN, uint8_t prevRootId, uint8_t rootId,
|
||||
const Morpheme* morph, float accScore, float firstChunkScore, float accTypoCost, float accDialectCost,
|
||||
const WordLL<LmState>* parent, LmState&& lmState, SpecialState spState)
|
||||
const Morpheme* morph, float accScore, float firstChunkScore,
|
||||
const WordLL* base, uint32_t parent, LmState&& lmState, SpecialState spState)
|
||||
{
|
||||
PathHash<LmState> ph{ lmState, prevRootId, spState };
|
||||
auto inserted = bestPathIndex.emplace(ph, std::make_pair((uint32_t)bestPathValues.size(), 1));
|
||||
if (inserted.second)
|
||||
{
|
||||
bestPathValues.emplace_back(morph, accScore, firstChunkScore, accTypoCost, accDialectCost,
|
||||
parent, std::move(lmState), spState);
|
||||
bestPathValues.emplace_back(morph, accScore, firstChunkScore,
|
||||
parent, std::move(lmState), spState,
|
||||
parent ? base[parent].rootId : (uint8_t)0,
|
||||
(uint16_t)(parent ? base[parent].oovFlag : 0),
|
||||
parent ? base[parent].oovCntArenaPtr : (uint32_t)0);
|
||||
if (rootId != commonRootId) bestPathValues.back().rootId = rootId;
|
||||
bestPathValues.resize(bestPathValues.size() + topN - 1);
|
||||
}
|
||||
|
|
@ -184,8 +200,12 @@ namespace kiwi
|
|||
auto bestPathLast = bestPathValues.begin() + inserted.first->second.first + inserted.first->second.second;
|
||||
if (std::distance(bestPathFirst, bestPathLast) < topN)
|
||||
{
|
||||
*bestPathLast = WordLL<LmState>{ morph, accScore, firstChunkScore, accTypoCost, accDialectCost,
|
||||
parent, std::move(lmState), spState };
|
||||
*bestPathLast = WordLL{ morph, accScore, firstChunkScore,
|
||||
parent, std::move(lmState), spState,
|
||||
parent ? base[parent].rootId : (uint8_t)0,
|
||||
(uint16_t)(parent ? base[parent].oovFlag : 0),
|
||||
parent ? base[parent].oovCntArenaPtr : (uint32_t)0
|
||||
};
|
||||
if (rootId != commonRootId) bestPathLast->rootId = rootId;
|
||||
std::push_heap(bestPathFirst, bestPathLast + 1, WordLLGreater{});
|
||||
++inserted.first->second.second;
|
||||
|
|
@ -195,8 +215,12 @@ namespace kiwi
|
|||
if (accScore > bestPathFirst->accScore)
|
||||
{
|
||||
std::pop_heap(bestPathFirst, bestPathLast, WordLLGreater{});
|
||||
*(bestPathLast - 1) = WordLL<LmState>{ morph, accScore, firstChunkScore, accTypoCost, accDialectCost,
|
||||
parent, std::move(lmState), spState };
|
||||
*(bestPathLast - 1) = WordLL{ morph, accScore, firstChunkScore,
|
||||
parent, std::move(lmState), spState,
|
||||
parent ? base[parent].rootId : (uint8_t)0,
|
||||
(uint16_t)(parent ? base[parent].oovFlag : 0),
|
||||
parent ? base[parent].oovCntArenaPtr : (uint32_t)0
|
||||
};
|
||||
if (rootId != commonRootId) (*(bestPathLast - 1)).rootId = rootId;
|
||||
std::push_heap(bestPathFirst, bestPathLast, WordLLGreater{});
|
||||
}
|
||||
|
|
@ -204,7 +228,7 @@ namespace kiwi
|
|||
}
|
||||
}
|
||||
|
||||
inline void writeTo(Vector<WordLL<LmState>>& resultOut, const Morpheme* curMorph, Wid lastSeqId, size_t ownFormId)
|
||||
inline void writeTo(Vector<WordLL>& resultOut, const Morpheme* curMorph, Wid lastSeqId, size_t ownFormId)
|
||||
{
|
||||
for (auto& p : bestPathIndex)
|
||||
{
|
||||
|
|
@ -227,10 +251,11 @@ namespace kiwi
|
|||
}
|
||||
};
|
||||
|
||||
template<class LmState>
|
||||
class BestPathConatiner<PathEvaluatingMode::top1, LmState>
|
||||
template<class WordLL>
|
||||
class BestPathConatiner<PathEvaluatingMode::top1, WordLL>
|
||||
{
|
||||
UnorderedSet<WordLL<LmState>> bestPathes;
|
||||
using LmState = typename WordLL::LmState;
|
||||
UnorderedSet<WordLL> bestPathes;
|
||||
public:
|
||||
inline void clear()
|
||||
{
|
||||
|
|
@ -238,11 +263,15 @@ namespace kiwi
|
|||
}
|
||||
|
||||
inline void insert(size_t topN, uint8_t prevRootId, uint8_t rootId,
|
||||
const Morpheme* morph, float accScore, float firstChunkScore, float accTypoCost, float accDialectCost,
|
||||
const WordLL<LmState>* parent, LmState&& lmState, SpecialState spState)
|
||||
const Morpheme* morph, float accScore, float firstChunkScore,
|
||||
const WordLL* base, uint32_t parent, LmState&& lmState, SpecialState spState)
|
||||
{
|
||||
WordLL<LmState> newPath{ morph, accScore, firstChunkScore, accTypoCost, accDialectCost,
|
||||
parent, std::move(lmState), spState };
|
||||
WordLL newPath{ morph, accScore, firstChunkScore,
|
||||
parent, std::move(lmState), spState,
|
||||
parent ? base[parent].rootId : (uint8_t)0,
|
||||
(uint16_t)(parent ? base[parent].oovFlag : 0),
|
||||
parent ? base[parent].oovCntArenaPtr : (uint32_t)0
|
||||
};
|
||||
newPath.prevRootId = prevRootId;
|
||||
if (rootId != commonRootId) newPath.rootId = rootId;
|
||||
auto inserted = bestPathes.emplace(newPath);
|
||||
|
|
@ -250,7 +279,7 @@ namespace kiwi
|
|||
{
|
||||
// this is dangerous, but we can update the key safely
|
||||
// because an equality between the two objects is guaranteed
|
||||
auto& target = const_cast<WordLL<LmState>&>(*inserted.first);
|
||||
auto& target = const_cast<WordLL&>(*inserted.first);
|
||||
if (accScore > target.accScore)
|
||||
{
|
||||
target = newPath;
|
||||
|
|
@ -258,7 +287,7 @@ namespace kiwi
|
|||
}
|
||||
}
|
||||
|
||||
inline void writeTo(Vector<WordLL<LmState>>& resultOut, const Morpheme* curMorph, Wid lastSeqId, size_t ownFormId)
|
||||
inline void writeTo(Vector<WordLL>& resultOut, const Morpheme* curMorph, Wid lastSeqId, size_t ownFormId)
|
||||
{
|
||||
for (auto& p : bestPathes)
|
||||
{
|
||||
|
|
@ -288,13 +317,14 @@ namespace kiwi
|
|||
static constexpr size_t maxSize = BestPathContainerTraits<PathEvaluatingMode::top1Small>::maxSize * 4;
|
||||
};
|
||||
|
||||
template<class LmState, size_t bucketBits>
|
||||
template<class WordLL, size_t bucketBits>
|
||||
class BucketedHashContainer
|
||||
{
|
||||
using LmState = typename WordLL::LmState;
|
||||
static constexpr size_t bucketSize = 1 << bucketBits;
|
||||
|
||||
std::array<std::array<uint8_t, BestPathContainerTraits<PathEvaluatingMode::top1Small>::maxSize>, bucketSize> hashes;
|
||||
std::array<Vector<WordLL<LmState>>, bucketSize> values;
|
||||
std::array<Vector<WordLL>, bucketSize> values;
|
||||
|
||||
public:
|
||||
BucketedHashContainer()
|
||||
|
|
@ -315,11 +345,11 @@ namespace kiwi
|
|||
|
||||
template<ArchType archType>
|
||||
inline void insertOptimized(size_t topN, uint8_t prevRootId, uint8_t rootId,
|
||||
const Morpheme* morph, float accScore, float firstChunkScore, float accTypoCost, float accDialectCost,
|
||||
const WordLL<LmState>* parent, LmState&& lmState, SpecialState spState)
|
||||
const Morpheme* morph, float accScore, float firstChunkScore,
|
||||
const WordLL* base, uint32_t parent, LmState&& lmState, SpecialState spState)
|
||||
{
|
||||
static constexpr size_t numBits = sizeof(size_t) * 8;
|
||||
const size_t h = Hash<WordLL<LmState>>{}(lmState, prevRootId, spState);
|
||||
const size_t h = Hash<WordLL>{}(lmState, prevRootId, spState);
|
||||
const size_t bucket = (h >> 8) & (bucketSize - 1);
|
||||
auto& hash = hashes[bucket];
|
||||
auto& value = values[bucket];
|
||||
|
|
@ -355,8 +385,12 @@ namespace kiwi
|
|||
if (value.size() < hash.size())
|
||||
{
|
||||
hash[value.size()] = h;
|
||||
value.emplace_back(morph, accScore, firstChunkScore, accTypoCost, accDialectCost,
|
||||
parent, std::move(lmState), spState);
|
||||
value.emplace_back(morph, accScore, firstChunkScore,
|
||||
parent, std::move(lmState), spState,
|
||||
parent ? base[parent].rootId : (uint8_t)0,
|
||||
(uint16_t)(parent ? base[parent].oovFlag : 0),
|
||||
parent ? base[parent].oovCntArenaPtr : (uint32_t)0
|
||||
);
|
||||
value.back().prevRootId = prevRootId;
|
||||
if (rootId != commonRootId) value.back().rootId = rootId;
|
||||
}
|
||||
|
|
@ -374,29 +408,29 @@ namespace kiwi
|
|||
target.morpheme = morph;
|
||||
target.accScore = accScore;
|
||||
target.firstChunkScore = firstChunkScore;
|
||||
target.accTypoCost = accTypoCost;
|
||||
target.accDialectCost = accDialectCost;
|
||||
target.parent = parent;
|
||||
target.lmState = std::move(lmState);
|
||||
target.spState = spState;
|
||||
target.rootId = parent ? parent->rootId : 0;
|
||||
target.rootId = parent ? base[parent].rootId : 0;
|
||||
if (rootId != commonRootId) target.rootId = rootId;
|
||||
target.oovFlag = parent ? base[parent].oovFlag : 0;
|
||||
target.oovCntArenaPtr = parent ? base[parent].oovCntArenaPtr : 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
inline void insert(size_t topN, uint8_t prevRootId, uint8_t rootId,
|
||||
const Morpheme* morph, float accScore, float firstChunkScore, float accTypoCost, float accDialectCost,
|
||||
const WordLL<LmState>* parent, LmState&& lmState, SpecialState spState)
|
||||
const Morpheme* morph, float accScore, float firstChunkScore,
|
||||
const WordLL* base, uint32_t parent, LmState&& lmState, SpecialState spState)
|
||||
{
|
||||
static constexpr ArchType archType = LmState::arch;
|
||||
if constexpr (archType != ArchType::none && archType != ArchType::balanced)
|
||||
{
|
||||
return insertOptimized<archType>(topN, prevRootId, rootId, morph, accScore, firstChunkScore, accTypoCost, accDialectCost,
|
||||
parent, std::move(lmState), spState);
|
||||
return insertOptimized<archType>(topN, prevRootId, rootId, morph, accScore, firstChunkScore,
|
||||
base, parent, std::move(lmState), spState);
|
||||
}
|
||||
|
||||
const size_t h = Hash<WordLL<LmState>>{}(lmState, prevRootId, spState);
|
||||
const size_t h = Hash<WordLL>{}(lmState, prevRootId, spState);
|
||||
const size_t bucket = (h >> 8) & (bucketSize - 1);
|
||||
auto& hash = hashes[bucket];
|
||||
auto& value = values[bucket];
|
||||
|
|
@ -418,8 +452,12 @@ namespace kiwi
|
|||
if (value.size() < hash.size())
|
||||
{
|
||||
hash[value.size()] = h;
|
||||
value.emplace_back(morph, accScore, firstChunkScore, accTypoCost, accDialectCost,
|
||||
parent, std::move(lmState), spState);
|
||||
value.emplace_back(morph, accScore, firstChunkScore,
|
||||
parent, std::move(lmState), spState,
|
||||
parent ? base[parent].rootId : (uint8_t)0,
|
||||
(uint16_t)(parent ? base[parent].oovFlag : 0),
|
||||
parent ? base[parent].oovCntArenaPtr : (uint32_t)0
|
||||
);
|
||||
value.back().prevRootId = prevRootId;
|
||||
if (rootId != commonRootId) value.back().rootId = rootId;
|
||||
}
|
||||
|
|
@ -437,18 +475,18 @@ namespace kiwi
|
|||
target.morpheme = morph;
|
||||
target.accScore = accScore;
|
||||
target.firstChunkScore = firstChunkScore;
|
||||
target.accTypoCost = accTypoCost;
|
||||
target.accDialectCost = accDialectCost;
|
||||
target.parent = parent;
|
||||
target.lmState = std::move(lmState);
|
||||
target.spState = spState;
|
||||
target.rootId = parent ? parent->rootId : 0;
|
||||
target.rootId = parent ? base[parent].rootId : 0;
|
||||
if (rootId != commonRootId) target.rootId = rootId;
|
||||
target.oovFlag = parent ? base[parent].oovFlag : 0;
|
||||
target.oovCntArenaPtr = parent ? base[parent].oovCntArenaPtr : 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
inline void writeTo(Vector<WordLL<LmState>>& resultOut, const Morpheme* curMorph, Wid lastSeqId, size_t ownFormId)
|
||||
inline void writeTo(Vector<WordLL>& resultOut, const Morpheme* curMorph, Wid lastSeqId, size_t ownFormId)
|
||||
{
|
||||
for (auto& v : values)
|
||||
{
|
||||
|
|
@ -470,15 +508,15 @@ namespace kiwi
|
|||
};
|
||||
|
||||
|
||||
template<class LmState>
|
||||
class alignas(BestPathContainerTraits<PathEvaluatingMode::top1Small>::maxSize) BestPathConatiner<PathEvaluatingMode::top1Small, LmState>
|
||||
: public BucketedHashContainer<LmState, 0>
|
||||
template<class WordLL>
|
||||
class alignas(BestPathContainerTraits<PathEvaluatingMode::top1Small>::maxSize) BestPathConatiner<PathEvaluatingMode::top1Small, WordLL>
|
||||
: public BucketedHashContainer<WordLL, 0>
|
||||
{
|
||||
};
|
||||
|
||||
template<class LmState>
|
||||
class alignas(BestPathContainerTraits<PathEvaluatingMode::top1Small>::maxSize) BestPathConatiner<PathEvaluatingMode::top1Medium, LmState>
|
||||
: public BucketedHashContainer<LmState, 2>
|
||||
template<class WordLL>
|
||||
class alignas(BestPathContainerTraits<PathEvaluatingMode::top1Small>::maxSize) BestPathConatiner<PathEvaluatingMode::top1Medium, WordLL>
|
||||
: public BucketedHashContainer<WordLL, 2>
|
||||
{
|
||||
};
|
||||
}
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load diff
|
|
@ -15,6 +15,29 @@ namespace kiwi
|
|||
{
|
||||
namespace lm
|
||||
{
|
||||
/*
|
||||
* quantize frequency scale
|
||||
*
|
||||
* values <= 16 are linearly mapped
|
||||
* values > 16 are mapped logarithmically:
|
||||
*/
|
||||
inline uint8_t quantizeFrequencyScale(float freq)
|
||||
{
|
||||
if (freq <= 0) return 0;
|
||||
if (freq <= 16) return (uint8_t)freq;
|
||||
const float logFreq = log2f(freq);
|
||||
const float rounded = round(logFreq * 8) - 16;
|
||||
if (rounded >= 255) return 255;
|
||||
return (uint8_t)rounded;
|
||||
}
|
||||
|
||||
inline float dequantizeFrequencyScale(uint8_t qfreq)
|
||||
{
|
||||
if (qfreq <= 16) return (float)qfreq;
|
||||
const float logFreq = ((float)qfreq + 16) / 8.0f;
|
||||
return powf(2.0f, logFreq);
|
||||
}
|
||||
|
||||
template<size_t windowSize, ArchType _arch, class VocabTy, class VlVocabTy, bool quantized>
|
||||
class CoNgramState;
|
||||
|
||||
|
|
@ -28,13 +51,40 @@ namespace kiwi
|
|||
const uint8_t* alignedKeyValueData = nullptr;
|
||||
std::unique_ptr<int32_t[]> allRootValueData;
|
||||
std::unique_ptr<uint8_t[]> allEmbs;
|
||||
const uint8_t* contextEmbPtr = nullptr; // [numContexts, (dim + scale? + bias + confid + vts)]
|
||||
const uint8_t* contextEmbPtr = nullptr; // [numContexts, (dim + scale? + bias + confid + vts)] (quantized NEON: dim stores S8 values)
|
||||
const uint8_t* outputEmbPtr = nullptr; // [numOutputs, (dim + scale? + sum?)]
|
||||
const uint8_t* distantEmbPtr = nullptr; // [numOutputs, (dim + scale? + bias + confid + pad?)]
|
||||
const float* positionConfidPtr = nullptr;
|
||||
const uint8_t* distantMaskPtr = nullptr;
|
||||
const float* outputEmbBiasPtr = nullptr;
|
||||
const KeyType* invertedContextVocabPtr = nullptr;
|
||||
const float* invNormContextPtr = nullptr;
|
||||
const float* invNormOutputPtr = nullptr;
|
||||
const float* contextEmbEntropyPtr = nullptr;
|
||||
|
||||
inline uint32_t unpackContextId(uint32_t v) const
|
||||
{
|
||||
if (header.flags & header.hasTrieFrequency)
|
||||
{
|
||||
return v & 0x00FFFFFF;
|
||||
}
|
||||
else
|
||||
{
|
||||
return v;
|
||||
}
|
||||
}
|
||||
|
||||
inline float unpackTrieFrequency(uint32_t v) const
|
||||
{
|
||||
if (header.flags & header.hasTrieFrequency)
|
||||
{
|
||||
return dequantizeFrequencyScale(v >> 24);
|
||||
}
|
||||
else
|
||||
{
|
||||
return 0.f;
|
||||
}
|
||||
}
|
||||
|
||||
inline size_t contextEmbStride() const
|
||||
{
|
||||
|
|
@ -59,11 +109,16 @@ namespace kiwi
|
|||
return reinterpret_cast<const float*>(contextEmbPtr + idx * contextEmbStride());
|
||||
}
|
||||
|
||||
inline const uint8_t* getContextQuantEmb(uint32_t idx) const
|
||||
inline const uint8_t* getContextQuantEmb(size_t idx) const
|
||||
{
|
||||
return contextEmbPtr + idx * contextEmbStride();
|
||||
}
|
||||
|
||||
inline const int8_t* getContextQuantEmbS8(size_t idx) const
|
||||
{
|
||||
return reinterpret_cast<const int8_t*>(contextEmbPtr + idx * contextEmbStride());
|
||||
}
|
||||
|
||||
inline float getContextBias(uint32_t idx) const
|
||||
{
|
||||
const size_t offset = quantized ?
|
||||
|
|
@ -204,26 +259,53 @@ namespace kiwi
|
|||
float contextSimilarity(uint32_t contextId1, uint32_t contextId2) const override;
|
||||
size_t predictWordsFromContext(uint32_t contextId, size_t topN, std::pair<uint32_t, float>* output) const override;
|
||||
size_t predictWordsFromContextDiff(uint32_t contextId, uint32_t bgContextId, float weight, size_t topN, std::pair<uint32_t, float>* output) const override;
|
||||
|
||||
float progressOneStep(int32_t& nodeIdx, uint32_t& contextIdx, uint32_t next) const override;
|
||||
float getContextFrequency(uint32_t contextId) const override;
|
||||
float getContextEntropy(uint32_t contextId) const override;
|
||||
size_t getNodeDepth(uint32_t nodeId) const override;
|
||||
|
||||
uint32_t toContextId(const uint32_t* vocabIds, size_t size) const override;
|
||||
std::vector<std::vector<uint32_t>> getContextWordMap() const override;
|
||||
|
||||
uint32_t progressContextNode(int32_t& nodeIdx, KeyType next) const
|
||||
{
|
||||
if (invertedContextVocabPtr)
|
||||
{
|
||||
next = invertedContextVocabPtr[next];
|
||||
}
|
||||
|
||||
if constexpr (std::is_same_v<KeyType, VlKeyType>)
|
||||
{
|
||||
return progressContextNodeVl(nodeIdx, next);
|
||||
}
|
||||
|
||||
static constexpr size_t tMax = (1 << 16) - (1 << 10) * 2;
|
||||
static constexpr size_t keyWidth = sizeof(VlKeyType) * 8,
|
||||
surrogateBitWidth = keyWidth == 16 ? 10 : 5,
|
||||
surrogateBitMask = (1 << surrogateBitWidth) - 1;
|
||||
static constexpr size_t tMax = (keyWidth >= sizeof(size_t) * 8) ? (size_t)-1 - (((size_t)1 << surrogateBitWidth) * 2 - 1) : (((size_t)1 << keyWidth) - ((size_t)1 << surrogateBitWidth) * 2);
|
||||
if (next < tMax)
|
||||
{
|
||||
return progressContextNodeVl(nodeIdx, next);
|
||||
}
|
||||
next -= tMax;
|
||||
const size_t high = next >> 10, low = next & 0x3FF;
|
||||
const size_t high = next >> surrogateBitWidth, low = next & surrogateBitMask;
|
||||
progressContextNodeVl(nodeIdx, tMax + high);
|
||||
return progressContextNodeVl(nodeIdx, tMax + (1 << 10) + low);
|
||||
return progressContextNodeVl(nodeIdx, tMax + (1 << surrogateBitWidth) + low);
|
||||
}
|
||||
|
||||
bool isSecondSurrogate(VlKeyType k) const
|
||||
{
|
||||
if constexpr (std::is_same_v<KeyType, VlKeyType>)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
static constexpr size_t keyWidth = sizeof(VlKeyType) * 8,
|
||||
surrogateBitWidth = keyWidth == 16 ? 10 : 5,
|
||||
surrogateBitMask = (1 << surrogateBitWidth) - 1;
|
||||
static constexpr size_t tMax = (keyWidth >= sizeof(size_t) * 8) ? (size_t)-1 - (((size_t)1 << surrogateBitWidth) * 2 - 1) : (((size_t)1 << keyWidth) - ((size_t)1 << surrogateBitWidth) * 2);
|
||||
return k >= tMax + (1 << surrogateBitWidth);
|
||||
}
|
||||
|
||||
uint32_t progressContextNodeVl(int32_t& nodeIdx, VlKeyType next) const
|
||||
|
|
@ -426,21 +508,17 @@ namespace kiwi
|
|||
return (x << r) | (x >> (sizeof(size_t) * 8 - r));
|
||||
}
|
||||
|
||||
template<>
|
||||
struct Hash<uint32_t>
|
||||
inline size_t hashUint32(uint32_t v)
|
||||
{
|
||||
size_t operator()(uint32_t v) const
|
||||
{
|
||||
return ((size_t)v * largePrime) ^ rol((size_t)v, sizeof(size_t) * 4 + 1);
|
||||
}
|
||||
};
|
||||
return ((size_t)v * largePrime) ^ rol((size_t)v, sizeof(size_t) * 4 + 1);
|
||||
}
|
||||
|
||||
template<size_t windowSize, ArchType arch, class VocabTy, class VlVocabTy, bool quantized>
|
||||
struct Hash<lm::CoNgramState<windowSize, arch, VocabTy, VlVocabTy, quantized>>
|
||||
{
|
||||
size_t operator()(const lm::CoNgramState<windowSize, arch, VocabTy, VlVocabTy, quantized>& state) const
|
||||
{
|
||||
size_t ret = Hash<uint32_t>{}(state.node);
|
||||
size_t ret = hashUint32(state.node);
|
||||
static constexpr size_t cmpStart = windowSize - sizeof(size_t) / sizeof(VocabTy);
|
||||
size_t h = *reinterpret_cast<const size_t*>(&state.history[cmpStart]);
|
||||
h = (h * largePrime) ^ rol(h, sizeof(size_t) * 4 - 1);
|
||||
|
|
@ -454,7 +532,7 @@ namespace kiwi
|
|||
{
|
||||
size_t operator()(const lm::CoNgramState<0, arch, VocabTy, VlVocabTy, quantized>& state) const
|
||||
{
|
||||
size_t ret = Hash<uint32_t>{}(state.node);
|
||||
size_t ret = hashUint32(state.node);
|
||||
return ret;
|
||||
}
|
||||
};
|
||||
|
|
|
|||
454
src/Dataset.cpp
454
src/Dataset.cpp
|
|
@ -2,6 +2,7 @@
|
|||
#include <kiwi/SubstringExtractor.h>
|
||||
#include "FrozenTrie.hpp"
|
||||
#include "RaggedVector.hpp"
|
||||
#include "StrUtils.h"
|
||||
|
||||
using namespace kiwi;
|
||||
|
||||
|
|
@ -43,9 +44,7 @@ HSDataset::HSDataset(size_t _batchSize,
|
|||
{
|
||||
}
|
||||
|
||||
HSDataset::~HSDataset()
|
||||
{
|
||||
}
|
||||
HSDataset::~HSDataset() = default;
|
||||
|
||||
HSDataset::HSDataset(HSDataset&& o) /*noexcept*/ = default;
|
||||
|
||||
|
|
@ -802,3 +801,452 @@ std::vector<std::pair<std::vector<uint32_t>, size_t>> HSDataset::extractPrefixes
|
|||
});
|
||||
return ret;
|
||||
}
|
||||
|
||||
size_t ChrTokenizer::encodeOne(char32_t c) const
|
||||
{
|
||||
if (isHangulSyllable(c))
|
||||
{
|
||||
int32_t i = (c - 0xAC00) / 28;
|
||||
return (int32_t)Token::hangulSyllableStart + i;
|
||||
}
|
||||
else if (isHangulCoda(c))
|
||||
{
|
||||
int32_t i = c - 0x11A8;
|
||||
return (int32_t)Token::hangulCodaStart + i;
|
||||
}
|
||||
else if (0x21 <= c && c < 0x7F)
|
||||
{
|
||||
return (int32_t)Token::asciiStart + (c - 0x21);
|
||||
}
|
||||
else
|
||||
{
|
||||
const POSTag type = identifySpecialChr(c);
|
||||
switch (type)
|
||||
{
|
||||
case POSTag::sf:
|
||||
return (int32_t)Token::sf;
|
||||
case POSTag::sp:
|
||||
return (int32_t)Token::sp;
|
||||
case POSTag::ss:
|
||||
return (int32_t)Token::ss;
|
||||
case POSTag::sso:
|
||||
return (int32_t)Token::sso;
|
||||
case POSTag::ssc:
|
||||
return (int32_t)Token::ssc;
|
||||
case POSTag::se:
|
||||
return (int32_t)Token::se;
|
||||
case POSTag::so:
|
||||
return (int32_t)Token::so;
|
||||
case POSTag::sh:
|
||||
return (int32_t)Token::sh;
|
||||
default:
|
||||
return (int32_t)Token::sw;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
size_t ChrTokenizer::encode(std::string_view text, int32_t* outBuf, size_t bufSize) const
|
||||
{
|
||||
size_t written = 0;
|
||||
const auto normalizedText = normalizeHangul(utf8To16(text));
|
||||
for (auto c : normalizedText)
|
||||
{
|
||||
if (written >= bufSize) break;
|
||||
|
||||
outBuf[written++] = encodeOne(c);
|
||||
}
|
||||
return written;
|
||||
}
|
||||
|
||||
std::u16string ChrTokenizer::decode(const int32_t* tokenBuf, size_t tokenCnt) const
|
||||
{
|
||||
KString result;
|
||||
for (size_t i = 0; i < tokenCnt; ++i)
|
||||
{
|
||||
int32_t t = tokenBuf[i];
|
||||
if (Token::hangulSyllableStart <= (Token)t && (Token)t < Token::hangulCodaStart)
|
||||
{
|
||||
char16_t c = 0xAC00 + (uint16_t)(t - (int32_t)Token::hangulSyllableStart) * 28;
|
||||
result.push_back(c);
|
||||
}
|
||||
else if (Token::hangulCodaStart <= (Token)t && (Token)t < Token::asciiStart)
|
||||
{
|
||||
char16_t c = 0x11A8 + (uint16_t)(t - (int32_t)Token::hangulCodaStart);
|
||||
result.push_back(c);
|
||||
}
|
||||
else if (Token::asciiStart <= (Token)t && (Token)t < Token::max)
|
||||
{
|
||||
char16_t c = 0x21 + (uint16_t)(t - (int32_t)Token::asciiStart);
|
||||
result.push_back(c);
|
||||
}
|
||||
else
|
||||
{
|
||||
switch ((Token)t)
|
||||
{
|
||||
case Token::sf:
|
||||
result.push_back(u'.');
|
||||
break;
|
||||
case Token::sp:
|
||||
result.push_back(u',');
|
||||
break;
|
||||
case Token::ss:
|
||||
result.push_back(u'"');
|
||||
break;
|
||||
case Token::sso:
|
||||
result.push_back(u'(');
|
||||
break;
|
||||
case Token::ssc:
|
||||
result.push_back(u')');
|
||||
break;
|
||||
case Token::se:
|
||||
result.push_back(u'\u2026');
|
||||
break;
|
||||
case Token::so:
|
||||
result.push_back(u'\u223c');
|
||||
break;
|
||||
case Token::sh:
|
||||
result.push_back(u'漢');
|
||||
break;
|
||||
case Token::sw:
|
||||
result.push_back(u'※');
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
return joinHangul(result);
|
||||
}
|
||||
|
||||
ChrDataset::ChrDataset(size_t _batchSize, size_t _causalContextSize, size_t _windowSize, float _prefixDropoutProb, bool _sampleWithoutWeights,
|
||||
const std::vector<std::pair<size_t, std::vector<uint32_t>>>& _contextualMapper
|
||||
)
|
||||
: batchSize(_batchSize), causalContextSize(_causalContextSize), windowSize(_windowSize), prefixDropoutProb(_prefixDropoutProb), sampleWithoutWeights(_sampleWithoutWeights)
|
||||
{
|
||||
rng.seed(currentSeed);
|
||||
|
||||
if (!_contextualMapper.empty())
|
||||
{
|
||||
utils::ContinuousTrie<utils::TrieNodeEx<uint32_t, uint32_t>> cmTrie(1);
|
||||
for (auto& p : _contextualMapper)
|
||||
{
|
||||
cmTrie.build(p.second.begin(), p.second.end(), p.first + 1);
|
||||
}
|
||||
cmTrie.fillFail();
|
||||
contextualMapper = utils::FrozenTrie<uint32_t, uint32_t>{ cmTrie, ArchTypeHolder<ArchType::balanced>{} };
|
||||
}
|
||||
}
|
||||
|
||||
ChrDataset::~ChrDataset() = default;
|
||||
|
||||
ChrDataset::ChrDataset(ChrDataset&&) = default;
|
||||
|
||||
ChrDataset& ChrDataset::operator=(ChrDataset&&) = default;
|
||||
|
||||
|
||||
void ChrDataset::addSentence(std::string_view sentence, float weight, std::string_view nonLabelPrefix)
|
||||
{
|
||||
ChrTokenizer tokenizer;
|
||||
thread_local Vector<int32_t> tokenBuf;
|
||||
tokenBuf.resize(sentence.size() + nonLabelPrefix.size());
|
||||
std::string joined;
|
||||
joined += nonLabelPrefix;
|
||||
joined += sentence;
|
||||
const size_t prefixSize = tokenizer.encode(nonLabelPrefix, tokenBuf.data(), tokenBuf.size());
|
||||
const size_t tokenCnt = tokenizer.encode(joined, tokenBuf.data(), tokenBuf.size());
|
||||
auto& sents = this->sents.get();
|
||||
sents.emplace_back();
|
||||
sents.insert_data(tokenBuf.begin(), tokenBuf.begin() + tokenCnt);
|
||||
sentWeights.emplace_back(weight);
|
||||
nonLabelPrefixSizes.emplace_back(prefixSize);
|
||||
totalWeight += weight;
|
||||
}
|
||||
|
||||
size_t ChrDataset::numSents() const
|
||||
{
|
||||
return sents.get().size();
|
||||
}
|
||||
|
||||
void ChrDataset::seed(size_t newSeed)
|
||||
{
|
||||
currentSeed = newSeed;
|
||||
rng.seed(newSeed);
|
||||
}
|
||||
|
||||
void ChrDataset::reset()
|
||||
{
|
||||
seed(currentSeed);
|
||||
sentSampled.clear();
|
||||
shuffledIdcs.clear();
|
||||
totalSampled = 0;
|
||||
consumedSents = 0;
|
||||
}
|
||||
|
||||
class InputTokenMapper
|
||||
{
|
||||
const utils::FrozenTrie<uint32_t, uint32_t>& cmTrie;
|
||||
const utils::FrozenTrie<uint32_t, uint32_t>::Node* node = nullptr;
|
||||
public:
|
||||
InputTokenMapper(const utils::FrozenTrie<uint32_t, uint32_t>& trie)
|
||||
: cmTrie{ trie }
|
||||
{
|
||||
if (!cmTrie.empty())
|
||||
{
|
||||
node = cmTrie.root();
|
||||
}
|
||||
}
|
||||
|
||||
int32_t operator()(int32_t inputToken)
|
||||
{
|
||||
if (cmTrie.empty() || inputToken == 0)
|
||||
{
|
||||
return inputToken;
|
||||
}
|
||||
|
||||
auto* next = node->template nextOpt<ArchType::balanced>(cmTrie, inputToken);
|
||||
while (!next)
|
||||
{
|
||||
node = node->fail();
|
||||
if (!node) break;
|
||||
next = node->template nextOpt<ArchType::balanced>(cmTrie, inputToken);
|
||||
}
|
||||
|
||||
if (next)
|
||||
{
|
||||
node = next;
|
||||
auto val = next->val(cmTrie);
|
||||
if (cmTrie.hasMatch(val))
|
||||
{
|
||||
return val - 1;
|
||||
}
|
||||
else if (cmTrie.hasSubmatch(val))
|
||||
{
|
||||
auto sub = next->fail();
|
||||
for (; sub; sub = sub->fail())
|
||||
{
|
||||
val = sub->val(cmTrie);
|
||||
if (cmTrie.hasMatch(val))
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (sub) return val - 1;
|
||||
else return -1;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
node = cmTrie.root();
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template<class InTy, class OutTy>
|
||||
size_t ChrDataset::_next(InTy in, OutTy out)
|
||||
{
|
||||
if (sentSampled.size() != sentWeights.size())
|
||||
{
|
||||
sentSampled.resize(sentWeights.size());
|
||||
}
|
||||
|
||||
if (sampleWithoutWeights)
|
||||
{
|
||||
if (shuffledIdcs.size() != sentWeights.size())
|
||||
{
|
||||
shuffledIdcs.resize(sentWeights.size());
|
||||
std::iota(shuffledIdcs.begin(), shuffledIdcs.end(), 0);
|
||||
std::shuffle(shuffledIdcs.begin(), shuffledIdcs.end(), rng);
|
||||
consumedSents = 0;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (totalSampled <= 0)
|
||||
{
|
||||
shuffledIdcs.resize(sentWeights.size());
|
||||
std::iota(shuffledIdcs.begin(), shuffledIdcs.end(), 0);
|
||||
}
|
||||
else
|
||||
{
|
||||
shuffledIdcs.clear();
|
||||
const float totalWeight = this->totalWeight,
|
||||
totalSampled = this->totalSampled;
|
||||
for (size_t i = 0; i < sentWeights.size(); ++i)
|
||||
{
|
||||
const float w = sentWeights[i] / totalWeight;
|
||||
const float s = sentSampled[i] / totalSampled;
|
||||
if (s < w)
|
||||
{
|
||||
shuffledIdcs.emplace_back(i);
|
||||
}
|
||||
}
|
||||
}
|
||||
std::shuffle(shuffledIdcs.begin(), shuffledIdcs.end(), rng);
|
||||
}
|
||||
|
||||
auto& sents = this->sents.get();
|
||||
size_t b;
|
||||
for (b = 0; b < batchSize; ++b)
|
||||
{
|
||||
if (sampleWithoutWeights && b + consumedSents >= shuffledIdcs.size())
|
||||
{
|
||||
break;
|
||||
}
|
||||
|
||||
const size_t i = sampleWithoutWeights ? shuffledIdcs[b + consumedSents] : shuffledIdcs[b % shuffledIdcs.size()];
|
||||
sentSampled[i] += 1.f;
|
||||
totalSampled += 1;
|
||||
|
||||
size_t start = 0;
|
||||
if (prefixDropoutProb > 0 && std::generate_canonical<float, 32>(rng) < prefixDropoutProb)
|
||||
{
|
||||
start = (size_t)((std::max(sents[i].size(), (size_t)2) - 2) * std::generate_canonical<float, 32>(rng));
|
||||
}
|
||||
const size_t nonLabelPrefixSize = nonLabelPrefixSizes[i];
|
||||
const size_t end = std::min(sents[i].size() + 1, causalContextSize);
|
||||
|
||||
InputTokenMapper tokenMapper{ contextualMapper };
|
||||
for (size_t j = start; j < end; ++j)
|
||||
{
|
||||
const auto inputToken = j > 0 ? sents[i][j - 1] : 0;
|
||||
*in = tokenMapper(inputToken);
|
||||
++in;
|
||||
*out = j < nonLabelPrefixSize ? nonVocab : (j < sents[i].size() ? sents[i][j] : 0);
|
||||
++out;
|
||||
}
|
||||
for (size_t j = end - start; j < causalContextSize; ++j)
|
||||
{
|
||||
*in = nonVocab;
|
||||
++in;
|
||||
*out = nonVocab;
|
||||
++out;
|
||||
}
|
||||
}
|
||||
if (sampleWithoutWeights)
|
||||
{
|
||||
consumedSents += b;
|
||||
}
|
||||
return b;
|
||||
}
|
||||
|
||||
size_t ChrDataset::next(int32_t* in, int32_t* out)
|
||||
{
|
||||
return _next(in, out);
|
||||
}
|
||||
|
||||
size_t ChrDataset::next(int64_t* in, int64_t* out)
|
||||
{
|
||||
return _next(in, out);
|
||||
}
|
||||
|
||||
std::vector<float> ChrDataset::getVocabProbs(double epsilon) const
|
||||
{
|
||||
Vector<double> weights(vocabSize(), epsilon);
|
||||
|
||||
for (size_t i = 0; i < sentWeights.size(); ++i)
|
||||
{
|
||||
auto sent = sents.get()[i];
|
||||
for (auto token : sent)
|
||||
{
|
||||
auto v = token;
|
||||
if (v < 0 || v >= vocabSize())
|
||||
{
|
||||
continue;
|
||||
}
|
||||
weights[v] += sentWeights[i];
|
||||
}
|
||||
weights[0] += sentWeights[i]; // for EOS
|
||||
}
|
||||
|
||||
const double totalWeight = std::accumulate(weights.begin(), weights.end(), 0.0);
|
||||
std::vector<float> probs(vocabSize());
|
||||
for (size_t i = 0; i < vocabSize(); ++i)
|
||||
{
|
||||
probs[i] = (float)(weights[i] / totalWeight);
|
||||
}
|
||||
return probs;
|
||||
}
|
||||
|
||||
std::vector<std::pair<std::vector<uint32_t>, double>> ChrDataset::extractPrefixes(
|
||||
float resolution, float minWeight,
|
||||
size_t maxLength, size_t numWorkers, bool exclusiveCnt) const
|
||||
{
|
||||
using Pair = std::pair<std::vector<uint32_t>, double>;
|
||||
std::vector<Pair> ret;
|
||||
const size_t minCnt = (size_t)ceil(minWeight / resolution);
|
||||
PrefixCounter counter{ maxLength, minCnt, numWorkers };
|
||||
Vector<int32_t> tokenBuf;
|
||||
for (size_t i = 0; i < sents.get().size(); ++i)
|
||||
{
|
||||
const auto sent = sents.get()[i];
|
||||
tokenBuf.clear();
|
||||
tokenBuf.emplace_back(0);
|
||||
tokenBuf.insert(tokenBuf.end(), sent.begin(), sent.end());
|
||||
const size_t n = (size_t)ceil(sentWeights[i] / resolution);
|
||||
for (size_t j = 0; j < n; ++j)
|
||||
{
|
||||
counter.addArray(tokenBuf.data(), tokenBuf.data() + tokenBuf.size());
|
||||
}
|
||||
}
|
||||
auto trie = counter.count();
|
||||
if (exclusiveCnt)
|
||||
{
|
||||
Vector<UnorderedMap<Vector<uint32_t>, size_t>> cnts_by_length(maxLength);
|
||||
trie.traverse([&](size_t cnt, const std::vector<uint32_t>& prefix)
|
||||
{
|
||||
if (cnt < minCnt) return;
|
||||
if (std::find_if(prefix.begin() + 1, prefix.end(), [](uint32_t t) { return t == 0; }) != prefix.end())
|
||||
{
|
||||
return;
|
||||
}
|
||||
Vector<uint32_t> p(prefix.begin(), prefix.end());
|
||||
cnts_by_length[p.size() - 1].emplace(move(p), cnt);
|
||||
});
|
||||
|
||||
Vector<uint32_t> suffix;
|
||||
suffix.reserve(maxLength);
|
||||
for (size_t i = 1; i < maxLength; ++i)
|
||||
{
|
||||
for (auto& p : cnts_by_length[i])
|
||||
{
|
||||
suffix.clear();
|
||||
suffix.insert(suffix.end(), p.first.begin() + 1, p.first.end());
|
||||
auto it = cnts_by_length[i - 1].find(suffix);
|
||||
if (it == cnts_by_length[i - 1].end() || it->second < p.second)
|
||||
{
|
||||
throw std::runtime_error("This should not happen");
|
||||
}
|
||||
it->second -= p.second;
|
||||
}
|
||||
}
|
||||
|
||||
for (auto& cnts : cnts_by_length)
|
||||
{
|
||||
for (auto& p : cnts)
|
||||
{
|
||||
if (p.second < minCnt) continue;
|
||||
ret.emplace_back(std::vector<uint32_t>{ p.first.begin(), p.first.end() }, (double)p.second * resolution);
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
trie.traverse([&](size_t cnt, const std::vector<uint32_t>& prefix)
|
||||
{
|
||||
if (cnt < minCnt) return;
|
||||
if (std::find_if(prefix.begin() + 1, prefix.end(), [](uint32_t t) { return t == 0; }) != prefix.end())
|
||||
{
|
||||
return;
|
||||
}
|
||||
ret.emplace_back(prefix, (double)cnt * resolution);
|
||||
});
|
||||
}
|
||||
|
||||
std::sort(ret.begin(), ret.end(), [](const Pair& a, const Pair& b)
|
||||
{
|
||||
return a.second > b.second;
|
||||
});
|
||||
return ret;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -106,6 +106,9 @@ namespace kiwi
|
|||
}
|
||||
ret.zCodaAppendable = zCodaAppendable ? 1 : 0;
|
||||
ret.zSiotAppendable = zSiotAppendable ? 1 : 0;
|
||||
// 다음 값들은 KiwiBuilder::build에서 채워 넣는다
|
||||
ret.hasJClass = 0;
|
||||
ret.hasAnyFullMorphemes = 0;
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -136,6 +136,7 @@ namespace kiwi
|
|||
auto v = nextDiffs[p->nextOffset + i];
|
||||
if (v <= 0) continue;
|
||||
auto* child = &p[v];
|
||||
child->depth = p->depth + 1;
|
||||
child->lower = p->template findFail<archType>(*this, k) - child;
|
||||
dq.emplace_back(child);
|
||||
}
|
||||
|
|
|
|||
1161
src/KTrie.cpp
1161
src/KTrie.cpp
File diff suppressed because it is too large
Load diff
|
|
@ -5,6 +5,7 @@
|
|||
#include <kiwi/Form.h>
|
||||
#include <kiwi/PatternMatcher.h>
|
||||
#include <kiwi/FrozenTrie.h>
|
||||
#include <kiwi/TypoTransformer.h>
|
||||
|
||||
#include "StrUtils.h"
|
||||
|
||||
|
|
@ -97,7 +98,10 @@ namespace kiwi
|
|||
Match matchOptions,
|
||||
Dialect allowedDialect,
|
||||
size_t maxUnkFormSize,
|
||||
size_t maxUnkFormSizeFollowedByJClass,
|
||||
size_t spaceTolerance,
|
||||
const PreparedTypoTransformer* typoTransformer,
|
||||
float typoThreshold,
|
||||
float continualTypoCost,
|
||||
float lengtheningTypoCost,
|
||||
const PretokenizedSpanGroup::Span*& pretokenizedFirst,
|
||||
|
|
|
|||
152
src/Kiwi.cpp
152
src/Kiwi.cpp
|
|
@ -499,7 +499,8 @@ namespace kiwi
|
|||
| Match::joinVerbSuffix
|
||||
| Match::joinAdjSuffix
|
||||
| Match::joinAdvSuffix
|
||||
| Match::mergeSaisiot))) return last;
|
||||
| Match::mergeSaisiot
|
||||
| Match::joinParticleYo))) return last;
|
||||
if (std::distance(first, last) < 2) return last;
|
||||
|
||||
auto next = first;
|
||||
|
|
@ -566,6 +567,15 @@ namespace kiwi
|
|||
++next;
|
||||
++next;
|
||||
}
|
||||
// (EC | EF) + JX(요) => (EC | EF)
|
||||
else if (!!(matchOptions & Match::joinParticleYo)
|
||||
&& nextToken.tag == POSTag::jx
|
||||
&& nextToken.morph && *nextToken.morph->kform == u"요"
|
||||
&& (current.tag == POSTag::ec || current.tag == POSTag::ef))
|
||||
{
|
||||
concatTokens(current, nextToken, current.tag);
|
||||
++next;
|
||||
}
|
||||
else
|
||||
{
|
||||
++first;
|
||||
|
|
@ -603,7 +613,8 @@ namespace kiwi
|
|||
|
||||
inline void insertPathIntoResults(
|
||||
vector<TokenResult>& ret,
|
||||
Vector<SpecialState>& spStatesByRet,
|
||||
Vector<PackedState>& spStatesByRet,
|
||||
Vector<uint8_t>& oovTotalCnt,
|
||||
const Vector<PathResult>& pathes,
|
||||
size_t topN,
|
||||
Match matchOptions,
|
||||
|
|
@ -616,6 +627,15 @@ namespace kiwi
|
|||
{
|
||||
Vector<size_t> parentMap;
|
||||
|
||||
uint32_t oovCntArenaMinPtr = -1;
|
||||
if (!!(matchOptions & Match::oovTotalConsistency))
|
||||
{
|
||||
for (auto& p : pathes)
|
||||
{
|
||||
oovCntArenaMinPtr = min(oovCntArenaMinPtr, p.curState.oovCntArenaPtr());
|
||||
}
|
||||
}
|
||||
|
||||
if (ret.empty())
|
||||
{
|
||||
const size_t n = min(pathes.size(), topN * 2);
|
||||
|
|
@ -626,7 +646,7 @@ namespace kiwi
|
|||
}
|
||||
else
|
||||
{
|
||||
UnorderedMap<uint8_t, uint32_t> prevParents;
|
||||
UnorderedMap<PackedState, uint32_t> prevParents;
|
||||
Vector<uint8_t> selectedPathes(pathes.size());
|
||||
for (size_t i = 0; i < ret.size(); ++i)
|
||||
{
|
||||
|
|
@ -667,7 +687,7 @@ namespace kiwi
|
|||
}
|
||||
}
|
||||
|
||||
UnorderedMap<uint8_t, uint32_t> spStateCnt;
|
||||
UnorderedMap<PackedState, uint32_t> spStateCnt;
|
||||
size_t validTarget = 0;
|
||||
for (size_t i = 0; i < ret.size(); ++i)
|
||||
{
|
||||
|
|
@ -728,6 +748,10 @@ namespace kiwi
|
|||
token.typoCost = s.typoCost;
|
||||
token.typoFormId = s.typoFormId;
|
||||
token.senseId = s.morph->senseId;
|
||||
if ((s.morph->tag == POSTag::nng || s.morph->tag == POSTag::nnp) && !s.str.empty())
|
||||
{
|
||||
token.senseId = -1; // OOV인 경우에는 senseId를 -1로 설정
|
||||
}
|
||||
updateTokenInfoScript(token);
|
||||
token.dialect = s.morph->dialect;
|
||||
auto ptId = nodeInWhichPretokenized[s.nodeId] + 1;
|
||||
|
|
@ -751,7 +775,7 @@ namespace kiwi
|
|||
sort(idx.begin(), idx.end(), [&](size_t a, size_t b) { return ret[a].second > ret[b].second; });
|
||||
|
||||
Vector<TokenResult> sortedRet;
|
||||
Vector<SpecialState> sortedSpStatesByRet;
|
||||
Vector<PackedState> sortedSpStatesByRet;
|
||||
const size_t maxCands = min(topN * 2, validTarget);
|
||||
for (size_t i = 0; i < maxCands; ++i)
|
||||
{
|
||||
|
|
@ -763,8 +787,16 @@ namespace kiwi
|
|||
for (size_t i = 0; i < maxCands; ++i)
|
||||
{
|
||||
ret.emplace_back(move(sortedRet[i]));
|
||||
spStatesByRet.emplace_back(sortedSpStatesByRet[i]);
|
||||
if (!!(matchOptions & Match::oovTotalConsistency))
|
||||
{
|
||||
spStatesByRet.emplace_back(sortedSpStatesByRet[i].specialState(), sortedSpStatesByRet[i].oovCntArenaPtr() - oovCntArenaMinPtr);
|
||||
}
|
||||
else
|
||||
{
|
||||
spStatesByRet.emplace_back(sortedSpStatesByRet[i]);
|
||||
}
|
||||
}
|
||||
if (!!(matchOptions & Match::oovTotalConsistency)) oovTotalCnt.erase(oovTotalCnt.begin(), oovTotalCnt.begin() + oovCntArenaMinPtr);
|
||||
}
|
||||
|
||||
inline void makePretokenizedSpanGroup(
|
||||
|
|
@ -960,14 +992,24 @@ namespace kiwi
|
|||
throw invalid_argument{ "`cutOffThreshold` should be >= 0." };
|
||||
}
|
||||
|
||||
if (unkFormScoreScale < 0)
|
||||
if (oovRuleScale < 0)
|
||||
{
|
||||
throw invalid_argument{ "`unkFormScoreScale` should be >= 0." };
|
||||
throw invalid_argument{ "`oovRuleScale` should be >= 0." };
|
||||
}
|
||||
|
||||
if (unkFormScoreBias < 0)
|
||||
if (oovGlobalWeight <= 0)
|
||||
{
|
||||
throw invalid_argument{ "`unkFormScoreBias` should be >= 0." };
|
||||
throw invalid_argument{ "`oovGlobalWeight` should be > 0." };
|
||||
}
|
||||
|
||||
if (oovLocalWeight <= 0)
|
||||
{
|
||||
throw invalid_argument{ "`oovLocalWeight` should be > 0." };
|
||||
}
|
||||
|
||||
if (oovGlobalMinFreq <= 0)
|
||||
{
|
||||
throw invalid_argument{ "`oovGlobalMinFreq` should be >= 0." };
|
||||
}
|
||||
|
||||
if (spacePenalty <= 0)
|
||||
|
|
@ -986,6 +1028,9 @@ namespace kiwi
|
|||
}
|
||||
}
|
||||
|
||||
std::ostream* logStream = &std::cerr;
|
||||
int doLogging = 0;
|
||||
|
||||
vector<TokenResult> Kiwi::analyze(const u16string& str, size_t topN, AnalyzeOption option,
|
||||
const vector<PretokenizedSpan>& pretokenized,
|
||||
const optional<KiwiConfig>& overrideConfig
|
||||
|
|
@ -1004,6 +1049,17 @@ namespace kiwi
|
|||
|
||||
if (!!(option.match & Match::normalizeCoda)) normalizeCoda(normalizedStr.begin(), normalizedStr.end());
|
||||
|
||||
if ((option.match & Match::oovMask) >= Match::oovChrModel && !nounChrMdl)
|
||||
{
|
||||
throw invalid_argument{ "`oovChrModel` option is set but the character-level noun model is not loaded." };
|
||||
}
|
||||
|
||||
if (option.allowedDialects != Dialect::standard && option.typoTransformer == nullptr)
|
||||
{
|
||||
option.typoTransformer = getDefaultPreparedTypoSet(DefaultTypoSet::dialect);
|
||||
option.typoThreshold = 2.5f;
|
||||
}
|
||||
|
||||
makePretokenizedSpanGroup(
|
||||
pretokenizedGroup,
|
||||
pretokenized,
|
||||
|
|
@ -1019,18 +1075,59 @@ namespace kiwi
|
|||
wordPositions.clear();
|
||||
getWordPositions(wordPositions, str.begin(), str.end());
|
||||
|
||||
SubstringCounter substringCounter;
|
||||
if ((option.match & Match::oovMask) >= Match::oovChrFreqModel || !!(option.match & Match::oovTotalConsistency))
|
||||
{
|
||||
thread_local Vector<char16_t> filteredStr;
|
||||
filteredStr.clear();
|
||||
filteredStr.reserve(normalizedStr.size());
|
||||
for (size_t i = 0; i < normalizedStr.size(); ++i)
|
||||
{
|
||||
auto c = normalizedStr[i];
|
||||
const POSTag chrType = identifySpecialChr(c);
|
||||
switch (chrType)
|
||||
{
|
||||
case POSTag::unknown:
|
||||
case POSTag::sf:
|
||||
case POSTag::sp:
|
||||
case POSTag::ss:
|
||||
case POSTag::sso:
|
||||
case POSTag::ssc:
|
||||
case POSTag::se:
|
||||
case POSTag::so:
|
||||
case POSTag::sw:
|
||||
case POSTag::sb:
|
||||
c = u' ';
|
||||
break;
|
||||
}
|
||||
filteredStr.emplace_back(c);
|
||||
}
|
||||
substringCounter = SubstringCounter{ filteredStr.data(), filteredStr.size() };
|
||||
}
|
||||
|
||||
vector<TokenResult> ret;
|
||||
Vector<SpecialState> spStatesByRet;
|
||||
Vector<PackedState> spStatesByRet;
|
||||
thread_local Vector<KGraphNode> nodes;
|
||||
thread_local Vector<uint32_t> nodeInWhichPretokenized;
|
||||
thread_local UnorderedMap<U16StringView, size_t> oovTotalMap;
|
||||
thread_local UnorderedMap<OovOrForm,Vector<uint16_t>> oovPrefixLists;
|
||||
thread_local Vector<uint8_t> oovTotalCnt;
|
||||
oovTotalMap.clear();
|
||||
oovPrefixLists.clear();
|
||||
oovTotalCnt.clear();
|
||||
const auto* pretokenizedFirst = pretokenizedGroup.spans.data();
|
||||
const auto* pretokenizedLast = pretokenizedFirst + pretokenizedGroup.spans.size();
|
||||
size_t splitEnd = 0;
|
||||
chrono::steady_clock::time_point startTime;
|
||||
while (splitEnd < normalizedStr.size())
|
||||
{
|
||||
if (doLogging)
|
||||
{
|
||||
startTime = chrono::steady_clock::now();
|
||||
}
|
||||
nodes.clear();
|
||||
auto* pretokenizedPrev = pretokenizedFirst;
|
||||
splitEnd = (*reinterpret_cast<FnSplitByTrie>(dfSplitByTrie))(
|
||||
const size_t newSplitEnd = (*reinterpret_cast<FnSplitByTrie>(dfSplitByTrie))(
|
||||
nodes,
|
||||
forms.data(),
|
||||
typoPtrs.data(),
|
||||
|
|
@ -1040,17 +1137,27 @@ namespace kiwi
|
|||
option.match,
|
||||
option.allowedDialects,
|
||||
config.maxUnkFormSize,
|
||||
config.maxUnkFormSizeFollowedByJClass,
|
||||
config.spaceTolerance,
|
||||
option.typoTransformer,
|
||||
option.typoThreshold,
|
||||
continualTypoCost,
|
||||
lengtheningTypoCost,
|
||||
pretokenizedFirst,
|
||||
pretokenizedLast
|
||||
);
|
||||
|
||||
if (doLogging)
|
||||
{
|
||||
auto input = utf16To8(joinHangul(normalizedStr.substr(splitEnd, newSplitEnd - splitEnd)));
|
||||
*logStream << "Input: " << input << "\nNodes: " << nodes.size() << endl;
|
||||
}
|
||||
splitEnd = newSplitEnd;
|
||||
|
||||
if (nodes.size() <= 2) continue;
|
||||
findPretokenizedGroupOfNode(nodeInWhichPretokenized, nodes, pretokenizedPrev, pretokenizedFirst);
|
||||
|
||||
Vector<PathResult> res = (*reinterpret_cast<FnFindBestPath>(dfFindBestPath))(
|
||||
Vector<PathResult> res = (*reinterpret_cast<FnFindBestPath>(dfFindBestPath))(FindBestPathArgs{
|
||||
this,
|
||||
config,
|
||||
spStatesByRet,
|
||||
|
|
@ -1058,15 +1165,28 @@ namespace kiwi
|
|||
nodes.data(),
|
||||
nodes.size(),
|
||||
topN,
|
||||
(size_t)(option.match & Match::oovMask),
|
||||
!!(option.match & Match::oovTotalConsistency) ? &oovTotalMap : nullptr,
|
||||
!!(option.match & Match::oovTotalConsistency) ? &oovTotalCnt : nullptr,
|
||||
!!(option.match & Match::oovTotalConsistency) ? &oovPrefixLists : nullptr,
|
||||
!!(option.match & Match::oovTotalConsistency) ? &ret : nullptr,
|
||||
option.openEnding && splitEnd == normalizedStr.size(),
|
||||
!!(option.match & Match::splitComplex),
|
||||
!!(option.match & Match::splitSaisiot),
|
||||
!!(option.match & Match::mergeSaisiot),
|
||||
option.blocklist,
|
||||
option.allowedDialects,
|
||||
option.dialectCost
|
||||
);
|
||||
insertPathIntoResults(ret, spStatesByRet, res, topN, option.match, config.integrateAllomorph, positionTable, wordPositions, pretokenizedGroup, nodeInWhichPretokenized);
|
||||
option.dialectCost,
|
||||
(option.match & Match::oovMask) >= Match::oovChrFreqModel ? &substringCounter : nullptr
|
||||
});
|
||||
insertPathIntoResults(ret, spStatesByRet, oovTotalCnt,
|
||||
res, topN, option.match, config.integrateAllomorph,
|
||||
positionTable, wordPositions, pretokenizedGroup, nodeInWhichPretokenized);
|
||||
if (doLogging)
|
||||
{
|
||||
auto duration = chrono::duration_cast<chrono::milliseconds>(chrono::steady_clock::now() - startTime).count();
|
||||
*logStream << "Time: " << duration << "ms\n" << endl;
|
||||
}
|
||||
}
|
||||
|
||||
sort(ret.begin(), ret.end(), [](const TokenResult& a, const TokenResult& b)
|
||||
|
|
|
|||
|
|
@ -1090,6 +1090,14 @@ KiwiBuilder::KiwiBuilder(StreamProvider streamProvider, size_t _numThreads, Buil
|
|||
throw IOException{ "Cannot open required file: combiningRule.txt" };
|
||||
}
|
||||
}
|
||||
|
||||
if (auto stream = streamProvider("nounchr.mdl"))
|
||||
{
|
||||
nounChrMdl = lm::CoNgramModelBase::create(utils::createMemoryObjectFromStream(*stream),
|
||||
archType,
|
||||
false,
|
||||
(modelType == ModelType::cong || modelType == ModelType::congGlobal));
|
||||
}
|
||||
}
|
||||
|
||||
KiwiBuilder::KiwiBuilder(const string& modelPath, size_t _numThreads, BuildOption _options, ModelType _modelType, Dialect _enabledDialects)
|
||||
|
|
@ -2378,6 +2386,7 @@ Kiwi KiwiBuilder::build(const TypoTransformer& typos, float typoCostThreshold) c
|
|||
{
|
||||
Kiwi ret{ archType, langMdl, !typos.empty(), typos.isContinualTypoEnabled(), typos.isLengtheningTypoEnabled() };
|
||||
ret.enabledDialects = enabledDialects;
|
||||
ret.nounChrMdl = nounChrMdl;
|
||||
|
||||
Vector<FormRaw> combinedForms;
|
||||
Vector<MorphemeRaw> combinedMorphemes;
|
||||
|
|
@ -2459,6 +2468,7 @@ Kiwi KiwiBuilder::build(const TypoTransformer& typos, float typoCostThreshold) c
|
|||
for (size_t i = 0; i < defaultFormSize; ++i)
|
||||
{
|
||||
formTrie[i + 1].val = &ret.forms[i];
|
||||
ret.forms[i].hasAnyFullMorphemes = true;
|
||||
}
|
||||
|
||||
Vector<const Form*> sortedForms;
|
||||
|
|
@ -2477,6 +2487,17 @@ Kiwi KiwiBuilder::build(const TypoTransformer& typos, float typoCostThreshold) c
|
|||
f.polar = accumulate(f.candidate.begin(), f.candidate.end(), f.candidate[0]->polar, reducePolar);
|
||||
}
|
||||
|
||||
f.hasJClass = any_of(f.candidate.begin(), f.candidate.end(), [&](const Morpheme* m)
|
||||
{
|
||||
return isJClass(m->tag) || m->tag == POSTag::ec || m->tag == POSTag::ef;
|
||||
});
|
||||
|
||||
f.hasAnyFullMorphemes = any_of(f.candidate.begin(), f.candidate.end(), [&](const Morpheme* m)
|
||||
{
|
||||
const auto tag = clearIrregular(m->tag);
|
||||
return m->dialect == Dialect::standard && tag != POSTag::unknown && tag != POSTag::pa && tag != POSTag::pv;
|
||||
});
|
||||
|
||||
f.dialect = accumulate(f.candidate.begin(), f.candidate.end(), f.candidate[0]->dialect, reduceDialect);
|
||||
if (f.dialect != Dialect::standard && !(enabledDialects & f.dialect))
|
||||
{
|
||||
|
|
|
|||
|
|
@ -1,5 +1,6 @@
|
|||
#pragma once
|
||||
#include <kiwi/Kiwi.h>
|
||||
#include "UnkFormScorer.h"
|
||||
|
||||
namespace kiwi
|
||||
{
|
||||
|
|
@ -13,6 +14,11 @@ namespace kiwi
|
|||
{
|
||||
}
|
||||
|
||||
explicit SpecialState(uint8_t val)
|
||||
{
|
||||
reinterpret_cast<uint8_t&>(*this) = val;
|
||||
}
|
||||
|
||||
operator uint8_t() const
|
||||
{
|
||||
return reinterpret_cast<const uint8_t&>(*this);
|
||||
|
|
@ -68,43 +74,231 @@ namespace kiwi
|
|||
};
|
||||
using Path = Vector<PathNode>;
|
||||
|
||||
struct PackedState
|
||||
{
|
||||
uint32_t data = 0;
|
||||
PackedState() = default;
|
||||
PackedState(SpecialState state, uint32_t oovCntArenaPtr = 0)
|
||||
{
|
||||
data = (oovCntArenaPtr << 8) | (uint8_t)state;
|
||||
}
|
||||
|
||||
SpecialState specialState() const
|
||||
{
|
||||
return (SpecialState)(uint8_t)(data & 0xFF);
|
||||
}
|
||||
|
||||
uint32_t oovCntArenaPtr() const
|
||||
{
|
||||
return data >> 8;
|
||||
}
|
||||
|
||||
void setSpecialState(SpecialState state)
|
||||
{
|
||||
data = (data & 0xFFFFFF00) | (uint8_t)state;
|
||||
}
|
||||
|
||||
void setOovCntArenaPtr(uint32_t ptr)
|
||||
{
|
||||
data = (data & 0xFF) | (ptr << 8);
|
||||
}
|
||||
|
||||
bool operator<(const PackedState& o) const
|
||||
{
|
||||
return data < o.data;
|
||||
}
|
||||
|
||||
bool operator==(const PackedState& o) const
|
||||
{
|
||||
return data == o.data;
|
||||
}
|
||||
};
|
||||
|
||||
template<>
|
||||
struct Hash<PackedState>
|
||||
{
|
||||
size_t operator()(const PackedState& s) const
|
||||
{
|
||||
return std::hash<uint32_t>{}(s.data);
|
||||
}
|
||||
};
|
||||
|
||||
struct PathResult
|
||||
{
|
||||
Path path;
|
||||
float score = 0;
|
||||
SpecialState prevState;
|
||||
SpecialState curState;
|
||||
PackedState prevState;
|
||||
PackedState curState;
|
||||
|
||||
PathResult(Path&& _path = {}, float _score = 0, SpecialState _prevState = {}, SpecialState _curState = {})
|
||||
PathResult(Path&& _path = {}, float _score = 0, PackedState _prevState = {}, PackedState _curState = {})
|
||||
: path{ move(_path) }, score{ _score }, prevState{ _prevState }, curState{ _curState }
|
||||
{
|
||||
sizeof(PathResult);
|
||||
}
|
||||
|
||||
PathResult(const Path& _path, float _score = 0, SpecialState _prevState = {}, SpecialState _curState = {})
|
||||
PathResult(const Path& _path, float _score = 0, PackedState _prevState = {}, PackedState _curState = {})
|
||||
: path{ _path }, score{ _score }, prevState{ _prevState }, curState{ _curState }
|
||||
{
|
||||
}
|
||||
};
|
||||
|
||||
template<class LangModel>
|
||||
struct BestPathFinder
|
||||
class OovOrForm : public U16StringView
|
||||
{
|
||||
static Vector<PathResult> findBestPath(const Kiwi* kw,
|
||||
const KiwiConfig& config,
|
||||
const Vector<SpecialState>& prevSpStates,
|
||||
const KString& normForm,
|
||||
const KGraphNode* graph,
|
||||
const size_t graphSize,
|
||||
const size_t topN,
|
||||
bool openEnding,
|
||||
bool splitComplex = false,
|
||||
bool splitSaisiot = false,
|
||||
bool mergeSaisiot = false,
|
||||
const std::unordered_set<const Morpheme*>* blocklist = nullptr,
|
||||
Dialect allowedDialects = Dialect::standard,
|
||||
float dialectCost = 0.f
|
||||
);
|
||||
public:
|
||||
explicit OovOrForm(const char16_t* str, size_t len) : U16StringView{ len ? str : nullptr, len } {}
|
||||
OovOrForm(U16StringView str) : OovOrForm{ str.data(), str.size() } {}
|
||||
explicit OovOrForm(const Form* form) : U16StringView{ reinterpret_cast<const char16_t*>(form), 0 } {}
|
||||
|
||||
const Form* asForm() const
|
||||
{
|
||||
if (size() > 0) return nullptr;
|
||||
return reinterpret_cast<const Form*>(data());
|
||||
}
|
||||
|
||||
U16StringView asOov() const
|
||||
{
|
||||
return *this;
|
||||
}
|
||||
|
||||
bool operator==(const OovOrForm& o) const
|
||||
{
|
||||
const Form* form1 = asForm();
|
||||
const Form* form2 = o.asForm();
|
||||
if (form1 && form2)
|
||||
{
|
||||
return form1 == form2;
|
||||
}
|
||||
else if (!form1 && !form2)
|
||||
{
|
||||
return asOov() == o.asOov();
|
||||
}
|
||||
else
|
||||
{
|
||||
return false;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
using FnFindBestPath = decltype(&BestPathFinder<void>::findBestPath);
|
||||
template<>
|
||||
struct Hash<OovOrForm>
|
||||
{
|
||||
size_t operator()(const OovOrForm& o) const
|
||||
{
|
||||
const Form* form = o.asForm();
|
||||
if (form)
|
||||
{
|
||||
return Hash<const Form*>{}(form);
|
||||
}
|
||||
else
|
||||
{
|
||||
return Hash<U16StringView>{}(o.asOov());
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
struct FindBestPathArgs
|
||||
{
|
||||
const Kiwi* kw;
|
||||
const KiwiConfig& config;
|
||||
const Vector<PackedState>& prevSpStates;
|
||||
const KString& normForm;
|
||||
const KGraphNode* graph;
|
||||
size_t graphSize;
|
||||
size_t topN;
|
||||
size_t oovScoringType;
|
||||
UnorderedMap<U16StringView, size_t>* oovTotalMap;
|
||||
Vector<uint8_t>* oovTotalCnt;
|
||||
UnorderedMap<OovOrForm, Vector<uint16_t>>* oovPrefixLists;
|
||||
const std::vector<TokenResult>* prevResults = nullptr;
|
||||
bool openEnding;
|
||||
bool splitComplex = false;
|
||||
bool splitSaisiot = false;
|
||||
bool mergeSaisiot = false;
|
||||
const std::unordered_set<const Morpheme*>* blocklist = nullptr;
|
||||
Dialect allowedDialects = Dialect::standard;
|
||||
float dialectCost = 0.f;
|
||||
const SubstringCounter* substringCounter = nullptr;
|
||||
};
|
||||
|
||||
class OovUnigramScorer
|
||||
{
|
||||
const UnorderedMap<U16StringView, size_t>* oovTotalMap = nullptr;
|
||||
const Vector<uint8_t>* oovTotalCnt = nullptr;
|
||||
const KGraphNode* graph = nullptr;
|
||||
const uint32_t* oovCands = nullptr;
|
||||
size_t oovCandSize = 0;
|
||||
float smoothness = 0;
|
||||
|
||||
public:
|
||||
OovUnigramScorer(
|
||||
const UnorderedMap<U16StringView, size_t>* _oovTotalMap,
|
||||
const Vector<uint8_t>* _oovTotalCnt,
|
||||
const KGraphNode* _graph,
|
||||
const uint32_t* _oovCands,
|
||||
size_t _oovCandSize,
|
||||
float _smoothness
|
||||
)
|
||||
: oovTotalMap{ _oovTotalMap }, oovTotalCnt{ _oovTotalCnt }, graph{ _graph }, oovCands{ _oovCands }, oovCandSize{ _oovCandSize }, smoothness{ _smoothness }
|
||||
{
|
||||
}
|
||||
|
||||
bool empty() const
|
||||
{
|
||||
return oovCandSize == 0;
|
||||
}
|
||||
|
||||
float score(uint32_t cntArenaPtr, uint32_t nodeIdx) const;
|
||||
};
|
||||
|
||||
template<class LangModel>
|
||||
struct BestPathFinder : public FindBestPathArgs
|
||||
{
|
||||
using LmState = typename LangModel::LmStateType;
|
||||
|
||||
size_t insertOovPrefices(size_t targetNodeIdx, size_t oovIdx);
|
||||
|
||||
template<class WordLL, class Func>
|
||||
void traverseNodesWithEndPos(
|
||||
Vector<WordLL>& pathes,
|
||||
const Vector<size_t>& pathIndices,
|
||||
size_t targetNodeIdx,
|
||||
Func&& func
|
||||
) const;
|
||||
|
||||
template<class WordLL>
|
||||
void updateOovTotalMap(
|
||||
Vector<WordLL>& pathes,
|
||||
Vector<size_t>& pathIndices,
|
||||
size_t prevOovIdx, size_t bit, size_t i = -1);
|
||||
|
||||
template<class WordLL>
|
||||
void updatePrefixCnts(
|
||||
Vector<WordLL>& pathes,
|
||||
Vector<size_t>& pathIndices,
|
||||
size_t nodeIdx,
|
||||
const Vector<uint32_t>& currentOovNodeIdcs);
|
||||
|
||||
void findOovNodes(
|
||||
size_t nodeIdx,
|
||||
Vector<uint32_t>& oovNodeIdcs
|
||||
) const;
|
||||
|
||||
template<bool useOovTotalConsistency>
|
||||
Vector<PathResult> findBestPathDispatched();
|
||||
|
||||
static Vector<PathResult> findBestPath(const FindBestPathArgs& args)
|
||||
{
|
||||
BestPathFinder<LangModel> finder{ args };
|
||||
if (args.oovTotalMap)
|
||||
{
|
||||
return finder.findBestPathDispatched<true>();
|
||||
}
|
||||
else
|
||||
{
|
||||
return finder.findBestPathDispatched<false>();
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
using FnFindBestPath = Vector<PathResult>(*)(const FindBestPathArgs&);
|
||||
}
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load diff
17
src/SIMD.hpp
17
src/SIMD.hpp
|
|
@ -622,7 +622,11 @@ namespace kiwi
|
|||
{
|
||||
pa = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(&a[i]));
|
||||
pb = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(&b[i]));
|
||||
#ifdef _MSC_VER
|
||||
acc = _mm256_dpbusd_avx_epi32(acc, pa, pb);
|
||||
#else
|
||||
acc = _mm256_dpbusd_epi32(acc, pa, pb);
|
||||
#endif
|
||||
}
|
||||
// reduce sum of eight int32_t to one int32_t
|
||||
__m256i sum = _mm256_hadd_epi32(acc, acc);
|
||||
|
|
@ -892,11 +896,18 @@ namespace kiwi
|
|||
static STRONG_INLINE int32_t dotprod(const uint8_t* a, const int8_t* b, size_t size)
|
||||
{
|
||||
int32x4_t sum = vdupq_n_s32(0);
|
||||
uint16x8_t pa;
|
||||
int8x16_t pb;
|
||||
for (size_t i = 0; i < size; i += 16)
|
||||
{
|
||||
//
|
||||
uint8x16_t pa = vld1q_u8(a + i);
|
||||
int8x16_t pb = vld1q_s8(b + i);
|
||||
// Extend a (uint8, 0-255) to int16 via zero-extend, b (int8) via sign-extend
|
||||
// Product range: 0*(-128) to 255*127 = [-32640, 32385], fits in int16
|
||||
int16x8_t pa_lo = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pa)));
|
||||
int16x8_t pa_hi = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pa)));
|
||||
int16x8_t pb_lo = vmovl_s8(vget_low_s8(pb));
|
||||
int16x8_t pb_hi = vmovl_s8(vget_high_s8(pb));
|
||||
sum = vpadalq_s16(sum, vmulq_s16(pa_lo, pb_lo));
|
||||
sum = vpadalq_s16(sum, vmulq_s16(pa_hi, pb_hi));
|
||||
}
|
||||
sum = vpaddq_s32(sum, sum);
|
||||
sum = vpaddq_s32(sum, sum);
|
||||
|
|
|
|||
|
|
@ -260,27 +260,20 @@ namespace kiwi
|
|||
}
|
||||
}
|
||||
|
||||
template<class Ty>
|
||||
class ContainerSearcher
|
||||
{
|
||||
std::vector<const Ty*> data;
|
||||
std::vector<size_t> idx;
|
||||
const size_t* idcs;
|
||||
const size_t size;
|
||||
public:
|
||||
template<class AllocA, class AllocB>
|
||||
ContainerSearcher(const std::vector<std::vector<Ty, AllocB>, AllocA>& v)
|
||||
: data(v.size()), idx(v.size())
|
||||
template<class Alloc>
|
||||
ContainerSearcher(const std::vector<size_t, Alloc>& _idcs)
|
||||
: idcs(_idcs.data()), size(_idcs.size())
|
||||
{
|
||||
for (size_t i = 0; i < v.size(); ++i)
|
||||
{
|
||||
data[i] = v[i].data();
|
||||
}
|
||||
|
||||
sortWriteIdx(data.begin(), data.end(), idx.begin());
|
||||
}
|
||||
|
||||
size_t operator()(const Ty* v) const
|
||||
size_t operator()(size_t v) const
|
||||
{
|
||||
return idx[(std::upper_bound(data.begin(), data.end(), v) - data.begin()) - 1];
|
||||
return std::upper_bound(idcs, idcs + size, v) - idcs - 1;
|
||||
}
|
||||
};
|
||||
}
|
||||
|
|
|
|||
|
|
@ -481,28 +481,7 @@ namespace kiwi
|
|||
inline KString normalizeHangul(It first, It last)
|
||||
{
|
||||
KString ret;
|
||||
ret.reserve((size_t)(std::distance(first, last) * 1.5));
|
||||
for (; first != last; ++first)
|
||||
{
|
||||
char16_t c = *first;
|
||||
if (c == 0xB42C) c = 0xB410; // '됬'을 '됐'으로 강제교정
|
||||
if (isHangulSyllable(c))
|
||||
{
|
||||
int coda = (c - 0xAC00) % 28;
|
||||
ret.push_back(c - coda);
|
||||
if (coda) ret.push_back(coda + 0x11A7);
|
||||
}
|
||||
else if (!ret.empty() && isHangulOnset(ret.back())
|
||||
&& 0x1161 <= c && c < 0x1176)
|
||||
{
|
||||
// 첫가끝 초성 + 중성 중 현대한글 음절로 가능한 것은 결합
|
||||
ret.back() = (char16_t)(0xAC00 + ((ret.back() - 0x1100) * 21 * 28) + ((c - 0x1161) * 28));
|
||||
}
|
||||
else
|
||||
{
|
||||
ret.push_back(c);
|
||||
}
|
||||
}
|
||||
normalizeHangul(ret, first, last);
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
|
|
|||
194
src/SubstringCounter.hpp
Normal file
194
src/SubstringCounter.hpp
Normal file
|
|
@ -0,0 +1,194 @@
|
|||
#pragma once
|
||||
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <cstring>
|
||||
#include <algorithm>
|
||||
#include <bitset>
|
||||
#include <vector>
|
||||
#include <limits>
|
||||
|
||||
#include <kiwi/Types.h>
|
||||
|
||||
namespace kiwi
|
||||
{
|
||||
class SubstringCounter
|
||||
{
|
||||
struct Entry
|
||||
{
|
||||
const char16_t* ptr = nullptr;
|
||||
uint32_t hash = 0;
|
||||
uint16_t length = 0;
|
||||
uint16_t count = 0;
|
||||
};
|
||||
|
||||
Vector<Entry> table;
|
||||
uint32_t mask = 0;
|
||||
size_t entryCount = 0;
|
||||
Vector<char16_t> chars;
|
||||
|
||||
static constexpr uint32_t kPrime = 0x01000193;
|
||||
static constexpr uint32_t kOffset = 0x811c9dc5;
|
||||
|
||||
void grow()
|
||||
{
|
||||
size_t newSize = table.size() * 2;
|
||||
uint32_t newMask = (uint32_t)(newSize - 1);
|
||||
Vector<Entry> newTable(newSize);
|
||||
for (auto& e : table)
|
||||
{
|
||||
if (!e.ptr) continue;
|
||||
size_t slot = e.hash & newMask;
|
||||
while (newTable[slot].ptr)
|
||||
{
|
||||
slot = (slot + 1) & newMask;
|
||||
}
|
||||
newTable[slot] = e;
|
||||
}
|
||||
table = std::move(newTable);
|
||||
mask = newMask;
|
||||
}
|
||||
|
||||
void insertOrIncrement(uint32_t hash, const char16_t* ptr, size_t length)
|
||||
{
|
||||
if (entryCount * 5 > table.size() * 3) // load factor > 0.6
|
||||
{
|
||||
grow();
|
||||
}
|
||||
|
||||
size_t slot = hash & mask;
|
||||
while (true)
|
||||
{
|
||||
auto& e = table[slot];
|
||||
if (!e.ptr)
|
||||
{
|
||||
e.hash = hash;
|
||||
e.ptr = ptr;
|
||||
e.length = (uint16_t)length;
|
||||
e.count = 1;
|
||||
++entryCount;
|
||||
return;
|
||||
}
|
||||
if (e.hash == hash && e.length == length &&
|
||||
std::memcmp(e.ptr, ptr, length * sizeof(char16_t)) == 0)
|
||||
{
|
||||
if (e.count < std::numeric_limits<decltype(e.count)>::max())
|
||||
{
|
||||
++e.count;
|
||||
}
|
||||
return;
|
||||
}
|
||||
slot = (slot + 1) & mask;
|
||||
}
|
||||
}
|
||||
|
||||
public:
|
||||
SubstringCounter() = default;
|
||||
|
||||
SubstringCounter(const char16_t* data, size_t size, size_t maxLen = 32)
|
||||
{
|
||||
// estimate initial table size
|
||||
size_t estimatedEntries = size * 8;
|
||||
size_t tableSize = 16;
|
||||
while (tableSize < estimatedEntries * 2) tableSize *= 2;
|
||||
table.resize(tableSize);
|
||||
mask = (uint32_t)(tableSize - 1);
|
||||
entryCount = 0;
|
||||
|
||||
// collect unique chars
|
||||
std::bitset<0x10000> seen;
|
||||
|
||||
size_t segStart = 0;
|
||||
for (size_t s = 0; s <= size; ++s)
|
||||
{
|
||||
if (s == size || data[s] == u' ')
|
||||
{
|
||||
for (size_t i = segStart; i < s; ++i)
|
||||
{
|
||||
uint32_t rollingHash = 0;
|
||||
const size_t jEnd = std::min(i + maxLen, s);
|
||||
for (size_t j = i; j < jEnd; ++j)
|
||||
{
|
||||
const auto c = data[j];
|
||||
if (j == i)
|
||||
rollingHash = initHash(c);
|
||||
else
|
||||
rollingHash = extendHash(rollingHash, c);
|
||||
|
||||
insertOrIncrement(rollingHash, &data[i], j - i + 1);
|
||||
|
||||
if (!seen[c])
|
||||
{
|
||||
seen[c] = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
segStart = s + 1;
|
||||
}
|
||||
}
|
||||
|
||||
// build sorted unique chars vector
|
||||
for (size_t i = 0; i < 0x10000; ++i)
|
||||
{
|
||||
if (seen[i])
|
||||
{
|
||||
chars.push_back((char16_t)i);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
size_t count(uint32_t hash, const char16_t* data, size_t len) const
|
||||
{
|
||||
if (table.empty()) return 0;
|
||||
size_t slot = hash & mask;
|
||||
while (true)
|
||||
{
|
||||
auto& e = table[slot];
|
||||
if (!e.ptr) return 0;
|
||||
if (e.hash == hash && e.length == len &&
|
||||
std::memcmp(e.ptr, data, len * sizeof(char16_t)) == 0)
|
||||
{
|
||||
return e.count;
|
||||
}
|
||||
slot = (slot + 1) & mask;
|
||||
}
|
||||
}
|
||||
|
||||
size_t count(U16StringView str) const
|
||||
{
|
||||
return count(hash(str), str.data(), str.size());
|
||||
}
|
||||
|
||||
const Vector<char16_t>& getUniqueChars() const
|
||||
{
|
||||
return chars;
|
||||
}
|
||||
|
||||
static uint32_t initHash(char16_t c)
|
||||
{
|
||||
return (uint32_t)c * kPrime + kOffset;
|
||||
}
|
||||
|
||||
static uint32_t extendHash(uint32_t prev, char16_t c)
|
||||
{
|
||||
return prev * kPrime + (uint32_t)c;
|
||||
}
|
||||
|
||||
static uint32_t hash(const char16_t* data, size_t len)
|
||||
{
|
||||
if (len == 0) return kOffset;
|
||||
uint32_t h = initHash(data[0]);
|
||||
for (size_t i = 1; i < len; ++i)
|
||||
{
|
||||
h = extendHash(h, data[i]);
|
||||
}
|
||||
return h;
|
||||
}
|
||||
|
||||
static uint32_t hash(U16StringView str)
|
||||
{
|
||||
return hash(str.data(), str.size());
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
|
|
@ -3,6 +3,7 @@
|
|||
#include <kiwi/Utils.h>
|
||||
#include "StrUtils.h"
|
||||
#include "FrozenTrie.hpp"
|
||||
#include "FeatureTestor.h"
|
||||
|
||||
using namespace std;
|
||||
using namespace kiwi;
|
||||
|
|
@ -224,7 +225,7 @@ void TypoTransformer::addTypoWithCond(const KString& orig, const KString& error,
|
|||
{
|
||||
if (orig == error) return;
|
||||
|
||||
if (leftCond == CondVowel::none || leftCond == CondVowel::vowel || leftCond == CondVowel::any)
|
||||
if (leftCond == CondVowel::none || leftCond == CondVowel::vowel || leftCond == CondVowel::any || leftCond == CondVowel::continual || leftCond == CondVowel::boundary)
|
||||
{
|
||||
auto inserted = typos.emplace(make_tuple(orig, error, leftCond, dialect), cost);
|
||||
if (!inserted.second)
|
||||
|
|
@ -431,13 +432,19 @@ namespace kiwi
|
|||
|
||||
PreparedTypoTransformer::PreparedTypoTransformer() = default;
|
||||
|
||||
PreparedTypoTransformer::PreparedTypoTransformer(const TypoTransformer& tt)
|
||||
PreparedTypoTransformer::PreparedTypoTransformer(const TypoTransformer& tt, bool inverse)
|
||||
: continualTypoThreshold{ tt.continualTypoThreshold }, lengtheningTypoThreshold{ tt.lengtheningTypoThreshold }
|
||||
{
|
||||
IntermediateTypoTransformer itt;
|
||||
for (auto& t : tt.typos)
|
||||
{
|
||||
itt.addTypo(get<0>(t.first), get<1>(t.first), t.second, get<2>(t.first), get<3>(t.first));
|
||||
itt.addTypo(
|
||||
inverse ? get<1>(t.first) : get<0>(t.first),
|
||||
inverse ? get<0>(t.first) : get<1>(t.first),
|
||||
t.second,
|
||||
get<2>(t.first),
|
||||
get<3>(t.first)
|
||||
);
|
||||
}
|
||||
strPool = std::move(itt.strPool);
|
||||
|
||||
|
|
@ -445,20 +452,25 @@ PreparedTypoTransformer::PreparedTypoTransformer(const TypoTransformer& tt)
|
|||
for (auto& rs : itt.replacements) tot += rs.size();
|
||||
replacements.reserve(tot);
|
||||
|
||||
Vector<std::pair<const ReplInfo*, uint32_t>> patData;
|
||||
Vector<pair<const ReplInfo*, uint32_t>> patData;
|
||||
for (auto& rs : itt.replacements)
|
||||
{
|
||||
patData.emplace_back(replacements.data() + replacements.size(), rs.size());
|
||||
for (auto& r : rs)
|
||||
{
|
||||
replacements.emplace_back(strPool.data() + r.begin, r.end - r.begin, r.cost, r.leftCond, r.dialect);
|
||||
auto rBegin = r.begin;
|
||||
if (inverse && r.leftCond == CondVowel::applosive && strPool[rBegin] == 0)
|
||||
{
|
||||
rBegin++;
|
||||
}
|
||||
replacements.emplace_back(strPool.data() + rBegin, r.end - rBegin, r.cost, r.leftCond, r.dialect);
|
||||
}
|
||||
}
|
||||
|
||||
patTrie = decltype(patTrie){ itt.patTrie, ArchTypeHolder<ArchType::none>{}, [&](const IntermediateTypoTransformer::TrieNode& o) -> PatInfo
|
||||
{
|
||||
uint32_t depth = o.depth;
|
||||
if (o.val && patData[o.val - 1].first->leftCond == CondVowel::applosive)
|
||||
if (!inverse && o.val && patData[o.val - 1].first->leftCond == CondVowel::applosive)
|
||||
{
|
||||
depth--;
|
||||
}
|
||||
|
|
@ -569,6 +581,463 @@ TypoCandidates<true> PreparedTypoTransformer::generate(const u16string& orig, fl
|
|||
return _generate<true>(normalizeHangul(orig), costThreshold);
|
||||
}
|
||||
|
||||
/*
|
||||
다음과 같은 오타 규칙이 있을때
|
||||
* ㄷ이 -> 지
|
||||
'구지'라는 입력에 대한 Char DAG는 아래처럼 구축된다.
|
||||
|
||||
Idx 0 1 2
|
||||
BOS -> 구 -> 지 -> EOS
|
||||
-> ㄷ -> 이 -> EOS
|
||||
*/
|
||||
|
||||
template<class Alloc, class... Args>
|
||||
inline bool appendNewNode(vector<TypoGraphNode, Alloc>& nodes, Vector<pair<uint32_t, uint32_t>>& endPosMap, size_t endPosMapOffset, U16StringView form, size_t startPos, size_t endPos, Args&&... args)
|
||||
{
|
||||
static constexpr uint32_t npos = -1;
|
||||
|
||||
if (startPos != -1 && endPosMap[startPos - endPosMapOffset].first == npos)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
size_t newId = nodes.size();
|
||||
nodes.emplace_back(form, endPos, forward<Args>(args)...);
|
||||
TypoGraphNode& nnode = nodes.back();
|
||||
|
||||
if (startPos == -1)
|
||||
{
|
||||
nnode.prevOffset = newId - 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
nnode.prevOffset = endPosMap[startPos - endPosMapOffset].first; // absolute offset for now, will be converted to relative offset later
|
||||
}
|
||||
if (nnode.endPos >= endPosMap.size() + endPosMapOffset) return true;
|
||||
|
||||
if (endPosMap[nnode.endPos - endPosMapOffset].first == npos)
|
||||
{
|
||||
endPosMap[nnode.endPos - endPosMapOffset].first = newId;
|
||||
}
|
||||
else
|
||||
{
|
||||
nodes[endPosMap[nnode.endPos - endPosMapOffset].second].siblingOffset = newId; // absolute offset for now, will be converted to relative offset later
|
||||
}
|
||||
endPosMap[nnode.endPos - endPosMapOffset].second = newId;
|
||||
return true;
|
||||
}
|
||||
|
||||
// onset: ㅇ=11, ㅎ=18
|
||||
inline char16_t overrideOnset(char16_t c, const int onset = 11)
|
||||
{
|
||||
if (!isHangulSyllable(c)) return 0;
|
||||
const int vowel = (c - 0xAC00) / 28 % 21;
|
||||
const int coda = (c - 0xAC00) % 28;
|
||||
return 0xAC00 + onset * 28 * 21 + vowel * 28 + coda;
|
||||
}
|
||||
|
||||
// 받침 + 초성 ㅇ이 연철된 경우
|
||||
struct ContinualIeungDecomposer
|
||||
{
|
||||
static constexpr size_t boundaryId = 1;
|
||||
char16_t onsetToCoda(char16_t c, char16_t prev)
|
||||
{
|
||||
static constexpr char16_t o2c[] = {
|
||||
0x11A8, // ㄱ
|
||||
0x11A9, // ㄲ
|
||||
0x11AB, // ㄴ
|
||||
0x11AE, // ㄷ
|
||||
0, // ㄸ
|
||||
0x11AF, // ㄹ
|
||||
0x11B7, // ㅁ
|
||||
0x11B8, // ㅂ
|
||||
0, // ㅃ
|
||||
0x11BA, // ㅅ
|
||||
0x11BB, // ㅆ
|
||||
0, // ㅇ
|
||||
0x11BD, // ㅈ
|
||||
0, // ㅉ
|
||||
0x11BE, // ㅊ
|
||||
0x11BF, // ㅋ
|
||||
0x11C0, // ㅌ
|
||||
0x11C1, // ㅍ
|
||||
0x11C2, // ㅎ
|
||||
};
|
||||
|
||||
if (isHangulSyllable(c))
|
||||
{
|
||||
int onset = (c - 0xAC00) / 28 / 21;
|
||||
return o2c[onset];
|
||||
}
|
||||
|
||||
switch (c)
|
||||
{
|
||||
case u'ㄱ': return 0x11A8;
|
||||
case u'ㄲ': return 0x11A9;
|
||||
case u'ㄴ': return 0x11AB;
|
||||
case u'ㄷ': return 0x11AE;
|
||||
case u'ㄹ': return 0x11AF;
|
||||
case u'ㅁ': return 0x11B7;
|
||||
case u'ㅂ': return 0x11B8;
|
||||
case u'ㅅ': return 0x11BA;
|
||||
case u'ㅆ': return 0x11BB;
|
||||
case u'ㅈ': return 0x11BD;
|
||||
case u'ㅊ': return 0x11BE;
|
||||
case u'ㅋ': return 0x11BF;
|
||||
case u'ㅌ': return 0x11C0;
|
||||
case u'ㅍ': return 0x11C1;
|
||||
case u'ㅎ': return 0x11C2;
|
||||
default: return 0;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
char16_t dropRightSyllable(char16_t c)
|
||||
{
|
||||
return overrideOnset(c, 11);
|
||||
}
|
||||
};
|
||||
|
||||
// 받침 + 초성 ㅎ이 연철된 경우
|
||||
struct ContinualHieutDecomposer
|
||||
{
|
||||
static constexpr size_t boundaryId = 2;
|
||||
char16_t onsetToCoda(char16_t c)
|
||||
{
|
||||
static constexpr char16_t o2c[] = {
|
||||
0, // ㄱ
|
||||
0, // ㄲ
|
||||
0x11AB, // ㄴ
|
||||
0, // ㄷ
|
||||
0, // ㄸ
|
||||
0x11AF, // ㄹ
|
||||
0x11B7, // ㅁ
|
||||
0, // ㅂ
|
||||
0, // ㅃ
|
||||
0x11BA, // ㅅ
|
||||
0, // ㅆ
|
||||
0, // ㅇ
|
||||
0, // ㅈ
|
||||
0, // ㅉ
|
||||
0x11BD, // ㅊ->ㅈ
|
||||
0x11A8, // ㅋ->ㄱ
|
||||
0x11AE, // ㅌ->ㄷ
|
||||
0x11B8, // ㅍ->ㅂ
|
||||
0, // ㅎ
|
||||
};
|
||||
|
||||
if (isHangulSyllable(c))
|
||||
{
|
||||
int onset = (c - 0xAC00) / 28 / 21;
|
||||
return o2c[onset];
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
char16_t dropRightSyllable(char16_t c)
|
||||
{
|
||||
return overrideOnset(c, 18);
|
||||
}
|
||||
};
|
||||
|
||||
// 받침 ㅎ + ㅎ이 아닌 초성이 연철된 경우
|
||||
struct ContinualCodaDecomposer
|
||||
{
|
||||
static constexpr size_t boundaryId = 3;
|
||||
char16_t onsetToCoda(char16_t c)
|
||||
{
|
||||
static constexpr char16_t o2c[] = {
|
||||
0, // ㄱ
|
||||
0, // ㄲ
|
||||
0, // ㄴ
|
||||
0, // ㄷ
|
||||
0, // ㄸ
|
||||
0, // ㄹ
|
||||
0, // ㅁ
|
||||
0, // ㅂ
|
||||
0, // ㅃ
|
||||
0, // ㅅ
|
||||
0, // ㅆ
|
||||
0, // ㅇ
|
||||
0, // ㅈ
|
||||
0, // ㅉ
|
||||
0x11C2, // ㅊ->ㅎ
|
||||
0x11C2, // ㅋ->ㅎ
|
||||
0x11C2, // ㅌ->ㅎ
|
||||
0x11C2, // ㅍ->ㅎ
|
||||
0, // ㅎ
|
||||
};
|
||||
|
||||
if (isHangulSyllable(c))
|
||||
{
|
||||
int onset = (c - 0xAC00) / 28 / 21;
|
||||
return o2c[onset];
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
char16_t dropRightSyllable(char16_t c)
|
||||
{
|
||||
const int onset = (c - 0xAC00) / 28 / 21;
|
||||
const int vowel = (c - 0xAC00) / 28 % 21;
|
||||
const int coda = (c - 0xAC00) % 28;
|
||||
static constexpr char16_t onsetMap[] = {
|
||||
0, // ㄱ
|
||||
0, // ㄲ
|
||||
0, // ㄴ
|
||||
0, // ㄷ
|
||||
0, // ㄸ
|
||||
0, // ㄹ
|
||||
0, // ㅁ
|
||||
0, // ㅂ
|
||||
0, // ㅃ
|
||||
0, // ㅅ
|
||||
0, // ㅆ
|
||||
0, // ㅇ
|
||||
0, // ㅈ
|
||||
0, // ㅉ
|
||||
12, // ㅊ->ㅈ
|
||||
0, // ㅋ->ㄱ
|
||||
3, // ㅌ->ㄷ
|
||||
7, // ㅍ->ㅂ
|
||||
0, // ㅎ
|
||||
};
|
||||
return 0xAC00 + (onsetMap[onset] * 21 + vowel) * 28 + coda;
|
||||
}
|
||||
};
|
||||
|
||||
template<class Alloc>
|
||||
size_t PreparedTypoTransformer::generateGraph(U16StringView str,
|
||||
vector<TypoGraphNode, Alloc>& graphOut,
|
||||
Dialect allowedDialect,
|
||||
const pair<uint32_t, uint32_t>* pretokenizedFirst,
|
||||
const pair<uint32_t, uint32_t>* pretokenizedLast,
|
||||
size_t* maxContinualTypoIdxOut
|
||||
) const
|
||||
{
|
||||
const bool continualTypoEnabled = isfinite(continualTypoThreshold);
|
||||
static constexpr size_t npos = -1;
|
||||
using MatchInfo = tuple<size_t, PatInfo>; // (endPos, patternInfo)
|
||||
thread_local Vector<TypoGraphNode> tempGraph;
|
||||
thread_local Vector<MatchInfo> matches;
|
||||
thread_local Vector<size_t> breakPoints;
|
||||
thread_local Vector<pair<uint32_t, uint32_t>> endPosMap; // (first position, last position)
|
||||
thread_local UnorderedMap<char16_t, pair<size_t, size_t>> continualTypoIdxMap;
|
||||
matches.clear();
|
||||
endPosMap.clear();
|
||||
endPosMap.emplace_back(0, 0);
|
||||
|
||||
size_t last = 0;
|
||||
tempGraph.clear();
|
||||
tempGraph.emplace_back(U16StringView{ str.data(), 0 }, 0);
|
||||
|
||||
const auto& insertBranch = [&]()
|
||||
{
|
||||
const size_t totStartPos = get<size_t>(matches[0]) - get<PatInfo>(matches[0]).patLength;
|
||||
const size_t totEndPos = get<size_t>(matches.back());
|
||||
|
||||
const auto v = endPosMap.back();
|
||||
endPosMap.clear();
|
||||
endPosMap.resize((totEndPos - last) + 1, make_pair(npos, npos));
|
||||
endPosMap[0] = v;
|
||||
|
||||
breakPoints.clear();
|
||||
breakPoints.emplace_back(totStartPos);
|
||||
for (auto& m : matches)
|
||||
{
|
||||
const size_t e = get<size_t>(m);
|
||||
const size_t s = e - get<PatInfo>(m).patLength;
|
||||
breakPoints.emplace_back(e);
|
||||
}
|
||||
breakPoints.emplace_back(totEndPos);
|
||||
sort(breakPoints.begin(), breakPoints.end());
|
||||
breakPoints.erase(unique(breakPoints.begin(), breakPoints.end()), breakPoints.end());
|
||||
|
||||
sort(matches.begin(), matches.end(), [](const MatchInfo& a, const MatchInfo& b)
|
||||
{
|
||||
return get<size_t>(a) - get<PatInfo>(a).patLength < get<size_t>(b) - get<PatInfo>(b).patLength;
|
||||
}
|
||||
);
|
||||
|
||||
if (last < totStartPos)
|
||||
{
|
||||
appendNewNode(tempGraph, endPosMap, last, U16StringView{ str.data() + last, totStartPos - last }, last, totStartPos);
|
||||
}
|
||||
for (size_t i = 1; i < breakPoints.size(); ++i)
|
||||
{
|
||||
appendNewNode(tempGraph, endPosMap, last, U16StringView{ str.data() + breakPoints[i - 1], breakPoints[i] - breakPoints[i - 1] }, breakPoints[i - 1], breakPoints[i]);
|
||||
}
|
||||
|
||||
for (auto& m : matches)
|
||||
{
|
||||
auto [endPos, patInfo] = m;
|
||||
const size_t e = endPos;
|
||||
const size_t s = e - patInfo.patLength;
|
||||
continualTypoIdxMap.clear();
|
||||
|
||||
for (size_t j = 0; j < patInfo.size; ++j)
|
||||
{
|
||||
auto& repl = patInfo.repl[j];
|
||||
if (repl.dialect != Dialect::standard && !(allowedDialect & repl.dialect)) continue;
|
||||
|
||||
if (repl.leftCond == CondVowel::vowel)
|
||||
{
|
||||
if (s == 0 || !isHangulSyllable(str[s - 1])) continue;
|
||||
}
|
||||
else if (repl.leftCond == CondVowel::any)
|
||||
{
|
||||
if (s == 0) continue;
|
||||
}
|
||||
else if (repl.leftCond == CondVowel::continual || repl.leftCond == CondVowel::boundary)
|
||||
{
|
||||
if (repl.leftCond == CondVowel::continual && (s == 0 || !isHangulSyllable(str[s - 1]))) continue;
|
||||
if (repl.leftCond == CondVowel::continual && !isfinite(continualTypoThreshold)) continue;
|
||||
const float scale = repl.leftCond == CondVowel::continual ? continualTypoThreshold : 1.f;
|
||||
const auto [it, inserted] = continualTypoIdxMap.emplace(*repl.str, make_pair(continualTypoIdxMap.size() + 1, 0));
|
||||
auto& [continualTypoIdx, continualTypoNodeIdx] = it->second;
|
||||
if (inserted)
|
||||
{
|
||||
if (appendNewNode(tempGraph, endPosMap, last, U16StringView{ repl.str, 1 }, s, -1, repl.cost * scale / 2))
|
||||
{
|
||||
tempGraph.back().endPos = e;
|
||||
tempGraph.back().continualTypoIdx = continualTypoIdx;
|
||||
tempGraph.back().dialect = repl.dialect;
|
||||
continualTypoNodeIdx = tempGraph.size() - 1;
|
||||
if (appendNewNode(tempGraph, endPosMap, last, U16StringView{ repl.str + 1, repl.length - 1 }, -1, e, repl.cost * scale / 2))
|
||||
{
|
||||
tempGraph.back().prevOffset = continualTypoNodeIdx;
|
||||
tempGraph.back().dialect = repl.dialect;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
continualTypoIdxMap.erase(it);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (appendNewNode(tempGraph, endPosMap, last, U16StringView{ repl.str + 1, repl.length - 1 }, -1, e, repl.cost * scale / 2))
|
||||
{
|
||||
tempGraph.back().prevOffset = continualTypoNodeIdx;
|
||||
tempGraph.back().dialect = repl.dialect;
|
||||
}
|
||||
}
|
||||
continue;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (!FeatureTestor::isMatched(str.data(), str.data() + s, repl.leftCond)) continue;
|
||||
}
|
||||
if (appendNewNode(tempGraph, endPosMap, last, U16StringView{ repl.str, repl.length }, s, e, repl.cost))
|
||||
{
|
||||
tempGraph.back().dialect = repl.dialect;
|
||||
}
|
||||
}
|
||||
if (maxContinualTypoIdxOut)
|
||||
{
|
||||
*maxContinualTypoIdxOut = max(*maxContinualTypoIdxOut, continualTypoIdxMap.size() + 1);
|
||||
}
|
||||
}
|
||||
last = totEndPos;
|
||||
matches.clear();
|
||||
};
|
||||
|
||||
auto node = patTrie.root()->nextOpt<ArchType::none>(patTrie, 0);
|
||||
for (size_t i = 0; i < str.size(); ++i)
|
||||
{
|
||||
if (pretokenizedFirst < pretokenizedLast && pretokenizedFirst->first == i)
|
||||
{
|
||||
const auto prevLast = last;
|
||||
if (!matches.empty())
|
||||
{
|
||||
insertBranch();
|
||||
}
|
||||
node = patTrie.root();
|
||||
appendNewNode(tempGraph, endPosMap, prevLast,
|
||||
U16StringView{ str.data() + last, pretokenizedFirst->second - last },
|
||||
last, pretokenizedFirst->second
|
||||
);
|
||||
last = pretokenizedFirst->second;
|
||||
endPosMap.clear();
|
||||
endPosMap.emplace_back(tempGraph.size() - 1, tempGraph.size() - 1);
|
||||
i += pretokenizedFirst->second - pretokenizedFirst->first - 1;
|
||||
++pretokenizedFirst;
|
||||
continue;
|
||||
}
|
||||
|
||||
auto nnode = node->nextOpt<ArchType::none>(patTrie, str[i]);
|
||||
while (!nnode)
|
||||
{
|
||||
node = node->fail();
|
||||
if (node)
|
||||
{
|
||||
nnode = node->nextOpt<ArchType::none>(patTrie, str[i]);
|
||||
}
|
||||
else
|
||||
{
|
||||
node = patTrie.root();
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!nnode) continue;
|
||||
node = nnode;
|
||||
|
||||
auto& v = node->val(patTrie);
|
||||
if (patTrie.isNull(v)) continue;
|
||||
|
||||
const size_t endPos = i + 1;
|
||||
const size_t startPos = endPos - v.patLength;
|
||||
if (!matches.empty() && get<size_t>(matches.back()) < startPos)
|
||||
{
|
||||
insertBranch();
|
||||
}
|
||||
for (auto sub = node; sub; sub = sub->fail())
|
||||
{
|
||||
auto& sv = sub->val(patTrie);
|
||||
if (patTrie.isNull(sv)) break;
|
||||
if (patTrie.hasSubmatch(sv)) continue;
|
||||
matches.emplace_back(endPos, sv);
|
||||
}
|
||||
}
|
||||
if (!matches.empty())
|
||||
{
|
||||
insertBranch();
|
||||
}
|
||||
const auto v = endPosMap.back();
|
||||
endPosMap.clear();
|
||||
endPosMap.resize(1);
|
||||
endPosMap[0] = v;
|
||||
appendNewNode(tempGraph, endPosMap, last, U16StringView{ str.data() + last, str.size() - last }, last, str.size() + 1);
|
||||
tempGraph.back().endPos = str.size();
|
||||
|
||||
auto& sortIdx = breakPoints;
|
||||
sortIdx.clear();
|
||||
sortIdx.resize(tempGraph.size() * 2);
|
||||
auto reverseIdx = sortIdx.begin() + tempGraph.size();
|
||||
iota(sortIdx.begin(), reverseIdx, 0);
|
||||
stable_sort(sortIdx.begin(), reverseIdx, [&](size_t a, size_t b)
|
||||
{
|
||||
return tempGraph[a].endPos < tempGraph[b].endPos;
|
||||
}
|
||||
);
|
||||
for (size_t i = 0; i < tempGraph.size(); ++i)
|
||||
{
|
||||
reverseIdx[sortIdx[i]] = i;
|
||||
}
|
||||
|
||||
graphOut.clear();
|
||||
graphOut.reserve(tempGraph.size());
|
||||
for (size_t i = 0; i < tempGraph.size(); ++i)
|
||||
{
|
||||
graphOut.push_back(tempGraph[sortIdx[i]]);
|
||||
auto& n = graphOut.back();
|
||||
n.prevOffset = i - reverseIdx[n.prevOffset];
|
||||
if (n.siblingOffset != 0) n.siblingOffset = reverseIdx[n.siblingOffset] - i;
|
||||
}
|
||||
return graphOut.size();
|
||||
}
|
||||
|
||||
namespace kiwi
|
||||
{
|
||||
template class TypoCandidates<true>;
|
||||
|
|
@ -579,6 +1048,12 @@ namespace kiwi
|
|||
template TypoCandidates<true> PreparedTypoTransformer::_generate<true>(const KString&, float) const;
|
||||
template TypoCandidates<false> PreparedTypoTransformer::_generate<false>(const KString&, float) const;
|
||||
|
||||
template size_t PreparedTypoTransformer::generateGraph<allocator<TypoGraphNode>>(
|
||||
U16StringView, vector<TypoGraphNode, allocator<TypoGraphNode>>&, Dialect, const pair<uint32_t, uint32_t>*, const pair<uint32_t, uint32_t>*, size_t*) const;
|
||||
#ifdef KIWI_USE_MIMALLOC
|
||||
template size_t PreparedTypoTransformer::generateGraph<mi_stl_allocator<TypoGraphNode>>(
|
||||
U16StringView, vector<TypoGraphNode, mi_stl_allocator<TypoGraphNode>>&, Dialect, const pair<uint32_t, uint32_t>*, const pair<uint32_t, uint32_t>*, size_t*) const;
|
||||
#endif
|
||||
|
||||
const TypoTransformer& getDefaultTypoSet(DefaultTypoSet set)
|
||||
{
|
||||
|
|
@ -688,6 +1163,46 @@ namespace kiwi
|
|||
TypoDef{ {u"ᆵ"}, {u"ᆯᇁ"}, 1e-12f, CondVowel::none },
|
||||
TypoDef{ {u"ᆶ"}, {u"ᆯᇂ"}, 1e-12f, CondVowel::none },
|
||||
TypoDef{ {u"ᆹ"}, {u"ᆸᆺ", u"ᆸᆻ"}, 1e-12f, CondVowel::none },
|
||||
|
||||
TypoDef{ {u"ᆨᄋ"}, {u"ᄀ"}, 1.f, CondVowel::continual },
|
||||
TypoDef{ {u"ᆩᄋ", u"ᆨᄀ"}, {u"ᄁ", u"ᆨᄀ"}, 1.f, CondVowel::continual },
|
||||
TypoDef{ {u"ᆪᄋ", u"ᆪᄒ"}, {u"ᆨᄉ", u"ᆨᄊ"}, 1.f, CondVowel::continual },
|
||||
TypoDef{ {u"ᆫᄋ", u"ᆫᄒ"}, {u"ᄂ"}, 1.f, CondVowel::continual },
|
||||
TypoDef{ {u"ᆬᄋ", u"ᆫᄌ"}, {u"ᆬᄋ", u"ᆫᄌ"}, 1.f, CondVowel::continual },
|
||||
TypoDef{ {u"ᆭᄋ"}, {u"ᆫᄒ", u"ᄂ"}, 1.f, CondVowel::continual },
|
||||
TypoDef{ {u"ᆭᄀ"}, {u"ᆫᄏ"}, 1.f, CondVowel::continual },
|
||||
TypoDef{ {u"ᆭᄃ"}, {u"ᆫᄐ"}, 1.f, CondVowel::continual },
|
||||
TypoDef{ {u"ᆭᄇ"}, {u"ᆫᄑ"}, 1.f, CondVowel::continual },
|
||||
TypoDef{ {u"ᆭᄉ"}, {u"ᆫᄉ"}, 1.f, CondVowel::continual },
|
||||
TypoDef{ {u"ᆭᄌ"}, {u"ᆫᄎ"}, 1.f, CondVowel::continual },
|
||||
TypoDef{ {u"ᆮᄋ"}, {u"ᄃ"}, 1.f, CondVowel::continual },
|
||||
TypoDef{ {u"ᆯᄋ", u"ᆯᄒ"}, {u"ᄅ"}, 1.f, CondVowel::continual },
|
||||
TypoDef{ {u"ᆰᄋ"}, {u"ᆯᄀ"}, 1.f, CondVowel::continual },
|
||||
TypoDef{ {u"ᆰᄀ"}, {u"ᆯᄁ"}, 1.f, CondVowel::continual },
|
||||
TypoDef{ {u"ᆰᄒ"}, {u"ᆯᄏ"}, 1.f, CondVowel::continual },
|
||||
TypoDef{ {u"ᆱᄋ", u"ᆱᄒ"}, {u"ᆯᄆ"}, 1.f, CondVowel::continual},
|
||||
TypoDef{ {u"ᆲᄋ"}, {u"ᆯᄇ"}, 1.f, CondVowel::continual },
|
||||
TypoDef{ {u"ᆲᄇ"}, {u"ᆯᄈ"}, 1.f, CondVowel::continual },
|
||||
TypoDef{ {u"ᆲᄒ"}, {u"ᆯᄑ"}, 1.f, CondVowel::continual },
|
||||
TypoDef{ {u"ᆳᄋ"}, {u"ᆯᄉ"}, 1.f, CondVowel::continual },
|
||||
TypoDef{ {u"ᆳᄉ"}, {u"ᆯᄊ"}, 1.f, CondVowel::continual },
|
||||
TypoDef{ {u"ᆴᄋ", u"ᆴᄐ", u"ᆴᄒ"}, {u"ᆯᄐ"}, 1.f, CondVowel::continual },
|
||||
TypoDef{ {u"ᆵᄋ", u"ᆵᄑ", u"ᆵᄒ"}, {u"ᆯᄑ"}, 1.f, CondVowel::continual },
|
||||
TypoDef{ {u"ᆶᄉ"}, {u"ᆯᄉ"}, 1.f, CondVowel::continual },
|
||||
TypoDef{ {u"ᆶᄋ", u"ᆶᄒ"}, {u"ᆯᄒ"}, 1.f, CondVowel::continual },
|
||||
TypoDef{ {u"ᆷᄋ", u"ᆷᄒ"}, {u"ᄆ"}, 1.f, CondVowel::continual },
|
||||
TypoDef{ {u"ᆸᄋ"}, {u"ᄇ"}, 1.f, CondVowel::continual },
|
||||
TypoDef{ {u"ᆸᄇ"}, {u"ᄈ"}, 1.f, CondVowel::continual },
|
||||
TypoDef{ {u"ᆹᄋ", u"ᆹᄒ"}, {u"ᆸᄉ", u"ᆸᄊ"}, 1.f, CondVowel::continual },
|
||||
TypoDef{ {u"ᆺᄋ"}, {u"ᄉ"}, 1.f, CondVowel::continual },
|
||||
TypoDef{ {u"ᆻᄋ", u"ᆺᄉ"}, {u"ᄊ", u"ᆺᄉ"}, 1.f, CondVowel::continual },
|
||||
TypoDef{ {u"ᆽᄋ"}, {u"ᄌ"}, 1.f, CondVowel::continual },
|
||||
TypoDef{ {u"ᆽᄌ"}, {u"ᄍ"}, 1.f, CondVowel::continual },
|
||||
TypoDef{ {u"ᆾᄋ", u"ᆾᄒ", u"ᆽᄒ", u"ᇂᄌ", u"ᇂᄎ"}, {u"ᄎ"}, 1.f, CondVowel::continual },
|
||||
TypoDef{ {u"ᆿᄋ", u"ᆿᄒ", u"ᆨᄒ", u"ᇂᄀ", u"ᇂᄏ"}, {u"ᄏ"}, 1.f, CondVowel::continual },
|
||||
TypoDef{ {u"ᇀᄋ", u"ᇀᄒ", u"ᆮᄒ", u"ᇂᄃ", u"ᇂᄐ"}, {u"ᄐ"}, 1.f, CondVowel::continual },
|
||||
TypoDef{ {u"ᇁᄋ", u"ᇁᄒ", u"ᆸᄒ", u"ᇂᄇ", u"ᇂᄑ"}, {u"ᄑ"}, 1.f, CondVowel::continual },
|
||||
TypoDef{ {u"ᇂᄋ"}, {u"ᄒ"}, 1.f, CondVowel::continual },
|
||||
});
|
||||
|
||||
static const TypoTransformer basicTypoSetWithContinual = basicTypoSet | continualTypoSet;
|
||||
|
|
@ -700,7 +1215,22 @@ namespace kiwi
|
|||
TypoDef{ {u"ㅚ"}, {u"ㅞ"}, 0.5f, CondVowel::none },
|
||||
TypoDef{ {u"ᆻ"}, {u"ᆺ"}, 0.5f, CondVowel::none },
|
||||
TypoDef{ {u"ㅐ", u"ㅔ"}, {u"ㅐ", u"ㅔ"}, 1.f, CondVowel::none },
|
||||
}.copyWithDialectOverriding(Dialect::jeju);
|
||||
}.copyWithDialectOverriding(Dialect::jeju) | TypoTransformer{
|
||||
TypoDef{ {u"ㅣ이"}, {u"ㅣ"}, 0.2f, CondVowel::boundary },
|
||||
TypoDef{ {u"ㅏ이", u"ㅐ이"}, {u"ㅐ"}, 0.2f, CondVowel::boundary },
|
||||
TypoDef{ {u"ㅓ이", u"ㅔ이"}, {u"ㅔ"}, 0.2f, CondVowel::boundary },
|
||||
TypoDef{ {u"ㅘ이"}, {u"ㅙ"}, 0.2f, CondVowel::boundary },
|
||||
TypoDef{ {u"ㅚ이"}, {u"ㅚ"}, 0.2f, CondVowel::boundary },
|
||||
TypoDef{ {u"ㅝ이"}, {u"ㅞ"}, 0.2f, CondVowel::boundary },
|
||||
TypoDef{ {u"ㅟ이"}, {u"ㅟ"}, 0.2f, CondVowel::boundary },
|
||||
TypoDef{ {u"ㅜ우"}, {u"ㅜ"}, 0.2f, CondVowel::boundary },
|
||||
TypoDef{ {u"ㅠ우"}, {u"ㅠ"}, 0.2f, CondVowel::boundary },
|
||||
}.copyWithDialectOverriding(Dialect::hamgyeong) | TypoTransformer{
|
||||
TypoDef{ {u"ㅣ어"}, {u"ㅔ"}, 0.25f, CondVowel::boundary },
|
||||
TypoDef{ {u"ㅣ어"}, {u"ㅖ"}, 0.5f, CondVowel::boundary },
|
||||
}.copyWithDialectOverriding(Dialect::hamgyeong | Dialect::gyeongsang | Dialect::gangwon) | TypoTransformer{
|
||||
TypoDef{ {u"ㅣ었"}, {u"ㅣᆻ"}, 0.25f, CondVowel::boundary },
|
||||
}.copyWithDialectOverriding(Dialect::gyeongsang);
|
||||
|
||||
switch (set)
|
||||
{
|
||||
|
|
@ -723,4 +1253,33 @@ namespace kiwi
|
|||
}
|
||||
}
|
||||
|
||||
const PreparedTypoTransformer* getDefaultPreparedTypoSet(DefaultTypoSet set)
|
||||
{
|
||||
static const auto defaultTypoSet = getDefaultTypoSet(DefaultTypoSet::basicTypoSet).prepare(true);
|
||||
static const auto defaultTypoSetWithContinual = getDefaultTypoSet(DefaultTypoSet::basicTypoSetWithContinual).prepare(true);
|
||||
static const auto defaultTypoSetWithContinualAndLengthening = getDefaultTypoSet(DefaultTypoSet::basicTypoSetWithContinualAndLengthening).prepare(true);
|
||||
static const auto continualTypoSet = getDefaultTypoSet(DefaultTypoSet::continualTypoSet).prepare(true);
|
||||
static const auto lengtheningTypoSet = getDefaultTypoSet(DefaultTypoSet::lengtheningTypoSet).prepare(true);
|
||||
static const auto dialect = getDefaultTypoSet(DefaultTypoSet::dialect).prepare(true);
|
||||
switch (set)
|
||||
{
|
||||
case kiwi::DefaultTypoSet::withoutTypo:
|
||||
return nullptr;
|
||||
case kiwi::DefaultTypoSet::basicTypoSet:
|
||||
return &defaultTypoSet;
|
||||
case kiwi::DefaultTypoSet::continualTypoSet:
|
||||
return &continualTypoSet;
|
||||
case kiwi::DefaultTypoSet::basicTypoSetWithContinual:
|
||||
return &defaultTypoSetWithContinual;
|
||||
case kiwi::DefaultTypoSet::lengtheningTypoSet:
|
||||
return &lengtheningTypoSet;
|
||||
case kiwi::DefaultTypoSet::basicTypoSetWithContinualAndLengthening:
|
||||
return &defaultTypoSetWithContinualAndLengthening;
|
||||
case kiwi::DefaultTypoSet::dialect:
|
||||
return &dialect;
|
||||
default:
|
||||
throw invalid_argument{ "Invalid `DefaultTypoSet`" };
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
|||
190
src/UnkFormScorer.cpp
Normal file
190
src/UnkFormScorer.cpp
Normal file
|
|
@ -0,0 +1,190 @@
|
|||
#include <kiwi/Dataset.h>
|
||||
#include "UnkFormScorer.h"
|
||||
#include "SubstringCounter.hpp"
|
||||
|
||||
using namespace std;
|
||||
using namespace kiwi;
|
||||
|
||||
UnkFormScorer::UnkFormScorer(float scale, float bias,
|
||||
const lm::CoNgramModelBase* _chrModel, float _chrBias,
|
||||
const SubstringCounter* _substringCounter,
|
||||
float _globalWeight,
|
||||
float _localWeight,
|
||||
float _globalMinFreq,
|
||||
bool _useChrFreqBranchModel)
|
||||
: chrModel{ _chrModel }, substringCounter{ _substringCounter },
|
||||
oovRuleScale{ scale }, oovRuleBias{ bias },
|
||||
chrBias{ _chrBias },
|
||||
globalWeight{ _globalWeight }, localWeight{ _localWeight },
|
||||
globalMinFreq{ _globalMinFreq },
|
||||
useChrFreqBranchModel{ _useChrFreqBranchModel }
|
||||
{
|
||||
if (chrModel)
|
||||
{
|
||||
chrModel->progressOneStep(bosNodeIdx, bosContextIdx, 0); // BOS
|
||||
}
|
||||
}
|
||||
|
||||
float UnkFormScorer::ruleBasedScore(const U16StringView& form) const
|
||||
{
|
||||
float penalty = 0;
|
||||
if (form.size() > 0)
|
||||
{
|
||||
char32_t chrs[2] = { 0,0 };
|
||||
for (size_t i = 0, j = 0; i < form.size() && j < 2; ++j)
|
||||
{
|
||||
if (isHighSurrogate(form[i]))
|
||||
{
|
||||
chrs[j] = mergeSurrogate(form[i], form[i + 1]);
|
||||
i += 2;
|
||||
}
|
||||
else
|
||||
{
|
||||
chrs[j] = form[i];
|
||||
++i;
|
||||
}
|
||||
}
|
||||
if (isEmoji(chrs[0], chrs[1])) penalty = -10;
|
||||
}
|
||||
|
||||
return penalty - (form.size() * oovRuleScale + oovRuleBias);
|
||||
}
|
||||
|
||||
float UnkFormScorer::chrBasedScore(const U16StringView& form) const
|
||||
{
|
||||
int32_t nodeIdx = bosNodeIdx;
|
||||
uint32_t contextIdx = bosContextIdx;
|
||||
ChrTokenizer tokenizer;
|
||||
float score = 0;
|
||||
for (char16_t c : form)
|
||||
{
|
||||
const size_t token = tokenizer.encodeOne(c);
|
||||
score += chrModel->progressOneStep(nodeIdx, contextIdx, token);
|
||||
}
|
||||
score += chrModel->progressOneStep(nodeIdx, contextIdx, 0); // EOS
|
||||
score -= chrBias;
|
||||
return score;
|
||||
}
|
||||
|
||||
float UnkFormScorer::chrFreqBasedScore(const U16StringView& form) const
|
||||
{
|
||||
int32_t nodeIdx = bosNodeIdx;
|
||||
uint32_t contextIdx = bosContextIdx;
|
||||
ChrTokenizer tokenizer;
|
||||
float score = 0;
|
||||
|
||||
uint32_t rollingHash = 0;
|
||||
for (size_t i = 0; i < form.size(); ++i)
|
||||
{
|
||||
const auto c = form[i];
|
||||
const size_t depth = chrModel->getNodeDepth(nodeIdx);
|
||||
const float globalContextFreq = depth < i ? globalMinFreq : max(chrModel->getContextFrequency(contextIdx), globalMinFreq);
|
||||
const float globalContextFreqSat = tanhf(globalContextFreq / globalWeight) * globalWeight;
|
||||
const size_t token = tokenizer.encodeOne(c);
|
||||
const float lprob = chrModel->progressOneStep(nodeIdx, contextIdx, token);
|
||||
if (i == 0)
|
||||
{
|
||||
rollingHash = SubstringCounter::initHash(c);
|
||||
score += lprob;
|
||||
}
|
||||
else
|
||||
{
|
||||
const float localContextFreq = (float)substringCounter->count(rollingHash, form.data(), i) - 1;
|
||||
rollingHash = SubstringCounter::extendHash(rollingHash, c);
|
||||
if (localContextFreq > 0)
|
||||
{
|
||||
const float curFreq = (float)substringCounter->count(rollingHash, form.data(), i + 1) - 1;
|
||||
if (curFreq < 0) return -99999.f; // should not happen, but just in case
|
||||
const float localContextFreqSat = tanhf(localContextFreq / localWeight) * localWeight;
|
||||
const float localFreq = curFreq * (localContextFreqSat / localContextFreq);
|
||||
const float globalFreq = globalContextFreqSat * expf(lprob);
|
||||
const float mixedProb = logf((localFreq + globalFreq) / (localContextFreqSat + globalContextFreqSat));
|
||||
score += mixedProb;
|
||||
}
|
||||
else
|
||||
{
|
||||
score += lprob;
|
||||
}
|
||||
}
|
||||
}
|
||||
score += chrModel->progressOneStep(nodeIdx, contextIdx, 0); // EOS
|
||||
score -= chrBias;
|
||||
return score;
|
||||
}
|
||||
|
||||
float UnkFormScorer::chrFreqBranchBasedScore(const U16StringView& form) const
|
||||
{
|
||||
return chrFreqBasedScore(form);
|
||||
|
||||
// not implemented yet
|
||||
int32_t nodeIdx = bosNodeIdx;
|
||||
uint32_t contextIdx = bosContextIdx;
|
||||
ChrTokenizer tokenizer;
|
||||
float score = 0;
|
||||
|
||||
array<char16_t, 33> buf;
|
||||
Vector<pair<char16_t, float>> nextChrs;
|
||||
uint32_t rollingHash = 0;
|
||||
for (size_t i = 0; i < form.size(); ++i)
|
||||
{
|
||||
const auto c = form[i];
|
||||
const size_t depth = chrModel->getNodeDepth(nodeIdx);
|
||||
const float globalContextFreq = depth < i ? globalMinFreq : max(chrModel->getContextFrequency(contextIdx), globalMinFreq);
|
||||
const float globalContextFreqSat = tanhf(globalContextFreq / globalWeight) * globalWeight;
|
||||
const size_t token = tokenizer.encodeOne(c);
|
||||
const float lprob = chrModel->progressOneStep(nodeIdx, contextIdx, token);
|
||||
const float branchEntropy = chrModel->getContextEntropy(contextIdx);
|
||||
if (i == 0)
|
||||
{
|
||||
rollingHash = SubstringCounter::initHash(c);
|
||||
// enumerate next characters
|
||||
buf[0] = c;
|
||||
nextChrs.clear();
|
||||
for (char16_t nextChr : substringCounter->getUniqueChars())
|
||||
{
|
||||
buf[1] = nextChr;
|
||||
auto h = SubstringCounter::extendHash(rollingHash, nextChr);
|
||||
auto cnt = substringCounter->count(h, buf.data(), 2);
|
||||
if (cnt > 0)
|
||||
{
|
||||
nextChrs.emplace_back(nextChr, (float)cnt);
|
||||
}
|
||||
}
|
||||
score += lprob;
|
||||
}
|
||||
else
|
||||
{
|
||||
const float localContextFreq = (float)substringCounter->count(rollingHash, form.data(), i) - 1;
|
||||
rollingHash = SubstringCounter::extendHash(rollingHash, c);
|
||||
if (localContextFreq > 0)
|
||||
{
|
||||
// enumerate next characters
|
||||
memcpy(buf.data(), form.data(), (i + 1) * sizeof(char16_t));
|
||||
nextChrs.clear();
|
||||
for (char16_t nextChr : substringCounter->getUniqueChars())
|
||||
{
|
||||
buf[i + 1] = nextChr;
|
||||
auto h = SubstringCounter::extendHash(rollingHash, nextChr);
|
||||
auto cnt = substringCounter->count(h, buf.data(), i + 2);
|
||||
if (cnt > 0)
|
||||
{
|
||||
nextChrs.emplace_back(nextChr, (float)cnt);
|
||||
}
|
||||
}
|
||||
const float curFreq = (float)substringCounter->count(rollingHash, form.data(), i + 1) - 1;
|
||||
const float localContextFreqSat = tanhf(localContextFreq / localWeight) * localWeight;
|
||||
const float localFreq = curFreq * (localContextFreqSat / localContextFreq);
|
||||
const float globalFreq = globalContextFreqSat * expf(lprob);
|
||||
const float mixedProb = logf((localFreq + globalFreq) / (localContextFreqSat + globalContextFreqSat));
|
||||
score += mixedProb;
|
||||
}
|
||||
else
|
||||
{
|
||||
score += lprob;
|
||||
}
|
||||
}
|
||||
}
|
||||
score += chrModel->progressOneStep(nodeIdx, contextIdx, 0); // EOS
|
||||
score -= chrBias;
|
||||
return score;
|
||||
}
|
||||
62
src/UnkFormScorer.h
Normal file
62
src/UnkFormScorer.h
Normal file
|
|
@ -0,0 +1,62 @@
|
|||
#pragma once
|
||||
#include <kiwi/CoNgramModel.h>
|
||||
|
||||
namespace kiwi
|
||||
{
|
||||
class SubstringCounter;
|
||||
|
||||
class UnkFormScorer
|
||||
{
|
||||
const lm::CoNgramModelBase* chrModel = nullptr;
|
||||
const SubstringCounter* substringCounter = nullptr;
|
||||
float oovRuleScale = 0;
|
||||
float oovRuleBias = 0;
|
||||
float chrBias = 0;
|
||||
float globalWeight = 0;
|
||||
float localWeight = 0;
|
||||
float globalMinFreq = 4.f;
|
||||
int32_t bosNodeIdx = 0;
|
||||
uint32_t bosContextIdx = 0;
|
||||
bool useChrFreqBranchModel = false;
|
||||
|
||||
public:
|
||||
UnkFormScorer(float scale, float bias,
|
||||
const lm::CoNgramModelBase* _chrModel, float _chrBias,
|
||||
const SubstringCounter* _substringCounter,
|
||||
float _globalWeight = 60.f,
|
||||
float _localWeight = 3.f,
|
||||
float _globalMinFreq = 4.f,
|
||||
bool _useChrFreqBranchModel = false);
|
||||
|
||||
float ruleBasedScore(const U16StringView& form) const;
|
||||
|
||||
float chrBasedScore(const U16StringView& form) const;
|
||||
|
||||
float chrFreqBasedScore(const U16StringView& form) const;
|
||||
|
||||
float chrFreqBranchBasedScore(const U16StringView& form) const;
|
||||
|
||||
float operator()(const U16StringView& form) const
|
||||
{
|
||||
if (chrModel && substringCounter)
|
||||
{
|
||||
if (useChrFreqBranchModel)
|
||||
{
|
||||
return chrFreqBranchBasedScore(form);
|
||||
}
|
||||
else
|
||||
{
|
||||
return chrFreqBasedScore(form);
|
||||
}
|
||||
}
|
||||
else if (chrModel)
|
||||
{
|
||||
return chrBasedScore(form);
|
||||
}
|
||||
else
|
||||
{
|
||||
return ruleBasedScore(form);
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
|
|
@ -609,7 +609,11 @@ namespace kiwi
|
|||
{
|
||||
return [modelPath](const std::string& filename) -> std::unique_ptr<std::istream> {
|
||||
std::string fullPath = modelPath + "/" + filename;
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
auto stream = std::make_unique<std::ifstream>((const wchar_t*)utf8To16(fullPath).c_str(), std::ios::binary);
|
||||
#else
|
||||
auto stream = std::make_unique<std::ifstream>(fullPath, std::ios::binary);
|
||||
#endif
|
||||
if (!stream->is_open()) {
|
||||
return nullptr;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -3,7 +3,11 @@
|
|||
#include <cstring>
|
||||
|
||||
#ifdef USE_VNNI
|
||||
#ifdef _MSC_VER
|
||||
#define DPBUSD _mm256_dpbusd_avx_epi32
|
||||
#else
|
||||
#define DPBUSD _mm256_dpbusd_epi32
|
||||
#endif
|
||||
#define DETAIL detailVnni
|
||||
#else
|
||||
#define DPBUSD emulated_dpbusd
|
||||
|
|
|
|||
|
|
@ -1,5 +1,6 @@
|
|||
#include "../MathFunc.hpp"
|
||||
#include "../qgemm.hpp"
|
||||
#include <arm_neon.h>
|
||||
|
||||
namespace kiwi
|
||||
{
|
||||
|
|
@ -22,34 +23,246 @@ namespace kiwi
|
|||
float* c, size_t ldc
|
||||
);
|
||||
|
||||
static FORCE_INLINE int32_t reduce_sum_s32(int32x4_t v)
|
||||
{
|
||||
v = vpaddq_s32(v, v);
|
||||
v = vpaddq_s32(v, v);
|
||||
return vgetq_lane_s32(v, 0);
|
||||
}
|
||||
|
||||
// gemv: compute c[i] = (dotprod(a_uint8, b_int8[i]) - bSum[i]) * aScale * bScale[i]
|
||||
// a: [k uint8][float aScale], b rows: [k int8][float bScale][int32 bSum]
|
||||
inline void gemv_neon(size_t m, size_t k, const uint8_t* a, const int8_t* b, size_t ldb, float* c)
|
||||
{
|
||||
const float aScale = *reinterpret_cast<const float*>(a + k);
|
||||
float bScale[4];
|
||||
int32_t bSum[4];
|
||||
const float32x4_t vaScale = vdupq_n_f32(aScale);
|
||||
|
||||
for (size_t mi = 0; mi < m; mi += 4)
|
||||
{
|
||||
const int8_t* bPtr0 = b + ldb * (mi + 0);
|
||||
const int8_t* bPtr1 = b + ldb * (mi + 1);
|
||||
const int8_t* bPtr2 = b + ldb * (mi + 2);
|
||||
const int8_t* bPtr3 = b + ldb * (mi + 3);
|
||||
|
||||
int32x4_t sum0 = vdupq_n_s32(0);
|
||||
int32x4_t sum1 = vdupq_n_s32(0);
|
||||
int32x4_t sum2 = vdupq_n_s32(0);
|
||||
int32x4_t sum3 = vdupq_n_s32(0);
|
||||
|
||||
for (size_t j = 0; j < k; j += 16)
|
||||
{
|
||||
uint8x16_t pa = vld1q_u8(a + j);
|
||||
int8x16_t pb0 = vld1q_s8(bPtr0 + j);
|
||||
int8x16_t pb1 = vld1q_s8(bPtr1 + j);
|
||||
int8x16_t pb2 = vld1q_s8(bPtr2 + j);
|
||||
int8x16_t pb3 = vld1q_s8(bPtr3 + j);
|
||||
|
||||
// Extend a (uint8) to int16 via zero-extend; b (int8) via sign-extend
|
||||
// Product fits in int16: range [-32640, 32385]
|
||||
int16x8_t pa_lo = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pa)));
|
||||
int16x8_t pa_hi = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pa)));
|
||||
|
||||
sum0 = vpadalq_s16(sum0, vmulq_s16(pa_lo, vmovl_s8(vget_low_s8(pb0))));
|
||||
sum0 = vpadalq_s16(sum0, vmulq_s16(pa_hi, vmovl_s8(vget_high_s8(pb0))));
|
||||
sum1 = vpadalq_s16(sum1, vmulq_s16(pa_lo, vmovl_s8(vget_low_s8(pb1))));
|
||||
sum1 = vpadalq_s16(sum1, vmulq_s16(pa_hi, vmovl_s8(vget_high_s8(pb1))));
|
||||
sum2 = vpadalq_s16(sum2, vmulq_s16(pa_lo, vmovl_s8(vget_low_s8(pb2))));
|
||||
sum2 = vpadalq_s16(sum2, vmulq_s16(pa_hi, vmovl_s8(vget_high_s8(pb2))));
|
||||
sum3 = vpadalq_s16(sum3, vmulq_s16(pa_lo, vmovl_s8(vget_low_s8(pb3))));
|
||||
sum3 = vpadalq_s16(sum3, vmulq_s16(pa_hi, vmovl_s8(vget_high_s8(pb3))));
|
||||
}
|
||||
|
||||
bScale[0] = *reinterpret_cast<const float*>(bPtr0 + k);
|
||||
bScale[1] = *reinterpret_cast<const float*>(bPtr1 + k);
|
||||
bScale[2] = *reinterpret_cast<const float*>(bPtr2 + k);
|
||||
bScale[3] = *reinterpret_cast<const float*>(bPtr3 + k);
|
||||
bSum[0] = *reinterpret_cast<const int32_t*>(bPtr0 + k + 4);
|
||||
bSum[1] = *reinterpret_cast<const int32_t*>(bPtr1 + k + 4);
|
||||
bSum[2] = *reinterpret_cast<const int32_t*>(bPtr2 + k + 4);
|
||||
bSum[3] = *reinterpret_cast<const int32_t*>(bPtr3 + k + 4);
|
||||
|
||||
const int32_t sArr[4] = {
|
||||
reduce_sum_s32(sum0) - bSum[0],
|
||||
reduce_sum_s32(sum1) - bSum[1],
|
||||
reduce_sum_s32(sum2) - bSum[2],
|
||||
reduce_sum_s32(sum3) - bSum[3]
|
||||
};
|
||||
const float32x4_t vbScale = vld1q_f32(bScale);
|
||||
const float32x4_t vfsums = vcvtq_f32_s32(vld1q_s32(sArr));
|
||||
vst1q_f32(c + mi, vmulq_f32(vmulq_f32(vfsums, vaScale), vbScale));
|
||||
}
|
||||
}
|
||||
|
||||
// gemvS8S8: native int8 x int8 GEMV, no bias correction needed
|
||||
// a: [k int8][float aScale], b rows: [k int8][float bScale][int32 bSum (unused)]
|
||||
// result[i] = dotprod(a, b[i]) * aScale * bScale[i]
|
||||
inline void gemvS8S8_neon(size_t m, size_t k, const int8_t* a, const int8_t* b, size_t ldb, float* c)
|
||||
{
|
||||
const float aScale = *reinterpret_cast<const float*>(a + k);
|
||||
float bScale[4];
|
||||
const float32x4_t vaScale = vdupq_n_f32(aScale);
|
||||
|
||||
for (size_t mi = 0; mi < m; mi += 4)
|
||||
{
|
||||
const int8_t* bPtr0 = b + ldb * (mi + 0);
|
||||
const int8_t* bPtr1 = b + ldb * (mi + 1);
|
||||
const int8_t* bPtr2 = b + ldb * (mi + 2);
|
||||
const int8_t* bPtr3 = b + ldb * (mi + 3);
|
||||
|
||||
int32x4_t sum0 = vdupq_n_s32(0);
|
||||
int32x4_t sum1 = vdupq_n_s32(0);
|
||||
int32x4_t sum2 = vdupq_n_s32(0);
|
||||
int32x4_t sum3 = vdupq_n_s32(0);
|
||||
|
||||
for (size_t j = 0; j < k; j += 16)
|
||||
{
|
||||
int8x16_t pa = vld1q_s8(a + j);
|
||||
int8x16_t pb0 = vld1q_s8(bPtr0 + j);
|
||||
int8x16_t pb1 = vld1q_s8(bPtr1 + j);
|
||||
int8x16_t pb2 = vld1q_s8(bPtr2 + j);
|
||||
int8x16_t pb3 = vld1q_s8(bPtr3 + j);
|
||||
|
||||
// Native int8 x int8 dot product using vmull_s8
|
||||
// Product range: [-128*127, 127*127] = [-16256, 16129], fits in int16
|
||||
sum0 = vpadalq_s16(sum0, vmull_s8(vget_low_s8(pa), vget_low_s8(pb0)));
|
||||
sum0 = vpadalq_s16(sum0, vmull_s8(vget_high_s8(pa), vget_high_s8(pb0)));
|
||||
sum1 = vpadalq_s16(sum1, vmull_s8(vget_low_s8(pa), vget_low_s8(pb1)));
|
||||
sum1 = vpadalq_s16(sum1, vmull_s8(vget_high_s8(pa), vget_high_s8(pb1)));
|
||||
sum2 = vpadalq_s16(sum2, vmull_s8(vget_low_s8(pa), vget_low_s8(pb2)));
|
||||
sum2 = vpadalq_s16(sum2, vmull_s8(vget_high_s8(pa), vget_high_s8(pb2)));
|
||||
sum3 = vpadalq_s16(sum3, vmull_s8(vget_low_s8(pa), vget_low_s8(pb3)));
|
||||
sum3 = vpadalq_s16(sum3, vmull_s8(vget_high_s8(pa), vget_high_s8(pb3)));
|
||||
}
|
||||
|
||||
bScale[0] = *reinterpret_cast<const float*>(bPtr0 + k);
|
||||
bScale[1] = *reinterpret_cast<const float*>(bPtr1 + k);
|
||||
bScale[2] = *reinterpret_cast<const float*>(bPtr2 + k);
|
||||
bScale[3] = *reinterpret_cast<const float*>(bPtr3 + k);
|
||||
// bSum correction is not needed: native int8 x int8 gives exact result
|
||||
|
||||
const int32_t sArr[4] = {
|
||||
reduce_sum_s32(sum0),
|
||||
reduce_sum_s32(sum1),
|
||||
reduce_sum_s32(sum2),
|
||||
reduce_sum_s32(sum3)
|
||||
};
|
||||
const float32x4_t vbScale = vld1q_f32(bScale);
|
||||
const float32x4_t vfsums = vcvtq_f32_s32(vld1q_s32(sArr));
|
||||
vst1q_f32(c + mi, vmulq_f32(vmulq_f32(vfsums, vaScale), vbScale));
|
||||
}
|
||||
}
|
||||
|
||||
// gemvU8U8: centered uint8 x uint8 GEMV (both a and b represent int8 biased by +128)
|
||||
// result[i] = sum((a-128) * (b[i]-128)) * aScale * bScale[i]
|
||||
inline void gemvU8U8_neon(size_t m, size_t k, const uint8_t* a, const uint8_t* b, size_t ldb, float* c)
|
||||
{
|
||||
const uint8x16_t bias = vdupq_n_u8(128);
|
||||
const float aScale = *reinterpret_cast<const float*>(a + k);
|
||||
float bScale[4];
|
||||
const float32x4_t vaScale = vdupq_n_f32(aScale);
|
||||
|
||||
for (size_t mi = 0; mi < m; mi += 4)
|
||||
{
|
||||
const uint8_t* bPtr0 = b + ldb * (mi + 0);
|
||||
const uint8_t* bPtr1 = b + ldb * (mi + 1);
|
||||
const uint8_t* bPtr2 = b + ldb * (mi + 2);
|
||||
const uint8_t* bPtr3 = b + ldb * (mi + 3);
|
||||
|
||||
int32x4_t sum0 = vdupq_n_s32(0);
|
||||
int32x4_t sum1 = vdupq_n_s32(0);
|
||||
int32x4_t sum2 = vdupq_n_s32(0);
|
||||
int32x4_t sum3 = vdupq_n_s32(0);
|
||||
|
||||
for (size_t j = 0; j < k; j += 16)
|
||||
{
|
||||
// Convert from uint8 (0-255) to int8 (-128 to 127) via XOR 0x80
|
||||
int8x16_t pa = vreinterpretq_s8_u8(veorq_u8(vld1q_u8(a + j), bias));
|
||||
int8x16_t pb0 = vreinterpretq_s8_u8(veorq_u8(vld1q_u8(bPtr0 + j), bias));
|
||||
int8x16_t pb1 = vreinterpretq_s8_u8(veorq_u8(vld1q_u8(bPtr1 + j), bias));
|
||||
int8x16_t pb2 = vreinterpretq_s8_u8(veorq_u8(vld1q_u8(bPtr2 + j), bias));
|
||||
int8x16_t pb3 = vreinterpretq_s8_u8(veorq_u8(vld1q_u8(bPtr3 + j), bias));
|
||||
|
||||
sum0 = vpadalq_s16(sum0, vmull_s8(vget_low_s8(pa), vget_low_s8(pb0)));
|
||||
sum0 = vpadalq_s16(sum0, vmull_s8(vget_high_s8(pa), vget_high_s8(pb0)));
|
||||
sum1 = vpadalq_s16(sum1, vmull_s8(vget_low_s8(pa), vget_low_s8(pb1)));
|
||||
sum1 = vpadalq_s16(sum1, vmull_s8(vget_high_s8(pa), vget_high_s8(pb1)));
|
||||
sum2 = vpadalq_s16(sum2, vmull_s8(vget_low_s8(pa), vget_low_s8(pb2)));
|
||||
sum2 = vpadalq_s16(sum2, vmull_s8(vget_high_s8(pa), vget_high_s8(pb2)));
|
||||
sum3 = vpadalq_s16(sum3, vmull_s8(vget_low_s8(pa), vget_low_s8(pb3)));
|
||||
sum3 = vpadalq_s16(sum3, vmull_s8(vget_high_s8(pa), vget_high_s8(pb3)));
|
||||
}
|
||||
|
||||
bScale[0] = *reinterpret_cast<const float*>(bPtr0 + k);
|
||||
bScale[1] = *reinterpret_cast<const float*>(bPtr1 + k);
|
||||
bScale[2] = *reinterpret_cast<const float*>(bPtr2 + k);
|
||||
bScale[3] = *reinterpret_cast<const float*>(bPtr3 + k);
|
||||
|
||||
const int32_t sArr[4] = {
|
||||
reduce_sum_s32(sum0),
|
||||
reduce_sum_s32(sum1),
|
||||
reduce_sum_s32(sum2),
|
||||
reduce_sum_s32(sum3)
|
||||
};
|
||||
const float32x4_t vbScale = vld1q_f32(bScale);
|
||||
const float32x4_t vfsums = vcvtq_f32_s32(vld1q_s32(sArr));
|
||||
vst1q_f32(c + mi, vmulq_f32(vmulq_f32(vfsums, vaScale), vbScale));
|
||||
}
|
||||
}
|
||||
|
||||
template<>
|
||||
void gemv<ArchType::neon>(size_t m, size_t k, const uint8_t* a, const int8_t* b, size_t ldb, float* c)
|
||||
{
|
||||
throw std::runtime_error("Not implemented yet");
|
||||
return gemv_neon(m, k, a, b, ldb, c);
|
||||
}
|
||||
|
||||
template<>
|
||||
void gemvS8S8<ArchType::neon>(size_t m, size_t k, const int8_t* a, const int8_t* b, size_t ldb, float* c)
|
||||
{
|
||||
throw std::runtime_error("Not implemented yet");
|
||||
return gemvS8S8_neon(m, k, a, b, ldb, c);
|
||||
}
|
||||
|
||||
template<>
|
||||
void gemvU8U8<ArchType::neon>(size_t m, size_t k, const uint8_t* a, const uint8_t* b, size_t ldb, float* c)
|
||||
{
|
||||
throw std::runtime_error("Not implemented yet");
|
||||
return gemvU8U8_neon(m, k, a, b, ldb, c);
|
||||
}
|
||||
|
||||
template<>
|
||||
float dotS8S8<ArchType::neon>(size_t k, const int8_t* a, const int8_t* b)
|
||||
{
|
||||
throw std::runtime_error("Not implemented yet");
|
||||
const float aScale = *reinterpret_cast<const float*>(a + k);
|
||||
const float bScale = *reinterpret_cast<const float*>(b + k);
|
||||
// No bSum correction needed for native int8 x int8
|
||||
|
||||
int32x4_t sum = vdupq_n_s32(0);
|
||||
for (size_t i = 0; i < k; i += 16)
|
||||
{
|
||||
int8x16_t pa = vld1q_s8(a + i);
|
||||
int8x16_t pb = vld1q_s8(b + i);
|
||||
sum = vpadalq_s16(sum, vmull_s8(vget_low_s8(pa), vget_low_s8(pb)));
|
||||
sum = vpadalq_s16(sum, vmull_s8(vget_high_s8(pa), vget_high_s8(pb)));
|
||||
}
|
||||
return static_cast<float>(reduce_sum_s32(sum)) * aScale * bScale;
|
||||
}
|
||||
|
||||
template<>
|
||||
float dotU8U8<ArchType::neon>(size_t k, const uint8_t* a, const uint8_t* b)
|
||||
{
|
||||
throw std::runtime_error("Not implemented yet");
|
||||
const float aScale = *reinterpret_cast<const float*>(a + k);
|
||||
const float bScale = *reinterpret_cast<const float*>(b + k);
|
||||
const uint8x16_t bias = vdupq_n_u8(128);
|
||||
|
||||
int32x4_t sum = vdupq_n_s32(0);
|
||||
for (size_t i = 0; i < k; i += 16)
|
||||
{
|
||||
int8x16_t pa = vreinterpretq_s8_u8(veorq_u8(vld1q_u8(a + i), bias));
|
||||
int8x16_t pb = vreinterpretq_s8_u8(veorq_u8(vld1q_u8(b + i), bias));
|
||||
sum = vpadalq_s16(sum, vmull_s8(vget_low_s8(pa), vget_low_s8(pb)));
|
||||
sum = vpadalq_s16(sum, vmull_s8(vget_high_s8(pa), vget_high_s8(pb)));
|
||||
}
|
||||
return static_cast<float>(reduce_sum_s32(sum)) * aScale * bScale;
|
||||
}
|
||||
|
||||
template<>
|
||||
|
|
@ -59,7 +272,50 @@ namespace kiwi
|
|||
float* out
|
||||
)
|
||||
{
|
||||
throw std::runtime_error("Not implemented yet");
|
||||
for (size_t mi = 0; mi < m; mi += 4)
|
||||
{
|
||||
const int8_t* aPtr0 = a + lda * (mi + 0);
|
||||
const int8_t* aPtr1 = a + lda * (mi + 1);
|
||||
const int8_t* aPtr2 = a + lda * (mi + 2);
|
||||
const int8_t* aPtr3 = a + lda * (mi + 3);
|
||||
|
||||
int32x4_t sum0 = vdupq_n_s32(0);
|
||||
int32x4_t sum1 = vdupq_n_s32(0);
|
||||
int32x4_t sum2 = vdupq_n_s32(0);
|
||||
int32x4_t sum3 = vdupq_n_s32(0);
|
||||
|
||||
for (size_t j = 0; j < k; j += 16)
|
||||
{
|
||||
int8x16_t pa0 = vld1q_s8(aPtr0 + j);
|
||||
int8x16_t pa1 = vld1q_s8(aPtr1 + j);
|
||||
int8x16_t pa2 = vld1q_s8(aPtr2 + j);
|
||||
int8x16_t pa3 = vld1q_s8(aPtr3 + j);
|
||||
|
||||
// Compute a^2 using native int8 x int8 multiply
|
||||
// Max product: (-128)*(-128) = 16384, fits in int16
|
||||
sum0 = vpadalq_s16(sum0, vmull_s8(vget_low_s8(pa0), vget_low_s8(pa0)));
|
||||
sum0 = vpadalq_s16(sum0, vmull_s8(vget_high_s8(pa0), vget_high_s8(pa0)));
|
||||
sum1 = vpadalq_s16(sum1, vmull_s8(vget_low_s8(pa1), vget_low_s8(pa1)));
|
||||
sum1 = vpadalq_s16(sum1, vmull_s8(vget_high_s8(pa1), vget_high_s8(pa1)));
|
||||
sum2 = vpadalq_s16(sum2, vmull_s8(vget_low_s8(pa2), vget_low_s8(pa2)));
|
||||
sum2 = vpadalq_s16(sum2, vmull_s8(vget_high_s8(pa2), vget_high_s8(pa2)));
|
||||
sum3 = vpadalq_s16(sum3, vmull_s8(vget_low_s8(pa3), vget_low_s8(pa3)));
|
||||
sum3 = vpadalq_s16(sum3, vmull_s8(vget_high_s8(pa3), vget_high_s8(pa3)));
|
||||
}
|
||||
|
||||
const float aScale0 = *reinterpret_cast<const float*>(aPtr0 + k);
|
||||
const float aScale1 = *reinterpret_cast<const float*>(aPtr1 + k);
|
||||
const float aScale2 = *reinterpret_cast<const float*>(aPtr2 + k);
|
||||
const float aScale3 = *reinterpret_cast<const float*>(aPtr3 + k);
|
||||
|
||||
const float rArr[4] = {
|
||||
static_cast<float>(reduce_sum_s32(sum0)) * aScale0 * aScale0,
|
||||
static_cast<float>(reduce_sum_s32(sum1)) * aScale1 * aScale1,
|
||||
static_cast<float>(reduce_sum_s32(sum2)) * aScale2 * aScale2,
|
||||
static_cast<float>(reduce_sum_s32(sum3)) * aScale3 * aScale3
|
||||
};
|
||||
vst1q_f32(out + mi, vrsqrteq_f32(vld1q_f32(rArr)));
|
||||
}
|
||||
}
|
||||
|
||||
template<>
|
||||
|
|
@ -69,7 +325,52 @@ namespace kiwi
|
|||
float* out
|
||||
)
|
||||
{
|
||||
throw std::runtime_error("Not implemented yet");
|
||||
const uint8x16_t bias = vdupq_n_u8(128);
|
||||
|
||||
for (size_t mi = 0; mi < m; mi += 4)
|
||||
{
|
||||
const uint8_t* aPtr0 = a + lda * (mi + 0);
|
||||
const uint8_t* aPtr1 = a + lda * (mi + 1);
|
||||
const uint8_t* aPtr2 = a + lda * (mi + 2);
|
||||
const uint8_t* aPtr3 = a + lda * (mi + 3);
|
||||
|
||||
int32x4_t sum0 = vdupq_n_s32(0);
|
||||
int32x4_t sum1 = vdupq_n_s32(0);
|
||||
int32x4_t sum2 = vdupq_n_s32(0);
|
||||
int32x4_t sum3 = vdupq_n_s32(0);
|
||||
|
||||
for (size_t j = 0; j < k; j += 16)
|
||||
{
|
||||
// Center uint8 to int8 via XOR 0x80: (a-128)
|
||||
int8x16_t pa0 = vreinterpretq_s8_u8(veorq_u8(vld1q_u8(aPtr0 + j), bias));
|
||||
int8x16_t pa1 = vreinterpretq_s8_u8(veorq_u8(vld1q_u8(aPtr1 + j), bias));
|
||||
int8x16_t pa2 = vreinterpretq_s8_u8(veorq_u8(vld1q_u8(aPtr2 + j), bias));
|
||||
int8x16_t pa3 = vreinterpretq_s8_u8(veorq_u8(vld1q_u8(aPtr3 + j), bias));
|
||||
|
||||
// Compute (a-128)^2
|
||||
sum0 = vpadalq_s16(sum0, vmull_s8(vget_low_s8(pa0), vget_low_s8(pa0)));
|
||||
sum0 = vpadalq_s16(sum0, vmull_s8(vget_high_s8(pa0), vget_high_s8(pa0)));
|
||||
sum1 = vpadalq_s16(sum1, vmull_s8(vget_low_s8(pa1), vget_low_s8(pa1)));
|
||||
sum1 = vpadalq_s16(sum1, vmull_s8(vget_high_s8(pa1), vget_high_s8(pa1)));
|
||||
sum2 = vpadalq_s16(sum2, vmull_s8(vget_low_s8(pa2), vget_low_s8(pa2)));
|
||||
sum2 = vpadalq_s16(sum2, vmull_s8(vget_high_s8(pa2), vget_high_s8(pa2)));
|
||||
sum3 = vpadalq_s16(sum3, vmull_s8(vget_low_s8(pa3), vget_low_s8(pa3)));
|
||||
sum3 = vpadalq_s16(sum3, vmull_s8(vget_high_s8(pa3), vget_high_s8(pa3)));
|
||||
}
|
||||
|
||||
const float aScale0 = *reinterpret_cast<const float*>(aPtr0 + k);
|
||||
const float aScale1 = *reinterpret_cast<const float*>(aPtr1 + k);
|
||||
const float aScale2 = *reinterpret_cast<const float*>(aPtr2 + k);
|
||||
const float aScale3 = *reinterpret_cast<const float*>(aPtr3 + k);
|
||||
|
||||
const float rArr[4] = {
|
||||
static_cast<float>(reduce_sum_s32(sum0)) * aScale0 * aScale0,
|
||||
static_cast<float>(reduce_sum_s32(sum1)) * aScale1 * aScale1,
|
||||
static_cast<float>(reduce_sum_s32(sum2)) * aScale2 * aScale2,
|
||||
static_cast<float>(reduce_sum_s32(sum3)) * aScale3 * aScale3
|
||||
};
|
||||
vst1q_f32(out + mi, vrsqrteq_f32(vld1q_f32(rArr)));
|
||||
}
|
||||
}
|
||||
|
||||
template<>
|
||||
|
|
|
|||
|
|
@ -42,10 +42,10 @@ namespace kiwi
|
|||
scale = (scale & 0x3F) + scaleBias;
|
||||
|
||||
lower = (lower - lzp) * scale;
|
||||
lower += (lower >= 0) ? 4 : -4;
|
||||
lower += (lower >= 0) ? 4 : -4; // for round up
|
||||
lower /= scaleDivider;
|
||||
upper = (upper - lzp) * scale;
|
||||
upper += (upper >= 0) ? 4 : -4;
|
||||
upper += (upper >= 0) ? 4 : -4; // for round up
|
||||
upper /= scaleDivider;
|
||||
if (toUint8)
|
||||
{
|
||||
|
|
|
|||
|
|
@ -48,6 +48,10 @@ struct kiwi_typo : public TypoTransformer
|
|||
{
|
||||
};
|
||||
|
||||
struct kiwi_prepared_typo : public PreparedTypoTransformer
|
||||
{
|
||||
};
|
||||
|
||||
struct kiwi_morphset
|
||||
{
|
||||
Kiwi* inst = nullptr;
|
||||
|
|
@ -284,6 +288,40 @@ int kiwi_builder_add_alias_word(kiwi_builder_h handle, const char* alias, const
|
|||
}
|
||||
}
|
||||
|
||||
int kiwi_builder_add_word_with_def(kiwi_builder_h handle, const char* word, const char* pos, int sense_id, int dialect, float score)
|
||||
{
|
||||
if (!handle) return KIWIERR_INVALID_HANDLE;
|
||||
auto* kb = (KiwiBuilder*)handle;
|
||||
try
|
||||
{
|
||||
MorphemeDef def{ parse_tag(pos), (uint8_t)sense_id, (Dialect)dialect };
|
||||
if (kb->addWord(utf8To16(word), def, score).second) return 0;
|
||||
return KIWIERR_FAIL;
|
||||
}
|
||||
catch (...)
|
||||
{
|
||||
currentError = current_exception();
|
||||
return KIWIERR_FAIL;
|
||||
}
|
||||
}
|
||||
|
||||
int kiwi_builder_add_alias_word_with_def(kiwi_builder_h handle, const char* alias, const char* pos, int sense_id, int dialect, float score, const char* orig_word)
|
||||
{
|
||||
if (!handle) return KIWIERR_INVALID_HANDLE;
|
||||
auto* kb = (KiwiBuilder*)handle;
|
||||
try
|
||||
{
|
||||
MorphemeDef def{ parse_tag(pos), (uint8_t)sense_id, (Dialect)dialect };
|
||||
if (kb->addWord(utf8To16(alias), def, score, utf8To16(orig_word)).second) return 0;
|
||||
return KIWIERR_FAIL;
|
||||
}
|
||||
catch (...)
|
||||
{
|
||||
currentError = current_exception();
|
||||
return KIWIERR_FAIL;
|
||||
}
|
||||
}
|
||||
|
||||
int kiwi_builder_add_pre_analyzed_word(kiwi_builder_h handle, const char* form, int size, const char** analyzed_morphs, const char** analyzed_pos, float score, const int* positions)
|
||||
{
|
||||
if (!handle) return KIWIERR_INVALID_HANDLE;
|
||||
|
|
@ -647,11 +685,48 @@ int kiwi_typo_close(kiwi_typo_h handle)
|
|||
}
|
||||
}
|
||||
|
||||
kiwi_h kiwi_init(const char * modelPath, int num_threads, int options)
|
||||
kiwi_prepared_typo_h kiwi_typo_prepare(kiwi_typo_h handle)
|
||||
{
|
||||
if (!handle) return nullptr;
|
||||
try
|
||||
{
|
||||
return new kiwi_prepared_typo{ handle->prepare(true) };
|
||||
}
|
||||
catch (...)
|
||||
{
|
||||
currentError = current_exception();
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
int kiwi_prepared_typo_close(kiwi_prepared_typo_h handle)
|
||||
{
|
||||
if (!handle) return KIWIERR_INVALID_HANDLE;
|
||||
try
|
||||
{
|
||||
delete handle;
|
||||
return 0;
|
||||
}
|
||||
catch (...)
|
||||
{
|
||||
currentError = current_exception();
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
kiwi_h kiwi_init(const char * modelPath, int num_threads, int options, int enabled_dialects)
|
||||
{
|
||||
try
|
||||
{
|
||||
return (kiwi_h)new Kiwi{ KiwiBuilder{ modelPath, (size_t)num_threads, (BuildOption)options }.build() };
|
||||
BuildOption buildOption = (BuildOption)(options & 0xFF);
|
||||
const auto mtMask = options & (KIWI_BUILD_MODEL_TYPE_LARGEST | KIWI_BUILD_MODEL_TYPE_KNLM | KIWI_BUILD_MODEL_TYPE_SBG | KIWI_BUILD_MODEL_TYPE_CONG | KIWI_BUILD_MODEL_TYPE_CONG_GLOBAL);
|
||||
const ModelType modelType = (mtMask == KIWI_BUILD_MODEL_TYPE_LARGEST) ? ModelType::largest
|
||||
: (mtMask == KIWI_BUILD_MODEL_TYPE_KNLM) ? ModelType::knlm
|
||||
: (mtMask == KIWI_BUILD_MODEL_TYPE_SBG) ? ModelType::sbg
|
||||
: (mtMask == KIWI_BUILD_MODEL_TYPE_CONG) ? ModelType::cong
|
||||
: (mtMask == KIWI_BUILD_MODEL_TYPE_CONG_GLOBAL) ? ModelType::congGlobal
|
||||
: ModelType::none;
|
||||
return (kiwi_h)new Kiwi{ KiwiBuilder{ modelPath, (size_t)num_threads, buildOption, modelType, (Dialect)enabled_dialects }.build() };
|
||||
}
|
||||
catch (...)
|
||||
{
|
||||
|
|
@ -669,11 +744,16 @@ void kiwi_set_global_config(kiwi_h handle, kiwi_config_t config)
|
|||
KiwiConfig kconfig{
|
||||
!!config.integrate_allomorph,
|
||||
config.cut_off_threshold,
|
||||
config.unk_form_score_scale,
|
||||
config.unk_form_score_bias,
|
||||
config.oov_rule_scale,
|
||||
config.oov_rule_bias,
|
||||
config.oov_chr_bias,
|
||||
config.oov_global_weight,
|
||||
config.oov_local_weight,
|
||||
config.oov_global_min_freq,
|
||||
config.space_penalty,
|
||||
config.typo_cost_weight,
|
||||
config.max_unk_form_size,
|
||||
config.max_unk_form_size_followed_by_j_class,
|
||||
config.space_tolerance,
|
||||
};
|
||||
kiwi->setGlobalConfig(kconfig);
|
||||
|
|
@ -694,11 +774,16 @@ kiwi_config_t kiwi_get_global_config(kiwi_h handle)
|
|||
KiwiConfig kconfig = kiwi->getGlobalConfig();
|
||||
config.integrate_allomorph = kconfig.integrateAllomorph;
|
||||
config.cut_off_threshold = kconfig.cutOffThreshold;
|
||||
config.unk_form_score_scale = kconfig.unkFormScoreScale;
|
||||
config.unk_form_score_bias = kconfig.unkFormScoreBias;
|
||||
config.oov_rule_scale = kconfig.oovRuleScale;
|
||||
config.oov_rule_bias = kconfig.oovRuleBias;
|
||||
config.oov_chr_bias = kconfig.oovChrBias;
|
||||
config.oov_global_weight = kconfig.oovGlobalWeight;
|
||||
config.oov_local_weight = kconfig.oovLocalWeight;
|
||||
config.oov_global_min_freq = kconfig.oovGlobalMinFreq;
|
||||
config.space_penalty = kconfig.spacePenalty;
|
||||
config.typo_cost_weight = kconfig.typoCostWeight;
|
||||
config.max_unk_form_size = kconfig.maxUnkFormSize;
|
||||
config.max_unk_form_size_followed_by_j_class = kconfig.maxUnkFormSizeFollowedByJClass;
|
||||
config.space_tolerance = kconfig.spaceTolerance;
|
||||
}
|
||||
catch (...)
|
||||
|
|
@ -785,7 +870,9 @@ inline AnalyzeOption toAnalyzeOption(kiwi_analyze_option_t option)
|
|||
option.blocklist ? &option.blocklist->morphemes : nullptr,
|
||||
!!option.open_ending,
|
||||
(Dialect)option.allowed_dialects,
|
||||
option.dialect_cost
|
||||
option.dialect_cost,
|
||||
option.typo_transformer,
|
||||
option.typo_threshold
|
||||
};
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -61,12 +61,23 @@ namespace kiwi
|
|||
{
|
||||
const auto* aPtr = aBuffer + i * (k + 8);
|
||||
const auto* bPtr = bBuffer + j * (k + 8);
|
||||
int32_t acc = op.dotprod(aPtr, bPtr, k);
|
||||
const float contextScale = *reinterpret_cast<const float*>(aPtr + k),
|
||||
outputScale = *reinterpret_cast<const float*>(bPtr + k),
|
||||
float contextBias;
|
||||
if constexpr (archType == ArchType::neon)
|
||||
{
|
||||
const auto* aPtrS8 = reinterpret_cast<const int8_t*>(aPtr);
|
||||
const float score = dotS8S8<archType>(k, aPtrS8, bPtr);
|
||||
contextBias = *reinterpret_cast<const float*>(aPtr + k + 4);
|
||||
const int32_t hsum = *reinterpret_cast<const int32_t*>(bPtr + k + 4);
|
||||
c[i * ldc + j] = (acc - hsum) * contextScale * outputScale + contextBias;
|
||||
c[i * ldc + j] = score + contextBias;
|
||||
}
|
||||
else
|
||||
{
|
||||
const int32_t acc = op.dotprod(aPtr, bPtr, k);
|
||||
const float contextScale = *reinterpret_cast<const float*>(aPtr + k);
|
||||
const float outputScale = *reinterpret_cast<const float*>(bPtr + k);
|
||||
contextBias = *reinterpret_cast<const float*>(aPtr + k + 4);
|
||||
const int32_t hsum = *reinterpret_cast<const int32_t*>(bPtr + k + 4);
|
||||
c[i * ldc + j] = (acc - hsum) * contextScale * outputScale + contextBias;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -228,5 +228,14 @@ namespace sais
|
|||
});
|
||||
return std::accumulate(numSuffices.begin(), numSuffices.end(), (size_t)0);
|
||||
}
|
||||
|
||||
template<class Fn>
|
||||
size_t enumNextChr(const std::pair<size_t, size_t>& range, Fn&& fn) const
|
||||
{
|
||||
return waveletTree.enumerate(range.first, range.second, [&](ChrTy c, size_t cl, size_t cr)
|
||||
{
|
||||
return fn(c, cr - cl);
|
||||
});
|
||||
}
|
||||
};
|
||||
}
|
||||
|
|
|
|||
|
|
@ -5,13 +5,13 @@
|
|||
|
||||
kiwi_h reuse_kiwi_instance()
|
||||
{
|
||||
static kiwi_h kw = kiwi_init(MODEL_PATH, -1, KIWI_BUILD_DEFAULT);
|
||||
static kiwi_h kw = kiwi_init(MODEL_PATH, -1, KIWI_BUILD_DEFAULT, 0);
|
||||
return kw;
|
||||
}
|
||||
|
||||
TEST(KiwiC, InitClose)
|
||||
{
|
||||
kiwi_h kw = kiwi_init(MODEL_PATH, -1, KIWI_BUILD_DEFAULT);
|
||||
kiwi_h kw = kiwi_init(MODEL_PATH, -1, KIWI_BUILD_DEFAULT, 0);
|
||||
EXPECT_NE(kw, nullptr);
|
||||
EXPECT_EQ(kiwi_close(kw), 0);
|
||||
}
|
||||
|
|
@ -63,7 +63,7 @@ int mt_receiver(int idx, kiwi_res_h res, void* user)
|
|||
TEST(KiwiC, AnalyzeMultithread)
|
||||
{
|
||||
auto data = loadTestCorpus();
|
||||
kiwi_h kw = kiwi_init(MODEL_PATH, 2, KIWI_BUILD_DEFAULT);
|
||||
kiwi_h kw = kiwi_init(MODEL_PATH, 2, KIWI_BUILD_DEFAULT, 0);
|
||||
EXPECT_NE(kw, nullptr);
|
||||
kiwi_analyze_option_t option = { KIWI_MATCH_ALL, };
|
||||
EXPECT_EQ(kiwi_analyze_m(kw, mt_reader, mt_receiver, &data, 1, option), data.size());
|
||||
|
|
@ -256,12 +256,16 @@ TEST(KiwiC, AnalyzeBasicTypoSet)
|
|||
{
|
||||
kiwi_h okw = reuse_kiwi_instance(), typo_kw;
|
||||
kiwi_builder_h builder = kiwi_builder_init(MODEL_PATH, -1, KIWI_BUILD_DEFAULT, KIWI_DIALECT_STANDARD);
|
||||
typo_kw = kiwi_builder_build(builder, kiwi_typo_get_default(KIWI_TYPO_BASIC_TYPO_SET), 2.5f);
|
||||
typo_kw = kiwi_builder_build(builder, nullptr, 0);
|
||||
kiwi_config_t config = kiwi_get_global_config(typo_kw);
|
||||
config.typo_cost_weight = 5;
|
||||
kiwi_set_global_config(typo_kw, config);
|
||||
|
||||
kiwi_analyze_option_t option = { KIWI_MATCH_ALL_WITH_NORMALIZING, };
|
||||
kiwi_prepared_typo_h ptt = kiwi_typo_prepare(kiwi_typo_get_default(KIWI_TYPO_BASIC_TYPO_SET));
|
||||
|
||||
kiwi_analyze_option_t option = { KIWI_MATCH_ALL_WITH_NORMALIZING | KIWI_MATCH_OOV_CHR_FREQ_MODEL, };
|
||||
option.typo_transformer = ptt;
|
||||
option.typo_threshold = 2.5;
|
||||
kiwi_res_h o, c;
|
||||
for (const char* s : { u8"외않됀데?", u8"나 죰 도와죠.", u8"잘했따", u8"외구거 공부", u8"맗은 믈을 마셧다!" })
|
||||
{
|
||||
|
|
@ -272,6 +276,7 @@ TEST(KiwiC, AnalyzeBasicTypoSet)
|
|||
EXPECT_EQ(kiwi_res_close(c), 0);
|
||||
}
|
||||
|
||||
EXPECT_EQ(kiwi_prepared_typo_close(ptt), 0);
|
||||
EXPECT_EQ(kiwi_builder_close(builder), 0);
|
||||
EXPECT_EQ(kiwi_close(typo_kw), 0);
|
||||
}
|
||||
|
|
@ -289,12 +294,16 @@ TEST(KiwiC, CustomTypoSet)
|
|||
kiwi_typo_update(custom_typo, continual_typo);
|
||||
kiwi_typo_update(custom_typo, lengthening_typo);
|
||||
|
||||
typo_kw = kiwi_builder_build(builder, custom_typo, 2.5f);
|
||||
kiwi_prepared_typo_h ptt = kiwi_typo_prepare(custom_typo);
|
||||
|
||||
typo_kw = kiwi_builder_build(builder, nullptr, 0);
|
||||
kiwi_config_t config = kiwi_get_global_config(typo_kw);
|
||||
config.typo_cost_weight = 5;
|
||||
kiwi_set_global_config(typo_kw, config);
|
||||
|
||||
kiwi_analyze_option_t option = { KIWI_MATCH_ALL_WITH_NORMALIZING, };
|
||||
kiwi_analyze_option_t option = { KIWI_MATCH_ALL_WITH_NORMALIZING | KIWI_MATCH_OOV_CHR_FREQ_MODEL, };
|
||||
option.typo_transformer = ptt;
|
||||
option.typo_threshold = 2.5;
|
||||
kiwi_res_h o, c;
|
||||
for (const char* s : { u8"외않됀데?", u8"나 죰 도와죠.", u8"자알했따", u8"외구거 공부", u8"맗은 믈을 마셧다!" })
|
||||
{
|
||||
|
|
@ -305,6 +314,7 @@ TEST(KiwiC, CustomTypoSet)
|
|||
EXPECT_EQ(kiwi_res_close(c), 0);
|
||||
}
|
||||
|
||||
EXPECT_EQ(kiwi_prepared_typo_close(ptt), 0);
|
||||
EXPECT_EQ(kiwi_typo_close(custom_typo), 0);
|
||||
EXPECT_EQ(kiwi_builder_close(builder), 0);
|
||||
EXPECT_EQ(kiwi_close(typo_kw), 0);
|
||||
|
|
|
|||
|
|
@ -22,17 +22,17 @@ TestInitializer _global_initializer;
|
|||
|
||||
using namespace kiwi;
|
||||
|
||||
inline testing::AssertionResult testTokenization(Kiwi& kiwi, const std::u16string& s)
|
||||
inline testing::AssertionResult testTokenization(Kiwi& kiwi, const std::u16string& s, AnalyzeOption option)
|
||||
{
|
||||
auto tokens = kiwi.analyze(s, Match::all).first;
|
||||
if (tokens.empty()) return testing::AssertionFailure() << "kiwi.analyze(" << testing::PrintToString(s) << ") yields an empty result.";
|
||||
auto tokens = kiwi.analyze(s, option).first;
|
||||
if (tokens.empty()) return testing::AssertionFailure() << "kiwi.analyze(" << utf16To8(s) << ") yields an empty result.";
|
||||
if (tokens.back().position + tokens.back().length == s.size())
|
||||
{
|
||||
return testing::AssertionSuccess();
|
||||
}
|
||||
else
|
||||
{
|
||||
return testing::AssertionFailure() << "the result of kiwi.analyze(" << testing::PrintToString(s) << ") ends at " << (tokens.back().position + tokens.back().length);
|
||||
return testing::AssertionFailure() << "the result of kiwi.analyze(" << utf16To8(s) << ") ends at " << (tokens.back().position + tokens.back().length);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -57,6 +57,120 @@ Kiwi& reuseKiwiInstance()
|
|||
return kiwi;
|
||||
}
|
||||
|
||||
TEST(KiwiCpp, ChrTokenizer)
|
||||
{
|
||||
ChrTokenizer tokenizer;
|
||||
const std::string_view s = u8"안녕하세요.오늘날씨가참좋네요!Adx9810::~";
|
||||
|
||||
Vector<int32_t> encodedBuf(s.size());
|
||||
encodedBuf.erase(encodedBuf.begin() + tokenizer.encode(s, encodedBuf.data(), encodedBuf.size()), encodedBuf.end());
|
||||
EXPECT_TRUE(std::all_of(encodedBuf.begin(), encodedBuf.end(), [&](int32_t t) { return t < tokenizer.vocabSize(); }));
|
||||
|
||||
std::string decoded = utf16To8(tokenizer.decode(encodedBuf.data(), encodedBuf.size()));
|
||||
EXPECT_EQ(s, decoded);
|
||||
}
|
||||
|
||||
TEST(KiwiCpp, ChrModel)
|
||||
{
|
||||
ChrTokenizer tokenizer;
|
||||
Kiwi& kiwi = reuseKiwiInstance();
|
||||
|
||||
auto streamProvider = utils::makeFilesystemProvider(MODEL_PATH);
|
||||
auto stream = streamProvider("nounchr.mdl");
|
||||
|
||||
auto chrModel = lm::CoNgramModelBase::create(utils::createMemoryObjectFromStream(*stream),
|
||||
kiwi.archType(),
|
||||
false,
|
||||
true
|
||||
);
|
||||
|
||||
EXPECT_EQ(chrModel->vocabSize(), tokenizer.vocabSize());
|
||||
|
||||
std::array<int32_t, 256> buf = { 0, };
|
||||
for (auto str : {
|
||||
"한국어",
|
||||
"됐습니다.",
|
||||
"AS365버전",
|
||||
"형태",
|
||||
"형태를",
|
||||
"바다를",
|
||||
"샤를",
|
||||
"카를",
|
||||
"아를",
|
||||
"자갈을",
|
||||
"생선마을",
|
||||
"북구을",
|
||||
"분당을",
|
||||
"사람을",
|
||||
"도서관을",
|
||||
"이민철",
|
||||
"김민철",
|
||||
"황보민수",
|
||||
"남궁민수",
|
||||
})
|
||||
{
|
||||
size_t size = tokenizer.encode(str, buf.data(), buf.size());
|
||||
buf[size++] = 0;
|
||||
float accScore = 0;
|
||||
int32_t nodeIdx = 0;
|
||||
uint32_t contextIdx = 0;
|
||||
chrModel->progressOneStep(nodeIdx, contextIdx, 0);
|
||||
for (size_t i = 0; i < size; ++i)
|
||||
{
|
||||
const size_t depth = chrModel->getNodeDepth(nodeIdx);
|
||||
const float score = chrModel->progressOneStep(nodeIdx, contextIdx, buf[i]);
|
||||
const float freq = chrModel->getContextFrequency(contextIdx);
|
||||
const float entropy = chrModel->getContextEntropy(contextIdx);
|
||||
auto tokenStr = utf16To8(tokenizer.decode(&buf[i], 1));
|
||||
std::cerr << " Token: " << tokenStr << "(" << buf[i] << ") Score: " << score << " Depth: " << depth << " Freq: " << freq << " Entropy: " << entropy << std::endl;
|
||||
EXPECT_LT(score, 0.01);
|
||||
accScore += score;
|
||||
}
|
||||
std::cerr << "AccScore for \"" << str << "\": " << accScore << " AvgScore: " << (accScore / (size - 1)) << std::endl;
|
||||
EXPECT_LT(accScore, 0);
|
||||
}
|
||||
}
|
||||
|
||||
TEST(KiwiCpp, ChrDataset)
|
||||
{
|
||||
constexpr size_t batchSize = 64, contextSize = 8, sentSize = 1000;
|
||||
ChrDataset dataset{ batchSize, contextSize, 0, 0.f };
|
||||
double totalWeight = 0.f;
|
||||
for (size_t i = 0; i < sentSize; ++i)
|
||||
{
|
||||
const float weight = 1.f / (i + 2.f);
|
||||
dataset.addSentence(std::to_string(i), weight, "0");
|
||||
totalWeight += weight;
|
||||
}
|
||||
|
||||
auto vocabProbs = dataset.getVocabProbs();
|
||||
EXPECT_EQ(vocabProbs.size(), dataset.vocabSize());
|
||||
|
||||
std::array<int32_t, batchSize* contextSize> inBuf, outBuf;
|
||||
ChrTokenizer tokenizer;
|
||||
|
||||
Vector<size_t> cnts(sentSize);
|
||||
size_t totalSampled = 0;
|
||||
for (size_t b = 0; b < 10000; ++b)
|
||||
{
|
||||
const size_t n = dataset.next(inBuf.data(), outBuf.data());
|
||||
for (size_t i = 0; i < n; ++i)
|
||||
{
|
||||
const auto decoded = tokenizer.decode(&inBuf[i * contextSize + 1], contextSize - 1);
|
||||
const size_t v = std::stoi(utf16To8(decoded));
|
||||
cnts[v] += 1;
|
||||
totalSampled += 1;
|
||||
}
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < sentSize; ++i)
|
||||
{
|
||||
const float expectedProb = (float)((1.f / (i + 2.f)) / totalWeight);
|
||||
const float actualProb = (float)(cnts[i] / (double)totalSampled);
|
||||
EXPECT_NEAR(expectedProb, actualProb, expectedProb * 0.1f) << " for sentence " << i;
|
||||
}
|
||||
}
|
||||
|
||||
TEST(KiwiCpp, ExtractSubstrings)
|
||||
{
|
||||
const std::u16string s = u"자, 너 오늘 하루 뭐 했니? "
|
||||
|
|
@ -120,10 +234,21 @@ TEST(KiwiCpp, EmptyResult)
|
|||
u"스틸블루",
|
||||
u"15살이었므로",
|
||||
u"타란튤라",
|
||||
u"꽃게 맛이 가장 좋다는 봄철에는 알이 통통하게 든 암 꽃게가 많이 잡히며, 게 딱지 속에 노란 알과 내장이 가득하여 게장으로 담그면 좋고, 가을에는 살이 통통하게 오른 숫 꽃게가 많이 잡히는데 살이 많고 찌더라도 퍽퍽하지 않고 부드러워 찜으로 요리하면 좋다.",
|
||||
};
|
||||
for (auto s : testCases)
|
||||
{
|
||||
EXPECT_TRUE(testTokenization(kiwi, s));
|
||||
EXPECT_TRUE(testTokenization(kiwi, s, Match::all));
|
||||
}
|
||||
|
||||
AnalyzeOption option = Match::allWithNormalizing | Match::oovChrFreqModel | Match::mergeSaisiot;
|
||||
option.typoTransformer = getDefaultPreparedTypoSet(DefaultTypoSet::basicTypoSetWithContinualAndLengthening);
|
||||
auto testCases2 = {
|
||||
u"해물톳짜장 나왔습니당 쫄깃한 면발 위에 듬뿍 얹어진 톳 그 위에 방풍나물 마라도 짜장면 맛집 인정!",
|
||||
};
|
||||
for (auto s : testCases2)
|
||||
{
|
||||
EXPECT_TRUE(testTokenization(kiwi, s, option));
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -154,15 +279,32 @@ TEST(KiwiCpp, SingleConsonantMorpheme)
|
|||
TEST(KiwiCpp, SpecialTokenErrorOnContinualTypo)
|
||||
{
|
||||
KiwiBuilder builder{ MODEL_PATH, 0, BuildOption::default_, ModelType::none };
|
||||
Kiwi typoKiwi = builder.build(DefaultTypoSet::continualTypoSet);
|
||||
Kiwi typoKiwi = builder.build();
|
||||
AnalyzeOption option = Match::allWithNormalizing;
|
||||
option.typoTransformer = getDefaultPreparedTypoSet(DefaultTypoSet::continualTypoSet);
|
||||
|
||||
auto res = typoKiwi.analyze(u"감사합니다 -친구들과", Match::allWithNormalizing).first;
|
||||
auto res = typoKiwi.analyze(u"감사합니다 -친구들과", option).first;
|
||||
EXPECT_EQ(res[0].str, u"감사");
|
||||
EXPECT_EQ(res[1].str, u"하");
|
||||
EXPECT_EQ(res[3].str, u"-");
|
||||
EXPECT_EQ(res[3].tag, POSTag::so);
|
||||
}
|
||||
|
||||
TEST(KiwiCpp, MultiWordTypo)
|
||||
{
|
||||
Kiwi& kiwi = reuseKiwiInstance();
|
||||
AnalyzeOption option = Match::allWithNormalizing;
|
||||
auto res = kiwi.analyze(u"존 F. 케네디 주니어", option).first;
|
||||
EXPECT_EQ(res[0].str, u"존 F. 케네디 주니어");
|
||||
res = kiwi.analyze(u"존 F. 캐네디 주니어", option).first;
|
||||
EXPECT_NE(res[0].str, u"존 F. 케네디 주니어");
|
||||
option.typoTransformer = getDefaultPreparedTypoSet(DefaultTypoSet::basicTypoSet);
|
||||
res = kiwi.analyze(u"존 F. 캐네디 주니어", option).first;
|
||||
EXPECT_EQ(res[0].str, u"존 F. 케네디 주니어");
|
||||
res = kiwi.analyze(u"존F.캐네디주니어", option).first;
|
||||
EXPECT_EQ(res[0].str, u"존 F. 케네디 주니어");
|
||||
}
|
||||
|
||||
TEST(KiwiCpp, SplitComplex)
|
||||
{
|
||||
Kiwi& kiwi = reuseKiwiInstance();
|
||||
|
|
@ -308,7 +450,8 @@ TEST(KiwiCpp, Pretokenized)
|
|||
{
|
||||
Kiwi& kiwi = reuseKiwiInstance();
|
||||
auto str = u"드디어패트와 매트가 2017년에 국내 개봉했다. 패트와매트는 2016년...";
|
||||
|
||||
AnalyzeOption option = Match::allWithNormalizing;
|
||||
|
||||
std::vector<TokenInfo> res;
|
||||
{
|
||||
std::vector<PretokenizedSpan> pretokenized = {
|
||||
|
|
@ -317,7 +460,7 @@ TEST(KiwiCpp, Pretokenized)
|
|||
PretokenizedSpan{ 34, 39, {} },
|
||||
};
|
||||
|
||||
res = kiwi.analyze(str, Match::allWithNormalizing, pretokenized).first;
|
||||
res = kiwi.analyze(str, option, pretokenized).first;
|
||||
EXPECT_EQ(res[1].str, u"패트와 매트");
|
||||
EXPECT_EQ(res[3].str, u"2017년");
|
||||
EXPECT_EQ(res[13].str, u"2016년");
|
||||
|
|
@ -330,7 +473,7 @@ TEST(KiwiCpp, Pretokenized)
|
|||
PretokenizedSpan{ 21, 24, { BasicToken{ u"개봉하", 0, 3, POSTag::vv }, BasicToken{ u"었", 2, 3, POSTag::ep } }},
|
||||
};
|
||||
|
||||
res = kiwi.analyze(str, Match::allWithNormalizing, pretokenized).first;
|
||||
res = kiwi.analyze(str, option, pretokenized).first;
|
||||
EXPECT_EQ(res[7].str, u"개봉하");
|
||||
EXPECT_EQ(res[7].tag, POSTag::vv);
|
||||
EXPECT_EQ(res[7].position, 21);
|
||||
|
|
@ -351,8 +494,8 @@ TEST(KiwiCpp, Pretokenized)
|
|||
PretokenizedSpan{ 16, 17, { BasicToken{ u"에", 0, 1, POSTag::jkb } } },
|
||||
};
|
||||
|
||||
auto ref = kiwi.analyze(str, Match::allWithNormalizing).first;
|
||||
res = kiwi.analyze(str, Match::allWithNormalizing, pretokenized).first;
|
||||
auto ref = kiwi.analyze(str, option).first;
|
||||
res = kiwi.analyze(str, option, pretokenized).first;
|
||||
EXPECT_EQ(res[2].tag, POSTag::jks);
|
||||
EXPECT_EQ(res[2].morph, ref[2].morph);
|
||||
EXPECT_FLOAT_EQ(res[2].score, ref[2].score);
|
||||
|
|
@ -367,8 +510,80 @@ TEST(KiwiCpp, Pretokenized)
|
|||
PretokenizedSpan{ 3, 4, { BasicToken{ u"걷", 0, 1, POSTag::vv } } },
|
||||
};
|
||||
|
||||
auto ref = kiwi.analyze(str2, Match::allWithNormalizing).first;
|
||||
res = kiwi.analyze(str2, Match::allWithNormalizing, pretokenized).first;
|
||||
auto ref = kiwi.analyze(str2, option).first;
|
||||
res = kiwi.analyze(str2, option, pretokenized).first;
|
||||
EXPECT_EQ(res[2].tag, POSTag::vvi);
|
||||
EXPECT_EQ(res[2].morph, ref[2].morph);
|
||||
}
|
||||
}
|
||||
|
||||
TEST(KiwiCpp, PretokenizedWithTypo)
|
||||
{
|
||||
Kiwi& kiwi = reuseKiwiInstance();
|
||||
auto str = u"드디어패트와 매트가 2017년에 국내 개봉했다. 패트와매트는 2016년...";
|
||||
AnalyzeOption option = Match::allWithNormalizing;
|
||||
option.typoTransformer = getDefaultPreparedTypoSet(DefaultTypoSet::basicTypoSetWithContinualAndLengthening);
|
||||
|
||||
std::vector<TokenInfo> res;
|
||||
{
|
||||
std::vector<PretokenizedSpan> pretokenized = {
|
||||
PretokenizedSpan{ 3, 9, {} },
|
||||
PretokenizedSpan{ 11, 16, {} },
|
||||
PretokenizedSpan{ 34, 39, {} },
|
||||
};
|
||||
|
||||
res = kiwi.analyze(str, option, pretokenized).first;
|
||||
EXPECT_EQ(res[1].str, u"패트와 매트");
|
||||
EXPECT_EQ(res[3].str, u"2017년");
|
||||
EXPECT_EQ(res[13].str, u"2016년");
|
||||
}
|
||||
|
||||
{
|
||||
std::vector<PretokenizedSpan> pretokenized = {
|
||||
PretokenizedSpan{ 27, 29, { BasicToken{ u"페트", 0, 2, POSTag::nnb } } },
|
||||
PretokenizedSpan{ 30, 32, {} },
|
||||
PretokenizedSpan{ 21, 24, { BasicToken{ u"개봉하", 0, 3, POSTag::vv }, BasicToken{ u"었", 2, 3, POSTag::ep } }},
|
||||
};
|
||||
|
||||
res = kiwi.analyze(str, option, pretokenized).first;
|
||||
EXPECT_EQ(res[7].str, u"개봉하");
|
||||
EXPECT_EQ(res[7].tag, POSTag::vv);
|
||||
EXPECT_EQ(res[7].position, 21);
|
||||
EXPECT_EQ(res[7].length, 3);
|
||||
EXPECT_EQ(res[8].str, u"었");
|
||||
EXPECT_EQ(res[8].tag, POSTag::ep);
|
||||
EXPECT_EQ(res[8].position, 23);
|
||||
EXPECT_EQ(res[8].length, 1);
|
||||
EXPECT_EQ(res[11].str, u"페트");
|
||||
EXPECT_EQ(res[11].tag, POSTag::nnb);
|
||||
EXPECT_EQ(res[13].str, u"매트");
|
||||
EXPECT_EQ(res[13].tag, POSTag::nng);
|
||||
}
|
||||
|
||||
{
|
||||
std::vector<PretokenizedSpan> pretokenized = {
|
||||
PretokenizedSpan{ 9, 10, { BasicToken{ u"가", 0, 1, POSTag::jks } } },
|
||||
PretokenizedSpan{ 16, 17, { BasicToken{ u"에", 0, 1, POSTag::jkb } } },
|
||||
};
|
||||
|
||||
auto ref = kiwi.analyze(str, option).first;
|
||||
res = kiwi.analyze(str, option, pretokenized).first;
|
||||
EXPECT_EQ(res[2].tag, POSTag::jks);
|
||||
EXPECT_EQ(res[2].morph, ref[2].morph);
|
||||
EXPECT_FLOAT_EQ(res[2].score, ref[2].score);
|
||||
EXPECT_EQ(res[5].tag, POSTag::jkb);
|
||||
EXPECT_EQ(res[5].morph, ref[5].morph);
|
||||
EXPECT_FLOAT_EQ(res[5].score, ref[5].score);
|
||||
}
|
||||
|
||||
{
|
||||
auto str2 = u"길을 걷다";
|
||||
std::vector<PretokenizedSpan> pretokenized = {
|
||||
PretokenizedSpan{ 3, 4, { BasicToken{ u"걷", 0, 1, POSTag::vv } } },
|
||||
};
|
||||
|
||||
auto ref = kiwi.analyze(str2, option).first;
|
||||
res = kiwi.analyze(str2, option, pretokenized).first;
|
||||
EXPECT_EQ(res[2].tag, POSTag::vvi);
|
||||
EXPECT_EQ(res[2].morph, ref[2].morph);
|
||||
}
|
||||
|
|
@ -1090,19 +1305,19 @@ TEST(KiwiCpp, AnalyzeError01)
|
|||
TEST(KiwiCpp, NormalizeCoda)
|
||||
{
|
||||
Kiwi& kiwi = reuseKiwiInstance();
|
||||
TokenResult res = kiwi.analyze(u"키윜ㅋㅋ", Match::allWithNormalizing);
|
||||
TokenResult res = kiwi.analyze(u"키윜ㅋㅋ", Match::allWithNormalizing | Match::oovChrModel);
|
||||
EXPECT_EQ(res.first.back().str, std::u16string{ u"ㅋㅋㅋ" });
|
||||
res = kiwi.analyze(u"키윟ㅎ", Match::allWithNormalizing);
|
||||
res = kiwi.analyze(u"키윟ㅎ", Match::allWithNormalizing | Match::oovChrModel);
|
||||
EXPECT_EQ(res.first.back().str, std::u16string{ u"ㅎㅎ" });
|
||||
res = kiwi.analyze(u"키윅ㄱ", Match::allWithNormalizing);
|
||||
res = kiwi.analyze(u"키윅ㄱ", Match::allWithNormalizing | Match::oovChrModel);
|
||||
EXPECT_EQ(res.first.back().str, std::u16string{ u"ㄱㄱ" });
|
||||
res = kiwi.analyze(u"키윈ㄴㄴ", Match::allWithNormalizing);
|
||||
res = kiwi.analyze(u"키윈ㄴㄴ", Match::allWithNormalizing | Match::oovChrModel);
|
||||
EXPECT_EQ(res.first.back().str, std::u16string{ u"ㄴㄴㄴ" });
|
||||
res = kiwi.analyze(u"키윊ㅎㅎ", Match::allWithNormalizing);
|
||||
res = kiwi.analyze(u"키윊ㅎㅎ", Match::allWithNormalizing | Match::oovChrModel);
|
||||
EXPECT_EQ(res.first.back().str, std::u16string{ u"ㅎㅎ" });
|
||||
res = kiwi.analyze(u"키윍ㄱㄱ", Match::allWithNormalizing);
|
||||
res = kiwi.analyze(u"키윍ㄱㄱ", Match::allWithNormalizing | Match::oovChrModel);
|
||||
EXPECT_EQ(res.first.back().str, std::u16string{u"ㄱㄱ"});
|
||||
}
|
||||
}
|
||||
|
||||
TEST(KiwiCpp, ZCoda)
|
||||
{
|
||||
|
|
@ -1117,9 +1332,9 @@ TEST(KiwiCpp, ZCoda)
|
|||
};
|
||||
for (auto s : testCases)
|
||||
{
|
||||
auto res1 = kiwi.analyze(s.first, Match::allWithNormalizing);
|
||||
auto res2 = kiwi.analyze(s.second, Match::allWithNormalizing);
|
||||
auto res3 = kiwi.analyze(s.second, Match::allWithNormalizing & ~Match::zCoda);
|
||||
auto res1 = kiwi.analyze(s.first, Match::allWithNormalizing | Match::oovChrFreqModel);
|
||||
auto res2 = kiwi.analyze(s.second, Match::allWithNormalizing | Match::oovChrFreqModel);
|
||||
auto res3 = kiwi.analyze(s.second, (Match::allWithNormalizing | Match::oovChrFreqModel) & ~Match::zCoda);
|
||||
EXPECT_GE(res1.second - kiwi.getGlobalConfig().typoCostWeight, res2.second);
|
||||
EXPECT_GT(res2.second, res3.second);
|
||||
EXPECT_EQ(res2.first[res2.first.size() - 2].tag, POSTag::z_coda);
|
||||
|
|
@ -1130,8 +1345,9 @@ TEST(KiwiCpp, ZCoda)
|
|||
TEST(KiwiCpp, ZSiot)
|
||||
{
|
||||
Kiwi& kiwi = reuseKiwiInstance();
|
||||
|
||||
auto resSplit = kiwi.analyze(u"찰랑찰랑한 머릿결과 볼륨감", Match::allWithNormalizing | Match::splitSaisiot);
|
||||
KiwiConfig config = kiwi.getGlobalConfig();
|
||||
config.oovRuleScale = 6;
|
||||
auto resSplit = kiwi.analyze(u"찰랑찰랑한 머릿결과 볼륨감", Match::allWithNormalizing | Match::splitSaisiot, {}, config);
|
||||
EXPECT_EQ(resSplit.first.size(), 8);
|
||||
EXPECT_EQ(resSplit.first[3].str, u"머리");
|
||||
EXPECT_EQ(resSplit.first[4].tag, POSTag::z_siot);
|
||||
|
|
@ -1139,9 +1355,9 @@ TEST(KiwiCpp, ZSiot)
|
|||
|
||||
for (auto s : {u"하굣길", u"만둣국", u"나뭇잎", u"세숫물", u"고춧가루", u"시곗바늘", u"사글셋방"})
|
||||
{
|
||||
auto resNone = kiwi.analyze(s, Match::allWithNormalizing);
|
||||
auto resSplit = kiwi.analyze(s, Match::allWithNormalizing | Match::splitSaisiot);
|
||||
auto resMerge = kiwi.analyze(s, Match::allWithNormalizing | Match::mergeSaisiot);
|
||||
auto resNone = kiwi.analyze(s, Match::allWithNormalizing, {}, config);
|
||||
auto resSplit = kiwi.analyze(s, Match::allWithNormalizing | Match::splitSaisiot, {}, config);
|
||||
auto resMerge = kiwi.analyze(s, Match::allWithNormalizing | Match::mergeSaisiot, {}, config);
|
||||
EXPECT_FALSE(std::any_of(resNone.first.begin(), resNone.first.end(), [](const TokenInfo& token) { return token.tag == POSTag::z_siot; }));
|
||||
EXPECT_EQ(resSplit.first.size(), 3);
|
||||
EXPECT_EQ(resSplit.first[0].tag, POSTag::nng);
|
||||
|
|
@ -1153,9 +1369,9 @@ TEST(KiwiCpp, ZSiot)
|
|||
|
||||
for (auto s : {u"발렛 파킹", u"미닛"})
|
||||
{
|
||||
auto resNone = kiwi.analyze(s, Match::allWithNormalizing);
|
||||
auto resSplit = kiwi.analyze(s, Match::allWithNormalizing | Match::splitSaisiot);
|
||||
auto resMerge = kiwi.analyze(s, Match::allWithNormalizing | Match::mergeSaisiot);
|
||||
auto resNone = kiwi.analyze(s, Match::allWithNormalizing, {}, config);
|
||||
auto resSplit = kiwi.analyze(s, Match::allWithNormalizing | Match::splitSaisiot, {}, config);
|
||||
auto resMerge = kiwi.analyze(s, Match::allWithNormalizing | Match::mergeSaisiot, {}, config);
|
||||
EXPECT_EQ(resNone.second, resSplit.second);
|
||||
EXPECT_EQ(resNone.second, resMerge.second);
|
||||
EXPECT_FALSE(std::any_of(resSplit.first.begin(), resSplit.first.end(), [](const TokenInfo& token) { return token.tag == POSTag::z_siot; }));
|
||||
|
|
@ -1164,13 +1380,16 @@ TEST(KiwiCpp, ZSiot)
|
|||
|
||||
TEST(KiwiCpp, ZSiotWithTypo)
|
||||
{
|
||||
Kiwi kiwi = KiwiBuilder{ MODEL_PATH, 0, BuildOption::default_, }.build(getDefaultTypoSet(DefaultTypoSet::basicTypoSetWithContinual));
|
||||
|
||||
Kiwi kiwi = KiwiBuilder{ MODEL_PATH, 0, BuildOption::default_, }.build();
|
||||
AnalyzeOption option;
|
||||
option.typoTransformer = getDefaultPreparedTypoSet(DefaultTypoSet::basicTypoSetWithContinual);
|
||||
KiwiConfig config = kiwi.getGlobalConfig();
|
||||
config.oovRuleScale = 6;
|
||||
for (auto s : { u"하굣길", u"만둣국", u"나뭇잎", u"세숫물", u"고춧가루", u"시곗바늘", u"사글셋방" })
|
||||
{
|
||||
auto resNone = kiwi.analyze(s, Match::allWithNormalizing);
|
||||
auto resSplit = kiwi.analyze(s, Match::allWithNormalizing | Match::splitSaisiot);
|
||||
auto resMerge = kiwi.analyze(s, Match::allWithNormalizing | Match::mergeSaisiot);
|
||||
auto resNone = kiwi.analyze(s, option.withMatch(Match::allWithNormalizing), {}, config);
|
||||
auto resSplit = kiwi.analyze(s, option.withMatch(Match::allWithNormalizing | Match::splitSaisiot), {}, config);
|
||||
auto resMerge = kiwi.analyze(s, option.withMatch(Match::allWithNormalizing | Match::mergeSaisiot), {}, config);
|
||||
EXPECT_FALSE(std::any_of(resNone.first.begin(), resNone.first.end(), [](const TokenInfo& token) { return token.tag == POSTag::z_siot; }));
|
||||
EXPECT_EQ(resSplit.first.size(), 3);
|
||||
EXPECT_EQ(resSplit.first[0].tag, POSTag::nng);
|
||||
|
|
@ -1182,9 +1401,9 @@ TEST(KiwiCpp, ZSiotWithTypo)
|
|||
|
||||
for (auto s : { u"발렛 파킹", u"미닛" })
|
||||
{
|
||||
auto resNone = kiwi.analyze(s, Match::allWithNormalizing);
|
||||
auto resSplit = kiwi.analyze(s, Match::allWithNormalizing | Match::splitSaisiot);
|
||||
auto resMerge = kiwi.analyze(s, Match::allWithNormalizing | Match::mergeSaisiot);
|
||||
auto resNone = kiwi.analyze(s, option.withMatch(Match::allWithNormalizing), {}, config);
|
||||
auto resSplit = kiwi.analyze(s, option.withMatch(Match::allWithNormalizing | Match::splitSaisiot), {}, config);
|
||||
auto resMerge = kiwi.analyze(s, option.withMatch(Match::allWithNormalizing | Match::mergeSaisiot), {}, config);
|
||||
EXPECT_EQ(resNone.second, resSplit.second);
|
||||
EXPECT_EQ(resNone.second, resMerge.second);
|
||||
EXPECT_FALSE(std::any_of(resSplit.first.begin(), resSplit.first.end(), [](const TokenInfo& token) { return token.tag == POSTag::z_siot; }));
|
||||
|
|
@ -1468,6 +1687,19 @@ TEST(KiwiCpp, JoinAffix)
|
|||
EXPECT_EQ(res5.first[5].str, u"배송되");
|
||||
}
|
||||
|
||||
TEST(KiwiCpp, JoinParticleYo)
|
||||
{
|
||||
Kiwi& kiwi = reuseKiwiInstance();
|
||||
auto sample1 = u"밥을 먹는다던가요";
|
||||
auto res_without = kiwi.analyze(sample1, Match::none).first;
|
||||
auto res_with = kiwi.analyze(sample1, Match::joinParticleYo).first;
|
||||
|
||||
EXPECT_EQ(res_without[res_without.size() - 2].str, u"는다던가");
|
||||
EXPECT_EQ(res_without[res_without.size() - 1].str, u"요");
|
||||
|
||||
EXPECT_EQ(res_with[res_with.size() - 1].str, u"는다던가요");
|
||||
}
|
||||
|
||||
TEST(KiwiCpp, CompatibleJamo)
|
||||
{
|
||||
Kiwi& kiwi = reuseKiwiInstance();
|
||||
|
|
@ -1815,8 +2047,10 @@ TEST(KiwiCpp, Issue205)
|
|||
|
||||
EXPECT_EQ(res1[0].str, u"함박 스테이크");
|
||||
|
||||
auto kiwi2 = builder.build(DefaultTypoSet::basicTypoSetWithContinual);
|
||||
auto res2 = kiwi2.analyze(u"함박 스테이크를 먹었습니다", Match::allWithNormalizing).first;
|
||||
auto kiwi2 = builder.build();
|
||||
AnalyzeOption option = Match::allWithNormalizing;
|
||||
option.typoTransformer = getDefaultPreparedTypoSet(DefaultTypoSet::basicTypoSetWithContinual);
|
||||
auto res2 = kiwi2.analyze(u"함박 스테이크를 먹었습니다", option).first;
|
||||
|
||||
EXPECT_EQ(res2[0].str, u"함박 스테이크");
|
||||
}
|
||||
|
|
@ -1919,3 +2153,19 @@ TEST(KiwiCpp, Issue231)
|
|||
EXPECT_EQ(tokens.size(), 1);
|
||||
EXPECT_EQ(tokens[0].str, u"숫");
|
||||
}
|
||||
|
||||
TEST(KiwiCpp, Issue246)
|
||||
{
|
||||
auto& kiwi = reuseKiwiInstance();
|
||||
for (auto s : {
|
||||
u"1. 분석",
|
||||
u"1. 해야 하는 일",
|
||||
u"1. 해야 하는 업무",
|
||||
u"1. 수학적 증명",
|
||||
u"1. Dataset"
|
||||
})
|
||||
{
|
||||
auto res = kiwi.analyze(s, 5, Match::allWithNormalizing);
|
||||
EXPECT_EQ(res[0].first[0].tag, POSTag::sb) << " for input: " << utf16To8(s);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -5,6 +5,27 @@
|
|||
|
||||
using namespace kiwi;
|
||||
|
||||
TEST(KiwiTypo, GenerateGraph)
|
||||
{
|
||||
TypoTransformer tt;
|
||||
tt.addTypo(u"ㅐ", u"ㅚ");
|
||||
tt.addTypo(u"레", u"뢰");
|
||||
tt.addTypo(u"뢨", u"룄");
|
||||
auto ptt = tt.prepare(true);
|
||||
|
||||
std::vector<TypoGraphNode> graph;
|
||||
std::u16string nstr;
|
||||
normalizeHangul(nstr, std::u16string_view{ u"그럼 내괴다룄네" });
|
||||
auto size = ptt.generateGraph(nstr, graph);
|
||||
EXPECT_EQ(size, 11);
|
||||
|
||||
ptt = getDefaultTypoSet(DefaultTypoSet::basicTypoSet).prepare(true);
|
||||
nstr.clear();
|
||||
normalizeHangul(nstr, std::u16string_view{ u"앗뿔싸 그럼 오늘부터 다시 열심히 해보자꾸나." });
|
||||
size = ptt.generateGraph(nstr, graph);
|
||||
EXPECT_GT(size, 0);
|
||||
}
|
||||
|
||||
TEST(KiwiTypo, Generate)
|
||||
{
|
||||
TypoTransformer tt;
|
||||
|
|
@ -13,14 +34,14 @@ TEST(KiwiTypo, Generate)
|
|||
tt.addTypo(u"사에", u"사레");
|
||||
auto ptt = tt.prepare();
|
||||
UnorderedMap<std::u16string, float> typos;
|
||||
|
||||
|
||||
typos.clear();
|
||||
for (auto e : ptt.generate(u"%없어"))
|
||||
{
|
||||
typos.emplace(e.str, e.cost);
|
||||
}
|
||||
EXPECT_EQ(typos.size(), 1);
|
||||
|
||||
|
||||
typos.clear();
|
||||
for (auto e : ptt.generate(u"개가납네", 2))
|
||||
{
|
||||
|
|
@ -56,7 +77,7 @@ TEST(KiwiTypo, Generate)
|
|||
TEST(KiwiTypo, BasicTypoSet)
|
||||
{
|
||||
auto ptt = getDefaultTypoSet(DefaultTypoSet::basicTypoSet).prepare();
|
||||
|
||||
|
||||
for (auto t : ptt.generate(u"의"))
|
||||
{
|
||||
}
|
||||
|
|
@ -75,25 +96,31 @@ TEST(KiwiTypo, Builder)
|
|||
TypoTransformer tt;
|
||||
tt.addTypo(u"ㅐ", u"ㅔ");
|
||||
tt.addTypo(u"ㅔ", u"ㅐ");
|
||||
Kiwi kiwi = KiwiBuilder{ MODEL_PATH, 0, BuildOption::default_, }.build(tt);
|
||||
auto ptt = tt.prepare(true);
|
||||
|
||||
Kiwi kiwi = KiwiBuilder{ MODEL_PATH, 0, BuildOption::default_, }.build();
|
||||
|
||||
AnalyzeOption option;
|
||||
option.match = Match::allWithNormalizing;
|
||||
option.typoTransformer = &ptt;
|
||||
|
||||
auto config = kiwi.getGlobalConfig();
|
||||
TokenResult ret;
|
||||
config.typoCostWeight = 1e-9;
|
||||
kiwi.setGlobalConfig(config);
|
||||
ret = kiwi.analyze(u"문화제 보호", Match::allWithNormalizing);
|
||||
|
||||
ret = kiwi.analyze(u"문화제 보호", option);
|
||||
|
||||
config.typoCostWeight = 2;
|
||||
kiwi.setGlobalConfig(config);
|
||||
ret = kiwi.analyze(u"문화제 보호", Match::allWithNormalizing);
|
||||
|
||||
ret = kiwi.analyze(u"문화제 보호", option);
|
||||
|
||||
config.typoCostWeight = 4;
|
||||
kiwi.setGlobalConfig(config);
|
||||
ret = kiwi.analyze(u"문화제 보호", Match::allWithNormalizing);
|
||||
ret = kiwi.analyze(u"문화제 보호", option);
|
||||
|
||||
config.typoCostWeight = 6;
|
||||
kiwi.setGlobalConfig(config);
|
||||
ret = kiwi.analyze(u"문화제 보호", Match::allWithNormalizing);
|
||||
ret = kiwi.analyze(u"문화제 보호", option);
|
||||
}
|
||||
|
||||
TEST(KiwiTypo, AnalyzeBasicTypoSet)
|
||||
|
|
@ -101,73 +128,83 @@ TEST(KiwiTypo, AnalyzeBasicTypoSet)
|
|||
KiwiBuilder builder{ MODEL_PATH, 0, BuildOption::default_, };
|
||||
Kiwi kiwi = builder.build();
|
||||
|
||||
Kiwi typoKiwi = builder.build(DefaultTypoSet::basicTypoSet);
|
||||
auto config = typoKiwi.getGlobalConfig();
|
||||
auto ptt = getDefaultTypoSet(DefaultTypoSet::basicTypoSet).prepare(true);
|
||||
|
||||
AnalyzeOption option;
|
||||
option.match = Match::allWithNormalizing | Match::oovChrFreqModel;
|
||||
option.typoTransformer = &ptt;
|
||||
|
||||
auto config = kiwi.getGlobalConfig();
|
||||
config.typoCostWeight = 5;
|
||||
typoKiwi.setGlobalConfig(config);
|
||||
kiwi.setGlobalConfig(config);
|
||||
|
||||
TokenResult o = kiwi.analyze(u"외않됀데?", Match::allWithNormalizing);
|
||||
TokenResult c = typoKiwi.analyze(u"외않됀데?", Match::allWithNormalizing);
|
||||
TokenResult o = kiwi.analyze(u"외않됀데?", Match::allWithNormalizing | Match::oovChrFreqModel);
|
||||
TokenResult c = kiwi.analyze(u"외않됀데?", option);
|
||||
EXPECT_TRUE(o.second < c.second);
|
||||
|
||||
o = kiwi.analyze(u"나 죰 도와죠.", Match::allWithNormalizing);
|
||||
c = typoKiwi.analyze(u"나 죰 도와죠.", Match::allWithNormalizing);
|
||||
o = kiwi.analyze(u"나 죰 도와죠.", Match::allWithNormalizing | Match::oovChrFreqModel);
|
||||
c = kiwi.analyze(u"나 죰 도와죠.", option);
|
||||
EXPECT_TRUE(o.second < c.second);
|
||||
|
||||
o = kiwi.analyze(u"잘했따", Match::allWithNormalizing);
|
||||
c = typoKiwi.analyze(u"잘했따", Match::allWithNormalizing);
|
||||
o = kiwi.analyze(u"잘했따", Match::allWithNormalizing | Match::oovChrFreqModel);
|
||||
c = kiwi.analyze(u"잘했따", option);
|
||||
EXPECT_TRUE(o.second < c.second);
|
||||
|
||||
o = kiwi.analyze(u"외구거 공부", Match::allWithNormalizing);
|
||||
c = typoKiwi.analyze(u"외구거 공부", Match::allWithNormalizing);
|
||||
o = kiwi.analyze(u"외구거 공부", Match::allWithNormalizing | Match::oovChrFreqModel);
|
||||
c = kiwi.analyze(u"외구거 공부", option);
|
||||
EXPECT_TRUE(o.second < c.second);
|
||||
|
||||
o = kiwi.analyze(u"맗은 믈을 마셧다!", Match::allWithNormalizing);
|
||||
c = typoKiwi.analyze(u"맗은 믈을 마셧다!", Match::allWithNormalizing);
|
||||
o = kiwi.analyze(u"맗은 믈을 마셧다!", Match::allWithNormalizing | Match::oovChrFreqModel);
|
||||
c = kiwi.analyze(u"맗은 믈을 마셧다!", option);
|
||||
EXPECT_TRUE(o.second < c.second);
|
||||
|
||||
o = kiwi.analyze(u"Wertheimer)가 자신의 논문 <운동지각에 관한 실험연구>(Experimental studies on the perception of movement)을 통해 일상적인 지각 현상에 대한 새로운 시각을 제시한 시기이다.",
|
||||
Match::allWithNormalizing);
|
||||
c = typoKiwi.analyze(u"Wertheimer)가 자신의 논문 <운동지각에 관한 실험연구>(Experimental studies on the perception of movement)을 통해 일상적인 지각 현상에 대한 새로운 시각을 제시한 시기이다.",
|
||||
Match::allWithNormalizing);
|
||||
Match::allWithNormalizing | Match::oovChrFreqModel);
|
||||
c = kiwi.analyze(u"Wertheimer)가 자신의 논문 <운동지각에 관한 실험연구>(Experimental studies on the perception of movement)을 통해 일상적인 지각 현상에 대한 새로운 시각을 제시한 시기이다.",
|
||||
option);
|
||||
}
|
||||
|
||||
TEST(KiwiTypo, ContinualTypoSet)
|
||||
{
|
||||
KiwiBuilder builder{ MODEL_PATH, 0, BuildOption::default_, };
|
||||
Kiwi typoKiwi = builder.build(DefaultTypoSet::continualTypoSet);
|
||||
Kiwi kiwi = builder.build();
|
||||
|
||||
auto res = typoKiwi.analyze(u"프로그래미", Match::allWithNormalizing).first;
|
||||
auto ptt = getDefaultTypoSet(DefaultTypoSet::continualTypoSet).prepare(true);
|
||||
|
||||
AnalyzeOption option{ Match::allWithNormalizing };
|
||||
option.typoTransformer = &ptt;
|
||||
|
||||
auto res = kiwi.analyze(u"프로그래미", option).first;
|
||||
EXPECT_EQ(res.size(), 2);
|
||||
EXPECT_EQ(res[0].str, u"프로그램");
|
||||
EXPECT_EQ(res[1].str, u"이");
|
||||
|
||||
res = typoKiwi.analyze(u"프로그래믈", Match::allWithNormalizing).first;
|
||||
res = kiwi.analyze(u"프로그래믈", option).first;
|
||||
EXPECT_EQ(res.size(), 2);
|
||||
EXPECT_EQ(res[0].str, u"프로그램");
|
||||
EXPECT_EQ(res[1].str, u"을");
|
||||
|
||||
res = typoKiwi.analyze(u"오늘사무시레서", Match::allWithNormalizing).first;
|
||||
res = kiwi.analyze(u"오늘사무시레서", option).first;
|
||||
EXPECT_EQ(res.size(), 3);
|
||||
EXPECT_EQ(res[1].str, u"사무실");
|
||||
EXPECT_EQ(res[2].str, u"에서");
|
||||
|
||||
res = typoKiwi.analyze(u"법원이 기가캤다.", Match::allWithNormalizing).first;
|
||||
res = kiwi.analyze(u"법원이 기가캤다.", option).first;
|
||||
EXPECT_EQ(res.size(), 7);
|
||||
EXPECT_EQ(res[2].str, u"기각");
|
||||
EXPECT_EQ(res[3].str, u"하");
|
||||
|
||||
res = typoKiwi.analyze(u"하나도 업써.", Match::allWithNormalizing).first;
|
||||
res = kiwi.analyze(u"하나도 업써.", option).first;
|
||||
EXPECT_EQ(res.size(), 5);
|
||||
EXPECT_EQ(res[2].str, u"없");
|
||||
EXPECT_EQ(res[3].str, u"어");
|
||||
|
||||
res = typoKiwi.analyze(u"말근 하늘", Match::allWithNormalizing).first;
|
||||
res = kiwi.analyze(u"말근 하늘", option).first;
|
||||
EXPECT_EQ(res.size(), 3);
|
||||
EXPECT_EQ(res[0].str, u"맑");
|
||||
EXPECT_EQ(res[1].str, u"은");
|
||||
|
||||
res = typoKiwi.analyze(u"아주 만타.", Match::allWithNormalizing).first;
|
||||
res = kiwi.analyze(u"아주 만타.", option).first;
|
||||
EXPECT_EQ(res.size(), 4);
|
||||
EXPECT_EQ(res[1].str, u"많");
|
||||
EXPECT_EQ(res[2].str, u"다");
|
||||
|
|
@ -177,74 +214,84 @@ TEST(KiwiTypo, ContinualTypoSet)
|
|||
TEST(KiwiTypo, BasicTypoSetWithContinual)
|
||||
{
|
||||
KiwiBuilder builder{ MODEL_PATH, 0, BuildOption::default_, };
|
||||
Kiwi typoKiwi = builder.build(DefaultTypoSet::basicTypoSetWithContinual);
|
||||
Kiwi kiwi = builder.build();
|
||||
|
||||
auto res = typoKiwi.analyze(u"프로그레미", Match::allWithNormalizing).first;
|
||||
auto ptt = getDefaultTypoSet(DefaultTypoSet::basicTypoSetWithContinual).prepare(true);
|
||||
|
||||
AnalyzeOption option;
|
||||
option.match = Match::allWithNormalizing | Match::oovChrFreqModel;
|
||||
option.typoTransformer = &ptt;
|
||||
|
||||
auto config = kiwi.getGlobalConfig();
|
||||
|
||||
auto res = kiwi.analyze(u"프로그레믈", option, {}, config).first;
|
||||
EXPECT_EQ(res.size(), 2);
|
||||
EXPECT_EQ(res[0].str, u"프로그램");
|
||||
EXPECT_EQ(res[1].str, u"이");
|
||||
if (res.size() > 1) EXPECT_EQ(res[1].str, u"을");
|
||||
|
||||
res = typoKiwi.analyze(u"프로그레믈", Match::allWithNormalizing).first;
|
||||
EXPECT_EQ(res.size(), 2);
|
||||
EXPECT_EQ(res[0].str, u"프로그램");
|
||||
EXPECT_EQ(res[1].str, u"을");
|
||||
|
||||
res = typoKiwi.analyze(u"오늘사므시레서", Match::allWithNormalizing).first;
|
||||
res = kiwi.analyze(u"오늘사므시레서", option, {}, config).first;
|
||||
EXPECT_EQ(res.size(), 3);
|
||||
EXPECT_EQ(res[1].str, u"사무실");
|
||||
EXPECT_EQ(res[2].str, u"에서");
|
||||
if (res.size() > 1) EXPECT_EQ(res[1].str, u"사무실");
|
||||
if (res.size() > 2) EXPECT_EQ(res[2].str, u"에서");
|
||||
|
||||
res = typoKiwi.analyze(u"버붠이 기가캤다.", Match::allWithNormalizing).first;
|
||||
res = kiwi.analyze(u"버붠이 기가캤다.", option, {}, config).first;
|
||||
EXPECT_EQ(res.size(), 7);
|
||||
EXPECT_EQ(res[2].str, u"기각");
|
||||
EXPECT_EQ(res[3].str, u"하");
|
||||
if (res.size() > 2) EXPECT_EQ(res[2].str, u"기각");
|
||||
if (res.size() > 3) EXPECT_EQ(res[3].str, u"하");
|
||||
|
||||
res = typoKiwi.analyze(u"하나도 업써.", Match::allWithNormalizing).first;
|
||||
res = kiwi.analyze(u"하나도 업써.", option, {}, config).first;
|
||||
EXPECT_EQ(res.size(), 5);
|
||||
EXPECT_EQ(res[2].str, u"없");
|
||||
EXPECT_EQ(res[3].str, u"어");
|
||||
if (res.size() > 2) EXPECT_EQ(res[2].str, u"없");
|
||||
if (res.size() > 3) EXPECT_EQ(res[3].str, u"어");
|
||||
|
||||
res = typoKiwi.analyze(u"말근 하늘", Match::allWithNormalizing).first;
|
||||
res = kiwi.analyze(u"말근 하늘", option, {}, config).first;
|
||||
EXPECT_EQ(res.size(), 3);
|
||||
EXPECT_EQ(res[0].str, u"맑");
|
||||
EXPECT_EQ(res[1].str, u"은");
|
||||
if (res.size() > 1) EXPECT_EQ(res[1].str, u"은");
|
||||
|
||||
res = typoKiwi.analyze(u"아주 만타.", Match::allWithNormalizing).first;
|
||||
res = kiwi.analyze(u"아주 만타.", option, {}, config).first;
|
||||
EXPECT_EQ(res.size(), 4);
|
||||
EXPECT_EQ(res[1].str, u"많");
|
||||
EXPECT_EQ(res[2].str, u"다");
|
||||
if (res.size() > 1) EXPECT_EQ(res[1].str, u"많");
|
||||
if (res.size() > 2) EXPECT_EQ(res[2].str, u"다");
|
||||
}
|
||||
|
||||
TEST(KiwiTypo, LengtheningTypoSet)
|
||||
{
|
||||
KiwiBuilder builder{ MODEL_PATH, 0, BuildOption::default_, };
|
||||
Kiwi typoKiwi = builder.build(DefaultTypoSet::lengtheningTypoSet);
|
||||
const float typoCost = typoKiwi.getGlobalConfig().typoCostWeight * 0.25f;
|
||||
Kiwi kiwi = builder.build();
|
||||
|
||||
auto ref = typoKiwi.analyze(u"진짜?", Match::allWithNormalizing);
|
||||
auto res = typoKiwi.analyze(u"지인짜?", Match::allWithNormalizing);
|
||||
auto ptt = getDefaultTypoSet(DefaultTypoSet::lengtheningTypoSet).prepare(true);
|
||||
|
||||
AnalyzeOption option;
|
||||
option.match = Match::allWithNormalizing;
|
||||
option.typoTransformer = &ptt;
|
||||
|
||||
const float typoCost = kiwi.getGlobalConfig().typoCostWeight * 0.25f;
|
||||
|
||||
auto ref = kiwi.analyze(u"진짜?", option);
|
||||
auto res = kiwi.analyze(u"지인짜?", option);
|
||||
EXPECT_FLOAT_EQ(ref.second - 4 * typoCost, res.second);
|
||||
EXPECT_EQ(res.first.size(), 2);
|
||||
EXPECT_EQ(res.first[0].str, u"진짜");
|
||||
EXPECT_EQ(res.first[1].str, u"?");
|
||||
|
||||
res = typoKiwi.analyze(u"지인짜아?", Match::allWithNormalizing);
|
||||
res = kiwi.analyze(u"지인짜아?", option);
|
||||
EXPECT_FLOAT_EQ(ref.second - 5 * typoCost, res.second);
|
||||
EXPECT_EQ(res.first.size(), 2);
|
||||
EXPECT_EQ(res.first[0].str, u"진짜");
|
||||
EXPECT_EQ(res.first[1].str, u"?");
|
||||
|
||||
res = typoKiwi.analyze(u"그으으래?", Match::allWithNormalizing);
|
||||
res = kiwi.analyze(u"그으으래?", option);
|
||||
EXPECT_EQ(res.first.size(), 2);
|
||||
EXPECT_EQ(res.first[0].str, u"그래");
|
||||
EXPECT_EQ(res.first[1].str, u"?");
|
||||
|
||||
res = typoKiwi.analyze(u"그으으으으래?", Match::allWithNormalizing);
|
||||
res = kiwi.analyze(u"그으으으으래?", option);
|
||||
EXPECT_EQ(res.first.size(), 2);
|
||||
EXPECT_EQ(res.first[0].str, u"그래");
|
||||
EXPECT_EQ(res.first[1].str, u"?");
|
||||
|
||||
res = typoKiwi.analyze(u"학교오를 가야아해", Match::allWithNormalizing);
|
||||
res = kiwi.analyze(u"학교오를 가야아해", option);
|
||||
EXPECT_EQ(res.first.size(), 6);
|
||||
EXPECT_EQ(res.first[0].str, u"학교");
|
||||
EXPECT_EQ(res.first[1].str, u"를");
|
||||
|
|
|
|||
|
|
@ -1,4 +1,6 @@
|
|||
#include <fstream>
|
||||
#include <string>
|
||||
#include <regex>
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
|
||||
#include <kiwi/Utils.h>
|
||||
|
|
@ -14,6 +16,7 @@ unique_ptr<Evaluator> Evaluator::create(const std::string& evalType)
|
|||
{
|
||||
if (evalType == "morph") return std::make_unique<MorphEvaluator>();
|
||||
if (evalType == "disamb") return std::make_unique<DisambEvaluator>();
|
||||
if (evalType == "noun") return std::make_unique<NounEvaluator>();
|
||||
throw runtime_error{ "Unknown Evaluator Type" };
|
||||
}
|
||||
|
||||
|
|
@ -64,9 +67,12 @@ inline TokenInfo parseWordPOS(const u16string& str)
|
|||
int Evaluator::operator()(const string& modelPath,
|
||||
const string& output,
|
||||
const vector<string>& input,
|
||||
bool normCoda, bool zCoda, bool multiDict, ModelType modelType,
|
||||
bool normCoda, bool zCoda, bool defaultDict, bool multiDict, ModelType modelType,
|
||||
float typoCostWeight, bool bTypo, bool cTypo, bool lTypo,
|
||||
Dialect allowedDialect,
|
||||
Match oovScoringType,
|
||||
float unkFormScoreScale, float unkFormScoreBias,
|
||||
bool oldSplitter,
|
||||
int repeat)
|
||||
{
|
||||
try
|
||||
|
|
@ -83,27 +89,53 @@ int Evaluator::operator()(const string& modelPath,
|
|||
}
|
||||
|
||||
tutils::Timer timer;
|
||||
auto option = (BuildOption::default_ & ~BuildOption::loadMultiDict) | (multiDict ? BuildOption::loadMultiDict : BuildOption::none);
|
||||
auto option = (BuildOption::default_ & ~BuildOption::loadDefaultDict & ~BuildOption::loadMultiDict)
|
||||
| (defaultDict ? BuildOption::loadDefaultDict : BuildOption::none)
|
||||
| (multiDict ? BuildOption::loadMultiDict : BuildOption::none);
|
||||
PreparedTypoTransformer ptt;
|
||||
auto typo = getDefaultTypoSet(DefaultTypoSet::withoutTypo);
|
||||
|
||||
string typoStr = "";
|
||||
if (bTypo)
|
||||
{
|
||||
typo |= getDefaultTypoSet(DefaultTypoSet::basicTypoSet);
|
||||
typoStr += "basic";
|
||||
}
|
||||
|
||||
if (cTypo)
|
||||
{
|
||||
typo |= getDefaultTypoSet(DefaultTypoSet::continualTypoSet);
|
||||
if (!typoStr.empty()) typoStr += "+";
|
||||
typoStr += "continual";
|
||||
}
|
||||
|
||||
if (lTypo)
|
||||
{
|
||||
typo |= getDefaultTypoSet(DefaultTypoSet::lengtheningTypoSet);
|
||||
if (!typoStr.empty()) typoStr += "+";
|
||||
typoStr += "lengthening";
|
||||
}
|
||||
|
||||
if (allowedDialect != Dialect::standard)
|
||||
{
|
||||
typo |= getDefaultTypoSet(DefaultTypoSet::dialect);
|
||||
if (!typoStr.empty()) typoStr += "+";
|
||||
typoStr += "dialect";
|
||||
}
|
||||
Kiwi kw;
|
||||
|
||||
if (oldSplitter)
|
||||
{
|
||||
kw = KiwiBuilder{ modelPath, 1, option, modelType, allowedDialect }.build(
|
||||
typo
|
||||
);
|
||||
}
|
||||
else
|
||||
{
|
||||
kw = KiwiBuilder{ modelPath, 1, option, modelType, allowedDialect }.build();
|
||||
ptt = typo.prepare(true);
|
||||
}
|
||||
|
||||
Kiwi kw = KiwiBuilder{ modelPath, 1, option, modelType, allowedDialect }.build(
|
||||
typo
|
||||
);
|
||||
if (typoCostWeight > 0)
|
||||
{
|
||||
auto config = kw.getGlobalConfig();
|
||||
|
|
@ -111,6 +143,26 @@ int Evaluator::operator()(const string& modelPath,
|
|||
kw.setGlobalConfig(config);
|
||||
}
|
||||
|
||||
if (isfinite(unkFormScoreScale))
|
||||
{
|
||||
auto config = kw.getGlobalConfig();
|
||||
config.oovRuleScale = unkFormScoreScale;
|
||||
kw.setGlobalConfig(config);
|
||||
}
|
||||
if (isfinite(unkFormScoreBias))
|
||||
{
|
||||
auto config = kw.getGlobalConfig();
|
||||
if (oovScoringType == Match::oovRuleOnly)
|
||||
{
|
||||
config.oovRuleBias = unkFormScoreBias;
|
||||
}
|
||||
else
|
||||
{
|
||||
config.oovChrBias = unkFormScoreBias;
|
||||
}
|
||||
kw.setGlobalConfig(config);
|
||||
}
|
||||
|
||||
cout << "Loading Time : " << timer.getElapsed() << " ms" << endl;
|
||||
cout << "ArchType : " << archToStr(kw.archType()) << endl;
|
||||
cout << "Model Type : " << modelTypeToStr(kw.modelType()) << endl;
|
||||
|
|
@ -118,16 +170,31 @@ int Evaluator::operator()(const string& modelPath,
|
|||
{
|
||||
cout << "LM Size : " << (kw.getLangModel()->getMemorySize() / 1024. / 1024.) << " MB" << endl;
|
||||
}
|
||||
cout << "OOV Scoring : " << tutils::oovScoringTypeToStr(oovScoringType) << endl;
|
||||
cout << "Typo Correction: " << (typoStr.empty() ? "none" : typoStr) << endl;
|
||||
cout << "Mem Usage : " << (tutils::getCurrentPhysicalMemoryUsage() / 1024.) << " MB\n" << endl;
|
||||
|
||||
double avgMicro = 0, avgMacro = 0;
|
||||
double cnt = 0;
|
||||
AnalyzeOption analyzeOption;
|
||||
analyzeOption.match = (normCoda ? Match::allWithNormalizing : Match::all) & ~(zCoda ? Match::none : Match::zCoda);
|
||||
analyzeOption.match |= oovScoringType;
|
||||
analyzeOption.allowedDialects = allowedDialect;
|
||||
if (oldSplitter)
|
||||
{
|
||||
analyzeOption.match |= Match::useOldSplitter;
|
||||
}
|
||||
else
|
||||
{
|
||||
analyzeOption.typoTransformer = &ptt;
|
||||
}
|
||||
|
||||
for (auto& tf : input)
|
||||
{
|
||||
cout << "Test file: " << tf << endl;
|
||||
try
|
||||
{
|
||||
auto result = eval(output, tf, kw, normCoda, zCoda, allowedDialect, repeat);
|
||||
auto result = eval(output, tf, kw, analyzeOption, repeat);
|
||||
avgMicro += result.first;
|
||||
avgMacro += result.second;
|
||||
++cnt;
|
||||
|
|
@ -229,30 +296,6 @@ auto MorphEvaluator::computeScore(vector<TestResult>& preds, vector<TestResult>&
|
|||
return ret;
|
||||
}
|
||||
|
||||
auto DisambEvaluator::computeScore(vector<TestResult>& preds, vector<TestResult>& errors) const -> Score
|
||||
{
|
||||
errors.clear();
|
||||
Score score;
|
||||
for (auto& tr : preds)
|
||||
{
|
||||
bool correct = false;
|
||||
for (auto& token : tr.result.first)
|
||||
{
|
||||
if (token.str == tr.target.str &&
|
||||
clearIrregular(token.tag) == clearIrregular(tr.target.tag))
|
||||
{
|
||||
correct = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (correct) score.acc += 1;
|
||||
else errors.emplace_back(tr);
|
||||
score.totalCount++;
|
||||
}
|
||||
score.acc /= score.totalCount;
|
||||
return score;
|
||||
}
|
||||
|
||||
void MorphEvaluator::TestResult::writeResult(ostream& out) const
|
||||
{
|
||||
out << utf16To8(q) << '\t' << score << endl;
|
||||
|
|
@ -269,12 +312,9 @@ void MorphEvaluator::TestResult::writeResult(ostream& out) const
|
|||
out << endl;
|
||||
}
|
||||
|
||||
pair<double, double> MorphEvaluator::eval(const string& output, const string& file, kiwi::Kiwi& kiwi, bool normCoda, bool zCoda, Dialect allowedDialect, int repeat)
|
||||
pair<double, double> MorphEvaluator::eval(const string& output, const string& file, Kiwi& kiwi, AnalyzeOption option, int repeat)
|
||||
{
|
||||
const size_t topN = 1;
|
||||
AnalyzeOption option;
|
||||
option.match = (normCoda ? Match::allWithNormalizing : Match::all) & ~(zCoda ? Match::none : Match::zCoda);
|
||||
option.allowedDialects = allowedDialect;
|
||||
vector<TestResult> testsets = loadTestset(file), errors;
|
||||
tutils::Timer total;
|
||||
for (int i = 0; i < repeat; ++i)
|
||||
|
|
@ -331,6 +371,30 @@ auto DisambEvaluator::loadTestset(const string& testSetFile) const -> vector<Tes
|
|||
return ret;
|
||||
}
|
||||
|
||||
auto DisambEvaluator::computeScore(vector<TestResult>& preds, vector<TestResult>& errors) const -> Score
|
||||
{
|
||||
errors.clear();
|
||||
Score score;
|
||||
for (auto& tr : preds)
|
||||
{
|
||||
bool correct = false;
|
||||
for (auto& token : tr.result.first)
|
||||
{
|
||||
if (token.str == tr.target.str &&
|
||||
clearIrregular(token.tag) == clearIrregular(tr.target.tag))
|
||||
{
|
||||
correct = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (correct) score.acc += 1;
|
||||
else errors.emplace_back(tr);
|
||||
score.totalCount++;
|
||||
}
|
||||
score.acc /= score.totalCount;
|
||||
return score;
|
||||
}
|
||||
|
||||
void DisambEvaluator::TestResult::writeResult(ostream& out) const
|
||||
{
|
||||
out << target << '\t' << utf16To8(text) << '\t' << score << endl;
|
||||
|
|
@ -342,12 +406,9 @@ void DisambEvaluator::TestResult::writeResult(ostream& out) const
|
|||
out << endl;
|
||||
}
|
||||
|
||||
pair<double, double> DisambEvaluator::eval(const string& output, const string& file, kiwi::Kiwi& kiwi, bool normCoda, bool zCoda, Dialect allowedDialect, int repeat)
|
||||
pair<double, double> DisambEvaluator::eval(const string& output, const string& file, Kiwi& kiwi, AnalyzeOption option, int repeat)
|
||||
{
|
||||
const size_t topN = 1;
|
||||
AnalyzeOption option;
|
||||
option.match = (normCoda ? Match::allWithNormalizing : Match::all) & ~(zCoda ? Match::none : Match::zCoda);
|
||||
option.allowedDialects = allowedDialect;
|
||||
vector<TestResult> testsets = loadTestset(file), errors;
|
||||
tutils::Timer total;
|
||||
for (int i = 0; i < repeat; ++i)
|
||||
|
|
@ -383,3 +444,169 @@ pair<double, double> DisambEvaluator::eval(const string& output, const string& f
|
|||
}
|
||||
return make_pair(score.acc, score.acc);
|
||||
}
|
||||
|
||||
auto NounEvaluator::loadTestset(const string& testSetFile) const -> vector<TestResult>
|
||||
{
|
||||
vector<TestResult> ret;
|
||||
ifstream f{ testSetFile };
|
||||
if (!f) throw std::ios_base::failure{ "Cannot open '" + testSetFile + "'" };
|
||||
string line;
|
||||
|
||||
regex nounTagPattern{ "<n(?:\\s+e=\"([^\"]+)\")?>(.+?)</n>" };
|
||||
|
||||
while (getline(f, line))
|
||||
{
|
||||
while (line.back() == '\n' || line.back() == '\r') line.pop_back();
|
||||
TestResult tr;
|
||||
smatch matches;
|
||||
auto searchStart = line.cbegin();
|
||||
string inputText;
|
||||
while (regex_search(searchStart, line.cend(), matches, nounTagPattern))
|
||||
{
|
||||
inputText.insert(inputText.end(), searchStart, matches[0].first);
|
||||
const u16string nounStr = utf8To16(matches[2].str());
|
||||
const string labelStr = matches[1].str();
|
||||
++tr.golds[nounStr].first;
|
||||
tr.golds[nounStr].second = labelStr;
|
||||
inputText.insert(inputText.end(), matches[2].first, matches[2].second);
|
||||
searchStart = matches[0].second;
|
||||
}
|
||||
inputText.insert(inputText.end(), searchStart, line.cend());
|
||||
tr.text = utf8To16(inputText);
|
||||
ret.emplace_back(std::move(tr));
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
auto NounEvaluator::computeScore(vector<TestResult>& preds, vector<TestResult>& errors) const -> Score
|
||||
{
|
||||
errors.clear();
|
||||
size_t totalCorrect = 0, totalLabeledCorrect = 0, totalGolds = 0, totalLabeledGolds = 0, totalPreds = 0;
|
||||
size_t totalCorrectChr = 0, totalPredsChr = 0, totalGoldsChr = 0;
|
||||
for (auto& tr : preds)
|
||||
{
|
||||
std::unordered_map<u16string, size_t> predCnt;
|
||||
for (auto& token : tr.result.first)
|
||||
{
|
||||
if (token.tag == POSTag::nng || token.tag == POSTag::nnp || token.tag == POSTag::nnb)
|
||||
{
|
||||
++predCnt[token.str];
|
||||
tr.numPredsChr += token.str.size();
|
||||
++tr.numPreds;
|
||||
}
|
||||
}
|
||||
size_t numCurrentGoldLabels = 0;
|
||||
for (auto& [g, info] : tr.golds)
|
||||
{
|
||||
auto [cnt, label] = info;
|
||||
auto it = predCnt.find(g);
|
||||
if (it != predCnt.end())
|
||||
{
|
||||
size_t matchCnt = min(it->second, cnt);
|
||||
tr.correct += matchCnt;
|
||||
tr.correctChr += g.size() * matchCnt;
|
||||
if (!label.empty())
|
||||
{
|
||||
tr.labeledCorrect += matchCnt;
|
||||
}
|
||||
}
|
||||
if (!label.empty())
|
||||
{
|
||||
totalLabeledGolds += cnt;
|
||||
numCurrentGoldLabels += cnt;
|
||||
}
|
||||
totalGolds += cnt;
|
||||
totalGoldsChr += g.size() * cnt;
|
||||
}
|
||||
totalPreds += tr.numPreds;
|
||||
totalCorrect += tr.correct;
|
||||
totalLabeledCorrect += tr.labeledCorrect;
|
||||
totalPredsChr += tr.numPredsChr;
|
||||
totalCorrectChr += tr.correctChr;
|
||||
if (tr.labeledCorrect < numCurrentGoldLabels) errors.emplace_back(tr);
|
||||
}
|
||||
Score score;
|
||||
score.precision = (totalPreds == 0) ? 0 : (double)totalCorrect / totalPreds;
|
||||
score.recall = (totalGolds == 0) ? 0 : (double)totalCorrect / totalGolds;
|
||||
score.labeledRecall = (totalLabeledGolds == 0) ? 0 : (double)totalLabeledCorrect / totalLabeledGolds;
|
||||
score.f1 = 2 * score.precision * score.recall / max(score.precision + score.recall, 1.);
|
||||
|
||||
score.precisionChr = (totalPredsChr == 0) ? 0 : (double)totalCorrectChr / totalPredsChr;
|
||||
score.recallChr = (totalGoldsChr == 0) ? 0 : (double)totalCorrectChr / totalGoldsChr;
|
||||
score.f1Chr = 2 * score.precisionChr * score.recallChr / max(score.precisionChr + score.recallChr, 1.);
|
||||
score.totalCount = preds.size();
|
||||
return score;
|
||||
}
|
||||
|
||||
void NounEvaluator::TestResult::writeResult(ostream& out) const
|
||||
{
|
||||
size_t totalGolds = 0;
|
||||
size_t labeledGolds = 0;
|
||||
for (auto& [g, info] : golds)
|
||||
{
|
||||
auto [cnt, label] = info;
|
||||
if (!label.empty()) labeledGolds += cnt;
|
||||
totalGolds += cnt;
|
||||
}
|
||||
|
||||
float precision = (numPreds == 0) ? 0 : (double)correct / numPreds;
|
||||
float recall = (totalGolds == 0) ? 0 : (double)correct / totalGolds;
|
||||
float f1 = 2 * precision * recall / max(precision + recall, 1e-10f);
|
||||
float labeledRecall = (labeledGolds == 0) ? 0 : (double)labeledCorrect / labeledGolds;
|
||||
out << utf16To8(text) << '\t' << labeledRecall << '\t' << precision << '\t' << recall << '\t' << f1 << endl;
|
||||
out << "Golds:" << '\t';
|
||||
for (auto& [g, info] : golds)
|
||||
{
|
||||
auto [cnt, label] = info;
|
||||
out << utf16To8(g) << (label.empty() ? "" : ("/" + label) ) << '\t';
|
||||
}
|
||||
out << endl;
|
||||
for (auto& r : result.first)
|
||||
{
|
||||
out << r << '\t';
|
||||
}
|
||||
out << endl;
|
||||
out << endl;
|
||||
}
|
||||
|
||||
std::pair<double, double> NounEvaluator::eval(const std::string& output, const std::string& file, kiwi::Kiwi& kiwi, kiwi::AnalyzeOption option, int repeat)
|
||||
{
|
||||
vector<TestResult> testsets = loadTestset(file), errors;
|
||||
tutils::Timer total;
|
||||
for (int i = 0; i < repeat; ++i)
|
||||
{
|
||||
for (auto& tr : testsets)
|
||||
{
|
||||
auto cands = kiwi.analyze(tr.text, option);
|
||||
tr.result = cands;
|
||||
}
|
||||
}
|
||||
double tm = total.getElapsed() / repeat;
|
||||
auto score = computeScore(testsets, errors);
|
||||
|
||||
cout << "Labeled Recall: " << score.labeledRecall << endl;
|
||||
cout << "(Morph Level) Precision: " << score.precision << ", Recall: " << score.recall << ", F1: " << score.f1 << endl;
|
||||
cout << "(Chr Level) Precision: " << score.precisionChr << ", Recall: " << score.recallChr << ", F1: " << score.f1Chr << endl;
|
||||
cout << "Total (" << score.totalCount << " lines) Time : " << tm << " ms" << endl;
|
||||
cout << "Time per Line : " << tm / score.totalCount << " ms" << endl;
|
||||
|
||||
if (!output.empty())
|
||||
{
|
||||
const size_t last_slash_idx = file.find_last_of("\\/");
|
||||
string name;
|
||||
if (last_slash_idx != file.npos) name = file.substr(last_slash_idx + 1);
|
||||
else name = file;
|
||||
|
||||
ofstream out{ output + "/" + name };
|
||||
out << "Labeled Recall: " << score.labeledRecall << endl;
|
||||
out << "(Morph Level) Precision: " << score.precision << ", Recall: " << score.recall << ", F1: " << score.f1 << endl;
|
||||
out << "(Chr Level) Precision: " << score.precisionChr << ", Recall: " << score.recallChr << ", F1: " << score.f1Chr << endl;
|
||||
out << "Total (" << score.totalCount << ") Time : " << tm << " ms" << endl;
|
||||
out << "Time per Unit : " << tm / score.totalCount << " ms" << endl;
|
||||
for (auto t : errors)
|
||||
{
|
||||
t.writeResult(out);
|
||||
}
|
||||
}
|
||||
return make_pair(score.labeledRecall, score.f1Chr);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -4,7 +4,7 @@
|
|||
class Evaluator
|
||||
{
|
||||
virtual std::pair<double, double> eval(const std::string& output, const std::string& file, kiwi::Kiwi& kiwi,
|
||||
bool normCoda, bool zCoda, kiwi::Dialect allowedDialect, int repeat) = 0;
|
||||
kiwi::AnalyzeOption option, int repeat) = 0;
|
||||
public:
|
||||
|
||||
virtual ~Evaluator() = default;
|
||||
|
|
@ -14,9 +14,12 @@ public:
|
|||
int operator()(const std::string& modelPath,
|
||||
const std::string& output,
|
||||
const std::vector<std::string>& input,
|
||||
bool normCoda, bool zCoda, bool multiDict, kiwi::ModelType modelType,
|
||||
bool normCoda, bool zCoda, bool defaultDict, bool multiDict, kiwi::ModelType modelType,
|
||||
float typoCostWeight, bool bTypo, bool cTypo, bool lTypo,
|
||||
kiwi::Dialect allowedDialect,
|
||||
kiwi::Match oovScoringType,
|
||||
float unkFormScoreScale, float unkFormScoreBias,
|
||||
bool oldSplitter,
|
||||
int repeat);
|
||||
};
|
||||
|
||||
|
|
@ -42,7 +45,7 @@ class MorphEvaluator : public Evaluator
|
|||
};
|
||||
|
||||
std::pair<double, double> eval(const std::string& output, const std::string& file, kiwi::Kiwi& kiwi,
|
||||
bool normCoda, bool zCoda, kiwi::Dialect allowedDialect, int repeat) override;
|
||||
kiwi::AnalyzeOption option, int repeat) override;
|
||||
|
||||
std::vector<TestResult> loadTestset(const std::string& file) const;
|
||||
Score computeScore(std::vector<TestResult>& preds, std::vector<TestResult>& errors) const;
|
||||
|
|
@ -66,7 +69,33 @@ class DisambEvaluator : public Evaluator
|
|||
};
|
||||
|
||||
std::pair<double, double> eval(const std::string& output, const std::string& file, kiwi::Kiwi& kiwi,
|
||||
bool normCoda, bool zCoda, kiwi::Dialect allowedDialect, int repeat) override;
|
||||
kiwi::AnalyzeOption option, int repeat) override;
|
||||
|
||||
std::vector<TestResult> loadTestset(const std::string& file) const;
|
||||
Score computeScore(std::vector<TestResult>& preds, std::vector<TestResult>& errors) const;
|
||||
};
|
||||
|
||||
class NounEvaluator : public Evaluator
|
||||
{
|
||||
struct TestResult
|
||||
{
|
||||
std::u16string text;
|
||||
std::unordered_map<std::u16string, std::pair<size_t, std::string>> golds;
|
||||
kiwi::TokenResult result;
|
||||
size_t correct = 0, labeledCorrect = 0, numPreds = 0;
|
||||
size_t correctChr = 0, numPredsChr = 0;
|
||||
void writeResult(std::ostream& out) const;
|
||||
};
|
||||
|
||||
struct Score
|
||||
{
|
||||
double precision = 0, recall = 0, f1 = 0, labeledRecall = 0;
|
||||
double precisionChr = 0, recallChr = 0, f1Chr = 0;
|
||||
size_t totalCount = 0;
|
||||
};
|
||||
|
||||
std::pair<double, double> eval(const std::string& output, const std::string& file, kiwi::Kiwi& kiwi,
|
||||
kiwi::AnalyzeOption option, int repeat) override;
|
||||
|
||||
std::vector<TestResult> loadTestset(const std::string& file) const;
|
||||
Score computeScore(std::vector<TestResult>& preds, std::vector<TestResult>& errors) const;
|
||||
|
|
|
|||
61
tools/chr_model_builder.cpp
Normal file
61
tools/chr_model_builder.cpp
Normal file
|
|
@ -0,0 +1,61 @@
|
|||
#include <iostream>
|
||||
#include <fstream>
|
||||
|
||||
#include <kiwi/Kiwi.h>
|
||||
#include <kiwi/CoNgramModel.h>
|
||||
#include <tclap/CmdLine.h>
|
||||
#include "toolUtils.h"
|
||||
|
||||
using namespace std;
|
||||
using namespace kiwi;
|
||||
|
||||
int run(const std::string& contextDef,
|
||||
const std::string& embedding,
|
||||
size_t maxLength, const std::string& output)
|
||||
{
|
||||
try
|
||||
{
|
||||
tutils::Timer timer;
|
||||
auto ret = lm::CoNgramModelBase::buildChrModel(contextDef, embedding, maxLength);
|
||||
ret.writeToFile(output + "/nounchr.mdl");
|
||||
double tm = timer.getElapsed();
|
||||
cout << "Total: " << tm << " ms " << endl;
|
||||
return 0;
|
||||
}
|
||||
catch (const exception& e)
|
||||
{
|
||||
cerr << e.what() << endl;
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
using namespace TCLAP;
|
||||
|
||||
int main(int argc, const char* argv[])
|
||||
{
|
||||
tutils::setUTF8Output();
|
||||
|
||||
CmdLine cmd{ "Kiwi CoNgram Builder", ' ', KIWI_VERSION_STRING };
|
||||
|
||||
ValueArg<string> cdef{ "c", "context-def", "context definition", true, "", "string" };
|
||||
ValueArg<string> emb{ "e", "emb", "embedding file", true, "", "string" };
|
||||
ValueArg<size_t> maxLength{ "l", "max-length", "max length of n-grams", false, (size_t)-1, "int"};
|
||||
ValueArg<string> output{ "o", "output", "", true, "", "string" };
|
||||
|
||||
cmd.add(cdef);
|
||||
cmd.add(emb);
|
||||
cmd.add(maxLength);
|
||||
cmd.add(output);
|
||||
|
||||
try
|
||||
{
|
||||
cmd.parse(argc, argv);
|
||||
}
|
||||
catch (const ArgException& e)
|
||||
{
|
||||
cerr << "error: " << e.error() << " for arg " << e.argId() << endl;
|
||||
return -1;
|
||||
}
|
||||
|
||||
return run(cdef, emb, maxLength, output);
|
||||
}
|
||||
|
|
@ -22,6 +22,7 @@ int main(int argc, const char* argv[])
|
|||
ValueArg<string> output{ "o", "output", "output dir for evaluation errors", false, "", "string" };
|
||||
SwitchArg noNormCoda{ "", "no-normcoda", "without normalizing coda", false };
|
||||
SwitchArg noZCoda{ "", "no-zcoda", "without z-coda", false };
|
||||
SwitchArg noDefault{ "", "no-default", "turn off default dict", false };
|
||||
SwitchArg noMulti{ "", "no-multi", "turn off multi dict", false };
|
||||
ValueArg<string> modelType{ "t", "type", "model type", false, "none", "string" };
|
||||
ValueArg<float> typoWeight{ "", "typo", "typo weight", false, 0.f, "float"};
|
||||
|
|
@ -30,12 +31,17 @@ int main(int argc, const char* argv[])
|
|||
SwitchArg lTypo{ "", "ltypo", "make lengthening-typo-tolerant model", false };
|
||||
ValueArg<string> dialect{ "d", "dialect", "allowed dialect", false, "standard", "string" };
|
||||
ValueArg<int> repeat{ "", "repeat", "repeat evaluation for benchmark", false, 1, "int" };
|
||||
UnlabeledMultiArg<string> inputs{ "inputs", "evaluation set (--morph, --disamb)", false, "string" };
|
||||
ValueArg<string> oovScoring{ "x", "oov-scoring", "OOV scoring method (none, rule, chr, chrfreq, chrfreqbranch)", false, "rule", "string" };
|
||||
ValueArg<float> unkFormScoreScale{ "", "unk-form-scale", "unknown form score scaling factor (NaN for default)", false, std::numeric_limits<float>::quiet_NaN(), "float" };
|
||||
ValueArg<float> unkFormScoreBias{ "", "unk-form-bias", "unknown form score bias (NaN for default)", false, std::numeric_limits<float>::quiet_NaN(), "float" };
|
||||
SwitchArg oldSplitter{ "", "old-splitter", "use old splitter (for ablation)", false };
|
||||
UnlabeledMultiArg<string> inputs{ "inputs", "evaluation set (--morph, --disamb, --noun)", false, "string" };
|
||||
|
||||
cmd.add(model);
|
||||
cmd.add(output);
|
||||
cmd.add(noNormCoda);
|
||||
cmd.add(noZCoda);
|
||||
cmd.add(noDefault);
|
||||
cmd.add(noMulti);
|
||||
cmd.add(modelType);
|
||||
cmd.add(typoWeight);
|
||||
|
|
@ -44,6 +50,10 @@ int main(int argc, const char* argv[])
|
|||
cmd.add(lTypo);
|
||||
cmd.add(dialect);
|
||||
cmd.add(repeat);
|
||||
cmd.add(oovScoring);
|
||||
cmd.add(unkFormScoreScale);
|
||||
cmd.add(unkFormScoreBias);
|
||||
cmd.add(oldSplitter);
|
||||
cmd.add(inputs);
|
||||
|
||||
try
|
||||
|
|
@ -66,7 +76,18 @@ int main(int argc, const char* argv[])
|
|||
return -1;
|
||||
}
|
||||
|
||||
vector<string> morphInputs, disambInputs;
|
||||
Match oovScoringType = Match::oovRuleOnly;
|
||||
try
|
||||
{
|
||||
oovScoringType = tutils::parseOOVScoring(oovScoring.getValue());
|
||||
}
|
||||
catch (const exception& e)
|
||||
{
|
||||
cerr << e.what() << endl;
|
||||
return -1;
|
||||
}
|
||||
|
||||
vector<string> morphInputs, disambInputs, nounInputs;
|
||||
|
||||
string currentType = "";
|
||||
for (auto& input : inputs.getValue())
|
||||
|
|
@ -85,6 +106,10 @@ int main(int argc, const char* argv[])
|
|||
{
|
||||
disambInputs.emplace_back(input);
|
||||
}
|
||||
else if (currentType == "--noun")
|
||||
{
|
||||
nounInputs.emplace_back(input);
|
||||
}
|
||||
else
|
||||
{
|
||||
cerr << "Unknown argument: " << input << endl;
|
||||
|
|
@ -99,10 +124,13 @@ int main(int argc, const char* argv[])
|
|||
{
|
||||
auto evaluator = Evaluator::create("morph");
|
||||
(*evaluator)(model, output, morphInputs,
|
||||
!noNormCoda, !noZCoda, !noMulti,
|
||||
!noNormCoda, !noZCoda, !noDefault, !noMulti,
|
||||
kiwiModelType,
|
||||
typoWeight, bTypo, cTypo, lTypo,
|
||||
allowedDialect,
|
||||
oovScoringType,
|
||||
unkFormScoreScale, unkFormScoreBias,
|
||||
oldSplitter,
|
||||
repeat);
|
||||
cout << endl;
|
||||
}
|
||||
|
|
@ -111,10 +139,28 @@ int main(int argc, const char* argv[])
|
|||
{
|
||||
auto evaluator = Evaluator::create("disamb");
|
||||
(*evaluator)(model, output, disambInputs,
|
||||
!noNormCoda, !noZCoda, !noMulti,
|
||||
!noNormCoda, !noZCoda, !noDefault, !noMulti,
|
||||
kiwiModelType,
|
||||
typoWeight, bTypo, cTypo, lTypo,
|
||||
allowedDialect,
|
||||
oovScoringType,
|
||||
unkFormScoreScale, unkFormScoreBias,
|
||||
oldSplitter,
|
||||
repeat);
|
||||
cout << endl;
|
||||
}
|
||||
|
||||
if (nounInputs.size())
|
||||
{
|
||||
auto evaluator = Evaluator::create("noun");
|
||||
(*evaluator)(model, output, nounInputs,
|
||||
!noNormCoda, !noZCoda, !noDefault, !noMulti,
|
||||
kiwiModelType,
|
||||
typoWeight, bTypo, cTypo, lTypo,
|
||||
allowedDialect,
|
||||
oovScoringType,
|
||||
unkFormScoreScale, unkFormScoreBias,
|
||||
oldSplitter,
|
||||
repeat);
|
||||
cout << endl;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -36,6 +36,7 @@ int run(const string& modelPath, bool benchmark, const string& output, const str
|
|||
float dialectCost,
|
||||
bool score,
|
||||
ModelType modelType,
|
||||
Match oovScoringType,
|
||||
const vector<string>& input)
|
||||
{
|
||||
try
|
||||
|
|
@ -56,25 +57,33 @@ int run(const string& modelPath, bool benchmark, const string& output, const str
|
|||
|
||||
auto typo = getDefaultTypoSet(DefaultTypoSet::withoutTypo);
|
||||
|
||||
string typoStr = "";
|
||||
if (bTypo)
|
||||
{
|
||||
typo |= getDefaultTypoSet(DefaultTypoSet::basicTypoSet);
|
||||
typoStr += "basic";
|
||||
}
|
||||
|
||||
if (cTypo)
|
||||
{
|
||||
typo |= getDefaultTypoSet(DefaultTypoSet::continualTypoSet);
|
||||
if (!typoStr.empty()) typoStr += "+";
|
||||
typoStr += "continual";
|
||||
}
|
||||
|
||||
if (lTypo)
|
||||
{
|
||||
typo |= getDefaultTypoSet(DefaultTypoSet::lengtheningTypoSet);
|
||||
if (!typoStr.empty()) typoStr += "+";
|
||||
typoStr += "lengthening";
|
||||
}
|
||||
|
||||
|
||||
Kiwi kw = KiwiBuilder{ modelPath, 1, BuildOption::default_, modelType }.build(typo);
|
||||
Kiwi kw = KiwiBuilder{ modelPath, 1, BuildOption::default_, modelType }.build();
|
||||
auto ptt = typo.prepare(true);
|
||||
|
||||
AnalyzeOption option{ Match::allWithNormalizing, nullptr, false, allowedDialects, dialectCost };
|
||||
AnalyzeOption option{ Match::allWithNormalizing | oovScoringType, nullptr, false, allowedDialects, dialectCost };
|
||||
option.typoTransformer = &ptt;
|
||||
|
||||
cout << "Kiwi v" << KIWI_VERSION_STRING << endl;
|
||||
if (tolerance)
|
||||
|
|
@ -99,6 +108,8 @@ int run(const string& modelPath, bool benchmark, const string& output, const str
|
|||
cout << "LM Size : " << (kw.getLangModel()->getMemorySize() / 1024. / 1024.) << " MB" << endl;
|
||||
cout << "Mem Usage : " << (tutils::getCurrentPhysicalMemoryUsage() / 1024.) << " MB" << endl;
|
||||
cout << "ModelType : " << modelTypeToStr(kw.getLangModel()->getType()) << endl;
|
||||
cout << "OOV Scoring : " << tutils::oovScoringTypeToStr(oovScoringType) << endl;
|
||||
cout << "Typo Correction: " << (typoStr.empty() ? "none" : typoStr) << endl;
|
||||
}
|
||||
|
||||
ostream* out = &cout;
|
||||
|
|
@ -187,6 +198,7 @@ int main(int argc, const char* argv[])
|
|||
SwitchArg lTypo{ "", "ltypo", "make lengthening-typo-tolerant model", false };
|
||||
ValueArg<string> dialect{ "d", "dialect", "allowed dialect", false, "standard", "string" };
|
||||
ValueArg<float> dialectCost{ "", "dialect-cost", "dialect cost", false, 6.f, "float" };
|
||||
ValueArg<string> oovScoring{ "x", "oov-scoring", "OOV scoring method (none, rule, chr, chrfreq, chrfreqbranch)", false, "rule", "string" };
|
||||
SwitchArg score{ "s", "score", "print score together" };
|
||||
UnlabeledMultiArg<string> files{ "inputs", "input files", false, "string" };
|
||||
|
||||
|
|
@ -203,6 +215,7 @@ int main(int argc, const char* argv[])
|
|||
cmd.add(lTypo);
|
||||
cmd.add(dialect);
|
||||
cmd.add(dialectCost);
|
||||
cmd.add(oovScoring);
|
||||
cmd.add(score);
|
||||
cmd.add(files);
|
||||
|
||||
|
|
@ -217,7 +230,27 @@ int main(int argc, const char* argv[])
|
|||
}
|
||||
|
||||
Dialect parsedDialect = parseDialects(dialect.getValue());
|
||||
ModelType kiwiModelType = tutils::parseModelType(modelType);
|
||||
ModelType kiwiModelType = ModelType::none;
|
||||
try
|
||||
{
|
||||
kiwiModelType = tutils::parseModelType(modelType);
|
||||
}
|
||||
catch (const exception& e)
|
||||
{
|
||||
cerr << e.what() << endl;
|
||||
return -1;
|
||||
}
|
||||
|
||||
Match oovScoringType = Match::oovRuleOnly;
|
||||
try
|
||||
{
|
||||
oovScoringType = tutils::parseOOVScoring(oovScoring.getValue());
|
||||
}
|
||||
catch (const exception& e)
|
||||
{
|
||||
cerr << e.what() << endl;
|
||||
return -1;
|
||||
}
|
||||
|
||||
return run(model, benchmark, output, user, topn, tolerance,
|
||||
typoWeight,
|
||||
|
|
@ -228,6 +261,7 @@ int main(int argc, const char* argv[])
|
|||
dialectCost,
|
||||
score,
|
||||
kiwiModelType,
|
||||
oovScoringType,
|
||||
files.getValue());
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -150,6 +150,50 @@ namespace tutils
|
|||
}
|
||||
}
|
||||
|
||||
inline kiwi::Match parseOOVScoring(const std::string& str)
|
||||
{
|
||||
if (str == "none")
|
||||
{
|
||||
return kiwi::Match::oovRuleOnly;
|
||||
}
|
||||
else if (str == "rule")
|
||||
{
|
||||
return kiwi::Match::oovRuleOnly;
|
||||
}
|
||||
else if (str == "chr")
|
||||
{
|
||||
return kiwi::Match::oovChrModel;
|
||||
}
|
||||
else if (str == "chrfreq")
|
||||
{
|
||||
return kiwi::Match::oovChrFreqModel;
|
||||
}
|
||||
else if (str == "chrfreqbranch")
|
||||
{
|
||||
return kiwi::Match::oovChrFreqBranchModel;
|
||||
}
|
||||
else
|
||||
{
|
||||
throw std::runtime_error{ "Unknown OOV scoring method: " + str };
|
||||
}
|
||||
}
|
||||
|
||||
inline std::string oovScoringTypeToStr(kiwi::Match t)
|
||||
{
|
||||
switch (t)
|
||||
{
|
||||
case kiwi::Match::oovRuleOnly:
|
||||
return "rule";
|
||||
case kiwi::Match::oovChrModel:
|
||||
return "chr";
|
||||
case kiwi::Match::oovChrFreqModel:
|
||||
return "chrFreq";
|
||||
case kiwi::Match::oovChrFreqBranchModel:
|
||||
return "chrFreqBranch";
|
||||
default:
|
||||
return "none";
|
||||
}
|
||||
}
|
||||
|
||||
template<class BaseStr, class BaseChr, class OutIterator>
|
||||
OutIterator split(BaseStr&& s, BaseChr delim, OutIterator result, size_t maxSplit = -1, BaseChr delimEscape = 0)
|
||||
|
|
|
|||
230
vsproj/build_chr_model.vcxproj
Normal file
230
vsproj/build_chr_model.vcxproj
Normal file
|
|
@ -0,0 +1,230 @@
|
|||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
||||
<ItemGroup Label="ProjectConfigurations">
|
||||
<ProjectConfiguration Include="Debug|ARM64">
|
||||
<Configuration>Debug</Configuration>
|
||||
<Platform>ARM64</Platform>
|
||||
</ProjectConfiguration>
|
||||
<ProjectConfiguration Include="Debug|Win32">
|
||||
<Configuration>Debug</Configuration>
|
||||
<Platform>Win32</Platform>
|
||||
</ProjectConfiguration>
|
||||
<ProjectConfiguration Include="Release|ARM64">
|
||||
<Configuration>Release</Configuration>
|
||||
<Platform>ARM64</Platform>
|
||||
</ProjectConfiguration>
|
||||
<ProjectConfiguration Include="Release|Win32">
|
||||
<Configuration>Release</Configuration>
|
||||
<Platform>Win32</Platform>
|
||||
</ProjectConfiguration>
|
||||
<ProjectConfiguration Include="Debug|x64">
|
||||
<Configuration>Debug</Configuration>
|
||||
<Platform>x64</Platform>
|
||||
</ProjectConfiguration>
|
||||
<ProjectConfiguration Include="Release|x64">
|
||||
<Configuration>Release</Configuration>
|
||||
<Platform>x64</Platform>
|
||||
</ProjectConfiguration>
|
||||
</ItemGroup>
|
||||
<PropertyGroup Label="Globals">
|
||||
<VCProjectVersion>15.0</VCProjectVersion>
|
||||
<ProjectGuid>{C3592484-4AF8-4F2A-ACD0-17C8A4A549C3}</ProjectGuid>
|
||||
<Keyword>Win32Proj</Keyword>
|
||||
<RootNamespace>KiwiRun</RootNamespace>
|
||||
<WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
|
||||
<ProjectName>build_chr_model</ProjectName>
|
||||
</PropertyGroup>
|
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
|
||||
<ConfigurationType>Application</ConfigurationType>
|
||||
<UseDebugLibraries>true</UseDebugLibraries>
|
||||
<PlatformToolset>v143</PlatformToolset>
|
||||
<CharacterSet>Unicode</CharacterSet>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
|
||||
<ConfigurationType>Application</ConfigurationType>
|
||||
<UseDebugLibraries>false</UseDebugLibraries>
|
||||
<PlatformToolset>v143</PlatformToolset>
|
||||
<WholeProgramOptimization>true</WholeProgramOptimization>
|
||||
<CharacterSet>Unicode</CharacterSet>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
|
||||
<ConfigurationType>Application</ConfigurationType>
|
||||
<UseDebugLibraries>true</UseDebugLibraries>
|
||||
<PlatformToolset>v143</PlatformToolset>
|
||||
<CharacterSet>Unicode</CharacterSet>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|ARM64'" Label="Configuration">
|
||||
<ConfigurationType>Application</ConfigurationType>
|
||||
<UseDebugLibraries>true</UseDebugLibraries>
|
||||
<PlatformToolset>v143</PlatformToolset>
|
||||
<CharacterSet>Unicode</CharacterSet>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
|
||||
<ConfigurationType>Application</ConfigurationType>
|
||||
<UseDebugLibraries>false</UseDebugLibraries>
|
||||
<PlatformToolset>v143</PlatformToolset>
|
||||
<WholeProgramOptimization>true</WholeProgramOptimization>
|
||||
<CharacterSet>Unicode</CharacterSet>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|ARM64'" Label="Configuration">
|
||||
<ConfigurationType>Application</ConfigurationType>
|
||||
<UseDebugLibraries>false</UseDebugLibraries>
|
||||
<PlatformToolset>v143</PlatformToolset>
|
||||
<WholeProgramOptimization>true</WholeProgramOptimization>
|
||||
<CharacterSet>Unicode</CharacterSet>
|
||||
</PropertyGroup>
|
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
||||
<ImportGroup Label="ExtensionSettings">
|
||||
</ImportGroup>
|
||||
<ImportGroup Label="Shared">
|
||||
</ImportGroup>
|
||||
<ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
|
||||
</ImportGroup>
|
||||
<ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
|
||||
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
|
||||
</ImportGroup>
|
||||
<ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
|
||||
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
|
||||
</ImportGroup>
|
||||
<ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|ARM64'" Label="PropertySheets">
|
||||
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
|
||||
</ImportGroup>
|
||||
<ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
|
||||
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
|
||||
</ImportGroup>
|
||||
<ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|ARM64'" Label="PropertySheets">
|
||||
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
|
||||
</ImportGroup>
|
||||
<PropertyGroup Label="UserMacros" />
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
|
||||
<LinkIncremental>false</LinkIncremental>
|
||||
<IncludePath>$(SolutionDir)third_party/mimalloc/include;$(SolutionDir)third_party/tclap/include;$(SolutionDir)include\;$(VC_IncludePath);$(WindowsSDK_IncludePath);</IncludePath>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|ARM64'">
|
||||
<LinkIncremental>false</LinkIncremental>
|
||||
<IncludePath>$(SolutionDir)third_party/mimalloc/include;$(SolutionDir)third_party/tclap/include;$(SolutionDir)include\;$(VC_IncludePath);$(WindowsSDK_IncludePath);</IncludePath>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||
<LinkIncremental>true</LinkIncremental>
|
||||
<IncludePath>$(SolutionDir)third_party/mimalloc/include;$(SolutionDir)third_party/tclap/include;$(SolutionDir)include\;$(VC_IncludePath);$(WindowsSDK_IncludePath);</IncludePath>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
|
||||
<LinkIncremental>true</LinkIncremental>
|
||||
<IncludePath>$(SolutionDir)third_party/mimalloc/include;$(SolutionDir)third_party/tclap/include;$(SolutionDir)include\;$(VC_IncludePath);$(WindowsSDK_IncludePath);</IncludePath>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|ARM64'">
|
||||
<LinkIncremental>true</LinkIncremental>
|
||||
<IncludePath>$(SolutionDir)third_party/mimalloc/include;$(SolutionDir)third_party/tclap/include;$(SolutionDir)include\;$(VC_IncludePath);$(WindowsSDK_IncludePath);</IncludePath>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
|
||||
<LinkIncremental>false</LinkIncremental>
|
||||
<IncludePath>$(SolutionDir)third_party/mimalloc/include;$(SolutionDir)third_party/tclap/include;$(SolutionDir)include\;$(VC_IncludePath);$(WindowsSDK_IncludePath);</IncludePath>
|
||||
</PropertyGroup>
|
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
|
||||
<ClCompile>
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<PrecompiledHeader>NotUsing</PrecompiledHeader>
|
||||
<Optimization>MaxSpeed</Optimization>
|
||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<PreprocessorDefinitions>NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<RuntimeLibrary>MultiThreaded</RuntimeLibrary>
|
||||
<AdditionalOptions>/utf-8 %(AdditionalOptions)</AdditionalOptions>
|
||||
<LanguageStandard>stdcpp17</LanguageStandard>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
<SubSystem>Console</SubSystem>
|
||||
<EnableCOMDATFolding>true</EnableCOMDATFolding>
|
||||
<OptimizeReferences>true</OptimizeReferences>
|
||||
</Link>
|
||||
</ItemDefinitionGroup>
|
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|ARM64'">
|
||||
<ClCompile>
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<PrecompiledHeader>NotUsing</PrecompiledHeader>
|
||||
<Optimization>MaxSpeed</Optimization>
|
||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<PreprocessorDefinitions>NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<RuntimeLibrary>MultiThreaded</RuntimeLibrary>
|
||||
<AdditionalOptions>/utf-8 %(AdditionalOptions)</AdditionalOptions>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
<SubSystem>Console</SubSystem>
|
||||
<EnableCOMDATFolding>true</EnableCOMDATFolding>
|
||||
<OptimizeReferences>true</OptimizeReferences>
|
||||
</Link>
|
||||
</ItemDefinitionGroup>
|
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||
<ClCompile>
|
||||
<PrecompiledHeader>NotUsing</PrecompiledHeader>
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<Optimization>Disabled</Optimization>
|
||||
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
|
||||
<AdditionalOptions>/utf-8 %(AdditionalOptions)</AdditionalOptions>
|
||||
<LanguageStandard>stdcpp17</LanguageStandard>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
<SubSystem>Console</SubSystem>
|
||||
</Link>
|
||||
</ItemDefinitionGroup>
|
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
|
||||
<ClCompile>
|
||||
<PrecompiledHeader>NotUsing</PrecompiledHeader>
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<Optimization>Disabled</Optimization>
|
||||
<PreprocessorDefinitions>_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
|
||||
<AdditionalOptions>/utf-8 %(AdditionalOptions)</AdditionalOptions>
|
||||
<LanguageStandard>stdcpp17</LanguageStandard>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
<SubSystem>Console</SubSystem>
|
||||
</Link>
|
||||
</ItemDefinitionGroup>
|
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|ARM64'">
|
||||
<ClCompile>
|
||||
<PrecompiledHeader>NotUsing</PrecompiledHeader>
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<Optimization>Disabled</Optimization>
|
||||
<PreprocessorDefinitions>_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
|
||||
<AdditionalOptions>/utf-8 %(AdditionalOptions)</AdditionalOptions>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
<SubSystem>Console</SubSystem>
|
||||
</Link>
|
||||
</ItemDefinitionGroup>
|
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
|
||||
<ClCompile>
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<PrecompiledHeader>NotUsing</PrecompiledHeader>
|
||||
<Optimization>MaxSpeed</Optimization>
|
||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
<RuntimeLibrary>MultiThreaded</RuntimeLibrary>
|
||||
<AdditionalOptions>/utf-8 %(AdditionalOptions)</AdditionalOptions>
|
||||
<LanguageStandard>stdcpp17</LanguageStandard>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
<SubSystem>Console</SubSystem>
|
||||
<EnableCOMDATFolding>true</EnableCOMDATFolding>
|
||||
<OptimizeReferences>true</OptimizeReferences>
|
||||
</Link>
|
||||
</ItemDefinitionGroup>
|
||||
<ItemGroup>
|
||||
<ProjectReference Include="kiwi_shared_library.vcxproj">
|
||||
<Project>{f790bc37-2732-4ed1-9ca5-7248bed3588e}</Project>
|
||||
</ProjectReference>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<ClCompile Include="..\tools\chr_model_builder.cpp" />
|
||||
</ItemGroup>
|
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||
<ImportGroup Label="ExtensionTargets">
|
||||
</ImportGroup>
|
||||
</Project>
|
||||
|
|
@ -149,6 +149,7 @@
|
|||
<ClCompile Include="..\src\SwTokenizer.cpp" />
|
||||
<ClCompile Include="..\src\TypoTransformer.cpp" />
|
||||
<ClCompile Include="..\src\UnicodeCase.cpp" />
|
||||
<ClCompile Include="..\src\UnkFormScorer.cpp" />
|
||||
<ClCompile Include="..\third_party\mimalloc\src\static.c" />
|
||||
<ClCompile Include="..\src\Form.cpp" />
|
||||
<ClCompile Include="..\src\FeatureTestor.cpp" />
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue