mirror of
https://github.com/bab2min/Kiwi.git
synced 2026-06-17 01:54:27 +00:00
added tokenizer samples
This commit is contained in:
parent
e3021519f6
commit
73a4b1032c
5 changed files with 160131 additions and 1 deletions
|
|
@ -1,4 +1,4 @@
|
|||
#include "gtest/gtest.h"
|
||||
#include "gtest/gtest.h"
|
||||
#include <fstream>
|
||||
#include <kiwi/Kiwi.h>
|
||||
#include <kiwi/SwTokenizer.h>
|
||||
|
|
@ -165,6 +165,24 @@ TEST(KiwiSwTokenizer, Builder)
|
|||
}
|
||||
}
|
||||
|
||||
TEST(KiwiSwTokenizer, EncodeError)
|
||||
{
|
||||
SwTokenizer tokenizer;
|
||||
{
|
||||
std::ifstream ifs{ "tokenizers/kor.16k.json" };
|
||||
tokenizer = SwTokenizer::load(reuseKiwiInstance(), ifs);
|
||||
}
|
||||
|
||||
for (auto c : {
|
||||
u8"또는 “𡆮”으로 새겨야 하는 것",
|
||||
})
|
||||
{
|
||||
auto encoded = tokenizer.encode(c);
|
||||
auto decoded = tokenizer.decode(encoded);
|
||||
//EXPECT_EQ(decoded, c);
|
||||
}
|
||||
}
|
||||
|
||||
TEST(KiwiSwTokenizer, BasicEncodeAndDecode)
|
||||
{
|
||||
SwTokenizer tokenizer;
|
||||
|
|
|
|||
16028
tokenizers/kor.16k.json
Normal file
16028
tokenizers/kor.16k.json
Normal file
File diff suppressed because it is too large
Load diff
32028
tokenizers/kor.32k.json
Normal file
32028
tokenizers/kor.32k.json
Normal file
File diff suppressed because it is too large
Load diff
48028
tokenizers/kor.48k.json
Normal file
48028
tokenizers/kor.48k.json
Normal file
File diff suppressed because it is too large
Load diff
64028
tokenizers/kor.64k.json
Normal file
64028
tokenizers/kor.64k.json
Normal file
File diff suppressed because it is too large
Load diff
Loading…
Add table
Add a link
Reference in a new issue