added tokenizer samples

This commit is contained in:
bab2min 2023-05-07 16:02:29 +09:00
commit 73a4b1032c
5 changed files with 160131 additions and 1 deletions

View file

@ -1,4 +1,4 @@
#include "gtest/gtest.h"
#include "gtest/gtest.h"
#include <fstream>
#include <kiwi/Kiwi.h>
#include <kiwi/SwTokenizer.h>
@ -165,6 +165,24 @@ TEST(KiwiSwTokenizer, Builder)
}
}
TEST(KiwiSwTokenizer, EncodeError)
{
SwTokenizer tokenizer;
{
std::ifstream ifs{ "tokenizers/kor.16k.json" };
tokenizer = SwTokenizer::load(reuseKiwiInstance(), ifs);
}
for (auto c : {
u8"또는 “𡆮”으로 새겨야 하는 것",
})
{
auto encoded = tokenizer.encode(c);
auto decoded = tokenizer.decode(encoded);
//EXPECT_EQ(decoded, c);
}
}
TEST(KiwiSwTokenizer, BasicEncodeAndDecode)
{
SwTokenizer tokenizer;

16028
tokenizers/kor.16k.json Normal file

File diff suppressed because it is too large Load diff

32028
tokenizers/kor.32k.json Normal file

File diff suppressed because it is too large Load diff

48028
tokenizers/kor.48k.json Normal file

File diff suppressed because it is too large Load diff

64028
tokenizers/kor.64k.json Normal file

File diff suppressed because it is too large Load diff