mirror of
https://github.com/bab2min/Kiwi.git
synced 2026-06-17 01:54:27 +00:00
- Changed kiwi_builder_init_stream() to return kiwi_stream_object_t by value instead of pointer - Eliminates memory management ambiguity and potential leaks - Simpler and safer API following standard C callback patterns - Updated CStreamAdapter to work with value semantics - Added stddef.h include for C compatibility - Updated documentation with corrected examples - Backward incompatible but safer API design Co-authored-by: bab2min <19266222+bab2min@users.noreply.github.com>
9 KiB
9 KiB
StreamProvider API Documentation
Overview
The StreamProvider interface allows KiwiBuilder to read model files from sources other than the traditional filesystem. This enables embedding models in applications, loading from databases, network sources, or any custom data source.
C++ API
Basic Usage
#include <kiwi/Kiwi.h>
// Define a StreamProvider function
kiwi::KiwiBuilder::StreamProvider myProvider = [](const std::string& filename) -> std::unique_ptr<std::istream> {
// Return stream for the requested file
// Return nullptr if file not found
};
// Create KiwiBuilder with StreamProvider
kiwi::KiwiBuilder builder(myProvider, numThreads, options, modelType);
auto kiwi = builder.build();
Memory-based Example
#include <sstream>
#include <map>
std::map<std::string, std::string> modelFiles = {
{"config.txt", "model_type=base\nversion=1.0"},
{"basic.mdl", "binary_model_data"},
{"combinedRule.txt", "rule1\nrule2"}
};
auto streamProvider = [&modelFiles](const std::string& filename) -> std::unique_ptr<std::istream> {
auto it = modelFiles.find(filename);
if (it != modelFiles.end()) {
return std::make_unique<std::istringstream>(it->second);
}
return nullptr;
};
kiwi::KiwiBuilder builder(streamProvider, 4, kiwi::BuildOption::default_);
C API
Function Signature
#include <kiwi/capi.h>
typedef size_t(*kiwi_stream_read_func)(void* user_data, char* buffer, size_t length);
typedef long long(*kiwi_stream_seek_func)(void* user_data, long long offset, int whence);
typedef void(*kiwi_stream_close_func)(void* user_data);
typedef struct {
kiwi_stream_read_func read;
kiwi_stream_seek_func seek;
kiwi_stream_close_func close;
void* user_data;
} kiwi_stream_object_t;
kiwi_builder_h kiwi_builder_init_stream(
kiwi_stream_object_t (*stream_object_factory)(const char* filename),
int num_threads,
int options
);
Usage Example
// Stream implementation
typedef struct {
const char* data;
size_t size;
size_t position;
} memory_stream_t;
size_t memory_read(void* user_data, char* buffer, size_t length) {
memory_stream_t* stream = (memory_stream_t*)user_data;
size_t available = stream->size - stream->position;
size_t to_read = (length < available) ? length : available;
memcpy(buffer, stream->data + stream->position, to_read);
stream->position += to_read;
return to_read;
}
long long memory_seek(void* user_data, long long offset, int whence) {
memory_stream_t* stream = (memory_stream_t*)user_data;
// Implement seeking logic
return new_position;
}
void memory_close(void* user_data) {
free(user_data);
}
kiwi_stream_object_t create_stream(const char* filename) {
// Load file data (implementation specific)
memory_stream_t* mem_stream = load_file_data(filename);
kiwi_stream_object_t stream_obj = {0};
stream_obj.read = memory_read;
stream_obj.seek = memory_seek;
stream_obj.close = memory_close;
stream_obj.user_data = mem_stream;
return stream_obj;
}
// Create KiwiBuilder
kiwi_builder_h builder = kiwi_builder_init_stream(
create_stream,
4, // num_threads
KIWI_BUILD_DEFAULT
);
if (builder) {
kiwi_h kiwi = kiwi_builder_build(builder, NULL, 0);
// Use kiwi...
kiwi_close(kiwi);
kiwi_builder_close(builder);
}
Java API
Interface Definition
@FunctionalInterface
public interface StreamProvider {
InputStream provide(String filename);
}
Usage Example
import kr.pe.bab2min.KiwiBuilder;
import java.io.ByteArrayInputStream;
import java.io.InputStream;
import java.util.Map;
// Create model file map
Map<String, byte[]> modelFiles = Map.of(
"config.txt", "model_type=base\nversion=1.0".getBytes(),
"basic.mdl", getBinaryModelData(),
"combinedRule.txt", "rule1\nrule2".getBytes()
);
// StreamProvider implementation
KiwiBuilder.StreamProvider provider = filename -> {
byte[] data = modelFiles.get(filename);
return data != null ? new ByteArrayInputStream(data) : null;
};
// Create KiwiBuilder
KiwiBuilder builder = new KiwiBuilder(
provider,
4, // numWorkers
KiwiBuilder.BuildOption.default_,
KiwiBuilder.ModelType.none
);
Kiwi kiwi = builder.build();
Lambda Expression
// Concise lambda syntax
KiwiBuilder builder = new KiwiBuilder(
filename -> getModelStream(filename),
4,
KiwiBuilder.BuildOption.default_
);
WASM/JavaScript API
Function Definition
// Define global StreamProvider function
function myStreamProviderFunction(filename) {
// Return Uint8Array, ArrayBuffer, or null
if (filename === 'config.txt') {
return new TextEncoder().encode('model_type=base\nversion=1.0');
}
return null; // File not found
}
// Make globally available
window.myStreamProviderFunction = myStreamProviderFunction;
Usage Example
// Build request with StreamProvider
const buildRequest = {
method: 'buildWithStreamProvider',
args: [{
streamProviderCallback: 'myStreamProviderFunction',
modelType: 'none',
integrateAllomorph: true,
loadDefaultDict: false,
userWords: [
{ word: '테스트', tag: 'NNG', score: 0.0 }
]
}]
};
// Create KiwiBuilder
const kiwiId = kiwi.api(JSON.stringify(buildRequest));
const result = JSON.parse(kiwiId);
Advanced Example with Fetch
function networkStreamProvider(filename) {
// Note: This is pseudo-code as WASM StreamProvider must be synchronous
// In practice, you'd pre-fetch and cache the data
const cachedData = modelCache[filename];
if (cachedData) {
return new Uint8Array(cachedData);
}
return null;
}
// Pre-cache model files
async function loadModels() {
const files = ['config.txt', 'basic.mdl', 'combinedRule.txt'];
for (const file of files) {
const response = await fetch(`/models/${file}`);
modelCache[file] = await response.arrayBuffer();
}
}
Use Cases
1. Embedded Applications
Package model files as binary resources in your application:
// Embed model files as byte arrays
extern const char config_txt[];
extern const size_t config_txt_len;
auto provider = [](const std::string& filename) -> std::unique_ptr<std::istream> {
if (filename == "config.txt") {
return std::make_unique<std::istringstream>(
std::string(config_txt, config_txt_len)
);
}
return nullptr;
};
2. Network Loading
Load models from a CDN or server:
KiwiBuilder.StreamProvider networkProvider = filename -> {
try {
URL url = new URL("https://cdn.example.com/models/" + filename);
return url.openStream();
} catch (IOException e) {
return null;
}
};
3. Database Storage
Store models in a database:
auto dbProvider = [&database](const std::string& filename) -> std::unique_ptr<std::istream> {
auto data = database.getModelFile(filename);
if (!data.empty()) {
return std::make_unique<std::istringstream>(data);
}
return nullptr;
};
4. Encrypted/Compressed Models
Decrypt or decompress models on-the-fly:
int encrypted_provider(const char* filename, char* buffer, void* user_data) {
if (buffer == NULL) {
return get_encrypted_file_size(filename);
}
// Decrypt data into buffer
return decrypt_file(filename, buffer);
}
Error Handling
C++
- Return
nullptrfrom StreamProvider for missing files - Exceptions thrown by StreamProvider are caught and handled
C API
- Return
-1from callback function to indicate error - Check return value of
kiwi_builder_init_streamforNULL
Java
- Return
nullfromStreamProvider.provide()for missing files - IOException and other exceptions are handled gracefully
WASM
- Return
nullorundefinedfrom JavaScript function for missing files - JavaScript exceptions are caught and handled
Performance Considerations
- Caching: Cache frequently accessed files in memory
- Lazy Loading: Only load files when requested
- Compression: Consider compressing model files and decompressing in StreamProvider
- Threading: StreamProvider may be called from multiple threads in C++
Limitations
- WordDetector: When using StreamProvider,
extractWords()andextractAddWords()methods are not available - Synchronous: All StreamProvider implementations must be synchronous
- WASM: JavaScript callback function must be globally accessible
- Memory: Large models loaded into memory may increase RAM usage
Migration Guide
From Filesystem to StreamProvider
Before:
KiwiBuilder builder("/path/to/models", numThreads, options);
After:
auto provider = [](const std::string& filename) {
return std::make_unique<std::ifstream>("/path/to/models/" + filename, std::ios::binary);
};
KiwiBuilder builder(provider, numThreads, options);
This allows gradual migration and testing of the StreamProvider interface while maintaining the same functionality.