#include #include #include #include #include "toolUtils.h" using namespace std; using namespace kiwi; vector splitMultipleInts(const string& s, const char delim = ',') { vector ret; size_t p = 0, e = 0; while (1) { size_t t = s.find(delim, p); if (t == s.npos) { ret.emplace_back(atoi(&s[e])); return ret; } else { ret.emplace_back(atoi(&s[e])); p = t + 1; e = t + 1; } } } int run(const KiwiBuilder::ModelBuildArgs& args, const string& output, bool skipBigram) { try { tutils::Timer timer; if (skipBigram) { cout << "Build SkipBigram model based on KnLM: " << output << endl; KiwiBuilder kb{ output, args }; } else { KiwiBuilder{ args }.saveModel(output); } double tm = timer.getElapsed(); cout << "Total: " << tm << " ms " << endl; return 0; } catch (const exception& e) { cerr << e.what() << endl; return -1; } } using namespace TCLAP; int main(int argc, const char* argv[]) { tutils::setUTF8Output(); CmdLine cmd{ "Kiwi ModelBuilder", ' ', "0.11.0" }; ValueArg morpheme{ "", "morpheme", "morpheme files", true, "", "string" }; SwitchArg compress{ "", "compress", "compress LM" }; SwitchArg quantize{ "", "quantize", "quantize LM" }; SwitchArg tagHistory{ "", "history", "use tag history of LM" }; SwitchArg skipBigram{ "", "skipbigram", "build skipbigram model" }; ValueArg workers{ "w", "workers", "number of workers", false, 1, "int" }; ValueArg morMinCnt{ "", "morpheme_min_cnt", "min count of morpheme", false, 10, "int" }; ValueArg lmOrder{ "", "order", "order of LM", false, 4, "int" }; ValueArg lmMinCnt{ "", "min_cnt", "min count of LM", false, "1", "multiple ints with comma"}; ValueArg lmLastOrderMinCnt{ "", "last_min_cnt", "min count of the last order of LM", false, 2, "int" }; ValueArg output{ "o", "output", "output model path", true, "", "string" }; ValueArg sbgSize{ "", "sbg_size", "sbg size", false, 1000000, "int" }; ValueArg sbgEpochs{ "", "sbg_epochs", "sbg epochs", false, 10, "double" }; ValueArg sbgEvalSetRatio{ "", "sbg_eval_ratio", "", false, 20, "int" }; ValueArg sbgMinCnt{ "", "sbg_min_cnt", "", false, 150, "int" }; ValueArg sbgMinCoCnt{ "", "sbg_min_co_cnt", "", false, 20, "int" }; UnlabeledMultiArg inputs{ "inputs", "input copora", true, "string" }; cmd.add(output); cmd.add(inputs); cmd.add(morpheme); cmd.add(compress); cmd.add(quantize); cmd.add(tagHistory); cmd.add(skipBigram); cmd.add(morMinCnt); cmd.add(lmOrder); cmd.add(lmMinCnt); cmd.add(lmLastOrderMinCnt); cmd.add(workers); cmd.add(sbgSize); cmd.add(sbgEpochs); cmd.add(sbgEvalSetRatio); cmd.add(sbgMinCnt); cmd.add(sbgMinCoCnt); try { cmd.parse(argc, argv); } catch (const ArgException& e) { cerr << "error: " << e.error() << " for arg " << e.argId() << endl; return -1; } KiwiBuilder::ModelBuildArgs args; args.morphemeDef = morpheme; args.corpora = inputs.getValue(); args.compressLm = compress; args.quantizeLm = quantize; args.useLmTagHistory = tagHistory; args.minMorphCnt = morMinCnt; args.lmOrder = lmOrder; args.numWorkers = workers; args.sbgSize = sbgSize; args.sbgEpochs = sbgEpochs; args.sbgEvalSetRatio = sbgEvalSetRatio; args.sbgMinCount = sbgMinCnt; args.sbgMinCoCount = sbgMinCoCnt; auto v = splitMultipleInts(lmMinCnt.getValue()); if (v.empty()) { args.lmMinCnts.resize(1, 1); } else if (v.size() == 1 || v.size() == lmOrder) { args.lmMinCnts = v; } else { cerr << "error: min_cnt size should be 1 or equal to order" << endl; return -1; } return run(args, output, skipBigram); }