Vaporetto is a fast and lightweight pointwise prediction based tokenizer. vaporetto_rules is rule-base filters for Vaporetto.
```rust use std::fs::File; use std::io::BufReader;
use vaporetto::{CharacterType, Model, Predictor, Sentence}; use vaporettorules::{ SentenceFilter, StringFilter, sentencefilters::{ConcatGraphemeClustersFilter, KyteaWsConstFilter}, string_filters::KyteaFullwidthFilter, };
let mut f = BufReader::new(File::open("model.bin").unwrap()); let model = Model::read(&mut f).unwrap(); let mut predictor = Predictor::new(model);
let prefilters: Vec
let input = "Vaporettoは仲良し家族👨👨👧👦を離れ離れにさせません。" .to_string();
let preprocinput = prefilters.iter().fold(input, |s, filter| filter.filter(s));
let sentence = Sentence::fromraw(preprocinput).unwrap(); let sentence = predictor.predict(sentence);
let postprocresult = postfilters.iter().fold(sentence, |s, filter| filter.filter(s));
asserteq!( "Vaporetto は 仲良 し 家族 👨👨👧👦 を 離れ離れ に さ せ ま せ ん 。", postprocresult.totokenizedstring().unwrap(), ); ```
Licensed under either of
at your option.
Unless you explicitly state otherwise, any contribution intentionally submitted for inclusion in the work by you, as defined in the Apache-2.0 license, shall be dual licensed as above, without any additional terms or conditions.