vaporetto_tantivy

Vaporetto is a fast and lightweight pointwise prediction based tokenizer. vaporetto_tantivy is a crate to use Vaporetto in Tantivy.

Example

```rust use std::fs::File; use std::io::{Read, BufReader};

use tantivy::schema::{IndexRecordOption, Schema, TextFieldIndexing, TextOptions}; use tantivy::Index; use vaporetto::Model; use vaporetto_tantivy::VaporettoTokenizer;

let mut schemabuilder = Schema::builder(); let textfieldindexing = TextFieldIndexing::default() .settokenizer("javaporetto") .setindexoption(IndexRecordOption::WithFreqsAndPositions); let textoptions = TextOptions::default() .setindexingoptions(textfieldindexing) .setstored(); schemabuilder.addtextfield("title", textoptions); let schema = schemabuilder.build(); let index = Index::createinram(schema);

// Loads a model with decompression. let mut f = BufReader::new(File::open("bccwj-suw+unidic.model.zst").unwrap()); let mut decoder = ruzstd::StreamingDecoder::new(&mut f).unwrap(); let mut buff = vec![]; decoder.readtoend(&mut buff).unwrap(); let model = Model::read(&mut buff.as_slice()).unwrap();

// Creates VaporettoTokenizer with wsconst=DGR. let tokenizer = VaporettoTokenizer::new(model, "DGR").unwrap(); index .tokenizers() .register("ja_vaporetto", tokenizer); ```