Crate Build Status codecov dependency status Documentation Crate Crate

Tantivy analysis

This a collection of Tokenizer and TokenFilters for Tantivy that aims to replicate features available in Lucene.

It relies on Google's Rust ICU.

Breaking word rules are from Lucene.

Features

By default, all features are included.

Example

```rust use tantivy::{doc, Index, ReloadPolicy}; use tantivy::collector::TopDocs; use tantivy::query::QueryParser; use tantivy::schema::{IndexRecordOption, SchemaBuilder, TextFieldIndexing, TextOptions}; use tantivy::tokenizer::TextAnalyzer; use tantivyanalysiscontrib::icu::{Direction, ICUTokenizer, ICUTransformTokenFilter};

const ANALYSIS_NAME: &str = "test";

fn main() -> Result<(), Box> { let options = TextOptions::default() .setindexingoptions( TextFieldIndexing::default() .settokenizer(ANALYSISNAME) .setindexoption(IndexRecordOption::WithFreqsAndPositions), ) .setstored(); let mut schema = SchemaBuilder::new(); schema.addtext_field("field", options); let schema = schema.build();

let transform = ICUTransformTokenFilter {
    compound_id: "Any-Latin; NFD; [:Nonspacing Mark:] Remove; Lower;  NFC".to_string(),
    rules: None,
    direction: Direction::Forward
};
let icu_analyzer = TextAnalyzer::from(ICUTokenizer).filter(transform);

let field = schema.get_field("field").expect("Can't get field.");

let index = Index::create_in_ram(schema);
index.tokenizers().register(ANALYSIS_NAME, icu_analyzer);

let mut index_writer = index.writer(3_000_000)?;

index_writer.add_document(doc!(
    field => "中国"
))?;
index_writer.add_document(doc!(
    field => "Another Document"
))?;

index_writer.commit()?;

let reader = index
    .reader_builder()
    .reload_policy(ReloadPolicy::OnCommit)
    .try_into()?;

let searcher = reader.searcher();

let query_parser = QueryParser::for_index(&index, vec![field]);

let query = query_parser.parse_query("zhong")?;
let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?;
let mut result: Vec<String> = Vec::new();
for (_, doc_address) in top_docs {
    let retrieved_doc = searcher.doc(doc_address)?;
    let values: Vec<&str> = retrieved_doc.get_all(field).map(|v| v.as_text().unwrap()).collect();
    for v in values {
        result.push(v.to_string());
    }
}
let expected: Vec<String> = vec!["中国".to_string()];
assert_eq!(expected, result);

let query = query_parser.parse_query("国")?;
let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?;
let mut result: Vec<String> = Vec::new();
for (_, doc_address) in top_docs {
    let retrieved_doc = searcher.doc(doc_address)?;
    let values: Vec<&str> = retrieved_doc.get_all(field).map(|v| v.as_text().unwrap()).collect();
    for v in values {
        result.push(v.to_string());
    }
}
let expected: Vec<String> = vec!["中国".to_string()];
assert_eq!(expected, result);
let query = query_parser.parse_query("document")?;
let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?;
let mut result: Vec<String> = Vec::new();
for (_, doc_address) in top_docs {
    let retrieved_doc = searcher.doc(doc_address)?;
    let values: Vec<&str> = retrieved_doc.get_all(field).map(|v| v.as_text().unwrap()).collect();
    for v in values {
        result.push(v.to_string());
    }
}
let expected: Vec<String> = vec!["Another Document".to_string()];
assert_eq!(expected, result);
Ok(())

} ```

License

Licensed under either of

at your option.

Contribution

Unless you explicitly state otherwise, any contribution intentionally submitted for inclusion in the work by you, as defined in the Apache-2.0 license, shall be dual licensed as above, without any additional terms or conditions.