Commit 5488110b authored by Emmanuel Raviart's avatar Emmanuel Raviart
Browse files

Add french stemmer & stop words, unicode compatible ngram tokenizer & slugifier.

parent f0d5357c
......@@ -193,11 +193,18 @@ name = "crunchy"
version = "0.1.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "deunicode"
version = "0.4.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "dfih-search"
version = "0.1.0"
dependencies = [
"json 0.11.13 (registry+https://github.com/rust-lang/crates.io-index)",
"rust-stemmers 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)",
"slug 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)",
"tantivy 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)",
]
......@@ -660,6 +667,14 @@ dependencies = [
"serde 1.0.79 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "slug"
version = "0.1.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"deunicode 0.4.3 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "smallvec"
version = "0.6.5"
......@@ -878,6 +893,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
"checksum crossbeam-epoch 0.6.0 (registry+https://github.com/rust-lang/crates.io-index)" = "9c90f1474584f38e270b5b613e898c8c328aa4f3dea85e0a27ac2e642f009416"
"checksum crossbeam-utils 0.5.0 (registry+https://github.com/rust-lang/crates.io-index)" = "677d453a17e8bd2b913fa38e8b9cf04bcdbb5be790aa294f2389661d72036015"
"checksum crunchy 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)" = "a2f4a431c5c9f662e1200b7c7f02c34e91361150e382089a8f2dec3ba680cbda"
"checksum deunicode 0.4.3 (registry+https://github.com/rust-lang/crates.io-index)" = "850878694b7933ca4c9569d30a34b55031b9b139ee1fc7b94a527c4ef960d690"
"checksum downcast 0.9.2 (registry+https://github.com/rust-lang/crates.io-index)" = "6c6fe31318b6ef21166c8e839e680238eb16f875849d597544eead7ec882eed3"
"checksum either 1.5.0 (registry+https://github.com/rust-lang/crates.io-index)" = "3be565ca5c557d7f59e7cfcf1844f9e3033650c929c6566f511e8005f205c1d0"
"checksum fail 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "bd2e1a22c616c8c8c96b6e07c243014551f3ba77291d24c22e0bfea6830c0b4e"
......@@ -938,6 +954,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
"checksum serde 1.0.79 (registry+https://github.com/rust-lang/crates.io-index)" = "84257ccd054dc351472528c8587b4de2dbf0dc0fe2e634030c1a90bfdacebaa9"
"checksum serde_derive 1.0.79 (registry+https://github.com/rust-lang/crates.io-index)" = "31569d901045afbff7a9479f793177fe9259819aff10ab4f89ef69bbc5f567fe"
"checksum serde_json 1.0.32 (registry+https://github.com/rust-lang/crates.io-index)" = "43344e7ce05d0d8280c5940cabb4964bea626aa58b1ec0e8c73fa2a8512a38ce"
"checksum slug 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)" = "b3bc762e6a4b6c6fcaade73e77f9ebc6991b676f88bb2358bddb56560f073373"
"checksum smallvec 0.6.5 (registry+https://github.com/rust-lang/crates.io-index)" = "153ffa32fd170e9944f7e0838edf824a754ec4c1fc64746fcc9fe1f8fa602e5d"
"checksum snap 0.2.5 (registry+https://github.com/rust-lang/crates.io-index)" = "95d697d63d44ad8b78b8d235bf85b34022a78af292c8918527c5f0cffdde7f43"
"checksum stable_deref_trait 1.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "dba1a27d3efae4351c8051072d619e3ade2820635c3958d826bfea39d59b54c8"
......
......@@ -5,4 +5,6 @@ authors = ["Emmanuel Raviart <emmanuel@raviart.com>"]
[dependencies]
json = "0.11"
rust-stemmers = "1"
slug = "0.1"
tantivy = "0.7"
// #![cfg_attr(feature = "cargo-clippy", allow(clippy::new_without_default))]
// Module adapted from https://github.com/tantivy-search/tantivy/blob/master/src/tokenizer/stemmer.rs
use rust_stemmers::{self, Algorithm};
use std::sync::Arc;
use tantivy::tokenizer::{Token, TokenFilter, TokenStream};
/// `FrenchStemmer` token filter.
/// Tokens are expected to be slugified beforehands.
#[derive(Clone)]
pub struct FrenchStemmer {
stemmer_algorithm: Arc<Algorithm>,
}
impl FrenchStemmer {
/// Creates a new FrenchStemmer `TokenFilter`.
pub fn new() -> FrenchStemmer {
FrenchStemmer {
stemmer_algorithm: Arc::new(Algorithm::French),
}
}
}
impl<TailTokenStream> TokenFilter<TailTokenStream> for FrenchStemmer
where
TailTokenStream: TokenStream,
{
type ResultTokenStream = FrenchStemmerTokenStream<TailTokenStream>;
fn transform(&self, token_stream: TailTokenStream) -> Self::ResultTokenStream {
let inner_stemmer = rust_stemmers::Stemmer::create(Algorithm::French);
FrenchStemmerTokenStream::wrap(inner_stemmer, token_stream)
}
}
pub struct FrenchStemmerTokenStream<TailTokenStream>
where
TailTokenStream: TokenStream,
{
tail: TailTokenStream,
stemmer: rust_stemmers::Stemmer,
}
impl<TailTokenStream> TokenStream for FrenchStemmerTokenStream<TailTokenStream>
where
TailTokenStream: TokenStream,
{
fn token(&self) -> &Token {
self.tail.token()
}
fn token_mut(&mut self) -> &mut Token {
self.tail.token_mut()
}
fn advance(&mut self) -> bool {
if self.tail.advance() {
// TODO remove allocation.
let stemmed_str: String = self.stemmer.stem(&self.token().text).into_owned();
self.token_mut().text.clear();
self.token_mut().text.push_str(&stemmed_str);
true
} else {
false
}
}
}
impl<TailTokenStream> FrenchStemmerTokenStream<TailTokenStream>
where
TailTokenStream: TokenStream,
{
fn wrap(
stemmer: rust_stemmers::Stemmer,
tail: TailTokenStream,
) -> FrenchStemmerTokenStream<TailTokenStream> {
FrenchStemmerTokenStream { tail, stemmer }
}
}
// French stop words, taken from http://snowball.tartarus.org/algorithms/french/stop.txt
pub const FRENCH_STOP_WORDS: [&str; 164] = [
"au", // a + le
"aux", // a + les
"avec", // with
"ce", // this
"ces", // these
"dans", // with
"de", // of
"des", // de + les
"du", // de + le
"elle", // she
"en", // `of them' etc
"et", // and
"eux", // them
"il", // he
"je", // I
"la", // the
"le", // the
"leur", // their
"lui", // him
"ma", // my (fem)
"mais", // but
"me", // me
"même", // same; as in moi-même (myself) etc
"mes", // me (pl)
"moi", // me
"mon", // my (masc)
"ne", // not
"nos", // our (pl)
"notre", // our
"nous", // we
"on", // one
"ou", // where
"par", // by
"pas", // not
"pour", // for
"qu", // que before vowel
"que", // that
"qui", // who
"sa", // his, her (fem)
"se", // oneself
"ses", // his (pl)
"son", // his, her (masc)
"sur", // on
"ta", // thy (fem)
"te", // thee
"tes", // thy (pl)
"toi", // thee
"ton", // thy (masc)
"tu", // thou
"un", // a
"une", // a
"vos", // your (pl)
"votre", // your
"vous", // you
// single letter forms
"c", // c'
"d", // d'
"j", // j'
"l", // l'
"à", // to, at
"m", // m'
"n", // n'
"s", // s'
"t", // t'
"y", // there
// forms of être (not including the infinitive):
"été", "étée", "étées", "étés", "étant", "suis", "es", "est", "sommes", "êtes",
"sont", "serai", "seras", "sera", "serons", "serez", "seront", "serais", "serait", "serions",
"seriez", "seraient", "étais", "était", "étions", "étiez", "étaient", "fus", "fut",
"fûmes", "fûtes", "furent", "sois", "soit", "soyons", "soyez", "soient", "fusse", "fusses",
"fût", "fussions", "fussiez", "fussent",
// forms of avoir (not including the infinitive):
"ayant", "eu", "eue", "eues", "eus", "ai", "as", "avons", "avez", "ont", "aurai", "auras",
"aura", "aurons", "aurez", "auront", "aurais", "aurait", "aurions", "auriez", "auraient",
"avais", "avait", "avions", "aviez", "avaient", "eut", "eûmes", "eûtes", "eurent", "aie",
"aies", "ait", "ayons", "ayez", "aient", "eusse", "eusses", "eût", "eussions", "eussiez",
"eussent", // Later additions (from Jean-Christophe Deschamps)
"ceci", // this
"cela", // that (added 11 Apr 2012. Omission reported by Adrien Grand)
"celà", // that (incorrect, though common)
"cet", // this
"cette", // this
"ici", // here
"ils", // they
"les", // the (pl)
"leurs", // their (pl)
"quel", // which
"quels", // which
"quelle", // which
"quelles", // which
"sans", // without
"soi", // oneself
];
// #[macro_use]
extern crate json;
extern crate rust_stemmers;
extern crate slug;
extern crate tantivy;
mod french_stemmer;
mod french_stop_words;
mod ngram_tokenizer;
mod slugifier;
use french_stemmer::FrenchStemmer;
use french_stop_words::FRENCH_STOP_WORDS;
use ngram_tokenizer::NgramTokenizer;
use slug::slugify;
use slugifier::Slugifier;
use std::fs::{self, File};
use std::io::prelude::*;
use std::path::Path;
use tantivy::collector::TopCollector;
use tantivy::query::QueryParser;
use tantivy::schema::*;
use tantivy::tokenizer::*;
use tantivy::tokenizer::{RemoveLongFilter, SimpleTokenizer, StopWordFilter, Tokenizer};
use tantivy::Index;
fn main() -> tantivy::Result<()> {
......@@ -21,7 +30,8 @@ fn main() -> tantivy::Result<()> {
let mut schema_builder = SchemaBuilder::default();
let text_field_indexing = TextFieldIndexing::default()
.set_tokenizer("french_tokenizer")
.set_tokenizer("french_autocompletion_tokenizer")
// .set_tokenizer("french_tokenizer")
.set_index_option(IndexRecordOption::WithFreqsAndPositions);
let text_options = TextOptions::default()
.set_indexing_options(text_field_indexing)
......@@ -31,15 +41,22 @@ fn main() -> tantivy::Result<()> {
let schema = schema_builder.build();
let index = Index::create_in_dir(&index_dir, schema.clone())?;
let tokenizer = SimpleTokenizer
.filter(LowerCaser)
let french_autocompletion_tokenizer = NgramTokenizer::new(3, 3, false).filter(Slugifier);
index.tokenizers().register(
"french_autocompletion_tokenizer",
french_autocompletion_tokenizer,
);
let french_tokenizer = SimpleTokenizer
.filter(RemoveLongFilter::limit(40))
.filter(StopWordFilter::remove(
FRENCH_STOP_WORDS
.iter()
.map(|word| word.to_string())
.collect(),
));
index.tokenizers().register("french_tokenizer", tokenizer);
FRENCH_STOP_WORDS.iter().map(slugify).collect(),
))
.filter(Slugifier)
.filter(FrenchStemmer::new());
index
.tokenizers()
.register("french_tokenizer", french_tokenizer);
let mut index_writer = index.writer(50_000_000)?;
let issuers_dir = data_dir.join("issuers");
......@@ -50,7 +67,7 @@ fn main() -> tantivy::Result<()> {
let mut json_string = String::new();
json_file
.read_to_string(&mut json_string)
.expect("Something went wrong reading the file");
.expect("An error occurred while reading file");
let issuer = json::parse(&json_string).expect("Invalid JSON");
let mut issuer_doc = Document::default();
for name in issuer["names"].members() {
......@@ -63,7 +80,7 @@ fn main() -> tantivy::Result<()> {
index.load_searchers()?;
let searcher = index.searcher();
let query_parser = QueryParser::for_index(&index, vec![name_field]);
let query = query_parser.parse_query("forges")?;
let query = query_parser.parse_query("lyonnais")?;
let mut top_collector = TopCollector::with_limit(10);
searcher.search(&*query, &mut top_collector)?;
let doc_addresses = top_collector.docs();
......
use std::str::Chars;
use tantivy::tokenizer::{Token, TokenStream, Tokenizer};
// Module taken from https://github.com/tantivy-search/tantivy/pull/430/
/// Tokenize the text by splitting words into n-grams of the given size(s)
///
/// With this tokenizer, the `position` field expresses the starting offset of the ngram
/// rather than the `token` offset.
///
/// Example 1: `hello` would be tokenized as (min_gram: 2, max_gram: 3, prefix_only: false)
///
/// | Term | he | hel | el | ell | ll | llo | lo |
/// |----------|-----|-----|-----|-----|-----|-----|----|
/// | Position | 0 | 0 | 1 | 1 | 2 | 2 | 3 |
/// | Offsets | 0,2 | 0,3 | 1,3 | 1,4 | 2,4 | 2,5 | 3,5|
///
/// Example 2: `hello` would be tokenized as (min_gram: 2, max_gram: 5, prefix_only: **true**)
///
/// | Term | he | hel | hell | hello |
/// |----------|-----|-----|-------|-------|
/// | Position | 0 | 0 | 0 | 0 |
/// | Offsets | 0,2 | 0,3 | 0,4 | 0,5 |
///
/// Example 3: `hεllo` (non-ascii) would be tokenized as (min_gram: 2, max_gram: 5, prefix_only: **true**)
///
/// | Term | hε | hεl | hεll | hεllo |
/// |----------|-----|-----|-------|-------|
/// | Position | 0 | 0 | 0 | 0 |
/// | Offsets | 0,3 | 0,4 | 0,5 | 0,6 |
///
/// # Example
///
/// ```
/// extern crate tantivy;
/// use tantivy::tokenizer::*;
/// use tantivy::tokenizer::assert_token;
///
/// # fn main() {
/// let tokenizer = NgramTokenizer::new(2, 3, false);
/// let mut stream = tokenizer.token_stream("hello");
///
/// assert_token(stream.next().unwrap(), 0, "he", 0, 2);
/// assert_token(stream.next().unwrap(), 0, "hel", 0, 3);
/// assert_token(stream.next().unwrap(), 1, "el", 1, 3);
/// assert_token(stream.next().unwrap(), 1, "ell", 1, 4);
/// assert_token(stream.next().unwrap(), 2, "ll", 2, 4);
/// assert_token(stream.next().unwrap(), 2, "llo", 2, 5);
/// assert_token(stream.next().unwrap(), 3, "lo", 3, 5);
/// assert!(stream.next().is_none());
/// # }
/// ```
#[derive(Clone)]
pub struct NgramTokenizer {
/// min size of the n-gram
min_gram: usize,
/// max size of the n-gram
max_gram: usize,
/// if true, will only parse the leading edge of the input
prefix_only: bool,
}
impl NgramTokenizer {
/// Configures a new Ngram tokenizer
pub fn new(min_gram: usize, max_gram: usize, prefix_only: bool) -> NgramTokenizer {
assert!(min_gram > 0, "min_gram must be greater than 0");
assert!(
min_gram <= max_gram,
"min_gram must not be greater than max_gram"
);
NgramTokenizer {
min_gram,
max_gram,
prefix_only,
}
}
}
pub struct NgramTokenStream<'a> {
text: &'a str,
chars: Chars<'a>,
position: usize,
char_count: usize,
token: Token,
min_gram: usize,
max_gram: usize,
gram_size: usize,
prefix_only: bool,
}
impl<'a> Tokenizer<'a> for NgramTokenizer {
type TokenStreamImpl = NgramTokenStream<'a>;
fn token_stream(&self, text: &'a str) -> Self::TokenStreamImpl {
let chars = text.chars();
let char_count = text.chars().count();
NgramTokenStream {
text,
chars,
char_count,
position: 0,
token: Token::default(),
min_gram: self.min_gram,
max_gram: self.max_gram,
prefix_only: self.prefix_only,
gram_size: self.min_gram,
}
}
}
impl<'a> NgramTokenStream<'a> {
/// Get the next set of token options
/// cycle through 1,2 (min..=max)
/// returning None if processing should stop
fn chomp(&mut self) -> Option<(usize, usize)> {
// Have we exceeded the bounds of the text we are indexing?
if self.gram_size > self.max_gram {
if self.prefix_only {
return None;
}
// since we aren't just processing edges
// we need to reset the gram size
self.gram_size = self.min_gram;
// and move down the chain of letters
self.position += 1;
}
let result = if (self.position + self.gram_size) <= self.char_count {
// map from logical to physical
let chars = self.chars.clone();
let raw_position = chars.take(self.position).map(|c| c.len_utf8()).sum();
let chars = self.chars.clone();
let raw_size = chars
.skip(self.position)
.take(self.gram_size)
.map(|c| c.len_utf8())
.sum();
Some((raw_position, raw_size))
} else {
None
};
// increase the gram size for the next pass
self.gram_size += 1;
result
}
}
impl<'a> TokenStream for NgramTokenStream<'a> {
fn advance(&mut self) -> bool {
// clear out working token text
self.token.text.clear();
if let Some((position, size)) = self.chomp() {
self.token.position = self.position;
let offset_from = position;
let offset_to = offset_from + size;
self.token.offset_from = offset_from;
self.token.offset_to = offset_to;
self.token.text.push_str(&self.text[offset_from..offset_to]);
true
} else {
false
}
}
fn token(&self) -> &Token {
&self.token
}
fn token_mut(&mut self) -> &mut Token {
&mut self.token
}
}
// Module adapted from https://github.com/tantivy-search/tantivy/blob/master/src/tokenizer/lower_caser.rs
use slug::slugify;
use std::mem;
use tantivy::tokenizer::{Token, TokenFilter, TokenStream};
/// Token filter that replace terms with their slugs.
#[derive(Clone)]
pub struct Slugifier;
impl<TailTokenStream> TokenFilter<TailTokenStream> for Slugifier
where
TailTokenStream: TokenStream,
{
type ResultTokenStream = SlugifierTokenStream<TailTokenStream>;
fn transform(&self, token_stream: TailTokenStream) -> Self::ResultTokenStream {
SlugifierTokenStream::wrap(token_stream)
}
}
pub struct SlugifierTokenStream<TailTokenStream> {
buffer: String,
tail: TailTokenStream,
}
// Write a slugified version of text into output.
fn to_slug(text: &mut String, output: &mut String) {
output.clear();
output.push_str(&slugify(text).replace("-", " "));
}
impl<TailTokenStream> TokenStream for SlugifierTokenStream<TailTokenStream>
where
TailTokenStream: TokenStream,
{
fn token(&self) -> &Token {
self.tail.token()
}
fn token_mut(&mut self) -> &mut Token {
self.tail.token_mut()
}
fn advance(&mut self) -> bool {
if self.tail.advance() {
to_slug(&mut self.tail.token_mut().text, &mut self.buffer);
mem::swap(&mut self.tail.token_mut().text, &mut self.buffer);
true
} else {
false
}
}
}
impl<TailTokenStream> SlugifierTokenStream<TailTokenStream>
where
TailTokenStream: TokenStream,
{
fn wrap(tail: TailTokenStream) -> SlugifierTokenStream<TailTokenStream> {
SlugifierTokenStream {
tail,
buffer: String::with_capacity(100),
}
}
}
#[cfg(test)]
mod tests {
use super::Slugifier;
use tantivy::tokenizer::SimpleTokenizer;
use tantivy::tokenizer::TokenStream;
use tantivy::tokenizer::Tokenizer;
#[test]
fn test_to_slug() {
assert_eq!(
slug_helper("Русский текст"),
vec!["русский".to_string(), "текст".to_string()]
);
}
fn slug_helper(text: &str) -> Vec<String> {
let mut tokens = vec![];
let mut token_stream = SimpleTokenizer.filter(Slugifier).token_stream(text);
while token_stream.advance() {
let token_text = token_stream.token().text.clone();
tokens.push(token_text);
}
tokens
}
#[test]
fn test_slugr() {
assert_eq!(slug_helper("Tree"), vec!["tree".to_string()]);
assert_eq!(
slug_helper("Русский"),
vec!["русский".to_string()]
);
}
}
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment