Commit 8f20268a authored by David Smadja's avatar David Smadja
Browse files

Update dependencies, adapt code, drop autocomplete

parent 3232a374
This diff is collapsed.
[package]
name = "dfih-search"
version = "0.1.0"
authors = ["Emmanuel Raviart <emmanuel@raviart.com>"]
authors = ["Emmanuel Raviart <emmanuel@raviart.com>", "David Smadja <david.smadja@altatech.fr>"]
[dependencies]
clap = "2"
json = "0.11"
json = "0.12"
rust-stemmers = "1"
serde = "1"
serde_derive = "1"
serde_json = "1"
serde_urlencoded = "0.6"
slug = "0.1"
tantivy = "0.7"
tower-web = "0.3"
tantivy = "0.10"
tower-web = "0.3"
\ No newline at end of file
......@@ -15,7 +15,6 @@ fn run_index(data_dir: &Path) -> tantivy::Result<()> {
let id_field = schema.get_field("id").unwrap();
let main_name_field = schema.get_field("main_name").unwrap();
let name_autocomplete_field = schema.get_field("name_autocomplete").unwrap();
let name_field = schema.get_field("name").unwrap();
let prices_count_field = schema.get_field("prices_count").unwrap();
let public_status_field = schema.get_field("public_status").unwrap();
......@@ -32,7 +31,6 @@ fn run_index(data_dir: &Path) -> tantivy::Result<()> {
issuer_doc.add_u64(id_field, issuer["id"].as_u64().unwrap());
issuer_doc.add_text(main_name_field, issuer["name"].as_str().unwrap());
for name in issuer["names"].members() {
issuer_doc.add_text(name_autocomplete_field, &name.to_string());
issuer_doc.add_text(name_field, &name.to_string());
}
let prices_count: u64 = issuer["securities"]
......@@ -50,6 +48,7 @@ fn run_index(data_dir: &Path) -> tantivy::Result<()> {
index_writer.add_document(issuer_doc);
}
index_writer.commit()?;
}
......@@ -61,7 +60,6 @@ fn run_index(data_dir: &Path) -> tantivy::Result<()> {
let id_field = schema.get_field("id").unwrap();
let main_name_field = schema.get_field("main_name").unwrap();
let name_autocomplete_field = schema.get_field("name_autocomplete").unwrap();
let name_field = schema.get_field("name").unwrap();
let functions_count_field = schema.get_field("functions_count").unwrap();
......@@ -77,7 +75,6 @@ fn run_index(data_dir: &Path) -> tantivy::Result<()> {
person_doc.add_u64(id_field, person["id"].as_u64().unwrap());
person_doc.add_text(main_name_field, person["name"].as_str().unwrap());
for name in person["names"].members() {
person_doc.add_text(name_autocomplete_field, &name.to_string());
person_doc.add_text(name_field, &name.to_string());
}
person_doc.add_u64(functions_count_field, person["functions"].len() as u64);
......
......@@ -4,22 +4,16 @@ use persons;
use std::convert::From;
use std::net::SocketAddr;
use std::path::Path;
use tantivy::collector::{self, CountCollector, TopFieldCollector, TopScoreCollector};
use tantivy::query::{AllQuery, BooleanQuery, Occur, Query, QueryParser, RangeQuery};
use tantivy::schema::{NamedFieldDocument, Schema};
use tantivy;
use tantivy::collector::{Count, TopDocs};
use tantivy::query::{BooleanQuery, Occur, QueryParser, RangeQuery};
use tantivy::schema::*;
use tantivy::DocAddress;
use tantivy::Document;
use tantivy::Index;
use tantivy::{doc, Index, ReloadPolicy};
use tower_web::middleware::cors::{AllowedOrigins, CorsBuilder};
use tower_web::{Response, ServiceBuilder};
#[derive(Response)]
struct AutocompleteResponse {
q: String,
count: usize,
hits: Vec<Hit>,
}
#[derive(Serialize)]
struct Hit {
doc: NamedFieldDocument,
......@@ -27,7 +21,6 @@ struct Hit {
}
struct IssuersResource {
autocomplete_query_parser: QueryParser,
index: Index,
list_query_parser: QueryParser,
schema: Schema,
......@@ -55,7 +48,6 @@ struct IssuersSearchQuery {
}
struct PersonsResource {
autocomplete_query_parser: QueryParser,
index: Index,
list_query_parser: QueryParser,
schema: Schema,
......@@ -95,127 +87,86 @@ fn public_status_default() -> usize {
2
}
impl_web!{
impl_web! {
impl IssuersResource {
#[get("/issuers")]
#[content_type("application/json")]
fn list(&self, query_string: IssuersSearchQuery) -> Result<ListResponse, ()> {
let reader = self.index
.reader_builder()
.reload_policy(ReloadPolicy::OnCommit)
.try_into()
.unwrap();
let searcher = reader.searcher();
let schema = self.index.schema();
let prices_count_field = schema.get_field("prices_count").unwrap();
let public_status = query_string.public_status;
let q = query_string.q.replace("'", " ");
let q = q.replace("\"", " ");
let q = q.replace(":" , " ");
let q = q.replace("-" , " ");
let q = q.replace("+" , " ");
let q = q.trim();
let q = if q.is_empty() {
"*"
} else {
q
};
let searcher = self.index.searcher();
let mut count_collector = CountCollector::default();
let hits: Vec<Hit> = if q.is_empty() {
let mut top_collector = TopFieldCollector::<u64>::with_limit(self.schema.get_field("prices_count").unwrap(), query_string.offset + query_string.limit);
{
let mut chained_collector = collector::chain()
.push(&mut top_collector)
.push(&mut count_collector);
if public_status >= 2 {
let query = Box::new(AllQuery);
query.search(&searcher, &mut chained_collector).unwrap();
} else {
let query = RangeQuery::new_u64(self.schema.get_field(
"public_status").unwrap(),
public_status as u64..public_status as u64 + 1,
);
query.search(&searcher, &mut chained_collector).unwrap();
}
}
top_collector
.docs()
.iter()
.skip(query_string.offset)
.map(|doc_address| {
let doc: Document = searcher.doc(*doc_address).unwrap();
self.create_hit(&doc, doc_address)
})
.collect()
let term_query = self.list_query_parser
.parse_query(&q)
.expect("Parsing the query failed");
let range = if public_status < 2 {
public_status as u64..public_status as u64 + 1
} else {
let mut top_collector = TopScoreCollector::with_limit(query_string.offset + query_string.limit);
{
let mut chained_collector = collector::chain()
.push(&mut top_collector)
.push(&mut count_collector);
let term_query = self.list_query_parser
.parse_query(q)
.expect("Parsing the query failed");
if public_status >= 2 {
term_query.search(&searcher, &mut chained_collector).unwrap();
} else {
let public_status_query = RangeQuery::new_u64(self.schema.get_field(
0..2
};
let public_status_query = RangeQuery::new_u64(self.schema.get_field(
"public_status").unwrap(),
public_status as u64..public_status as u64 + 1,
range,
);
let query = BooleanQuery::from(vec![
let query = BooleanQuery::from(vec![
(Occur::Must, term_query),
(Occur::Must, Box::new(public_status_query)),
]);
query.search(&searcher, &mut chained_collector).unwrap();
}
}
top_collector
.docs()
.iter()
.skip(query_string.offset)
.map(|doc_address| {
let doc: Document = searcher.doc(*doc_address).unwrap();
self.create_hit(&doc, doc_address)
})
.collect()
};
Ok(ListResponse {
q: query_string.q.clone(),
count: count_collector.count(),
hits,
})
}
#[get("/issuers/autocomplete")]
#[content_type("application/json")]
fn autocomplete(&self, query_string: SearchQuery) -> Result<AutocompleteResponse, ()> {
let q = query_string.q.replace("'", " ");
let q = q.replace("\"", " ");
let q = q.replace(":" , " ");
let q = q.replace("-" , " ");
let q = q.replace("+" , " ");
let q = q.trim();
let query = if q.is_empty() {
Box::new(AllQuery)
} else {
self.autocomplete_query_parser
.parse_query(q)
.expect("Parsing the query failed")
};
let searcher = self.index.searcher();
let mut count_collector = CountCollector::default();
let mut top_collector = TopScoreCollector::with_limit(query_string.offset + query_string.limit);
{
let mut chained_collector = collector::chain()
.push(&mut top_collector)
.push(&mut count_collector);
query.search(&searcher, &mut chained_collector).unwrap();
let (top_docs, doc_count) = if q == "*" {
let (top_docs_u64, doc_count) = searcher.search
(
&query,
&(TopDocs
::with_limit(query_string.offset + query_string.limit)
.order_by_u64_field(prices_count_field), Count)
)
.unwrap();
(
top_docs_u64
.iter()
.map(|(score, doc_addr)| (*score as f32, *doc_addr))
.collect(),
doc_count
)
}
else {searcher.search(&query, &(TopDocs::with_limit(query_string.offset + query_string.limit), Count)).unwrap()};
let hits: Vec<Hit> = {
top_collector
.docs()
.iter()
.skip(query_string.offset)
.map(|doc_address| {
let doc: Document = searcher.doc(*doc_address).unwrap();
self.create_hit(&doc, doc_address)
})
.collect()
top_docs
.iter()
.skip(query_string.offset)
.map(|(_score, doc_address)| {
let doc: Document = searcher.doc(*doc_address).unwrap();
self.create_hit(&doc, doc_address)
})
.collect()
};
Ok(AutocompleteResponse {
Ok(ListResponse {
q: query_string.q.clone(),
count: count_collector.count(),
count: doc_count,
hits,
})
}
......@@ -225,101 +176,64 @@ impl_web!{
#[get("/persons")]
#[content_type("application/json")]
fn list(&self, query_string: SearchQuery) -> Result<ListResponse, ()> {
let q = query_string.q.replace("'", " ");
let q = q.replace("\"", " ");
let q = q.replace(":" , " ");
let q = q.replace("-" , " ");
let q = q.replace("+" , " ");
let q = q.trim();
let reader = self.index
.reader_builder()
.reload_policy(ReloadPolicy::OnCommit)
.try_into()
.unwrap();
let searcher = self.index.searcher();
let mut count_collector = CountCollector::default();
let hits: Vec<Hit> = if q.is_empty() {
let mut top_collector = TopFieldCollector::<u64>::with_limit(self.schema.get_field("functions_count").unwrap(), query_string.offset + query_string.limit);
{
let mut chained_collector = collector::chain()
.push(&mut top_collector)
.push(&mut count_collector);
let query = Box::new(AllQuery);
query.search(&searcher, &mut chained_collector).unwrap();
}
top_collector
.docs()
.iter()
.skip(query_string.offset)
.map(|doc_address| {
let doc: Document = searcher.doc(*doc_address).unwrap();
self.create_hit(&doc, doc_address)
})
.collect()
} else {
let mut top_collector = TopScoreCollector::with_limit(query_string.offset + query_string.limit);
{
let mut chained_collector = collector::chain()
.push(&mut top_collector)
.push(&mut count_collector);
let query = self.list_query_parser
.parse_query(q)
.expect("Parsing the query failed");
query.search(&searcher, &mut chained_collector).unwrap();
}
top_collector
.docs()
.iter()
.skip(query_string.offset)
.map(|doc_address| {
let doc: Document = searcher.doc(*doc_address).unwrap();
self.create_hit(&doc, doc_address)
})
.collect()
};
Ok(ListResponse {
q: query_string.q.clone(),
count: count_collector.count(),
hits,
})
}
let searcher = reader.searcher();
let schema = self.index.schema();
let functions_count_field = schema.get_field("functions_count").unwrap();
#[get("/persons/autocomplete")]
#[content_type("application/json")]
fn autocomplete(&self, query_string: SearchQuery) -> Result<AutocompleteResponse, ()> {
let q = query_string.q.replace("'", " ");
let q = q.replace("\"", " ");
let q = q.replace(":" , " ");
let q = q.replace("-" , " ");
let q = q.replace("+" , " ");
let q = q.trim();
let query = if q.is_empty() {
Box::new(AllQuery)
let q = if q.is_empty() {
"*"
} else {
self.autocomplete_query_parser
.parse_query(q)
.expect("Parsing the query failed")
q
};
let searcher = self.index.searcher();
let mut count_collector = CountCollector::default();
let mut top_collector = TopScoreCollector::with_limit(query_string.offset + query_string.limit);
{
let mut chained_collector = collector::chain()
.push(&mut top_collector)
.push(&mut count_collector);
query.search(&searcher, &mut chained_collector).unwrap();
let query = self.list_query_parser
.parse_query(&q)
.expect("Parsing the query failed");
let (top_docs, doc_count) = if q == "*" {
let (top_docs_u64, doc_count) = searcher.search
(
&query,
&(TopDocs
::with_limit(query_string.offset + query_string.limit)
.order_by_u64_field(functions_count_field), Count)
)
.unwrap();
(
top_docs_u64
.iter()
.map(|(score, doc_addr)| (*score as f32, *doc_addr))
.collect(),
doc_count
)
}
else {searcher.search(&query, &(TopDocs::with_limit(query_string.offset + query_string.limit), Count)).unwrap()};
let hits: Vec<Hit> = {
top_collector
.docs()
.iter()
.skip(query_string.offset)
.map(|doc_address| {
let doc: Document = searcher.doc(*doc_address).unwrap();
self.create_hit(&doc, doc_address)
})
.collect()
top_docs
.iter()
.skip(query_string.offset)
.map(|(_score, doc_address)| {
let doc: Document = searcher.doc(*doc_address).unwrap();
self.create_hit(&doc, doc_address)
})
.collect()
};
Ok(AutocompleteResponse {
Ok(ListResponse {
q: query_string.q.clone(),
count: count_collector.count(),
count: doc_count,
hits,
})
}
......@@ -330,13 +244,9 @@ fn run_serve(data_dir: &Path, addr: &SocketAddr) -> tantivy::Result<()> {
let issuers_resource = {
let index = issuers::load_index(data_dir)?;
let schema = index.schema();
let name_autocomplete_field = schema.get_field("name_autocomplete").unwrap();
let name_field = schema.get_field("name").unwrap();
let autocomplete_query_parser =
QueryParser::for_index(&index, vec![name_autocomplete_field]);
let list_query_parser = QueryParser::for_index(&index, vec![name_field]);
IssuersResource {
autocomplete_query_parser,
index,
list_query_parser,
schema,
......@@ -346,13 +256,9 @@ fn run_serve(data_dir: &Path, addr: &SocketAddr) -> tantivy::Result<()> {
let persons_resource = {
let index = persons::load_index(data_dir)?;
let schema = index.schema();
let name_autocomplete_field = schema.get_field("name_autocomplete").unwrap();
let name_field = schema.get_field("name").unwrap();
let autocomplete_query_parser =
QueryParser::for_index(&index, vec![name_autocomplete_field]);
let list_query_parser = QueryParser::for_index(&index, vec![name_field]);
PersonsResource {
autocomplete_query_parser,
index,
list_query_parser,
schema,
......
......@@ -11,8 +11,7 @@ pub fn build_index(data_dir: &Path) -> tantivy::Result<tantivy::Index> {
fs::create_dir_all(&index_dir).expect("Directory creation failed");
let mut schema_builder = SchemaBuilder::default();
schema_builder.add_u64_field("id", INT_STORED);
schema_builder.add_i64_field("id", STORED);
schema_builder.add_text_field("main_name", STORED);
{
......@@ -25,22 +24,13 @@ pub fn build_index(data_dir: &Path) -> tantivy::Result<tantivy::Index> {
schema_builder.add_text_field("name", text_options);
}
{
let text_field_indexing = TextFieldIndexing::default()
.set_tokenizer("french_autocomplete_tokenizer")
.set_index_option(IndexRecordOption::WithFreqsAndPositions);
let text_options = TextOptions::default()
.set_indexing_options(text_field_indexing)
.set_stored();
schema_builder.add_text_field("name_autocomplete", text_options);
}
schema_builder.add_u64_field("prices_count", FAST);
schema_builder.add_u64_field("public_status", INT_INDEXED);
schema_builder.add_u64_field("public_status", INDEXED);
let schema = schema_builder.build();
let index = Index::create_in_dir(&index_dir, schema.clone()).unwrap();
register_tokenizers(&index)?;
Ok(index)
......
......@@ -20,8 +20,6 @@ use commands::*;
use std::io::Write;
fn main() {
// env_logger::init().unwrap();
let data_dir_arg = Arg::with_name("data_dir")
.short("d")
.long("data_dir")
......@@ -32,7 +30,7 @@ fn main() {
let cli_options = App::new("DFIH-Search")
.setting(AppSettings::SubcommandRequiredElseHelp)
.version(env!("CARGO_PKG_VERSION"))
.author("Emmanuel Raviart <emmanuel@raviart.com>")
.author(crate_authors!())
.about(r#"Search web service for the "Données financières historiques (DFIH)" project"#)
.subcommand(
SubCommand::with_name("index")
......
......@@ -10,7 +10,7 @@ pub fn build_index(data_dir: &Path) -> tantivy::Result<tantivy::Index> {
fs::create_dir_all(&index_dir).expect("Directory creation failed");
let mut schema_builder = SchemaBuilder::default();
schema_builder.add_u64_field("id", INT_STORED);
schema_builder.add_u64_field("id", STORED);
schema_builder.add_text_field("main_name", STORED);
......
......@@ -66,12 +66,11 @@ pub const FRENCH_STOP_WORDS: [&str; 164] = [
"t", // t'
"y", // there
// forms of être (not including the infinitive):
"été", "étée", "étées", "étés", "étant", "suis", "es", "est", "sommes", "êtes",
"sont", "serai", "seras", "sera", "serons", "serez", "seront", "serais", "serait", "serions",
"seriez", "seraient", "étais", "était", "étions", "étiez", "étaient", "fus", "fut",
"fûmes", "fûtes", "furent", "sois", "soit", "soyons", "soyez", "soient", "fusse", "fusses",
"fût", "fussions", "fussiez", "fussent",
// forms of avoir (not including the infinitive):
"été", "étée", "étées", "étés", "étant", "suis", "es", "est", "sommes", "êtes", "sont", "serai",
"seras", "sera", "serons", "serez", "seront", "serais", "serait", "serions", "seriez",
"seraient", "étais", "était", "étions", "étiez", "étaient", "fus", "fut", "fûmes", "fûtes",
"furent", "sois", "soit", "soyons", "soyez", "soient", "fusse", "fusses", "fût", "fussions",
"fussiez", "fussent", // forms of avoir (not including the infinitive):
"ayant", "eu", "eue", "eues", "eus", "ai", "as", "avons", "avez", "ont", "aurai", "auras",
"aura", "aurons", "aurez", "auront", "aurais", "aurait", "aurions", "auriez", "auraient",
"avais", "avait", "avions", "aviez", "avaient", "eut", "eûmes", "eûtes", "eurent", "aie",
......
......@@ -93,10 +93,6 @@ mod tests {
#[test]
fn test_slugr() {
assert_eq!(slug_helper("Tree"), vec!["tree".to_string()]);
assert_eq!(
slug_helper("Русский"),
vec!["русский".to_string()]
);
assert_eq!(slug_helper("Русский"), vec!["русский".to_string()]);
}
}
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment