Commit 5e418342 authored by Emmanuel Raviart's avatar Emmanuel Raviart
Browse files

Add indexes for persons.

parent d2fb67f5
use clap::ArgMatches;
use issuers;
use persons;
use std::fs::{self, File};
use std::io::prelude::*;
use std::path::Path;
use tantivy::schema::Document;
fn run_index(data_dir: &Path) -> tantivy::Result<()> {
let index = issuers::build_index(data_dir)?;
let mut index_writer = index.writer(50_000_000)?;
let issuers_dir = data_dir.join("issuers");
let schema = index.schema();
let id_field = schema.get_field("id").unwrap();
let main_name_field = schema.get_field("main_name").unwrap();
let name_autocomplete_field = schema.get_field("name_autocomplete").unwrap();
let name_field = schema.get_field("name").unwrap();
let securities_count_field = schema.get_field("securities_count").unwrap();
for entry in fs::read_dir(issuers_dir).unwrap() {
let json_file_path = entry.unwrap().path();
let mut json_file = File::open(json_file_path).expect("JSON file not found");
let mut json_string = String::new();
json_file
.read_to_string(&mut json_string)
.expect("An error occurred while reading file");
let issuer = json::parse(&json_string).expect("Invalid JSON");
let mut issuer_doc = Document::default();
issuer_doc.add_u64(id_field, issuer["id"].as_u64().unwrap());
issuer_doc.add_text(main_name_field, issuer["name"].as_str().unwrap());
for name in issuer["names"].members() {
issuer_doc.add_text(name_autocomplete_field, &name.to_string());
issuer_doc.add_text(name_field, &name.to_string());
{
let index = issuers::build_index(data_dir)?;
let mut index_writer = index.writer(50_000_000)?;
let issuers_dir = data_dir.join("issuers");
let schema = index.schema();
let id_field = schema.get_field("id").unwrap();
let main_name_field = schema.get_field("main_name").unwrap();
let name_autocomplete_field = schema.get_field("name_autocomplete").unwrap();
let name_field = schema.get_field("name").unwrap();
let securities_count_field = schema.get_field("securities_count").unwrap();
for entry in fs::read_dir(issuers_dir).unwrap() {
let json_file_path = entry.unwrap().path();
let mut json_file = File::open(json_file_path).expect("JSON file not found");
let mut json_string = String::new();
json_file
.read_to_string(&mut json_string)
.expect("An error occurred while reading file");
let issuer = json::parse(&json_string).expect("Invalid JSON");
let mut issuer_doc = Document::default();
issuer_doc.add_u64(id_field, issuer["id"].as_u64().unwrap());
issuer_doc.add_text(main_name_field, issuer["name"].as_str().unwrap());
for name in issuer["names"].members() {
issuer_doc.add_text(name_autocomplete_field, &name.to_string());
issuer_doc.add_text(name_field, &name.to_string());
}
issuer_doc.add_u64(securities_count_field, issuer["securities"].len() as u64);
index_writer.add_document(issuer_doc);
}
index_writer.commit()?;
}
{
let index = persons::build_index(data_dir)?;
let mut index_writer = index.writer(50_000_000)?;
let persons_dir = data_dir.join("persons");
let schema = index.schema();
let id_field = schema.get_field("id").unwrap();
let main_name_field = schema.get_field("main_name").unwrap();
let name_autocomplete_field = schema.get_field("name_autocomplete").unwrap();
let name_field = schema.get_field("name").unwrap();
// let securities_count_field = schema.get_field("securities_count").unwrap();
for entry in fs::read_dir(persons_dir).unwrap() {
let json_file_path = entry.unwrap().path();
let mut json_file = File::open(json_file_path).expect("JSON file not found");
let mut json_string = String::new();
json_file
.read_to_string(&mut json_string)
.expect("An error occurred while reading file");
let person = json::parse(&json_string).expect("Invalid JSON");
let mut person_doc = Document::default();
person_doc.add_u64(id_field, person["id"].as_u64().unwrap());
person_doc.add_text(main_name_field, person["name"].as_str().unwrap());
for name in person["names"].members() {
person_doc.add_text(name_autocomplete_field, &name.to_string());
person_doc.add_text(name_field, &name.to_string());
}
// person_doc.add_u64(securities_count_field, person["securities"].len() as u64);
index_writer.add_document(person_doc);
}
issuer_doc.add_u64(securities_count_field, issuer["securities"].len() as u64);
index_writer.add_document(issuer_doc);
index_writer.commit()?;
}
index_writer.commit()?;
Ok(())
}
......
use clap::ArgMatches;
use issuers;
use persons;
use std::convert::From;
use std::net::SocketAddr;
use std::path::Path;
......@@ -41,6 +42,22 @@ impl IssuersResource {
}
}
struct PersonsResource {
autocomplete_query_parser: QueryParser,
index: Index,
list_query_parser: QueryParser,
schema: Schema,
}
impl PersonsResource {
fn create_hit(&self, doc: &Document, doc_address: &DocAddress) -> Hit {
Hit {
doc: self.schema.to_named_doc(&doc),
id: doc_address.doc(),
}
}
}
#[derive(Response)]
struct ListResponse {
q: String,
......@@ -160,6 +177,105 @@ impl_web!{
})
}
}
impl PersonsResource {
#[get("/persons")]
#[content_type("application/json")]
fn list(&self, query_string: SearchQuery) -> Result<ListResponse, ()> {
let q = query_string.q.replace("'", " ");
let q = q.replace("\"", " ");
let q = q.trim();
let searcher = self.index.searcher();
let mut count_collector = CountCollector::default();
let hits: Vec<Hit> = if q.is_empty() {
// let mut top_collector = TopFieldCollector::<u64>::with_limit(self.schema.get_field("securities_count").unwrap(), query_string.offset + query_string.limit);
let mut top_collector = TopScoreCollector::with_limit(query_string.offset + query_string.limit);
{
let mut chained_collector = collector::chain()
.push(&mut top_collector)
.push(&mut count_collector);
let query = Box::new(AllQuery);
query.search(&searcher, &mut chained_collector).unwrap();
}
top_collector
.docs()
.iter()
.skip(query_string.offset)
.map(|doc_address| {
let doc: Document = searcher.doc(*doc_address).unwrap();
self.create_hit(&doc, doc_address)
})
.collect()
} else {
let mut top_collector = TopScoreCollector::with_limit(query_string.offset + query_string.limit);
{
let mut chained_collector = collector::chain()
.push(&mut top_collector)
.push(&mut count_collector);
let query = self.list_query_parser
.parse_query(q)
.expect("Parsing the query failed");
query.search(&searcher, &mut chained_collector).unwrap();
}
top_collector
.docs()
.iter()
.skip(query_string.offset)
.map(|doc_address| {
let doc: Document = searcher.doc(*doc_address).unwrap();
self.create_hit(&doc, doc_address)
})
.collect()
};
Ok(ListResponse {
q: query_string.q.clone(),
count: count_collector.count(),
hits,
})
}
#[get("/persons/autocomplete")]
#[content_type("application/json")]
fn autocomplete(&self, query_string: SearchQuery) -> Result<AutocompleteResponse, ()> {
let q = query_string.q.replace("'", " ");
let q = q.replace("\"", " ");
let q = q.trim();
let query = if q.is_empty() {
Box::new(AllQuery)
} else {
self.autocomplete_query_parser
.parse_query(q)
.expect("Parsing the query failed")
};
let searcher = self.index.searcher();
let mut count_collector = CountCollector::default();
let mut top_collector = TopScoreCollector::with_limit(query_string.offset + query_string.limit);
{
let mut chained_collector = collector::chain()
.push(&mut top_collector)
.push(&mut count_collector);
query.search(&searcher, &mut chained_collector).unwrap();
}
let hits: Vec<Hit> = {
top_collector
.docs()
.iter()
.skip(query_string.offset)
.map(|doc_address| {
let doc: Document = searcher.doc(*doc_address).unwrap();
self.create_hit(&doc, doc_address)
})
.collect()
};
Ok(AutocompleteResponse {
q: query_string.q.clone(),
count: count_collector.count(),
hits,
})
}
}
}
fn run_serve(data_dir: &Path, addr: &SocketAddr) -> tantivy::Result<()> {
......@@ -179,6 +295,22 @@ fn run_serve(data_dir: &Path, addr: &SocketAddr) -> tantivy::Result<()> {
}
};
let persons_resource = {
let index = persons::load_index(data_dir)?;
let schema = index.schema();
let name_autocomplete_field = schema.get_field("name_autocomplete").unwrap();
let name_field = schema.get_field("name").unwrap();
let autocomplete_query_parser =
QueryParser::for_index(&index, vec![name_autocomplete_field]);
let list_query_parser = QueryParser::for_index(&index, vec![name_field]);
PersonsResource {
autocomplete_query_parser,
index,
list_query_parser,
schema,
}
};
let cors = CorsBuilder::new()
.allow_origins(AllowedOrigins::Any { allow_null: true })
.build();
......@@ -186,6 +318,7 @@ fn run_serve(data_dir: &Path, addr: &SocketAddr) -> tantivy::Result<()> {
println!("Listening on http://{}", addr);
ServiceBuilder::new()
.resource(issuers_resource)
.resource(persons_resource)
.middleware(cors)
.run(addr)
.unwrap();
......
......@@ -12,6 +12,7 @@ extern crate tower_web;
mod commands;
mod issuers;
mod persons;
mod tokenizers;
use clap::{App, AppSettings, Arg, SubCommand};
......
use std::fs;
use std::path::Path;
use tantivy::schema::*;
use tantivy::tokenizer::{SimpleTokenizer, Tokenizer};
use tantivy::Index;
use tokenizers::{NgramTokenizer, Slugifier};
pub fn build_index(data_dir: &Path) -> tantivy::Result<tantivy::Index> {
let index_dir = data_dir.join("indexes").join("persons");
fs::create_dir_all(&index_dir).expect("Directory creation failed");
let mut schema_builder = SchemaBuilder::default();
schema_builder.add_u64_field("id", INT_STORED);
schema_builder.add_text_field("main_name", STORED);
{
let text_field_indexing = TextFieldIndexing::default()
.set_tokenizer("french_tokenizer")
.set_index_option(IndexRecordOption::WithFreqsAndPositions);
let text_options = TextOptions::default()
.set_indexing_options(text_field_indexing)
.set_stored();
schema_builder.add_text_field("name", text_options);
}
{
let text_field_indexing = TextFieldIndexing::default()
.set_tokenizer("french_autocomplete_tokenizer")
.set_index_option(IndexRecordOption::WithFreqsAndPositions);
let text_options = TextOptions::default()
.set_indexing_options(text_field_indexing)
.set_stored();
schema_builder.add_text_field("name_autocomplete", text_options);
}
schema_builder.add_u64_field("securities_count", FAST);
let schema = schema_builder.build();
let index = Index::create_in_dir(&index_dir, schema.clone()).unwrap();
register_tokenizers(&index)?;
Ok(index)
}
pub fn load_index(data_dir: &Path) -> tantivy::Result<tantivy::Index> {
let index_dir = data_dir.join("indexes").join("persons");
let index = Index::open_in_dir(index_dir)?;
register_tokenizers(&index)?;
Ok(index)
}
fn register_tokenizers(index: &tantivy::Index) -> tantivy::Result<()> {
let tokenizers = index.tokenizers();
let french_autocomplete_tokenizer = NgramTokenizer::new(1, 3, false).filter(Slugifier);
tokenizers.register(
"french_autocomplete_tokenizer",
french_autocomplete_tokenizer,
);
let french_tokenizer = SimpleTokenizer
.filter(Slugifier);
tokenizers.register("french_tokenizer", french_tokenizer);
Ok(())
}
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment