Commit ed6567be authored by Emmanuel Raviart's avatar Emmanuel Raviart
Browse files

Add web server.

parent 5488110b
This diff is collapsed.
......@@ -4,7 +4,13 @@ version = "0.1.0"
authors = ["Emmanuel Raviart <emmanuel@raviart.com>"]
[dependencies]
clap = "2"
json = "0.11"
rust-stemmers = "1"
serde = "1"
serde_derive = "1"
serde_json = "1"
serde_urlencoded = "0.5"
slug = "0.1"
tantivy = "0.7"
tower-web = "0.3"
use clap::ArgMatches;
use issuers;
use std::fs::{self, File};
use std::io::prelude::*;
use std::path::Path;
use tantivy::schema::Document;
fn run_index(data_dir: &Path) -> tantivy::Result<()> {
let index = issuers::build_autocompletion_index(data_dir)?;
let mut index_writer = index.writer(50_000_000)?;
let issuers_dir = data_dir.join("issuers");
let schema = index.schema();
let name_field = schema.get_field("name").unwrap();
for entry in fs::read_dir(issuers_dir).unwrap() {
let json_file_path = entry.unwrap().path();
let mut json_file = File::open(json_file_path).expect("JSON file not found");
let mut json_string = String::new();
json_file
.read_to_string(&mut json_string)
.expect("An error occurred while reading file");
let issuer = json::parse(&json_string).expect("Invalid JSON");
let mut issuer_doc = Document::default();
for name in issuer["names"].members() {
issuer_doc.add_text(name_field, &name.to_string());
}
index_writer.add_document(issuer_doc);
}
index_writer.commit()?;
Ok(())
}
pub fn run_index_cli(args: &ArgMatches) -> Result<(), String> {
let data_dir = args
.value_of("data_dir")
.map(|path| Path::new(path))
.unwrap();
run_index(data_dir).map_err(|e| format!("Indexing failed : {:?}", e))
}
mod index;
mod serve;
pub use self::index::run_index_cli;
pub use self::serve::run_serve_cli;
use clap::ArgMatches;
use issuers;
use std::convert::From;
use std::net::SocketAddr;
use std::path::Path;
use tantivy::collector;
use tantivy::collector::CountCollector;
use tantivy::collector::TopCollector;
use tantivy::query::QueryParser;
use tantivy::schema::NamedFieldDocument;
use tantivy::schema::Schema;
use tantivy::DocAddress;
use tantivy::Document;
use tantivy::Index;
use tower_web::{Response, ServiceBuilder};
#[derive(Response)]
struct Autocomplete {
q: String,
count: usize,
hits: Vec<Hit>,
}
#[derive(Serialize)]
struct Hit {
doc: NamedFieldDocument,
id: u32,
}
struct IssuersResource {
index: Index,
query_parser: QueryParser,
schema: Schema,
}
impl IssuersResource {
fn create_hit(&self, doc: &Document, doc_address: &DocAddress) -> Hit {
Hit {
doc: self.schema.to_named_doc(&doc),
id: doc_address.doc(),
}
}
}
#[derive(Debug, Deserialize)]
struct SearchQuery {
#[serde(default)]
q: String,
#[serde(default = "count_default")]
count: usize,
}
fn count_default() -> usize {
10
}
impl_web!{
impl IssuersResource {
#[get("/issuers/autocomplete")]
#[content_type("application/json")]
fn autocomplete(&self, query_string: String) -> Result<Autocomplete, ()> {
let url_query: SearchQuery = serde_urlencoded::from_str(&query_string).unwrap();
let query = self .query_parser
.parse_query(&url_query.q)
.expect("Parsing the query failed");
let searcher = self.index.searcher();
let mut count_collector = CountCollector::default();
let mut top_collector = TopCollector::with_limit(url_query.count);
{
let mut chained_collector = collector::chain()
.push(&mut top_collector)
.push(&mut count_collector);
query.search(&searcher, &mut chained_collector).unwrap();
}
let hits: Vec<Hit> = {
top_collector
.docs()
.iter()
.map(|doc_address| {
let doc: Document = searcher.doc(*doc_address).unwrap();
self.create_hit(&doc, doc_address)
})
.collect()
};
Ok(Autocomplete {
q: url_query.q.clone(),
count: count_collector.count(),
hits,
})
}
}
}
fn run_serve(data_dir: &Path, addr: &SocketAddr) -> tantivy::Result<()> {
let index = issuers::load_autocompletion_index(data_dir)?;
let schema = index.schema();
let name_field = schema.get_field("name").unwrap();
let query_parser = QueryParser::for_index(&index, vec![name_field]);
println!("Listening on http://{}", addr);
ServiceBuilder::new()
.resource(IssuersResource {
index,
query_parser,
schema,
})
.run(addr)
.unwrap();
Ok(())
}
pub fn run_serve_cli(args: &ArgMatches) -> Result<(), String> {
let data_dir = args
.value_of("data_dir")
.map(|path| Path::new(path))
.expect("Invalid data directory");
let port = value_t!(args, "port", u16).unwrap();
let host = args.value_of("host").unwrap();
let addr = format!("{}:{}", host, port)
.parse()
.expect("Invalid address");
run_serve(data_dir, &addr).map_err(|e| format!("{:?}", e))
}
use slug::slugify;
use std::path::Path;
use tantivy::schema::*;
use tantivy::tokenizer::{RemoveLongFilter, SimpleTokenizer, StopWordFilter, Tokenizer};
use tantivy::Index;
use tokenizers::{FrenchStemmer, NgramTokenizer, Slugifier, FRENCH_STOP_WORDS};
pub fn build_autocompletion_index(data_dir: &Path) -> tantivy::Result<tantivy::Index> {
let index_dir = data_dir.join("index");
let mut schema_builder = SchemaBuilder::default();
let text_field_indexing = TextFieldIndexing::default()
.set_tokenizer("french_autocompletion_tokenizer")
// .set_tokenizer("french_tokenizer")
.set_index_option(IndexRecordOption::WithFreqsAndPositions);
let text_options = TextOptions::default()
.set_indexing_options(text_field_indexing)
.set_stored();
schema_builder.add_text_field("name", text_options);
let schema = schema_builder.build();
let index = Index::create_in_dir(&index_dir, schema.clone()).unwrap();
let french_autocompletion_tokenizer = NgramTokenizer::new(3, 3, false).filter(Slugifier);
index.tokenizers().register(
"french_autocompletion_tokenizer",
french_autocompletion_tokenizer,
);
// let french_tokenizer = SimpleTokenizer
// .filter(RemoveLongFilter::limit(40))
// .filter(StopWordFilter::remove(
// FRENCH_STOP_WORDS.iter().map(slugify).collect(),
// ))
// .filter(Slugifier)
// .filter(FrenchStemmer::new());
// index
// .tokenizers()
// .register("french_tokenizer", french_tokenizer);
Ok(index)
}
pub fn load_autocompletion_index(data_dir: &Path) -> tantivy::Result<tantivy::Index> {
let index_dir = data_dir.join("index");
let index = Index::open_in_dir(index_dir)?;
let french_autocompletion_tokenizer = NgramTokenizer::new(3, 3, false).filter(Slugifier);
index.tokenizers().register(
"french_autocompletion_tokenizer",
french_autocompletion_tokenizer,
);
Ok(index)
}
// #[macro_use]
#[macro_use]
extern crate clap;
extern crate json;
extern crate rust_stemmers;
extern crate serde;
extern crate serde_derive;
extern crate serde_json;
extern crate serde_urlencoded;
extern crate slug;
extern crate tantivy;
#[macro_use]
extern crate tower_web;
mod french_stemmer;
mod french_stop_words;
mod ngram_tokenizer;
mod slugifier;
mod commands;
mod issuers;
mod tokenizers;
use french_stemmer::FrenchStemmer;
use french_stop_words::FRENCH_STOP_WORDS;
use ngram_tokenizer::NgramTokenizer;
use slug::slugify;
use slugifier::Slugifier;
use std::fs::{self, File};
use std::io::prelude::*;
use std::path::Path;
use tantivy::collector::TopCollector;
use tantivy::query::QueryParser;
use tantivy::schema::*;
use tantivy::tokenizer::{RemoveLongFilter, SimpleTokenizer, StopWordFilter, Tokenizer};
use tantivy::Index;
use clap::{App, AppSettings, Arg, SubCommand};
use commands::*;
use std::io::Write;
fn main() -> tantivy::Result<()> {
let data_dir = Path::new("../dfih-ui/data");
fn main() {
// env_logger::init().unwrap();
let index_dir = data_dir.join("index");
let mut schema_builder = SchemaBuilder::default();
let data_dir_arg = Arg::with_name("data_dir")
.short("d")
.long("data_dir")
.value_name("data_dir")
.help("Directory containing the DFIH data")
.default_value("../dfih-ui/data");
let text_field_indexing = TextFieldIndexing::default()
.set_tokenizer("french_autocompletion_tokenizer")
// .set_tokenizer("french_tokenizer")
.set_index_option(IndexRecordOption::WithFreqsAndPositions);
let text_options = TextOptions::default()
.set_indexing_options(text_field_indexing)
.set_stored();
schema_builder.add_text_field("name", text_options);
let cli_options = App::new("Tantivy")
.setting(AppSettings::SubcommandRequiredElseHelp)
.version(env!("CARGO_PKG_VERSION"))
.author("Emmanuel Raviart <emmanuel@raviart.com>")
.about(r#"Search web service for the "Données financières historiques (DFIH)" project"#)
.subcommand(
SubCommand::with_name("index")
.about("Index DFIH JSON files")
.arg(data_dir_arg.clone()),
)
.subcommand(
SubCommand::with_name("serve")
.about("Start web server")
.arg(data_dir_arg.clone())
.arg(
Arg::with_name("host")
.long("host")
.value_name("host")
.help("host to listen to")
.default_value("127.0.0.1"),
)
.arg(
Arg::with_name("port")
.short("p")
.long("port")
.value_name("port")
.help("Port")
.default_value("2999"),
),
)
.get_matches();
let schema = schema_builder.build();
let index = Index::create_in_dir(&index_dir, schema.clone())?;
let (subcommand, some_options) = cli_options.subcommand();
let options = some_options.unwrap();
let run_cli = match subcommand {
"index" => run_index_cli,
"serve" => run_serve_cli,
_ => panic!("Subcommand {} is unknown", subcommand),
};
let french_autocompletion_tokenizer = NgramTokenizer::new(3, 3, false).filter(Slugifier);
index.tokenizers().register(
"french_autocompletion_tokenizer",
french_autocompletion_tokenizer,
);
let french_tokenizer = SimpleTokenizer
.filter(RemoveLongFilter::limit(40))
.filter(StopWordFilter::remove(
FRENCH_STOP_WORDS.iter().map(slugify).collect(),
))
.filter(Slugifier)
.filter(FrenchStemmer::new());
index
.tokenizers()
.register("french_tokenizer", french_tokenizer);
let mut index_writer = index.writer(50_000_000)?;
let issuers_dir = data_dir.join("issuers");
let name_field = schema.get_field("name").unwrap();
for entry in fs::read_dir(issuers_dir).unwrap() {
let json_file_path = entry.unwrap().path();
let mut json_file = File::open(json_file_path).expect("JSON file not found");
let mut json_string = String::new();
json_file
.read_to_string(&mut json_string)
.expect("An error occurred while reading file");
let issuer = json::parse(&json_string).expect("Invalid JSON");
let mut issuer_doc = Document::default();
for name in issuer["names"].members() {
issuer_doc.add_text(name_field, &name.to_string());
}
index_writer.add_document(issuer_doc);
}
index_writer.commit()?;
index.load_searchers()?;
let searcher = index.searcher();
let query_parser = QueryParser::for_index(&index, vec![name_field]);
let query = query_parser.parse_query("lyonnais")?;
let mut top_collector = TopCollector::with_limit(10);
searcher.search(&*query, &mut top_collector)?;
let doc_addresses = top_collector.docs();
for doc_address in doc_addresses {
let retrieved_doc = searcher.doc(doc_address)?;
println!("{}", schema.to_json(&retrieved_doc));
if let Err(ref e) = run_cli(options) {
let stderr = &mut std::io::stderr();
let errmsg = "Error writing ot stderr";
writeln!(stderr, "{}", e).expect(errmsg);
std::process::exit(1);
}
Ok(())
}
mod french_stemmer;
mod french_stop_words;
mod ngram_tokenizer;
mod slugifier;
pub use self::french_stemmer::FrenchStemmer;
pub use self::french_stop_words::FRENCH_STOP_WORDS;
pub use self::ngram_tokenizer::NgramTokenizer;
pub use self::slugifier::Slugifier;
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment