Skip to content

Commit 8a630c0

Browse files
authored
feat(*): restore standalone split feature (#1390)
1 parent f472f33 commit 8a630c0

23 files changed

Lines changed: 1165 additions & 443 deletions

File tree

Cargo.lock

Lines changed: 429 additions & 400 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

crates/cli/src/cli.rs

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ use clap::{crate_version, Parser, Subcommand};
33
use crate::alias::AliasCommands;
44
use crate::{
55
CompileArgs, DownloadArgs, DumpArgs, IndexArgs, InfoArgs, LexiconArgs, LookupArgs, MergeArgs,
6-
NewArgs, SearchArgs, ServeArgs, TokenizeArgs,
6+
NewArgs, SearchArgs, ServeArgs, SplitArgs, TokenizeArgs,
77
};
88

99
#[derive(Debug, Parser)]
@@ -71,6 +71,10 @@ pub enum Commands {
7171
#[command(arg_required_else_help = true)]
7272
Serve(ServeArgs),
7373

74+
/// Splits text into component dictionary words without attempting a whole-word lookup first
75+
#[command(arg_required_else_help = true)]
76+
Split(SplitArgs),
77+
7478
/// Tokenize text and find dictionary entries for each token
7579
#[command(arg_required_else_help = true)]
7680
Tokenize(TokenizeArgs),

crates/cli/src/lib.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ mod new;
1515
mod print;
1616
mod search;
1717
mod serve;
18+
mod split;
1819
mod tokenize;
1920
mod utils;
2021

@@ -34,5 +35,6 @@ pub use new::*;
3435
pub use print::*;
3536
pub use search::*;
3637
pub use serve::*;
38+
pub use split::*;
3739
pub use tokenize::*;
3840
pub use utils::*;

crates/cli/src/main.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
use clap::Parser;
22
use console::style;
33
use odict_cli::{
4-
alias, compile, download, dump, index, info, lexicon, lookup, merge, new, search, serve,
4+
alias, compile, download, dump, index, info, lexicon, lookup, merge, new, search, serve, split,
55
tokenize, CLIContext, Commands, CLI,
66
};
77

@@ -23,6 +23,7 @@ async fn main() {
2323
Commands::Search(ref args) => search(&mut ctx, args).await,
2424
Commands::Serve(ref args) => serve(&mut ctx, args).await,
2525
Commands::Info(ref args) => info(&mut ctx, args).await,
26+
Commands::Split(ref args) => split(&mut ctx, args).await,
2627
Commands::Tokenize(ref args) => tokenize(&mut ctx, args).await,
2728
};
2829

crates/cli/src/serve/mod.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ use crate::CLIContext;
1818

1919
mod lookup;
2020
mod search;
21+
mod split;
2122
mod tokenize;
2223

2324
#[derive(Debug, Clone, ValueEnum)]
@@ -174,6 +175,7 @@ pub async fn serve<'a>(ctx: &mut CLIContext<'a>, args: &ServeArgs) -> anyhow::Re
174175
.app_data(Data::clone(&data))
175176
.service(lookup::handle_lookup)
176177
.service(search::handle_search)
178+
.service(split::handle_split)
177179
.service(tokenize::handle_tokenize)
178180
})
179181
.bind(("0.0.0.0", *port))?

crates/cli/src/serve/split.rs

Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
use actix_web::{
2+
get,
3+
http::{header::ContentType, StatusCode},
4+
web::{Data, Path, Query},
5+
HttpResponse, Responder, ResponseError,
6+
};
7+
use derive_more::{Display, Error};
8+
use odict::{format::json::ToJSON, split::SplitOptions};
9+
use serde::Deserialize;
10+
11+
use crate::get_lookup_entries;
12+
13+
#[derive(Debug, Deserialize)]
14+
pub struct SplitRequest {
15+
q: String,
16+
follow: Option<bool>,
17+
min_length: Option<usize>,
18+
}
19+
20+
#[derive(Debug, Display, Error)]
21+
enum SplitError {
22+
#[display("Dictionary not found: {}", name)]
23+
DictionaryNotFound { name: String },
24+
25+
#[display("Failed to read dictionary: {}", name)]
26+
DictionaryReadError { name: String },
27+
28+
#[display("Split error: {}", message)]
29+
SplitError { message: String },
30+
31+
#[display("Failed to serialize response")]
32+
SerializeError,
33+
}
34+
35+
impl ResponseError for SplitError {
36+
fn error_response(&self) -> HttpResponse {
37+
HttpResponse::build(self.status_code())
38+
.insert_header(ContentType::html())
39+
.body(self.to_string())
40+
}
41+
42+
fn status_code(&self) -> StatusCode {
43+
match *self {
44+
SplitError::DictionaryNotFound { .. } => StatusCode::NOT_FOUND,
45+
SplitError::DictionaryReadError { .. } => StatusCode::INTERNAL_SERVER_ERROR,
46+
SplitError::SplitError { .. } => StatusCode::INTERNAL_SERVER_ERROR,
47+
SplitError::SerializeError => StatusCode::INTERNAL_SERVER_ERROR,
48+
}
49+
}
50+
}
51+
52+
#[get("/{name}/split")]
53+
async fn handle_split(
54+
params: Query<SplitRequest>,
55+
dict: Path<String>,
56+
dictionary_cache: Data<crate::serve::DictionaryCache>,
57+
) -> Result<impl Responder, SplitError> {
58+
let SplitRequest {
59+
q: raw_queries,
60+
follow,
61+
min_length,
62+
} = params.0;
63+
64+
let queries = raw_queries
65+
.split(',')
66+
.map(|s| s.to_string())
67+
.collect::<Vec<_>>();
68+
69+
let dictionary_name = dict.into_inner();
70+
71+
let file = dictionary_cache
72+
.get(&dictionary_name)
73+
.await
74+
.map_err(|_e| SplitError::DictionaryReadError {
75+
name: dictionary_name.to_string(),
76+
})?
77+
.ok_or(SplitError::DictionaryNotFound {
78+
name: dictionary_name.to_string(),
79+
})?;
80+
81+
let dictionary = file
82+
.contents()
83+
.map_err(|_e| SplitError::DictionaryReadError {
84+
name: dictionary_name.to_string(),
85+
})?;
86+
87+
let opts = SplitOptions::default()
88+
.threshold(min_length.unwrap_or(1))
89+
.follow(follow.unwrap_or(false));
90+
91+
let entries = dictionary
92+
.split(&queries, &opts)
93+
.map_err(|e| SplitError::SplitError {
94+
message: e.to_string(),
95+
})?;
96+
97+
let json = get_lookup_entries(entries)
98+
.to_json(true)
99+
.map_err(|_e| SplitError::SerializeError)?;
100+
101+
Ok(HttpResponse::Ok()
102+
.content_type("application/json")
103+
.body(json))
104+
}

crates/cli/src/split.rs

Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
use std::time::Duration;
2+
3+
use crate::enums::PrintFormat;
4+
use crate::get_lookup_entries;
5+
use crate::{context::CLIContext, print_entries};
6+
use clap::Args;
7+
use odict::{download::DictionaryDownloader, split::SplitOptions, LoadOptions, OpenDictionary};
8+
9+
#[derive(Debug, Args)]
10+
#[command(args_conflicts_with_subcommands = true)]
11+
#[command(flatten_help = true)]
12+
pub struct SplitArgs {
13+
#[arg(required = true, help = "Path to a compiled dictionary")]
14+
dictionary_path: String,
15+
16+
#[arg(required = true, help = "Text to split into dictionary words")]
17+
queries: Vec<String>,
18+
19+
#[arg(
20+
short,
21+
long,
22+
value_enum,
23+
default_value_t = PrintFormat::Print,
24+
help = "Output format of the entries"
25+
)]
26+
format: PrintFormat,
27+
28+
#[arg(
29+
short = 'F',
30+
long,
31+
help = "Follow see_also redirects until finding an entry with etymologies"
32+
)]
33+
follow: bool,
34+
35+
#[arg(
36+
short = 'm',
37+
long,
38+
default_value_t = 1,
39+
help = "Minimum character length of each split segment"
40+
)]
41+
min_length: usize,
42+
43+
#[arg(
44+
short = 'i',
45+
long,
46+
default_value_t = false,
47+
help = "Perform case-insensitive lookups"
48+
)]
49+
insensitive: bool,
50+
51+
#[arg(
52+
short = 'r',
53+
long,
54+
default_value_t = crate::DEFAULT_RETRIES,
55+
help = "Number of times to retry loading the dictionary (remote-only)"
56+
)]
57+
retries: u32,
58+
}
59+
60+
pub async fn split<'a>(ctx: &mut CLIContext<'a>, args: &SplitArgs) -> anyhow::Result<()> {
61+
let SplitArgs {
62+
dictionary_path: path,
63+
queries,
64+
format,
65+
follow,
66+
min_length,
67+
insensitive,
68+
retries,
69+
} = args;
70+
71+
let spinner = indicatif::ProgressBar::new_spinner();
72+
73+
spinner.enable_steady_tick(Duration::from_millis(100));
74+
75+
let file = OpenDictionary::load_with_options(
76+
path,
77+
LoadOptions::default()
78+
.with_downloader(DictionaryDownloader::default().with_retries(*retries)),
79+
)
80+
.await?;
81+
82+
let opts = SplitOptions::default()
83+
.threshold(*min_length)
84+
.follow(*follow)
85+
.insensitive(*insensitive);
86+
87+
let result = file.contents()?.split(queries, opts);
88+
89+
spinner.finish_and_clear();
90+
91+
match result {
92+
Ok(entries) => {
93+
print_entries(ctx, get_lookup_entries(entries), format)?;
94+
Ok(())
95+
}
96+
Err(err) => Err(anyhow::Error::from(err)),
97+
}
98+
}

crates/lib/src/core/lookup.rs

Lines changed: 63 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,47 @@ macro_rules! lookup {
132132
Ok($opt::None)
133133
}
134134

135+
fn perform_split<'a>(
136+
&'a self,
137+
query: &str,
138+
options: &crate::split::SplitOptions,
139+
) -> crate::Result<Vec<LookupResult<&'a $ret>>> {
140+
let crate::split::SplitOptions {
141+
threshold,
142+
follow,
143+
insensitive,
144+
} = options;
145+
146+
let chars: Vec<_> = query.chars().collect();
147+
let mut results: Vec<LookupResult<&'a $ret>> = Vec::new();
148+
let mut start = 0;
149+
let mut end = chars.len();
150+
151+
while start < end {
152+
let substr: String = chars[start..end].iter().collect();
153+
let mut path = Vec::new();
154+
155+
match self.find_entry(follow, insensitive, substr.as_str(), None, &mut path) {
156+
Ok($opt::Some(result)) => {
157+
results.push(result);
158+
start = end;
159+
end = chars.len();
160+
}
161+
Ok($opt::None) => {
162+
if end - start <= *threshold {
163+
start = end;
164+
end = chars.len();
165+
} else {
166+
end -= 1;
167+
}
168+
}
169+
Err(e) => return Err(e),
170+
}
171+
}
172+
173+
Ok(results)
174+
}
175+
135176
fn perform_lookup<'a, Options>(
136177
&'a self,
137178
query: &str,
@@ -154,46 +195,32 @@ macro_rules! lookup {
154195
return Ok(vec![result]);
155196
}
156197

157-
let mut results: Vec<LookupResult<&$ret>> = Vec::new();
158-
159198
if let LookupStrategy::Split(min_length) = strategy {
160-
let chars: Vec<_> = query.chars().collect();
161-
let mut start = 0;
162-
let mut end = chars.len();
163-
164-
while start < end {
165-
let substr: String = chars[start..end].iter().collect();
166-
let mut substr_path = Vec::new();
167-
let maybe_entry = self.find_entry(
168-
follow,
169-
insensitive,
170-
substr.as_str(),
171-
None,
172-
&mut substr_path,
173-
);
174-
175-
match maybe_entry {
176-
Ok($opt::Some(result)) => {
177-
results.push(result);
178-
start = end;
179-
end = chars.len();
180-
continue;
181-
}
182-
Ok($opt::None) => {
183-
if substr.len() <= *min_length {
184-
start = end;
185-
end = chars.len();
186-
continue;
187-
}
188-
}
189-
Err(e) => return Err(e),
190-
}
199+
let split_opts = crate::split::SplitOptions::default()
200+
.threshold(*min_length)
201+
.follow(*follow)
202+
.insensitive(*insensitive);
191203

192-
end -= 1;
193-
}
204+
return self.perform_split(query, &split_opts);
194205
}
195206

196-
Ok(results)
207+
Ok(vec![])
208+
}
209+
210+
pub fn split<'a, Query, Options>(
211+
&'a self,
212+
queries: &Vec<Query>,
213+
options: Options,
214+
) -> crate::Result<Vec<LookupResult<&'a $ret>>>
215+
where
216+
Query: AsRef<str> + Send + Sync,
217+
Options: AsRef<crate::split::SplitOptions> + Send + Sync,
218+
{
219+
queries
220+
.par_iter()
221+
.map(|q| self.perform_split(q.as_ref(), options.as_ref()))
222+
.collect::<crate::Result<Vec<_>>>()
223+
.map(|v| v.into_iter().flatten().collect())
197224
}
198225

199226
pub fn lookup<'a, 'b, Query, Options>(

crates/lib/src/core/mod.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ pub mod compile;
44
pub mod lexicon;
55
pub mod lookup;
66
pub mod merge;
7+
pub mod split;
78
pub mod preview;
89
pub mod rank;
910
pub mod read;

0 commit comments

Comments
 (0)