Skip to content

Commit ee917db

Browse files
committed
Implement size_hint for ngrams and optimize unknown lookup.
For BucketIndexer, the number of ngrams (and thus subword indices) can be precomputed. Allocating a correctly sized buffer for the subwords yields large bossts in benches.
1 parent 8218cf4 commit ee917db

File tree

3 files changed

+36
-4
lines changed

3 files changed

+36
-4
lines changed

src/chunks/vocab/subword.rs

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ use crate::subword::{
1313
BucketIndexer, ExplicitIndexer, FinalfusionHashIndexer, Indexer,
1414
SubwordIndices as StrSubwordIndices,
1515
};
16+
use crate::util::CollectWithCapacity;
1617

1718
/// fastText vocabulary with hashed n-grams.
1819
pub type FastTextSubwordVocab = SubwordVocab<FastTextIndexer>;
@@ -81,7 +82,7 @@ where
8182
}
8283

8384
fn bracket(word: impl AsRef<str>) -> String {
84-
let mut bracketed = String::new();
85+
let mut bracketed = String::with_capacity(word.as_ref().len() + 2);
8586
bracketed.push(Self::BOW);
8687
bracketed.push_str(word.as_ref());
8788
bracketed.push(Self::EOW);
@@ -164,11 +165,16 @@ where
164165
I: Indexer,
165166
{
166167
fn subword_indices(&self, word: &str) -> Option<Vec<usize>> {
167-
let indices = Self::bracket(word)
168+
let word = Self::bracket(word);
169+
let indices = word
168170
.as_str()
169171
.subword_indices(self.min_n as usize, self.max_n as usize, &self.indexer)
170-
.map(|idx| idx as usize + self.words_len())
171-
.collect::<Vec<_>>();
172+
.map(|idx| idx as usize + self.words_len());
173+
if I::infallible() {
174+
let size = indices.size_hint().1.unwrap();
175+
return Some(indices.collect_with_capacity(size));
176+
}
177+
let indices = indices.collect::<Vec<_>>();
172178
if indices.is_empty() {
173179
None
174180
} else {

src/compat/fasttext/indexer.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,10 @@ impl Indexer for FastTextIndexer {
4848
fn upper_bound(&self) -> u64 {
4949
u64::from(self.buckets)
5050
}
51+
52+
fn infallible() -> bool {
53+
true
54+
}
5155
}
5256

5357
/// fastText FNV-1a implementation.

src/subword.rs

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,9 @@ pub trait Indexer {
2121

2222
/// Return the (exclusive) upper bound of this indexer.
2323
fn upper_bound(&self) -> u64;
24+
25+
/// Indicates whether this Indexer never fails to produce an index.
26+
fn infallible() -> bool;
2427
}
2528

2629
/// N-Gram indexer with bucketing.
@@ -116,6 +119,10 @@ where
116119
// max val is <= 64
117120
2u64.pow(self.buckets_exp as u32)
118121
}
122+
123+
fn infallible() -> bool {
124+
true
125+
}
119126
}
120127

121128
impl<H> PartialEq for HashIndexer<H> {
@@ -209,6 +216,10 @@ impl Indexer for ExplicitIndexer {
209216
fn upper_bound(&self) -> u64 {
210217
self.bound as u64
211218
}
219+
220+
fn infallible() -> bool {
221+
false
222+
}
212223
}
213224

214225
/// A string reference with its length in characters.
@@ -341,6 +352,12 @@ impl<'a> Iterator for NGrams<'a> {
341352

342353
Some(ngram_with_len)
343354
}
355+
356+
#[inline]
357+
fn size_hint(&self) -> (usize, Option<usize>) {
358+
let cap_approx = (self.max_n - self.min_n + 1) * self.char_offsets.len();
359+
(cap_approx, Some(cap_approx))
360+
}
344361
}
345362

346363
/// Trait returning iterators over subwords and indices.
@@ -435,6 +452,11 @@ where
435452
.next()
436453
.map(|ngram| (ngram.inner, self.indexer.index_ngram(&ngram)))
437454
}
455+
456+
#[inline]
457+
fn size_hint(&self) -> (usize, Option<usize>) {
458+
self.ngrams.size_hint()
459+
}
438460
}
439461

440462
#[cfg(test)]

0 commit comments

Comments
 (0)