diff --git a/lib/shared/search-index.js b/lib/shared/search-index.js index 026eca102..626e94146 100644 --- a/lib/shared/search-index.js +++ b/lib/shared/search-index.js @@ -1,89 +1,76 @@ // @flow import Tokenizer from 'tokenize-text'; +import RadixTree from './radix-tree.js'; + type Token = { index: number, match: { [i: number]: string, index: number, input: string, ... }, offset: number, value: string, ... }; type TokenizeFunc = (str: string) => Token[]; const defaultTokenize: TokenizeFunc = new Tokenizer().words(); class SearchIndex { tokenize: (str: string) => Token[]; - fullTextIndex: { [token: string]: Set }; - partialTextIndex: { [token: string]: Set }; + radixTree: RadixTree = new RadixTree(); constructor(inputTokenize?: TokenizeFunc) { this.tokenize = inputTokenize ?? defaultTokenize; - this.fullTextIndex = {}; - this.partialTextIndex = {}; } addAllPrefixes(id: string, value: string): void { - if (this.fullTextIndex[value] === undefined) { - this.fullTextIndex[value] = new Set(); - } - this.fullTextIndex[value].add(id); - let partialString = ''; - for (let i = 0; i < value.length; i++) { - const char = value[i]; - partialString += char; - // TODO probably should do some stopwords here - if (this.partialTextIndex[partialString] === undefined) { - this.partialTextIndex[partialString] = new Set(); - } - this.partialTextIndex[partialString].add(id); - } + this.radixTree.add(value, id); } addEntry(id: string, rawText: string) { const keywords = this.tokenize(rawText); for (const keyword of keywords) { const value = keyword.value.toLowerCase(); this.addAllPrefixes(id, value); } } getSearchResults(query: string): string[] { const keywords = this.tokenize(query); if (keywords.length === 0) { return []; } const lastKeyword = keywords[keywords.length - 1]; const lastKeywordValue = lastKeyword.value.toLowerCase(); const lastMatchSet = lastKeyword.match.input.match(/\S$/) - ? this.partialTextIndex[lastKeywordValue] - : this.fullTextIndex[lastKeywordValue]; - if (!lastMatchSet) { + ? this.radixTree.getAllMatchingPrefix(lastKeywordValue) + : this.radixTree.getAllMatchingExactly(lastKeywordValue); + if (lastMatchSet.length === 0) { return []; } const fullKeywords = keywords.slice(0, -1).map(k => k.value.toLowerCase()); - let possibleMatches: string[] = Array.from(lastMatchSet); + let possibleMatches = lastMatchSet; for (const keyword of fullKeywords) { - const fullMatches = this.fullTextIndex[keyword]; - if (!fullMatches) { + const fullMatches = this.radixTree.getAllMatchingExactly(keyword); + if (fullMatches.length === 0) { return []; } - possibleMatches = possibleMatches.filter(id => fullMatches.has(id)); + const fullMatchesSet = new Set(fullMatches); + possibleMatches = possibleMatches.filter(id => fullMatchesSet.has(id)); if (possibleMatches.length === 0) { return []; } } return possibleMatches; } } export default SearchIndex; diff --git a/lib/shared/sentence-prefix-search-index.js b/lib/shared/sentence-prefix-search-index.js index 928c025fa..bc85bd7cf 100644 --- a/lib/shared/sentence-prefix-search-index.js +++ b/lib/shared/sentence-prefix-search-index.js @@ -1,37 +1,33 @@ // @flow import Tokenizer from 'tokenize-text'; import SearchIndex from './search-index.js'; // defaultTokenize used in SearchIndex splits on punctuation // We use this alternative because we only want to split on whitespace const tokenize = new Tokenizer().re(/\S+/); class SentencePrefixSearchIndex extends SearchIndex { entries: Set; constructor() { super(tokenize); this.entries = new Set(); } addEntry(id: string, rawText: string) { const keywords = this.tokenize(rawText); for (const keyword of keywords) { const value = rawText.slice(keyword.index).toLowerCase(); this.addAllPrefixes(id, value); } this.entries.add(id); } getSearchResults(query: string): string[] { - const transformedQuery = query.toLowerCase(); - if (this.partialTextIndex[transformedQuery]) { - return Array.from(this.partialTextIndex[transformedQuery]); - } - return []; + return this.radixTree.getAllMatchingPrefix(query.toLowerCase()); } } export default SentencePrefixSearchIndex;