diff --git a/lib/shared/search-index.js b/lib/shared/search-index.js index 432b2cadb..026eca102 100644 --- a/lib/shared/search-index.js +++ b/lib/shared/search-index.js @@ -1,86 +1,89 @@ // @flow import Tokenizer from 'tokenize-text'; type Token = { index: number, match: { [i: number]: string, index: number, input: string, ... }, offset: number, value: string, ... }; +type TokenizeFunc = (str: string) => Token[]; +const defaultTokenize: TokenizeFunc = new Tokenizer().words(); + class SearchIndex { tokenize: (str: string) => Token[]; fullTextIndex: { [token: string]: Set }; partialTextIndex: { [token: string]: Set }; - constructor() { - this.tokenize = new Tokenizer().words(); + constructor(inputTokenize?: TokenizeFunc) { + this.tokenize = inputTokenize ?? defaultTokenize; this.fullTextIndex = {}; this.partialTextIndex = {}; } addAllPrefixes(id: string, value: string): void { if (this.fullTextIndex[value] === undefined) { this.fullTextIndex[value] = new Set(); } this.fullTextIndex[value].add(id); let partialString = ''; for (let i = 0; i < value.length; i++) { const char = value[i]; partialString += char; // TODO probably should do some stopwords here if (this.partialTextIndex[partialString] === undefined) { this.partialTextIndex[partialString] = new Set(); } this.partialTextIndex[partialString].add(id); } } addEntry(id: string, rawText: string) { const keywords = this.tokenize(rawText); for (const keyword of keywords) { const value = keyword.value.toLowerCase(); this.addAllPrefixes(id, value); } } getSearchResults(query: string): string[] { const keywords = this.tokenize(query); if (keywords.length === 0) { return []; } const lastKeyword = keywords[keywords.length - 1]; const lastKeywordValue = lastKeyword.value.toLowerCase(); const lastMatchSet = lastKeyword.match.input.match(/\S$/) ? this.partialTextIndex[lastKeywordValue] : this.fullTextIndex[lastKeywordValue]; if (!lastMatchSet) { return []; } const fullKeywords = keywords.slice(0, -1).map(k => k.value.toLowerCase()); let possibleMatches: string[] = Array.from(lastMatchSet); for (const keyword of fullKeywords) { const fullMatches = this.fullTextIndex[keyword]; if (!fullMatches) { return []; } possibleMatches = possibleMatches.filter(id => fullMatches.has(id)); if (possibleMatches.length === 0) { return []; } } return possibleMatches; } } export default SearchIndex; diff --git a/lib/shared/sentence-prefix-search-index.js b/lib/shared/sentence-prefix-search-index.js index 4848f2f0b..928c025fa 100644 --- a/lib/shared/sentence-prefix-search-index.js +++ b/lib/shared/sentence-prefix-search-index.js @@ -1,34 +1,37 @@ // @flow import Tokenizer from 'tokenize-text'; import SearchIndex from './search-index.js'; +// defaultTokenize used in SearchIndex splits on punctuation +// We use this alternative because we only want to split on whitespace +const tokenize = new Tokenizer().re(/\S+/); + class SentencePrefixSearchIndex extends SearchIndex { entries: Set; constructor() { - super(); - this.tokenize = new Tokenizer().re(/\S+/); + super(tokenize); this.entries = new Set(); } addEntry(id: string, rawText: string) { const keywords = this.tokenize(rawText); for (const keyword of keywords) { const value = rawText.slice(keyword.index).toLowerCase(); this.addAllPrefixes(id, value); } this.entries.add(id); } getSearchResults(query: string): string[] { const transformedQuery = query.toLowerCase(); if (this.partialTextIndex[transformedQuery]) { return Array.from(this.partialTextIndex[transformedQuery]); } return []; } } export default SentencePrefixSearchIndex;