diff --git a/keyserver/src/creators/message-creator.js b/keyserver/src/creators/message-creator.js --- a/keyserver/src/creators/message-creator.js +++ b/keyserver/src/creators/message-creator.js @@ -32,6 +32,7 @@ appendSQLArray, mergeOrConditions, } from '../database/database.js'; +import { processMessagesForSearch } from '../database/search-utils.js'; import { fetchMessageInfoForLocalID, fetchMessageInfoByID, @@ -288,6 +289,7 @@ // (1) Sending push notifs // (2) Setting threads to unread and generating corresponding UpdateInfos // (3) Publishing to Redis so that active sockets pass on new messages +// (4) Processing messages for search async function postMessageSend( viewer: Viewer, threadsToMessageIndices: Map, @@ -296,6 +298,8 @@ messageDatas: MessageData[], updatesForCurrentSession: UpdatesForCurrentSession, ) { + const processForSearch = processMessagesForSearch(messageInfos); + let joinIndex = 0; let subthreadSelects = ''; const subthreadJoins = []; @@ -498,6 +502,7 @@ createReadStatusUpdates(latestMessages), redisPublish(viewer, messageInfosPerUser, updatesForCurrentSession), updateLatestMessages(latestMessages), + processForSearch, ]); await Promise.all([ diff --git a/keyserver/src/database/search-utils.js b/keyserver/src/database/search-utils.js new file mode 100644 --- /dev/null +++ b/keyserver/src/database/search-utils.js @@ -0,0 +1,73 @@ +// @flow + +import natural from 'natural'; + +import type { RawMessageInfo } from 'lib/types/message-types'; +import { messageTypes } from 'lib/types/message-types.js'; + +import { dbQuery, SQL } from '../database/database.js'; +import { getSegmenter } from '../utils/segmenter.js'; + +const whiteSpacesRegex = /^[\s]*$/; +const punctuationRegex: RegExp = /\p{General_Category=Punctuation}/gu; + +const segmenter = getSegmenter(); +const { stopwords } = natural; + +function segmentAndStem(message: string): string { + const segmentsIterator = segmenter.segment(message.toLowerCase()); + + const stemmedSegments = []; + for (const segmentObj of segmentsIterator) { + const { segment } = segmentObj; + if (segment.match(whiteSpacesRegex) || stopwords.indexOf(segment) !== -1) { + continue; + } + const stemmedSegment = natural.PorterStemmer.stem(segment).replaceAll( + punctuationRegex, + '', + ); + stemmedSegments.push(stemmedSegment); + } + + return stemmedSegments + .filter(segment => !segment.match(whiteSpacesRegex)) + .join(' '); +} + +async function processMessagesForSearch( + messages: $ReadOnlyArray, +): Promise { + const processedMessages = []; + + for (const msg of messages) { + if ( + msg.type !== messageTypes.TEXT && + msg.type !== messageTypes.EDIT_MESSAGE + ) { + continue; + } + + const processedMessage = segmentAndStem(msg.text); + + if (msg.type === messageTypes.TEXT) { + processedMessages.push([msg.id, msg.id, processedMessage]); + } else { + processedMessages.push([msg.targetMessageID, msg.id, processedMessage]); + } + } + + if (processedMessages.length === 0) { + return; + } + + await dbQuery(SQL` + INSERT INTO message_search (original_message_id, message_id, processed_content) + VALUES ${processedMessages} + ON DUPLICATE KEY UPDATE + message_id = VALUE(message_id), + processed_content = VALUE(processed_content); + `); +} + +export { processMessagesForSearch, segmentAndStem, stopwords }; diff --git a/keyserver/src/database/search-utils.test.js b/keyserver/src/database/search-utils.test.js new file mode 100644 --- /dev/null +++ b/keyserver/src/database/search-utils.test.js @@ -0,0 +1,52 @@ +// @flow +import { segmentAndStem, stopwords } from './search-utils.js'; + +const alphaNumericRegex = /^[A-Za-z0-9 ]*$/; +const lowerCaseRegex = /^[a-z ]*$/; + +describe('segmentAndStem(message: string)', () => { + it('should remove punctuation', () => { + expect(segmentAndStem("o'clock")).toMatch(alphaNumericRegex); + expect(segmentAndStem('test@example')).toMatch(alphaNumericRegex); + expect(segmentAndStem('100,000')).toMatch(alphaNumericRegex); + expect(segmentAndStem('100,000,000')).toMatch(alphaNumericRegex); + expect(segmentAndStem('hello, bye')).toMatch(alphaNumericRegex); + expect(segmentAndStem('hello []!"#$%&\'()*,./:;?@\\_{}- bye')).toMatch( + alphaNumericRegex, + ); + }); + + it('should remove uppercase', () => { + expect(segmentAndStem('Hi Comm')).toMatch(lowerCaseRegex); + expect(segmentAndStem('HELLO')).toMatch(lowerCaseRegex); + }); + + it('should remove stopwords', () => { + const [stopWord1, stopWord2, stopWord3, stopWord4, stopWord5] = stopwords; + expect(segmentAndStem(`hello ${stopWord1}`)).toBe('hello'); + expect(segmentAndStem(`${stopWord2} ${stopWord3} ${stopWord4}`)).toBe(''); + expect(segmentAndStem(`${stopWord5} bye`)).toBe('bye'); + }); + + it('should remove excess whithespace', () => { + expect(segmentAndStem('hello bye')).not.toMatch(/[\s]{2}/); + }); + + it('should remove unicode punctuation', () => { + expect(segmentAndStem('︴﹍⸺〰༻༽»⸃«⸠%¿【﹃')).toBe(''); + }); + + it('should not remove emojis', () => { + const emojiTumbsUp = '👍'; + const emojiFace = '🫡'; + expect(segmentAndStem(emojiTumbsUp)).toBe(emojiTumbsUp); + expect(segmentAndStem(emojiFace)).toBe(emojiFace); + }); + + it('should leave + < = > ^ ` | ~', () => { + const notRemovedASCIIPunctuation = '+ < = > ^ ` | ~'; + expect(segmentAndStem(notRemovedASCIIPunctuation)).toBe( + notRemovedASCIIPunctuation, + ); + }); +});