Page MenuHomePhorge

D7077.1765356661.diff
No OneTemporary

Size
5 KB
Referenced Files
None
Subscribers
None

D7077.1765356661.diff

diff --git a/keyserver/src/creators/message-creator.js b/keyserver/src/creators/message-creator.js
--- a/keyserver/src/creators/message-creator.js
+++ b/keyserver/src/creators/message-creator.js
@@ -32,6 +32,7 @@
appendSQLArray,
mergeOrConditions,
} from '../database/database.js';
+import { processMessagesForSearch } from '../database/search-utils.js';
import {
fetchMessageInfoForLocalID,
fetchMessageInfoByID,
@@ -288,6 +289,7 @@
// (1) Sending push notifs
// (2) Setting threads to unread and generating corresponding UpdateInfos
// (3) Publishing to Redis so that active sockets pass on new messages
+// (4) Processing messages for search
async function postMessageSend(
viewer: Viewer,
threadsToMessageIndices: Map<string, number[]>,
@@ -296,6 +298,8 @@
messageDatas: MessageData[],
updatesForCurrentSession: UpdatesForCurrentSession,
) {
+ const processForSearch = processMessagesForSearch(messageInfos);
+
let joinIndex = 0;
let subthreadSelects = '';
const subthreadJoins = [];
@@ -498,6 +502,7 @@
createReadStatusUpdates(latestMessages),
redisPublish(viewer, messageInfosPerUser, updatesForCurrentSession),
updateLatestMessages(latestMessages),
+ processForSearch,
]);
await Promise.all([
diff --git a/keyserver/src/database/search-utils.js b/keyserver/src/database/search-utils.js
new file mode 100644
--- /dev/null
+++ b/keyserver/src/database/search-utils.js
@@ -0,0 +1,73 @@
+// @flow
+
+import natural from 'natural';
+
+import type { RawMessageInfo } from 'lib/types/message-types';
+import { messageTypes } from 'lib/types/message-types.js';
+
+import { dbQuery, SQL } from '../database/database.js';
+import { getSegmenter } from '../utils/segmenter.js';
+
+const whiteSpacesRegex = /^[\s]*$/;
+const punctuationRegex: RegExp = /\p{General_Category=Punctuation}/gu;
+
+const segmenter = getSegmenter();
+const { stopwords } = natural;
+
+function segmentAndStem(message: string): string {
+ const segmentsIterator = segmenter.segment(message.toLowerCase());
+
+ const stemmedSegments = [];
+ for (const segmentObj of segmentsIterator) {
+ const { segment } = segmentObj;
+ if (segment.match(whiteSpacesRegex) || stopwords.indexOf(segment) !== -1) {
+ continue;
+ }
+ const stemmedSegment = natural.PorterStemmer.stem(segment).replaceAll(
+ punctuationRegex,
+ '',
+ );
+ stemmedSegments.push(stemmedSegment);
+ }
+
+ return stemmedSegments
+ .filter(segment => !segment.match(whiteSpacesRegex))
+ .join(' ');
+}
+
+async function processMessagesForSearch(
+ messages: $ReadOnlyArray<RawMessageInfo>,
+): Promise<void> {
+ const processedMessages = [];
+
+ for (const msg of messages) {
+ if (
+ msg.type !== messageTypes.TEXT &&
+ msg.type !== messageTypes.EDIT_MESSAGE
+ ) {
+ continue;
+ }
+
+ const processedMessage = segmentAndStem(msg.text);
+
+ if (msg.type === messageTypes.TEXT) {
+ processedMessages.push([msg.id, msg.id, processedMessage]);
+ } else {
+ processedMessages.push([msg.targetMessageID, msg.id, processedMessage]);
+ }
+ }
+
+ if (processedMessages.length === 0) {
+ return;
+ }
+
+ await dbQuery(SQL`
+ INSERT INTO message_search (original_message_id, message_id, processed_content)
+ VALUES ${processedMessages}
+ ON DUPLICATE KEY UPDATE
+ message_id = VALUE(message_id),
+ processed_content = VALUE(processed_content);
+ `);
+}
+
+export { processMessagesForSearch, segmentAndStem, stopwords };
diff --git a/keyserver/src/database/search-utils.test.js b/keyserver/src/database/search-utils.test.js
new file mode 100644
--- /dev/null
+++ b/keyserver/src/database/search-utils.test.js
@@ -0,0 +1,52 @@
+// @flow
+import { segmentAndStem, stopwords } from './search-utils.js';
+
+const alphaNumericRegex = /^[A-Za-z0-9 ]*$/;
+const lowerCaseRegex = /^[a-z ]*$/;
+
+describe('segmentAndStem(message: string)', () => {
+ it('should remove punctuation', () => {
+ expect(segmentAndStem("o'clock")).toMatch(alphaNumericRegex);
+ expect(segmentAndStem('test@example')).toMatch(alphaNumericRegex);
+ expect(segmentAndStem('100,000')).toMatch(alphaNumericRegex);
+ expect(segmentAndStem('100,000,000')).toMatch(alphaNumericRegex);
+ expect(segmentAndStem('hello, bye')).toMatch(alphaNumericRegex);
+ expect(segmentAndStem('hello []!"#$%&\'()*,./:;?@\\_{}- bye')).toMatch(
+ alphaNumericRegex,
+ );
+ });
+
+ it('should remove uppercase', () => {
+ expect(segmentAndStem('Hi Comm')).toMatch(lowerCaseRegex);
+ expect(segmentAndStem('HELLO')).toMatch(lowerCaseRegex);
+ });
+
+ it('should remove stopwords', () => {
+ const [stopWord1, stopWord2, stopWord3, stopWord4, stopWord5] = stopwords;
+ expect(segmentAndStem(`hello ${stopWord1}`)).toBe('hello');
+ expect(segmentAndStem(`${stopWord2} ${stopWord3} ${stopWord4}`)).toBe('');
+ expect(segmentAndStem(`${stopWord5} bye`)).toBe('bye');
+ });
+
+ it('should remove excess whithespace', () => {
+ expect(segmentAndStem('hello bye')).not.toMatch(/[\s]{2}/);
+ });
+
+ it('should remove unicode punctuation', () => {
+ expect(segmentAndStem('︴﹍⸺〰༻༽»⸃«⸠%¿【﹃')).toBe('');
+ });
+
+ it('should not remove emojis', () => {
+ const emojiTumbsUp = '👍';
+ const emojiFace = '🫡';
+ expect(segmentAndStem(emojiTumbsUp)).toBe(emojiTumbsUp);
+ expect(segmentAndStem(emojiFace)).toBe(emojiFace);
+ });
+
+ it('should leave + < = > ^ ` | ~', () => {
+ const notRemovedASCIIPunctuation = '+ < = > ^ ` | ~';
+ expect(segmentAndStem(notRemovedASCIIPunctuation)).toBe(
+ notRemovedASCIIPunctuation,
+ );
+ });
+});

File Metadata

Mime Type
text/plain
Expires
Wed, Dec 10, 8:51 AM (14 h, 53 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
5858948
Default Alt Text
D7077.1765356661.diff (5 KB)

Event Timeline