diff --git a/convertors/move-timestamp-out-of-payload.js b/convertors/move-timestamp-out-of-payload.js index 97f6b369..225b4ad8 100644 --- a/convertors/move-timestamp-out-of-payload.js +++ b/convertors/move-timestamp-out-of-payload.js @@ -16,7 +16,12 @@ async function movePayloadTimestampToEventLevel(db, collectionName) { const docsToUpdate = collection.find( { timestamp: { $exists: false } }, - { projection: { _id: 1, 'payload.timestamp': 1 } } + { + projection: { + _id: 1, + 'payload.timestamp': 1, + }, + } ).limit(documentsSelectionLimit); const batchedOps = []; @@ -34,11 +39,11 @@ async function movePayloadTimestampToEventLevel(db, collectionName) { updateOne: { filter: { _id: doc._id }, update: { - $set: { timestamp: Number(doc.payload.timestamp)}, - $unset: {'payload.timestamp': ''}, - } - } - }) + $set: { timestamp: Number(doc.payload.timestamp) }, + $unset: { 'payload.timestamp': '' }, + }, + }, + }); currentCount++; } @@ -47,7 +52,7 @@ async function movePayloadTimestampToEventLevel(db, collectionName) { await collection.bulkWrite(batchedOps); } - return currentCount + return currentCount; } /** * @param db - mongo db instance @@ -58,15 +63,21 @@ async function backfillTimestampsFromEvents(db, repetitionCollectionName, projec const repetitions = db.collection(repetitionCollectionName); const events = db.collection(`events:${projectId}`); - let bulkOps = []; + const bulkOps = []; let repetitionCount = 1; const repetitionsList = await repetitions.find( { timestamp: { $exists: false }, }, - { projection: { _id: 1, groupHash: 1 } } - ).limit(documentsSelectionLimit).toArray(); + { + projection: { + _id: 1, + groupHash: 1, + }, + } + ).limit(documentsSelectionLimit) + .toArray(); const groupHashList = []; @@ -78,14 +89,19 @@ async function backfillTimestampsFromEvents(db, repetitionCollectionName, projec const relatedEvents = await events.find( { groupHash: { $in: groupHashList } }, - { projection: { timestamp: 1, groupHash: 1 } } + { + projection: { + timestamp: 1, + groupHash: 1, + }, + } ).toArray(); - const relatedEventsMap = new Map() + const relatedEventsMap = new Map(); relatedEvents.forEach(e => { relatedEventsMap.set(e.groupHash, e); - }) + }); for (const repetition of repetitionsList) { const relatedEvent = relatedEventsMap.get(repetition.groupHash); @@ -93,9 +109,9 @@ async function backfillTimestampsFromEvents(db, repetitionCollectionName, projec if (!relatedEvent) { bulkOps.push({ deleteOne: { - filter: { _id: repetition._id } - } - }) + filter: { _id: repetition._id }, + }, + }); } else if (relatedEvent?.timestamp !== null) { bulkOps.push({ updateOne: { @@ -112,11 +128,12 @@ async function backfillTimestampsFromEvents(db, repetitionCollectionName, projec const result = await repetitions.bulkWrite(bulkOps); const updated = result.modifiedCount; const deleted = result.deletedCount; + processed = bulkOps.length; console.log(` updates (${processed} processed, ${updated} updated, ${deleted} deleted)`); if (updated + deleted === 0) { - repetitionCollectionsToCheck.filter(collection => collection !== repetition) + repetitionCollectionsToCheck.filter(collection => collection !== repetition); } } @@ -175,13 +192,13 @@ async function run() { // Convert events let i = 1; - let documentsUpdatedCount = 1 + let documentsUpdatedCount = 1; while (documentsUpdatedCount != 0) { documentsUpdatedCount = 0; i = 1; const collectionsToUpdateCount = eventCollectionsToCheck.length; - + for (const collectionName of eventCollectionsToCheck) { console.log(`[${i}/${collectionsToUpdateCount}] Processing ${collectionName}`); const updated = await movePayloadTimestampToEventLevel(db, collectionName); @@ -190,10 +207,10 @@ async function run() { eventCollectionsToCheck = eventCollectionsToCheck.filter(collection => collection !== collectionName); } - documentsUpdatedCount += updated + documentsUpdatedCount += updated; i++; } - } + } // Convert repetitions + backfill from events documentsUpdatedCount = 1; diff --git a/workers/grouper/src/index.ts b/workers/grouper/src/index.ts index d6a01caf..4a2809aa 100644 --- a/workers/grouper/src/index.ts +++ b/workers/grouper/src/index.ts @@ -23,7 +23,7 @@ import HawkCatcher from '@hawk.so/nodejs'; import { MS_IN_SEC } from '../../../lib/utils/consts'; import DataFilter from './data-filter'; import RedisHelper from './redisHelper'; -import levenshtein from 'js-levenshtein'; +// import levenshtein from 'js-levenshtein'; import { computeDelta } from './utils/repetitionDiff'; import TimeMs from '../../../lib/utils/time'; import { rightTrim } from '../../../lib/utils/string'; @@ -109,7 +109,7 @@ export default class GrouperWorker extends Worker { let existedEvent = await this.getEvent(task.projectId, uniqueEventHash); /** - * If we couldn't group by group hash (title), try grouping by Levenshtein distance or patterns + * If we couldn't group by group hash (title), try grouping by patterns */ if (!existedEvent) { const similarEvent = await this.findSimilarEvent(task.projectId, task.payload); @@ -287,35 +287,35 @@ export default class GrouperWorker extends Worker { * @param event - event to compare */ private async findSimilarEvent(projectId: string, event: EventData): Promise { - const eventsCountToCompare = 60; - const diffTreshold = 0.35; + // const eventsCountToCompare = 60; + // const diffTreshold = 0.35; - const lastUniqueEvents = await this.findLastEvents(projectId, eventsCountToCompare); + // const lastUniqueEvents = await this.findLastEvents(projectId, eventsCountToCompare); /** * Trim titles to reduce CPU usage for Levenshtein comparison */ - const trimmedEventTitle = hasValue(event.title) ? rightTrim(event.title, MAX_CODE_LINE_LENGTH) : ''; + // const trimmedEventTitle = hasValue(event.title) ? rightTrim(event.title, MAX_CODE_LINE_LENGTH) : ''; /** * First try to find by Levenshtein distance */ - const similarByLevenshtein = lastUniqueEvents.filter(prevEvent => { - const trimmedPrevTitle = hasValue(prevEvent.payload.title) ? rightTrim(prevEvent.payload.title, MAX_CODE_LINE_LENGTH) : ''; + // const similarByLevenshtein = lastUniqueEvents.filter(prevEvent => { + // const trimmedPrevTitle = hasValue(prevEvent.payload.title) ? rightTrim(prevEvent.payload.title, MAX_CODE_LINE_LENGTH) : ''; - if (trimmedEventTitle === '' || trimmedPrevTitle === '') { - return false; - } + // if (trimmedEventTitle === '' || trimmedPrevTitle === '') { + // return false; + // } - const distance = levenshtein(trimmedEventTitle, trimmedPrevTitle); - const threshold = trimmedEventTitle.length * diffTreshold; + // const distance = levenshtein(trimmedEventTitle, trimmedPrevTitle); + // const threshold = trimmedEventTitle.length * diffTreshold; - return distance < threshold; - }).pop(); + // return distance < threshold; + // }).pop(); - if (similarByLevenshtein) { - return similarByLevenshtein; - } + // if (similarByLevenshtein) { + // return similarByLevenshtein; + // } /** * If no match by Levenshtein, try matching by patterns @@ -402,23 +402,23 @@ export default class GrouperWorker extends Worker { * @param count - how many events to return * @returns {GroupedEventDBScheme[]} list of the last N unique events */ - private findLastEvents(projectId: string, count: number): Promise { - return this.cache.get(`last:${count}:eventsOf:${projectId}`, async () => { - return this.eventsDb.getConnection() - .collection(`events:${projectId}`) - .find() - .sort({ - _id: 1, - }) - .limit(count) - .toArray(); - }, - /** - * TimeMs class stores time intervals in milliseconds, however NodeCache ttl needs to be specified in seconds - */ - /* eslint-disable-next-line @typescript-eslint/no-magic-numbers */ - TimeMs.MINUTE / 1000); - } + // private findLastEvents(projectId: string, count: number): Promise { + // return this.cache.get(`last:${count}:eventsOf:${projectId}`, async () => { + // return this.eventsDb.getConnection() + // .collection(`events:${projectId}`) + // .find() + // .sort({ + // _id: 1, + // }) + // .limit(count) + // .toArray(); + // }, + // /** + // * TimeMs class stores time intervals in milliseconds, however NodeCache ttl needs to be specified in seconds + // */ + // /* eslint-disable-next-line @typescript-eslint/no-magic-numbers */ + // TimeMs.MINUTE / 1000); + // } /** * Decides whether to increase the number of affected users for the repetition and the daily aggregation diff --git a/workers/grouper/tests/index.test.ts b/workers/grouper/tests/index.test.ts index bf3ea84e..d3f5649e 100644 --- a/workers/grouper/tests/index.test.ts +++ b/workers/grouper/tests/index.test.ts @@ -457,17 +457,17 @@ describe('GrouperWorker', () => { }); describe('Grouping', () => { - test('should group events with partially different titles', async () => { - await worker.handle(generateTask({ title: 'Some error (but not filly identical) example' })); - await worker.handle(generateTask({ title: 'Some error (yes, it is not the identical) example' })); - await worker.handle(generateTask({ title: 'Some error (and it is not identical) example' })); + // test('should group events with partially different titles', async () => { + // await worker.handle(generateTask({ title: 'Some error (but not filly identical) example' })); + // await worker.handle(generateTask({ title: 'Some error (yes, it is not the identical) example' })); + // await worker.handle(generateTask({ title: 'Some error (and it is not identical) example' })); - const originalEvent = await eventsCollection.findOne({}); + // const originalEvent = await eventsCollection.findOne({}); - expect((await repetitionsCollection.find({ - groupHash: originalEvent.groupHash, - }).toArray()).length).toBe(2); - }); + // expect((await repetitionsCollection.find({ + // groupHash: originalEvent.groupHash, + // }).toArray()).length).toBe(2); + // }); describe('Pattern matching', () => { beforeEach(() => {