Skip to content

Commit c98b769

Browse files
authored
chore(): disable grouping by levenstein (#433)
* chore(): disable grouping by levenstein * chore(): rm redundant test * chore: lint fix
1 parent 6947129 commit c98b769

3 files changed

Lines changed: 82 additions & 65 deletions

File tree

convertors/move-timestamp-out-of-payload.js

Lines changed: 38 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,12 @@ async function movePayloadTimestampToEventLevel(db, collectionName) {
1616

1717
const docsToUpdate = collection.find(
1818
{ timestamp: { $exists: false } },
19-
{ projection: { _id: 1, 'payload.timestamp': 1 } }
19+
{
20+
projection: {
21+
_id: 1,
22+
'payload.timestamp': 1,
23+
},
24+
}
2025
).limit(documentsSelectionLimit);
2126

2227
const batchedOps = [];
@@ -34,11 +39,11 @@ async function movePayloadTimestampToEventLevel(db, collectionName) {
3439
updateOne: {
3540
filter: { _id: doc._id },
3641
update: {
37-
$set: { timestamp: Number(doc.payload.timestamp)},
38-
$unset: {'payload.timestamp': ''},
39-
}
40-
}
41-
})
42+
$set: { timestamp: Number(doc.payload.timestamp) },
43+
$unset: { 'payload.timestamp': '' },
44+
},
45+
},
46+
});
4247

4348
currentCount++;
4449
}
@@ -47,7 +52,7 @@ async function movePayloadTimestampToEventLevel(db, collectionName) {
4752
await collection.bulkWrite(batchedOps);
4853
}
4954

50-
return currentCount
55+
return currentCount;
5156
}
5257
/**
5358
* @param db - mongo db instance
@@ -58,15 +63,21 @@ async function backfillTimestampsFromEvents(db, repetitionCollectionName, projec
5863
const repetitions = db.collection(repetitionCollectionName);
5964
const events = db.collection(`events:${projectId}`);
6065

61-
let bulkOps = [];
66+
const bulkOps = [];
6267
let repetitionCount = 1;
6368

6469
const repetitionsList = await repetitions.find(
6570
{
6671
timestamp: { $exists: false },
6772
},
68-
{ projection: { _id: 1, groupHash: 1 } }
69-
).limit(documentsSelectionLimit).toArray();
73+
{
74+
projection: {
75+
_id: 1,
76+
groupHash: 1,
77+
},
78+
}
79+
).limit(documentsSelectionLimit)
80+
.toArray();
7081

7182
const groupHashList = [];
7283

@@ -78,24 +89,29 @@ async function backfillTimestampsFromEvents(db, repetitionCollectionName, projec
7889

7990
const relatedEvents = await events.find(
8091
{ groupHash: { $in: groupHashList } },
81-
{ projection: { timestamp: 1, groupHash: 1 } }
92+
{
93+
projection: {
94+
timestamp: 1,
95+
groupHash: 1,
96+
},
97+
}
8298
).toArray();
8399

84-
const relatedEventsMap = new Map()
100+
const relatedEventsMap = new Map();
85101

86102
relatedEvents.forEach(e => {
87103
relatedEventsMap.set(e.groupHash, e);
88-
})
104+
});
89105

90106
for (const repetition of repetitionsList) {
91107
const relatedEvent = relatedEventsMap.get(repetition.groupHash);
92108

93109
if (!relatedEvent) {
94110
bulkOps.push({
95111
deleteOne: {
96-
filter: { _id: repetition._id }
97-
}
98-
})
112+
filter: { _id: repetition._id },
113+
},
114+
});
99115
} else if (relatedEvent?.timestamp !== null) {
100116
bulkOps.push({
101117
updateOne: {
@@ -112,11 +128,12 @@ async function backfillTimestampsFromEvents(db, repetitionCollectionName, projec
112128
const result = await repetitions.bulkWrite(bulkOps);
113129
const updated = result.modifiedCount;
114130
const deleted = result.deletedCount;
131+
115132
processed = bulkOps.length;
116133
console.log(` updates (${processed} processed, ${updated} updated, ${deleted} deleted)`);
117134

118135
if (updated + deleted === 0) {
119-
repetitionCollectionsToCheck.filter(collection => collection !== repetition)
136+
repetitionCollectionsToCheck.filter(collection => collection !== repetition);
120137
}
121138
}
122139

@@ -175,13 +192,13 @@ async function run() {
175192

176193
// Convert events
177194
let i = 1;
178-
let documentsUpdatedCount = 1
195+
let documentsUpdatedCount = 1;
179196

180197
while (documentsUpdatedCount != 0) {
181198
documentsUpdatedCount = 0;
182199
i = 1;
183200
const collectionsToUpdateCount = eventCollectionsToCheck.length;
184-
201+
185202
for (const collectionName of eventCollectionsToCheck) {
186203
console.log(`[${i}/${collectionsToUpdateCount}] Processing ${collectionName}`);
187204
const updated = await movePayloadTimestampToEventLevel(db, collectionName);
@@ -190,10 +207,10 @@ async function run() {
190207
eventCollectionsToCheck = eventCollectionsToCheck.filter(collection => collection !== collectionName);
191208
}
192209

193-
documentsUpdatedCount += updated
210+
documentsUpdatedCount += updated;
194211
i++;
195212
}
196-
}
213+
}
197214

198215
// Convert repetitions + backfill from events
199216
documentsUpdatedCount = 1;

workers/grouper/src/index.ts

Lines changed: 35 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ import HawkCatcher from '@hawk.so/nodejs';
2323
import { MS_IN_SEC } from '../../../lib/utils/consts';
2424
import DataFilter from './data-filter';
2525
import RedisHelper from './redisHelper';
26-
import levenshtein from 'js-levenshtein';
26+
// import levenshtein from 'js-levenshtein';
2727
import { computeDelta } from './utils/repetitionDiff';
2828
import TimeMs from '../../../lib/utils/time';
2929
import { rightTrim } from '../../../lib/utils/string';
@@ -109,7 +109,7 @@ export default class GrouperWorker extends Worker {
109109
let existedEvent = await this.getEvent(task.projectId, uniqueEventHash);
110110

111111
/**
112-
* If we couldn't group by group hash (title), try grouping by Levenshtein distance or patterns
112+
* If we couldn't group by group hash (title), try grouping by patterns
113113
*/
114114
if (!existedEvent) {
115115
const similarEvent = await this.findSimilarEvent(task.projectId, task.payload);
@@ -287,35 +287,35 @@ export default class GrouperWorker extends Worker {
287287
* @param event - event to compare
288288
*/
289289
private async findSimilarEvent(projectId: string, event: EventData<EventAddons>): Promise<GroupedEventDBScheme | undefined> {
290-
const eventsCountToCompare = 60;
291-
const diffTreshold = 0.35;
290+
// const eventsCountToCompare = 60;
291+
// const diffTreshold = 0.35;
292292

293-
const lastUniqueEvents = await this.findLastEvents(projectId, eventsCountToCompare);
293+
// const lastUniqueEvents = await this.findLastEvents(projectId, eventsCountToCompare);
294294

295295
/**
296296
* Trim titles to reduce CPU usage for Levenshtein comparison
297297
*/
298-
const trimmedEventTitle = hasValue(event.title) ? rightTrim(event.title, MAX_CODE_LINE_LENGTH) : '';
298+
// const trimmedEventTitle = hasValue(event.title) ? rightTrim(event.title, MAX_CODE_LINE_LENGTH) : '';
299299

300300
/**
301301
* First try to find by Levenshtein distance
302302
*/
303-
const similarByLevenshtein = lastUniqueEvents.filter(prevEvent => {
304-
const trimmedPrevTitle = hasValue(prevEvent.payload.title) ? rightTrim(prevEvent.payload.title, MAX_CODE_LINE_LENGTH) : '';
303+
// const similarByLevenshtein = lastUniqueEvents.filter(prevEvent => {
304+
// const trimmedPrevTitle = hasValue(prevEvent.payload.title) ? rightTrim(prevEvent.payload.title, MAX_CODE_LINE_LENGTH) : '';
305305

306-
if (trimmedEventTitle === '' || trimmedPrevTitle === '') {
307-
return false;
308-
}
306+
// if (trimmedEventTitle === '' || trimmedPrevTitle === '') {
307+
// return false;
308+
// }
309309

310-
const distance = levenshtein(trimmedEventTitle, trimmedPrevTitle);
311-
const threshold = trimmedEventTitle.length * diffTreshold;
310+
// const distance = levenshtein(trimmedEventTitle, trimmedPrevTitle);
311+
// const threshold = trimmedEventTitle.length * diffTreshold;
312312

313-
return distance < threshold;
314-
}).pop();
313+
// return distance < threshold;
314+
// }).pop();
315315

316-
if (similarByLevenshtein) {
317-
return similarByLevenshtein;
318-
}
316+
// if (similarByLevenshtein) {
317+
// return similarByLevenshtein;
318+
// }
319319

320320
/**
321321
* If no match by Levenshtein, try matching by patterns
@@ -402,23 +402,23 @@ export default class GrouperWorker extends Worker {
402402
* @param count - how many events to return
403403
* @returns {GroupedEventDBScheme[]} list of the last N unique events
404404
*/
405-
private findLastEvents(projectId: string, count: number): Promise<GroupedEventDBScheme[]> {
406-
return this.cache.get(`last:${count}:eventsOf:${projectId}`, async () => {
407-
return this.eventsDb.getConnection()
408-
.collection(`events:${projectId}`)
409-
.find()
410-
.sort({
411-
_id: 1,
412-
})
413-
.limit(count)
414-
.toArray();
415-
},
416-
/**
417-
* TimeMs class stores time intervals in milliseconds, however NodeCache ttl needs to be specified in seconds
418-
*/
419-
/* eslint-disable-next-line @typescript-eslint/no-magic-numbers */
420-
TimeMs.MINUTE / 1000);
421-
}
405+
// private findLastEvents(projectId: string, count: number): Promise<GroupedEventDBScheme[]> {
406+
// return this.cache.get(`last:${count}:eventsOf:${projectId}`, async () => {
407+
// return this.eventsDb.getConnection()
408+
// .collection(`events:${projectId}`)
409+
// .find()
410+
// .sort({
411+
// _id: 1,
412+
// })
413+
// .limit(count)
414+
// .toArray();
415+
// },
416+
// /**
417+
// * TimeMs class stores time intervals in milliseconds, however NodeCache ttl needs to be specified in seconds
418+
// */
419+
// /* eslint-disable-next-line @typescript-eslint/no-magic-numbers */
420+
// TimeMs.MINUTE / 1000);
421+
// }
422422

423423
/**
424424
* Decides whether to increase the number of affected users for the repetition and the daily aggregation

workers/grouper/tests/index.test.ts

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -457,17 +457,17 @@ describe('GrouperWorker', () => {
457457
});
458458

459459
describe('Grouping', () => {
460-
test('should group events with partially different titles', async () => {
461-
await worker.handle(generateTask({ title: 'Some error (but not filly identical) example' }));
462-
await worker.handle(generateTask({ title: 'Some error (yes, it is not the identical) example' }));
463-
await worker.handle(generateTask({ title: 'Some error (and it is not identical) example' }));
460+
// test('should group events with partially different titles', async () => {
461+
// await worker.handle(generateTask({ title: 'Some error (but not filly identical) example' }));
462+
// await worker.handle(generateTask({ title: 'Some error (yes, it is not the identical) example' }));
463+
// await worker.handle(generateTask({ title: 'Some error (and it is not identical) example' }));
464464

465-
const originalEvent = await eventsCollection.findOne({});
465+
// const originalEvent = await eventsCollection.findOne({});
466466

467-
expect((await repetitionsCollection.find({
468-
groupHash: originalEvent.groupHash,
469-
}).toArray()).length).toBe(2);
470-
});
467+
// expect((await repetitionsCollection.find({
468+
// groupHash: originalEvent.groupHash,
469+
// }).toArray()).length).toBe(2);
470+
// });
471471

472472
describe('Pattern matching', () => {
473473
beforeEach(() => {

0 commit comments

Comments
 (0)