Skip to content

Commit 33411ae

Browse files
committed
fix: improve member merge suggestions for split git email profiles (CM-1137)
Signed-off-by: Yeganathan S <63534555+skwowet@users.noreply.github.com>
1 parent 19cae0e commit 33411ae

6 files changed

Lines changed: 242 additions & 35 deletions

File tree

services/apps/merge_suggestions_worker/src/activities/memberMergeSuggestions.ts

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
/* eslint-disable @typescript-eslint/no-explicit-any */
22
import uniqBy from 'lodash.uniqby'
33

4-
import { parseGitHubNoreplyEmail } from '@crowd/common'
4+
import { parseGitHubNoreplyEmail, parseGitLabNoreplyEmail } from '@crowd/common'
55
import { addMemberNoMerge } from '@crowd/data-access-layer/src/member_merge'
66
import { MemberField, queryMembers } from '@crowd/data-access-layer/src/members'
77
import MemberMergeSuggestionsRepository from '@crowd/data-access-layer/src/old/apps/merge_suggestions_worker/memberMergeSuggestions.repo'
@@ -119,6 +119,10 @@ export async function getMemberMergeSuggestions(
119119
if (ghUsername) {
120120
noreplyEmailUsernameMatches.push({ value: ghUsername, platform: PlatformType.GITHUB })
121121
}
122+
const glUsername = parseGitLabNoreplyEmail(value)
123+
if (glUsername) {
124+
noreplyEmailUsernameMatches.push({ value: glUsername, platform: PlatformType.GITLAB })
125+
}
122126
}
123127

124128
// Fuzzy matches (only for verified & non-numeric)
@@ -213,8 +217,8 @@ export async function getMemberMergeSuggestions(
213217
}),
214218
},
215219
{
216-
// Query 8: Noreply/private email -> username (verified or unverified)
217-
matches: uniqBy(noreplyEmailUsernameMatches, 'value'),
220+
// Query 8: Noreply email -> platform username (verified identities only)
221+
matches: uniqBy(noreplyEmailUsernameMatches, (m) => `${m.platform}:${m.value}`),
218222
builder: ({ value, platform }) => ({
219223
bool: {
220224
must: [

services/apps/merge_suggestions_worker/src/memberSimilarityCalculator.ts

Lines changed: 157 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,12 @@
11
import { get as getLevenshteinDistance } from 'fast-levenshtein'
22

3-
import { parseGitHubNoreplyEmail } from '@crowd/common'
3+
import {
4+
getEmailLocalPart,
5+
isKnownBot,
6+
isLocalMachineEmail,
7+
parseGitHubNoreplyEmail,
8+
parseGitLabNoreplyEmail,
9+
} from '@crowd/common'
410
import {
511
IMemberIdentity,
612
IMemberOpensearch,
@@ -14,6 +20,7 @@ import {
1420

1521
import { EMAIL_AS_USERNAME_PLATFORMS } from './enums'
1622
import { MemberAttributeOpensearch } from './enums'
23+
import { isOsReservedName } from './utils'
1724

1825
class MemberSimilarityCalculator {
1926
static HIGH_CONFIDENCE_SCORE = 0.9
@@ -194,28 +201,84 @@ class MemberSimilarityCalculator {
194201
): boolean {
195202
if (member.identities && member.identities.length > 0) {
196203
for (const identity of member.identities.filter(
197-
(i) => i.type === MemberIdentityType.USERNAME,
204+
(i) => i.type === MemberIdentityType.USERNAME && i.verified,
198205
)) {
206+
const clashingIdentities = similarMember.nested_identities.filter(
207+
(i) =>
208+
i.keyword_type === MemberIdentityType.USERNAME &&
209+
i.bool_verified === true &&
210+
i.string_platform === identity.platform &&
211+
i.keyword_value !== identity.value,
212+
)
213+
214+
if (clashingIdentities.length === 0) continue
215+
216+
// git "usernames" are commit-author emails — a single person commits from many addresses
217+
// (work, personal, noreply, machine-local). Different values don't mean different people.
218+
// Relax only when ALL clashes are email-like and the displayName is a real identity.
199219
if (
200-
similarMember.nested_identities.some(
201-
(i) =>
202-
i.keyword_type === MemberIdentityType.USERNAME &&
203-
i.string_platform === identity.platform &&
204-
i.keyword_value !== identity.value,
205-
)
220+
identity.platform === PlatformType.GIT &&
221+
identity.value.includes('@') &&
222+
clashingIdentities.every((c) => c.keyword_value?.includes('@')) &&
223+
!this.isNonIdentityBearingDisplayName(member, similarMember)
206224
) {
207-
return true
225+
continue
208226
}
227+
228+
return true
209229
}
210230
}
211231

212232
return false
213233
}
214234

215-
/**
216-
* Checks bidirectionally if a noreply address on one member resolves to a platform username on the other.
217-
* No identity type filter — git ingest stores noreply addresses as type=username, not type=email.
218-
*/
235+
// Returns true when either member's displayName is a bot or a generic placeholder rather than
236+
// a real person's name — we keep conservative clash detection for these to avoid merging
237+
// distinct entities that share the same non-identity string.
238+
private static isNonIdentityBearingDisplayName(
239+
member: IMemberWithAggregatesForMergeSuggestions,
240+
similarMember: IMemberOpensearch,
241+
): boolean {
242+
if (member.attributes?.[MemberAttributeName.IS_BOT]?.default === true) return true
243+
// obj_isBot is a boolean attribute indexed under obj_attributes but not in the typed enum
244+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
245+
if ((similarMember.obj_attributes as any)?.obj_isBot?.bool_default === true) return true
246+
if (isKnownBot(member.displayName ?? '') || isKnownBot(similarMember.keyword_displayName ?? ''))
247+
return true
248+
249+
const primaryDn = (member.displayName ?? '').toLowerCase().trim()
250+
const similarDn = (similarMember.keyword_displayName ?? '').toLowerCase().trim()
251+
252+
if (isOsReservedName(primaryDn) || isOsReservedName(similarDn)) return true
253+
254+
// When a git/username local-part equals the displayName, the name is just the commit-time
255+
// default (e.g. user.name="user", user.email="user@laptop.local").
256+
for (const id of member.identities) {
257+
if (
258+
id.platform === PlatformType.GIT &&
259+
id.type === MemberIdentityType.USERNAME &&
260+
id.value.includes('@')
261+
) {
262+
const lp = getEmailLocalPart(id.value)
263+
if (lp === primaryDn || lp === similarDn) return true
264+
}
265+
}
266+
for (const id of similarMember.nested_identities) {
267+
if (
268+
id.string_platform === PlatformType.GIT &&
269+
id.keyword_type === MemberIdentityType.USERNAME &&
270+
id.string_value?.includes('@')
271+
) {
272+
const lp = getEmailLocalPart(id.string_value)
273+
if (lp === primaryDn || lp === similarDn) return true
274+
}
275+
}
276+
277+
return false
278+
}
279+
280+
// Checks if a noreply address on either member resolves to the other's platform username.
281+
// Noreply identities can be ingested as type=email or type=username depending on the source.
219282
static hasMatchingUsernameFromNoreplyEmail(
220283
primaryMember: IMemberWithAggregatesForMergeSuggestions,
221284
similarMember: IMemberOpensearch,
@@ -231,6 +294,15 @@ class MemberSimilarityCalculator {
231294
)
232295
.map((i) => i.string_value?.toLowerCase()),
233296
),
297+
[PlatformType.GITLAB]: new Set(
298+
similarMember.nested_identities
299+
.filter(
300+
(i) =>
301+
i.string_platform === PlatformType.GITLAB &&
302+
i.keyword_type === MemberIdentityType.USERNAME,
303+
)
304+
.map((i) => i.string_value?.toLowerCase()),
305+
),
234306
}
235307

236308
for (const identity of primaryMember.identities) {
@@ -240,6 +312,11 @@ class MemberSimilarityCalculator {
240312
if (ghUsername && similarUsernamesByPlatform[PlatformType.GITHUB].has(ghUsername)) {
241313
return true
242314
}
315+
316+
const glUsername = parseGitLabNoreplyEmail(identity.value)
317+
if (glUsername && similarUsernamesByPlatform[PlatformType.GITLAB].has(glUsername)) {
318+
return true
319+
}
243320
}
244321

245322
// Similar member's noreply -> primary member's platform usernames
@@ -251,6 +328,13 @@ class MemberSimilarityCalculator {
251328
)
252329
.map((i) => i.value?.toLowerCase()),
253330
),
331+
[PlatformType.GITLAB]: new Set(
332+
primaryMember.identities
333+
.filter(
334+
(i) => i.platform === PlatformType.GITLAB && i.type === MemberIdentityType.USERNAME,
335+
)
336+
.map((i) => i.value?.toLowerCase()),
337+
),
254338
}
255339

256340
for (const identity of similarMember.nested_identities) {
@@ -260,6 +344,11 @@ class MemberSimilarityCalculator {
260344
if (ghUsername && primaryUsernamesByPlatform[PlatformType.GITHUB].has(ghUsername)) {
261345
return true
262346
}
347+
348+
const glUsername = parseGitLabNoreplyEmail(identity.string_value)
349+
if (glUsername && primaryUsernamesByPlatform[PlatformType.GITLAB].has(glUsername)) {
350+
return true
351+
}
263352
}
264353

265354
return false
@@ -379,10 +468,15 @@ class MemberSimilarityCalculator {
379468
similarMember: IMemberOpensearch,
380469
startingScore?: number,
381470
): number {
382-
let isHighConfidence = false
383-
let confidenceScore = startingScore || this.HIGH_CONFIDENCE_SCORE
471+
// displayName equality (startingScore omitted) is itself a high-confidence signal — don't
472+
// gate on metadata. Edit-distance callers pass a score and must have at least one signal.
473+
let isHighConfidence = startingScore === undefined
474+
let confidenceScore =
475+
startingScore != null && Number.isFinite(startingScore)
476+
? startingScore
477+
: this.HIGH_CONFIDENCE_SCORE
384478

385-
const bumpFactor = Math.floor((1 - confidenceScore) / 5)
479+
const bumpFactor = (1 - confidenceScore) / 6
386480

387481
if (this.hasSameLocation(member, similarMember)) {
388482
isHighConfidence = true
@@ -409,13 +503,60 @@ class MemberSimilarityCalculator {
409503
confidenceScore = this.bumpConfidenceScore(confidenceScore, bumpFactor)
410504
}
411505

506+
// Catches contributors whose only identity is commits from an unconfigured machine —
507+
// same local-part across *.local / *.lan addresses is a strong hint they're the same person.
508+
if (this.hasSameLocalPartInGitUsername(member, similarMember)) {
509+
isHighConfidence = true
510+
confidenceScore = this.bumpConfidenceScore(confidenceScore, bumpFactor)
511+
}
512+
412513
if (!isHighConfidence) {
413514
return this.LOW_CONFIDENCE_SCORE
414515
}
415516

416517
return confidenceScore
417518
}
418519

520+
static hasSameLocalPartInGitUsername(
521+
member: IMemberWithAggregatesForMergeSuggestions,
522+
similarMember: IMemberOpensearch,
523+
): boolean {
524+
const primaryDn = (member.displayName ?? '').toLowerCase().trim()
525+
const similarDn = (similarMember.keyword_displayName ?? '').toLowerCase().trim()
526+
const isUsable = (lp: string) =>
527+
lp.length > 0 &&
528+
lp !== primaryDn &&
529+
lp !== similarDn &&
530+
!isKnownBot(lp) &&
531+
!isOsReservedName(lp)
532+
533+
const primaryLocalParts = member.identities
534+
.filter(
535+
(i) =>
536+
i.platform === PlatformType.GIT &&
537+
i.type === MemberIdentityType.USERNAME &&
538+
isLocalMachineEmail(i.value),
539+
)
540+
.map((i) => getEmailLocalPart(i.value))
541+
.filter(isUsable)
542+
543+
if (primaryLocalParts.length === 0) return false
544+
545+
const similarLocalParts = new Set(
546+
similarMember.nested_identities
547+
.filter(
548+
(i) =>
549+
i.string_platform === PlatformType.GIT &&
550+
i.keyword_type === MemberIdentityType.USERNAME &&
551+
isLocalMachineEmail(i.string_value ?? ''),
552+
)
553+
.map((i) => getEmailLocalPart(i.string_value ?? ''))
554+
.filter(isUsable),
555+
)
556+
557+
return primaryLocalParts.some((lp) => similarLocalParts.has(lp))
558+
}
559+
419560
static bumpConfidenceScore(confidenceScore: number, bump: number): number {
420561
return Math.min(1, confidenceScore + bump)
421562
}

services/apps/merge_suggestions_worker/src/utils.ts

Lines changed: 21 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,13 @@ import { ILLMConsumableMember, PlatformType } from '@crowd/types'
22

33
import { EMAIL_AS_USERNAME_PLATFORMS } from './enums'
44

5+
const NOREPLY_SUFFIXES = ['@users.noreply.github.com', '@users.noreply.gitlab.com']
6+
7+
function isNoreplyEmail(value: string): boolean {
8+
const lower = value.toLowerCase()
9+
return NOREPLY_SUFFIXES.some((s) => lower.endsWith(s))
10+
}
11+
512
export const prefixLength = (string: string) => {
613
if (string.length > 5 && string.length < 8) {
714
return 6
@@ -21,15 +28,12 @@ export function chunkArray<T>(array: T[], chunkSize: number): T[][] {
2128
export const removeEmailLikeIdentitiesFromMember = (
2229
member: ILLMConsumableMember,
2330
): ILLMConsumableMember => {
24-
const nonEmailIdentities: { platform: string; value: string }[] = []
25-
for (const identity of member.identities) {
26-
if (identity.value.indexOf('@') === -1) {
27-
// remove found identity from member.identities
28-
nonEmailIdentities.push(identity)
29-
}
30-
}
31-
32-
member.identities = nonEmailIdentities
31+
// Strip plain emails to avoid the LLM making false matches via shared domain suffixes.
32+
// Noreply addresses are kept — their local-part is a provider-authoritative username and
33+
// gives the LLM a meaningful identity signal.
34+
member.identities = member.identities.filter(
35+
(identity) => !identity.value.includes('@') || isNoreplyEmail(identity.value),
36+
)
3337

3438
return member
3539
}
@@ -45,3 +49,11 @@ export function isEmailAsUsernamePlatform(platform: PlatformType) {
4549
export function stripProtocol(value: string) {
4650
return value.replace(/^https?:\/\//, '')
4751
}
52+
53+
// Generic git user.name placeholders and OS account names that are never a real human
54+
// identity — shared across unrelated machines and meaningless as a merge signal.
55+
const OS_RESERVED_NAMES = new Set(['unknown', 'root', 'ubuntu', 'admin', 'user', 'guest'])
56+
57+
export function isOsReservedName(name: string): boolean {
58+
return OS_RESERVED_NAMES.has(name.trim().toLowerCase())
59+
}

services/apps/merge_suggestions_worker/src/workflows/mergeMembersWithLLM.ts

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -42,9 +42,10 @@ export async function mergeMembersWithLLM(
4242
3. Attributes and other fields: If one member have a specific field and other member doesn't, skip that field when deciding similarity.
4343
Checking semantically instead of literally is important for such fields. Important fields here are: location, timezone, languages, programming languages.
4444
For example one member might have Berlin in location, while other can have Germany - consider such members have same location.
45-
4. Display Name: Tokenize using both character and word tokenization. When the display name is more than one word, and the difference is a few edit distances consider it a strong indication of similarity.
46-
When one display name is contained by the other, check other fields for the final decision. The same members on different platforms might have different display names.
47-
Display names can be multiple words and might be sorted in different order in different platforms for the same member.
45+
4. Display Name: Tokenize using both character and word tokenization. When the display name is more than one word, and the difference is a few edit distances consider it a strong indication of similarity.
46+
When one display name is contained by the other, check other fields for the final decision. The same members on different platforms might have different display names.
47+
Display names can be multiple words and might be sorted in different order in different platforms for the same member. Display name is a supporting signal only — it is never sufficient on its own.
48+
If display name is the only thing that matches and there are no corroborating signals from identities, organizations, or attributes, return 'false'.
4849
CRITICAL RULE - NEVER MERGE IF SAME PLATFORM WITH DIFFERENT VALUES:
4950
Before making any decision, you MUST check if both members have identities on the same platform.
5051
If member1.identities[x].platform === member2.identities[y].platform (they share a platform), then:

services/libs/common/src/constants/email-providers.ts

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,10 @@
1+
export const noreplyEmailProviders = {
2+
github: '@users.noreply.github.com',
3+
gitlab: '@users.noreply.gitlab.com',
4+
} as const
5+
16
export const emailProviders = new Set([
7+
'users.noreply.gitlab.com',
28
'gmail.com',
39
'gmail.co.uk',
410
'gmail.com.au',

0 commit comments

Comments
 (0)