Skip to content

Commit b8cf307

Browse files
skwowetclaude
andcommitted
fix: improve member merge suggestions for split git email profiles (CM-1137)
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent 0af8dc1 commit b8cf307

5 files changed

Lines changed: 222 additions & 36 deletions

File tree

services/apps/merge_suggestions_worker/src/activities/memberMergeSuggestions.ts

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
/* eslint-disable @typescript-eslint/no-explicit-any */
22
import uniqBy from 'lodash.uniqby'
33

4-
import { parseGitHubNoreplyEmail } from '@crowd/common'
4+
import { parseGitHubNoreplyEmail, parseGitLabNoreplyEmail } from '@crowd/common'
55
import { addMemberNoMerge } from '@crowd/data-access-layer/src/member_merge'
66
import { MemberField, queryMembers } from '@crowd/data-access-layer/src/members'
77
import MemberMergeSuggestionsRepository from '@crowd/data-access-layer/src/old/apps/merge_suggestions_worker/memberMergeSuggestions.repo'
@@ -113,12 +113,18 @@ export async function getMemberMergeSuggestions(
113113
targetLists.usernameEmail.push({ value })
114114
}
115115

116-
// Noreply email -> platform username extraction
117-
if (isEmail && verified) {
116+
// Noreply identity -> platform username extraction.
117+
// Noreply identities may be ingested as type=email or type=username depending on the source,
118+
// so we check both.
119+
if ((isEmail || isUsername) && verified) {
118120
const ghUsername = parseGitHubNoreplyEmail(value)
119121
if (ghUsername) {
120122
noreplyEmailUsernameMatches.push({ value: ghUsername, platform: PlatformType.GITHUB })
121123
}
124+
const glUsername = parseGitLabNoreplyEmail(value)
125+
if (glUsername) {
126+
noreplyEmailUsernameMatches.push({ value: glUsername, platform: PlatformType.GITLAB })
127+
}
122128
}
123129

124130
// Fuzzy matches (only for verified & non-numeric)
@@ -214,7 +220,7 @@ export async function getMemberMergeSuggestions(
214220
},
215221
{
216222
// Query 8: Noreply/private email -> username (verified or unverified)
217-
matches: uniqBy(noreplyEmailUsernameMatches, 'value'),
223+
matches: uniqBy(noreplyEmailUsernameMatches, (m) => `${m.platform}:${m.value}`),
218224
builder: ({ value, platform }) => ({
219225
bool: {
220226
must: [

services/apps/merge_suggestions_worker/src/memberSimilarityCalculator.ts

Lines changed: 147 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,12 @@
11
import { get as getLevenshteinDistance } from 'fast-levenshtein'
22

3-
import { parseGitHubNoreplyEmail } from '@crowd/common'
3+
import {
4+
getEmailLocalPart,
5+
isKnownBot,
6+
isLocalMachineEmail,
7+
parseGitHubNoreplyEmail,
8+
parseGitLabNoreplyEmail,
9+
} from '@crowd/common'
410
import {
511
IMemberIdentity,
612
IMemberOpensearch,
@@ -196,32 +202,87 @@ class MemberSimilarityCalculator {
196202
for (const identity of member.identities.filter(
197203
(i) => i.type === MemberIdentityType.USERNAME,
198204
)) {
205+
const clashingIdentity = similarMember.nested_identities.find(
206+
(i) =>
207+
i.keyword_type === MemberIdentityType.USERNAME &&
208+
i.string_platform === identity.platform &&
209+
i.keyword_value !== identity.value,
210+
)
211+
212+
if (!clashingIdentity) continue
213+
214+
// git "usernames" are commit-author emails — a single person commits from many addresses
215+
// (work, personal, noreply, machine-local). Different values don't mean different people.
216+
// Skip the clash unless the displayName is a bot or placeholder rather than a real identity.
199217
if (
200-
similarMember.nested_identities.some(
201-
(i) =>
202-
i.keyword_type === MemberIdentityType.USERNAME &&
203-
i.string_platform === identity.platform &&
204-
i.keyword_value !== identity.value,
205-
)
218+
identity.platform === PlatformType.GIT &&
219+
identity.value.includes('@') &&
220+
clashingIdentity.keyword_value?.includes('@') &&
221+
!this.isNonIdentityBearingDisplayName(member, similarMember)
206222
) {
207-
return true
223+
continue
208224
}
225+
226+
return true
209227
}
210228
}
211229

212230
return false
213231
}
214232

215-
/**
216-
* Checks if a noreply email in one member matches a username in the other (e.g. GitHub noreply email -> GitHub username)
217-
* Works bidirectionally: primary email -> similar username, and similar email -> primary username.
218-
*/
233+
// Returns true when either member's displayName is a bot or a generic placeholder rather than
234+
// a real person's name — we keep conservative clash detection for these to avoid merging
235+
// distinct entities that share the same non-identity string.
236+
private static isNonIdentityBearingDisplayName(
237+
member: IMemberWithAggregatesForMergeSuggestions,
238+
similarMember: IMemberOpensearch,
239+
): boolean {
240+
if (member.attributes?.[MemberAttributeName.IS_BOT]?.default === true) return true
241+
if (isKnownBot(member.displayName ?? '') || isKnownBot(similarMember.keyword_displayName ?? ''))
242+
return true
243+
244+
const primaryDn = (member.displayName ?? '').toLowerCase().trim()
245+
const similarDn = (similarMember.keyword_displayName ?? '').toLowerCase().trim()
246+
247+
// OS-level reserved names that are never a real human identity — shared across
248+
// thousands of unrelated machines and therefore meaningless as a merge signal.
249+
const osReservedNames = new Set(['unknown', 'root', 'ubuntu', 'admin'])
250+
if (osReservedNames.has(primaryDn) || osReservedNames.has(similarDn)) return true
251+
252+
// When a git/username local-part equals the displayName, the name is just the commit-time
253+
// default (e.g. user.name="user", user.email="user@laptop.local").
254+
for (const id of member.identities) {
255+
if (
256+
id.platform === PlatformType.GIT &&
257+
id.type === MemberIdentityType.USERNAME &&
258+
id.value.includes('@')
259+
) {
260+
const lp = getEmailLocalPart(id.value)
261+
if (lp === primaryDn || lp === similarDn) return true
262+
}
263+
}
264+
for (const id of similarMember.nested_identities) {
265+
if (
266+
id.string_platform === PlatformType.GIT &&
267+
id.keyword_type === MemberIdentityType.USERNAME &&
268+
id.string_value?.includes('@')
269+
) {
270+
const lp = getEmailLocalPart(id.string_value)
271+
if (lp === primaryDn || lp === similarDn) return true
272+
}
273+
}
274+
275+
return false
276+
}
277+
278+
// Checks if a noreply address on either member resolves to the other's platform username.
279+
// Noreply identities can be ingested as type=email or type=username depending on the source.
219280
static hasMatchingUsernameFromNoreplyEmail(
220281
primaryMember: IMemberWithAggregatesForMergeSuggestions,
221282
similarMember: IMemberOpensearch,
222283
): boolean {
223-
// Primary member's noreply emails -> similar member's platform usernames
224-
const similarUsernamesByPlatform = {
284+
// Primary member's noreply identities -> similar member's platform usernames
285+
const similarUsernamesByPlatform: Record<string, Set<string>> = {
225286
[PlatformType.GITHUB]: new Set(
226287
similarMember.nested_identities
227288
.filter(
@@ -231,35 +292,61 @@ class MemberSimilarityCalculator {
231292
)
232293
.map((i) => i.string_value?.toLowerCase()),
233294
),
295+
[PlatformType.GITLAB]: new Set(
296+
similarMember.nested_identities
297+
.filter(
298+
(i) =>
299+
i.string_platform === PlatformType.GITLAB &&
300+
i.keyword_type === MemberIdentityType.USERNAME,
301+
)
302+
.map((i) => i.string_value?.toLowerCase()),
303+
),
234304
}
235305

236306
for (const identity of primaryMember.identities) {
237-
if (!identity.verified || identity.type !== MemberIdentityType.EMAIL) continue
307+
if (!identity.verified) continue
238308

239309
const ghUsername = parseGitHubNoreplyEmail(identity.value)
240310
if (ghUsername && similarUsernamesByPlatform[PlatformType.GITHUB].has(ghUsername)) {
241311
return true
242312
}
313+
314+
const glUsername = parseGitLabNoreplyEmail(identity.value)
315+
if (glUsername && similarUsernamesByPlatform[PlatformType.GITLAB].has(glUsername)) {
316+
return true
317+
}
243318
}
244319

245-
// Similar member's noreply emails -> primary member's platform usernames
246-
const primaryUsernamesByPlatform = {
320+
// Similar member's noreply identities -> primary member's platform usernames
321+
const primaryUsernamesByPlatform: Record<string, Set<string>> = {
247322
[PlatformType.GITHUB]: new Set(
248323
primaryMember.identities
249324
.filter(
250325
(i) => i.platform === PlatformType.GITHUB && i.type === MemberIdentityType.USERNAME,
251326
)
252327
.map((i) => i.value?.toLowerCase()),
253328
),
329+
[PlatformType.GITLAB]: new Set(
330+
primaryMember.identities
331+
.filter(
332+
(i) => i.platform === PlatformType.GITLAB && i.type === MemberIdentityType.USERNAME,
333+
)
334+
.map((i) => i.value?.toLowerCase()),
335+
),
254336
}
255337

256338
for (const identity of similarMember.nested_identities) {
257-
if (!identity.bool_verified || identity.keyword_type !== MemberIdentityType.EMAIL) continue
339+
if (!identity.bool_verified) continue
258340

259341
const ghUsername = parseGitHubNoreplyEmail(identity.string_value)
260342
if (ghUsername && primaryUsernamesByPlatform[PlatformType.GITHUB].has(ghUsername)) {
261343
return true
262344
}
345+
346+
const glUsername = parseGitLabNoreplyEmail(identity.string_value)
347+
if (glUsername && primaryUsernamesByPlatform[PlatformType.GITLAB].has(glUsername)) {
348+
return true
349+
}
263350
}
264351

265352
return false
@@ -382,7 +469,7 @@ class MemberSimilarityCalculator {
382469
let isHighConfidence = false
383470
let confidenceScore = startingScore || this.HIGH_CONFIDENCE_SCORE
384471

385-
const bumpFactor = Math.floor((1 - confidenceScore) / 5)
472+
const bumpFactor = (1 - confidenceScore) / 5
386473

387474
if (this.hasSameLocation(member, similarMember)) {
388475
isHighConfidence = true
@@ -409,13 +496,54 @@ class MemberSimilarityCalculator {
409496
confidenceScore = this.bumpConfidenceScore(confidenceScore, bumpFactor)
410497
}
411498

499+
// Catches contributors whose only identity is commits from an unconfigured machine —
500+
// same local-part across *.local / *.lan addresses is a strong hint they're the same person.
501+
if (this.hasSameLocalPartInGitUsername(member, similarMember)) {
502+
isHighConfidence = true
503+
confidenceScore = this.bumpConfidenceScore(confidenceScore, bumpFactor)
504+
}
505+
412506
if (!isHighConfidence) {
413507
return this.LOW_CONFIDENCE_SCORE
414508
}
415509

416510
return confidenceScore
417511
}
418512

513+
static hasSameLocalPartInGitUsername(
514+
member: IMemberWithAggregatesForMergeSuggestions,
515+
similarMember: IMemberOpensearch,
516+
): boolean {
517+
const displayName = (member.displayName ?? '').toLowerCase().trim()
518+
const isUsable = (lp: string) => lp.length > 0 && lp !== displayName && !isKnownBot(lp)
519+
520+
const primaryLocalParts = member.identities
521+
.filter(
522+
(i) =>
523+
i.platform === PlatformType.GIT &&
524+
i.type === MemberIdentityType.USERNAME &&
525+
isLocalMachineEmail(i.value),
526+
)
527+
.map((i) => getEmailLocalPart(i.value))
528+
.filter(isUsable)
529+
530+
if (primaryLocalParts.length === 0) return false
531+
532+
const similarLocalParts = new Set(
533+
similarMember.nested_identities
534+
.filter(
535+
(i) =>
536+
i.string_platform === PlatformType.GIT &&
537+
i.keyword_type === MemberIdentityType.USERNAME &&
538+
isLocalMachineEmail(i.string_value ?? ''),
539+
)
540+
.map((i) => getEmailLocalPart(i.string_value ?? ''))
541+
.filter(isUsable),
542+
)
543+
544+
return primaryLocalParts.some((lp) => similarLocalParts.has(lp))
545+
}
546+
419547
static bumpConfidenceScore(confidenceScore: number, bump: number): number {
420548
return Math.min(1, confidenceScore + bump)
421549
}

services/apps/merge_suggestions_worker/src/utils.ts

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import { isNoreplyEmail } from '@crowd/common'
12
import { ILLMConsumableMember, PlatformType } from '@crowd/types'
23

34
import { EMAIL_AS_USERNAME_PLATFORMS } from './enums'
@@ -21,15 +22,12 @@ export function chunkArray<T>(array: T[], chunkSize: number): T[][] {
2122
export const removeEmailLikeIdentitiesFromMember = (
2223
member: ILLMConsumableMember,
2324
): ILLMConsumableMember => {
24-
const nonEmailIdentities: { platform: string; value: string }[] = []
25-
for (const identity of member.identities) {
26-
if (identity.value.indexOf('@') === -1) {
27-
// remove found identity from member.identities
28-
nonEmailIdentities.push(identity)
29-
}
30-
}
31-
32-
member.identities = nonEmailIdentities
25+
// Strip plain emails to avoid the LLM making false matches via shared domain suffixes.
26+
// Noreply addresses are kept — their local-part is a provider-authoritative username and
27+
// gives the LLM a meaningful identity signal.
28+
member.identities = member.identities.filter(
29+
(identity) => !identity.value.includes('@') || isNoreplyEmail(identity.value),
30+
)
3331

3432
return member
3533
}

services/libs/common/src/constants/email-providers.ts

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,8 @@
1+
export const noreplyEmailProviders = {
2+
github: '@users.noreply.github.com',
3+
gitlab: '@users.noreply.gitlab.com',
4+
} as const
5+
16
export const emailProviders = new Set([
27
'gmail.com',
38
'gmail.co.uk',

services/libs/common/src/email.ts

Lines changed: 53 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
11
import validator from 'validator'
22

3+
import { noreplyEmailProviders } from './constants'
4+
35
export const isValidEmail = (value: string): boolean => {
46
return validator.isEmail(value)
57
}
68

7-
const GITHUB_NOREPLY_EMAIL_SUFFIX = '@users.noreply.github.com'
8-
99
/**
1010
* Extracts username from a GitHub noreply email.
1111
* @see https://docs.github.com/en/account-and-profile/reference/email-addresses-reference#your-noreply-email-address
@@ -14,13 +14,62 @@ export const parseGitHubNoreplyEmail = (email?: string | null): string | null =>
1414
if (!email) return null
1515

1616
const lower = email.toLowerCase()
17-
if (!lower.endsWith(GITHUB_NOREPLY_EMAIL_SUFFIX)) return null
17+
if (!lower.endsWith(noreplyEmailProviders.github)) return null
1818

19-
const local = lower.slice(0, -GITHUB_NOREPLY_EMAIL_SUFFIX.length)
19+
const local = lower.slice(0, -noreplyEmailProviders.github.length)
2020
if (!local) return null
2121

2222
const plusIndex = local.indexOf('+')
2323
const username = plusIndex >= 0 ? local.slice(plusIndex + 1) : local
2424

2525
return username || null
2626
}
27+
28+
/**
29+
* Extracts username from a GitLab noreply email.
30+
* @see https://docs.gitlab.com/user/profile/#use-an-automatically-generated-private-commit-email
31+
*/
32+
export const parseGitLabNoreplyEmail = (email?: string | null): string | null => {
33+
if (!email) return null
34+
35+
const lower = email.toLowerCase()
36+
if (!lower.endsWith(noreplyEmailProviders.gitlab)) return null
37+
38+
const local = lower.slice(0, -noreplyEmailProviders.gitlab.length)
39+
if (!local) return null
40+
41+
// Strip numeric ID prefix (e.g. "12345-username" → "username")
42+
const username = /^\d+-/.test(local) ? local.slice(local.indexOf('-') + 1) : local
43+
return username || null
44+
}
45+
46+
/** Returns true if the value is a provider-generated noreply address. */
47+
export const isNoreplyEmail = (value: string): boolean => {
48+
const lower = value.toLowerCase()
49+
return Object.values(noreplyEmailProviders).some((s) => lower.endsWith(s))
50+
}
51+
52+
/**
53+
* Returns true if the email host indicates a local machine (not a real mail server).
54+
* Git commits from unconfigured machines often produce addresses like user@hostname.local.
55+
*/
56+
export const isLocalMachineEmail = (value: string): boolean => {
57+
const atIndex = value.indexOf('@')
58+
if (atIndex < 0) return false
59+
const host = value.slice(atIndex + 1).toLowerCase()
60+
return (
61+
host === 'localhost' ||
62+
host.endsWith('.local') ||
63+
host.endsWith('.lan') ||
64+
host.endsWith('.localdomain')
65+
)
66+
}
67+
68+
/**
69+
* Returns the local-part of an email (before '@'), lower-cased.
70+
* Returns the full value lower-cased when there is no '@'.
71+
*/
72+
export const getEmailLocalPart = (value: string): string => {
73+
const atIndex = value.indexOf('@')
74+
return (atIndex >= 0 ? value.slice(0, atIndex) : value).toLowerCase()
75+
}

0 commit comments

Comments
 (0)