11import { get as getLevenshteinDistance } from 'fast-levenshtein'
22
3- import { parseGitHubNoreplyEmail } from '@crowd/common'
3+ import {
4+ getEmailLocalPart ,
5+ isKnownBot ,
6+ isLocalMachineEmail ,
7+ parseGitHubNoreplyEmail ,
8+ parseGitLabNoreplyEmail ,
9+ } from '@crowd/common'
410import {
511 IMemberIdentity ,
612 IMemberOpensearch ,
@@ -196,32 +202,87 @@ class MemberSimilarityCalculator {
196202 for ( const identity of member . identities . filter (
197203 ( i ) => i . type === MemberIdentityType . USERNAME ,
198204 ) ) {
205+ const clashingIdentity = similarMember . nested_identities . find (
206+ ( i ) =>
207+ i . keyword_type === MemberIdentityType . USERNAME &&
208+ i . string_platform === identity . platform &&
209+ i . keyword_value !== identity . value ,
210+ )
211+
212+ if ( ! clashingIdentity ) continue
213+
214+ // git "usernames" are commit-author emails — a single person commits from many addresses
215+ // (work, personal, noreply, machine-local). Different values don't mean different people.
216+ // Skip the clash unless the displayName is a bot or placeholder rather than a real identity.
199217 if (
200- similarMember . nested_identities . some (
201- ( i ) =>
202- i . keyword_type === MemberIdentityType . USERNAME &&
203- i . string_platform === identity . platform &&
204- i . keyword_value !== identity . value ,
205- )
218+ identity . platform === PlatformType . GIT &&
219+ identity . value . includes ( '@' ) &&
220+ clashingIdentity . keyword_value ?. includes ( '@' ) &&
221+ ! this . isNonIdentityBearingDisplayName ( member , similarMember )
206222 ) {
207- return true
223+ continue
208224 }
225+
226+ return true
209227 }
210228 }
211229
212230 return false
213231 }
214232
215- /**
216- * Checks if a noreply email in one member matches a username in the other (e.g. GitHub noreply email -> GitHub username)
217- * Works bidirectionally: primary email -> similar username, and similar email -> primary username.
218- */
233+ // Returns true when either member's displayName is a bot or a generic placeholder rather than
234+ // a real person's name — we keep conservative clash detection for these to avoid merging
235+ // distinct entities that share the same non-identity string.
236+ private static isNonIdentityBearingDisplayName (
237+ member : IMemberWithAggregatesForMergeSuggestions ,
238+ similarMember : IMemberOpensearch ,
239+ ) : boolean {
240+ if ( member . attributes ?. [ MemberAttributeName . IS_BOT ] ?. default === true ) return true
241+ if ( isKnownBot ( member . displayName ?? '' ) || isKnownBot ( similarMember . keyword_displayName ?? '' ) )
242+ return true
243+
244+ const primaryDn = ( member . displayName ?? '' ) . toLowerCase ( ) . trim ( )
245+ const similarDn = ( similarMember . keyword_displayName ?? '' ) . toLowerCase ( ) . trim ( )
246+
247+ // OS-level reserved names that are never a real human identity — shared across
248+ // thousands of unrelated machines and therefore meaningless as a merge signal.
249+ const osReservedNames = new Set ( [ 'unknown' , 'root' , 'ubuntu' , 'admin' ] )
250+ if ( osReservedNames . has ( primaryDn ) || osReservedNames . has ( similarDn ) ) return true
251+
252+ // When a git/username local-part equals the displayName, the name is just the commit-time
253+ // default (e.g. user.name="user", user.email="user@laptop.local").
254+ for ( const id of member . identities ) {
255+ if (
256+ id . platform === PlatformType . GIT &&
257+ id . type === MemberIdentityType . USERNAME &&
258+ id . value . includes ( '@' )
259+ ) {
260+ const lp = getEmailLocalPart ( id . value )
261+ if ( lp === primaryDn || lp === similarDn ) return true
262+ }
263+ }
264+ for ( const id of similarMember . nested_identities ) {
265+ if (
266+ id . string_platform === PlatformType . GIT &&
267+ id . keyword_type === MemberIdentityType . USERNAME &&
268+ id . string_value ?. includes ( '@' )
269+ ) {
270+ const lp = getEmailLocalPart ( id . string_value )
271+ if ( lp === primaryDn || lp === similarDn ) return true
272+ }
273+ }
274+
275+ return false
276+ }
277+
278+ // Checks if a noreply address on either member resolves to the other's platform username.
279+ // Noreply identities can be ingested as type=email or type=username depending on the source.
219280 static hasMatchingUsernameFromNoreplyEmail (
220281 primaryMember : IMemberWithAggregatesForMergeSuggestions ,
221282 similarMember : IMemberOpensearch ,
222283 ) : boolean {
223- // Primary member's noreply emails -> similar member's platform usernames
224- const similarUsernamesByPlatform = {
284+ // Primary member's noreply identities -> similar member's platform usernames
285+ const similarUsernamesByPlatform : Record < string , Set < string > > = {
225286 [ PlatformType . GITHUB ] : new Set (
226287 similarMember . nested_identities
227288 . filter (
@@ -231,35 +292,61 @@ class MemberSimilarityCalculator {
231292 )
232293 . map ( ( i ) => i . string_value ?. toLowerCase ( ) ) ,
233294 ) ,
295+ [ PlatformType . GITLAB ] : new Set (
296+ similarMember . nested_identities
297+ . filter (
298+ ( i ) =>
299+ i . string_platform === PlatformType . GITLAB &&
300+ i . keyword_type === MemberIdentityType . USERNAME ,
301+ )
302+ . map ( ( i ) => i . string_value ?. toLowerCase ( ) ) ,
303+ ) ,
234304 }
235305
236306 for ( const identity of primaryMember . identities ) {
237- if ( ! identity . verified || identity . type !== MemberIdentityType . EMAIL ) continue
307+ if ( ! identity . verified ) continue
238308
239309 const ghUsername = parseGitHubNoreplyEmail ( identity . value )
240310 if ( ghUsername && similarUsernamesByPlatform [ PlatformType . GITHUB ] . has ( ghUsername ) ) {
241311 return true
242312 }
313+
314+ const glUsername = parseGitLabNoreplyEmail ( identity . value )
315+ if ( glUsername && similarUsernamesByPlatform [ PlatformType . GITLAB ] . has ( glUsername ) ) {
316+ return true
317+ }
243318 }
244319
245- // Similar member's noreply emails -> primary member's platform usernames
246- const primaryUsernamesByPlatform = {
320+ // Similar member's noreply identities -> primary member's platform usernames
321+ const primaryUsernamesByPlatform : Record < string , Set < string > > = {
247322 [ PlatformType . GITHUB ] : new Set (
248323 primaryMember . identities
249324 . filter (
250325 ( i ) => i . platform === PlatformType . GITHUB && i . type === MemberIdentityType . USERNAME ,
251326 )
252327 . map ( ( i ) => i . value ?. toLowerCase ( ) ) ,
253328 ) ,
329+ [ PlatformType . GITLAB ] : new Set (
330+ primaryMember . identities
331+ . filter (
332+ ( i ) => i . platform === PlatformType . GITLAB && i . type === MemberIdentityType . USERNAME ,
333+ )
334+ . map ( ( i ) => i . value ?. toLowerCase ( ) ) ,
335+ ) ,
254336 }
255337
256338 for ( const identity of similarMember . nested_identities ) {
257- if ( ! identity . bool_verified || identity . keyword_type !== MemberIdentityType . EMAIL ) continue
339+ if ( ! identity . bool_verified ) continue
258340
259341 const ghUsername = parseGitHubNoreplyEmail ( identity . string_value )
260342 if ( ghUsername && primaryUsernamesByPlatform [ PlatformType . GITHUB ] . has ( ghUsername ) ) {
261343 return true
262344 }
345+
346+ const glUsername = parseGitLabNoreplyEmail ( identity . string_value )
347+ if ( glUsername && primaryUsernamesByPlatform [ PlatformType . GITLAB ] . has ( glUsername ) ) {
348+ return true
349+ }
263350 }
264351
265352 return false
@@ -382,7 +469,7 @@ class MemberSimilarityCalculator {
382469 let isHighConfidence = false
383470 let confidenceScore = startingScore || this . HIGH_CONFIDENCE_SCORE
384471
385- const bumpFactor = Math . floor ( ( 1 - confidenceScore ) / 5 )
472+ const bumpFactor = ( 1 - confidenceScore ) / 5
386473
387474 if ( this . hasSameLocation ( member , similarMember ) ) {
388475 isHighConfidence = true
@@ -409,13 +496,54 @@ class MemberSimilarityCalculator {
409496 confidenceScore = this . bumpConfidenceScore ( confidenceScore , bumpFactor )
410497 }
411498
499+ // Catches contributors whose only identity is commits from an unconfigured machine —
500+ // same local-part across *.local / *.lan addresses is a strong hint they're the same person.
501+ if ( this . hasSameLocalPartInGitUsername ( member , similarMember ) ) {
502+ isHighConfidence = true
503+ confidenceScore = this . bumpConfidenceScore ( confidenceScore , bumpFactor )
504+ }
505+
412506 if ( ! isHighConfidence ) {
413507 return this . LOW_CONFIDENCE_SCORE
414508 }
415509
416510 return confidenceScore
417511 }
418512
513+ static hasSameLocalPartInGitUsername (
514+ member : IMemberWithAggregatesForMergeSuggestions ,
515+ similarMember : IMemberOpensearch ,
516+ ) : boolean {
517+ const displayName = ( member . displayName ?? '' ) . toLowerCase ( ) . trim ( )
518+ const isUsable = ( lp : string ) => lp . length > 0 && lp !== displayName && ! isKnownBot ( lp )
519+
520+ const primaryLocalParts = member . identities
521+ . filter (
522+ ( i ) =>
523+ i . platform === PlatformType . GIT &&
524+ i . type === MemberIdentityType . USERNAME &&
525+ isLocalMachineEmail ( i . value ) ,
526+ )
527+ . map ( ( i ) => getEmailLocalPart ( i . value ) )
528+ . filter ( isUsable )
529+
530+ if ( primaryLocalParts . length === 0 ) return false
531+
532+ const similarLocalParts = new Set (
533+ similarMember . nested_identities
534+ . filter (
535+ ( i ) =>
536+ i . string_platform === PlatformType . GIT &&
537+ i . keyword_type === MemberIdentityType . USERNAME &&
538+ isLocalMachineEmail ( i . string_value ?? '' ) ,
539+ )
540+ . map ( ( i ) => getEmailLocalPart ( i . string_value ?? '' ) )
541+ . filter ( isUsable ) ,
542+ )
543+
544+ return primaryLocalParts . some ( ( lp ) => similarLocalParts . has ( lp ) )
545+ }
546+
419547 static bumpConfidenceScore ( confidenceScore : number , bump : number ) : number {
420548 return Math . min ( 1 , confidenceScore + bump )
421549 }
0 commit comments