Skip to content

Commit b5805d5

Browse files
JAORMXclaude
andcommitted
fix(github): create domain accounts for non-committer authors (#8886)
ConvertAccounts sourced the domain `accounts` table FROM _tool_github_accounts, which is only populated for users we collected full profiles for (effectively, committers). Issue and PR authors who never committed were written into _tool_github_repo_accounts but never converted, so issues.creator_id and pull_requests.author_id pointed at accounts rows that didn't exist. Source ConvertAccounts FROM _tool_github_repo_accounts LEFT JOIN _tool_github_accounts instead, so every user the repo references gets a domain account, enriched with profile detail when we have it and login-only otherwise. The domain id uses the same generator the issue/PR convertors use, so the FKs line up. Also emit a repo_account for a PR's merged_by user so pull_requests.merged_by_id resolves too. The query stays MySQL/PostgreSQL-agnostic (COALESCE, no backtick quoting, parameterized via the dal) and mirrors the join already in account_org_collector.go. Adds the non-committer orphan case to the e2e fixture plus a referential- integrity assertion in TestAccountDataFlow. Verified on both MySQL and PostgreSQL. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
1 parent 4ac2c42 commit b5805d5

5 files changed

Lines changed: 102 additions & 21 deletions

File tree

backend/plugins/github/e2e/account_test.go

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,11 +20,15 @@ package e2e
2020
import (
2121
"testing"
2222

23+
"github.com/apache/incubator-devlake/core/dal"
2324
"github.com/apache/incubator-devlake/core/models/domainlayer/crossdomain"
25+
"github.com/apache/incubator-devlake/core/models/domainlayer/didgen"
2426
"github.com/apache/incubator-devlake/helpers/e2ehelper"
2527
"github.com/apache/incubator-devlake/plugins/github/impl"
2628
"github.com/apache/incubator-devlake/plugins/github/models"
2729
"github.com/apache/incubator-devlake/plugins/github/tasks"
30+
"github.com/stretchr/testify/assert"
31+
"github.com/stretchr/testify/require"
2832
)
2933

3034
func TestAccountDataFlow(t *testing.T) {
@@ -108,4 +112,33 @@ func TestAccountDataFlow(t *testing.T) {
108112
"_raw_data_remark",
109113
},
110114
)
115+
116+
// Referential-integrity invariant (#8886): every account the repo references in
117+
// _tool_github_repo_accounts must have a domain `accounts` row, so issues.creator_id /
118+
// pull_requests.author_id / merged_by_id never point at a missing account. We generate
119+
// the domain id with the SAME generator the issue/PR convertors use, so this is a
120+
// faithful proxy for the FK join the issue reported as broken. It also fails loudly if a
121+
// future change shrinks ConvertAccounts' coverage or diverges the id generation.
122+
accountIdGen := didgen.NewDomainIdGenerator(&models.GithubAccount{})
123+
var repoAccounts []models.GithubRepoAccount
124+
require.NoError(t, dataflowTester.Dal.All(&repoAccounts,
125+
dal.Where("repo_github_id = ? AND connection_id = ? AND account_id > 0",
126+
taskData.Options.GithubId, taskData.Options.ConnectionId),
127+
))
128+
require.NotEmpty(t, repoAccounts, "fixture must reference at least one account")
129+
sawOrphanCase := false
130+
for _, ra := range repoAccounts {
131+
if ra.Login == "milichev" {
132+
sawOrphanCase = true // the non-committer author from the issue repro
133+
}
134+
domainId := accountIdGen.Generate(taskData.Options.ConnectionId, ra.AccountId)
135+
count, err := dataflowTester.Dal.Count(
136+
dal.From(&crossdomain.Account{}),
137+
dal.Where("id = ?", domainId),
138+
)
139+
require.NoError(t, err)
140+
assert.Equalf(t, int64(1), count,
141+
"orphan FK: repo account %q (id=%d) has no domain accounts row %q", ra.Login, ra.AccountId, domainId)
142+
}
143+
assert.True(t, sawOrphanCase, "fixture should include the non-committer orphan case (milichev)")
111144
}

backend/plugins/github/e2e/raw_tables/_tool_github_repo_accounts.csv

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ connection_id,account_id,repo_github_id,login
66
1,3971390,134018330,ppmoon
77
1,7496278,134018330,panjf2000
88
1,8518239,134018330,gitter-badger
9+
1,145564,134018330,milichev
910
1,11763614,2,Moonlight-Zhao
1011
1,12420699,2,shanghai-Jerry
1112
1,14950473,2,zqkgo
Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,9 @@
11
id,email,full_name,user_name,avatar_url,organization,_raw_data_params,_raw_data_table,_raw_data_id,_raw_data_remark
2-
github:GithubAccount:1:1052632,runner.mei@,runner,runner-mei,https://avatars.githubusercontent.com/u/1052632?v=4,,"{""ConnectionId"":1,""Name"":""panjf2000/ants""}",_raw_github_api_accounts,13,
3-
github:GithubAccount:1:21979,appleboy.tw@gmail.com,Bo-Yi Wu,appleboy,https://avatars.githubusercontent.com/u/21979?v=4,"COSCUP,nodejs-tw,moztw,h5bp,CodeIgniter-TW,drone,Getmore,golangtw,laravel-taiwan,go-xorm,gin-gonic,PHPConf-TW,Mediatek-Cloud,SJFinder,go-gitea,laradock,gin-contrib,tagfans,maintainers,go-training,go-ggz,the-benchmarker,golang-queue","{""ConnectionId"":1,""Name"":""panjf2000/ants""}",_raw_github_api_accounts,8,
4-
github:GithubAccount:1:3794113,shanhu5739@gmail.com,Derek,shanhuhai5739,https://avatars.githubusercontent.com/u/3794113?v=4,,"{""ConnectionId"":1,""Name"":""panjf2000/ants""}",_raw_github_api_accounts,2,
5-
github:GithubAccount:1:3971390,cnliuyunpeng@gmail.com,ppmoon,ppmoon,https://avatars.githubusercontent.com/u/3971390?v=4,,"{""ConnectionId"":1,""Name"":""panjf2000/ants""}",_raw_github_api_accounts,14,
6-
github:GithubAccount:1:7496278,i@andypan.me,Andy Pan,panjf2000,https://avatars.githubusercontent.com/u/7496278?v=4,,"{""ConnectionId"":1,""Name"":""panjf2000/ants""}",_raw_github_api_accounts,5,
7-
github:GithubAccount:1:8518239,badger@gitter.im,The Gitter Badger,gitter-badger,https://avatars.githubusercontent.com/u/8518239?v=4,,"{""ConnectionId"":1,""Name"":""panjf2000/ants""}",_raw_github_api_accounts,9,
8-
github:GithubAccount:1:964542,sarath.sp06@gmail.com,Sarath Sadasivan Pillai,sarathsp06,https://avatars.githubusercontent.com/u/964542?v=4,"exotel,leadmrktr,shellagilehub,odysseyhack,boodltech","{""ConnectionId"":1,""Name"":""panjf2000/ants""}",_raw_github_api_accounts,1,
2+
github:GithubAccount:1:1052632,runner.mei@,runner,runner-mei,https://avatars.githubusercontent.com/u/1052632?v=4,,,,0,
3+
github:GithubAccount:1:145564,,,milichev,,,,,0,
4+
github:GithubAccount:1:21979,appleboy.tw@gmail.com,Bo-Yi Wu,appleboy,https://avatars.githubusercontent.com/u/21979?v=4,"COSCUP,nodejs-tw,moztw,h5bp,CodeIgniter-TW,drone,Getmore,golangtw,laravel-taiwan,go-xorm,gin-gonic,PHPConf-TW,Mediatek-Cloud,SJFinder,go-gitea,laradock,gin-contrib,tagfans,maintainers,go-training,go-ggz,the-benchmarker,golang-queue",,,0,
5+
github:GithubAccount:1:3794113,shanhu5739@gmail.com,Derek,shanhuhai5739,https://avatars.githubusercontent.com/u/3794113?v=4,,,,0,
6+
github:GithubAccount:1:3971390,cnliuyunpeng@gmail.com,ppmoon,ppmoon,https://avatars.githubusercontent.com/u/3971390?v=4,,,,0,
7+
github:GithubAccount:1:7496278,i@andypan.me,Andy Pan,panjf2000,https://avatars.githubusercontent.com/u/7496278?v=4,,,,0,
8+
github:GithubAccount:1:8518239,badger@gitter.im,The Gitter Badger,gitter-badger,https://avatars.githubusercontent.com/u/8518239?v=4,,,,0,
9+
github:GithubAccount:1:964542,sarath.sp06@gmail.com,Sarath Sadasivan Pillai,sarathsp06,https://avatars.githubusercontent.com/u/964542?v=4,"exotel,leadmrktr,shellagilehub,odysseyhack,boodltech",,,0,

backend/plugins/github/tasks/account_convertor.go

Lines changed: 50 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ import (
2222

2323
"github.com/apache/incubator-devlake/core/dal"
2424
"github.com/apache/incubator-devlake/core/errors"
25+
"github.com/apache/incubator-devlake/core/models/common"
2526
"github.com/apache/incubator-devlake/core/models/domainlayer"
2627
"github.com/apache/incubator-devlake/core/models/domainlayer/crossdomain"
2728
"github.com/apache/incubator-devlake/core/models/domainlayer/didgen"
@@ -30,6 +31,19 @@ import (
3031
"github.com/apache/incubator-devlake/plugins/github/models"
3132
)
3233

34+
// repoAccountForConvert is the row projected by ConvertAccounts' query: every
35+
// account referenced by the repo (from _tool_github_repo_accounts), enriched
36+
// with profile detail from _tool_github_accounts when it was collected. The
37+
// embedded NoPKModel carries the RawDataOrigin across to the domain row.
38+
type repoAccountForConvert struct {
39+
Id int
40+
Login string
41+
Name string
42+
Email string
43+
AvatarUrl string
44+
common.NoPKModel
45+
}
46+
3347
func init() {
3448
RegisterSubtaskMeta(&ConvertAccountsMeta)
3549
}
@@ -38,12 +52,12 @@ var ConvertAccountsMeta = plugin.SubTaskMeta{
3852
Name: "Convert Users",
3953
EntryPoint: ConvertAccounts,
4054
EnabledByDefault: true,
41-
Description: "Convert tool layer table github_accounts into domain layer table accounts",
55+
Description: "Convert every account referenced by the repo (tool layer repo_accounts, enriched by github_accounts) into domain layer table accounts",
4256
DomainTypes: []string{plugin.DOMAIN_TYPE_CROSS},
4357
DependencyTables: []string{
44-
models.GithubAccount{}.TableName(), // cursor
45-
models.GithubRepoAccount{}.TableName(), // cursor
46-
models.GithubAccountOrg{}.TableName()}, // account id gen
58+
models.GithubRepoAccount{}.TableName(), // cursor (every user referenced by the repo)
59+
models.GithubAccount{}.TableName(), // left-join enrichment (profile detail, optional)
60+
models.GithubAccountOrg{}.TableName()}, // org pluck
4761
ProductTables: []string{crossdomain.Account{}.TableName()},
4862
}
4963

@@ -53,7 +67,7 @@ func ConvertAccounts(taskCtx plugin.SubTaskContext) errors.Error {
5367

5468
accountIdGen := didgen.NewDomainIdGenerator(&models.GithubAccount{})
5569

56-
converter, err := api.NewStatefulDataConverter(&api.StatefulDataConverterArgs[models.GithubAccount]{
70+
converter, err := api.NewStatefulDataConverter(&api.StatefulDataConverterArgs[repoAccountForConvert]{
5771
SubtaskCommonArgs: &api.SubtaskCommonArgs{
5872
SubTaskContext: taskCtx,
5973
Table: RAW_ACCOUNT_TABLE,
@@ -62,29 +76,51 @@ func ConvertAccounts(taskCtx plugin.SubTaskContext) errors.Error {
6276
Name: data.Options.Name,
6377
},
6478
},
79+
// Source every account referenced by this repo from _tool_github_repo_accounts
80+
// (which the issue/PR/commit extractors populate for any author, assignee, or
81+
// merged-by user), and LEFT JOIN _tool_github_accounts for profile detail when it
82+
// was collected. This guarantees a domain `accounts` row for every CreatorId /
83+
// AuthorId the other convertors emit, instead of only for users who committed.
84+
// SQL is kept DB-agnostic (no backtick quoting, COALESCE not IFNULL) so it runs on
85+
// both MySQL and PostgreSQL.
6586
Input: func(stateManager *api.SubtaskStateManager) (dal.Rows, errors.Error) {
6687
clauses := []dal.Clause{
67-
dal.Select("_tool_github_accounts.*"),
68-
dal.From(&models.GithubAccount{}),
88+
dal.Select(`_tool_github_repo_accounts.account_id AS id,
89+
_tool_github_repo_accounts.login AS login,
90+
COALESCE(ga.name, '') AS name,
91+
COALESCE(ga.email, '') AS email,
92+
COALESCE(ga.avatar_url, '') AS avatar_url,
93+
_tool_github_repo_accounts._raw_data_params AS _raw_data_params,
94+
_tool_github_repo_accounts._raw_data_table AS _raw_data_table,
95+
_tool_github_repo_accounts._raw_data_id AS _raw_data_id,
96+
_tool_github_repo_accounts._raw_data_remark AS _raw_data_remark`),
97+
dal.From(&models.GithubRepoAccount{}),
98+
dal.Join(`left join _tool_github_accounts ga on (
99+
ga.connection_id = _tool_github_repo_accounts.connection_id
100+
AND ga.id = _tool_github_repo_accounts.account_id
101+
)`),
69102
dal.Where(
70-
"repo_github_id = ? and _tool_github_accounts.connection_id=?",
103+
`_tool_github_repo_accounts.repo_github_id = ?
104+
AND _tool_github_repo_accounts.connection_id = ?
105+
AND _tool_github_repo_accounts.account_id > 0`,
71106
data.Options.GithubId,
72107
data.Options.ConnectionId,
73108
),
74-
dal.Join(`left join _tool_github_repo_accounts gra on (
75-
_tool_github_accounts.connection_id = gra.connection_id
76-
AND _tool_github_accounts.id = gra.account_id
77-
)`),
78109
}
79110
if stateManager.IsIncremental() {
80111
since := stateManager.GetSince()
81112
if since != nil {
82-
clauses = append(clauses, dal.Where("_tool_github_accounts.updated_at >= ?", since))
113+
// Incremental cursor intentionally tracks _tool_github_repo_accounts.updated_at
114+
// (repo membership), not _tool_github_accounts.updated_at (profile freshness):
115+
// account-detail re-enrichment is reconciled on the next full sync. Do not switch
116+
// this back to _tool_github_accounts — that is what left issue/PR-only authors
117+
// orphaned (#8886).
118+
clauses = append(clauses, dal.Where("_tool_github_repo_accounts.updated_at >= ?", since))
83119
}
84120
}
85121
return db.Cursor(clauses...)
86122
},
87-
Convert: func(githubUser *models.GithubAccount) ([]interface{}, errors.Error) {
123+
Convert: func(githubUser *repoAccountForConvert) ([]interface{}, errors.Error) {
88124
// query related orgs
89125
var orgs []string
90126
err := db.Pluck(`org_login`, &orgs,

backend/plugins/github/tasks/pr_extractor.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,16 @@ func ExtractApiPullRequests(taskCtx plugin.SubTaskContext) errors.Error {
151151
githubPr.AuthorName = githubUser.Login
152152
githubPr.AuthorId = githubUser.AccountId
153153
}
154+
// Emit a repo_account for the merged-by user too, so pull_requests.merged_by_id
155+
// resolves to a domain account instead of an orphan FK (ConvertAccounts sources
156+
// every referenced user from _tool_github_repo_accounts).
157+
if body.MergedBy != nil {
158+
mergedByUser, err := convertAccount(body.MergedBy, data.Options.GithubId, data.Options.ConnectionId)
159+
if err != nil {
160+
return nil, err
161+
}
162+
results = append(results, mergedByUser)
163+
}
154164
for _, label := range body.Labels {
155165
results = append(results, &models.GithubPrLabel{
156166
ConnectionId: data.Options.ConnectionId,

0 commit comments

Comments
 (0)