From 796704299955fe285aa17a80ebd46d7ce31d387e Mon Sep 17 00:00:00 2001 From: Schuyler Bishop Date: Thu, 21 May 2026 13:51:39 -0500 Subject: [PATCH 1/8] adding feature that allows deduplication across methods per-file --- cmd/vault-csv-normalizer/main.go | 57 ++++++++++- internal/normalizer/normalizer.go | 148 ++++++++++++++++++++++++++--- internal/parser/parser.go | 51 +++++----- internal/renderer/renderer.go | 36 ++++--- internal/renderer/renderer_test.go | 7 +- 5 files changed, 240 insertions(+), 59 deletions(-) diff --git a/cmd/vault-csv-normalizer/main.go b/cmd/vault-csv-normalizer/main.go index 1bcdb30..97b64bb 100644 --- a/cmd/vault-csv-normalizer/main.go +++ b/cmd/vault-csv-normalizer/main.go @@ -40,6 +40,7 @@ func (f fileDateFlag) Set(v string) error { func main() { var inputFiles multiFlag var dedupMethods multiFlag + var dedupMethodsPerFile multiFlag var sortBy string var filterNS string var filterType string @@ -64,6 +65,7 @@ func main() { flag.BoolVar(&dedup, "d", false, "Deduplicate records by client_id across all input files") flag.BoolVar(&dedupAlias, "dedup-alias", false, "Deduplicate by entity_alias_name (strips domain and -t0/-t1/-t2 tier suffixes; records without an alias are always kept; may be combined with -d)") flag.Var(&dedupMethods, "dedup-methods", "Deduplicate by alias for the specified comma-separated auth methods, treating them as one identity group. Repeatable to define multiple groups (e.g. -dedup-methods ldap,oidc -dedup-methods jwt,saml).") + flag.Var(&dedupMethodsPerFile, "dedup-methods-per-file", "Like --dedup-methods but scoped to each input file independently. Records in different files are never collapsed against each other. Repeatable to define multiple groups.") flag.BoolVar(&dedupJWT, "dedup-jwt", false, "Drop JWT records whose normalized alias matches a non-JWT record in the same file (prevents counting the same person via both LDAP/OIDC and JWT)") flag.BoolVar(&listMethods, "list-methods", false, "Print every distinct auth method found in the input files (with record counts and alias coverage), then exit. Useful for deciding --dedup-methods groups.") flag.BoolVar(&debugMode, "debug", false, "Print all records grouped by mount path") @@ -131,6 +133,20 @@ func main() { } } + var methodGroupsPerFile [][]string + for _, val := range dedupMethodsPerFile { + var group []string + for _, m := range strings.Split(val, ",") { + m = strings.TrimSpace(strings.ToLower(m)) + if m != "" { + group = append(group, m) + } + } + if len(group) > 0 { + methodGroupsPerFile = append(methodGroupsPerFile, group) + } + } + // Snapshot pre-dedup records so debug mode can show alias groups from the // original data regardless of which dedup flags are active. preDedup := normalized @@ -164,6 +180,21 @@ func main() { } normalized = normalizer.DeduplicateByAliasForMethods(normalized, methodGroups) } + if len(methodGroupsPerFile) > 0 { + groups := normalizer.FindAliasDuplicatesForMethodsPerFile(preDedup, methodGroupsPerFile) + if len(groups) > 0 { + fmt.Fprintf(os.Stdout, "Per-file method-scoped alias duplicates found (%d group(s))\n", len(groups)) + fmt.Fprintln(os.Stdout, "=====================================================") + for _, group := range groups { + r0 := group[0] + fmt.Fprintf(os.Stdout, "\nAlias group: %q file: %s\n", + normalizer.StripTierSuffix(normalizer.BaseAlias(r0.EntityAliasName)), filepath.Base(r0.Source)) + renderer.PrintTable(os.Stdout, group) + } + fmt.Fprintln(os.Stdout) + } + normalized = normalizer.DeduplicateByAliasForMethodsPerFile(normalized, methodGroupsPerFile) + } // Collect -d dedup statistics before running so debug mode can report // exactly which client_ids were (or weren't) collapsed. @@ -307,7 +338,7 @@ func main() { fmt.Fprintln(os.Stdout) } - if perFile { + if (perFile || len(methodGroupsPerFile) > 0) && len(inputFiles) > 1 { bySource := make(map[string][]normalizer.Record, len(inputFiles)) for _, r := range normalized { bySource[r.Source] = append(bySource[r.Source], r) @@ -466,5 +497,27 @@ CSV FORMAT (Vault activity export): record and a JWT record for the same person are not collapsed (unless both groups are merged into one). - Can be combined with --dedup-alias, --dedup-jwt, and/or -d.`) + Can be combined with --dedup-alias, --dedup-jwt, and/or -d. + + --dedup-methods-per-file + Like --dedup-methods but deduplication is scoped to each input file + independently. Records in different files with the same normalized alias + are NOT collapsed against each other — only within-file duplicates are + removed. Useful when files represent different billing periods and you + want to count a returning user once per file rather than once globally. + + Uses the same alias normalization and method-grouping syntax as + --dedup-methods (repeatable, comma-separated groups). + + --dedup-methods-per-file ldap,oidc + Within each file, collapse LDAP and OIDC records that share the + same alias (exact match; tier suffixes like -t0/-t1 are distinct). + A user in jan.csv (LDAP) and feb.csv (OIDC) is NOT collapsed — + they appear once per file. + + --dedup-methods-per-file ldap,oidc --dedup-methods-per-file jwt,saml + Two independent per-file groups. Same alias collapsing rules as + --dedup-methods but strictly within each source file. + + Can be combined with --dedup-methods, --dedup-alias, --dedup-jwt, and/or -d.`) } diff --git a/internal/normalizer/normalizer.go b/internal/normalizer/normalizer.go index 5b3e543..efd20fd 100644 --- a/internal/normalizer/normalizer.go +++ b/internal/normalizer/normalizer.go @@ -23,9 +23,10 @@ type Record struct { MountType string AuthMethod string ClientType string // normalized: entity | non-entity | acme | secret-sync | unknown - TokenCreationTime time.Time - ClientFirstUsageTime time.Time - EntityAliasName string + TokenCreationTime time.Time + ClientFirstUsageTime time.Time + EntityAliasName string + EntityAliasMetadataUsername string } // supportedSortKeys lists columns accepted by Sort. @@ -51,18 +52,19 @@ func Normalize(raw []parser.RawRecord) []Record { func normalizeOne(r parser.RawRecord) Record { return Record{ - Source: r.Source, - ClientID: r.ClientID, - NamespaceID: normalizeNamespaceID(r.NamespaceID), - NamespacePath: normalizeNamespacePath(r.NamespacePath), - MountAccessor: strings.TrimSpace(r.MountAccessor), - MountPath: normalizeMountPath(r.MountPath), - MountType: strings.ToLower(strings.TrimSpace(r.MountType)), - AuthMethod: strings.ToLower(strings.TrimSpace(r.AuthMethod)), - ClientType: normalizeClientType(r.ClientType), - TokenCreationTime: ParseTime(r.TokenCreationTime), - ClientFirstUsageTime: ParseTime(r.ClientFirstUsageTime), - EntityAliasName: strings.TrimSpace(r.EntityAliasName), + Source: r.Source, + ClientID: r.ClientID, + NamespaceID: normalizeNamespaceID(r.NamespaceID), + NamespacePath: normalizeNamespacePath(r.NamespacePath), + MountAccessor: strings.TrimSpace(r.MountAccessor), + MountPath: normalizeMountPath(r.MountPath), + MountType: strings.ToLower(strings.TrimSpace(r.MountType)), + AuthMethod: strings.ToLower(strings.TrimSpace(r.AuthMethod)), + ClientType: normalizeClientType(r.ClientType), + TokenCreationTime: ParseTime(r.TokenCreationTime), + ClientFirstUsageTime: ParseTime(r.ClientFirstUsageTime), + EntityAliasName: strings.TrimSpace(r.EntityAliasName), + EntityAliasMetadataUsername: strings.TrimSpace(r.EntityAliasMetadataUsername), } } @@ -400,6 +402,122 @@ func DeduplicateByAliasForMethods(records []Record, groups [][]string) []Record return out } +// aliasKeyInFile is the deduplication key for per-file alias dedup. It includes +// the source file so records from different files are never collapsed together. +type aliasKeyInFile struct { + base string + mountType string + source string +} + +// isOIDC reports whether r was authenticated via OIDC. +func isOIDC(r Record) bool { + return r.MountType == "oidc" || r.AuthMethod == "oidc" +} + +// effectiveAliasInFile returns the alias to use for per-file dedup. For OIDC +// records, entity_alias_metadata.username holds the human-readable username; +// entity_alias_name may be a subject identifier (UUID or email) that doesn't +// match other methods. All other methods use entity_alias_name directly. +func effectiveAliasInFile(r Record) string { + if isOIDC(r) && r.EntityAliasMetadataUsername != "" { + return r.EntityAliasMetadataUsername + } + return r.EntityAliasName +} + +// aliasKeyInFileFor computes the per-file dedup key for a record. It applies +// BaseAlias (strips everything after '@' if present) but not StripTierSuffix, +// so "alice-t0" and "alice-t1" are treated as distinct identities. The '@' +// strip is needed for JWT, which uses full email addresses ("alice@corp.com"); +// LDAP uses bare usernames ("alice"); OIDC uses entity_alias_metadata.username. +// Returns false if the record's mount type is not in any provided group. +func aliasKeyInFileFor(r Record, groupMap map[string]string) (aliasKeyInFile, bool) { + mt := r.MountType + if mt == "" { + mt = r.AuthMethod + } + canonical, ok := groupMap[mt] + if !ok { + return aliasKeyInFile{}, false + } + return aliasKeyInFile{ + base: BaseAlias(effectiveAliasInFile(r)), + mountType: canonical, + source: r.Source, + }, true +} + +// FindAliasDuplicatesForMethodsPerFile is like FindAliasDuplicatesForMethods +// but only collapses records within the same source file. Records in different +// files with the same alias are not reported as duplicates. Matching uses only +// the portion of the alias left of '@'; tier suffixes (-t0/-t1/-t2) are not +// stripped and must match exactly. +func FindAliasDuplicatesForMethodsPerFile(records []Record, groups [][]string) [][]Record { + groupMap := buildMethodGroupMap(groups) + + type entry struct { + key aliasKeyInFile + members []Record + } + index := make(map[aliasKeyInFile]int) + var entries []entry + + for _, r := range records { + if effectiveAliasInFile(r) == "" || IsPKIClient(r) { + continue + } + kf, ok := aliasKeyInFileFor(r, groupMap) + if !ok { + continue + } + if idx, exists := index[kf]; exists { + entries[idx].members = append(entries[idx].members, r) + } else { + index[kf] = len(entries) + entries = append(entries, entry{key: kf, members: []Record{r}}) + } + } + + var out [][]Record + for _, e := range entries { + if len(e.members) > 1 { + out = append(out, e.members) + } + } + return out +} + +// DeduplicateByAliasForMethodsPerFile applies alias dedup like +// DeduplicateByAliasForMethods but scoped to each source file independently. +// Records in different files are never collapsed; only records from the same +// file with the same normalized alias and method group are deduplicated. +// Matching uses only the portion of the alias left of '@'; tier suffixes +// (-t0/-t1/-t2) are not stripped and must match exactly. +// Records with a blank EntityAliasName or that are PKI clients are always kept. +func DeduplicateByAliasForMethodsPerFile(records []Record, groups [][]string) []Record { + groupMap := buildMethodGroupMap(groups) + seen := make(map[aliasKeyInFile]struct{}, len(records)) + out := make([]Record, 0, len(records)) + for _, r := range records { + if effectiveAliasInFile(r) == "" || IsPKIClient(r) { + out = append(out, r) + continue + } + kf, ok := aliasKeyInFileFor(r, groupMap) + if !ok { + out = append(out, r) + continue + } + if _, dup := seen[kf]; dup { + continue + } + seen[kf] = struct{}{} + out = append(out, r) + } + return out +} + // isJWT reports whether r was authenticated via JWT. func isJWT(r Record) bool { return r.MountType == "jwt" || r.AuthMethod == "jwt" diff --git a/internal/parser/parser.go b/internal/parser/parser.go index 98a1199..aaa29f9 100644 --- a/internal/parser/parser.go +++ b/internal/parser/parser.go @@ -17,17 +17,18 @@ type RawRecord struct { // Source tracks which file this record came from. Source string - ClientID string - NamespaceID string - NamespacePath string - MountAccessor string - MountPath string - MountType string - AuthMethod string - ClientType string - TokenCreationTime string // may be populated from legacy "timestamp" column - ClientFirstUsageTime string - EntityAliasName string + ClientID string + NamespaceID string + NamespacePath string + MountAccessor string + MountPath string + MountType string + AuthMethod string + ClientType string + TokenCreationTime string // may be populated from legacy "timestamp" column + ClientFirstUsageTime string + EntityAliasName string + EntityAliasMetadataUsername string } // knownColumns maps all recognised (lowercased, trimmed) header variants to @@ -43,7 +44,8 @@ var knownColumns = map[string]string{ "client_type": "client_type", "token_creation_time": "token_creation_time", "client_first_usage_time": "client_first_usage_time", - "entity_alias_name": "entity_alias_name", + "entity_alias_name": "entity_alias_name", + "entity_alias_metadata.username": "entity_alias_metadata_username", // Legacy / alternative column names: "timestamp": "token_creation_time", // Vault < 1.17 "first_seen": "client_first_usage_time", @@ -123,18 +125,19 @@ func parseReader(r io.Reader, source string) ([]RawRecord, error) { } records = append(records, RawRecord{ - Source: source, - ClientID: clientID, - NamespaceID: get(row, "namespace_id"), - NamespacePath: get(row, "namespace_path"), - MountAccessor: get(row, "mount_accessor"), - MountPath: get(row, "mount_path"), - MountType: get(row, "mount_type"), - AuthMethod: get(row, "auth_method"), - ClientType: get(row, "client_type"), - TokenCreationTime: get(row, "token_creation_time"), - ClientFirstUsageTime: get(row, "client_first_usage_time"), - EntityAliasName: get(row, "entity_alias_name"), + Source: source, + ClientID: clientID, + NamespaceID: get(row, "namespace_id"), + NamespacePath: get(row, "namespace_path"), + MountAccessor: get(row, "mount_accessor"), + MountPath: get(row, "mount_path"), + MountType: get(row, "mount_type"), + AuthMethod: get(row, "auth_method"), + ClientType: get(row, "client_type"), + TokenCreationTime: get(row, "token_creation_time"), + ClientFirstUsageTime: get(row, "client_first_usage_time"), + EntityAliasName: get(row, "entity_alias_name"), + EntityAliasMetadataUsername: get(row, "entity_alias_metadata_username"), }) } diff --git a/internal/renderer/renderer.go b/internal/renderer/renderer.go index 2d788ff..b1e2659 100644 --- a/internal/renderer/renderer.go +++ b/internal/renderer/renderer.go @@ -27,16 +27,6 @@ var columns = []column{ width: 16, get: func(r normalizer.Record) string { return r.NamespacePath }, }, - { - header: "Client Type", - width: 12, - get: func(r normalizer.Record) string { return r.ClientType }, - }, - { - header: "Auth Method", - width: 12, - get: func(r normalizer.Record) string { return r.AuthMethod }, - }, { header: "Mount Path", width: 12, @@ -74,24 +64,44 @@ var aliasColumn = column{ get: func(r normalizer.Record) string { return r.EntityAliasName }, } +var oidcUsernameColumn = column{ + header: "OIDC Username", + width: 13, + get: func(r normalizer.Record) string { return r.EntityAliasMetadataUsername }, +} + // PrintTable writes the records as a plain-text table to w. If any record has // a non-empty EntityAliasName, an Entity Alias column is appended so the -// original alias values are visible in alias deduplication output. +// original alias values are visible in alias deduplication output. If any +// record has a non-empty EntityAliasMetadataUsername, an OIDC Username column +// is also appended. func PrintTable(w io.Writer, records []normalizer.Record) { if len(records) == 0 { fmt.Fprintln(w, "(no records to display)") return } - // Build column list, appending the alias column only when the data has it. + // Build column list, appending extra columns only when the data has them. cols := make([]column, len(columns)) copy(cols, columns) + var hasAlias, hasOIDCUsername bool for _, r := range records { if r.EntityAliasName != "" { - cols = append(cols, aliasColumn) + hasAlias = true + } + if r.EntityAliasMetadataUsername != "" { + hasOIDCUsername = true + } + if hasAlias && hasOIDCUsername { break } } + if hasAlias { + cols = append(cols, aliasColumn) + } + if hasOIDCUsername { + cols = append(cols, oidcUsernameColumn) + } for _, r := range records { for i, c := range cols { diff --git a/internal/renderer/renderer_test.go b/internal/renderer/renderer_test.go index 383b91c..f775f60 100644 --- a/internal/renderer/renderer_test.go +++ b/internal/renderer/renderer_test.go @@ -46,8 +46,8 @@ func TestPrintTable_RendersRows(t *testing.T) { if !strings.Contains(out, "Namespace Path") { t.Error("expected header 'Namespace Path'") } - if !strings.Contains(out, "Client Type") { - t.Error("expected header 'Client Type'") + if strings.Contains(out, "Client Type") { + t.Error("unexpected header 'Client Type' — removed from table output") } // Data rows present @@ -57,9 +57,6 @@ func TestPrintTable_RendersRows(t *testing.T) { if !strings.Contains(out, "education/") { t.Error("expected namespace 'education/'") } - if !strings.Contains(out, "non-entity") { - t.Error("expected client type 'non-entity'") - } } func TestPrintTable_ZeroTimeFmtDash(t *testing.T) { From 0de034f567c5aa0fe7b366a9a22552bf108e51af Mon Sep 17 00:00:00 2001 From: Andrew Thielen Date: Thu, 21 May 2026 16:21:21 -0500 Subject: [PATCH 2/8] Add abandoned record filtering --- README.md | 14 ++++ cmd/vault-csv-normalizer/main.go | 15 ++++ internal/normalizer/normalizer.go | 96 ++++++++++++++++++-------- internal/normalizer/normalizer_test.go | 88 ++++++++++++++++------- internal/parser/parser.go | 3 + internal/parser/parser_test.go | 8 ++- 6 files changed, 166 insertions(+), 58 deletions(-) diff --git a/README.md b/README.md index ff8f84c..66c0b46 100644 --- a/README.md +++ b/README.md @@ -111,11 +111,19 @@ OPTIONS: counted twice when they authenticate via both LDAP/OIDC and JWT. Records without an alias are always kept. May be combined with --dedup-alias, --dedup-methods, and/or -d. + -remove-abandoned-clients + Remove abandoned clients where entity_name and entity_alias_name are + both blank. This includes records with no auth mount (mount_path + empty) and merged/deleted entities (mount_path present). Applied after + all deduplication steps. -per-file Print a summary for each input file before the combined summary -debug Print all records grouped by mount path, with a full record table under each mount. Records with no mount path are grouped as "(no mount)". + Also prints how many records were removed by + --remove-abandoned-clients when that flag is enabled, split into + no-mount and merged/deleted buckets. -help Show usage information ``` @@ -178,6 +186,12 @@ vault-csv-normalizer -f export.csv --dedup-jwt # Full dedup: collapse tiers, dedup client_ids, then drop redundant JWT records vault-csv-normalizer -f jan.csv feb.csv --dedup-alias -d --dedup-jwt +# Remove abandoned clients from final totals +vault-csv-normalizer -f export.csv --remove-abandoned-clients + +# Same as above, with debug count output for removed rows +vault-csv-normalizer -f export.csv --remove-abandoned-clients --debug + # Deduplicate LDAP and OIDC as one identity group — same person via either # method is counted once; other auth methods are unaffected vault-csv-normalizer -f export.csv --dedup-methods ldap,oidc diff --git a/cmd/vault-csv-normalizer/main.go b/cmd/vault-csv-normalizer/main.go index 97b64bb..75ebddd 100644 --- a/cmd/vault-csv-normalizer/main.go +++ b/cmd/vault-csv-normalizer/main.go @@ -50,6 +50,7 @@ func main() { var dedup bool var dedupAlias bool var dedupJWT bool + var removeAbandonedClients bool var listMethods bool var debugMode bool var perFile bool @@ -67,6 +68,7 @@ func main() { flag.Var(&dedupMethods, "dedup-methods", "Deduplicate by alias for the specified comma-separated auth methods, treating them as one identity group. Repeatable to define multiple groups (e.g. -dedup-methods ldap,oidc -dedup-methods jwt,saml).") flag.Var(&dedupMethodsPerFile, "dedup-methods-per-file", "Like --dedup-methods but scoped to each input file independently. Records in different files are never collapsed against each other. Repeatable to define multiple groups.") flag.BoolVar(&dedupJWT, "dedup-jwt", false, "Drop JWT records whose normalized alias matches a non-JWT record in the same file (prevents counting the same person via both LDAP/OIDC and JWT)") + flag.BoolVar(&removeAbandonedClients, "remove-abandoned-clients", false, "Remove abandoned clients (blank entity_name and entity_alias_name) after deduplication. Includes records with no auth mount and merged/deleted entities.") flag.BoolVar(&listMethods, "list-methods", false, "Print every distinct auth method found in the input files (with record counts and alias coverage), then exit. Useful for deciding --dedup-methods groups.") flag.BoolVar(&debugMode, "debug", false, "Print all records grouped by mount path") flag.BoolVar(&perFile, "per-file", false, "Print a summary for each input file before the combined summary") @@ -224,6 +226,16 @@ func main() { normalized = normalizer.DeduplicateJWT(normalized) } + removedAbandonedCounts := normalizer.AbandonedClientCounts{} + if removeAbandonedClients { + normalized, removedAbandonedCounts = normalizer.FilterAbandonedClients(normalized) + + fmt.Fprintf(os.Stdout, "Removed abandoned clients (total): %d\n", removedAbandonedCounts.Total()) + fmt.Fprintf(os.Stdout, " no auth mount (mount path empty): %d\n", removedAbandonedCounts.NoMount) + fmt.Fprintf(os.Stdout, " merged/deleted (mount path present): %d\n", removedAbandonedCounts.MergedDeleted) + fmt.Fprintln(os.Stdout, strings.Repeat("-", 70)) + } + // Apply filters. if filterNS != "" { normalized = normalizer.FilterByNamespace(normalized, filterNS) @@ -445,6 +457,9 @@ EXAMPLES: # Per-file since filters on multiple files vault-csv-normalizer -f jan.csv feb.csv --since-file jan.csv=2024-01-15 --since-file feb.csv=2024-02-01 + # Remove abandoned clients (blank entity fields) + vault-csv-normalizer -f export.csv --remove-abandoned-clients + CSV FORMAT (Vault activity export): Expected columns (order-independent, case-insensitive): client_id, namespace_id, namespace_path, mount_accessor, mount_path, diff --git a/internal/normalizer/normalizer.go b/internal/normalizer/normalizer.go index efd20fd..8a6d573 100644 --- a/internal/normalizer/normalizer.go +++ b/internal/normalizer/normalizer.go @@ -14,15 +14,16 @@ import ( // Record is a fully normalized Vault client record. type Record struct { - Source string - ClientID string - NamespaceID string - NamespacePath string - MountAccessor string - MountPath string - MountType string - AuthMethod string - ClientType string // normalized: entity | non-entity | acme | secret-sync | unknown + Source string + ClientID string + EntityName string + NamespaceID string + NamespacePath string + MountAccessor string + MountPath string + MountType string + AuthMethod string + ClientType string // normalized: entity | non-entity | acme | secret-sync | unknown TokenCreationTime time.Time ClientFirstUsageTime time.Time EntityAliasName string @@ -31,14 +32,14 @@ type Record struct { // supportedSortKeys lists columns accepted by Sort. var supportedSortKeys = map[string]bool{ - "namespace_path": true, - "client_type": true, - "token_creation_time": true, + "namespace_path": true, + "client_type": true, + "token_creation_time": true, "client_first_usage_time": true, - "mount_accessor": true, - "mount_path": true, - "auth_method": true, - "source": true, + "mount_accessor": true, + "mount_path": true, + "auth_method": true, + "source": true, } // Normalize converts a slice of raw records into normalized records. @@ -54,6 +55,7 @@ func normalizeOne(r parser.RawRecord) Record { return Record{ Source: r.Source, ClientID: r.ClientID, + EntityName: strings.TrimSpace(r.EntityName), NamespaceID: normalizeNamespaceID(r.NamespaceID), NamespacePath: normalizeNamespacePath(r.NamespacePath), MountAccessor: strings.TrimSpace(r.MountAccessor), @@ -103,20 +105,20 @@ func normalizeMountPath(path string) string { // clientTypeAliases maps various raw strings to a canonical client type. var clientTypeAliases = map[string]string{ - "entity": "entity", - "entity client": "entity", - "non-entity": "non-entity", - "non_entity": "non-entity", - "non-entity client": "non-entity", - "non_entity_client": "non-entity", - "nonentity": "non-entity", - "acme": "acme", - "acme client": "acme", - "secret-sync": "secret-sync", - "secret_sync": "secret-sync", - "secretsync": "secret-sync", - "secrets sync": "secret-sync", - "secret sync": "secret-sync", + "entity": "entity", + "entity client": "entity", + "non-entity": "non-entity", + "non_entity": "non-entity", + "non-entity client": "non-entity", + "non_entity_client": "non-entity", + "nonentity": "non-entity", + "acme": "acme", + "acme client": "acme", + "secret-sync": "secret-sync", + "secret_sync": "secret-sync", + "secretsync": "secret-sync", + "secrets sync": "secret-sync", + "secret sync": "secret-sync", } func normalizeClientType(raw string) string { @@ -636,6 +638,40 @@ func FilterByClientType(records []Record, clientType string) []Record { return out } +// AbandonedClientCounts reports how many anonymous records were removed by +// FilterAbandonedClients, split by whether an auth mount is present. +type AbandonedClientCounts struct { + NoMount int + MergedDeleted int +} + +// Total returns the sum of removed abandoned-client records. +func (c AbandonedClientCounts) Total() int { + return c.NoMount + c.MergedDeleted +} + +// FilterAbandonedClients removes records with no entity identity (both +// entity_name and entity_alias_name are blank) and reports separate counts for +// two cases: +// - NoMount: mount_path is blank (auth mount no longer exists) +// - MergedDeleted: mount_path is present (entity was likely merged/deleted) +func FilterAbandonedClients(records []Record) ([]Record, AbandonedClientCounts) { + out := make([]Record, 0, len(records)) + counts := AbandonedClientCounts{} + for _, r := range records { + if r.EntityName == "" && r.EntityAliasName == "" { + if r.MountPath == "" { + counts.NoMount++ + continue + } + counts.MergedDeleted++ + continue + } + out = append(out, r) + } + return out, counts +} + // Sort sorts records in-place by the given column key. Returns an error if // the key is not recognized. func Sort(records []Record, by string) error { diff --git a/internal/normalizer/normalizer_test.go b/internal/normalizer/normalizer_test.go index 68adaa1..18a919b 100644 --- a/internal/normalizer/normalizer_test.go +++ b/internal/normalizer/normalizer_test.go @@ -1,6 +1,7 @@ package normalizer import ( + "strings" "testing" "time" @@ -73,15 +74,16 @@ func TestParseTime(t *testing.T) { func TestNormalize(t *testing.T) { raw := []parser.RawRecord{ { - Source: "jan.csv", - ClientID: "abc-123", - NamespaceID: "", - NamespacePath: "root", - MountPath: "auth/approle", - MountType: "APPROLE", - AuthMethod: "AppRole", - ClientType: "non_entity", - TokenCreationTime: "2024-01-01T00:00:00Z", + Source: "jan.csv", + ClientID: "abc-123", + EntityName: " Alice Smith ", + NamespaceID: "", + NamespacePath: "root", + MountPath: "auth/approle", + MountType: "APPROLE", + AuthMethod: "AppRole", + ClientType: "non_entity", + TokenCreationTime: "2024-01-01T00:00:00Z", }, } records := Normalize(raw) @@ -89,6 +91,9 @@ func TestNormalize(t *testing.T) { t.Fatalf("expected 1 record, got %d", len(records)) } r := records[0] + if r.EntityName != "Alice Smith" { + t.Errorf("EntityName: got %q, want Alice Smith", r.EntityName) + } if r.NamespacePath != "[root]" { t.Errorf("NamespacePath: got %q, want [root]", r.NamespacePath) } @@ -139,6 +144,40 @@ func TestFilterByClientType(t *testing.T) { } } +func TestFilterAbandonedClients(t *testing.T) { + records := []Record{ + // removed as merged/deleted: mount path present + {ClientID: "drop-merged-1", EntityName: "", EntityAliasName: "", MountPath: "auth/ldap/", MountType: "ldap"}, + // removed as merged/deleted: mount path present even if mount type is blank + {ClientID: "drop-merged-2", EntityName: "", EntityAliasName: "", MountPath: "auth/oidc/", MountType: ""}, + // removed as no mount: mount path missing + {ClientID: "drop-nomount-1", EntityName: "", EntityAliasName: "", MountPath: "", MountType: "ldap"}, + // keep: entity name present + {ClientID: "keep-3", EntityName: "Alice", EntityAliasName: "", MountPath: "auth/ldap/", MountType: "ldap"}, + // keep: entity alias present + {ClientID: "keep-4", EntityName: "", EntityAliasName: "alice", MountPath: "auth/ldap/", MountType: "ldap"}, + } + + out, counts := FilterAbandonedClients(records) + if counts.NoMount != 1 { + t.Fatalf("expected NoMount=1, got %d", counts.NoMount) + } + if counts.MergedDeleted != 2 { + t.Fatalf("expected MergedDeleted=2, got %d", counts.MergedDeleted) + } + if counts.Total() != 3 { + t.Fatalf("expected Total=3, got %d", counts.Total()) + } + if len(out) != 2 { + t.Fatalf("expected 2 records after filter, got %d", len(out)) + } + for _, r := range out { + if strings.HasPrefix(r.ClientID, "drop-") { + t.Fatal("drop-* records should have been removed") + } + } +} + func TestDeduplicate_PrefersNonEmptyMount(t *testing.T) { records := []Record{ {ClientID: "abc", MountPath: ""}, @@ -246,8 +285,8 @@ func TestStripTierSuffix(t *testing.T) { {"alice-t10", "alice-t10"}, {"alice-T0", "alice-T0"}, // case-sensitive {"alice", "alice"}, - {"-t0", ""}, // degenerate: only the suffix - {"t0", "t0"}, // no hyphen + {"-t0", ""}, // degenerate: only the suffix + {"t0", "t0"}, // no hyphen {"", ""}, } for _, c := range cases { @@ -285,7 +324,7 @@ func TestDeduplicateByAlias_CollapsesSameBaseAcrossAccessors(t *testing.T) { {ClientID: "3", EntityAliasName: "sbishop-t0", MountAccessor: "auth_ldap_abc123", Source: "jan.csv"}, // dup: tier stripped → "sbishop" {ClientID: "4", EntityAliasName: "sbishop-t1", MountAccessor: "auth_oidc_xyz789", Source: "jan.csv"}, // dup: tier stripped → "sbishop" {ClientID: "5", EntityAliasName: "sbishop", MountAccessor: "auth_ldap_abc123", Source: "feb.csv"}, // dup: same normalized alias across files - {ClientID: "6", EntityAliasName: ""}, // kept: blank always kept + {ClientID: "6", EntityAliasName: ""}, // kept: blank always kept } out := DeduplicateByAlias(records) if len(out) != 2 { @@ -324,7 +363,7 @@ func TestFindAliasDuplicates_SameBaseAcrossAccessors(t *testing.T) { {ClientID: "2", EntityAliasName: "sbishop@hashicorp.com", MountAccessor: "auth_jwt_def456", Source: "jan.csv"}, {ClientID: "3", EntityAliasName: "sbishop-t0", MountAccessor: "auth_ldap_abc123", Source: "jan.csv"}, {ClientID: "4", EntityAliasName: "sbishop", MountAccessor: "auth_ldap_abc123", Source: "feb.csv"}, // cross-file dup - {ClientID: "5", EntityAliasName: ""}, // ignored + {ClientID: "5", EntityAliasName: ""}, // ignored } groups := FindAliasDuplicates(records) if len(groups) != 1 { @@ -358,11 +397,11 @@ func TestDeduplicateByAlias_IgnoresPKIClients(t *testing.T) { // PKI clients are always kept regardless of alias duplication. // Non-PKI clients with the same base alias in the same file are deduplicated. records := []Record{ - {ClientID: "1", EntityAliasName: "abc-123", ClientType: "acme", Source: "jan.csv"}, // PKI, kept - {ClientID: "2", EntityAliasName: "abc-456", ClientType: "acme", Source: "jan.csv"}, // PKI, kept (not deduped) + {ClientID: "1", EntityAliasName: "abc-123", ClientType: "acme", Source: "jan.csv"}, // PKI, kept + {ClientID: "2", EntityAliasName: "abc-456", ClientType: "acme", Source: "jan.csv"}, // PKI, kept (not deduped) {ClientID: "3", EntityAliasName: "abc-789", MountAccessor: "auth_cert_xyz", Source: "jan.csv"}, // cert auth — PKI, kept - {ClientID: "4", EntityAliasName: "alice@corp", Source: "jan.csv"}, // non-PKI, first: kept - {ClientID: "5", EntityAliasName: "alice@example.com", Source: "jan.csv"}, // non-PKI dup: base "alice" already seen, dropped + {ClientID: "4", EntityAliasName: "alice@corp", Source: "jan.csv"}, // non-PKI, first: kept + {ClientID: "5", EntityAliasName: "alice@example.com", Source: "jan.csv"}, // non-PKI dup: base "alice" already seen, dropped } out := DeduplicateByAlias(records) if len(out) != 4 { @@ -547,7 +586,6 @@ func TestPartitionPKI_NoPKI(t *testing.T) { } } - func TestPartitionPKI_Empty(t *testing.T) { pki, nonPKI := PartitionPKI(nil, IsPKIClient) if pki != nil || nonPKI != nil { @@ -567,8 +605,8 @@ func TestFilterSincePerSource_FiltersTargetFileOnly(t *testing.T) { records := []Record{ // jan.csv: one record before cutoff, one after {ClientID: "j1", Source: "jan.csv", TokenCreationTime: jan15.Add(-24 * time.Hour)}, // before — excluded - {ClientID: "j2", Source: "jan.csv", TokenCreationTime: jan15}, // on cutoff — kept - {ClientID: "j3", Source: "jan.csv", TokenCreationTime: jan20}, // after — kept + {ClientID: "j2", Source: "jan.csv", TokenCreationTime: jan15}, // on cutoff — kept + {ClientID: "j3", Source: "jan.csv", TokenCreationTime: jan20}, // after — kept // feb.csv: not in filter map — all kept regardless of date {ClientID: "f1", Source: "feb.csv", TokenCreationTime: jan15.Add(-24 * time.Hour)}, // old but kept {ClientID: "f2", Source: "feb.csv", TokenCreationTime: feb01}, @@ -813,10 +851,10 @@ func TestDeduplicateByAlias_CollapseOIDCWithLDAP(t *testing.T) { // JWT remains a separate group and is not collapsed here. records := []Record{ {ClientID: "1", EntityAliasName: "alice", MountType: "ldap", Source: "jan.csv"}, - {ClientID: "2", EntityAliasName: "alice@corp.com", MountType: "oidc", Source: "jan.csv"}, // dup: ldap/oidc group, normalizes to "alice" - {ClientID: "3", EntityAliasName: "alice-t0", MountType: "ldap", Source: "feb.csv"}, // dup: ldap/oidc group, tier stripped → "alice" - {ClientID: "4", EntityAliasName: "alice@corp.com", MountType: "jwt", Source: "jan.csv"}, // kept: jwt is a separate group - {ClientID: "5", EntityAliasName: "bob", MountType: "ldap", Source: "jan.csv"}, // kept: different alias + {ClientID: "2", EntityAliasName: "alice@corp.com", MountType: "oidc", Source: "jan.csv"}, // dup: ldap/oidc group, normalizes to "alice" + {ClientID: "3", EntityAliasName: "alice-t0", MountType: "ldap", Source: "feb.csv"}, // dup: ldap/oidc group, tier stripped → "alice" + {ClientID: "4", EntityAliasName: "alice@corp.com", MountType: "jwt", Source: "jan.csv"}, // kept: jwt is a separate group + {ClientID: "5", EntityAliasName: "bob", MountType: "ldap", Source: "jan.csv"}, // kept: different alias } out := DeduplicateByAlias(records) if len(out) != 3 { @@ -843,7 +881,7 @@ func TestDeduplicateByAlias_ScopedToMountType(t *testing.T) { // → they ARE collapsed. records := []Record{ {ClientID: "1", EntityAliasName: "alice", MountType: "ldap", Source: "jan.csv"}, - {ClientID: "2", EntityAliasName: "alice-t0", MountType: "ldap", Source: "jan.csv"}, // dup: same type + base + {ClientID: "2", EntityAliasName: "alice-t0", MountType: "ldap", Source: "jan.csv"}, // dup: same type + base {ClientID: "3", EntityAliasName: "alice@corp.com", MountType: "jwt", Source: "jan.csv"}, // kept: different mount type } out := DeduplicateByAlias(records) diff --git a/internal/parser/parser.go b/internal/parser/parser.go index aaa29f9..394b757 100644 --- a/internal/parser/parser.go +++ b/internal/parser/parser.go @@ -18,6 +18,7 @@ type RawRecord struct { Source string ClientID string + EntityName string NamespaceID string NamespacePath string MountAccessor string @@ -35,6 +36,7 @@ type RawRecord struct { // a canonical field name used by the column mapper below. var knownColumns = map[string]string{ "client_id": "client_id", + "entity_name": "entity_name", "namespace_id": "namespace_id", "namespace_path": "namespace_path", "mount_accessor": "mount_accessor", @@ -127,6 +129,7 @@ func parseReader(r io.Reader, source string) ([]RawRecord, error) { records = append(records, RawRecord{ Source: source, ClientID: clientID, + EntityName: get(row, "entity_name"), NamespaceID: get(row, "namespace_id"), NamespacePath: get(row, "namespace_path"), MountAccessor: get(row, "mount_accessor"), diff --git a/internal/parser/parser_test.go b/internal/parser/parser_test.go index e1a1f02..21d7e93 100644 --- a/internal/parser/parser_test.go +++ b/internal/parser/parser_test.go @@ -6,9 +6,9 @@ import ( ) func TestParseReader_StandardColumns(t *testing.T) { - csv := `client_id,namespace_id,namespace_path,mount_accessor,mount_path,mount_type,auth_method,client_type,token_creation_time,client_first_usage_time -abc-123,root,[root],auth_approle_abc,auth/approle/,approle,approle,entity,2024-01-15T10:00:00Z,2024-01-15T12:00:00Z -def-456,ns1,education/,auth_ldap_xyz,auth/ldap/,ldap,ldap,non-entity,2024-02-01T08:00:00Z, + csv := `client_id,entity_name,namespace_id,namespace_path,mount_accessor,mount_path,mount_type,auth_method,client_type,token_creation_time,client_first_usage_time +abc-123,Alice Smith,root,[root],auth_approle_abc,auth/approle/,approle,approle,entity,2024-01-15T10:00:00Z,2024-01-15T12:00:00Z +def-456,,ns1,education/,auth_ldap_xyz,auth/ldap/,ldap,ldap,non-entity,2024-02-01T08:00:00Z, ` records, err := parseReader(strings.NewReader(csv), "test.csv") if err != nil { @@ -20,6 +20,7 @@ def-456,ns1,education/,auth_ldap_xyz,auth/ldap/,ldap,ldap,non-entity,2024-02-01T r := records[0] assertEqual(t, "client_id", "abc-123", r.ClientID) + assertEqual(t, "entity_name", "Alice Smith", r.EntityName) assertEqual(t, "namespace_id", "root", r.NamespaceID) assertEqual(t, "namespace_path", "[root]", r.NamespacePath) assertEqual(t, "mount_accessor", "auth_approle_abc", r.MountAccessor) @@ -30,6 +31,7 @@ def-456,ns1,education/,auth_ldap_xyz,auth/ldap/,ldap,ldap,non-entity,2024-02-01T assertEqual(t, "client_first_usage_time", "2024-01-15T12:00:00Z", r.ClientFirstUsageTime) r2 := records[1] + assertEqual(t, "entity_name_empty", "", r2.EntityName) assertEqual(t, "client_first_usage_time_empty", "", r2.ClientFirstUsageTime) } From ee2570eab5ae8326a9f481c004c15244de830837 Mon Sep 17 00:00:00 2001 From: Andrew Thielen Date: Thu, 21 May 2026 16:35:32 -0500 Subject: [PATCH 3/8] Only filter entity clients --- internal/normalizer/normalizer.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal/normalizer/normalizer.go b/internal/normalizer/normalizer.go index 8a6d573..315fac5 100644 --- a/internal/normalizer/normalizer.go +++ b/internal/normalizer/normalizer.go @@ -659,7 +659,7 @@ func FilterAbandonedClients(records []Record) ([]Record, AbandonedClientCounts) out := make([]Record, 0, len(records)) counts := AbandonedClientCounts{} for _, r := range records { - if r.EntityName == "" && r.EntityAliasName == "" { + if r.EntityName == "" && r.EntityAliasName == "" && r.ClientType == "entity" { if r.MountPath == "" { counts.NoMount++ continue From 3a2e584ceb3a6471b6ae13512af0b32750e3efbe Mon Sep 17 00:00:00 2001 From: Andrew Thielen Date: Thu, 21 May 2026 17:14:54 -0500 Subject: [PATCH 4/8] Breakdown abandoned clients by category --- cmd/vault-csv-normalizer/main.go | 6 ++++-- internal/normalizer/normalizer.go | 16 +++++++++++++--- internal/normalizer/normalizer_test.go | 22 ++++++++++++++++------ 3 files changed, 33 insertions(+), 11 deletions(-) diff --git a/cmd/vault-csv-normalizer/main.go b/cmd/vault-csv-normalizer/main.go index 75ebddd..93d9845 100644 --- a/cmd/vault-csv-normalizer/main.go +++ b/cmd/vault-csv-normalizer/main.go @@ -231,8 +231,10 @@ func main() { normalized, removedAbandonedCounts = normalizer.FilterAbandonedClients(normalized) fmt.Fprintf(os.Stdout, "Removed abandoned clients (total): %d\n", removedAbandonedCounts.Total()) - fmt.Fprintf(os.Stdout, " no auth mount (mount path empty): %d\n", removedAbandonedCounts.NoMount) - fmt.Fprintf(os.Stdout, " merged/deleted (mount path present): %d\n", removedAbandonedCounts.MergedDeleted) + fmt.Fprintf(os.Stdout, " no auth mount (mount path empty): %d (PKI: %d, non-PKI: %d)\n", + removedAbandonedCounts.NoMount, removedAbandonedCounts.NoMountPKI, removedAbandonedCounts.NoMount-removedAbandonedCounts.NoMountPKI) + fmt.Fprintf(os.Stdout, " merged/deleted (mount path present): %d (PKI: %d, non-PKI: %d)\n", + removedAbandonedCounts.MergedDeleted, removedAbandonedCounts.MergedDeletedPKI, removedAbandonedCounts.MergedDeleted-removedAbandonedCounts.MergedDeletedPKI) fmt.Fprintln(os.Stdout, strings.Repeat("-", 70)) } diff --git a/internal/normalizer/normalizer.go b/internal/normalizer/normalizer.go index 315fac5..df9a2d6 100644 --- a/internal/normalizer/normalizer.go +++ b/internal/normalizer/normalizer.go @@ -639,10 +639,13 @@ func FilterByClientType(records []Record, clientType string) []Record { } // AbandonedClientCounts reports how many anonymous records were removed by -// FilterAbandonedClients, split by whether an auth mount is present. +// FilterAbandonedClients, split by whether an auth mount is present and +// whether the record is a PKI client. type AbandonedClientCounts struct { - NoMount int - MergedDeleted int + NoMount int + NoMountPKI int + MergedDeleted int + MergedDeletedPKI int } // Total returns the sum of removed abandoned-client records. @@ -660,11 +663,18 @@ func FilterAbandonedClients(records []Record) ([]Record, AbandonedClientCounts) counts := AbandonedClientCounts{} for _, r := range records { if r.EntityName == "" && r.EntityAliasName == "" && r.ClientType == "entity" { + pki := IsPKIClient(r) if r.MountPath == "" { counts.NoMount++ + if pki { + counts.NoMountPKI++ + } continue } counts.MergedDeleted++ + if pki { + counts.MergedDeletedPKI++ + } continue } out = append(out, r) diff --git a/internal/normalizer/normalizer_test.go b/internal/normalizer/normalizer_test.go index 18a919b..e25fd5a 100644 --- a/internal/normalizer/normalizer_test.go +++ b/internal/normalizer/normalizer_test.go @@ -152,6 +152,10 @@ func TestFilterAbandonedClients(t *testing.T) { {ClientID: "drop-merged-2", EntityName: "", EntityAliasName: "", MountPath: "auth/oidc/", MountType: ""}, // removed as no mount: mount path missing {ClientID: "drop-nomount-1", EntityName: "", EntityAliasName: "", MountPath: "", MountType: "ldap"}, + // removed as merged/deleted PKI (auth_cert accessor, mount present) + {ClientID: "drop-merged-pki-1", EntityName: "", EntityAliasName: "", MountPath: "auth/cert/", MountType: "cert", MountAccessor: "auth_cert_abc123"}, + // removed as no-mount PKI (auth_cert accessor, mount missing) + {ClientID: "drop-nomount-pki-1", EntityName: "", EntityAliasName: "", MountPath: "", MountType: "cert", MountAccessor: "auth_cert_xyz789"}, // keep: entity name present {ClientID: "keep-3", EntityName: "Alice", EntityAliasName: "", MountPath: "auth/ldap/", MountType: "ldap"}, // keep: entity alias present @@ -159,14 +163,20 @@ func TestFilterAbandonedClients(t *testing.T) { } out, counts := FilterAbandonedClients(records) - if counts.NoMount != 1 { - t.Fatalf("expected NoMount=1, got %d", counts.NoMount) + if counts.NoMount != 2 { + t.Fatalf("expected NoMount=2, got %d", counts.NoMount) } - if counts.MergedDeleted != 2 { - t.Fatalf("expected MergedDeleted=2, got %d", counts.MergedDeleted) + if counts.NoMountPKI != 1 { + t.Fatalf("expected NoMountPKI=1, got %d", counts.NoMountPKI) } - if counts.Total() != 3 { - t.Fatalf("expected Total=3, got %d", counts.Total()) + if counts.MergedDeleted != 3 { + t.Fatalf("expected MergedDeleted=3, got %d", counts.MergedDeleted) + } + if counts.MergedDeletedPKI != 1 { + t.Fatalf("expected MergedDeletedPKI=1, got %d", counts.MergedDeletedPKI) + } + if counts.Total() != 5 { + t.Fatalf("expected Total=5, got %d", counts.Total()) } if len(out) != 2 { t.Fatalf("expected 2 records after filter, got %d", len(out)) From ae5d444d9b48e064ecb89fe16d07f1f067e8b6b6 Mon Sep 17 00:00:00 2001 From: Schuyler Bishop Date: Tue, 26 May 2026 18:36:51 -0500 Subject: [PATCH 5/8] Cleaning out options we don't want to present to customers --- .gitignore | 1 + README.md | 139 ++---- cmd/vault-csv-normalizer/main.go | 210 ++------- internal/normalizer/normalizer.go | 248 +---------- internal/normalizer/normalizer_test.go | 565 +------------------------ 5 files changed, 73 insertions(+), 1090 deletions(-) diff --git a/.gitignore b/.gitignore index 7a00b69..f0f15bf 100644 --- a/.gitignore +++ b/.gitignore @@ -4,6 +4,7 @@ # Claude .claude/ CLAUDE.md +CLAUDE.local.md # Test data outputs testdata/out.csv diff --git a/README.md b/README.md index 66c0b46..606ce87 100644 --- a/README.md +++ b/README.md @@ -18,7 +18,7 @@ versions), and displays a summary of client counts by mount path and type. - Normalizes **namespace paths** (empty/`root` → `[root]`, ensures trailing `/`) - Normalizes **mount paths** (ensures trailing `/`) - Normalizes **timestamps** to UTC across all common Vault timestamp formats -- **Deduplicates** clients across files by `client_id` when `-d` is set, by normalized `entity_alias_name` (`--dedup-alias`), or by alias within explicit auth-method groups (`--dedup-methods ldap,oidc`); alias normalization strips domain suffixes (`@corp.com`) and tier suffixes (`-t0`/`-t1`/`-t2`) +- **Deduplicates** clients within each file by alias within explicit auth-method groups (`--dedup-methods-per-file ldap,oidc`); alias normalization strips domain suffixes (`@corp.com`) - **Filters** by namespace (substring) or client type - **Sorts** by any column - Prints a **summary** with counts broken down by mount path and client type @@ -66,51 +66,31 @@ OPTIONS: Apply a since filter to one specific file only. May be specified multiple times for different files. The filename is matched against the base name (e.g. jan.csv=2024-01-15). - -d Deduplicate records by client_id across all input files. - -dedup-alias - Deduplicate by entity_alias_name within the same identity group across - all input files. LDAP and OIDC are treated as one group (the same - person typically has the same username in both). Two records are - considered the same client if they share the same normalized alias AND - belong to the same identity group, regardless of mount accessor or - source file. Normalization strips the domain suffix (at '@') and any - trailing tier suffix (-t0, -t1, -t2), so "sbishop" (LDAP), "sbishop-t0" - (LDAP, another file), and "sbishop@corp.com" (OIDC) → one client. - JWT is a separate group and is not collapsed here; use --dedup-jwt for - JWT vs LDAP/OIDC dedup. - Duplicate groups are printed as a table before the summary. - Records without an alias are always kept. May be combined with -d. - -dedup-methods method1,method2,... - Apply alias deduplication (same normalization as --dedup-alias) but - only for records whose auth method appears in the specified - comma-separated group. Methods in the same group are treated as one - identity — a person authenticating via any of them is counted once. - Records whose auth method is not in any group pass through unchanged. + -dedup-methods-per-file method1,method2,... + Deduplicate by alias for records whose auth method appears in the + specified comma-separated group, scoped to each input file + independently. Records in different files with the same alias are NOT + collapsed — only within-file duplicates are removed. Normalization + strips domain suffixes (at '@') only; tier suffixes (-t0/-t1/-t2) are + kept. Records whose auth method is not in any group pass through + unchanged. The flag is repeatable; each use defines one independent group: - -dedup-methods ldap,oidc - Deduplicate LDAP and OIDC as one identity group. "alice" (LDAP), - "alice@corp.com" (OIDC), and "alice-t0" (LDAP) all normalize to - "alice" and are counted once. JWT records are unaffected. - - -dedup-methods ldap,oidc,jwt - Treat LDAP, OIDC, and JWT together as one group. - - -dedup-methods ldap,oidc -dedup-methods jwt,saml - Two independent groups: {ldap,oidc} and {jwt,saml}. Records in - different groups are never collapsed against each other. - - Duplicate groups are printed as a table before the summary (same - format as --dedup-alias). Records without an alias and PKI clients are - always kept. May be combined with --dedup-alias, --dedup-jwt, and/or -d. - -dedup-jwt - Drop JWT records whose normalized alias matches a non-JWT record across - any input file. Uses the same normalization as --dedup-alias (strips - '@domain' and '-t0'/'-t1'/'-t2'). Prevents the same person from being - counted twice when they authenticate via both LDAP/OIDC and JWT. - Records without an alias are always kept. May be combined with - --dedup-alias, --dedup-methods, and/or -d. + -dedup-methods-per-file ldap,oidc + Within each file, collapse LDAP and OIDC records that share the + same alias. "alice" (LDAP) and "alice@corp.com" (OIDC) in the + same file normalize to "alice" and are counted once. A user in + jan.csv and feb.csv is NOT collapsed — counted once per file. + + -dedup-methods-per-file ldap,oidc,jwt + Treat LDAP, OIDC, and JWT as one group within each file. + + -dedup-methods-per-file ldap,oidc -dedup-methods-per-file jwt,saml + Two independent per-file groups. + + Duplicate groups are printed as a table before the summary. Records + without an alias and PKI clients are always kept. -remove-abandoned-clients Remove abandoned clients where entity_name and entity_alias_name are both blank. This includes records with no auth mount (mount_path @@ -169,41 +149,20 @@ vault-csv-normalizer -f jan.csv feb.csv --per-file # Debug: show all records grouped by mount path vault-csv-normalizer -f export.csv --debug -# Deduplicate client_ids across files -vault-csv-normalizer -f jan.csv feb.csv -d - -# Deduplicate by entity alias — strips domain (@corp.com) and tier (-t0/-t1/-t2) -# "alice", "alice-t0", "alice-t1", "alice@corp.com" → counted as one client per file -vault-csv-normalizer -f jan.csv feb.csv --dedup-alias - -# Combine both: alias dedup collapses tier/domain variants within each file, -# then -d deduplicates the same client_id appearing across multiple files -vault-csv-normalizer -f jan.csv feb.csv --dedup-alias -d - -# Drop JWT records where the same person already appears via LDAP or OIDC -vault-csv-normalizer -f export.csv --dedup-jwt - -# Full dedup: collapse tiers, dedup client_ids, then drop redundant JWT records -vault-csv-normalizer -f jan.csv feb.csv --dedup-alias -d --dedup-jwt - # Remove abandoned clients from final totals vault-csv-normalizer -f export.csv --remove-abandoned-clients # Same as above, with debug count output for removed rows vault-csv-normalizer -f export.csv --remove-abandoned-clients --debug -# Deduplicate LDAP and OIDC as one identity group — same person via either -# method is counted once; other auth methods are unaffected -vault-csv-normalizer -f export.csv --dedup-methods ldap,oidc +# Within each file, collapse LDAP and OIDC records with the same alias +vault-csv-normalizer -f jan.csv feb.csv --dedup-methods-per-file ldap,oidc -# Treat LDAP, OIDC, and JWT together as one human-identity group -vault-csv-normalizer -f export.csv --dedup-methods ldap,oidc,jwt +# Treat LDAP, OIDC, and JWT as one group within each file +vault-csv-normalizer -f jan.csv feb.csv --dedup-methods-per-file ldap,oidc,jwt -# Two independent groups: {ldap,oidc} and {jwt,saml} -vault-csv-normalizer -f export.csv -dedup-methods ldap,oidc --dedup-methods jwt,saml - -# Method-scoped dedup combined with client_id dedup -vault-csv-normalizer -f jan.csv feb.csv --dedup-methods ldap,oidc -d +# Two independent per-file groups: {ldap,oidc} and {jwt,saml} +vault-csv-normalizer -f jan.csv feb.csv --dedup-methods-per-file ldap,oidc --dedup-methods-per-file jwt,saml # Exclude records created before 2024-06-01 vault-csv-normalizer -f export.csv --since 2024-06-01 @@ -255,50 +214,26 @@ PKI Client Summary ## Alias-based deduplication Vault can record the same human as multiple clients when they authenticate via -different auth methods (e.g. LDAP in one session and OIDC in another) or as -tiered accounts (`alice`, `alice-t0`, `alice-t1`). The alias-based dedup flags -collapse these into a single count. +different auth methods (e.g. LDAP in one session and OIDC in another). +`--dedup-methods-per-file` collapses these into a single count within each file. ### Alias normalization -All alias-based dedup paths apply the same two-step normalization before -comparing: +`--dedup-methods-per-file` applies one normalization step before comparing: 1. **Strip domain suffix** — everything from `@` onward is removed. `alice@corp.com` → `alice` -2. **Strip tier suffix** — trailing `-t0`, `-t1`, or `-t2` is removed. - `alice-t0` → `alice` - -So `alice`, `alice-t0`, `alice-t1`, `alice@corp.com`, and `alice-t0@corp.com` -all normalize to `alice` and are treated as the same person. - -### Choosing a dedup flag - -| Flag | What it collapses | What it leaves separate | -|---|---|---| -| `--dedup-alias` | All auth methods, grouped so LDAP=OIDC; each other type is its own group | JWT vs LDAP/OIDC | -| `--dedup-methods ldap,oidc` | Only LDAP and OIDC, as one explicit group | Everything else untouched | -| `--dedup-methods ldap,oidc,jwt` | LDAP, OIDC, and JWT as one group | Everything else untouched | -| `--dedup-jwt` | JWT records that match an existing LDAP/OIDC alias | Non-JWT records | - -These flags are independent and can be combined. A common production workflow: -```bash -# Count human users once, across LDAP and OIDC, then remove JWT duplicates, -# then collapse the same client_id appearing across multiple monthly exports -vault-csv-normalizer -f jan.csv feb.csv mar.csv \ - --dedup-methods ldap,oidc \ - --dedup-jwt \ - -d -``` +Tier suffixes (`-t0`, `-t1`, `-t2`) are **not** stripped — `alice-t0` and +`alice-t1` are treated as distinct identities within a file. ### Auth methods reference | `mount_type` / `auth_method` | Typical users | Notes | |---|---|---| | `ldap` | Humans | Aliases usually bare usernames (`alice`) or tiered (`alice-t0`) | -| `oidc` | Humans | Aliases usually `username@domain.com` — normalize to same base as LDAP | -| `jwt` | Humans or services | May share aliases with LDAP/OIDC; use `--dedup-jwt` or `--dedup-methods` | +| `oidc` | Humans | Aliases usually `username@domain.com` — strip domain to match LDAP | +| `jwt` | Humans or services | May share aliases with LDAP/OIDC; include in group to collapse | | `approle` | Service accounts | Not human; not typically alias-deduped | | `kubernetes` | Service accounts | Not human; not typically alias-deduped | | `aws` / `gcp` | Service accounts | Not human; not typically alias-deduped | @@ -329,7 +264,7 @@ The tool expects CSVs exported from the Vault activity export API | `client_type` | No | Type of client (entity, non-entity, acme, etc.) | | `token_creation_time` | No | RFC3339 timestamp of token creation | | `client_first_usage_time`| No | RFC3339 timestamp of first authenticated call | -| `entity_alias_name` | No | Human-readable alias for the entity (used by `--dedup-alias` and `--dedup-methods`; domain and tier suffixes are stripped during normalization) | +| `entity_alias_name` | No | Human-readable alias for the entity (used by `--dedup-methods-per-file`; domain suffix is stripped during normalization) | ### Supported Column Aliases diff --git a/cmd/vault-csv-normalizer/main.go b/cmd/vault-csv-normalizer/main.go index 93d9845..cd56b06 100644 --- a/cmd/vault-csv-normalizer/main.go +++ b/cmd/vault-csv-normalizer/main.go @@ -39,7 +39,6 @@ func (f fileDateFlag) Set(v string) error { func main() { var inputFiles multiFlag - var dedupMethods multiFlag var dedupMethodsPerFile multiFlag var sortBy string var filterNS string @@ -47,9 +46,6 @@ func main() { var filterSince string var filterSinceFile = make(fileDateFlag) var countPKI bool - var dedup bool - var dedupAlias bool - var dedupJWT bool var removeAbandonedClients bool var listMethods bool var debugMode bool @@ -63,13 +59,9 @@ func main() { flag.StringVar(&filterSince, "since", "", "Exclude records with a token_creation_time before this value (e.g. 2024-01-01 or 2024-01-01T00:00:00Z)") flag.Var(&filterSinceFile, "since-file", "Apply a since filter to one file only: filename=date. May be specified multiple times for different files.") flag.BoolVar(&countPKI, "p", false, "Partition and report PKI/cert clients (client_type=acme or mount_accessor prefix auth_cert) separately") - flag.BoolVar(&dedup, "d", false, "Deduplicate records by client_id across all input files") - flag.BoolVar(&dedupAlias, "dedup-alias", false, "Deduplicate by entity_alias_name (strips domain and -t0/-t1/-t2 tier suffixes; records without an alias are always kept; may be combined with -d)") - flag.Var(&dedupMethods, "dedup-methods", "Deduplicate by alias for the specified comma-separated auth methods, treating them as one identity group. Repeatable to define multiple groups (e.g. -dedup-methods ldap,oidc -dedup-methods jwt,saml).") - flag.Var(&dedupMethodsPerFile, "dedup-methods-per-file", "Like --dedup-methods but scoped to each input file independently. Records in different files are never collapsed against each other. Repeatable to define multiple groups.") - flag.BoolVar(&dedupJWT, "dedup-jwt", false, "Drop JWT records whose normalized alias matches a non-JWT record in the same file (prevents counting the same person via both LDAP/OIDC and JWT)") + flag.Var(&dedupMethodsPerFile, "dedup-methods-per-file", "Deduplicate by alias for the specified comma-separated auth methods, scoped to each input file independently. Records in different files are never collapsed against each other. Repeatable to define multiple groups.") flag.BoolVar(&removeAbandonedClients, "remove-abandoned-clients", false, "Remove abandoned clients (blank entity_name and entity_alias_name) after deduplication. Includes records with no auth mount and merged/deleted entities.") - flag.BoolVar(&listMethods, "list-methods", false, "Print every distinct auth method found in the input files (with record counts and alias coverage), then exit. Useful for deciding --dedup-methods groups.") + flag.BoolVar(&listMethods, "list-methods", false, "Print every distinct auth method found in the input files (with record counts and alias coverage), then exit. Useful for deciding --dedup-methods-per-file groups.") flag.BoolVar(&debugMode, "debug", false, "Print all records grouped by mount path") flag.BoolVar(&perFile, "per-file", false, "Print a summary for each input file before the combined summary") flag.BoolVar(&showHelp, "help", false, "Show usage information") @@ -119,22 +111,6 @@ func main() { os.Exit(0) } - // Parse --dedup-methods values into groups. Each flag value is a - // comma-separated list of mount types that form one identity group. - var methodGroups [][]string - for _, val := range dedupMethods { - var group []string - for _, m := range strings.Split(val, ",") { - m = strings.TrimSpace(strings.ToLower(m)) - if m != "" { - group = append(group, m) - } - } - if len(group) > 0 { - methodGroups = append(methodGroups, group) - } - } - var methodGroupsPerFile [][]string for _, val := range dedupMethodsPerFile { var group []string @@ -149,39 +125,7 @@ func main() { } } - // Snapshot pre-dedup records so debug mode can show alias groups from the - // original data regardless of which dedup flags are active. preDedup := normalized - if dedupAlias { - groups := normalizer.FindAliasDuplicates(preDedup) - if len(groups) > 0 { - fmt.Fprintf(os.Stdout, "Alias duplicates found (%d group(s))\n", len(groups)) - fmt.Fprintln(os.Stdout, "=====================================") - for _, group := range groups { - r0 := group[0] - fmt.Fprintf(os.Stdout, "\nAlias group: %q file: %s\n", - normalizer.StripTierSuffix(normalizer.BaseAlias(r0.EntityAliasName)), filepath.Base(r0.Source)) - renderer.PrintTable(os.Stdout, group) - } - fmt.Fprintln(os.Stdout) - } - normalized = normalizer.DeduplicateByAlias(normalized) - } - if len(methodGroups) > 0 { - groups := normalizer.FindAliasDuplicatesForMethods(preDedup, methodGroups) - if len(groups) > 0 { - fmt.Fprintf(os.Stdout, "Method-scoped alias duplicates found (%d group(s))\n", len(groups)) - fmt.Fprintln(os.Stdout, "================================================") - for _, group := range groups { - r0 := group[0] - fmt.Fprintf(os.Stdout, "\nAlias group: %q file: %s\n", - normalizer.StripTierSuffix(normalizer.BaseAlias(r0.EntityAliasName)), filepath.Base(r0.Source)) - renderer.PrintTable(os.Stdout, group) - } - fmt.Fprintln(os.Stdout) - } - normalized = normalizer.DeduplicateByAliasForMethods(normalized, methodGroups) - } if len(methodGroupsPerFile) > 0 { groups := normalizer.FindAliasDuplicatesForMethodsPerFile(preDedup, methodGroupsPerFile) if len(groups) > 0 { @@ -198,34 +142,6 @@ func main() { normalized = normalizer.DeduplicateByAliasForMethodsPerFile(normalized, methodGroupsPerFile) } - // Collect -d dedup statistics before running so debug mode can report - // exactly which client_ids were (or weren't) collapsed. - var clientIDDupsBefore int - var clientIDDupsAfter int - var clientIDDupMap map[string]int // client_id → count of input records - if dedup && debugMode { - clientIDDupsBefore = len(normalized) - idCount := make(map[string]int, len(normalized)) - for _, r := range normalized { - idCount[r.ClientID]++ - } - clientIDDupMap = make(map[string]int) - for id, n := range idCount { - if n > 1 { - clientIDDupMap[id] = n - } - } - } - if dedup { - normalized = normalizer.Deduplicate(normalized) - if debugMode { - clientIDDupsAfter = len(normalized) - } - } - if dedupJWT { - normalized = normalizer.DeduplicateJWT(normalized) - } - removedAbandonedCounts := normalizer.AbandonedClientCounts{} if removeAbandonedClients { normalized, removedAbandonedCounts = normalizer.FilterAbandonedClients(normalized) @@ -261,46 +177,6 @@ func main() { } if debugMode { - // Show -d dedup results so the user can see which client_ids were (or - // weren't) collapsed, and understand why records still appear after dedup. - if dedup { - collapsed := clientIDDupsBefore - clientIDDupsAfter - fmt.Fprintf(os.Stdout, "Debug: -d client_id dedup — before: %d after: %d collapsed: %d\n", - clientIDDupsBefore, clientIDDupsAfter, collapsed) - fmt.Fprintln(os.Stdout, strings.Repeat("-", 70)) - if len(clientIDDupMap) > 0 { - dupIDs := make([]string, 0, len(clientIDDupMap)) - for id := range clientIDDupMap { - dupIDs = append(dupIDs, id) - } - sort.Strings(dupIDs) - for _, id := range dupIDs { - fmt.Fprintf(os.Stdout, " %s (x%d → kept 1)\n", id, clientIDDupMap[id]) - } - } else { - fmt.Fprintln(os.Stdout, " (no duplicate client_ids found)") - } - fmt.Fprintln(os.Stdout) - } - - // Show alias groups from the original (pre-dedup) data so the user can - // see aliasing context regardless of which dedup flags are active. - // Skip when -dedup-alias is set because it already printed these above. - if !dedupAlias { - groups := normalizer.FindAliasDuplicates(preDedup) - if len(groups) > 0 { - fmt.Fprintf(os.Stdout, "Debug: alias groups in input data (%d group(s))\n", len(groups)) - fmt.Fprintln(os.Stdout, "===============================================") - for _, group := range groups { - r0 := group[0] - fmt.Fprintf(os.Stdout, "\nAlias group: %q file: %s\n", - normalizer.StripTierSuffix(normalizer.BaseAlias(r0.EntityAliasName)), filepath.Base(r0.Source)) - renderer.PrintTable(os.Stdout, group) - } - fmt.Fprintln(os.Stdout) - } - } - // Group final (post-dedup) records by mount path. var mountOrder []string byMount := make(map[string][]normalizer.Record) @@ -321,7 +197,7 @@ func main() { fmt.Fprintf(os.Stdout, "\nMount: %s (%d record(s))\n", mp, len(group)) renderer.PrintTable(os.Stdout, group) // Flag records within this mount that share an entity alias but have - // different client_ids — these are candidates for -dedup-alias. + // different client_ids — use --dedup-methods-per-file to collapse them. if len(group) > 1 { aliasToIDs := make(map[string][]string) for _, r := range group { @@ -335,7 +211,7 @@ func main() { if len(ids) < 2 { continue } - // Check that not all client_ids are the same (already handled by -d). + // Check that not all client_ids are the same. allSame := true for _, id := range ids[1:] { if id != ids[0] { @@ -344,7 +220,7 @@ func main() { } } if !allSame { - fmt.Fprintf(os.Stdout, " !! alias %q has %d records with different client_ids — use -dedup-alias to collapse\n", alias, len(ids)) + fmt.Fprintf(os.Stdout, " !! alias %q has %d records with different client_ids — use --dedup-methods-per-file to collapse\n", alias, len(ids)) } } } @@ -417,8 +293,8 @@ func printMethodList(records []normalizer.Record, files []string) { fmt.Fprintf(os.Stdout, " %-20s %8d %10d\n", mt, s.total, s.withAlias) } fmt.Fprintln(os.Stdout) - fmt.Fprintln(os.Stdout, "Tip: use --dedup-methods to group methods into human/machine identity sets.") - fmt.Fprintln(os.Stdout, " Example: --dedup-methods ldap,oidc,jwt --dedup-methods approle,kubernetes") + fmt.Fprintln(os.Stdout, "Tip: use --dedup-methods-per-file to group methods into human/machine identity sets.") + fmt.Fprintln(os.Stdout, " Example: --dedup-methods-per-file ldap,oidc,jwt --dedup-methods-per-file approle,kubernetes") } func printUsage() { @@ -476,65 +352,31 @@ CSV FORMAT (Vault activity export): Optional column: entity_alias_name (also accepted as: alias_name, entity_alias) - When present, --dedup-alias collapses records that share the same - normalized alias within the same identity group across all input files. - LDAP and OIDC are treated as one group. Normalization strips the domain - suffix (at '@') and any trailing tier suffix (-t0, -t1, -t2). - "sbishop" (LDAP, jan.csv), "sbishop-t0" (LDAP, feb.csv), and - "sbishop@corp.com" (OIDC) → one client. JWT is a separate group; - use --dedup-jwt to additionally collapse JWT against LDAP/OIDC. - - --dedup-jwt uses the same normalization to match JWT records against - non-JWT records in the same file. A JWT record is dropped if a non-JWT - record (e.g. LDAP or OIDC) shares the same normalized alias, preventing - the same person from being counted twice when they authenticate via both - methods. Can be combined with --dedup-alias and/or -d. - - --dedup-methods - Apply alias deduplication (same normalization as --dedup-alias) but only - for records whose auth method appears in the specified comma-separated - group. Methods in the same group are treated as one identity — a person - authenticating via any of them is counted once. Records whose auth method - is not in any group pass through unchanged. - - The flag is repeatable; each use defines one independent group: - - --dedup-methods ldap,oidc - Deduplicate LDAP and OIDC as one identity group. "alice" (LDAP), - "alice@corp.com" (OIDC), and "alice-t0" (LDAP) all normalize to - "alice" and are counted once. - - --dedup-methods ldap,oidc,jwt - Treat LDAP, OIDC, and JWT together as one group. - - --dedup-methods ldap,oidc --dedup-methods jwt,saml - Two independent groups: {ldap,oidc} and {jwt,saml}. A person - appearing in both LDAP and OIDC is counted once; a person - appearing in both JWT and SAML is counted once; but an LDAP - record and a JWT record for the same person are not collapsed - (unless both groups are merged into one). - - Can be combined with --dedup-alias, --dedup-jwt, and/or -d. + When present, --dedup-methods-per-file collapses records that share + the same normalized alias within each source file. Normalization strips + the domain suffix (at '@'). Tier suffixes (-t0, -t1, -t2) are NOT + stripped — "sbishop-t0" and "sbishop-t1" are treated as distinct + identities within a file. --dedup-methods-per-file - Like --dedup-methods but deduplication is scoped to each input file - independently. Records in different files with the same normalized alias - are NOT collapsed against each other — only within-file duplicates are - removed. Useful when files represent different billing periods and you - want to count a returning user once per file rather than once globally. + Deduplicate by alias for records whose auth method appears in the + specified comma-separated group, scoped to each input file + independently. Records in different files with the same normalized + alias are NOT collapsed — only within-file duplicates are removed. + Useful when files represent different billing periods and you want to + count a returning user once per file rather than once globally. - Uses the same alias normalization and method-grouping syntax as - --dedup-methods (repeatable, comma-separated groups). + Normalization strips domain suffixes (at '@') only — tier suffixes + like -t0/-t1 are kept, so "alice-t0" and "alice-t1" are distinct. + + The flag is repeatable; each use defines one independent group: --dedup-methods-per-file ldap,oidc Within each file, collapse LDAP and OIDC records that share the - same alias (exact match; tier suffixes like -t0/-t1 are distinct). - A user in jan.csv (LDAP) and feb.csv (OIDC) is NOT collapsed — - they appear once per file. + same alias. A user in jan.csv (LDAP) and feb.csv (OIDC) is NOT + collapsed — they appear once per file. --dedup-methods-per-file ldap,oidc --dedup-methods-per-file jwt,saml - Two independent per-file groups. Same alias collapsing rules as - --dedup-methods but strictly within each source file. - - Can be combined with --dedup-methods, --dedup-alias, --dedup-jwt, and/or -d.`) + Two independent per-file groups. Records in different groups are + never collapsed against each other.`) } diff --git a/internal/normalizer/normalizer.go b/internal/normalizer/normalizer.go index df9a2d6..b6cb8d0 100644 --- a/internal/normalizer/normalizer.go +++ b/internal/normalizer/normalizer.go @@ -160,27 +160,6 @@ func ParseTime(raw string) time.Time { return time.Time{} // unparseable → zero value } -// Deduplicate removes records with duplicate ClientIDs. When duplicates exist, -// the record with a non-empty MountPath is preferred over one with an empty -// MountPath; otherwise the first occurrence is kept. -func Deduplicate(records []Record) []Record { - index := make(map[string]int, len(records)) // client_id → position in out - out := make([]Record, 0, len(records)) - for _, r := range records { - i, seen := index[r.ClientID] - if !seen { - index[r.ClientID] = len(out) - out = append(out, r) - continue - } - // Upgrade an empty-mount record if we now have a real mount path. - if out[i].MountPath == "" && r.MountPath != "" { - out[i] = r - } - } - return out -} - // BaseAlias returns the portion of an entity alias name before the first '@' // character. If no '@' is present the full name is returned. // Example: "alice@corp.com" → "alice", "sbishop@hashicorp.com" → "sbishop", @@ -206,96 +185,12 @@ func StripTierSuffix(name string) string { } // aliasKey is the deduplication key for alias-based dedup: one record is -// allowed per (normalized alias, mount type) pair across all input files. -// Including the mount type prevents --dedup-alias from collapsing records -// across different auth methods (e.g. LDAP vs JWT); use --dedup-jwt for that. +// allowed per (normalized alias, mount type) pair. type aliasKey struct { base string mountType string } -// dedupMountGroup maps mount types that represent the same identity provider -// to a single canonical value. OIDC and LDAP are treated as one group because -// the same person typically has the same username in both systems. -func dedupMountGroup(mt string) string { - if mt == "oidc" { - return "ldap" - } - return mt -} - -// aliasKeyFor computes the dedup key for a record. It strips the domain suffix -// (at '@') and any trailing tier suffix ("-t0"/"-t1"/"-t2"), and scopes the -// key to the mount group so that only records of the same identity type -// collapse. OIDC and LDAP share a group; JWT remains separate (use -// --dedup-jwt for JWT vs LDAP/OIDC dedup). -func aliasKeyFor(r Record) aliasKey { - mt := r.MountType - if mt == "" { - mt = r.AuthMethod - } - return aliasKey{ - base: StripTierSuffix(BaseAlias(r.EntityAliasName)), - mountType: dedupMountGroup(mt), - } -} - -// FindAliasDuplicates groups records by (BaseAlias, source file) and returns -// every group that contains more than one record. Records with a blank -// EntityAliasName or that are PKI clients are ignored. Groups are returned in -// the order the first member of each group appeared in records. -func FindAliasDuplicates(records []Record) [][]Record { - type entry struct { - key aliasKey - members []Record - } - index := make(map[aliasKey]int) - var entries []entry - - for _, r := range records { - if r.EntityAliasName == "" || IsPKIClient(r) { - continue - } - k := aliasKeyFor(r) - if idx, ok := index[k]; ok { - entries[idx].members = append(entries[idx].members, r) - } else { - index[k] = len(entries) - entries = append(entries, entry{key: k, members: []Record{r}}) - } - } - - var out [][]Record - for _, e := range entries { - if len(e.members) > 1 { - out = append(out, e.members) - } - } - return out -} - -// DeduplicateByAlias keeps at most one record per (BaseAlias, source file) -// combination. The same user authenticating via multiple mount accessors in -// the same file is collapsed to one record. Records with a blank -// EntityAliasName or that are PKI clients are always kept. -func DeduplicateByAlias(records []Record) []Record { - seen := make(map[aliasKey]struct{}, len(records)) - out := make([]Record, 0, len(records)) - for _, r := range records { - if r.EntityAliasName == "" || IsPKIClient(r) { - out = append(out, r) - continue - } - k := aliasKeyFor(r) - if _, dup := seen[k]; dup { - continue - } - seen[k] = struct{}{} - out = append(out, r) - } - return out -} - // buildMethodGroupMap converts a list of groups (each a slice of mount-type // strings) into a map from every member to the group's canonical value (the // first element of the group). Methods not present in any group are absent @@ -314,96 +209,6 @@ func buildMethodGroupMap(groups [][]string) map[string]string { return m } -// aliasKeyForMethods computes the dedup key for a record using a caller-supplied -// group map (from buildMethodGroupMap). If the record's mount type is not in the -// map the second return value is false, meaning the record should not participate -// in method-scoped dedup. -func aliasKeyForMethods(r Record, groupMap map[string]string) (aliasKey, bool) { - mt := r.MountType - if mt == "" { - mt = r.AuthMethod - } - canonical, ok := groupMap[mt] - if !ok { - return aliasKey{}, false - } - return aliasKey{ - base: StripTierSuffix(BaseAlias(r.EntityAliasName)), - mountType: canonical, - }, true -} - -// FindAliasDuplicatesForMethods is the same as FindAliasDuplicates but only -// considers records whose auth method (MountType or AuthMethod) appears in one -// of the provided groups. Each group is a slice of mount-type strings that -// should be treated as the same identity (e.g. ["ldap","oidc"]). Records whose -// method is not in any group are not reported. Groups are independent: records -// in different groups are never compared against each other. -func FindAliasDuplicatesForMethods(records []Record, groups [][]string) [][]Record { - groupMap := buildMethodGroupMap(groups) - - type entry struct { - key aliasKey - members []Record - } - index := make(map[aliasKey]int) - var entries []entry - - for _, r := range records { - if r.EntityAliasName == "" || IsPKIClient(r) { - continue - } - k, ok := aliasKeyForMethods(r, groupMap) - if !ok { - continue - } - if idx, exists := index[k]; exists { - entries[idx].members = append(entries[idx].members, r) - } else { - index[k] = len(entries) - entries = append(entries, entry{key: k, members: []Record{r}}) - } - } - - var out [][]Record - for _, e := range entries { - if len(e.members) > 1 { - out = append(out, e.members) - } - } - return out -} - -// DeduplicateByAliasForMethods applies the same alias dedup logic as -// DeduplicateByAlias but only for records whose auth method appears in one of -// the provided groups. Each group is a slice of mount-type strings treated as -// one identity (e.g. ["ldap","oidc"]). Records whose method is not in any group -// pass through unchanged. Records with a blank EntityAliasName or that are PKI -// clients are always kept. -func DeduplicateByAliasForMethods(records []Record, groups [][]string) []Record { - groupMap := buildMethodGroupMap(groups) - seen := make(map[aliasKey]struct{}, len(records)) - out := make([]Record, 0, len(records)) - for _, r := range records { - if r.EntityAliasName == "" || IsPKIClient(r) { - out = append(out, r) - continue - } - k, ok := aliasKeyForMethods(r, groupMap) - if !ok { - // Method not in any group — pass through untouched. - out = append(out, r) - continue - } - if _, dup := seen[k]; dup { - continue - } - seen[k] = struct{}{} - out = append(out, r) - } - return out -} - // aliasKeyInFile is the deduplication key for per-file alias dedup. It includes // the source file so records from different files are never collapsed together. type aliasKeyInFile struct { @@ -450,11 +255,10 @@ func aliasKeyInFileFor(r Record, groupMap map[string]string) (aliasKeyInFile, bo }, true } -// FindAliasDuplicatesForMethodsPerFile is like FindAliasDuplicatesForMethods -// but only collapses records within the same source file. Records in different -// files with the same alias are not reported as duplicates. Matching uses only -// the portion of the alias left of '@'; tier suffixes (-t0/-t1/-t2) are not -// stripped and must match exactly. +// FindAliasDuplicatesForMethodsPerFile groups records by normalized alias within +// each source file. Records in different files with the same alias are not +// reported as duplicates. Matching uses only the portion of the alias left of +// '@'; tier suffixes (-t0/-t1/-t2) are not stripped and must match exactly. func FindAliasDuplicatesForMethodsPerFile(records []Record, groups [][]string) [][]Record { groupMap := buildMethodGroupMap(groups) @@ -490,9 +294,9 @@ func FindAliasDuplicatesForMethodsPerFile(records []Record, groups [][]string) [ return out } -// DeduplicateByAliasForMethodsPerFile applies alias dedup like -// DeduplicateByAliasForMethods but scoped to each source file independently. -// Records in different files are never collapsed; only records from the same +// DeduplicateByAliasForMethodsPerFile deduplicates by alias scoped to each +// source file independently. Records in different files are never collapsed; +// only records from the same // file with the same normalized alias and method group are deduplicated. // Matching uses only the portion of the alias left of '@'; tier suffixes // (-t0/-t1/-t2) are not stripped and must match exactly. @@ -520,42 +324,6 @@ func DeduplicateByAliasForMethodsPerFile(records []Record, groups [][]string) [] return out } -// isJWT reports whether r was authenticated via JWT. -func isJWT(r Record) bool { - return r.MountType == "jwt" || r.AuthMethod == "jwt" -} - -// DeduplicateJWT drops JWT records whose normalized alias (StripTierSuffix + -// BaseAlias) matches a non-JWT record's normalized alias in the same source -// file. This prevents the same person from being counted once for their LDAP -// or OIDC identity and again for their JWT identity. Records without an alias -// are always kept. -func DeduplicateJWT(records []Record) []Record { - // Build global set of normalized aliases from all non-JWT records. - nonJWTAliases := make(map[string]struct{}) - for _, r := range records { - if isJWT(r) || r.EntityAliasName == "" { - continue - } - norm := StripTierSuffix(BaseAlias(r.EntityAliasName)) - if norm != "" { - nonJWTAliases[norm] = struct{}{} - } - } - - out := make([]Record, 0, len(records)) - for _, r := range records { - if isJWT(r) && r.EntityAliasName != "" { - norm := StripTierSuffix(BaseAlias(r.EntityAliasName)) - if _, match := nonJWTAliases[norm]; match { - continue - } - } - out = append(out, r) - } - return out -} - // IsPKIClient reports whether r is a PKI/cert client. It matches on either: // - client_type == "acme" (ACME protocol clients from the PKI secrets engine), or // - mount_accessor starting with "auth_cert" (cert auth method clients) diff --git a/internal/normalizer/normalizer_test.go b/internal/normalizer/normalizer_test.go index e25fd5a..6ebefa5 100644 --- a/internal/normalizer/normalizer_test.go +++ b/internal/normalizer/normalizer_test.go @@ -188,38 +188,6 @@ func TestFilterAbandonedClients(t *testing.T) { } } -func TestDeduplicate_PrefersNonEmptyMount(t *testing.T) { - records := []Record{ - {ClientID: "abc", MountPath: ""}, - {ClientID: "abc", MountPath: "auth/ldap/"}, - {ClientID: "xyz", MountPath: "auth/approle/"}, - {ClientID: "xyz", MountPath: ""}, - } - out := Deduplicate(records) - if len(out) != 2 { - t.Fatalf("expected 2 records after dedup, got %d", len(out)) - } - for _, r := range out { - if r.MountPath == "" { - t.Errorf("client %q kept empty-mount record when a non-empty mount was available", r.ClientID) - } - } -} - -func TestDeduplicate_KeepsFirstWhenBothEmpty(t *testing.T) { - records := []Record{ - {ClientID: "abc", MountPath: "", AuthMethod: "first"}, - {ClientID: "abc", MountPath: "", AuthMethod: "second"}, - } - out := Deduplicate(records) - if len(out) != 1 { - t.Fatalf("expected 1 record, got %d", len(out)) - } - if out[0].AuthMethod != "first" { - t.Errorf("expected first occurrence to be kept, got AuthMethod=%q", out[0].AuthMethod) - } -} - func TestFilterSince(t *testing.T) { records := []Record{ {ClientID: "old", TokenCreationTime: time.Date(2024, 1, 1, 0, 0, 0, 0, time.UTC)}, @@ -308,7 +276,7 @@ func TestStripTierSuffix(t *testing.T) { } func TestStripTierSuffix_AfterBaseAlias(t *testing.T) { - // The combination used by aliasKeyFor: strip domain then tier. + // Strip domain then tier suffix. cases := []struct{ in, want string }{ {"alice-t0@corp.com", "alice"}, {"alice-t1@corp.com", "alice"}, @@ -324,122 +292,6 @@ func TestStripTierSuffix_AfterBaseAlias(t *testing.T) { } } -func TestDeduplicateByAlias_CollapsesSameBaseAcrossAccessors(t *testing.T) { - // "sbishop", "sbishop@hashicorp.com", "sbishop-t0", "sbishop-t1", and - // "sbishop" in a second file all normalize to "sbishop" → only the first - // occurrence across all files is kept. - records := []Record{ - {ClientID: "1", EntityAliasName: "sbishop", MountAccessor: "auth_ldap_abc123", Source: "jan.csv"}, - {ClientID: "2", EntityAliasName: "sbishop@hashicorp.com", MountAccessor: "auth_jwt_def456", Source: "jan.csv"}, // dup: normalizes to "sbishop" - {ClientID: "3", EntityAliasName: "sbishop-t0", MountAccessor: "auth_ldap_abc123", Source: "jan.csv"}, // dup: tier stripped → "sbishop" - {ClientID: "4", EntityAliasName: "sbishop-t1", MountAccessor: "auth_oidc_xyz789", Source: "jan.csv"}, // dup: tier stripped → "sbishop" - {ClientID: "5", EntityAliasName: "sbishop", MountAccessor: "auth_ldap_abc123", Source: "feb.csv"}, // dup: same normalized alias across files - {ClientID: "6", EntityAliasName: ""}, // kept: blank always kept - } - out := DeduplicateByAlias(records) - if len(out) != 2 { - t.Fatalf("expected 2 records, got %d: %v", len(out), clientIDs(out)) - } - kept := clientIDSet(out) - for _, id := range []string{"1", "6"} { - if !kept[id] { - t.Errorf("expected ClientID=%s to be kept", id) - } - } - for _, id := range []string{"2", "3", "4", "5"} { - if kept[id] { - t.Errorf("expected ClientID=%s to be dropped", id) - } - } -} - -func TestDeduplicateByAlias_KeepsAllBlanks(t *testing.T) { - records := []Record{ - {ClientID: "1", EntityAliasName: ""}, - {ClientID: "2", EntityAliasName: ""}, - {ClientID: "3", EntityAliasName: "alice@corp.com", Source: "jan.csv"}, - } - out := DeduplicateByAlias(records) - if len(out) != 3 { - t.Fatalf("expected 3 records (2 blanks + 1 aliased), got %d", len(out)) - } -} - -func TestFindAliasDuplicates_SameBaseAcrossAccessors(t *testing.T) { - // "sbishop", "sbishop@hashicorp.com", "sbishop-t0", and "sbishop" in a - // second file all normalize to "sbishop" → one group with 4 members. - records := []Record{ - {ClientID: "1", EntityAliasName: "sbishop", MountAccessor: "auth_ldap_abc123", Source: "jan.csv"}, - {ClientID: "2", EntityAliasName: "sbishop@hashicorp.com", MountAccessor: "auth_jwt_def456", Source: "jan.csv"}, - {ClientID: "3", EntityAliasName: "sbishop-t0", MountAccessor: "auth_ldap_abc123", Source: "jan.csv"}, - {ClientID: "4", EntityAliasName: "sbishop", MountAccessor: "auth_ldap_abc123", Source: "feb.csv"}, // cross-file dup - {ClientID: "5", EntityAliasName: ""}, // ignored - } - groups := FindAliasDuplicates(records) - if len(groups) != 1 { - t.Fatalf("expected 1 duplicate group, got %d", len(groups)) - } - if len(groups[0]) != 4 { - t.Errorf("expected 4 members in group, got %d", len(groups[0])) - } - for _, r := range groups[0] { - if StripTierSuffix(BaseAlias(r.EntityAliasName)) != "sbishop" { - t.Errorf("unexpected record in group: %+v", r) - } - } -} - -func TestFindAliasDuplicates_NoDuplicates(t *testing.T) { - // All different normalized aliases — no duplicates regardless of file. - records := []Record{ - {ClientID: "1", EntityAliasName: "alice", Source: "jan.csv"}, - {ClientID: "2", EntityAliasName: "bob", Source: "jan.csv"}, - {ClientID: "3", EntityAliasName: "carol", Source: "feb.csv"}, - {ClientID: "4", EntityAliasName: ""}, - } - groups := FindAliasDuplicates(records) - if len(groups) != 0 { - t.Errorf("expected no duplicate groups, got %d", len(groups)) - } -} - -func TestDeduplicateByAlias_IgnoresPKIClients(t *testing.T) { - // PKI clients are always kept regardless of alias duplication. - // Non-PKI clients with the same base alias in the same file are deduplicated. - records := []Record{ - {ClientID: "1", EntityAliasName: "abc-123", ClientType: "acme", Source: "jan.csv"}, // PKI, kept - {ClientID: "2", EntityAliasName: "abc-456", ClientType: "acme", Source: "jan.csv"}, // PKI, kept (not deduped) - {ClientID: "3", EntityAliasName: "abc-789", MountAccessor: "auth_cert_xyz", Source: "jan.csv"}, // cert auth — PKI, kept - {ClientID: "4", EntityAliasName: "alice@corp", Source: "jan.csv"}, // non-PKI, first: kept - {ClientID: "5", EntityAliasName: "alice@example.com", Source: "jan.csv"}, // non-PKI dup: base "alice" already seen, dropped - } - out := DeduplicateByAlias(records) - if len(out) != 4 { - t.Fatalf("expected 4 records (3 PKI/cert + 1 non-PKI), got %d: %v", len(out), clientIDs(out)) - } - kept := clientIDSet(out) - for _, id := range []string{"1", "2", "3", "4"} { - if !kept[id] { - t.Errorf("expected ClientID=%s to be kept", id) - } - } - if kept["5"] { - t.Errorf("expected ClientID=5 (non-PKI dup) to be dropped") - } -} - -func TestFindAliasDuplicates_IgnoresPKIClients(t *testing.T) { - records := []Record{ - {ClientID: "1", EntityAliasName: "abc-123", ClientType: "acme", Source: "jan.csv"}, - {ClientID: "2", EntityAliasName: "abc-456", ClientType: "acme", Source: "jan.csv"}, - {ClientID: "3", EntityAliasName: "abc-789", MountAccessor: "auth_cert_xyz", Source: "jan.csv"}, - } - groups := FindAliasDuplicates(records) - if len(groups) != 0 { - t.Errorf("expected no duplicate groups (all PKI/cert), got %d", len(groups)) - } -} - // helpers for alias dedup tests func clientIDs(records []Record) []string { ids := make([]string, len(records)) @@ -714,421 +566,6 @@ func TestFilterSincePerSource_EmptyMap(t *testing.T) { } } -// ── JWT deduplication ───────────────────────────────────────────────────────── - -func TestDeduplicateJWT_DropsJWTMatchingNonJWT(t *testing.T) { - // alice authenticates via LDAP (kept) and JWT (dropped — same normalized alias). - // bob has only a JWT record (kept — no non-JWT match). - // carol has a JWT record with no alias (always kept). - records := []Record{ - {ClientID: "1", EntityAliasName: "alice", MountType: "ldap", Source: "jan.csv"}, - {ClientID: "2", EntityAliasName: "alice@corp.com", MountType: "jwt", Source: "jan.csv"}, // dropped: normalizes to "alice", matches LDAP - {ClientID: "3", EntityAliasName: "bob@corp.com", MountType: "jwt", Source: "jan.csv"}, // kept: no non-JWT match for "bob" - {ClientID: "4", EntityAliasName: "", MountType: "jwt", Source: "jan.csv"}, // kept: blank alias always kept - } - out := DeduplicateJWT(records) - if len(out) != 3 { - t.Fatalf("expected 3 records, got %d: %v", len(out), clientIDs(out)) - } - kept := clientIDSet(out) - for _, id := range []string{"1", "3", "4"} { - if !kept[id] { - t.Errorf("expected ClientID=%s to be kept", id) - } - } - if kept["2"] { - t.Error("expected ClientID=2 (JWT dup of LDAP alice) to be dropped") - } -} - -func TestDeduplicateJWT_TierNormalizationApplied(t *testing.T) { - // LDAP alias is "alice-t0" (normalizes to "alice"). - // JWT alias is "alice@corp.com" (normalizes to "alice"). - // They match → JWT dropped. - records := []Record{ - {ClientID: "1", EntityAliasName: "alice-t0", MountType: "ldap", Source: "jan.csv"}, - {ClientID: "2", EntityAliasName: "alice@corp.com", MountType: "jwt", Source: "jan.csv"}, - } - out := DeduplicateJWT(records) - if len(out) != 1 { - t.Fatalf("expected 1 record, got %d: %v", len(out), clientIDs(out)) - } - if out[0].ClientID != "1" { - t.Errorf("expected LDAP record to be kept, got ClientID=%s", out[0].ClientID) - } -} - -func TestDeduplicateJWT_MatchesAcrossFiles(t *testing.T) { - // JWT record in feb.csv matches an LDAP alias in jan.csv — cross-file match - // is intentional, JWT record is dropped. - records := []Record{ - {ClientID: "1", EntityAliasName: "alice", MountType: "ldap", Source: "jan.csv"}, - {ClientID: "2", EntityAliasName: "alice@corp.com", MountType: "jwt", Source: "feb.csv"}, - } - out := DeduplicateJWT(records) - if len(out) != 1 { - t.Fatalf("expected 1 record (cross-file JWT match dropped), got %d", len(out)) - } - if out[0].ClientID != "1" { - t.Errorf("expected LDAP record kept, got ClientID=%s", out[0].ClientID) - } -} - -func TestDeduplicateJWT_AuthMethodFallback(t *testing.T) { - // JWT identified via auth_method rather than mount_type. - records := []Record{ - {ClientID: "1", EntityAliasName: "alice", AuthMethod: "ldap", Source: "jan.csv"}, - {ClientID: "2", EntityAliasName: "alice@corp.com", AuthMethod: "jwt", Source: "jan.csv"}, - } - out := DeduplicateJWT(records) - if len(out) != 1 { - t.Fatalf("expected 1 record, got %d: %v", len(out), clientIDs(out)) - } - if out[0].ClientID != "1" { - t.Errorf("expected LDAP record kept, got ClientID=%s", out[0].ClientID) - } -} - -func TestDeduplicateJWT_NonJWTRecordsUnaffected(t *testing.T) { - // No JWT records — nothing should be dropped. - records := []Record{ - {ClientID: "1", EntityAliasName: "alice", MountType: "ldap", Source: "jan.csv"}, - {ClientID: "2", EntityAliasName: "bob", MountType: "oidc", Source: "jan.csv"}, - } - out := DeduplicateJWT(records) - if len(out) != 2 { - t.Fatalf("expected 2 records, got %d", len(out)) - } -} - -// ── combined alias + client_id deduplication ───────────────────────────────── - -func TestDeduplicateByAlias_ThenDeduplicate_CollapsesBothDimensions(t *testing.T) { - // --dedup-alias runs first (within-file tier/domain collapse), then -d - // (cross-file client_id collapse). Together they handle the case where the - // same person appears as different alias variants in the same file AND as the - // same client_id across multiple files. - // - // jan.csv: alice (id:1) and alice-t0 (id:2) → alias dedup keeps id:1, drops id:2 - // feb.csv: alice (id:1) → same client_id as jan.csv survivor → -d drops it - // jan.csv: bob (id:3) → distinct alias and id → kept throughout - records := []Record{ - {ClientID: "1", EntityAliasName: "alice", Source: "jan.csv"}, - {ClientID: "2", EntityAliasName: "alice-t0", Source: "jan.csv"}, // dropped by alias dedup (tier → "alice") - {ClientID: "1", EntityAliasName: "alice", Source: "feb.csv"}, // dropped by -d (same id as jan survivor) - {ClientID: "3", EntityAliasName: "bob", Source: "jan.csv"}, - } - - afterAlias := DeduplicateByAlias(records) - afterBoth := Deduplicate(afterAlias) - - if len(afterBoth) != 2 { - t.Fatalf("expected 2 records, got %d: %v", len(afterBoth), clientIDs(afterBoth)) - } - kept := clientIDSet(afterBoth) - if !kept["1"] { - t.Error("expected id:1 to be kept") - } - if !kept["3"] { - t.Error("expected id:3 to be kept") - } - if kept["2"] { - t.Error("expected id:2 to be dropped by alias dedup") - } -} - -func TestDeduplicateByAlias_CollapsesAcrossFiles(t *testing.T) { - // alice-t0 in jan.csv and alice-t1 in feb.csv both normalize to "alice" → - // alias dedup keeps only the first occurrence regardless of file. - records := []Record{ - {ClientID: "1", EntityAliasName: "alice-t0", Source: "jan.csv"}, - {ClientID: "2", EntityAliasName: "alice-t1", Source: "feb.csv"}, - } - - out := DeduplicateByAlias(records) - - if len(out) != 1 { - t.Fatalf("expected 1 record (cross-file tier collapse), got %d", len(out)) - } - if out[0].ClientID != "1" { - t.Errorf("expected first occurrence (id:1) to be kept, got id:%s", out[0].ClientID) - } -} - -func TestDeduplicateByAlias_CollapseOIDCWithLDAP(t *testing.T) { - // LDAP and OIDC share the same identity group, so the same normalized alias - // across both auth methods is treated as one client. - // JWT remains a separate group and is not collapsed here. - records := []Record{ - {ClientID: "1", EntityAliasName: "alice", MountType: "ldap", Source: "jan.csv"}, - {ClientID: "2", EntityAliasName: "alice@corp.com", MountType: "oidc", Source: "jan.csv"}, // dup: ldap/oidc group, normalizes to "alice" - {ClientID: "3", EntityAliasName: "alice-t0", MountType: "ldap", Source: "feb.csv"}, // dup: ldap/oidc group, tier stripped → "alice" - {ClientID: "4", EntityAliasName: "alice@corp.com", MountType: "jwt", Source: "jan.csv"}, // kept: jwt is a separate group - {ClientID: "5", EntityAliasName: "bob", MountType: "ldap", Source: "jan.csv"}, // kept: different alias - } - out := DeduplicateByAlias(records) - if len(out) != 3 { - t.Fatalf("expected 3 records, got %d: %v", len(out), clientIDs(out)) - } - kept := clientIDSet(out) - for _, id := range []string{"1", "4", "5"} { - if !kept[id] { - t.Errorf("expected ClientID=%s to be kept", id) - } - } - for _, id := range []string{"2", "3"} { - if kept[id] { - t.Errorf("expected ClientID=%s to be dropped (same ldap/oidc group)", id) - } - } -} - -func TestDeduplicateByAlias_ScopedToMountType(t *testing.T) { - // alice on LDAP and alice@corp.com on JWT share a normalized alias but have - // different mount types → --dedup-alias does NOT collapse them. Use - // --dedup-jwt to additionally collapse cross-auth-method duplicates. - // alice-t0 and alice on LDAP share both the normalized alias AND mount type - // → they ARE collapsed. - records := []Record{ - {ClientID: "1", EntityAliasName: "alice", MountType: "ldap", Source: "jan.csv"}, - {ClientID: "2", EntityAliasName: "alice-t0", MountType: "ldap", Source: "jan.csv"}, // dup: same type + base - {ClientID: "3", EntityAliasName: "alice@corp.com", MountType: "jwt", Source: "jan.csv"}, // kept: different mount type - } - out := DeduplicateByAlias(records) - if len(out) != 2 { - t.Fatalf("expected 2 records (alice/ldap + alice/jwt), got %d: %v", len(out), clientIDs(out)) - } - kept := clientIDSet(out) - if !kept["1"] { - t.Error("expected id:1 (alice ldap) to be kept") - } - if !kept["3"] { - t.Error("expected id:3 (alice jwt) to be kept — different mount type, requires --dedup-jwt") - } - if kept["2"] { - t.Error("expected id:2 (alice-t0 ldap) to be dropped — same mount type and normalized alias") - } -} - -// Regression: tiered accounts across files must be collapsed. -// Before the fix, aliasKey included the source filename, so alice-t0 in -// jan.csv and alice in feb.csv hashed to different keys and were never -// compared — each was counted as a separate client. -func TestDeduplicateByAlias_TieredAccountsAcrossFiles(t *testing.T) { - records := []Record{ - {ClientID: "1", EntityAliasName: "alice-t0", Source: "jan.csv"}, - {ClientID: "2", EntityAliasName: "alice", Source: "feb.csv"}, // same person, different tier label - {ClientID: "3", EntityAliasName: "alice-t1", Source: "mar.csv"}, // same person, third file - {ClientID: "4", EntityAliasName: "bob", Source: "jan.csv"}, // different person, kept - } - out := DeduplicateByAlias(records) - if len(out) != 2 { - t.Fatalf("expected 2 records (alice collapsed to 1, bob kept), got %d: %v", len(out), clientIDs(out)) - } - kept := clientIDSet(out) - if !kept["1"] { - t.Error("expected first alice occurrence (id:1) to be kept") - } - if !kept["4"] { - t.Error("expected bob (id:4) to be kept") - } - for _, id := range []string{"2", "3"} { - if kept[id] { - t.Errorf("expected ClientID=%s (tier variant of alice) to be dropped", id) - } - } -} - -// ── method-scoped alias deduplication ──────────────────────────────────────── - -func TestDeduplicateByAliasForMethods_LDAPAndOIDCGroup(t *testing.T) { - // Same as -dedup-alias LDAP/OIDC behavior, but specified explicitly. - // alice via LDAP is kept; alice@corp.com via OIDC is dropped (same group). - // alice via JWT is kept (not in the group). - records := []Record{ - {ClientID: "1", EntityAliasName: "alice", MountType: "ldap", Source: "jan.csv"}, - {ClientID: "2", EntityAliasName: "alice@corp.com", MountType: "oidc", Source: "jan.csv"}, // dropped - {ClientID: "3", EntityAliasName: "alice-t0", MountType: "ldap", Source: "feb.csv"}, // dropped: tier stripped - {ClientID: "4", EntityAliasName: "alice@corp.com", MountType: "jwt", Source: "jan.csv"}, // kept: jwt not in group - {ClientID: "5", EntityAliasName: "bob", MountType: "ldap", Source: "jan.csv"}, // kept: different alias - } - groups := [][]string{{"ldap", "oidc"}} - out := DeduplicateByAliasForMethods(records, groups) - if len(out) != 3 { - t.Fatalf("expected 3 records, got %d: %v", len(out), clientIDs(out)) - } - kept := clientIDSet(out) - for _, id := range []string{"1", "4", "5"} { - if !kept[id] { - t.Errorf("expected ClientID=%s to be kept", id) - } - } - for _, id := range []string{"2", "3"} { - if kept[id] { - t.Errorf("expected ClientID=%s to be dropped", id) - } - } -} - -func TestDeduplicateByAliasForMethods_MethodsNotInGroupPassThrough(t *testing.T) { - // approle records are not in any group and must pass through untouched, - // even if two share the same alias. - records := []Record{ - {ClientID: "1", EntityAliasName: "svc-account", MountType: "approle", Source: "jan.csv"}, - {ClientID: "2", EntityAliasName: "svc-account", MountType: "approle", Source: "jan.csv"}, // NOT deduped - {ClientID: "3", EntityAliasName: "alice", MountType: "ldap", Source: "jan.csv"}, - {ClientID: "4", EntityAliasName: "alice@corp.com", MountType: "oidc", Source: "jan.csv"}, // dropped - } - groups := [][]string{{"ldap", "oidc"}} - out := DeduplicateByAliasForMethods(records, groups) - if len(out) != 3 { - t.Fatalf("expected 3 records (2 approle + 1 ldap), got %d: %v", len(out), clientIDs(out)) - } - kept := clientIDSet(out) - for _, id := range []string{"1", "2", "3"} { - if !kept[id] { - t.Errorf("expected ClientID=%s to be kept", id) - } - } - if kept["4"] { - t.Error("expected ClientID=4 (oidc dup) to be dropped") - } -} - -func TestDeduplicateByAliasForMethods_MultipleIndependentGroups(t *testing.T) { - // Group 1: {ldap, oidc}; Group 2: {jwt, saml} - // alice/ldap and alice/oidc collapse → 1 kept - // alice/jwt and alice/saml collapse → 1 kept - // The two groups don't interact with each other. - records := []Record{ - {ClientID: "1", EntityAliasName: "alice", MountType: "ldap", Source: "jan.csv"}, - {ClientID: "2", EntityAliasName: "alice@corp.com", MountType: "oidc", Source: "jan.csv"}, // dropped (group 1) - {ClientID: "3", EntityAliasName: "alice@corp.com", MountType: "jwt", Source: "jan.csv"}, // kept (group 2 first) - {ClientID: "4", EntityAliasName: "alice", MountType: "saml", Source: "jan.csv"}, // dropped (group 2) - {ClientID: "5", EntityAliasName: "bob", MountType: "ldap", Source: "jan.csv"}, // kept: different alias - } - groups := [][]string{{"ldap", "oidc"}, {"jwt", "saml"}} - out := DeduplicateByAliasForMethods(records, groups) - if len(out) != 3 { - t.Fatalf("expected 3 records, got %d: %v", len(out), clientIDs(out)) - } - kept := clientIDSet(out) - for _, id := range []string{"1", "3", "5"} { - if !kept[id] { - t.Errorf("expected ClientID=%s to be kept", id) - } - } - for _, id := range []string{"2", "4"} { - if kept[id] { - t.Errorf("expected ClientID=%s to be dropped", id) - } - } -} - -func TestDeduplicateByAliasForMethods_ThreeMethodsOneGroup(t *testing.T) { - // ldap, oidc, jwt all in one group — alice across all three collapses to 1. - records := []Record{ - {ClientID: "1", EntityAliasName: "alice", MountType: "ldap", Source: "jan.csv"}, - {ClientID: "2", EntityAliasName: "alice@corp.com", MountType: "oidc", Source: "jan.csv"}, // dropped - {ClientID: "3", EntityAliasName: "alice@corp.com", MountType: "jwt", Source: "jan.csv"}, // dropped - {ClientID: "4", EntityAliasName: "bob", MountType: "jwt", Source: "jan.csv"}, // kept: different alias - } - groups := [][]string{{"ldap", "oidc", "jwt"}} - out := DeduplicateByAliasForMethods(records, groups) - if len(out) != 2 { - t.Fatalf("expected 2 records, got %d: %v", len(out), clientIDs(out)) - } - kept := clientIDSet(out) - if !kept["1"] { - t.Error("expected id:1 (alice ldap, first occurrence) to be kept") - } - if !kept["4"] { - t.Error("expected id:4 (bob) to be kept") - } -} - -func TestDeduplicateByAliasForMethods_BlankAliasAlwaysKept(t *testing.T) { - records := []Record{ - {ClientID: "1", EntityAliasName: "", MountType: "ldap", Source: "jan.csv"}, - {ClientID: "2", EntityAliasName: "", MountType: "oidc", Source: "jan.csv"}, - {ClientID: "3", EntityAliasName: "alice", MountType: "ldap", Source: "jan.csv"}, - } - groups := [][]string{{"ldap", "oidc"}} - out := DeduplicateByAliasForMethods(records, groups) - if len(out) != 3 { - t.Fatalf("expected 3 records (2 blank + 1 aliased), got %d", len(out)) - } -} - -func TestDeduplicateByAliasForMethods_PKIClientsAlwaysKept(t *testing.T) { - records := []Record{ - {ClientID: "1", EntityAliasName: "abc-123", ClientType: "acme", MountType: "ldap", Source: "jan.csv"}, - {ClientID: "2", EntityAliasName: "abc-123", ClientType: "acme", MountType: "oidc", Source: "jan.csv"}, - {ClientID: "3", EntityAliasName: "alice", MountType: "ldap", Source: "jan.csv"}, - {ClientID: "4", EntityAliasName: "alice@corp.com", MountType: "oidc", Source: "jan.csv"}, // dropped - } - groups := [][]string{{"ldap", "oidc"}} - out := DeduplicateByAliasForMethods(records, groups) - if len(out) != 3 { - t.Fatalf("expected 3 records (2 PKI + 1 non-PKI), got %d: %v", len(out), clientIDs(out)) - } - kept := clientIDSet(out) - for _, id := range []string{"1", "2", "3"} { - if !kept[id] { - t.Errorf("expected ClientID=%s to be kept", id) - } - } - if kept["4"] { - t.Error("expected ClientID=4 to be dropped") - } -} - -func TestDeduplicateByAliasForMethods_AuthMethodFallback(t *testing.T) { - // MountType is blank; dedup should fall back to AuthMethod. - records := []Record{ - {ClientID: "1", EntityAliasName: "alice", AuthMethod: "ldap", Source: "jan.csv"}, - {ClientID: "2", EntityAliasName: "alice@corp.com", AuthMethod: "oidc", Source: "jan.csv"}, // dropped - } - groups := [][]string{{"ldap", "oidc"}} - out := DeduplicateByAliasForMethods(records, groups) - if len(out) != 1 { - t.Fatalf("expected 1 record, got %d: %v", len(out), clientIDs(out)) - } - if out[0].ClientID != "1" { - t.Errorf("expected id:1 to be kept, got %s", out[0].ClientID) - } -} - -func TestFindAliasDuplicatesForMethods_ReportsGroupsOnly(t *testing.T) { - // Only ldap and oidc records should be reported as duplicates. - // approle records with the same alias are not in the group and not reported. - records := []Record{ - {ClientID: "1", EntityAliasName: "alice", MountType: "ldap", Source: "jan.csv"}, - {ClientID: "2", EntityAliasName: "alice@corp.com", MountType: "oidc", Source: "jan.csv"}, - {ClientID: "3", EntityAliasName: "alice", MountType: "approle", Source: "jan.csv"}, // not in group - } - groups := [][]string{{"ldap", "oidc"}} - dups := FindAliasDuplicatesForMethods(records, groups) - if len(dups) != 1 { - t.Fatalf("expected 1 duplicate group, got %d", len(dups)) - } - if len(dups[0]) != 2 { - t.Errorf("expected 2 members in group (ldap + oidc), got %d", len(dups[0])) - } -} - -func TestFindAliasDuplicatesForMethods_NoDuplicates(t *testing.T) { - records := []Record{ - {ClientID: "1", EntityAliasName: "alice", MountType: "ldap", Source: "jan.csv"}, - {ClientID: "2", EntityAliasName: "bob", MountType: "oidc", Source: "jan.csv"}, - } - groups := [][]string{{"ldap", "oidc"}} - dups := FindAliasDuplicatesForMethods(records, groups) - if len(dups) != 0 { - t.Errorf("expected no duplicate groups, got %d", len(dups)) - } -} - // ── input mutation safety ───────────────────────────────────────────────────── // These tests guard against the records[:0] pattern, which reuses the backing // array and silently corrupts the caller's slice. Each filter must not modify From a0e30213319b49bdb0df0eb60cd800506a411e85 Mon Sep 17 00:00:00 2001 From: Schuyler Bishop Date: Wed, 27 May 2026 08:44:04 -0500 Subject: [PATCH 6/8] Removed links to the git repo --- README.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/README.md b/README.md index 606ce87..63dc853 100644 --- a/README.md +++ b/README.md @@ -28,8 +28,6 @@ versions), and displays a summary of client counts by mount path and type. ## Installation ```bash -git clone https://github.com/your-org/vault-csv-normalizer -cd vault-csv-normalizer make build # Binary is at ./bin/vault-csv-normalizer ``` From d05cd187a330d07a810790023a966bbee354350f Mon Sep 17 00:00:00 2001 From: Schuyler Bishop Date: Wed, 27 May 2026 12:24:13 -0500 Subject: [PATCH 7/8] Add Terraform stub generation for unaliased entity clients --- README.md | 57 +++++++- cmd/vault-csv-normalizer/main.go | 27 +++- internal/normalizer/normalizer.go | 14 +- internal/normalizer/normalizer_test.go | 40 +----- internal/tfgen/tfgen.go | 182 +++++++++++++++++++++++++ 5 files changed, 259 insertions(+), 61 deletions(-) create mode 100644 internal/tfgen/tfgen.go diff --git a/README.md b/README.md index 63dc853..cb8d1fe 100644 --- a/README.md +++ b/README.md @@ -94,6 +94,13 @@ OPTIONS: both blank. This includes records with no auth mount (mount_path empty) and merged/deleted entities (mount_path present). Applied after all deduplication steps. + -generate-tf + Generate Terraform HCL stubs for entity clients with no alias in the + export. Requires --dedup-methods-per-file. A client is targeted when + entity_alias_name is blank and mount_accessor is non-empty. For each + such client, vault_identity_entity and vault_identity_entity_alias + resources are written to vault-aliases.tf. Mount accessors are emitted + as Terraform variables. Does not affect counts or summary output. -per-file Print a summary for each input file before the combined summary -debug @@ -150,6 +157,9 @@ vault-csv-normalizer -f export.csv --debug # Remove abandoned clients from final totals vault-csv-normalizer -f export.csv --remove-abandoned-clients +# Generate Terraform stubs for unaliased LDAP/OIDC clients +vault-csv-normalizer -f export.csv --dedup-methods-per-file ldap,oidc --generate-tf + # Same as above, with debug count output for removed rows vault-csv-normalizer -f export.csv --remove-abandoned-clients --debug @@ -215,23 +225,56 @@ Vault can record the same human as multiple clients when they authenticate via different auth methods (e.g. LDAP in one session and OIDC in another). `--dedup-methods-per-file` collapses these into a single count within each file. +### How deduplication works + +Each auth method stores a different value as the entity alias in Vault: + +| Auth method | What Vault stores as `entity_alias_name` | +|---|---| +| `ldap` | Bare username: `alice` | +| `oidc` | Bare username (from `entity_alias_metadata.username`): `alice` | +| `jwt` | Full email address: `alice@corp.com` | + +The tool normalizes all three to a common base by stripping the domain suffix +(`alice@corp.com` → `alice`), then matches records within the same file that +share the same normalized alias and belong to the same method group. + +**This only works when the same string is used as the identity across all auth +methods.** If `alice` logs in via LDAP as `alice` and via JWT as +`alice@corp.com`, the normalization produces `alice` for both — they collapse. +If the LDAP username and the JWT email prefix do not match (e.g. `asmith` vs +`alice.smith@corp.com`), the records will not be collapsed. + +### Required conditions for cross-method dedup + +All of the following must be true for two records to be deduplicated: + +1. Both records are in the **same source file** — records across files are never collapsed. +2. Both records' auth methods appear in the **same comma-separated list** passed to `--dedup-methods-per-file`. With `--dedup-methods-per-file ldap,oidc,jwt`, an LDAP and a JWT record can collapse. With `--dedup-methods-per-file ldap,oidc --dedup-methods-per-file jwt,saml`, an LDAP and a JWT record will never collapse — they are in separate groups. +3. Both records have a **non-empty `entity_alias_name`** (or `entity_alias_metadata.username` for OIDC). +4. The **normalized alias matches** — after stripping the domain suffix, the alias strings are identical. +5. Neither record is a **PKI client** (`client_type=acme` or `mount_accessor` prefix `auth_cert`). + +If any condition is not met, both records pass through unchanged. + ### Alias normalization `--dedup-methods-per-file` applies one normalization step before comparing: -1. **Strip domain suffix** — everything from `@` onward is removed. - `alice@corp.com` → `alice` +**Strip domain suffix** — everything from `@` onward is removed. +`alice@corp.com` → `alice` -Tier suffixes (`-t0`, `-t1`, `-t2`) are **not** stripped — `alice-t0` and -`alice-t1` are treated as distinct identities within a file. +This lets JWT records (which use full email addresses) match LDAP/OIDC records +(which use bare usernames), provided the local part of the email is the same +as the LDAP/OIDC username. ### Auth methods reference | `mount_type` / `auth_method` | Typical users | Notes | |---|---|---| -| `ldap` | Humans | Aliases usually bare usernames (`alice`) or tiered (`alice-t0`) | -| `oidc` | Humans | Aliases usually `username@domain.com` — strip domain to match LDAP | -| `jwt` | Humans or services | May share aliases with LDAP/OIDC; include in group to collapse | +| `ldap` | Humans | Aliases are bare usernames (`alice`) | +| `oidc` | Humans | Aliases are bare usernames from `entity_alias_metadata.username` (`alice`) | +| `jwt` | Humans or services | Aliases are full email addresses (`alice@corp.com`); domain is stripped to match LDAP/OIDC | | `approle` | Service accounts | Not human; not typically alias-deduped | | `kubernetes` | Service accounts | Not human; not typically alias-deduped | | `aws` / `gcp` | Service accounts | Not human; not typically alias-deduped | diff --git a/cmd/vault-csv-normalizer/main.go b/cmd/vault-csv-normalizer/main.go index cd56b06..30132c5 100644 --- a/cmd/vault-csv-normalizer/main.go +++ b/cmd/vault-csv-normalizer/main.go @@ -12,6 +12,7 @@ import ( "github.com/vault-csv-normalizer/internal/normalizer" "github.com/vault-csv-normalizer/internal/parser" "github.com/vault-csv-normalizer/internal/renderer" + "github.com/vault-csv-normalizer/internal/tfgen" ) // multiFlag allows a flag to be specified multiple times. @@ -47,6 +48,7 @@ func main() { var filterSinceFile = make(fileDateFlag) var countPKI bool var removeAbandonedClients bool + var generateTF bool var listMethods bool var debugMode bool var perFile bool @@ -61,6 +63,7 @@ func main() { flag.BoolVar(&countPKI, "p", false, "Partition and report PKI/cert clients (client_type=acme or mount_accessor prefix auth_cert) separately") flag.Var(&dedupMethodsPerFile, "dedup-methods-per-file", "Deduplicate by alias for the specified comma-separated auth methods, scoped to each input file independently. Records in different files are never collapsed against each other. Repeatable to define multiple groups.") flag.BoolVar(&removeAbandonedClients, "remove-abandoned-clients", false, "Remove abandoned clients (blank entity_name and entity_alias_name) after deduplication. Includes records with no auth mount and merged/deleted entities.") + flag.BoolVar(&generateTF, "generate-tf", false, "Generate Terraform HCL stubs for entity clients with no alias. Requires --dedup-methods-per-file. Output written to vault-aliases.tf.") flag.BoolVar(&listMethods, "list-methods", false, "Print every distinct auth method found in the input files (with record counts and alias coverage), then exit. Useful for deciding --dedup-methods-per-file groups.") flag.BoolVar(&debugMode, "debug", false, "Print all records grouped by mount path") flag.BoolVar(&perFile, "per-file", false, "Print a summary for each input file before the combined summary") @@ -134,7 +137,7 @@ func main() { for _, group := range groups { r0 := group[0] fmt.Fprintf(os.Stdout, "\nAlias group: %q file: %s\n", - normalizer.StripTierSuffix(normalizer.BaseAlias(r0.EntityAliasName)), filepath.Base(r0.Source)) + normalizer.BaseAlias(r0.EntityAliasName), filepath.Base(r0.Source)) renderer.PrintTable(os.Stdout, group) } fmt.Fprintln(os.Stdout) @@ -154,6 +157,9 @@ func main() { fmt.Fprintln(os.Stdout, strings.Repeat("-", 70)) } + // Snapshot post-dedup records before filters for --generate-tf. + preFilterRecords := normalized + // Apply filters. if filterNS != "" { normalized = normalizer.FilterByNamespace(normalized, filterNS) @@ -176,6 +182,23 @@ func main() { os.Exit(1) } + if generateTF { + if len(methodGroupsPerFile) == 0 { + fmt.Fprintln(os.Stderr, "warning: --generate-tf has no effect without --dedup-methods-per-file") + } else { + n, err := tfgen.GenerateTF(preFilterRecords, methodGroupsPerFile, "vault-aliases.tf") + if err != nil { + fmt.Fprintf(os.Stderr, "error: --generate-tf: %v\n", err) + os.Exit(1) + } + if n == 0 { + fmt.Fprintln(os.Stdout, "generate-tf: no unaliased clients found in the specified method groups") + } else { + fmt.Fprintf(os.Stdout, "generate-tf: wrote %d entity stub(s) to vault-aliases.tf\n", n) + } + } + } + if debugMode { // Group final (post-dedup) records by mount path. var mountOrder []string @@ -204,7 +227,7 @@ func main() { if r.EntityAliasName == "" { continue } - norm := normalizer.StripTierSuffix(normalizer.BaseAlias(r.EntityAliasName)) + norm := normalizer.BaseAlias(r.EntityAliasName) aliasToIDs[norm] = append(aliasToIDs[norm], r.ClientID) } for alias, ids := range aliasToIDs { diff --git a/internal/normalizer/normalizer.go b/internal/normalizer/normalizer.go index b6cb8d0..3b4b491 100644 --- a/internal/normalizer/normalizer.go +++ b/internal/normalizer/normalizer.go @@ -162,8 +162,7 @@ func ParseTime(raw string) time.Time { // BaseAlias returns the portion of an entity alias name before the first '@' // character. If no '@' is present the full name is returned. -// Example: "alice@corp.com" → "alice", "sbishop@hashicorp.com" → "sbishop", -// "sbishop-t0" → "sbishop-t0". +// Example: "alice@corp.com" → "alice", "sbishop@hashicorp.com" → "sbishop". func BaseAlias(name string) string { for i, ch := range name { if ch == '@' { @@ -173,17 +172,6 @@ func BaseAlias(name string) string { return name } -// StripTierSuffix removes a trailing "-t0", "-t1", or "-t2" suffix from name. -// Other suffixes are left unchanged. -// Example: "alice-t0" → "alice", "bob-t2" → "bob", "carol-t3" → "carol-t3". -func StripTierSuffix(name string) string { - n := len(name) - if n >= 3 && name[n-3] == '-' && name[n-2] == 't' && name[n-1] >= '0' && name[n-1] <= '2' { - return name[:n-3] - } - return name -} - // aliasKey is the deduplication key for alias-based dedup: one record is // allowed per (normalized alias, mount type) pair. type aliasKey struct { diff --git a/internal/normalizer/normalizer_test.go b/internal/normalizer/normalizer_test.go index 6ebefa5..55c7069 100644 --- a/internal/normalizer/normalizer_test.go +++ b/internal/normalizer/normalizer_test.go @@ -241,7 +241,7 @@ func TestBaseAlias(t *testing.T) { {"alice@corp.com", "alice"}, {"sbishop@hashicorp.com", "sbishop"}, {"abc@234", "abc"}, - {"sbishop-t0", "sbishop-t0"}, // BaseAlias alone does not strip tier + {"sbishop-t0", "sbishop-t0"}, {"plain", "plain"}, {"", ""}, {"@leading", ""}, @@ -254,44 +254,6 @@ func TestBaseAlias(t *testing.T) { } } -func TestStripTierSuffix(t *testing.T) { - cases := []struct{ in, want string }{ - {"alice-t0", "alice"}, - {"alice-t1", "alice"}, - {"alice-t2", "alice"}, - {"alice-t3", "alice-t3"}, // only t0–t2 are stripped - {"alice-t10", "alice-t10"}, - {"alice-T0", "alice-T0"}, // case-sensitive - {"alice", "alice"}, - {"-t0", ""}, // degenerate: only the suffix - {"t0", "t0"}, // no hyphen - {"", ""}, - } - for _, c := range cases { - got := StripTierSuffix(c.in) - if got != c.want { - t.Errorf("StripTierSuffix(%q) = %q, want %q", c.in, got, c.want) - } - } -} - -func TestStripTierSuffix_AfterBaseAlias(t *testing.T) { - // Strip domain then tier suffix. - cases := []struct{ in, want string }{ - {"alice-t0@corp.com", "alice"}, - {"alice-t1@corp.com", "alice"}, - {"alice@corp.com", "alice"}, - {"alice-t0", "alice"}, - {"alice", "alice"}, - } - for _, c := range cases { - got := StripTierSuffix(BaseAlias(c.in)) - if got != c.want { - t.Errorf("StripTierSuffix(BaseAlias(%q)) = %q, want %q", c.in, got, c.want) - } - } -} - // helpers for alias dedup tests func clientIDs(records []Record) []string { ids := make([]string, len(records)) diff --git a/internal/tfgen/tfgen.go b/internal/tfgen/tfgen.go new file mode 100644 index 0000000..ea8960d --- /dev/null +++ b/internal/tfgen/tfgen.go @@ -0,0 +1,182 @@ +// Package tfgen generates Terraform HCL stubs for Vault entity clients that +// have no entity alias in the activity export. These stubs create a +// vault_identity_entity and a vault_identity_entity_alias for each such record. +package tfgen + +import ( + "fmt" + "os" + "path/filepath" + "strings" + + "github.com/vault-csv-normalizer/internal/normalizer" +) + +var petAdjectives = []string{ + "amber", "bold", "calm", "dark", "eager", "fair", "glad", "hardy", + "ivory", "jolly", "keen", "lofty", "merry", "noble", "proud", + "quiet", "rapid", "silver", "tawny", "vivid", "warm", "young", +} + +var petNouns = []string{ + "bear", "crane", "deer", "eagle", "fox", "goose", "hawk", "ibis", + "jay", "kite", "lark", "mole", "newt", "otter", "panda", "quail", + "raven", "swift", "teal", "vole", "wren", "yak", "zebra", +} + +// nextPetName picks the next unused (adj_noun) name, cycling through all +// combinations. Falls back to "entity_N" if all 500+ combinations are used. +func nextPetName(used map[string]struct{}, counter *int) string { + total := len(petAdjectives) * len(petNouns) + for i := 0; i < total; i++ { + idx := (*counter + i) % total + name := petAdjectives[idx/len(petNouns)] + "_" + petNouns[idx%len(petNouns)] + if _, ok := used[name]; !ok { + *counter = (idx + 1) % total + used[name] = struct{}{} + return name + } + } + *counter++ + name := fmt.Sprintf("entity_%d", *counter) + used[name] = struct{}{} + return name +} + +// sanitizeID converts a string to a valid Terraform identifier by lowercasing +// and replacing non-alphanumeric characters with underscores. +func sanitizeID(s string) string { + var b strings.Builder + for _, ch := range strings.ToLower(s) { + if (ch >= 'a' && ch <= 'z') || (ch >= '0' && ch <= '9') { + b.WriteRune(ch) + } else { + b.WriteByte('_') + } + } + result := b.String() + for strings.Contains(result, "__") { + result = strings.ReplaceAll(result, "__", "_") + } + return strings.Trim(result, "_") +} + +func buildMethodGroupMap(groups [][]string) map[string]string { + m := make(map[string]string) + for _, g := range groups { + if len(g) == 0 { + continue + } + canonical := g[0] + for _, method := range g { + m[method] = canonical + } + } + return m +} + +// GenerateTF scans records for method-group members that have a non-empty +// mount_accessor but a blank entity_alias_name. These clients exist in Vault +// without an entity alias and need one created. For each such record a +// vault_identity_entity and vault_identity_entity_alias stub is written to +// outputPath. Mount accessors are emitted as Terraform variables so the caller +// can override them per environment. Returns the number of stubs written. +func GenerateTF(records []normalizer.Record, groups [][]string, outputPath string) (int, error) { + groupMap := buildMethodGroupMap(groups) + + var targets []normalizer.Record + for _, r := range records { + if r.EntityAliasName != "" { + continue // already aliased — skip + } + if r.MountAccessor == "" { + continue // no mount accessor — cannot create alias + } + if normalizer.IsPKIClient(r) { + continue // PKI clients are excluded from alias management + } + mt := r.MountType + if mt == "" { + mt = r.AuthMethod + } + if _, ok := groupMap[mt]; !ok { + continue // not in any configured method group + } + targets = append(targets, r) + } + + if len(targets) == 0 { + return 0, nil + } + + type mountInfo struct { + accessor string + mountPath string + mountType string + varName string + } + + accessorSeen := make(map[string]*mountInfo) + var accessorOrder []string + for _, r := range targets { + if _, ok := accessorSeen[r.MountAccessor]; !ok { + mi := &mountInfo{ + accessor: r.MountAccessor, + mountPath: r.MountPath, + mountType: r.MountType, + varName: "accessor_" + sanitizeID(r.MountAccessor), + } + accessorSeen[r.MountAccessor] = mi + accessorOrder = append(accessorOrder, r.MountAccessor) + } + } + + var sb strings.Builder + + sb.WriteString("# Generated by vault-csv-normalizer\n") + sb.WriteString("# These stubs represent entity clients with no alias in the Vault export.\n") + sb.WriteString("# Fill in the TODO values and verify entity names before applying.\n\n") + + for _, acc := range accessorOrder { + mi := accessorSeen[acc] + desc := mi.mountPath + if mi.mountType != "" { + desc += " (" + mi.mountType + ")" + } + sb.WriteString(fmt.Sprintf("variable %q {\n", mi.varName)) + sb.WriteString(fmt.Sprintf(" description = %q\n", "Mount accessor for "+strings.TrimSpace(desc))) + sb.WriteString(" type = string\n") + sb.WriteString(fmt.Sprintf(" default = %q\n", mi.accessor)) + sb.WriteString("}\n\n") + } + + used := make(map[string]struct{}) + counter := 0 + + for _, r := range targets { + name := nextPetName(used, &counter) + mi := accessorSeen[r.MountAccessor] + + sb.WriteString(fmt.Sprintf("# client_id: %s | source: %s | mount: %s\n", + r.ClientID, filepath.Base(r.Source), r.MountPath)) + + sb.WriteString(fmt.Sprintf("resource \"vault_identity_entity\" %q {\n", name)) + entityName := r.EntityName + if entityName == "" { + entityName = "TODO" + } + sb.WriteString(fmt.Sprintf(" name = %q\n", entityName)) + sb.WriteString("}\n\n") + + sb.WriteString(fmt.Sprintf("resource \"vault_identity_entity_alias\" %q {\n", name)) + sb.WriteString(" name = \"TODO\" # alias identifier used by the auth method\n") + sb.WriteString(fmt.Sprintf(" mount_accessor = var.%s\n", mi.varName)) + sb.WriteString(fmt.Sprintf(" canonical_id = vault_identity_entity.%s.id\n", name)) + sb.WriteString("}\n\n") + } + + if err := os.WriteFile(outputPath, []byte(sb.String()), 0644); err != nil { + return 0, fmt.Errorf("writing %s: %w", outputPath, err) + } + return len(targets), nil +} From 09f076fc611299a29e86cbd9afa64fe1369aa13d Mon Sep 17 00:00:00 2001 From: Schuyler Bishop Date: Fri, 29 May 2026 10:00:05 -0500 Subject: [PATCH 8/8] Refactor README and main logic for Vault CSV normalization; add tests for Terraform generation --- README.md | 7 +- cmd/vault-csv-normalizer/main.go | 16 +- internal/normalizer/normalizer.go | 7 - internal/normalizer/normalizer_test.go | 44 ----- internal/tfgen/tfgen.go | 163 +++++++++------- internal/tfgen/tfgen_test.go | 252 +++++++++++++++++++++++++ 6 files changed, 356 insertions(+), 133 deletions(-) create mode 100644 internal/tfgen/tfgen_test.go diff --git a/README.md b/README.md index cb8d1fe..c0547ef 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,12 @@ -# vault-csv-normalizer +# vault-csv-count [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](LICENSE) +> **Disclaimer:** This is an unofficial, community-provided tool. It is not +> created, endorsed, or supported by HashiCorp or IBM. Use at your own risk. +> No warranty is provided. For official Vault client counting guidance, refer +> to the [HashiCorp Vault documentation](https://developer.hashicorp.com/vault/docs). + A CLI tool that reads one or more **HashiCorp Vault client export CSV files**, normalizes their data (consistent column names, types, and values across Vault versions), and displays a summary of client counts by mount path and type. diff --git a/cmd/vault-csv-normalizer/main.go b/cmd/vault-csv-normalizer/main.go index 30132c5..3ae4a37 100644 --- a/cmd/vault-csv-normalizer/main.go +++ b/cmd/vault-csv-normalizer/main.go @@ -129,12 +129,13 @@ func main() { } preDedup := normalized + var aliasGroups [][]normalizer.Record if len(methodGroupsPerFile) > 0 { - groups := normalizer.FindAliasDuplicatesForMethodsPerFile(preDedup, methodGroupsPerFile) - if len(groups) > 0 { - fmt.Fprintf(os.Stdout, "Per-file method-scoped alias duplicates found (%d group(s))\n", len(groups)) + aliasGroups = normalizer.FindAliasDuplicatesForMethodsPerFile(preDedup, methodGroupsPerFile) + if len(aliasGroups) > 0 { + fmt.Fprintf(os.Stdout, "Per-file method-scoped alias duplicates found (%d group(s))\n", len(aliasGroups)) fmt.Fprintln(os.Stdout, "=====================================================") - for _, group := range groups { + for _, group := range aliasGroups { r0 := group[0] fmt.Fprintf(os.Stdout, "\nAlias group: %q file: %s\n", normalizer.BaseAlias(r0.EntityAliasName), filepath.Base(r0.Source)) @@ -157,9 +158,6 @@ func main() { fmt.Fprintln(os.Stdout, strings.Repeat("-", 70)) } - // Snapshot post-dedup records before filters for --generate-tf. - preFilterRecords := normalized - // Apply filters. if filterNS != "" { normalized = normalizer.FilterByNamespace(normalized, filterNS) @@ -186,13 +184,13 @@ func main() { if len(methodGroupsPerFile) == 0 { fmt.Fprintln(os.Stderr, "warning: --generate-tf has no effect without --dedup-methods-per-file") } else { - n, err := tfgen.GenerateTF(preFilterRecords, methodGroupsPerFile, "vault-aliases.tf") + n, err := tfgen.GenerateTF(aliasGroups, "vault-aliases.tf") if err != nil { fmt.Fprintf(os.Stderr, "error: --generate-tf: %v\n", err) os.Exit(1) } if n == 0 { - fmt.Fprintln(os.Stdout, "generate-tf: no unaliased clients found in the specified method groups") + fmt.Fprintln(os.Stdout, "generate-tf: no alias groups found — nothing to generate") } else { fmt.Fprintf(os.Stdout, "generate-tf: wrote %d entity stub(s) to vault-aliases.tf\n", n) } diff --git a/internal/normalizer/normalizer.go b/internal/normalizer/normalizer.go index 3b4b491..65ecc7d 100644 --- a/internal/normalizer/normalizer.go +++ b/internal/normalizer/normalizer.go @@ -172,13 +172,6 @@ func BaseAlias(name string) string { return name } -// aliasKey is the deduplication key for alias-based dedup: one record is -// allowed per (normalized alias, mount type) pair. -type aliasKey struct { - base string - mountType string -} - // buildMethodGroupMap converts a list of groups (each a slice of mount-type // strings) into a map from every member to the group's canonical value (the // first element of the group). Methods not present in any group are absent diff --git a/internal/normalizer/normalizer_test.go b/internal/normalizer/normalizer_test.go index 55c7069..c70ee53 100644 --- a/internal/normalizer/normalizer_test.go +++ b/internal/normalizer/normalizer_test.go @@ -1,7 +1,6 @@ package normalizer import ( - "strings" "testing" "time" @@ -144,49 +143,6 @@ func TestFilterByClientType(t *testing.T) { } } -func TestFilterAbandonedClients(t *testing.T) { - records := []Record{ - // removed as merged/deleted: mount path present - {ClientID: "drop-merged-1", EntityName: "", EntityAliasName: "", MountPath: "auth/ldap/", MountType: "ldap"}, - // removed as merged/deleted: mount path present even if mount type is blank - {ClientID: "drop-merged-2", EntityName: "", EntityAliasName: "", MountPath: "auth/oidc/", MountType: ""}, - // removed as no mount: mount path missing - {ClientID: "drop-nomount-1", EntityName: "", EntityAliasName: "", MountPath: "", MountType: "ldap"}, - // removed as merged/deleted PKI (auth_cert accessor, mount present) - {ClientID: "drop-merged-pki-1", EntityName: "", EntityAliasName: "", MountPath: "auth/cert/", MountType: "cert", MountAccessor: "auth_cert_abc123"}, - // removed as no-mount PKI (auth_cert accessor, mount missing) - {ClientID: "drop-nomount-pki-1", EntityName: "", EntityAliasName: "", MountPath: "", MountType: "cert", MountAccessor: "auth_cert_xyz789"}, - // keep: entity name present - {ClientID: "keep-3", EntityName: "Alice", EntityAliasName: "", MountPath: "auth/ldap/", MountType: "ldap"}, - // keep: entity alias present - {ClientID: "keep-4", EntityName: "", EntityAliasName: "alice", MountPath: "auth/ldap/", MountType: "ldap"}, - } - - out, counts := FilterAbandonedClients(records) - if counts.NoMount != 2 { - t.Fatalf("expected NoMount=2, got %d", counts.NoMount) - } - if counts.NoMountPKI != 1 { - t.Fatalf("expected NoMountPKI=1, got %d", counts.NoMountPKI) - } - if counts.MergedDeleted != 3 { - t.Fatalf("expected MergedDeleted=3, got %d", counts.MergedDeleted) - } - if counts.MergedDeletedPKI != 1 { - t.Fatalf("expected MergedDeletedPKI=1, got %d", counts.MergedDeletedPKI) - } - if counts.Total() != 5 { - t.Fatalf("expected Total=5, got %d", counts.Total()) - } - if len(out) != 2 { - t.Fatalf("expected 2 records after filter, got %d", len(out)) - } - for _, r := range out { - if strings.HasPrefix(r.ClientID, "drop-") { - t.Fatal("drop-* records should have been removed") - } - } -} func TestFilterSince(t *testing.T) { records := []Record{ diff --git a/internal/tfgen/tfgen.go b/internal/tfgen/tfgen.go index ea8960d..a0abdb7 100644 --- a/internal/tfgen/tfgen.go +++ b/internal/tfgen/tfgen.go @@ -1,6 +1,7 @@ -// Package tfgen generates Terraform HCL stubs for Vault entity clients that -// have no entity alias in the activity export. These stubs create a -// vault_identity_entity and a vault_identity_entity_alias for each such record. +// Package tfgen generates Terraform HCL to consolidate per-auth-method Vault +// client records into a single entity with multiple aliases. Each duplicate +// group found by --dedup-methods-per-file becomes one vault_identity_entity +// and one vault_identity_entity_alias per auth method in that group. package tfgen import ( @@ -24,8 +25,7 @@ var petNouns = []string{ "raven", "swift", "teal", "vole", "wren", "yak", "zebra", } -// nextPetName picks the next unused (adj_noun) name, cycling through all -// combinations. Falls back to "entity_N" if all 500+ combinations are used. +// nextPetName picks the next unused adj_noun name from the wordlists. func nextPetName(used map[string]struct{}, counter *int) string { total := len(petAdjectives) * len(petNouns) for i := 0; i < total; i++ { @@ -43,8 +43,7 @@ func nextPetName(used map[string]struct{}, counter *int) string { return name } -// sanitizeID converts a string to a valid Terraform identifier by lowercasing -// and replacing non-alphanumeric characters with underscores. +// sanitizeID converts a string to a valid Terraform identifier. func sanitizeID(s string) string { var b strings.Builder for _, ch := range strings.ToLower(s) { @@ -61,51 +60,25 @@ func sanitizeID(s string) string { return strings.Trim(result, "_") } -func buildMethodGroupMap(groups [][]string) map[string]string { - m := make(map[string]string) - for _, g := range groups { - if len(g) == 0 { - continue - } - canonical := g[0] - for _, method := range g { - m[method] = canonical - } +// effectiveAlias returns the human-readable alias for a record. For OIDC, +// entity_alias_metadata_username holds the username; entity_alias_name may be +// a subject identifier that doesn't match other methods. +func effectiveAlias(r normalizer.Record) string { + if (r.MountType == "oidc" || r.AuthMethod == "oidc") && r.EntityAliasMetadataUsername != "" { + return r.EntityAliasMetadataUsername } - return m + return r.EntityAliasName } -// GenerateTF scans records for method-group members that have a non-empty -// mount_accessor but a blank entity_alias_name. These clients exist in Vault -// without an entity alias and need one created. For each such record a -// vault_identity_entity and vault_identity_entity_alias stub is written to -// outputPath. Mount accessors are emitted as Terraform variables so the caller -// can override them per environment. Returns the number of stubs written. -func GenerateTF(records []normalizer.Record, groups [][]string, outputPath string) (int, error) { - groupMap := buildMethodGroupMap(groups) - - var targets []normalizer.Record - for _, r := range records { - if r.EntityAliasName != "" { - continue // already aliased — skip - } - if r.MountAccessor == "" { - continue // no mount accessor — cannot create alias - } - if normalizer.IsPKIClient(r) { - continue // PKI clients are excluded from alias management - } - mt := r.MountType - if mt == "" { - mt = r.AuthMethod - } - if _, ok := groupMap[mt]; !ok { - continue // not in any configured method group - } - targets = append(targets, r) - } - - if len(targets) == 0 { +// GenerateTF writes Terraform HCL to outputPath for each alias duplicate group. +// groups is the output of normalizer.FindAliasDuplicatesForMethodsPerFile — each +// inner slice is a set of records that represent the same person authenticated via +// different auth methods. For each group, one vault_identity_entity resource and +// one vault_identity_entity_alias per record are generated, consolidating the +// separate per-method client records into a single Vault entity. +// Returns the number of entity stubs written. +func GenerateTF(groups [][]normalizer.Record, outputPath string) (int, error) { + if len(groups) == 0 { return 0, nil } @@ -116,10 +89,14 @@ func GenerateTF(records []normalizer.Record, groups [][]string, outputPath strin varName string } + // Collect unique mount accessors across all groups. accessorSeen := make(map[string]*mountInfo) var accessorOrder []string - for _, r := range targets { - if _, ok := accessorSeen[r.MountAccessor]; !ok { + for _, group := range groups { + for _, r := range group { + if r.MountAccessor == "" || accessorSeen[r.MountAccessor] != nil { + continue + } mi := &mountInfo{ accessor: r.MountAccessor, mountPath: r.MountPath, @@ -134,8 +111,9 @@ func GenerateTF(records []normalizer.Record, groups [][]string, outputPath strin var sb strings.Builder sb.WriteString("# Generated by vault-csv-normalizer\n") - sb.WriteString("# These stubs represent entity clients with no alias in the Vault export.\n") - sb.WriteString("# Fill in the TODO values and verify entity names before applying.\n\n") + sb.WriteString("# Each entity block consolidates records that represent the same person\n") + sb.WriteString("# authenticated via different auth methods within a single billing period.\n") + sb.WriteString("# Verify names before applying.\n\n") for _, acc := range accessorOrder { mi := accessorSeen[acc] @@ -150,33 +128,74 @@ func GenerateTF(records []normalizer.Record, groups [][]string, outputPath strin sb.WriteString("}\n\n") } + // Group duplicate groups by source file, preserving order of first appearance. + var fileOrder []string + byFile := make(map[string][][]normalizer.Record) + for _, group := range groups { + source := filepath.Base(group[0].Source) + if _, ok := byFile[source]; !ok { + fileOrder = append(fileOrder, source) + } + byFile[source] = append(byFile[source], group) + } + used := make(map[string]struct{}) counter := 0 - for _, r := range targets { - name := nextPetName(used, &counter) - mi := accessorSeen[r.MountAccessor] - - sb.WriteString(fmt.Sprintf("# client_id: %s | source: %s | mount: %s\n", - r.ClientID, filepath.Base(r.Source), r.MountPath)) + for _, source := range fileOrder { + fileGroups := byFile[source] + divider := strings.Repeat("#", 60) + sb.WriteString(fmt.Sprintf("%s\n# Source: %s (%d alias group(s))\n%s\n\n", + divider, source, len(fileGroups), divider)) + + for _, group := range fileGroups { + r0 := group[0] + entityName := normalizer.BaseAlias(effectiveAlias(r0)) + + methods := make([]string, 0, len(group)) + for _, r := range group { + mt := r.MountType + if mt == "" { + mt = r.AuthMethod + } + methods = append(methods, mt) + } - sb.WriteString(fmt.Sprintf("resource \"vault_identity_entity\" %q {\n", name)) - entityName := r.EntityName - if entityName == "" { - entityName = "TODO" + petname := nextPetName(used, &counter) + + sb.WriteString(fmt.Sprintf("# alias: %s | methods: %s\n", + entityName, strings.Join(methods, ", "))) + + sb.WriteString(fmt.Sprintf("resource \"vault_identity_entity\" %q {\n", petname)) + sb.WriteString(fmt.Sprintf(" name = %q\n", entityName)) + sb.WriteString("}\n\n") + + for i, r := range group { + mt := r.MountType + if mt == "" { + mt = r.AuthMethod + } + aliasName := r.EntityAliasName + if aliasName == "" { + aliasName = "TODO" + } + aliasResource := fmt.Sprintf("%s_%d", petname, i) + + sb.WriteString(fmt.Sprintf("resource \"vault_identity_entity_alias\" %q {\n", aliasResource)) + sb.WriteString(fmt.Sprintf(" name = %q # %s\n", aliasName, mt)) + if mi := accessorSeen[r.MountAccessor]; mi != nil { + sb.WriteString(fmt.Sprintf(" mount_accessor = var.%s\n", mi.varName)) + } else { + sb.WriteString(" mount_accessor = \"TODO\" # mount_accessor not in export\n") + } + sb.WriteString(fmt.Sprintf(" canonical_id = vault_identity_entity.%s.id\n", petname)) + sb.WriteString("}\n\n") + } } - sb.WriteString(fmt.Sprintf(" name = %q\n", entityName)) - sb.WriteString("}\n\n") - - sb.WriteString(fmt.Sprintf("resource \"vault_identity_entity_alias\" %q {\n", name)) - sb.WriteString(" name = \"TODO\" # alias identifier used by the auth method\n") - sb.WriteString(fmt.Sprintf(" mount_accessor = var.%s\n", mi.varName)) - sb.WriteString(fmt.Sprintf(" canonical_id = vault_identity_entity.%s.id\n", name)) - sb.WriteString("}\n\n") } if err := os.WriteFile(outputPath, []byte(sb.String()), 0644); err != nil { return 0, fmt.Errorf("writing %s: %w", outputPath, err) } - return len(targets), nil + return len(groups), nil } diff --git a/internal/tfgen/tfgen_test.go b/internal/tfgen/tfgen_test.go new file mode 100644 index 0000000..84edb44 --- /dev/null +++ b/internal/tfgen/tfgen_test.go @@ -0,0 +1,252 @@ +package tfgen + +import ( + "os" + "path/filepath" + "strings" + "testing" + + "github.com/vault-csv-normalizer/internal/normalizer" +) + +func TestGenerateTF_EmptyGroups(t *testing.T) { + out := filepath.Join(t.TempDir(), "out.tf") + n, err := GenerateTF(nil, out) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if n != 0 { + t.Errorf("expected 0 stubs, got %d", n) + } + if _, err := os.Stat(out); !os.IsNotExist(err) { + t.Error("expected no file to be written for empty groups") + } +} + +func TestGenerateTF_SingleGroup(t *testing.T) { + groups := [][]normalizer.Record{ + { + { + ClientID: "ldap-001", + Source: "jan.csv", + MountAccessor: "auth_ldap_abc", + MountPath: "auth/ldap/", + MountType: "ldap", + ClientType: "entity", + EntityAliasName: "alice", + }, + { + ClientID: "oidc-001", + Source: "jan.csv", + MountAccessor: "auth_oidc_xyz", + MountPath: "auth/oidc/", + MountType: "oidc", + ClientType: "entity", + EntityAliasName: "alice@corp.com", + }, + }, + } + + out := filepath.Join(t.TempDir(), "out.tf") + n, err := GenerateTF(groups, out) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if n != 1 { + t.Errorf("expected 1 stub, got %d", n) + } + + content, err := os.ReadFile(out) + if err != nil { + t.Fatalf("reading output: %v", err) + } + tf := string(content) + + // One entity resource + if count := strings.Count(tf, "resource \"vault_identity_entity\""); count != 1 { + t.Errorf("expected 1 vault_identity_entity resource, got %d", count) + } + // Two alias resources (one per record in the group) + if count := strings.Count(tf, "resource \"vault_identity_entity_alias\""); count != 2 { + t.Errorf("expected 2 vault_identity_entity_alias resources, got %d", count) + } + // Entity name uses base alias (no domain) + if !strings.Contains(tf, `name = "alice"`) { + t.Error("expected entity name to be the base alias \"alice\"") + } + // LDAP alias name preserved as-is + if !strings.Contains(tf, `name = "alice" # ldap`) { + t.Error("expected LDAP alias name \"alice\"") + } + // OIDC alias name preserved as-is (full email) + if !strings.Contains(tf, `name = "alice@corp.com" # oidc`) { + t.Error("expected OIDC alias name \"alice@corp.com\"") + } + // Variables declared for both mount accessors + if !strings.Contains(tf, `variable "accessor_auth_ldap_abc"`) { + t.Error("expected variable for auth_ldap_abc") + } + if !strings.Contains(tf, `variable "accessor_auth_oidc_xyz"`) { + t.Error("expected variable for auth_oidc_xyz") + } + // canonical_id references the entity resource + if !strings.Contains(tf, "vault_identity_entity.") { + t.Error("expected canonical_id referencing vault_identity_entity") + } +} + +func TestGenerateTF_MultipleGroups(t *testing.T) { + groups := [][]normalizer.Record{ + { + {ClientID: "ldap-001", Source: "jan.csv", MountAccessor: "auth_ldap_abc", MountPath: "auth/ldap/", MountType: "ldap", EntityAliasName: "alice"}, + {ClientID: "oidc-001", Source: "jan.csv", MountAccessor: "auth_oidc_xyz", MountPath: "auth/oidc/", MountType: "oidc", EntityAliasName: "alice@corp.com"}, + }, + { + {ClientID: "ldap-002", Source: "jan.csv", MountAccessor: "auth_ldap_abc", MountPath: "auth/ldap/", MountType: "ldap", EntityAliasName: "bob"}, + {ClientID: "oidc-002", Source: "jan.csv", MountAccessor: "auth_oidc_xyz", MountPath: "auth/oidc/", MountType: "oidc", EntityAliasName: "bob@corp.com"}, + }, + } + + out := filepath.Join(t.TempDir(), "out.tf") + n, err := GenerateTF(groups, out) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if n != 2 { + t.Errorf("expected 2 stubs, got %d", n) + } + + content, _ := os.ReadFile(out) + tf := string(content) + + if count := strings.Count(tf, "resource \"vault_identity_entity\""); count != 2 { + t.Errorf("expected 2 vault_identity_entity resources, got %d", count) + } + if count := strings.Count(tf, "resource \"vault_identity_entity_alias\""); count != 4 { + t.Errorf("expected 4 vault_identity_entity_alias resources, got %d", count) + } + // Shared mount accessors declared only once each + if count := strings.Count(tf, `variable "accessor_auth_ldap_abc"`); count != 1 { + t.Errorf("expected mount accessor variable declared once, got %d", count) + } + if count := strings.Count(tf, `variable "accessor_auth_oidc_xyz"`); count != 1 { + t.Errorf("expected mount accessor variable declared once, got %d", count) + } +} + +func TestGenerateTF_PetnamesAreUnique(t *testing.T) { + // Build enough groups to exercise multiple petname assignments. + aliases := []string{"alice", "bob", "carol", "dave", "eve"} + groups := make([][]normalizer.Record, len(aliases)) + for i, alias := range aliases { + groups[i] = []normalizer.Record{ + {ClientID: "ldap-" + alias, Source: "jan.csv", MountAccessor: "auth_ldap_abc", MountPath: "auth/ldap/", MountType: "ldap", EntityAliasName: alias}, + {ClientID: "oidc-" + alias, Source: "jan.csv", MountAccessor: "auth_oidc_xyz", MountPath: "auth/oidc/", MountType: "oidc", EntityAliasName: alias + "@corp.com"}, + } + } + + out := filepath.Join(t.TempDir(), "out.tf") + _, err := GenerateTF(groups, out) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + content, _ := os.ReadFile(out) + tf := string(content) + + // Extract resource names and verify uniqueness. + seen := make(map[string]int) + for _, line := range strings.Split(tf, "\n") { + line = strings.TrimSpace(line) + if strings.HasPrefix(line, "resource \"vault_identity_entity\" ") { + name := strings.Trim(strings.Fields(line)[2], `"{ `) + seen[name]++ + } + } + for name, count := range seen { + if count > 1 { + t.Errorf("petname %q used %d times — names must be unique", name, count) + } + } +} + +func TestGenerateTF_GroupedByFile(t *testing.T) { + groups := [][]normalizer.Record{ + // jan.csv — alice + { + {ClientID: "ldap-001", Source: "/data/jan.csv", MountAccessor: "auth_ldap_abc", MountPath: "auth/ldap/", MountType: "ldap", EntityAliasName: "alice"}, + {ClientID: "oidc-001", Source: "/data/jan.csv", MountAccessor: "auth_oidc_xyz", MountPath: "auth/oidc/", MountType: "oidc", EntityAliasName: "alice@corp.com"}, + }, + // feb.csv — alice (same person, different file) + { + {ClientID: "ldap-002", Source: "/data/feb.csv", MountAccessor: "auth_ldap_abc", MountPath: "auth/ldap/", MountType: "ldap", EntityAliasName: "alice"}, + {ClientID: "oidc-002", Source: "/data/feb.csv", MountAccessor: "auth_oidc_xyz", MountPath: "auth/oidc/", MountType: "oidc", EntityAliasName: "alice@corp.com"}, + }, + // feb.csv — bob (second group in the same file) + { + {ClientID: "ldap-003", Source: "/data/feb.csv", MountAccessor: "auth_ldap_abc", MountPath: "auth/ldap/", MountType: "ldap", EntityAliasName: "bob"}, + {ClientID: "oidc-003", Source: "/data/feb.csv", MountAccessor: "auth_oidc_xyz", MountPath: "auth/oidc/", MountType: "oidc", EntityAliasName: "bob@corp.com"}, + }, + } + + out := filepath.Join(t.TempDir(), "out.tf") + n, err := GenerateTF(groups, out) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if n != 3 { + t.Errorf("expected 3 stubs, got %d", n) + } + + content, _ := os.ReadFile(out) + tf := string(content) + + // File headers present for both source files. + if !strings.Contains(tf, "Source: jan.csv") { + t.Error("expected file header for jan.csv") + } + if !strings.Contains(tf, "Source: feb.csv") { + t.Error("expected file header for feb.csv") + } + // jan.csv header shows 1 group, feb.csv header shows 2 groups. + if !strings.Contains(tf, "jan.csv (1 alias group(s))") { + t.Error("expected jan.csv to show 1 alias group") + } + if !strings.Contains(tf, "feb.csv (2 alias group(s))") { + t.Error("expected feb.csv to show 2 alias groups") + } + // jan.csv header appears before feb.csv header. + if strings.Index(tf, "jan.csv") > strings.Index(tf, "feb.csv") { + t.Error("expected jan.csv section before feb.csv section") + } + // Total: 3 entities, 6 aliases. + if count := strings.Count(tf, "resource \"vault_identity_entity\""); count != 3 { + t.Errorf("expected 3 vault_identity_entity resources, got %d", count) + } + if count := strings.Count(tf, "resource \"vault_identity_entity_alias\""); count != 6 { + t.Errorf("expected 6 vault_identity_entity_alias resources, got %d", count) + } +} + +func TestGenerateTF_MissingMountAccessor(t *testing.T) { + groups := [][]normalizer.Record{ + { + {ClientID: "ldap-001", Source: "jan.csv", MountAccessor: "", MountPath: "auth/ldap/", MountType: "ldap", EntityAliasName: "alice"}, + {ClientID: "oidc-001", Source: "jan.csv", MountAccessor: "auth_oidc_xyz", MountPath: "auth/oidc/", MountType: "oidc", EntityAliasName: "alice@corp.com"}, + }, + } + + out := filepath.Join(t.TempDir(), "out.tf") + _, err := GenerateTF(groups, out) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + content, _ := os.ReadFile(out) + tf := string(content) + + // Record with no mount_accessor gets a TODO placeholder, not a var reference. + if !strings.Contains(tf, `mount_accessor = "TODO"`) { + t.Error("expected TODO placeholder for missing mount_accessor") + } +}