Skip to content

Commit 6f9efb7

Browse files
authored
Legislator matching been OCPF and legislature IDs (#2168)
* ocpf-id-matching * remove member from flags doc once matched * cleaned up the matching logic * removing unused import * clarified logging for ambiguous cases * further cleaning up log message * use all_filers file for matching
1 parent d3b610b commit 6f9efb7

5 files changed

Lines changed: 390 additions & 7 deletions

File tree

functions/package.json

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,13 +13,13 @@
1313
},
1414
"main": "lib/index.js",
1515
"dependencies": {
16+
"@google-cloud/aiplatform": "^3.9.0",
1617
"@google-cloud/firestore": "^5.0.2",
1718
"@google-cloud/pubsub": "^3.0.1",
1819
"assemblyai": "^4.9.0",
1920
"axios": "^0.25.0",
2021
"date-fns": "^2.30.0",
2122
"firebase-admin": "^12.0.0",
22-
"@google-cloud/aiplatform": "^3.9.0",
2323
"firebase-functions": "^5.1.1",
2424
"fluent-ffmpeg": "^2.1.3",
2525
"fuse.js": "6.5.3",
@@ -34,6 +34,7 @@
3434
"runtypes": "6.6.0",
3535
"ssl-root-cas": "^1.3.1",
3636
"typesense": "^1.2.2",
37+
"unzipper": "^0.12.3",
3738
"zod": "^3.20.2"
3839
},
3940
"devDependencies": {
@@ -43,6 +44,7 @@
4344
"@types/luxon": "^2.0.9",
4445
"@types/object-hash": "^2.2.1",
4546
"@types/pdf-parse": "1.1.5",
47+
"@types/unzipper": "^0.10.10",
4648
"copyfiles": "^2.4.1",
4749
"firebase-functions-test": "^0.3.3",
4850
"firebase-tools": "^13.18.0",

functions/src/index.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,8 @@ export {
6060

6161
export { transcription } from "./webhooks"
6262

63+
export { matchOcpfMembers } from "./ocpf/matchOcpfMembers"
64+
6365
export * from "./triggerPubsubFunction"
6466

6567
// Export the health check last so it is loaded last.
Lines changed: 279 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,279 @@
1+
import * as functions from "firebase-functions"
2+
import { getAuth } from "firebase-admin/auth"
3+
import axios from "axios"
4+
import unzipper from "unzipper"
5+
import { db } from "../firebase"
6+
import { currentGeneralCourt } from "../shared"
7+
import { MemberContent } from "../members/types"
8+
import {
9+
OcpfFilerRow,
10+
OcpfMemberMapping,
11+
OcpfMemberMappingEntry,
12+
OcpfMemberMappingFlags,
13+
OcpfMemberMappingFlagsEntry
14+
} from "./types"
15+
16+
export const matchOcpfMembers = functions.https.onRequest(async (req, res) => {
17+
if (req.method !== "POST") {
18+
res.status(405).send("Method Not Allowed. Use POST.")
19+
return
20+
}
21+
if (process.env.FUNCTIONS_EMULATOR !== "true") {
22+
const authHeader = req.headers.authorization
23+
if (!authHeader?.startsWith("Bearer ")) {
24+
res.status(401).send("Unauthorized")
25+
return
26+
}
27+
try {
28+
const decoded = await getAuth().verifyIdToken(authHeader.slice(7))
29+
if (decoded["role"] !== "admin") {
30+
res.status(403).send("Forbidden")
31+
return
32+
}
33+
} catch {
34+
res.status(401).send("Unauthorized")
35+
return
36+
}
37+
}
38+
39+
const filers = await downloadAndParseFilers()
40+
const members = await loadMembers()
41+
42+
const existingMappingDoc = await db.doc("/config/ocpfMemberMapping").get()
43+
const existingMapping = (existingMappingDoc.data() ?? {}) as OcpfMemberMapping
44+
45+
const mapping: OcpfMemberMapping = {}
46+
const unmatched: OcpfMemberMappingFlagsEntry[] = []
47+
const ambiguous: OcpfMemberMappingFlagsEntry[] = []
48+
49+
for (const member of members) {
50+
const lastName = extractLastName(member.Name)
51+
const branch = member.Branch
52+
53+
if (!branch || (branch !== "Senate" && branch !== "House")) continue
54+
55+
const lastNameAndBranchMatches = filers.filter(
56+
f =>
57+
f.lastName.toLowerCase() === lastName.toLowerCase() &&
58+
f.officeSought === branch
59+
)
60+
61+
// Narrow by first name: compare first word of each (e.g. "Daniel" from "Daniel J. Ryan"
62+
// vs "Daniel" from "Daniel Joseph"). If none align, falls back to matches by last name and branch.
63+
const mapleFirstName = member.Name.trim().split(/\s+/)[0].toLowerCase()
64+
const firstNameMatches = lastNameAndBranchMatches.filter(
65+
f => f.firstName.trim().split(/\s+/)[0].toLowerCase() === mapleFirstName
66+
)
67+
const candidates =
68+
firstNameMatches.length > 0 ? firstNameMatches : lastNameAndBranchMatches
69+
70+
if (firstNameMatches.length === 1) {
71+
const entry: OcpfMemberMappingEntry = {
72+
cpfId: candidates[0].cpfId,
73+
name: member.Name
74+
}
75+
mapping[member.MemberCode] = entry
76+
77+
// Matching was likley fixed manually
78+
} else if (member.MemberCode in existingMapping) {
79+
continue
80+
81+
// Single last name match but first name didn't align. Flag rather than auto-match,
82+
// since the OCPF filer may be a different person (e.g. original member changed office sought,
83+
// and another person with same last name is running for original office).
84+
} else if (candidates.length === 1 && firstNameMatches.length === 0) {
85+
ambiguous.push({ memberCode: member.MemberCode, name: member.Name })
86+
functions.logger.warn(
87+
"Ambiguous OCPF match. Single last name match but first name did not align.",
88+
{
89+
memberCode: member.MemberCode,
90+
name: member.Name,
91+
district: member.District,
92+
branch,
93+
ocpfFirstName: candidates[0].firstName,
94+
ocpfLastName: candidates[0].lastName,
95+
ocpfDistrict: candidates[0].district,
96+
ocpfOfficeSought: candidates[0].officeSought
97+
}
98+
)
99+
} else if (candidates.length === 0) {
100+
unmatched.push({ memberCode: member.MemberCode, name: member.Name })
101+
functions.logger.warn("No OCPF match.", {
102+
memberCode: member.MemberCode,
103+
name: member.Name,
104+
district: member.District,
105+
branch
106+
})
107+
} else {
108+
ambiguous.push({ memberCode: member.MemberCode, name: member.Name })
109+
functions.logger.warn("Ambiguous OCPF match.", {
110+
memberCode: member.MemberCode,
111+
name: member.Name,
112+
district: member.District,
113+
branch,
114+
candidates: candidates.map(c => ({
115+
cpfId: c.cpfId,
116+
firstName: c.firstName,
117+
lastName: c.lastName,
118+
district: c.district,
119+
officeSought: c.officeSought
120+
}))
121+
})
122+
}
123+
}
124+
125+
const flags: OcpfMemberMappingFlags = { unmatched, ambiguous }
126+
127+
await db.doc("/config/ocpfMemberMapping").set(mapping, { merge: true })
128+
await db.doc("/config/ocpfMemberMappingFlags").set(flags)
129+
130+
functions.logger.info("matchOcpfMembers complete", {
131+
matched: Object.keys(mapping).length,
132+
unmatched: unmatched.length,
133+
ambiguous: ambiguous.length
134+
})
135+
136+
res.status(200).json({
137+
results: {
138+
matched: Object.keys(mapping).length,
139+
unmatched: unmatched.length,
140+
ambiguous: ambiguous.length
141+
},
142+
unmatched_members: unmatched,
143+
ambiguous_members: ambiguous
144+
})
145+
})
146+
147+
async function downloadAndParseFilers(): Promise<OcpfFilerRow[]> {
148+
const response = await axios.get(
149+
"https://ocpf2.blob.core.windows.net/downloads/data2/ocpf-filers.zip",
150+
{ responseType: "arraybuffer" }
151+
)
152+
153+
const buffer = Buffer.from(response.data as ArrayBuffer)
154+
functions.logger.info("Downloaded ocpf-filers.zip", {
155+
status: response.status,
156+
contentType: response.headers["content-type"],
157+
bytes: buffer.length,
158+
firstBytes: buffer.subarray(0, 4).toString("hex") // should be 504b0304 for a valid ZIP
159+
})
160+
const directory = await unzipper.Open.buffer(buffer)
161+
162+
// Using all_filers.txt to catch more edge cases. If want to filter out more ambiguity,
163+
// such as not including Z account types, we can switch to candidates.txt
164+
const entry = directory.files.find(
165+
f => f.type === "File" && f.path.toLowerCase() === "all_filers.txt"
166+
)
167+
if (!entry)
168+
throw new Error("No all_filers.txt file found inside ocpf-filers.zip")
169+
170+
const content = await entry.buffer()
171+
const text = content.toString("utf8")
172+
const lines = text.split(/\r?\n/)
173+
174+
const rawHeaders = lines[0].split("\t").map(h => h.trim())
175+
functions.logger.info("OCPF filers headers", { headers: rawHeaders })
176+
177+
const colIndex = buildColumnIndex(rawHeaders, [
178+
"cpfId",
179+
"lastName",
180+
"firstName",
181+
"officeSought",
182+
"district",
183+
"closedDate"
184+
])
185+
186+
// Values in the file are wrapped in double quotes — strip them after splitting
187+
const col = (cols: string[], idx: number) =>
188+
(cols[idx] ?? "").trim().replace(/^"|"$/g, "")
189+
190+
const filers: OcpfFilerRow[] = []
191+
for (let i = 1; i < lines.length; i++) {
192+
const line = lines[i]
193+
if (!line.trim()) continue
194+
195+
const cols = line.split("\t")
196+
const closedDate = col(cols, colIndex.closedDate)
197+
const officeSought = col(cols, colIndex.officeSought)
198+
199+
if (closedDate !== "") continue
200+
if (officeSought !== "Senate" && officeSought !== "House") continue
201+
202+
filers.push({
203+
cpfId: parseInt(col(cols, colIndex.cpfId), 10),
204+
lastName: col(cols, colIndex.lastName),
205+
firstName: col(cols, colIndex.firstName),
206+
officeSought,
207+
district: col(cols, colIndex.district),
208+
closedDate
209+
})
210+
}
211+
212+
functions.logger.info("Parsed active state legislators from OCPF", {
213+
count: filers.length
214+
})
215+
return filers
216+
}
217+
218+
const COLUMN_ALIASES: Record<string, string[]> = {
219+
cpfId: ["cpf_id"],
220+
lastName: ["candidate_last_name"],
221+
firstName: ["candidate_first_name"],
222+
officeSought: ["office_type_sought"],
223+
district: ["district_name_sought"],
224+
closedDate: ["closed_date"]
225+
}
226+
227+
function buildColumnIndex(
228+
headers: string[],
229+
fields: string[]
230+
): Record<string, number> {
231+
const normalized = headers.map(h => h.toLowerCase().replace(/\s+/g, "_"))
232+
const index: Record<string, number> = {}
233+
234+
for (const field of fields) {
235+
const aliases = COLUMN_ALIASES[field] ?? [field.toLowerCase()]
236+
const found = aliases.findIndex(alias =>
237+
normalized.some((h, i) => {
238+
if (h === alias) {
239+
index[field] = i
240+
return true
241+
}
242+
return false
243+
})
244+
)
245+
if (found === -1 && !(field in index)) {
246+
throw new Error(
247+
`Required column '${field}' not found in OCPF filers file. ` +
248+
`Headers: ${headers.join(", ")}`
249+
)
250+
}
251+
}
252+
253+
return index
254+
}
255+
256+
const GENERATIONAL_SUFFIXES = new Set(["jr", "sr", "ii", "iii", "iv", "v"])
257+
258+
function extractLastName(fullName: string): string {
259+
const parts = fullName.trim().split(/\s+/)
260+
while (parts.length > 1) {
261+
const last = parts[parts.length - 1].toLowerCase().replace(/[.,]/g, "")
262+
if (GENERATIONAL_SUFFIXES.has(last)) parts.pop()
263+
else break
264+
}
265+
return parts[parts.length - 1].replace(/[,.]$/, "")
266+
}
267+
268+
async function loadMembers(): Promise<MemberContent[]> {
269+
const snapshot = await db
270+
.collection(`/generalCourts/${currentGeneralCourt}/members`)
271+
.get()
272+
273+
return snapshot.docs
274+
.map(doc => {
275+
const data = doc.data()
276+
return data?.content as MemberContent | undefined
277+
})
278+
.filter((c): c is MemberContent => !!c)
279+
}

functions/src/ocpf/types.ts

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
// One active filer row parsed from ocpf-filers.txt
2+
export interface OcpfFilerRow {
3+
cpfId: number
4+
lastName: string
5+
firstName: string
6+
officeSought: string // "Senate" | "House"
7+
district: string
8+
closedDate: string // empty string = active
9+
}
10+
11+
// Firestore: /config/ocpfMemberMapping
12+
// memberCode → { cpfId, name }, e.g. { "SND1": { cpfId: 15031, name: "Sal N. DiDomenico" } }
13+
export interface OcpfMemberMappingEntry {
14+
cpfId: number
15+
name: string
16+
}
17+
18+
export type OcpfMemberMapping = Record<string, OcpfMemberMappingEntry>
19+
20+
export interface OcpfMemberMappingFlagsEntry {
21+
memberCode: string
22+
name: string
23+
}
24+
25+
// Firestore: /config/ocpfMemberMappingFlags
26+
export interface OcpfMemberMappingFlags {
27+
unmatched: OcpfMemberMappingFlagsEntry[]
28+
ambiguous: OcpfMemberMappingFlagsEntry[]
29+
}
30+
31+
export interface FinanceBreakdownEntry {
32+
count: number
33+
amount: number
34+
}
35+
36+
export interface MembersFinanceBreakdown {
37+
individual: FinanceBreakdownEntry
38+
committee: FinanceBreakdownEntry
39+
union: FinanceBreakdownEntry
40+
unitemized: { amount: number }
41+
}

0 commit comments

Comments
 (0)