Skip to content

Commit 3c7bfa7

Browse files
waleedlatif1claude
andauthored
improvement(kb): deferred content fetching and metadata-based hashes for connectors (#4044)
* improvement(kb): deferred content fetching and metadata-based hashes for connectors * fix(kb): remove message count from outlook contentHash to prevent list/get divergence * fix(kb): increase outlook getDocument message limit from 50 to 250 * fix(kb): skip outlook messages without conversationId to prevent broken stubs * fix(kb): scope outlook getDocument to same folder as listDocuments to prevent hash divergence * fix(kb): add missing connector sync cron job to Helm values The connector sync endpoint existed but had no cron job configured to trigger it, meaning scheduled syncs would never fire. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
1 parent d0d35dd commit 3c7bfa7

File tree

17 files changed

+350
-305
lines changed

17 files changed

+350
-305
lines changed

apps/sim/connectors/asana/asana.ts

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ import { createLogger } from '@sim/logger'
22
import { AsanaIcon } from '@/components/icons'
33
import { fetchWithRetry, VALIDATE_RETRY_OPTIONS } from '@/lib/knowledge/documents/utils'
44
import type { ConnectorConfig, ExternalDocument, ExternalDocumentList } from '@/connectors/types'
5-
import { computeContentHash, joinTagArray, parseTagDate } from '@/connectors/utils'
5+
import { joinTagArray, parseTagDate } from '@/connectors/utils'
66

77
const logger = createLogger('AsanaConnector')
88

@@ -240,7 +240,6 @@ export const asanaConnector: ConnectorConfig = {
240240

241241
for (const task of result.data) {
242242
const content = buildTaskContent(task)
243-
const contentHash = await computeContentHash(content)
244243
const tagNames = task.tags?.map((t) => t.name).filter(Boolean) || []
245244

246245
documents.push({
@@ -249,7 +248,7 @@ export const asanaConnector: ConnectorConfig = {
249248
content,
250249
mimeType: 'text/plain',
251250
sourceUrl: task.permalink_url || undefined,
252-
contentHash,
251+
contentHash: `asana:${task.gid}:${task.modified_at ?? ''}`,
253252
metadata: {
254253
project: currentProjectGid,
255254
assignee: task.assignee?.name,
@@ -315,7 +314,6 @@ export const asanaConnector: ConnectorConfig = {
315314
if (!task) return null
316315

317316
const content = buildTaskContent(task)
318-
const contentHash = await computeContentHash(content)
319317
const tagNames = task.tags?.map((t) => t.name).filter(Boolean) || []
320318

321319
return {
@@ -324,7 +322,7 @@ export const asanaConnector: ConnectorConfig = {
324322
content,
325323
mimeType: 'text/plain',
326324
sourceUrl: task.permalink_url || undefined,
327-
contentHash,
325+
contentHash: `asana:${task.gid}:${task.modified_at ?? ''}`,
328326
metadata: {
329327
assignee: task.assignee?.name,
330328
completed: task.completed,

apps/sim/connectors/fireflies/fireflies.ts

Lines changed: 24 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ import { createLogger } from '@sim/logger'
22
import { FirefliesIcon } from '@/components/icons'
33
import { fetchWithRetry, VALIDATE_RETRY_OPTIONS } from '@/lib/knowledge/documents/utils'
44
import type { ConnectorConfig, ExternalDocument, ExternalDocumentList } from '@/connectors/types'
5-
import { computeContentHash, parseTagDate } from '@/connectors/utils'
5+
import { parseTagDate } from '@/connectors/utils'
66

77
const logger = createLogger('FirefliesConnector')
88

@@ -196,50 +196,34 @@ export const firefliesConnector: ConnectorConfig = {
196196
id
197197
name
198198
}
199-
sentences {
200-
index
201-
speaker_name
202-
text
203-
}
204-
summary {
205-
keywords
206-
action_items
207-
overview
208-
short_summary
209-
}
210199
}
211200
}`,
212201
variables
213202
)
214203

215204
const transcripts = (data.transcripts || []) as FirefliesTranscript[]
216205

217-
const documents: ExternalDocument[] = await Promise.all(
218-
transcripts.map(async (transcript) => {
219-
const content = formatTranscriptContent(transcript)
220-
const contentHash = await computeContentHash(content)
221-
222-
const meetingDate = transcript.date ? new Date(transcript.date).toISOString() : undefined
223-
const speakerNames = transcript.speakers?.map((s) => s.name).filter(Boolean) ?? []
224-
225-
return {
226-
externalId: transcript.id,
227-
title: transcript.title || 'Untitled Meeting',
228-
content,
229-
mimeType: 'text/plain' as const,
230-
sourceUrl: transcript.transcript_url || undefined,
231-
contentHash,
232-
metadata: {
233-
hostEmail: transcript.host_email,
234-
duration: transcript.duration,
235-
meetingDate,
236-
participants: transcript.participants,
237-
speakers: speakerNames,
238-
keywords: transcript.summary?.keywords,
239-
},
240-
}
241-
})
242-
)
206+
const documents: ExternalDocument[] = transcripts.map((transcript) => {
207+
const meetingDate = transcript.date ? new Date(transcript.date).toISOString() : undefined
208+
const speakerNames = transcript.speakers?.map((s) => s.name).filter(Boolean) ?? []
209+
210+
return {
211+
externalId: transcript.id,
212+
title: transcript.title || 'Untitled Meeting',
213+
content: '',
214+
contentDeferred: true,
215+
mimeType: 'text/plain' as const,
216+
sourceUrl: transcript.transcript_url || undefined,
217+
contentHash: `fireflies:${transcript.id}:${transcript.date ?? ''}:${transcript.duration ?? ''}`,
218+
metadata: {
219+
hostEmail: transcript.host_email,
220+
duration: transcript.duration,
221+
meetingDate,
222+
participants: transcript.participants,
223+
speakers: speakerNames,
224+
},
225+
}
226+
})
243227

244228
const totalFetched = ((syncContext?.totalDocsFetched as number) ?? 0) + documents.length
245229
if (syncContext) syncContext.totalDocsFetched = totalFetched
@@ -296,7 +280,7 @@ export const firefliesConnector: ConnectorConfig = {
296280
if (!transcript) return null
297281

298282
const content = formatTranscriptContent(transcript)
299-
const contentHash = await computeContentHash(content)
283+
const contentHash = `fireflies:${transcript.id}:${transcript.date ?? ''}:${transcript.duration ?? ''}`
300284

301285
const meetingDate = transcript.date ? new Date(transcript.date).toISOString() : undefined
302286
const speakerNames = transcript.speakers?.map((s) => s.name).filter(Boolean) ?? []
@@ -305,6 +289,7 @@ export const firefliesConnector: ConnectorConfig = {
305289
externalId: transcript.id,
306290
title: transcript.title || 'Untitled Meeting',
307291
content,
292+
contentDeferred: false,
308293
mimeType: 'text/plain',
309294
sourceUrl: transcript.transcript_url || undefined,
310295
contentHash,

apps/sim/connectors/google-calendar/google-calendar.ts

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ import { createLogger } from '@sim/logger'
22
import { GoogleCalendarIcon } from '@/components/icons'
33
import { fetchWithRetry, VALIDATE_RETRY_OPTIONS } from '@/lib/knowledge/documents/utils'
44
import type { ConnectorConfig, ExternalDocument, ExternalDocumentList } from '@/connectors/types'
5-
import { computeContentHash, parseTagDate } from '@/connectors/utils'
5+
import { parseTagDate } from '@/connectors/utils'
66

77
const logger = createLogger('GoogleCalendarConnector')
88

@@ -195,14 +195,12 @@ function getTimeRange(sourceConfig: Record<string, unknown>): { timeMin: string;
195195
/**
196196
* Converts a CalendarEvent to an ExternalDocument.
197197
*/
198-
async function eventToDocument(event: CalendarEvent): Promise<ExternalDocument | null> {
198+
function eventToDocument(event: CalendarEvent): ExternalDocument | null {
199199
if (event.status === 'cancelled') return null
200200

201201
const content = eventToContent(event)
202202
if (!content.trim()) return null
203203

204-
const contentHash = await computeContentHash(content)
205-
206204
const startTime = event.start?.dateTime || event.start?.date || ''
207205
const attendeeCount = event.attendees?.filter((a) => !a.resource).length || 0
208206

@@ -212,7 +210,7 @@ async function eventToDocument(event: CalendarEvent): Promise<ExternalDocument |
212210
content,
213211
mimeType: 'text/plain',
214212
sourceUrl: event.htmlLink || `https://calendar.google.com/calendar/event?eid=${event.id}`,
215-
contentHash,
213+
contentHash: `gcal:${event.id}:${event.updated ?? ''}`,
216214
metadata: {
217215
startTime,
218216
endTime: event.end?.dateTime || event.end?.date || '',
@@ -348,7 +346,7 @@ export const googleCalendarConnector: ConnectorConfig = {
348346

349347
const documents: ExternalDocument[] = []
350348
for (const event of events) {
351-
const doc = await eventToDocument(event)
349+
const doc = eventToDocument(event)
352350
if (doc) documents.push(doc)
353351
}
354352

@@ -392,7 +390,7 @@ export const googleCalendarConnector: ConnectorConfig = {
392390

393391
if (event.status === 'cancelled') return null
394392

395-
return eventToDocument(event)
393+
return eventToDocument(event) ?? null
396394
},
397395

398396
validateConfig: async (

apps/sim/connectors/google-docs/google-docs.ts

Lines changed: 30 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ import { createLogger } from '@sim/logger'
22
import { GoogleDocsIcon } from '@/components/icons'
33
import { fetchWithRetry, VALIDATE_RETRY_OPTIONS } from '@/lib/knowledge/documents/utils'
44
import type { ConnectorConfig, ExternalDocument, ExternalDocumentList } from '@/connectors/types'
5-
import { computeContentHash, joinTagArray, parseTagDate } from '@/connectors/utils'
5+
import { joinTagArray, parseTagDate } from '@/connectors/utils'
66

77
const logger = createLogger('GoogleDocsConnector')
88

@@ -117,40 +117,23 @@ async function fetchDocContent(accessToken: string, documentId: string): Promise
117117
}
118118

119119
/**
120-
* Converts a Drive file entry into an ExternalDocument by fetching its content
121-
* from the Google Docs API.
120+
* Creates a lightweight stub from a Drive file entry. Content is deferred
121+
* and only fetched via getDocument for new or changed documents.
122122
*/
123-
async function fileToDocument(
124-
accessToken: string,
125-
file: DriveFile
126-
): Promise<ExternalDocument | null> {
127-
try {
128-
const content = await fetchDocContent(accessToken, file.id)
129-
if (!content.trim()) {
130-
logger.info(`Skipping empty document: ${file.name} (${file.id})`)
131-
return null
132-
}
133-
134-
const contentHash = await computeContentHash(content)
135-
136-
return {
137-
externalId: file.id,
138-
title: file.name || 'Untitled',
139-
content,
140-
mimeType: 'text/plain',
141-
sourceUrl: file.webViewLink || `https://docs.google.com/document/d/${file.id}/edit`,
142-
contentHash,
143-
metadata: {
144-
modifiedTime: file.modifiedTime,
145-
createdTime: file.createdTime,
146-
owners: file.owners?.map((o) => o.displayName || o.emailAddress).filter(Boolean),
147-
},
148-
}
149-
} catch (error) {
150-
logger.warn(`Failed to extract content from document: ${file.name} (${file.id})`, {
151-
error: error instanceof Error ? error.message : String(error),
152-
})
153-
return null
123+
function fileToStub(file: DriveFile): ExternalDocument {
124+
return {
125+
externalId: file.id,
126+
title: file.name || 'Untitled',
127+
content: '',
128+
contentDeferred: true,
129+
mimeType: 'text/plain',
130+
sourceUrl: file.webViewLink || `https://docs.google.com/document/d/${file.id}/edit`,
131+
contentHash: `gdocs:${file.id}:${file.modifiedTime ?? ''}`,
132+
metadata: {
133+
modifiedTime: file.modifiedTime,
134+
createdTime: file.createdTime,
135+
owners: file.owners?.map((o) => o.displayName || o.emailAddress).filter(Boolean),
136+
},
154137
}
155138
}
156139

@@ -246,18 +229,11 @@ export const googleDocsConnector: ConnectorConfig = {
246229
const maxDocs = sourceConfig.maxDocs ? Number(sourceConfig.maxDocs) : 0
247230
const previouslyFetched = (syncContext?.totalDocsFetched as number) ?? 0
248231

249-
const CONCURRENCY = 5
250-
const documents: ExternalDocument[] = []
251-
for (let i = 0; i < files.length; i += CONCURRENCY) {
252-
if (maxDocs > 0 && previouslyFetched + documents.length >= maxDocs) break
253-
const batch = files.slice(i, i + CONCURRENCY)
254-
const results = await Promise.all(batch.map((file) => fileToDocument(accessToken, file)))
255-
documents.push(...(results.filter(Boolean) as ExternalDocument[]))
256-
}
232+
let documents = files.map(fileToStub)
257233
if (maxDocs > 0) {
258234
const remaining = maxDocs - previouslyFetched
259235
if (documents.length > remaining) {
260-
documents.splice(remaining)
236+
documents = documents.slice(0, remaining)
261237
}
262238
}
263239

@@ -300,7 +276,17 @@ export const googleDocsConnector: ConnectorConfig = {
300276
if (file.trashed) return null
301277
if (file.mimeType !== 'application/vnd.google-apps.document') return null
302278

303-
return fileToDocument(accessToken, file)
279+
try {
280+
const content = await fetchDocContent(accessToken, file.id)
281+
if (!content.trim()) return null
282+
283+
return { ...fileToStub(file), content, contentDeferred: false }
284+
} catch (error) {
285+
logger.warn(`Failed to extract content from document: ${file.name} (${file.id})`, {
286+
error: error instanceof Error ? error.message : String(error),
287+
})
288+
return null
289+
}
304290
},
305291

306292
validateConfig: async (

apps/sim/connectors/google-sheets/google-sheets.ts

Lines changed: 23 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ import { createLogger } from '@sim/logger'
22
import { GoogleSheetsIcon } from '@/components/icons'
33
import { fetchWithRetry, VALIDATE_RETRY_OPTIONS } from '@/lib/knowledge/documents/utils'
44
import type { ConnectorConfig, ExternalDocument, ExternalDocumentList } from '@/connectors/types'
5-
import { computeContentHash, parseTagDate } from '@/connectors/utils'
5+
import { parseTagDate } from '@/connectors/utils'
66

77
const logger = createLogger('GoogleSheetsConnector')
88

@@ -168,7 +168,6 @@ async function sheetToDocument(
168168
return null
169169
}
170170

171-
const contentHash = await computeContentHash(content)
172171
const rowCount = dataRows.length
173172

174173
return {
@@ -177,7 +176,7 @@ async function sheetToDocument(
177176
content,
178177
mimeType: 'text/plain',
179178
sourceUrl: `https://docs.google.com/spreadsheets/d/${spreadsheetId}/edit#gid=${sheet.sheetId}`,
180-
contentHash,
179+
contentHash: `gsheets:${spreadsheetId}:${sheet.sheetId}:${modifiedTime ?? ''}`,
181180
metadata: {
182181
spreadsheetId,
183182
spreadsheetTitle,
@@ -259,22 +258,24 @@ export const googleSheetsConnector: ConnectorConfig = {
259258
sheetCount: sheets.length,
260259
})
261260

262-
const documents: ExternalDocument[] = []
263-
for (let i = 0; i < sheets.length; i += CONCURRENCY) {
264-
const batch = sheets.slice(i, i + CONCURRENCY)
265-
const results = await Promise.all(
266-
batch.map((sheet) =>
267-
sheetToDocument(
268-
accessToken,
269-
spreadsheetId,
270-
metadata.properties.title,
271-
sheet,
272-
modifiedTime
273-
)
274-
)
275-
)
276-
documents.push(...(results.filter(Boolean) as ExternalDocument[]))
277-
}
261+
const documents: ExternalDocument[] = sheets.map((sheet) => ({
262+
externalId: `${spreadsheetId}__sheet__${sheet.sheetId}`,
263+
title: `${metadata.properties.title} - ${sheet.title}`,
264+
content: '',
265+
contentDeferred: true,
266+
mimeType: 'text/plain',
267+
sourceUrl: `https://docs.google.com/spreadsheets/d/${spreadsheetId}/edit#gid=${sheet.sheetId}`,
268+
contentHash: `gsheets:${spreadsheetId}:${sheet.sheetId}:${modifiedTime ?? ''}`,
269+
metadata: {
270+
spreadsheetId,
271+
spreadsheetTitle: metadata.properties.title,
272+
sheetTitle: sheet.title,
273+
sheetId: sheet.sheetId,
274+
rowCount: sheet.gridProperties?.rowCount,
275+
columnCount: sheet.gridProperties?.columnCount,
276+
...(modifiedTime ? { modifiedTime } : {}),
277+
},
278+
}))
278279

279280
return {
280281
documents,
@@ -324,13 +325,15 @@ export const googleSheetsConnector: ConnectorConfig = {
324325
return null
325326
}
326327

327-
return sheetToDocument(
328+
const doc = await sheetToDocument(
328329
accessToken,
329330
spreadsheetId,
330331
metadata.properties.title,
331332
sheetEntry.properties,
332333
modifiedTime
333334
)
335+
if (!doc) return null
336+
return { ...doc, contentDeferred: false }
334337
},
335338

336339
validateConfig: async (

0 commit comments

Comments
 (0)