framework/packages/spec/src/automation/etl.zod.ts at main · objectstack-ai/framework · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
// Copyright (c) 2025 ObjectStack. Licensed under the Apache-2.0 license.

import { z } from 'zod';
import { CronExpressionInputSchema } from '../shared/expression.zod';

/**
 * ETL (Extract, Transform, Load) Pipeline Protocol - LEVEL 2: Data Engineering
 *
 * Inspired by modern data integration platforms like Airbyte, Fivetran, and Apache NiFi.
 *
 * **Positioning in 3-Layer Architecture:**
 * - **L1: Simple Sync** (automation/sync.zod.ts) - Business users - Sync Salesforce to Sheets
 * - **L2: ETL Pipeline** (THIS FILE) - Data engineers - Aggregate 10 sources to warehouse
 * - **L3: Enterprise Connector** (integration/connector.zod.ts) - System integrators - Full SAP integration
 *
 * ETL pipelines enable automated data synchronization between systems, transforming
 * data as it moves from source to destination.
 *
 * **SCOPE: Advanced multi-source, multi-stage transformations.**
 * Supports complex operations: joins, aggregations, filtering, custom SQL.
 *
 * ## When to Use This Layer
 *
 * **Use ETL Pipeline when:**
 * - Combining data from multiple sources
 * - Need aggregations, joins, transformations
 * - Building data warehouses or analytics platforms
 * - Complex data transformations required
 *
 * **Examples:**
 * - Sales data from Salesforce + Marketing from HubSpot → Data Warehouse
 * - Multi-region databases → Consolidated reporting
 * - Legacy system migration with transformation
 *
 * **When to downgrade:**
 * - Simple 1:1 sync → Use {@link file://./sync.zod.ts | Simple Sync}
 *
 * **When to upgrade:**
 * - Need full connector lifecycle (auth, webhooks, rate limits) → Use {@link file://../integration/connector.zod.ts | Enterprise Connector}
 *
 * @see {@link file://./sync.zod.ts} for Level 1 (simple sync)
 * @see {@link file://../integration/connector.zod.ts} for Level 3 (enterprise integration)
 *
 * ## Use Cases
 *
 * 1. **Data Warehouse Population**
 *    - Extract from multiple operational systems
 *    - Transform to analytical schema
 *    - Load into data warehouse
 *
 * 2. **System Integration**
 *    - Sync data between CRM and Marketing Automation
 *    - Keep product catalog synchronized across e-commerce platforms
 *    - Replicate data for backup/disaster recovery
 *
 * 3. **Data Migration**
 *    - Move data from legacy systems to modern platforms
 *    - Consolidate data from multiple sources
 *    - Split monolithic databases into microservices
 *
 * @see https://airbyte.com/
 * @see https://docs.fivetran.com/
 * @see https://nifi.apache.org/
 *
 * @example
 * ```typescript
 * const salesforceToDB: ETLPipeline = {
 *   name: 'salesforce_to_postgres',
 *   label: 'Salesforce Accounts to PostgreSQL',
 *   source: {
 *     type: 'api',
 *     connector: 'salesforce',
 *     config: { object: 'Account' }
 *   },
 *   destination: {
 *     type: 'database',
 *     connector: 'postgres',
 *     config: { table: 'accounts' }
 *   },
 *   transformations: [
 *     { type: 'map', config: { 'Name': 'account_name' } }
 *   ],
 *   schedule: '0 2 * * *' // Daily at 2 AM
 * }
 * ```
 */

/**
 * ETL Source/Destination Type
 */
import { lazySchema } from '../shared/lazy-schema';
export const ETLEndpointTypeSchema = lazySchema(() => z.enum([
  'database',    // SQL/NoSQL databases
  'api',         // REST/GraphQL APIs
  'file',        // CSV, JSON, XML, Excel files
  'stream',      // Kafka, RabbitMQ, Kinesis
  'object',      // ObjectStack object
  'warehouse',   // Data warehouse (Snowflake, BigQuery, Redshift)
  'storage',     // S3, Azure Blob, Google Cloud Storage
  'spreadsheet', // Google Sheets, Excel Online
]));

export type ETLEndpointType = z.infer<typeof ETLEndpointTypeSchema>;

/**
 * ETL Source Configuration
 */
export const ETLSourceSchema = lazySchema(() => z.object({
  /**
   * Source type
   */
  type: ETLEndpointTypeSchema.describe('Source type'),

  /**
   * Connector identifier
   * References a registered connector
   *
   * @example "salesforce", "postgres", "mysql", "s3"
   */
  connector: z.string().optional().describe('Connector ID'),

  /**
   * Source-specific configuration
   * Structure varies by source type
   *
   * @example For database: { table: 'customers', schema: 'public' }
   * @example For API: { endpoint: '/api/users', method: 'GET' }
   * @example For file: { path: 's3://bucket/data.csv', format: 'csv' }
   */
  config: z.record(z.string(), z.unknown()).describe('Source configuration'),

  /**
   * Incremental sync configuration
   * Allows extracting only changed data
   */
  incremental: z.object({
    enabled: z.boolean().default(false),
    cursorField: z.string().describe('Field to track progress (e.g., updated_at)'),
    cursorValue: z.unknown().optional().describe('Last processed value'),
  }).optional().describe('Incremental extraction config'),
}));

export type ETLSource = z.infer<typeof ETLSourceSchema>;

/**
 * ETL Destination Configuration
 */
export const ETLDestinationSchema = lazySchema(() => z.object({
  /**
   * Destination type
   */
  type: ETLEndpointTypeSchema.describe('Destination type'),

  /**
   * Connector identifier
   */
  connector: z.string().optional().describe('Connector ID'),

  /**
   * Destination-specific configuration
   */
  config: z.record(z.string(), z.unknown()).describe('Destination configuration'),

  /**
   * Write mode
   */
  writeMode: z.enum([
    'append',      // Add new records
    'overwrite',   // Replace all data
    'upsert',      // Insert or update based on key
    'merge',       // Smart merge based on business rules
  ]).default('append').describe('How to write data'),

  /**
   * Primary key fields for upsert/merge
   */
  primaryKey: z.array(z.string()).optional().describe('Primary key fields'),
}));

export type ETLDestination = z.infer<typeof ETLDestinationSchema>;

/**
 * ETL Transformation Type
 */
export const ETLTransformationTypeSchema = lazySchema(() => z.enum([
  'map',         // Field mapping/renaming
  'filter',      // Row filtering
  'aggregate',   // Aggregation/grouping
  'join',        // Joining with other data
  'script',      // Custom JavaScript/Python script
  'lookup',      // Enrich with lookup data
  'split',       // Split one record into multiple
  'merge',       // Merge multiple records into one
  'normalize',   // Data normalization
  'deduplicate', // Remove duplicates
]));

export type ETLTransformationType = z.infer<typeof ETLTransformationTypeSchema>;

/**
 * ETL Transformation Configuration
 */
export const ETLTransformationSchema = lazySchema(() => z.object({
  /**
   * Transformation name
   */
  name: z.string().optional().describe('Transformation name'),

  /**
   * Transformation type
   */
  type: ETLTransformationTypeSchema.describe('Transformation type'),

  /**
   * Transformation-specific configuration
   *
   * @example For map: { oldField: 'newField' }
   * @example For filter: { condition: 'status == "active"' }
   * @example For script: { language: 'javascript', code: '...' }
   */
  config: z.record(z.string(), z.unknown()).describe('Transformation config'),

  /**
   * Whether to continue on error
   */
  continueOnError: z.boolean().default(false).describe('Continue on error'),
}));

export type ETLTransformation = z.infer<typeof ETLTransformationSchema>;

/**
 * ETL Sync Mode
 */
export const ETLSyncModeSchema = lazySchema(() => z.enum([
  'full',        // Full refresh - extract all data every time
  'incremental', // Only extract changed data
  'cdc',         // Change Data Capture - real-time streaming
]));

export type ETLSyncMode = z.infer<typeof ETLSyncModeSchema>;

/**
 * ETL Pipeline Schema
 *
 * Complete definition of a data pipeline from source to destination with transformations.
 */
export const ETLPipelineSchema = lazySchema(() => z.object({
  /**
   * Pipeline identifier (snake_case)
   */
  name: z.string()
    .regex(/^[a-z_][a-z0-9_]*$/)
    .describe('Pipeline identifier (snake_case)'),

  /**
   * Human-readable pipeline name
   */
  label: z.string().optional().describe('Pipeline display name'),

  /**
   * Pipeline description
   */
  description: z.string().optional().describe('Pipeline description'),

  /**
   * Data source configuration
   */
  source: ETLSourceSchema.describe('Data source'),

  /**
   * Data destination configuration
   */
  destination: ETLDestinationSchema.describe('Data destination'),

  /**
   * Transformation steps
   * Applied in order from source to destination
   */
  transformations: z.array(ETLTransformationSchema)
    .optional()
    .describe('Transformation pipeline'),

  /**
   * Sync mode
   */
  syncMode: ETLSyncModeSchema.default('full').describe('Sync mode'),

  /**
   * Execution schedule (cron expression)
   *
   * @example "0 2 * * *" - Daily at 2 AM
   * @example "0 *\/4 * * *" - Every 4 hours
   * @example "0 0 * * 0" - Weekly on Sunday
   */
  schedule: CronExpressionInputSchema.optional().describe('Cron schedule expression'),

  /**
   * Whether pipeline is enabled
   */
  enabled: z.boolean().default(true).describe('Pipeline enabled status'),

  /**
   * Retry configuration for failed runs
   */
  retry: z.object({
    maxAttempts: z.number().int().min(0).default(3).describe('Max retry attempts'),
    backoffMs: z.number().int().min(0).default(60000).describe('Backoff in milliseconds'),
  }).optional().describe('Retry configuration'),

  /**
   * Notification configuration
   */
  notifications: z.object({
    onSuccess: z.array(z.string()).optional().describe('Email addresses for success notifications'),
    onFailure: z.array(z.string()).optional().describe('Email addresses for failure notifications'),
  }).optional().describe('Notification settings'),

  /**
   * Pipeline tags for organization
   */
  tags: z.array(z.string()).optional().describe('Pipeline tags'),

  /**
   * Custom metadata
   */
  metadata: z.record(z.string(), z.unknown()).optional().describe('Custom metadata'),
}));

export type ETLPipeline = z.infer<typeof ETLPipelineSchema>;

/**
 * ETL Run Status
 */
export const ETLRunStatusSchema = lazySchema(() => z.enum([
  'pending',    // Queued for execution
  'running',    // Currently executing
  'succeeded',  // Completed successfully
  'failed',     // Failed with errors
  'cancelled',  // Manually cancelled
  'timeout',    // Timed out
]));

export type ETLRunStatus = z.infer<typeof ETLRunStatusSchema>;

/**
 * ETL Pipeline Run Result
 *
 * Result of a pipeline execution
 */
export const ETLPipelineRunSchema = lazySchema(() => z.object({
  /**
   * Run ID
   */
  id: z.string().describe('Run identifier'),

  /**
   * Pipeline name
   */
  pipelineName: z.string().describe('Pipeline name'),

  /**
   * Run status
   */
  status: ETLRunStatusSchema.describe('Run status'),

  /**
   * Start timestamp
   */
  startedAt: z.string().datetime().describe('Start time'),

  /**
   * End timestamp
   */
  completedAt: z.string().datetime().optional().describe('Completion time'),

  /**
   * Duration in milliseconds
   */
  durationMs: z.number().optional().describe('Duration in ms'),

  /**
   * Statistics
   */
  stats: z.object({
    recordsRead: z.number().int().default(0).describe('Records extracted'),
    recordsWritten: z.number().int().default(0).describe('Records loaded'),
    recordsErrored: z.number().int().default(0).describe('Records with errors'),
    bytesProcessed: z.number().int().default(0).describe('Bytes processed'),
  }).optional().describe('Run statistics'),

  /**
   * Error information
   */
  error: z.object({
    message: z.string().describe('Error message'),
    code: z.string().optional().describe('Error code'),
    details: z.unknown().optional().describe('Error details'),
  }).optional().describe('Error information'),

  /**
   * Execution logs
   */
  logs: z.array(z.string()).optional().describe('Execution logs'),
}));

export type ETLPipelineRun = z.infer<typeof ETLPipelineRunSchema>;

/**
 * Helper factory for creating ETL pipelines
 */
export const ETL = {
  /**
   * Create a simple database-to-database pipeline
   */
  databaseSync: (params: {
    name: string;
    sourceTable: string;
    destTable: string;
    schedule?: import("../shared/expression.zod").CronExpressionInput;
  }): ETLPipeline => ({
    name: params.name,
    source: {
      type: 'database',
      config: { table: params.sourceTable },
    },
    destination: {
      type: 'database',
      config: { table: params.destTable },
      writeMode: 'upsert',
    },
    syncMode: 'incremental',
    schedule: typeof params.schedule === 'string' ? { dialect: 'cron', source: params.schedule } : params.schedule,
    enabled: true,
  }),

  /**
   * Create an API to database pipeline
   */
  apiToDatabase: (params: {
    name: string;
    apiConnector: string;
    destTable: string;
    schedule?: import("../shared/expression.zod").CronExpressionInput;
  }): ETLPipeline => ({
    name: params.name,
    source: {
      type: 'api',
      connector: params.apiConnector,
      config: {},
    },
    destination: {
      type: 'database',
      config: { table: params.destTable },
      writeMode: 'append',
    },
    syncMode: 'full',
    schedule: typeof params.schedule === 'string' ? { dialect: 'cron', source: params.schedule } : params.schedule,
    enabled: true,
  }),
} as const;