Merge pull request #1066 from constructive-io/feat/data-file-embedding

pyramation · web-flow · commit a5ae7a6ec65f · 2026-05-07T15:17:39.000-07:00
feat: add DataFileEmbedding node type definition
diff --git a/packages/node-type-registry/src/data/data-file-embedding.ts b/packages/node-type-registry/src/data/data-file-embedding.ts
@@ -0,0 +1,198 @@
+import type { NodeTypeDefinition } from '../types';
+
+export const DataFileEmbedding: NodeTypeDefinition = {
+  name: 'DataFileEmbedding',
+  slug: 'data_file_embedding',
+  category: 'data',
+  display_name: 'File Embedding',
+  description:
+    'Generic, MIME-scoped embedding node for file tables. Supports two modes: ' +
+    'direct (whole-file to single vector, e.g. CLIP for images) when extraction ' +
+    'is omitted, or extract (file to text to chunks to per-chunk vectors) when ' +
+    'extraction config is provided. Composes SearchVector + DataJobTrigger ' +
+    'internally. Multiple instances can coexist on the same table with different ' +
+    'MIME scopes, field names, and embedding strategies.',
+  parameter_schema: {
+    type: 'object',
+    properties: {
+
+      // ── Vector config (passed through to SearchVector) ─────────────
+      field_name: {
+        type: 'string',
+        format: 'column-ref',
+        description: 'Name of the vector embedding column',
+        default: 'embedding'
+      },
+      dimensions: {
+        type: 'integer',
+        description: 'Vector dimensions (e.g. 512 for CLIP, 768 for nomic, 1536 for ada-002)',
+        default: 768
+      },
+      index_method: {
+        type: 'string',
+        enum: ['hnsw', 'ivfflat'],
+        description: 'Index type for similarity search',
+        default: 'hnsw'
+      },
+      metric: {
+        type: 'string',
+        enum: ['cosine', 'l2', 'ip'],
+        description: 'Distance metric',
+        default: 'cosine'
+      },
+      index_options: {
+        type: 'object',
+        description: 'Index-specific options. HNSW: {m, ef_construction}. IVFFlat: {lists}.',
+        default: {}
+      },
+
+      // ── MIME scoping ───────────────────────────────────────────────
+      mime_patterns: {
+        type: 'array',
+        items: { type: 'string' },
+        description:
+          'MIME type LIKE patterns to match. Multiple patterns are OR\'d together. ' +
+          'Examples: [\'image/%\'], [\'application/pdf\', \'text/%\'], [\'audio/%\'].',
+        default: ['image/%']
+      },
+
+      // ── Job routing ────────────────────────────────────────────────
+      task_identifier: {
+        type: 'string',
+        description:
+          'Job task identifier for the worker. In direct mode this is the ' +
+          'embedding worker; in extract mode this is the extraction worker.',
+        default: 'process_file_embedding'
+      },
+      events: {
+        type: 'array',
+        items: { type: 'string', enum: ['INSERT', 'UPDATE'] },
+        description: 'Trigger events that fire the job',
+        default: ['INSERT']
+      },
+      payload_custom: {
+        type: 'object',
+        additionalProperties: { type: 'string', format: 'column-ref' },
+        description: 'Custom payload key-to-column mapping for the job trigger',
+        default: {
+          file_id: 'id',
+          key: 'key',
+          mime_type: 'mime_type',
+          bucket_id: 'bucket_id'
+        }
+      },
+      trigger_conditions: {
+        description:
+          'Additional compound conditions beyond MIME filtering. ' +
+          'Merged with the auto-generated MIME conditions via AND. ' +
+          'Use this to add status checks, field guards, etc.',
+        'x-codegen-type': 'TriggerCondition | TriggerCondition[]',
+        oneOf: [
+          { $ref: '#/$defs/triggerCondition' },
+          { type: 'array', items: { $ref: '#/$defs/triggerCondition' } }
+        ]
+      },
+
+      // ── Extraction config (optional — enables extract mode) ────────
+      extraction: {
+        type: 'object',
+        description:
+          'Text extraction configuration. When present, the generator creates ' +
+          'extraction output fields on the table and configures SearchVector with ' +
+          'source_fields + stale tracking. When absent, the node operates in direct ' +
+          'mode (single vector per file, no text extraction).',
+        properties: {
+          text_field: {
+            type: 'string',
+            format: 'column-ref',
+            description: 'Field to store extracted text/markdown',
+            default: 'extracted_text'
+          },
+          metadata_field: {
+            type: 'string',
+            format: 'column-ref',
+            description: 'JSONB field for extraction metadata (page count, language, etc.)',
+            default: 'extracted_metadata'
+          },
+          status_field: {
+            type: 'string',
+            format: 'column-ref',
+            description: 'Extraction lifecycle status field',
+            default: 'extraction_status'
+          }
+        }
+      },
+
+      // ── Chunking config (optional — creates embedding_chunks) ──────
+      chunks: {
+        type: 'object',
+        description:
+          'Chunking configuration. Creates an embedding_chunks record that drives ' +
+          'automatic text splitting and per-chunk embedding. Only meaningful when ' +
+          'extraction is also provided.',
+        properties: {
+          content_field_name: {
+            type: 'string',
+            format: 'column-ref',
+            description: 'Name of the text content column in the chunks table',
+            default: 'content'
+          },
+          chunk_size: {
+            type: 'integer',
+            description: 'Maximum number of characters per chunk',
+            default: 1000
+          },
+          chunk_overlap: {
+            type: 'integer',
+            description: 'Number of overlapping characters between consecutive chunks',
+            default: 200
+          },
+          chunk_strategy: {
+            type: 'string',
+            enum: ['fixed', 'sentence', 'paragraph', 'semantic'],
+            description: 'Strategy for splitting text into chunks',
+            default: 'paragraph'
+          },
+          metadata_fields: {
+            type: 'object',
+            description: 'Metadata fields from parent to copy into chunks'
+          },
+          enqueue_chunking_job: {
+            type: 'boolean',
+            description: 'Whether to auto-enqueue a chunking job on insert/update',
+            default: true
+          },
+          chunking_task_name: {
+            type: 'string',
+            description: 'Task identifier for the chunking job queue',
+            default: 'generate_chunks'
+          }
+        }
+      },
+
+      // ── Stale tracking (meaningful in extract mode) ────────────────
+      stale_strategy: {
+        type: 'string',
+        enum: ['column', 'null', 'hash'],
+        description:
+          'Strategy for tracking embedding staleness when extraction is enabled. ' +
+          'column: embedding_stale boolean. null: set embedding to NULL. hash: md5 hash.',
+        default: 'column'
+      },
+      include_stale_field: {
+        type: 'boolean',
+        description: 'Whether to include the embedding_stale boolean field (extract mode)',
+        default: true
+      }
+    }
+  },
+  tags: [
+    'embedding',
+    'vector',
+    'ai',
+    'composition',
+    'jobs',
+    'multimodal',
+    'files'
+  ]
+};
diff --git a/packages/node-type-registry/src/data/index.ts b/packages/node-type-registry/src/data/index.ts
@@ -1,6 +1,7 @@
 export { DataCompositeField } from './data-composite-field';
 export { DataDirectOwner } from './data-direct-owner';
 export { DataEntityMembership } from './data-entity-membership';
+export { DataFileEmbedding } from './data-file-embedding';
 export { DataFeatureFlag } from './data-feature-flag';
 export { DataForceCurrentUser } from './data-force-current-user';
 export { DataId } from './data-id';