Skip to content

Commit a5ae7a6

Browse files
authored
Merge pull request #1066 from constructive-io/feat/data-file-embedding
feat: add DataFileEmbedding node type definition
2 parents 2659593 + 81b5321 commit a5ae7a6

2 files changed

Lines changed: 199 additions & 0 deletions

File tree

Lines changed: 198 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,198 @@
1+
import type { NodeTypeDefinition } from '../types';
2+
3+
export const DataFileEmbedding: NodeTypeDefinition = {
4+
name: 'DataFileEmbedding',
5+
slug: 'data_file_embedding',
6+
category: 'data',
7+
display_name: 'File Embedding',
8+
description:
9+
'Generic, MIME-scoped embedding node for file tables. Supports two modes: ' +
10+
'direct (whole-file to single vector, e.g. CLIP for images) when extraction ' +
11+
'is omitted, or extract (file to text to chunks to per-chunk vectors) when ' +
12+
'extraction config is provided. Composes SearchVector + DataJobTrigger ' +
13+
'internally. Multiple instances can coexist on the same table with different ' +
14+
'MIME scopes, field names, and embedding strategies.',
15+
parameter_schema: {
16+
type: 'object',
17+
properties: {
18+
19+
// ── Vector config (passed through to SearchVector) ─────────────
20+
field_name: {
21+
type: 'string',
22+
format: 'column-ref',
23+
description: 'Name of the vector embedding column',
24+
default: 'embedding'
25+
},
26+
dimensions: {
27+
type: 'integer',
28+
description: 'Vector dimensions (e.g. 512 for CLIP, 768 for nomic, 1536 for ada-002)',
29+
default: 768
30+
},
31+
index_method: {
32+
type: 'string',
33+
enum: ['hnsw', 'ivfflat'],
34+
description: 'Index type for similarity search',
35+
default: 'hnsw'
36+
},
37+
metric: {
38+
type: 'string',
39+
enum: ['cosine', 'l2', 'ip'],
40+
description: 'Distance metric',
41+
default: 'cosine'
42+
},
43+
index_options: {
44+
type: 'object',
45+
description: 'Index-specific options. HNSW: {m, ef_construction}. IVFFlat: {lists}.',
46+
default: {}
47+
},
48+
49+
// ── MIME scoping ───────────────────────────────────────────────
50+
mime_patterns: {
51+
type: 'array',
52+
items: { type: 'string' },
53+
description:
54+
'MIME type LIKE patterns to match. Multiple patterns are OR\'d together. ' +
55+
'Examples: [\'image/%\'], [\'application/pdf\', \'text/%\'], [\'audio/%\'].',
56+
default: ['image/%']
57+
},
58+
59+
// ── Job routing ────────────────────────────────────────────────
60+
task_identifier: {
61+
type: 'string',
62+
description:
63+
'Job task identifier for the worker. In direct mode this is the ' +
64+
'embedding worker; in extract mode this is the extraction worker.',
65+
default: 'process_file_embedding'
66+
},
67+
events: {
68+
type: 'array',
69+
items: { type: 'string', enum: ['INSERT', 'UPDATE'] },
70+
description: 'Trigger events that fire the job',
71+
default: ['INSERT']
72+
},
73+
payload_custom: {
74+
type: 'object',
75+
additionalProperties: { type: 'string', format: 'column-ref' },
76+
description: 'Custom payload key-to-column mapping for the job trigger',
77+
default: {
78+
file_id: 'id',
79+
key: 'key',
80+
mime_type: 'mime_type',
81+
bucket_id: 'bucket_id'
82+
}
83+
},
84+
trigger_conditions: {
85+
description:
86+
'Additional compound conditions beyond MIME filtering. ' +
87+
'Merged with the auto-generated MIME conditions via AND. ' +
88+
'Use this to add status checks, field guards, etc.',
89+
'x-codegen-type': 'TriggerCondition | TriggerCondition[]',
90+
oneOf: [
91+
{ $ref: '#/$defs/triggerCondition' },
92+
{ type: 'array', items: { $ref: '#/$defs/triggerCondition' } }
93+
]
94+
},
95+
96+
// ── Extraction config (optional — enables extract mode) ────────
97+
extraction: {
98+
type: 'object',
99+
description:
100+
'Text extraction configuration. When present, the generator creates ' +
101+
'extraction output fields on the table and configures SearchVector with ' +
102+
'source_fields + stale tracking. When absent, the node operates in direct ' +
103+
'mode (single vector per file, no text extraction).',
104+
properties: {
105+
text_field: {
106+
type: 'string',
107+
format: 'column-ref',
108+
description: 'Field to store extracted text/markdown',
109+
default: 'extracted_text'
110+
},
111+
metadata_field: {
112+
type: 'string',
113+
format: 'column-ref',
114+
description: 'JSONB field for extraction metadata (page count, language, etc.)',
115+
default: 'extracted_metadata'
116+
},
117+
status_field: {
118+
type: 'string',
119+
format: 'column-ref',
120+
description: 'Extraction lifecycle status field',
121+
default: 'extraction_status'
122+
}
123+
}
124+
},
125+
126+
// ── Chunking config (optional — creates embedding_chunks) ──────
127+
chunks: {
128+
type: 'object',
129+
description:
130+
'Chunking configuration. Creates an embedding_chunks record that drives ' +
131+
'automatic text splitting and per-chunk embedding. Only meaningful when ' +
132+
'extraction is also provided.',
133+
properties: {
134+
content_field_name: {
135+
type: 'string',
136+
format: 'column-ref',
137+
description: 'Name of the text content column in the chunks table',
138+
default: 'content'
139+
},
140+
chunk_size: {
141+
type: 'integer',
142+
description: 'Maximum number of characters per chunk',
143+
default: 1000
144+
},
145+
chunk_overlap: {
146+
type: 'integer',
147+
description: 'Number of overlapping characters between consecutive chunks',
148+
default: 200
149+
},
150+
chunk_strategy: {
151+
type: 'string',
152+
enum: ['fixed', 'sentence', 'paragraph', 'semantic'],
153+
description: 'Strategy for splitting text into chunks',
154+
default: 'paragraph'
155+
},
156+
metadata_fields: {
157+
type: 'object',
158+
description: 'Metadata fields from parent to copy into chunks'
159+
},
160+
enqueue_chunking_job: {
161+
type: 'boolean',
162+
description: 'Whether to auto-enqueue a chunking job on insert/update',
163+
default: true
164+
},
165+
chunking_task_name: {
166+
type: 'string',
167+
description: 'Task identifier for the chunking job queue',
168+
default: 'generate_chunks'
169+
}
170+
}
171+
},
172+
173+
// ── Stale tracking (meaningful in extract mode) ────────────────
174+
stale_strategy: {
175+
type: 'string',
176+
enum: ['column', 'null', 'hash'],
177+
description:
178+
'Strategy for tracking embedding staleness when extraction is enabled. ' +
179+
'column: embedding_stale boolean. null: set embedding to NULL. hash: md5 hash.',
180+
default: 'column'
181+
},
182+
include_stale_field: {
183+
type: 'boolean',
184+
description: 'Whether to include the embedding_stale boolean field (extract mode)',
185+
default: true
186+
}
187+
}
188+
},
189+
tags: [
190+
'embedding',
191+
'vector',
192+
'ai',
193+
'composition',
194+
'jobs',
195+
'multimodal',
196+
'files'
197+
]
198+
};

packages/node-type-registry/src/data/index.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
export { DataCompositeField } from './data-composite-field';
22
export { DataDirectOwner } from './data-direct-owner';
33
export { DataEntityMembership } from './data-entity-membership';
4+
export { DataFileEmbedding } from './data-file-embedding';
45
export { DataFeatureFlag } from './data-feature-flag';
56
export { DataForceCurrentUser } from './data-force-current-user';
67
export { DataId } from './data-id';

0 commit comments

Comments
 (0)