Skip to content

Commit 4247a3f

Browse files
authored
ENG-1822 Modify upsert_concepts so it accepts embedded content (#1097)
1 parent 33a3224 commit 4247a3f

10 files changed

Lines changed: 657 additions & 199 deletions

File tree

apps/roam/src/utils/syncDgNodesToSupabase.ts

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -20,13 +20,16 @@ import { fetchEmbeddingsForNodes } from "./upsertNodesAsContentWithEmbeddings";
2020
import { convertRoamNodeToLocalContent } from "./upsertNodesAsContentWithEmbeddings";
2121
import type { DGSupabaseClient } from "@repo/database/lib/client";
2222
import { intersection } from "@repo/utils/setOperations";
23-
import type { Json, CompositeTypes, Enums } from "@repo/database/dbTypes";
23+
import type { Json, Enums } from "@repo/database/dbTypes";
2424
import { render as renderToast } from "roamjs-components/components/Toast";
2525
import internalError from "~/utils/internalError";
26-
type LocalContentDataInput = Partial<CompositeTypes<"content_local_input">>;
27-
type AccountLocalInput = CompositeTypes<"account_local_input">;
2826
import { FatalError } from "@repo/database/lib/contextFunctions";
2927
import { getAllPages } from "@repo/database/lib/pagination";
28+
import type {
29+
LocalConceptDataInput,
30+
LocalContentDataInput,
31+
LocalAccountDataInput,
32+
} from "@repo/database/inputTypes";
3033

3134
const SYNC_FUNCTION = "embedding";
3235
// Minimal interval between syncs of all clients for this task.
@@ -43,8 +46,6 @@ type SyncTaskInfo = {
4346
shouldProceed: boolean;
4447
};
4548

46-
type LocalConceptDataInput = Partial<CompositeTypes<"concept_local_input">>;
47-
4849
const chunk = <T>(array: T[], size: number): T[][] => {
4950
const chunks: T[][] = [];
5051
for (let i = 0; i < array.length; i += size) {
@@ -389,7 +390,7 @@ export const upsertNodesToSupabaseAsContentWithEmbeddings = async (
389390
await uploadBatches(chunk(nodesWithEmbeddings, BATCH_SIZE));
390391
};
391392

392-
const getAllUsers = async (): Promise<AccountLocalInput[]> => {
393+
const getAllUsers = async (): Promise<LocalAccountDataInput[]> => {
393394
const query = `[:find ?author_local_id ?author_name
394395
:keys author_local_id name
395396
:where
@@ -413,7 +414,7 @@ const getAllUsers = async (): Promise<AccountLocalInput[]> => {
413414
};
414415

415416
const upsertUsers = async (
416-
users: AccountLocalInput[],
417+
users: LocalAccountDataInput[],
417418
supabaseClient: DGSupabaseClient,
418419
context: SupabaseContext,
419420
) => {

apps/roam/src/utils/upsertNodesAsContentWithEmbeddings.ts

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,8 @@ import { type RoamDiscourseNodeData } from "./getAllDiscourseNodesSince";
22
import { type SupabaseContext } from "./supabaseContext";
33
import { nextApiRoot } from "@repo/utils/execContext";
44
import type { DGSupabaseClient } from "@repo/database/lib/client";
5-
import type { Json, CompositeTypes } from "@repo/database/dbTypes";
6-
7-
type LocalContentDataInput = Partial<CompositeTypes<"content_local_input">>;
5+
import type { Json } from "@repo/database/dbTypes";
6+
import type { LocalContentDataInput } from "@repo/database/inputTypes";
87

98
const EMBEDDING_BATCH_SIZE = 200;
109
const EMBEDDING_MODEL = "openai_text_embedding_3_small_1536";

packages/database/doc/upsert_content.md

Lines changed: 0 additions & 69 deletions
This file was deleted.
Lines changed: 236 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,236 @@
1+
/*
2+
A few examples of using upsert_authors / upsert_document / upsert_content / upsert_concept etc.
3+
4+
In general, for external references such as `author_id`, you can either:
5+
6+
1. use the database id as is: `author_id`
7+
2. Use the platform id, with the name transformed such as `author_local_id`
8+
3. Put the data inline as a subobject, with the name transformed such as `author_inline`
9+
10+
Note that embeddings are always inline.
11+
12+
Here are complete examples:
13+
*/
14+
15+
import type {
16+
LocalAccountDataInput,
17+
LocalDocumentDataInput,
18+
LocalContentDataInput,
19+
LocalConceptDataInput,
20+
} from "@repo/database/inputTypes";
21+
import type { Json } from "@repo/database/dbTypes";
22+
import type { DGSupabaseClient } from "@repo/database/lib/client";
23+
24+
export const demoCode = async (client: DGSupabaseClient) => {
25+
const accounts: LocalAccountDataInput[] = [
26+
{
27+
account_local_id: "sR22zZ470dNPkIf9PpjQXXdTBjG2",
28+
name: "maparent",
29+
},
30+
];
31+
32+
const docs: LocalDocumentDataInput[] = [
33+
{
34+
source_local_id: "page1_uid",
35+
created: "2000/01/01",
36+
last_modified: "2001/01/02",
37+
author_local_id: "sR22zZ470dNPkIf9PpjQXXdTBjG2",
38+
},
39+
];
40+
41+
const contents: LocalContentDataInput[] = [
42+
{
43+
author_local_id: "sR22zZ470dNPkIf9PpjQXXdTBjG2",
44+
author_inline: {
45+
account_local_id: "sR22zZ470dNPkIf9PpjQXXdTBjG2",
46+
name: "maparent",
47+
},
48+
document_inline: {
49+
source_local_id: "page1_uid",
50+
created: "2000/01/01",
51+
last_modified: "2001/01/02",
52+
author_local_id: "sR22zZ470dNPkIf9PpjQXXdTBjG2",
53+
},
54+
source_local_id: "a_roam_uid",
55+
scale: "document",
56+
created: "2000/01/01",
57+
last_modified: "2001/01/02",
58+
text: "[[CLM]] a claim",
59+
},
60+
{
61+
author_local_id: "sR22zZ470dNPkIf9PpjQXXdTBjG2",
62+
document_local_id: "page1_uid",
63+
source_local_id: "a_roam_uid2",
64+
scale: "section",
65+
created: "2000/01/02",
66+
last_modified: "2001/01/03",
67+
part_of_local_id: "a_roam_uid",
68+
text: "Some subsection",
69+
},
70+
{
71+
author_local_id: "sR22zZ470dNPkIf9PpjQXXdTBjG2",
72+
document_inline: docs[0],
73+
source_local_id: "a_roam_uid3",
74+
scale: "paragraph",
75+
created: "2000/01/02",
76+
last_modified: "2001/01/03",
77+
part_of_local_id: "a_roam_uid2",
78+
text: "Some paragraph",
79+
embedding_inline: {
80+
model: "openai_text_embedding_3_small_1536",
81+
vector: [0], // assume that the vector has the requisite length
82+
},
83+
},
84+
];
85+
86+
const concepts: LocalConceptDataInput[] = [
87+
{
88+
author_local_id: "sR22zZ470dNPkIf9PpjQXXdTBjG2",
89+
source_local_id: "a_roam_uid3",
90+
created: "2000/01/01",
91+
last_modified: "2001/01/02",
92+
name: "Some subsubtext",
93+
schema_represented_by_local_id: "known_claim_schema_local_id",
94+
},
95+
];
96+
97+
// Base scenario: insert references in order of dependencies
98+
// 1. upsert accounts.
99+
{
100+
const { data, error } = await client.rpc("upsert_accounts_in_space", {
101+
space_id_: 12,
102+
accounts,
103+
});
104+
if (error) console.error(error);
105+
console.log(data);
106+
}
107+
108+
// 2. upsert documents.
109+
{
110+
const { data, error } = await client.rpc("upsert_documents", {
111+
v_space_id: 12,
112+
data: docs as Json,
113+
});
114+
if (error) console.error(error);
115+
console.log(data);
116+
}
117+
118+
// 3. content
119+
{
120+
const { data, error } = await client.rpc("upsert_content", {
121+
v_space_id: 12,
122+
data: contents as Json,
123+
v_creator_id: 63,
124+
});
125+
if (error) console.error(error);
126+
console.log(data);
127+
}
128+
129+
// 4. upsert concept
130+
{
131+
const { data, error } = await client.rpc("upsert_concepts", {
132+
v_space_id: 12,
133+
data: concepts as Json,
134+
});
135+
if (error) console.error(error);
136+
console.log(data);
137+
}
138+
139+
// Variant: if all content is known to also be a document, you can upsert both as if it were a single object
140+
const pageContents: LocalContentDataInput[] = [
141+
{
142+
author_local_id: "sR22zZ470dNPkIf9PpjQXXdTBjG2",
143+
source_local_id: "a_page_uid2",
144+
created: "2000/01/02",
145+
last_modified: "2001/01/03",
146+
text: "Some other page",
147+
},
148+
];
149+
150+
{
151+
const { data, error } = await client.rpc("upsert_content", {
152+
v_space_id: 12,
153+
data: pageContents as Json,
154+
v_creator_id: 63,
155+
content_as_document: true,
156+
});
157+
if (error) console.error(error);
158+
console.log(data);
159+
}
160+
161+
// Nesting: you can nest information. Nested types can inherit information from the type above.
162+
// Eg 1: upserting content with nested documents and authors
163+
const contentWithNestedDoc: LocalContentDataInput[] = [
164+
{
165+
author_local_id: "sR22zZ470dNPkIf9PpjQXXdTBjG2",
166+
source_local_id: "a_page_uid3",
167+
created: "2000/01/02",
168+
scale: "document",
169+
last_modified: "2001/01/03",
170+
text: "Yet another page",
171+
author_inline: {
172+
account_local_id: "sR22zZ470dNPkIf9PpjQXXdTBjG2",
173+
name: "maparent",
174+
},
175+
document_inline: {
176+
source_local_id: "a_page_uid3",
177+
// note that created, last_modified and author_inline will be inherited
178+
},
179+
},
180+
];
181+
{
182+
const { data, error } = await client.rpc("upsert_content", {
183+
v_space_id: 12,
184+
data: contentWithNestedDoc as Json,
185+
v_creator_id: 63,
186+
});
187+
if (error) console.error(error);
188+
console.log(data);
189+
}
190+
191+
// eg 2: Nesting multiple contents in a concept
192+
const conceptsWithNestedContent: LocalConceptDataInput[] = [
193+
{
194+
author_local_id: "sR22zZ470dNPkIf9PpjQXXdTBjG2",
195+
source_local_id: "a_roam_uid4",
196+
created: "2000/01/01",
197+
last_modified: "2001/01/02",
198+
name: "[[CLM]] another claim",
199+
schema_represented_by_local_id: "known_claim_schema_local_id",
200+
author_inline: {
201+
account_local_id: "sR22zZ470dNPkIf9PpjQXXdTBjG2",
202+
name: "maparent",
203+
},
204+
document_inline: {
205+
source_local_id: "a_page_uid3",
206+
// note that created, last_modified and author_inline will be inherited
207+
},
208+
contents_inline: [
209+
{
210+
text: "The claim page contents",
211+
variant: "full",
212+
// here, source_local_id, document_inline, author_inline, created, last_modified are inherited
213+
},
214+
{
215+
text: "[[CLM]] another claim",
216+
variant: "direct",
217+
embedding_inline: {
218+
model: "openai_text_embedding_3_small_1536",
219+
vector: [0], // assume that the vector has the requisite length
220+
// again source_local_id etc. are inherited
221+
},
222+
// same variables are intherited
223+
},
224+
],
225+
},
226+
];
227+
228+
{
229+
const { data, error } = await client.rpc("upsert_concepts", {
230+
v_space_id: 12,
231+
data: conceptsWithNestedContent as Json,
232+
});
233+
if (error) console.error(error);
234+
console.log(data);
235+
}
236+
};

0 commit comments

Comments
 (0)