Skip to content

Commit 2e99530

Browse files
authored
Merge pull request #37 from constructive-io/feat/documents-loader
feat: add @agentic-db/documents-loader package and CLI docs command
2 parents e92daa9 + 2376a08 commit 2e99530

23 files changed

Lines changed: 7856 additions & 9117 deletions

.github/workflows/integration-test.yml

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -421,6 +421,33 @@ jobs:
421421
cd packages/agentic-db
422422
pnpm test -- --forceExit --detectOpenHandles __tests__/rag.test.ts __tests__/rag-unified-search.test.ts __tests__/cli-search-integration.test.ts
423423
424+
documents-loader-tests:
425+
runs-on: ubuntu-latest
426+
timeout-minutes: 10
427+
428+
steps:
429+
- name: Checkout
430+
uses: actions/checkout@v4
431+
432+
- name: Setup pnpm
433+
uses: pnpm/action-setup@v2
434+
with:
435+
version: 10.22.0
436+
437+
- name: Setup Node.js
438+
uses: actions/setup-node@v4
439+
with:
440+
node-version: '22'
441+
cache: 'pnpm'
442+
443+
- name: Install dependencies
444+
run: pnpm install --frozen-lockfile
445+
446+
- name: Run documents-loader tests
447+
run: |
448+
cd packages/documents-loader
449+
pnpm test
450+
424451
cli-e2e-tests:
425452
runs-on: ubuntu-latest
426453
timeout-minutes: 25
@@ -487,6 +514,9 @@ jobs:
487514
- name: Build SDK
488515
run: pnpm --filter @agentic-db/sdk run build
489516

517+
- name: Build documents-loader
518+
run: pnpm --filter @agentic-db/documents-loader run build
519+
490520
- name: Wait for Ollama and pull model
491521
run: |
492522
echo "Waiting for Ollama to be ready..."
Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
# @agentic-db/documents-loader
2+
3+
Load, import, and export text-based files (markdown, MDX, plain text, etc.) into the agentic-db `documents` table.
4+
5+
## Features
6+
7+
- **Import** a directory of markdown/text files into the documents table
8+
- **Export** documents back to disk as files, preserving directory structure
9+
- **Bidirectional sync** between Git repositories and the database
10+
- **Frontmatter parsing** for `.md` and `.mdx` files (title, tags, metadata)
11+
- **Last-write-wins** conflict resolution for seamless workflows
12+
- Supports `.md`, `.mdx`, `.txt`, `.rst`, `.html`, `.xml`, `.json`, `.yaml`, `.yml`, `.csv`, `.tsv`
13+
14+
## Usage
15+
16+
### As a library
17+
18+
```typescript
19+
import {
20+
importDirectory,
21+
exportDocuments,
22+
createDocumentClient,
23+
} from '@agentic-db/documents-loader';
24+
import { createClient } from '@agentic-db/sdk';
25+
26+
const sdk = createClient({ endpoint: '...', headers: { ... } });
27+
const client = createDocumentClient(sdk);
28+
29+
// Import files from a directory
30+
const importStats = await importDirectory('./my-docs', client, {
31+
repoName: 'my-repo',
32+
tags: ['docs'],
33+
commitHash: 'abc123',
34+
});
35+
36+
// Export documents back to disk
37+
const exportStats = await exportDocuments('./output', client, {
38+
repoName: 'my-repo',
39+
includeFrontmatter: true,
40+
});
41+
```
42+
43+
### Via the CLI
44+
45+
```bash
46+
# Import a directory of docs
47+
agentic-db docs import ./my-docs --repo my-repo --tags docs,internal
48+
49+
# Export documents to a directory
50+
agentic-db docs export ./output --repo my-repo
51+
52+
# List documents for a repo
53+
agentic-db docs list --repo my-repo
54+
```
55+
56+
## How it works
57+
58+
### Import
59+
60+
1. Scans the directory for supported text files
61+
2. Parses frontmatter from `.md`/`.mdx` files to extract title, tags, and metadata
62+
3. Matches files to existing documents by `repo_name + file_path`
63+
4. Creates new documents or updates existing ones (last-write-wins)
64+
5. The database's auto-embed triggers handle embedding generation
65+
66+
### Export
67+
68+
1. Fetches all documents for the specified `repo_name`
69+
2. Writes each document to disk at its `file_path`
70+
3. Optionally includes frontmatter (title, tags, metadata) in markdown files
71+
4. Creates nested directories as needed
72+
73+
### Conflict Resolution
74+
75+
Uses **last-write-wins**: whichever operation runs last (import or manual DB edit) determines the current state. This keeps the workflow simple and predictable.
Lines changed: 278 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,278 @@
1+
import { mkdtempSync, readFileSync, rmSync, existsSync } from 'fs';
2+
import { join } from 'path';
3+
import { tmpdir } from 'os';
4+
5+
import { exportDocuments } from '../src/exporter';
6+
import { DocumentClient, DocumentRecord } from '../src/importer';
7+
8+
function createMockClientWithDocs(docs: DocumentRecord[]): DocumentClient {
9+
return {
10+
async findByRepoAndPath(repoName: string, filePath: string) {
11+
return docs.find((d) => d.repoName === repoName && d.filePath === filePath) || null;
12+
},
13+
14+
async findByRepo(repoName: string) {
15+
return docs.filter((d) => d.repoName === repoName);
16+
},
17+
18+
async create(doc) {
19+
return { id: 'new', ...doc, updatedAt: null, tags: doc.tags || null, metadata: doc.metadata || null, commitHash: doc.commitHash || null };
20+
},
21+
22+
async update(id, patch) {
23+
const existing = docs.find((d) => d.id === id);
24+
if (!existing) throw new Error('Not found');
25+
return { ...existing, ...patch };
26+
},
27+
28+
async delete() {
29+
// noop
30+
},
31+
};
32+
}
33+
34+
describe('exporter', () => {
35+
let tempDir: string;
36+
37+
beforeEach(() => {
38+
tempDir = mkdtempSync(join(tmpdir(), 'docloader-export-'));
39+
});
40+
41+
afterEach(() => {
42+
rmSync(tempDir, { recursive: true, force: true });
43+
});
44+
45+
it('should export documents as markdown files', async () => {
46+
const docs: DocumentRecord[] = [
47+
{
48+
id: '1',
49+
title: 'Getting Started',
50+
content: '# Getting Started\n\nWelcome!',
51+
repoName: 'test-repo',
52+
filePath: 'getting-started.md',
53+
commitHash: null,
54+
tags: ['docs'],
55+
metadata: null,
56+
updatedAt: '2024-01-01T00:00:00Z',
57+
},
58+
];
59+
60+
const client = createMockClientWithDocs(docs);
61+
const stats = await exportDocuments(tempDir, client, {
62+
repoName: 'test-repo',
63+
});
64+
65+
expect(stats.written).toBe(1);
66+
expect(stats.errors).toBe(0);
67+
68+
const content = readFileSync(join(tempDir, 'getting-started.md'), 'utf-8');
69+
expect(content).toContain('---');
70+
expect(content).toContain('title: "Getting Started"');
71+
expect(content).toContain('tags: ["docs"]');
72+
expect(content).toContain('# Getting Started');
73+
expect(content).toContain('Welcome!');
74+
});
75+
76+
it('should create nested directories for file paths', async () => {
77+
const docs: DocumentRecord[] = [
78+
{
79+
id: '1',
80+
title: 'API Ref',
81+
content: '# API Reference',
82+
repoName: 'test-repo',
83+
filePath: 'docs/api/reference.md',
84+
commitHash: null,
85+
tags: null,
86+
metadata: null,
87+
updatedAt: null,
88+
},
89+
];
90+
91+
const client = createMockClientWithDocs(docs);
92+
await exportDocuments(tempDir, client, { repoName: 'test-repo' });
93+
94+
expect(existsSync(join(tempDir, 'docs', 'api', 'reference.md'))).toBe(true);
95+
const content = readFileSync(join(tempDir, 'docs', 'api', 'reference.md'), 'utf-8');
96+
expect(content).toContain('# API Reference');
97+
});
98+
99+
it('should skip documents without file_path or title', async () => {
100+
const docs: DocumentRecord[] = [
101+
{
102+
id: '1',
103+
title: null,
104+
content: 'No path or title',
105+
repoName: 'test-repo',
106+
filePath: null,
107+
commitHash: null,
108+
tags: null,
109+
metadata: null,
110+
updatedAt: null,
111+
},
112+
];
113+
114+
const client = createMockClientWithDocs(docs);
115+
const stats = await exportDocuments(tempDir, client, {
116+
repoName: 'test-repo',
117+
});
118+
119+
expect(stats.skipped).toBe(1);
120+
expect(stats.written).toBe(0);
121+
});
122+
123+
it('should generate file path from title if no file_path', async () => {
124+
const docs: DocumentRecord[] = [
125+
{
126+
id: '1',
127+
title: 'My Great Document',
128+
content: 'Content here',
129+
repoName: 'test-repo',
130+
filePath: null,
131+
commitHash: null,
132+
tags: null,
133+
metadata: null,
134+
updatedAt: null,
135+
},
136+
];
137+
138+
const client = createMockClientWithDocs(docs);
139+
await exportDocuments(tempDir, client, { repoName: 'test-repo' });
140+
141+
expect(existsSync(join(tempDir, 'my-great-document.md'))).toBe(true);
142+
});
143+
144+
it('should export without frontmatter when disabled', async () => {
145+
const docs: DocumentRecord[] = [
146+
{
147+
id: '1',
148+
title: 'Raw Doc',
149+
content: '# Raw Content',
150+
repoName: 'test-repo',
151+
filePath: 'raw.md',
152+
commitHash: null,
153+
tags: ['tag1'],
154+
metadata: null,
155+
updatedAt: null,
156+
},
157+
];
158+
159+
const client = createMockClientWithDocs(docs);
160+
await exportDocuments(tempDir, client, {
161+
repoName: 'test-repo',
162+
includeFrontmatter: false,
163+
});
164+
165+
const content = readFileSync(join(tempDir, 'raw.md'), 'utf-8');
166+
expect(content).not.toContain('---');
167+
expect(content).toBe('# Raw Content');
168+
});
169+
170+
it('should include metadata in frontmatter', async () => {
171+
const docs: DocumentRecord[] = [
172+
{
173+
id: '1',
174+
title: 'Meta Doc',
175+
content: 'Content',
176+
repoName: 'test-repo',
177+
filePath: 'meta.md',
178+
commitHash: null,
179+
tags: null,
180+
metadata: { author: 'Dan', category: 'guide' },
181+
updatedAt: null,
182+
},
183+
];
184+
185+
const client = createMockClientWithDocs(docs);
186+
await exportDocuments(tempDir, client, { repoName: 'test-repo' });
187+
188+
const content = readFileSync(join(tempDir, 'meta.md'), 'utf-8');
189+
expect(content).toContain('author: "Dan"');
190+
expect(content).toContain('category: "guide"');
191+
});
192+
193+
it('should report progress events', async () => {
194+
const docs: DocumentRecord[] = [
195+
{
196+
id: '1',
197+
title: 'Doc',
198+
content: 'Content',
199+
repoName: 'test-repo',
200+
filePath: 'doc.md',
201+
commitHash: null,
202+
tags: null,
203+
metadata: null,
204+
updatedAt: null,
205+
},
206+
];
207+
208+
const events: string[] = [];
209+
const client = createMockClientWithDocs(docs);
210+
await exportDocuments(tempDir, client, {
211+
repoName: 'test-repo',
212+
onProgress: (event) => events.push(event.type),
213+
});
214+
215+
expect(events).toContain('exporting');
216+
expect(events).toContain('written');
217+
expect(events).toContain('done');
218+
});
219+
220+
it('should handle empty repo', async () => {
221+
const client = createMockClientWithDocs([]);
222+
const stats = await exportDocuments(tempDir, client, {
223+
repoName: 'empty-repo',
224+
});
225+
226+
expect(stats.total).toBe(0);
227+
expect(stats.written).toBe(0);
228+
});
229+
230+
it('should export multiple documents', async () => {
231+
const docs: DocumentRecord[] = [
232+
{
233+
id: '1',
234+
title: 'Doc A',
235+
content: 'Content A',
236+
repoName: 'test-repo',
237+
filePath: 'a.md',
238+
commitHash: null,
239+
tags: null,
240+
metadata: null,
241+
updatedAt: null,
242+
},
243+
{
244+
id: '2',
245+
title: 'Doc B',
246+
content: 'Content B',
247+
repoName: 'test-repo',
248+
filePath: 'b.md',
249+
commitHash: null,
250+
tags: null,
251+
metadata: null,
252+
updatedAt: null,
253+
},
254+
{
255+
id: '3',
256+
title: 'Other Repo',
257+
content: 'Should not appear',
258+
repoName: 'other-repo',
259+
filePath: 'c.md',
260+
commitHash: null,
261+
tags: null,
262+
metadata: null,
263+
updatedAt: null,
264+
},
265+
];
266+
267+
const client = createMockClientWithDocs(docs);
268+
const stats = await exportDocuments(tempDir, client, {
269+
repoName: 'test-repo',
270+
});
271+
272+
expect(stats.total).toBe(2);
273+
expect(stats.written).toBe(2);
274+
expect(existsSync(join(tempDir, 'a.md'))).toBe(true);
275+
expect(existsSync(join(tempDir, 'b.md'))).toBe(true);
276+
expect(existsSync(join(tempDir, 'c.md'))).toBe(false);
277+
});
278+
});

0 commit comments

Comments
 (0)