Enhance dataset generation with new export formats and update documentation. Added support for 'chat_template' export format in dataset generation, updated README to reflect new output options, and improved dataset writer to handle dynamic schemas for Parquet format.

michalwarda · michalwarda · commit 6c2c2e7a2e2e · 2025-11-19T19:25:01.000+01:00
diff --git a/packages/torque/AGENTS.md b/packages/torque/AGENTS.md
@@ -1,24 +1,28 @@
 # @qforge/torque Agent Guide
 
 ## Product Surface
+
 - Declarative DSL for composing LLM datasets (messages, tools, metadata) with deterministic RNG helpers.
 - Shipping artifacts: `dist/` bundle, README examples, StackBlitz templates under `stackblitz-templates/`.
 - Consumers rely on stable builder APIs (`generatedUser`, `oneOf`, `times`, `metadata`, schema helpers) and Bun-friendly ESM output.
 
 ## Code Map
+
 - `src/generators.ts`, `schema.ts`, `schema-rng.ts`: core composition primitives and RNG utilities.
 - `src/faker.ts`, `src/seed.ts`, `src/utils.ts`: deterministic seeding & Faker wiring.
-- `src/writer.ts`, `src/dataset.ts`, `src/cli-renderer.ts`: dataset materialization, Parquet/JSONL writers, CLI UX.
+- `src/writer.ts`, `src/dataset.ts`, `src/cli-renderer.ts`, `src/formatter.ts`: dataset materialization, formatters, Parquet/JSONL writers, CLI UX.
 - Tests co-located in `src/*.test.ts` using `bun:test`; keep new tests near the code they cover.
 
 ## Implementation Guardrails
+
 1. **Determinism first** – Always thread `seed` + `withSeed` helpers through new flows; never call `Math.random` or instantiate Faker ad-hoc.
 2. **Immutable schemas** – Treat schema objects as frozen after the `check` phase; copy before mutating and respect `phase` on `IMessageSchemaContext`.
 3. **Types are the contract** – Update `src/types.ts` alongside behavior changes and re-run `tsc -p tsconfig.build.json`.
 4. **Error messaging** – Use descriptive errors (see `schema.ts` for tone) and prefer `ZodError`-style aggregates when validating user structures.
 5. **CLI/story templates** – If a change affects example output, refresh snippets in the README and regenerate StackBlitz templates (`bun run generate:templates`).
 
 ## Testing & Verification
+
 - Unit tests: `bun test packages/torque/src` (Bun discovers `*.test.ts`).
 - Type check + build: `bun run --filter @qforge/torque build`.
 - For RNG-sensitive code, add golden tests that fix a seed and assert exact arrays/messages.
@@ -40,11 +44,13 @@ const mockModel = new MockLanguageModelV2({
 - Document manual verification steps (e.g., running `examples/*.ts`) when automated tests are insufficient.
 
 ## When to Loop In a Human
+
 - Introducing new public builder APIs or altering existing function signatures.
 - Changes that risk breaking template compatibility, dataset schemas, or CLI output formats.
 - Work that requires new dependencies, native bindings, or non-Bun tooling.
 
 ## Definition of Done
+
 - Code is deterministic, typed, and tested.
 - README + templates reflect surface changes.
 - `dist/` is regenerated only during release—do not commit build artifacts.
diff --git a/packages/torque/README.md b/packages/torque/README.md
@@ -354,31 +354,37 @@ When Bun workers are unavailable, Torque automatically falls back to in-process
 
 ### Output Formats
 
-Choose your preferred output format for generated datasets:
+Choose your preferred output file format and data structure:
 
 ```typescript
-// Export as JSONL (default - line-delimited JSON)
+// Export as JSONL with default ai-sdk structure (default)
 await generateDataset(schema, {
   count: 100,
   model: openai("gpt-4o-mini"),
-  format: "jsonl", // default, can be omitted
+  format: "jsonl",
   output: "data/dataset.jsonl",
 });
 
-// Export as Parquet (columnar format, efficient for analytics)
+// Export in OpenAI Chat Completions format (tools + messages structure)
 await generateDataset(schema, {
   count: 100,
   model: openai("gpt-4o-mini"),
-  format: "parquet",
-  output: "data/dataset.parquet",
+  format: "jsonl",
+  exportFormat: "chat_template",
+  output: "data/finetune.jsonl",
 });
 ```
 
-**Supported formats:**
+**Supported File Formats (`format`):**
 
 - **`jsonl`** (default) - JSON Lines format, one row per line. Best for streaming and line-by-line processing.
 - **`parquet`** - Apache Parquet columnar format. More efficient for large datasets and analytics tools (e.g., Pandas, DuckDB, Apache Spark).
 
+**Supported Data Structures (`exportFormat`):**
+
+- **`ai-sdk`** (default) - Internal Torque format, compatible with Vercel AI SDK. Includes schema metadata, tool definitions, and full message objects.
+- **`chat_template`** - OpenAI Chat Completions compatible format. Flattened message structure with `tools` and `messages` top-level keys. Ideal for fine-tuning or direct API usage.
+
 Both formats write rows incrementally as they're generated, so large datasets won't consume excessive memory.
 
 > 💡 When `format` is specified without `output`, the file extension is automatically set based on the format.
diff --git a/packages/torque/src/dataset.ts b/packages/torque/src/dataset.ts
@@ -29,6 +29,7 @@ import type {
   IGenerateDatasetArgsMultiSchema,
 } from "./types";
 import { createWriter } from "./writer";
+import { createFormatter } from "./formatter";
 import { TokenCounterPool } from "./token-counting/tokenCounterPool";
 import { hoistSystemMessages } from "./ai-message-order";
 
@@ -67,6 +68,7 @@ export async function generateDataset(
     seed,
     output,
     format = "jsonl",
+    exportFormat = "ai-sdk",
     model,
     concurrency = 5,
     generationContext,
@@ -99,7 +101,8 @@ export async function generateDataset(
   await fsp.mkdir(outputDir, { recursive: true });
 
   // Initialize the writer for the specified format
-  const writer = createWriter(format, outputPath);
+  const formatter = createFormatter(exportFormat);
+  const writer = createWriter(format, outputPath, formatter.parquetSchema);
   await writer.init();
 
   // Initialize the CLI renderer
@@ -174,7 +177,8 @@ export async function generateDataset(
 
         // Write row immediately after generation
         // Thread-safety is handled internally by the writer
-        await writer.appendRow(row);
+        const formattedRow = formatter.format(row);
+        await writer.appendRow(formattedRow);
 
         // Mark generation as completed
         renderer.completeGeneration(task.index);
diff --git a/packages/torque/src/formatter.test.ts b/packages/torque/src/formatter.test.ts
@@ -0,0 +1,149 @@
+import { describe, it, expect } from "bun:test";
+import { ChatTemplateFormatter } from "./formatter";
+import type { IDatasetRow } from "./types";
+
+describe("ChatTemplateFormatter", () => {
+  const formatter = new ChatTemplateFormatter();
+
+  it("should transform tools to OpenAI format", () => {
+    const row: IDatasetRow = {
+      messages: [],
+      tools: [
+        {
+          name: "calculator",
+          description: "Performs math",
+          parameters: {
+            type: "object",
+            properties: { a: { type: "number" } },
+            required: ["a"],
+          },
+          output: {},
+        },
+      ],
+      schema: {} as any,
+      meta: {} as any,
+    };
+
+    const result = formatter.format(row);
+    expect(result.tools).toHaveLength(1);
+    expect(result.tools[0]).toEqual({
+      type: "function",
+      function: {
+        name: "calculator",
+        description: "Performs math",
+        parameters: {
+          type: "object",
+          properties: { a: { type: "number" } },
+          required: ["a"],
+        },
+      },
+    });
+  });
+
+  it("should transform user messages", () => {
+    const row: IDatasetRow = {
+      messages: [
+        {
+          role: "user",
+          content: "Hello",
+          generationId: "1",
+        },
+      ],
+      tools: [],
+      schema: {} as any,
+      meta: {} as any,
+    };
+
+    const result = formatter.format(row);
+    expect(result.messages).toHaveLength(1);
+    expect(result.messages[0]).toEqual({
+      role: "user",
+      content: "Hello",
+    });
+  });
+
+  it("should transform assistant messages with tool calls", () => {
+    const row: IDatasetRow = {
+      messages: [
+        {
+          role: "assistant",
+          content: [
+            { type: "text", text: "Thinking..." },
+            {
+              type: "tool-call",
+              toolCallId: "call_1",
+              toolName: "calc",
+              input: { a: 1 },
+            },
+          ],
+          generationId: "1",
+        } as any, // Casting because IDatasetMessage content type is strict in tests
+      ],
+      tools: [],
+      schema: {} as any,
+      meta: {} as any,
+    };
+
+    const result = formatter.format(row);
+    expect(result.messages).toHaveLength(1);
+    expect(result.messages[0]).toEqual({
+      role: "assistant",
+      content: "Thinking...",
+      tool_calls: [
+        {
+          id: "call_1",
+          type: "function",
+          function: {
+            name: "calc",
+            arguments: { a: 1 },
+          },
+        },
+      ],
+    });
+  });
+
+  it("should flatten tool result messages", () => {
+    const row: IDatasetRow = {
+      messages: [
+        {
+          role: "tool",
+          content: [
+            {
+              type: "tool-result",
+              toolCallId: "call_1",
+              toolName: "calc",
+              result: 2,
+              output: 2, // dataset.ts populates output
+            },
+            {
+              type: "tool-result",
+              toolCallId: "call_2",
+              toolName: "calc",
+              result: 4,
+              output: 4,
+            },
+          ],
+          generationId: "1",
+        } as any,
+      ],
+      tools: [],
+      schema: {} as any,
+      meta: {} as any,
+    };
+
+    const result = formatter.format(row);
+    expect(result.messages).toHaveLength(2);
+    expect(result.messages[0]).toEqual({
+      role: "tool",
+      tool_call_id: "call_1",
+      name: "calc",
+      content: "2",
+    });
+    expect(result.messages[1]).toEqual({
+      role: "tool",
+      tool_call_id: "call_2",
+      name: "calc",
+      content: "4",
+    });
+  });
+});
diff --git a/packages/torque/src/formatter.ts b/packages/torque/src/formatter.ts
@@ -0,0 +1,124 @@
+import type { IDatasetRow, DatasetExportFormat } from "./types";
+
+export interface IDatasetFormatter {
+  format(row: IDatasetRow): Record<string, any>;
+  parquetSchema: Record<string, any>;
+}
+
+export class AiSdkFormatter implements IDatasetFormatter {
+  parquetSchema = {
+    messages: { type: "UTF8" }, // JSON string
+    tools: { type: "UTF8" }, // JSON string
+    schema: { type: "UTF8" }, // JSON string
+    meta: { type: "UTF8" }, // JSON string
+  };
+
+  format(row: IDatasetRow): Record<string, any> {
+    return row as unknown as Record<string, any>;
+  }
+}
+
+export class ChatTemplateFormatter implements IDatasetFormatter {
+  parquetSchema = {
+    tools: { type: "UTF8" }, // JSON string
+    messages: { type: "UTF8" }, // JSON string
+  };
+
+  format(row: IDatasetRow): Record<string, any> {
+    const tools = row.tools.map((tool) => ({
+      type: "function",
+      function: {
+        name: tool.name,
+        description: tool.description,
+        parameters: tool.parameters,
+      },
+    }));
+
+    const messages = row.messages.flatMap((msg) => {
+      if (msg.role === "tool") {
+        // Flatten tool results
+        if (Array.isArray(msg.content)) {
+          return msg.content
+            .map((part: any) => {
+              if (part.type === "tool-result") {
+                return {
+                  role: "tool",
+                  tool_call_id: part.toolCallId,
+                  name: part.toolName,
+                  content: JSON.stringify(part.output),
+                };
+              }
+              return null;
+            })
+            .filter(Boolean);
+        }
+        return [];
+      }
+
+      if (msg.role === "assistant") {
+        const toolCalls: any[] = [];
+        let contentString = "";
+
+        if (Array.isArray(msg.content)) {
+          for (const part of msg.content) {
+            if (part.type === "tool-call") {
+              toolCalls.push({
+                id: part.toolCallId,
+                type: "function",
+                function: {
+                  name: part.toolName,
+                  arguments: part.input,
+                },
+              });
+            } else if (part.type === "text") {
+              contentString += part.text;
+            }
+            // Skip reasoning for chat_template
+          }
+        } else if (typeof msg.content === "string") {
+          contentString = msg.content;
+        }
+
+        const newMsg: any = {
+          role: "assistant",
+          content: contentString || null,
+        };
+        if (toolCalls.length > 0) {
+          newMsg.tool_calls = toolCalls;
+        }
+        return [newMsg];
+      }
+
+      // User / System
+      let content = msg.content;
+      if (Array.isArray(content)) {
+        // Ensure content parts are compatible
+        // OpenAI accepts array of text/image parts.
+        // We'll assume they are compatible or simplify if needed.
+        // For now, pass through.
+      }
+
+      return [
+        {
+          role: msg.role,
+          content,
+        },
+      ];
+    });
+
+    return { tools, messages };
+  }
+}
+
+export function createFormatter(
+  format: DatasetExportFormat
+): IDatasetFormatter {
+  switch (format) {
+    case "ai-sdk":
+      return new AiSdkFormatter();
+    case "chat_template":
+      return new ChatTemplateFormatter();
+    default:
+      throw new Error(`Unsupported export format: ${format}`);
+  }
+}
diff --git a/packages/torque/src/types.ts b/packages/torque/src/types.ts
diff --git a/packages/torque/src/writer.ts b/packages/torque/src/writer.ts