Skip to content

Commit f9f0f80

Browse files
Merge pull request #17 from lab68dev/feature/ai-model-training
Feature/ai model training
2 parents 7b1452c + b4d69bc commit f9f0f80

14 files changed

Lines changed: 914 additions & 1303 deletions

File tree

.github/workflows/ci.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ jobs:
2020

2121
- uses: pnpm/action-setup@v2
2222
with:
23-
version: 8
23+
version: 10
2424

2525
- name: Cache pnpm
2626
uses: actions/cache@v4
@@ -53,7 +53,7 @@ jobs:
5353

5454
- uses: pnpm/action-setup@v2
5555
with:
56-
version: 8
56+
version: 10
5757

5858
- name: Cache pnpm
5959
uses: actions/cache@v4

.gitignore

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,4 +24,14 @@ yarn-error.log*
2424

2525
# typescript
2626
*.tsbuildinfo
27-
next-env.d.ts
27+
next-env.d.ts
28+
29+
# Python
30+
__pycache__/
31+
*.py[cod]
32+
*$py.class
33+
*.so
34+
.Python
35+
venv/
36+
ENV/
37+
.venv

ai-model/README.md

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
# AI Model Training Pipeline for Lab68Dev
2+
3+
Custom NLP model fine-tuned for task creation and tech support.
4+
5+
## Quick Start
6+
7+
### 1. Setup Environment
8+
9+
```bash
10+
# Create virtual environment
11+
python -m venv venv
12+
13+
# Activate (Windows)
14+
.\venv\Scripts\activate
15+
16+
# Activate (Linux/macOS)
17+
source venv/bin/activate
18+
# Install dependencies
19+
pip install -r requirements.txt
20+
```
21+
22+
### 2. Generate Training Data
23+
24+
```bash
25+
python data/generate_dataset.py
26+
```
27+
28+
### 3. Train Model
29+
30+
```bash
31+
python train.py
32+
```
33+
34+
### 4. Run Inference Server
35+
36+
```bash
37+
# Set allowed CORS origins (optional, defaults to http://localhost:3000)
38+
export ALLOWED_ORIGINS="http://localhost:3000,http://localhost:3001"
39+
40+
# Run the server
41+
python inference/server.py
42+
```
43+
44+
#### Environment Variables
45+
46+
- `ALLOWED_ORIGINS`: Comma-separated list of allowed CORS origins (default: `http://localhost:3000`)
47+
- Example: `http://localhost:3000,https://example.com`
48+
- For production, set this to your specific domain(s) for security
49+
50+
## Hardware Requirements
51+
52+
- **Minimum:** RTX 4060 (8GB VRAM)
53+
- **Recommended:** RTX 4070+ (12GB+ VRAM)
54+
55+
## Model Details
56+
57+
- **Base Model:** TinyLlama-1.1B-Chat-v1.0
58+
- **Fine-tuning:** LoRA (rank 16)
59+
- **Dataset:** ~4000 synthetic examples (task creation + tech Q&A)
Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
# Training Configuration for Lab68Dev AI Model
2+
3+
# Model Settings
4+
model:
5+
name: "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
6+
max_length: 512
7+
load_in_4bit: true
8+
9+
# LoRA Configuration
10+
lora:
11+
r: 16
12+
lora_alpha: 32
13+
lora_dropout: 0.05
14+
target_modules:
15+
- "q_proj"
16+
- "k_proj"
17+
- "v_proj"
18+
- "o_proj"
19+
- "gate_proj"
20+
- "up_proj"
21+
- "down_proj"
22+
23+
# Training Hyperparameters
24+
training:
25+
output_dir: "./models/lab68dev-assistant"
26+
num_train_epochs: 3
27+
per_device_train_batch_size: 2
28+
gradient_accumulation_steps: 8
29+
learning_rate: 2.0e-4
30+
weight_decay: 0.01
31+
warmup_ratio: 0.03
32+
lr_scheduler_type: "cosine"
33+
logging_steps: 10
34+
save_steps: 100
35+
eval_steps: 100
36+
save_total_limit: 3
37+
fp16: true
38+
optim: "paged_adamw_8bit"
39+
40+
# Dataset Settings
41+
dataset:
42+
train_file: "./data/dataset/train.jsonl"
43+
val_file: "./data/dataset/val.jsonl"
44+
train_size: 0.9
45+
46+
# Generation Settings (for inference)
47+
generation:
48+
max_new_tokens: 256
49+
temperature: 0.7
50+
top_p: 0.9
51+
do_sample: true

ai-model/data/generate_dataset.py

Lines changed: 141 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,141 @@
1+
#!/usr/bin/env python3
2+
"""Dataset Generator for Lab68Dev AI Model"""
3+
import json
4+
import random
5+
from pathlib import Path
6+
from typing import List, Dict
7+
8+
TECH_STACKS = {
9+
"frontend": ["React", "Next.js", "Vue.js", "TypeScript", "Tailwind"],
10+
"backend": ["Node.js", "Python", "FastAPI", "Express", "Django"],
11+
"database": ["PostgreSQL", "MongoDB", "Redis", "Supabase"],
12+
"devops": ["Docker", "AWS", "GitHub Actions", "Kubernetes"]
13+
}
14+
15+
TASK_PROMPTS = [
16+
("Create a task for building a navbar in React", "frontend"),
17+
("Create a task for building a login form", "frontend"),
18+
("I need to implement dark mode for my app", "frontend"),
19+
("Create a task for setting up a REST API", "backend"),
20+
("I need to build a user authentication system", "backend"),
21+
("Create a task for database schema design", "database"),
22+
("I need to set up CI/CD pipeline", "devops"),
23+
("Create a task for Docker containerization", "devops"),
24+
]
25+
26+
QA_PAIRS = [
27+
("Explain async/await in JavaScript", "Async/await is syntactic sugar for Promises. The async keyword marks a function as asynchronous, and await pauses execution until the Promise resolves. Use try/catch for error handling."),
28+
("Explain closures in JavaScript", "A closure is a function that has access to variables from its outer scope, even after the outer function has returned. Common uses include data privacy and factory functions."),
29+
("How to implement state management in React?", "Options include: useState for local state, useReducer for complex state, Context API for global state, or external libraries like Zustand or Redux for large applications."),
30+
("How to handle errors in Node.js?", "Use try/catch blocks for async/await. Create custom error classes. Implement error middleware in Express. Always log errors with proper context."),
31+
("Explain REST APIs", "REST uses HTTP methods (GET, POST, PUT, DELETE) for CRUD operations. Resources are identified by URLs. The architecture is stateless and uses proper status codes."),
32+
("What is the difference between let and const?", "Both are block-scoped. let allows reassignment while const does not. Use const by default and let only when you need to reassign the variable."),
33+
("How to optimize database queries?", "Use indexes on frequently queried columns. Select only needed columns. Use EXPLAIN to analyze query plans. Implement pagination for large datasets."),
34+
]
35+
36+
SYSTEM_PROMPT = "You are Lab68Dev Assistant, an AI specialized in software development tasks and technical explanations."
37+
38+
39+
def extract_title(prompt: str) -> str:
40+
"""Extract task title from prompt."""
41+
for prefix in ["Create a task for ", "I need to "]:
42+
if prompt.startswith(prefix):
43+
return prompt[len(prefix):].capitalize()
44+
return prompt.capitalize()
45+
46+
47+
def generate_task(prompt: str, category: str) -> Dict:
48+
"""Generate structured task response."""
49+
tech = TECH_STACKS.get(category, ["JavaScript"])
50+
return {
51+
"title": extract_title(prompt),
52+
"category": category,
53+
"priority": random.choice(["low", "medium", "high"]),
54+
"estimated_hours": random.choice([2, 4, 8, 16]),
55+
"tech_stack": random.sample(tech, min(2, len(tech))),
56+
"steps": [
57+
{"step": 1, "description": "Research requirements", "status": "pending"},
58+
{"step": 2, "description": "Implement core functionality", "status": "pending"},
59+
{"step": 3, "description": "Write tests", "status": "pending"},
60+
{"step": 4, "description": "Review and deploy", "status": "pending"},
61+
]
62+
}
63+
64+
65+
def generate_examples(num_tasks: int = 2000, num_qa: int = 2000) -> List[Dict]:
66+
"""Generate training examples."""
67+
examples = []
68+
69+
# Task creation examples
70+
for _ in range(num_tasks):
71+
prompt, category = random.choice(TASK_PROMPTS)
72+
task = generate_task(prompt, category)
73+
examples.append({
74+
"instruction": prompt,
75+
"output": json.dumps(task, indent=2),
76+
"type": "task_creation"
77+
})
78+
79+
# Q&A examples
80+
for _ in range(num_qa):
81+
question, answer = random.choice(QA_PAIRS)
82+
examples.append({
83+
"instruction": question,
84+
"output": answer,
85+
"type": "tech_qa"
86+
})
87+
88+
random.shuffle(examples)
89+
return examples
90+
91+
92+
def format_for_training(examples: List[Dict]) -> List[Dict]:
93+
"""Format examples for TinyLlama chat format."""
94+
# Define tags for TinyLlama chat format
95+
sys_open = "<|system|>"
96+
user_open = "<|user|>"
97+
asst_open = "<|assistant|>"
98+
end_tag = "</s>"
99+
newline = "\n"
100+
101+
formatted = []
102+
for ex in examples:
103+
text = sys_open + newline + SYSTEM_PROMPT + end_tag + newline
104+
text += user_open + newline + ex["instruction"] + end_tag + newline
105+
text += asst_open + newline + ex["output"] + end_tag
106+
formatted.append({"text": text})
107+
108+
return formatted
109+
110+
111+
def main():
112+
print("Generating synthetic dataset...")
113+
114+
# Create output directory
115+
output_dir = Path("data/dataset")
116+
output_dir.mkdir(parents=True, exist_ok=True)
117+
118+
# Generate examples
119+
examples = generate_examples(2000, 2000)
120+
121+
# Split into train/val
122+
split_idx = int(len(examples) * 0.9)
123+
train_examples = format_for_training(examples[:split_idx])
124+
val_examples = format_for_training(examples[split_idx:])
125+
126+
# Save to JSONL files
127+
with open(output_dir / "train.jsonl", "w", encoding="utf-8") as f:
128+
for ex in train_examples:
129+
f.write(json.dumps(ex, ensure_ascii=False) + "\n")
130+
131+
with open(output_dir / "val.jsonl", "w", encoding="utf-8") as f:
132+
for ex in val_examples:
133+
f.write(json.dumps(ex, ensure_ascii=False) + "\n")
134+
135+
print(f"Generated {len(train_examples)} training examples")
136+
print(f"Generated {len(val_examples)} validation examples")
137+
print(f"Saved to {output_dir}")
138+
139+
140+
if __name__ == "__main__":
141+
main()

0 commit comments

Comments
 (0)