lab68dev
diff --git a/‎.github/workflows/ci.yml‎
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/ci.yml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎.gitignore‎
Lines changed: 11 additions & 1 deletion b/‎.gitignore‎
Lines changed: 11 additions & 1 deletion
diff --git a/‎ai-model/README.md‎
Lines changed: 59 additions & 0 deletions b/‎ai-model/README.md‎
Lines changed: 59 additions & 0 deletions
diff --git a/‎ai-model/config/training_config.yaml‎
Lines changed: 51 additions & 0 deletions b/‎ai-model/config/training_config.yaml‎
Lines changed: 51 additions & 0 deletions
diff --git a/‎ai-model/data/generate_dataset.py‎
Lines changed: 141 additions & 0 deletions b/‎ai-model/data/generate_dataset.py‎
Lines changed: 141 additions & 0 deletions
@@ -20,7 +20,7 @@ jobs:
 
       - uses: pnpm/action-setup@v2
         with:
-          version: 8
+          version: 10
 
       - name: Cache pnpm
         uses: actions/cache@v4
@@ -53,7 +53,7 @@ jobs:
 
       - uses: pnpm/action-setup@v2
         with:
-          version: 8
+          version: 10
 
       - name: Cache pnpm
         uses: actions/cache@v4
 
@@ -24,4 +24,14 @@ yarn-error.log*
 
 # typescript
 *.tsbuildinfo
-next-env.d.ts
+next-env.d.ts
+
+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+venv/
+ENV/
+.venv
@@ -0,0 +1,59 @@
+# AI Model Training Pipeline for Lab68Dev
+
+Custom NLP model fine-tuned for task creation and tech support.
+
+## Quick Start
+
+### 1. Setup Environment
+
+```bash
+# Create virtual environment
+python -m venv venv
+
+# Activate (Windows)
+.\venv\Scripts\activate
+
+# Activate (Linux/macOS)
+source venv/bin/activate
+# Install dependencies
+pip install -r requirements.txt
+```
+
+### 2. Generate Training Data
+
+```bash
+python data/generate_dataset.py
+```
+
+### 3. Train Model
+
+```bash
+python train.py
+```
+
+### 4. Run Inference Server
+
+```bash
+# Set allowed CORS origins (optional, defaults to http://localhost:3000)
+export ALLOWED_ORIGINS="http://localhost:3000,http://localhost:3001"
+
+# Run the server
+python inference/server.py
+```
+
+#### Environment Variables
+
+- `ALLOWED_ORIGINS`: Comma-separated list of allowed CORS origins (default: `http://localhost:3000`)
+  - Example: `http://localhost:3000,https://example.com`
+  - For production, set this to your specific domain(s) for security
+
+## Hardware Requirements
+
+- **Minimum:** RTX 4060 (8GB VRAM)
+- **Recommended:** RTX 4070+ (12GB+ VRAM)
+
+## Model Details
+
+- **Base Model:** TinyLlama-1.1B-Chat-v1.0
+- **Fine-tuning:** LoRA (rank 16)
+- **Dataset:** ~4000 synthetic examples (task creation + tech Q&A)
@@ -0,0 +1,51 @@
+# Training Configuration for Lab68Dev AI Model
+
+# Model Settings
+model:
+  name: "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
+  max_length: 512
+  load_in_4bit: true
+
+# LoRA Configuration
+lora:
+  r: 16
+  lora_alpha: 32
+  lora_dropout: 0.05
+  target_modules:
+    - "q_proj"
+    - "k_proj"
+    - "v_proj"
+    - "o_proj"
+    - "gate_proj"
+    - "up_proj"
+    - "down_proj"
+
+# Training Hyperparameters
+training:
+  output_dir: "./models/lab68dev-assistant"
+  num_train_epochs: 3
+  per_device_train_batch_size: 2
+  gradient_accumulation_steps: 8
+  learning_rate: 2.0e-4
+  weight_decay: 0.01
+  warmup_ratio: 0.03
+  lr_scheduler_type: "cosine"
+  logging_steps: 10
+  save_steps: 100
+  eval_steps: 100
+  save_total_limit: 3
+  fp16: true
+  optim: "paged_adamw_8bit"
+
+# Dataset Settings
+dataset:
+  train_file: "./data/dataset/train.jsonl"
+  val_file: "./data/dataset/val.jsonl"
+  train_size: 0.9
+
+# Generation Settings (for inference)
+generation:
+  max_new_tokens: 256
+  temperature: 0.7
+  top_p: 0.9
+  do_sample: true
@@ -0,0 +1,141 @@
+#!/usr/bin/env python3
+"""Dataset Generator for Lab68Dev AI Model"""
+import json
+import random
+from pathlib import Path
+from typing import List, Dict
+
+TECH_STACKS = {
+    "frontend": ["React", "Next.js", "Vue.js", "TypeScript", "Tailwind"],
+    "backend": ["Node.js", "Python", "FastAPI", "Express", "Django"],
+    "database": ["PostgreSQL", "MongoDB", "Redis", "Supabase"],
+    "devops": ["Docker", "AWS", "GitHub Actions", "Kubernetes"]
+}
+
+TASK_PROMPTS = [
+    ("Create a task for building a navbar in React", "frontend"),
+    ("Create a task for building a login form", "frontend"),
+    ("I need to implement dark mode for my app", "frontend"),
+    ("Create a task for setting up a REST API", "backend"),
+    ("I need to build a user authentication system", "backend"),
+    ("Create a task for database schema design", "database"),
+    ("I need to set up CI/CD pipeline", "devops"),
+    ("Create a task for Docker containerization", "devops"),
+]
+
+QA_PAIRS = [
+    ("Explain async/await in JavaScript", "Async/await is syntactic sugar for Promises. The async keyword marks a function as asynchronous, and await pauses execution until the Promise resolves. Use try/catch for error handling."),
+    ("Explain closures in JavaScript", "A closure is a function that has access to variables from its outer scope, even after the outer function has returned. Common uses include data privacy and factory functions."),
+    ("How to implement state management in React?", "Options include: useState for local state, useReducer for complex state, Context API for global state, or external libraries like Zustand or Redux for large applications."),
+    ("How to handle errors in Node.js?", "Use try/catch blocks for async/await. Create custom error classes. Implement error middleware in Express. Always log errors with proper context."),
+    ("Explain REST APIs", "REST uses HTTP methods (GET, POST, PUT, DELETE) for CRUD operations. Resources are identified by URLs. The architecture is stateless and uses proper status codes."),
+    ("What is the difference between let and const?", "Both are block-scoped. let allows reassignment while const does not. Use const by default and let only when you need to reassign the variable."),
+    ("How to optimize database queries?", "Use indexes on frequently queried columns. Select only needed columns. Use EXPLAIN to analyze query plans. Implement pagination for large datasets."),
+]
+
+SYSTEM_PROMPT = "You are Lab68Dev Assistant, an AI specialized in software development tasks and technical explanations."
+
+
+def extract_title(prompt: str) -> str:
+    """Extract task title from prompt."""
+    for prefix in ["Create a task for ", "I need to "]:
+        if prompt.startswith(prefix):
+            return prompt[len(prefix):].capitalize()
+    return prompt.capitalize()
+
+
+def generate_task(prompt: str, category: str) -> Dict:
+    """Generate structured task response."""
+    tech = TECH_STACKS.get(category, ["JavaScript"])
+    return {
+        "title": extract_title(prompt),
+        "category": category,
+        "priority": random.choice(["low", "medium", "high"]),
+        "estimated_hours": random.choice([2, 4, 8, 16]),
+        "tech_stack": random.sample(tech, min(2, len(tech))),
+        "steps": [
+            {"step": 1, "description": "Research requirements", "status": "pending"},
+            {"step": 2, "description": "Implement core functionality", "status": "pending"},
+            {"step": 3, "description": "Write tests", "status": "pending"},
+            {"step": 4, "description": "Review and deploy", "status": "pending"},
+        ]
+    }
+
+
+def generate_examples(num_tasks: int = 2000, num_qa: int = 2000) -> List[Dict]:
+    """Generate training examples."""
+    examples = []
+    
+    # Task creation examples
+    for _ in range(num_tasks):
+        prompt, category = random.choice(TASK_PROMPTS)
+        task = generate_task(prompt, category)
+        examples.append({
+            "instruction": prompt,
+            "output": json.dumps(task, indent=2),
+            "type": "task_creation"
+        })
+    
+    # Q&A examples
+    for _ in range(num_qa):
+        question, answer = random.choice(QA_PAIRS)
+        examples.append({
+            "instruction": question,
+            "output": answer,
+            "type": "tech_qa"
+        })
+    
+    random.shuffle(examples)
+    return examples
+
+
+def format_for_training(examples: List[Dict]) -> List[Dict]:
+    """Format examples for TinyLlama chat format."""
+    # Define tags for TinyLlama chat format
+    sys_open = "<|system|>"
+    user_open = "<|user|>"
+    asst_open = "<|assistant|>"
+    end_tag = "</s>"
+    newline = "\n"
+    
+    formatted = []
+    for ex in examples:
+        text = sys_open + newline + SYSTEM_PROMPT + end_tag + newline
+        text += user_open + newline + ex["instruction"] + end_tag + newline
+        text += asst_open + newline + ex["output"] + end_tag
+        formatted.append({"text": text})
+    
+    return formatted
+
+
+def main():
+    print("Generating synthetic dataset...")
+    
+    # Create output directory
+    output_dir = Path("data/dataset")
+    output_dir.mkdir(parents=True, exist_ok=True)
+    
+    # Generate examples
+    examples = generate_examples(2000, 2000)
+    
+    # Split into train/val
+    split_idx = int(len(examples) * 0.9)
+    train_examples = format_for_training(examples[:split_idx])
+    val_examples = format_for_training(examples[split_idx:])
+    
+    # Save to JSONL files
+    with open(output_dir / "train.jsonl", "w", encoding="utf-8") as f:
+        for ex in train_examples:
+            f.write(json.dumps(ex, ensure_ascii=False) + "\n")
+    
+    with open(output_dir / "val.jsonl", "w", encoding="utf-8") as f:
+        for ex in val_examples:
+            f.write(json.dumps(ex, ensure_ascii=False) + "\n")
+    
+    print(f"Generated {len(train_examples)} training examples")
+    print(f"Generated {len(val_examples)} validation examples")
+    print(f"Saved to {output_dir}")
+
+
+if __name__ == "__main__":
+    main()