Merge pull request #1 from nmdra/copilot/fine-tune-smollm2-135m-lora

nmdra · web-flow · commit 4848d6ef7e5b · 2026-04-19T14:52:05.000+05:30
Add  SmolLM2 training pipeline with script + notebook outputs
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,7 @@
+__pycache__/
+*.py[cod]
+.venv/
+outputs/
+smollm-student-extractor/
+smollm-student-gguf/
+data/dataset.json
diff --git a/README.md b/README.md
@@ -1,2 +1,56 @@
 # Assignment-Metadata-Extractor
-A lightweight, locally hosted LLM pipeline that extracts and normalizes unstructured student assignment text into a strict JSON schema using a fine-tuned SmolLM2-135M model served via Ollama.
+
+A lightweight, locally hosted LLM pipeline that extracts and normalizes unstructured student assignment text into a strict JSON schema using a fine-tuned SmolLM2 model served via Ollama.
+
+## Model
+
+Hugging Face model: https://huggingface.co/nimendraai/SmolLM2-360M-Assignment-Metadata-Extractor
+
+## Training (Part 1)
+
+This repository includes:
+
+- `data/generate_dataset.py`: Generates a diverse synthetic training dataset.
+- `training/train.py`: Fine-tunes `HuggingFaceTB/SmolLM2-135M-Instruct` with Unsloth + LoRA and exports GGUF.
+- `training/train.ipynb`: Jupyter notebook version of the same training pipeline.
+
+## Environment Setup (UV)
+
+This project uses **uv** for package management.
+
+```bash
+uv venv
+source .venv/bin/activate   # Linux/macOS
+# .venv\Scripts\activate    # Windows PowerShell
+
+uv sync
+```
+
+If you want to run the notebook locally:
+
+```bash
+uv run python -m ipykernel install --user --name assignment-metadata-extractor
+```
+
+## Generate Dataset
+
+```bash
+uv run python data/generate_dataset.py --size 400 --output data/dataset.json
+```
+
+## Run Fine-Tuning Script
+
+```bash
+uv run python training/train.py
+```
+
+Outputs:
+
+- `./smollm-student-extractor/` (HuggingFace format)
+- `./smollm-student-gguf/model-Q4_K_M.gguf` (Ollama-ready GGUF)
+
+## Run Notebook Version
+
+```bash
+uv run jupyter notebook training/train.ipynb
+```
diff --git a/data/generate_dataset.py b/data/generate_dataset.py
@@ -0,0 +1,89 @@
+import argparse
+import json
+import random
+from pathlib import Path
+
+STUDENT_NUMBER_KEYS = [
+    "Student No",
+    "Stu. ID",
+    "Student ID",
+    "ID",
+    "Reg No",
+    "Registration No",
+    "Index No",
+    "Reg. Number",
+]
+STUDENT_NAME_KEYS = ["Name", "Full Name", "Student Name", "Student", "Stu. Name"]
+ASSIGNMENT_KEYS = [
+    "Assignment #",
+    "Assignment No",
+    "HW",
+    "Task No",
+    "Submission No",
+    "Assgn #",
+    "Worksheet No",
+]
+SEPARATORS = [": ", " - ", " = ", ": "]
+LINE_BREAKS = ["\n", " | ", ", ", "  "]
+
+NAMES = [
+    "Amal Perera",
+    "Nimal Silva",
+    "Kasun Fernando",
+    "Dilini Rathnayake",
+    "Chamara Bandara",
+    "Sithum Jayawardena",
+    "Amali Gunasekara",
+]
+
+
+def make_example(student_num: int, name: str, assign_num: int) -> dict:
+    sk = random.choice(STUDENT_NUMBER_KEYS)
+    nk = random.choice(STUDENT_NAME_KEYS)
+    ak = random.choice(ASSIGNMENT_KEYS)
+    sep = random.choice(SEPARATORS)
+    lb = random.choice(LINE_BREAKS)
+    text = f"{sk}{sep}{student_num}{lb}{nk}{sep}{name}{lb}{ak}{sep}{assign_num}"
+    return {
+        "instruction": "Extract student info as JSON from the following text.",
+        "input": text,
+        "output": json.dumps(
+            {
+                "student_number": str(student_num),
+                "student_name": name,
+                "assignment_number": str(assign_num),
+            }
+        ),
+    }
+
+
+def generate_dataset(size: int) -> list[dict]:
+    dataset = [
+        make_example(20210000 + i, random.choice(NAMES), (i % 10) + 1) for i in range(size)
+    ]
+    random.shuffle(dataset)
+    return dataset
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Generate student extraction dataset.")
+    parser.add_argument("--size", type=int, default=400, help="Number of examples to generate.")
+    parser.add_argument(
+        "--output",
+        default="data/dataset.json",
+        help="Output JSON path.",
+    )
+    args = parser.parse_args()
+
+    output_path = Path(args.output)
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+
+    dataset = generate_dataset(args.size)
+    with output_path.open("w", encoding="utf-8") as f:
+        json.dump(dataset, f, indent=2)
+
+    print(f"Generated {len(dataset)} examples at {output_path}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/pyproject.toml b/pyproject.toml
@@ -0,0 +1,18 @@
+[project]
+name = "assignment-metadata-extractor"
+version = "0.1.0"
+description = "SmolLM2 fine-tuning pipeline for student assignment metadata extraction"
+readme = "README.md"
+requires-python = ">=3.10,<3.12"
+dependencies = [
+  "accelerate>=1.2.1",
+  "datasets>=3.2.0",
+  "ipykernel>=6.29.5",
+  "jupyter>=1.1.1",
+  "llama-cpp-python>=0.3.7",
+  "trl>=0.12.2",
+  "unsloth>=2025.2.15",
+]
+
+[tool.uv]
+package = false
diff --git a/training/train.ipynb b/training/train.ipynb
diff --git a/training/train.py b/training/train.py