Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 17 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -107,14 +107,16 @@ Somm.dev uses **six specialized AI agents** built with LangGraph to evaluate you

## 📊 Evaluation Criteria

Somm.dev provides **4 customizable evaluation modes**:
Somm.dev provides **6 evaluation modes**:

| Mode | Use Case | Weights |
|------|----------|---------|
| **Basic** | General code review | Code Quality (25%), Architecture (20%), Documentation (20%), Testing (20%), Security (15%) |
| **Hackathon** | Gemini 3 Hackathon judging | Technical (40%), Innovation (30%), Impact (20%), Presentation (10%) |
| **Academic** | Research projects | Novelty (25%), Methodology (25%), Reproducibility (20%), Documentation (20%), Impact (10%) |
| **Custom** | Special requirements | User-defined criteria |
| Mode | Use Case | Description |
|------|----------|-------------|
| **six_sommeliers** | Default evaluation | 6 AI sommelier agents with parallel fan-out pattern |
| **grand_tasting** | Quick evaluation | P0 priority techniques only for faster results |
| **full_techniques** | Comprehensive evaluation | All 75 techniques across 8 categories with deep synthesis |
| **basic** | General code review | Code Quality (25%), Architecture (20%), Documentation (20%), Testing (20%), Security (15%) |
| **hackathon** | Gemini 3 Hackathon judging | Technical (40%), Innovation (30%), Impact (20%), Presentation (10%) |
| **academic** | Research projects | Novelty (25%), Methodology (25%), Reproducibility (20%), Documentation (20%), Impact (10%) |

---

Expand Down Expand Up @@ -145,6 +147,14 @@ Somm.dev evaluates repositories on a **0-100 point scale**:
| `/api/evaluate/{id}/result` | GET | Get evaluation results |
| `/api/history` | GET | User evaluation history |

### Techniques API

| Endpoint | Method | Description |
|----------|--------|-------------|
| `/api/techniques` | GET | List all 75 techniques (filter by category, hat, mode) |
| `/api/techniques/stats` | GET | Get aggregated statistics |
| `/api/techniques/{id}` | GET | Get detailed technique definition |

### Request Example
```bash
curl -X POST http://localhost:8000/api/evaluate \
Expand Down
312 changes: 311 additions & 1 deletion backend/app/agents/code_grader.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
from dataclasses import dataclass
from typing import Any

from app.criteria.llm_grader import LLMGrader


@dataclass
class CodeGraderResult:
Expand All @@ -14,7 +16,42 @@ class CodeGraderResult:


class CodeGraderAgent:
EVALUABLE_ITEMS = ["D1", "D2", "C4", "B1", "B2"]
EVALUABLE_ITEMS = [
"A1",
"A2",
"A3",
"A4",
"B1",
"B2",
"B3",
"B4",
"C1",
"C2",
"C3",
"C4",
"C5",
"D1",
"D2",
"D3",
"D4",
]

OBJECTIVE_ITEMS = ["C1", "C2", "C4", "B1", "B2", "D1", "D2"]

SUBJECTIVE_ITEMS = [
"A1",
"A2",
"A3",
"A4",
"B3",
"B4",
"C3",
"C5",
"D3",
"D4",
]
Comment on lines +41 to +52

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The item B2 (System Architecture) is incorrectly included in SUBJECTIVE_ITEMS. B2 is evaluated by a deterministic heuristic (_evaluate_system_architecture) and is correctly listed in OBJECTIVE_ITEMS. To avoid confusion and maintain a clear distinction between objective and subjective evaluations, it should be removed from this list.

    SUBJECTIVE_ITEMS = [
        "A1",
        "A2",
        "A3",
        "A4",
        "B3",
        "B4",
        "C3",
        "C5",
        "D3",
        "D4",
    ]


ORIGINAL_ITEMS = ["D1", "D2", "C4", "B1", "B2"]

def __init__(self):
self.name = "Code Grader"
Expand All @@ -23,7 +60,18 @@ def __init__(self):
def get_evaluable_items(self) -> list[str]:
return list(self.EVALUABLE_ITEMS)

def get_objective_items(self) -> list[str]:
return list(self.OBJECTIVE_ITEMS)

def get_subjective_items(self) -> list[str]:
return list(self.SUBJECTIVE_ITEMS)

def evaluate(self, context: dict[str, Any]) -> dict[str, Any]:
"""Evaluate repository using deterministic heuristics.

Maintains backward compatibility - returns only original 5 items.
Use evaluate_all() for full 17-item evaluation.
"""
repo_context = context.get("repo_context", {})

if not repo_context:
Expand Down Expand Up @@ -62,6 +110,77 @@ def evaluate(self, context: dict[str, Any]) -> dict[str, Any]:
},
}

async def evaluate_all(
self, context: dict[str, Any], llm: Any = None
) -> dict[str, Any]:
"""Evaluate all 17 BMAD items using objective heuristics and optional LLM.

Args:
context: Repository context dictionary
llm: Optional LLM client for subjective items. If None, subjective
items return placeholder scores with confidence="low".

Returns:
Dictionary with all 17 item scores and usage tracking
"""
repo_context = context.get("repo_context", {})

if not repo_context:
return {
"item_scores": {},
"_usage": {
"input_tokens": 0,
"output_tokens": 0,
"duration_ms": 0,
"cost_usd": 0,
},
}

item_scores = {}

objective_map = {
"C1": self._evaluate_code_quality,
"C2": self._evaluate_testing_coverage,
"C4": self._evaluate_test_existence,
"B1": self._evaluate_tech_stack,
"B2": self._evaluate_system_architecture,
"D1": self._evaluate_readme_quality,
"D2": self._evaluate_code_comments,
}

for item_code, method in objective_map.items():
result = method(repo_context)
if result:
item_score = self._to_item_score(result)
item_scores[item_code] = item_score

llm_grader = LLMGrader()
for item_code in self.SUBJECTIVE_ITEMS:
if llm is not None or item_code not in item_scores:
result = await llm_grader.grade_item(item_code, repo_context, llm)
item_scores[item_code] = {
"item_code": result["item_id"],
"score": result["score"],
"max_score": result["max_score"],
"evidence": result["evidence"],
"rationale": result["rationale"],
"confidence": result["confidence"],
"metrics": {},
}
Comment on lines +156 to +169

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🔴 Critical

evaluate_all에서 llm_grader.grade_item 호출 시 llm_grader.py의 버그 영향을 받습니다.

llm_grader.pygrade_item에서 response["score"] 대신 response["parsed"]["score"]를 참조해야 하는 치명적 버그가 있습니다. LLM이 제공되면 이 코드 경로에서 KeyError가 발생하여 결국 재시도 후 fallback 결과(score=0, confidence="low")가 반환됩니다. 즉, LLM을 사용한 주관적 평가가 항상 실패하게 됩니다.

llm_grader.py의 수정이 선행되어야 합니다.

🤖 Prompt for AI Agents
In `@backend/app/agents/code_grader.py` around lines 156 - 169, The evaluate_all
loop calls LLMGrader.grade_item and is failing because grade_item in
llm_grader.py reads response["score"] instead of the actual parsed value
response["parsed"]["score"]; update LLMGrader.grade_item to extract score,
max_score, confidence, rationale, and evidence from response["parsed"] (e.g.,
use response["parsed"]["score"] etc.) and ensure evaluate_all's usage of
LLMGrader.grade_item remains the same so subjective items use the corrected
parsed fields rather than causing KeyError and fallback.


llm_usage = llm_grader.get_usage()
usage = {
"input_tokens": llm_usage["prompt_tokens"],
"output_tokens": llm_usage["completion_tokens"],
"duration_ms": 0,
"cost_usd": llm_usage["cost_usd"],
}

return {
"item_scores": item_scores,
"_usage": usage,
}

def _to_item_score(self, result: CodeGraderResult) -> dict[str, Any]:
return {
"item_code": result.item_code,
Expand Down Expand Up @@ -559,3 +678,194 @@ def _evaluate_system_architecture(
rationale=f"시스템 아키텍처 평가: {score}/6점",
metrics=metrics,
)

def _evaluate_code_quality(
self, repo_context: dict[str, Any]
) -> CodeGraderResult | None:
"""Evaluate code quality (C1) based on linting configs and code metrics."""
file_tree = repo_context.get("file_tree", [])
main_files = repo_context.get("main_files", [])

score = 0
evidence = []
metrics = {
"has_linter_config": False,
"has_formatter_config": False,
"has_type_checker": False,
"avg_file_length": 0,
"total_files": len(main_files),
}

filenames = {path.split("/")[-1].lower() for path in file_tree}

linter_configs = {
"ruff.toml": "ruff",
".flake8": "flake8",
".pylintrc": "pylint",
".eslintrc": "eslint",
".eslintrc.js": "eslint",
".eslintrc.json": "eslint",
"eslint.config.js": "eslint",
".golangci.yml": "golangci-lint",
}
detected_linters = [
tool for cfg, tool in linter_configs.items() if cfg.lower() in filenames
]
if detected_linters:
score += 2
evidence.append(f"린터 설정: {', '.join(set(detected_linters))}")
metrics["has_linter_config"] = True

formatter_configs = {
".prettierrc": "prettier",
".prettierrc.js": "prettier",
".prettierrc.json": "prettier",
"biome.json": "biome",
"black.toml": "black",
"pyproject.toml": "black/isort",
}
detected_formatters = [
tool for cfg, tool in formatter_configs.items() if cfg.lower() in filenames
]
if detected_formatters:
score += 1
evidence.append(f"포맷터 설정: {', '.join(set(detected_formatters))}")
metrics["has_formatter_config"] = True

type_configs = {
"mypy.ini": "mypy",
".mypy.ini": "mypy",
"pyrightconfig.json": "pyright",
"tsconfig.json": "typescript",
}
detected_type_checkers = [
tool for cfg, tool in type_configs.items() if cfg.lower() in filenames
]
if detected_type_checkers:
score += 2
evidence.append(f"타입 체커: {', '.join(set(detected_type_checkers))}")
metrics["has_type_checker"] = True

if main_files:
total_lines = 0
for file_info in main_files:
content = file_info.get("content", "")
if content:
total_lines += len(content.split("\n"))

avg_length = total_lines / len(main_files) if main_files else 0
metrics["avg_file_length"] = round(avg_length, 2)

if avg_length <= 300:
score += 1
evidence.append(f"적절한 파일 길이 (평균 {avg_length:.0f}줄)")
elif avg_length <= 500:
score += 1
evidence.append(f"양호한 파일 길이 (평균 {avg_length:.0f}줄)")

if ".pre-commit-config.yaml" in filenames:
score += 1
evidence.append("pre-commit 훅 설정")

if not evidence:
evidence.append("코드 품질 도구 설정 없음")

return CodeGraderResult(
item_code="C1",
score=min(score, 7),
max_score=7,
evidence=evidence,
rationale=f"코드 품질 평가: {score}/7점",
metrics=metrics,
)

def _evaluate_testing_coverage(
self, repo_context: dict[str, Any]
) -> CodeGraderResult | None:
"""Evaluate testing coverage (C2) based on test files and coverage configs."""
file_tree = repo_context.get("file_tree", [])

score = 0
evidence = []
metrics = {
"test_file_count": 0,
"has_coverage_config": False,
"has_test_framework": False,
"test_to_source_ratio": 0.0,
}

test_patterns = ["test_", "_test.", ".test.", ".spec."]
test_files = [
path for path in file_tree if any(p in path.lower() for p in test_patterns)
]
test_file_count = len(test_files)
metrics["test_file_count"] = test_file_count

if test_file_count > 0:
score += 1
evidence.append(f"테스트 파일 {test_file_count}개")

if test_file_count >= 5:
score += 1
evidence.append("충분한 테스트 파일 (5개 이상)")

coverage_configs = [
".coveragerc",
"coverage.ini",
".nycrc",
"codecov.yml",
".codecov.yml",
]
filenames_for_coverage = {path.split("/")[-1].lower() for path in file_tree}
has_coverage = any(
cfg.lower() in filenames_for_coverage for cfg in coverage_configs
)
Comment on lines +812 to +822

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

커버리지 설정 감지 로직에서 오탐(false positive) 가능성이 있습니다.

any(cfg in path.lower() for path in file_tree for cfg in coverage_configs) 패턴은 경로의 일부 문자열과 매칭합니다. 예를 들어 "src/coverage.ini.bak" 같은 경로도 매칭됩니다. _evaluate_test_existence 등 다른 메서드에서는 파일명만 추출(path.split("/")[-1])하여 비교하는데, 여기서는 전체 경로에서 부분 문자열 매칭을 사용하여 일관성이 떨어집니다.

🔧 파일명 기반 매칭으로 수정
+        filenames = {path.split("/")[-1].lower() for path in file_tree}
         coverage_configs = [
             ".coveragerc",
             "coverage.ini",
             ".nycrc",
             "codecov.yml",
             ".codecov.yml",
         ]
-        has_coverage = any(
-            cfg in path.lower() for path in file_tree for cfg in coverage_configs
-        )
+        has_coverage = any(cfg in filenames for cfg in coverage_configs)
🤖 Prompt for AI Agents
In `@backend/app/agents/code_grader.py` around lines 812 - 821, The coverage
detection currently uses substring matching which causes false positives; change
the logic that sets has_coverage to compare only the filename portion against
the exact set of coverage filenames (coverage_configs) using a basename
extraction (e.g., path.split("/")[-1] or os.path.basename) and normalize to
lower-case before equality checks so entries like "src/coverage.ini.bak" no
longer match; update the code around coverage_configs and has_coverage to
implement this exact filename comparison using file_tree and the filename
extraction.

if has_coverage:
score += 2
evidence.append("커버리지 설정 존재")
metrics["has_coverage_config"] = True

framework_configs = {
"pytest.ini": "pytest",
"setup.cfg": "pytest/distutils",
"jest.config.js": "jest",
"jest.config.ts": "jest",
"vitest.config.js": "vitest",
"vitest.config.ts": "vitest",
"karma.conf.js": "karma",
}
filenames = {path.split("/")[-1].lower() for path in file_tree}
detected_frameworks = [
fw for cfg, fw in framework_configs.items() if cfg.lower() in filenames
]
if detected_frameworks:
score += 1
evidence.append(f"테스트 프레임워크: {', '.join(set(detected_frameworks))}")
metrics["has_test_framework"] = True

source_files = [
f
for f in file_tree
if f.endswith((".py", ".js", ".ts", ".go", ".java", ".rs"))
and not any(
p in f.lower() for p in test_patterns + ["tests/", "__tests__/"]
)
]
if source_files and test_files:
ratio = len(test_files) / len(source_files)
metrics["test_to_source_ratio"] = round(ratio, 2)
if ratio >= 0.5:
score += 1
evidence.append(f"양호한 테스트 비율 ({ratio:.2f})")

if not evidence:
evidence.append("테스트 관련 설정 부족")

return CodeGraderResult(
item_code="C2",
score=min(score, 6),
max_score=6,
evidence=evidence,
rationale=f"테스트 커버리지 평가: {score}/6점",
metrics=metrics,
)
Loading
Loading