Skip to content

Commit b2269ff

Browse files
tbitcsoz-agent
andcommitted
fix: import quality — diff marker stripping, dedup, existing doc detection
P0: Diff marker detection + auto-stripping - Detects |-, |+, || prefixes and merge conflict markers in AGENTS.md - Strips them before section parsing, prints warnings to stderr - Prevents broken formatting from propagating into governance files P0: Paragraph-level deduplication - After splitting into governance files, removes duplicate paragraphs within each file using first-200-char normalized comparison - Eliminates ~50 lines of pure duplication in typical FPGA projects P2: Existing doc detection - Skip generating docs/architecture.md stub when project already has docs/**/architecture* or docs/**/ARCHITECTURE* - Same for docs/REQUIREMENTS.md and docs/TEST_SPEC.md - Prevents conflicting stubs alongside real project documentation Filed #47 (topic-aware classification) and #48 (type-specific thresholds) for remaining P1/P2 improvements. Co-Authored-By: Oz <oz-agent@warp.dev>
1 parent a8db64c commit b2269ff

1 file changed

Lines changed: 106 additions & 12 deletions

File tree

src/specsmith/importer.py

Lines changed: 106 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -634,11 +634,75 @@ def _extract_git_contributors(root: Path) -> list[str]:
634634
]
635635

636636

637+
def _clean_diff_markers(text: str) -> str:
638+
"""Strip git diff/merge conflict artifacts from text.
639+
640+
Removes leading |-, |+, || prefixes and <<<<<<< / >>>>>>> markers.
641+
"""
642+
import re
643+
644+
cleaned_lines: list[str] = []
645+
skip_block = False
646+
for line in text.splitlines():
647+
# Skip merge conflict markers entirely
648+
if line.startswith(("<<<<<<", ">>>>>>", "======")):
649+
skip_block = not skip_block if line.startswith("<<<<<<") else skip_block
650+
if line.startswith(">>>>>>>"):
651+
skip_block = False
652+
continue
653+
if skip_block:
654+
continue
655+
# Strip diff marker prefixes: |-, |+, ||, leading +, leading -
656+
stripped = re.sub(r"^\|{1,2}[-+]\s?", "", line)
657+
stripped = re.sub(r"^[-+]\s(?=[A-Z])", "", stripped) # +/- before prose
658+
cleaned_lines.append(stripped)
659+
return "\n".join(cleaned_lines)
660+
661+
662+
def _detect_content_issues(text: str) -> list[str]:
663+
"""Detect quality issues in source text. Returns list of warnings."""
664+
import re
665+
666+
warnings: list[str] = []
667+
lines = text.splitlines()
668+
diff_marker_count = 0
669+
for i, line in enumerate(lines, 1):
670+
# Diff markers
671+
if re.match(r"^\|{1,2}[-+]", line):
672+
diff_marker_count += 1
673+
# Merge conflict markers
674+
if line.startswith(("<<<<<<", ">>>>>>")):
675+
warnings.append(f" Line {i}: unresolved merge conflict marker")
676+
if diff_marker_count > 0:
677+
warnings.append(
678+
f" {diff_marker_count} line(s) with git diff markers (|-, |+) — auto-stripped"
679+
)
680+
return warnings
681+
682+
683+
def _deduplicate_paragraphs(text: str) -> str:
684+
"""Remove duplicate paragraphs within a text block."""
685+
paragraphs = text.split("\n\n")
686+
seen: set[str] = set()
687+
unique: list[str] = []
688+
for para in paragraphs:
689+
normalized = para.strip()
690+
if not normalized:
691+
continue
692+
# Use first 200 chars as dedup key (handles minor formatting diffs)
693+
key = normalized[:200].lower()
694+
if key not in seen:
695+
seen.add(key)
696+
unique.append(para)
697+
return "\n\n".join(unique)
698+
699+
637700
def _extract_governance_sections(root: Path) -> dict[str, str]:
638701
"""Extract modular governance content from existing AGENTS.md.
639702
640703
If AGENTS.md exists and is large, extract sections into modular files.
641704
Unmatched sections are collected into rules.md so nothing is lost.
705+
Diff markers are stripped and duplicate paragraphs are removed.
642706
"""
643707
defaults = {
644708
"rules": (
@@ -668,10 +732,21 @@ def _extract_governance_sections(root: Path) -> dict[str, str]:
668732
if not agents_path.exists():
669733
return defaults
670734

671-
content = agents_path.read_text(encoding="utf-8")
672-
if len(content.splitlines()) < 50:
735+
raw_content = agents_path.read_text(encoding="utf-8")
736+
if len(raw_content.splitlines()) < 50:
673737
return defaults # Too short to extract from
674738

739+
# P0: Detect and report content issues, then clean diff markers
740+
issues = _detect_content_issues(raw_content)
741+
if issues:
742+
import sys
743+
744+
print("\n[specsmith] Content quality warnings in AGENTS.md:", file=sys.stderr) # noqa: T201
745+
for w in issues:
746+
print(w, file=sys.stderr) # noqa: T201
747+
748+
content = _clean_diff_markers(raw_content)
749+
675750
# Parse AGENTS.md into sections by ## headings
676751
sections: dict[str, str] = {}
677752
current_heading = ""
@@ -744,7 +819,8 @@ def _extract_governance_sections(root: Path) -> dict[str, str]:
744819
parts.append(f"## {heading}\n")
745820
parts.append(body)
746821
parts.append("")
747-
result[category] = "\n".join(parts) + "\n"
822+
# P1: Deduplicate paragraphs within each governance file
823+
result[category] = _deduplicate_paragraphs("\n".join(parts)) + "\n"
748824

749825
return result
750826

@@ -896,8 +972,14 @@ def _write(rel_path: str, content: str) -> None:
896972
f"- Build system: {result.build_system}\n",
897973
)
898974

899-
# docs/REQUIREMENTS.md
900-
reqs = "# Requirements\n\nRequirements auto-generated from project detection.\n\n"
975+
# docs/REQUIREMENTS.md — skip if project already has one (anywhere under docs/)
976+
existing_reqs = list(target.glob("docs/**/REQUIREMENTS*")) + list(
977+
target.glob("docs/**/requirements*")
978+
)
979+
if existing_reqs and not force:
980+
pass # Preserve existing requirements doc
981+
else:
982+
reqs = "# Requirements\n\nRequirements auto-generated from project detection.\n\n"
901983
for module in result.modules:
902984
mu = module.upper().replace(" ", "-")
903985
reqs += (
@@ -913,21 +995,33 @@ def _write(rel_path: str, content: str) -> None:
913995
"- **Status**: Draft\n"
914996
f"- **Description**: Project builds successfully with {result.build_system}\n\n"
915997
)
916-
_write("docs/REQUIREMENTS.md", reqs)
998+
_write("docs/REQUIREMENTS.md", reqs)
917999

918-
# docs/TEST_SPEC.md
919-
tests = "# Test Specification\n\nTests auto-generated from project detection.\n\n"
1000+
# docs/TEST_SPEC.md — skip if project already has one
1001+
existing_tests = list(target.glob("docs/**/TEST_SPEC*")) + list(
1002+
target.glob("docs/**/test_spec*")
1003+
)
1004+
if existing_tests and not force:
1005+
pass # Preserve existing test spec
1006+
else:
1007+
tests = "# Test Specification\n\nTests auto-generated from project detection.\n\n"
9201008
for i, test_file in enumerate(result.test_files[:20], 1):
9211009
tests += f"## TEST-{i:03d}\n- **File**: {test_file}\n- **Status**: Detected\n"
9221010
for module in result.modules:
9231011
if module in test_file:
9241012
tests += f"- **Requirement**: REQ-{module.upper()}-001\n"
9251013
break
9261014
tests += "\n"
927-
_write("docs/TEST_SPEC.md", tests)
1015+
_write("docs/TEST_SPEC.md", tests)
9281016

929-
# docs/architecture.md
930-
arch = (
1017+
# docs/architecture.md — skip if project has architecture doc anywhere under docs/
1018+
existing_arch = list(target.glob("docs/**/architecture*")) + list(
1019+
target.glob("docs/**/ARCHITECTURE*")
1020+
)
1021+
if existing_arch and not force:
1022+
pass # Preserve existing architecture doc
1023+
else:
1024+
arch = (
9311025
f"# Architecture — {name}\n\n"
9321026
"Architecture auto-generated from project detection.\n\n"
9331027
"## Overview\n"
@@ -949,7 +1043,7 @@ def _write(rel_path: str, content: str) -> None:
9491043
arch += "## Language Distribution\n"
9501044
for lang_name, count in sorted(result.languages.items(), key=lambda x: -x[1]):
9511045
arch += f"- {lang_name}: {count} files\n"
952-
_write("docs/architecture.md", arch)
1046+
_write("docs/architecture.md", arch)
9531047

9541048
# --- Modular governance files ---
9551049
# If AGENTS.md exists and is rich, extract sections from it.

0 commit comments

Comments
 (0)