@@ -634,11 +634,75 @@ def _extract_git_contributors(root: Path) -> list[str]:
634634]
635635
636636
637+ def _clean_diff_markers (text : str ) -> str :
638+ """Strip git diff/merge conflict artifacts from text.
639+
640+ Removes leading |-, |+, || prefixes and <<<<<<< / >>>>>>> markers.
641+ """
642+ import re
643+
644+ cleaned_lines : list [str ] = []
645+ skip_block = False
646+ for line in text .splitlines ():
647+ # Skip merge conflict markers entirely
648+ if line .startswith (("<<<<<<" , ">>>>>>" , "======" )):
649+ skip_block = not skip_block if line .startswith ("<<<<<<" ) else skip_block
650+ if line .startswith (">>>>>>>" ):
651+ skip_block = False
652+ continue
653+ if skip_block :
654+ continue
655+ # Strip diff marker prefixes: |-, |+, ||, leading +, leading -
656+ stripped = re .sub (r"^\|{1,2}[-+]\s?" , "" , line )
657+ stripped = re .sub (r"^[-+]\s(?=[A-Z])" , "" , stripped ) # +/- before prose
658+ cleaned_lines .append (stripped )
659+ return "\n " .join (cleaned_lines )
660+
661+
662+ def _detect_content_issues (text : str ) -> list [str ]:
663+ """Detect quality issues in source text. Returns list of warnings."""
664+ import re
665+
666+ warnings : list [str ] = []
667+ lines = text .splitlines ()
668+ diff_marker_count = 0
669+ for i , line in enumerate (lines , 1 ):
670+ # Diff markers
671+ if re .match (r"^\|{1,2}[-+]" , line ):
672+ diff_marker_count += 1
673+ # Merge conflict markers
674+ if line .startswith (("<<<<<<" , ">>>>>>" )):
675+ warnings .append (f" Line { i } : unresolved merge conflict marker" )
676+ if diff_marker_count > 0 :
677+ warnings .append (
678+ f" { diff_marker_count } line(s) with git diff markers (|-, |+) — auto-stripped"
679+ )
680+ return warnings
681+
682+
683+ def _deduplicate_paragraphs (text : str ) -> str :
684+ """Remove duplicate paragraphs within a text block."""
685+ paragraphs = text .split ("\n \n " )
686+ seen : set [str ] = set ()
687+ unique : list [str ] = []
688+ for para in paragraphs :
689+ normalized = para .strip ()
690+ if not normalized :
691+ continue
692+ # Use first 200 chars as dedup key (handles minor formatting diffs)
693+ key = normalized [:200 ].lower ()
694+ if key not in seen :
695+ seen .add (key )
696+ unique .append (para )
697+ return "\n \n " .join (unique )
698+
699+
637700def _extract_governance_sections (root : Path ) -> dict [str , str ]:
638701 """Extract modular governance content from existing AGENTS.md.
639702
640703 If AGENTS.md exists and is large, extract sections into modular files.
641704 Unmatched sections are collected into rules.md so nothing is lost.
705+ Diff markers are stripped and duplicate paragraphs are removed.
642706 """
643707 defaults = {
644708 "rules" : (
@@ -668,10 +732,21 @@ def _extract_governance_sections(root: Path) -> dict[str, str]:
668732 if not agents_path .exists ():
669733 return defaults
670734
671- content = agents_path .read_text (encoding = "utf-8" )
672- if len (content .splitlines ()) < 50 :
735+ raw_content = agents_path .read_text (encoding = "utf-8" )
736+ if len (raw_content .splitlines ()) < 50 :
673737 return defaults # Too short to extract from
674738
739+ # P0: Detect and report content issues, then clean diff markers
740+ issues = _detect_content_issues (raw_content )
741+ if issues :
742+ import sys
743+
744+ print ("\n [specsmith] Content quality warnings in AGENTS.md:" , file = sys .stderr ) # noqa: T201
745+ for w in issues :
746+ print (w , file = sys .stderr ) # noqa: T201
747+
748+ content = _clean_diff_markers (raw_content )
749+
675750 # Parse AGENTS.md into sections by ## headings
676751 sections : dict [str , str ] = {}
677752 current_heading = ""
@@ -744,7 +819,8 @@ def _extract_governance_sections(root: Path) -> dict[str, str]:
744819 parts .append (f"## { heading } \n " )
745820 parts .append (body )
746821 parts .append ("" )
747- result [category ] = "\n " .join (parts ) + "\n "
822+ # P1: Deduplicate paragraphs within each governance file
823+ result [category ] = _deduplicate_paragraphs ("\n " .join (parts )) + "\n "
748824
749825 return result
750826
@@ -896,8 +972,14 @@ def _write(rel_path: str, content: str) -> None:
896972 f"- Build system: { result .build_system } \n " ,
897973 )
898974
899- # docs/REQUIREMENTS.md
900- reqs = "# Requirements\n \n Requirements auto-generated from project detection.\n \n "
975+ # docs/REQUIREMENTS.md — skip if project already has one (anywhere under docs/)
976+ existing_reqs = list (target .glob ("docs/**/REQUIREMENTS*" )) + list (
977+ target .glob ("docs/**/requirements*" )
978+ )
979+ if existing_reqs and not force :
980+ pass # Preserve existing requirements doc
981+ else :
982+ reqs = "# Requirements\n \n Requirements auto-generated from project detection.\n \n "
901983 for module in result .modules :
902984 mu = module .upper ().replace (" " , "-" )
903985 reqs += (
@@ -913,21 +995,33 @@ def _write(rel_path: str, content: str) -> None:
913995 "- **Status**: Draft\n "
914996 f"- **Description**: Project builds successfully with { result .build_system } \n \n "
915997 )
916- _write ("docs/REQUIREMENTS.md" , reqs )
998+ _write ("docs/REQUIREMENTS.md" , reqs )
917999
918- # docs/TEST_SPEC.md
919- tests = "# Test Specification\n \n Tests auto-generated from project detection.\n \n "
1000+ # docs/TEST_SPEC.md — skip if project already has one
1001+ existing_tests = list (target .glob ("docs/**/TEST_SPEC*" )) + list (
1002+ target .glob ("docs/**/test_spec*" )
1003+ )
1004+ if existing_tests and not force :
1005+ pass # Preserve existing test spec
1006+ else :
1007+ tests = "# Test Specification\n \n Tests auto-generated from project detection.\n \n "
9201008 for i , test_file in enumerate (result .test_files [:20 ], 1 ):
9211009 tests += f"## TEST-{ i :03d} \n - **File**: { test_file } \n - **Status**: Detected\n "
9221010 for module in result .modules :
9231011 if module in test_file :
9241012 tests += f"- **Requirement**: REQ-{ module .upper ()} -001\n "
9251013 break
9261014 tests += "\n "
927- _write ("docs/TEST_SPEC.md" , tests )
1015+ _write ("docs/TEST_SPEC.md" , tests )
9281016
929- # docs/architecture.md
930- arch = (
1017+ # docs/architecture.md — skip if project has architecture doc anywhere under docs/
1018+ existing_arch = list (target .glob ("docs/**/architecture*" )) + list (
1019+ target .glob ("docs/**/ARCHITECTURE*" )
1020+ )
1021+ if existing_arch and not force :
1022+ pass # Preserve existing architecture doc
1023+ else :
1024+ arch = (
9311025 f"# Architecture — { name } \n \n "
9321026 "Architecture auto-generated from project detection.\n \n "
9331027 "## Overview\n "
@@ -949,7 +1043,7 @@ def _write(rel_path: str, content: str) -> None:
9491043 arch += "## Language Distribution\n "
9501044 for lang_name , count in sorted (result .languages .items (), key = lambda x : - x [1 ]):
9511045 arch += f"- { lang_name } : { count } files\n "
952- _write ("docs/architecture.md" , arch )
1046+ _write ("docs/architecture.md" , arch )
9531047
9541048 # --- Modular governance files ---
9551049 # If AGENTS.md exists and is rich, extract sections from it.
0 commit comments