3737from crowdgit .models .service_execution import ServiceExecution
3838from crowdgit .services .base .base_service import BaseService
3939from crowdgit .services .maintainer .bedrock import invoke_bedrock
40- from crowdgit .services .utils import run_shell_command
40+ from crowdgit .services .maintainer .section_extractor import SectionExtractor
41+ from crowdgit .services .utils import run_shell_command , safe_decode
4142from crowdgit .settings import MAINTAINER_RETRY_INTERVAL_DAYS , MAINTAINER_UPDATE_INTERVAL_HOURS
4243
4344
@@ -93,6 +94,7 @@ class MaintainerService(BaseService):
9394 "code_owners" ,
9495 "emeritus" ,
9596 "workgroup" ,
97+ "readme" ,
9698 }
9799
98100 VALID_EXTENSIONS = {
@@ -132,6 +134,17 @@ class MaintainerService(BaseService):
132134 STEM_MATCH_SCORE = 50
133135 PARTIAL_STEM_SCORE = 25
134136
137+ # Files in KNOWN_PATHS that still need section filtering (contain non-governance content)
138+ SECTION_FILTERED_PATHS = {"readme.md" , "governance.md" }
139+ SCORING_KEYWORDS_SET = frozenset (SCORING_KEYWORDS )
140+
141+ _section_extractor = SectionExtractor ()
142+
143+ @staticmethod
144+ async def _read_text_file (file_path : str ) -> str :
145+ async with aiofiles .open (file_path , "rb" ) as f :
146+ return safe_decode (await f .read ())
147+
135148 def make_role (self , title : str ):
136149 title = title .lower ()
137150 title = (
@@ -360,8 +373,10 @@ async def process_chunk(chunk_index: int, chunk: str):
360373 self .get_extraction_prompt (maintainer_filename , content ),
361374 pydantic_model = MaintainerInfo ,
362375 )
363- self .logger .info ("Maintainers file content analyzed by AI" )
364- self .logger .info (f"Maintainers response: { maintainer_info } " )
376+ info_count = len (maintainer_info .output .info ) if maintainer_info .output .info else 0
377+ self .logger .info (
378+ f"Maintainers file content analyzed by AI (found={ info_count } , cost={ maintainer_info .cost :.4f} )"
379+ )
365380 if maintainer_info .output .info is not None :
366381 return AggregatedMaintainerInfo (
367382 output = AggregatedMaintainerInfoItems (info = maintainer_info .output .info ),
@@ -373,7 +388,7 @@ async def process_chunk(chunk_index: int, chunk: str):
373388 )
374389 else :
375390 self .logger .error (
376- f"Expected a list of maintainer info or an error message, got: { str ( maintainer_info ) } "
391+ f"Expected a list of maintainer info or an error message, got error= { maintainer_info . output . error } "
377392 )
378393 raise MaintanerAnalysisError (
379394 error_message = "Unexpected response from AI for Maintainers analysis" ,
@@ -544,8 +559,7 @@ async def find_candidate_files(
544559 for candidate_path in all_paths :
545560 file_path = os .path .join (repo_path , candidate_path )
546561 try :
547- async with aiofiles .open (file_path , "r" , encoding = "utf-8" ) as f :
548- content = await f .read ()
562+ content = await self ._read_text_file (file_path )
549563 except Exception as e :
550564 self .logger .warning (f"Failed to read candidate { candidate_path } : { repr (e )} " )
551565 continue
@@ -586,6 +600,16 @@ async def analyze_and_build_result(self, filename: str, content: str) -> Maintai
586600 f"Skipping README file '{ filename } ': no governance keyword found in content"
587601 )
588602 raise MaintanerAnalysisError (error_code = ErrorCode .NO_MAINTAINER_FOUND )
603+
604+ fname = os .path .basename (filename ).lower ()
605+ if fname not in self .KNOWN_PATHS or fname in self .SECTION_FILTERED_PATHS :
606+ extracted = self ._section_extractor .extract (fname , content , self .SCORING_KEYWORDS_SET )
607+ if extracted :
608+ self .logger .info (f"Using extracted sections for '{ filename } '" )
609+ content = extracted
610+ else :
611+ self .logger .debug (f"No sections extracted for '{ filename } ', using full content" )
612+
589613 result = await self .analyze_file_content (filename , content )
590614
591615 if not result .output .info :
@@ -618,9 +642,7 @@ async def try_saved_maintainer_file(
618642 f"Saved maintainer file exists, reading content: '{ saved_maintainer_file } '"
619643 )
620644 try :
621- async with aiofiles .open (file_path , "r" , encoding = "utf-8" ) as f :
622- content = await f .read ()
623-
645+ content = await self ._read_text_file (file_path )
624646 result = await self .analyze_and_build_result (saved_maintainer_file , content )
625647 cost += result .total_cost
626648 return result , cost
@@ -664,12 +686,6 @@ def _attach_metadata(result: MaintainerResult) -> MaintainerResult:
664686 root_candidates , subdir_candidates = await self .find_candidate_files (repo_path )
665687 all_candidates = root_candidates + subdir_candidates
666688 candidate_files = [(path , score ) for path , _ , score in all_candidates ][:100 ]
667- self .logger .debug (
668- f"Detection step 2: { len (root_candidates )} root candidate(s), "
669- f"{ len (subdir_candidates )} subdir candidate(s); "
670- f"root={ [p for p , _ , _ in root_candidates ]} , "
671- f"subdir_top={ [p for p , _ , _ in subdir_candidates [:3 ]]} "
672- )
673689
674690 # Step 3: Try root-level files first (in score order), then top subdirectory file
675691 failed_candidates : set [str ] = set ()
@@ -757,7 +773,6 @@ def _attach_metadata(result: MaintainerResult) -> MaintainerResult:
757773 f"Passing { len (ai_input_files )} files to AI for maintainer file detection "
758774 f"(total repo files: { len (file_names )} )"
759775 )
760- self .logger .debug (f"AI input files: { [f for f , _ in ai_input_files ]} " )
761776 ai_file_name , ai_cost = await self .find_maintainer_file_with_ai (ai_input_files )
762777 ai_suggested_file = ai_file_name
763778 total_cost += ai_cost
@@ -771,8 +786,7 @@ def _attach_metadata(result: MaintainerResult) -> MaintainerResult:
771786 )
772787 else :
773788 try :
774- async with aiofiles .open (file_path , "r" , encoding = "utf-8" ) as f :
775- content = await f .read ()
789+ content = await self ._read_text_file (file_path )
776790 result = await self .analyze_and_build_result (ai_file_name , content )
777791 total_cost += result .total_cost
778792 return _attach_metadata (result )
0 commit comments