77import requests
88from datetime import datetime , timedelta
99from dateutil import parser
10- from collections import OrderedDict
10+ from collections import OrderedDict , defaultdict
1111import pytz
1212import logging
1313import time
@@ -523,24 +523,24 @@ def _get_documents_from_url(self, url):
523523 constructor , filters = _parse_filter (url )
524524 return self .wiki .get_documents (filters , constructor = constructor )
525525
526- def clean (self , url_or_filename , langs , ask_before_saving = True , clean_bbcode = False ):
526+ def clean (
527+ self , url_or_filename , lang , ask_before_saving , thread_url , clean_bbcode = False
528+ ):
527529 """
528530 Clean a set of document.
529531
530532 :param url_or_filename: Camptocamp.org URL, or filename
531- :param langs: comma-separated list of lang identifiers
533+ :param lang: lang identifier
532534 :param ask_before_saving: Boolean
533535 :param clean_bbcode: Boolean
534536
535537 """
536538
537- assert len (langs ) != 0
538-
539539 documents = self .get_documents (url_or_filename )
540- processors = get_automatic_replacments ( self , clean_bbcode )
540+ report_header = f"Clean documents from ` { url_or_filename } `"
541541
542542 self ._process_documents (
543- documents , processors , langs , ask_before_saving , excluded_ids = [ 996571 ,]
543+ documents , lang , ask_before_saving , report_header , thread_url , clean_bbcode
544544 )
545545
546546 def report (self , url_or_filename , lang ):
@@ -581,48 +581,77 @@ def report(self, url_or_filename, lang):
581581 print ("\n " .join (stdout_report ))
582582
583583 def _process_documents (
584- self , documents , processors , langs , ask_before_saving = True , excluded_ids = None
584+ self ,
585+ documents ,
586+ lang ,
587+ ask_before_saving ,
588+ report_header ,
589+ thread_url ,
590+ clean_bbcode = False ,
585591 ):
586592
587- for document in documents :
593+ excluded_document_ids = [
594+ 996571 , # article with all automatic corrections
595+ ]
596+
597+ processors = get_automatic_replacments (self , clean_bbcode = clean_bbcode )
588598
599+ report = defaultdict (int )
600+
601+ for document in documents :
589602 if "redirects_to" in document :
590- pass # document id is not available...
603+ continue # document id is not available...
604+
605+ document_url = document .get_url ()
606+ report [f"Inspected" ] += 1
591607
592- elif excluded_ids is not None and document .document_id in excluded_ids :
608+ if document .document_id in excluded_document_ids :
593609 pass
594610
595611 elif document .get ("protected" , False ) and not self .moderator :
596- print ("{} is a protected" .format (document .get_url ()))
612+ logging .info (f"{ document_url } is protected" )
613+ report ["Skipped because protected" ] += 1
597614
598615 elif document .is_personal () and not self .moderator :
599- print ("{} is a personal" .format (document .get_url ()))
616+ logging .info (f"{ document_url } is a personal document" )
617+ report ["Skipped because is not CC-BY-SA" ] += 1
600618
601619 elif not document .is_valid ():
602- print (
603- "{} : {}" .format (
604- document .get_url (), document .get_invalidity_reason ()
605- )
606- )
620+ reason = document .get_invalidity_reason ()
621+ logging .info (f"{ document_url } : { reason } " )
622+ report [f"Skipped because { reason } " ] += 1
607623
608624 else :
609625 messages = []
610626 must_save = False
611627
612628 for processor in processors :
613629 if processor .ready_for_production :
614- if processor (document , langs ):
630+ if processor (document , [ lang ,] ):
615631 messages .append (processor .comment )
616632 must_save = True
617633
618634 if must_save :
619635 comment = ", " .join (messages )
620636 try :
621- document .save (comment , ask_before_saving = ask_before_saving )
622- except Exception as e :
623- print (
624- "Error while saving {} :\n {}" .format (document .get_url (), e )
637+ new_document = document .save (
638+ comment , ask_before_saving = ask_before_saving
625639 )
640+ except Exception as e :
641+ report ["Unexpcted error" ] += 1
642+ logging .error (f"Error while saving { document_url } :\n { e } " )
643+ else :
644+ if new_document is None :
645+ report ["Skipped by bot owner" ] += 1
646+ else :
647+ report ["Corrected" ] += 1
648+
649+ log_report = "\n " .join (
650+ [f"* `{ bucket } `: { count } " for bucket , count in report .items ()]
651+ )
652+ self .forum .post_message (f"### { report_header } \n \n { log_report } " , thread_url )
653+
654+ return report
626655
627656 def export (self , url , filename = None ):
628657 """
@@ -719,6 +748,10 @@ def write(**kwargs):
719748 def get_modified_documents (
720749 self , lang , oldest_date = None , newest_date = None , excluded_users = ()
721750 ):
751+ logging .info (
752+ f"Get modified documents from { oldest_date } to { newest_date } in lang:{ lang } "
753+ )
754+
722755 result = OrderedDict ()
723756 for contrib in self .wiki .get_contributions (
724757 oldest_date = oldest_date , newest_date = newest_date
@@ -738,15 +771,11 @@ def get_modified_documents(
738771
739772 return result
740773
741- def clean_recent_changes (self , days , lang , ask_before_saving ):
774+ def clean_recent_changes (self , days , lang , ask_before_saving , thread_url ):
742775 newest_date = utils .today ().replace (hour = 0 , minute = 0 , second = 0 , microsecond = 0 )
743776 oldest_date = newest_date - timedelta (days = days )
744777
745- excluded_ids = [
746- 996571 ,
747- ]
748-
749- processors = get_automatic_replacments (self )
778+ report_header = f"Clean recent change from `{ oldest_date } ` to `{ newest_date } `"
750779
751780 def get_documents ():
752781
@@ -761,12 +790,11 @@ def get_documents():
761790 document_id , document_type = document_type
762791 )
763792
764- if document_id not in excluded_ids :
765- yield document
793+ yield document
766794
767- print ( "Fix recent changes" )
768- self . _process_documents ( get_documents (), processors , [ lang ,], ask_before_saving )
769- print ( "Fix recent changes finished" )
795+ self . _process_documents (
796+ get_documents (), lang , ask_before_saving , report_header , thread_url
797+ )
770798
771799 def get_new_contributors (self , contrib_threshold = 20 , outings_threshold = 15 ):
772800 with open ("contributors.txt" , "r" ) as f :
0 commit comments