diff --git a/changedetectionio/blueprint/ui/templates/edit.html b/changedetectionio/blueprint/ui/templates/edit.html index f6e7f6a086d..3b90d266a68 100644 --- a/changedetectionio/blueprint/ui/templates/edit.html +++ b/changedetectionio/blueprint/ui/templates/edit.html @@ -328,8 +328,9 @@

Text filtering

{{ render_checkbox_field(form.filter_text_replaced) }} {{ render_checkbox_field(form.filter_text_removed) }} Note: Depending on the length and similarity of the text on each line, the algorithm may consider an addition instead of replacement for example.
-  So it's always better to select Added+Replaced when you're interested in new content.
-  When content is merely moved in a list, it will also trigger an addition, consider enabling Only trigger when unique lines appear + So it's always better to select Added+Replaced when you're interested in new content.
+ When content is merely moved in a list, it will also trigger an addition, consider enabling Only trigger when unique lines appear.
+ The full snapshot is still saved (this does not strip added/changed/removed lines), only limits the triggers of the change detection.
{{ render_checkbox_field(form.check_unique_lines) }} diff --git a/changedetectionio/processors/text_json_diff/processor.py b/changedetectionio/processors/text_json_diff/processor.py index e877ef7cc9e..d774e68fcbd 100644 --- a/changedetectionio/processors/text_json_diff/processor.py +++ b/changedetectionio/processors/text_json_diff/processor.py @@ -459,15 +459,27 @@ def run_changedetection(self, watch): # Save text before ignore filters (for diff calculation) text_content_before_ignored_filter = stripped_text + # Save full content before diff filtering for consistent MD5 calculation + full_content_for_md5 = None + # === DIFF FILTERING === # If user wants specific diff types (added/removed/replaced only) if watch.has_special_diff_filter_options_set() and len(watch.history.keys()): - stripped_text = self._apply_diff_filtering(watch, stripped_text, text_content_before_ignored_filter) - if stripped_text is None: - # No differences found, but content exists + # Save full content BEFORE applying diff filtering + # This ensures MD5 is always calculated from full content, not the filtered diff + full_content_for_md5 = stripped_text + + filtered_diff = self._apply_diff_filtering(watch, stripped_text, text_content_before_ignored_filter) + if filtered_diff is None: + # No matching differences found (e.g., only removed lines when user wants added/replaced) + # Calculate MD5 of full content and return early c = ChecksumCalculator.calculate(text_content_before_ignored_filter, ignore_whitespace=True) return False, {'previous_md5': c}, text_content_before_ignored_filter.encode('utf-8') + # Has matching changes - use filtered diff for trigger_text evaluation and display, + # but full_content_for_md5 will be used later for MD5 calculation + stripped_text = filtered_diff + # === EMPTY PAGE CHECK === empty_pages_are_a_change = self.datastore.data['settings']['application'].get('empty_pages_are_a_change', False) if not stream_content_type.is_json and not empty_pages_are_a_change and len(stripped_text.strip()) == 0: @@ -495,17 +507,23 @@ def run_changedetection(self, watch): stripped_text = transformer.sort_alphabetically(stripped_text) # === CHECKSUM CALCULATION === - text_for_checksuming = stripped_text + # When diff filtering is active, use full content for MD5, not the filtered diff + # This ensures consistent MD5 calculation regardless of what changes occurred + if full_content_for_md5 is not None: + text_for_checksuming = full_content_for_md5 + else: + text_for_checksuming = stripped_text # Apply ignore_text for checksum calculation if filter_config.ignore_text: - text_for_checksuming = html_tools.strip_ignore_text(stripped_text, filter_config.ignore_text) + text_for_checksuming = html_tools.strip_ignore_text(text_for_checksuming, filter_config.ignore_text) # Optionally remove ignored lines from output + # Note: Only apply to stripped_text if we're not using full_content_for_md5 strip_ignored_lines = watch.get('strip_ignored_lines') if strip_ignored_lines is None: strip_ignored_lines = self.datastore.data['settings']['application'].get('strip_ignored_lines') - if strip_ignored_lines: + if strip_ignored_lines and full_content_for_md5 is None: stripped_text = text_for_checksuming # Calculate checksum @@ -571,7 +589,12 @@ def run_changedetection(self, watch): if 'text_for_checksuming' in locals() and text_for_checksuming is not stripped_text: del text_for_checksuming - return changed_detected, update_obj, stripped_text + # When diff filtering is active, return full content for history snapshots + # stripped_text contains only the filtered diff, which would create confusing/broken snapshots + if full_content_for_md5 is not None: + return changed_detected, update_obj, full_content_for_md5 + else: + return changed_detected, update_obj, stripped_text def _apply_diff_filtering(self, watch, stripped_text, text_before_filter): """Apply user's diff filtering preferences (show only added/removed/replaced lines).""" diff --git a/changedetectionio/tests/test_add_replace_remove_filter.py b/changedetectionio/tests/test_add_replace_remove_filter.py index ef38b9ad518..07b1e9141ad 100644 --- a/changedetectionio/tests/test_add_replace_remove_filter.py +++ b/changedetectionio/tests/test_add_replace_remove_filter.py @@ -183,3 +183,104 @@ def test_check_add_line_contains_trigger(client, live_server, measure_memory_usa assert '网站监测 内容更新了'.encode('utf-8') in response delete_all_watches(client) + + +def test_consistent_md5_with_diff_filtering(client, live_server, measure_memory_usage): + """ + Test that MD5 checksums are calculated consistently when diff filtering is active. + + This test ensures that after a change is detected with diff filtering enabled, + subsequent checks with identical content don't trigger false positives. + + Bug: Previously, MD5 was calculated from the filtered diff (partial content) + when changes were found, but from full content when no changes were found. + This caused false positives on the next check with identical content. + + Fix: Always calculate MD5 from full content, regardless of diff filtering. + """ + + delete_all_watches(client) + time.sleep(1) + + # Setup initial content + set_original() + test_url = url_for('test_endpoint', _external=True) + uuid = client.application.config.get('DATASTORE').add_watch(url=test_url) + + # Configure: Only track ADDED and REPLACED lines, ignore REMOVED lines + res = client.post( + url_for("ui.ui_edit.edit_page", uuid="first"), + data={ + "url": test_url, + 'processor': 'text_json_diff', + 'fetch_backend': "html_requests", + 'filter_text_added': 'y', # Track added lines + 'filter_text_replaced': 'y', # Track replaced lines + 'filter_text_removed': '', # Don't track removed lines + "time_between_check_use_default": "y" + }, + follow_redirects=True + ) + assert b"Updated watch." in res.data + + # CHECK 1: Initial baseline + client.get(url_for("ui.form_watch_checknow"), follow_redirects=True) + wait_for_all_checks(client) + res = client.get(url_for("watchlist.index")) + assert b'has-unread-changes' not in res.data # First check, no change + + # Mark as viewed to start fresh + client.get(url_for("ui.mark_all_viewed"), follow_redirects=True) + + # CHECK 2: Remove a line (should NOT trigger - removed lines are filtered out) + set_original(excluding='Something irrelevant') + client.get(url_for("ui.form_watch_checknow"), follow_redirects=True) + wait_for_all_checks(client) + res = client.get(url_for("watchlist.index")) + assert b'has-unread-changes' not in res.data # No change (removed line filtered) + + # CHECK 3: Add a line (should trigger - added lines are tracked) + set_original(excluding='Something irrelevant', add_line='

New exciting feature!

') + client.get(url_for("ui.form_watch_checknow"), follow_redirects=True) + wait_for_all_checks(client) + res = client.get(url_for("watchlist.index")) + assert b'has-unread-changes' in res.data # Change detected (added line) + + # Mark as viewed + client.get(url_for("ui.mark_all_viewed"), follow_redirects=True) + + # CHECK 4: Same content as CHECK 3 (THE CRITICAL TEST - should NOT trigger) + # This is where the bug would manifest: false positive change detection + # because previous MD5 was from filtered diff, current MD5 is from full content + set_original(excluding='Something irrelevant', add_line='

New exciting feature!

') + client.get(url_for("ui.form_watch_checknow"), follow_redirects=True) + wait_for_all_checks(client) + res = client.get(url_for("watchlist.index")) + + # CRITICAL ASSERTION: Should NOT detect change (content is identical to CHECK 3) + assert b'has-unread-changes' not in res.data, \ + "False positive! Content identical to previous check but change was detected. " \ + "MD5 calculation is inconsistent with diff filtering." + + # CHECK 5: Verify system still detects real changes (replace a line) + # Change "Some initial text" to "Some modified text" + modified_content = """ + +

Some modified text

+

So let's see what happens.

+

and a new line!

+

The golden line

+

New exciting feature!

+

A BREAK TO MAKE THE TOP LINE STAY AS "REMOVED" OR IT WILL GET COUNTED AS "CHANGED INTO"

+ + + """ + with open("test-datastore/endpoint-content.txt", "w") as f: + f.write(modified_content) + + client.get(url_for("ui.form_watch_checknow"), follow_redirects=True) + wait_for_all_checks(client) + res = client.get(url_for("watchlist.index")) + assert b'has-unread-changes' in res.data # Change detected (replaced line) + + delete_all_watches(client)