diff --git a/changedetectionio/blueprint/ui/templates/edit.html b/changedetectionio/blueprint/ui/templates/edit.html
index f6e7f6a086d..3b90d266a68 100644
--- a/changedetectionio/blueprint/ui/templates/edit.html
+++ b/changedetectionio/blueprint/ui/templates/edit.html
@@ -328,8 +328,9 @@
Text filtering
{{ render_checkbox_field(form.filter_text_replaced) }}
{{ render_checkbox_field(form.filter_text_removed) }}
Note: Depending on the length and similarity of the text on each line, the algorithm may consider an addition instead of replacement for example.
- So it's always better to select Added +Replaced when you're interested in new content.
- When content is merely moved in a list, it will also trigger an addition , consider enabling Only trigger when unique lines appear
+ So it's always better to select Added +Replaced when you're interested in new content.
+ When content is merely moved in a list, it will also trigger an addition , consider enabling Only trigger when unique lines appear .
+ The full snapshot is still saved (this does not strip added/changed/removed lines), only limits the triggers of the change detection.
{{ render_checkbox_field(form.check_unique_lines) }}
diff --git a/changedetectionio/processors/text_json_diff/processor.py b/changedetectionio/processors/text_json_diff/processor.py
index e877ef7cc9e..d774e68fcbd 100644
--- a/changedetectionio/processors/text_json_diff/processor.py
+++ b/changedetectionio/processors/text_json_diff/processor.py
@@ -459,15 +459,27 @@ def run_changedetection(self, watch):
# Save text before ignore filters (for diff calculation)
text_content_before_ignored_filter = stripped_text
+ # Save full content before diff filtering for consistent MD5 calculation
+ full_content_for_md5 = None
+
# === DIFF FILTERING ===
# If user wants specific diff types (added/removed/replaced only)
if watch.has_special_diff_filter_options_set() and len(watch.history.keys()):
- stripped_text = self._apply_diff_filtering(watch, stripped_text, text_content_before_ignored_filter)
- if stripped_text is None:
- # No differences found, but content exists
+ # Save full content BEFORE applying diff filtering
+ # This ensures MD5 is always calculated from full content, not the filtered diff
+ full_content_for_md5 = stripped_text
+
+ filtered_diff = self._apply_diff_filtering(watch, stripped_text, text_content_before_ignored_filter)
+ if filtered_diff is None:
+ # No matching differences found (e.g., only removed lines when user wants added/replaced)
+ # Calculate MD5 of full content and return early
c = ChecksumCalculator.calculate(text_content_before_ignored_filter, ignore_whitespace=True)
return False, {'previous_md5': c}, text_content_before_ignored_filter.encode('utf-8')
+ # Has matching changes - use filtered diff for trigger_text evaluation and display,
+ # but full_content_for_md5 will be used later for MD5 calculation
+ stripped_text = filtered_diff
+
# === EMPTY PAGE CHECK ===
empty_pages_are_a_change = self.datastore.data['settings']['application'].get('empty_pages_are_a_change', False)
if not stream_content_type.is_json and not empty_pages_are_a_change and len(stripped_text.strip()) == 0:
@@ -495,17 +507,23 @@ def run_changedetection(self, watch):
stripped_text = transformer.sort_alphabetically(stripped_text)
# === CHECKSUM CALCULATION ===
- text_for_checksuming = stripped_text
+ # When diff filtering is active, use full content for MD5, not the filtered diff
+ # This ensures consistent MD5 calculation regardless of what changes occurred
+ if full_content_for_md5 is not None:
+ text_for_checksuming = full_content_for_md5
+ else:
+ text_for_checksuming = stripped_text
# Apply ignore_text for checksum calculation
if filter_config.ignore_text:
- text_for_checksuming = html_tools.strip_ignore_text(stripped_text, filter_config.ignore_text)
+ text_for_checksuming = html_tools.strip_ignore_text(text_for_checksuming, filter_config.ignore_text)
# Optionally remove ignored lines from output
+ # Note: Only apply to stripped_text if we're not using full_content_for_md5
strip_ignored_lines = watch.get('strip_ignored_lines')
if strip_ignored_lines is None:
strip_ignored_lines = self.datastore.data['settings']['application'].get('strip_ignored_lines')
- if strip_ignored_lines:
+ if strip_ignored_lines and full_content_for_md5 is None:
stripped_text = text_for_checksuming
# Calculate checksum
@@ -571,7 +589,12 @@ def run_changedetection(self, watch):
if 'text_for_checksuming' in locals() and text_for_checksuming is not stripped_text:
del text_for_checksuming
- return changed_detected, update_obj, stripped_text
+ # When diff filtering is active, return full content for history snapshots
+ # stripped_text contains only the filtered diff, which would create confusing/broken snapshots
+ if full_content_for_md5 is not None:
+ return changed_detected, update_obj, full_content_for_md5
+ else:
+ return changed_detected, update_obj, stripped_text
def _apply_diff_filtering(self, watch, stripped_text, text_before_filter):
"""Apply user's diff filtering preferences (show only added/removed/replaced lines)."""
diff --git a/changedetectionio/tests/test_add_replace_remove_filter.py b/changedetectionio/tests/test_add_replace_remove_filter.py
index ef38b9ad518..07b1e9141ad 100644
--- a/changedetectionio/tests/test_add_replace_remove_filter.py
+++ b/changedetectionio/tests/test_add_replace_remove_filter.py
@@ -183,3 +183,104 @@ def test_check_add_line_contains_trigger(client, live_server, measure_memory_usa
assert '网站监测 内容更新了'.encode('utf-8') in response
delete_all_watches(client)
+
+
+def test_consistent_md5_with_diff_filtering(client, live_server, measure_memory_usage):
+ """
+ Test that MD5 checksums are calculated consistently when diff filtering is active.
+
+ This test ensures that after a change is detected with diff filtering enabled,
+ subsequent checks with identical content don't trigger false positives.
+
+ Bug: Previously, MD5 was calculated from the filtered diff (partial content)
+ when changes were found, but from full content when no changes were found.
+ This caused false positives on the next check with identical content.
+
+ Fix: Always calculate MD5 from full content, regardless of diff filtering.
+ """
+
+ delete_all_watches(client)
+ time.sleep(1)
+
+ # Setup initial content
+ set_original()
+ test_url = url_for('test_endpoint', _external=True)
+ uuid = client.application.config.get('DATASTORE').add_watch(url=test_url)
+
+ # Configure: Only track ADDED and REPLACED lines, ignore REMOVED lines
+ res = client.post(
+ url_for("ui.ui_edit.edit_page", uuid="first"),
+ data={
+ "url": test_url,
+ 'processor': 'text_json_diff',
+ 'fetch_backend': "html_requests",
+ 'filter_text_added': 'y', # Track added lines
+ 'filter_text_replaced': 'y', # Track replaced lines
+ 'filter_text_removed': '', # Don't track removed lines
+ "time_between_check_use_default": "y"
+ },
+ follow_redirects=True
+ )
+ assert b"Updated watch." in res.data
+
+ # CHECK 1: Initial baseline
+ client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
+ wait_for_all_checks(client)
+ res = client.get(url_for("watchlist.index"))
+ assert b'has-unread-changes' not in res.data # First check, no change
+
+ # Mark as viewed to start fresh
+ client.get(url_for("ui.mark_all_viewed"), follow_redirects=True)
+
+ # CHECK 2: Remove a line (should NOT trigger - removed lines are filtered out)
+ set_original(excluding='Something irrelevant')
+ client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
+ wait_for_all_checks(client)
+ res = client.get(url_for("watchlist.index"))
+ assert b'has-unread-changes' not in res.data # No change (removed line filtered)
+
+ # CHECK 3: Add a line (should trigger - added lines are tracked)
+ set_original(excluding='Something irrelevant', add_line='New exciting feature!
')
+ client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
+ wait_for_all_checks(client)
+ res = client.get(url_for("watchlist.index"))
+ assert b'has-unread-changes' in res.data # Change detected (added line)
+
+ # Mark as viewed
+ client.get(url_for("ui.mark_all_viewed"), follow_redirects=True)
+
+ # CHECK 4: Same content as CHECK 3 (THE CRITICAL TEST - should NOT trigger)
+ # This is where the bug would manifest: false positive change detection
+ # because previous MD5 was from filtered diff, current MD5 is from full content
+ set_original(excluding='Something irrelevant', add_line='New exciting feature!
')
+ client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
+ wait_for_all_checks(client)
+ res = client.get(url_for("watchlist.index"))
+
+ # CRITICAL ASSERTION: Should NOT detect change (content is identical to CHECK 3)
+ assert b'has-unread-changes' not in res.data, \
+ "False positive! Content identical to previous check but change was detected. " \
+ "MD5 calculation is inconsistent with diff filtering."
+
+ # CHECK 5: Verify system still detects real changes (replace a line)
+ # Change "Some initial text" to "Some modified text"
+ modified_content = """
+
+ Some modified text
+ So let's see what happens.
+ and a new line!
+ The golden line
+ New exciting feature!
+ A BREAK TO MAKE THE TOP LINE STAY AS "REMOVED" OR IT WILL GET COUNTED AS "CHANGED INTO"
+
+
+ """
+ with open("test-datastore/endpoint-content.txt", "w") as f:
+ f.write(modified_content)
+
+ client.get(url_for("ui.form_watch_checknow"), follow_redirects=True)
+ wait_for_all_checks(client)
+ res = client.get(url_for("watchlist.index"))
+ assert b'has-unread-changes' in res.data # Change detected (replaced line)
+
+ delete_all_watches(client)