From 4fbe66282d6822786a3d74b15e42e39561affa10 Mon Sep 17 00:00:00 2001 From: Matt Van Horn <455140+mvanhorn@users.noreply.github.com> Date: Wed, 24 Jun 2026 03:37:37 -0700 Subject: [PATCH] Fix 'Filter by value' top/bottom n returning more items than requested --- processors/filtering/column_filter.py | 35 +++++++++++++++++++-------- 1 file changed, 25 insertions(+), 10 deletions(-) diff --git a/processors/filtering/column_filter.py b/processors/filtering/column_filter.py index 8b0729d9c..5045089f5 100644 --- a/processors/filtering/column_filter.py +++ b/processors/filtering/column_filter.py @@ -239,20 +239,35 @@ def filter_top(self, column, top_n, bottom=False): :param bool bottom: If true, return bottom results instead :return: """ - possible_values = set() + value_counts = {} top_n = convert_to_int(top_n, 10) - for item in self.source_dataset.iterate_items(): - possible_values.add(item.get(column)) + if top_n <= 0: + return - ranked_items = 0 - top_values = sorted(list(possible_values), reverse=(not bottom))[:top_n] for item in self.source_dataset.iterate_items(processor=self): - if item.get(column) in top_values: - ranked_items = 0 - yield item + value = item.get(column) + value_counts[value] = value_counts.get(value, 0) + 1 - if ranked_items >= top_n: - return + value_quotas = {} + remaining = top_n + for value in sorted(value_counts, reverse=(not bottom)): + value_quotas[value] = min(value_counts[value], remaining) + remaining -= value_quotas[value] + if remaining <= 0: + break + + yielded_values = {} + for item in self.source_dataset.iterate_items(processor=self): + value = item.get(column) + if value not in value_quotas: + continue + + yielded_for_value = yielded_values.get(value, 0) + if yielded_for_value >= value_quotas[value]: + continue + + yield item + yielded_values[value] = yielded_for_value + 1 class ColumnProcessorFilter(ColumnFilter):