Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions src/data_profiling/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,8 @@ class Histogram(BaseModel):
max_bins: int = 250
x_axis_labels: bool = True
density: bool = False
# Percentile cutoff for truncated histogram (0.0 to disable, e.g. 0.05 for 5-95%)
percentile_cutoff: float = 0.0


class CatFrequencyPlot(BaseModel):
Expand Down
36 changes: 36 additions & 0 deletions src/data_profiling/model/summary_algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,42 @@ def histogram_compute(
)

stats[name] = hist

# Compute truncated histogram if percentile_cutoff is set
cutoff = hist_config.percentile_cutoff
if cutoff > 0.0:
lower = np.percentile(finite, cutoff * 100)
upper = np.percentile(finite, (1 - cutoff) * 100)
mask = (finite_values >= lower) & (finite_values <= upper)
truncated_values = finite_values[mask]
truncated_weights = weights[mask] if weights is not None else None

if len(truncated_values) > 0:
t_vmin = float(np.min(truncated_values))
t_vmax = float(np.max(truncated_values))
t_range = t_vmax - t_vmin

if t_range == 0:
eps = 0.5 if t_vmin == 0 else abs(t_vmin) * 0.1
t_bins = np.array([t_vmin - eps, t_vmin + eps])
else:
requested_bins = hist_config.bins if hist_config.bins > 0 else "auto"
if isinstance(requested_bins, int):
safe_bins = min(requested_bins, n_unique, hist_config.max_bins)
safe_bins = max(1, safe_bins)
t_bins = np.linspace(t_vmin, t_vmax, safe_bins + 1)
else:
t_bins = np.histogram_bin_edges(truncated_values, bins="auto")
if len(t_bins) - 1 > hist_config.max_bins:
t_bins = np.linspace(t_vmin, t_vmax, hist_config.max_bins + 1)

stats[f"{name}_truncated"] = np.histogram(
truncated_values,
bins=t_bins,
weights=truncated_weights,
density=hist_config.density,
)

return stats


Expand Down
18 changes: 16 additions & 2 deletions src/data_profiling/report/structure/variables/render_real.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,7 @@ def render_real(config: Settings, summary: dict) -> dict:
)

summary_histogram = summary.get("histogram", [])
summary_histogram_truncated = summary.get("histogram_truncated", None)

mini_hist_data = None

Expand Down Expand Up @@ -268,7 +269,6 @@ def render_real(config: Settings, summary: dict) -> dict:
f"<strong>Histogram with fixed size bins</strong> "
f"(bins={len(summary_histogram[1]) - 1})"
)

hist = image_or_empty(
hist_data,
alt="Histogram",
Expand All @@ -277,6 +277,20 @@ def render_real(config: Settings, summary: dict) -> dict:
name="Histogram",
anchor_id=f"{varid}histogram",
)
# Truncated histogram
truncated_hist_data = None
if summary_histogram_truncated is not None:
cutoff = config.plot.histogram.percentile_cutoff
truncated_hist_data = histogram(config, *summary_histogram_truncated)

truncated_hist = image_or_empty(
truncated_hist_data,
alt="Truncated Histogram",
image_format=image_format,
caption=f"<strong>Histogram (truncated)</strong> ({cutoff:.0%} - {1-cutoff:.0%} percentile)" if truncated_hist_data else None,
name="Truncated Histogram",
anchor_id=f"{varid}histogram_truncated",
)

fq = FrequencyTable(
template_variables["freq_table_rows"],
Expand Down Expand Up @@ -306,7 +320,7 @@ def render_real(config: Settings, summary: dict) -> dict:
)

template_variables["bottom"] = Container(
[statistics, hist, fq, evs],
[statistics, hist, truncated_hist, fq, evs],
sequence_type="tabs",
anchor_id=f"{varid}bottom",
)
Expand Down