Skip to content

Commit bc9b27e

Browse files
committed
Update
1 parent b121595 commit bc9b27e

8 files changed

Lines changed: 326 additions & 400 deletions

File tree

Cargo.lock

Lines changed: 126 additions & 253 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

dataframely/_native.pyi

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,13 @@
11
from typing import overload
22

3+
import polars as pl
4+
35
def format_rule_failures(
46
failures: list[tuple[str, int]],
5-
examples: dict[str, list[str]] | None = None,
7+
*,
8+
failures_from: pl.DataFrame | None,
9+
examples_from: pl.DataFrame | None,
10+
primary_key_columns: list[str],
611
) -> str:
712
"""
813
Format rule failures with the same logic that produces validation errors from the
@@ -11,9 +16,13 @@ def format_rule_failures(
1116
Args:
1217
failures: The name of the failures and their counts. This should only include
1318
failures with a count of at least 1.
14-
examples: Optional mapping from rule name to a list of example row strings.
15-
When provided, up to ``len(examples[rule])`` distinct examples are included
16-
in the formatted message for each rule.
19+
failures_from: The data frame containing the rule columns providing the
20+
failures.
21+
max_examples: The maximum number of examples to include for each failure. No
22+
effect if `examples_from` is not provided.
23+
primary_key_columns: The primary key columns of the schema for which to format
24+
rule failures. This is only relevant if `examples_from` is provided and
25+
allows for better error messages for the "primary_key" rule.
1726
1827
Returns:
1928
The formatted rule failures.

dataframely/_plugin.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ def all_rules_required(
6060
null_is_valid: bool = True,
6161
schema_name: str,
6262
data_columns: Iterable[IntoExpr] | None = None,
63+
primary_key_columns: list[str] | None,
6364
) -> pl.Expr:
6465
"""Execute :mod:`~polars.all_horizontal` and `.all` for a set of rules.
6566
@@ -80,6 +81,8 @@ def all_rules_required(
8081
data_columns: Optional data columns to include for generating example rows in
8182
error messages. If provided, up to 5 distinct example rows are included
8283
for each failing rule.
84+
primary_key_columns: Optional list of primary key columns which are used for
85+
better error messages if data columns are provided.
8386
8487
Returns:
8588
A scalar boolean expression.
@@ -95,6 +98,7 @@ def all_rules_required(
9598
"null_is_valid": null_is_valid,
9699
"schema_name": schema_name,
97100
"num_rule_columns": num_rule_columns,
101+
"primary_key_columns": primary_key_columns or [],
98102
},
99103
use_abs_path=True,
100104
is_elementwise=True,

dataframely/collection/collection.py

Lines changed: 17 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -409,13 +409,19 @@ def validate(
409409
# information to properly construct a useful error message.
410410
filtered, failures = cls.filter(data, cast=cast, eager=True)
411411
if any(len(failure) > 0 for failure in failures.values()):
412-
errors = {
413-
member: format_rule_failures(
414-
list(failure.counts().items()), failure.examples()
412+
errors: dict[str, str] = {}
413+
for member, failure in failures.items():
414+
if len(failure) == 0:
415+
continue
416+
417+
counts = failure.counts()
418+
errors[member] = format_rule_failures(
419+
list(counts.items()),
420+
failures_from=failure._df.select(counts.keys()),
421+
examples_from=failure.invalid(),
422+
primary_key_columns=cls.member_schemas()[member].primary_key(),
415423
)
416-
for member, failure in failures.items()
417-
if len(failure) > 0
418-
}
424+
419425
details = [
420426
f" > Member '{member}' failed validation:\n"
421427
+ textwrap.indent(error, " ")
@@ -453,7 +459,11 @@ def validate(
453459
)
454460
.filter(
455461
all_rules_required(
456-
filter_names, null_is_valid=False, schema_name=name
462+
filter_names,
463+
null_is_valid=False,
464+
schema_name=name,
465+
data_columns=cls.common_primary_key(),
466+
primary_key_columns=cls.common_primary_key(),
457467
)
458468
)
459469
.drop(filter_names)

dataframely/filter_result.py

Lines changed: 0 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -146,22 +146,6 @@ def counts(self) -> dict[str, int]:
146146
"""
147147
return _compute_counts(self._df, self._rule_columns)
148148

149-
def examples(self, max_examples: int = 5) -> dict[str, list[str]]:
150-
"""Example rows for each failing rule.
151-
152-
For each rule that has at least one failure, returns up to `max_examples`
153-
distinct example rows (as formatted strings) from the original data columns.
154-
155-
Args:
156-
max_examples: The maximum number of distinct example rows to return per
157-
rule.
158-
159-
Returns:
160-
A mapping from rule name to a list of example row strings. Rules with no
161-
failures are not included.
162-
"""
163-
return _compute_examples(self._df, self._rule_columns, max_examples)
164-
165149
def cooccurrence_counts(self) -> dict[frozenset[str], int]:
166150
"""The number of validation failures per co-occurring rule validation failure.
167151
@@ -425,28 +409,6 @@ def _compute_counts(df: pl.DataFrame, rule_columns: list[str]) -> dict[str, int]
425409
}
426410

427411

428-
def _compute_examples(
429-
df: pl.DataFrame, rule_columns: list[str], max_examples: int
430-
) -> dict[str, list[str]]:
431-
if len(rule_columns) == 0:
432-
return {}
433-
434-
data_columns = [c for c in df.columns if c not in rule_columns]
435-
if not data_columns:
436-
return {}
437-
438-
result = {}
439-
for rule_name in rule_columns:
440-
failing = df.filter(pl.col(rule_name).not_())
441-
if len(failing) == 0:
442-
continue
443-
examples_df = (
444-
failing.select(data_columns).unique(maintain_order=True).head(max_examples)
445-
)
446-
result[rule_name] = [str(row) for row in examples_df.to_dicts()]
447-
return result
448-
449-
450412
def _compute_cooccurrence_counts(
451413
df: pl.DataFrame, rule_columns: list[str]
452414
) -> dict[frozenset[str], int]:

dataframely/schema.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -576,9 +576,13 @@ def validate(
576576
if eager:
577577
out, failure = cls.filter(df, cast=cast, eager=True)
578578
if len(failure) > 0:
579+
counts = failure.counts()
579580
raise ValidationError(
580581
format_rule_failures(
581-
list(failure.counts().items()), failure.examples()
582+
list(counts.items()),
583+
failures_from=failure._df.select(counts.keys()),
584+
examples_from=failure.invalid(),
585+
primary_key_columns=cls.primary_key(),
582586
)
583587
)
584588
return out
@@ -594,6 +598,7 @@ def validate(
594598
rules.keys(),
595599
schema_name=cls.__name__,
596600
data_columns=cls.column_names(),
601+
primary_key_columns=cls.primary_key(),
597602
)
598603
)
599604
.drop(rules.keys())

src/polars_plugin/mod.rs

Lines changed: 22 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,6 @@ mod rule_failure;
22
mod utils;
33
mod validation_error;
44

5-
use std::collections::{HashMap, HashSet};
6-
75
use polars::prelude::*;
86
use polars_core::POOL;
97
use pyo3_polars::derive::polars_expr;
@@ -64,55 +62,11 @@ pub fn all_rules(inputs: &[Series]) -> PolarsResult<Series> {
6462
struct RequiredValidationKwargs {
6563
schema_name: String,
6664
null_is_valid: bool,
65+
primary_key_columns: Option<Vec<String>>,
6766
#[serde(default)]
6867
num_rule_columns: Option<usize>,
6968
}
7069

71-
/// The maximum number of distinct example rows included in validation error messages.
72-
const MAX_EXAMPLES: usize = 5;
73-
74-
/// Format a single data row (at `row_idx`) from the given data series as a Python-like dict string.
75-
fn format_example_row(data_series: &[Series], row_idx: usize) -> String {
76-
let kvs: Vec<String> = data_series
77-
.iter()
78-
.map(|s| {
79-
let val = s.get(row_idx).unwrap_or(AnyValue::Null);
80-
format!("'{}': {}", s.name(), val)
81-
})
82-
.collect();
83-
format!("{{{}}}", kvs.join(", "))
84-
}
85-
86-
/// Compute up to `max_examples` distinct example rows for a failing rule.
87-
fn compute_examples(
88-
bool_ca: &BooleanChunked,
89-
null_is_valid: bool,
90-
data_series: &[Series],
91-
max_examples: usize,
92-
) -> Vec<String> {
93-
let mut seen: HashSet<String> = HashSet::new();
94-
let mut examples: Vec<String> = Vec::new();
95-
96-
for (i, val) in bool_ca.iter().enumerate() {
97-
let is_failure = match val {
98-
Some(false) => true,
99-
None => !null_is_valid,
100-
_ => false,
101-
};
102-
if is_failure {
103-
let row_str = format_example_row(data_series, i);
104-
if seen.insert(row_str.clone()) {
105-
examples.push(row_str);
106-
if examples.len() >= max_examples {
107-
break;
108-
}
109-
}
110-
}
111-
}
112-
113-
examples
114-
}
115-
11670
/// Reduce a set of boolean columns into a single boolean scalar, AND-ing all values.
11771
/// Null values are treated as `true`.
11872
/// In contrast to `all_rules`, this function raises an error if the returned value would be
@@ -140,26 +94,26 @@ pub fn all_rules_required(
14094
return Ok(column.take_materialized_series());
14195
}
14296

143-
// Compute examples for each failing rule using the data columns.
144-
let examples: HashMap<String, Vec<String>> = if data_inputs.is_empty() {
145-
HashMap::new()
146-
} else {
147-
failures
97+
// Aggregate failures into a validation error
98+
let failures_from = DataFrame::new(
99+
rule_inputs[0].len(),
100+
rule_inputs
148101
.iter()
149-
.map(|failure| {
150-
let rule_series = rule_inputs
151-
.iter()
152-
.find(|s| s.name().as_str() == failure.rule)
153-
.expect("failing rule not found in inputs");
154-
let bool_ca = as_bool(rule_series)?;
155-
let examples =
156-
compute_examples(bool_ca, kwargs.null_is_valid, data_inputs, MAX_EXAMPLES);
157-
Ok((failure.rule.to_string(), examples))
158-
})
159-
.collect::<PolarsResult<HashMap<_, _>>>()?
160-
};
161-
162-
// Aggregate failure counts into a validation error.
163-
let error = RuleValidationError::new(failures);
164-
Err(polars_err!(ComputeError: format!("\n{}", error.to_string(Some(&kwargs.schema_name), Some(&examples)))))
102+
.map(|s| s.clone().into_column())
103+
.collect(),
104+
)?;
105+
let examples_from = DataFrame::new(
106+
data_inputs[0].len(),
107+
data_inputs
108+
.iter()
109+
.map(|s| s.clone().into_column())
110+
.collect(),
111+
)?;
112+
let error = RuleValidationError::new(
113+
failures,
114+
Some(failures_from),
115+
Some(examples_from),
116+
kwargs.primary_key_columns.unwrap_or_default(),
117+
);
118+
Err(polars_err!(ComputeError: format!("\n{}", error.to_string(Some(&kwargs.schema_name)))))
165119
}

0 commit comments

Comments
 (0)