@@ -2,8 +2,6 @@ mod rule_failure;
22mod utils;
33mod validation_error;
44
5- use std:: collections:: { HashMap , HashSet } ;
6-
75use polars:: prelude:: * ;
86use polars_core:: POOL ;
97use pyo3_polars:: derive:: polars_expr;
@@ -64,55 +62,11 @@ pub fn all_rules(inputs: &[Series]) -> PolarsResult<Series> {
6462struct RequiredValidationKwargs {
6563 schema_name : String ,
6664 null_is_valid : bool ,
65+ primary_key_columns : Option < Vec < String > > ,
6766 #[ serde( default ) ]
6867 num_rule_columns : Option < usize > ,
6968}
7069
71- /// The maximum number of distinct example rows included in validation error messages.
72- const MAX_EXAMPLES : usize = 5 ;
73-
74- /// Format a single data row (at `row_idx`) from the given data series as a Python-like dict string.
75- fn format_example_row ( data_series : & [ Series ] , row_idx : usize ) -> String {
76- let kvs: Vec < String > = data_series
77- . iter ( )
78- . map ( |s| {
79- let val = s. get ( row_idx) . unwrap_or ( AnyValue :: Null ) ;
80- format ! ( "'{}': {}" , s. name( ) , val)
81- } )
82- . collect ( ) ;
83- format ! ( "{{{}}}" , kvs. join( ", " ) )
84- }
85-
86- /// Compute up to `max_examples` distinct example rows for a failing rule.
87- fn compute_examples (
88- bool_ca : & BooleanChunked ,
89- null_is_valid : bool ,
90- data_series : & [ Series ] ,
91- max_examples : usize ,
92- ) -> Vec < String > {
93- let mut seen: HashSet < String > = HashSet :: new ( ) ;
94- let mut examples: Vec < String > = Vec :: new ( ) ;
95-
96- for ( i, val) in bool_ca. iter ( ) . enumerate ( ) {
97- let is_failure = match val {
98- Some ( false ) => true ,
99- None => !null_is_valid,
100- _ => false ,
101- } ;
102- if is_failure {
103- let row_str = format_example_row ( data_series, i) ;
104- if seen. insert ( row_str. clone ( ) ) {
105- examples. push ( row_str) ;
106- if examples. len ( ) >= max_examples {
107- break ;
108- }
109- }
110- }
111- }
112-
113- examples
114- }
115-
11670/// Reduce a set of boolean columns into a single boolean scalar, AND-ing all values.
11771/// Null values are treated as `true`.
11872/// In contrast to `all_rules`, this function raises an error if the returned value would be
@@ -140,26 +94,26 @@ pub fn all_rules_required(
14094 return Ok ( column. take_materialized_series ( ) ) ;
14195 }
14296
143- // Compute examples for each failing rule using the data columns.
144- let examples: HashMap < String , Vec < String > > = if data_inputs. is_empty ( ) {
145- HashMap :: new ( )
146- } else {
147- failures
97+ // Aggregate failures into a validation error
98+ let failures_from = DataFrame :: new (
99+ rule_inputs[ 0 ] . len ( ) ,
100+ rule_inputs
148101 . iter ( )
149- . map ( |failure| {
150- let rule_series = rule_inputs
151- . iter ( )
152- . find ( |s| s. name ( ) . as_str ( ) == failure. rule )
153- . expect ( "failing rule not found in inputs" ) ;
154- let bool_ca = as_bool ( rule_series) ?;
155- let examples =
156- compute_examples ( bool_ca, kwargs. null_is_valid , data_inputs, MAX_EXAMPLES ) ;
157- Ok ( ( failure. rule . to_string ( ) , examples) )
158- } )
159- . collect :: < PolarsResult < HashMap < _ , _ > > > ( ) ?
160- } ;
161-
162- // Aggregate failure counts into a validation error.
163- let error = RuleValidationError :: new ( failures) ;
164- Err ( polars_err ! ( ComputeError : format!( "\n {}" , error. to_string( Some ( & kwargs. schema_name) , Some ( & examples) ) ) ) )
102+ . map ( |s| s. clone ( ) . into_column ( ) )
103+ . collect ( ) ,
104+ ) ?;
105+ let examples_from = DataFrame :: new (
106+ data_inputs[ 0 ] . len ( ) ,
107+ data_inputs
108+ . iter ( )
109+ . map ( |s| s. clone ( ) . into_column ( ) )
110+ . collect ( ) ,
111+ ) ?;
112+ let error = RuleValidationError :: new (
113+ failures,
114+ Some ( failures_from) ,
115+ Some ( examples_from) ,
116+ kwargs. primary_key_columns . unwrap_or_default ( ) ,
117+ ) ;
118+ Err ( polars_err ! ( ComputeError : format!( "\n {}" , error. to_string( Some ( & kwargs. schema_name) ) ) ) )
165119}
0 commit comments