11# -*- coding: utf-8 -*-
22import unittest
33from pyspark .sql import Row
4- from pydeequ .analyzers import KLLParameters
5- from pydeequ .profiles import ColumnProfilerRunBuilder , ColumnProfilerRunner
4+ from pydeequ .profiles import ColumnProfilerRunBuilder , ColumnProfilerRunner , DistributionValue , StringColumnProfile
65from pydeequ .analyzers import KLLParameters , DataTypeInstances
76from tests .conftest import setup_pyspark
87
@@ -11,7 +10,7 @@ class TestProfiles(unittest.TestCase):
1110 def setUpClass (cls ):
1211 cls .spark = setup_pyspark ().appName ("test-profiles-local" ).getOrCreate ()
1312 cls .sc = cls .spark .sparkContext
14- cls .df = cls .sc .parallelize ([Row (a = "foo" , b = 1 , c = 5 ), Row (a = "bar" , b = 2 , c = 6 ), Row (a = "baz " , b = 3 , c = None )]).toDF ()
13+ cls .df = cls .sc .parallelize ([Row (a = "foo" , b = 1 , c = 5 ), Row (a = "bar" , b = 2 , c = 6 ), Row (a = "bazz " , b = 3 , c = None )]).toDF ()
1514
1615 @classmethod
1716 def tearDownClass (cls ):
@@ -76,6 +75,26 @@ def test_profile_numRecords(self):
7675 result = ColumnProfilerRunner (self .spark ).onData (self .df ).run ()
7776 self .assertEqual (result .numRecords , 3 )
7877
78+ def test_StringColumnProfile (self ):
79+ result = ColumnProfilerRunner (self .spark ).onData (self .df ).run ()
80+ column_profile = result .profiles ["a" ]
81+ self .assertIsInstance (column_profile , StringColumnProfile )
82+ self .assertEqual (column_profile .minLength , 3 )
83+ self .assertEqual (column_profile .maxLength , 4 )
84+
85+ self .assertEqual (column_profile .completeness , 1.0 )
86+ self .assertEqual (column_profile .approximateNumDistinctValues , 3 )
87+ self .assertEqual (column_profile .typeCounts ["String" ], 3 )
88+ self .assertEqual (column_profile .isDataTypeInferred , False )
89+ self .assertListEqual (
90+ sorted (column_profile .histogram ),
91+ [
92+ DistributionValue ("bar" , 1 , 1 / 3 ),
93+ DistributionValue ("bazz" , 1 , 1 / 3 ),
94+ DistributionValue ("foo" , 1 , 1 / 3 ),
95+ ]
96+ )
97+
7998
8099if __name__ == "__main__" :
81100 unittest .main ()
0 commit comments