Skip to content

Commit cd60ca6

Browse files
authored
Profiler: add StringColumnProfile (#248)
* Profiler: add StringColumnProfile. * Profiler: remove JavaObject import used for typing purposes. * Profiler: use assertAlmostEquals in test for float values, refine sorting. * Profiler: use assertAlmostEqual in test instead of assertAlmostEquals. * Profiler: add assertion for StringColumnProfile.__str__ result. * StringColumnProfile: build self.all from scratch instead of updating parent's. * StringColumnProfile: shorten __str__ result assertion. * StringColumnProfile: add assertion for __str__ result. * StringColumProfile: use dedicated df for testing. * Profile tests: apply black formatting. * StringColumProfile: refactor to subclass ColumnProfile. * StringColumProfile: add tests for None values
1 parent c1f9cff commit cd60ca6

3 files changed

Lines changed: 101 additions & 8 deletions

File tree

docs/profiles.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,4 +22,5 @@ Here are the current supported functionalities of Profiles.
2222
| | property: profiles | Done |
2323
| | property: numRecords | Done |
2424
| StandardColumnProfile | StandardColumnProfile(spark_session, column, java_column_profile) | Done |
25+
| StringColumnProfile | StringColumnProfile(spark_session, column, java_column_profile) | Done |
2526
| NumericColumnProfile | NumericColumnProfile(spark_session, column, java_column_profile) | Done |

pydeequ/profiles.py

Lines changed: 39 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
""" Profiles file for all the Profiles classes in Deequ"""
33
import json
44
from collections import namedtuple
5+
from typing import Optional
56

67
from pyspark.sql import DataFrame, SparkSession
78
from pydeequ.analyzers import KLLParameters
@@ -241,9 +242,8 @@ def __init__(self, spark_session: SparkSession):
241242
self._numRecords = 0
242243
self.columnProfileClasses = {
243244
"StandardColumnProfile": StandardColumnProfile,
244-
"StringColumnProfile": StandardColumnProfile,
245+
"StringColumnProfile": StringColumnProfile,
245246
"NumericColumnProfile": NumericColumnProfile,
246-
247247
}
248248

249249
def _columnProfilesFromColumnRunBuilderRun(self, run):
@@ -528,3 +528,40 @@ def approxPercentiles(self):
528528
"""
529529
return self._approxPercentiles
530530

531+
532+
class StringColumnProfile(ColumnProfile):
533+
"""
534+
String Column Profile class
535+
536+
:param SparkSession spark_session: sparkSession
537+
:param str column: the designated column of which the profile is run on
538+
:param JavaObject java_column_profile: The profile mapped as a Java map
539+
"""
540+
541+
def __init__(
542+
self, spark_session: SparkSession, column: str, java_column_profile
543+
) -> None:
544+
super().__init__(spark_session, column, java_column_profile)
545+
self._minLength = get_or_else_none(java_column_profile.minLength())
546+
self._maxLength = get_or_else_none(java_column_profile.maxLength())
547+
self.all = {
548+
"completeness": self.completeness,
549+
"approximateNumDistinctValues": self.approximateNumDistinctValues,
550+
"dataType": self.dataType,
551+
"isDataTypeInferred": self.isDataTypeInferred,
552+
"typeCounts": self.typeCounts,
553+
"histogram": self.histogram,
554+
"minLength": self._minLength,
555+
"maxLength": self._maxLength,
556+
}
557+
558+
@property
559+
def minLength(self) -> Optional[int]:
560+
return self._minLength
561+
562+
@property
563+
def maxLength(self) -> Optional[int]:
564+
return self._maxLength
565+
566+
def __str__(self) -> str:
567+
return f"StringProfiles for column: {self.column}: {json.dumps(self.all, indent=4)}"

tests/test_profiles.py

Lines changed: 61 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,28 +1,43 @@
11
# -*- coding: utf-8 -*-
22
import unittest
33
from pyspark.sql import Row
4-
from pydeequ.analyzers import KLLParameters
5-
from pydeequ.profiles import ColumnProfilerRunBuilder, ColumnProfilerRunner
4+
from pydeequ.profiles import (
5+
ColumnProfilerRunBuilder,
6+
ColumnProfilerRunner,
7+
DistributionValue,
8+
StringColumnProfile,
9+
)
610
from pydeequ.analyzers import KLLParameters, DataTypeInstances
711
from tests.conftest import setup_pyspark
812

13+
914
class TestProfiles(unittest.TestCase):
1015
@classmethod
1116
def setUpClass(cls):
1217
cls.spark = setup_pyspark().appName("test-profiles-local").getOrCreate()
1318
cls.sc = cls.spark.sparkContext
14-
cls.df = cls.sc.parallelize([Row(a="foo", b=1, c=5), Row(a="bar", b=2, c=6), Row(a="baz", b=3, c=None)]).toDF()
19+
cls.df = cls.sc.parallelize(
20+
[Row(a="foo", b=1, c=5), Row(a="bar", b=2, c=6), Row(a="baz", b=3, c=None)]
21+
).toDF()
1522

1623
@classmethod
1724
def tearDownClass(cls):
1825
cls.spark.sparkContext._gateway.shutdown_callback_server()
1926
cls.spark.stop()
2027

2128
def test_setPredefinedTypes(self):
22-
result = ColumnProfilerRunner(self.spark) \
23-
.onData(self.df) \
24-
.setPredefinedTypes({'a': DataTypeInstances.Unknown, 'b': DataTypeInstances.String, 'c': DataTypeInstances.Fractional}) \
29+
result = (
30+
ColumnProfilerRunner(self.spark)
31+
.onData(self.df)
32+
.setPredefinedTypes(
33+
{
34+
"a": DataTypeInstances.Unknown,
35+
"b": DataTypeInstances.String,
36+
"c": DataTypeInstances.Fractional,
37+
}
38+
)
2539
.run()
40+
)
2641
print(result)
2742
for col, profile in result.profiles.items():
2843
print("Profiles:", profile)
@@ -76,6 +91,46 @@ def test_profile_numRecords(self):
7691
result = ColumnProfilerRunner(self.spark).onData(self.df).run()
7792
self.assertEqual(result.numRecords, 3)
7893

94+
def test_StringColumnProfile(self):
95+
df = self.sc.parallelize(
96+
[
97+
Row(a="ant", b="dragonfly"),
98+
Row(a="bee", b="earwig"),
99+
Row(a="bee", b=None),
100+
Row(a="cricket", b=None),
101+
]
102+
).toDF()
103+
result = ColumnProfilerRunner(self.spark).onData(df).run()
104+
column_profile = result.profiles["a"]
105+
self.assertIsInstance(column_profile, StringColumnProfile)
106+
self.assertEqual(column_profile.minLength, 3)
107+
self.assertEqual(column_profile.maxLength, 7)
108+
self.assertEqual(str(column_profile)[0:29], "StringProfiles for column: a:")
109+
self.assertIn('"minLength": 3', str(column_profile))
110+
111+
self.assertEqual(column_profile.completeness, 1)
112+
self.assertEqual(column_profile.approximateNumDistinctValues, 3)
113+
self.assertEqual(column_profile.typeCounts["String"], 4)
114+
self.assertEqual(column_profile.isDataTypeInferred, False)
115+
actual_histogram = sorted(column_profile.histogram, key=lambda x: x.value)
116+
self.assertEqual(len(actual_histogram), 3)
117+
expected_histogram = [
118+
DistributionValue("ant", 1, 0.25),
119+
DistributionValue("bee", 2, 0.5),
120+
DistributionValue("cricket", 1, 0.25),
121+
]
122+
for actual, expected in zip(actual_histogram, expected_histogram):
123+
self.assertEqual(actual.value, expected.value)
124+
self.assertEqual(actual.count, expected.count)
125+
self.assertAlmostEqual(actual.ratio, expected.ratio)
126+
127+
column_profile = result.profiles["b"]
128+
self.assertEqual(column_profile.completeness, 0.5)
129+
self.assertEqual(column_profile.approximateNumDistinctValues, 2)
130+
self.assertEqual(column_profile.typeCounts["String"], 2)
131+
self.assertEqual(column_profile.minLength, 0)
132+
self.assertEqual(column_profile.maxLength, 9)
133+
79134

80135
if __name__ == "__main__":
81136
unittest.main()

0 commit comments

Comments
 (0)