Skip to content

Commit 67e8dbb

Browse files
committed
Profiler: add StringColumnProfile.
1 parent c1f9cff commit 67e8dbb

3 files changed

Lines changed: 59 additions & 5 deletions

File tree

docs/profiles.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,4 +22,5 @@ Here are the current supported functionalities of Profiles.
2222
| | property: profiles | Done |
2323
| | property: numRecords | Done |
2424
| StandardColumnProfile | StandardColumnProfile(spark_session, column, java_column_profile) | Done |
25+
| StringColumnProfile | StringColumnProfile(spark_session, column, java_column_profile) | Done |
2526
| NumericColumnProfile | NumericColumnProfile(spark_session, column, java_column_profile) | Done |

pydeequ/profiles.py

Lines changed: 36 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,9 @@
22
""" Profiles file for all the Profiles classes in Deequ"""
33
import json
44
from collections import namedtuple
5+
from typing import Optional
56

7+
from py4j.java_gateway import JavaObject
68
from pyspark.sql import DataFrame, SparkSession
79
from pydeequ.analyzers import KLLParameters
810
from pydeequ.metrics import BucketDistribution
@@ -241,9 +243,8 @@ def __init__(self, spark_session: SparkSession):
241243
self._numRecords = 0
242244
self.columnProfileClasses = {
243245
"StandardColumnProfile": StandardColumnProfile,
244-
"StringColumnProfile": StandardColumnProfile,
246+
"StringColumnProfile": StringColumnProfile,
245247
"NumericColumnProfile": NumericColumnProfile,
246-
247248
}
248249

249250
def _columnProfilesFromColumnRunBuilderRun(self, run):
@@ -528,3 +529,36 @@ def approxPercentiles(self):
528529
"""
529530
return self._approxPercentiles
530531

532+
533+
class StringColumnProfile(StandardColumnProfile):
534+
"""
535+
String Column Profile class
536+
537+
:param SparkSession spark_session: sparkSession
538+
:param str column: the designated column of which the profile is run on
539+
:param JavaObject java_column_profile: The profile mapped as a Java map
540+
"""
541+
542+
def __init__(
543+
self, spark_session: SparkSession, column: str, java_column_profile: JavaObject
544+
) -> None:
545+
super().__init__(spark_session, column, java_column_profile)
546+
self._minLength = get_or_else_none(java_column_profile.minLength())
547+
self._maxLength = get_or_else_none(java_column_profile.maxLength())
548+
self.all.update(
549+
{
550+
"minLength": self._minLength,
551+
"maxLength": self._maxLength,
552+
}
553+
)
554+
555+
@property
556+
def minLength(self) -> Optional[int]:
557+
return self._minLength
558+
559+
@property
560+
def maxLength(self) -> Optional[int]:
561+
return self._maxLength
562+
563+
def __str__(self) -> str:
564+
return f"StringProfiles for column: {self.column}: {json.dumps(self.all, indent=4)}"

tests/test_profiles.py

Lines changed: 22 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,7 @@
11
# -*- coding: utf-8 -*-
22
import unittest
33
from pyspark.sql import Row
4-
from pydeequ.analyzers import KLLParameters
5-
from pydeequ.profiles import ColumnProfilerRunBuilder, ColumnProfilerRunner
4+
from pydeequ.profiles import ColumnProfilerRunBuilder, ColumnProfilerRunner, DistributionValue, StringColumnProfile
65
from pydeequ.analyzers import KLLParameters, DataTypeInstances
76
from tests.conftest import setup_pyspark
87

@@ -11,7 +10,7 @@ class TestProfiles(unittest.TestCase):
1110
def setUpClass(cls):
1211
cls.spark = setup_pyspark().appName("test-profiles-local").getOrCreate()
1312
cls.sc = cls.spark.sparkContext
14-
cls.df = cls.sc.parallelize([Row(a="foo", b=1, c=5), Row(a="bar", b=2, c=6), Row(a="baz", b=3, c=None)]).toDF()
13+
cls.df = cls.sc.parallelize([Row(a="foo", b=1, c=5), Row(a="bar", b=2, c=6), Row(a="bazz", b=3, c=None)]).toDF()
1514

1615
@classmethod
1716
def tearDownClass(cls):
@@ -76,6 +75,26 @@ def test_profile_numRecords(self):
7675
result = ColumnProfilerRunner(self.spark).onData(self.df).run()
7776
self.assertEqual(result.numRecords, 3)
7877

78+
def test_StringColumnProfile(self):
79+
result = ColumnProfilerRunner(self.spark).onData(self.df).run()
80+
column_profile = result.profiles["a"]
81+
self.assertIsInstance(column_profile, StringColumnProfile)
82+
self.assertEqual(column_profile.minLength, 3)
83+
self.assertEqual(column_profile.maxLength, 4)
84+
85+
self.assertEqual(column_profile.completeness, 1.0)
86+
self.assertEqual(column_profile.approximateNumDistinctValues, 3)
87+
self.assertEqual(column_profile.typeCounts["String"], 3)
88+
self.assertEqual(column_profile.isDataTypeInferred, False)
89+
self.assertListEqual(
90+
sorted(column_profile.histogram),
91+
[
92+
DistributionValue("bar", 1, 1/3),
93+
DistributionValue("bazz", 1, 1/3),
94+
DistributionValue("foo", 1, 1/3),
95+
]
96+
)
97+
7998

8099
if __name__ == "__main__":
81100
unittest.main()

0 commit comments

Comments
 (0)