|
1 | 1 | # -*- coding: utf-8 -*- |
2 | 2 | """ Profiles file for all the Profiles classes in Deequ""" |
3 | | -# from pydeequ.analyzers import * |
4 | | -# from pydeequ.metrics import * |
5 | 3 | import json |
6 | 4 | from collections import namedtuple |
7 | 5 |
|
8 | 6 | from pyspark.sql import DataFrame, SparkSession |
9 | | - |
10 | 7 | from pydeequ.analyzers import KLLParameters |
11 | 8 | from pydeequ.metrics import BucketDistribution |
12 | 9 | from pydeequ.pandas_utils import ensure_pyspark_df |
| 10 | +from enum import Enum |
13 | 11 | from pydeequ.scala_utils import ( |
14 | 12 | get_or_else_none, |
15 | 13 | java_list_to_python_list, |
@@ -181,14 +179,18 @@ def setKLLParameters(self, kllParameters: KLLParameters): |
181 | 179 | self._ColumnProfilerRunBuilder.setKLLParameters(self._jvm.scala.Option.apply(kllParameters._param)) |
182 | 180 | return self |
183 | 181 |
|
184 | | - def setPredefinedTypes(self, dataTypes: dict): |
| 182 | + def setPredefinedTypes(self, dataTypesDict: dict): |
185 | 183 | """ |
186 | 184 | Set predefined data types for each column (e.g. baseline) |
187 | 185 |
|
188 | | - :param dict dataTypes: dataType map for baseline columns |
189 | | - :return: Baseline for each column |
| 186 | + :param dict{"columnName": DataTypeInstance} dataTypes: dataType map for baseline columns. |
| 187 | + :return: Baseline for each column. I.E. returns the dataType label to the desired DataTypeInstance |
190 | 188 | """ |
191 | | - self._ColumnProfilerRunBuilder.setPredefinedTypes(to_scala_map(self._spark_session, dataTypes)) |
| 189 | + dataType_scala_map = {} |
| 190 | + for key, value in dataTypesDict.items(): |
| 191 | + val = value._create_java_object(self._jvm) |
| 192 | + dataType_scala_map[key] = val |
| 193 | + self._ColumnProfilerRunBuilder.setPredefinedTypes(to_scala_map(self._spark_session, dataType_scala_map)) |
192 | 194 | return self |
193 | 195 |
|
194 | 196 | def useRepository(self, repository): |
@@ -513,3 +515,4 @@ def approxPercentiles(self): |
513 | 515 | :return: gets the approximate percentiles of the column |
514 | 516 | """ |
515 | 517 | return self._approxPercentiles |
| 518 | + |
0 commit comments