Skip to content

Commit fac0079

Browse files
author
Thierry RAMORASOAVINA
committed
Add a way to add a variable to a dictionary using a complete specification
1 parent 1b0b3dc commit fac0079

File tree

6 files changed

+346
-114
lines changed

6 files changed

+346
-114
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
comments, and dictionary and variable block internal comments.
1414
- (`core`) Dictionary `Rule` class and supporting API for adding and getting
1515
rules to / from variables and variable blocks.
16+
- (`core`) New way to add a variable to a dictionary using a complete specification.
1617
- (`sklearn`) `Text` Khiops type support at the estimator level.
1718

1819
### Fixed

doc/samples/samples.rst

Lines changed: 17 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -79,40 +79,28 @@ Samples
7979
{"name": "ts", "type": "TimestampTZ"},
8080
]
8181
for var_spec in simple_variables:
82-
var = kh.Variable()
83-
var.name = var_spec["name"]
84-
var.type = var_spec["type"]
85-
root_dictionary.add_variable(var)
82+
root_dictionary.add_variable_from_spec(name=var_spec["name"], type=var_spec["type"])
8683
8784
# Create a second dictionary
8885
second_dictionary = kh.Dictionary(
8986
json_data={"name": "Service", "key": ["Id", "id_product"]}
9087
)
91-
second_dictionary.add_variable(
92-
kh.Variable(json_data={"name": "Id", "type": "Categorical"})
93-
)
94-
second_dictionary.add_variable(
95-
kh.Variable(json_data={"name": "id_product", "type": "Categorical"})
96-
)
88+
second_dictionary.add_variable_from_spec(name="Id", type="Categorical")
89+
second_dictionary.add_variable_from_spec(name="id_product", type="Categorical")
90+
9791
# Create a third dictionary
9892
third_dictionary = kh.Dictionary(json_data={"name": "Address", "key": ["Id"]})
99-
third_dictionary.add_variable(
100-
kh.Variable(json_data={"name": "StreetNumber", "type": "Numerical"})
101-
)
102-
third_dictionary.add_variable(
103-
kh.Variable(json_data={"name": "StreetName", "type": "Categorical"})
104-
)
105-
third_dictionary.add_variable(
106-
kh.Variable(json_data={"name": "id_city", "type": "Categorical"})
107-
)
93+
third_dictionary.add_variable_from_spec(name="StreetNumber", type="Numerical")
94+
third_dictionary.add_variable_from_spec(name="StreetName", type="Categorical")
95+
third_dictionary.add_variable_from_spec(name="id_city", type="Categorical")
10896
10997
# Add the variables used in a multi-table context in the first dictionary.
11098
# They link the root dictionary to the additional ones
111-
root_dictionary.add_variable(
112-
kh.Variable(json_data={"name": "Services", "type": "Table(Service)"})
99+
root_dictionary.add_variable_from_spec(
100+
name="Services", type="Table", object_type="Service"
113101
)
114-
root_dictionary.add_variable(
115-
kh.Variable(json_data={"name": "Address", "type": "Entity(Address)"})
102+
root_dictionary.add_variable_from_spec(
103+
name="Address", type="Entity", object_type="Address"
116104
)
117105
118106
# Create a DictionaryDomain (set of dictionaries)
@@ -651,25 +639,19 @@ Samples
651639
652640
# Add a random fold index variable to the learning dictionary
653641
fold_number = 5
654-
fold_index_variable = kh.Variable()
655-
fold_index_variable.name = "FoldIndex"
656-
fold_index_variable.type = "Numerical"
657-
fold_index_variable.used = False
658-
dictionary.add_variable(fold_index_variable)
642+
dictionary.add_variable_from_spec(name="FoldIndex", type="Numerical", used=False)
659643
660644
# Create fold indexing rule and set it on `fold_index_variable`
661-
dictionary.get_variable(fold_index_variable.name).set_rule(
645+
fold_index_variable = dictionary.get_variable("FoldIndex")
646+
fold_index_variable.set_rule(
662647
kh.Rule("Ceil", kh.Rule("Product", fold_number, kh.Rule("Random()"))),
663648
)
664649
665650
# Add variables that indicate if the instance is in the train dataset:
666651
for fold_index in range(1, fold_number + 1):
667-
is_in_train_dataset_variable = kh.Variable()
668-
is_in_train_dataset_variable.name = "IsInTrainDataset" + str(fold_index)
669-
is_in_train_dataset_variable.type = "Numerical"
670-
is_in_train_dataset_variable.used = False
671-
dictionary.add_variable(is_in_train_dataset_variable)
672-
dictionary.get_variable(is_in_train_dataset_variable.name).set_rule(
652+
name = "IsInTrainDataset" + str(fold_index)
653+
dictionary.add_variable_from_spec(name=name, type="Numerical", used=False)
654+
dictionary.get_variable(name).set_rule(
673655
kh.Rule("NEQ", fold_index_variable, fold_index),
674656
)
675657

khiops/core/dictionary.py

Lines changed: 205 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,8 @@
2323
from khiops.core.exceptions import KhiopsJSONError
2424
from khiops.core.internals.common import (
2525
deprecation_message,
26+
is_dict_like,
27+
is_list_like,
2628
is_string_like,
2729
type_error_message,
2830
)
@@ -43,10 +45,6 @@ def _format_name(name):
4345
4446
Otherwise, it returns the name between backquoted (backquotes within are doubled)
4547
"""
46-
# Check that the type of name is string or bytes
47-
if not is_string_like(name):
48-
raise TypeError(type_error_message("name", name, "string-like"))
49-
5048
# Check if the name is an identifier
5149
# Python isalnum is not used because of utf-8 encoding (accentuated chars
5250
# are considered alphanumeric)
@@ -81,6 +79,70 @@ def _quote_value(value):
8179
return quoted_value
8280

8381

82+
def _check_name(name):
83+
"""Ensures the variable name is consistent
84+
with the Khiops core name constraints
85+
86+
Plain string or bytes are both accepted as input.
87+
The Khiops core forbids a name
88+
- with a length outside the [1,128] interval
89+
- containing a simple (Unix) carriage-return (\n)
90+
- with leading and trailing spaces
91+
This function must check at least these constraints.
92+
93+
Parameters
94+
----------
95+
name : str
96+
Name to be validated
97+
Raises
98+
------
99+
`ValueError`
100+
If the provided name does not comply with the formatting constraints
101+
"""
102+
# Check that the type of name is string or bytes
103+
if not is_string_like(name):
104+
raise TypeError(type_error_message("name", name, "string-like"))
105+
106+
# Check the name complies with the Khiops core constraints
107+
if isinstance(name, str):
108+
contains_carriage_return = "\n" in name
109+
else:
110+
assert isinstance(name, bytes)
111+
contains_carriage_return = b"\n" in name
112+
if not len(name) <= 128 or contains_carriage_return or name != name.strip():
113+
raise ValueError(
114+
f"Variable name '{name}' cannot be accepted "
115+
"(invalid length or characters)"
116+
)
117+
118+
119+
def _is_valid_type(type_str):
120+
"""Checks whether the type is known"""
121+
return (
122+
_is_native_type(type_str)
123+
or _is_object_type(type_str)
124+
or type_str in ["TextList", "Structure"]
125+
) # internal types
126+
127+
128+
def _is_native_type(type_str):
129+
"""Checks whether the type is native (not internal or relational)"""
130+
return type_str in [
131+
"Categorical",
132+
"Numerical",
133+
"Time",
134+
"Date",
135+
"Timestamp",
136+
"TimestampTZ",
137+
"Text",
138+
]
139+
140+
141+
def _is_object_type(type_str):
142+
"""Checks whether the type is an object one (relational)"""
143+
return type_str in ["Entity", "Table"]
144+
145+
84146
class DictionaryDomain(KhiopsJSONObject):
85147
"""Main class containing the information of a Khiops dictionary file
86148
@@ -769,6 +831,130 @@ def add_variable(self, variable):
769831
self.variables.append(variable)
770832
self._variables_by_name[variable.name] = variable
771833

834+
def add_variable_from_spec(
835+
self,
836+
name,
837+
type,
838+
label="",
839+
used=True,
840+
object_type=None,
841+
structure_type=None,
842+
rule=None,
843+
meta_data=None,
844+
):
845+
"""Adds a variable to this dictionary using a complete specification
846+
847+
Parameters
848+
----------
849+
name : str
850+
Variable name
851+
type : str
852+
Variable type, See `Variable`
853+
label : str, default ""
854+
Label of the variable.
855+
used : bool, default ``True``
856+
Usage status of the variable.
857+
object_type : str, optional
858+
Object type. Ignored if variable type not in ["Entity", "Table"]
859+
structure_type : str, optional
860+
Structure type. Ignored if variable type is not "Structure"
861+
rule : str, optional
862+
Variable rule (in verbatim).
863+
meta_data : dict, optional
864+
A Python dictionary which holds the metadata specification
865+
with the following keys:
866+
- keys : list, default []
867+
list of meta-data keys
868+
- values : list, default []
869+
list of meta-data values.
870+
The values can be str, bool, float or int.
871+
872+
Raises
873+
------
874+
`ValueError`
875+
- If the variable name is empty or does not comply
876+
with the formatting constraints.
877+
- If there is already a variable with the same name.
878+
- If the given variable type is unknown.
879+
- If a native type is given 'object_type' or 'structure_type'
880+
- If the 'meta_data' is not a dictionary
881+
"""
882+
# Values and Types checks
883+
if not name:
884+
raise ValueError(
885+
"Cannot add to dictionary unnamed variable " f"(name = '{name}')"
886+
)
887+
if name in self._variables_by_name:
888+
raise ValueError(f"Dictionary already has a variable named '{name}'")
889+
if not _is_valid_type(type):
890+
raise ValueError(f"Invalid type '{type}'")
891+
if _is_native_type(type):
892+
if object_type or structure_type:
893+
raise ValueError(
894+
f"Native type '{type}' "
895+
"cannot have 'object_type' or 'structure_type'"
896+
)
897+
if _is_object_type(type) and object_type is None:
898+
raise ValueError(f"'object_type' must be provided for type '{type}'")
899+
if meta_data is not None:
900+
if not is_dict_like(meta_data):
901+
raise TypeError(type_error_message("meta_data", meta_data, "dict-like"))
902+
if "keys" not in meta_data or "values" not in meta_data:
903+
raise ValueError(
904+
"'meta_data' does not contain "
905+
"the mandatory keys 'keys' and 'values'"
906+
)
907+
if not is_list_like(meta_data["keys"]):
908+
raise TypeError(
909+
type_error_message(
910+
"meta_data['keys']", meta_data["keys"], "list-like"
911+
)
912+
)
913+
if not is_list_like(meta_data["values"]):
914+
raise TypeError(
915+
type_error_message(
916+
"meta_data['values']", meta_data["values"], "list-like"
917+
)
918+
)
919+
if len(meta_data["keys"]) != len(meta_data["values"]):
920+
raise ValueError(
921+
"'meta_data' keys and values " "do not have the same size"
922+
)
923+
if label is not None:
924+
if not is_string_like(label):
925+
raise TypeError(type_error_message("label", label, "string-like"))
926+
if object_type is not None:
927+
if not is_string_like(object_type):
928+
raise TypeError(
929+
type_error_message("object_type", object_type, "string-like")
930+
)
931+
if structure_type is not None:
932+
if not is_string_like(structure_type):
933+
raise TypeError(
934+
type_error_message("structure_type", structure_type, "string-like")
935+
)
936+
if rule is not None:
937+
if not is_string_like(rule):
938+
raise TypeError(type_error_message("rule", rule, "string-like"))
939+
940+
# Variable initialization
941+
variable = Variable()
942+
variable.name = name
943+
variable.type = type
944+
variable.used = used
945+
if meta_data is not None:
946+
for key, value in zip(meta_data["keys"], meta_data["values"]):
947+
variable.meta_data.add_value(key, value)
948+
if label is not None:
949+
variable.label = label
950+
if object_type is not None:
951+
variable.object_type = object_type
952+
if structure_type is not None:
953+
variable.structure_type = structure_type
954+
if rule is not None:
955+
variable.rule = Rule(verbatim=rule)
956+
self.add_variable(variable)
957+
772958
def remove_variable(self, variable_name):
773959
"""Removes the specified variable from this dictionary
774960
@@ -1017,7 +1203,9 @@ def __init__(self, json_data=None):
10171203
raise TypeError(type_error_message("json_data", json_data, dict))
10181204

10191205
# Main attributes
1020-
self.name = ""
1206+
# The variable name is protected attribute accessible only via a property
1207+
# to ensure it is always valid
1208+
self._name = ""
10211209
self.label = ""
10221210
self.comments = []
10231211
self.used = True
@@ -1058,7 +1246,7 @@ def __init__(self, json_data=None):
10581246
self.type = json_data.get("type")
10591247

10601248
# Initialize complement of the type
1061-
if self.type in ("Entity", "Table"):
1249+
if _is_object_type(self.type):
10621250
self.object_type = json_data.get("objectType")
10631251
elif self.type == "Structure":
10641252
self.structure_type = json_data.get("structureType")
@@ -1072,7 +1260,7 @@ def __init__(self, json_data=None):
10721260
self.meta_data = MetaData(json_meta_data)
10731261

10741262
def __repr__(self):
1075-
"""Returns a human readable string representation"""
1263+
"""Returns a human-readable string representation"""
10761264
return f"Variable ({self.name})"
10771265

10781266
def __str__(self):
@@ -1081,6 +1269,15 @@ def __str__(self):
10811269
self.write(writer)
10821270
return str(stream.getvalue(), encoding="utf8", errors="replace")
10831271

1272+
@property
1273+
def name(self):
1274+
return self._name
1275+
1276+
@name.setter
1277+
def name(self, value):
1278+
_check_name(value)
1279+
self._name = value
1280+
10841281
def copy(self):
10851282
"""Copies this variable instance
10861283
@@ -1179,7 +1376,7 @@ def full_type(self):
11791376
basic.
11801377
"""
11811378
full_type = self.type
1182-
if self.type in ("Entity", "Table"):
1379+
if _is_object_type(self.type):
11831380
full_type += f"({self.object_type})"
11841381
elif self.type == "Structure":
11851382
full_type += f"({self.structure_type})"

0 commit comments

Comments
 (0)