Skip to content

Commit f9fdfcc

Browse files
author
Thierry RAMORASOAVINA
committed
Add a way to add a variable to a dictionary using a complete specification
1 parent 5533702 commit f9fdfcc

File tree

6 files changed

+325
-111
lines changed

6 files changed

+325
-111
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
comments, and dictionary and variable block internal comments.
1414
- (`core`) Dictionary `Rule` class and supporting API for adding and getting
1515
rules to / from variables and variable blocks.
16+
- (`core`) New way to add a variable to a dictionary using a complete specification.
1617
- (`sklearn`) `Text` Khiops type support at the estimator level.
1718

1819
### Fixed

doc/samples/samples.rst

Lines changed: 22 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -79,40 +79,34 @@ Samples
7979
{"name": "ts", "type": "TimestampTZ"},
8080
]
8181
for var_spec in simple_variables:
82-
var = kh.Variable()
83-
var.name = var_spec["name"]
84-
var.type = var_spec["type"]
85-
root_dictionary.add_variable(var)
82+
root_dictionary.add_variable_from_spec(name=var_spec["name"], type=var_spec["type"])
8683
8784
# Create a second dictionary
8885
second_dictionary = kh.Dictionary(
8986
json_data={"name": "Service", "key": ["Id", "id_product"]}
9087
)
91-
second_dictionary.add_variable(
92-
kh.Variable(json_data={"name": "Id", "type": "Categorical"})
93-
)
94-
second_dictionary.add_variable(
95-
kh.Variable(json_data={"name": "id_product", "type": "Categorical"})
96-
)
88+
second_dictionary.add_variable_from_spec(name="Id", type="Categorical")
89+
second_dictionary.add_variable_from_spec(name="id_product", type="Categorical")
90+
9791
# Create a third dictionary
9892
third_dictionary = kh.Dictionary(json_data={"name": "Address", "key": ["Id"]})
99-
third_dictionary.add_variable(
100-
kh.Variable(json_data={"name": "StreetNumber", "type": "Numerical"})
101-
)
102-
third_dictionary.add_variable(
103-
kh.Variable(json_data={"name": "StreetName", "type": "Categorical"})
104-
)
105-
third_dictionary.add_variable(
106-
kh.Variable(json_data={"name": "id_city", "type": "Categorical"})
93+
third_dictionary.add_variable_from_spec(name="StreetNumber", type="Numerical")
94+
third_dictionary.add_variable_from_spec(name="StreetName", type="Categorical")
95+
third_dictionary.add_variable_from_spec(name="id_city", type="Categorical")
96+
# Add a variable with a rule
97+
third_dictionary.add_variable_from_spec(
98+
name="computed",
99+
type="Numerical",
100+
rule=kh.Rule("Ceil", kh.Rule("Product", 3, kh.Rule("Random()"))),
107101
)
108102
109103
# Add the variables used in a multi-table context in the first dictionary.
110104
# They link the root dictionary to the additional ones
111-
root_dictionary.add_variable(
112-
kh.Variable(json_data={"name": "Services", "type": "Table(Service)"})
105+
root_dictionary.add_variable_from_spec(
106+
name="Services", type="Table", object_type="Service"
113107
)
114-
root_dictionary.add_variable(
115-
kh.Variable(json_data={"name": "Address", "type": "Entity(Address)"})
108+
root_dictionary.add_variable_from_spec(
109+
name="Address", type="Entity", object_type="Address"
116110
)
117111
118112
# Create a DictionaryDomain (set of dictionaries)
@@ -651,25 +645,19 @@ Samples
651645
652646
# Add a random fold index variable to the learning dictionary
653647
fold_number = 5
654-
fold_index_variable = kh.Variable()
655-
fold_index_variable.name = "FoldIndex"
656-
fold_index_variable.type = "Numerical"
657-
fold_index_variable.used = False
658-
dictionary.add_variable(fold_index_variable)
648+
dictionary.add_variable_from_spec(name="FoldIndex", type="Numerical", used=False)
659649
660650
# Create fold indexing rule and set it on `fold_index_variable`
661-
dictionary.get_variable(fold_index_variable.name).set_rule(
651+
fold_index_variable = dictionary.get_variable("FoldIndex")
652+
fold_index_variable.set_rule(
662653
kh.Rule("Ceil", kh.Rule("Product", fold_number, kh.Rule("Random()"))),
663654
)
664655
665656
# Add variables that indicate if the instance is in the train dataset:
666657
for fold_index in range(1, fold_number + 1):
667-
is_in_train_dataset_variable = kh.Variable()
668-
is_in_train_dataset_variable.name = "IsInTrainDataset" + str(fold_index)
669-
is_in_train_dataset_variable.type = "Numerical"
670-
is_in_train_dataset_variable.used = False
671-
dictionary.add_variable(is_in_train_dataset_variable)
672-
dictionary.get_variable(is_in_train_dataset_variable.name).set_rule(
658+
name = "IsInTrainDataset" + str(fold_index)
659+
dictionary.add_variable_from_spec(name=name, type="Numerical", used=False)
660+
dictionary.get_variable(name).set_rule(
673661
kh.Rule("NEQ", fold_index_variable, fold_index),
674662
)
675663

khiops/core/dictionary.py

Lines changed: 176 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
from khiops.core.exceptions import KhiopsJSONError
2424
from khiops.core.internals.common import (
2525
deprecation_message,
26+
is_dict_like,
2627
is_string_like,
2728
type_error_message,
2829
)
@@ -43,10 +44,6 @@ def _format_name(name):
4344
4445
Otherwise, it returns the name between backquoted (backquotes within are doubled)
4546
"""
46-
# Check that the type of name is string or bytes
47-
if not is_string_like(name):
48-
raise TypeError(type_error_message("name", name, "string-like"))
49-
5047
# Check if the name is an identifier
5148
# Python isalnum is not used because of utf-8 encoding (accentuated chars
5249
# are considered alphanumeric)
@@ -81,6 +78,70 @@ def _quote_value(value):
8178
return quoted_value
8279

8380

81+
def _check_name(name):
82+
"""Ensures the variable name is consistent
83+
with the Khiops core name constraints
84+
85+
Plain string or bytes are both accepted as input.
86+
The Khiops core forbids a name
87+
- with a length outside the [1,128] interval
88+
- containing a simple (Unix) carriage-return (\n)
89+
- with leading and trailing spaces.
90+
This function must check at least these constraints.
91+
92+
Parameters
93+
----------
94+
name : str
95+
Name to be validated.
96+
Raises
97+
------
98+
`ValueError`
99+
If the provided name does not comply with the formatting constraints.
100+
"""
101+
# Check that the type of name is string or bytes
102+
if not is_string_like(name):
103+
raise TypeError(type_error_message("name", name, "string-like"))
104+
105+
# Check the name complies with the Khiops core constraints
106+
if isinstance(name, str):
107+
contains_carriage_return = "\n" in name
108+
else:
109+
assert isinstance(name, bytes)
110+
contains_carriage_return = b"\n" in name
111+
if len(name) > 128 or contains_carriage_return or name != name.strip():
112+
raise ValueError(
113+
f"Variable name '{name}' cannot be accepted "
114+
"(invalid length or characters)"
115+
)
116+
117+
118+
def _is_valid_type(type_str):
119+
"""Checks whether the type is known"""
120+
return (
121+
_is_native_type(type_str)
122+
or _is_object_type(type_str)
123+
or type_str in ["TextList", "Structure"]
124+
) # internal types
125+
126+
127+
def _is_native_type(type_str):
128+
"""Checks whether the type is native (not internal or relational)"""
129+
return type_str in [
130+
"Categorical",
131+
"Numerical",
132+
"Time",
133+
"Date",
134+
"Timestamp",
135+
"TimestampTZ",
136+
"Text",
137+
]
138+
139+
140+
def _is_object_type(type_str):
141+
"""Checks whether the type is an object one (relational)"""
142+
return type_str in ["Entity", "Table"]
143+
144+
84145
class DictionaryDomain(KhiopsJSONObject):
85146
"""Main class containing the information of a Khiops dictionary file
86147
@@ -769,6 +830,102 @@ def add_variable(self, variable):
769830
self.variables.append(variable)
770831
self._variables_by_name[variable.name] = variable
771832

833+
def add_variable_from_spec(
834+
self,
835+
name,
836+
type,
837+
label="",
838+
used=True,
839+
object_type=None,
840+
structure_type=None,
841+
rule=None,
842+
meta_data=None,
843+
):
844+
"""Adds a variable to this dictionary using a complete specification
845+
846+
Parameters
847+
----------
848+
name : str
849+
Variable name.
850+
type : str
851+
Variable type. See `Variable`.
852+
label : str, default ""
853+
Label of the variable.
854+
used : bool, default ``True``
855+
Usage status of the variable.
856+
object_type : str, optional
857+
Object type. Ignored if variable type not in ["Entity", "Table"].
858+
structure_type : str, optional
859+
Structure type. Ignored if variable type is not "Structure".
860+
rule : `Rule`, optional
861+
Variable rule.
862+
meta_data : dict, optional
863+
A Python dictionary which holds the metadata specification.
864+
The dictionary keys are str. The values can be str, bool, float or int.
865+
866+
Raises
867+
------
868+
`ValueError`
869+
- If the variable name is empty or does not comply
870+
with the formatting constraints.
871+
- If there is already a variable with the same name.
872+
- If the given variable type is unknown.
873+
- If a native type is given 'object_type' or 'structure_type'.
874+
- If the 'meta_data' is not a dictionary.
875+
"""
876+
# Values and Types checks
877+
if not name:
878+
raise ValueError(
879+
"Cannot add to dictionary unnamed variable " f"(name = '{name}')"
880+
)
881+
if name in self._variables_by_name:
882+
raise ValueError(f"Dictionary already has a variable named '{name}'")
883+
if not _is_valid_type(type):
884+
raise ValueError(f"Invalid type '{type}'")
885+
if _is_native_type(type):
886+
if object_type or structure_type:
887+
raise ValueError(
888+
f"Native type '{type}' "
889+
"cannot have 'object_type' or 'structure_type'"
890+
)
891+
if _is_object_type(type) and object_type is None:
892+
raise ValueError(f"'object_type' must be provided for type '{type}'")
893+
if type == "Structure" and structure_type is None:
894+
raise ValueError(f"'structure_type' must be provided for type '{type}'")
895+
if meta_data is not None:
896+
if not is_dict_like(meta_data):
897+
raise TypeError(type_error_message("meta_data", meta_data, "dict-like"))
898+
if object_type is not None:
899+
if not is_string_like(object_type):
900+
raise TypeError(
901+
type_error_message("object_type", object_type, "string-like")
902+
)
903+
if structure_type is not None:
904+
if not is_string_like(structure_type):
905+
raise TypeError(
906+
type_error_message("structure_type", structure_type, "string-like")
907+
)
908+
if rule is not None:
909+
if not isinstance(rule, Rule):
910+
raise TypeError(type_error_message("rule", rule, Rule))
911+
912+
# Variable initialization
913+
variable = Variable()
914+
variable.name = name
915+
variable.type = type
916+
variable.used = used
917+
if meta_data is not None:
918+
for key, value in meta_data.items():
919+
variable.meta_data.add_value(key, value)
920+
variable.label = label
921+
if object_type is not None:
922+
variable.object_type = object_type
923+
if structure_type is not None:
924+
variable.structure_type = structure_type
925+
if rule is not None:
926+
variable.set_rule(rule)
927+
self.add_variable(variable)
928+
772929
def remove_variable(self, variable_name):
773930
"""Removes the specified variable from this dictionary
774931
@@ -1017,7 +1174,9 @@ def __init__(self, json_data=None):
10171174
raise TypeError(type_error_message("json_data", json_data, dict))
10181175

10191176
# Main attributes
1020-
self.name = ""
1177+
# The variable name is protected attribute accessible only via a property
1178+
# to ensure it is always valid
1179+
self._name = ""
10211180
self.label = ""
10221181
self.comments = []
10231182
self.used = True
@@ -1058,7 +1217,7 @@ def __init__(self, json_data=None):
10581217
self.type = json_data.get("type")
10591218

10601219
# Initialize complement of the type
1061-
if self.type in ("Entity", "Table"):
1220+
if _is_object_type(self.type):
10621221
self.object_type = json_data.get("objectType")
10631222
elif self.type == "Structure":
10641223
self.structure_type = json_data.get("structureType")
@@ -1072,7 +1231,7 @@ def __init__(self, json_data=None):
10721231
self.meta_data = MetaData(json_meta_data)
10731232

10741233
def __repr__(self):
1075-
"""Returns a human readable string representation"""
1234+
"""Returns a human-readable string representation"""
10761235
return f"Variable ({self.name})"
10771236

10781237
def __str__(self):
@@ -1081,6 +1240,15 @@ def __str__(self):
10811240
self.write(writer)
10821241
return str(stream.getvalue(), encoding="utf8", errors="replace")
10831242

1243+
@property
1244+
def name(self):
1245+
return self._name
1246+
1247+
@name.setter
1248+
def name(self, value):
1249+
_check_name(value)
1250+
self._name = value
1251+
10841252
def copy(self):
10851253
"""Copies this variable instance
10861254
@@ -1179,7 +1347,7 @@ def full_type(self):
11791347
basic.
11801348
"""
11811349
full_type = self.type
1182-
if self.type in ("Entity", "Table"):
1350+
if _is_object_type(self.type):
11831351
full_type += f"({self.object_type})"
11841352
elif self.type == "Structure":
11851353
full_type += f"({self.structure_type})"

0 commit comments

Comments
 (0)