Skip to content

Commit 51feb8a

Browse files
author
Thierry RAMORASOAVINA
committed
Add a way to add a variable to a dictionary using a complete specification
1 parent 1b0b3dc commit 51feb8a

File tree

13 files changed

+345
-129
lines changed

13 files changed

+345
-129
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
comments, and dictionary and variable block internal comments.
1414
- (`core`) Dictionary `Rule` class and supporting API for adding and getting
1515
rules to / from variables and variable blocks.
16+
- (`core`) New way to add a variable to a dictionary using a complete specification.
1617
- (`sklearn`) `Text` Khiops type support at the estimator level.
1718

1819
### Fixed

doc/samples/samples.rst

Lines changed: 17 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -79,40 +79,28 @@ Samples
7979
{"name": "ts", "type": "TimestampTZ"},
8080
]
8181
for var_spec in simple_variables:
82-
var = kh.Variable()
83-
var.name = var_spec["name"]
84-
var.type = var_spec["type"]
85-
root_dictionary.add_variable(var)
82+
root_dictionary.add_variable_from_spec(name=var_spec["name"], type=var_spec["type"])
8683
8784
# Create a second dictionary
8885
second_dictionary = kh.Dictionary(
8986
json_data={"name": "Service", "key": ["Id", "id_product"]}
9087
)
91-
second_dictionary.add_variable(
92-
kh.Variable(json_data={"name": "Id", "type": "Categorical"})
93-
)
94-
second_dictionary.add_variable(
95-
kh.Variable(json_data={"name": "id_product", "type": "Categorical"})
96-
)
88+
second_dictionary.add_variable_from_spec(name="Id", type="Categorical")
89+
second_dictionary.add_variable_from_spec(name="id_product", type="Categorical")
90+
9791
# Create a third dictionary
9892
third_dictionary = kh.Dictionary(json_data={"name": "Address", "key": ["Id"]})
99-
third_dictionary.add_variable(
100-
kh.Variable(json_data={"name": "StreetNumber", "type": "Numerical"})
101-
)
102-
third_dictionary.add_variable(
103-
kh.Variable(json_data={"name": "StreetName", "type": "Categorical"})
104-
)
105-
third_dictionary.add_variable(
106-
kh.Variable(json_data={"name": "id_city", "type": "Categorical"})
107-
)
93+
third_dictionary.add_variable_from_spec(name="StreetNumber", type="Numerical")
94+
third_dictionary.add_variable_from_spec(name="StreetName", type="Categorical")
95+
third_dictionary.add_variable_from_spec(name="id_city", type="Categorical")
10896
10997
# Add the variables used in a multi-table context in the first dictionary.
11098
# They link the root dictionary to the additional ones
111-
root_dictionary.add_variable(
112-
kh.Variable(json_data={"name": "Services", "type": "Table(Service)"})
99+
root_dictionary.add_variable_from_spec(
100+
name="Services", type="Table", object_type="Service"
113101
)
114-
root_dictionary.add_variable(
115-
kh.Variable(json_data={"name": "Address", "type": "Entity(Address)"})
102+
root_dictionary.add_variable_from_spec(
103+
name="Address", type="Entity", object_type="Address"
116104
)
117105
118106
# Create a DictionaryDomain (set of dictionaries)
@@ -651,25 +639,19 @@ Samples
651639
652640
# Add a random fold index variable to the learning dictionary
653641
fold_number = 5
654-
fold_index_variable = kh.Variable()
655-
fold_index_variable.name = "FoldIndex"
656-
fold_index_variable.type = "Numerical"
657-
fold_index_variable.used = False
658-
dictionary.add_variable(fold_index_variable)
642+
dictionary.add_variable_from_spec(name="FoldIndex", type="Numerical", used=False)
659643
660644
# Create fold indexing rule and set it on `fold_index_variable`
661-
dictionary.get_variable(fold_index_variable.name).set_rule(
645+
fold_index_variable = dictionary.get_variable("FoldIndex")
646+
fold_index_variable.set_rule(
662647
kh.Rule("Ceil", kh.Rule("Product", fold_number, kh.Rule("Random()"))),
663648
)
664649
665650
# Add variables that indicate if the instance is in the train dataset:
666651
for fold_index in range(1, fold_number + 1):
667-
is_in_train_dataset_variable = kh.Variable()
668-
is_in_train_dataset_variable.name = "IsInTrainDataset" + str(fold_index)
669-
is_in_train_dataset_variable.type = "Numerical"
670-
is_in_train_dataset_variable.used = False
671-
dictionary.add_variable(is_in_train_dataset_variable)
672-
dictionary.get_variable(is_in_train_dataset_variable.name).set_rule(
652+
name = "IsInTrainDataset" + str(fold_index)
653+
dictionary.add_variable_from_spec(name=name, type="Numerical", used=False)
654+
dictionary.get_variable(name).set_rule(
673655
kh.Rule("NEQ", fold_index_variable, fold_index),
674656
)
675657

khiops/core/dictionary.py

Lines changed: 195 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,8 @@
2323
from khiops.core.exceptions import KhiopsJSONError
2424
from khiops.core.internals.common import (
2525
deprecation_message,
26+
is_dict_like,
27+
is_list_like,
2628
is_string_like,
2729
type_error_message,
2830
)
@@ -43,25 +45,17 @@ def _format_name(name):
4345
4446
Otherwise, it returns the name between backquoted (backquotes within are doubled)
4547
"""
46-
# Check that the type of name is string or bytes
47-
if not is_string_like(name):
48-
raise TypeError(type_error_message("name", name, "string-like"))
48+
is_valid_identifier = _check_name(name)
4949

50-
# Check if the name is an identifier
51-
# Python isalnum is not used because of utf-8 encoding (accentuated chars
52-
# are considered alphanumeric)
5350
# Return original name if is an identifier, otherwise between backquotes
54-
identifier_pattern = r"^[a-zA-Z_][a-zA-Z0-9_]*"
55-
str_identifier_regex = re.compile(identifier_pattern)
56-
bytes_identifier_regex = re.compile(bytes(identifier_pattern, encoding="ascii"))
5751
if isinstance(name, str):
58-
if str_identifier_regex.fullmatch(name) is not None:
52+
if is_valid_identifier:
5953
formatted_name = name
6054
else:
6155
formatted_name = "`" + name.replace("`", "``") + "`"
6256
else:
6357
assert isinstance(name, bytes)
64-
if bytes_identifier_regex.fullmatch(name) is not None:
58+
if is_valid_identifier:
6559
formatted_name = name
6660
else:
6761
formatted_name = b"`" + name.replace(b"`", b"``") + b"`"
@@ -81,6 +75,62 @@ def _quote_value(value):
8175
return quoted_value
8276

8377

78+
def _check_name(name):
79+
"""Ensures the variable name is a valid identifier
80+
81+
Plain string or bytes are both accepted as input.
82+
Please note the Khiops core forbids a name
83+
- with a length outside the [1,128] interval
84+
- containing a simple (Unix) carriage-return (\n)
85+
- with leading and trailing spaces
86+
(\s in Perl-Compatible-Regular-Expressions syntax).
87+
This function must check at least these constraints.
88+
"""
89+
# Check that the type of name is string or bytes
90+
if not is_string_like(name):
91+
raise TypeError(type_error_message("name", name, "string-like"))
92+
93+
# Accentuated characters (between 128 and 255) are also accepted
94+
# BUT NOT the greek ones
95+
extended_identifier_pattern = r"^[a-zA-Z_\x80-\xFF][a-zA-Z0-9_\x80-\xFF]{0,127}$"
96+
str_identifier_regex = re.compile(extended_identifier_pattern)
97+
bytes_identifier_regex = re.compile(
98+
bytes(extended_identifier_pattern, encoding="ascii")
99+
)
100+
if isinstance(name, str):
101+
return str_identifier_regex.match(name) is not None
102+
else:
103+
assert isinstance(name, bytes)
104+
return bytes_identifier_regex.match(name) is not None
105+
106+
107+
def _is_valid_type(type_str):
108+
"""Checks whether the type is known"""
109+
return (
110+
_is_native_type(type_str)
111+
or _is_object_type(type_str)
112+
or type_str in ["TextList", "Structure"]
113+
) # internal types
114+
115+
116+
def _is_native_type(type_str):
117+
"""Checks whether the type is native (not internal or relational)"""
118+
return type_str in [
119+
"Categorical",
120+
"Numerical",
121+
"Time",
122+
"Date",
123+
"Timestamp",
124+
"TimestampTZ",
125+
"Text",
126+
]
127+
128+
129+
def _is_object_type(type_str):
130+
"""Checks whether the type is an object one (relational)"""
131+
return type_str in ["Entity", "Table"]
132+
133+
84134
class DictionaryDomain(KhiopsJSONObject):
85135
"""Main class containing the information of a Khiops dictionary file
86136
@@ -769,6 +819,137 @@ def add_variable(self, variable):
769819
self.variables.append(variable)
770820
self._variables_by_name[variable.name] = variable
771821

822+
def add_variable_from_spec(
823+
self,
824+
name,
825+
type,
826+
label=None,
827+
used=True,
828+
object_type=None,
829+
structure_type=None,
830+
rule=None,
831+
meta_data=None,
832+
):
833+
"""Adds a variable to this dictionary using a complete specification
834+
835+
Parameters
836+
----------
837+
name : str
838+
Variable name
839+
type : str
840+
Variable type, See `Variable`
841+
label : str, optional
842+
Label of the variable.
843+
used : bool, default ``True``
844+
Usage status of the variable.
845+
object_type : str, optional
846+
Object type. Ignored if variable type not in ["Entity", "Table"]
847+
structure_type : str, optional
848+
Structure type. Ignored if variable type is not "Structure"
849+
rule : str, optional
850+
Variable rule (in verbatim).
851+
meta_data : dict, optional
852+
A Python dictionary which holds the metadata specification
853+
with the following keys:
854+
- keys : list, default []
855+
list of meta-data keys
856+
- values : list, default []
857+
list of meta-data values.
858+
The values can be str, bool, float or int.
859+
860+
Raises
861+
------
862+
`ValueError`
863+
- If the variable name is empty or does not comply
864+
with the formatting constraints.
865+
- If there is already a variable with the same name.
866+
- If the given variable type is unknown.
867+
- If a native type is given 'object_type' or 'structure_type'
868+
- If the 'meta_data' is not a dictionary
869+
"""
870+
# Values and Types checks
871+
if not name:
872+
raise ValueError(
873+
"Cannot add to dictionary unnamed variable " f"(name = '{name}')"
874+
)
875+
if not _check_name(name):
876+
raise ValueError(
877+
f"New variable name '{name}' cannot be accepted "
878+
"(invalid length or characters)"
879+
)
880+
if name in self._variables_by_name:
881+
raise ValueError(f"Dictionary already has a variable named '{name}'")
882+
if not _is_valid_type(type):
883+
raise ValueError(f"Invalid type '{type}'")
884+
if _is_native_type(type):
885+
if object_type or structure_type:
886+
raise ValueError(
887+
f"Native type '{type}' "
888+
"cannot have 'object_type' or 'structure_type'"
889+
)
890+
if _is_object_type(type) and object_type is None:
891+
raise ValueError(f"'object_type' must be provided for type '{type}'")
892+
if meta_data is not None:
893+
if not is_dict_like(meta_data):
894+
raise TypeError(type_error_message("meta_data", meta_data, "dict-like"))
895+
if "keys" not in meta_data or "values" not in meta_data:
896+
raise ValueError(
897+
"'meta_data' does not contain "
898+
"the mandatory keys 'keys' and 'values'"
899+
)
900+
if not is_list_like(meta_data["keys"]):
901+
raise TypeError(
902+
type_error_message(
903+
"meta_data['keys']", meta_data["keys"], "list-like"
904+
)
905+
)
906+
if not is_list_like(meta_data["values"]):
907+
raise TypeError(
908+
type_error_message(
909+
"meta_data['values']", meta_data["values"], "list-like"
910+
)
911+
)
912+
if len(meta_data["keys"]) != len(meta_data["values"]):
913+
raise ValueError(
914+
"'meta_data' keys and values " "do not have the same size"
915+
)
916+
if label is not None:
917+
if not is_string_like(label):
918+
raise TypeError(type_error_message("label", label, "string-like"))
919+
if object_type is not None:
920+
if not is_string_like(object_type):
921+
raise TypeError(
922+
type_error_message("object_type", object_type, "string-like")
923+
)
924+
if structure_type is not None:
925+
if not is_string_like(structure_type):
926+
raise TypeError(
927+
type_error_message("structure_type", structure_type, "string-like")
928+
)
929+
if rule is not None:
930+
if not is_string_like(rule):
931+
raise TypeError(
932+
type_error_message("rule", structure_type, "string-like")
933+
)
934+
935+
# Variable initialization
936+
variable = Variable()
937+
variable.name = name
938+
variable.type = type
939+
variable.used = used
940+
if meta_data is not None:
941+
for key, value in zip(meta_data["keys"], meta_data["values"]):
942+
variable.meta_data.add_value(key, value)
943+
if label is not None:
944+
variable.label = label
945+
if object_type is not None:
946+
variable.object_type = object_type
947+
if structure_type is not None:
948+
variable.structure_type = structure_type
949+
if rule is not None:
950+
variable.rule = Rule(verbatim=rule)
951+
self.add_variable(variable)
952+
772953
def remove_variable(self, variable_name):
773954
"""Removes the specified variable from this dictionary
774955
@@ -1058,7 +1239,7 @@ def __init__(self, json_data=None):
10581239
self.type = json_data.get("type")
10591240

10601241
# Initialize complement of the type
1061-
if self.type in ("Entity", "Table"):
1242+
if _is_object_type(self.type):
10621243
self.object_type = json_data.get("objectType")
10631244
elif self.type == "Structure":
10641245
self.structure_type = json_data.get("structureType")
@@ -1072,7 +1253,7 @@ def __init__(self, json_data=None):
10721253
self.meta_data = MetaData(json_meta_data)
10731254

10741255
def __repr__(self):
1075-
"""Returns a human readable string representation"""
1256+
"""Returns a human-readable string representation"""
10761257
return f"Variable ({self.name})"
10771258

10781259
def __str__(self):
@@ -1179,7 +1360,7 @@ def full_type(self):
11791360
basic.
11801361
"""
11811362
full_type = self.type
1182-
if self.type in ("Entity", "Table"):
1363+
if _is_object_type(self.type):
11831364
full_type += f"({self.object_type})"
11841365
elif self.type == "Structure":
11851366
full_type += f"({self.structure_type})"

0 commit comments

Comments
 (0)