Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,13 @@ and this project adheres to [Semantic Versioning].

## [Unreleased]

## [0.0.27] - 2025-08-06

### Changed in 0.0.27

- Corrected relationship counts in sz_snapshot
- Corrected reported bugs and requests in sz_explorer

## [0.0.26] - 2025-07-11

### Changed in 0.0.26
Expand Down
84 changes: 46 additions & 38 deletions sz_tools/sz_explorer
Original file line number Diff line number Diff line change
Expand Up @@ -1206,12 +1206,9 @@
distinct_usage_type = distinct_feat_record.get("USAGE_TYPE", "")
# search request does contain feat_desc_values
if not distinct_feat_record.get("FEAT_DESC_VALUES"):
feat_record = distinct_feat_record.copy()
distinct_feat_record["FEAT_DESC_VALUES"] = [
feat_record,
]
distinct_feat_record["FEAT_DESC_VALUES"] = [distinct_feat_record]
for feat_record in distinct_feat_record["FEAT_DESC_VALUES"]:
feat_record["RECORD_COUNT"] = 0 # will be incremented later
feat_record = feat_record.copy()
feat_record["FTYPE_CODE"] = ftype_code
feat_record["USAGE_TYPE"] = feat_record.get("USAGE_TYPE", distinct_usage_type)
if feat_record["USAGE_TYPE"] == "PRIMARY":
Expand All @@ -1220,25 +1217,26 @@
feat_record["USAGE_TYPE_SORT"] = f"2-{feat_record['USAGE_TYPE']}"
else:
feat_record["USAGE_TYPE_SORT"] = "3-UNSPECIFIED"
features_by_type[ftype_code].append(feat_record)
lib_feat_id = feat_record["LIB_FEAT_ID"]
if lib_feat_id in features_by_id: # may be a different usage type
usage_type1 = features_by_id.get("USAGE_TYPE", "")
usage_type1 = features_by_id[lib_feat_id].get("USAGE_TYPE", "")
usage_type2 = feat_record.get("USAGE_TYPE", "")
if usage_type2 and usage_type2 != usage_type1:
delim = ", " if usage_type1 else ""
features_by_id[lib_feat_id]["USAGE_TYPE"] += delim + usage_type2
if usage_type1 and usage_type2 and usage_type2 != usage_type1:
features_by_id[lib_feat_id]["USAGE_TYPE"] += ", " + usage_type2
else:
features_by_id[lib_feat_id] = feat_record

# features_by_type is updated after in case duplicate lib_feat_ids
for feat_record in features_by_id.values():
features_by_type[feat_record["FTYPE_CODE"]].append(feat_record)

return {"BY_ID": features_by_id, "BY_TYPE": features_by_type}

def get_record_features(self, entity_features, feature_list):
features_by_id = {}
features_by_type = {}
for feat_record in feature_list:
lib_feat_id = feat_record["LIB_FEAT_ID"]
if lib_feat_id in entity_features:
entity_features[lib_feat_id]["RECORD_COUNT"] += 1
feature_data = entity_features[lib_feat_id].copy()
feature_data["USAGE_TYPE"] = feat_record.get("USAGE_TYPE", "")
if feature_data["USAGE_TYPE"] == "PRIMARY":
Expand All @@ -1248,18 +1246,21 @@
else:
feature_data["USAGE_TYPE_SORT"] = "3-UNSPECIFIED"
ftype_code = feature_data["FTYPE_CODE"]
if ftype_code not in features_by_type:
features_by_type[ftype_code] = [feature_data]
else:
features_by_type[ftype_code].append(feature_data)
if lib_feat_id in features_by_id: # may be a different usage type
usage_type1 = features_by_id.get("USAGE_TYPE", "")
usage_type2 = feature_data.get("USAGE_TYPE", "")
if usage_type2 and usage_type2 != usage_type1:
delim = ", " if usage_type1 else ""
features_by_id[lib_feat_id]["USAGE_TYPE"] += delim + usage_type2
usage_type1 = features_by_id[lib_feat_id].get("USAGE_TYPE", "")
usage_type2 = feat_record.get("USAGE_TYPE", "")
if usage_type1 and usage_type2 and usage_type2 != usage_type1:
features_by_id[lib_feat_id]["USAGE_TYPE"] += ", " + usage_type2
else:
features_by_id[lib_feat_id] = feature_data
# features_by_type is updated after in case duplicate lib_feat_ids
features_by_type = {}
for feat_record in features_by_id.values():
if feat_record["FTYPE_CODE"] not in features_by_type:
features_by_type[feat_record["FTYPE_CODE"]] = [feat_record]
else:
features_by_type[feat_record["FTYPE_CODE"]].append(feat_record)

return {"BY_ID": features_by_id, "BY_TYPE": features_by_type}

def regroup_by_type(self, features_by_id):
Expand Down Expand Up @@ -1340,11 +1341,9 @@
score_records = {}
if match_info.get("WHY_KEY_DETAILS"):
for best_score in match_info["WHY_KEY_DETAILS"].get("CONFIRMATIONS", []):
best_score["TOKEN"] = "+" + best_score["TOKEN"]
key = f"{best_score['INBOUND_FEAT_ID']}-{best_score['CANDIDATE_FEAT_ID']}"
score_records[key] = best_score
for best_score in match_info["WHY_KEY_DETAILS"].get("DENIALS", []):
best_score["TOKEN"] = "-" + best_score["TOKEN"]
key = f"{best_score['INBOUND_FEAT_ID']}-{best_score['CANDIDATE_FEAT_ID']}"
score_records[key] = best_score
for ftype_code in match_info.get("FEATURE_SCORES", {}):
Expand Down Expand Up @@ -1496,6 +1495,11 @@

errule_code = step["MATCH_INFO"].get("ERRULE_CODE", "None")
match_key = step["MATCH_INFO"].get("MATCH_KEY", "None")
if not match_key.startswith("+NAME"):
# eventually get match_key details and check name score
reason = "Match_key has no or partial name match"
summary["INTERESTING_STEP"][step_num] = reason

if errule_code not in summary["PRINCIPLE"]:
summary["PRINCIPLE"][errule_code] = {"COUNT": 1, "MATCH_KEY": {}}
else:
Expand All @@ -1522,7 +1526,6 @@
"STEPS": steps,
"FINAL_ENTITIES": final_entities,
}

return how_data

def reorder_search_results(self, resolved_entities):
Expand Down Expand Up @@ -1660,11 +1663,6 @@
if any(x in stats for x in ("~", "!", "#")):
feat_color += ",dim" if feat_color else "dim"

# mixed_color = False / mixed_color and wrap text would not work well with each other
if feature.get("MATCH_KEY_TOKEN"):
feat_color = "good" if feature["MATCH_KEY_TOKEN"].startswith("+") else "bad"
# mixed_color = feature["FTYPE_CODE"] = feature["MATCH_KEY_TOKEN"][1:]

display_list.extend(wrap_text(f"{feat_desc} {stats}", attr_width, feat_color))

if feature.get("MATCHED_FEAT_DESC"):
Expand Down Expand Up @@ -1960,7 +1958,7 @@

entity_only_features = []
for feat_data in entity_features.values():
if feat_data["RECORD_COUNT"] == 0:
if feat_data["FTYPE_CODE"] in self.senzing_features:
entity_only_features.append(f"{colorize(feat_data['FTYPE_CODE'], 'bad')}: {feat_data['FEAT_DESC']}")
if entity_only_features:
tbl.rows.insert(0, [colorize_dsrc("SENZING"), "\n".join(entity_only_features), ""])
Expand Down Expand Up @@ -2297,7 +2295,7 @@
virtual_id1 = step_data["ENTITY_LIST"][0]["VIRTUAL_ID"]
virtual_id2 = step_data["ENTITY_LIST"][1]["VIRTUAL_ID"]

# debug_print(step_data["ENTITY_LIST"][0])
# debug_print(step_data)
# source_row.append(self.fmt_record_list(records, 1, attr_width))

tbl = eda_table()
Expand All @@ -2315,6 +2313,13 @@
self.fmt_record_list(step_data["ENTITY_LIST"][1]["RECORDS"], **kwargs), # .replace("\n", " | ")[0:80],
],
]
if step_data["STEP_TYPE"] == "Create virtual entity":
left_side_feat = "CANDIDATE_FEAT_DESC"
right_side_feat = "INBOUND_FEAT_DESC"
else:
left_side_feat = "INBOUND_FEAT_DESC"
right_side_feat = "CANDIDATE_FEAT_DESC"

match_key = step_data["MATCH_INFO"].get("MATCH_KEY", "")
feature_scores = step_data["MATCH_INFO"]["FEATURE_SCORES"]
for ftype_code in sorted(set(feature_scores), key=lambda k: self.ftype_code_order[k]):
Expand Down Expand Up @@ -2344,9 +2349,9 @@
tbl.rows.append(
[
colorize(ftype_code, "dim"),
best_score["INBOUND_FEAT_DESC"],
best_score[left_side_feat],
colorize(score_value, feat_color),
best_score["CANDIDATE_FEAT_DESC"],
best_score[right_side_feat],
]
)
# tbl.rows[0][0].append(colorize(ftype_code, "dim"))
Expand Down Expand Up @@ -2605,7 +2610,7 @@

def how_summary(self, how_data):
summary_node = eda_node("summary")
summary_node.node_desc = colorize("HOW STATISTICS", "") # "highlight2")
summary_node.node_desc = f"How summary for {self.fmt_entity_desc(how_data)}"

category_node = eda_node("resolution")
category_node.node_desc = self.fmt_how_statistic_hdr("RESOLUTION SUMMARY")
Expand Down Expand Up @@ -2705,12 +2710,14 @@
root_node.node_desc = "n/a"
else:
root_node.node_desc = colorize("Re-evaluation needed! ", "bad")
final_node_cnt = 0
for final_entity in final_entities:
final_id = final_entity["VIRTUAL_ENTITY_ID"]
if len(final_entities) == 1:
final_node_msg = "final entity"
else:
final_node_msg = "final entity"
final_node_cnt += 1
final_node_msg = f"final entity {final_node_cnt} of {len(final_entities)}"
final_node = eda_node(final_id)
final_node.node_desc = f"{colorize_entity(final_id)} {final_node_msg}"
final_node.step_created = 0
Expand Down Expand Up @@ -2768,7 +2775,7 @@
node.node_desc += " was orphaned!"
ordered_nodes.append(node)
root_node.children = ordered_nodes
return root_node.render_tree()
return f"How decision tree for {self.fmt_entity_desc(how_data)}\n\n" + root_node.render_tree()

def how_columnar(self, how_data, **kwargs):
entity_id = how_data["ENTITY_ID"]
Expand All @@ -2786,7 +2793,8 @@
"MATCH_INFO": step_data[step_num]["MATCH_INFO"],
}
# entities, how_data = self.why_how(entity_id)
kwargs["report_title"] = f"How for entity: {colorize_entity(entity_id)}"

kwargs["report_title"] = f"Columnar how for {self.fmt_entity_desc(how_data)}"
kwargs["row1_title"] = "VIRTUAL_ID"
if how_data["SUMMARY"]["REEVALUATION_MSG"]:
kwargs["report_title"] += " " + colorize(how_data["SUMMARY"]["REEVALUATION_MSG"], "bad")
Expand Down Expand Up @@ -3396,7 +3404,7 @@
elif token.upper() in sdk_wrapper.ftype_code_lookup:
feature_list.append(token)
elif token.lower() in sdk_wrapper.how_views.values():
kwarg_dict["how_view"] = token.lower
kwarg_dict["how_view"] = token.lower()
else:
remaining_tokens.append(token)
prior_token = token.upper()
Expand Down Expand Up @@ -3471,7 +3479,7 @@
ftype_id = sdk_wrapper.ftype_code_lookup[ftype]["FTYPE_ID"]
if operator == "LIKE":
if sz_dbo_uri.startswith("postgres"):
operator = "ILIKE" # makes case insensitive

Check warning on line 3482 in sz_tools/sz_explorer

View workflow job for this annotation

GitHub Actions / spellcheck

Unknown word (ILIKE)
elif sz_dbo_uri.startswith("sqlite"):
sz_dbo.sqlExec("PRAGMA case_sensitive_like=OFF")
if "%" not in value:
Expand Down
27 changes: 15 additions & 12 deletions sz_tools/sz_snapshot
Original file line number Diff line number Diff line change
Expand Up @@ -479,10 +479,8 @@ class SnapshotWriter:
stat_keys.extend(["PRINCIPLES", principle_matchkey])
self.update_stat_pack(stat_keys, {"COUNT": 1, "SAMPLE": [entity_id]})

# if len(entity0_sources) > 1:
# include single source so can find non-matches, ie customers not on watch list or in reference file
multi_source_key = "||".join(sorted(entity0_sources.keys()))
self.update_stat_pack(["ENTITY_SOURCES", multi_source_key], {"ENTITY_COUNT": 1, "SAMPLE": [entity_id]})
source_key = "||".join(sorted(entity0_sources.keys()))
self.update_stat_pack(["ENTITY_SOURCES", source_key], {"ENTITY_COUNT": 1, "SAMPLE": [entity_id]})
elif related_id > entity_id:
sample = f"{entity_id} {related_id}"
principle_matchkey = list(resume_data[related_id]["PRINCIPLES"].keys())[0]
Expand All @@ -494,16 +492,21 @@ class SnapshotWriter:

for data_source1 in entity0_sources:
for data_source2 in resume_data[related_id]["DATA_SOURCES"]:
stat_key_list = []
if data_source1 == data_source2:
stat_keys = ["DATA_SOURCES", data_source1, match_level]
stat_key_list.append(["DATA_SOURCES", data_source1, match_level])
else:
data_source_pair = f"{data_source1}||{data_source2}"
if data_source_pair not in self.stat_pack["CROSS_SOURCES"]:
self.initialize_match_levels(["CROSS_SOURCES", data_source_pair])
stat_keys = ["CROSS_SOURCES", data_source_pair, match_level]
self.update_stat_pack(stat_keys, {"RELATION_COUNT": 1})
stat_keys.extend(["PRINCIPLES", principle_matchkey])
self.update_stat_pack(stat_keys, {"COUNT": 1, "SAMPLE": [sample]})
for data_source_pair in [
f"{data_source1}||{data_source2}",
f"{data_source2}||{data_source1}",
]:
if data_source_pair not in self.stat_pack["CROSS_SOURCES"]:
self.initialize_match_levels(["CROSS_SOURCES", data_source_pair])
stat_key_list.append(["CROSS_SOURCES", data_source_pair, match_level])
for stat_keys in stat_key_list:
self.update_stat_pack(stat_keys, {"RELATION_COUNT": 1})
stat_keys.extend(["PRINCIPLES", principle_matchkey])
self.update_stat_pack(stat_keys, {"COUNT": 1, "SAMPLE": [sample]})


def check_stat_pack(stats_file_name, csv_file_name, args):
Expand Down
Loading