Skip to content

Commit 9f74361

Browse files
committed
#175 and updates for sdk default flag changes
1 parent 5269070 commit 9f74361

1 file changed

Lines changed: 52 additions & 48 deletions

File tree

sz_tools/sz_explorer

Lines changed: 52 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -40,12 +40,10 @@ except ImportError as ex:
4040
sys.exit(1)
4141

4242

43-
def print_exception_info(ex):
44-
print_message(ex, "error")
45-
response = get_char_with_prompt("press T for traceback...")
46-
if response.upper() == "T":
43+
def print_exception(ex):
44+
print_message(f"{ex}, press any key to continue", "error")
45+
if get_char() in "tTdD":
4746
print(traceback.format_exc())
48-
get_char_with_prompt("press any key any key to continue...")
4947
print()
5048

5149

@@ -411,12 +409,13 @@ class EdaReports:
411409
# (feature_counts)
412410
for ftype_code in feature_counts:
413411
ftype_excl = sdk_wrapper.ftype_code_lookup[ftype_code]["FTYPE_EXCL"]
412+
ftype_freq = sdk_wrapper.ftype_code_lookup[ftype_code]["FTYPE_FREQ"]
414413
ftype_count = feature_counts[ftype_code]
415414
if ftype_count > entity_size: # a single record has reported multiple
416415
continue
417416
if ftype_excl == "Yes" and ftype_count > 1:
418417
review_features.append(ftype_code)
419-
elif ftype_count > review_max:
418+
elif ftype_freq in ("F1", "FF") and ftype_count > review_max:
420419
review_features.append(ftype_code)
421420
return review_features
422421

@@ -561,7 +560,7 @@ class EdaReports:
561560
tbl = eda_table()
562561
tbl.title = "Review categories..."
563562
tbl.columns = [
564-
{"name": "row", "width": 5, "align": "center"},
563+
{"name": "Row", "width": 5, "align": "center"},
565564
{"name": "Category", "width": 25, "align": "left"},
566565
{"name": "Count", "width": 25, "align": "right"},
567566
]
@@ -618,7 +617,7 @@ class EdaReports:
618617
tbl = eda_table()
619618
tbl.title = f"Entity size breakdown from {self.snapshot_file}"
620619
tbl.columns = [
621-
{"name": "row", "width": 5, "align": "center"},
620+
{"name": "Row", "width": 5, "align": "center"},
622621
{"name": "Size Group", "width": 10, "align": "center"},
623622
{"name": "Entity Count", "width": 10, "align": "right"},
624623
{"name": "Review Count", "width": 10, "align": "right"},
@@ -648,7 +647,7 @@ class EdaReports:
648647
tbl = eda_table()
649648
tbl.title = f"Data Source Summary from {self.snapshot_file}"
650649
tbl.columns = [
651-
{"name": "\nrow", "width": 5, "align": "center"},
650+
{"name": "\nRow", "width": 5, "align": "center"},
652651
{"name": "\nData Source", "width": 25, "align": "left"},
653652
{"name": "\nRecords", "width": 15, "align": "right"},
654653
{"name": "\nEntities", "width": 15, "align": "right"},
@@ -702,7 +701,7 @@ class EdaReports:
702701
tbl = eda_table()
703702
tbl.title = f"Cross Source Summary from {self.snapshot_file}"
704703
tbl.columns = [
705-
{"name": "\nrow", "width": 5, "align": "center"},
704+
{"name": "\nRow", "width": 5, "align": "center"},
706705
{"name": "From\nData Source", "width": 25, "align": "center"},
707706
{"name": "To\nData Source", "width": 25, "align": "center"},
708707
{"name": "Matched\nRecords", "width": 15, "align": "right"},
@@ -747,25 +746,25 @@ class EdaReports:
747746
select_levels = ["DATA_SOURCES", "MATCH_LEVEL", "MATCH_KEY"]
748747
self.drill_into(report_table, report_data, select_levels)
749748

750-
def multi_source_summary(self, data_source_filter):
749+
def entity_source_summary(self, data_source_filter):
751750
self.check_for_snapshot()
752751
report_data = {}
753752
tbl = eda_table()
754-
tbl.title = f"Multi-Source Summary from {self.snapshot_file}"
753+
tbl.title = f"Entity Source Summary from {self.snapshot_file}"
755754
tbl.columns = [
756-
{"name": "row", "width": 5, "align": "center"},
755+
{"name": "Row", "width": 5, "align": "center"},
757756
{"name": "Data Sources", "width": 100, "align": "left"},
758-
{"name": "Records", "width": 15, "align": "right"},
757+
{"name": "Entities", "width": 15, "align": "right"},
759758
]
760759
tbl.rows = []
761760
row_num = 0
762-
_data = self.snapshot_data["MULTI_SOURCES"]
761+
_data = self.snapshot_data["ENTITY_SOURCES"]
763762
sorted_data = sorted(_data, key=lambda k: _data[k]["ENTITY_COUNT"], reverse=True)
764763
for data_sources in sorted_data:
765764
if data_source_filter and data_source_filter.upper() not in data_sources:
766765
continue
767766
row_num += 1
768-
report_segment = self.snapshot_data["MULTI_SOURCES"][data_sources]
767+
report_segment = self.snapshot_data["ENTITY_SOURCES"][data_sources]
769768
row = [
770769
colorize(row_num, "bold"),
771770
colorize(" | ", "dim").join(colorize_dsrc(x) for x in data_sources.split("||")),
@@ -805,7 +804,7 @@ class EdaReports:
805804
tbl = eda_table()
806805
tbl.title = f"Principles Used Report from {self.snapshot_file}"
807806
tbl.columns = [
808-
{"name": "row", "width": 5, "align": "center"},
807+
{"name": "Row", "width": 5, "align": "center"},
809808
{"name": "Match level", "width": 25, "align": "left"},
810809
{"name": "Count", "width": 15, "align": "right"},
811810
]
@@ -840,7 +839,7 @@ class EdaReports:
840839
tbl = eda_table()
841840
tbl.title = f"Selected {prior_keys}"
842841
tbl.columns = [
843-
{"name": "row", "width": 5, "align": "center"},
842+
{"name": "Row", "width": 5, "align": "center"},
844843
{"name": select_level.lower(), "width": 100, "align": "left"},
845844
{"name": "count", "width": 10, "align": "right"},
846845
]
@@ -2073,7 +2072,7 @@ class EdaSdkWrapper:
20732072
tree_nodes[related_id].node_desc = rel_desc
20742073
tree_nodes[group_node_id].add_child(tree_nodes[related_id])
20752074

2076-
if len(entities[related_id]["RELATIONSHIPS"]) != 0:
2075+
if len(entities[related_id]["RELATIONSHIPS"]) != 0 and len(parents) < build_out_degree:
20772076
related_entities = sorted(
20782077
entities[related_id]["RELATIONSHIPS"],
20792078
key=lambda k: (k["MATCH_CATEGORY_SORT"], k["ERRULE_ID"], k["ENTITY_ID"]),
@@ -2361,25 +2360,25 @@ class EdaSdkWrapper:
23612360
return tbl.render_table(no_lines=True)
23622361

23632362
def why_search(self, search_json, **kwargs):
2363+
search_flag_list = [
2364+
"SZ_SEARCH_INCLUDE_REQUEST_DETAILS",
2365+
"SZ_INCLUDE_FEATURE_SCORES",
2366+
"SZ_INCLUDE_MATCH_KEY_DETAILS",
2367+
"SZ_ENTITY_DEFAULT_FLAGS",
2368+
"SZ_ENTITY_INCLUDE_ENTITY_NAME",
2369+
"SZ_ENTITY_INCLUDE_INTERNAL_FEATURES",
2370+
"SZ_ENTITY_INCLUDE_FEATURE_STATS",
2371+
"SZ_ENTITY_INCLUDE_RECORD_FEATURES",
2372+
]
2373+
23642374
if kwargs["search"] == 0:
2365-
search_flag_list = [
2366-
"SZ_SEARCH_INCLUDE_REQUEST_DETAILS",
2367-
"SZ_SEARCH_INCLUDE_ALL_CANDIDATES",
2368-
"SZ_INCLUDE_MATCH_KEY_DETAILS",
2369-
"SZ_WHY_ENTITIES_DEFAULT_FLAGS",
2370-
]
2375+
search_flag_list.append("SZ_SEARCH_INCLUDE_ALL_CANDIDATES")
23712376
try:
23722377
json_data = self.call_sdk("search_by_attributes", search_flag_list, json.dumps(search_json))
23732378
except SzError as err:
23742379
raise err from err
23752380
matched_list = sdk_wrapper.reorder_search_results(json_data.get("RESOLVED_ENTITIES", []))
23762381
else:
2377-
search_flag_list = [
2378-
"SZ_SEARCH_INCLUDE_REQUEST_DETAILS",
2379-
"SZ_INCLUDE_MATCH_KEY_DETAILS",
2380-
"SZ_INCLUDE_FEATURE_SCORES",
2381-
"SZ_WHY_ENTITIES_DEFAULT_FLAGS",
2382-
]
23832382
try:
23842383
json_data = self.call_sdk("why_search", search_flag_list, [json.dumps(search_json), kwargs["search"]])
23852384
except SzError as err:
@@ -2982,18 +2981,22 @@ class EdaCmd(cmd.Cmd):
29822981
tree {colorize("- see a tree view of an entity's relationships through 1 or 2 degrees.", 'dim')}
29832982
export {colorize("- export the json records for an entity for debugging or correcting and reloading.", 'dim')}
29842983
2985-
{colorize('Snapshot reports:', 'highlight2')} {colorize('(requires a json file created with sz_snapshot)', 'italics')}
2984+
{colorize('Snapshot reports:', 'highlight2')} {colorize('(requires a json file generated by sz_snapshot)', 'italics')}
29862985
data_source_summary {colorize('– shows how many duplicates were detected within each data source, as well as ', 'dim')}
29872986
{colorize('the possible matches and relationships that were derived. For example, how many duplicate customers ', 'dim')}
29882987
{colorize('there are, and are any of them related to each other.', 'dim')}
29892988
cross_source_summary {colorize('– shows how many matches were made across data sources. For example, how many ', 'dim')}
29902989
{colorize('employees are related to customers.', 'dim')}
2990+
entity_source_summary {colorize('– shows the number of entities by the set of data sources they can be found in. For example, ', 'dim')}
2991+
{colorize('how many entities are only in one data source, how many are only in these two data sources, etc.', 'dim')}
29912992
entity_size_breakdown {colorize("– shows how many entities of what size were created. For instance, some entities ", 'dim')}
29922993
{colorize("are singletons, some might have connected 2 records, some 3, etc. This report is primarily used to", 'dim')}
29932994
{colorize("ensure there are no instances of over matching. For instance, it’s ok for an entity to have hundreds", 'dim')}
29942995
{colorize("of records as long as there are not too many different names, addresses, identifiers, etc.", 'dim')}
2996+
principles_used {colorize('– shows what principles and match_keys are firing across all data sources. For example, ', 'dim')}
2997+
{colorize('how many name and address matches, how many address only, etc.', 'dim')}
29952998
2996-
{colorize('Audit report:', 'highlight2')} {colorize('(requires a json file created with sz_audit)', 'italics')}
2999+
{colorize('Audit report:', 'highlight2')} {colorize('(requires a json file generated by sz_audit)', 'italics')}
29973000
audit_summary {colorize("- shows the precision, recall and F1 scores with the ability to browse the entities that", 'dim')}
29983001
{colorize("were split or merged.", 'dim')}
29993002
@@ -3194,7 +3197,7 @@ class EdaCmd(cmd.Cmd):
31943197
tbl = eda_table()
31953198
tbl.title = "Data source counts"
31963199
tbl.columns = [
3197-
{"name": "id", "width": 5, "align": "center"},
3200+
{"name": "ID", "width": 5, "align": "center"},
31983201
{"name": "DataSource", "width": 30, "align": "left"},
31993202
{"name": "ActualRecordCount", "width": 20, "align": "right"},
32003203
{"name": "DistinctRecordCount", "width": 20, "align": "right"},
@@ -3247,7 +3250,7 @@ class EdaCmd(cmd.Cmd):
32473250
try:
32483251
self.eda_reports.audit_summary()
32493252
except Exception as ex:
3250-
print_exception_info(ex)
3253+
print_exception(ex)
32513254

32523255
def help_entity_size_breakdown(self):
32533256
print(
@@ -3271,7 +3274,7 @@ class EdaCmd(cmd.Cmd):
32713274
try:
32723275
self.eda_reports.entity_size_breakdown()
32733276
except Exception as ex:
3274-
print_exception_info(ex)
3277+
print_exception(ex)
32753278

32763279
def complete_data_source_summary(self, text, line, begidx, endidx):
32773280
possibles = sorted(self.eda_reports.snapshot_data.get("DATA_SOURCES", {}).keys())
@@ -3296,7 +3299,7 @@ class EdaCmd(cmd.Cmd):
32963299
try:
32973300
self.eda_reports.data_source_summary(arg)
32983301
except Exception as ex:
3299-
print_exception_info(ex)
3302+
print_exception(ex)
33003303

33013304
def help_cross_source_summary(self):
33023305
print(
@@ -3321,9 +3324,9 @@ class EdaCmd(cmd.Cmd):
33213324
try:
33223325
self.eda_reports.cross_source_summary(arg)
33233326
except Exception as ex:
3324-
print_exception_info(ex)
3327+
print_exception(ex)
33253328

3326-
def help_multi_source_summary(self):
3329+
def help_entity_source_summary(self):
33273330
print(
33283331
textwrap.dedent(
33293332
f"""\
@@ -3335,22 +3338,22 @@ class EdaCmd(cmd.Cmd):
33353338
can only show the matches, not the possible matches and relationships.
33363339
33373340
{colorize('Syntax:', 'highlight2')}
3338-
multi_source_summary [dataSource]
3341+
entity_source_summary [dataSource]
33393342
"""
33403343
)
33413344
)
33423345

3343-
def complete_multi_source_summary(self, text, line, begidx, endidx):
3346+
def complete_entity_source_summary(self, text, line, begidx, endidx):
33443347
possibles = sorted(self.eda_reports.snapshot_data.get("DATA_SOURCES", {}).keys())
33453348
if text:
33463349
return [i for i in possibles if i.startswith(text.upper())]
33473350
return possibles
33483351

3349-
def do_multi_source_summary(self, arg):
3352+
def do_entity_source_summary(self, arg):
33503353
try:
3351-
self.eda_reports.multi_source_summary(arg)
3354+
self.eda_reports.entity_source_summary(arg)
33523355
except Exception as ex:
3353-
print_exception_info(ex)
3356+
print_exception(ex)
33543357

33553358
def help_principles_used(self):
33563359
print(
@@ -3370,7 +3373,7 @@ class EdaCmd(cmd.Cmd):
33703373
try:
33713374
self.eda_reports.principles_used_report()
33723375
except Exception as ex:
3373-
print_exception_info(ex)
3376+
print_exception(ex)
33743377

33753378
# adhoc commands
33763379

@@ -3774,7 +3777,7 @@ class EdaCmd(cmd.Cmd):
37743777
print_message(err, "error")
37753778
return
37763779
except Exception as ex:
3777-
print_exception_info(ex)
3780+
print_exception(ex)
37783781
return
37793782
view_report(report)
37803783
if len(arg_tokens) == 1: # supports previous/next
@@ -3909,7 +3912,8 @@ class EdaCmd(cmd.Cmd):
39093912
why <entity_id1> {colorize('actually runs a columnar how instead!', 'dim')}
39103913
why <entity_id1> <entity_id2> {colorize('shows why two or more different entities did not resolve', 'dim')}
39113914
why <data_source1> <record_id1> <data_source2> <record_id2> {colorize('shows if the two data source records could resolve or relate', 'dim')}
3912-
3915+
why search [optional entity_id] {colorize('shows the features and keys generated by a search against any candidate entities', 'dim')}
3916+
39133917
{colorize('Color legend:', 'highlight2')}
39143918
{colorize('green', 'good')} indicates the values matched and contributed to the overall score
39153919
{colorize('red', 'bad')} indicates the values did not match and hurt the overall score
@@ -4225,7 +4229,7 @@ if __name__ == "__main__":
42254229
try:
42264230
sdk_wrapper = EdaSdkWrapper(engine_config, debug_trace=args.debug_trace, webapp_url=args.webapp_url)
42274231
except SzError:
4228-
print_exception_info()
4232+
print_exception()
42294233
sys.exit(1)
42304234

42314235
try:

0 commit comments

Comments
 (0)