@@ -40,12 +40,10 @@ except ImportError as ex:
4040 sys .exit (1 )
4141
4242
43- def print_exception_info (ex ):
44- print_message (ex , "error" )
45- response = get_char_with_prompt ("press T for traceback..." )
46- if response .upper () == "T" :
43+ def print_exception (ex ):
44+ print_message (f"{ ex } , press any key to continue" , "error" )
45+ if get_char () in "tTdD" :
4746 print (traceback .format_exc ())
48- get_char_with_prompt ("press any key any key to continue..." )
4947 print ()
5048
5149
@@ -411,12 +409,13 @@ class EdaReports:
411409 # (feature_counts)
412410 for ftype_code in feature_counts :
413411 ftype_excl = sdk_wrapper .ftype_code_lookup [ftype_code ]["FTYPE_EXCL" ]
412+ ftype_freq = sdk_wrapper .ftype_code_lookup [ftype_code ]["FTYPE_FREQ" ]
414413 ftype_count = feature_counts [ftype_code ]
415414 if ftype_count > entity_size : # a single record has reported multiple
416415 continue
417416 if ftype_excl == "Yes" and ftype_count > 1 :
418417 review_features .append (ftype_code )
419- elif ftype_count > review_max :
418+ elif ftype_freq in ( "F1" , "FF" ) and ftype_count > review_max :
420419 review_features .append (ftype_code )
421420 return review_features
422421
@@ -561,7 +560,7 @@ class EdaReports:
561560 tbl = eda_table ()
562561 tbl .title = "Review categories..."
563562 tbl .columns = [
564- {"name" : "row " , "width" : 5 , "align" : "center" },
563+ {"name" : "Row " , "width" : 5 , "align" : "center" },
565564 {"name" : "Category" , "width" : 25 , "align" : "left" },
566565 {"name" : "Count" , "width" : 25 , "align" : "right" },
567566 ]
@@ -618,7 +617,7 @@ class EdaReports:
618617 tbl = eda_table ()
619618 tbl .title = f"Entity size breakdown from { self .snapshot_file } "
620619 tbl .columns = [
621- {"name" : "row " , "width" : 5 , "align" : "center" },
620+ {"name" : "Row " , "width" : 5 , "align" : "center" },
622621 {"name" : "Size Group" , "width" : 10 , "align" : "center" },
623622 {"name" : "Entity Count" , "width" : 10 , "align" : "right" },
624623 {"name" : "Review Count" , "width" : 10 , "align" : "right" },
@@ -648,7 +647,7 @@ class EdaReports:
648647 tbl = eda_table ()
649648 tbl .title = f"Data Source Summary from { self .snapshot_file } "
650649 tbl .columns = [
651- {"name" : "\n row " , "width" : 5 , "align" : "center" },
650+ {"name" : "\n Row " , "width" : 5 , "align" : "center" },
652651 {"name" : "\n Data Source" , "width" : 25 , "align" : "left" },
653652 {"name" : "\n Records" , "width" : 15 , "align" : "right" },
654653 {"name" : "\n Entities" , "width" : 15 , "align" : "right" },
@@ -702,7 +701,7 @@ class EdaReports:
702701 tbl = eda_table ()
703702 tbl .title = f"Cross Source Summary from { self .snapshot_file } "
704703 tbl .columns = [
705- {"name" : "\n row " , "width" : 5 , "align" : "center" },
704+ {"name" : "\n Row " , "width" : 5 , "align" : "center" },
706705 {"name" : "From\n Data Source" , "width" : 25 , "align" : "center" },
707706 {"name" : "To\n Data Source" , "width" : 25 , "align" : "center" },
708707 {"name" : "Matched\n Records" , "width" : 15 , "align" : "right" },
@@ -747,25 +746,25 @@ class EdaReports:
747746 select_levels = ["DATA_SOURCES" , "MATCH_LEVEL" , "MATCH_KEY" ]
748747 self .drill_into (report_table , report_data , select_levels )
749748
750- def multi_source_summary (self , data_source_filter ):
749+ def entity_source_summary (self , data_source_filter ):
751750 self .check_for_snapshot ()
752751 report_data = {}
753752 tbl = eda_table ()
754- tbl .title = f"Multi- Source Summary from { self .snapshot_file } "
753+ tbl .title = f"Entity Source Summary from { self .snapshot_file } "
755754 tbl .columns = [
756- {"name" : "row " , "width" : 5 , "align" : "center" },
755+ {"name" : "Row " , "width" : 5 , "align" : "center" },
757756 {"name" : "Data Sources" , "width" : 100 , "align" : "left" },
758- {"name" : "Records " , "width" : 15 , "align" : "right" },
757+ {"name" : "Entities " , "width" : 15 , "align" : "right" },
759758 ]
760759 tbl .rows = []
761760 row_num = 0
762- _data = self .snapshot_data ["MULTI_SOURCES " ]
761+ _data = self .snapshot_data ["ENTITY_SOURCES " ]
763762 sorted_data = sorted (_data , key = lambda k : _data [k ]["ENTITY_COUNT" ], reverse = True )
764763 for data_sources in sorted_data :
765764 if data_source_filter and data_source_filter .upper () not in data_sources :
766765 continue
767766 row_num += 1
768- report_segment = self .snapshot_data ["MULTI_SOURCES " ][data_sources ]
767+ report_segment = self .snapshot_data ["ENTITY_SOURCES " ][data_sources ]
769768 row = [
770769 colorize (row_num , "bold" ),
771770 colorize (" | " , "dim" ).join (colorize_dsrc (x ) for x in data_sources .split ("||" )),
@@ -805,7 +804,7 @@ class EdaReports:
805804 tbl = eda_table ()
806805 tbl .title = f"Principles Used Report from { self .snapshot_file } "
807806 tbl .columns = [
808- {"name" : "row " , "width" : 5 , "align" : "center" },
807+ {"name" : "Row " , "width" : 5 , "align" : "center" },
809808 {"name" : "Match level" , "width" : 25 , "align" : "left" },
810809 {"name" : "Count" , "width" : 15 , "align" : "right" },
811810 ]
@@ -840,7 +839,7 @@ class EdaReports:
840839 tbl = eda_table ()
841840 tbl .title = f"Selected { prior_keys } "
842841 tbl .columns = [
843- {"name" : "row " , "width" : 5 , "align" : "center" },
842+ {"name" : "Row " , "width" : 5 , "align" : "center" },
844843 {"name" : select_level .lower (), "width" : 100 , "align" : "left" },
845844 {"name" : "count" , "width" : 10 , "align" : "right" },
846845 ]
@@ -2073,7 +2072,7 @@ class EdaSdkWrapper:
20732072 tree_nodes [related_id ].node_desc = rel_desc
20742073 tree_nodes [group_node_id ].add_child (tree_nodes [related_id ])
20752074
2076- if len (entities [related_id ]["RELATIONSHIPS" ]) != 0 :
2075+ if len (entities [related_id ]["RELATIONSHIPS" ]) != 0 and len ( parents ) < build_out_degree :
20772076 related_entities = sorted (
20782077 entities [related_id ]["RELATIONSHIPS" ],
20792078 key = lambda k : (k ["MATCH_CATEGORY_SORT" ], k ["ERRULE_ID" ], k ["ENTITY_ID" ]),
@@ -2361,25 +2360,25 @@ class EdaSdkWrapper:
23612360 return tbl .render_table (no_lines = True )
23622361
23632362 def why_search (self , search_json , ** kwargs ):
2363+ search_flag_list = [
2364+ "SZ_SEARCH_INCLUDE_REQUEST_DETAILS" ,
2365+ "SZ_INCLUDE_FEATURE_SCORES" ,
2366+ "SZ_INCLUDE_MATCH_KEY_DETAILS" ,
2367+ "SZ_ENTITY_DEFAULT_FLAGS" ,
2368+ "SZ_ENTITY_INCLUDE_ENTITY_NAME" ,
2369+ "SZ_ENTITY_INCLUDE_INTERNAL_FEATURES" ,
2370+ "SZ_ENTITY_INCLUDE_FEATURE_STATS" ,
2371+ "SZ_ENTITY_INCLUDE_RECORD_FEATURES" ,
2372+ ]
2373+
23642374 if kwargs ["search" ] == 0 :
2365- search_flag_list = [
2366- "SZ_SEARCH_INCLUDE_REQUEST_DETAILS" ,
2367- "SZ_SEARCH_INCLUDE_ALL_CANDIDATES" ,
2368- "SZ_INCLUDE_MATCH_KEY_DETAILS" ,
2369- "SZ_WHY_ENTITIES_DEFAULT_FLAGS" ,
2370- ]
2375+ search_flag_list .append ("SZ_SEARCH_INCLUDE_ALL_CANDIDATES" )
23712376 try :
23722377 json_data = self .call_sdk ("search_by_attributes" , search_flag_list , json .dumps (search_json ))
23732378 except SzError as err :
23742379 raise err from err
23752380 matched_list = sdk_wrapper .reorder_search_results (json_data .get ("RESOLVED_ENTITIES" , []))
23762381 else :
2377- search_flag_list = [
2378- "SZ_SEARCH_INCLUDE_REQUEST_DETAILS" ,
2379- "SZ_INCLUDE_MATCH_KEY_DETAILS" ,
2380- "SZ_INCLUDE_FEATURE_SCORES" ,
2381- "SZ_WHY_ENTITIES_DEFAULT_FLAGS" ,
2382- ]
23832382 try :
23842383 json_data = self .call_sdk ("why_search" , search_flag_list , [json .dumps (search_json ), kwargs ["search" ]])
23852384 except SzError as err :
@@ -2982,18 +2981,22 @@ class EdaCmd(cmd.Cmd):
29822981 tree { colorize ("- see a tree view of an entity's relationships through 1 or 2 degrees." , 'dim' )}
29832982 export { colorize ("- export the json records for an entity for debugging or correcting and reloading." , 'dim' )}
29842983
2985- { colorize ('Snapshot reports:' , 'highlight2' )} { colorize ('(requires a json file created with sz_snapshot)' , 'italics' )}
2984+ { colorize ('Snapshot reports:' , 'highlight2' )} { colorize ('(requires a json file generated by sz_snapshot)' , 'italics' )}
29862985 data_source_summary { colorize ('– shows how many duplicates were detected within each data source, as well as ' , 'dim' )}
29872986 { colorize ('the possible matches and relationships that were derived. For example, how many duplicate customers ' , 'dim' )}
29882987 { colorize ('there are, and are any of them related to each other.' , 'dim' )}
29892988 cross_source_summary { colorize ('– shows how many matches were made across data sources. For example, how many ' , 'dim' )}
29902989 { colorize ('employees are related to customers.' , 'dim' )}
2990+ entity_source_summary { colorize ('– shows the number of entities by the set of data sources they can be found in. For example, ' , 'dim' )}
2991+ { colorize ('how many entities are only in one data source, how many are only in these two data sources, etc.' , 'dim' )}
29912992 entity_size_breakdown { colorize ("– shows how many entities of what size were created. For instance, some entities " , 'dim' )}
29922993 { colorize ("are singletons, some might have connected 2 records, some 3, etc. This report is primarily used to" , 'dim' )}
29932994 { colorize ("ensure there are no instances of over matching. For instance, it’s ok for an entity to have hundreds" , 'dim' )}
29942995 { colorize ("of records as long as there are not too many different names, addresses, identifiers, etc." , 'dim' )}
2996+ principles_used { colorize ('– shows what principles and match_keys are firing across all data sources. For example, ' , 'dim' )}
2997+ { colorize ('how many name and address matches, how many address only, etc.' , 'dim' )}
29952998
2996- { colorize ('Audit report:' , 'highlight2' )} { colorize ('(requires a json file created with sz_audit)' , 'italics' )}
2999+ { colorize ('Audit report:' , 'highlight2' )} { colorize ('(requires a json file generated by sz_audit)' , 'italics' )}
29973000 audit_summary { colorize ("- shows the precision, recall and F1 scores with the ability to browse the entities that" , 'dim' )}
29983001 { colorize ("were split or merged." , 'dim' )}
29993002
@@ -3194,7 +3197,7 @@ class EdaCmd(cmd.Cmd):
31943197 tbl = eda_table ()
31953198 tbl .title = "Data source counts"
31963199 tbl .columns = [
3197- {"name" : "id " , "width" : 5 , "align" : "center" },
3200+ {"name" : "ID " , "width" : 5 , "align" : "center" },
31983201 {"name" : "DataSource" , "width" : 30 , "align" : "left" },
31993202 {"name" : "ActualRecordCount" , "width" : 20 , "align" : "right" },
32003203 {"name" : "DistinctRecordCount" , "width" : 20 , "align" : "right" },
@@ -3247,7 +3250,7 @@ class EdaCmd(cmd.Cmd):
32473250 try :
32483251 self .eda_reports .audit_summary ()
32493252 except Exception as ex :
3250- print_exception_info (ex )
3253+ print_exception (ex )
32513254
32523255 def help_entity_size_breakdown (self ):
32533256 print (
@@ -3271,7 +3274,7 @@ class EdaCmd(cmd.Cmd):
32713274 try :
32723275 self .eda_reports .entity_size_breakdown ()
32733276 except Exception as ex :
3274- print_exception_info (ex )
3277+ print_exception (ex )
32753278
32763279 def complete_data_source_summary (self , text , line , begidx , endidx ):
32773280 possibles = sorted (self .eda_reports .snapshot_data .get ("DATA_SOURCES" , {}).keys ())
@@ -3296,7 +3299,7 @@ class EdaCmd(cmd.Cmd):
32963299 try :
32973300 self .eda_reports .data_source_summary (arg )
32983301 except Exception as ex :
3299- print_exception_info (ex )
3302+ print_exception (ex )
33003303
33013304 def help_cross_source_summary (self ):
33023305 print (
@@ -3321,9 +3324,9 @@ class EdaCmd(cmd.Cmd):
33213324 try :
33223325 self .eda_reports .cross_source_summary (arg )
33233326 except Exception as ex :
3324- print_exception_info (ex )
3327+ print_exception (ex )
33253328
3326- def help_multi_source_summary (self ):
3329+ def help_entity_source_summary (self ):
33273330 print (
33283331 textwrap .dedent (
33293332 f"""\
@@ -3335,22 +3338,22 @@ class EdaCmd(cmd.Cmd):
33353338 can only show the matches, not the possible matches and relationships.
33363339
33373340 { colorize ('Syntax:' , 'highlight2' )}
3338- multi_source_summary [dataSource]
3341+ entity_source_summary [dataSource]
33393342 """
33403343 )
33413344 )
33423345
3343- def complete_multi_source_summary (self , text , line , begidx , endidx ):
3346+ def complete_entity_source_summary (self , text , line , begidx , endidx ):
33443347 possibles = sorted (self .eda_reports .snapshot_data .get ("DATA_SOURCES" , {}).keys ())
33453348 if text :
33463349 return [i for i in possibles if i .startswith (text .upper ())]
33473350 return possibles
33483351
3349- def do_multi_source_summary (self , arg ):
3352+ def do_entity_source_summary (self , arg ):
33503353 try :
3351- self .eda_reports .multi_source_summary (arg )
3354+ self .eda_reports .entity_source_summary (arg )
33523355 except Exception as ex :
3353- print_exception_info (ex )
3356+ print_exception (ex )
33543357
33553358 def help_principles_used (self ):
33563359 print (
@@ -3370,7 +3373,7 @@ class EdaCmd(cmd.Cmd):
33703373 try :
33713374 self .eda_reports .principles_used_report ()
33723375 except Exception as ex :
3373- print_exception_info (ex )
3376+ print_exception (ex )
33743377
33753378 # adhoc commands
33763379
@@ -3774,7 +3777,7 @@ class EdaCmd(cmd.Cmd):
37743777 print_message (err , "error" )
37753778 return
37763779 except Exception as ex :
3777- print_exception_info (ex )
3780+ print_exception (ex )
37783781 return
37793782 view_report (report )
37803783 if len (arg_tokens ) == 1 : # supports previous/next
@@ -3909,7 +3912,8 @@ class EdaCmd(cmd.Cmd):
39093912 why <entity_id1> { colorize ('actually runs a columnar how instead!' , 'dim' )}
39103913 why <entity_id1> <entity_id2> { colorize ('shows why two or more different entities did not resolve' , 'dim' )}
39113914 why <data_source1> <record_id1> <data_source2> <record_id2> { colorize ('shows if the two data source records could resolve or relate' , 'dim' )}
3912-
3915+ why search [optional entity_id] { colorize ('shows the features and keys generated by a search against any candidate entities' , 'dim' )}
3916+
39133917 { colorize ('Color legend:' , 'highlight2' )}
39143918 { colorize ('green' , 'good' )} indicates the values matched and contributed to the overall score
39153919 { colorize ('red' , 'bad' )} indicates the values did not match and hurt the overall score
@@ -4225,7 +4229,7 @@ if __name__ == "__main__":
42254229 try :
42264230 sdk_wrapper = EdaSdkWrapper (engine_config , debug_trace = args .debug_trace , webapp_url = args .webapp_url )
42274231 except SzError :
4228- print_exception_info ()
4232+ print_exception ()
42294233 sys .exit (1 )
42304234
42314235 try :
0 commit comments