11include (" functions_classification.jl" )
22using Latexify
33
4+ # Helper functions
5+ """
6+ non_optimal_couples(df, tup, metric; aside_tuples=Tuple{Symbol,Symbol}[])
7+
8+ Give the couples (N,T) for which the method and clustering combination given in `tup` does not have the optimal value of the metric given in `metric` (either :er or :mr), excluding the combinations given in `aside_tuples`.
9+ """
10+ function non_optimal_couples (
11+ df:: DataFrame ,
12+ tup:: Tuple{Symbol,Symbol} ,
13+ metric:: Symbol ;
14+ aside_tuples:: Vector{Tuple{Symbol,Symbol}} = Tuple{Symbol,Symbol}[],
15+ )
16+ df_copy = deepcopy (df) # to avoid modifying the original dataframe with @rsubset!
17+ f = metric == :er ? identity : (x -> - x) # to maximize ER and minimize MR
18+
19+ for a_tup in aside_tuples
20+ @rsubset! (df_copy, ! ((:method , :clustering ) == string .(a_tup))) # exclude the aside methods and clusterings to compare to
21+ end
22+
23+ output = @chain df_copy begin
24+ transform (Symbol (metric, :_mean ) => f => :target_metric ) # create a column with the target metric to maximize (either ER or minus MR)
25+ @aside target = @rsubset (_, (:method , :clustering ) == string .(tup))
26+ @groupby ([:parameter , :T ])
27+ @combine (:optimal_target_metric = maximum (:target_metric ))
28+ _[target[! , :target_metric ] .< _[! , :optimal_target_metric ], [:parameter , :T ]]
29+ @rename (:N = :parameter )
30+ end
31+
32+ return output
33+ end
34+
35+ round_percent (x) = round (Int, 100 * x)
36+
437# # Load table
538df_wide = estimatorsload (" data/CO24/data_for_color_plot" )
639df_mean_bands = mmr_per (df_wide)
740
8- # ## helper function
9- round_percent (x) = round (Int, 100 * x)
10-
1141# # Misclassification rate - table
1242df_mr = @chain df_mean_bands begin
1343 # value of interest is MMR ± standard error
@@ -19,14 +49,14 @@ df_mr = @chain df_mean_bands begin
1949 :mr = string (round_percent (:mr_mean )) * " ± " * string (round_percent (:mr_std )),
2050 )
2151 # create a column with the combination of clustering and method to be able to unstack both at the same time
22- @rtransform! (:col_name = string (:clustering , " _" , :method ))
52+ @rtransform! (:col_name = string (:method , " _" , :clustering ))
2353 # unstack the table to have one column per method and clustering combination
2454 unstack ([:N , :T ], :col_name , :mr )
2555end
2656
2757# couples are selected so that the lowest MMR (usually ag_kmeans) is around 0.02
2858function isselected (n, t)
29- return (n, t) in [(34 , 2641 ), (94 , 7903 ), (142 , 13165 ), (190 , 21058 ), (250 , 31582 )]
59+ return (n, t) in [(34 , 2641 ), (94 , 7903 ), (142 , 10534 ), (190 , 13165 ), (250 , 18427 )]
3060end
3161latexify (@rsubset (df_mr, isselected (:N , :T )))
3262
@@ -37,20 +67,31 @@ latexify(@rsubset(df_mr, isselected(:N, :T)))
3767end
3868
3969# # Misclassification rate - best method ?
40- # ## Most of the time, the lowest Mean Misclassification Rate (MMR) is achieved for ag_kmeans
41- # ## 25 couples (N, T) out of 420 where the lowest MMR is not ag_kmeans
42- @chain df_mean_bands begin
43- @groupby ([:parameter , :T ])
44- @combine (:id_lowest_mmr = argmin (:mr_mean ))
45- @rsubset (:id_lowest_mmr != 1 )
46- end
70+ # ## Most of the time, ag_threshold is amongst the lowest Mean Misclassification Rate (MMR)
71+ # ## 25 couples (N, T) out of 420 where ag_threshold is not amongst the lowest MMR
72+ size (non_optimal_couples (df_mean_bands, (:ag , :threshold ), :mr ))[1 ]
73+ # ### If we remove threshold clustering,
74+ # ### 27 couples where ag_kmeans is not amongst the lowest MMR
75+ size (
76+ non_optimal_couples (
77+ df_mean_bands,
78+ (:ag , :kmeans ),
79+ :mr ;
80+ aside_tuples= [(:ag , :threshold ), (:sp , :threshold )],
81+ ),
82+ )[1 ]
4783
48- # ## When MMR is rounded up to 2 decimals, only 11 couples (N, T) out of 420
49- @chain df_mean_bands begin
50- @groupby ([:parameter , :T ])
51- @combine (:id_lowest_mmr = argmin (round .(100 * :mr_mean )))
52- @rsubset (:id_lowest_mmr != 1 )
53- end
84+ # ## When MMR is rounded up to 2 decimals,
85+ df_rounded = @rtransform (df_mean_bands, :mr_mean = round_percent (:mr_mean ))
86+ # ## only 13 couples where ag_threshold is not amongst the lowest MMR
87+ size (non_optimal_couples (df_rounded, (:ag , :threshold ), :mr ))[1 ]
88+ # ### If we remove threshold clustering,
89+ # ### 14 couples where ag_kmeans is not amongst the lowest MMR
90+ size (
91+ non_optimal_couples (
92+ df_rounded, (:ag , :kmeans ), :mr ; aside_tuples= [(:ag , :threshold ), (:sp , :threshold )]
93+ ),
94+ )[1 ]
5495
5596# # Exact recovery
5697factor = quantile (Normal (), 0.975 ) / sqrt (metadata (df_mean_bands, " Number of simulations" ))
@@ -67,7 +108,7 @@ df_er = @chain df_mean_bands begin
67108 string (round_percent (factor * :er_std )),
68109 )
69110 # create a column with the combination of clustering and method to be able to unstack both at the same time
70- @rtransform! (:col_name = string (:clustering , " _" , :method ))
111+ @rtransform! (:col_name = string (:method , " _" , :clustering ))
71112 # unstack the table to have one column per method and clustering combination
72113 unstack ([:N , :T ], :col_name , :er )
73114end
@@ -85,17 +126,28 @@ latexify(@rsubset(df_er, isselected(:N, :T)))
85126end
86127
87128# # Exact recovery - best method ?
88- # ## Most of the time, the highest Probability of Exact Recovery (PER) is achieved for ag_kmeans
89- # ## 11 couples (N, T) out of 420 where the highest PER is not ag_kmeans
90- @chain df_mean_bands begin
91- @groupby ([:parameter , :T ])
92- @combine (:id_highest_er = argmax (:er_mean ))
93- @rsubset (:id_highest_er != 1 )
94- end
129+ # ## Most of the time, ag_threshold is amongst the highest Probability of Exact Recovery (PER)
130+ # ## 7 couples (N, T) out of 420 where ag_threshold is not amongst the highest PER
131+ size (non_optimal_couples (df_mean_bands, (:ag , :threshold ), :er ))[1 ]
132+ # ### If we remove threshold clustering,
133+ # ### 11 couples where ag_kmeans is not amongst the highest PER
134+ size (
135+ non_optimal_couples (
136+ df_mean_bands,
137+ (:ag , :kmeans ),
138+ :er ;
139+ aside_tuples= [(:ag , :threshold ), (:sp , :threshold )],
140+ ),
141+ )[1 ]
95142
96- # ## When PER is rounded up to 2 decimals, ag_kmeans is always among the best
97- @chain df_mean_bands begin
98- @groupby ([:parameter , :T ])
99- @combine (:id_highest_er = argmax (round .(100 * :er_mean )))
100- @rsubset (:id_highest_er != 1 )
101- end
143+ # ## When PER is rounded up to 2 decimals,
144+ df_rounded = @rtransform (df_mean_bands, :er_mean = round_percent (:er_mean ))
145+ # ## only 1 couple where ag_threshold is not amongst the highest PER
146+ size (non_optimal_couples (df_rounded, (:ag , :threshold ), :er ))[1 ]
147+ # ### If we remove threshold clustering,
148+ # ### 2 couples where ag_kmeans is not amongst the highest PER
149+ size (
150+ non_optimal_couples (
151+ df_rounded, (:ag , :kmeans ), :er ; aside_tuples= [(:ag , :threshold ), (:sp , :threshold )]
152+ ),
153+ )[1 ]
0 commit comments