@@ -378,16 +378,37 @@ def annotate_outliers(outliers: pd.DataFrame) -> None:
378378 plot .savefig (plot_file_path )
379379
380380
381- def plot_feature_distribution (feature_values : pd .Series , feature_name : str , title : str , plot_file_path : str ) -> None :
381+ def plot_feature_distribution (
382+ feature_values : pd .Series ,
383+ feature_name : str ,
384+ short_names : pd .Series ,
385+ title : str ,
386+ plot_file_path : str ,
387+ log_y_axis : bool = False ,
388+ standard_deviation_lines : bool = True ,
389+ number_of_outliers_to_annotate : int = 4 ,
390+ ) -> None :
382391 """
383392 Plots the distribution of feature's values.
384393
385394 Parameters
386395 ----------
387396 feature_values : pd.Series
388397 Series containing feature values.
389- text_prefix: str
390- Text at the beginning of the title
398+ feature_name : str
399+ Name of the feature to be displayed on the x-axis.
400+ short_names : pd.Series
401+ Series containing the short names of the data points for annotation.
402+ title : str
403+ Title of the plot.
404+ plot_file_path : str
405+ File path to save the plot.
406+ log_y_axis : bool
407+ Whether to use logarithmic scale for the y-axis.
408+ standard_deviation_lines : bool
409+ Whether to plot standard deviation lines.
410+ number_of_outliers_to_annotate : int
411+ Number of outliers to annotate on each side of the distribution.
391412 """
392413 if feature_values .empty :
393414 print ("No data available to plot." )
@@ -400,18 +421,50 @@ def plot_feature_distribution(feature_values: pd.Series, feature_name: str, titl
400421 plot .xlabel (feature_name )
401422 plot .ylabel ('Frequency' )
402423 plot .xlim (left = feature_values .min (), right = feature_values .max ())
403- # plot.yscale('log') # Use logarithmic scale for better visibility of differences
424+ if log_y_axis :
425+ plot .yscale ('log' ) # Use logarithmic scale for better visibility of differences
404426 plot .grid (True )
405427
406428 mean = feature_values .mean ()
407429 standard_deviation = feature_values .std ()
408430
409- # Vertical line for the mean
410- plot_standard_deviation_lines ('red' , mean , standard_deviation , standard_deviation_factor = 0 )
411- # Vertical line for 1 x standard deviations + mean (=z-score of 1)
412- plot_standard_deviation_lines ('orange' , mean , standard_deviation , standard_deviation_factor = 1 )
413- # Vertical line for 2 x standard deviations + mean (=z-score of 2)
414- plot_standard_deviation_lines ('green' , mean , standard_deviation , standard_deviation_factor = 2 )
431+ if standard_deviation_lines :
432+ # Vertical line for the mean
433+ plot_standard_deviation_lines ('red' , mean , standard_deviation , standard_deviation_factor = 0 )
434+ # Vertical line for 1 x standard deviations + mean (=z-score of 1)
435+ plot_standard_deviation_lines ('orange' , mean , standard_deviation , standard_deviation_factor = 1 )
436+ # Vertical line for 2 x standard deviations + mean (=z-score of 2)
437+ plot_standard_deviation_lines ('green' , mean , standard_deviation , standard_deviation_factor = 2 )
438+
439+ def annotate_distribution_outliers (
440+ outliers : pd .Series ,
441+ names : pd .Series ,
442+ ) -> None :
443+ if outliers .empty :
444+ return
445+ rank = 0
446+ for item_index , value in outliers .items ():
447+ index = typing .cast (int , item_index )
448+ rank = rank + 1
449+ short_name = names [index ]
450+
451+ x_index_offset = (rank % 6 ) * 10
452+ if value > mean :
453+ x_index_offset = - x_index_offset
454+
455+ plot .annotate (
456+ text = f'{ short_name } ' ,
457+ xy = (value , 1 ),
458+ xytext = (x_index_offset , 60 ), # offset in points (uses 'textcoords': 'offset points')
459+ rotation = 90 ,
460+ ** plot_annotation_style ,
461+ )
462+
463+ positive_outliers = feature_values .sort_values (ascending = False ).head (number_of_outliers_to_annotate )
464+ annotate_distribution_outliers (positive_outliers , short_names )
465+
466+ negative_outliers = feature_values .sort_values (ascending = True ).head (number_of_outliers_to_annotate )
467+ annotate_distribution_outliers (negative_outliers , short_names )
415468
416469 plot .tight_layout ()
417470 plot .savefig (plot_file_path )
@@ -835,17 +888,48 @@ def get_common_plot_parameters(data: pd.DataFrame) -> dict:
835888plot_feature_distribution (
836889 feature_values = data ['clusteringCoefficient' ],
837890 feature_name = 'Clustering Coefficient' ,
891+ short_names = data ['shortCodeUnitName' ],
838892 title = f"{ title_prefix } clustering coefficient distribution" ,
839- plot_file_path = get_file_path ("ClusteringCoefficient_distribution" , parameters )
893+ standard_deviation_lines = False ,
894+ plot_file_path = get_file_path ("ClusteringCoefficient_distribution" , parameters ),
840895)
841896
842897plot_feature_distribution (
843898 feature_values = data ['betweenness' ],
844899 feature_name = 'Betweenness' ,
900+ short_names = data ['shortCodeUnitName' ],
845901 title = f"{ title_prefix } betweenness centrality distribution" ,
902+ log_y_axis = True ,
846903 plot_file_path = get_file_path ("BetweennessCentrality_distribution" , parameters )
847904)
848905
906+ plot_feature_distribution (
907+ feature_values = data ['degree' ],
908+ feature_name = 'Degree' ,
909+ short_names = data ['shortCodeUnitName' ],
910+ title = f"{ title_prefix } degree distribution" ,
911+ log_y_axis = True ,
912+ plot_file_path = get_file_path ("Degree_distribution" , parameters )
913+ )
914+
915+ plot_feature_distribution (
916+ feature_values = data ['incomingDependencies' ],
917+ feature_name = 'incomingDependencies' ,
918+ short_names = data ['shortCodeUnitName' ],
919+ title = f"{ title_prefix } incoming dependencies distribution" ,
920+ log_y_axis = True ,
921+ plot_file_path = get_file_path ("IncomingDependencies_distribution" , parameters )
922+ )
923+
924+ plot_feature_distribution (
925+ feature_values = data ['outgoingDependencies' ],
926+ feature_name = 'outgoingDependencies' ,
927+ short_names = data ['shortCodeUnitName' ],
928+ title = f"{ title_prefix } outgoing dependencies distribution" ,
929+ log_y_axis = True ,
930+ plot_file_path = get_file_path ("OutgoingDependencies_distribution" , parameters )
931+ )
932+
849933plot_clustering_coefficient_vs_page_rank (
850934 data ['clusteringCoefficient' ],
851935 data ['pageRank' ],
0 commit comments