Skip to content

Commit fa0cb95

Browse files
authored
Merge pull request #497 from JohT/feature/plot-additional-anomaly-detection-feature-distributions
Print more anomaly detection feature distributions
2 parents d45913c + 4b8f223 commit fa0cb95

File tree

6 files changed

+1037
-46
lines changed

6 files changed

+1037
-46
lines changed
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
// Set "dependencyDegree" and "dependencyDegreeWeighted" on all nodes containing a property for incoming or outgoing dependencies. Requires all "Set_Incoming*.cypher" and "Set_Outgoing*.cypher".
2+
3+
MATCH (dependency)
4+
WHERE (dependency.incomingDependencies IS NOT NULL OR dependency.outgoingDependencies IS NOT NULL)
5+
WITH dependency
6+
,coalesce(dependency.incomingDependencies, 0) AS inDegree
7+
,coalesce(dependency.outgoingDependencies, 0) AS outDegree
8+
,coalesce(dependency.incomingDependenciesWeight, 0) AS inDegreeWeighted
9+
,coalesce(dependency.outgoingDependenciesWeight, 0) AS outDegreeWeighted
10+
SET dependency.dependencyDegree = inDegree + outDegree
11+
,dependency.dependencyDegreeWeighted = inDegreeWeighted + outDegreeWeighted
12+
RETURN count(*) AS writtenNodes
13+
,max(dependency.dependencyDegree) AS maxDependencyDegree
14+
,max(dependency.dependencyDegreeWeighted) AS maxDependencyDegreeWeighted
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
// Set "dependencyDegreeRank" on all nodes containing a "dependencyDegree" property. Requires "Set_Degree.cypher".
2+
3+
MATCH (dependency)
4+
WHERE dependency.dependencyDegree IS NOT NULL
5+
WITH dependency.dependencyDegree AS degree, collect(dependency) AS group
6+
ORDER BY degree DESC
7+
WITH collect({degree: degree, nodes: group}) AS groups
8+
UNWIND range(0, size(groups) - 1) AS rowIndex
9+
WITH rowIndex
10+
,groups[rowIndex] AS group
11+
UNWIND group.nodes AS dependency
12+
SET dependency.dependencyDegreeRank = rowIndex + 1
13+
RETURN count(*) AS writtenNodes
14+
,max(dependency.dependencyDegreeRank) AS maxDependencyDegreeRank

domains/anomaly-detection/anomalyDetectionFeaturePlots.py

Lines changed: 95 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -378,16 +378,37 @@ def annotate_outliers(outliers: pd.DataFrame) -> None:
378378
plot.savefig(plot_file_path)
379379

380380

381-
def plot_feature_distribution(feature_values: pd.Series, feature_name: str, title: str, plot_file_path: str) -> None:
381+
def plot_feature_distribution(
382+
feature_values: pd.Series,
383+
feature_name: str,
384+
short_names: pd.Series,
385+
title: str,
386+
plot_file_path: str,
387+
log_y_axis: bool = False,
388+
standard_deviation_lines: bool = True,
389+
number_of_outliers_to_annotate: int = 4,
390+
) -> None:
382391
"""
383392
Plots the distribution of feature's values.
384393
385394
Parameters
386395
----------
387396
feature_values : pd.Series
388397
Series containing feature values.
389-
text_prefix: str
390-
Text at the beginning of the title
398+
feature_name : str
399+
Name of the feature to be displayed on the x-axis.
400+
short_names : pd.Series
401+
Series containing the short names of the data points for annotation.
402+
title : str
403+
Title of the plot.
404+
plot_file_path : str
405+
File path to save the plot.
406+
log_y_axis : bool
407+
Whether to use logarithmic scale for the y-axis.
408+
standard_deviation_lines : bool
409+
Whether to plot standard deviation lines.
410+
number_of_outliers_to_annotate : int
411+
Number of outliers to annotate on each side of the distribution.
391412
"""
392413
if feature_values.empty:
393414
print("No data available to plot.")
@@ -400,18 +421,50 @@ def plot_feature_distribution(feature_values: pd.Series, feature_name: str, titl
400421
plot.xlabel(feature_name)
401422
plot.ylabel('Frequency')
402423
plot.xlim(left=feature_values.min(), right=feature_values.max())
403-
# plot.yscale('log') # Use logarithmic scale for better visibility of differences
424+
if log_y_axis:
425+
plot.yscale('log') # Use logarithmic scale for better visibility of differences
404426
plot.grid(True)
405427

406428
mean = feature_values.mean()
407429
standard_deviation = feature_values.std()
408430

409-
# Vertical line for the mean
410-
plot_standard_deviation_lines('red', mean, standard_deviation, standard_deviation_factor=0)
411-
# Vertical line for 1 x standard deviations + mean (=z-score of 1)
412-
plot_standard_deviation_lines('orange', mean, standard_deviation, standard_deviation_factor=1)
413-
# Vertical line for 2 x standard deviations + mean (=z-score of 2)
414-
plot_standard_deviation_lines('green', mean, standard_deviation, standard_deviation_factor=2)
431+
if standard_deviation_lines:
432+
# Vertical line for the mean
433+
plot_standard_deviation_lines('red', mean, standard_deviation, standard_deviation_factor=0)
434+
# Vertical line for 1 x standard deviations + mean (=z-score of 1)
435+
plot_standard_deviation_lines('orange', mean, standard_deviation, standard_deviation_factor=1)
436+
# Vertical line for 2 x standard deviations + mean (=z-score of 2)
437+
plot_standard_deviation_lines('green', mean, standard_deviation, standard_deviation_factor=2)
438+
439+
def annotate_distribution_outliers(
440+
outliers: pd.Series,
441+
names: pd.Series,
442+
) -> None:
443+
if outliers.empty:
444+
return
445+
rank = 0
446+
for item_index, value in outliers.items():
447+
index = typing.cast(int, item_index)
448+
rank = rank + 1
449+
short_name = names[index]
450+
451+
x_index_offset = (rank % 6) * 10
452+
if value > mean:
453+
x_index_offset = -x_index_offset
454+
455+
plot.annotate(
456+
text=f'{short_name}',
457+
xy=(value, 1),
458+
xytext=(x_index_offset, 60), # offset in points (uses 'textcoords': 'offset points')
459+
rotation=90,
460+
**plot_annotation_style,
461+
)
462+
463+
positive_outliers = feature_values.sort_values(ascending=False).head(number_of_outliers_to_annotate)
464+
annotate_distribution_outliers(positive_outliers, short_names)
465+
466+
negative_outliers = feature_values.sort_values(ascending=True).head(number_of_outliers_to_annotate)
467+
annotate_distribution_outliers(negative_outliers, short_names)
415468

416469
plot.tight_layout()
417470
plot.savefig(plot_file_path)
@@ -835,17 +888,48 @@ def get_common_plot_parameters(data: pd.DataFrame) -> dict:
835888
plot_feature_distribution(
836889
feature_values=data['clusteringCoefficient'],
837890
feature_name='Clustering Coefficient',
891+
short_names=data['shortCodeUnitName'],
838892
title=f"{title_prefix} clustering coefficient distribution",
839-
plot_file_path=get_file_path("ClusteringCoefficient_distribution", parameters)
893+
standard_deviation_lines=False,
894+
plot_file_path=get_file_path("ClusteringCoefficient_distribution", parameters),
840895
)
841896

842897
plot_feature_distribution(
843898
feature_values=data['betweenness'],
844899
feature_name='Betweenness',
900+
short_names=data['shortCodeUnitName'],
845901
title=f"{title_prefix} betweenness centrality distribution",
902+
log_y_axis=True,
846903
plot_file_path=get_file_path("BetweennessCentrality_distribution", parameters)
847904
)
848905

906+
plot_feature_distribution(
907+
feature_values=data['degree'],
908+
feature_name='Degree',
909+
short_names=data['shortCodeUnitName'],
910+
title=f"{title_prefix} degree distribution",
911+
log_y_axis=True,
912+
plot_file_path=get_file_path("Degree_distribution", parameters)
913+
)
914+
915+
plot_feature_distribution(
916+
feature_values=data['incomingDependencies'],
917+
feature_name='incomingDependencies',
918+
short_names=data['shortCodeUnitName'],
919+
title=f"{title_prefix} incoming dependencies distribution",
920+
log_y_axis=True,
921+
plot_file_path=get_file_path("IncomingDependencies_distribution", parameters)
922+
)
923+
924+
plot_feature_distribution(
925+
feature_values=data['outgoingDependencies'],
926+
feature_name='outgoingDependencies',
927+
short_names=data['shortCodeUnitName'],
928+
title=f"{title_prefix} outgoing dependencies distribution",
929+
log_y_axis=True,
930+
plot_file_path=get_file_path("OutgoingDependencies_distribution", parameters)
931+
)
932+
849933
plot_clustering_coefficient_vs_page_rank(
850934
data['clusteringCoefficient'],
851935
data['pageRank'],

0 commit comments

Comments
 (0)