Skip to content

Commit 27c82e6

Browse files
committed
Remove constant features from anomaly detection
1 parent 1a293f6 commit 27c82e6

2 files changed

Lines changed: 42 additions & 5 deletions

File tree

domains/anomaly-detection/explore/AnomalyDetectionIsolationForestExploration.ipynb

Lines changed: 27 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -407,14 +407,35 @@
407407
" return scaler.fit_transform(features_to_scale)"
408408
]
409409
},
410+
{
411+
"cell_type": "code",
412+
"execution_count": null,
413+
"id": "04d510a6",
414+
"metadata": {},
415+
"outputs": [],
416+
"source": [
417+
"def remove_constant_features(features: pd.DataFrame, feature_names: list[str]) -> list[str]:\n",
418+
" \"\"\"\n",
419+
" Removes constant features from the feature list.\n",
420+
" \"\"\"\n",
421+
" non_constant_features = []\n",
422+
" for feature in feature_names:\n",
423+
" if features[feature].nunique() > 1:\n",
424+
" non_constant_features.append(feature)\n",
425+
" else:\n",
426+
" print(f\"Removed constant feature: {feature}\")\n",
427+
" return non_constant_features"
428+
]
429+
},
410430
{
411431
"cell_type": "code",
412432
"execution_count": null,
413433
"id": "2de5ade1",
414434
"metadata": {},
415435
"outputs": [],
416436
"source": [
417-
"java_package_anomaly_detection_features_standardized = standardize_features(java_package_anomaly_detection_features, java_package_features_to_standardize)"
437+
"java_package_anomaly_detection_feature_names_to_standardize = remove_constant_features(java_package_anomaly_detection_features, java_package_features_to_standardize)\n",
438+
"java_package_anomaly_detection_features_standardized = standardize_features(java_package_anomaly_detection_features, java_package_anomaly_detection_feature_names_to_standardize)"
418439
]
419440
},
420441
{
@@ -491,7 +512,7 @@
491512
"outputs": [],
492513
"source": [
493514
"java_package_anomaly_detection_features_prepared = np.hstack([java_package_anomaly_detection_features_standardized, java_package_anomaly_detection_node_embeddings_reduced])\n",
494-
"java_package_anomaly_detection_feature_names = list(java_package_features_to_standardize) + [f'nodeEmbeddingPCA_{i}' for i in range(java_package_anomaly_detection_node_embeddings_reduced.shape[1])]"
515+
"java_package_anomaly_detection_feature_names = list(java_package_anomaly_detection_feature_names_to_standardize) + [f'nodeEmbeddingPCA_{i}' for i in range(java_package_anomaly_detection_node_embeddings_reduced.shape[1])]"
495516
]
496517
},
497518
{
@@ -1986,13 +2007,14 @@
19862007
"outputs": [],
19872008
"source": [
19882009
"validate_data(java_type_anomaly_detection_features)\n",
1989-
"java_type_anomaly_detection_features_standardized = standardize_features(java_type_anomaly_detection_features, java_type_features_to_standardize)\n",
2010+
"java_type_anomaly_detection_feature_names_to_standardize = remove_constant_features(java_type_anomaly_detection_features, java_type_features_to_standardize )\n",
2011+
"java_type_anomaly_detection_features_standardized = standardize_features(java_type_anomaly_detection_features, java_type_anomaly_detection_feature_names_to_standardize)\n",
19902012
"java_type_anomaly_detection_node_embeddings_reduced = reduce_dimensionality_of_node_embeddings(java_type_anomaly_detection_features, max_dimensions=35)\n",
19912013
"\n",
19922014
"java_type_anomaly_detection_features_prepared = np.hstack([java_type_anomaly_detection_features_standardized, java_type_anomaly_detection_node_embeddings_reduced])\n",
1993-
"java_type_anomaly_detection_feature_names = list(java_type_features_to_standardize) + [f'nodeEmbeddingPCA_{i}' for i in range(java_type_anomaly_detection_node_embeddings_reduced.shape[1])]\n",
2015+
"java_type_anomaly_detection_feature_names = list(java_type_anomaly_detection_feature_names_to_standardize) + [f'nodeEmbeddingPCA_{i}' for i in range(java_type_anomaly_detection_node_embeddings_reduced.shape[1])]\n",
19942016
"\n",
1995-
"plot_feature_correlation_matrix(java_type_anomaly_detection_features[java_type_features_to_standardize])"
2017+
"plot_feature_correlation_matrix(java_type_anomaly_detection_features[java_type_anomaly_detection_feature_names_to_standardize])"
19962018
]
19972019
},
19982020
{

domains/anomaly-detection/tunedAnomalyDetectionExplained.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -296,6 +296,20 @@ def standardize_features(features: pd.DataFrame, feature_list: list[str]) -> num
296296
return scaler.fit_transform(features_to_scale)
297297

298298

299+
def remove_constant_features(features: pd.DataFrame, feature_names: list[str], is_verbose: bool = False) -> list[str]:
300+
"""
301+
Removes constant features from the feature list.
302+
"""
303+
non_constant_features = []
304+
for feature in feature_names:
305+
if features[feature].nunique() > 1:
306+
non_constant_features.append(feature)
307+
else:
308+
if is_verbose:
309+
print("tunedAnomalyDetectionExplained: Removing constant feature {feature}")
310+
return non_constant_features
311+
312+
299313
def reduce_dimensionality_of_node_embeddings(
300314
features: pd.DataFrame,
301315
min_dimensions: int = 20,
@@ -1162,6 +1176,7 @@ def output_top_shap_explained_global_features_as_markdown_table(
11621176
sys.exit(0)
11631177

11641178
features_to_standardize = features.columns.drop(features_for_visualization_to_exclude_from_training + ['embedding']).to_list()
1179+
features_to_standardize = remove_constant_features(features, features_to_standardize, is_verbose=parameters.is_verbose())
11651180
features_standardized = standardize_features(features, features_to_standardize)
11661181
node_embeddings_reduced = reduce_dimensionality_of_node_embeddings(features)
11671182
features_prepared = np.hstack([features_standardized, node_embeddings_reduced])

0 commit comments

Comments
 (0)