|
644 | 644 | "metadata": {}, |
645 | 645 | "outputs": [], |
646 | 646 | "source": [ |
647 | | - "def plot_feature_distribution(feature_values: pd.Series, feature_name: str, title_prefix: str) -> None:\n", |
| 647 | + "def plot_feature_distribution(\n", |
| 648 | + " feature_values: pd.Series,\n", |
| 649 | + " feature_name: str,\n", |
| 650 | + " short_names: pd.Series,\n", |
| 651 | + " title_prefix: str,\n", |
| 652 | + " log_y_axis: bool = False,\n", |
| 653 | + " standard_deviation_lines: bool = True,\n", |
| 654 | + " number_of_outliers_to_annotate: int = 4,\n", |
| 655 | + " ) -> None:\n", |
648 | 656 | " \"\"\"\n", |
649 | 657 | " Plots the distribution of feature's values.\n", |
650 | 658 | " \n", |
651 | 659 | " Parameters\n", |
652 | 660 | " ----------\n", |
653 | 661 | " feature_values : pd.Series\n", |
654 | 662 | " Series containing feature values.\n", |
655 | | - " text_prefix: str\n", |
656 | | - " Text at the beginning of the title\n", |
| 663 | + " feature_name : str\n", |
| 664 | + " Name of the feature to be displayed on the x-axis.\n", |
| 665 | + " short_names : pd.Series\n", |
| 666 | + " Series containing the short names of the data points for annotation.\n", |
| 667 | + " title_prefix : str\n", |
| 668 | + " Prefix for the title of the plot.\n", |
| 669 | + " log_y_axis : bool\n", |
| 670 | + " Whether to use logarithmic scale for the y-axis.\n", |
| 671 | + " standard_deviation_lines : bool\n", |
| 672 | + " Whether to plot standard deviation lines.\n", |
| 673 | + " number_of_outliers_to_annotate : int\n", |
| 674 | + " Number of outliers to annotate on each side of the distribution.\n", |
657 | 675 | " \"\"\"\n", |
658 | 676 | " if feature_values.empty:\n", |
659 | 677 | " print(\"No data available to plot.\")\n", |
|
666 | 684 | " plot.xlabel(feature_name)\n", |
667 | 685 | " plot.ylabel('Frequency')\n", |
668 | 686 | " plot.xlim(left=feature_values.min(), right=feature_values.max())\n", |
669 | | - " # plot.yscale('log') # Use logarithmic scale for better visibility of differences\n", |
| 687 | + " if log_y_axis:\n", |
| 688 | + " plot.yscale('log') # Use logarithmic scale for better visibility of differences\n", |
670 | 689 | " plot.grid(True)\n", |
671 | 690 | " plot.tight_layout()\n", |
672 | 691 | "\n", |
673 | 692 | " mean = feature_values.mean()\n", |
674 | 693 | " standard_deviation = feature_values.std()\n", |
675 | 694 | "\n", |
676 | | - " # Vertical line for the mean\n", |
677 | | - " plot_standard_deviation_lines('red', mean, standard_deviation, standard_deviation_factor=0)\n", |
678 | | - " # Vertical line for 1 x standard deviations + mean (=z-score of 1)\n", |
679 | | - " plot_standard_deviation_lines('orange', mean, standard_deviation, standard_deviation_factor=1)\n", |
680 | | - " # Vertical line for 2 x standard deviations + mean (=z-score of 2)\n", |
681 | | - " plot_standard_deviation_lines('green', mean, standard_deviation, standard_deviation_factor=2)\n", |
| 695 | + " if standard_deviation_lines:\n", |
| 696 | + " # Vertical line for the mean\n", |
| 697 | + " plot_standard_deviation_lines('red', mean, standard_deviation, standard_deviation_factor=0)\n", |
| 698 | + " # Vertical line for 1 x standard deviations + mean (=z-score of 1)\n", |
| 699 | + " plot_standard_deviation_lines('orange', mean, standard_deviation, standard_deviation_factor=1)\n", |
| 700 | + " # Vertical line for 2 x standard deviations + mean (=z-score of 2)\n", |
| 701 | + " plot_standard_deviation_lines('green', mean, standard_deviation, standard_deviation_factor=2)\n", |
| 702 | + "\n", |
| 703 | + " def annotate_distribution_outliers(\n", |
| 704 | + " outliers: pd.Series,\n", |
| 705 | + " names: pd.Series,\n", |
| 706 | + " ) -> None:\n", |
| 707 | + " if outliers.empty:\n", |
| 708 | + " return\n", |
| 709 | + " rank = 0\n", |
| 710 | + " for item_index, value in outliers.items():\n", |
| 711 | + " index = typing.cast(int, item_index)\n", |
| 712 | + " rank = rank + 1\n", |
| 713 | + " short_name = names[index]\n", |
| 714 | + " \n", |
| 715 | + " x_index_offset = (rank % 6) * 10\n", |
| 716 | + " if value > mean:\n", |
| 717 | + " x_index_offset = -x_index_offset\n", |
| 718 | + "\n", |
| 719 | + " plot.annotate(\n", |
| 720 | + " text=f'{short_name}',\n", |
| 721 | + " xy=(value, 1),\n", |
| 722 | + " xytext=(x_index_offset, 60),\n", |
| 723 | + " rotation=90,\n", |
| 724 | + " **plot_annotation_style,\n", |
| 725 | + " )\n", |
| 726 | + "\n", |
| 727 | + " positive_outliers = feature_values.sort_values(ascending=False).head(number_of_outliers_to_annotate)\n", |
| 728 | + " annotate_distribution_outliers(positive_outliers, short_names)\n", |
| 729 | + "\n", |
| 730 | + " negative_outliers = feature_values.sort_values(ascending=True).head(number_of_outliers_to_annotate)\n", |
| 731 | + " annotate_distribution_outliers(negative_outliers, short_names)\n", |
682 | 732 | "\n", |
683 | 733 | " plot.show()" |
684 | 734 | ] |
685 | 735 | }, |
686 | | - { |
687 | | - "cell_type": "code", |
688 | | - "execution_count": null, |
689 | | - "id": "ed900c59", |
690 | | - "metadata": {}, |
691 | | - "outputs": [], |
692 | | - "source": [ |
693 | | - "def plot_clustering_coefficient_distribution(clustering_coefficients: pd.Series, title_prefix: str) -> None:\n", |
694 | | - " \"\"\"\n", |
695 | | - " Plots the distribution of clustering coefficients.\n", |
696 | | - " \n", |
697 | | - " Parameters\n", |
698 | | - " ----------\n", |
699 | | - " clustering_coefficients : pd.Series\n", |
700 | | - " Series containing clustering coefficient values.\n", |
701 | | - " text_prefix: str\n", |
702 | | - " Text at the beginning of the title\n", |
703 | | - " \"\"\"\n", |
704 | | - " plot_feature_distribution(clustering_coefficients, 'Clustering Coefficient', title_prefix)" |
705 | | - ] |
706 | | - }, |
707 | 736 | { |
708 | 737 | "cell_type": "code", |
709 | 738 | "execution_count": null, |
710 | 739 | "id": "92aff8d9", |
711 | 740 | "metadata": {}, |
712 | 741 | "outputs": [], |
713 | 742 | "source": [ |
714 | | - "plot_feature_distribution(java_package_features['clusteringCoefficient'], 'Clustering Coefficient', title_prefix=\"Java Package\")" |
| 743 | + "plot_feature_distribution(\n", |
| 744 | + " java_package_features['clusteringCoefficient'],\n", |
| 745 | + " 'Clustering Coefficient', \n", |
| 746 | + " java_package_features['shortCodeUnitName'], \n", |
| 747 | + " title_prefix=\"Java Package\",\n", |
| 748 | + " standard_deviation_lines=False\n", |
| 749 | + ")" |
715 | 750 | ] |
716 | 751 | }, |
717 | 752 | { |
|
836 | 871 | "metadata": {}, |
837 | 872 | "outputs": [], |
838 | 873 | "source": [ |
839 | | - "plot_feature_distribution(java_package_features['betweenness'], 'Betweenness', title_prefix=\"Java Package\")" |
| 874 | + "plot_feature_distribution(\n", |
| 875 | + " java_package_features['betweenness'], \n", |
| 876 | + " 'Betweenness', \n", |
| 877 | + " java_package_features['shortCodeUnitName'],\n", |
| 878 | + " title_prefix=\"Java Package\",\n", |
| 879 | + " log_y_axis=True,\n", |
| 880 | + ")" |
840 | 881 | ] |
841 | 882 | }, |
842 | 883 | { |
|
1443 | 1484 | "metadata": {}, |
1444 | 1485 | "outputs": [], |
1445 | 1486 | "source": [ |
1446 | | - "plot_feature_distribution(java_type_features['clusteringCoefficient'], 'Clustering Coefficient', title_prefix=\"Java Type\")" |
| 1487 | + "plot_feature_distribution(\n", |
| 1488 | + " java_type_features['clusteringCoefficient'],\n", |
| 1489 | + " 'Clustering Coefficient',\n", |
| 1490 | + " java_type_features['shortCodeUnitName'],\n", |
| 1491 | + " title_prefix=\"Java Type\",\n", |
| 1492 | + " standard_deviation_lines=False\n", |
| 1493 | + ")" |
1447 | 1494 | ] |
1448 | 1495 | }, |
1449 | 1496 | { |
|
1477 | 1524 | "metadata": {}, |
1478 | 1525 | "outputs": [], |
1479 | 1526 | "source": [ |
1480 | | - "plot_feature_distribution(java_type_features['betweenness'], 'Betweenness', title_prefix=\"Java Type\")" |
| 1527 | + "plot_feature_distribution(\n", |
| 1528 | + " java_type_features['betweenness'],\n", |
| 1529 | + " 'Betweenness',\n", |
| 1530 | + " java_type_features['shortCodeUnitName'],\n", |
| 1531 | + " log_y_axis=True,\n", |
| 1532 | + " title_prefix=\"Java Type\"\n", |
| 1533 | + ")" |
1481 | 1534 | ] |
1482 | 1535 | }, |
1483 | 1536 | { |
|
0 commit comments