|
233 | 233 | " return embeddings" |
234 | 234 | ] |
235 | 235 | }, |
| 236 | + { |
| 237 | + "cell_type": "code", |
| 238 | + "execution_count": null, |
| 239 | + "id": "e2b52e51", |
| 240 | + "metadata": {}, |
| 241 | + "outputs": [], |
| 242 | + "source": [ |
| 243 | + "def create_node_embeddings_with_GraphSAGE(parameters: dict) -> pd.DataFrame: \n", |
| 244 | + " \"\"\"\n", |
| 245 | + " Creates an in-memory Graph projection by calling \"create_undirected_projection\", \n", |
| 246 | + " enriches it with a degree centrality property for every node, trains GraphSAGE \n", |
| 247 | + " and returns the resulting node embeddings as DataFrame.\n", |
| 248 | + " \n", |
| 249 | + " parameters\n", |
| 250 | + " ----------\n", |
| 251 | + " dependencies_projection : str\n", |
| 252 | + " The name prefix for the in-memory projection for dependencies. Example: \"java-package-embeddings-notebook\"\n", |
| 253 | + " dependencies_projection_node : str\n", |
| 254 | + " The label of the nodes that will be used for the projection. Example: \"Package\"\n", |
| 255 | + " dependencies_projection_weight_property : str\n", |
| 256 | + " The name of the node property that contains the dependency weight. Example: \"weight25PercentInterfaces\"\n", |
| 257 | + " dependencies_projection_embedding_dimension : str\n", |
| 258 | + " The number of the dimensions and therefore size of the resulting array of floating point numbers\n", |
| 259 | + " \"\"\"\n", |
| 260 | + " \n", |
| 261 | + " is_data_available=create_undirected_projection(parameters)\n", |
| 262 | + " \n", |
| 263 | + " if not is_data_available:\n", |
| 264 | + " print(\"No projected data for node embeddings calculation available\")\n", |
| 265 | + " empty_result = pd.DataFrame(columns=[\"codeUnitName\", \"shortCodeUnitName\", 'projectName', 'communityId', 'centrality', 'embedding'])\n", |
| 266 | + " return empty_result\n", |
| 267 | + " \n", |
| 268 | + " existing_embeddings_query_filename=\"../cypher/Node_Embeddings/Node_Embeddings_0a_Query_Calculated.cypher\"\n", |
| 269 | + " embeddings=query_cypher_to_data_frame(existing_embeddings_query_filename, parameters)\n", |
| 270 | + " if embeddings.empty:\n", |
| 271 | + " query_cypher_to_data_frame(\"../cypher/Node_Embeddings/Node_Embeddings_0b_Prepare_Degree.cypher\", parameters)\n", |
| 272 | + " query_cypher_to_data_frame(\"../cypher/Node_Embeddings/Node_Embeddings_0c_Drop_Model.cypher\", parameters)\n", |
| 273 | + " display(query_cypher_to_data_frame(\"../cypher/Node_Embeddings/Node_Embeddings_4b_GraphSAGE_Train.cypher\", parameters))\n", |
| 274 | + " embeddings=query_cypher_to_data_frame(\"../cypher/Node_Embeddings/Node_Embeddings_4d_GraphSAGE_Stream.cypher\", parameters)\n", |
| 275 | + " else:\n", |
| 276 | + " print(\"The results have been provided by the query filename: \" + existing_embeddings_query_filename)\n", |
| 277 | + " \n", |
| 278 | + " display(embeddings.head()) # Display the first entries of the table\n", |
| 279 | + " return embeddings" |
| 280 | + ] |
| 281 | + }, |
236 | 282 | { |
237 | 283 | "cell_type": "code", |
238 | 284 | "execution_count": null, |
|
699 | 745 | "plot_2d_node_embeddings(embeddings_node2vec, get_plot_title(\"TypeScript Modules\", \"node2vec\", scores_node2vec))" |
700 | 746 | ] |
701 | 747 | }, |
| 748 | + { |
| 749 | + "cell_type": "markdown", |
| 750 | + "id": "059d162c", |
| 751 | + "metadata": {}, |
| 752 | + "source": [ |
| 753 | + "### 1.6 Node Embeddings for Java Packages using GraphSAGE" |
| 754 | + ] |
| 755 | + }, |
| 756 | + { |
| 757 | + "cell_type": "code", |
| 758 | + "execution_count": null, |
| 759 | + "id": "2c5664b9", |
| 760 | + "metadata": {}, |
| 761 | + "outputs": [], |
| 762 | + "source": [ |
| 763 | + "typescript_module_embeddings_parameters={\n", |
| 764 | + " \"dependencies_projection\": \"typescript-module-embeddings-notebook\",\n", |
| 765 | + " \"dependencies_projection_node\": \"Module\",\n", |
| 766 | + " \"dependencies_projection_weight_property\": \"lowCouplingElement25PercentWeight\",\n", |
| 767 | + " \"dependencies_projection_write_property\": \"embeddingsGraphSAGE\",\n", |
| 768 | + " \"dependencies_projection_embedding_dimension\":\"32\"\n", |
| 769 | + "}\n", |
| 770 | + "embeddings_graphSAGE= create_node_embeddings_with_GraphSAGE(typescript_module_embeddings_parameters)\n", |
| 771 | + "embeddings_graphSAGE = prepare_node_embeddings_for_2d_visualization(embeddings_graphSAGE)\n", |
| 772 | + "scores_graphSAGE = CommunityScores.calculate(embeddings_graphSAGE)\n", |
| 773 | + "plot_2d_node_embeddings(embeddings_graphSAGE, get_plot_title(\"TypeScript Modules\", \"GraphSAGE\", scores_graphSAGE))" |
| 774 | + ] |
| 775 | + }, |
702 | 776 | { |
703 | 777 | "cell_type": "markdown", |
704 | 778 | "id": "c5c73bd3", |
|
717 | 791 | "outputs": [], |
718 | 792 | "source": [ |
719 | 793 | "plot_all_2d_node_embeddings_in_grid(\n", |
720 | | - " embeddings=[embeddings_fastRP, embeddings_hashGNN, embeddings_node2vec],\n", |
| 794 | + " embeddings=[embeddings_fastRP, embeddings_hashGNN, embeddings_node2vec, embeddings_graphSAGE],\n", |
721 | 795 | " titles=[\n", |
722 | 796 | " get_plot_title(\"TypeScript Modules\", \"Fast Random Projection\", scores_fastRP),\n", |
723 | 797 | " get_plot_title(\"TypeScript Modules\", \"HashGNN\", scores_hashGNN),\n", |
724 | 798 | " get_plot_title(\"TypeScript Modules\", \"node2vec\", scores_node2vec),\n", |
| 799 | + " get_plot_title(\"TypeScript Modules\", \"GraphSAGE\", scores_graphSAGE),\n", |
725 | 800 | " ],\n", |
726 | 801 | ")" |
727 | 802 | ] |
| 803 | + }, |
| 804 | + { |
| 805 | + "cell_type": "markdown", |
| 806 | + "id": "75acc17d", |
| 807 | + "metadata": {}, |
| 808 | + "source": [ |
| 809 | + "#### Interpreting Node Embedding Results\n", |
| 810 | + "\n", |
| 811 | + "##### Summary of Observations\n", |
| 812 | + "\n", |
| 813 | + "- **FastRP** and **node2vec** show clear, well-separated clusters\n", |
| 814 | + "- **HashGNN** and **GraphSAGE** produce more diffuse embeddings\n", |
| 815 | + "- Silhouette scores are high for FastRP / node2vec and low for HashGNN / GraphSAGE\n", |
| 816 | + "\n", |
| 817 | + "These differences are expected and stem from the **fundamentally different objectives** of the algorithms.\n", |
| 818 | + "\n", |
| 819 | + "##### Key Takeaways\n", |
| 820 | + "\n", |
| 821 | + "- **FastRP and node2vec** are well-suited for **community discovery and visualization**\n", |
| 822 | + "- **HashGNN** is best viewed as a **fast structural fingerprint**, not a clustering embedding\n", |
| 823 | + "- **GraphSAGE** requires meaningful node features or labels and performs poorly in dense, feature-poor settings\n", |
| 824 | + "- Poor silhouette scores for HashGNN and GraphSAGE are **expected and theoretically consistent**" |
| 825 | + ] |
728 | 826 | } |
729 | 827 | ], |
730 | 828 | "metadata": { |
|
0 commit comments