|
185 | 185 | " return True" |
186 | 186 | ] |
187 | 187 | }, |
| 188 | + { |
| 189 | + "cell_type": "code", |
| 190 | + "execution_count": null, |
| 191 | + "id": "25a0fbd3", |
| 192 | + "metadata": {}, |
| 193 | + "outputs": [], |
| 194 | + "source": [ |
| 195 | + "def get_projected_graph_statistics(projection_name: str) -> pd.DataFrame:\n", |
| 196 | + " \"\"\"\n", |
| 197 | + " Returns the projection statistics for the given parameters.\n", |
| 198 | + " Parameters\n", |
| 199 | + " ----------\n", |
| 200 | + " projection_name : str\n", |
| 201 | + " The name prefix for the in-memory projection for dependencies. Example: \"java-package-embeddings-notebook\"\n", |
| 202 | + " \"\"\"\n", |
| 203 | + "\n", |
| 204 | + " parameters = dict(\n", |
| 205 | + " dependencies_projection=projection_name,\n", |
| 206 | + " )\n", |
| 207 | + " return query_cypher_to_data_frame(\"../cypher/Dependencies_Projection/Dependencies_12_Get_Projection_Statistics.cypher\", parameters)\n", |
| 208 | + "\n", |
| 209 | + "\n", |
| 210 | + "def get_projected_graph_node_count(projection_name: str) -> int:\n", |
| 211 | + " \"\"\"\n", |
| 212 | + " Returns the number of nodes in the projected graph.\n", |
| 213 | + " Parameters\n", |
| 214 | + " ----------\n", |
| 215 | + " projection_name : str\n", |
| 216 | + " The name prefix for the in-memory projection for dependencies. Example: \"java-package-embeddings-notebook\"\n", |
| 217 | + " \"\"\"\n", |
| 218 | + "\n", |
| 219 | + " graph_statistics = get_projected_graph_statistics(projection_name)\n", |
| 220 | + " if graph_statistics.empty:\n", |
| 221 | + " return 0\n", |
| 222 | + " return graph_statistics[\"nodeCount\"].values[0]" |
| 223 | + ] |
| 224 | + }, |
| 225 | + { |
| 226 | + "cell_type": "code", |
| 227 | + "execution_count": null, |
| 228 | + "id": "511cb6ea", |
| 229 | + "metadata": {}, |
| 230 | + "outputs": [], |
| 231 | + "source": [ |
| 232 | + "def empty_embeddings() -> pd.DataFrame:\n", |
| 233 | + " return pd.DataFrame(columns=[\"codeUnitName\", \"shortCodeUnitName\", 'projectName', 'communityId', 'centrality', 'embedding', 'x', 'y'])" |
| 234 | + ] |
| 235 | + }, |
188 | 236 | { |
189 | 237 | "cell_type": "code", |
190 | 238 | "execution_count": null, |
|
218 | 266 | " The name of the node property that contains the dependency weight. Example: \"weight25PercentInterfaces\"\n", |
219 | 267 | " dependencies_projection_embedding_dimension : str\n", |
220 | 268 | " The number of the dimensions and therefore size of the resulting array of floating point numbers\n", |
| 269 | + " dependencies_projection_write_property : str\n", |
| 270 | + " The name of the node property where the resulting embeddings will be stored. Example: \"embedding\n", |
221 | 271 | " \"\"\"\n", |
222 | 272 | " \n", |
223 | | - " is_data_available=create_undirected_projection(parameters)\n", |
224 | | - " \n", |
225 | | - " if not is_data_available:\n", |
| 273 | + " node_count = get_projected_graph_node_count(parameters[\"dependencies_projection\"])\n", |
| 274 | + " if node_count <= 0:\n", |
226 | 275 | " print(\"No projected data for node embeddings calculation available\")\n", |
227 | | - " empty_result = pd.DataFrame(columns=[\"codeUnitName\", \"shortCodeUnitName\", 'projectName', 'communityId', 'centrality', 'embedding'])\n", |
228 | | - " return empty_result\n", |
| 276 | + " return empty_embeddings()\n", |
229 | 277 | "\n", |
230 | 278 | " existing_embeddings_query_filename=\"../cypher/Node_Embeddings/Node_Embeddings_0a_Query_Calculated.cypher\"\n", |
231 | 279 | " embeddings = query_first_non_empty_cypher_to_data_frame(existing_embeddings_query_filename, cypher_file_name, parameters=parameters)\n", |
|
258 | 306 | " The number of the dimensions and therefore size of the resulting array of floating point numbers\n", |
259 | 307 | " \"\"\"\n", |
260 | 308 | " \n", |
261 | | - " is_data_available=create_undirected_projection(parameters)\n", |
262 | | - " \n", |
263 | | - " if not is_data_available:\n", |
| 309 | + " node_count = get_projected_graph_node_count(parameters[\"dependencies_projection\"])\n", |
| 310 | + " if node_count <= 0:\n", |
264 | 311 | " print(\"No projected data for node embeddings calculation available\")\n", |
265 | | - " empty_result = pd.DataFrame(columns=[\"codeUnitName\", \"shortCodeUnitName\", 'projectName', 'communityId', 'centrality', 'embedding'])\n", |
266 | | - " return empty_result\n", |
| 312 | + " return empty_embeddings()\n", |
267 | 313 | " \n", |
268 | | - " existing_embeddings_query_filename=\"../cypher/Node_Embeddings/Node_Embeddings_0a_Query_Calculated.cypher\"\n", |
269 | | - " embeddings=query_cypher_to_data_frame(existing_embeddings_query_filename, parameters)\n", |
270 | | - " if embeddings.empty:\n", |
271 | | - " query_cypher_to_data_frame(\"../cypher/Node_Embeddings/Node_Embeddings_0b_Prepare_Degree.cypher\", parameters)\n", |
272 | | - " query_cypher_to_data_frame(\"../cypher/Node_Embeddings/Node_Embeddings_0c_Drop_Model.cypher\", parameters)\n", |
273 | | - " display(query_cypher_to_data_frame(\"../cypher/Node_Embeddings/Node_Embeddings_4b_GraphSAGE_Train.cypher\", parameters))\n", |
274 | | - " embeddings=query_cypher_to_data_frame(\"../cypher/Node_Embeddings/Node_Embeddings_4d_GraphSAGE_Stream.cypher\", parameters)\n", |
275 | | - " else:\n", |
276 | | - " print(\"The results have been provided by the query filename: \" + existing_embeddings_query_filename)\n", |
| 314 | + " if node_count > 500:\n", |
| 315 | + " print(\"GraphSAGE node embeddings training will be skipped for \" + str(node_count) + \" (>500) nodes, since it is computationally expensive and not eagerly needed for demonstration purposes.\")\n", |
| 316 | + " return empty_embeddings()\n", |
| 317 | + "\n", |
| 318 | + " query_cypher_to_data_frame(\"../cypher/Node_Embeddings/Node_Embeddings_0b_Prepare_Degree.cypher\", parameters)\n", |
| 319 | + " query_cypher_to_data_frame(\"../cypher/Node_Embeddings/Node_Embeddings_0c_Drop_Model.cypher\", parameters)\n", |
| 320 | + " display(query_cypher_to_data_frame(\"../cypher/Node_Embeddings/Node_Embeddings_4b_GraphSAGE_Train.cypher\", parameters))\n", |
| 321 | + " embeddings=query_cypher_to_data_frame(\"../cypher/Node_Embeddings/Node_Embeddings_4d_GraphSAGE_Stream.cypher\", parameters)\n", |
277 | 322 | " \n", |
278 | 323 | " display(embeddings.head()) # Display the first entries of the table\n", |
279 | 324 | " return embeddings" |
|
610 | 655 | "## 1. Java Packages" |
611 | 656 | ] |
612 | 657 | }, |
| 658 | + { |
| 659 | + "cell_type": "markdown", |
| 660 | + "id": "515db579", |
| 661 | + "metadata": {}, |
| 662 | + "source": [ |
| 663 | + "### 1.1 Create Dependency Graph Projection for Java Packages\n", |
| 664 | + "\n", |
| 665 | + "The projection and related common parameters are shared across all embedding algorithms below." |
| 666 | + ] |
| 667 | + }, |
| 668 | + { |
| 669 | + "cell_type": "code", |
| 670 | + "execution_count": null, |
| 671 | + "id": "5631c434", |
| 672 | + "metadata": {}, |
| 673 | + "outputs": [], |
| 674 | + "source": [ |
| 675 | + "common_projection_parameters={\n", |
| 676 | + " \"dependencies_projection\": \"java-package-embeddings-notebook\",\n", |
| 677 | + " \"dependencies_projection_node\": \"Package\",\n", |
| 678 | + " \"dependencies_projection_weight_property\": \"weight25PercentInterfaces\",\n", |
| 679 | + "}\n", |
| 680 | + "if create_undirected_projection(common_projection_parameters):\n", |
| 681 | + " display(get_projected_graph_statistics(common_projection_parameters[\"dependencies_projection\"]))\n", |
| 682 | + "else:\n", |
| 683 | + " print(f\"No data for projection creation available: {common_projection_parameters}\")" |
| 684 | + ] |
| 685 | + }, |
613 | 686 | { |
614 | 687 | "cell_type": "markdown", |
615 | 688 | "id": "145dca19", |
616 | 689 | "metadata": {}, |
617 | 690 | "source": [ |
618 | | - "### 1.1 Generate Node Embeddings using Fast Random Projection (Fast RP) for Java Packages\n", |
| 691 | + "### 1.2 Generate Node Embeddings using Fast Random Projection (Fast RP) for Java Packages\n", |
619 | 692 | "\n", |
620 | 693 | "[Fast Random Projection](https://neo4j.com/docs/graph-data-science/current/machine-learning/node-embeddings/fastrp) is used to reduce the dimensionality of the node feature space while preserving most of the distance information. Nodes with similar neighborhood result in node embedding with similar vectors.\n", |
621 | 694 | "\n", |
|
630 | 703 | "outputs": [], |
631 | 704 | "source": [ |
632 | 705 | "java_package_embeddings_parameters={\n", |
633 | | - " \"dependencies_projection\": \"java-package-embeddings-notebook\",\n", |
634 | | - " \"dependencies_projection_node\": \"Package\",\n", |
635 | | - " \"dependencies_projection_weight_property\": \"weight25PercentInterfaces\",\n", |
| 706 | + " **common_projection_parameters,\n", |
636 | 707 | " \"dependencies_projection_write_property\": \"embeddingsFastRandomProjection\",\n", |
637 | 708 | " \"dependencies_projection_embedding_dimension\":\"32\"\n", |
638 | 709 | "}\n", |
|
644 | 715 | "id": "76d8bca1", |
645 | 716 | "metadata": {}, |
646 | 717 | "source": [ |
647 | | - "### 1.2 Dimensionality reduction with Uniform Manifold Approximation and Projection (UMAP)\n", |
| 718 | + "### 1.3 Dimensionality reduction with Uniform Manifold Approximation and Projection (UMAP)\n", |
648 | 719 | "\n", |
649 | 720 | "This step takes the original node embeddings in their high dimensionality, e.g. 32 floating point numbers, and reduces them into a two dimensional array for visualization. For more details look up the function \"prepare_node_embeddings_for_2d_visualization\".\n", |
650 | 721 | "\n", |
|
671 | 742 | "id": "f908c47f", |
672 | 743 | "metadata": {}, |
673 | 744 | "source": [ |
674 | | - "### 1.3 Visualization of the node embeddings reduced to two dimensions" |
| 745 | + "### 1.4 Visualization of the node embeddings reduced to two dimensions" |
675 | 746 | ] |
676 | 747 | }, |
677 | 748 | { |
|
689 | 760 | "id": "b690b9a7", |
690 | 761 | "metadata": {}, |
691 | 762 | "source": [ |
692 | | - "### 1.4 Node Embeddings for Java Packages using HashGNN\n", |
| 763 | + "### 1.5 Node Embeddings for Java Packages using HashGNN\n", |
693 | 764 | "\n", |
694 | 765 | "[HashGNN](https://neo4j.com/docs/graph-data-science/2.6/machine-learning/node-embeddings/hashgnn) resembles Graph Neural Networks (GNN) but does not include a model or require training. It combines ideas of GNNs and fast randomized algorithms. For more details see [HashGNN](https://neo4j.com/docs/graph-data-science/2.6/machine-learning/node-embeddings/hashgnn). Here, the latter 3 steps are combined into one for HashGNN." |
695 | 766 | ] |
|
702 | 773 | "outputs": [], |
703 | 774 | "source": [ |
704 | 775 | "java_package_embeddings_parameters={\n", |
705 | | - " \"dependencies_projection\": \"java-package-embeddings-notebook\",\n", |
706 | | - " \"dependencies_projection_node\": \"Package\",\n", |
707 | | - " \"dependencies_projection_weight_property\": \"weight25PercentInterfaces\",\n", |
| 776 | + " **common_projection_parameters,\n", |
708 | 777 | " \"dependencies_projection_write_property\": \"embeddingsHashGNN\",\n", |
709 | 778 | " \"dependencies_projection_embedding_dimension\":\"64\"\n", |
710 | 779 | "}\n", |
|
719 | 788 | "id": "248d88b4", |
720 | 789 | "metadata": {}, |
721 | 790 | "source": [ |
722 | | - "### 1.5 Node Embeddings for Java Packages using node2vec" |
| 791 | + "### 1.6 Node Embeddings for Java Packages using node2vec" |
723 | 792 | ] |
724 | 793 | }, |
725 | 794 | { |
|
730 | 799 | "outputs": [], |
731 | 800 | "source": [ |
732 | 801 | "java_package_embeddings_parameters={\n", |
733 | | - " \"dependencies_projection\": \"java-package-embeddings-notebook\",\n", |
734 | | - " \"dependencies_projection_node\": \"Package\",\n", |
735 | | - " \"dependencies_projection_weight_property\": \"weight25PercentInterfaces\",\n", |
| 802 | + " **common_projection_parameters,\n", |
736 | 803 | " \"dependencies_projection_write_property\": \"embeddingsNode2Vec\",\n", |
737 | 804 | " \"dependencies_projection_embedding_dimension\":\"32\"\n", |
738 | 805 | "}\n", |
|
747 | 814 | "id": "873d6a4e", |
748 | 815 | "metadata": {}, |
749 | 816 | "source": [ |
750 | | - "### 1.6 Node Embeddings for Java Packages using GraphSAGE" |
| 817 | + "### 1.7 Node Embeddings for Java Packages using GraphSAGE" |
751 | 818 | ] |
752 | 819 | }, |
753 | 820 | { |
|
0 commit comments