Skip to content

Commit ca6d81f

Browse files
committed
Optimize performance in projection creation
1 parent bca582e commit ca6d81f

2 files changed

Lines changed: 203 additions & 71 deletions

File tree

jupyter/NodeEmbeddingsJava.ipynb

Lines changed: 101 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -185,6 +185,54 @@
185185
" return True"
186186
]
187187
},
188+
{
189+
"cell_type": "code",
190+
"execution_count": null,
191+
"id": "25a0fbd3",
192+
"metadata": {},
193+
"outputs": [],
194+
"source": [
195+
"def get_projected_graph_statistics(projection_name: str) -> pd.DataFrame:\n",
196+
" \"\"\"\n",
197+
" Returns the projection statistics for the given parameters.\n",
198+
" Parameters\n",
199+
" ----------\n",
200+
" projection_name : str\n",
201+
" The name prefix for the in-memory projection for dependencies. Example: \"java-package-embeddings-notebook\"\n",
202+
" \"\"\"\n",
203+
"\n",
204+
" parameters = dict(\n",
205+
" dependencies_projection=projection_name,\n",
206+
" )\n",
207+
" return query_cypher_to_data_frame(\"../cypher/Dependencies_Projection/Dependencies_12_Get_Projection_Statistics.cypher\", parameters)\n",
208+
"\n",
209+
"\n",
210+
"def get_projected_graph_node_count(projection_name: str) -> int:\n",
211+
" \"\"\"\n",
212+
" Returns the number of nodes in the projected graph.\n",
213+
" Parameters\n",
214+
" ----------\n",
215+
" projection_name : str\n",
216+
" The name prefix for the in-memory projection for dependencies. Example: \"java-package-embeddings-notebook\"\n",
217+
" \"\"\"\n",
218+
"\n",
219+
" graph_statistics = get_projected_graph_statistics(projection_name)\n",
220+
" if graph_statistics.empty:\n",
221+
" return 0\n",
222+
" return graph_statistics[\"nodeCount\"].values[0]"
223+
]
224+
},
225+
{
226+
"cell_type": "code",
227+
"execution_count": null,
228+
"id": "511cb6ea",
229+
"metadata": {},
230+
"outputs": [],
231+
"source": [
232+
"def empty_embeddings() -> pd.DataFrame:\n",
233+
" return pd.DataFrame(columns=[\"codeUnitName\", \"shortCodeUnitName\", 'projectName', 'communityId', 'centrality', 'embedding', 'x', 'y'])"
234+
]
235+
},
188236
{
189237
"cell_type": "code",
190238
"execution_count": null,
@@ -218,14 +266,14 @@
218266
" The name of the node property that contains the dependency weight. Example: \"weight25PercentInterfaces\"\n",
219267
" dependencies_projection_embedding_dimension : str\n",
220268
" The number of the dimensions and therefore size of the resulting array of floating point numbers\n",
269+
" dependencies_projection_write_property : str\n",
270+
" The name of the node property where the resulting embeddings will be stored. Example: \"embedding\n",
221271
" \"\"\"\n",
222272
" \n",
223-
" is_data_available=create_undirected_projection(parameters)\n",
224-
" \n",
225-
" if not is_data_available:\n",
273+
" node_count = get_projected_graph_node_count(parameters[\"dependencies_projection\"])\n",
274+
" if node_count <= 0:\n",
226275
" print(\"No projected data for node embeddings calculation available\")\n",
227-
" empty_result = pd.DataFrame(columns=[\"codeUnitName\", \"shortCodeUnitName\", 'projectName', 'communityId', 'centrality', 'embedding'])\n",
228-
" return empty_result\n",
276+
" return empty_embeddings()\n",
229277
"\n",
230278
" existing_embeddings_query_filename=\"../cypher/Node_Embeddings/Node_Embeddings_0a_Query_Calculated.cypher\"\n",
231279
" embeddings = query_first_non_empty_cypher_to_data_frame(existing_embeddings_query_filename, cypher_file_name, parameters=parameters)\n",
@@ -258,22 +306,19 @@
258306
" The number of the dimensions and therefore size of the resulting array of floating point numbers\n",
259307
" \"\"\"\n",
260308
" \n",
261-
" is_data_available=create_undirected_projection(parameters)\n",
262-
" \n",
263-
" if not is_data_available:\n",
309+
" node_count = get_projected_graph_node_count(parameters[\"dependencies_projection\"])\n",
310+
" if node_count <= 0:\n",
264311
" print(\"No projected data for node embeddings calculation available\")\n",
265-
" empty_result = pd.DataFrame(columns=[\"codeUnitName\", \"shortCodeUnitName\", 'projectName', 'communityId', 'centrality', 'embedding'])\n",
266-
" return empty_result\n",
312+
" return empty_embeddings()\n",
267313
" \n",
268-
" existing_embeddings_query_filename=\"../cypher/Node_Embeddings/Node_Embeddings_0a_Query_Calculated.cypher\"\n",
269-
" embeddings=query_cypher_to_data_frame(existing_embeddings_query_filename, parameters)\n",
270-
" if embeddings.empty:\n",
271-
" query_cypher_to_data_frame(\"../cypher/Node_Embeddings/Node_Embeddings_0b_Prepare_Degree.cypher\", parameters)\n",
272-
" query_cypher_to_data_frame(\"../cypher/Node_Embeddings/Node_Embeddings_0c_Drop_Model.cypher\", parameters)\n",
273-
" display(query_cypher_to_data_frame(\"../cypher/Node_Embeddings/Node_Embeddings_4b_GraphSAGE_Train.cypher\", parameters))\n",
274-
" embeddings=query_cypher_to_data_frame(\"../cypher/Node_Embeddings/Node_Embeddings_4d_GraphSAGE_Stream.cypher\", parameters)\n",
275-
" else:\n",
276-
" print(\"The results have been provided by the query filename: \" + existing_embeddings_query_filename)\n",
314+
" if node_count > 500:\n",
315+
" print(\"GraphSAGE node embeddings training will be skipped for \" + str(node_count) + \" (>500) nodes, since it is computationally expensive and not eagerly needed for demonstration purposes.\")\n",
316+
" return empty_embeddings()\n",
317+
"\n",
318+
" query_cypher_to_data_frame(\"../cypher/Node_Embeddings/Node_Embeddings_0b_Prepare_Degree.cypher\", parameters)\n",
319+
" query_cypher_to_data_frame(\"../cypher/Node_Embeddings/Node_Embeddings_0c_Drop_Model.cypher\", parameters)\n",
320+
" display(query_cypher_to_data_frame(\"../cypher/Node_Embeddings/Node_Embeddings_4b_GraphSAGE_Train.cypher\", parameters))\n",
321+
" embeddings=query_cypher_to_data_frame(\"../cypher/Node_Embeddings/Node_Embeddings_4d_GraphSAGE_Stream.cypher\", parameters)\n",
277322
" \n",
278323
" display(embeddings.head()) # Display the first entries of the table\n",
279324
" return embeddings"
@@ -610,12 +655,40 @@
610655
"## 1. Java Packages"
611656
]
612657
},
658+
{
659+
"cell_type": "markdown",
660+
"id": "515db579",
661+
"metadata": {},
662+
"source": [
663+
"### 1.1 Create Dependency Graph Projection for Java Packages\n",
664+
"\n",
665+
"The projection and related common parameters are shared across all embedding algorithms below."
666+
]
667+
},
668+
{
669+
"cell_type": "code",
670+
"execution_count": null,
671+
"id": "5631c434",
672+
"metadata": {},
673+
"outputs": [],
674+
"source": [
675+
"common_projection_parameters={\n",
676+
" \"dependencies_projection\": \"java-package-embeddings-notebook\",\n",
677+
" \"dependencies_projection_node\": \"Package\",\n",
678+
" \"dependencies_projection_weight_property\": \"weight25PercentInterfaces\",\n",
679+
"}\n",
680+
"if create_undirected_projection(common_projection_parameters):\n",
681+
" display(get_projected_graph_statistics(common_projection_parameters[\"dependencies_projection\"]))\n",
682+
"else:\n",
683+
" print(f\"No data for projection creation available: {common_projection_parameters}\")"
684+
]
685+
},
613686
{
614687
"cell_type": "markdown",
615688
"id": "145dca19",
616689
"metadata": {},
617690
"source": [
618-
"### 1.1 Generate Node Embeddings using Fast Random Projection (Fast RP) for Java Packages\n",
691+
"### 1.2 Generate Node Embeddings using Fast Random Projection (Fast RP) for Java Packages\n",
619692
"\n",
620693
"[Fast Random Projection](https://neo4j.com/docs/graph-data-science/current/machine-learning/node-embeddings/fastrp) is used to reduce the dimensionality of the node feature space while preserving most of the distance information. Nodes with similar neighborhood result in node embedding with similar vectors.\n",
621694
"\n",
@@ -630,9 +703,7 @@
630703
"outputs": [],
631704
"source": [
632705
"java_package_embeddings_parameters={\n",
633-
" \"dependencies_projection\": \"java-package-embeddings-notebook\",\n",
634-
" \"dependencies_projection_node\": \"Package\",\n",
635-
" \"dependencies_projection_weight_property\": \"weight25PercentInterfaces\",\n",
706+
" **common_projection_parameters,\n",
636707
" \"dependencies_projection_write_property\": \"embeddingsFastRandomProjection\",\n",
637708
" \"dependencies_projection_embedding_dimension\":\"32\"\n",
638709
"}\n",
@@ -644,7 +715,7 @@
644715
"id": "76d8bca1",
645716
"metadata": {},
646717
"source": [
647-
"### 1.2 Dimensionality reduction with Uniform Manifold Approximation and Projection (UMAP)\n",
718+
"### 1.3 Dimensionality reduction with Uniform Manifold Approximation and Projection (UMAP)\n",
648719
"\n",
649720
"This step takes the original node embeddings in their high dimensionality, e.g. 32 floating point numbers, and reduces them into a two dimensional array for visualization. For more details look up the function \"prepare_node_embeddings_for_2d_visualization\".\n",
650721
"\n",
@@ -671,7 +742,7 @@
671742
"id": "f908c47f",
672743
"metadata": {},
673744
"source": [
674-
"### 1.3 Visualization of the node embeddings reduced to two dimensions"
745+
"### 1.4 Visualization of the node embeddings reduced to two dimensions"
675746
]
676747
},
677748
{
@@ -689,7 +760,7 @@
689760
"id": "b690b9a7",
690761
"metadata": {},
691762
"source": [
692-
"### 1.4 Node Embeddings for Java Packages using HashGNN\n",
763+
"### 1.5 Node Embeddings for Java Packages using HashGNN\n",
693764
"\n",
694765
"[HashGNN](https://neo4j.com/docs/graph-data-science/2.6/machine-learning/node-embeddings/hashgnn) resembles Graph Neural Networks (GNN) but does not include a model or require training. It combines ideas of GNNs and fast randomized algorithms. For more details see [HashGNN](https://neo4j.com/docs/graph-data-science/2.6/machine-learning/node-embeddings/hashgnn). Here, the latter 3 steps are combined into one for HashGNN."
695766
]
@@ -702,9 +773,7 @@
702773
"outputs": [],
703774
"source": [
704775
"java_package_embeddings_parameters={\n",
705-
" \"dependencies_projection\": \"java-package-embeddings-notebook\",\n",
706-
" \"dependencies_projection_node\": \"Package\",\n",
707-
" \"dependencies_projection_weight_property\": \"weight25PercentInterfaces\",\n",
776+
" **common_projection_parameters,\n",
708777
" \"dependencies_projection_write_property\": \"embeddingsHashGNN\",\n",
709778
" \"dependencies_projection_embedding_dimension\":\"64\"\n",
710779
"}\n",
@@ -719,7 +788,7 @@
719788
"id": "248d88b4",
720789
"metadata": {},
721790
"source": [
722-
"### 1.5 Node Embeddings for Java Packages using node2vec"
791+
"### 1.6 Node Embeddings for Java Packages using node2vec"
723792
]
724793
},
725794
{
@@ -730,9 +799,7 @@
730799
"outputs": [],
731800
"source": [
732801
"java_package_embeddings_parameters={\n",
733-
" \"dependencies_projection\": \"java-package-embeddings-notebook\",\n",
734-
" \"dependencies_projection_node\": \"Package\",\n",
735-
" \"dependencies_projection_weight_property\": \"weight25PercentInterfaces\",\n",
802+
" **common_projection_parameters,\n",
736803
" \"dependencies_projection_write_property\": \"embeddingsNode2Vec\",\n",
737804
" \"dependencies_projection_embedding_dimension\":\"32\"\n",
738805
"}\n",
@@ -747,7 +814,7 @@
747814
"id": "873d6a4e",
748815
"metadata": {},
749816
"source": [
750-
"### 1.6 Node Embeddings for Java Packages using GraphSAGE"
817+
"### 1.7 Node Embeddings for Java Packages using GraphSAGE"
751818
]
752819
},
753820
{

0 commit comments

Comments
 (0)