diff --git a/conda-environment.yml b/conda-environment.yml index 0cf45ac5a..43f0cabdf 100644 --- a/conda-environment.yml +++ b/conda-environment.yml @@ -15,8 +15,6 @@ dependencies: - pandas=2.2.3 - pip=25.0.1 - setuptools=80.9.0 # opentsne uses sklearn.base uses joblib uses distutils missing in Python >= 12 (TODO use native openTSNE?) - - typing-extensions=4.15.0 # Also needed for opentsne and Python >= 3.12 - - opentsne=1.0.4 # to visualize node embeddings in 2D (t-SNE dimensionality reduction) - wordcloud=1.9.4 - monotonic=1.6 - plotly=6.5.0 diff --git a/cypher/Node_Embeddings/Node_Embeddings_0b_Prepare_Degree.cypher b/cypher/Node_Embeddings/Node_Embeddings_0b_Prepare_Degree.cypher new file mode 100644 index 000000000..8d7bf3ba9 --- /dev/null +++ b/cypher/Node_Embeddings/Node_Embeddings_0b_Prepare_Degree.cypher @@ -0,0 +1,28 @@ +// Node Embeddings 0b: Prepare: Calculate Degree Property. + +CALL gds.degree.mutate( + $dependencies_projection + '-cleaned', { + orientation: 'UNDIRECTED' + ,relationshipWeightProperty: CASE $dependencies_projection_weight_property WHEN '' THEN null ELSE $dependencies_projection_weight_property END + ,mutateProperty: 'degreeForNodeEmbeddings' +}) + YIELD nodePropertiesWritten + ,preProcessingMillis + ,computeMillis + ,mutateMillis + ,postProcessingMillis + ,centralityDistribution +RETURN nodePropertiesWritten + ,preProcessingMillis + ,computeMillis + ,mutateMillis + ,postProcessingMillis + ,centralityDistribution.min + ,centralityDistribution.mean + ,centralityDistribution.max + ,centralityDistribution.p50 + ,centralityDistribution.p75 + ,centralityDistribution.p90 + ,centralityDistribution.p95 + ,centralityDistribution.p99 + ,centralityDistribution.p999 \ No newline at end of file diff --git a/cypher/Node_Embeddings/Node_Embeddings_0c_Drop_Model.cypher b/cypher/Node_Embeddings/Node_Embeddings_0c_Drop_Model.cypher new file mode 100644 index 000000000..8eebe754c --- /dev/null +++ b/cypher/Node_Embeddings/Node_Embeddings_0c_Drop_Model.cypher @@ -0,0 +1,21 @@ +// Node Embeddings 0b: Prepare: Calculate Degree Property. + +CALL gds.model.drop($dependencies_projection + '-graphSAGE', false) +YIELD modelName, + modelType, + modelInfo, + creationTime, + trainConfig, + graphSchema, + loaded, + stored, + published +RETURN modelName, + modelType, + modelInfo, + creationTime, + trainConfig, + graphSchema, + loaded, + stored, + published \ No newline at end of file diff --git a/cypher/Node_Embeddings/Node_Embeddings_4b_GraphSAGE_Train.cypher b/cypher/Node_Embeddings/Node_Embeddings_4b_GraphSAGE_Train.cypher new file mode 100644 index 000000000..924e6a3b1 --- /dev/null +++ b/cypher/Node_Embeddings/Node_Embeddings_4b_GraphSAGE_Train.cypher @@ -0,0 +1,27 @@ +// Node Embeddings 4c using GraphSAGE (Graph Neural Networks): Train. Requires: "Node_Embeddings_0b_Prepare_Degree.cypher". + +CALL gds.beta.graphSage.train( + $dependencies_projection + '-cleaned', { + modelName: $dependencies_projection + '-graphSAGE' + ,featureProperties: ['degreeForNodeEmbeddings'] + ,embeddingDimension: toInteger($dependencies_projection_embedding_dimension) + ,relationshipWeightProperty: CASE $dependencies_projection_weight_property WHEN '' THEN null ELSE $dependencies_projection_weight_property END + ,batchSize: 64 + ,activationFunction: 'relu' + ,sampleSizes: [25, 20, 20, 10] + //,aggregator: 'pool' + //,epochs: 10 + //,penaltyL2: 0.0000001 + //,tolerance: 0.0001 + //,learningRate: 0.1 + //,searchDepth: 5 + ,randomSeed: 47 + } +) +YIELD modelInfo AS info, trainMillis +RETURN + info.modelName AS modelName, + info.metrics.didConverge AS didConverge, + info.metrics.ranEpochs AS ranEpochs, + info.metrics.epochLosses AS epochLosses, + trainMillis AS trainingTimeMilliseconds \ No newline at end of file diff --git a/cypher/Node_Embeddings/Node_Embeddings_4d_GraphSAGE_Stream.cypher b/cypher/Node_Embeddings/Node_Embeddings_4d_GraphSAGE_Stream.cypher new file mode 100644 index 000000000..df2c74f40 --- /dev/null +++ b/cypher/Node_Embeddings/Node_Embeddings_4d_GraphSAGE_Stream.cypher @@ -0,0 +1,22 @@ +// Node Embeddings 4d using GraphSAGE: Stream. Requires "Add_file_name and_extension.cypher". + +CALL gds.beta.graphSage.stream( + $dependencies_projection + '-cleaned', { + modelName: $dependencies_projection + '-graphSAGE' + } +) +YIELD nodeId, embedding + WITH gds.util.asNode(nodeId) AS codeUnit + ,embedding +OPTIONAL MATCH (artifact:Java:Artifact)-[:CONTAINS]->(codeUnit) + WITH *, artifact.name AS artifactName +OPTIONAL MATCH (projectRoot:Directory)<-[:HAS_ROOT]-(proj:TS:Project)-[:CONTAINS]->(codeUnit) + WITH *, last(split(projectRoot.absoluteFileName, '/')) AS projectName + RETURN DISTINCT + coalesce(codeUnit.fqn, codeUnit.globalFqn, codeUnit.fileName, codeUnit.signature, codeUnit.name) AS codeUnitName + ,codeUnit.name AS shortCodeUnitName + ,elementId(codeUnit) AS nodeElementId + ,coalesce(artifactName, projectName) AS projectName + ,coalesce(codeUnit.communityLeidenId, 0) AS communityId + ,coalesce(codeUnit.centralityPageRank, 0.01) AS centrality + ,embedding \ No newline at end of file diff --git a/jupyter/NodeEmbeddingsJava.ipynb b/jupyter/NodeEmbeddingsJava.ipynb index 1a340fdd9..69ba0f8ab 100644 --- a/jupyter/NodeEmbeddingsJava.ipynb +++ b/jupyter/NodeEmbeddingsJava.ipynb @@ -6,7 +6,7 @@ "id": "2f0eabc4", "metadata": {}, "source": [ - "# Node Embeddings\n", + "# Node Embeddings for Java\n", "\n", "This notebook demonstrates different methods for node embeddings and how to further reduce their dimensionality to be able to visualize them in a 2D plot. \n", "\n", @@ -20,7 +20,7 @@ "- Clean the data, e.g. filter out very few nodes with extremely high degree that aren't actually that important\n", "- Try directed vs. undirected projections\n", "- Tune the embedding algorithm, e.g. use a higher dimensionality\n", - "- Tune t-SNE that is used to reduce the node embeddings dimension to two dimensions for visualization. \n", + "- Tune UMAP that is used to reduce the node embeddings dimension to two dimensions for visualization. \n", "\n", "It could also be the case that the node embeddings are good enough and well suited the way they are despite their visualization for the down stream task like node classification or link prediction. In that case it makes sense to see how the whole pipeline performs before tuning the node embeddings in detail. \n", "\n", @@ -37,7 +37,7 @@ "- [Neo4j Python Driver](https://neo4j.com/docs/api/python-driver/current)\n", "- [Tutorial: Applied Graph Embeddings](https://neo4j.com/developer/graph-data-science/applied-graph-embeddings)\n", "- [Visualizing the embeddings in 2D](https://github.com/openai/openai-cookbook/blob/main/examples/Visualizing_embeddings_in_2D.ipynb)\n", - "- [scikit-learn TSNE](https://scikit-learn.org/stable/modules/generated/sklearn.manifold.TSNE.html#sklearn.manifold.TSNE)\n", + "- [UMAP](https://umap-learn.readthedocs.io/en/latest)\n", "- [AttributeError: 'list' object has no attribute 'shape'](https://bobbyhadz.com/blog/python-attributeerror-list-object-has-no-attribute-shape)\n", "- [Fast Random Projection (neo4j)](https://neo4j.com/docs/graph-data-science/current/machine-learning/node-embeddings/fastrp)\n", "- [HashGNN (neo4j)](https://neo4j.com/docs/graph-data-science/2.6/machine-learning/node-embeddings/hashgnn)\n", @@ -64,18 +64,6 @@ "" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "b23a5047", - "metadata": {}, - "outputs": [], - "source": [ - "# Main Colormap\n", - "# main_color_map = 'nipy_spectral'\n", - "main_color_map = 'viridis'" - ] - }, { "cell_type": "code", "execution_count": null, @@ -89,8 +77,10 @@ "import matplotlib.pyplot as plot\n", "import typing as typ\n", "import numpy as np\n", - "from openTSNE.sklearn import TSNE\n", - "from neo4j import GraphDatabase" + "import umap\n", + "from neo4j import GraphDatabase\n", + "from sklearn.metrics import silhouette_score\n", + "from sklearn.metrics import davies_bouldin_score" ] }, { @@ -100,9 +90,17 @@ "metadata": {}, "outputs": [], "source": [ - "from openTSNE import __version__ as openTSNE_version\n", - "print('The openTSNE version is: {}'.format(openTSNE_version))\n", - "print('The pandas version is: {}'.format(pd.__version__))\n" + "print('The numpy version is: {}'.format(np.__version__))\n", + "print('The pandas version is: {}'.format(pd.__version__))\n", + "\n", + "from umap import __version__ as umap_version\n", + "print('The UMAP version is: {}'.format(umap_version))\n", + "\n", + "from matplotlib import __version__ as matplotlib_version\n", + "print('The matplotlib version is: {}'.format(matplotlib_version))\n", + "\n", + "from sklearn import __version__ as sklearn_version\n", + "print('The sklearn version is: {}'.format(sklearn_version))" ] }, { @@ -159,8 +157,6 @@ "metadata": {}, "outputs": [], "source": [ - "# TODO option to choose between directed and undirected projection\n", - "\n", "def create_undirected_projection(parameters: dict) -> bool: \n", " \"\"\"\n", " Creates an undirected homogenous in-memory Graph projection for/with Neo4j Graph Data Science Plugin.\n", @@ -189,6 +185,54 @@ " return True" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "25a0fbd3", + "metadata": {}, + "outputs": [], + "source": [ + "def get_projected_graph_statistics(projection_name: str) -> pd.DataFrame:\n", + " \"\"\"\n", + " Returns the projection statistics for the given parameters.\n", + " Parameters\n", + " ----------\n", + " projection_name : str\n", + " The name prefix for the in-memory projection for dependencies. Example: \"java-package-embeddings-notebook\"\n", + " \"\"\"\n", + "\n", + " parameters = dict(\n", + " dependencies_projection=projection_name,\n", + " )\n", + " return query_cypher_to_data_frame(\"../cypher/Dependencies_Projection/Dependencies_12_Get_Projection_Statistics.cypher\", parameters)\n", + "\n", + "\n", + "def get_projected_graph_node_count(projection_name: str) -> int:\n", + " \"\"\"\n", + " Returns the number of nodes in the projected graph.\n", + " Parameters\n", + " ----------\n", + " projection_name : str\n", + " The name prefix for the in-memory projection for dependencies. Example: \"java-package-embeddings-notebook\"\n", + " \"\"\"\n", + "\n", + " graph_statistics = get_projected_graph_statistics(projection_name)\n", + " if graph_statistics.empty:\n", + " return 0\n", + " return graph_statistics[\"nodeCount\"].values[0]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "511cb6ea", + "metadata": {}, + "outputs": [], + "source": [ + "def empty_embeddings() -> pd.DataFrame:\n", + " return pd.DataFrame(columns=[\"codeUnitName\", \"shortCodeUnitName\", 'projectName', 'communityId', 'centrality', 'embedding', 'x', 'y'])" + ] + }, { "cell_type": "code", "execution_count": null, @@ -196,11 +240,11 @@ "metadata": {}, "outputs": [], "source": [ - "# Feature ideas\n", - "# TODO option to choose between directed and undirected projection\n", - "# TODO option to not read already existing node embeddings to experiment with different (hpyer) parameters\n", - "# TODO run a community detection algorithm co-located in here when \"communityId\" is missing\n", - "# TODO run a centrality algorithm co-located in here when \"centrality\" score is missing\n", + "# Feature Ideas:\n", + "# - Option to choose between directed and undirected projection?\n", + "# - Option to not read already existing node embeddings to experiment with different hyper-parameters?\n", + "# - Run a community detection algorithm co-located in here when \"communityId\" is missing?\n", + "# - Run a centrality algorithm co-located in here when \"centrality\" score is missing?\n", "\n", "def create_node_embeddings(cypher_file_name: str, parameters: dict) -> pd.DataFrame: \n", " \"\"\"\n", @@ -222,14 +266,14 @@ " The name of the node property that contains the dependency weight. Example: \"weight25PercentInterfaces\"\n", " dependencies_projection_embedding_dimension : str\n", " The number of the dimensions and therefore size of the resulting array of floating point numbers\n", + " dependencies_projection_write_property : str\n", + " The name of the node property where the resulting embeddings will be stored. Example: \"embedding\n", " \"\"\"\n", " \n", - " is_data_available=create_undirected_projection(parameters)\n", - " \n", - " if not is_data_available:\n", + " node_count = get_projected_graph_node_count(parameters[\"dependencies_projection\"])\n", + " if node_count <= 0:\n", " print(\"No projected data for node embeddings calculation available\")\n", - " empty_result = pd.DataFrame(columns=[\"codeUnitName\", 'projectName', 'communityId', 'centrality', 'embedding'])\n", - " return empty_result\n", + " return empty_embeddings()\n", "\n", " existing_embeddings_query_filename=\"../cypher/Node_Embeddings/Node_Embeddings_0a_Query_Calculated.cypher\"\n", " embeddings = query_first_non_empty_cypher_to_data_frame(existing_embeddings_query_filename, cypher_file_name, parameters=parameters)\n", @@ -238,17 +282,83 @@ ] }, { - "cell_type": "markdown", - "id": "f6ec6a9b", + "cell_type": "code", + "execution_count": null, + "id": "48cb52c6", "metadata": {}, + "outputs": [], "source": [ - "### Dimensionality reduction with t-distributed stochastic neighbor embedding (t-SNE)\n", + "def create_node_embeddings_with_GraphSAGE(parameters: dict) -> pd.DataFrame: \n", + " \"\"\"\n", + " Creates an in-memory Graph projection by calling \"create_undirected_projection\", \n", + " enriches it with a degree centrality property for every node, trains GraphSAGE \n", + " and returns the resulting node embeddings as DataFrame.\n", + " \n", + " parameters\n", + " ----------\n", + " dependencies_projection : str\n", + " The name prefix for the in-memory projection for dependencies. Example: \"java-package-embeddings-notebook\"\n", + " dependencies_projection_node : str\n", + " The label of the nodes that will be used for the projection. Example: \"Package\"\n", + " dependencies_projection_weight_property : str\n", + " The name of the node property that contains the dependency weight. Example: \"weight25PercentInterfaces\"\n", + " dependencies_projection_embedding_dimension : str\n", + " The number of the dimensions and therefore size of the resulting array of floating point numbers\n", + " \"\"\"\n", + " \n", + " node_count = get_projected_graph_node_count(parameters[\"dependencies_projection\"])\n", + " if node_count <= 0:\n", + " print(\"No projected data for node embeddings calculation available\")\n", + " return empty_embeddings()\n", + " \n", + " if node_count > 500:\n", + " print(\"GraphSAGE node embeddings training will be skipped for \" + str(node_count) + \" (>500) nodes, since it is computationally expensive and not eagerly needed for demonstration purposes.\")\n", + " return empty_embeddings()\n", + "\n", + " query_cypher_to_data_frame(\"../cypher/Node_Embeddings/Node_Embeddings_0b_Prepare_Degree.cypher\", parameters)\n", + " query_cypher_to_data_frame(\"../cypher/Node_Embeddings/Node_Embeddings_0c_Drop_Model.cypher\", parameters)\n", + " display(query_cypher_to_data_frame(\"../cypher/Node_Embeddings/Node_Embeddings_4b_GraphSAGE_Train.cypher\", parameters))\n", + " embeddings=query_cypher_to_data_frame(\"../cypher/Node_Embeddings/Node_Embeddings_4d_GraphSAGE_Stream.cypher\", parameters)\n", + " \n", + " display(embeddings.head()) # Display the first entries of the table\n", + " return embeddings" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9e3dd1de", + "metadata": {}, + "outputs": [], + "source": [ + "class CommunityScores:\n", + " \n", + " def __init__(self, silhouette_score: float, davies_bouldin_score: float):\n", + " self.silhouette_score = silhouette_score\n", + " self.davies_bouldin_score = davies_bouldin_score\n", + "\n", + " def __repr__(self):\n", + " return f\"CommunityScores(silhouette_score={self.silhouette_score}, davies_bouldin_score={self.davies_bouldin_score})\"\n", "\n", - "The following function takes the original node embeddings with a higher dimensionality, e.g. 64 floating point numbers, and reduces them into a two dimensional array for visualization. \n", + " @classmethod\n", + " def calculate(cls, data: pd.DataFrame, metric: str = \"cosine\"):\n", + " \"\"\"\n", + " data: pandas DataFrame with columns:\n", + " - 'communityId': int\n", + " - 'embedding': array-like (same length for all rows)\n", + " metric: 'cosine', 'euclidean', etc.\n", + " \"\"\"\n", + " # ensure we pass a sequence/array of arrays to sklearn / numpy\n", + " X = np.array(data[\"embedding\"].to_list())\n", + " labels = data[\"communityId\"].astype(int)\n", "\n", - "> It converts similarities between data points to joint probabilities and tries to minimize the Kullback-Leibler divergence between the joint probabilities of the low-dimensional embedding and the high-dimensional data.\n", + " if len(np.unique(labels)) < 2:\n", + " print(\"Community scores require at least 2 communities\")\n", + " return cls(0.0, 0.0)\n", "\n", - "(see https://opentsne.readthedocs.io)" + " silhouette = silhouette_score(X, labels, metric=metric)\n", + " davies_bouldin = davies_bouldin_score(X, labels)\n", + " return cls(float(silhouette), float(davies_bouldin))" ] }, { @@ -261,39 +371,131 @@ "def prepare_node_embeddings_for_2d_visualization(embeddings: pd.DataFrame) -> pd.DataFrame:\n", " \"\"\"\n", " Reduces the dimensionality of the node embeddings (e.g. 64 floating point numbers in an array)\n", - " to two dimensions for 2D visualization.\n", - " see https://opentsne.readthedocs.io\n", + " to two dimensions for 2D visualization using Uniform Manifold Approximation and Projection (UMAP).\n", + " see https://umap-learn.readthedocs.io\n", " \"\"\"\n", "\n", - " if embeddings.empty: \n", - " print(\"No projected data for node embeddings dimensionality reduction available\")\n", + " if embeddings.empty:\n", + " print(\"No projected data for node embeddings dimensionality reduction available with UMAP.\")\n", " return embeddings\n", - " \n", - " # Calling the fit_transform method just with a list doesn't seem to work (anymore?). \n", - " # It leads to an error with the following message: 'list' object has no attribute 'shape'\n", - " # This can be solved by converting the list to a numpy array using np.array(..).\n", - " # See https://bobbyhadz.com/blog/python-attributeerror-list-object-has-no-attribute-shape\n", + "\n", + " # Convert the list of embeddings to a numpy array\n", " embeddings_as_numpy_array = np.array(embeddings.embedding.to_list())\n", "\n", - " # Use t-distributed stochastic neighbor embedding (t-SNE) to reduce the dimensionality \n", - " # of the previously calculated node embeddings to 2 dimensions for visualization\n", - " t_distributed_stochastic_neighbor_embedding = TSNE(n_components=2, verbose=1, random_state=47)\n", - " two_dimension_node_embeddings = t_distributed_stochastic_neighbor_embedding.fit_transform(embeddings_as_numpy_array)\n", - " display(two_dimension_node_embeddings.shape) # Display the shape of the t-SNE result\n", + " # Use UMAP to reduce the dimensionality to 2D for visualization\n", + " reducer = umap.UMAP(n_components=2, min_dist=0.3, random_state=42, n_jobs=1, verbose=False)\n", + " two_dimensional_node_embeddings = reducer.fit_transform(embeddings_as_numpy_array)\n", + " \n", + " # Convert to dense numpy array (works for both sparse and dense input)\n", + " two_dimensional_node_embeddings = np.asarray(two_dimensional_node_embeddings)\n", + " # display(two_dimensional_node_embeddings.shape) # Display the shape of the UMAP result\n", "\n", " # Create a new DataFrame with the results of the 2 dimensional node embeddings\n", " # and the code unit and artifact name of the query above as preparation for the plot\n", - " node_embeddings_for_visualization = pd.DataFrame(data = {\n", - " \"codeUnit\": embeddings.codeUnitName,\n", - " \"artifact\": embeddings.projectName,\n", - " \"communityId\": embeddings.communityId,\n", - " \"centrality\": embeddings.centrality,\n", - " \"x\": [value[0] for value in two_dimension_node_embeddings],\n", - " \"y\": [value[1] for value in two_dimension_node_embeddings]\n", - " })\n", - " display(node_embeddings_for_visualization.head()) # Display the first line of the results\n", - " return node_embeddings_for_visualization\n", - " " + " embeddings[\"x\"] = [value[0] for value in two_dimensional_node_embeddings]\n", + " embeddings[\"y\"] = [value[1] for value in two_dimensional_node_embeddings]\n", + " # display(embeddings.head()) # Display the first line of the results\n", + " \n", + " return embeddings" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a132ce29", + "metadata": {}, + "outputs": [], + "source": [ + "def find_community_medoids(\n", + " data: pd.DataFrame,\n", + " community_column_name: str = \"communityId\",\n", + " x_column_name: str = \"x\",\n", + " y_column_name: str = \"y\",\n", + ") -> pd.DataFrame:\n", + " \"\"\"\n", + " Return one representative (geometric, less prone to outliers medoid) row per community.\n", + " The medoid is defined as the point closest to the community centroid.\n", + "\n", + " Parameters\n", + " ----------\n", + " data : pd.DataFrame\n", + " Input dataframe containing embeddings\n", + " community_column_name : str\n", + " Column identifying communities\n", + " x_column_name, y_column_name : str\n", + " Coordinate columns\n", + "\n", + " Returns\n", + " -------\n", + " pd.DataFrame\n", + " Subset of df with one row per community (the medoids)\n", + " \"\"\"\n", + " medoids = []\n", + "\n", + " for _, group in data.groupby(community_column_name):\n", + " center_x = group[x_column_name].median()\n", + " center_y = group[y_column_name].median()\n", + "\n", + " distances = (group[x_column_name] - center_x) ** 2 + (group[y_column_name] - center_y) ** 2\n", + " medoid_index = distances.idxmin()\n", + "\n", + " medoids.append(data.loc[medoid_index])\n", + "\n", + " return pd.DataFrame(medoids).reset_index(drop=True)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "41082be5", + "metadata": {}, + "outputs": [], + "source": [ + "def find_top_k_community_medoids(data, k=20, **kwargs):\n", + " top_communities = (\n", + " data.groupby(\"communityId\")\n", + " .size()\n", + " .nlargest(k)\n", + " .index\n", + " )\n", + " return find_community_medoids(\n", + " data[data.communityId.isin(top_communities)],\n", + " **kwargs\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "15f07225", + "metadata": {}, + "outputs": [], + "source": [ + "plot_annotation_style: dict = {\n", + " 'textcoords': 'offset points',\n", + " 'arrowprops': dict(arrowstyle='->', color='black', alpha=0.3),\n", + " 'fontsize': 6,\n", + " 'backgroundcolor': 'white',\n", + " 'bbox': dict(boxstyle='round,pad=0.3',\n", + " edgecolor='silver',\n", + " facecolor='whitesmoke',\n", + " alpha=0.8\n", + " )\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6a7d6307", + "metadata": {}, + "outputs": [], + "source": [ + "def get_plot_title(code_unit_type:str, algorithm_name: str, scores: CommunityScores) -> str:\n", + " main_title = f\"{code_unit_type} dependency graph node embeddings\"\n", + " scores_description = f\"Silhouette Score (aim higher)={scores.silhouette_score:.4f}, Davies-Bouldin Score (aim lower)={scores.davies_bouldin_score:.4f}\"\n", + " algorithm_description = f\"{algorithm_name} -> UMAP\"\n", + " return f\"{main_title}\\n{scores_description}\\n{algorithm_description}\"" ] }, { @@ -303,19 +505,115 @@ "metadata": {}, "outputs": [], "source": [ - "def plot_2d_node_embeddings(node_embeddings_for_visualization: pd.DataFrame, title: str):\n", - " if node_embeddings_for_visualization.empty:\n", + "def plot_2d_node_embeddings_on_axes(axes: plot.Axes, embeddings: pd.DataFrame, title: str):\n", + " if embeddings.empty:\n", " print(\"No projected data to plot available\")\n", " return\n", + " \n", + " def normalize(values: pd.Series) -> pd.Series:\n", + " max_value = values.max()\n", + " min_value = values.min()\n", + " range_value = max_value - min_value\n", + " return (values - min_value) / range_value if range_value != 0 else values\n", + "\n", + " normalized_centrality = normalize(embeddings.centrality)\n", + " base_size = np.clip(normalized_centrality * 50, None, 30) + 2\n", "\n", - " plot.scatter(\n", - " x=node_embeddings_for_visualization.x,\n", - " y=node_embeddings_for_visualization.y,\n", - " s=node_embeddings_for_visualization.centrality * 60,\n", - " c=node_embeddings_for_visualization.communityId,\n", - " cmap=main_color_map,\n", + " common_parameters = {\n", + " 'x': embeddings.x,\n", + " 'y': embeddings.y,\n", + " 'c': embeddings.communityId,\n", + " 'cmap': 'nipy_spectral', # nipy_spectral, gist_ncar, jet, turbo, gist_stern, rainbow, viridis\n", + " 'linewidths': 1,\n", + " }\n", + " \n", + " # Transparent 'halo' around the main points\n", + " axes.scatter(\n", + " **common_parameters,\n", + " s=base_size * 6 + 12,\n", + " alpha=0.12,\n", " )\n", - " plot.title(title)\n", + "\n", + " # Main points\n", + " axes.scatter(\n", + " **common_parameters,\n", + " s=base_size,\n", + " alpha=1.0\n", + " )\n", + " \n", + " # Annotate medoids (representative points of communities, the node closest to the community center)\n", + " medoids = find_top_k_community_medoids(embeddings)\n", + " for _, row in medoids.iterrows():\n", + " axes.annotate(\n", + " f\"{row.shortCodeUnitName}({row.communityId})\",\n", + " (row.x, row.y),\n", + " xytext=(5, 5),\n", + " **plot_annotation_style,\n", + " )\n", + " \n", + " # Annotate top centrality nodes\n", + " top_centrality_nodes = embeddings.nlargest(5, 'centrality')\n", + " for _, row in top_centrality_nodes.iterrows():\n", + " axes.annotate(\n", + " row.shortCodeUnitName,\n", + " (row.x, row.y),\n", + " xytext=(5, 5),\n", + " color='grey',\n", + " **plot_annotation_style,\n", + " )\n", + "\n", + " # Finalize plot\n", + " axes.set_title(title, fontsize=9)\n", + " axes.set_xticks([])\n", + " axes.set_yticks([])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "59a55342", + "metadata": {}, + "outputs": [], + "source": [ + "def plot_2d_node_embeddings(embeddings: pd.DataFrame, title: str, **kwargs):\n", + " if embeddings.empty:\n", + " print(\"No projected data to plot available\")\n", + " return\n", + " \n", + " figure, axes = plot.subplots(figsize=(8, 6))\n", + " plot_2d_node_embeddings_on_axes(axes=axes, embeddings=embeddings, title=title, **kwargs)\n", + " plot.tight_layout()\n", + " plot.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cdadf585", + "metadata": {}, + "outputs": [], + "source": [ + "def plot_all_2d_node_embeddings_in_grid(\n", + " embeddings: typ.List[pd.DataFrame],\n", + " titles: typ.List[str],\n", + " number_of_columns: int = 2\n", + "):\n", + " if embeddings[0].empty:\n", + " print(\"No projected data to plot available\")\n", + " return\n", + " \n", + " number_of_rows = (len(embeddings) + number_of_columns - 1) // number_of_columns\n", + " figure, axes = plot.subplots(number_of_rows, number_of_columns, figsize=(6 * number_of_columns, 4.5 * number_of_rows))\n", + " axes = np.array(axes).flatten()\n", + " i = -1\n", + "\n", + " for i, (node_embeddings_for_visualization, title) in enumerate(zip(embeddings, titles)):\n", + " plot_2d_node_embeddings_on_axes(axes=axes[i], embeddings=node_embeddings_for_visualization, title=title)\n", + "\n", + " for j in range(i + 1, len(axes)):\n", + " axes[j].axis('off')\n", + "\n", + " plot.tight_layout()\n", " plot.show()" ] }, @@ -350,22 +648,39 @@ ] }, { - "cell_type": "code", - "execution_count": null, - "id": "c2496caf", + "cell_type": "markdown", + "id": "0c68aa20", "metadata": {}, - "outputs": [], "source": [ - "# Main Colormap\n", - "main_color_map = 'nipy_spectral'" + "## 1. Java Packages" ] }, { "cell_type": "markdown", - "id": "0c68aa20", + "id": "515db579", "metadata": {}, "source": [ - "## 1. Java Packages" + "### 1.1 Create Dependency Graph Projection for Java Packages\n", + "\n", + "The projection and related common parameters are shared across all embedding algorithms below." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5631c434", + "metadata": {}, + "outputs": [], + "source": [ + "common_projection_parameters={\n", + " \"dependencies_projection\": \"java-package-embeddings-notebook\",\n", + " \"dependencies_projection_node\": \"Package\",\n", + " \"dependencies_projection_weight_property\": \"weight25PercentInterfaces\",\n", + "}\n", + "if create_undirected_projection(common_projection_parameters):\n", + " display(get_projected_graph_statistics(common_projection_parameters[\"dependencies_projection\"]))\n", + "else:\n", + " print(f\"No data for projection creation available: {common_projection_parameters}\")" ] }, { @@ -373,7 +688,7 @@ "id": "145dca19", "metadata": {}, "source": [ - "### 1.1 Generate Node Embeddings using Fast Random Projection (Fast RP) for Java Packages\n", + "### 1.2 Generate Node Embeddings using Fast Random Projection (Fast RP) for Java Packages\n", "\n", "[Fast Random Projection](https://neo4j.com/docs/graph-data-science/current/machine-learning/node-embeddings/fastrp) is used to reduce the dimensionality of the node feature space while preserving most of the distance information. Nodes with similar neighborhood result in node embedding with similar vectors.\n", "\n", @@ -388,13 +703,11 @@ "outputs": [], "source": [ "java_package_embeddings_parameters={\n", - " \"dependencies_projection\": \"java-package-embeddings-notebook\",\n", - " \"dependencies_projection_node\": \"Package\",\n", - " \"dependencies_projection_weight_property\": \"weight25PercentInterfaces\",\n", + " **common_projection_parameters,\n", " \"dependencies_projection_write_property\": \"embeddingsFastRandomProjection\",\n", " \"dependencies_projection_embedding_dimension\":\"32\"\n", "}\n", - "embeddings = create_node_embeddings(\"../cypher/Node_Embeddings/Node_Embeddings_1d_Fast_Random_Projection_Stream.cypher\", java_package_embeddings_parameters)\n" + "embeddings_fastRP = create_node_embeddings(\"../cypher/Node_Embeddings/Node_Embeddings_1d_Fast_Random_Projection_Stream.cypher\", java_package_embeddings_parameters)\n" ] }, { @@ -402,9 +715,15 @@ "id": "76d8bca1", "metadata": {}, "source": [ - "### 1.2 Dimensionality reduction with t-distributed stochastic neighbor embedding (t-SNE)\n", + "### 1.3 Dimensionality reduction with Uniform Manifold Approximation and Projection (UMAP)\n", + "\n", + "This step takes the original node embeddings in their high dimensionality, e.g. 32 floating point numbers, and reduces them into a two dimensional array for visualization. For more details look up the function \"prepare_node_embeddings_for_2d_visualization\".\n", + "\n", + "**About UMAP:**\n", "\n", - "This step takes the original node embeddings with a higher dimensionality, e.g. 64 floating point numbers, and reduces them into a two dimensional array for visualization. For more details look up the function declaration for \"prepare_node_embeddings_for_2d_visualization\"." + "> The embedding is found by searching for a low dimensional projection of the data that has the closest possible equivalent fuzzy topological structure.\n", + "\n", + "(see https://umap-learn.readthedocs.io)" ] }, { @@ -414,7 +733,8 @@ "metadata": {}, "outputs": [], "source": [ - "node_embeddings_for_visualization = prepare_node_embeddings_for_2d_visualization(embeddings)" + "embeddings_fastRP = prepare_node_embeddings_for_2d_visualization(embeddings_fastRP)\n", + "scores_fastRP = CommunityScores.calculate(embeddings_fastRP)" ] }, { @@ -422,7 +742,7 @@ "id": "f908c47f", "metadata": {}, "source": [ - "### 1.3 Visualization of the node embeddings reduced to two dimensions" + "### 1.4 Visualization of the node embeddings reduced to two dimensions" ] }, { @@ -432,10 +752,7 @@ "metadata": {}, "outputs": [], "source": [ - "plot_2d_node_embeddings(\n", - " node_embeddings_for_visualization, \n", - " \"Java Package positioned by their dependency relationships (FastRP node embeddings + t-SNE)\"\n", - ")" + "plot_2d_node_embeddings(embeddings_fastRP, get_plot_title(\"Java Packages\", \"Fast Random Projection\", scores_fastRP))" ] }, { @@ -443,7 +760,7 @@ "id": "b690b9a7", "metadata": {}, "source": [ - "### 1.4 Node Embeddings for Java Packages using HashGNN\n", + "### 1.5 Node Embeddings for Java Packages using HashGNN\n", "\n", "[HashGNN](https://neo4j.com/docs/graph-data-science/2.6/machine-learning/node-embeddings/hashgnn) resembles Graph Neural Networks (GNN) but does not include a model or require training. It combines ideas of GNNs and fast randomized algorithms. For more details see [HashGNN](https://neo4j.com/docs/graph-data-science/2.6/machine-learning/node-embeddings/hashgnn). Here, the latter 3 steps are combined into one for HashGNN." ] @@ -456,18 +773,14 @@ "outputs": [], "source": [ "java_package_embeddings_parameters={\n", - " \"dependencies_projection\": \"java-package-embeddings-notebook\",\n", - " \"dependencies_projection_node\": \"Package\",\n", - " \"dependencies_projection_weight_property\": \"weight25PercentInterfaces\",\n", + " **common_projection_parameters,\n", " \"dependencies_projection_write_property\": \"embeddingsHashGNN\",\n", " \"dependencies_projection_embedding_dimension\":\"64\"\n", "}\n", - "embeddings = create_node_embeddings(\"../cypher/Node_Embeddings/Node_Embeddings_2d_Hash_GNN_Stream.cypher\", java_package_embeddings_parameters)\n", - "node_embeddings_for_visualization = prepare_node_embeddings_for_2d_visualization(embeddings)\n", - "plot_2d_node_embeddings(\n", - " node_embeddings_for_visualization, \n", - " \"Java Package positioned by their dependency relationships (HashGNN node embeddings + t-SNE)\"\n", - ")" + "embeddings_hashGNN = create_node_embeddings(\"../cypher/Node_Embeddings/Node_Embeddings_2d_Hash_GNN_Stream.cypher\", java_package_embeddings_parameters)\n", + "embeddings_hashGNN = prepare_node_embeddings_for_2d_visualization(embeddings_hashGNN)\n", + "scores_hashGNN = CommunityScores.calculate(embeddings_hashGNN)\n", + "plot_2d_node_embeddings(embeddings_hashGNN, get_plot_title(\"Java Packages\", \"HashGNN\", scores_hashGNN))" ] }, { @@ -475,7 +788,7 @@ "id": "248d88b4", "metadata": {}, "source": [ - "### 2.5 Node Embeddings for Java Packages using node2vec" + "### 1.6 Node Embeddings for Java Packages using node2vec" ] }, { @@ -484,21 +797,96 @@ "id": "62c40c45", "metadata": {}, "outputs": [], + "source": [ + "java_package_embeddings_parameters={\n", + " **common_projection_parameters,\n", + " \"dependencies_projection_write_property\": \"embeddingsNode2Vec\",\n", + " \"dependencies_projection_embedding_dimension\":\"32\"\n", + "}\n", + "embeddings_node2vec = create_node_embeddings(\"../cypher/Node_Embeddings/Node_Embeddings_3d_Node2Vec_Stream.cypher\", java_package_embeddings_parameters)\n", + "embeddings_node2vec = prepare_node_embeddings_for_2d_visualization(embeddings_node2vec)\n", + "scores_node2vec = CommunityScores.calculate(embeddings_node2vec)\n", + "plot_2d_node_embeddings(embeddings_node2vec, get_plot_title(\"Java Packages\", \"node2vec\", scores_node2vec))" + ] + }, + { + "cell_type": "markdown", + "id": "873d6a4e", + "metadata": {}, + "source": [ + "### 1.7 Node Embeddings for Java Packages using GraphSAGE" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f25a062f", + "metadata": {}, + "outputs": [], "source": [ "java_package_embeddings_parameters={\n", " \"dependencies_projection\": \"java-package-embeddings-notebook\",\n", " \"dependencies_projection_node\": \"Package\",\n", " \"dependencies_projection_weight_property\": \"weight25PercentInterfaces\",\n", - " \"dependencies_projection_write_property\": \"embeddingsNode2Vec\",\n", + " \"dependencies_projection_write_property\": \"embeddingsGraphSAGE\",\n", " \"dependencies_projection_embedding_dimension\":\"32\"\n", "}\n", - "embeddings = create_node_embeddings(\"../cypher/Node_Embeddings/Node_Embeddings_3d_Node2Vec_Stream.cypher\", java_package_embeddings_parameters)\n", - "node_embeddings_for_visualization = prepare_node_embeddings_for_2d_visualization(embeddings)\n", - "plot_2d_node_embeddings(\n", - " node_embeddings_for_visualization, \n", - " \"Java Package positioned by their dependency relationships (node2vec node embeddings + t-SNE)\"\n", + "embeddings_graphSAGE= create_node_embeddings_with_GraphSAGE(java_package_embeddings_parameters)\n", + "embeddings_graphSAGE = prepare_node_embeddings_for_2d_visualization(embeddings_graphSAGE)\n", + "scores_graphSAGE = CommunityScores.calculate(embeddings_graphSAGE)\n", + "plot_2d_node_embeddings(embeddings_graphSAGE, get_plot_title(\"Java Packages\", \"GraphSAGE\", scores_graphSAGE))" + ] + }, + { + "cell_type": "markdown", + "id": "b9a5d57b", + "metadata": {}, + "source": [ + "### 2. Compare Node Embeddings\n", + "\n", + "In this section we will compare all node embedding methods from above in a grid plot. This helps to see how well the different algorithms were able to capture the structure of the graph and how well the communities are separated." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c725a0ba", + "metadata": {}, + "outputs": [], + "source": [ + "plot_all_2d_node_embeddings_in_grid(\n", + " embeddings=[embeddings_fastRP, embeddings_hashGNN, embeddings_node2vec, embeddings_graphSAGE],\n", + " titles=[\n", + " get_plot_title(\"Java Packages\", \"Fast Random Projection\", scores_fastRP),\n", + " get_plot_title(\"Java Packages\", \"HashGNN\", scores_hashGNN),\n", + " get_plot_title(\"Java Packages\", \"node2vec\", scores_node2vec),\n", + " get_plot_title(\"Java Packages\", \"GraphSAGE\", scores_graphSAGE),\n", + " ],\n", ")" ] + }, + { + "cell_type": "markdown", + "id": "6d55b6f2", + "metadata": {}, + "source": [ + "#### Interpreting Node Embedding Results\n", + "\n", + "##### Summary of Observations\n", + "\n", + "- **FastRP** and **node2vec** show clear, well-separated clusters\n", + "- **HashGNN** and **GraphSAGE** produce more diffuse embeddings\n", + "- Silhouette scores are high for FastRP / node2vec and low for HashGNN / GraphSAGE\n", + "\n", + "These differences are expected and stem from the **fundamentally different objectives** of the algorithms.\n", + "\n", + "##### Key Takeaways\n", + "\n", + "- **FastRP and node2vec** are well-suited for **community discovery and visualization**\n", + "- **HashGNN** is best viewed as a **fast structural fingerprint**, not a clustering embedding\n", + "- **GraphSAGE** requires meaningful node features or labels and performs poorly in dense, feature-poor settings\n", + "- Poor silhouette scores for HashGNN and GraphSAGE are **expected and theoretically consistent**" + ] } ], "metadata": { diff --git a/jupyter/NodeEmbeddingsTypescript.ipynb b/jupyter/NodeEmbeddingsTypescript.ipynb index bd19314d8..af08c1839 100644 --- a/jupyter/NodeEmbeddingsTypescript.ipynb +++ b/jupyter/NodeEmbeddingsTypescript.ipynb @@ -6,7 +6,7 @@ "id": "2f0eabc4", "metadata": {}, "source": [ - "# Node Embeddings\n", + "# Node Embeddings for TypeScript\n", "\n", "This notebook demonstrates different methods for node embeddings and how to further reduce their dimensionality to be able to visualize them in a 2D plot. \n", "\n", @@ -20,7 +20,7 @@ "- Clean the data, e.g. filter out very few nodes with extremely high degree that aren't actually that important\n", "- Try directed vs. undirected projections\n", "- Tune the embedding algorithm, e.g. use a higher dimensionality\n", - "- Tune t-SNE that is used to reduce the node embeddings dimension to two dimensions for visualization. \n", + "- Tune UMAP that is used to reduce the node embeddings dimension to two dimensions for visualization. \n", "\n", "It could also be the case that the node embeddings are good enough and well suited the way they are despite their visualization for the down stream task like node classification or link prediction. In that case it makes sense to see how the whole pipeline performs before tuning the node embeddings in detail. \n", "\n", @@ -37,7 +37,7 @@ "- [Neo4j Python Driver](https://neo4j.com/docs/api/python-driver/current)\n", "- [Tutorial: Applied Graph Embeddings](https://neo4j.com/developer/graph-data-science/applied-graph-embeddings)\n", "- [Visualizing the embeddings in 2D](https://github.com/openai/openai-cookbook/blob/main/examples/Visualizing_embeddings_in_2D.ipynb)\n", - "- [scikit-learn TSNE](https://scikit-learn.org/stable/modules/generated/sklearn.manifold.TSNE.html#sklearn.manifold.TSNE)\n", + "- [UMAP](https://umap-learn.readthedocs.io/en/latest)\n", "- [AttributeError: 'list' object has no attribute 'shape'](https://bobbyhadz.com/blog/python-attributeerror-list-object-has-no-attribute-shape)\n", "- [Fast Random Projection (neo4j)](https://neo4j.com/docs/graph-data-science/current/machine-learning/node-embeddings/fastrp)\n", "- [HashGNN (neo4j)](https://neo4j.com/docs/graph-data-science/2.6/machine-learning/node-embeddings/hashgnn)\n", @@ -67,18 +67,6 @@ { "cell_type": "code", "execution_count": null, - "id": "8b650672", - "metadata": {}, - "outputs": [], - "source": [ - "# Main Colormap\n", - "# main_color_map = 'nipy_spectral'\n", - "main_color_map = 'viridis'" - ] - }, - { - "cell_type": "code", - "execution_count": 1, "id": "4191f259", "metadata": {}, "outputs": [], @@ -89,8 +77,10 @@ "import matplotlib.pyplot as plot\n", "import typing as typ\n", "import numpy as np\n", - "from openTSNE.sklearn import TSNE\n", - "from neo4j import GraphDatabase" + "import umap\n", + "from neo4j import GraphDatabase\n", + "from sklearn.metrics import silhouette_score\n", + "from sklearn.metrics import davies_bouldin_score" ] }, { @@ -100,9 +90,17 @@ "metadata": {}, "outputs": [], "source": [ - "from openTSNE import __version__ as openTSNE_version\n", - "print('The openTSNE version is: {}'.format(openTSNE_version))\n", - "print('The pandas version is {}.'.format(pd.__version__))\n" + "print('The numpy version is: {}'.format(np.__version__))\n", + "print('The pandas version is: {}'.format(pd.__version__))\n", + "\n", + "from matplotlib import __version__ as matplotlib_version\n", + "print('The matplotlib version is: {}'.format(matplotlib_version))\n", + "\n", + "from sklearn import __version__ as sklearn_version\n", + "print('The sklearn version is: {}'.format(sklearn_version))\n", + "\n", + "from umap import __version__ as umap_version\n", + "print('The UMAP version is: {}'.format(umap_version))" ] }, { @@ -159,8 +157,6 @@ "metadata": {}, "outputs": [], "source": [ - "# TODO option to choose between directed and undirected projection\n", - "\n", "def create_undirected_projection(parameters: dict) -> bool: \n", " \"\"\"\n", " Creates an undirected homogenous in-memory Graph projection for/with Neo4j Graph Data Science Plugin.\n", @@ -189,6 +185,54 @@ " return True" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "2c316f25", + "metadata": {}, + "outputs": [], + "source": [ + "def get_projected_graph_statistics(projection_name: str) -> pd.DataFrame:\n", + " \"\"\"\n", + " Returns the projection statistics for the given parameters.\n", + " Parameters\n", + " ----------\n", + " projection_name : str\n", + " The name prefix for the in-memory projection for dependencies. Example: \"java-package-embeddings-notebook\"\n", + " \"\"\"\n", + "\n", + " parameters = dict(\n", + " dependencies_projection=projection_name,\n", + " )\n", + " return query_cypher_to_data_frame(\"../cypher/Dependencies_Projection/Dependencies_12_Get_Projection_Statistics.cypher\", parameters)\n", + "\n", + "\n", + "def get_projected_graph_node_count(projection_name: str) -> int:\n", + " \"\"\"\n", + " Returns the number of nodes in the projected graph.\n", + " Parameters\n", + " ----------\n", + " projection_name : str\n", + " The name prefix for the in-memory projection for dependencies. Example: \"java-package-embeddings-notebook\"\n", + " \"\"\"\n", + "\n", + " graph_statistics = get_projected_graph_statistics(projection_name)\n", + " if graph_statistics.empty:\n", + " return 0\n", + " return graph_statistics[\"nodeCount\"].values[0]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9e98795d", + "metadata": {}, + "outputs": [], + "source": [ + "def empty_embeddings() -> pd.DataFrame:\n", + " return pd.DataFrame(columns=[\"codeUnitName\", \"shortCodeUnitName\", 'projectName', 'communityId', 'centrality', 'embedding', 'x', 'y'])" + ] + }, { "cell_type": "code", "execution_count": null, @@ -196,11 +240,11 @@ "metadata": {}, "outputs": [], "source": [ - "# Feature ideas\n", - "# TODO option to choose between directed and undirected projection\n", - "# TODO option to not read already existing node embeddings to experiment with different (hpyer) parameters\n", - "# TODO run a community detection algorithm co-located in here when \"communityId\" is missing\n", - "# TODO run a centrality algorithm co-located in here when \"centrality\" score is missing\n", + "# Feature Ideas:\n", + "# - Option to choose between directed and undirected projection?\n", + "# - Option to not read already existing node embeddings to experiment with different hyper-parameters?\n", + "# - Run a community detection algorithm co-located in here when \"communityId\" is missing?\n", + "# - Run a centrality algorithm co-located in here when \"centrality\" score is missing?\n", "\n", "def create_node_embeddings(cypher_file_name: str, parameters: dict) -> pd.DataFrame: \n", " \"\"\"\n", @@ -222,14 +266,14 @@ " The name of the node property that contains the dependency weight. Example: \"lowCouplingElement25PercentWeight\"\n", " dependencies_projection_embedding_dimension : str\n", " The number of the dimensions and therefore size of the resulting array of floating point numbers\n", + " dependencies_projection_write_property : str\n", + " The name of the node property where the resulting embeddings will be stored. Example: \"embedding\n", " \"\"\"\n", " \n", - " is_data_available=create_undirected_projection(parameters)\n", - " \n", - " if not is_data_available:\n", + " node_count = get_projected_graph_node_count(parameters[\"dependencies_projection\"])\n", + " if node_count <= 0:\n", " print(\"No projected data for node embeddings calculation available\")\n", - " empty_result = pd.DataFrame(columns=[\"codeUnitName\", 'projectName', 'communityId', 'centrality', 'embedding'])\n", - " return empty_result\n", + " return empty_embeddings()\n", "\n", " existing_embeddings_query_filename=\"../cypher/Node_Embeddings/Node_Embeddings_0a_Query_Calculated.cypher\"\n", " embeddings = query_first_non_empty_cypher_to_data_frame(existing_embeddings_query_filename, cypher_file_name, parameters=parameters)\n", @@ -238,17 +282,83 @@ ] }, { - "cell_type": "markdown", - "id": "f6ec6a9b", + "cell_type": "code", + "execution_count": null, + "id": "e2b52e51", "metadata": {}, + "outputs": [], "source": [ - "### Dimensionality reduction with t-distributed stochastic neighbor embedding (t-SNE)\n", + "def create_node_embeddings_with_GraphSAGE(parameters: dict) -> pd.DataFrame: \n", + " \"\"\"\n", + " Creates an in-memory Graph projection by calling \"create_undirected_projection\", \n", + " enriches it with a degree centrality property for every node, trains GraphSAGE \n", + " and returns the resulting node embeddings as DataFrame.\n", + " \n", + " parameters\n", + " ----------\n", + " dependencies_projection : str\n", + " The name prefix for the in-memory projection for dependencies. Example: \"java-package-embeddings-notebook\"\n", + " dependencies_projection_node : str\n", + " The label of the nodes that will be used for the projection. Example: \"Package\"\n", + " dependencies_projection_weight_property : str\n", + " The name of the node property that contains the dependency weight. Example: \"weight25PercentInterfaces\"\n", + " dependencies_projection_embedding_dimension : str\n", + " The number of the dimensions and therefore size of the resulting array of floating point numbers\n", + " \"\"\"\n", + " \n", + " node_count = get_projected_graph_node_count(parameters[\"dependencies_projection\"])\n", + " if node_count <= 0:\n", + " print(\"No projected data for node embeddings calculation available\")\n", + " return empty_embeddings()\n", + " \n", + " if node_count > 500:\n", + " print(\"GraphSAGE node embeddings training will be skipped for \" + str(node_count) + \" (>500) nodes, since it is computationally expensive and not eagerly needed for demonstration purposes.\")\n", + " return empty_embeddings()\n", + " \n", + " query_cypher_to_data_frame(\"../cypher/Node_Embeddings/Node_Embeddings_0b_Prepare_Degree.cypher\", parameters)\n", + " query_cypher_to_data_frame(\"../cypher/Node_Embeddings/Node_Embeddings_0c_Drop_Model.cypher\", parameters)\n", + " display(query_cypher_to_data_frame(\"../cypher/Node_Embeddings/Node_Embeddings_4b_GraphSAGE_Train.cypher\", parameters))\n", + " embeddings=query_cypher_to_data_frame(\"../cypher/Node_Embeddings/Node_Embeddings_4d_GraphSAGE_Stream.cypher\", parameters)\n", + " \n", + " display(embeddings.head()) # Display the first entries of the table\n", + " return embeddings" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "be400bdb", + "metadata": {}, + "outputs": [], + "source": [ + "class CommunityScores:\n", + " \n", + " def __init__(self, silhouette_score: float, davies_bouldin_score: float):\n", + " self.silhouette_score = silhouette_score\n", + " self.davies_bouldin_score = davies_bouldin_score\n", "\n", - "The following function takes the original node embeddings with a higher dimensionality, e.g. 64 floating point numbers, and reduces them into a two dimensional array for visualization. \n", + " def __repr__(self):\n", + " return f\"CommunityScores(silhouette_score={self.silhouette_score}, davies_bouldin_score={self.davies_bouldin_score})\"\n", "\n", - "> It converts similarities between data points to joint probabilities and tries to minimize the Kullback-Leibler divergence between the joint probabilities of the low-dimensional embedding and the high-dimensional data.\n", + " @classmethod\n", + " def calculate(cls, data: pd.DataFrame, metric: str = \"cosine\"):\n", + " \"\"\"\n", + " data: pandas DataFrame with columns:\n", + " - 'communityId': int\n", + " - 'embedding': array-like (same length for all rows)\n", + " metric: 'cosine', 'euclidean', etc.\n", + " \"\"\"\n", + " # ensure we pass a sequence/array of arrays to sklearn / numpy\n", + " X = np.array(data[\"embedding\"].to_list())\n", + " labels = data[\"communityId\"].astype(int)\n", "\n", - "(see https://opentsne.readthedocs.io)" + " if len(np.unique(labels)) < 2:\n", + " print(\"Community scores require at least 2 communities\")\n", + " return cls(0.0, 0.0)\n", + "\n", + " silhouette = silhouette_score(X, labels, metric=metric)\n", + " davies_bouldin = davies_bouldin_score(X, labels)\n", + " return cls(float(silhouette), float(davies_bouldin))" ] }, { @@ -260,62 +370,250 @@ "source": [ "def prepare_node_embeddings_for_2d_visualization(embeddings: pd.DataFrame) -> pd.DataFrame:\n", " \"\"\"\n", - " Reduces the dimensionality of the node embeddings (e.g. 32 floating point numbers in an array)\n", - " to two dimensions for 2D visualization.\n", - " see https://opentsne.readthedocs.io\n", + " Reduces the dimensionality of the node embeddings (e.g. 64 floating point numbers in an array)\n", + " to two dimensions for 2D visualization using Uniform Manifold Approximation and Projection (UMAP).\n", + " see https://umap-learn.readthedocs.io\n", " \"\"\"\n", "\n", - " if embeddings.empty: \n", - " print(\"No projected data for node embeddings dimensionality reduction available\")\n", + " if embeddings.empty:\n", + " print(\"No projected data for node embeddings dimensionality reduction available with UMAP.\")\n", " return embeddings\n", - " \n", - " # Calling the fit_transform method just with a list doesn't seem to work (anymore?). \n", - " # It leads to an error with the following message: 'list' object has no attribute 'shape'\n", - " # This can be solved by converting the list to a numpy array using np.array(..).\n", - " # See https://bobbyhadz.com/blog/python-attributeerror-list-object-has-no-attribute-shape\n", + "\n", + " # Convert the list of embeddings to a numpy array\n", " embeddings_as_numpy_array = np.array(embeddings.embedding.to_list())\n", "\n", - " # Use t-distributed stochastic neighbor embedding (t-SNE) to reduce the dimensionality \n", - " # of the previously calculated node embeddings to 2 dimensions for visualization\n", - " t_distributed_stochastic_neighbor_embedding = TSNE(n_components=2, verbose=1, random_state=47)\n", - " two_dimension_node_embeddings = t_distributed_stochastic_neighbor_embedding.fit_transform(embeddings_as_numpy_array)\n", - " display(two_dimension_node_embeddings.shape) # Display the shape of the t-SNE result\n", + " # Use UMAP to reduce the dimensionality to 2D for visualization\n", + " reducer = umap.UMAP(n_components=2, min_dist=0.3, random_state=42, n_jobs=1, verbose=False)\n", + " two_dimensional_node_embeddings = reducer.fit_transform(embeddings_as_numpy_array)\n", + " \n", + " # Convert to dense numpy array (works for both sparse and dense input)\n", + " two_dimensional_node_embeddings = np.asarray(two_dimensional_node_embeddings)\n", + " # display(two_dimensional_node_embeddings.shape) # Display the shape of the UMAP result\n", "\n", " # Create a new DataFrame with the results of the 2 dimensional node embeddings\n", " # and the code unit and artifact name of the query above as preparation for the plot\n", - " node_embeddings_for_visualization = pd.DataFrame(data = {\n", - " \"codeUnit\": embeddings.codeUnitName,\n", - " \"artifact\": embeddings.projectName,\n", - " \"communityId\": embeddings.communityId,\n", - " \"centrality\": embeddings.centrality,\n", - " \"x\": [value[0] for value in two_dimension_node_embeddings],\n", - " \"y\": [value[1] for value in two_dimension_node_embeddings]\n", - " })\n", - " display(node_embeddings_for_visualization.head()) # Display the first line of the results\n", - " return node_embeddings_for_visualization\n", - " " + " embeddings[\"x\"] = [value[0] for value in two_dimensional_node_embeddings]\n", + " embeddings[\"y\"] = [value[1] for value in two_dimensional_node_embeddings]\n", + " # display(embeddings.head()) # Display the first line of the results\n", + " \n", + " return embeddings" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4b31d031", + "metadata": {}, + "outputs": [], + "source": [ + "def find_community_medoids(\n", + " data: pd.DataFrame,\n", + " community_column_name: str = \"communityId\",\n", + " x_column_name: str = \"x\",\n", + " y_column_name: str = \"y\",\n", + ") -> pd.DataFrame:\n", + " \"\"\"\n", + " Return one representative (geometric, less prone to outliers medoid) row per community.\n", + " The medoid is defined as the point closest to the community centroid.\n", + "\n", + " Parameters\n", + " ----------\n", + " data : pd.DataFrame\n", + " Input dataframe containing embeddings\n", + " community_column_name : str\n", + " Column identifying communities\n", + " x_column_name, y_column_name : str\n", + " Coordinate columns\n", + "\n", + " Returns\n", + " -------\n", + " pd.DataFrame\n", + " Subset of df with one row per community (the medoids)\n", + " \"\"\"\n", + " medoids = []\n", + "\n", + " for _, group in data.groupby(community_column_name):\n", + " center_x = group[x_column_name].median()\n", + " center_y = group[y_column_name].median()\n", + "\n", + " distances = (group[x_column_name] - center_x) ** 2 + (group[y_column_name] - center_y) ** 2\n", + " medoid_index = distances.idxmin()\n", + "\n", + " medoids.append(data.loc[medoid_index])\n", + "\n", + " return pd.DataFrame(medoids).reset_index(drop=True)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b64b67d4", + "metadata": {}, + "outputs": [], + "source": [ + "def find_top_k_community_medoids(data, k=20, **kwargs):\n", + " top_communities = (\n", + " data.groupby(\"communityId\")\n", + " .size()\n", + " .nlargest(k)\n", + " .index\n", + " )\n", + " return find_community_medoids(\n", + " data[data.communityId.isin(top_communities)],\n", + " **kwargs\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a97c2bca", + "metadata": {}, + "outputs": [], + "source": [ + "plot_annotation_style: dict = {\n", + " 'textcoords': 'offset points',\n", + " 'arrowprops': dict(arrowstyle='->', color='black', alpha=0.3),\n", + " 'fontsize': 6,\n", + " 'backgroundcolor': 'white',\n", + " 'bbox': dict(boxstyle='round,pad=0.3',\n", + " edgecolor='silver',\n", + " facecolor='whitesmoke',\n", + " alpha=0.8\n", + " )\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d2d184ae", + "metadata": {}, + "outputs": [], + "source": [ + "def get_plot_title(code_unit_type:str, algorithm_name: str, scores: CommunityScores) -> str:\n", + " main_title = f\"{code_unit_type} dependency graph node embeddings\"\n", + " scores_description = f\"Silhouette Score (aim higher)={scores.silhouette_score:.4f}, Davies-Bouldin Score (aim lower)={scores.davies_bouldin_score:.4f}\"\n", + " algorithm_description = f\"{algorithm_name} -> UMAP\"\n", + " return f\"{main_title}\\n{scores_description}\\n{algorithm_description}\"" ] }, { "cell_type": "code", "execution_count": null, - "id": "d937e26e", + "id": "5ee8e702", "metadata": {}, "outputs": [], "source": [ - "def plot_2d_node_embeddings(node_embeddings_for_visualization: pd.DataFrame, title: str):\n", - " if node_embeddings_for_visualization.empty:\n", + "def plot_2d_node_embeddings_on_axes(axes: plot.Axes, embeddings: pd.DataFrame, title: str):\n", + " if embeddings.empty:\n", " print(\"No projected data to plot available\")\n", " return\n", + " \n", + " def normalize(values: pd.Series) -> pd.Series:\n", + " max_value = values.max()\n", + " min_value = values.min()\n", + " range_value = max_value - min_value\n", + " return (values - min_value) / range_value if range_value != 0 else values\n", "\n", - " plot.scatter(\n", - " x=node_embeddings_for_visualization.x,\n", - " y=node_embeddings_for_visualization.y,\n", - " s=node_embeddings_for_visualization.centrality * 60,\n", - " c=node_embeddings_for_visualization.communityId,\n", - " cmap=main_color_map,\n", + " normalized_centrality = normalize(embeddings.centrality)\n", + " base_size = np.clip(normalized_centrality * 50, None, 30) + 2\n", + "\n", + " common_parameters = {\n", + " 'x': embeddings.x,\n", + " 'y': embeddings.y,\n", + " 'c': embeddings.communityId,\n", + " 'cmap': 'nipy_spectral', # nipy_spectral, gist_ncar, jet, turbo, gist_stern, rainbow, viridis\n", + " 'linewidths': 1,\n", + " }\n", + " \n", + " # Transparent 'halo' around the main points\n", + " axes.scatter(\n", + " **common_parameters,\n", + " s=base_size * 6 + 12,\n", + " alpha=0.12,\n", + " )\n", + "\n", + " # Main points\n", + " axes.scatter(\n", + " **common_parameters,\n", + " s=base_size,\n", + " alpha=1.0\n", " )\n", - " plot.title(title)\n", + " \n", + " # Annotate medoids (representative points of communities, the node closest to the community center)\n", + " medoids = find_top_k_community_medoids(embeddings)\n", + " for _, row in medoids.iterrows():\n", + " axes.annotate(\n", + " f\"{row.shortCodeUnitName}({row.communityId})\",\n", + " (row.x, row.y),\n", + " xytext=(5, 5),\n", + " **plot_annotation_style,\n", + " )\n", + " \n", + " # Annotate top centrality nodes\n", + " top_centrality_nodes = embeddings.nlargest(5, 'centrality')\n", + " for _, row in top_centrality_nodes.iterrows():\n", + " axes.annotate(\n", + " row.shortCodeUnitName,\n", + " (row.x, row.y),\n", + " xytext=(5, 5),\n", + " color='grey',\n", + " **plot_annotation_style,\n", + " )\n", + "\n", + " # Finalize plot\n", + " axes.set_title(title, fontsize=9)\n", + " axes.set_xticks([])\n", + " axes.set_yticks([])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "73a5f165", + "metadata": {}, + "outputs": [], + "source": [ + "def plot_2d_node_embeddings(embeddings: pd.DataFrame, title: str, **kwargs):\n", + " if embeddings.empty:\n", + " print(\"No projected data to plot available\")\n", + " return\n", + " \n", + " figure, axes = plot.subplots(figsize=(8, 6))\n", + " plot_2d_node_embeddings_on_axes(axes=axes, embeddings=embeddings, title=title, **kwargs)\n", + " plot.tight_layout()\n", + " plot.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "74bd9937", + "metadata": {}, + "outputs": [], + "source": [ + "def plot_all_2d_node_embeddings_in_grid(\n", + " embeddings: typ.List[pd.DataFrame],\n", + " titles: typ.List[str],\n", + " number_of_columns: int = 2\n", + "):\n", + " if embeddings[0].empty:\n", + " print(\"No projected data to plot available\")\n", + " return\n", + " \n", + " number_of_rows = (len(embeddings) + number_of_columns - 1) // number_of_columns\n", + " figure, axes = plot.subplots(number_of_rows, number_of_columns, figsize=(6 * number_of_columns, 4.5 * number_of_rows))\n", + " axes = np.array(axes).flatten()\n", + " i = -1\n", + "\n", + " for i, (node_embeddings_for_visualization, title) in enumerate(zip(embeddings, titles)):\n", + " plot_2d_node_embeddings_on_axes(axes=axes[i], embeddings=node_embeddings_for_visualization, title=title)\n", + "\n", + " for j in range(i + 1, len(axes)):\n", + " axes[j].axis('off')\n", + "\n", + " plot.tight_layout()\n", " plot.show()" ] }, @@ -350,22 +648,39 @@ ] }, { - "cell_type": "code", - "execution_count": null, - "id": "c2496caf", + "cell_type": "markdown", + "id": "0b42163d", "metadata": {}, - "outputs": [], "source": [ - "# Main Colormap\n", - "main_color_map = 'nipy_spectral'" + "## 1. Typescript Modules" ] }, { "cell_type": "markdown", - "id": "0b42163d", + "id": "f2e86bae", "metadata": {}, "source": [ - "## 1. Typescript Modules" + "### 1.1 Create Dependency Graph Projection for TypeScript Modules\n", + "\n", + "The projection and related common parameters are shared across all embedding algorithms below." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "93d806bb", + "metadata": {}, + "outputs": [], + "source": [ + "common_projection_parameters={\n", + " \"dependencies_projection\": \"typescript-module-embeddings-notebook\",\n", + " \"dependencies_projection_node\": \"Module\",\n", + " \"dependencies_projection_weight_property\": \"lowCouplingElement25PercentWeight\",\n", + "}\n", + "if create_undirected_projection(common_projection_parameters):\n", + " display(get_projected_graph_statistics(common_projection_parameters[\"dependencies_projection\"]))\n", + "else:\n", + " print(f\"No data for projection creation available: {common_projection_parameters}\")" ] }, { @@ -373,7 +688,7 @@ "id": "3b468bae", "metadata": {}, "source": [ - "### 1.1 Generate Node Embeddings for Typescript Modules using Fast Random Projection (Fast RP)\n", + "### 1.2 Generate Node Embeddings for Typescript Modules using Fast Random Projection (Fast RP)\n", "\n", "[Fast Random Projection](https://neo4j.com/docs/graph-data-science/current/machine-learning/node-embeddings/fastrp) is used to reduce the dimensionality of the node feature space while preserving most of the distance information. Nodes with similar neighborhood result in node embedding with similar vectors.\n", "\n", @@ -388,13 +703,11 @@ "outputs": [], "source": [ "typescript_module_embeddings_parameters={\n", - " \"dependencies_projection\": \"typescript-module-embeddings-notebook\",\n", - " \"dependencies_projection_node\": \"Module\",\n", - " \"dependencies_projection_weight_property\": \"lowCouplingElement25PercentWeight\",\n", + " **common_projection_parameters,\n", " \"dependencies_projection_write_property\": \"embeddingsFastRandomProjection\",\n", " \"dependencies_projection_embedding_dimension\":\"32\" \n", "}\n", - "embeddings = create_node_embeddings(\"../cypher/Node_Embeddings/Node_Embeddings_1d_Fast_Random_Projection_Stream.cypher\", typescript_module_embeddings_parameters)\n" + "embeddings_fastRP = create_node_embeddings(\"../cypher/Node_Embeddings/Node_Embeddings_1d_Fast_Random_Projection_Stream.cypher\", typescript_module_embeddings_parameters)\n" ] }, { @@ -402,9 +715,15 @@ "id": "ad17607c", "metadata": {}, "source": [ - "### 1.2 Dimensionality reduction with t-distributed stochastic neighbor embedding (t-SNE)\n", + "### 1.3 Dimensionality reduction with Uniform Manifold Approximation and Projection (UMAP)\n", + "\n", + "This step takes the original node embeddings in their high dimensionality, e.g. 32 floating point numbers, and reduces them into a two dimensional array for visualization. For more details look up the function \"prepare_node_embeddings_for_2d_visualization\".\n", + "\n", + "**About UMAP:**\n", + "\n", + "> The embedding is found by searching for a low dimensional projection of the data that has the closest possible equivalent fuzzy topological structure.\n", "\n", - "This step takes the original node embeddings with a higher dimensionality, e.g. 64 floating point numbers, and reduces them into a two dimensional array for visualization. For more details look up the function declaration for \"prepare_node_embeddings_for_2d_visualization\"." + "(see https://umap-learn.readthedocs.io)" ] }, { @@ -414,7 +733,8 @@ "metadata": {}, "outputs": [], "source": [ - "node_embeddings_for_visualization = prepare_node_embeddings_for_2d_visualization(embeddings)" + "embeddings_fastRP = prepare_node_embeddings_for_2d_visualization(embeddings_fastRP)\n", + "scores_fastRP = CommunityScores.calculate(embeddings_fastRP)" ] }, { @@ -422,7 +742,7 @@ "id": "20084589", "metadata": {}, "source": [ - "### 1.3 Plot the node embeddings reduced to two dimensions for Typescript" + "### 1.4 Plot the node embeddings reduced to two dimensions for Typescript" ] }, { @@ -432,10 +752,7 @@ "metadata": {}, "outputs": [], "source": [ - "plot_2d_node_embeddings(\n", - " node_embeddings_for_visualization, \n", - " \"Typescript Modules positioned by their dependency relationships (FastRP node embeddings + t-SNE)\"\n", - ")" + "plot_2d_node_embeddings(embeddings_fastRP, get_plot_title(\"TypeScript Modules\", \"Fast Random Projection\", scores_fastRP))" ] }, { @@ -443,7 +760,7 @@ "id": "6cac9be7", "metadata": {}, "source": [ - "### 1.4 Node Embeddings for Typescript Modules using HashGNN\n", + "### 1.5 Node Embeddings for Typescript Modules using HashGNN\n", "\n", "[HashGNN](https://neo4j.com/docs/graph-data-science/2.6/machine-learning/node-embeddings/hashgnn) resembles Graph Neural Networks (GNN) but does not include a model or require training. It combines ideas of GNNs and fast randomized algorithms. For more details see [HashGNN](https://neo4j.com/docs/graph-data-science/2.6/machine-learning/node-embeddings/hashgnn). Here, the latter 3 steps are combined into one for HashGNN." ] @@ -456,18 +773,14 @@ "outputs": [], "source": [ "typescript_module_embeddings_parameters={\n", - " \"dependencies_projection\": \"typescript-module-embeddings-notebook\",\n", - " \"dependencies_projection_node\": \"Module\",\n", - " \"dependencies_projection_weight_property\": \"lowCouplingElement25PercentWeight\",\n", + " **common_projection_parameters,\n", " \"dependencies_projection_write_property\": \"embeddingsHashGNN\",\n", " \"dependencies_projection_embedding_dimension\":\"32\"\n", "}\n", - "embeddings = create_node_embeddings(\"../cypher/Node_Embeddings/Node_Embeddings_2d_Hash_GNN_Stream.cypher\", typescript_module_embeddings_parameters)\n", - "node_embeddings_for_visualization = prepare_node_embeddings_for_2d_visualization(embeddings)\n", - "plot_2d_node_embeddings(\n", - " node_embeddings_for_visualization, \n", - " \"Typescript Modules positioned by their dependency relationships (HashGNN node embeddings + t-SNE)\"\n", - ")" + "embeddings_hashGNN = create_node_embeddings(\"../cypher/Node_Embeddings/Node_Embeddings_2d_Hash_GNN_Stream.cypher\", typescript_module_embeddings_parameters)\n", + "embeddings_hashGNN = prepare_node_embeddings_for_2d_visualization(embeddings_hashGNN)\n", + "scores_hashGNN = CommunityScores.calculate(embeddings_hashGNN)\n", + "plot_2d_node_embeddings(embeddings_hashGNN, get_plot_title(\"TypeScript Modules\", \"HashGNN\", scores_hashGNN))" ] }, { @@ -475,7 +788,7 @@ "id": "0a7d66f5", "metadata": {}, "source": [ - "### 1.5 Node Embeddings for Typescript Modules using node2vec\n", + "### 1.6 Node Embeddings for Typescript Modules using node2vec\n", "\n", "[node2vec](https://neo4j.com/docs/graph-data-science/current/machine-learning/node-embeddings/node2vec) computes a vector representation of a node based on second order random walks in the graph. \n", "The [node2vec](https://towardsdatascience.com/complete-guide-to-understanding-node2vec-algorithm-4e9a35e5d147) algorithm is a transductive node embedding algorithm, meaning that it needs the whole graph to be available to learn the node embeddings." @@ -489,19 +802,92 @@ "outputs": [], "source": [ "typescript_module_embeddings_parameters={\n", - " \"dependencies_projection\": \"typescript-module-embeddings-notebook\",\n", - " \"dependencies_projection_node\": \"Module\",\n", - " \"dependencies_projection_weight_property\": \"lowCouplingElement25PercentWeight\",\n", + " **common_projection_parameters,\n", " \"dependencies_projection_write_property\": \"embeddingsNode2Vec\",\n", " \"dependencies_projection_embedding_dimension\":\"32\"\n", "}\n", - "embeddings = create_node_embeddings(\"../cypher/Node_Embeddings/Node_Embeddings_3d_Node2Vec_Stream.cypher\", typescript_module_embeddings_parameters)\n", - "node_embeddings_for_visualization = prepare_node_embeddings_for_2d_visualization(embeddings)\n", - "plot_2d_node_embeddings(\n", - " node_embeddings_for_visualization, \n", - " \"Typescript Modules positioned by their dependency relationships (node2vec node embeddings + t-SNE)\"\n", + "embeddings_node2vec = create_node_embeddings(\"../cypher/Node_Embeddings/Node_Embeddings_3d_Node2Vec_Stream.cypher\", typescript_module_embeddings_parameters)\n", + "embeddings_node2vec = prepare_node_embeddings_for_2d_visualization(embeddings_node2vec)\n", + "scores_node2vec = CommunityScores.calculate(embeddings_node2vec)\n", + "plot_2d_node_embeddings(embeddings_node2vec, get_plot_title(\"TypeScript Modules\", \"node2vec\", scores_node2vec))" + ] + }, + { + "cell_type": "markdown", + "id": "059d162c", + "metadata": {}, + "source": [ + "### 1.7 Node Embeddings for Java Packages using GraphSAGE" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2c5664b9", + "metadata": {}, + "outputs": [], + "source": [ + "typescript_module_embeddings_parameters={\n", + " **common_projection_parameters,\n", + " \"dependencies_projection_write_property\": \"embeddingsGraphSAGE\",\n", + " \"dependencies_projection_embedding_dimension\":\"32\"\n", + "}\n", + "embeddings_graphSAGE= create_node_embeddings_with_GraphSAGE(typescript_module_embeddings_parameters)\n", + "embeddings_graphSAGE = prepare_node_embeddings_for_2d_visualization(embeddings_graphSAGE)\n", + "scores_graphSAGE = CommunityScores.calculate(embeddings_graphSAGE)\n", + "plot_2d_node_embeddings(embeddings_graphSAGE, get_plot_title(\"TypeScript Modules\", \"GraphSAGE\", scores_graphSAGE))" + ] + }, + { + "cell_type": "markdown", + "id": "c5c73bd3", + "metadata": {}, + "source": [ + "### 2. Compare Node Embeddings\n", + "\n", + "In this section we will compare all node embedding methods from above in a grid plot. This helps to see how well the different algorithms were able to capture the structure of the graph and how well the communities are separated." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8a557fb2", + "metadata": {}, + "outputs": [], + "source": [ + "plot_all_2d_node_embeddings_in_grid(\n", + " embeddings=[embeddings_fastRP, embeddings_hashGNN, embeddings_node2vec, embeddings_graphSAGE],\n", + " titles=[\n", + " get_plot_title(\"TypeScript Modules\", \"Fast Random Projection\", scores_fastRP),\n", + " get_plot_title(\"TypeScript Modules\", \"HashGNN\", scores_hashGNN),\n", + " get_plot_title(\"TypeScript Modules\", \"node2vec\", scores_node2vec),\n", + " get_plot_title(\"TypeScript Modules\", \"GraphSAGE\", scores_graphSAGE),\n", + " ],\n", ")" ] + }, + { + "cell_type": "markdown", + "id": "75acc17d", + "metadata": {}, + "source": [ + "#### Interpreting Node Embedding Results\n", + "\n", + "##### Summary of Observations\n", + "\n", + "- **FastRP** and **node2vec** show clear, well-separated clusters\n", + "- **HashGNN** and **GraphSAGE** produce more diffuse embeddings\n", + "- Silhouette scores are high for FastRP / node2vec and low for HashGNN / GraphSAGE\n", + "\n", + "These differences are expected and stem from the **fundamentally different objectives** of the algorithms.\n", + "\n", + "##### Key Takeaways\n", + "\n", + "- **FastRP and node2vec** are well-suited for **community discovery and visualization**\n", + "- **HashGNN** is best viewed as a **fast structural fingerprint**, not a clustering embedding\n", + "- **GraphSAGE** requires meaningful node features or labels and performs poorly in dense, feature-poor settings\n", + "- Poor silhouette scores for HashGNN and GraphSAGE are **expected and theoretically consistent**" + ] } ], "metadata": { diff --git a/requirements.txt b/requirements.txt index 67ed7cd82..16245100f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,7 +11,6 @@ numpy==1.26.4 pandas==2.3.3 pip==25.3 setuptools==80.9.0 # opentsne uses sklearn.base uses joblib uses distutils missing in Python >= 12 (TODO use native openTSNE?) -typing-extensions==4.15.0 # Needed for opentsne and Python >= 3.12 # --- Visualization --- wordcloud==1.9.4 @@ -29,5 +28,4 @@ neo4j==5.28.2 # --- Native/scientific packages (may require compilation) --- # These are included but may cause install errors in pip/venv -opentsne==1.0.4 # Dimensionality reduction to visualize node embeddings in 2D. Might get replaced by umap. shap==0.49.1 # For e.g. explaining anomaly detection results \ No newline at end of file