diff --git a/.github/workflows/internal-check-notebooks.yml b/.github/workflows/internal-check-notebooks.yml new file mode 100644 index 000000000..adcb04e90 --- /dev/null +++ b/.github/workflows/internal-check-notebooks.yml @@ -0,0 +1,92 @@ +name: Check Jupyter Notebooks + +on: + pull_request: + branches: + - main + # Run when notebooks, Python dependencies, or this workflow change + paths: + - 'domains/**/explore/*.ipynb' + - 'pyproject.toml' + - 'uv.lock' + - 'scripts/activateUvEnvironment.sh' + - '.github/workflows/internal-check-notebooks.yml' + +jobs: + check-notebook-syntax-and-imports: + runs-on: ubuntu-22.04 + + steps: + - name: Checkout GIT Repository + uses: actions/checkout@v6 + + - name: (uv Setup) Install uv + uses: astral-sh/setup-uv@v6 + with: + python-version: '3.12' + + - name: (uv Setup) Sync dependencies from lockfile + run: uv sync --frozen + + - name: Check notebook syntax and imports + # For each notebook: parse each Python code cell as Python AST to catch SyntaxErrors, + # then collect every unique import statement across all notebooks and run them + # in a single Python process to catch ModuleNotFoundError / ImportError. + # Cell magics (%%html, %%bash, …) and line magics (%matplotlib, …) are skipped — + # they are not Python and would cause false-positive SyntaxErrors. + # No kernel execution — no Neo4j needed, finishes in seconds. + run: | + uv run python3 - <<'PYEOF' + import ast, json, sys + from pathlib import Path + + notebooks = sorted(Path("domains").glob("**/explore/*.ipynb")) + import_lines = set() + syntax_failures = [] + + for notebook in notebooks: + print(f"Parsing {notebook}", flush=True) + nb = json.loads(notebook.read_text()) + for cell in nb["cells"]: + if cell["cell_type"] != "code": + continue + source = "".join(cell["source"]).strip() + if not source: + continue + # Skip cell magics (%%html, %%bash, etc.) — not Python code + if source.startswith("%%"): + continue + # Remove line magics (%matplotlib, %time, etc.) — not valid Python syntax + python_source = "\n".join(line for line in source.split("\n") if not line.lstrip().startswith("%")) + if not python_source.strip(): + continue + try: + tree = ast.parse(python_source) + except SyntaxError as e: + syntax_failures.append(f"{notebook}: SyntaxError line {e.lineno}: {e.msg}") + continue + for node in ast.walk(tree): + if isinstance(node, ast.Import): + for alias in node.names: + import_lines.add(f"import {alias.name}") + elif isinstance(node, ast.ImportFrom) and node.module: + names = ", ".join(a.name for a in node.names) + import_lines.add(f"from {node.module} import {names}") + + if syntax_failures: + print("Syntax errors found:", file=sys.stderr) + for f in syntax_failures: + print(f" {f}", file=sys.stderr) + sys.exit(1) + + import_script = "\n".join(sorted(import_lines)) + print(f"\nRunning {len(import_lines)} unique import statements from {len(notebooks)} notebooks...", flush=True) + try: + exec(import_script) # noqa: S102 + except Exception as e: + print("Import check failed:", file=sys.stderr) + print(str(e), file=sys.stderr) + sys.exit(1) + + print(f"All {len(notebooks)} notebooks OK: syntax valid, {len(import_lines)} unique imports resolved.") + PYEOF diff --git a/CHANGELOG.md b/CHANGELOG.md index 135c6de50..59c21cfbf 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,14 @@ This document describes the changes to the Code Graph Analysis Pipeline. The changes are grouped by version and date. The latest version is at the top. +## v4.0.1 Improve charts to look more similar to previous Jupyter notebook charts and add pipeline to validate Jupyter notebooks + +### 🎨 Improvements + +* **Python charts improved** — Python charts now look more similar to the previous Jupyter notebooks with improved visual presentation by @JohT +* **Jupyter notebook validation pipeline** — Add pipeline to quickly validate Jupyter notebooks by @JohT +* **Python dependency improvements** — Add `nbformat` library for plotly support in Jupyter notebooks and remove `setuptools` dependency since opentsne is removed by @JohT + ## v4.0.0 - Vertical slice domains, uv as Python package manager, Jupyter removed ### ✨ Highlights diff --git a/README.md b/README.md index 70469f49c..d94ba92cc 100644 --- a/README.md +++ b/README.md @@ -56,16 +56,18 @@ If you think in architecture terms: domains are the vertical slices, report type ### Analysis Domains -- [Overview](./domains/overview/README.md) - High-level project structure, composition, counts, and complexity distributions. -- [External Dependencies](./domains/external-dependencies/README.md) - Usage of external libraries, packages, modules, and namespaces. -- [Internal Dependencies](./domains/internal-dependencies/README.md) - Internal dependency structure, path finding, topological order, OOD metrics, visibility metrics, and word clouds. -- [Cyclic Dependencies](./domains/cyclic-dependencies/README.md) - Dedicated cycle analysis for Java artifacts, Java packages, and TypeScript modules. -- [Java](./domains/java/README.md) - Java code quality, method metrics, annotations, and artifact dependency analysis. -- [Git History](./domains/git-history/README.md) - Change frequency, co-change patterns, authorship, and repository evolution. -- [Graph Algorithms](./domains/graph-algorithms/README.md) - Centrality, communities, similarity, and other Graph Data Science results. -- [Node Embeddings](./domains/node-embeddings/README.md) - Graph embeddings and 2D projections for structural exploration. -- [Anomaly Detection](./domains/anomaly-detection/README.md) - Machine-learning-supported structural anomaly detection. -- [Archetypes](./domains/archetypes/README.md) - Structural roles such as authority, bottleneck, and hub. +| Domain | Description | Java Example | TypeScript Example | Notebooks | Example Chart | +|--------|-------------|--------------|--------------------|-----------|---------------| +| [Anomaly Detection](./domains/anomaly-detection/README.md) | Machine-learning-supported structural anomaly detection | [AxonFramework](https://github.com/JohT/code-graph-analysis-examples/blob/main/analysis-results/AxonFramework/latest/anomaly-detection/anomaly_detection_report.md) | [react-router](https://github.com/JohT/code-graph-analysis-examples/blob/main/analysis-results/react-router/latest/anomaly-detection/anomaly_detection_report.md) | [Explore](./domains/anomaly-detection/explore/AnomalyDetectionExploration.ipynb) | [Anomalies](https://github.com/JohT/code-graph-analysis-examples/blob/main/analysis-results/AxonFramework/latest/anomaly-detection/Java_Type/Anomalies.svg) | +| [Archetypes](./domains/archetypes/README.md) | Structural roles: authority, bottleneck, and hub | [AxonFramework](https://github.com/JohT/code-graph-analysis-examples/blob/main/analysis-results/AxonFramework/latest/archetypes/archetypes_report.md) | [react-router](https://github.com/JohT/code-graph-analysis-examples/blob/main/analysis-results/react-router/latest/archetypes/archetypes_report.md) | — | [Treemap](https://github.com/JohT/code-graph-analysis-examples/blob/main/analysis-results/AxonFramework/latest/archetypes/JavaTreemap1ArchetypesOverviewPerDirectory.svg) | +| [Cyclic Dependencies](./domains/cyclic-dependencies/README.md) | Cycle analysis for Java artifacts, packages, and TypeScript modules | [AxonFramework](https://github.com/JohT/code-graph-analysis-examples/blob/main/analysis-results/AxonFramework/latest/cyclic-dependencies/cyclic_dependencies_report.md) | [react-router](https://github.com/JohT/code-graph-analysis-examples/blob/main/analysis-results/react-router/latest/cyclic-dependencies/cyclic_dependencies_report.md) | [Java](./domains/cyclic-dependencies/explore/CyclicDependenciesJavaExploration.ipynb) , [TypeScript](./domains/cyclic-dependencies/explore/CyclicDependenciesTypescriptExploration.ipynb) | [Cycle Graph](https://github.com/JohT/code-graph-analysis-examples/blob/main/analysis-results/AxonFramework/latest/cyclic-dependencies/Java_Package/Graph_Visualizations/JavaPackageCyclicDependencies1.svg) | +| [External Dependencies](./domains/external-dependencies/README.md) | Usage of external libraries, packages, modules, and namespaces | [AxonFramework](https://github.com/JohT/code-graph-analysis-examples/blob/main/analysis-results/AxonFramework/latest/external-dependencies/external_dependencies_report.md) | [react-router](https://github.com/JohT/code-graph-analysis-examples/blob/main/analysis-results/react-router/latest/external-dependencies/external_dependencies_report.md) | [Java](./domains/external-dependencies/explore/ExternalDependenciesJava.ipynb) , [TypeScript](./domains/external-dependencies/explore/ExternalDependenciesTypescript.ipynb) | [Most Spread Packages](https://github.com/JohT/code-graph-analysis-examples/blob/main/analysis-results/AxonFramework/latest/external-dependencies/Java_Most_spread_packages_by_packages_above_threshold.svg) | +| [Git History](./domains/git-history/README.md) | Change frequency, co-change patterns, authorship, and repository evolution | [AxonFramework](https://github.com/JohT/code-graph-analysis-examples/blob/main/analysis-results/AxonFramework/latest/git-history/git_history_report.md) | [react-router](https://github.com/JohT/code-graph-analysis-examples/blob/main/analysis-results/react-router/latest/git-history/git_history_report.md) | [General](./domains/git-history/explore/GitHistoryGeneralExploration.ipynb) , [Correlation](./domains/git-history/explore/GitHistoryCorrelationExploration.ipynb) | [Co-Changing Files](https://github.com/JohT/code-graph-analysis-examples/blob/main/analysis-results/AxonFramework/latest/git-history/CoChangingFiles.svg) | +| [Graph Algorithms](./domains/graph-algorithms/README.md) | Centrality, communities, similarity, and other Graph Data Science results | [AxonFramework](https://github.com/JohT/code-graph-analysis-examples/blob/main/analysis-results/AxonFramework/latest/graph-algorithms/graph_algorithms_report.md) | [react-router](https://github.com/JohT/code-graph-analysis-examples/blob/main/analysis-results/react-router/latest/graph-algorithms/graph_algorithms_report.md) | — | — | +| [Internal Dependencies](./domains/internal-dependencies/README.md) | Internal structure, path finding, topological order, OOD metrics, visibility metrics, and word clouds | [AxonFramework](https://github.com/JohT/code-graph-analysis-examples/blob/main/analysis-results/AxonFramework/latest/internal-dependencies/internal_dependencies_report.md) | [react-router](https://github.com/JohT/code-graph-analysis-examples/blob/main/analysis-results/react-router/latest/internal-dependencies/internal_dependencies_report.md) | [Java](./domains/internal-dependencies/explore/InternalDependenciesJava.ipynb) , [TypeScript](./domains/internal-dependencies/explore/InternalDependenciesTypescript.ipynb) | [Code Wordcloud](https://github.com/JohT/code-graph-analysis-examples/blob/main/analysis-results/AxonFramework/latest/internal-dependencies/CodeNamesWordcloud.svg) | +| [Java](./domains/java/README.md) | Java code quality, method metrics, annotations, and artifact dependency analysis | [AxonFramework](https://github.com/JohT/code-graph-analysis-examples/blob/main/analysis-results/AxonFramework/latest/java/java_report.md) | — | [Method Metrics](./domains/java/explore/MethodMetricsJavaExploration.ipynb) | [Artifact Dependencies](https://github.com/JohT/code-graph-analysis-examples/blob/main/analysis-results/AxonFramework/latest/java/ArtifactDependencies_OutgoingTop20_Bar.svg) | +| [Node Embeddings](./domains/node-embeddings/README.md) | Graph embeddings and 2D projections for structural exploration | [AxonFramework](https://github.com/JohT/code-graph-analysis-examples/blob/main/analysis-results/AxonFramework/latest/node-embeddings/node_embeddings_report.md) | [react-router](https://github.com/JohT/code-graph-analysis-examples/blob/main/analysis-results/react-router/latest/node-embeddings/node_embeddings_report.md) | [Java](./domains/node-embeddings/explore/NodeEmbeddingsJavaExploration.ipynb) , [TypeScript](./domains/node-embeddings/explore/NodeEmbeddingsTypescriptExploration.ipynb) | [Package Embeddings 2D](https://github.com/JohT/code-graph-analysis-examples/blob/main/analysis-results/AxonFramework/latest/node-embeddings/Package_Embeddings_FastRP_UMAP2D_Scatter.svg) | +| [Overview](./domains/overview/README.md) | High-level project structure, composition, counts, and complexity distributions | [AxonFramework](https://github.com/JohT/code-graph-analysis-examples/blob/main/analysis-results/AxonFramework/latest/overview/overview_report.md) | [react-router](https://github.com/JohT/code-graph-analysis-examples/blob/main/analysis-results/react-router/latest/overview/overview_report.md) | [Java](./domains/overview/explore/OverviewJavaExploration.ipynb) , [TypeScript](./domains/overview/explore/OverviewTypescriptExploration.ipynb) | [Packages Per Artifact](https://github.com/JohT/code-graph-analysis-examples/blob/main/analysis-results/AxonFramework/latest/overview/Overview_Java_Packages_Per_Artifact.svg) | ### Support Domain @@ -75,35 +77,36 @@ If you think in architecture terms: domains are the vertical slices, report type Here is a curated overview of report examples and exploratory notebooks from [code-graph-analysis-examples](https://github.com/JohT/code-graph-analysis-examples). These examples are grouped by user-facing output, not by domain. -- [External Dependencies](https://github.com/JohT/code-graph-analysis-examples/blob/main/analysis-results/AxonFramework/latest/external-dependencies-java/ExternalDependenciesJava.md) contains detailed information about external library usage ([Notebook](./domains/external-dependencies/explore/ExternalDependenciesJava.ipynb)). -- [Git History](https://github.com/JohT/code-graph-analysis-examples/blob/main/analysis-results/AxonFramework/latest/git-history-general/GitHistoryGeneral.md) contains information about the git history of the analyzed code ([Notebook](./domains/git-history/explore/GitHistoryGeneralExploration.ipynb)). -- [Internal Dependencies](https://github.com/JohT/code-graph-analysis-examples/blob/main/analysis-results/AxonFramework/latest/internal-dependencies-java/InternalDependenciesJava.md) is based on [Analyze java package metrics in a graph database](https://joht.github.io/johtizen/data/2023/04/21/java-package-metrics-analysis.html) and also includes cyclic dependencies ([Notebook](./domains/internal-dependencies/explore/InternalDependenciesJava.ipynb)). -- [Method Metrics](https://github.com/JohT/code-graph-analysis-examples/blob/main/analysis-results/AxonFramework/latest/method-metrics-java/MethodMetricsJava.md) shows how the effective number of lines of code and the cyclomatic complexity are distributed across the methods in the code ([Notebook](./domains/java/explore/MethodMetricsJavaExploration.ipynb)). -- [Node Embeddings](https://github.com/JohT/code-graph-analysis-examples/blob/main/analysis-results/AxonFramework/latest/node-embeddings-java/NodeEmbeddingsJava.md) shows how to generate node embeddings and to further reduce their dimensionality to be able to visualize them in a 2D plot ([Notebook](./domains/node-embeddings/explore/NodeEmbeddingsJavaExploration.ipynb)). -- [Object Oriented Design Quality Metrics](https://github.com/JohT/code-graph-analysis-examples/blob/main/analysis-results/AxonFramework/latest/object-oriented-design-metrics-java/ObjectOrientedDesignMetricsJava.md) is based on [OO Design Quality Metrics by Robert Martin](https://api.semanticscholar.org/CorpusID:18246616) ([Notebook](./domains/internal-dependencies/explore/ObjectOrientedDesignMetricsJava.ipynb)). -- [Overview](https://github.com/JohT/code-graph-analysis-examples/blob/main/analysis-results/AxonFramework/latest/overview-java/OverviewJava.md) contains overall statistics and details about methods and their complexity. ([Notebook](./domains/overview/explore/OverviewJavaExploration.ipynb)). -- [Visibility Metrics](https://github.com/JohT/code-graph-analysis-examples/blob/main/analysis-results/AxonFramework/latest/visibility-metrics-java/VisibilityMetricsJava.md) ([Notebook](./domains/internal-dependencies/explore/VisibilityMetricsJava.ipynb)). -- [Wordcloud](https://github.com/JohT/code-graph-analysis-examples/blob/main/analysis-results/AxonFramework/latest/wordcloud/Wordcloud.md) contains a visual representation of package and class names ([Notebook](./domains/internal-dependencies/explore/Wordcloud.ipynb)). -- [Java Archetypes Treemap](https://github.com/JohT/code-graph-analysis-examples/blob/main/analysis-results/AxonFramework/latest/anomaly-detection/JavaTreemap2ArchetypesOverviewPerDirectory.svg) ([Python Script](./domains/anomaly-detection/treemapVisualizations.py)) +- [External Dependencies](https://github.com/JohT/code-graph-analysis-examples/blob/main/analysis-results/AxonFramework/latest/external-dependencies/external_dependencies_report.md) contains detailed information about external library usage ([Notebook](./domains/external-dependencies/explore/ExternalDependenciesJava.ipynb)). +- [Git History](https://github.com/JohT/code-graph-analysis-examples/blob/main/analysis-results/AxonFramework/latest/git-history/git_history_report.md) contains information about the git history of the analyzed code ([Notebook](./domains/git-history/explore/GitHistoryGeneralExploration.ipynb)). +- [Internal Dependencies](https://github.com/JohT/code-graph-analysis-examples/blob/main/analysis-results/AxonFramework/latest/internal-dependencies/internal_dependencies_report.md) is based on [Analyze java package metrics in a graph database](https://joht.github.io/johtizen/data/2023/04/21/java-package-metrics-analysis.html) ([Notebook](./domains/internal-dependencies/explore/InternalDependenciesJava.ipynb)). +- [Cyclic Dependencies](https://github.com/JohT/code-graph-analysis-examples/blob/main/analysis-results/AxonFramework/latest/cyclic-dependencies/cyclic_dependencies_report.md) contains information about cyclic dependencies in the analyzed code ([Notebook](./domains/cyclic-dependencies/explore/CyclicDependenciesJavaExploration.ipynb)). +- [Java Method Metrics](https://github.com/JohT/code-graph-analysis-examples/blob/main/analysis-results/AxonFramework/latest/java/java_report.md#3-method-metrics) shows how the effective number of lines of code and the cyclomatic complexity are distributed across the methods in the code ([Notebook](./domains/java/explore/MethodMetricsJavaExploration.ipynb)). +- [Node Embeddings](https://github.com/JohT/code-graph-analysis-examples/blob/main/analysis-results/AxonFramework/latest/node-embeddings/node_embeddings_report.md) shows how to generate node embeddings and to further reduce their dimensionality to be able to visualize them in a 2D plot ([Notebook](./domains/node-embeddings/explore/NodeEmbeddingsJavaExploration.ipynb)). +- [Object Oriented Design Quality Metrics](https://github.com/JohT/code-graph-analysis-examples/blob/main/analysis-results/AxonFramework/latest/internal-dependencies/internal_dependencies_report.md#8-object-oriented-design-metrics) is based on [OO Design Quality Metrics by Robert Martin](https://api.semanticscholar.org/CorpusID:18246616) ([Notebook](./domains/internal-dependencies/explore/ObjectOrientedDesignMetricsJava.ipynb)). +- [Overview](https://github.com/JohT/code-graph-analysis-examples/blob/main/analysis-results/AxonFramework/latest/overview/overview_report.md) contains overall statistics and details about methods and their complexity. ([Notebook](./domains/overview/explore/OverviewJavaExploration.ipynb)). +- [Visibility Metrics](https://github.com/JohT/code-graph-analysis-examples/blob/main/analysis-results/AxonFramework/latest/internal-dependencies/internal_dependencies_report.md#9-visibility-metrics) ([Notebook](./domains/internal-dependencies/explore/VisibilityMetricsJava.ipynb)). +- [Wordcloud](https://github.com/JohT/code-graph-analysis-examples/blob/main/analysis-results/AxonFramework/latest/internal-dependencies/internal_dependencies_report.md#10-code-vocabulary) contains a visual representation of package and class names ([Notebook](./domains/internal-dependencies/explore/Wordcloud.ipynb)). +- [Java Archetypes Treemap](https://github.com/JohT/code-graph-analysis-examples/blob/main/analysis-results/AxonFramework/latest/archetypes/archetypes_report.md#13-overview-charts) ([Python Script](./domains/anomaly-detection/treemapVisualizations.py)) ### :blue_book: Graph Data Science Examples These examples show selected outputs powered by Neo4j's [Graph Data Science Library](https://neo4j.com/product/graph-data-science) across several domains. For a complete list, see the [CSV Cypher Query Report Reference](#page_with_curl-csv-cypher-query-report-reference). -- [Centrality with Page Rank](https://github.com/JohT/code-graph-analysis-examples/blob/main/analysis-results/AxonFramework/latest/centrality-csv/Package_Centrality_Page_Rank.csv) ([Source Script](./domains/graph-algorithms/centralityCsv.sh)) -- [Community Detection with Leiden](https://github.com/JohT/code-graph-analysis-examples/blob/main/analysis-results/AxonFramework/latest/community-csv/Package_communityLeidenId_Community__Metrics.csv) ([Source Script](./domains/graph-algorithms/communityCsv.sh)) -- [Node Embeddings with HashGNN](https://github.com/JohT/code-graph-analysis-examples/blob/main/analysis-results/AxonFramework/latest/node-embeddings-csv/Package_Embeddings_HashGNN.csv) ([Source Script](./domains/node-embeddings/nodeEmbeddingsCsv.sh)) -- [Path Finding with all pairs shortest path](https://github.com/JohT/code-graph-analysis-examples/blob/main/analysis-results/AxonFramework/latest/path-finding-csv/Package_all_pairs_shortest_paths_distribution_per_project.csv) ([Source Script](./domains/internal-dependencies/internalDependenciesCsv.sh)) -- [Similarity with Jaccard](https://github.com/JohT/code-graph-analysis-examples/blob/main/analysis-results/AxonFramework/latest/similarity-csv/Package_Similarity.csv) ([Source Script](./domains/graph-algorithms/similarityCsv.sh)) -- [Topology Sort](https://github.com/JohT/code-graph-analysis-examples/blob/main/analysis-results/AxonFramework/latest/topology-csv/Package_Topological_Sort.csv) ([Source Script](./domains/internal-dependencies/internalDependenciesCsv.sh)) +- [Centrality with Page Rank](https://github.com/JohT/code-graph-analysis-examples/blob/main/analysis-results/AxonFramework/latest/graph-algorithms/Java_Package/centrality/Package_Centrality_Page_Rank.csv) ([Source Script](./domains/graph-algorithms/centralityCsv.sh)) +- [Community Detection with Leiden](https://github.com/JohT/code-graph-analysis-examples/blob/main/analysis-results/AxonFramework/latest/graph-algorithms/Java_Package/communities/Package_communityLeidenId_Community__Metrics.csv) ([Source Script](./domains/graph-algorithms/communityCsv.sh)) +- [Node Embeddings with HashGNN](https://github.com/JohT/code-graph-analysis-examples/blob/main/analysis-results/AxonFramework/latest/node-embeddings/Package_Embeddings_HashGNN.csv) ([Source Script](./domains/node-embeddings/nodeEmbeddingsCsv.sh)) +- [Path Finding with all pairs shortest path](https://github.com/JohT/code-graph-analysis-examples/blob/main/analysis-results/AxonFramework/latest/internal-dependencies/Java_Package/Package_all_pairs_shortest_paths_distribution_per_project.csv) ([Source Script](./domains/internal-dependencies/internalDependenciesCsv.sh)) +- [Similarity with Jaccard](https://github.com/JohT/code-graph-analysis-examples/blob/main/analysis-results/AxonFramework/latest/graph-algorithms/Java_Package/similarity/Package_Similarity.csv) ([Source Script](./domains/graph-algorithms/similarityCsv.sh)) +- [Topology Sort](https://github.com/JohT/code-graph-analysis-examples/blob/main/analysis-results/AxonFramework/latest/internal-dependencies/Java_Package/Package_Topological_Sort.csv) ([Source Script](./domains/internal-dependencies/internalDependenciesCsv.sh)) ### :art: Graph Visualization Examples Here are some fully automated graph visualizations utilizing [GraphViz](https://graphviz.org) from [code-graph-analysis-examples](https://github.com/JohT/code-graph-analysis-examples): -- [Java Artifact Build Levels](https://github.com/JohT/code-graph-analysis-examples/blob/main/analysis-results/AxonFramework/latest/internal-dependencies-visualization/JavaArtifactBuildLevels.svg) ([Query](./domains/internal-dependencies/queries/internal-dependencies/Java_Artifact_build_levels_for_graphviz.cypher), [Source Script](./scripts/visualization/visualizeQueryResults.sh)) -- [Java Artifact Longest Path Contributors](https://github.com/JohT/code-graph-analysis-examples/blob/main/analysis-results/AxonFramework/latest/path-finding-visualization/JavaArtifactLongestPaths.svg) ([Query](./domains/internal-dependencies/queries/path-finding/Path_Finding_6_Longest_paths_contributors_for_graphviz.cypher), [Source Script](./scripts/visualization/visualizeQueryResults.sh)) -- [Java Package Top #1 Authority Archetype and contributing packages](https://github.com/JohT/code-graph-analysis-examples/blob/main/analysis-results/AxonFramework/latest/anomaly-detection/Java_Package/GraphVisualizations/TopAuthority1.svg) ([Query](./domains/archetypes/labels/ArchetypeAuthority.cypher), [Source Script](./domains/archetypes/graphs/archetypesGraphs.sh)) +- [Java Artifact Build Levels](https://github.com/JohT/code-graph-analysis-examples/blob/main/analysis-results/AxonFramework/latest/internal-dependencies/Java_Artifact/Graph_Visualizations/JavaArtifactBuildLevels.svg) ([Query](./domains/internal-dependencies/queries/internal-dependencies/Java_Artifact_build_levels_for_graphviz.cypher), [Source Script](./scripts/visualization/visualizeQueryResults.sh)) +- [Java Artifact Longest Path Contributors](https://github.com/JohT/code-graph-analysis-examples/blob/main/analysis-results/AxonFramework/latest/internal-dependencies/Java_Artifact/Graph_Visualizations/JavaArtifactLongestPaths.svg) ([Query](./domains/internal-dependencies/queries/path-finding/Path_Finding_6_Longest_paths_contributors_for_graphviz.cypher), [Source Script](./scripts/visualization/visualizeQueryResults.sh)) +- [Java Package Top #1 Authority Archetype and contributing packages](https://github.com/JohT/code-graph-analysis-examples/blob/main/analysis-results/AxonFramework/latest/archetypes/Java_Package/GraphVisualizations/TopAuthority1.svg) ([Query](./domains/archetypes/labels/ArchetypeAuthority.cypher), [Source Script](./domains/archetypes/graphs/archetypesGraphs.sh)) ## :book: Blog Articles diff --git a/conda-environment.yml b/conda-environment.yml index dd21b4e58..a63c97f12 100644 --- a/conda-environment.yml +++ b/conda-environment.yml @@ -8,11 +8,11 @@ dependencies: - python=3.12.9 - _python_rc=1 # Needed on Mac since Python >= 3.12 - ipykernel=7.2.0 + - nbformat=5.10.4 # For Jupyter notebook environments. Required for plotly to render plots in notebooks. - matplotlib=3.10.9 - numpy=2.2.5 - pandas=2.3.3 - pip=26.1 - - setuptools=80.10.2 # opentsne uses sklearn.base uses joblib uses distutils missing in Python >= 12 (TODO use native openTSNE?) - wordcloud=1.9.6 - monotonic=1.6 - plotly=6.6.0 # v6.7.0 in pyproject.toml not yet available in conda-forge diff --git a/domains/anomaly-detection/explore/NodeEmbeddingsHyperparameterTuningExploration.ipynb b/domains/anomaly-detection/explore/NodeEmbeddingsHyperparameterTuningExploration.ipynb index 9c9627e1d..11dc951d6 100644 --- a/domains/anomaly-detection/explore/NodeEmbeddingsHyperparameterTuningExploration.ipynb +++ b/domains/anomaly-detection/explore/NodeEmbeddingsHyperparameterTuningExploration.ipynb @@ -54,12 +54,16 @@ "source": [ "import os\n", "import contextlib\n", + "from typing import Literal\n", "\n", "from IPython.display import display\n", "import pandas as pd\n", "import typing as typ\n", "import numpy as np\n", - "from openTSNE.sklearn import TSNE\n", + "import numpy.typing as numpy_typing\n", + "from numpy.typing import NDArray\n", + "import umap\n", + "import optuna\n", "\n", "from sklearn.base import BaseEstimator\n", "from sklearn.model_selection import GridSearchCV\n", @@ -68,7 +72,7 @@ "from sklearn.cluster import HDBSCAN\n", "\n", "import matplotlib.pyplot as plot\n", - "import seaborn" + "import seaborn\n" ] }, { @@ -126,8 +130,8 @@ "from numpy import __version__ as numpy_version\n", "print('numpy version: {}'.format(numpy_version))\n", "\n", - "from openTSNE import __version__ as openTSNE_version\n", - "print('openTSNE version: {}'.format(openTSNE_version))\n", + "from umap import __version__ as umap_version\n", + "print('umap version: {}'.format(umap_version))\n", "\n", "from pandas import __version__ as pandas_version\n", "print('pandas version: {}'.format(pandas_version))\n", @@ -156,7 +160,7 @@ "\n", "driver = GraphDatabase.driver(\n", " uri=\"bolt://localhost:7687\", \n", - " auth=(\"neo4j\", os.environ.get(\"NEO4J_INITIAL_PASSWORD\"))\n", + " auth=(\"neo4j\", os.environ.get(\"NEO4J_INITIAL_PASSWORD\")) # type: ignore\n", ")\n", "driver.verify_connectivity()" ] @@ -174,7 +178,7 @@ " \n", "\n", "def query_cypher_to_data_frame(filename, parameters: typ.Optional[typ.Dict[str, typ.Any]] = None):\n", - " records, summary, keys = driver.execute_query(query_=get_cypher_query_from_file(filename), parameters_=parameters)\n", + " records, summary, keys = driver.execute_query(query_=get_cypher_query_from_file(filename), parameters_=parameters) # type: ignore\n", " return pd.DataFrame([r.values() for r in records], columns=keys)\n", "\n", "\n", @@ -227,7 +231,7 @@ "metadata": {}, "outputs": [], "source": [ - "def write_batch_data_into_database(dataframe: pd.DataFrame, node_label: str, id_column: str = \"nodeElementId\", cypher_query_file: str = \"../cypher/Dependencies_Projection/Dependencies_14_Write_Batch_Data.cypher\", batch_size: int = 1000):\n", + "def write_batch_data_into_database(dataframe: pd.DataFrame, node_label: str, id_column: str = \"nodeElementId\", cypher_query_file: str = \"../../../cypher/Dependencies_Projection/Dependencies_14_Write_Batch_Data.cypher\", batch_size: int = 1000):\n", " \"\"\"\n", " Writes the given dataframe to the Neo4j database using a batch write operation.\n", " \n", @@ -256,7 +260,7 @@ " for start in range(0, len(dataframe), batch_size):\n", " batch_dataframe = dataframe.iloc[start:start + batch_size]\n", " batch_rows = prepare_rows(batch_dataframe)\n", - " return session.execute_write(update_batch, batch_rows)" + " return session.execute_write(update_batch, batch_rows)\n" ] }, { @@ -282,22 +286,22 @@ " The name of the node property that contains the dependency weight. Example: \"weight25PercentInterfaces\"\n", " \"\"\"\n", " \n", - " is_data_missing=query_cypher_to_data_frame(\"../cypher/Dependencies_Projection/Dependencies_0_Check_Projectable.cypher\", parameters).empty\n", + " is_data_missing=query_cypher_to_data_frame(\"../../../cypher/Dependencies_Projection/Dependencies_0_Check_Projectable.cypher\", parameters).empty\n", " if is_data_missing: return False\n", "\n", - " query_cypher_to_data_frame(\"../cypher/Dependencies_Projection/Dependencies_1_Delete_Projection.cypher\", parameters)\n", - " query_cypher_to_data_frame(\"../cypher/Dependencies_Projection/Dependencies_2_Delete_Subgraph.cypher\", parameters)\n", - " query_cypher_to_data_frame(\"../cypher/Dependencies_Projection/Dependencies_1_Delete_Projection.cypher\", dict(dependencies_projection=parameters[\"dependencies_projection\"] + '-cleaned-sampled'))\n", + " query_cypher_to_data_frame(\"../../../cypher/Dependencies_Projection/Dependencies_1_Delete_Projection.cypher\", parameters)\n", + " query_cypher_to_data_frame(\"../../../cypher/Dependencies_Projection/Dependencies_2_Delete_Subgraph.cypher\", parameters)\n", + " query_cypher_to_data_frame(\"../../../cypher/Dependencies_Projection/Dependencies_1_Delete_Projection.cypher\", dict(dependencies_projection=parameters[\"dependencies_projection\"] + '-cleaned-sampled'))\n", " # To include the direction of the relationships use the following line to create the projection:\n", - " # query_cypher_to_data_frame(\"../cypher/Dependencies_Projection/Dependencies_3_Create_Projection.cypher\", parameters)\n", + " # query_cypher_to_data_frame(\"../../../cypher/Dependencies_Projection/Dependencies_3_Create_Projection.cypher\", parameters)\n", " node_count : int = 0\n", - " query_cypher_to_data_frame(\"../cypher/Dependencies_Projection/Dependencies_4_Create_Undirected_Projection.cypher\", parameters)\n", - " results=query_cypher_to_data_frame(\"../cypher/Dependencies_Projection/Dependencies_5_Create_Subgraph.cypher\", parameters)\n", + " query_cypher_to_data_frame(\"../../../cypher/Dependencies_Projection/Dependencies_4_Create_Undirected_Projection.cypher\", parameters)\n", + " results=query_cypher_to_data_frame(\"../../../cypher/Dependencies_Projection/Dependencies_5_Create_Subgraph.cypher\", parameters)\n", " node_count=results[\"nodeCount\"].values[0]\n", " \n", " print(\"The number of nodes in the original projection is: \" + str(node_count))\n", "\n", - " return True" + " return True\n" ] }, { @@ -307,9 +311,6 @@ "metadata": {}, "outputs": [], "source": [ - "import numpy.typing as numpy_typing\n", - "import numpy as np\n", - "\n", "def get_projected_graph_information(projection_name: str) -> pd.DataFrame:\n", " \"\"\"\n", " Returns the projection information for the given parameters.\n", @@ -322,7 +323,7 @@ " parameters = dict(\n", " dependencies_projection=projection_name,\n", " )\n", - " return query_cypher_to_data_frame(\"../cypher/Dependencies_Projection/Dependencies_12_Get_Projection_Statistics.cypher\", parameters)\n", + " return query_cypher_to_data_frame(\"../../../cypher/Dependencies_Projection/Dependencies_12_Get_Projection_Statistics.cypher\", parameters)\n", "\n", "\n", "def get_projected_graph_node_count(projection_name: str) -> int:\n", @@ -333,11 +334,12 @@ " projection_name : str\n", " The name prefix for the in-memory projection for dependencies. Example: \"java-package-embeddings-notebook\"\n", " \"\"\"\n", - " \n", - " graph_information = get_projected_graph_information(projection_name)\n", - " if graph_information.empty:\n", + "\n", + " graph_statistics = get_projected_graph_information(projection_name)\n", + " if graph_statistics.empty:\n", " return 0\n", - " return graph_information[\"nodeCount\"].values[0]\n", + " return int(graph_statistics[\"nodeCount\"].values[0])\n", + "\n", "\n", "\n", "def get_all_data_without_slicing_cross_validator_for_node_count(node_count: int) -> typ.List[typ.Tuple[np.ndarray, np.ndarray]]:\n", @@ -456,17 +458,17 @@ " return GraphSamplingResult.not_sampled(parameters)\n", " \n", " # Delete sampled graph projection if it already exists\n", - " query_cypher_to_data_frame(\"../cypher/Dependencies_Projection/Dependencies_1_Delete_Projection.cypher\", dict(dependencies_projection=parameters[\"dependencies_projection\"] + '-cleaned-sampled-cleaned'))\n", + " query_cypher_to_data_frame(\"../../../cypher/Dependencies_Projection/Dependencies_1_Delete_Projection.cypher\", dict(dependencies_projection=parameters[\"dependencies_projection\"] + '-cleaned-sampled-cleaned'))\n", "\n", " sampling_parameters = dict(\n", " dependencies_projection = parameters[\"dependencies_projection\"] + '-cleaned',\n", " dependencies_projection_sampling_ratio = graph_sampling_threshold / node_count\n", " )\n", - " results=query_cypher_to_data_frame(\"../cypher/Dependencies_Projection/Dependencies_13_Sample_Projected_Graph.cypher\", sampling_parameters)\n", + " results=query_cypher_to_data_frame(\"../../../cypher/Dependencies_Projection/Dependencies_13_Sample_Projected_Graph.cypher\", sampling_parameters)\n", " node_count=results[\"nodeCount\"].values[0]\n", " print(\"The number of nodes in the sampled projection is: \" + str(node_count))\n", " \n", - " return GraphSamplingResult(True, node_count, parameters)" + " return GraphSamplingResult(True, node_count, parameters)\n" ] }, { @@ -566,8 +568,6 @@ "metadata": {}, "outputs": [], "source": [ - "from numpy.typing import NDArray\n", - "\n", "def get_noise_ratio(clustering_results: NDArray) -> float:\n", " \"\"\"\n", " Returns the ratio of noise points in the clustering results.\n", @@ -650,7 +650,8 @@ "metadata": {}, "outputs": [], "source": [ - "from numpy.typing import NDArray\n", + "import copy\n", + "\n", "\n", "class TunedClusteringResult:\n", " def __init__(self, labels: NDArray, probabilities : NDArray):\n", @@ -701,7 +702,7 @@ " all_data_without_slicing_cross_validator = [(np.arange(len(embeddings)), np.arange(len(embeddings)))]\n", "\n", " tuned_hdbscan = GridSearchCV(\n", - " estimator=HDBSCAN(),\n", + " estimator=HDBSCAN(copy=True),\n", " refit=False, # Without refit, the estimator doesn't need to implement the 'predict' method. Drawback: Only the best parameters are returned, not the best model.\n", " param_grid=hyper_parameter_distributions,\n", " n_jobs=-1,\n", @@ -716,12 +717,12 @@ " print(\"Tuned HDBSCAN parameters:\", tuned_hdbscan.best_params_)\n", "\n", " # Run the clustering again with the best parameters\n", - " cluster_algorithm = HDBSCAN(**tuned_hdbscan.best_params_, n_jobs=-1, allow_single_cluster=False)\n", + " cluster_algorithm = HDBSCAN(**tuned_hdbscan.best_params_, n_jobs=-1, allow_single_cluster=False, copy=True)\n", " best_model = cluster_algorithm.fit(embeddings)\n", "\n", " results = TunedClusteringResult(best_model.labels_, best_model.probabilities_)\n", " print(f\"Number of HDBSCAN clusters (excluding noise): {results.cluster_count:.0f}\")\n", - " return results" + " return results\n" ] }, { @@ -731,8 +732,6 @@ "metadata": {}, "outputs": [], "source": [ - "import optuna\n", - "\n", "def output_optuna_tuning_results(optimized_study: optuna.Study, name_of_the_optimized_algorithm: str):\n", " from typing import cast\n", " from optuna.importance import get_param_importances, MeanDecreaseImpurityImportanceEvaluator\n", @@ -755,8 +754,6 @@ "metadata": {}, "outputs": [], "source": [ - "from numpy.typing import NDArray\n", - "\n", "# TODO keep either this (additional optuna dependency) or the implementation above (no additional dependency but not as efficient)\n", "def tuned_hierarchical_density_based_spatial_clustering_optuna(embeddings: NDArray, reference_community_ids: NDArray) -> TunedClusteringResult:\n", " \"\"\"\n", @@ -782,7 +779,8 @@ " clusterer = HDBSCAN(\n", " **base_clustering_parameter,\n", " min_cluster_size=min_cluster_size,\n", - " min_samples=min_samples\n", + " min_samples=min_samples,\n", + " copy=True\n", " )\n", " labels = clusterer.fit_predict(embeddings)\n", " return adjusted_mutual_info_score_with_soft_ramp_noise_penalty(labels, reference_community_ids)\n", @@ -799,10 +797,10 @@ " output_optuna_tuning_results(study, 'HDBSCAN')\n", "\n", " # Run the clustering again with the best parameters\n", - " cluster_algorithm = HDBSCAN(**base_clustering_parameter, **study.best_params, n_jobs=-1)\n", + " cluster_algorithm = HDBSCAN(**base_clustering_parameter, **study.best_params, n_jobs=-1, copy=True)\n", " best_model = cluster_algorithm.fit(embeddings)\n", "\n", - " return TunedClusteringResult(best_model.labels_, best_model.probabilities_)" + " return TunedClusteringResult(best_model.labels_, best_model.probabilities_)\n" ] }, { @@ -812,8 +810,6 @@ "metadata": {}, "outputs": [], "source": [ - "import numpy.typing as numpy_typing\n", - "\n", "class CommunityComparingScores:\n", " def __init__(self, adjusted_mutual_info_score: float, adjusted_rand_index: float, normalized_mutual_information: float):\n", " self.adjusted_mutual_info_score = adjusted_mutual_info_score\n", @@ -840,7 +836,7 @@ " ari = adjusted_rand_score(reference_community_ids[mask], cluster_labels[mask])\n", " nmi = float(normalized_mutual_info_score(reference_community_ids[mask], cluster_labels[mask]))\n", "\n", - " return CommunityComparingScores(ami, ari, nmi)" + " return CommunityComparingScores(ami, ari, nmi)\n" ] }, { @@ -850,9 +846,6 @@ "metadata": {}, "outputs": [], "source": [ - "from typing import Literal\n", - "import pandas as pd\n", - "\n", "def get_clustering_property_name(clustering_property_type: Literal['Label', 'Probability'] = 'Label', clustering_name: str = \"TunedHDBSCAN\"):\n", " \"\"\"\n", " Assembles the property name for clustering results.\n", @@ -877,7 +870,7 @@ " count=('codeUnitName', 'count'),\n", " communityIds=('communityId', lambda x: list(set(x))),\n", " codeUnitNames=('codeUnitName', lambda x: list(set(x))),\n", - " ).reset_index().sort_values(by='count', ascending=False)" + " ).reset_index().sort_values(by='count', ascending=False)\n" ] }, { @@ -1093,16 +1086,14 @@ " HierarchicalDensityClusteringScores\n", " An instance of HierarchicalDensityClusteringScores containing the clustering scores.\n", " \"\"\"\n", - " import numpy as np\n", - " from sklearn.cluster import HDBSCAN\n", - " \n", " hierarchical_density_based_spatial_clustering = HDBSCAN(\n", " cluster_selection_method='eom',\n", " metric='manhattan',\n", " min_samples=2,\n", " min_cluster_size=5,\n", " allow_single_cluster=False,\n", - " n_jobs=-1\n", + " n_jobs=-1,\n", + " copy=True\n", " )\n", " embeddings = np.array(embedding_column.tolist())\n", " clustering_result = hierarchical_density_based_spatial_clustering.fit(embeddings)\n", @@ -1961,60 +1952,6 @@ " return NodeEmbeddingsCreationResult(embeddings, sampling_result.is_sampled)" ] }, - { - "cell_type": "markdown", - "id": "f6ec6a9b", - "metadata": {}, - "source": [ - "### Dimensionality reduction with t-distributed stochastic neighbor embedding (t-SNE)\n", - "\n", - "The following function takes the original node embeddings with a higher dimensionality, e.g. 64 floating point numbers, and reduces them into a two dimensional array for visualization. \n", - "\n", - "> It converts similarities between data points to joint probabilities and tries to minimize the Kullback-Leibler divergence between the joint probabilities of the low-dimensional embedding and the high-dimensional data.\n", - "\n", - "(see https://opentsne.readthedocs.io)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "720aebd3", - "metadata": {}, - "outputs": [], - "source": [ - "def prepare_node_embeddings_for_2d_visualization_tsne(embeddings: pd.DataFrame) -> pd.DataFrame:\n", - " \"\"\"\n", - " Reduces the dimensionality of the node embeddings (e.g. 64 floating point numbers in an array)\n", - " to two dimensions for 2D visualization.\n", - " see https://opentsne.readthedocs.io\n", - " \"\"\"\n", - "\n", - " if embeddings.empty: \n", - " print(\"No projected data for node embeddings dimensionality reduction available\")\n", - " return embeddings\n", - " \n", - " # Calling the fit_transform method just with a list doesn't seem to work (anymore?). \n", - " # It leads to an error with the following message: 'list' object has no attribute 'shape'\n", - " # This can be solved by converting the list to a numpy array using np.array(..).\n", - " # See https://bobbyhadz.com/blog/python-attributeerror-list-object-has-no-attribute-shape\n", - " embeddings_as_numpy_array = np.array(embeddings.embedding.to_list())\n", - "\n", - " # Use t-distributed stochastic neighbor embedding (t-SNE) to reduce the dimensionality \n", - " # of the previously calculated node embeddings to 2 dimensions for visualization\n", - " t_distributed_stochastic_neighbor_embedding = TSNE(n_components=2, verbose=False, random_state=47)\n", - " two_dimension_node_embeddings = t_distributed_stochastic_neighbor_embedding.fit_transform(embeddings_as_numpy_array)\n", - " # display(two_dimension_node_embeddings.shape) # Display the shape of the t-SNE result\n", - "\n", - " # Create a new DataFrame with the results of the 2 dimensional node embeddings\n", - " # and the code unit and artifact name of the query above as preparation for the plot\n", - " embeddings['embeddingVisualizationX'] = [value[0] for value in two_dimension_node_embeddings]\n", - " embeddings['embeddingVisualizationY'] = [value[1] for value in two_dimension_node_embeddings]\n", - "\n", - " # display(embeddings.head(10)) # Display the first line of the results\n", - " return embeddings\n", - " " - ] - }, { "cell_type": "markdown", "id": "dd9b83c2", @@ -2036,8 +1973,6 @@ "metadata": {}, "outputs": [], "source": [ - "import umap\n", - "\n", "def prepare_node_embeddings_for_2d_visualization_umap(embeddings: pd.DataFrame) -> pd.DataFrame:\n", " \"\"\"\n", " Reduces the dimensionality of the node embeddings (e.g. 64 floating point numbers in an array)\n", @@ -2054,12 +1989,13 @@ "\n", " # Use UMAP to reduce the dimensionality to 2D for visualization\n", " # umap_reducer = umap.UMAP(min_dist=0.3, n_neighbors=15, n_components=2, metric='manhattan', random_state=47)\n", - " umap_reducer = umap.UMAP(n_components=2, min_dist=0.3, random_state=47)\n", + " umap_reducer = umap.UMAP(n_components=2, min_dist=0.3, random_state=47, n_jobs=1)\n", " two_dimension_node_embeddings = umap_reducer.fit_transform(embeddings_as_numpy_array)\n", + " two_dimension_node_embeddings = np.asarray(two_dimension_node_embeddings)\n", "\n", " # Add the 2D coordinates to the DataFrame\n", - " embeddings['embeddingUMAPVisualizationX'] = two_dimension_node_embeddings[:, 0]\n", - " embeddings['embeddingUMAPVisualizationY'] = two_dimension_node_embeddings[:, 1]\n", + " embeddings['embeddingVisualizationX'] = two_dimension_node_embeddings[:, 0]\n", + " embeddings['embeddingVisualizationY'] = two_dimension_node_embeddings[:, 1]\n", "\n", " return embeddings\n" ] @@ -2072,9 +2008,7 @@ "outputs": [], "source": [ "def prepare_node_embeddings_for_2d_visualization(embeddings: pd.DataFrame) -> pd.DataFrame:\n", - " embeddings = prepare_node_embeddings_for_2d_visualization_tsne(embeddings)\n", - " embeddings = prepare_node_embeddings_for_2d_visualization_umap(embeddings)\n", - " return embeddings" + " return prepare_node_embeddings_for_2d_visualization_umap(embeddings)\n" ] }, { @@ -2131,11 +2065,6 @@ "metadata": {}, "outputs": [], "source": [ - "import pandas as pd\n", - "import matplotlib.pyplot as plot\n", - "import seaborn\n", - "import numpy as np\n", - "\n", "def plot_2d_node_embeddings(\n", " node_embeddings_for_visualization: pd.DataFrame,\n", " title: str,\n", @@ -2444,8 +2373,6 @@ "# since the sampling is not necessary for Fast Random Projection embeddings.\n", "tuneable_fast_random_projection = create_tuneable(TuneableFastRandomProjectionNodeEmbeddings).with_projection_parameters(java_package_projection_parameters)\n", "\n", - "from sklearn.model_selection import GridSearchCV\n", - "\n", "hyperparameter_tuning = GridSearchCV(\n", " estimator=tuneable_fast_random_projection,\n", " param_grid=tuneable_fast_random_projection_parameter_grid,\n", @@ -2464,7 +2391,7 @@ " display(embeddings.head())\n", "\n", " # Write the results back into the Neo4j database\n", - " tuned_fast_random_projection.best_estimator_.write_embeddings()" + " tuned_fast_random_projection.best_estimator_.write_embeddings()\n" ] }, { @@ -2472,9 +2399,9 @@ "id": "76d8bca1", "metadata": {}, "source": [ - "#### Dimensionality reduction with t-distributed stochastic neighbor embedding (t-SNE)\n", + "#### Dimensionality reduction with Uniform Manifold Approximation and Projection (UMAP)\n", "\n", - "This step takes the original node embeddings with a higher dimensionality, e.g. 64 floating point numbers, and reduces them into a two dimensional array for visualization. For more details look up the function declaration for \"prepare_node_embeddings_for_2d_visualization\"." + "This step takes the original node embeddings with a higher dimensionality, e.g. 64 floating point numbers, and reduces them into a two dimensional array for visualization. For more details look up the function declaration for \"prepare_node_embeddings_for_2d_visualization\".\n" ] }, { @@ -2506,14 +2433,8 @@ "if java_package_data_available:\n", " plot_2d_node_embeddings(\n", " node_embeddings_for_visualization, \n", - " \"Java Package positioned by their dependency relationships (FastRP node embeddings + t-SNE)\"\n", - " )\n", - " plot_2d_node_embeddings(\n", - " node_embeddings_for_visualization, \n", - " \"Java Package positioned by their dependency relationships (FastRP node embeddings + UMAP)\",\n", - " x_position_column='embeddingUMAPVisualizationX',\n", - " y_position_column='embeddingUMAPVisualizationY'\n", - " )" + " \"Java Package positioned by their dependency relationships (FastRP node embeddings + UMAP)\"\n", + " )\n" ] }, { @@ -2621,13 +2542,7 @@ " node_embeddings_for_visualization = prepare_node_embeddings_for_2d_visualization(embeddings)\n", " plot_2d_node_embeddings(\n", " node_embeddings_for_visualization,\n", - " \"Java Packages positioned by their dependency relationships (HashGNN + t-SNE)\"\n", - " )\n", - " plot_2d_node_embeddings(\n", - " node_embeddings_for_visualization, \n", - " \"Java Packages positioned by their dependency relationships (HashGNN + UMAP)\",\n", - " x_position_column='embeddingUMAPVisualizationX',\n", - " y_position_column='embeddingUMAPVisualizationY'\n", + " \"Java Packages positioned by their dependency relationships (HashGNN + UMAP)\"\n", " )\n", "# -------\n", "tuneable_hashgnn_parameter_grid = {\n", @@ -2641,8 +2556,6 @@ "\n", "tuneable_hashgnn = create_tuneable(TuneableHashGNNNodeEmbeddings).with_projection_parameters(java_package_sampled_projection_parameters)\n", "\n", - "from sklearn.model_selection import GridSearchCV\n", - "\n", "hyperparameter_tuning = GridSearchCV(\n", " estimator=tuneable_hashgnn,\n", " param_grid=tuneable_hashgnn_parameter_grid,\n", @@ -2665,8 +2578,8 @@ "\n", " plot_2d_node_embeddings(\n", " prepare_node_embeddings_for_2d_visualization(embeddings),\n", - " \"Java Packages positioned by their dependency relationships (HashGNN + t-SNE)\"\n", - " )" + " \"Java Packages positioned by their dependency relationships (HashGNN + UMAP)\"\n", + " )\n" ] }, { @@ -2754,13 +2667,7 @@ " node_embeddings_for_visualization = prepare_node_embeddings_for_2d_visualization(embeddings)\n", " plot_2d_node_embeddings(\n", " node_embeddings_for_visualization,\n", - " \"Java Packages positioned by their dependency relationships (node2vec + t-SNE)\"\n", - " )\n", - " plot_2d_node_embeddings(\n", - " node_embeddings_for_visualization, \n", - " \"Java Packages positioned by their dependency relationships (node2vec + UMAP)\",\n", - " x_position_column='embeddingUMAPVisualizationX',\n", - " y_position_column='embeddingUMAPVisualizationY'\n", + " \"Java Packages positioned by their dependency relationships (node2vec + UMAP)\"\n", " )\n", "# -------\n", " \n", @@ -2779,8 +2686,6 @@ "\n", "tuneable_node2vec = create_tuneable(TuneableNode2VecNodeEmbeddings).with_projection_parameters(java_package_sampled_projection_parameters)\n", "\n", - "from sklearn.model_selection import GridSearchCV\n", - "\n", "hyperparameter_tuning = GridSearchCV(\n", " estimator=tuneable_node2vec,\n", " param_grid=tuneable_node2vec_parameter_grid,\n", @@ -2803,8 +2708,8 @@ "\n", " plot_2d_node_embeddings(\n", " prepare_node_embeddings_for_2d_visualization(embeddings),\n", - " \"Java Packages positioned by their dependency relationships (node2vec + t-SNE)\"\n", - " )" + " \"Java Packages positioned by their dependency relationships (node2vec + UMAP)\"\n", + " )\n" ] }, { @@ -2910,8 +2815,6 @@ "# since the sampling is not necessary for Fast Random Projection embeddings.\n", "tuneable_fast_random_projection = create_tuneable(TuneableFastRandomProjectionNodeEmbeddings).with_projection_parameters(java_type_projection_parameters)\n", "\n", - "from sklearn.model_selection import GridSearchCV\n", - "\n", "hyperparameter_tuning = GridSearchCV(\n", " estimator=tuneable_fast_random_projection,\n", " param_grid=tuneable_fast_random_projection_parameter_grid,\n", @@ -2935,13 +2838,7 @@ " node_embeddings_for_visualization = prepare_node_embeddings_for_2d_visualization(embeddings)\n", " plot_2d_node_embeddings(\n", " node_embeddings_for_visualization, \n", - " \"Java Types positioned by their dependency relationships (Fast Random Projection + t-SNE)\"\n", - " )\n", - " plot_2d_node_embeddings(\n", - " node_embeddings_for_visualization, \n", - " \"Java Types positioned by their dependency relationships (Fast Random Projection + UMAP)\",\n", - " x_position_column='embeddingUMAPVisualizationX',\n", - " y_position_column='embeddingUMAPVisualizationY'\n", + " \"Java Types positioned by their dependency relationships (Fast Random Projection + UMAP)\"\n", " )\n", "\n", " data_to_write = pd.DataFrame(data = {\n", @@ -2951,7 +2848,7 @@ " 'embeddingFastRandomProjectionVisualizationX': embeddings[\"embeddingVisualizationX\"],\n", " 'embeddingFastRandomProjectionVisualizationY': embeddings[\"embeddingVisualizationY\"],\n", " })\n", - " write_batch_data_into_database(data_to_write, 'Type')" + " write_batch_data_into_database(data_to_write, 'Type')\n" ] }, { @@ -2987,13 +2884,7 @@ " node_embeddings_for_visualization = prepare_node_embeddings_for_2d_visualization(embeddings)\n", " plot_2d_node_embeddings(\n", " node_embeddings_for_visualization,\n", - " \"Java Types positioned by their dependency relationships (HashGNN + t-SNE)\"\n", - " )\n", - " plot_2d_node_embeddings(\n", - " node_embeddings_for_visualization, \n", - " \"Java Types positioned by their dependency relationships (HashGNN + UMAP)\",\n", - " x_position_column='embeddingUMAPVisualizationX',\n", - " y_position_column='embeddingUMAPVisualizationY'\n", + " \"Java Types positioned by their dependency relationships (HashGNN + UMAP)\"\n", " )\n", "\n", "# -------\n", @@ -3009,8 +2900,6 @@ "\n", "tuneable_hashgnn = create_tuneable(TuneableHashGNNNodeEmbeddings).with_projection_parameters(java_type_sampled_projection_parameters)\n", "\n", - "from sklearn.model_selection import GridSearchCV\n", - "\n", "hyperparameter_tuning = GridSearchCV(\n", " estimator=tuneable_hashgnn,\n", " param_grid=tuneable_hashgnn_parameter_grid,\n", @@ -3033,8 +2922,8 @@ "\n", " plot_2d_node_embeddings(\n", " prepare_node_embeddings_for_2d_visualization(embeddings),\n", - " \"Java Types positioned by their dependency relationships (HashGNN + t-SNE)\"\n", - " )" + " \"Java Types positioned by their dependency relationships (HashGNN + UMAP)\"\n", + " )\n" ] }, { @@ -3067,13 +2956,7 @@ " node_embeddings_for_visualization = prepare_node_embeddings_for_2d_visualization(embeddings)\n", " plot_2d_node_embeddings(\n", " node_embeddings_for_visualization,\n", - " \"Java Types positioned by their dependency relationships (node2vec + t-SNE)\"\n", - " )\n", - " plot_2d_node_embeddings(\n", - " node_embeddings_for_visualization, \n", - " \"Java Types positioned by their dependency relationships (node2vec + UMAP)\",\n", - " x_position_column='embeddingUMAPVisualizationX',\n", - " y_position_column='embeddingUMAPVisualizationY'\n", + " \"Java Types positioned by their dependency relationships (node2vec + UMAP)\"\n", " )\n", "# -------\n", "tuneable_node2vec_parameter_grid = {\n", @@ -3091,8 +2974,6 @@ "\n", "tuneable_node2vec = create_tuneable(TuneableNode2VecNodeEmbeddings).with_projection_parameters(java_type_sampled_projection_parameters)\n", "\n", - "from sklearn.model_selection import GridSearchCV\n", - "\n", "hyperparameter_tuning = GridSearchCV(\n", " estimator=tuneable_node2vec,\n", " param_grid=tuneable_node2vec_parameter_grid,\n", @@ -3115,8 +2996,8 @@ "\n", " plot_2d_node_embeddings(\n", " prepare_node_embeddings_for_2d_visualization(embeddings),\n", - " \"Java Types positioned by their dependency relationships (node2vec + t-SNE)\"\n", - " )" + " \"Java Types positioned by their dependency relationships (node2vec + UMAP)\"\n", + " )\n" ] } ], diff --git a/domains/external-dependencies/explore/ExternalDependenciesJava.ipynb b/domains/external-dependencies/explore/ExternalDependenciesJava.ipynb index bc307525a..8a11c2a07 100644 --- a/domains/external-dependencies/explore/ExternalDependenciesJava.ipynb +++ b/domains/external-dependencies/explore/ExternalDependenciesJava.ipynb @@ -16,7 +16,7 @@ }, { "cell_type": "code", - "execution_count": 233, + "execution_count": null, "id": "4191f259", "metadata": {}, "outputs": [], @@ -24,21 +24,26 @@ "import os\n", "import pandas as pd\n", "import matplotlib.pyplot as plot\n", - "from neo4j import GraphDatabase" + "from neo4j import GraphDatabase\n", + "from typing import Any" ] }, { "cell_type": "code", - "execution_count": 234, + "execution_count": null, "id": "1c5dab37", "metadata": {}, "outputs": [], "source": [ - "# Please set the environment variable \"NEO4J_INITIAL_PASSWORD\" in your shell \n", - "# before starting jupyter notebook to provide the password for the user \"neo4j\". \n", + "# Please set the environment variable \"NEO4J_INITIAL_PASSWORD\" in your shell\n", + "# before starting jupyter notebook to provide the password for the user \"neo4j\".\n", "# It is not recommended to hardcode the password into jupyter notebook for security reasons.\n", "\n", - "driver = GraphDatabase.driver(uri=\"bolt://localhost:7687\", auth=(\"neo4j\", os.environ.get(\"NEO4J_INITIAL_PASSWORD\")))\n", + "neo4j_password = os.environ.get(\"NEO4J_INITIAL_PASSWORD\")\n", + "if not neo4j_password:\n", + " raise ValueError(\"NEO4J_INITIAL_PASSWORD environment variable must be set\")\n", + "\n", + "driver = GraphDatabase.driver(uri=\"bolt://localhost:7687\", auth=(\"neo4j\", neo4j_password))\n", "driver.verify_connectivity()" ] }, @@ -49,169 +54,177 @@ "metadata": {}, "outputs": [], "source": [ - "def get_cypher_query_from_file(filename):\n", - " with open(filename) as file:\n", - " return ' '.join(file.readlines())\n", + "from typing import cast, LiteralString\n", + "\n", + "\n", + "def get_cypher_query_from_file(filename: str) -> str:\n", + " with open(filename, encoding=\"utf-8\") as file:\n", + " return \" \".join(file.readlines())\n", "\n", "\n", - "def query_cypher_to_data_frame(filename):\n", - " records, summary, keys = driver.execute_query(get_cypher_query_from_file(filename))\n", - " return pd.DataFrame([r.values() for r in records], columns=keys)" + "def query_cypher_to_data_frame(filename: str) -> pd.DataFrame:\n", + " records, _, keys = driver.execute_query(cast(LiteralString, get_cypher_query_from_file(filename)))\n", + " return pd.DataFrame([r.values() for r in records], columns=keys)\n", + "\n", + "\n", + "main_color_map: str = \"nipy_spectral\"" ] }, { "cell_type": "code", - "execution_count": 237, + "execution_count": null, "id": "f02d9944", "metadata": {}, "outputs": [], "source": [ - "def group_to_others_below_threshold(data_frame : pd.DataFrame, value_column : str, name_column: str, threshold: float) -> pd.DataFrame: \n", + "def group_to_others_below_threshold(\n", + " data_frame: pd.DataFrame, value_column: str, name_column: str, threshold: float\n", + ") -> pd.DataFrame:\n", " \"\"\"\n", - " Adds a new percentage column for the value column and \n", + " Adds a new percentage column for the value column and\n", " groups all values below the given threshold to \"others\" in the name column.\n", "\n", - " Parameters:\n", - " - data_frame (pd.DataFrame): Input pandas DataFrame\n", - " - value_column (str): Name of the column that contains the numeric value\n", - " - name_column (str): Name of the column that contains the group name that will be replaced by \"others\" for small values\n", - " - threshold (float): Threshold in % that is used to group values below it into the \"others\" group\n", + " Args:\n", + " data_frame: Input pandas DataFrame\n", + " value_column: Name of the column that contains the numeric value\n", + " name_column: Name of the column that contains the group name that will be replaced by \"others\"\n", + " threshold: Threshold in % that is used to group values below it into the \"others\" group\n", "\n", " Returns:\n", - " int:Returning value\n", - "\n", + " DataFrame with grouped values sorted descending by percentage\n", " \"\"\"\n", - " result_data_frame = data_frame[[name_column, value_column]].copy();\n", + " result_data_frame = data_frame[[name_column, value_column]].copy()\n", "\n", - " percent_column_name = value_column + 'Percent';\n", + " percent_column_name = value_column + \"Percent\"\n", "\n", - " # Add column with the name given in \"percent_column_name\" with the percentage of the value column.\n", - " result_data_frame[percent_column_name] = result_data_frame[value_column] / result_data_frame[value_column].sum() * 100.0;\n", + " result_data_frame[percent_column_name] = (\n", + " result_data_frame[value_column] / result_data_frame[value_column].sum() * 100.0\n", + " )\n", "\n", - " # Convert name column to string values if it wasn't of that type before\n", " result_data_frame[name_column] = result_data_frame[name_column].astype(str)\n", "\n", - " # Change the group name to \"others\" if it is called less than the specified threshold\n", - " result_data_frame.loc[result_data_frame[percent_column_name] < threshold, name_column] = 'others';\n", + " result_data_frame.loc[result_data_frame[percent_column_name] < threshold, name_column] = \"others\"\n", "\n", - " # Group by name column (foremost the new \"others\" entries) and sum their percentage\n", - " #result_data_frame = result_data_frame.groupby(name_column)[percent_column_name].sum();\n", - " result_data_frame = result_data_frame.groupby(name_column).sum();\n", - " # Sort by values descending\n", - " #return result_data_frame.sort_values(ascending=False).to_frame();\n", - " return result_data_frame.sort_values(by=percent_column_name, ascending=False);" + " result_data_frame = result_data_frame.groupby(name_column).sum()\n", + " return result_data_frame.sort_values(by=percent_column_name, ascending=False)" ] }, { "cell_type": "code", - "execution_count": 238, + "execution_count": null, "id": "47cc11b0", "metadata": {}, "outputs": [], "source": [ - "def filter_values_below_threshold(data_frame : pd.DataFrame, value_column : str, upper_limit: float = 100.0) -> pd.DataFrame: \n", + "def filter_values_below_threshold(\n", + " data_frame: pd.DataFrame, value_column: str, upper_limit: float = 100.0\n", + ") -> pd.DataFrame:\n", " \"\"\"\n", - " Adds a new percentage column for the value column and \n", - " groups all values below the given threshold to \"others\" in the name column.\n", + " Adds a new percentage column for the value column and filters entries.\n", "\n", - " Parameters:\n", - " - data_frame (pd.DataFrame): Input pandas DataFrame\n", - " - value_column (str): Name of the column that contains the numeric value\n", - " - upper_limit (float): Defaults to 100%. Filters out all entries exceeding this limit. Intended to drill down \"others\" in a second chart/table.\n", + " Args:\n", + " data_frame: Input pandas DataFrame\n", + " value_column: Name of the column that contains the numeric value\n", + " upper_limit: Defaults to 100%. Filters out all entries exceeding this limit.\n", + " Intended to drill down \"others\" in a second chart/table.\n", "\n", " Returns:\n", - " int:Returning value\n", - "\n", + " Filtered DataFrame sorted descending by percentage\n", " \"\"\"\n", - " result_data_frame = data_frame.copy();\n", + " result_data_frame = data_frame.copy()\n", "\n", - " percent_column_name = value_column + 'Percent';\n", + " percent_column_name = value_column + \"Percent\"\n", "\n", - " # Add column with the name given in \"percent_column_name\" with the percentage of the value column.\n", - " result_data_frame[percent_column_name] = result_data_frame[value_column] / result_data_frame[value_column].sum() * 100.0;\n", + " result_data_frame[percent_column_name] = (\n", + " result_data_frame[value_column] / result_data_frame[value_column].sum() * 100.0\n", + " )\n", "\n", - " # Limit entries to meet the an optional upper limit (in percentage)\n", - " result_data_frame = result_data_frame.query(\"`\" + percent_column_name + \"` <= \" + str(upper_limit))\n", + " result_data_frame = result_data_frame.query(f\"`{percent_column_name}` <= {upper_limit}\")\n", "\n", " result_data_frame = result_data_frame.reset_index(drop=True)\n", - " return result_data_frame.sort_values(by=percent_column_name, ascending=False);" + " return result_data_frame.sort_values(by=percent_column_name, ascending=False)" ] }, { "cell_type": "code", - "execution_count": 239, + "execution_count": null, "id": "89a12ec4", "metadata": {}, "outputs": [], "source": [ - "def explode_index_value(input_data_frame: pd.DataFrame, index_value_to_emphasize: str = 'others', base_value: float = 0.02, emphasize_value: float = 0.2):\n", + "import numpy as np\n", + "\n", + "\n", + "def explode_index_value(\n", + " input_data_frame: pd.DataFrame,\n", + " index_value_to_emphasize: str = \"others\",\n", + " base_value: float = 0.02,\n", + " emphasize_value: float = 0.2,\n", + ") -> np.ndarray[Any, np.dtype[np.floating[Any]]]:\n", " \"\"\"\n", - " \"Explode\" offsets slices in a pie chart plot by a given value.\n", - " The specified index value will be emphasized with a larger value to make it stand out in the pie chart plot.\n", + " Generate explode offsets for pie chart slices.\n", "\n", - " Parameters:\n", - " - input_data_frame (pd.DataFrame): Input pandas DataFrame with the data that will be plot. (Required)\n", - " - index_value_to_emphasize (str): Value of the index that will be emphasized. (Default= 'others')\n", - " - base_value (float): Base value for all pies in the chart. (Default=0.02)\n", - " - emphasize_value (float): Value for the emphasized pie in the chart. (Default=0.2)\n", + " The specified index value will be emphasized with a larger offset to make it stand out.\n", "\n", - " Returns:\n", - " Array with the same size as the number of rows/pies to plot containing the \"explode\" value for each of them\n", + " Args:\n", + " input_data_frame: Input pandas DataFrame with the data that will be plotted\n", + " index_value_to_emphasize: Value of the index that will be emphasized (Default=\"others\")\n", + " base_value: Base offset value for all slices (Default=0.02)\n", + " emphasize_value: Offset value for the emphasized slice (Default=0.2)\n", "\n", + " Returns:\n", + " Array with the same length as the number of rows containing the explode offset for each slice\n", " \"\"\"\n", - " # Each entry in the list corresponds to an x value\n", - " # The comparison with the index_value_to_emphasize produces an array of booleans where nth entry with the emphasized value is \"true\"\n", - " # Multiplying it leads to 1 for True and 0 for False therefore \"exploding\" the emphasized entry whilst \n", - " return (input_data_frame.index == index_value_to_emphasize) * emphasize_value + base_value " + " return (input_data_frame.index == index_value_to_emphasize) * emphasize_value + base_value" ] }, { "cell_type": "code", - "execution_count": 240, + "execution_count": null, "id": "e9b1ccad", "metadata": {}, "outputs": [], "source": [ - "def plot_pie_chart(input_data_frame: pd.DataFrame, title: str):\n", + "def plot_pie_chart(input_data_frame: pd.DataFrame, title: str) -> None:\n", + " \"\"\"Render and display a pie chart from a DataFrame.\"\"\"\n", " if input_data_frame.empty:\n", - " print(\"No data to plot for title '\" + title + \"'.\")\n", + " print(f\"No data to plot for title '{title}'.\")\n", " return\n", - " \n", - " name_of_the_first_column_containing_the_values=input_data_frame.columns[0]\n", + "\n", + " name_of_the_first_column_containing_the_values = input_data_frame.columns[0]\n", " total_sum = input_data_frame[name_of_the_first_column_containing_the_values].sum()\n", - " \n", - " def custom_auto_percentage_format(percentage):\n", - " return '{:1.2f}% ({:.0f})'.format(percentage, total_sum * percentage / 100.0)\n", - " \n", - " plot.figure();\n", + "\n", + " def custom_auto_percentage_format(percentage: float) -> str:\n", + " return f\"{percentage:1.2f}% ({total_sum * percentage / 100.0:.0f})\"\n", + "\n", + " plot.figure()\n", "\n", " axis = input_data_frame.plot(\n", - " kind='pie',\n", - " y=name_of_the_first_column_containing_the_values + 'Percent',\n", - " ylabel='',\n", + " kind=\"pie\",\n", + " y=name_of_the_first_column_containing_the_values + \"Percent\",\n", + " ylabel=\"\",\n", " legend=True,\n", " labeldistance=None,\n", " autopct=custom_auto_percentage_format,\n", - " textprops={'fontsize': 6},\n", + " textprops={\"fontsize\": 6},\n", " pctdistance=1.15,\n", " cmap=main_color_map,\n", - " figsize=(9,9),\n", - " explode=explode_index_value(input_data_frame, index_value_to_emphasize='others')\n", + " figsize=(9, 9),\n", + " explode=explode_index_value(input_data_frame, index_value_to_emphasize=\"others\"),\n", " )\n", " plot.title(title, pad=15)\n", - " axis.legend(bbox_to_anchor=(1.08, 1), loc='upper left')\n", + " axis.legend(bbox_to_anchor=(1.08, 1), loc=\"upper left\")\n", " plot.show()" ] }, { "cell_type": "code", - "execution_count": 241, + "execution_count": null, "id": "da9e8edb", "metadata": {}, "outputs": [], "source": [ - "#The following cell uses the build-in %html \"magic\" to override the CSS style for tables to a much smaller size.\n", - "#This is especially needed for PDF export of tables with multiple columns." + "# CSS styling for smaller dataframe tables in notebook export" ] }, { @@ -237,17 +250,6 @@ "" ] }, - { - "cell_type": "code", - "execution_count": 243, - "id": "c2496caf", - "metadata": {}, - "outputs": [], - "source": [ - "# Main Colormap\n", - "main_color_map = 'nipy_spectral'" - ] - }, { "attachments": {}, "cell_type": "markdown", @@ -297,7 +299,7 @@ "metadata": {}, "outputs": [], "source": [ - "external_package_usage=query_cypher_to_data_frame(\"../cypher/External_Dependencies/External_package_usage_overall.cypher\")\n", + "external_package_usage=query_cypher_to_data_frame(\"../queries/External_package_usage_overall.cypher\")\n", "\n", "# Select columns and only show the first 20 entries (head)\n", "external_package_usage.head(20)" @@ -458,7 +460,7 @@ "metadata": {}, "outputs": [], "source": [ - "external_grouped_package_usage=query_cypher_to_data_frame(\"../cypher/External_Dependencies/External_second_level_package_usage_overall.cypher\")\n", + "external_grouped_package_usage=query_cypher_to_data_frame(\"../queries/External_second_level_package_usage_overall.cypher\")\n", "external_grouped_package_usage.head(20)" ] }, @@ -482,13 +484,13 @@ "source": [ "external_grouped_package_by_type_usage_significant = group_to_others_below_threshold(\n", " data_frame=external_grouped_package_usage,\n", - " value_column='numberOfExternalCallerTypes',\n", - " name_column='externalSecondLevelPackageName',\n", - " threshold= 0.7\n", - ");\n", + " value_column=\"numberOfExternalCallerTypes\",\n", + " name_column=\"externalSecondLevelPackageName\",\n", + " threshold=0.7,\n", + ")\n", "plot_pie_chart(\n", " input_data_frame=external_grouped_package_by_type_usage_significant,\n", - " title='Top external package (grouped by first 2 layers) usage [%] by type (more than 0.7% overall)'\n", + " title=\"Top external package (grouped by first 2 layers) usage [%] by type (more than 0.7% overall)\",\n", ")" ] }, @@ -543,13 +545,13 @@ "source": [ "external_grouped_package_by_package_usage_significant = group_to_others_below_threshold(\n", " data_frame=external_grouped_package_usage,\n", - " value_column='numberOfExternalCallerPackages',\n", - " name_column='externalSecondLevelPackageName',\n", - " threshold= 0.7\n", - ");\n", + " value_column=\"numberOfExternalCallerPackages\",\n", + " name_column=\"externalSecondLevelPackageName\",\n", + " threshold=0.7,\n", + ")\n", "plot_pie_chart(\n", " input_data_frame=external_grouped_package_by_package_usage_significant,\n", - " title='Top external package (grouped by first 2 layers) usage [%] by package (more than 0.7% overall)'\n", + " title=\"Top external package (grouped by first 2 layers) usage [%] by package (more than 0.7% overall)\",\n", ")" ] }, @@ -570,17 +572,19 @@ "metadata": {}, "outputs": [], "source": [ - "external_grouped_package_by_package_usage_drill_down_others=filter_values_below_threshold(external_grouped_package_usage, 'numberOfExternalCallerPackages', 0.7)\n", + "external_grouped_package_by_package_usage_drill_down_others = filter_values_below_threshold(\n", + " external_grouped_package_usage, \"numberOfExternalCallerPackages\", 0.7\n", + ")\n", "\n", - "external_grouped_package_by_package_usage_significant_drill_down_others = group_to_others_below_threshold(\n", - " data_frame=external_grouped_package_by_type_usage_drill_down_others,\n", - " value_column='numberOfExternalCallerPackages',\n", - " name_column='externalSecondLevelPackageName',\n", - " threshold= 0.3\n", + "external_grouped_package_package_usage_spread_significant_drill_down_others = group_to_others_below_threshold(\n", + " data_frame=external_grouped_package_by_package_usage_drill_down_others,\n", + " value_column=\"numberOfExternalCallerPackages\",\n", + " name_column=\"externalSecondLevelPackageName\",\n", + " threshold=0.3,\n", ")\n", "plot_pie_chart(\n", - " input_data_frame=external_grouped_package_by_package_usage_significant_drill_down_others,\n", - " title='Top external package (grouped by first 2 layers) usage [%] by package (less than 0.7% overall \"others\" drill-down)'\n", + " input_data_frame=external_grouped_package_package_usage_spread_significant_drill_down_others,\n", + " title=\"Top external package (grouped by first 2 layers) usage [%] by package (less than 0.7% overall 'others' drill-down)\",\n", ")" ] }, @@ -620,7 +624,7 @@ "source": [ "# Query the graph database to provide the \n", "# most widely spread external dependencies for the tables/charts below.\n", - "external_package_usage_spread=query_cypher_to_data_frame(\"../cypher/External_Dependencies/External_package_usage_spread.cypher\")\n", + "external_package_usage_spread=query_cypher_to_data_frame(\"../queries/External_package_usage_spread.cypher\")\n", "external_package_usage_spread.head(10)" ] }, @@ -727,13 +731,13 @@ "source": [ "external_package_type_usage_spread_significant = group_to_others_below_threshold(\n", " data_frame=external_package_usage_spread,\n", - " value_column='sumNumberOfTypes',\n", - " name_column='externalPackageName',\n", - " threshold= 0.5\n", - ");\n", + " value_column=\"sumNumberOfTypes\",\n", + " name_column=\"externalPackageName\",\n", + " threshold=0.5,\n", + ")\n", "plot_pie_chart(\n", " input_data_frame=external_package_type_usage_spread_significant,\n", - " title='Top external package usage spread [%] by type (more than 0.5% overall)'\n", + " title=\"Top external package usage spread [%] by type (more than 0.5% overall)\",\n", ")" ] }, @@ -787,13 +791,13 @@ "source": [ "external_package_usage_package_spread_significant = group_to_others_below_threshold(\n", " data_frame=external_package_usage_spread,\n", - " value_column='sumNumberOfPackages',\n", - " name_column='externalPackageName',\n", - " threshold= 0.5\n", - ");\n", + " value_column=\"sumNumberOfPackages\",\n", + " name_column=\"externalPackageName\",\n", + " threshold=0.5,\n", + ")\n", "plot_pie_chart(\n", " input_data_frame=external_package_usage_package_spread_significant,\n", - " title='Top external package usage spread [%] by package (more than 0.5% overall)'\n", + " title=\"Top external package usage spread [%] by package (more than 0.5% overall)\",\n", ")" ] }, @@ -862,7 +866,7 @@ "metadata": {}, "outputs": [], "source": [ - "external_grouped_package_usage_spread=query_cypher_to_data_frame(\"../cypher/External_Dependencies/External_second_level_package_usage_spread.cypher\")\n", + "external_grouped_package_usage_spread=query_cypher_to_data_frame(\"../queries/External_second_level_package_usage_spread.cypher\")\n", "external_grouped_package_usage_spread.head(20)" ] }, @@ -1013,14 +1017,13 @@ "metadata": {}, "outputs": [], "source": [ - "# Sort by number of external type calls\n", - "external_package_least_used=external_package_usage.sort_values(by='numberOfExternalTypeCalls', ascending=True)\n", + "external_package_least_used = external_package_usage.sort_values(\n", + " by=\"numberOfExternalTypeCalls\", ascending=True\n", + ")\n", "\n", - "# Reset index\n", "external_package_least_used = external_package_least_used.reset_index(drop=True)\n", "\n", - "# Select columns and only show the first 10 entries (head)\n", - "external_package_least_used[['externalPackageName','numberOfExternalTypeCalls']].head(20)\n" + "external_package_least_used[[\"externalPackageName\", \"numberOfExternalTypeCalls\"]].head(20)" ] }, { @@ -1056,7 +1059,7 @@ "metadata": {}, "outputs": [], "source": [ - "query_cypher_to_data_frame(\"../cypher/External_Dependencies/External_package_usage_per_artifact_sorted.cypher\").head(40)" + "query_cypher_to_data_frame(\"../queries/External_package_usage_per_artifact_sorted.cypher\").head(40)" ] }, { @@ -1096,7 +1099,7 @@ "metadata": {}, "outputs": [], "source": [ - "external_packages_per_artifact = query_cypher_to_data_frame(\"../cypher/External_Dependencies/External_package_usage_per_artifact_and_external_package.cypher\")\n", + "external_packages_per_artifact = query_cypher_to_data_frame(\"../queries/External_package_usage_per_artifact_and_external_package.cypher\")\n", "external_packages_per_artifact.head(30)" ] }, @@ -1117,7 +1120,7 @@ "metadata": {}, "outputs": [], "source": [ - "external_second_level_packages_per_artifact = query_cypher_to_data_frame(\"../cypher/External_Dependencies/External_second_level_package_usage_per_artifact_and_external_package.cypher\")\n", + "external_second_level_packages_per_artifact = query_cypher_to_data_frame(\"../queries/External_second_level_package_usage_per_artifact_and_external_package.cypher\")\n", "external_second_level_packages_per_artifact.head(30)" ] }, @@ -1133,13 +1136,13 @@ }, { "cell_type": "code", - "execution_count": 210, + "execution_count": null, "id": "fd9667a9", "metadata": {}, "outputs": [], "source": [ - "# Calculate the percentage for each value based on the global sum of all values\n", - "def percentage_global(data_frame : pd.DataFrame):\n", + "def percentage_global(data_frame: pd.DataFrame) -> pd.DataFrame:\n", + " \"\"\"Calculate percentage for each value based on the global sum of all values.\"\"\"\n", " total = data_frame.sum().sum()\n", " return data_frame / total * 100" ] @@ -1153,17 +1156,18 @@ "source": [ "external_packages_per_artifact_pivot = external_packages_per_artifact.copy()\n", "\n", - "# Every row represents the number of external package\n", - "external_packages_per_artifact_pivot=external_packages_per_artifact_pivot.pivot(index='externalPackageName', columns='artifactName', values='numberOfPackages')\n", + "external_packages_per_artifact_pivot = external_packages_per_artifact_pivot.pivot(\n", + " index=\"externalPackageName\", columns=\"artifactName\", values=\"numberOfPackages\"\n", + ")\n", "\n", - "# Sort by column sum and then take only the first 10 columns\n", "sum_of_external_packages_per_artifact = external_packages_per_artifact_pivot.sum()\n", - "external_packages_per_artifact_pivot = external_packages_per_artifact_pivot[sum_of_external_packages_per_artifact.sort_values(ascending=False).index[:15]]\n", + "external_packages_per_artifact_pivot = external_packages_per_artifact_pivot[\n", + " sum_of_external_packages_per_artifact.sort_values(ascending=False).index[:15]\n", + "]\n", "\n", - "# Fill missing values with zeroes\n", "external_packages_per_artifact_pivot.fillna(0, inplace=True)\n", "\n", - "external_packages_per_artifact_pivot.astype('int')" + "external_packages_per_artifact_pivot.astype(\"int\")" ] }, { @@ -1185,17 +1189,18 @@ "source": [ "external_second_level_packages_per_artifact_pivot = external_second_level_packages_per_artifact.copy()\n", "\n", - "# Every row represents the number of external package\n", - "external_second_level_packages_per_artifact_pivot=external_second_level_packages_per_artifact_pivot.pivot(index='externalPackageNameFirst2Levels', columns='artifactName', values='numberOfPackages')\n", + "external_second_level_packages_per_artifact_pivot = external_second_level_packages_per_artifact_pivot.pivot(\n", + " index=\"externalPackageNameFirst2Levels\", columns=\"artifactName\", values=\"numberOfPackages\"\n", + ")\n", "\n", - "# Sort by column sum and then take only the first 10 columns\n", "sum_of_external_second_level_packages_per_artifact = external_second_level_packages_per_artifact_pivot.sum()\n", - "external_second_level_packages_per_artifact_pivot = external_second_level_packages_per_artifact_pivot[sum_of_external_second_level_packages_per_artifact.sort_values(ascending=False).index[:15]]\n", + "external_second_level_packages_per_artifact_pivot = external_second_level_packages_per_artifact_pivot[\n", + " sum_of_external_second_level_packages_per_artifact.sort_values(ascending=False).index[:15]\n", + "]\n", "\n", - "# Fill missing values with zeroes\n", "external_second_level_packages_per_artifact_pivot.fillna(0, inplace=True)\n", "\n", - "external_second_level_packages_per_artifact_pivot.astype('int')" + "external_second_level_packages_per_artifact_pivot.astype(\"int\")" ] }, { @@ -1220,17 +1225,18 @@ "if external_packages_per_artifact_pivot.empty:\n", " print(\"No data to plot\")\n", "else:\n", - " plot.figure();\n", + " plot.figure()\n", " axes = external_packages_per_artifact_pivot.transpose().plot(\n", - " kind='bar', \n", + " kind=\"bar\",\n", " grid=True,\n", - " title='External package usage per artifact', \n", - " xlabel='artifact',\n", - " ylabel='number of packages',\n", + " title=\"External package usage per artifact\",\n", + " xlabel=\"artifact\",\n", + " ylabel=\"number of packages\",\n", " stacked=True,\n", " legend=True,\n", - " cmap=main_color_map\n", - " ).legend(bbox_to_anchor=(1.0, 1.0))\n", + " cmap=main_color_map,\n", + " )\n", + " axes.legend(bbox_to_anchor=(1.0, 1.0))\n", " plot.show()" ] }, @@ -1256,17 +1262,18 @@ "if external_second_level_packages_per_artifact_pivot.empty:\n", " print(\"No data to plot\")\n", "else:\n", - " plot.figure();\n", + " plot.figure()\n", " axes = external_second_level_packages_per_artifact_pivot.transpose().plot(\n", - " kind='bar', \n", + " kind=\"bar\",\n", " grid=True,\n", - " title='External package (first 2 levels) usage per artifact', \n", - " xlabel='artifact',\n", - " ylabel='number of packages',\n", + " title=\"External package (first 2 levels) usage per artifact\",\n", + " xlabel=\"artifact\",\n", + " ylabel=\"number of packages\",\n", " stacked=True,\n", " legend=True,\n", - " cmap=main_color_map\n", - " ).legend(bbox_to_anchor=(1.0, 1.0))\n", + " cmap=main_color_map,\n", + " )\n", + " axes.legend(bbox_to_anchor=(1.0, 1.0))\n", " plot.show()" ] }, @@ -1305,7 +1312,7 @@ "metadata": {}, "outputs": [], "source": [ - "query_cypher_to_data_frame(\"../cypher/External_Dependencies/External_package_usage_per_artifact_sorted_top.cypher\").head(40)" + "query_cypher_to_data_frame(\"../queries/External_package_usage_per_artifact_sorted_top.cypher\").head(40)" ] }, { @@ -1339,7 +1346,7 @@ "metadata": {}, "outputs": [], "source": [ - "external_package_usage_per_package = query_cypher_to_data_frame(\"../cypher/External_Dependencies/External_package_usage_per_artifact_and_package.cypher\")\n", + "external_package_usage_per_package = query_cypher_to_data_frame(\"../queries/External_package_usage_per_artifact_and_package.cypher\")\n", "external_package_usage_per_package.head(40)" ] }, @@ -1376,7 +1383,7 @@ "metadata": {}, "outputs": [], "source": [ - "external_package_usage_per_type = query_cypher_to_data_frame(\"../cypher/External_Dependencies/External_package_usage_per_type.cypher\")\n", + "external_package_usage_per_type = query_cypher_to_data_frame(\"../queries/External_package_usage_per_type.cypher\")\n", "external_package_usage_per_type.head(20)" ] }, @@ -1411,8 +1418,21 @@ "metadata": {}, "outputs": [], "source": [ - "external_package_usage_per_artifact_distribution = query_cypher_to_data_frame(\"../cypher/External_Dependencies/External_package_usage_per_artifact_distribution.cypher\")\n", - "external_package_usage_per_artifact_distribution_truncated=external_package_usage_per_artifact_distribution[['artifactName', 'artifactPackages', 'artifactTypes', 'numberOfExternalPackages', 'numberOfPackages', 'numberOfTypes', 'typesCallingExternalRate', 'packagesCallingExternalRate']].head(40)\n", + "external_package_usage_per_artifact_distribution = query_cypher_to_data_frame(\n", + " \"../queries/External_package_usage_per_artifact_distribution.cypher\"\n", + ")\n", + "external_package_usage_per_artifact_distribution_truncated = external_package_usage_per_artifact_distribution[\n", + " [\n", + " \"artifactName\",\n", + " \"artifactPackages\",\n", + " \"artifactTypes\",\n", + " \"numberOfExternalPackages\",\n", + " \"numberOfPackages\",\n", + " \"numberOfTypes\",\n", + " \"typesCallingExternalRate\",\n", + " \"packagesCallingExternalRate\",\n", + " ]\n", + "].head(40)\n", "external_package_usage_per_artifact_distribution_truncated" ] }, @@ -1439,16 +1459,18 @@ "metadata": {}, "outputs": [], "source": [ - "external_package_usage_per_package_distribution = query_cypher_to_data_frame(\"../cypher/External_Dependencies/External_package_usage_per_internal_package_count.cypher\")\n", + "external_package_usage_per_package_distribution = query_cypher_to_data_frame(\n", + " \"../queries/External_package_usage_per_internal_package_count.cypher\"\n", + ")\n", + "\n", + "external_package_usage_per_package_distribution = external_package_usage_per_package_distribution.query(\n", + " \"`numberOfPackages` >= 2\"\n", + ")\n", "\n", - "# Only show external dependencies that are at least used in 2 internal packages. \n", - "# Filter out all rows with \"numberOfPackages\" <= 2.\n", - "external_package_usage_per_package_distribution=external_package_usage_per_package_distribution.query(\"`numberOfPackages` >= 2\")\n", - "# Organize artifacts in columns with the number of packages that call external packages as values using pivot\n", - "# Every row represents the number of external packages\n", - "external_package_usage_per_package_distribution=external_package_usage_per_package_distribution.pivot(index='numberOfPackages', columns='artifactName', values='maxPackagesCallingExternalRate')\n", + "external_package_usage_per_package_distribution = external_package_usage_per_package_distribution.pivot(\n", + " index=\"numberOfPackages\", columns=\"artifactName\", values=\"maxPackagesCallingExternalRate\"\n", + ")\n", "\n", - "# Fill missing values with zero\n", "external_package_usage_per_package_distribution.fillna(0, inplace=True)\n", "\n", "external_package_usage_per_package_distribution" @@ -1496,7 +1518,7 @@ "metadata": {}, "outputs": [], "source": [ - "external_package_usage_aggregated = query_cypher_to_data_frame(\"../cypher/External_Dependencies/External_package_usage_per_artifact_package_aggregated.cypher\")\n", + "external_package_usage_aggregated = query_cypher_to_data_frame(\"../queries/External_package_usage_per_artifact_package_aggregated.cypher\")\n", "\n", "external_package_usage_aggregated_packages = external_package_usage_aggregated[['artifactName', 'artifactPackages', 'numberOfExternalPackages', 'minNumberOfPackages', 'medNumberOfPackages', 'avgNumberOfPackages', 'maxNumberOfPackages', 'stdNumberOfPackages']]\n", "external_package_usage_aggregated_packages.head(30)" @@ -1573,39 +1595,48 @@ }, { "cell_type": "code", - "execution_count": 224, + "execution_count": null, "id": "ad1db8af", "metadata": {}, "outputs": [], "source": [ - "def annotate_plot(data_frame: pd.DataFrame, index: int):\n", - " \"\"\"\n", - " Annotates the data points identified by the \"index\" in the plot of the \"data_frame\" \n", - " \"\"\"\n", + "def annotate_plot(data_frame: pd.DataFrame, index: int) -> None:\n", + " \"\"\"Annotate a data point in the plot with artifact name and arrow.\"\"\"\n", " x_position = data_frame.numberOfExternalPackages[index].item()\n", " y_position = data_frame.maxNumberOfPackagesPercentage[index].item()\n", - " artifact_name = data_frame.artifactName[index].item()\n", - "\n", - " label_box=dict(boxstyle=\"round4,pad=0.5\", fc=\"w\", alpha=0.8)\n", - " plot.annotate(artifact_name\n", - " ,xy=(x_position, y_position)\n", - " ,xycoords='data'\n", - " ,xytext=(-30, -15)\n", - " ,textcoords='offset points'\n", - " ,size=6\n", - " ,bbox=label_box\n", - " ,arrowprops=dict(arrowstyle=\"-|>\", mutation_scale=10, color=\"black\")\n", + " artifact_name = data_frame.artifactName[index]\n", + "\n", + " label_box = dict(boxstyle=\"round4,pad=0.5\", fc=\"w\", alpha=0.8)\n", + " plot.annotate(\n", + " artifact_name,\n", + " xy=(x_position, y_position),\n", + " xycoords=\"data\",\n", + " xytext=(-30, -15),\n", + " textcoords=\"offset points\",\n", + " size=6,\n", + " bbox=label_box,\n", + " arrowprops=dict(arrowstyle=\"-|>\", mutation_scale=10, color=\"black\"),\n", " )\n", "\n", - "def index_of_sorted(data_frame: pd.DataFrame, highest: list[str] = []):\n", + "\n", + "def index_of_sorted(\n", + " data_frame: pd.DataFrame, highest: list[str] | None = None\n", + ") -> int:\n", " \"\"\"\n", - " Sorts the \"data_frame\" by columns 'numberOfExternalPackages','maxNumberOfPackagesPercentage','artifactPackages', 'artifactName'\n", - " and returns the index of the first row.\n", - " Columns that are contained in the list of strings parameter \"highest\" will be sorted descending (highest first).\n", + " Sort the DataFrame and return the index of the first row.\n", + "\n", + " Columns in the highest list will be sorted descending (highest first).\n", " \"\"\"\n", - " by = ['numberOfExternalPackages','maxNumberOfPackagesPercentage','artifactPackages', 'artifactName']\n", - " ascending = [('numberOfExternalPackages' not in highest), ('maxNumberOfPackagesPercentage' not in highest), False, True]\n", - " return data_frame.sort_values(by=by, ascending=ascending).head(1).index" + " if highest is None:\n", + " highest = []\n", + " by = [\"numberOfExternalPackages\", \"maxNumberOfPackagesPercentage\", \"artifactPackages\", \"artifactName\"]\n", + " ascending = [\n", + " \"numberOfExternalPackages\" not in highest,\n", + " \"maxNumberOfPackagesPercentage\" not in highest,\n", + " False,\n", + " True,\n", + " ]\n", + " return int(data_frame.sort_values(by=by, ascending=ascending).head(1).index[0])" ] }, { @@ -1618,29 +1649,30 @@ "if external_package_usage_aggregated.empty:\n", " print(\"No data to plot\")\n", "else:\n", - " plot.figure();\n", + " plot.figure()\n", " axes = external_package_usage_aggregated.plot(\n", - " kind='scatter',\n", - " title='External package usage - max internal packages %', \n", - " x='numberOfExternalPackages',\n", - " y='maxNumberOfPackagesPercentage',\n", - " s='artifactPackages',\n", - " c='stdNumberOfPackagesPercentage',\n", - " xlabel='external package count',\n", - " ylabel='max percentage of internal packages',\n", + " kind=\"scatter\",\n", + " title=\"External package usage - max internal packages %\",\n", + " x=\"numberOfExternalPackages\",\n", + " y=\"maxNumberOfPackagesPercentage\",\n", + " s=\"artifactPackages\",\n", + " c=\"stdNumberOfPackagesPercentage\",\n", + " xlabel=\"external package count\",\n", + " ylabel=\"max percentage of internal packages\",\n", " cmap=main_color_map,\n", " )\n", "\n", - " # Annotate the largest artifact with the highest number of external packages and max number of packages in percentage\n", - " annotation_index = index_of_sorted(highest=['numberOfExternalPackages','maxNumberOfPackagesPercentage'], data_frame=external_package_usage_aggregated)\n", + " annotation_index = index_of_sorted(\n", + " highest=[\"numberOfExternalPackages\", \"maxNumberOfPackagesPercentage\"],\n", + " data_frame=external_package_usage_aggregated,\n", + " )\n", " annotate_plot(external_package_usage_aggregated, annotation_index)\n", "\n", - "\n", - " # Annotate the largest artifact with the lowest number of external packages and the highest max number of packages in percentage\n", - " annotation_index = index_of_sorted(highest=['maxNumberOfPackagesPercentage'], data_frame=external_package_usage_aggregated)\n", + " annotation_index = index_of_sorted(\n", + " highest=[\"maxNumberOfPackagesPercentage\"], data_frame=external_package_usage_aggregated\n", + " )\n", " annotate_plot(external_package_usage_aggregated, annotation_index)\n", "\n", - " # Annotate the largest artifact with the lowest number of external packages and max number of packages in percentage\n", " annotation_index = index_of_sorted(highest=[], data_frame=external_package_usage_aggregated)\n", " annotate_plot(external_package_usage_aggregated, annotation_index)\n", "\n", @@ -1669,16 +1701,16 @@ "if external_package_usage_aggregated.empty:\n", " print(\"No data to plot\")\n", "else:\n", - " plot.figure();\n", + " plot.figure()\n", " axes = external_package_usage_aggregated.plot(\n", - " kind='scatter',\n", - " title='External package usage - median internal packages %', \n", - " x='numberOfExternalPackages',\n", - " y='medNumberOfPackagesPercentage',\n", - " s='artifactPackages',\n", - " c='stdNumberOfPackagesPercentage',\n", - " xlabel='external package count',\n", - " ylabel='median percentage of internal packages',\n", + " kind=\"scatter\",\n", + " title=\"External package usage - median internal packages %\",\n", + " x=\"numberOfExternalPackages\",\n", + " y=\"medNumberOfPackagesPercentage\",\n", + " s=\"artifactPackages\",\n", + " c=\"stdNumberOfPackagesPercentage\",\n", + " xlabel=\"external package count\",\n", + " ylabel=\"median percentage of internal packages\",\n", " cmap=main_color_map,\n", " )\n", " plot.show()" @@ -1705,12 +1737,12 @@ "metadata": {}, "outputs": [], "source": [ - "query_cypher_to_data_frame(\"../cypher/External_Dependencies/Maven_POMs_and_their_declared_dependencies.cypher\")" + "query_cypher_to_data_frame(\"../queries/Maven_POMs_and_their_declared_dependencies.cypher\")" ] }, { "cell_type": "code", - "execution_count": 228, + "execution_count": null, "id": "46baa3c1", "metadata": {}, "outputs": [], @@ -1727,7 +1759,7 @@ ], "celltoolbar": "Tags", "kernelspec": { - "display_name": "codegraph", + "display_name": "code-graph-analysis-pipeline (3.12.8)", "language": "python", "name": "python3" }, diff --git a/domains/external-dependencies/explore/ExternalDependenciesTypescript.ipynb b/domains/external-dependencies/explore/ExternalDependenciesTypescript.ipynb index fe055b430..9283f90d8 100644 --- a/domains/external-dependencies/explore/ExternalDependenciesTypescript.ipynb +++ b/domains/external-dependencies/explore/ExternalDependenciesTypescript.ipynb @@ -24,7 +24,8 @@ "import os\n", "import pandas as pd\n", "import matplotlib.pyplot as plot\n", - "from neo4j import GraphDatabase" + "from neo4j import GraphDatabase\n", + "from typing import Any" ] }, { @@ -34,11 +35,15 @@ "metadata": {}, "outputs": [], "source": [ - "# Please set the environment variable \"NEO4J_INITIAL_PASSWORD\" in your shell \n", - "# before starting jupyter notebook to provide the password for the user \"neo4j\". \n", + "# Please set the environment variable \"NEO4J_INITIAL_PASSWORD\" in your shell\n", + "# before starting jupyter notebook to provide the password for the user \"neo4j\".\n", "# It is not recommended to hardcode the password into jupyter notebook for security reasons.\n", "\n", - "driver = GraphDatabase.driver(uri=\"bolt://localhost:7687\", auth=(\"neo4j\", os.environ.get(\"NEO4J_INITIAL_PASSWORD\")))\n", + "neo4j_password = os.environ.get(\"NEO4J_INITIAL_PASSWORD\")\n", + "if not neo4j_password:\n", + " raise ValueError(\"NEO4J_INITIAL_PASSWORD environment variable must be set\")\n", + "\n", + "driver = GraphDatabase.driver(uri=\"bolt://localhost:7687\", auth=(\"neo4j\", neo4j_password))\n", "driver.verify_connectivity()" ] }, @@ -49,14 +54,20 @@ "metadata": {}, "outputs": [], "source": [ - "def get_cypher_query_from_file(filename):\n", - " with open(filename) as file:\n", - " return ' '.join(file.readlines())\n", + "from typing import cast, LiteralString\n", + "\n", + "\n", + "def get_cypher_query_from_file(filename: str) -> str:\n", + " with open(filename, encoding=\"utf-8\") as file:\n", + " return \" \".join(file.readlines())\n", + "\n", "\n", + "def query_cypher_to_data_frame(filename: str) -> pd.DataFrame:\n", + " records, _, keys = driver.execute_query(cast(LiteralString, get_cypher_query_from_file(filename)))\n", + " return pd.DataFrame([r.values() for r in records], columns=keys)\n", "\n", - "def query_cypher_to_data_frame(filename):\n", - " records, summary, keys = driver.execute_query(get_cypher_query_from_file(filename))\n", - " return pd.DataFrame([r.values() for r in records], columns=keys)" + "\n", + "main_color_map: str = \"nipy_spectral\"" ] }, { @@ -66,40 +77,35 @@ "metadata": {}, "outputs": [], "source": [ - "def group_to_others_below_threshold(data_frame : pd.DataFrame, value_column : str, name_column: str, threshold: float) -> pd.DataFrame: \n", + "def group_to_others_below_threshold(\n", + " data_frame: pd.DataFrame, value_column: str, name_column: str, threshold: float\n", + ") -> pd.DataFrame:\n", " \"\"\"\n", - " Adds a new percentage column for the value column and \n", - " groups all values below the given threshold to \"others\" in the name column.\n", + " Add percentage column and groups values below threshold to \"others\".\n", "\n", - " Parameters:\n", - " - data_frame (pd.DataFrame): Input pandas DataFrame\n", - " - value_column (str): Name of the column that contains the numeric value\n", - " - name_column (str): Name of the column that contains the group name that will be replaced by \"others\" for small values\n", - " - threshold (float): Threshold in % that is used to group values below it into the \"others\" group\n", + " Args:\n", + " data_frame: Input pandas DataFrame\n", + " value_column: Name of the column that contains the numeric value\n", + " name_column: Name of the column that contains the group name\n", + " threshold: Threshold in % to group values below it into \"others\"\n", "\n", " Returns:\n", - " int:Returning value\n", - "\n", + " DataFrame with grouped values sorted descending by percentage\n", " \"\"\"\n", - " result_data_frame = data_frame[[name_column, value_column]].copy();\n", + " result_data_frame = data_frame[[name_column, value_column]].copy()\n", "\n", - " percent_column_name = value_column + 'Percent';\n", + " percent_column_name = value_column + \"Percent\"\n", "\n", - " # Add column with the name given in \"percent_column_name\" with the percentage of the value column.\n", - " result_data_frame[percent_column_name] = result_data_frame[value_column] / result_data_frame[value_column].sum() * 100.0;\n", + " result_data_frame[percent_column_name] = (\n", + " result_data_frame[value_column] / result_data_frame[value_column].sum() * 100.0\n", + " )\n", "\n", - " # Convert name column to string values if it wasn't of that type before\n", " result_data_frame[name_column] = result_data_frame[name_column].astype(str)\n", "\n", - " # Change the group name to \"others\" if it is called less than the specified threshold\n", - " result_data_frame.loc[result_data_frame[percent_column_name] < threshold, name_column] = 'others';\n", + " result_data_frame.loc[result_data_frame[percent_column_name] < threshold, name_column] = \"others\"\n", "\n", - " # Group by name column (foremost the new \"others\" entries) and sum their percentage\n", - " #result_data_frame = result_data_frame.groupby(name_column)[percent_column_name].sum();\n", - " result_data_frame = result_data_frame.groupby(name_column).sum();\n", - " # Sort by values descending\n", - " #return result_data_frame.sort_values(ascending=False).to_frame();\n", - " return result_data_frame.sort_values(by=percent_column_name, ascending=False);" + " result_data_frame = result_data_frame.groupby(name_column).sum()\n", + " return result_data_frame.sort_values(by=percent_column_name, ascending=False)" ] }, { @@ -109,32 +115,32 @@ "metadata": {}, "outputs": [], "source": [ - "def filter_values_below_threshold(data_frame : pd.DataFrame, value_column : str, upper_limit: float = 100.0) -> pd.DataFrame: \n", + "def filter_values_below_threshold(\n", + " data_frame: pd.DataFrame, value_column: str, upper_limit: float = 100.0\n", + ") -> pd.DataFrame:\n", " \"\"\"\n", - " Adds a new percentage column for the value column and \n", - " groups all values below the given threshold to \"others\" in the name column.\n", + " Add percentage column and filter entries.\n", "\n", - " Parameters:\n", - " - data_frame (pd.DataFrame): Input pandas DataFrame\n", - " - value_column (str): Name of the column that contains the numeric value\n", - " - upper_limit (float): Defaults to 100%. Filters out all entries exceeding this limit. Intended to drill down \"others\" in a second chart/table.\n", + " Args:\n", + " data_frame: Input pandas DataFrame\n", + " value_column: Name of the column that contains the numeric value\n", + " upper_limit: Defaults to 100%. Filters out all entries exceeding this limit.\n", "\n", " Returns:\n", - " int:Returning value\n", - "\n", + " Filtered DataFrame sorted descending by percentage\n", " \"\"\"\n", - " result_data_frame = data_frame.copy();\n", + " result_data_frame = data_frame.copy()\n", "\n", - " percent_column_name = value_column + 'Percent';\n", + " percent_column_name = value_column + \"Percent\"\n", "\n", - " # Add column with the name given in \"percent_column_name\" with the percentage of the value column.\n", - " result_data_frame[percent_column_name] = result_data_frame[value_column] / result_data_frame[value_column].sum() * 100.0;\n", + " result_data_frame[percent_column_name] = (\n", + " result_data_frame[value_column] / result_data_frame[value_column].sum() * 100.0\n", + " )\n", "\n", - " # Limit entries to meet the an optional upper limit (in percentage)\n", - " result_data_frame = result_data_frame.query(\"`\" + percent_column_name + \"` <= \" + str(upper_limit))\n", + " result_data_frame = result_data_frame.query(f\"`{percent_column_name}` <= {upper_limit}\")\n", "\n", " result_data_frame = result_data_frame.reset_index(drop=True)\n", - " return result_data_frame.sort_values(by=percent_column_name, ascending=False);" + " return result_data_frame.sort_values(by=percent_column_name, ascending=False)" ] }, { @@ -148,25 +154,28 @@ }, "outputs": [], "source": [ - "def explode_index_value(input_data_frame: pd.DataFrame, index_value_to_emphasize: str = 'others', base_value: float = 0.02, emphasize_value: float = 0.2):\n", + "import numpy as np\n", + "\n", + "\n", + "def explode_index_value(\n", + " input_data_frame: pd.DataFrame,\n", + " index_value_to_emphasize: str = \"others\",\n", + " base_value: float = 0.02,\n", + " emphasize_value: float = 0.2,\n", + ") -> np.ndarray[Any, np.dtype[np.floating[Any]]]:\n", " \"\"\"\n", - " \"Explode\" offsets slices in a pie chart plot by a given value.\n", - " The specified index value will be emphasized with a larger value to make it stand out in the pie chart plot.\n", + " Generate explode offsets for pie chart slices.\n", "\n", - " Parameters:\n", - " - input_data_frame (pd.DataFrame): Input pandas DataFrame with the data that will be plot. (Required)\n", - " - index_value_to_emphasize (str): Value of the index that will be emphasized. (Default= 'others')\n", - " - base_value (float): Base value for all pies in the chart. (Default=0.02)\n", - " - emphasize_value (float): Value for the emphasized pie in the chart. (Default=0.2)\n", + " Args:\n", + " input_data_frame: Input pandas DataFrame with data to plot\n", + " index_value_to_emphasize: Value of the index to emphasize (Default=\"others\")\n", + " base_value: Base offset value for all slices (Default=0.02)\n", + " emphasize_value: Offset value for the emphasized slice (Default=0.2)\n", "\n", " Returns:\n", - " Array with the same size as the number of rows/pies to plot containing the \"explode\" value for each of them\n", - "\n", + " Array with explode offset for each slice\n", " \"\"\"\n", - " # Each entry in the list corresponds to an x value\n", - " # The comparison with the index_value_to_emphasize produces an array of booleans where nth entry with the emphasized value is \"true\"\n", - " # Multiplying it leads to 1 for True and 0 for False therefore \"exploding\" the emphasized entry whilst \n", - " return (input_data_frame.index == index_value_to_emphasize) * emphasize_value + base_value " + " return (input_data_frame.index == index_value_to_emphasize) * emphasize_value + base_value" ] }, { @@ -176,34 +185,35 @@ "metadata": {}, "outputs": [], "source": [ - "def plot_pie_chart(input_data_frame: pd.DataFrame, title: str):\n", + "def plot_pie_chart(input_data_frame: pd.DataFrame, title: str) -> None:\n", + " \"\"\"Render and display a pie chart from a DataFrame.\"\"\"\n", " if input_data_frame.empty:\n", - " print(\"No data to plot for title '\" + title + \"'.\")\n", + " print(f\"No data to plot for title '{title}'.\")\n", " return\n", - " \n", - " name_of_the_first_column_containing_the_values=input_data_frame.columns[0]\n", + "\n", + " name_of_the_first_column_containing_the_values = input_data_frame.columns[0]\n", " total_sum = input_data_frame[name_of_the_first_column_containing_the_values].sum()\n", - " \n", - " def custom_auto_percentage_format(percentage):\n", - " return '{:1.2f}% ({:.0f})'.format(percentage, total_sum * percentage / 100.0)\n", - " \n", - " plot.figure();\n", + "\n", + " def custom_auto_percentage_format(percentage: float) -> str:\n", + " return f\"{percentage:1.2f}% ({total_sum * percentage / 100.0:.0f})\"\n", + "\n", + " plot.figure()\n", "\n", " axis = input_data_frame.plot(\n", - " kind='pie',\n", - " y=name_of_the_first_column_containing_the_values + 'Percent',\n", - " ylabel='',\n", + " kind=\"pie\",\n", + " y=name_of_the_first_column_containing_the_values + \"Percent\",\n", + " ylabel=\"\",\n", " legend=True,\n", " labeldistance=None,\n", " autopct=custom_auto_percentage_format,\n", - " textprops={'fontsize': 6},\n", + " textprops={\"fontsize\": 6},\n", " pctdistance=1.15,\n", " cmap=main_color_map,\n", - " figsize=(9,9),\n", - " explode=explode_index_value(input_data_frame, index_value_to_emphasize='others')\n", + " figsize=(9, 9),\n", + " explode=explode_index_value(input_data_frame, index_value_to_emphasize=\"others\"),\n", " )\n", " plot.title(title, pad=15)\n", - " axis.legend(bbox_to_anchor=(1.08, 1), loc='upper left')\n", + " axis.legend(bbox_to_anchor=(1.08, 1), loc=\"upper left\")\n", " plot.show()" ] }, @@ -214,8 +224,7 @@ "metadata": {}, "outputs": [], "source": [ - "#The following cell uses the build-in %html \"magic\" to override the CSS style for tables to a much smaller size.\n", - "#This is especially needed for PDF export of tables with multiple columns." + "# CSS styling for smaller dataframe tables in notebook export" ] }, { @@ -241,17 +250,6 @@ "" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "c2496caf", - "metadata": {}, - "outputs": [], - "source": [ - "# Main Colormap\n", - "main_color_map = 'nipy_spectral'" - ] - }, { "attachments": {}, "cell_type": "markdown", @@ -264,7 +262,7 @@ "\n", "An external Typescript module is marked with the label `ExternalModule` and the declarations it provides with `ExternalDeclaration`. In practice, the distinction between internal and external isn't always that clear. When there is a problem following the project configuration like discussed in [Missing Interfaces and other elements in the Graph](https://github.com/jqassistant-plugin/jqassistant-typescript-plugin/issues/35), some internal dependencies might be imported as external ones. \n", "\n", - "To have a second indicator, the property `isNodeModule` is written with [Add_module_properties.cypher](./../cypher/Typescript_Enrichment/Add_module_properties.cypher) in [prepareAnalysis.sh](./../scripts/prepareAnalysis.sh). For most package managers this should then be sufficient. As of now (June 2024), it might not work with [Yarn Plug'n'Play](https://yarnpkg.com/features/pnp)." + "To have a second indicator, the property `isNodeModule` is written with [Add_module_properties.cypher](../../../cypher/Typescript_Enrichment/Add_module_properties.cypher) in [prepareAnalysis.sh](../../../scripts/prepareAnalysis.sh). For most package managers this should then be sufficient. As of now (June 2024), it might not work with [Yarn Plug'n'Play](https://yarnpkg.com/features/pnp)." ] }, { @@ -299,10 +297,7 @@ "metadata": {}, "outputs": [], "source": [ - "external_module_usage=query_cypher_to_data_frame(\"../cypher/External_Dependencies/External_module_usage_overall_for_Typescript.cypher\")\n", - "\n", - "# Select columns and only show the first 20 entries (head)\n", - "external_module_usage.head(20)" + "external_module_usage=query_cypher_to_data_frame(\"../queries/External_module_usage_overall_for_Typescript.cypher\")" ] }, { @@ -326,13 +321,13 @@ "source": [ "external_module_by_internal_element_usage_significant = group_to_others_below_threshold(\n", " data_frame=external_module_usage,\n", - " value_column='numberOfExternalCallerElements',\n", - " name_column='externalModuleName',\n", - " threshold= 0.7\n", - ");\n", + " value_column=\"numberOfExternalCallerElements\",\n", + " name_column=\"externalModuleName\",\n", + " threshold=0.7,\n", + ")\n", "plot_pie_chart(\n", " input_data_frame=external_module_by_internal_element_usage_significant,\n", - " title='Top external module usage [%] by internal elements (more than 0.7% overall)'\n", + " title=\"Top external module usage [%] by internal elements (more than 0.7% overall)\",\n", ")" ] }, @@ -353,17 +348,19 @@ "metadata": {}, "outputs": [], "source": [ - "external_module_by_internal_element_usage_drill_down_others=filter_values_below_threshold(external_module_usage, 'numberOfExternalCallerElements', 0.7)\n", + "external_module_by_internal_element_usage_drill_down_others = filter_values_below_threshold(\n", + " external_module_usage, \"numberOfExternalCallerElements\", 0.7\n", + ")\n", "\n", "external_module_by_internal_element_usage_significant_drill_down_others = group_to_others_below_threshold(\n", " data_frame=external_module_by_internal_element_usage_drill_down_others,\n", - " value_column='numberOfExternalCallerElements',\n", - " name_column='externalModuleName',\n", - " threshold= 0.3\n", + " value_column=\"numberOfExternalCallerElements\",\n", + " name_column=\"externalModuleName\",\n", + " threshold=0.3,\n", ")\n", "plot_pie_chart(\n", " input_data_frame=external_module_by_internal_element_usage_significant_drill_down_others,\n", - " title='Top external module usage [%] by internal elements (less than 0.7% overall \"others\" drill-down)'\n", + " title=\"Top external module usage [%] by internal elements (less than 0.7% overall 'others' drill-down)\",\n", ")" ] }, @@ -387,13 +384,13 @@ "source": [ "external_module_used_by_internal_modules_significant = group_to_others_below_threshold(\n", " data_frame=external_module_usage,\n", - " value_column='numberOfExternalCallerModules',\n", - " name_column='externalModuleName',\n", - " threshold= 0.7\n", - ");\n", + " value_column=\"numberOfExternalCallerModules\",\n", + " name_column=\"externalModuleName\",\n", + " threshold=0.7,\n", + ")\n", "plot_pie_chart(\n", " input_data_frame=external_module_used_by_internal_modules_significant,\n", - " title='Top external module usage [%] by internal modules (more than 0.7% overall)'\n", + " title=\"Top external module usage [%] by internal modules (more than 0.7% overall)\",\n", ")" ] }, @@ -414,17 +411,19 @@ "metadata": {}, "outputs": [], "source": [ - "external_module_used_by_internal_modules_drill_down_others=filter_values_below_threshold(external_module_usage, 'numberOfExternalCallerModules', 0.7)\n", + "external_module_used_by_internal_modules_drill_down_others = filter_values_below_threshold(\n", + " external_module_usage, \"numberOfExternalCallerModules\", 0.7\n", + ")\n", "\n", "external_module_used_by_internal_modules_significant_drill_down_others = group_to_others_below_threshold(\n", " data_frame=external_module_used_by_internal_modules_drill_down_others,\n", - " value_column='numberOfExternalCallerModules',\n", - " name_column='externalModuleName',\n", - " threshold= 0.3\n", + " value_column=\"numberOfExternalCallerModules\",\n", + " name_column=\"externalModuleName\",\n", + " threshold=0.3,\n", ")\n", "plot_pie_chart(\n", " input_data_frame=external_module_used_by_internal_modules_significant_drill_down_others,\n", - " title='Top external module usage [%] by internal modules (less than 0.7% overall \"others\" drill-down)'\n", + " title=\"Top external module usage [%] by internal modules (less than 0.7% overall 'others' drill-down)\",\n", ")" ] }, @@ -460,7 +459,7 @@ "metadata": {}, "outputs": [], "source": [ - "external_namespace_usage=query_cypher_to_data_frame(\"../cypher/External_Dependencies/External_namespace_usage_overall_for_Typescript.cypher\")\n", + "external_namespace_usage=query_cypher_to_data_frame(\"../queries/External_namespace_usage_overall_for_Typescript.cypher\")\n", "external_namespace_usage.head(20)" ] }, @@ -625,8 +624,7 @@ "source": [ "# Query the graph database to provide the \n", "# most widely spread external dependencies for the tables/charts below.\n", - "external_module_usage_spread=query_cypher_to_data_frame(\"../cypher/External_Dependencies/External_module_usage_spread_for_Typescript.cypher\")\n", - "external_module_usage_spread.head(10)" + "external_module_usage_spread=query_cypher_to_data_frame(\"../queries/External_module_usage_spread_for_Typescript.cypher\")" ] }, { @@ -898,8 +896,7 @@ "metadata": {}, "outputs": [], "source": [ - "external_namespace_usage_spread=query_cypher_to_data_frame(\"../cypher/External_Dependencies/External_namespace_usage_spread_for_Typescript.cypher\")\n", - "external_namespace_usage_spread.head(20)" + "external_namespace_usage_spread=query_cypher_to_data_frame(\"../queries/External_namespace_usage_spread_for_Typescript.cypher\")" ] }, { @@ -1161,8 +1158,7 @@ "metadata": {}, "outputs": [], "source": [ - "external_modules_used_by_internal_modules = query_cypher_to_data_frame(\"../cypher/External_Dependencies/External_module_usage_per_internal_module_sorted_for_Typescript.cypher\")\n", - "external_modules_used_by_internal_modules.head(40)" + "external_modules_used_by_internal_modules = query_cypher_to_data_frame(\"../queries/External_module_usage_per_internal_module_sorted_for_Typescript.cypher\")" ] }, { @@ -1180,8 +1176,7 @@ "metadata": {}, "outputs": [], "source": [ - "external_namespaces_used_by_internal_modules = query_cypher_to_data_frame(\"../cypher/External_Dependencies/External_namespace_usage_per_internal_module_sorted_for_Typescript.cypher\")\n", - "external_namespaces_used_by_internal_modules.head(40)" + "external_namespaces_used_by_internal_modules = query_cypher_to_data_frame(\"../queries/External_namespace_usage_per_internal_module_sorted_for_Typescript.cypher\")" ] }, { @@ -1389,9 +1384,7 @@ "metadata": {}, "outputs": [], "source": [ - "external_package_usage_per_artifact_distribution = query_cypher_to_data_frame(\"../cypher/External_Dependencies/External_module_usage_per_internal_module_distribution_for_Typescript.cypher\")\n", - "external_package_usage_per_artifact_distribution_truncated=external_package_usage_per_artifact_distribution[['internalModuleName', 'numberOfAllInternalElements', 'externalModuleCount', 'internalElementCount', 'internalElementsCallingExternalRate']].head(40)\n", - "external_package_usage_per_artifact_distribution_truncated" + "external_package_usage_per_artifact_distribution = query_cypher_to_data_frame(\"../queries/External_module_usage_per_internal_module_distribution_for_Typescript.cypher\")" ] }, { @@ -1434,10 +1427,7 @@ "metadata": {}, "outputs": [], "source": [ - "external_package_usage_aggregated = query_cypher_to_data_frame(\"../cypher/External_Dependencies/External_module_usage_per_internal_module_aggregated_for_Typescript.cypher\")\n", - "\n", - "external_package_usage_aggregated_packages = external_package_usage_aggregated[['internalModuleName', 'internalModuleElementsCount', 'numberOfExternalModules', 'minNumberOfInternalModules', 'medNumberOfInternalModules', 'avgNumberOfInternalModules', 'maxNumberOfInternalModules', 'stdNumberOfInternalModules']]\n", - "external_package_usage_aggregated_packages.head(30)" + "external_package_usage_aggregated = query_cypher_to_data_frame(\"../queries/External_module_usage_per_internal_module_aggregated_for_Typescript.cypher\")" ] }, { @@ -1497,24 +1487,43 @@ "metadata": {}, "outputs": [], "source": [ - "def annotate_plot(data_frame: pd.DataFrame, index: int):\n", + "def annotate_plot(data_frame: pd.DataFrame, index: int) -> None:\n", + " \"\"\"Annotate a data point in the plot with artifact name and arrow.\"\"\"\n", + " x_position = data_frame.numberOfExternalPackages[index].item()\n", + " y_position = data_frame.maxNumberOfPackagesPercentage[index].item()\n", + " artifact_name = data_frame.artifactName[index]\n", + "\n", + " label_box = dict(boxstyle=\"round4,pad=0.5\", fc=\"w\", alpha=0.8)\n", + " plot.annotate(\n", + " artifact_name,\n", + " xy=(x_position, y_position),\n", + " xycoords=\"data\",\n", + " xytext=(-30, -15),\n", + " textcoords=\"offset points\",\n", + " size=6,\n", + " bbox=label_box,\n", + " arrowprops=dict(arrowstyle=\"-|>\", mutation_scale=10, color=\"black\"),\n", + " )\n", + "\n", + "\n", + "def index_of_sorted(\n", + " data_frame: pd.DataFrame, highest: list[str] | None = None\n", + ") -> int:\n", " \"\"\"\n", - " Annotates the data points identified by the \"index\" in the plot of the \"data_frame\" \n", + " Sort the DataFrame and return the index of the first row.\n", + "\n", + " Columns in the highest list will be sorted descending (highest first).\n", " \"\"\"\n", - " x_position = data_frame.numberOfExternalModules[index].item()\n", - " y_position = data_frame.maxNumberOfInternalElementsPercentage[index].item()\n", - " artifact_name = data_frame.internalModuleName[index].item()\n", - "\n", - " label_box=dict(boxstyle=\"round4,pad=0.5\", fc=\"w\", alpha=0.8)\n", - " plot.annotate(artifact_name\n", - " ,xy=(x_position, y_position)\n", - " ,xycoords='data'\n", - " ,xytext=(-30, -15)\n", - " ,textcoords='offset points'\n", - " ,size=6\n", - " ,bbox=label_box\n", - " ,arrowprops=dict(arrowstyle=\"-|>\", mutation_scale=10, color=\"black\")\n", - " )" + " if highest is None:\n", + " highest = []\n", + " by = [\"numberOfExternalPackages\", \"maxNumberOfPackagesPercentage\", \"artifactPackages\", \"artifactName\"]\n", + " ascending = [\n", + " \"numberOfExternalPackages\" not in highest,\n", + " \"maxNumberOfPackagesPercentage\" not in highest,\n", + " False,\n", + " True,\n", + " ]\n", + " return int(data_frame.sort_values(by=by, ascending=ascending).head(1).index[0])" ] }, { diff --git a/domains/external-dependencies/externalDependencyCharts.py b/domains/external-dependencies/externalDependencyCharts.py index 3d3f4a3fa..1a3380010 100644 --- a/domains/external-dependencies/externalDependencyCharts.py +++ b/domains/external-dependencies/externalDependencyCharts.py @@ -182,8 +182,10 @@ def filter_entries_below_percentage_threshold( threshold_percent: float, ) -> pd.DataFrame: """ - Returns only rows whose percentage share of the *original* total is strictly - below threshold_percent. Used to drill down into the 'others' slice. + Returns only rows whose percentage share of the *original* total is below + threshold_percent. Used to drill down into the 'others' slice. + Matches the grouping logic of group_small_values_into_others (< not <=) + to avoid double-counting at the threshold boundary. """ result = add_percentage_column(data_frame, value_column) percent_column = value_column + "Percent" @@ -452,9 +454,8 @@ def generate_java_charts(queries_directory: str, report_directory: str, verbose: # ── Top external packages (Table 1 equivalent) ──────────────────────────── if not overall_data.empty: - top20 = overall_data.head(20) save_pie_chart_pair( - source_data=top20, + source_data=overall_data, value_column="numberOfExternalCallerTypes", name_column="externalPackageName", chart_name_prefix="Java_Top_external_packages_by_types", @@ -463,7 +464,7 @@ def generate_java_charts(queries_directory: str, report_directory: str, verbose: verbose=verbose, ) save_pie_chart_pair( - source_data=top20, + source_data=overall_data, value_column="numberOfExternalCallerPackages", name_column="externalPackageName", chart_name_prefix="Java_Top_external_packages_by_packages", @@ -474,9 +475,8 @@ def generate_java_charts(queries_directory: str, report_directory: str, verbose: # ── Second-level package grouping (Table 2 equivalent) ──────────────────── if not second_level_overall_data.empty: - top20_second_level = second_level_overall_data.head(20) save_pie_chart_pair( - source_data=top20_second_level, + source_data=second_level_overall_data, value_column="numberOfExternalCallerTypes", name_column="externalSecondLevelPackageName", chart_name_prefix="Java_Top_second_level_packages_by_types", @@ -485,7 +485,7 @@ def generate_java_charts(queries_directory: str, report_directory: str, verbose: verbose=verbose, ) save_pie_chart_pair( - source_data=top20_second_level, + source_data=second_level_overall_data, value_column="numberOfExternalCallerPackages", name_column="externalSecondLevelPackageName", chart_name_prefix="Java_Top_second_level_packages_by_packages", @@ -496,9 +496,8 @@ def generate_java_charts(queries_directory: str, report_directory: str, verbose: # ── Most spread external packages (Table 3 equivalent) ──────────────────── if not spread_data.empty: - top20_spread = spread_data.head(20) save_pie_chart_pair( - source_data=top20_spread, + source_data=spread_data, value_column="sumNumberOfTypes", name_column="externalPackageName", chart_name_prefix="Java_Most_spread_packages_by_types", @@ -507,7 +506,7 @@ def generate_java_charts(queries_directory: str, report_directory: str, verbose: verbose=verbose, ) save_pie_chart_pair( - source_data=top20_spread, + source_data=spread_data, value_column="sumNumberOfPackages", name_column="externalPackageName", chart_name_prefix="Java_Most_spread_packages_by_packages", @@ -518,9 +517,8 @@ def generate_java_charts(queries_directory: str, report_directory: str, verbose: # ── Most spread second-level packages (Table 4 equivalent) ──────────────── if not second_level_spread_data.empty: - top20_second_level_spread = second_level_spread_data.head(20) save_pie_chart_pair( - source_data=top20_second_level_spread, + source_data=second_level_spread_data, value_column="sumNumberOfTypes", name_column="externalSecondLevelPackageName", chart_name_prefix="Java_Most_spread_second_level_packages_by_types", @@ -529,7 +527,7 @@ def generate_java_charts(queries_directory: str, report_directory: str, verbose: verbose=verbose, ) save_pie_chart_pair( - source_data=top20_second_level_spread, + source_data=second_level_spread_data, value_column="sumNumberOfPackages", name_column="externalSecondLevelPackageName", chart_name_prefix="Java_Most_spread_second_level_packages_by_packages", @@ -633,9 +631,8 @@ def generate_typescript_charts(queries_directory: str, report_directory: str, ve # ── Module usage overall ─────────────────────────────────────────────────── if not module_overall_data.empty: - top20_modules = module_overall_data.head(20) save_pie_chart_pair( - source_data=top20_modules, + source_data=module_overall_data, value_column="numberOfExternalCallerElements", name_column="externalModuleName", chart_name_prefix="Typescript_Top_external_modules_by_elements", @@ -644,7 +641,7 @@ def generate_typescript_charts(queries_directory: str, report_directory: str, ve verbose=verbose, ) save_pie_chart_pair( - source_data=top20_modules, + source_data=module_overall_data, value_column="numberOfExternalCallerModules", name_column="externalModuleName", chart_name_prefix="Typescript_Top_external_modules_by_modules", @@ -655,9 +652,8 @@ def generate_typescript_charts(queries_directory: str, report_directory: str, ve # ── Namespace usage overall ──────────────────────────────────────────────── if not namespace_overall_data.empty: - top20_namespaces = namespace_overall_data.head(20) save_pie_chart_pair( - source_data=top20_namespaces, + source_data=namespace_overall_data, value_column="numberOfExternalCallerElements", name_column="externalNamespaceName", chart_name_prefix="Typescript_Top_external_namespaces_by_elements", @@ -666,7 +662,7 @@ def generate_typescript_charts(queries_directory: str, report_directory: str, ve verbose=verbose, ) save_pie_chart_pair( - source_data=top20_namespaces, + source_data=namespace_overall_data, value_column="numberOfExternalCallerModules", name_column="externalNamespaceName", chart_name_prefix="Typescript_Top_external_namespaces_by_modules", @@ -677,9 +673,8 @@ def generate_typescript_charts(queries_directory: str, report_directory: str, ve # ── Module spread ────────────────────────────────────────────────────────── if not module_spread_data.empty: - top20_module_spread = module_spread_data.head(20) save_pie_chart_pair( - source_data=top20_module_spread, + source_data=module_spread_data, value_column="sumNumberOfUsedExternalDeclarations", name_column="externalModuleName", chart_name_prefix="Typescript_Most_spread_modules_by_declarations", @@ -688,7 +683,7 @@ def generate_typescript_charts(queries_directory: str, report_directory: str, ve verbose=verbose, ) save_pie_chart_pair( - source_data=top20_module_spread, + source_data=module_spread_data, value_column="numberOfInternalModules", name_column="externalModuleName", chart_name_prefix="Typescript_Most_spread_modules_by_modules", @@ -699,9 +694,8 @@ def generate_typescript_charts(queries_directory: str, report_directory: str, ve # ── Namespace spread ─────────────────────────────────────────────────────── if not namespace_spread_data.empty: - top20_namespace_spread = namespace_spread_data.head(20) save_pie_chart_pair( - source_data=top20_namespace_spread, + source_data=namespace_spread_data, value_column="sumNumberOfUsedExternalDeclarations", name_column="externalModuleNamespace", chart_name_prefix="Typescript_Most_spread_namespaces_by_declarations", @@ -710,7 +704,7 @@ def generate_typescript_charts(queries_directory: str, report_directory: str, ve verbose=verbose, ) save_pie_chart_pair( - source_data=top20_namespace_spread, + source_data=namespace_spread_data, value_column="numberOfInternalModules", name_column="externalModuleNamespace", chart_name_prefix="Typescript_Most_spread_namespaces_by_modules", diff --git a/domains/git-history/explore/GitHistoryGeneralExploration.ipynb b/domains/git-history/explore/GitHistoryGeneralExploration.ipynb index 7a308f6ed..6870d8650 100644 --- a/domains/git-history/explore/GitHistoryGeneralExploration.ipynb +++ b/domains/git-history/explore/GitHistoryGeneralExploration.ipynb @@ -175,10 +175,10 @@ " **plotly_main_layout_base_settings\n", ")\n", "plotly_treemap_figure_show_settings = dict(\n", - " renderer=\"svg\" if is_command_line_execution() else None,\n", - " width=680 if is_command_line_execution() else 1080,\n", - " height=680 if is_command_line_execution() else 1080,\n", - " config={'scrollZoom': False, 'displaylogo': False, 'displayModeBar': False} if is_command_line_execution() else {}\n", + " renderer=None,\n", + " width=1080,\n", + " height=1080,\n", + " config={}\n", ")\n", "\n", "plotly_treemap_marker_base_style = dict(\n", @@ -191,27 +191,6 @@ ")" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "03ee42af", - "metadata": {}, - "outputs": [], - "source": [ - "def get_plotly_figure_write_image_settings(name: str):\n", - " \"\"\"\n", - " Returns the settings for the plotly figure write_image method\n", - " :param name: Name of the figure\n", - " :return: Dictionary with settings for the write_image method\n", - " \"\"\"\n", - " return dict(\n", - " file=get_offline_path() + \"/\" + name + \".svg\", \n", - " format=\"svg\", \n", - " width=1080, \n", - " height=1080\n", - " )" - ] - }, { "cell_type": "code", "execution_count": null, @@ -601,6 +580,11 @@ "git_files_with_commit_statistics = add_directory_name_column(git_files_with_commit_statistics, 'directoryPath', 'directoryName')\n", "git_files_with_commit_statistics = add_parent_directory_column(git_files_with_commit_statistics, 'directoryPath', 'directoryParentPath')\n", "\n", + "# Convert date columns to strings for JSON serialization in Plotly\n", + "git_files_with_commit_statistics['lastCommitDate'] = git_files_with_commit_statistics['lastCommitDate'].astype(str)\n", + "git_files_with_commit_statistics['lastCreationDate'] = git_files_with_commit_statistics['lastCreationDate'].astype(str)\n", + "git_files_with_commit_statistics['lastModificationDate'] = git_files_with_commit_statistics['lastModificationDate'].astype(str)\n", + "\n", "# Debug\n", "# display(\"5. added parent and name columns ------------\")\n", "# display(git_files_with_commit_statistics)\n", @@ -683,9 +667,7 @@ " **plotly_treemap_layout_base_settings,\n", " title='Directories and their file count'\n", ")\n", - "figure.show(**plotly_treemap_figure_show_settings)\n", - "if is_command_line_execution():\n", - " figure.write_image(**get_plotly_figure_write_image_settings(\"NumberOfFilesPerDirectory\"))" + "figure.show(**plotly_treemap_figure_show_settings)" ] }, { @@ -721,9 +703,7 @@ " **plotly_treemap_layout_base_settings,\n", " title='Most frequent file extension per directory'\n", ")\n", - "figure.show(**plotly_treemap_figure_show_settings)\n", - "if is_command_line_execution():\n", - " figure.write_image(**get_plotly_figure_write_image_settings(\"MostFrequentFileExtensionPerDirectory\"))" + "figure.show(**plotly_treemap_figure_show_settings)" ] }, { @@ -757,9 +737,7 @@ " **plotly_treemap_layout_base_settings,\n", " title='Number of git commits',\n", ")\n", - "figure.show(**plotly_treemap_figure_show_settings)\n", - "if is_command_line_execution():\n", - " figure.write_image(**get_plotly_figure_write_image_settings(\"NumberOfGitCommits\"))" + "figure.show(**plotly_treemap_figure_show_settings)" ] }, { @@ -793,9 +771,7 @@ " **plotly_treemap_layout_base_settings,\n", " title='Number of distinct commit authors',\n", ")\n", - "figure.show(**plotly_treemap_figure_show_settings)\n", - "if is_command_line_execution():\n", - " figure.write_image(**get_plotly_figure_write_image_settings(\"NumberOfDistinctCommitAuthors\"))" + "figure.show(**plotly_treemap_figure_show_settings)" ] }, { @@ -837,9 +813,7 @@ " title='Number of distinct commit authors (red/black = only one or very few authors)',\n", ")\n", "\n", - "figure.show(**plotly_treemap_figure_show_settings)\n", - "if is_command_line_execution():\n", - " figure.write_image(**get_plotly_figure_write_image_settings(\"NumberOfDistinctCommitAuthorsLowFocus\"))" + "figure.show(**plotly_treemap_figure_show_settings)" ] }, { @@ -875,9 +849,7 @@ " **plotly_treemap_layout_base_settings,\n", " title='Main authors with highest number of commits'\n", ")\n", - "figure.show(**plotly_treemap_figure_show_settings)\n", - "if is_command_line_execution():\n", - " figure.write_image(**get_plotly_figure_write_image_settings(\"MainAuthorsWithHighestNumberOfCommits\"))" + "figure.show(**plotly_treemap_figure_show_settings)" ] }, { @@ -914,9 +886,7 @@ " **plotly_treemap_layout_base_settings,\n", " title='Second author with the second highest number of commits'\n", ")\n", - "figure.show(**plotly_treemap_figure_show_settings)\n", - "if is_command_line_execution():\n", - " figure.write_image(**get_plotly_figure_write_image_settings(\"SecondAuthorWithTheSecondHighestNumberOfCommits\"))" + "figure.show(**plotly_treemap_figure_show_settings)" ] }, { @@ -951,9 +921,7 @@ " **plotly_treemap_layout_base_settings,\n", " title='Days since last commit',\n", ")\n", - "figure.show(**plotly_treemap_figure_show_settings)\n", - "if is_command_line_execution():\n", - " figure.write_image(**get_plotly_figure_write_image_settings(\"DaysSinceLastCommit\"))" + "figure.show(**plotly_treemap_figure_show_settings)" ] }, { @@ -988,9 +956,7 @@ " **plotly_treemap_layout_base_settings,\n", " title='Rank of days since last commit',\n", ")\n", - "figure.show(**plotly_treemap_figure_show_settings)\n", - "if is_command_line_execution():\n", - " figure.write_image(**get_plotly_figure_write_image_settings(\"DaysSinceLastCommitRanked\"))" + "figure.show(**plotly_treemap_figure_show_settings)" ] }, { @@ -1024,9 +990,7 @@ " **plotly_treemap_layout_base_settings,\n", " title='Days since last file creation',\n", ")\n", - "figure.show(**plotly_treemap_figure_show_settings)\n", - "if is_command_line_execution():\n", - " figure.write_image(**get_plotly_figure_write_image_settings(\"DaysSinceLastFileCreation\"))" + "figure.show(**plotly_treemap_figure_show_settings)" ] }, { @@ -1060,9 +1024,7 @@ " **plotly_treemap_layout_base_settings,\n", " title='Rank of days since last file creation',\n", ")\n", - "figure.show(**plotly_treemap_figure_show_settings)\n", - "if is_command_line_execution():\n", - " figure.write_image(**get_plotly_figure_write_image_settings(\"DaysSinceLastFileCreationRanked\"))" + "figure.show(**plotly_treemap_figure_show_settings)" ] }, { @@ -1096,9 +1058,7 @@ " **plotly_treemap_layout_base_settings,\n", " title='Days since last file modification',\n", ")\n", - "figure.show(**plotly_treemap_figure_show_settings)\n", - "if is_command_line_execution():\n", - " figure.write_image(**get_plotly_figure_write_image_settings(\"DaysSinceLastFileModification\"))" + "figure.show(**plotly_treemap_figure_show_settings)" ] }, { @@ -1132,9 +1092,7 @@ " **plotly_treemap_layout_base_settings,\n", " title='Rank of days since last file modification',\n", ")\n", - "figure.show(**plotly_treemap_figure_show_settings)\n", - "if is_command_line_execution():\n", - " figure.write_image(**get_plotly_figure_write_image_settings(\"DaysSinceLastFileModificationRanked\"))" + "figure.show(**plotly_treemap_figure_show_settings)" ] }, { @@ -1200,9 +1158,7 @@ " xaxis_title='file count',\n", " yaxis_title='commit count'\n", " )\n", - " figure.show(**plotly_treemap_figure_show_settings)\n", - " if is_command_line_execution():\n", - " figure.write_image(**get_plotly_figure_write_image_settings(\"ChangedFilesPerCommit\"))" + " figure.show(**plotly_treemap_figure_show_settings)" ] }, { @@ -1312,9 +1268,7 @@ " **plotly_treemap_layout_base_settings,\n", " title='Files that likely co-change with others in update commits',\n", ")\n", - "figure.show(**plotly_treemap_figure_show_settings)\n", - "if is_command_line_execution():\n", - " figure.write_image(**get_plotly_figure_write_image_settings(\"CoChangingFiles\"))" + "figure.show(**plotly_treemap_figure_show_settings)" ] }, { @@ -1340,9 +1294,7 @@ " **plotly_treemap_layout_base_settings,\n", " title='Co-Changing files in update commits max lift (1=random, >1=more than random, <1=less than random)',\n", ")\n", - "figure.show(**plotly_treemap_figure_show_settings)\n", - "if is_command_line_execution():\n", - " figure.write_image(**get_plotly_figure_write_image_settings(\"CoChangingFilesMaxLift\"))" + "figure.show(**plotly_treemap_figure_show_settings)" ] }, { @@ -1368,9 +1320,7 @@ " **plotly_treemap_layout_base_settings,\n", " title='Co-Changing files in update commits average lift (1=random, >1=more than random, <1=less than random)',\n", ")\n", - "figure.show(**plotly_treemap_figure_show_settings)\n", - "if is_command_line_execution():\n", - " figure.write_image(**get_plotly_figure_write_image_settings(\"CoChangingFilesAverageLift\"))" + "figure.show(**plotly_treemap_figure_show_settings)" ] }, { @@ -1577,9 +1527,7 @@ " showlegend=False\n", " )\n", "\n", - " figure.show(**plotly_treemap_figure_show_settings)\n", - " if is_command_line_execution():\n", - " figure.write_image(**get_plotly_figure_write_image_settings(output_file_name))" + " figure.show(**plotly_treemap_figure_show_settings)" ] }, { @@ -1935,7 +1883,7 @@ } ], "kernelspec": { - "display_name": "codegraph", + "display_name": "code-graph-analysis-pipeline (3.12.8)", "language": "python", "name": "python3" }, diff --git a/domains/java/explore/MethodMetricsJavaExploration.ipynb b/domains/java/explore/MethodMetricsJavaExploration.ipynb index 26c66ef63..0d262573c 100644 --- a/domains/java/explore/MethodMetricsJavaExploration.ipynb +++ b/domains/java/explore/MethodMetricsJavaExploration.ipynb @@ -38,7 +38,7 @@ "# before starting jupyter notebook to provide the password for the user \"neo4j\". \n", "# It is not recommended to hardcode the password into jupyter notebook for security reasons.\n", "\n", - "driver = GraphDatabase.driver(uri=\"bolt://localhost:7687\", auth=(\"neo4j\", os.environ.get(\"NEO4J_INITIAL_PASSWORD\")))\n", + "driver = GraphDatabase.driver(uri=\"bolt://localhost:7687\", auth=(\"neo4j\", os.environ.get(\"NEO4J_INITIAL_PASSWORD\") or \"\"))\n", "driver.verify_connectivity()" ] }, @@ -49,13 +49,15 @@ "metadata": {}, "outputs": [], "source": [ - "def get_cypher_query_from_file(filename):\n", + "def get_cypher_query_from_file(filename: str) -> str:\n", + " \"\"\"Read and return the contents of a Cypher query file.\"\"\"\n", " with open(filename) as file:\n", - " return ' '.join(file.readlines())\n", + " return \" \".join(file.readlines())\n", " \n", "\n", - "def query_cypher_to_data_frame(filename):\n", - " records, summary, keys = driver.execute_query(get_cypher_query_from_file(filename))\n", + "def query_cypher_to_data_frame(filename: str) -> pd.DataFrame:\n", + " \"\"\"Execute a Cypher query from file and return results as a DataFrame.\"\"\"\n", + " records, _, keys = driver.execute_query(get_cypher_query_from_file(filename)) # type: ignore[arg-type]\n", " return pd.DataFrame([r.values() for r in records], columns=keys)" ] }, @@ -192,24 +194,24 @@ "if effective_method_line_count_distribution_normalized.empty:\n", " print(\"No data to plot\")\n", "else:\n", - " plot.figure();\n", - " method_line_count_x_ticks=range(1,20)\n", + " plot.figure()\n", + " method_line_count_x_ticks = range(1, 20)\n", " axes = effective_method_line_count_distribution_normalized.head(20).plot(\n", - " kind='line',\n", + " kind=\"line\",\n", " logx=True,\n", " grid=True,\n", " xlim=[2, 20],\n", " ylim=[0, 20],\n", " xticks=method_line_count_x_ticks,\n", - " title='Effective Method Line Count Distribution', \n", - " xlabel='effective line count',\n", - " ylabel='percent of methods',\n", + " title=\"Effective Method Line Count Distribution\", \n", + " xlabel=\"effective line count\",\n", + " ylabel=\"percent of methods\",\n", " cmap=main_color_map,\n", " figsize=(10, 6),\n", " lw=2,\n", " )\n", - " axes.set_xticklabels(method_line_count_x_ticks)\n", - " axes.legend(bbox_to_anchor=(1.05, 1), loc='upper left')\n", + " axes.set_xticklabels([str(i) for i in method_line_count_x_ticks])\n", + " axes.legend(bbox_to_anchor=(1.05, 1), loc=\"upper left\")\n", " plot.show()" ] }, @@ -312,7 +314,7 @@ "source": [ "cyclomatic_method_complexity_distribution_max_artifacts=15\n", "\n", - "cyclomatic_method_complexity_distribution=query_cypher_to_data_frame(\"../cypher/Overview/Cyclomatic_Method_Complexity_Distribution.cypher\")\n", + "cyclomatic_method_complexity_distribution=query_cypher_to_data_frame(\"../queries/method-metrics/Cyclomatic_Method_Complexity_Distribution.cypher\")\n", "cyclomatic_method_complexity_distribution=cyclomatic_method_complexity_distribution.pivot(index='cyclomaticComplexity', columns='artifactName', values='methods')\n", "\n", "# Fill missing values with zero\n", @@ -370,26 +372,26 @@ "if cyclomatic_method_complexity_distribution_normalized.empty:\n", " print(\"No data to plot\")\n", "else:\n", - " plot.figure();\n", - " method_line_count_x_ticks=range(1,11)\n", - " cyclomatic_complexity_y_ticks=[1, 2, 3, 4, 5, 7, 10, 20, 30, 40, 50, 100]\n", + " plot.figure()\n", + " method_line_count_x_ticks = range(1, 11)\n", + " cyclomatic_complexity_y_ticks = [1, 2, 3, 4, 5, 7, 10, 20, 30, 40, 50, 100]\n", " axes = cyclomatic_method_complexity_distribution_normalized.plot(\n", - " kind='line', \n", + " kind=\"line\",\n", " logx=True,\n", " logy=True,\n", " grid=True,\n", - " xlim=[1,11],\n", - " ylim=[1,100],\n", + " xlim=[1, 11],\n", + " ylim=[1, 100],\n", " xticks=method_line_count_x_ticks,\n", " yticks=cyclomatic_complexity_y_ticks,\n", - " title='Cyclomatic complexity distribution of methods', \n", - " xlabel='cyclomatic complexity',\n", - " ylabel='percentage of methods',\n", + " title=\"Cyclomatic complexity distribution of methods\", \n", + " xlabel=\"cyclomatic complexity\",\n", + " ylabel=\"percentage of methods\",\n", " cmap=main_color_map,\n", " )\n", - " axes.set_xticklabels(method_line_count_x_ticks)\n", - " axes.set_yticklabels(cyclomatic_complexity_y_ticks)\n", - " axes.legend(bbox_to_anchor=(1.05, 1), loc='upper left')\n", + " axes.set_xticklabels([str(i) for i in method_line_count_x_ticks])\n", + " axes.set_yticklabels([str(i) for i in cyclomatic_complexity_y_ticks])\n", + " axes.legend(bbox_to_anchor=(1.05, 1), loc=\"upper left\")\n", " plot.show()" ] }, diff --git a/domains/java/javaCharts.py b/domains/java/javaCharts.py index a34e4a028..308882a76 100644 --- a/domains/java/javaCharts.py +++ b/domains/java/javaCharts.py @@ -44,6 +44,9 @@ TOP_ANNOTATION_LIMIT = 15 HORIZONTAL_BAR_COLOR = "steelblue" +LINE_COUNT_DISTRIBUTION_MAX_ARTIFACTS = 20 +CYCLOMATIC_DISTRIBUTION_MAX_ARTIFACTS = 15 +DISTRIBUTION_CHART_COLORMAP = "nipy_spectral" # ── Parameters ──────────────────────────────────────────────────────────────── @@ -236,28 +239,91 @@ def generate_spread_per_dependent_chart(report_directory: str, verbose: bool) -> # ── Method metrics charts ───────────────────────────────────────────────────── -def generate_method_line_count_distribution_chart(report_directory: str, verbose: bool) -> None: - """Generate a histogram showing the distribution of effective method line counts.""" - data_frame = load_csv(report_directory, "EffectiveMethodLineCountDistribution.csv", verbose) +def generate_normalized_distribution_chart( + report_directory: str, + verbose: bool, + *, + csv_filename: str, + index_column: str, + max_artifacts: int, + empty_message: str, + x_ticks: list[int], + x_lim: tuple[float, float], + y_lim: tuple[float, float], + x_label: str, + y_label: str, + title: str, + output_filename: str, + y_scale: str = "linear", + y_ticks: list[int] | None = None, +) -> None: + """Generate a normalized per-artifact line chart of a distribution from a CSV file.""" + data_frame = load_csv(report_directory, csv_filename, verbose) if data_frame.empty: return - # Aggregate across all artifacts: sum method counts per line count - distribution = data_frame.groupby("effectiveLineCount")["methods"].sum().reset_index() - figure, axis = plot.subplots(figsize=(FIGURE_WIDTH, FIGURE_HEIGHT)) - axis.bar( - distribution["effectiveLineCount"], - distribution["methods"], - width=1.0, - color=HORIZONTAL_BAR_COLOR, - edgecolor="white", - linewidth=0.3, + distribution = ( + data_frame + .pivot(index=index_column, columns="artifactName", values="methods") + .fillna(0) + .astype(int) ) - axis.set_xlabel("Effective Line Count") - axis.set_ylabel("Number of Methods") - axis.set_title("Effective Method Line Count Distribution") + artifact_totals = distribution.sum() + top_artifacts = artifact_totals.sort_values(ascending=False).index[:max_artifacts] + distribution = distribution[top_artifacts] + + # Filter out columns with zero sum to prevent NaN/Inf after normalization. + non_zero_columns = distribution.columns[distribution.sum(axis=0) > 0] + distribution = distribution[non_zero_columns] + + if distribution.empty: + print(f"{SCRIPT_NAME}: {empty_message}") + return + + normalized = distribution.div(distribution.sum(axis=0), axis=1).multiply(100) + + colormap = matplotlib.colormaps[DISTRIBUTION_CHART_COLORMAP] + num_artifacts = len(normalized.columns) + colors = [colormap(i / max(num_artifacts - 1, 1)) for i in range(num_artifacts)] + + figure, axis = plot.subplots(figsize=(10, 6)) + for i, column in enumerate(normalized.columns): + axis.plot(normalized.index, normalized[column], label=column, color=colors[i], linewidth=2) - save_figure(figure, report_directory, "MethodMetrics_LineCountDistribution_Histogram", verbose) + axis.set_xscale("log") + axis.set_yscale(y_scale) + axis.set_xlim(*x_lim) + axis.set_ylim(*y_lim) + axis.set_xticks(x_ticks) + axis.set_xticklabels([str(t) for t in x_ticks]) + if y_ticks is not None: + axis.set_yticks(y_ticks) + axis.set_yticklabels([str(t) for t in y_ticks]) + axis.set_xlabel(x_label) + axis.set_ylabel(y_label) + axis.set_title(title) + axis.grid(True) + axis.legend(bbox_to_anchor=(1.05, 1), loc="upper left", fontsize=7) + + save_figure(figure, report_directory, output_filename, verbose) + + +def generate_method_line_count_distribution_chart(report_directory: str, verbose: bool) -> None: + """Generate a normalized per-artifact line chart of effective method line count distribution.""" + generate_normalized_distribution_chart( + report_directory, verbose, + csv_filename="EffectiveMethodLineCountDistribution.csv", + index_column="effectiveLineCount", + max_artifacts=LINE_COUNT_DISTRIBUTION_MAX_ARTIFACTS, + empty_message="No data for method line count distribution, skipping chart.", + x_ticks=list(range(2, 21)), + x_lim=(2, 20), + y_lim=(0, 20), + x_label="Effective Line Count", + y_label="Percent of Methods", + title="Effective Method Line Count Distribution (Normalized)", + output_filename="MethodMetrics_LineCountDistribution_Histogram", + ) def generate_top_types_by_loc_chart(report_directory: str, verbose: bool) -> None: @@ -298,6 +364,26 @@ def generate_top_packages_by_loc_chart(report_directory: str, verbose: bool) -> save_figure(figure, report_directory, "MethodMetrics_TopPackagesLOC_Bar", verbose) +def generate_cyclomatic_complexity_distribution_chart(report_directory: str, verbose: bool) -> None: + """Generate a normalized per-artifact line chart of cyclomatic method complexity distribution.""" + generate_normalized_distribution_chart( + report_directory, verbose, + csv_filename="CyclomaticMethodComplexityDistribution.csv", + index_column="cyclomaticComplexity", + max_artifacts=CYCLOMATIC_DISTRIBUTION_MAX_ARTIFACTS, + empty_message="No data for cyclomatic complexity distribution, skipping chart.", + x_ticks=list(range(1, 12)), + x_lim=(1, 11), + y_lim=(1, 100), + x_label="Cyclomatic Complexity", + y_label="Percentage of Methods", + title="Cyclomatic Complexity Distribution of Methods (Normalized)", + output_filename="MethodMetrics_CyclomaticComplexityDistribution_Normalized", + y_scale="log", + y_ticks=[1, 2, 3, 4, 5, 7, 10, 20, 30, 40, 50, 100], + ) + + # ── Java code quality charts ────────────────────────────────────────────────── def generate_annotation_type_distribution_chart(report_directory: str, verbose: bool) -> None: @@ -430,6 +516,7 @@ def generate_all_charts(report_directory: str, verbose: bool) -> None: generate_spread_per_dependency_chart(report_directory, verbose) generate_spread_per_dependent_chart(report_directory, verbose) generate_method_line_count_distribution_chart(report_directory, verbose) + generate_cyclomatic_complexity_distribution_chart(report_directory, verbose) generate_top_types_by_loc_chart(report_directory, verbose) generate_top_packages_by_loc_chart(report_directory, verbose) generate_annotation_type_distribution_chart(report_directory, verbose) diff --git a/domains/java/javaCsv.sh b/domains/java/javaCsv.sh index feef3952a..5ec2d6aac 100644 --- a/domains/java/javaCsv.sh +++ b/domains/java/javaCsv.sh @@ -83,6 +83,9 @@ execute_cypher "${JAVA_CODE_QUALITY_CYPHER_DIR}/JakartaEE_REST_Annotations.cyphe execute_cypher "${METHOD_METRICS_CYPHER_DIR}/Effective_Method_Line_Count_Distribution.cypher" \ > "${FULL_REPORT_DIRECTORY}/EffectiveMethodLineCountDistribution.csv" +execute_cypher "${METHOD_METRICS_CYPHER_DIR}/Cyclomatic_Method_Complexity_Distribution.cypher" \ + > "${FULL_REPORT_DIRECTORY}/CyclomaticMethodComplexityDistribution.csv" + execute_cypher "${METHOD_METRICS_CYPHER_DIR}/Effective_lines_of_method_code_per_type.cypher" \ > "${FULL_REPORT_DIRECTORY}/EffectiveLinesOfMethodCodePerType.csv" diff --git a/domains/java/queries/method-metrics/Cyclomatic_Method_Complexity_Distribution.cypher b/domains/java/queries/method-metrics/Cyclomatic_Method_Complexity_Distribution.cypher new file mode 100644 index 000000000..af4a2d983 --- /dev/null +++ b/domains/java/queries/method-metrics/Cyclomatic_Method_Complexity_Distribution.cypher @@ -0,0 +1,9 @@ + // Cyclomatic Method Complexity Distribution. + + MATCH (artifact:Artifact)-[:CONTAINS]->(type:Type)-[:DECLARES]->(method:Method) + WHERE method.effectiveLineCount > 0 + WITH last(split(artifact.fileName, '/')) AS artifactName + ,method.cyclomaticComplexity AS cyclomaticComplexity + ,count(method) AS methods +RETURN artifactName, cyclomaticComplexity, methods + ORDER BY artifactName ASC, cyclomaticComplexity \ No newline at end of file diff --git a/domains/node-embeddings/explore/NodeEmbeddingsJavaExploration.ipynb b/domains/node-embeddings/explore/NodeEmbeddingsJavaExploration.ipynb index eb34552fe..3620c6a8c 100644 --- a/domains/node-embeddings/explore/NodeEmbeddingsJavaExploration.ipynb +++ b/domains/node-embeddings/explore/NodeEmbeddingsJavaExploration.ipynb @@ -114,7 +114,7 @@ "# before starting jupyter notebook to provide the password for the user \"neo4j\". \n", "# It is not recommended to hardcode the password into jupyter notebook for security reasons.\n", "\n", - "driver = GraphDatabase.driver(uri=\"bolt://localhost:7687\", auth=(\"neo4j\", os.environ.get(\"NEO4J_INITIAL_PASSWORD\")))\n", + "driver = GraphDatabase.driver(uri=\"bolt://localhost:7687\", auth=(\"neo4j\", os.environ.get(\"NEO4J_INITIAL_PASSWORD\"))) # pyright: ignore[reportArgumentType]\n", "driver.verify_connectivity()" ] }, @@ -125,13 +125,13 @@ "metadata": {}, "outputs": [], "source": [ - "def get_cypher_query_from_file(filename):\n", + "def get_cypher_query_from_file(filename) -> str:\n", " with open(filename) as file:\n", " return ' '.join(file.readlines())\n", " \n", "\n", - "def query_cypher_to_data_frame(filename, parameters_: typ.Optional[typ.Dict[str, typ.Any]] = None):\n", - " records, summary, keys = driver.execute_query(get_cypher_query_from_file(filename),parameters_=parameters_)\n", + "def query_cypher_to_data_frame(filename, parameters_: typ.Optional[typ.Dict[str, typ.Any]] = None) -> pd.DataFrame:\n", + " records, summary, keys = driver.execute_query(get_cypher_query_from_file(filename),parameters_=parameters_) # pyright: ignore\n", " return pd.DataFrame([r.values() for r in records], columns=keys)\n", "\n", "\n", @@ -173,15 +173,15 @@ " The number of the dimensions and therefore size of the resulting array of floating point numbers\n", " \"\"\"\n", " \n", - " is_data_missing=query_cypher_to_data_frame(\"../cypher/Dependencies_Projection/Dependencies_0_Check_Projectable.cypher\", parameters).empty\n", + " is_data_missing=query_cypher_to_data_frame(\"../../../cypher/Dependencies_Projection/Dependencies_0_Check_Projectable.cypher\", parameters).empty\n", " if is_data_missing: return False\n", "\n", - " query_cypher_to_data_frame(\"../cypher/Dependencies_Projection/Dependencies_1_Delete_Projection.cypher\", parameters)\n", - " query_cypher_to_data_frame(\"../cypher/Dependencies_Projection/Dependencies_2_Delete_Subgraph.cypher\", parameters)\n", + " query_cypher_to_data_frame(\"../../../cypher/Dependencies_Projection/Dependencies_1_Delete_Projection.cypher\", parameters)\n", + " query_cypher_to_data_frame(\"../../../cypher/Dependencies_Projection/Dependencies_2_Delete_Subgraph.cypher\", parameters)\n", " # To include the direction of the relationships use the following line to create the projection:\n", - " # query_cypher_to_data_frame(\"../cypher/Dependencies_Projection/Dependencies_3_Create_Projection.cypher\", parameters)\n", - " query_cypher_to_data_frame(\"../cypher/Dependencies_Projection/Dependencies_4_Create_Undirected_Projection.cypher\", parameters)\n", - " query_cypher_to_data_frame(\"../cypher/Dependencies_Projection/Dependencies_5_Create_Subgraph.cypher\", parameters)\n", + " # query_cypher_to_data_frame(\"../../../cypher/Dependencies_Projection/Dependencies_3_Create_Projection.cypher\", parameters)\n", + " query_cypher_to_data_frame(\"../../../cypher/Dependencies_Projection/Dependencies_4_Create_Undirected_Projection.cypher\", parameters)\n", + " query_cypher_to_data_frame(\"../../../cypher/Dependencies_Projection/Dependencies_5_Create_Subgraph.cypher\", parameters)\n", " return True" ] }, @@ -204,7 +204,7 @@ " parameters = dict(\n", " dependencies_projection=projection_name,\n", " )\n", - " return query_cypher_to_data_frame(\"../cypher/Dependencies_Projection/Dependencies_12_Get_Projection_Statistics.cypher\", parameters)\n", + " return query_cypher_to_data_frame(\"../../../cypher/Dependencies_Projection/Dependencies_12_Get_Projection_Statistics.cypher\", parameters)\n", "\n", "\n", "def get_projected_graph_node_count(projection_name: str) -> int:\n", @@ -275,7 +275,7 @@ " print(\"No projected data for node embeddings calculation available\")\n", " return empty_embeddings()\n", "\n", - " existing_embeddings_query_filename=\"../cypher/Node_Embeddings/Node_Embeddings_0a_Query_Calculated.cypher\"\n", + " existing_embeddings_query_filename=\"../queries/node-embeddings/Node_Embeddings_0a_Query_Calculated.cypher\"\n", " embeddings = query_first_non_empty_cypher_to_data_frame(existing_embeddings_query_filename, cypher_file_name, parameters=parameters)\n", " display(embeddings.head()) # Display the first entries of the table\n", " return embeddings" @@ -315,10 +315,10 @@ " print(\"GraphSAGE node embeddings training will be skipped for \" + str(node_count) + \" (>500) nodes, since it is computationally expensive and not eagerly needed for demonstration purposes.\")\n", " return empty_embeddings()\n", "\n", - " query_cypher_to_data_frame(\"../cypher/Node_Embeddings/Node_Embeddings_0b_Prepare_Degree.cypher\", parameters)\n", - " query_cypher_to_data_frame(\"../cypher/Node_Embeddings/Node_Embeddings_0c_Drop_Model.cypher\", parameters)\n", - " display(query_cypher_to_data_frame(\"../cypher/Node_Embeddings/Node_Embeddings_4b_GraphSAGE_Train.cypher\", parameters))\n", - " embeddings=query_cypher_to_data_frame(\"../cypher/Node_Embeddings/Node_Embeddings_4d_GraphSAGE_Stream.cypher\", parameters)\n", + " query_cypher_to_data_frame(\"../queries/node-embeddings/Node_Embeddings_0b_Prepare_Degree.cypher\", parameters)\n", + " query_cypher_to_data_frame(\"../queries/node-embeddings/Node_Embeddings_0c_Drop_Model.cypher\", parameters)\n", + " display(query_cypher_to_data_frame(\"../queries/node-embeddings/Node_Embeddings_4b_GraphSAGE_Train.cypher\", parameters))\n", + " embeddings=query_cypher_to_data_frame(\"../queries/node-embeddings/Node_Embeddings_4d_GraphSAGE_Stream.cypher\", parameters)\n", " \n", " display(embeddings.head()) # Display the first entries of the table\n", " return embeddings" @@ -707,7 +707,7 @@ " \"dependencies_projection_write_property\": \"embeddingsFastRandomProjection\",\n", " \"dependencies_projection_embedding_dimension\":\"32\"\n", "}\n", - "embeddings_fastRP = create_node_embeddings(\"../cypher/Node_Embeddings/Node_Embeddings_1d_Fast_Random_Projection_Stream.cypher\", java_package_embeddings_parameters)\n" + "embeddings_fastRP = create_node_embeddings(\"../queries/node-embeddings/Node_Embeddings_1d_Fast_Random_Projection_Stream.cypher\", java_package_embeddings_parameters)\n" ] }, { @@ -777,7 +777,7 @@ " \"dependencies_projection_write_property\": \"embeddingsHashGNN\",\n", " \"dependencies_projection_embedding_dimension\":\"64\"\n", "}\n", - "embeddings_hashGNN = create_node_embeddings(\"../cypher/Node_Embeddings/Node_Embeddings_2d_Hash_GNN_Stream.cypher\", java_package_embeddings_parameters)\n", + "embeddings_hashGNN = create_node_embeddings(\"../queries/node-embeddings/Node_Embeddings_2d_Hash_GNN_Stream.cypher\", java_package_embeddings_parameters)\n", "embeddings_hashGNN = prepare_node_embeddings_for_2d_visualization(embeddings_hashGNN)\n", "scores_hashGNN = CommunityScores.calculate(embeddings_hashGNN)\n", "plot_2d_node_embeddings(embeddings_hashGNN, get_plot_title(\"Java Packages\", \"HashGNN\", scores_hashGNN))" @@ -803,7 +803,7 @@ " \"dependencies_projection_write_property\": \"embeddingsNode2Vec\",\n", " \"dependencies_projection_embedding_dimension\":\"32\"\n", "}\n", - "embeddings_node2vec = create_node_embeddings(\"../cypher/Node_Embeddings/Node_Embeddings_3d_Node2Vec_Stream.cypher\", java_package_embeddings_parameters)\n", + "embeddings_node2vec = create_node_embeddings(\"../queries/node-embeddings/Node_Embeddings_3d_Node2Vec_Stream.cypher\", java_package_embeddings_parameters)\n", "embeddings_node2vec = prepare_node_embeddings_for_2d_visualization(embeddings_node2vec)\n", "scores_node2vec = CommunityScores.calculate(embeddings_node2vec)\n", "plot_2d_node_embeddings(embeddings_node2vec, get_plot_title(\"Java Packages\", \"node2vec\", scores_node2vec))" diff --git a/domains/node-embeddings/explore/NodeEmbeddingsTypescriptExploration.ipynb b/domains/node-embeddings/explore/NodeEmbeddingsTypescriptExploration.ipynb index 9bd3bbd34..726adf407 100644 --- a/domains/node-embeddings/explore/NodeEmbeddingsTypescriptExploration.ipynb +++ b/domains/node-embeddings/explore/NodeEmbeddingsTypescriptExploration.ipynb @@ -114,7 +114,7 @@ "# before starting jupyter notebook to provide the password for the user \"neo4j\". \n", "# It is not recommended to hardcode the password into jupyter notebook for security reasons.\n", "\n", - "driver = GraphDatabase.driver(uri=\"bolt://localhost:7687\", auth=(\"neo4j\", os.environ.get(\"NEO4J_INITIAL_PASSWORD\")))\n", + "driver = GraphDatabase.driver(uri=\"bolt://localhost:7687\", auth=(\"neo4j\", os.environ.get(\"NEO4J_INITIAL_PASSWORD\"))) # pyright: ignore[reportArgumentType]\n", "driver.verify_connectivity()" ] }, @@ -125,13 +125,13 @@ "metadata": {}, "outputs": [], "source": [ - "def get_cypher_query_from_file(filename):\n", + "def get_cypher_query_from_file(filename) -> str:\n", " with open(filename) as file:\n", " return ' '.join(file.readlines())\n", " \n", "\n", - "def query_cypher_to_data_frame(filename, parameters_: typ.Optional[typ.Dict[str, typ.Any]] = None):\n", - " records, summary, keys = driver.execute_query(get_cypher_query_from_file(filename),parameters_=parameters_)\n", + "def query_cypher_to_data_frame(filename, parameters_: typ.Optional[typ.Dict[str, typ.Any]] = None) -> pd.DataFrame:\n", + " records, summary, keys = driver.execute_query(get_cypher_query_from_file(filename),parameters_=parameters_) # pyright: ignore\n", " return pd.DataFrame([r.values() for r in records], columns=keys)\n", "\n", "\n", @@ -173,15 +173,15 @@ " The number of the dimensions and therefore size of the resulting array of floating point numbers\n", " \"\"\"\n", " \n", - " is_data_missing=query_cypher_to_data_frame(\"../cypher/Dependencies_Projection/Dependencies_0_Check_Projectable.cypher\", parameters).empty\n", + " is_data_missing=query_cypher_to_data_frame(\"../../../cypher/Dependencies_Projection/Dependencies_0_Check_Projectable.cypher\", parameters).empty\n", " if is_data_missing: return False\n", "\n", - " query_cypher_to_data_frame(\"../cypher/Dependencies_Projection/Dependencies_1_Delete_Projection.cypher\", parameters)\n", - " query_cypher_to_data_frame(\"../cypher/Dependencies_Projection/Dependencies_2_Delete_Subgraph.cypher\", parameters)\n", + " query_cypher_to_data_frame(\"../../../cypher/Dependencies_Projection/Dependencies_1_Delete_Projection.cypher\", parameters)\n", + " query_cypher_to_data_frame(\"../../../cypher/Dependencies_Projection/Dependencies_2_Delete_Subgraph.cypher\", parameters)\n", " # To include the direction of the relationships use the following line to create the projection:\n", - " # query_cypher_to_data_frame(\"../cypher/Dependencies_Projection/Dependencies_3_Create_Projection.cypher\", parameters)\n", - " query_cypher_to_data_frame(\"../cypher/Dependencies_Projection/Dependencies_4_Create_Undirected_Projection.cypher\", parameters)\n", - " query_cypher_to_data_frame(\"../cypher/Dependencies_Projection/Dependencies_5_Create_Subgraph.cypher\", parameters)\n", + " # query_cypher_to_data_frame(\"../../../cypher/Dependencies_Projection/Dependencies_3_Create_Projection.cypher\", parameters)\n", + " query_cypher_to_data_frame(\"../../../cypher/Dependencies_Projection/Dependencies_4_Create_Undirected_Projection.cypher\", parameters)\n", + " query_cypher_to_data_frame(\"../../../cypher/Dependencies_Projection/Dependencies_5_Create_Subgraph.cypher\", parameters)\n", " return True" ] }, @@ -204,7 +204,7 @@ " parameters = dict(\n", " dependencies_projection=projection_name,\n", " )\n", - " return query_cypher_to_data_frame(\"../cypher/Dependencies_Projection/Dependencies_12_Get_Projection_Statistics.cypher\", parameters)\n", + " return query_cypher_to_data_frame(\"../../../cypher/Dependencies_Projection/Dependencies_12_Get_Projection_Statistics.cypher\", parameters)\n", "\n", "\n", "def get_projected_graph_node_count(projection_name: str) -> int:\n", @@ -275,7 +275,7 @@ " print(\"No projected data for node embeddings calculation available\")\n", " return empty_embeddings()\n", "\n", - " existing_embeddings_query_filename=\"../cypher/Node_Embeddings/Node_Embeddings_0a_Query_Calculated.cypher\"\n", + " existing_embeddings_query_filename=\"../queries/node-embeddings/Node_Embeddings_0a_Query_Calculated.cypher\"\n", " embeddings = query_first_non_empty_cypher_to_data_frame(existing_embeddings_query_filename, cypher_file_name, parameters=parameters)\n", " display(embeddings.head()) # Display the first entries of the table\n", " return embeddings" @@ -315,10 +315,10 @@ " print(\"GraphSAGE node embeddings training will be skipped for \" + str(node_count) + \" (>500) nodes, since it is computationally expensive and not eagerly needed for demonstration purposes.\")\n", " return empty_embeddings()\n", " \n", - " query_cypher_to_data_frame(\"../cypher/Node_Embeddings/Node_Embeddings_0b_Prepare_Degree.cypher\", parameters)\n", - " query_cypher_to_data_frame(\"../cypher/Node_Embeddings/Node_Embeddings_0c_Drop_Model.cypher\", parameters)\n", - " display(query_cypher_to_data_frame(\"../cypher/Node_Embeddings/Node_Embeddings_4b_GraphSAGE_Train.cypher\", parameters))\n", - " embeddings=query_cypher_to_data_frame(\"../cypher/Node_Embeddings/Node_Embeddings_4d_GraphSAGE_Stream.cypher\", parameters)\n", + " query_cypher_to_data_frame(\"../queries/node-embeddings/Node_Embeddings_0b_Prepare_Degree.cypher\", parameters)\n", + " query_cypher_to_data_frame(\"../queries/node-embeddings/Node_Embeddings_0c_Drop_Model.cypher\", parameters)\n", + " display(query_cypher_to_data_frame(\"../queries/node-embeddings/Node_Embeddings_4b_GraphSAGE_Train.cypher\", parameters))\n", + " embeddings=query_cypher_to_data_frame(\"../queries/node-embeddings/Node_Embeddings_4d_GraphSAGE_Stream.cypher\", parameters)\n", " \n", " display(embeddings.head()) # Display the first entries of the table\n", " return embeddings" @@ -707,7 +707,7 @@ " \"dependencies_projection_write_property\": \"embeddingsFastRandomProjection\",\n", " \"dependencies_projection_embedding_dimension\":\"32\" \n", "}\n", - "embeddings_fastRP = create_node_embeddings(\"../cypher/Node_Embeddings/Node_Embeddings_1d_Fast_Random_Projection_Stream.cypher\", typescript_module_embeddings_parameters)\n" + "embeddings_fastRP = create_node_embeddings(\"../queries/node-embeddings/Node_Embeddings_1d_Fast_Random_Projection_Stream.cypher\", typescript_module_embeddings_parameters)\n" ] }, { @@ -777,7 +777,7 @@ " \"dependencies_projection_write_property\": \"embeddingsHashGNN\",\n", " \"dependencies_projection_embedding_dimension\":\"32\"\n", "}\n", - "embeddings_hashGNN = create_node_embeddings(\"../cypher/Node_Embeddings/Node_Embeddings_2d_Hash_GNN_Stream.cypher\", typescript_module_embeddings_parameters)\n", + "embeddings_hashGNN = create_node_embeddings(\"../queries/node-embeddings/Node_Embeddings_2d_Hash_GNN_Stream.cypher\", typescript_module_embeddings_parameters)\n", "embeddings_hashGNN = prepare_node_embeddings_for_2d_visualization(embeddings_hashGNN)\n", "scores_hashGNN = CommunityScores.calculate(embeddings_hashGNN)\n", "plot_2d_node_embeddings(embeddings_hashGNN, get_plot_title(\"TypeScript Modules\", \"HashGNN\", scores_hashGNN))" @@ -806,7 +806,7 @@ " \"dependencies_projection_write_property\": \"embeddingsNode2Vec\",\n", " \"dependencies_projection_embedding_dimension\":\"32\"\n", "}\n", - "embeddings_node2vec = create_node_embeddings(\"../cypher/Node_Embeddings/Node_Embeddings_3d_Node2Vec_Stream.cypher\", typescript_module_embeddings_parameters)\n", + "embeddings_node2vec = create_node_embeddings(\"../queries/node-embeddings/Node_Embeddings_3d_Node2Vec_Stream.cypher\", typescript_module_embeddings_parameters)\n", "embeddings_node2vec = prepare_node_embeddings_for_2d_visualization(embeddings_node2vec)\n", "scores_node2vec = CommunityScores.calculate(embeddings_node2vec)\n", "plot_2d_node_embeddings(embeddings_node2vec, get_plot_title(\"TypeScript Modules\", \"node2vec\", scores_node2vec))" diff --git a/pyproject.toml b/pyproject.toml index f9fffb450..6aae914d6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,15 +1,16 @@ [project] name = "code-graph-analysis-pipeline" -version = "4.0.0" +version = "4.0.1" requires-python = ">=3.12,<3.14" dependencies = [ # --- Jupyter (for notebook environments) --- "ipykernel==7.2.0", + "nbformat==5.10.4", # Required for plotly to render plots in notebooks. + # --- Data manipulation and plotting --- "matplotlib==3.10.9", "numpy==2.2.5", "pandas==2.3.3", "pip==26.1", - "setuptools==80.10.2", # opentsne uses sklearn.base uses joblib uses distutils missing in Python >= 12 (TODO use native openTSNE?) # --- Visualization --- "wordcloud==1.9.6", "monotonic==1.6", diff --git a/uv.lock b/uv.lock index dd69412a8..a0576a1a5 100644 --- a/uv.lock +++ b/uv.lock @@ -34,6 +34,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/d2/39/e7eaf1799466a4aef85b6a4fe7bd175ad2b1c6345066aa33f1f58d4b18d0/asttokens-3.0.1-py3-none-any.whl", hash = "sha256:15a3ebc0f43c2d0a50eeafea25e19046c68398e487b9f1f5b517f7c0f40f976a", size = 27047, upload-time = "2025-11-15T16:43:16.109Z" }, ] +[[package]] +name = "attrs" +version = "26.1.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/9a/8e/82a0fe20a541c03148528be8cac2408564a6c9a0cc7e9171802bc1d26985/attrs-26.1.0.tar.gz", hash = "sha256:d03ceb89cb322a8fd706d4fb91940737b6642aa36998fe130a9bc96c985eff32", size = 952055, upload-time = "2026-03-19T14:22:25.026Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/64/b4/17d4b0b2a2dc85a6df63d1157e028ed19f90d4cd97c36717afef2bc2f395/attrs-26.1.0-py3-none-any.whl", hash = "sha256:c647aa4a12dfbad9333ca4e71fe62ddc36f4e63b2d260a37a8b83d2f043ac309", size = 67548, upload-time = "2026-03-19T14:22:23.645Z" }, +] + [[package]] name = "cffi" version = "2.0.0" @@ -94,12 +103,13 @@ wheels = [ [[package]] name = "code-graph-analysis-pipeline" -version = "4.0.0" +version = "4.0.1" source = { virtual = "." } dependencies = [ { name = "ipykernel" }, { name = "matplotlib" }, { name = "monotonic" }, + { name = "nbformat" }, { name = "neo4j" }, { name = "numpy" }, { name = "optuna" }, @@ -108,7 +118,6 @@ dependencies = [ { name = "plotly", extra = ["kaleido"] }, { name = "scikit-learn" }, { name = "seaborn" }, - { name = "setuptools" }, { name = "shap" }, { name = "umap-learn" }, { name = "wordcloud" }, @@ -119,6 +128,7 @@ requires-dist = [ { name = "ipykernel", specifier = "==7.2.0" }, { name = "matplotlib", specifier = "==3.10.9" }, { name = "monotonic", specifier = "==1.6" }, + { name = "nbformat", specifier = "==5.10.4" }, { name = "neo4j", specifier = "==6.2.0" }, { name = "numpy", specifier = "==2.2.5" }, { name = "optuna", specifier = "==4.7.0" }, @@ -127,7 +137,6 @@ requires-dist = [ { name = "plotly", extras = ["kaleido"], specifier = "==6.7.0" }, { name = "scikit-learn", specifier = "==1.8.0" }, { name = "seaborn", specifier = "==0.13.2" }, - { name = "setuptools", specifier = "==80.10.2" }, { name = "shap", specifier = "==0.50.0" }, { name = "umap-learn", specifier = "==0.5.11" }, { name = "wordcloud", specifier = "==1.9.6" }, @@ -251,6 +260,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/c1/ea/53f2148663b321f21b5a606bd5f191517cf40b7072c0497d3c92c4a13b1e/executing-2.2.1-py2.py3-none-any.whl", hash = "sha256:760643d3452b4d777d295bb167ccc74c64a81df23fb5e08eff250c425a4b2017", size = 28317, upload-time = "2025-09-01T09:48:08.5Z" }, ] +[[package]] +name = "fastjsonschema" +version = "2.21.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/20/b5/23b216d9d985a956623b6bd12d4086b60f0059b27799f23016af04a74ea1/fastjsonschema-2.21.2.tar.gz", hash = "sha256:b1eb43748041c880796cd077f1a07c3d94e93ae84bba5ed36800a33554ae05de", size = 374130, upload-time = "2025-08-14T18:49:36.666Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/cb/a8/20d0723294217e47de6d9e2e40fd4a9d2f7c4b6ef974babd482a59743694/fastjsonschema-2.21.2-py3-none-any.whl", hash = "sha256:1c797122d0a86c5cace2e54bf4e819c36223b552017172f32c5c024a6b77e463", size = 24024, upload-time = "2025-08-14T18:49:34.776Z" }, +] + [[package]] name = "fonttools" version = "4.62.1" @@ -379,6 +397,33 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/7b/91/984aca2ec129e2757d1e4e3c81c3fcda9d0f85b74670a094cc443d9ee949/joblib-1.5.3-py3-none-any.whl", hash = "sha256:5fc3c5039fc5ca8c0276333a188bbd59d6b7ab37fe6632daa76bc7f9ec18e713", size = 309071, upload-time = "2025-12-15T08:41:44.973Z" }, ] +[[package]] +name = "jsonschema" +version = "4.26.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "attrs" }, + { name = "jsonschema-specifications" }, + { name = "referencing" }, + { name = "rpds-py" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/b3/fc/e067678238fa451312d4c62bf6e6cf5ec56375422aee02f9cb5f909b3047/jsonschema-4.26.0.tar.gz", hash = "sha256:0c26707e2efad8aa1bfc5b7ce170f3fccc2e4918ff85989ba9ffa9facb2be326", size = 366583, upload-time = "2026-01-07T13:41:07.246Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/69/90/f63fb5873511e014207a475e2bb4e8b2e570d655b00ac19a9a0ca0a385ee/jsonschema-4.26.0-py3-none-any.whl", hash = "sha256:d489f15263b8d200f8387e64b4c3a75f06629559fb73deb8fdfb525f2dab50ce", size = 90630, upload-time = "2026-01-07T13:41:05.306Z" }, +] + +[[package]] +name = "jsonschema-specifications" +version = "2025.9.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "referencing" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/19/74/a633ee74eb36c44aa6d1095e7cc5569bebf04342ee146178e2d36600708b/jsonschema_specifications-2025.9.1.tar.gz", hash = "sha256:b540987f239e745613c7a9176f3edb72b832a4ac465cf02712288397832b5e8d", size = 32855, upload-time = "2025-09-08T01:34:59.186Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/41/45/1a4ed80516f02155c51f51e8cedb3c1902296743db0bbc66608a0db2814f/jsonschema_specifications-2025.9.1-py3-none-any.whl", hash = "sha256:98802fee3a11ee76ecaca44429fda8a41bff98b00a0f2838151b113f210cc6fe", size = 18437, upload-time = "2025-09-08T01:34:57.871Z" }, +] + [[package]] name = "jupyter-client" version = "8.8.0" @@ -627,6 +672,21 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/c7/e1/68c2256b69a314eba133673377ba9118c356f6342a0c02b61de449cf2bf2/narwhals-2.21.0-py3-none-any.whl", hash = "sha256:1e6617d0fca68ae1fda29e5397c4eaacd3ffc9fffe6bcd6ded0c690475e853be", size = 451943, upload-time = "2026-05-08T12:29:01.058Z" }, ] +[[package]] +name = "nbformat" +version = "5.10.4" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "fastjsonschema" }, + { name = "jsonschema" }, + { name = "jupyter-core" }, + { name = "traitlets" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/6d/fd/91545e604bc3dad7dca9ed03284086039b294c6b3d75c0d2fa45f9e9caf3/nbformat-5.10.4.tar.gz", hash = "sha256:322168b14f937a5d11362988ecac2a4952d3d8e3a2cbeb2319584631226d5b3a", size = 142749, upload-time = "2024-04-04T11:20:37.371Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a9/82/0340caa499416c78e5d8f5f05947ae4bc3cba53c9f038ab6e9ed964e22f1/nbformat-5.10.4-py3-none-any.whl", hash = "sha256:3b48d6c8fbca4b299bf3982ea7db1af21580e4fec269ad087b9e81588891200b", size = 78454, upload-time = "2024-04-04T11:20:34.895Z" }, +] + [[package]] name = "neo4j" version = "6.2.0" @@ -1083,6 +1143,72 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/f6/fa/f8aea7a28b0641f31d40dea42d7ef003fded31e184ef47db696bc74cd610/pyzmq-27.1.0-cp313-cp313t-win_arm64.whl", hash = "sha256:6bb54ca21bcfe361e445256c15eedf083f153811c37be87e0514934d6913061e", size = 561541, upload-time = "2025-09-08T23:08:42.668Z" }, ] +[[package]] +name = "referencing" +version = "0.37.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "attrs" }, + { name = "rpds-py" }, + { name = "typing-extensions", marker = "python_full_version < '3.13'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/22/f5/df4e9027acead3ecc63e50fe1e36aca1523e1719559c499951bb4b53188f/referencing-0.37.0.tar.gz", hash = "sha256:44aefc3142c5b842538163acb373e24cce6632bd54bdb01b21ad5863489f50d8", size = 78036, upload-time = "2025-10-13T15:30:48.871Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/2c/58/ca301544e1fa93ed4f80d724bf5b194f6e4b945841c5bfd555878eea9fcb/referencing-0.37.0-py3-none-any.whl", hash = "sha256:381329a9f99628c9069361716891d34ad94af76e461dcb0335825aecc7692231", size = 26766, upload-time = "2025-10-13T15:30:47.625Z" }, +] + +[[package]] +name = "rpds-py" +version = "0.30.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/20/af/3f2f423103f1113b36230496629986e0ef7e199d2aa8392452b484b38ced/rpds_py-0.30.0.tar.gz", hash = "sha256:dd8ff7cf90014af0c0f787eea34794ebf6415242ee1d6fa91eaba725cc441e84", size = 69469, upload-time = "2025-11-30T20:24:38.837Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/03/e7/98a2f4ac921d82f33e03f3835f5bf3a4a40aa1bfdc57975e74a97b2b4bdd/rpds_py-0.30.0-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:a161f20d9a43006833cd7068375a94d035714d73a172b681d8881820600abfad", size = 375086, upload-time = "2025-11-30T20:22:17.93Z" }, + { url = "https://files.pythonhosted.org/packages/4d/a1/bca7fd3d452b272e13335db8d6b0b3ecde0f90ad6f16f3328c6fb150c889/rpds_py-0.30.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:6abc8880d9d036ecaafe709079969f56e876fcf107f7a8e9920ba6d5a3878d05", size = 359053, upload-time = "2025-11-30T20:22:19.297Z" }, + { url = "https://files.pythonhosted.org/packages/65/1c/ae157e83a6357eceff62ba7e52113e3ec4834a84cfe07fa4b0757a7d105f/rpds_py-0.30.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ca28829ae5f5d569bb62a79512c842a03a12576375d5ece7d2cadf8abe96ec28", size = 390763, upload-time = "2025-11-30T20:22:21.661Z" }, + { url = "https://files.pythonhosted.org/packages/d4/36/eb2eb8515e2ad24c0bd43c3ee9cd74c33f7ca6430755ccdb240fd3144c44/rpds_py-0.30.0-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:a1010ed9524c73b94d15919ca4d41d8780980e1765babf85f9a2f90d247153dd", size = 408951, upload-time = "2025-11-30T20:22:23.408Z" }, + { url = "https://files.pythonhosted.org/packages/d6/65/ad8dc1784a331fabbd740ef6f71ce2198c7ed0890dab595adb9ea2d775a1/rpds_py-0.30.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f8d1736cfb49381ba528cd5baa46f82fdc65c06e843dab24dd70b63d09121b3f", size = 514622, upload-time = "2025-11-30T20:22:25.16Z" }, + { url = "https://files.pythonhosted.org/packages/63/8e/0cfa7ae158e15e143fe03993b5bcd743a59f541f5952e1546b1ac1b5fd45/rpds_py-0.30.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d948b135c4693daff7bc2dcfc4ec57237a29bd37e60c2fabf5aff2bbacf3e2f1", size = 414492, upload-time = "2025-11-30T20:22:26.505Z" }, + { url = "https://files.pythonhosted.org/packages/60/1b/6f8f29f3f995c7ffdde46a626ddccd7c63aefc0efae881dc13b6e5d5bb16/rpds_py-0.30.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:47f236970bccb2233267d89173d3ad2703cd36a0e2a6e92d0560d333871a3d23", size = 394080, upload-time = "2025-11-30T20:22:27.934Z" }, + { url = "https://files.pythonhosted.org/packages/6d/d5/a266341051a7a3ca2f4b750a3aa4abc986378431fc2da508c5034d081b70/rpds_py-0.30.0-cp312-cp312-manylinux_2_31_riscv64.whl", hash = "sha256:2e6ecb5a5bcacf59c3f912155044479af1d0b6681280048b338b28e364aca1f6", size = 408680, upload-time = "2025-11-30T20:22:29.341Z" }, + { url = "https://files.pythonhosted.org/packages/10/3b/71b725851df9ab7a7a4e33cf36d241933da66040d195a84781f49c50490c/rpds_py-0.30.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:a8fa71a2e078c527c3e9dc9fc5a98c9db40bcc8a92b4e8858e36d329f8684b51", size = 423589, upload-time = "2025-11-30T20:22:31.469Z" }, + { url = "https://files.pythonhosted.org/packages/00/2b/e59e58c544dc9bd8bd8384ecdb8ea91f6727f0e37a7131baeff8d6f51661/rpds_py-0.30.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:73c67f2db7bc334e518d097c6d1e6fed021bbc9b7d678d6cc433478365d1d5f5", size = 573289, upload-time = "2025-11-30T20:22:32.997Z" }, + { url = "https://files.pythonhosted.org/packages/da/3e/a18e6f5b460893172a7d6a680e86d3b6bc87a54c1f0b03446a3c8c7b588f/rpds_py-0.30.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:5ba103fb455be00f3b1c2076c9d4264bfcb037c976167a6047ed82f23153f02e", size = 599737, upload-time = "2025-11-30T20:22:34.419Z" }, + { url = "https://files.pythonhosted.org/packages/5c/e2/714694e4b87b85a18e2c243614974413c60aa107fd815b8cbc42b873d1d7/rpds_py-0.30.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:7cee9c752c0364588353e627da8a7e808a66873672bcb5f52890c33fd965b394", size = 563120, upload-time = "2025-11-30T20:22:35.903Z" }, + { url = "https://files.pythonhosted.org/packages/6f/ab/d5d5e3bcedb0a77f4f613706b750e50a5a3ba1c15ccd3665ecc636c968fd/rpds_py-0.30.0-cp312-cp312-win32.whl", hash = "sha256:1ab5b83dbcf55acc8b08fc62b796ef672c457b17dbd7820a11d6c52c06839bdf", size = 223782, upload-time = "2025-11-30T20:22:37.271Z" }, + { url = "https://files.pythonhosted.org/packages/39/3b/f786af9957306fdc38a74cef405b7b93180f481fb48453a114bb6465744a/rpds_py-0.30.0-cp312-cp312-win_amd64.whl", hash = "sha256:a090322ca841abd453d43456ac34db46e8b05fd9b3b4ac0c78bcde8b089f959b", size = 240463, upload-time = "2025-11-30T20:22:39.021Z" }, + { url = "https://files.pythonhosted.org/packages/f3/d2/b91dc748126c1559042cfe41990deb92c4ee3e2b415f6b5234969ffaf0cc/rpds_py-0.30.0-cp312-cp312-win_arm64.whl", hash = "sha256:669b1805bd639dd2989b281be2cfd951c6121b65e729d9b843e9639ef1fd555e", size = 230868, upload-time = "2025-11-30T20:22:40.493Z" }, + { url = "https://files.pythonhosted.org/packages/ed/dc/d61221eb88ff410de3c49143407f6f3147acf2538c86f2ab7ce65ae7d5f9/rpds_py-0.30.0-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:f83424d738204d9770830d35290ff3273fbb02b41f919870479fab14b9d303b2", size = 374887, upload-time = "2025-11-30T20:22:41.812Z" }, + { url = "https://files.pythonhosted.org/packages/fd/32/55fb50ae104061dbc564ef15cc43c013dc4a9f4527a1f4d99baddf56fe5f/rpds_py-0.30.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:e7536cd91353c5273434b4e003cbda89034d67e7710eab8761fd918ec6c69cf8", size = 358904, upload-time = "2025-11-30T20:22:43.479Z" }, + { url = "https://files.pythonhosted.org/packages/58/70/faed8186300e3b9bdd138d0273109784eea2396c68458ed580f885dfe7ad/rpds_py-0.30.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2771c6c15973347f50fece41fc447c054b7ac2ae0502388ce3b6738cd366e3d4", size = 389945, upload-time = "2025-11-30T20:22:44.819Z" }, + { url = "https://files.pythonhosted.org/packages/bd/a8/073cac3ed2c6387df38f71296d002ab43496a96b92c823e76f46b8af0543/rpds_py-0.30.0-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:0a59119fc6e3f460315fe9d08149f8102aa322299deaa5cab5b40092345c2136", size = 407783, upload-time = "2025-11-30T20:22:46.103Z" }, + { url = "https://files.pythonhosted.org/packages/77/57/5999eb8c58671f1c11eba084115e77a8899d6e694d2a18f69f0ba471ec8b/rpds_py-0.30.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:76fec018282b4ead0364022e3c54b60bf368b9d926877957a8624b58419169b7", size = 515021, upload-time = "2025-11-30T20:22:47.458Z" }, + { url = "https://files.pythonhosted.org/packages/e0/af/5ab4833eadc36c0a8ed2bc5c0de0493c04f6c06de223170bd0798ff98ced/rpds_py-0.30.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:692bef75a5525db97318e8cd061542b5a79812d711ea03dbc1f6f8dbb0c5f0d2", size = 414589, upload-time = "2025-11-30T20:22:48.872Z" }, + { url = "https://files.pythonhosted.org/packages/b7/de/f7192e12b21b9e9a68a6d0f249b4af3fdcdff8418be0767a627564afa1f1/rpds_py-0.30.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9027da1ce107104c50c81383cae773ef5c24d296dd11c99e2629dbd7967a20c6", size = 394025, upload-time = "2025-11-30T20:22:50.196Z" }, + { url = "https://files.pythonhosted.org/packages/91/c4/fc70cd0249496493500e7cc2de87504f5aa6509de1e88623431fec76d4b6/rpds_py-0.30.0-cp313-cp313-manylinux_2_31_riscv64.whl", hash = "sha256:9cf69cdda1f5968a30a359aba2f7f9aa648a9ce4b580d6826437f2b291cfc86e", size = 408895, upload-time = "2025-11-30T20:22:51.87Z" }, + { url = "https://files.pythonhosted.org/packages/58/95/d9275b05ab96556fefff73a385813eb66032e4c99f411d0795372d9abcea/rpds_py-0.30.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:a4796a717bf12b9da9d3ad002519a86063dcac8988b030e405704ef7d74d2d9d", size = 422799, upload-time = "2025-11-30T20:22:53.341Z" }, + { url = "https://files.pythonhosted.org/packages/06/c1/3088fc04b6624eb12a57eb814f0d4997a44b0d208d6cace713033ff1a6ba/rpds_py-0.30.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:5d4c2aa7c50ad4728a094ebd5eb46c452e9cb7edbfdb18f9e1221f597a73e1e7", size = 572731, upload-time = "2025-11-30T20:22:54.778Z" }, + { url = "https://files.pythonhosted.org/packages/d8/42/c612a833183b39774e8ac8fecae81263a68b9583ee343db33ab571a7ce55/rpds_py-0.30.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:ba81a9203d07805435eb06f536d95a266c21e5b2dfbf6517748ca40c98d19e31", size = 599027, upload-time = "2025-11-30T20:22:56.212Z" }, + { url = "https://files.pythonhosted.org/packages/5f/60/525a50f45b01d70005403ae0e25f43c0384369ad24ffe46e8d9068b50086/rpds_py-0.30.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:945dccface01af02675628334f7cf49c2af4c1c904748efc5cf7bbdf0b579f95", size = 563020, upload-time = "2025-11-30T20:22:58.2Z" }, + { url = "https://files.pythonhosted.org/packages/0b/5d/47c4655e9bcd5ca907148535c10e7d489044243cc9941c16ed7cd53be91d/rpds_py-0.30.0-cp313-cp313-win32.whl", hash = "sha256:b40fb160a2db369a194cb27943582b38f79fc4887291417685f3ad693c5a1d5d", size = 223139, upload-time = "2025-11-30T20:23:00.209Z" }, + { url = "https://files.pythonhosted.org/packages/f2/e1/485132437d20aa4d3e1d8b3fb5a5e65aa8139f1e097080c2a8443201742c/rpds_py-0.30.0-cp313-cp313-win_amd64.whl", hash = "sha256:806f36b1b605e2d6a72716f321f20036b9489d29c51c91f4dd29a3e3afb73b15", size = 240224, upload-time = "2025-11-30T20:23:02.008Z" }, + { url = "https://files.pythonhosted.org/packages/24/95/ffd128ed1146a153d928617b0ef673960130be0009c77d8fbf0abe306713/rpds_py-0.30.0-cp313-cp313-win_arm64.whl", hash = "sha256:d96c2086587c7c30d44f31f42eae4eac89b60dabbac18c7669be3700f13c3ce1", size = 230645, upload-time = "2025-11-30T20:23:03.43Z" }, + { url = "https://files.pythonhosted.org/packages/ff/1b/b10de890a0def2a319a2626334a7f0ae388215eb60914dbac8a3bae54435/rpds_py-0.30.0-cp313-cp313t-macosx_10_12_x86_64.whl", hash = "sha256:eb0b93f2e5c2189ee831ee43f156ed34e2a89a78a66b98cadad955972548be5a", size = 364443, upload-time = "2025-11-30T20:23:04.878Z" }, + { url = "https://files.pythonhosted.org/packages/0d/bf/27e39f5971dc4f305a4fb9c672ca06f290f7c4e261c568f3dea16a410d47/rpds_py-0.30.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:922e10f31f303c7c920da8981051ff6d8c1a56207dbdf330d9047f6d30b70e5e", size = 353375, upload-time = "2025-11-30T20:23:06.342Z" }, + { url = "https://files.pythonhosted.org/packages/40/58/442ada3bba6e8e6615fc00483135c14a7538d2ffac30e2d933ccf6852232/rpds_py-0.30.0-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cdc62c8286ba9bf7f47befdcea13ea0e26bf294bda99758fd90535cbaf408000", size = 383850, upload-time = "2025-11-30T20:23:07.825Z" }, + { url = "https://files.pythonhosted.org/packages/14/14/f59b0127409a33c6ef6f5c1ebd5ad8e32d7861c9c7adfa9a624fc3889f6c/rpds_py-0.30.0-cp313-cp313t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:47f9a91efc418b54fb8190a6b4aa7813a23fb79c51f4bb84e418f5476c38b8db", size = 392812, upload-time = "2025-11-30T20:23:09.228Z" }, + { url = "https://files.pythonhosted.org/packages/b3/66/e0be3e162ac299b3a22527e8913767d869e6cc75c46bd844aa43fb81ab62/rpds_py-0.30.0-cp313-cp313t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1f3587eb9b17f3789ad50824084fa6f81921bbf9a795826570bda82cb3ed91f2", size = 517841, upload-time = "2025-11-30T20:23:11.186Z" }, + { url = "https://files.pythonhosted.org/packages/3d/55/fa3b9cf31d0c963ecf1ba777f7cf4b2a2c976795ac430d24a1f43d25a6ba/rpds_py-0.30.0-cp313-cp313t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:39c02563fc592411c2c61d26b6c5fe1e51eaa44a75aa2c8735ca88b0d9599daa", size = 408149, upload-time = "2025-11-30T20:23:12.864Z" }, + { url = "https://files.pythonhosted.org/packages/60/ca/780cf3b1a32b18c0f05c441958d3758f02544f1d613abf9488cd78876378/rpds_py-0.30.0-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:51a1234d8febafdfd33a42d97da7a43f5dcb120c1060e352a3fbc0c6d36e2083", size = 383843, upload-time = "2025-11-30T20:23:14.638Z" }, + { url = "https://files.pythonhosted.org/packages/82/86/d5f2e04f2aa6247c613da0c1dd87fcd08fa17107e858193566048a1e2f0a/rpds_py-0.30.0-cp313-cp313t-manylinux_2_31_riscv64.whl", hash = "sha256:eb2c4071ab598733724c08221091e8d80e89064cd472819285a9ab0f24bcedb9", size = 396507, upload-time = "2025-11-30T20:23:16.105Z" }, + { url = "https://files.pythonhosted.org/packages/4b/9a/453255d2f769fe44e07ea9785c8347edaf867f7026872e76c1ad9f7bed92/rpds_py-0.30.0-cp313-cp313t-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:6bdfdb946967d816e6adf9a3d8201bfad269c67efe6cefd7093ef959683c8de0", size = 414949, upload-time = "2025-11-30T20:23:17.539Z" }, + { url = "https://files.pythonhosted.org/packages/a3/31/622a86cdc0c45d6df0e9ccb6becdba5074735e7033c20e401a6d9d0e2ca0/rpds_py-0.30.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:c77afbd5f5250bf27bf516c7c4a016813eb2d3e116139aed0096940c5982da94", size = 565790, upload-time = "2025-11-30T20:23:19.029Z" }, + { url = "https://files.pythonhosted.org/packages/1c/5d/15bbf0fb4a3f58a3b1c67855ec1efcc4ceaef4e86644665fff03e1b66d8d/rpds_py-0.30.0-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:61046904275472a76c8c90c9ccee9013d70a6d0f73eecefd38c1ae7c39045a08", size = 590217, upload-time = "2025-11-30T20:23:20.885Z" }, + { url = "https://files.pythonhosted.org/packages/6d/61/21b8c41f68e60c8cc3b2e25644f0e3681926020f11d06ab0b78e3c6bbff1/rpds_py-0.30.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:4c5f36a861bc4b7da6516dbdf302c55313afa09b81931e8280361a4f6c9a2d27", size = 555806, upload-time = "2025-11-30T20:23:22.488Z" }, + { url = "https://files.pythonhosted.org/packages/f9/39/7e067bb06c31de48de3eb200f9fc7c58982a4d3db44b07e73963e10d3be9/rpds_py-0.30.0-cp313-cp313t-win32.whl", hash = "sha256:3d4a69de7a3e50ffc214ae16d79d8fbb0922972da0356dcf4d0fdca2878559c6", size = 211341, upload-time = "2025-11-30T20:23:24.449Z" }, + { url = "https://files.pythonhosted.org/packages/0a/4d/222ef0b46443cf4cf46764d9c630f3fe4abaa7245be9417e56e9f52b8f65/rpds_py-0.30.0-cp313-cp313t-win_amd64.whl", hash = "sha256:f14fc5df50a716f7ece6a80b6c78bb35ea2ca47c499e422aa4463455dd96d56d", size = 225768, upload-time = "2025-11-30T20:23:25.908Z" }, +] + [[package]] name = "scikit-learn" version = "1.8.0" @@ -1170,15 +1296,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/83/11/00d3c3dfc25ad54e731d91449895a79e4bf2384dc3ac01809010ba88f6d5/seaborn-0.13.2-py3-none-any.whl", hash = "sha256:636f8336facf092165e27924f223d3c62ca560b1f2bb5dff7ab7fad265361987", size = 294914, upload-time = "2024-01-25T13:21:49.598Z" }, ] -[[package]] -name = "setuptools" -version = "80.10.2" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/76/95/faf61eb8363f26aa7e1d762267a8d602a1b26d4f3a1e758e92cb3cb8b054/setuptools-80.10.2.tar.gz", hash = "sha256:8b0e9d10c784bf7d262c4e5ec5d4ec94127ce206e8738f29a437945fbc219b70", size = 1200343, upload-time = "2026-01-25T22:38:17.252Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/94/b8/f1f62a5e3c0ad2ff1d189590bfa4c46b4f3b6e49cef6f26c6ee4e575394d/setuptools-80.10.2-py3-none-any.whl", hash = "sha256:95b30ddfb717250edb492926c92b5221f7ef3fbcc2b07579bcd4a27da21d0173", size = 1064234, upload-time = "2026-01-25T22:38:15.216Z" }, -] - [[package]] name = "shap" version = "0.50.0"