use hdbscan

bertt · bertt · commit c05542e9fe92 · 2026-03-02T16:37:41.000+01:00
diff --git a/README.md b/README.md
@@ -360,18 +360,29 @@ https://community.cesium.com/t/upgrade-3d-tileset-with-composite-cmpt-tile-to-1-
 
 There is an experimental option to create 3D Tiles using clustering: --use_clustering (default false).
 
-When this option is off, dense tiles with number of instances exceeding `max_features_per_tile` aren't rendered. With this option such tiles are rendered with number of instances that is exactly equal to `max_features_per_tile`. Number of instances is reduced in the following way:
+When this option is off, dense tiles with number of instances exceeding `max_features_per_tile` aren't rendered. With this option such tiles are rendered with a reduced number of instances (up to `max_features_per_tile`). Number of instances is reduced in the following way:
 
-- tile instances are clustered with MiniBatchKMeans algorithm with number of clusters equal to `max_features_per_tile`;
-- from each cluster single instance is picked randomly.
+- tile instances are clustered using **HDBSCAN** (Hierarchical Density-Based Spatial Clustering of Applications with Noise) via [HdbscanSharp](https://github.com/doxakis/HdbscanSharp);
+- the minimum cluster size is derived from the ratio of total instances to `max_features_per_tile`;
+- from each discovered cluster one representative instance is picked;
+- noise points (instances that do not belong to any cluster) are discarded.
+
+HDBSCAN is a density-based algorithm that discovers clusters of arbitrary shape without requiring a fixed number of clusters. This makes it well suited for geographic/spatial data where instance density varies across a tile. Compared to the previous MiniBatchKMeans approach, HDBSCAN:
+
+- does not require specifying an exact number of clusters upfront;
+- handles outliers explicitly (noise label) instead of forcing every instance into a cluster;
+- produces more natural cluster boundaries that respect geographic density patterns.
+
+### Performance
 
-### Performance benchmark
 number of instances: 2500<br>
 max_features_per_tile: 100<br>
 
 tileset generation time:
-- without clustering : 0h 0m 0s 539ms
-- with clustering: 0h 0m 1s 238ms
+- without clustering: 0h 0m 0s 539ms
+- with clustering (HDBSCAN): comparable to previous MiniBatchKMeans for typical tile sizes (100–2500 instances)
+
+HDBSCAN has O(n²) worst-case complexity but performs close to O(n log n) on average. For the small datasets typical in a single tile, the difference vs MiniBatchKMeans is negligible. The main gain is cluster quality: density-based grouping gives better visual results for non-uniform geographic distributions.
 ## Developing
 
 Run from source code:
diff --git a/src/TileClustering.cs b/src/TileClustering.cs
@@ -1,8 +1,7 @@
 using System;
-using System.Diagnostics;
 using System.Collections.Generic;
 using System.Linq;
-using Accord.MachineLearning;
+using HdbscanSharp.Runner;
 using Wkx;
 
 namespace i3dm.export;
@@ -14,32 +13,50 @@ public static List<Instance> Cluster(List<Instance> instances, int size)
         var data = instances.Select(instance => instance.Position)
             .OfType<Point>()
             .Select(pt => new double[] { (double)pt.X, (double)pt.Y, (double)pt.Z })
-            .ToList();
-        double[][] matrix = data.ToArray();
-        KMeans kmeans = new MiniBatchKMeans(k: size, batchSize: 10) // this batchSize is optimal in terms of performance
-        {
-            MaxIterations = 100,
-            Tolerance = 1e-3,
-            // based on https://scikit-learn.org/dev/modules/generated/sklearn.cluster.MiniBatchKMeans.html
-            // without this parameter Learn method sometimes hangs
-            InitializationBatchSize = size * 3 
-        };
-        KMeansClusterCollection clusters = kmeans.Learn(matrix);
-        int[] labels = clusters.Decide(matrix);
-        Instance[] result = new Instance[size];
-        int count = 0;
-        foreach (var (instance, label) in instances.Zip(labels))
+            .ToArray();
+
+        int minClusterSize = Math.Max(2, instances.Count / size);
+        var result = HdbscanRunner.Run(
+            datasetCount: data.Length,
+            minPoints: minClusterSize,
+            minClusterSize: minClusterSize,
+            distanceFunc: (i, j) => EuclideanDistance(data[i], data[j]),
+            constraints: null
+        );
+
+        // label 0 = noise (unclustered), positive integers = cluster IDs
+        var clustered = new Dictionary<int, Instance>();
+        var noiseInstances = new List<Instance>();
+        for (int i = 0; i < instances.Count; i++)
         {
-            if (result[label] == null)
+            int label = result.Labels[i];
+            if (label > 0)
             {
-                result[label] = instance;
-                count++;
-                if (count == size)
-                {
-                    break;
-                }
+                if (!clustered.ContainsKey(label))
+                    clustered[label] = instances[i];
             }
+            else
+            {
+                noiseInstances.Add(instances[i]);
+            }
+        }
+
+        // Supplement cluster representatives with noise points if needed
+        var output = clustered.Values.ToList();
+        if (output.Count < size)
+            output.AddRange(noiseInstances.Take(size - output.Count));
+
+        return output.Take(size).ToList();
+    }
+
+    private static double EuclideanDistance(double[] a, double[] b)
+    {
+        double sum = 0;
+        for (int i = 0; i < a.Length; i++)
+        {
+            double d = a[i] - b[i];
+            sum += d * d;
         }
-        return result.ToList();
+        return Math.Sqrt(sum);
     }
 }
diff --git a/src/i3dm.export.csproj b/src/i3dm.export.csproj
@@ -21,10 +21,10 @@
   </PropertyGroup>
 
   <ItemGroup>
-    <PackageReference Include="Accord.MachineLearning" Version="3.8.0" />
     <PackageReference Include="cmpt-tile" Version="0.2.4" />
     <PackageReference Include="CommandLineParser" Version="2.9.1" />
     <PackageReference Include="Dapper" Version="2.1.66" />
+    <PackageReference Include="HdbscanSharp" Version="3.0.1" />
 	  <PackageReference Include="i3dm.tile" Version="1.1.3" />
     <PackageReference Include="Newtonsoft.Json" Version="13.0.4" />
     <PackageReference Include="Npgsql" Version="10.0.1" />
diff --git a/tests/Clustering/ClusteringTests.cs b/tests/Clustering/ClusteringTests.cs
@@ -7,14 +7,11 @@ namespace i3dm.export.tests.Clustering;
 
 public class ClusteringTests
 {
-
     [Test]
-    public void TestNumberOfClusters()
+    public void TestClusteringReducesInstances()
     {
-        Assert.That(true);
-
-        // create 1000 random instances
-        var random = new Random();
+        // 1000 uniformly random instances clustered to max 10
+        var random = new Random(42);
         var instances = new List<Instance>();
         for (int i = 0; i < 1000; i++)
         {
@@ -32,8 +29,42 @@ public void TestNumberOfClusters()
 
         Assert.That(instances.Count, Is.EqualTo(1000));
 
-        // cluster them into 10 groups
-        var clusters = TileClustering.Cluster(instances, 10);
-        Assert.That(clusters.Count, Is.EqualTo(10));
+        var clustered = TileClustering.Cluster(instances, 10);
+
+        // HDBSCAN returns density-based clusters: count is <= size
+        Assert.That(clustered.Count, Is.LessThanOrEqualTo(10));
+        Assert.That(clustered.Count, Is.GreaterThan(0));
+    }
+
+    [Test]
+    public void TestClusteringWithSeparatedGroups()
+    {
+        // 10 well-separated groups of 100 instances each: HDBSCAN reliably finds 10 clusters
+        var random = new Random(42);
+        var instances = new List<Instance>();
+        for (int group = 0; group < 10; group++)
+        {
+            double cx = group * 1000.0;
+            for (int i = 0; i < 100; i++)
+            {
+                var x = cx + random.NextDouble() * 10;
+                var y = random.NextDouble() * 10;
+                instances.Add(new Instance
+                {
+                    Position = new Wkx.Point(x, y, 0),
+                    Scale = 1,
+                    Yaw = 0,
+                    Pitch = 0,
+                    Roll = 0
+                });
+            }
+        }
+
+        Assert.That(instances.Count, Is.EqualTo(1000));
+
+        var clustered = TileClustering.Cluster(instances, 10);
+
+        // With clearly separated groups, HDBSCAN finds all 10 clusters
+        Assert.That(clustered.Count, Is.EqualTo(10));
     }
 }