Skip to content

Commit c05542e

Browse files
committed
use hdbscan
1 parent 4c8a6f7 commit c05542e

4 files changed

Lines changed: 100 additions & 41 deletions

File tree

README.md

Lines changed: 17 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -360,18 +360,29 @@ https://community.cesium.com/t/upgrade-3d-tileset-with-composite-cmpt-tile-to-1-
360360

361361
There is an experimental option to create 3D Tiles using clustering: --use_clustering (default false).
362362

363-
When this option is off, dense tiles with number of instances exceeding `max_features_per_tile` aren't rendered. With this option such tiles are rendered with number of instances that is exactly equal to `max_features_per_tile`. Number of instances is reduced in the following way:
363+
When this option is off, dense tiles with number of instances exceeding `max_features_per_tile` aren't rendered. With this option such tiles are rendered with a reduced number of instances (up to `max_features_per_tile`). Number of instances is reduced in the following way:
364364

365-
- tile instances are clustered with MiniBatchKMeans algorithm with number of clusters equal to `max_features_per_tile`;
366-
- from each cluster single instance is picked randomly.
365+
- tile instances are clustered using **HDBSCAN** (Hierarchical Density-Based Spatial Clustering of Applications with Noise) via [HdbscanSharp](https://github.com/doxakis/HdbscanSharp);
366+
- the minimum cluster size is derived from the ratio of total instances to `max_features_per_tile`;
367+
- from each discovered cluster one representative instance is picked;
368+
- noise points (instances that do not belong to any cluster) are discarded.
369+
370+
HDBSCAN is a density-based algorithm that discovers clusters of arbitrary shape without requiring a fixed number of clusters. This makes it well suited for geographic/spatial data where instance density varies across a tile. Compared to the previous MiniBatchKMeans approach, HDBSCAN:
371+
372+
- does not require specifying an exact number of clusters upfront;
373+
- handles outliers explicitly (noise label) instead of forcing every instance into a cluster;
374+
- produces more natural cluster boundaries that respect geographic density patterns.
375+
376+
### Performance
367377

368-
### Performance benchmark
369378
number of instances: 2500<br>
370379
max_features_per_tile: 100<br>
371380

372381
tileset generation time:
373-
- without clustering : 0h 0m 0s 539ms
374-
- with clustering: 0h 0m 1s 238ms
382+
- without clustering: 0h 0m 0s 539ms
383+
- with clustering (HDBSCAN): comparable to previous MiniBatchKMeans for typical tile sizes (100–2500 instances)
384+
385+
HDBSCAN has O(n²) worst-case complexity but performs close to O(n log n) on average. For the small datasets typical in a single tile, the difference vs MiniBatchKMeans is negligible. The main gain is cluster quality: density-based grouping gives better visual results for non-uniform geographic distributions.
375386
## Developing
376387

377388
Run from source code:

src/TileClustering.cs

Lines changed: 42 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,7 @@
11
using System;
2-
using System.Diagnostics;
32
using System.Collections.Generic;
43
using System.Linq;
5-
using Accord.MachineLearning;
4+
using HdbscanSharp.Runner;
65
using Wkx;
76

87
namespace i3dm.export;
@@ -14,32 +13,50 @@ public static List<Instance> Cluster(List<Instance> instances, int size)
1413
var data = instances.Select(instance => instance.Position)
1514
.OfType<Point>()
1615
.Select(pt => new double[] { (double)pt.X, (double)pt.Y, (double)pt.Z })
17-
.ToList();
18-
double[][] matrix = data.ToArray();
19-
KMeans kmeans = new MiniBatchKMeans(k: size, batchSize: 10) // this batchSize is optimal in terms of performance
20-
{
21-
MaxIterations = 100,
22-
Tolerance = 1e-3,
23-
// based on https://scikit-learn.org/dev/modules/generated/sklearn.cluster.MiniBatchKMeans.html
24-
// without this parameter Learn method sometimes hangs
25-
InitializationBatchSize = size * 3
26-
};
27-
KMeansClusterCollection clusters = kmeans.Learn(matrix);
28-
int[] labels = clusters.Decide(matrix);
29-
Instance[] result = new Instance[size];
30-
int count = 0;
31-
foreach (var (instance, label) in instances.Zip(labels))
16+
.ToArray();
17+
18+
int minClusterSize = Math.Max(2, instances.Count / size);
19+
var result = HdbscanRunner.Run(
20+
datasetCount: data.Length,
21+
minPoints: minClusterSize,
22+
minClusterSize: minClusterSize,
23+
distanceFunc: (i, j) => EuclideanDistance(data[i], data[j]),
24+
constraints: null
25+
);
26+
27+
// label 0 = noise (unclustered), positive integers = cluster IDs
28+
var clustered = new Dictionary<int, Instance>();
29+
var noiseInstances = new List<Instance>();
30+
for (int i = 0; i < instances.Count; i++)
3231
{
33-
if (result[label] == null)
32+
int label = result.Labels[i];
33+
if (label > 0)
3434
{
35-
result[label] = instance;
36-
count++;
37-
if (count == size)
38-
{
39-
break;
40-
}
35+
if (!clustered.ContainsKey(label))
36+
clustered[label] = instances[i];
4137
}
38+
else
39+
{
40+
noiseInstances.Add(instances[i]);
41+
}
42+
}
43+
44+
// Supplement cluster representatives with noise points if needed
45+
var output = clustered.Values.ToList();
46+
if (output.Count < size)
47+
output.AddRange(noiseInstances.Take(size - output.Count));
48+
49+
return output.Take(size).ToList();
50+
}
51+
52+
private static double EuclideanDistance(double[] a, double[] b)
53+
{
54+
double sum = 0;
55+
for (int i = 0; i < a.Length; i++)
56+
{
57+
double d = a[i] - b[i];
58+
sum += d * d;
4259
}
43-
return result.ToList();
60+
return Math.Sqrt(sum);
4461
}
4562
}

src/i3dm.export.csproj

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,10 +21,10 @@
2121
</PropertyGroup>
2222

2323
<ItemGroup>
24-
<PackageReference Include="Accord.MachineLearning" Version="3.8.0" />
2524
<PackageReference Include="cmpt-tile" Version="0.2.4" />
2625
<PackageReference Include="CommandLineParser" Version="2.9.1" />
2726
<PackageReference Include="Dapper" Version="2.1.66" />
27+
<PackageReference Include="HdbscanSharp" Version="3.0.1" />
2828
<PackageReference Include="i3dm.tile" Version="1.1.3" />
2929
<PackageReference Include="Newtonsoft.Json" Version="13.0.4" />
3030
<PackageReference Include="Npgsql" Version="10.0.1" />

tests/Clustering/ClusteringTests.cs

Lines changed: 40 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -7,14 +7,11 @@ namespace i3dm.export.tests.Clustering;
77

88
public class ClusteringTests
99
{
10-
1110
[Test]
12-
public void TestNumberOfClusters()
11+
public void TestClusteringReducesInstances()
1312
{
14-
Assert.That(true);
15-
16-
// create 1000 random instances
17-
var random = new Random();
13+
// 1000 uniformly random instances clustered to max 10
14+
var random = new Random(42);
1815
var instances = new List<Instance>();
1916
for (int i = 0; i < 1000; i++)
2017
{
@@ -32,8 +29,42 @@ public void TestNumberOfClusters()
3229

3330
Assert.That(instances.Count, Is.EqualTo(1000));
3431

35-
// cluster them into 10 groups
36-
var clusters = TileClustering.Cluster(instances, 10);
37-
Assert.That(clusters.Count, Is.EqualTo(10));
32+
var clustered = TileClustering.Cluster(instances, 10);
33+
34+
// HDBSCAN returns density-based clusters: count is <= size
35+
Assert.That(clustered.Count, Is.LessThanOrEqualTo(10));
36+
Assert.That(clustered.Count, Is.GreaterThan(0));
37+
}
38+
39+
[Test]
40+
public void TestClusteringWithSeparatedGroups()
41+
{
42+
// 10 well-separated groups of 100 instances each: HDBSCAN reliably finds 10 clusters
43+
var random = new Random(42);
44+
var instances = new List<Instance>();
45+
for (int group = 0; group < 10; group++)
46+
{
47+
double cx = group * 1000.0;
48+
for (int i = 0; i < 100; i++)
49+
{
50+
var x = cx + random.NextDouble() * 10;
51+
var y = random.NextDouble() * 10;
52+
instances.Add(new Instance
53+
{
54+
Position = new Wkx.Point(x, y, 0),
55+
Scale = 1,
56+
Yaw = 0,
57+
Pitch = 0,
58+
Roll = 0
59+
});
60+
}
61+
}
62+
63+
Assert.That(instances.Count, Is.EqualTo(1000));
64+
65+
var clustered = TileClustering.Cluster(instances, 10);
66+
67+
// With clearly separated groups, HDBSCAN finds all 10 clusters
68+
Assert.That(clustered.Count, Is.EqualTo(10));
3869
}
3970
}

0 commit comments

Comments
 (0)