Skip to content

Commit 8bbe4bd

Browse files
CopilotMichaConrad
andcommitted
Round-2 optimizations from dotnet-trace profiling: ExpansionSum, ScaleExpansion, ExactInCircle
Co-authored-by: MichaCo <5837539+MichaCo@users.noreply.github.com>
1 parent 274b5c4 commit 8bbe4bd

2 files changed

Lines changed: 125 additions & 42 deletions

File tree

src/CDT.Core/Predicates/PredicatesAdaptive.cs

Lines changed: 115 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ public static class PredicatesAdaptive
6161
/// zero if collinear, or a negative value if to the right.
6262
/// </returns>
6363
/// <seealso cref="PredicatesExact.Orient2d(double, double, double, double, double, double)"/>
64-
[MethodImpl(MethodImplOptions.AggressiveInlining)]
64+
[MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)] // opt-15: aggressive JIT opt for fast-path Stage A
6565
[SkipLocalsInit]
6666
public static double Orient2d(
6767
double ax, double ay, double bx, double by, double cx, double cy)
@@ -195,7 +195,7 @@ public static float Orient2d(
195195
/// zero if on, or a negative value if outside.
196196
/// </returns>
197197
/// <seealso cref="PredicatesExact.InCircle(double, double, double, double, double, double, double, double)"/>
198-
[MethodImpl(MethodImplOptions.AggressiveInlining)]
198+
[MethodImpl(MethodImplOptions.AggressiveInlining)] // opt-16: keep only AggressiveInlining (AggressiveOptimization inflated the Stage-B frame and hurt Stage-A)
199199
[SkipLocalsInit]
200200
public static double InCircle(
201201
double ax, double ay, double bx, double by,
@@ -390,6 +390,7 @@ internal static double MultTail(double a, double b, double p)
390390
/// Matches Lenthe <c>ExpansionBase::TwoTwoDiff</c>.
391391
/// </summary>
392392
[MethodImpl(MethodImplOptions.AggressiveInlining)]
393+
[SkipLocalsInit] // opt-11: x0..x3 are all unconditionally computed before conditional write
393394
internal static int TwoTwoDiff(double ax, double by, double ay, double bx, Span<double> h)
394395
{
395396
double axby1 = ax * by;
@@ -419,6 +420,8 @@ internal static int TwoTwoDiff(double ax, double by, double ay, double bx, Span<
419420
/// Matches Lenthe <c>ExpansionBase::ScaleExpansion</c>.
420421
/// Output has up to <c>2*elen</c> terms.
421422
/// </summary>
423+
[MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)] // opt-7, opt-18
424+
[SkipLocalsInit] // opt-8: locals (hIdx, Q, hh, Ti, ti, Qi) are all written before read
422425
internal static int ScaleExpansion(Span<double> e, int elen, double b, Span<double> h)
423426
{
424427
if (elen == 0 || b == 0.0)
@@ -427,24 +430,30 @@ internal static int ScaleExpansion(Span<double> e, int elen, double b, Span<doub
427430
}
428431

429432
var (bHi, bLo) = Split(b);
430-
double Q = e[0] * b;
431-
double hh = DekkersPresplit(e[0], bHi, bLo, Q);
433+
434+
// opt-9: bounds-check-free loop via ref locals
435+
ref double eRef = ref MemoryMarshal.GetReference(e);
436+
ref double hRef = ref MemoryMarshal.GetReference(h);
437+
438+
double Q = Unsafe.Add(ref eRef, 0) * b;
439+
double hh = DekkersPresplit(Unsafe.Add(ref eRef, 0), bHi, bLo, Q);
432440
int hIdx = 0;
433-
if (hh != 0.0) { h[hIdx++] = hh; }
441+
if (hh != 0.0) { Unsafe.Add(ref hRef, hIdx++) = hh; }
434442

435443
for (int i = 1; i < elen; i++)
436444
{
437-
double Ti = e[i] * b;
438-
double ti = DekkersPresplit(e[i], bHi, bLo, Ti);
445+
double ei = Unsafe.Add(ref eRef, i);
446+
double Ti = ei * b;
447+
double ti = DekkersPresplit(ei, bHi, bLo, Ti);
439448
double Qi = Q + ti;
440449
hh = PlusTail(Q, ti, Qi);
441-
if (hh != 0.0) { h[hIdx++] = hh; }
450+
if (hh != 0.0) { Unsafe.Add(ref hRef, hIdx++) = hh; }
442451
Q = Ti + Qi;
443452
hh = FastPlusTail(Ti, Qi, Q);
444-
if (hh != 0.0) { h[hIdx++] = hh; }
453+
if (hh != 0.0) { Unsafe.Add(ref hRef, hIdx++) = hh; }
445454
}
446455

447-
if (Q != 0.0) { h[hIdx++] = Q; }
456+
if (Q != 0.0) { Unsafe.Add(ref hRef, hIdx++) = Q; }
448457
return hIdx;
449458
}
450459

@@ -465,7 +474,7 @@ internal static double DekkersPresplit(double a, double bHi, double bLo, double
465474
/// Computes <c>e*s*s + e*t*t</c> as an expansion (two ScaleExpansion calls each, then sum).
466475
/// Max output: 32 terms for 4-term input (used for InCircle Stage B lift terms).
467476
/// </summary>
468-
[SkipLocalsInit]
477+
[SkipLocalsInit] // keep SkipLocalsInit; remove AggressiveInlining (inlining 3× into AdaptiveInCircle bloats Stage-A JIT frame)
469478
internal static int ScaleExpansionSum(Span<double> e, int elen, double s, double t, Span<double> h)
470479
{
471480
Span<double> es = stackalloc double[8];
@@ -485,52 +494,121 @@ internal static int ScaleExpansionSum(Span<double> e, int elen, double s, double
485494
/// Merge-then-accumulate two expansions. Matches Lenthe <c>ExpansionBase::ExpansionSum</c>:
486495
/// std::merge by |value| (stable), then sequential grow-expansion accumulation.
487496
/// </summary>
497+
[MethodImpl(MethodImplOptions.AggressiveOptimization)] // opt-20: aggressive JIT optimization for this hot method
488498
[SkipLocalsInit]
489499
internal static int ExpansionSum(Span<double> e, int elen, Span<double> f, int flen, Span<double> h)
490500
{
491501
if (elen == 0 && flen == 0) { return 0; }
492-
if (elen == 0) { f[..flen].CopyTo(h); return flen; }
493-
if (flen == 0) { e[..elen].CopyTo(h); return elen; }
502+
if (elen == 0)
503+
{
504+
// opt-5: Unsafe.CopyBlockUnaligned replaces Span.CopyTo (eliminates Memmove call overhead)
505+
Unsafe.CopyBlockUnaligned(
506+
ref Unsafe.As<double, byte>(ref MemoryMarshal.GetReference(h)),
507+
ref Unsafe.As<double, byte>(ref MemoryMarshal.GetReference(f)),
508+
(uint)(flen * sizeof(double)));
509+
return flen;
510+
}
511+
if (flen == 0)
512+
{
513+
// opt-5: same as above for flen==0 fast path
514+
Unsafe.CopyBlockUnaligned(
515+
ref Unsafe.As<double, byte>(ref MemoryMarshal.GetReference(h)),
516+
ref Unsafe.As<double, byte>(ref MemoryMarshal.GetReference(e)),
517+
(uint)(elen * sizeof(double)));
518+
return elen;
519+
}
494520

495521
int total = elen + flen;
496522

497-
// Merge sorted by |value| into temporary buffer.
498-
// Maximum merged size for InCircle Stage D is 192+192=384 ≤ 400, so
499-
// the stackalloc path is always taken for that call site.
500-
Span<double> merged = total <= 400 ? stackalloc double[400] : new double[total];
523+
// opt-1: Tiered stackalloc — allocate only as much as the actual input size requires.
524+
// Using unsafe ref to the first element lets us hold the pointer across the branches
525+
// without assigning the Span itself to an outer variable (which Roslyn disallows for
526+
// stack-allocated Spans that might escape).
527+
if (total <= 16)
528+
{
529+
Span<double> merged16 = stackalloc double[16];
530+
return ExpansionSumCore(e, elen, f, flen, h, merged16);
531+
}
532+
if (total <= 64)
533+
{
534+
Span<double> merged64 = stackalloc double[64];
535+
return ExpansionSumCore(e, elen, f, flen, h, merged64);
536+
}
537+
if (total <= 400)
538+
{
539+
Span<double> merged400 = stackalloc double[400];
540+
return ExpansionSumCore(e, elen, f, flen, h, merged400);
541+
}
542+
return ExpansionSumCore(e, elen, f, flen, h, new double[total]);
543+
}
544+
545+
// opt-2, opt-3, opt-4: Core merge+accumulate logic — receives a pre-sized scratch buffer.
546+
// All span accesses use MemoryMarshal.GetReference + Unsafe.Add to eliminate bounds checks.
547+
[MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)]
548+
private static int ExpansionSumCore(
549+
Span<double> e, int elen, Span<double> f, int flen, Span<double> h, Span<double> merged)
550+
{
551+
ref double eRef = ref MemoryMarshal.GetReference(e);
552+
ref double fRef = ref MemoryMarshal.GetReference(f);
553+
ref double mRef = ref MemoryMarshal.GetReference(merged);
554+
501555
int ei = 0, fi = 0, mi = 0;
502556
while (ei < elen && fi < flen)
503557
{
504-
if (Math.Abs(f[fi]) < Math.Abs(e[ei]))
558+
double eVal = Unsafe.Add(ref eRef, ei);
559+
double fVal = Unsafe.Add(ref fRef, fi);
560+
if (Math.Abs(fVal) < Math.Abs(eVal))
505561
{
506-
merged[mi++] = f[fi++];
562+
Unsafe.Add(ref mRef, mi++) = fVal;
563+
fi++;
507564
}
508565
else
509566
{
510-
merged[mi++] = e[ei++];
567+
Unsafe.Add(ref mRef, mi++) = eVal;
568+
ei++;
511569
}
512570
}
513571

514-
while (ei < elen) { merged[mi++] = e[ei++]; }
515-
while (fi < flen) { merged[mi++] = f[fi++]; }
572+
// opt-4: tail copy loops → Unsafe.CopyBlockUnaligned
573+
if (ei < elen)
574+
{
575+
int rem = elen - ei;
576+
Unsafe.CopyBlockUnaligned(
577+
ref Unsafe.As<double, byte>(ref Unsafe.Add(ref mRef, mi)),
578+
ref Unsafe.As<double, byte>(ref Unsafe.Add(ref eRef, ei)),
579+
(uint)(rem * sizeof(double)));
580+
mi += rem;
581+
}
582+
if (fi < flen)
583+
{
584+
int rem = flen - fi;
585+
Unsafe.CopyBlockUnaligned(
586+
ref Unsafe.As<double, byte>(ref Unsafe.Add(ref mRef, mi)),
587+
ref Unsafe.As<double, byte>(ref Unsafe.Add(ref fRef, fi)),
588+
(uint)(rem * sizeof(double)));
589+
mi += rem;
590+
}
516591

517-
// Sequential accumulation
592+
// opt-3: bounds-check-free accumulation loop using ref locals
593+
ref double hRef = ref MemoryMarshal.GetReference(h);
518594
int hIdx = 0;
519-
double Q = merged[0];
520-
double Qnew = merged[1] + Q;
521-
double hh = FastPlusTail(merged[1], Q, Qnew);
595+
double Q = Unsafe.Add(ref mRef, 0);
596+
double m1 = Unsafe.Add(ref mRef, 1);
597+
double Qnew = m1 + Q;
598+
double hh = FastPlusTail(m1, Q, Qnew);
522599
Q = Qnew;
523-
if (hh != 0.0) { h[hIdx++] = hh; }
600+
if (hh != 0.0) { Unsafe.Add(ref hRef, hIdx++) = hh; }
524601

525602
for (int g = 2; g < mi; g++)
526603
{
527-
Qnew = Q + merged[g];
528-
hh = PlusTail(Q, merged[g], Qnew);
604+
double mg = Unsafe.Add(ref mRef, g);
605+
Qnew = Q + mg;
606+
hh = PlusTail(Q, mg, Qnew);
529607
Q = Qnew;
530-
if (hh != 0.0) { h[hIdx++] = hh; }
608+
if (hh != 0.0) { Unsafe.Add(ref hRef, hIdx++) = hh; }
531609
}
532610

533-
if (Q != 0.0) { h[hIdx++] = Q; }
611+
if (Q != 0.0) { Unsafe.Add(ref hRef, hIdx++) = Q; }
534612
return hIdx;
535613
}
536614

@@ -545,16 +623,19 @@ internal static double Estimate(Span<double> e, int elen)
545623
for (; i <= elen - 4; i += 4)
546624
acc = Vector256.Add(acc, Vector256.LoadUnsafe(ref eRef, (nuint)i));
547625
double sum = Vector256.Sum(acc);
626+
// opt-13: bounds-check-free scalar tail using Unsafe.Add
548627
for (; i < elen; i++) sum += Unsafe.Add(ref eRef, i);
549628
return sum;
550629
}
551630

631+
// opt-13: bounds-check-free scalar loop
632+
ref double sRef = ref MemoryMarshal.GetReference(e);
552633
double s = 0.0;
553-
for (int i = 0; i < elen; i++) { s += e[i]; }
634+
for (int i = 0; i < elen; i++) { s += Unsafe.Add(ref sRef, i); }
554635
return s;
555636
}
556637

557-
[MethodImpl(MethodImplOptions.AggressiveInlining)]
638+
[MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)] // opt-12
558639
internal static double MostSignificant(Span<double> e, int elen)
559640
{
560641
for (int i = elen - 1; i >= 0; i--)
@@ -564,7 +645,7 @@ internal static double MostSignificant(Span<double> e, int elen)
564645
return 0.0;
565646
}
566647

567-
[MethodImpl(MethodImplOptions.AggressiveInlining)]
648+
[MethodImpl(MethodImplOptions.AggressiveInlining | MethodImplOptions.AggressiveOptimization)] // opt-14
568649
internal static void NegateInto(Span<double> src, int len, Span<double> dst)
569650
{
570651
if (Vector256.IsHardwareAccelerated && len >= 4)

src/CDT.Core/Predicates/PredicatesExact.cs

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ public static class PredicatesExact
4343
/// zero if collinear, or a negative value if to the right.
4444
/// </returns>
4545
/// <seealso cref="PredicatesAdaptive.Orient2d(double, double, double, double, double, double)"/>
46+
[MethodImpl(MethodImplOptions.AggressiveInlining)] // opt-17: inline into Adaptive.Orient2d Stage D
4647
[SkipLocalsInit]
4748
public static double Orient2d(
4849
double ax, double ay, double bx, double by, double cx, double cy)
@@ -125,6 +126,7 @@ public static float Orient2d(
125126
/// zero if on, or a negative value if outside.
126127
/// </returns>
127128
/// <seealso cref="PredicatesAdaptive.InCircle(double, double, double, double, double, double, double, double)"/>
129+
[MethodImpl(MethodImplOptions.AggressiveOptimization)] // opt-20: aggressive JIT opt for this hot method
128130
[SkipLocalsInit]
129131
public static double InCircle(
130132
double ax, double ay, double bx, double by,
@@ -176,22 +178,22 @@ public static double InCircle(
176178
int adetLen = PredicatesAdaptive.ScaleExpansionSum(bcd, bcdLen, ax, ay, adet);
177179

178180
// bdet = -(cda*bx*bx + cda*by*by)
179-
Span<double> bdetPos = stackalloc double[96];
180-
int bdetPosLen = PredicatesAdaptive.ScaleExpansionSum(cda, cdaLen, bx, by, bdetPos);
181+
// opt-18: eliminate bdetPos[96] by computing ScaleExpansionSum directly into bdet,
182+
// then negating in-place. Saves 768 bytes of stack per call.
181183
Span<double> bdet = stackalloc double[96];
182-
PredicatesAdaptive.NegateInto(bdetPos, bdetPosLen, bdet);
183-
int bdetLen = bdetPosLen;
184+
int bdetLen = PredicatesAdaptive.ScaleExpansionSum(cda, cdaLen, bx, by, bdet);
185+
PredicatesAdaptive.NegateInto(bdet, bdetLen, bdet);
184186

185187
// cdet = dab*cx*cx + dab*cy*cy
186188
Span<double> cdet = stackalloc double[96];
187189
int cdetLen = PredicatesAdaptive.ScaleExpansionSum(dab, dabLen, cx, cy, cdet);
188190

189191
// ddet = -(abc*dx*dx + abc*dy*dy)
190-
Span<double> ddetPos = stackalloc double[96];
191-
int ddetPosLen = PredicatesAdaptive.ScaleExpansionSum(abc, abcLen, dx, dy, ddetPos);
192+
// opt-19: same as opt-18 — eliminate ddetPos[96] stackalloc with in-place negate.
193+
// Saves another 768 bytes of stack per call (total savings: 1 536 bytes).
192194
Span<double> ddet = stackalloc double[96];
193-
PredicatesAdaptive.NegateInto(ddetPos, ddetPosLen, ddet);
194-
int ddetLen = ddetPosLen;
195+
int ddetLen = PredicatesAdaptive.ScaleExpansionSum(abc, abcLen, dx, dy, ddet);
196+
PredicatesAdaptive.NegateInto(ddet, ddetLen, ddet);
195197

196198
// deter = (adet + bdet) + (cdet + ddet)
197199
Span<double> ab2 = stackalloc double[192];

0 commit comments

Comments
 (0)