|
16 | 16 | */ |
17 | 17 | package org.apache.lucene.benchmark.jmh; |
18 | 18 |
|
19 | | -import java.lang.foreign.Arena; |
20 | | -import java.lang.foreign.MemorySegment; |
21 | | -import java.lang.foreign.ValueLayout; |
| 19 | +import java.lang.invoke.MethodHandle; |
| 20 | +import java.lang.invoke.MethodHandles; |
| 21 | +import java.lang.invoke.MethodType; |
22 | 22 | import java.util.concurrent.ThreadLocalRandom; |
23 | 23 | import java.util.concurrent.TimeUnit; |
24 | 24 | import org.apache.lucene.util.VectorUtil; |
@@ -52,12 +52,11 @@ static void compressBytes(byte[] raw, byte[] compressed) { |
52 | 52 | private float[] floatsB; |
53 | 53 | private int expectedhalfByteDotProduct; |
54 | 54 |
|
55 | | - private MemorySegment nativeBytesA; |
| 55 | + private Object nativeBytesA; |
| 56 | + private Object nativeBytesB; |
56 | 57 |
|
57 | | - private MemorySegment nativeBytesB; |
58 | | - |
59 | | - // @Param({"1", "128", "207", "256", "300", "512", "702", "1024"}) |
60 | | - @Param({"768"}) |
| 58 | + /** private Object nativeBytesA; private Object nativeBytesB; */ |
| 59 | + @Param({"1", "128", "207", "256", "300", "512", "702", "1024"}) |
61 | 60 | int size; |
62 | 61 |
|
63 | 62 | @Setup(Level.Iteration) |
@@ -92,20 +91,76 @@ public void init() { |
92 | 91 | floatsA[i] = random.nextFloat(); |
93 | 92 | floatsB[i] = random.nextFloat(); |
94 | 93 | } |
95 | | - |
96 | | - Arena offHeap = Arena.ofAuto(); |
97 | | - nativeBytesA = offHeap.allocate(size, ValueLayout.JAVA_BYTE.byteAlignment()); |
98 | | - nativeBytesB = offHeap.allocate(size, ValueLayout.JAVA_BYTE.byteAlignment()); |
99 | | - for (int i = 0; i < size; ++i) { |
100 | | - nativeBytesA.set(ValueLayout.JAVA_BYTE, i, (byte) random.nextInt(128)); |
101 | | - nativeBytesA.set(ValueLayout.JAVA_BYTE, i, (byte) random.nextInt(128)); |
| 94 | + // Java 21+ specific initialization |
| 95 | + final int runtimeVersion = Runtime.version().feature(); |
| 96 | + if (runtimeVersion >= 21) { |
| 97 | + // Reflection based code to eliminate the use of Preview classes in JMH benchmarks |
| 98 | + try { |
| 99 | + final Class<?> vectorUtilSupportClass = VectorUtil.getVectorUtilSupportClass(); |
| 100 | + final var className = "org.apache.lucene.internal.vectorization.PanamaVectorUtilSupport"; |
| 101 | + if (vectorUtilSupportClass.getName().equals(className) == false) { |
| 102 | + nativeBytesA = null; |
| 103 | + nativeBytesB = null; |
| 104 | + } else { |
| 105 | + MethodHandles.Lookup lookup = MethodHandles.lookup(); |
| 106 | + final var MemorySegment = "java.lang.foreign.MemorySegment"; |
| 107 | + final var methodType = |
| 108 | + MethodType.methodType(lookup.findClass(MemorySegment), byte[].class); |
| 109 | + MethodHandle nativeMemorySegment = |
| 110 | + lookup.findStatic(vectorUtilSupportClass, "nativeMemorySegment", methodType); |
| 111 | + byte[] a = new byte[size]; |
| 112 | + byte[] b = new byte[size]; |
| 113 | + for (int i = 0; i < size; ++i) { |
| 114 | + a[i] = (byte) random.nextInt(128); |
| 115 | + b[i] = (byte) random.nextInt(128); |
| 116 | + } |
| 117 | + nativeBytesA = nativeMemorySegment.invoke(a); |
| 118 | + nativeBytesB = nativeMemorySegment.invoke(b); |
| 119 | + } |
| 120 | + } catch (Throwable e) { |
| 121 | + throw new RuntimeException(e); |
| 122 | + } |
| 123 | + /* |
| 124 | + Arena offHeap = Arena.ofAuto(); |
| 125 | + nativeBytesA = offHeap.allocate(size, ValueLayout.JAVA_BYTE.byteAlignment()); |
| 126 | + nativeBytesB = offHeap.allocate(size, ValueLayout.JAVA_BYTE.byteAlignment()); |
| 127 | + for (int i = 0; i < size; ++i) { |
| 128 | + nativeBytesA.set(ValueLayout.JAVA_BYTE, i, (byte) random.nextInt(128)); |
| 129 | + nativeBytesB.set(ValueLayout.JAVA_BYTE, i, (byte) random.nextInt(128)); |
| 130 | + }*/ |
102 | 131 | } |
103 | 132 | } |
104 | 133 |
|
| 134 | + /** |
| 135 | + * High overhead (lower score) from using NATIVE_DOT_PRODUCT.invoke(nativeBytesA, nativeBytesB). |
| 136 | + * Both nativeBytesA and nativeBytesB are offHeap MemorySegments created by invoking the method |
| 137 | + * PanamaVectorUtilSupport.nativeMemorySegment(byte[]) which allocated these segments and copies |
| 138 | + * bytes from the supplied byte[] to offHeap memory. The benchmark output below shows |
| 139 | + * significantly more overhead. <b>NOTE:</b> Return type of dots8s() was set to void for the |
| 140 | + * benchmark run to avoid boxing/unboxing overhead. |
| 141 | + * |
| 142 | + * <pre> |
| 143 | + * Benchmark (size) Mode Cnt Score Error Units |
| 144 | + * VectorUtilBenchmark.dot8s 768 thrpt 15 36.406 ± 0.496 ops/us |
| 145 | + * </pre> |
| 146 | + * |
| 147 | + * Much lower overhead was observed when preview APIs were used directly in JMH benchmarking code |
| 148 | + * and exact method invocation was made as shown below <b>return (int) |
| 149 | + * VectorUtil.NATIVE_DOT_PRODUCT.invokeExact(nativeBytesA, nativeBytesB);</b> |
| 150 | + * |
| 151 | + * <pre> |
| 152 | + * Benchmark (size) Mode Cnt Score Error Units |
| 153 | + * VectorUtilBenchmark.dot8s 768 thrpt 15 43.662 ± 0.818 ops/us |
| 154 | + * </pre> |
| 155 | + */ |
105 | 156 | @Benchmark |
106 | 157 | @Fork(jvmArgsPrepend = {"--add-modules=jdk.incubator.vector"}) |
107 | | - public int dot8s() { |
108 | | - return VectorUtil.dot8s(nativeBytesA, nativeBytesB, size); |
| 158 | + public void dot8s() { |
| 159 | + try { |
| 160 | + VectorUtil.NATIVE_DOT_PRODUCT.invoke(nativeBytesA, nativeBytesB); |
| 161 | + } catch (Throwable e) { |
| 162 | + throw new RuntimeException(e); |
| 163 | + } |
109 | 164 | } |
110 | 165 |
|
111 | 166 | @Benchmark |
|
0 commit comments