-
Notifications
You must be signed in to change notification settings - Fork 456
Expand file tree
/
Copy pathkernel.cpp
More file actions
3036 lines (2939 loc) · 187 KB
/
kernel.cpp
File metadata and controls
3036 lines (2939 loc) · 187 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#include "kernel.hpp" // note: unbalanced round brackets () are not allowed and string literals can't be arbitrarily long, so periodically interrupt with )+R(
string opencl_c_container() { return R( // ########################## begin of OpenCL C code ####################################################################
// ################################################## utility functions ##################################################
)+R(float sq(const float x) {
return x*x;
}
)+R(float cb(const float x) {
return x*x*x;
}
)+R(float angle(const float3 v1, const float3 v2) {
return acos(dot(v1, v2)/(length(v1)*length(v2)));
}
)+R(float fast_rsqrt(const float x) { // slightly faster approximation
return as_float(0x5F37642F-(as_int(x)>>1));
}
)+R(float fast_asin(const float x) { // slightly faster approximation
return x*fma(0.5702f, sq(sq(sq(x))), 1.0f); // 0.5707964f = (pi-2)/2
}
)+R(float fast_acos(const float x) { // slightly faster approximation
return fma(fma(-0.5702f, sq(sq(sq(x))), -1.0f), x, 1.5712963f); // 0.5707964f = (pi-2)/2
}
)+R(void swap(float* x, float* y) {
const float t = *x;
*x = *y;
*y = t;
}
)+R(void lu_solve(float* M, float* x, float* b, const int N, const int Nsol) { // solves system of N linear equations M*x=b within dimensionality Nsol<=N
for(int i=0; i<Nsol; i++) { // decompose M in M=L*U
for(int j=i+1; j<Nsol; j++) {
M[N*j+i] /= M[N*i+i];
for(int k=i+1; k<Nsol; k++) M[N*j+k] -= M[N*j+i]*M[N*i+k];
}
}
for(int i=0; i<Nsol; i++) { // find solution of L*y=b
x[i] = b[i];
for(int k=0; k<i; k++) x[i] -= M[N*i+k]*x[k];
}
for(int i=Nsol-1; i>=0; i--) { // find solution of U*x=y
for(int k=i+1; k<Nsol; k++) x[i] -= M[N*i+k]*x[k];
x[i] /= M[N*i+i];
}
}
)+R(float trilinear(const float3 p, const float* v) { // trilinear interpolation, p: position in unit cube, v: corner values in unit cube
const float x1=p.x, y1=p.y, z1=p.z, x0=1.0f-x1, y0=1.0f-y1, z0=1.0f-z1; // calculate interpolation factors
return (x0*y0*z0)*v[0]+(x1*y0*z0)*v[1]+(x1*y0*z1)*v[2]+(x0*y0*z1)*v[3]+(x0*y1*z0)*v[4]+(x1*y1*z0)*v[5]+(x1*y1*z1)*v[6]+(x0*y1*z1)*v[7]; // perform trilinear interpolation
}
)+R(float3 trilinear3(const float3 p, const float3* v) { // trilinear interpolation, p: position in unit cube, v: corner vectors in unit cube
const float x1=p.x, y1=p.y, z1=p.z, x0=1.0f-x1, y0=1.0f-y1, z0=1.0f-z1; // calculate interpolation factors
return (x0*y0*z0)*v[0]+(x1*y0*z0)*v[1]+(x1*y0*z1)*v[2]+(x0*y0*z1)*v[3]+(x0*y1*z0)*v[4]+(x1*y1*z0)*v[5]+(x1*y1*z1)*v[6]+(x0*y1*z1)*v[7]; // perform trilinear interpolation
}
// ################################################## Line3D code ##################################################
// Line3D OpenCL C version (c) Dr. Moritz Lehmann
// draw_point(...) : draw 3D pixel
// draw_circle(...) : draw 3D circle
// draw_line(...) : draw 3D line
// draw_triangle(...) : draw 3D triangle
// graphics_clear() : kernel to reset bitmap and zbuffer
)+"#ifdef GRAPHICS"+R(
)+R(int color_from_floats(const float red, const float green, const float blue) {
return clamp((int)fma(255.0f, red, 0.5f), 0, 255)<<16|clamp((int)fma(255.0f, green, 0.5f), 0, 255)<<8|clamp((int)fma(255.0f, blue, 0.5f), 0, 255);
}
)+R(int color_mul(const int c, const float x) { // c*x
const int r = min((int)fma((float)((c>>16)&255), x, 0.5f), 255);
const int g = min((int)fma((float)((c>> 8)&255), x, 0.5f), 255);
const int b = min((int)fma((float)( c &255), x, 0.5f), 255);
return r<<16|g<<8|b; // values are already clamped
}
)+R(int color_average(const int c1, const int c2) { // (c1+c2)/s
const uchar4 cc1=as_uchar4(c1), cc2=as_uchar4(c2);
return as_int((uchar4)((uchar)((cc1.x+cc2.x)/2u), (uchar)((cc1.y+cc2.y)/2u), (uchar)((cc1.z+cc2.z)/2u), (uchar)0u));
}
)+R(int color_mix(const int c1, const int c2, const float w) { // w*c1+(1-w)*c2
const uchar4 cc1=as_uchar4(c1), cc2=as_uchar4(c2);
const float3 fc1=(float3)((float)cc1.x, (float)cc1.y, (float)cc1.z), fc2=(float3)((float)cc2.x, (float)cc2.y, (float)cc2.z);
const float3 fcm = fma(w, fc1, fma(1.0f-w, fc2, (float3)(0.5f, 0.5f, 0.5f)));
return as_int((uchar4)((uchar)fcm.x, (uchar)fcm.y, (uchar)fcm.z, (uchar)0u));
}
)+R(int color_mix_3(const int c0, const int c1, const int c2, const float w0, const float w1, const float w2) { // w1*c1+w2*c2+w3*c3, w0+w1+w2 = 1
const uchar4 cc0=as_uchar4(c0), cc1=as_uchar4(c1), cc2=as_uchar4(c2);
const float3 fc0=(float3)((float)cc0.x, (float)cc0.y, (float)cc0.z), fc1=(float3)((float)cc1.x, (float)cc1.y, (float)cc1.z), fc2=(float3)((float)cc2.x, (float)cc2.y, (float)cc2.z);
const float3 fcm = fma(w0, fc0, fma(w1, fc1, fma(w2, fc2, (float3)(0.5f, 0.5f, 0.5f))));
return as_int((uchar4)((uchar)fcm.x, (uchar)fcm.y, (uchar)fcm.z, (uchar)0u));
}
)+R(int hsv_to_rgb(const float h, const float s, const float v) {
const float c = v*s;
const float x = c*(1.0f-fabs(fmod(h/60.0f, 2.0f)-1.0f));
const float m = v-c;
float r=0.0f, g=0.0f, b=0.0f;
if(0.0f<=h&&h<60.0f) { r = c; g = x; }
else if(h<120.0f) { r = x; g = c; }
else if(h<180.0f) { g = c; b = x; }
else if(h<240.0f) { g = x; b = c; }
else if(h<300.0f) { r = x; b = c; }
else if(h<360.0f) { r = c; b = x; }
return color_from_floats(r+m, g+m, b+m);
}
)+R(int colorscale_rainbow(float x) { // coloring scheme (float [0, 1] -> int color)
x = clamp(6.0f*(1.0f-x), 0.0f, 6.0f);
float r=0.0f, g=0.0f, b=0.0f; // black
if(x<1.2f) { // red - yellow
r = 1.0f;
g = x*0.83333333f;
} else if(x>=1.2f&&x<2.0f) { // yellow - green
r = 2.5f-x*1.25f;
g = 1.0f;
} else if(x>=2.0f&&x<3.0f) { // green - cyan
g = 1.0f;
b = x-2.0f;
} else if(x>=3.0f&&x<4.0f) { // cyan - blue
g = 4.0f-x;
b = 1.0f;
} else if(x>=4.0f&&x<5.0f) { // blue - violet
r = x*0.4f-1.6f;
b = 3.0f-x*0.5f;
} else { // violet - black
r = 2.4f-x*0.4f;
b = 3.0f-x*0.5f;
}
return color_from_floats(r, g, b);
}
)+R(int colorscale_iron(float x) { // coloring scheme (float [0, 1] -> int color)
x = clamp(4.0f*(1.0f-x), 0.0f, 4.0f);
float r=1.0f, g=0.0f, b=0.0f;
if(x<0.66666667f) { // white - yellow
g = 1.0f;
b = 1.0f-x*1.5f;
} else if(x<2.0f) { // yellow - red
g = 1.5f-x*0.75f;
} else if(x<3.0f) { // red - violet
r = 2.0f-x*0.5f;
b = x-2.0f;
} else { // violet - black
r = 2.0f-x*0.5f;
b = 4.0f-x;
}
return color_from_floats(r, g, b);
}
)+R(int colorscale_twocolor(float x) { // coloring scheme (float [0, 1] -> int color)
return x>0.5f ? color_mix(0xFFAA00, def_background_color, clamp(2.0f*x-1.0f, 0.0f, 1.0f)) : color_mix(def_background_color, 0x0080FF, clamp(2.0f*x, 0.0f, 1.0f)); // red - gray - blue
}
)+R(int shading(const int c, const float3 p, const float3 normal, const float* camera_cache) { // calculate flat shading color of triangle
)+"#ifndef GRAPHICS_TRANSPARENCY"+R(
const float zoom = camera_cache[ 0]; // fetch camera parameters (rotation matrix, camera position, etc.)
const float dis = camera_cache[ 1];
const float3 pos = (float3)(camera_cache[ 2], camera_cache[ 3], camera_cache[ 4])-(float3)(def_domain_offset_x, def_domain_offset_y, def_domain_offset_z);
const float3 Rz = (float3)(camera_cache[11], camera_cache[12], camera_cache[13]);
const float3 d = p-Rz*(dis/zoom)-pos; // distance vector between p and camera position
const float nl2 = sq(normal.x)+sq(normal.y)+sq(normal.z); // only one rsqrt instead of two
const float dl2 = sq(d.x)+sq(d.y)+sq(d.z);
return color_mul(c, max(1.5f*fabs(dot(normal, d))*rsqrt(nl2*dl2), 0.3f));
)+"#else"+R( // GRAPHICS_TRANSPARENCY
return c; // disable flat shading, just return input color
)+"#endif"+R( // GRAPHICS_TRANSPARENCY
}
)+R(bool is_off_screen(const int x, const int y, const int stereo) {
switch(stereo) {
default: return x< 0||x>=def_screen_width ||y<0||y>=def_screen_height; // entire screen
case -1: return x< 0||x>=def_screen_width/2||y<0||y>=def_screen_height; // left half
case +1: return x<def_screen_width/2||x>=def_screen_width ||y<0||y>=def_screen_height; // right half
}
}
)+R(void draw(const int x, const int y, const float z, const int color, global int* bitmap, volatile global int* zbuffer, const int stereo) {
const int index=x+y*def_screen_width, iz=(int)(z*1E3f); // use fixed-point int z-buffer and atomic_max to minimize noise in image, maximum render distance is 2.147E6f
)+"#ifndef GRAPHICS_TRANSPARENCY"+R(
if(!is_off_screen(x, y, stereo)&&iz>atomic_max(&zbuffer[index], iz)) bitmap[index] = color; // only draw if point is on screen and first in zbuffer
)+"#else"+R( // GRAPHICS_TRANSPARENCY
if(!is_off_screen(x, y, stereo)) { // transparent rendering (not quite order-independent transparency, but elegant solution for order-reversible transparency which is good enough here)
const float transparency = GRAPHICS_TRANSPARENCY;
const uchar4 cc4=as_uchar4(color), cb4=as_uchar4(def_background_color);
const float3 fc = (float3)((float)cc4.x, (float)cc4.y, (float)cc4.z); // new pixel color that is behind topmost drawn pixel color
const float3 fb = (float3)((float)cb4.x, (float)cb4.y, (float)cb4.z); // background color
const bool is_front = iz>atomic_max(&zbuffer[index], iz);
const uchar4 cp4 = as_uchar4(bitmap[index]);
const float3 fp = (float3)((float)cp4.x, (float)cp4.y, (float)cp4.z); // current pixel color
const int draw_count = (int)cp4.w; // use alpha color value to store how often the pixel has been over-drawn already
const float3 fn = fp+(1.0f-transparency)*( is_front ? fc-fp : pown(transparency, draw_count)*(fc-fb)); // black magic: either over-draw colors back-to-front, or add back colors as correction terms
bitmap[index] = as_int((uchar4)((uchar)clamp(fn.x+0.5f, 0.0f, 255.0f), (uchar)clamp(fn.y+0.5f, 0.0f, 255.0f), (uchar)clamp(fn.z+0.5f, 0.0f, 255.0f), (uchar)min(draw_count+1, 255)));
}
)+"#endif"+R( // GRAPHICS_TRANSPARENCY
}
)+R(bool convert(int* rx, int* ry, float* rz, const float3 p, const float* camera_cache, const int stereo) { // 3D -> 2D
const float zoom = camera_cache[0]; // fetch camera parameters (rotation matrix, camera position, etc.)
const float dis = camera_cache[1];
const float3 pos = (float3)(camera_cache[ 2], camera_cache[ 3], camera_cache[ 4])-(float3)(def_domain_offset_x, def_domain_offset_y, def_domain_offset_z);
const float3 Rx = (float3)(camera_cache[ 5], camera_cache[ 6], camera_cache[ 7]);
const float3 Ry = (float3)(camera_cache[ 8], camera_cache[ 9], camera_cache[10]);
const float3 Rz = (float3)(camera_cache[11], camera_cache[12], camera_cache[13]);
const float eye_distance = vload_half(28, (half*)camera_cache);
float3 t, r;
t = p-pos-((float)stereo*eye_distance/zoom)*(float3)(Rx.x, Rx.y, 0.0f); // transformation
r.z = dot(Rz, t); // z-position for z-buffer
const float rs = zoom*dis/(dis-r.z*zoom); // perspective (reciprocal is more efficient)
if(rs<=0.0f) return false; // point is behins camera
const float tv = ((as_int(camera_cache[14])>>30)&0x1)&&stereo!=0 ? 0.5f : 1.0f;
r.x = (dot(Rx, t)*rs+(float)stereo*eye_distance)*tv+(0.5f+(float)stereo*0.25f)*(float)def_screen_width; // x position on screen
r.y = dot(Ry, t)*rs+0.5f*(float)def_screen_height; // y position on screen
*rx = (int)(r.x+0.5f);
*ry = (int)(r.y+0.5f);
*rz = r.z;
return true;
}
)+R(void convert_circle(float3 p, const float r, const int color, const float* camera_cache, global int* bitmap, global int* zbuffer, const int stereo) { // 3D -> 2D
int rx, ry; float rz;
if(convert(&rx, &ry, &rz, p, camera_cache, stereo)) {
const float zoom = camera_cache[0];
const float dis = camera_cache[1];
const float rs = zoom*dis/(dis-rz*zoom);
const int radius = (int)(rs*r+0.5f);
switch(stereo) {
default: if(rx< -radius||rx>=(int)def_screen_width +radius || ry<-radius||ry>=(int)def_screen_height+radius) return; break; // cancel drawing if circle is off screen
case -1: if(rx< -radius||rx>=(int)def_screen_width/2+radius || ry<-radius||ry>=(int)def_screen_height+radius) return; break;
case +1: if(rx<(int)def_screen_width/2-radius||rx>=(int)def_screen_width +radius || ry<-radius||ry>=(int)def_screen_height+radius) return; break;
}
int d=-radius, x=radius, y=0; // Bresenham algorithm for circle
while(x>=y) {
draw(rx+x, ry+y, rz, color, bitmap, zbuffer, stereo);
draw(rx-x, ry+y, rz, color, bitmap, zbuffer, stereo);
draw(rx+x, ry-y, rz, color, bitmap, zbuffer, stereo);
draw(rx-x, ry-y, rz, color, bitmap, zbuffer, stereo);
draw(rx+y, ry+x, rz, color, bitmap, zbuffer, stereo);
draw(rx-y, ry+x, rz, color, bitmap, zbuffer, stereo);
draw(rx+y, ry-x, rz, color, bitmap, zbuffer, stereo);
draw(rx-y, ry-x, rz, color, bitmap, zbuffer, stereo);
d += 2*y+1;
y++;
if(d>0) d-=2*(--x);
}
}
}
)+R(void convert_line(const float3 p0, const float3 p1, const int color, const float* camera_cache, global int* bitmap, global int* zbuffer, const int stereo) { // 3D -> 2D
int r0x, r0y, r1x, r1y; float r0z, r1z;
if(convert(&r0x, &r0y, &r0z, p0, camera_cache, stereo) && convert(&r1x, &r1y, &r1z, p1, camera_cache, stereo)
&& !(is_off_screen(r0x, r0y, stereo) && is_off_screen(r1x, r1y, stereo))) { // cancel drawing if both points are off screen
int x=r0x, y=r0y; // Bresenham algorithm
const float z = 0.5f*(r0z+r1z); // approximate line z position for each pixel to be equal
const int dx= abs(r1x-r0x), sx=2*(r0x<r1x)-1;
const int dy=-abs(r1y-r0y), sy=2*(r0y<r1y)-1;
int err = dx+dy;
while(x!=r1x||y!=r1y) {
draw(x, y, z, color, bitmap, zbuffer, stereo);
const int e2 = 2*err;
if(e2>dy) { err+=dy; x+=sx; }
if(e2<dx) { err+=dx; y+=sy; }
}
}
}
)+R(void convert_triangle(float3 p0, float3 p1, float3 p2, const int color, const float* camera_cache, global int* bitmap, global int* zbuffer, const int stereo) { // 3D -> 2D
int r0x, r0y, r1x, r1y, r2x, r2y; float r0z, r1z, r2z;
if(convert(&r0x, &r0y, &r0z, p0, camera_cache, stereo) && convert(&r1x, &r1y, &r1z, p1, camera_cache, stereo) && convert(&r2x, &r2y, &r2z, p2, camera_cache, stereo)
&& !(is_off_screen(r0x, r0y, stereo) && is_off_screen(r1x, r1y, stereo) && is_off_screen(r2x, r2y, stereo))) { // cancel drawing if all points are off screen
if(r0x*(r1y-r2y)+r1x*(r2y-r0y)+r2x*(r0y-r1y)>40000 || (r0y==r1y&&r0y==r2y)) return; // return for large triangle area or degenerate triangles
//if(r1x*r0y+r2x*r1y+r0x*r2y>=r0x*r1y+r1x*r2y+r2x*r0y) return; // clockwise backface culling
if(r0y>r1y) { const int xt = r0x; const int yt = r0y; r0x = r1x; r0y = r1y; r1x = xt; r1y = yt; } // sort vertices ascending by y
if(r0y>r2y) { const int xt = r0x; const int yt = r0y; r0x = r2x; r0y = r2y; r2x = xt; r2y = yt; }
if(r1y>r2y) { const int xt = r1x; const int yt = r1y; r1x = r2x; r1y = r2y; r2x = xt; r2y = yt; }
const float z = (r0z+r1z+r2z)/3.0f; // approximate triangle z position for each pixel to be equal
for(int y=r0y; y<r1y; y++) { // Bresenham algorithm (lower triangle half)
const int xA = r0x+(r2x-r0x)*(y-r0y)/(r2y-r0y);
const int xB = r0x+(r1x-r0x)*(y-r0y)/(r1y-r0y);
for(int x=min(xA, xB); x<max(xA, xB); x++) {
draw(x, y, z, color, bitmap, zbuffer, stereo);
}
}
for(int y=r1y; y<r2y; y++) { // Bresenham algorithm (upper triangle half)
const int xA = r0x+(r2x-r0x)*(y-r0y)/(r2y-r0y);
const int xB = r1x+(r2x-r1x)*(y-r1y)/(r2y-r1y);
for(int x=min(xA, xB); x<max(xA, xB); x++) {
draw(x, y, z, color, bitmap, zbuffer, stereo);
}
}
}
}
)+R(void convert_triangle_interpolated(float3 p0, float3 p1, float3 p2, int c0, int c1, int c2, const float* camera_cache, global int* bitmap, global int* zbuffer, const int stereo) { // 3D -> 2D
int r0x, r0y, r1x, r1y, r2x, r2y; float r0z, r1z, r2z;
if(convert(&r0x, &r0y, &r0z, p0, camera_cache, stereo) && convert(&r1x, &r1y, &r1z, p1, camera_cache, stereo) && convert(&r2x, &r2y, &r2z, p2, camera_cache, stereo)
&& !(is_off_screen(r0x, r0y, stereo) && is_off_screen(r1x, r1y, stereo) && is_off_screen(r2x, r2y, stereo))) { // cancel drawing if all points are off screen
if(r0x*(r1y-r2y)+r1x*(r2y-r0y)+r2x*(r0y-r1y)>40000 || (r0y==r1y&&r0y==r2y)) return; // return for large triangle area or degenerate triangles
//if(r1x*r0y+r2x*r1y+r0x*r2y>=r0x*r1y+r1x*r2y+r2x*r0y) return; // clockwise backface culling
if(r0y>r1y) { const int xt = r0x; const int yt = r0y; r0x = r1x; r0y = r1y; r1x = xt; r1y = yt; const int ct = c0; c0 = c1; c1 = ct; } // sort vertices ascending by y
if(r0y>r2y) { const int xt = r0x; const int yt = r0y; r0x = r2x; r0y = r2y; r2x = xt; r2y = yt; const int ct = c0; c0 = c2; c2 = ct; }
if(r1y>r2y) { const int xt = r1x; const int yt = r1y; r1x = r2x; r1y = r2y; r2x = xt; r2y = yt; const int ct = c1; c1 = c2; c2 = ct; }
const float z = (r0z+r1z+r2z)/3.0f; // approximate triangle z position for each pixel to be equal
const float d = (float)((r1y-r2y)*(r0x-r2x)+(r2x-r1x)*(r0y-r2y));
for(int y=r0y; y<r1y; y++) { // Bresenham algorithm (lower triangle half)
const int xA = r0x+(r2x-r0x)*(y-r0y)/(r2y-r0y);
const int xB = r0x+(r1x-r0x)*(y-r0y)/(r1y-r0y);
for(int x=min(xA, xB); x<max(xA, xB); x++) {
const float w0 = (float)((r1y-r2y)*(x-r2x)+(r2x-r1x)*(y-r2y))/d; // barycentric coordinates
const float w1 = (float)((r2y-r0y)*(x-r2x)+(r0x-r2x)*(y-r2y))/d;
const float w2 = 1.0f-w0-w1;
const int color = color_mix_3(c0, c1, c2, w0, w1, w2); // interpolate color
draw(x, y, z, color, bitmap, zbuffer, stereo);
}
}
for(int y=r1y; y<r2y; y++) { // Bresenham algorithm (upper triangle half)
const int xA = r0x+(r2x-r0x)*(y-r0y)/(r2y-r0y);
const int xB = r1x+(r2x-r1x)*(y-r1y)/(r2y-r1y);
for(int x=min(xA, xB); x<max(xA, xB); x++) {
const float w0 = (float)((r1y-r2y)*(x-r2x)+(r2x-r1x)*(y-r2y))/d; // barycentric coordinates
const float w1 = (float)((r2y-r0y)*(x-r2x)+(r0x-r2x)*(y-r2y))/d;
const float w2 = 1.0f-w0-w1;
const int color = color_mix_3(c0, c1, c2, w0, w1, w2); // interpolate color
draw(x, y, z, color, bitmap, zbuffer, stereo);
}
}
}
}
)+R(void draw_point(const float3 p, const int color, const float* camera_cache, global int* bitmap, global int* zbuffer) { // 3D -> 2D
const bool vr = (as_int(camera_cache[14])>>31)&0x1;
int rx, ry; float rz;
if(!vr) {
if(convert(&rx, &ry, &rz, p, camera_cache, 0)) draw(rx, ry, rz, color, bitmap, zbuffer, 0);
} else {
if(convert(&rx, &ry, &rz, p, camera_cache, -1)) draw(rx, ry, rz, color, bitmap, zbuffer, -1); // left eye
if(convert(&rx, &ry, &rz, p, camera_cache, +1)) draw(rx, ry, rz, color, bitmap, zbuffer, +1); // right eye
}
}
)+R(void draw_circle(const float3 p, const float r, const int color, const float* camera_cache, global int* bitmap, global int* zbuffer) { // 3D -> 2D
const bool vr = (as_int(camera_cache[14])>>31)&0x1;
if(!vr) {
convert_circle(p, r, color, camera_cache, bitmap, zbuffer, 0);
} else {
convert_circle(p, r, color, camera_cache, bitmap, zbuffer, -1); // left eye
convert_circle(p, r, color, camera_cache, bitmap, zbuffer, +1); // right eye
}
}
)+R(void draw_line(const float3 p0, const float3 p1, const int color, const float* camera_cache, global int* bitmap, global int* zbuffer) { // 3D -> 2D
const bool vr = (as_int(camera_cache[14])>>31)&0x1;
if(!vr) {
convert_line(p0, p1, color, camera_cache, bitmap, zbuffer, 0);
} else {
convert_line(p0, p1, color, camera_cache, bitmap, zbuffer, -1); // left eye
convert_line(p0, p1, color, camera_cache, bitmap, zbuffer, +1); // right eye
}
}
)+R(void draw_triangle(const float3 p0, const float3 p1, const float3 p2, const int color, const float* camera_cache, global int* bitmap, global int* zbuffer) { // 3D -> 2D
const bool vr = (as_int(camera_cache[14])>>31)&0x1;
if(!vr) {
convert_triangle(p0, p1, p2, color, camera_cache, bitmap, zbuffer, 0);
} else {
convert_triangle(p0, p1, p2, color, camera_cache, bitmap, zbuffer, -1); // left eye
convert_triangle(p0, p1, p2, color, camera_cache, bitmap, zbuffer, +1); // right eye
}
}
)+R(__attribute__((always_inline)) void draw_triangle_interpolated(const float3 p0, const float3 p1, const float3 p2, const int c0, const int c1, const int c2, const float* camera_cache, global int* bitmap, global int* zbuffer) { // 3D -> 2D
const bool vr = (as_int(camera_cache[14])>>31)&0x1;
if(!vr) {
convert_triangle_interpolated(p0, p1, p2, c0, c1, c2, camera_cache, bitmap, zbuffer, 0);
} else {
convert_triangle_interpolated(p0, p1, p2, c0, c1, c2, camera_cache, bitmap, zbuffer, -1); // left eye
convert_triangle_interpolated(p0, p1, p2, c0, c1, c2, camera_cache, bitmap, zbuffer, +1); // right eye
}
}
)+R(kernel void graphics_clear(global int* bitmap, global int* zbuffer) {
const uint n = get_global_id(0);
bitmap[n] = def_background_color; // black background = 0x000000, use 0xFFFFFF for white background
zbuffer[n] = -2147483648;
}
)+R(constant uchar triangle_table_data[1920] = { // source: Paul Bourke, http://paulbourke.net/geometry/polygonise/, termination value 15, bit packed
255,255,255,255,255,255,255, 15, 56,255,255,255,255,255,255, 16,249,255,255,255,255,255, 31, 56,137,241,255,255,255,255, 33,250,255,255,255,255,255, 15, 56, 33,250,255,255,255,255, 41, 10,146,
255,255,255,255, 47, 56,162,168,137,255,255,255,179,242,255,255,255,255,255, 15, 43,184,240,255,255,255,255,145, 32,179,255,255,255,255, 31, 43,145,155,184,255,255,255,163,177, 58,255,255,255,
255, 15, 26,128,138,171,255,255,255,147, 48,155,171,249,255,255,159,168,138,251,255,255,255,255,116,248,255,255,255,255,255, 79, 3, 55,244,255,255,255,255, 16,137,116,255,255,255,255, 79,145,
116,113, 19,255,255,255, 33,138,116,255,255,255,255, 63,116, 3, 20,162,255,255,255, 41,154, 32, 72,247,255,255, 47,154,146, 39, 55,151,244,255, 72, 55, 43,255,255,255,255,191,116, 43, 36, 64,
255,255,255, 9,129,116, 50,251,255,255, 79,183, 73,155, 43, 41,241,255,163, 49,171,135,244,255,255, 31,171, 65, 27, 64,183,244,255,116,152,176,185,186, 48,255, 79,183,180,153,171,255,255,255,
89,244,255,255,255,255,255,159, 69,128,243,255,255,255,255, 80, 20, 5,255,255,255,255,143, 69, 56, 53, 81,255,255,255, 33,154, 69,255,255,255,255, 63,128, 33, 74, 89,255,255,255, 37, 90, 36,
4,242,255,255, 47, 90, 35, 53, 69, 67,248,255, 89, 36,179,255,255,255,255, 15, 43,128, 75, 89,255,255,255, 80, 4, 81, 50,251,255,255, 47, 81, 82, 40,184,132,245,255, 58,171, 49, 89,244,255,
255, 79, 89,128,129, 26,184,250,255, 69, 80,176,181,186, 48,255, 95,132,133,170,184,255,255,255,121, 88,151,255,255,255,255,159, 3, 89, 83, 55,255,255,255,112, 8,113, 81,247,255,255, 31, 53,
83,247,255,255,255,255,121,152,117, 26,242,255,255,175, 33, 89, 80, 3,117,243,255, 8,130, 82, 88,167, 37,255, 47, 90, 82, 51,117,255,255,255,151,117,152,179,242,255,255,159,117,121,146, 2,
114,251,255, 50, 11,129,113, 24,117,255,191, 18, 27,119, 81,255,255,255, 89,136,117, 26,163,179,255, 95, 7, 5,121, 11, 1,186, 10,171,176, 48, 90,128,112,117,176, 90,183,245,255,255,255,255,
106,245,255,255,255,255,255, 15, 56,165,246,255,255,255,255, 9, 81,106,255,255,255,255, 31, 56,145, 88,106,255,255,255, 97, 37, 22,255,255,255,255, 31, 86, 33, 54,128,255,255,255,105,149, 96,
32,246,255,255, 95,137,133, 82, 98, 35,248,255, 50,171, 86,255,255,255,255,191,128, 43,160, 86,255,255,255, 16, 41,179,165,246,255,255, 95,106,145,146, 43,137,251,255, 54,107, 53, 21,243,255,
255, 15,184,176, 5, 21,181,246,255,179, 6, 99, 96, 5,149,255,111,149,150,187,137,255,255,255,165, 70,135,255,255,255,255, 79, 3,116, 99,165,255,255,255,145, 80,106, 72,247,255,255,175, 86,
145, 23, 55,151,244,255, 22, 98, 21,116,248,255,255, 31, 82, 37, 54, 64, 67,247,255, 72,151, 80, 96, 5, 98,255,127,147,151, 52,146,149, 38,150,179,114, 72,106,245,255,255, 95,106,116, 66, 2,
114,251,255, 16, 73,135, 50, 91,106,255,159, 18,185,146,180,183, 84,106, 72, 55, 91, 83, 81,107,255, 95,177,181, 22,176,183, 4,180, 80, 9, 86, 48,182, 54, 72,103,149,150, 75,151,183,249,255,
74,105,164,255,255,255,255, 79,106,148, 10, 56,255,255,255, 10,161, 6, 70,240,255,255,143, 19, 24,134, 70, 22,250,255, 65, 25, 66, 98,244,255,255, 63,128, 33, 41,148, 98,244,255, 32, 68, 98,
255,255,255,255,143, 35, 40, 68, 98,255,255,255, 74,169, 70, 43,243,255,255, 15, 40,130, 75,169,164,246,255,179, 2, 97, 96,100,161,255,111, 20, 22, 74, 24, 18,139, 27,105,148, 99, 25,179, 54,
255,143, 27, 24,176, 22, 25,100, 20,179, 54, 6, 96,244,255,255,111,132,107,248,255,255,255,255,167,118,168,152,250,255,255, 15, 55,160, 7,169,118,250,255,106, 23,122,113, 24, 8,255,175,118,
122, 17, 55,255,255,255, 33, 22,134,129,137,118,255, 47,150,146, 97,151,144,115,147,135,112, 96, 6,242,255,255,127, 35,118,242,255,255,255,255, 50,171,134,138,137,118,255, 47,112,114, 11,121,
118,154,122,129, 16,135,161,103,167, 50,187, 18, 27,167, 22,118,241,255,152,134,118, 25,182, 54, 49, 6, 25,107,247,255,255,255,255,135,112, 96,179,176, 6,255,127,107,255,255,255,255,255,255,
103,251,255,255,255,255,255, 63,128,123,246,255,255,255,255, 16,185,103,255,255,255,255,143,145, 56,177,103,255,255,255, 26, 98,123,255,255,255,255, 31,162, 3,104,123,255,255,255,146, 32,154,
182,247,255,255,111,123,162,163, 56,154,248,255, 39, 99,114,255,255,255,255,127,128,103, 96, 2,255,255,255,114, 38,115, 16,249,255,255, 31, 38,129, 22,137,120,246,255,122,166,113, 49,247,255,
255,175,103,113, 26,120, 1,248,255, 48, 7,167,160,105,122,255,127,166,167,136,154,255,255,255,134,180,104,255,255,255,255, 63,182, 3, 6,100,255,255,255,104,139,100, 9,241,255,255,159,100,
105,147, 19, 59,246,255,134,100,139,162,241,255,255, 31,162, 3, 11,182, 64,246,255,180, 72,182, 32, 41,154,255,175, 57, 58,146, 52, 59, 70, 54, 40,131, 36,100,242,255,255, 15, 36,100,242,255,
255,255,255,145, 32, 67, 66, 70,131,255, 31, 73, 65, 34,100,255,255,255, 24,131, 22, 72,102, 26,255,175, 1, 10,102, 64,255,255,255,100, 67,131,166, 3,147,154,163, 73,166,244,255,255,255,255,
148,117,182,255,255,255,255, 15, 56,148,181,103,255,255,255, 5, 81, 4,103,251,255,255,191,103, 56, 52, 69, 19,245,255, 89,164, 33,103,251,255,255,111,123, 33, 10, 56,148,245,255,103, 91,164,
36, 74, 32,255, 63,132, 83, 52, 82, 90,178,103, 39,115, 38, 69,249,255,255,159, 69,128, 6, 38,134,247,255, 99, 50,103, 81, 80, 4,255,111,130,134, 39,129,132, 21,133, 89,164, 97,113, 22,115,
255, 31,166,113, 22,112,120,144, 69, 4, 74, 90, 48,106,122,115,122,166,167, 88,164,132,250,255,150,101,155,139,249,255,255, 63,182, 96, 3,101,144,245,255,176, 8,181, 16, 85,182,255,111, 59,
54, 85, 19,255,255,255, 33,154,181,185,184,101,255, 15, 59, 96, 11,105,101, 25,162,139,181,101, 8,165, 37, 32,101, 59, 54, 37, 58, 90,243,255,133, 89,130,101, 50, 40,255,159,101,105, 0, 38,
255,255,255, 81, 24, 8,101, 56, 40, 38, 24,101, 18,246,255,255,255,255, 49, 22,166,131, 86,150,152,166, 1, 10,150, 5,101,240,255, 48, 88,166,255,255,255,255,175,101,255,255,255,255,255,255,
91,122,181,255,255,255,255,191,165,123,133, 3,255,255,255,181, 87,186,145,240,255,255,175, 87,186,151, 24, 56,241,255, 27,178, 23, 87,241,255,255, 15, 56, 33, 23, 87, 39,251,255,121,149,114,
9, 34,123,255,127, 37, 39, 91, 41, 35,152, 40, 82, 42, 83,115,245,255,255,143, 2, 88,130, 87, 42,245,255, 9, 81, 58, 53, 55, 42,255,159, 40, 41,129, 39, 42,117, 37, 49, 53, 87,255,255,255,
255, 15,120,112, 17, 87,255,255,255, 9,147, 83, 53,247,255,255,159,120,149,247,255,255,255,255,133, 84,138,186,248,255,255, 95, 64,181, 80,186, 59,240,255, 16,137,164,168,171, 84,255,175, 75,
74,181, 67, 73, 49, 65, 82, 33, 88,178, 72,133,255, 15,180,176, 67,181,178, 81,177, 32, 5,149,178, 69,133,139,149, 84,178,243,255,255,255,255, 82, 58, 37, 67, 53, 72,255, 95, 42, 37, 68, 2,
255,255,255,163, 50,165,131, 69,133, 16, 89, 42, 37, 20, 41, 73,242,255, 72,133, 53, 83,241,255,255, 15, 84, 1,245,255,255,255,255, 72,133, 53, 9, 5, 83,255,159, 84,255,255,255,255,255,255,
180, 71,185,169,251,255,255, 15, 56,148,151,123,169,251,255,161, 27, 75, 65,112,180,255, 63, 65, 67, 24, 74, 71,171, 75,180,151, 75, 41,155, 33,255,159, 71,185,151,177,178, 1, 56,123,180, 36,
66,240,255,255,191, 71, 75,130, 67, 35,244,255,146, 42,151, 50,119,148,255,159,122,121,164,114,120, 32,112,115, 58, 42, 71, 26, 10, 4, 26, 42,120,244,255,255,255,255,148, 65,113, 23,243,255,
255, 79, 25, 20, 7, 24,120,241,255, 4,115, 52,255,255,255,255, 79,120,255,255,255,255,255,255,169,168,139,255,255,255,255, 63,144,147,187,169,255,255,255, 16, 10,138,168,251,255,255, 63,161,
59,250,255,255,255,255, 33, 27,155,185,248,255,255, 63,144,147, 27,146,178,249,255, 32,139,176,255,255,255,255, 63,178,255,255,255,255,255,255, 50, 40,168,138,249,255,255,159, 42,144,242,255,
255,255,255, 50, 40,168, 16, 24,138,255, 31, 42,255,255,255,255,255,255, 49,152,129,255,255,255,255, 15, 25,255,255,255,255,255,255, 48,248,255,255,255,255,255,255,255,255,255,255,255,255,255
};
)+R(uchar triangle_table(const uint i) {
return (triangle_table_data[i/2u]>>(4u*(i%2u)))&0xF;
}
)+R(float interpolate(const float v1, const float v2, const float iso) { // linearly interpolate position where isosurface cuts an edge between 2 vertices at 0 and 1
return (iso-v1)/(v2-v1);
}
)+R(uint marching_cubes(const float* v, const float iso, float3* triangles) { // input: 8 values v, isovalue; output: returns number of triangles, 15 triangle vertices t
uint cube = 0u; // determine index of which vertices are inside of the isosurface
for(uint i=0u; i<8u; i++) cube |= (v[i]<iso)<<i;
if(cube==0u||cube==255u) return 0u; // cube is entirely inside/outside of the isosurface
float3 vertex[12]; // find the vertices where the surface intersects the cube
vertex[ 0] = (float3)(interpolate(v[0], v[1], iso), 0.0f, 0.0f); // interpolate vertices on all 12 edges
vertex[ 1] = (float3)(1.0f, 0.0f, interpolate(v[1], v[2], iso)); // do interpolation only in 1D to reduce operations
vertex[ 2] = (float3)(interpolate(v[3], v[2], iso), 0.0f, 1.0f);
vertex[ 3] = (float3)(0.0f, 0.0f, interpolate(v[0], v[3], iso));
vertex[ 4] = (float3)(interpolate(v[4], v[5], iso), 1.0f, 0.0f);
vertex[ 5] = (float3)(1.0f, 1.0f, interpolate(v[5], v[6], iso));
vertex[ 6] = (float3)(interpolate(v[7], v[6], iso), 1.0f, 1.0f);
vertex[ 7] = (float3)(0.0f, 1.0f, interpolate(v[4], v[7], iso));
vertex[ 8] = (float3)(0.0f, interpolate(v[0], v[4], iso), 0.0f);
vertex[ 9] = (float3)(1.0f, interpolate(v[1], v[5], iso), 0.0f);
vertex[10] = (float3)(1.0f, interpolate(v[2], v[6], iso), 1.0f);
vertex[11] = (float3)(0.0f, interpolate(v[3], v[7], iso), 1.0f);
cube *= 15u;
uint i; // number of triangle vertices
for(i=0u; i<15u&&triangle_table(cube+i)!=15u; i+=3u) { // create the triangles
triangles[i ] = vertex[triangle_table(cube+i )];
triangles[i+1u] = vertex[triangle_table(cube+i+1u)];
triangles[i+2u] = vertex[triangle_table(cube+i+2u)];
}
return i/3u; // return number of triangles
}
)+R(uint marching_cubes_halfway(const bool* v, float3* triangles) { // input: 8 bool values v; output: returns number of triangles, 15 triangle vertices t
uint cube = 0u; // determine index of which vertices are inside of the isosurface
for(uint i=0u; i<8u; i++) cube |= (uint)(!v[i])<<i;
if(cube==0u||cube==255u) return 0u; // cube is entirely inside/outside of the isosurface
float3 vertex[12]; // find the vertices where the surface intersects the cube
vertex[ 0] = (float3)(0.5f, 0.0f, 0.0f); // vertices on all 12 edges
vertex[ 1] = (float3)(1.0f, 0.0f, 0.5f);
vertex[ 2] = (float3)(0.5f, 0.0f, 1.0f);
vertex[ 3] = (float3)(0.0f, 0.0f, 0.5f);
vertex[ 4] = (float3)(0.5f, 1.0f, 0.0f);
vertex[ 5] = (float3)(1.0f, 1.0f, 0.5f);
vertex[ 6] = (float3)(0.5f, 1.0f, 1.0f);
vertex[ 7] = (float3)(0.0f, 1.0f, 0.5f);
vertex[ 8] = (float3)(0.0f, 0.5f, 0.0f);
vertex[ 9] = (float3)(1.0f, 0.5f, 0.0f);
vertex[10] = (float3)(1.0f, 0.5f, 1.0f);
vertex[11] = (float3)(0.0f, 0.5f, 1.0f);
cube *= 15u;
uint i; // number of triangle vertices
for(i=0u; i<15u&&triangle_table(cube+i)!=15u; i+=3u) { // create the triangles
triangles[i ] = vertex[triangle_table(cube+i )];
triangles[i+1u] = vertex[triangle_table(cube+i+1u)];
triangles[i+2u] = vertex[triangle_table(cube+i+2u)];
}
return i/3u; // return number of triangles
}
)+R(typedef struct __attribute__((packed)) struct_ray {
float3 origin;
float3 direction;
} ray;
)+R(float intersect_sphere(const ray r, const float3 center, const float radius) {
const float3 oc = center-r.origin;
const float b=dot(oc, r.direction), c=sq(b)-dot(oc, oc)+sq(radius);
return c<0.0f ? -1.0f : b-sqrt(c);
}
)+R(float intersect_sphere_inside(const ray r, const float3 center, const float radius) {
const float3 oc = center-r.origin;
const float b=dot(oc, r.direction), c=sq(b)-dot(oc, oc)+sq(radius);
return c<0.0f ? -1.0f : b+sqrt(c);
}
)+R(float intersect_triangle(const ray r, const float3 p0, const float3 p1, const float3 p2) { // Moeller-Trumbore algorithm
const float3 u=p1-p0, v=p2-p0, w=r.origin-p0, h=cross(r.direction, v), q=cross(w, u);
const float g=dot(u, h), f=1.0f/g, s=f*dot(w, h), t=f*dot(r.direction, q);
return (g<=0.0f||s<-0.0001f||s>1.0001f||t<-0.0001f||s+t>1.0001f) ? -1.0f : f*dot(v, q); // add tolerance values to avoid graphical artifacts with axis-aligned camera
}
)+R(float intersect_triangle_bidirectional(const ray r, const float3 p0, const float3 p1, const float3 p2) { // Moeller-Trumbore algorithm
const float3 u=p1-p0, v=p2-p0, w=r.origin-p0, h=cross(r.direction, v), q=cross(w, u);
const float g=dot(u, h), f=1.0f/g, s=f*dot(w, h), t=f*dot(r.direction, q);
return (g==0.0f||s<-0.0001f||s>1.0001f||t<-0.0001f||s+t>1.0001f) ? -1.0f : f*dot(v, q); // add tolerance values to avoid graphical artifacts with axis-aligned camera
}
)+R(float intersect_rhombus(const ray r, const float3 p0, const float3 p1, const float3 p2) { // Moeller-Trumbore algorithm
const float3 u=p1-p0, v=p2-p0, w=r.origin-p0, h=cross(r.direction, v), q=cross(w, u);
const float g=dot(u, h), f=1.0f/g, s=f*dot(w, h), t=f*dot(r.direction, q);
return (g<=0.0f||s<-0.0001f||s>1.0001f||t<-0.0001f||t>1.0001f) ? -1.0f : f*dot(v, q); // add tolerance values to avoid graphical artifacts with axis-aligned camera
}
)+R(float intersect_plane(const ray r, const float3 p0, const float3 p1, const float3 p2) { // ray-triangle intersection, but skip barycentric coordinates
const float3 u=p1-p0, v=p2-p0, w=r.origin-p0, h=cross(r.direction, v);
const float g = dot(u, h);
return g<=0.0f ? -1.0f : dot(v, cross(w, u))/g;
}
)+R(float intersect_plane_bidirectional(const ray r, const float3 p0, const float3 p1, const float3 p2) { // ray-triangle intersection, but skip barycentric coordinates
const float3 u=p1-p0, v=p2-p0, w=r.origin-p0, h=cross(r.direction, v);
const float g = dot(u, h);
return g==0.0f ? -1.0f : dot(v, cross(w, u))/g;
}
)+R(bool intersect_cuboid_bool(const ray r, const float3 center, const float Lx, const float Ly, const float Lz) {
const float3 bmin = center-0.5f*(float3)(Lx, Ly, Lz);
const float3 bmax = center+0.5f*(float3)(Lx, Ly, Lz);
const float txa = (bmin.x-r.origin.x)/r.direction.x;
const float txb = (bmax.x-r.origin.x)/r.direction.x;
const float txmin = fmin(txa, txb);
const float txmax = fmax(txa, txb);
const float tya = (bmin.y-r.origin.y)/r.direction.y;
const float tyb = (bmax.y-r.origin.y)/r.direction.y;
const float tymin = fmin(tya, tyb);
const float tymax = fmax(tya, tyb);
if(txmin>tymax||tymin>txmax) return false;
const float tza = (bmin.z-r.origin.z)/r.direction.z;
const float tzb = (bmax.z-r.origin.z)/r.direction.z;
const float tzmin = fmin(tza, tzb);
const float tzmax = fmax(tza, tzb);
return fmax(txmin, tymin)<=tzmax&&tzmin<=fmin(txmax, tymax);
}
)+R(float intersect_cuboid(const ray r, const float3 center, const float Lx, const float Ly, const float Lz) {
const float3 bmin = center-0.5f*(float3)(Lx, Ly, Lz);
const float3 bmax = center+0.5f*(float3)(Lx, Ly, Lz);
if(r.origin.x>=bmin.x&&r.origin.y>=bmin.y&&r.origin.z>=bmin.z&&r.origin.x<=bmax.x&&r.origin.y<=bmax.y&&r.origin.z<=bmax.z) return 0.0f; // ray origin is within cuboid
float3 p[8]; // 8 cuboid vertices
p[0] = (float3)(bmin.x, bmin.y, bmin.z); // ---
p[1] = (float3)(bmax.x, bmin.y, bmin.z); // +--
p[2] = (float3)(bmax.x, bmin.y, bmax.z); // +-+
p[3] = (float3)(bmin.x, bmin.y, bmax.z); // --+
p[4] = (float3)(bmin.x, bmax.y, bmin.z); // -+-
p[5] = (float3)(bmax.x, bmax.y, bmin.z); // ++-
p[6] = (float3)(bmax.x, bmax.y, bmax.z); // +++
p[7] = (float3)(bmin.x, bmax.y, bmax.z); // -++
float intersect = -1.0f; // test for intersections with the 6 cuboid faces, ray will intersect with either 0 or 1 rhombuses
intersect = fmax(intersect, intersect_rhombus(r, p[0], p[3], p[4])); // +00 (normal vectors)
intersect = fmax(intersect, intersect_rhombus(r, p[2], p[1], p[6])); // -00
intersect = fmax(intersect, intersect_rhombus(r, p[1], p[2], p[0])); // 0+0
intersect = fmax(intersect, intersect_rhombus(r, p[7], p[6], p[4])); // 0-0
intersect = fmax(intersect, intersect_rhombus(r, p[1], p[0], p[5])); // 00+
intersect = fmax(intersect, intersect_rhombus(r, p[3], p[2], p[7])); // 00-
return intersect;
}
)+R(float intersect_cuboid_inside_with_normal(const ray r, const float3 center, const float Lx, const float Ly, const float Lz, float3* normal) {
const float3 bmin = center-0.5f*(float3)(Lx, Ly, Lz);
const float3 bmax = center+0.5f*(float3)(Lx, Ly, Lz);
float3 p[8]; // 8 cuboid vertices
p[0] = (float3)(bmin.x, bmin.y, bmin.z); // ---
p[1] = (float3)(bmax.x, bmin.y, bmin.z); // +--
p[2] = (float3)(bmax.x, bmin.y, bmax.z); // +-+
p[3] = (float3)(bmin.x, bmin.y, bmax.z); // --+
p[4] = (float3)(bmin.x, bmax.y, bmin.z); // -+-
p[5] = (float3)(bmax.x, bmax.y, bmin.z); // ++-
p[6] = (float3)(bmax.x, bmax.y, bmax.z); // +++
p[7] = (float3)(bmin.x, bmax.y, bmax.z); // -++
float intersect = -1.0f; // test for intersections with the 6 cuboid faces
float rhombus_intersect[6];
rhombus_intersect[0] = intersect_rhombus(r, p[2], p[6], p[1]); // +00 (normal vectors, points 2 and 3 are switched here to flip rhombus around)
rhombus_intersect[1] = intersect_rhombus(r, p[0], p[4], p[3]); // -00
rhombus_intersect[2] = intersect_rhombus(r, p[7], p[4], p[6]); // 0+0
rhombus_intersect[3] = intersect_rhombus(r, p[1], p[0], p[2]); // 0-0
rhombus_intersect[4] = intersect_rhombus(r, p[3], p[7], p[2]); // 00+
rhombus_intersect[5] = intersect_rhombus(r, p[1], p[5], p[0]); // 00-
uint side = 0u; // test for intersections with the 6 cuboid faces
for(uint i=0u; i<6u; i++) {
if(rhombus_intersect[i]>intersect) { // test for intersections with the 6 cuboid faces
intersect = rhombus_intersect[i]; // ray will intersect with either 0 or 1 rhombuses
side = i;
}
}
*normal = (float3)(side==0u ? 1.0f : side==1u ? -1.0f : 0.0f, side==2u ? 1.0f : side==3u ? -1.0f : 0.0f, side==4u ? 1.0f : side==5u ? -1.0f : 0.0f);
return intersect;
}
)+R(float3 reflect(const float3 direction, const float3 normal) {
return direction-2.0f*dot(direction, normal)*normal;
}
)+R(float3 refract(const float3 direction, const float3 normal, const float n) {
const float direction_normal = dot(direction, normal);
const float sqrt_part = sq(n)-1.0f+sq(direction_normal);
return sqrt_part>=0.0f ? (direction-(direction_normal+sqrt(sqrt_part))*normal)/n : direction-2.0f*direction_normal*normal; // refraction : total internal reflection
}
)+R(ray get_camray(const int x, const int y, const float* camera_cache) {
const float zoom = camera_cache[0]; // fetch camera parameters (rotation matrix, camera position, etc.)
const float dis = camera_cache[1];
const float3 pos = (float3)(camera_cache[ 2], camera_cache[ 3], camera_cache[ 4])-(float3)(def_domain_offset_x, def_domain_offset_y, def_domain_offset_z);
const float3 Rx = (float3)(camera_cache[ 5], camera_cache[ 6], camera_cache[ 7]);
const float3 Ry = (float3)(camera_cache[ 8], camera_cache[ 9], camera_cache[10]);
const float3 Rz = (float3)(camera_cache[11], camera_cache[12], camera_cache[13]);
const bool vr = (as_int(camera_cache[14])>>31)&0x1;
const float rtv = (as_int(camera_cache[14])>>30)&0x1 ? 2.0f : 1.0f;
const float eye_distance = vload_half(28, (half*)camera_cache);
const float stereo = (x<(int)def_screen_width/2 ? -1.0f : 1.0f);
float3 p0 = (float3)(!vr ? 0.0f : stereo*eye_distance/zoom, 0.0f, dis/zoom);
float3 p1 = p0+normalize((float3)(!vr ? (float)(x-(int)def_screen_width/2) : ((float)(x-(int)def_screen_width/2)-stereo*(float)(def_screen_width/4u))*rtv-stereo*eye_distance, (float)(y-(int)def_screen_height/2), -dis));
p0 = Rx*p0.x+Ry*p0.y+Rz*p0.z+pos; // reverse rotation and reverse transformation of p0
p1 = Rx*p1.x+Ry*p1.y+Rz*p1.z+pos; // reverse rotation and reverse transformation of p1
ray camray;
camray.origin = p0;
camray.direction = p1-p0;
return camray;
}
)+R(int skybox_bottom(const ray r, const int c1, const int c2, const int skybox_color) {
const float3 p0=(float3)(0.0f, 0.0f, -0.5f*(float)def_Nz), p1=(float3)(1.0f, 0.0f, -0.5f*(float)def_Nz), p2=(float3)(0.0f, 1.0f, -0.5f*(float)def_Nz);
const float distance = intersect_plane(r, p0, p1, p2);
if(distance>0.0f) { // ray intersects with bottom
const float3 normal = normalize(cross(p1-p0, p2-p0));
float3 intersection = r.origin+distance*r.direction;
const float scale = 2.0f/fmin((float)def_Nx, (float)def_Ny);
int a = abs((int)floor(scale*intersection.x));
int b = abs((int)floor(scale*intersection.y));
const float r = scale*sqrt(sq(intersection.x)+sq(intersection.y));
const int w = (a%2==b%2);
return color_mix(w*c1+(1-w)*c2, color_mix(c1, c2, 0.5f), clamp(10.0f/r, 0.0f, 1.0f));
} else {
return skybox_color;
}
}
)+R(int skybox_color_bw(const float x, const float y) {
return color_mul(0xFFFFFF, 1.0f-y);
}
)+R(int skybox_color_hsv(const float x, const float y) {
const float h = fmod(x*360.0f+120.0f, 360.0f);
const float s = y>0.5f ? 1.0f : 2.0f*y;
const float v = y>0.5f ? 2.0f-2.0f*y : 1.0f;
return hsv_to_rgb(h, s, v);
}
)+R(int skybox_color_sunset(const float x, const float y) {
return color_mix(255<<16|175<<8|55, y<0.5f ? 55<<16|111<<8|255 : 0, 2.0f*(0.5f-fabs(y-0.5f)));
}
)+R(int skybox_color_grid(const float x, const float y, const int c1, const int c2) {
int a = (int)(72.0f*x);
int b = (int)(36.0f*y);
const int w = (a%2==b%2);
return w*c1+(1-w)*c2;
}
)+R(int skybox_color(const ray r, const global int* skybox) {
const float3 direction = normalize(r.direction); // to avoid artifacts from asin(direction.z)
//const float x = fma(atan2(direction.x, direction.y), 0.5f/3.1415927f, 0.5f);
//const float y = fma(asin (direction.z ), -1.0f/3.1415927f, 0.5f);
//return skybox_color_bw(x, y);
//return color_mix(skybox_color_hsv(x, y), skybox_color_grid(x, y, 0xFFFFFF, 0x000000), 0.95f-0.33f*(2.0f*(0.5f-fabs(y-0.5f))));
//return skybox_bottom(r, 0xFFFFFF, 0xF0F0F0, skybox_color_grid(x, y, 0xFFFFFF, 0xF0F0F0));
const float fu = (float)def_skybox_width *fma(atan2(direction.x, direction.y), 0.5f/3.1415927f, 0.5f);
const float fv = (float)def_skybox_height*fma(asin (direction.z ), -1.0f/3.1415927f, 0.5f);
const int ua=clamp((int)fu, 0, (int)def_skybox_width-1), va=clamp((int)fv, 0, (int)def_skybox_height-1), ub=(ua+1)%def_skybox_width, vb=min(va+1, (int)def_skybox_height-1); // bilinear interpolation positions
const int s00=skybox[ua+va*def_skybox_width], s01=skybox[ua+vb*def_skybox_width], s10=skybox[ub+va*def_skybox_width], s11=skybox[ub+vb*def_skybox_width];
const float u1=fu-(float)ua, v1=fv-(float)va, u0=1.0f-u1, v0=1.0f-v1; // interpolation factors
return color_mix(color_mix(s00, s01, v0), color_mix(s10, s11, v0), u0); // perform bilinear interpolation
}
)+R(int last_ray(const ray reflection, const ray transmission, const float reflectivity, const float transmissivity, const global int* skybox) {
return color_mix(skybox_color(reflection, skybox), color_mix(skybox_color(transmission, skybox), def_absorption_color, transmissivity), reflectivity);
}
)+R(float interpolate_phi(const float3 p, const global float* phi, const uint Nx, const uint Ny, const uint Nz) { // trilinear interpolation of velocity at point p
const float xa=p.x-0.5f+1.5f*(float)Nx, ya=p.y-0.5f+1.5f*(float)Ny, za=p.z-0.5f+1.5f*(float)Nz; // subtract lattice offsets
const uint xb=(uint)xa, yb=(uint)ya, zb=(uint)za; // integer casting to find bottom left corner
const float x1=xa-(float)xb, y1=ya-(float)yb, z1=za-(float)zb, x0=1.0f-x1, y0=1.0f-y1, z0=1.0f-z1; // calculate interpolation factors
float phin[8]; // phi at unit cube corner points
for(uint c=0u; c<8u; c++) { // count over eight corner points
const uint i=(c&0x04u)>>2, j=(c&0x02u)>>1, k=c&0x01u; // disassemble c into corner indices ijk
const uint x=(xb+i)%Nx, y=(yb+j)%Ny, z=(zb+k)%Nz; // calculate corner lattice positions
const uxx n = (uxx)x+(uxx)(y+z*Ny)*(uxx)Nx; // calculate lattice linear index
phin[c] = phi[n]; // load velocity from lattice point
}
return (x0*y0*z0)*phin[0]+(x0*y0*z1)*phin[1]+(x0*y1*z0)*phin[2]+(x0*y1*z1)*phin[3]+(x1*y0*z0)*phin[4]+(x1*y0*z1)*phin[5]+(x1*y1*z0)*phin[6]+(x1*y1*z1)*phin[7]; // perform trilinear interpolation
}
)+R(float ray_grid_traverse(const ray r, const global float* phi, const global uchar* flags, float3* normal, const uint Nx, const uint Ny, const uint Nz) {
const float3 p = (float3)(r.origin.x-0.5f+0.5f*(float)Nx, r.origin.y-0.5f+0.5f*(float)Ny, r.origin.z-0.5f+0.5f*(float)Nz); // start point
const int dx=(int)sign(r.direction.x), dy=(int)sign(r.direction.y), dz=(int)sign(r.direction.z); // fast ray-grid-traversal
int3 xyz = (int3)((int)floor(p.x), (int)floor(p.y), (int)floor(p.z));
const float fxa=p.x-floor(p.x), fya=p.y-floor(p.y), fza=p.z-floor(p.z);
const float tdx = 1.0f/fmax(fabs(r.direction.x), 1E-6f);
const float tdy = 1.0f/fmax(fabs(r.direction.y), 1E-6f);
const float tdz = 1.0f/fmax(fabs(r.direction.z), 1E-6f);
float tmx = tdx*(dx>0 ? 1.0f-fxa : dx<0 ? fxa : 0.0f);
float tmy = tdy*(dy>0 ? 1.0f-fya : dy<0 ? fya : 0.0f);
float tmz = tdz*(dz>0 ? 1.0f-fza : dz<0 ? fza : 0.0f);
for(uint tc=0u; tc<Nx+Ny+Nz; tc++) { // limit number of traversed cells to space diagonal
if(tmx<tmy) { if(tmx<tmz) { xyz.x += dx; tmx += tdx; } else { xyz.z += dz; tmz += tdz; } }
else /****/ { if(tmy<tmz) { xyz.y += dy; tmy += tdy; } else { xyz.z += dz; tmz += tdz; } }
if(xyz.x<-1 || xyz.y<-1 || xyz.z<-1 || xyz.x>=(int)Nx || xyz.y>=(int)Ny || xyz.z>=(int)Nz) break; // out of simulation box
else if(xyz.x<0 || xyz.y<0 || xyz.z<0 || xyz.x>=(int)Nx-1 || xyz.y>=(int)Ny-1 || xyz.z>=(int)Nz-1) continue;
const uxx x0 = (uxx) xyz.x; // cube stencil
const uxx xp = (uxx) ( xyz.x+1);
const uxx y0 = (uxx)( (uint)xyz.y *Nx);
const uxx yp = (uxx)(((uint)xyz.y+1)*Nx);
const uxx z0 = (uxx) xyz.z *(uxx)(Ny*Nx);
const uxx zp = (uxx) ( xyz.z+1)*(uxx)(Ny*Nx);
uxx j[8];
j[0] = x0+y0+z0; // 000
j[1] = xp+y0+z0; // +00
j[2] = xp+y0+zp; // +0+
j[3] = x0+y0+zp; // 00+
j[4] = x0+yp+z0; // 0+0
j[5] = xp+yp+z0; // ++0
j[6] = xp+yp+zp; // +++
j[7] = x0+yp+zp; // 0++
uchar flags_cell = 0u; // check with cheap flags if the isosurface goes through the current marching-cubes cell (~15% performance boost)
for(uint i=0u; i<8u; i++) flags_cell |= flags[j[i]];
if(!(flags_cell&(TYPE_S|TYPE_E|TYPE_I))) continue; // cell is entirely inside/outside of the isosurface
float v[8];
for(uint i=0u; i<8u; i++) v[i] = phi[j[i]];
float3 triangles[15]; // maximum of 5 triangles with 3 vertices each
const uint tn = marching_cubes(v, 0.5f, triangles); // run marching cubes algorithm
if(tn==0u) continue; // if returned tn value is non-zero, iterate through triangles
const float3 offset = (float3)((float)xyz.x+0.5f-0.5f*(float)Nx, (float)xyz.y+0.5f-0.5f*(float)Ny, (float)xyz.z+0.5f-0.5f*(float)Nz);
for(uint i=0u; i<tn; i++) {
const float3 p0 = triangles[3u*i ]+offset;
const float3 p1 = triangles[3u*i+1u]+offset;
const float3 p2 = triangles[3u*i+2u]+offset;
const float intersect = intersect_triangle_bidirectional(r, p0, p1, p2); // for each triangle, check ray-triangle intersection
if(intersect>0.0f) { // intersection found (there can only be exactly 1 intersection)
const uxx xq = (uxx) (((uint)xyz.x +2u)%Nx); // central difference stencil on each cube corner point
const uxx xm = (uxx) (((uint)xyz.x+Nx-1u)%Nx);
const uxx yq = (uxx)((((uint)xyz.y +2u)%Ny)*Nx);
const uxx ym = (uxx)((((uint)xyz.y+Ny-1u)%Ny)*Nx);
const uxx zq = (uxx) (((uint)xyz.z +2u)%Nz)*(uxx)(Ny*Nx);
const uxx zm = (uxx) (((uint)xyz.z+Nz-1u)%Nz)*(uxx)(Ny*Nx);
float3 n[8];
n[0] = (float3)(phi[xm+y0+z0]-v[1], phi[x0+ym+z0]-v[4], phi[x0+y0+zm]-v[3]); // central difference stencil on each cube corner point
n[1] = (float3)(v[0]-phi[xq+y0+z0], phi[xp+ym+z0]-v[5], phi[xp+y0+zm]-v[2]); // compute normal vectors from gradient
n[2] = (float3)(v[3]-phi[xq+y0+zp], phi[xp+ym+zp]-v[6], v[1]-phi[xp+y0+zq]); // normalize later after trilinear interpolation
n[3] = (float3)(phi[xm+y0+zp]-v[2], phi[x0+ym+zp]-v[7], v[0]-phi[x0+y0+zq]);
n[4] = (float3)(phi[xm+yp+z0]-v[5], v[0]-phi[x0+yq+z0], phi[x0+yp+zm]-v[7]);
n[5] = (float3)(v[4]-phi[xq+yp+z0], v[1]-phi[xp+yq+z0], phi[xp+yp+zm]-v[6]);
n[6] = (float3)(v[7]-phi[xq+yp+zp], v[2]-phi[xp+yq+zp], v[5]-phi[xp+yp+zq]);
n[7] = (float3)(phi[xm+yp+zp]-v[6], v[3]-phi[x0+yq+zp], v[4]-phi[x0+yp+zq]);
const float3 p = r.origin+intersect*r.direction-offset; // intersection point minus offset
*normal = normalize(trilinear3(p-floor(p), n)); // perform trilinear interpolation and normalization
return intersect; // intersection found, exit loop
}
}
}
const float intersect = intersect_cuboid_inside_with_normal(r, (float3)(0.0f, 0.0f, 0.0f), (float)Nx-1.0f, (float)Ny-1.0f, (float)Nz-1.0f, normal); // -1 because marching-cubes surface ends between cells
return intersect>0.0f ? (interpolate_phi(r.origin+intersect*r.direction, phi, Nx, Ny, Nz)>0.5f ? intersect : -1.0f) : -1.0f; // interpolate phi at ray intersection point with simulation box, to check if ray is inside fluid
}
)+R(bool raytrace_phi_mirror(const ray ray_in, ray* ray_reflect, const global float* phi, const global uchar* flags, const global int* skybox, const uint Nx, const uint Ny, const uint Nz) { // only reflection
float3 normal;
float d = ray_grid_traverse(ray_in, phi, flags, &normal, Nx, Ny, Nz); // move ray through lattice, at each cell call marching_cubes
if(d==-1.0f) return false; // no intersection found
ray_reflect->origin = ray_in.origin+(d-0.005f)*ray_in.direction; // start intersection points a bit in front triangle to avoid self-reflection
ray_reflect->direction = reflect(ray_in.direction, normal);
return true;
}
)+R(bool raytrace_phi(const ray ray_in, ray* ray_reflect, ray* ray_transmit, float* reflectivity, float* transmissivity, const global float* phi, const global uchar* flags, const global int* skybox, const uint Nx, const uint Ny, const uint Nz) {
float3 normal;
float d = ray_grid_traverse(ray_in, phi, flags, &normal, Nx, Ny, Nz); // move ray through lattice, at each cell call marching_cubes
if(d==-1.0f) return false; // no intersection found
const float ray_in_normal = dot(ray_in.direction, normal);
const bool is_inside = ray_in_normal>0.0f; // camera is in fluid
ray_reflect->origin = ray_in.origin+(d-0.005f)*ray_in.direction; // start intersection points a bit in front triangle to avoid self-reflection
ray_reflect->direction = reflect(ray_in.direction, normal); // compute reflection ray
ray ray_internal; // compute internal ray and transmission ray
ray_internal.origin = ray_in.origin+(d+0.005f)*ray_in.direction; // start intersection points a bit behind triangle to avoid self-transmission
ray_internal.direction = refract(ray_in.direction, normal, def_n);
const float wr = clamp(sq(cb(2.0f*acospi(fabs(ray_in_normal)))), 0.0f, 1.0f); // increase reflectivity if ray intersects surface at shallow angle
if(is_inside) { // swap ray_reflect and ray_internal
const float3 ray_internal_origin = ray_internal.origin;
ray_internal.origin = ray_reflect->origin;
ray_internal.direction = ray_reflect->direction;
ray_reflect->origin = ray_internal_origin; // re-use internal ray origin
ray_reflect->direction = refract(ray_in.direction, -normal, 1.0f/def_n); // compute refraction again: refract out of fluid
if(sq(1.0f/def_n)-1.0f+sq(ray_in_normal)>=0.0f) { // refraction through Snell's window
ray_transmit->origin = ray_reflect->origin; // reflection ray and transmission ray are the same
ray_transmit->direction = ray_reflect->direction;
*reflectivity = 0.0f;
*transmissivity = exp(def_attenuation*d); // Beer-Lambert law
return true;
}
}
float d_internal = d;
d = ray_grid_traverse(ray_internal, phi, flags, &normal, Nx, Ny, Nz); // 2nd ray-grid traversal call: refraction (camera outside) or total internal reflection (camera inside)
ray_transmit->origin = d!=-1.0f ? ray_internal.origin+(d+0.005f)*ray_internal.direction : ray_internal.origin; // start intersection points a bit behind triangle to avoid self-transmission
ray_transmit->direction = d!=-1.0f||is_inside ? refract(ray_internal.direction, -normal, 1.0f/def_n) : ray_internal.direction; // internal ray intersects isosurface : internal ray does not intersect again
*reflectivity = is_inside ? 0.0f : wr; // is_inside means camera is inside fluid, so this is a total internal reflection down here
*transmissivity = d!=-1.0f||is_inside ? exp(def_attenuation*((float)is_inside*d_internal+d)) : (float)(def_attenuation==0.0f); // Beer-Lambert law
return true;
}
)+R(bool is_above_plane(const float3 point, const float3 plane_p, const float3 plane_n) {
return dot(point-plane_p, plane_n)>=0.0f;
}
)+R(bool is_below_plane(const float3 point, const float3 plane_p, const float3 plane_n) {
return dot(point-plane_p, plane_n)<=0.0f;
}
)+R(bool is_in_camera_frustrum(const float3 p, const float* camera_cache) { // returns true if point is located in camera frustrum
const float zoom = camera_cache[0]; // fetch camera parameters (rotation matrix, camera position, etc.)
const float dis = camera_cache[1];
const float3 pos = (float3)(camera_cache[ 2], camera_cache[ 3], camera_cache[ 4])-(float3)(def_domain_offset_x, def_domain_offset_y, def_domain_offset_z);
const float3 Rx = (float3)(camera_cache[ 5], camera_cache[ 6], camera_cache[ 7]);
const float3 Ry = (float3)(camera_cache[ 8], camera_cache[ 9], camera_cache[10]);
const float3 Rz = (float3)(camera_cache[11], camera_cache[12], camera_cache[13]);
const bool vr = (as_int(camera_cache[14])>>31)&0x1;
const float rtv = (as_int(camera_cache[14])>>30)&0x1 ? 2.0f : 1.0f;
const float3 p0 = (float3)(0.0f, 0.0f, dis/zoom);
const float3 camera_center = Rx*p0.x+Ry*p0.y+Rz*p0.z+pos; // reverse rotation and reverse transformation of p0
const float x_left = !vr ? (float)(-(int)def_screen_width/2 ) : ((float)(-(int)def_screen_width/2 )+(float)(def_screen_width/4u))*rtv;
const float x_right = !vr ? (float)( (int)def_screen_width/2-1) : ((float)( (int)def_screen_width/2-1)-(float)(def_screen_width/4u))*rtv;
const float y_top = (float)(-(int)def_screen_height/2 );
const float y_bottom = (float)((int)def_screen_height/2-1);
const float dis_clamped = fmin(dis, 1E4f); // avoid flickering at very small field of view
float3 r00 = p0+normalize((float3)(x_left , y_top , -dis_clamped)); // get 4 edge vectors of frustrum, get_camray(...) inlined and redundant parts eliminated
float3 r01 = p0+normalize((float3)(x_right, y_top , -dis_clamped));
float3 r10 = p0+normalize((float3)(x_left , y_bottom, -dis_clamped));
float3 r11 = p0+normalize((float3)(x_right, y_bottom, -dis_clamped));
r00 = Rx*r00.x+Ry*r00.y+Rz*r00.z+pos-camera_center; // reverse rotation and reverse transformation of r00
r01 = Rx*r01.x+Ry*r01.y+Rz*r01.z+pos-camera_center; // reverse rotation and reverse transformation of r01
r10 = Rx*r10.x+Ry*r10.y+Rz*r10.z+pos-camera_center; // reverse rotation and reverse transformation of r10
r11 = Rx*r11.x+Ry*r11.y+Rz*r11.z+pos-camera_center; // reverse rotation and reverse transformation of r11
const float3 plane_n_top = cross(r00, r01); // get 4 frustrum planes
const float3 plane_n_bottom = cross(r11, r10);
const float3 plane_n_left = cross(r10, r00);
const float3 plane_n_right = cross(r01, r11);
const float3 plane_p_top = camera_center-2.0f*plane_n_top; // move frustrum planes outward by 2 units
const float3 plane_p_bottom = camera_center-2.0f*plane_n_bottom;
const float3 plane_p_left = camera_center-(2.0f+8.0f*(float)vr)*plane_n_left; // move frustrum planes outward by 2 units, for stereoscopic rendering a bit more
const float3 plane_p_right = camera_center-(2.0f+8.0f*(float)vr)*plane_n_right;
return is_above_plane(p, plane_p_top, plane_n_top)&&is_above_plane(p, plane_p_bottom, plane_n_bottom)&&is_above_plane(p, plane_p_left, plane_n_left)&&is_above_plane(p, plane_p_right, plane_n_right);
}
)+"#endif"+R( // GRAPHICS
// ################################################## LBM code ##################################################
)+R(uint3 coordinates(const uxx n) { // disassemble 1D index to 3D coordinates (n -> x,y,z)
const uint t = (uint)(n%(uxx)(def_Nx*def_Ny));
return (uint3)(t%def_Nx, t/def_Nx, (uint)(n/(uxx)(def_Nx*def_Ny))); // n = x+(y+z*Ny)*Nx
}
)+R(uxx index(const uint3 xyz) { // assemble 1D index from 3D coordinates (x,y,z -> n)
return (uxx)xyz.x+(uxx)(xyz.y+xyz.z*def_Ny)*(uxx)def_Nx; // n = x+(y+z*Ny)*Nx
}
)+R(float3 position(const uint3 xyz) { // 3D coordinates to 3D position
return (float3)((float)xyz.x+0.5f-0.5f*(float)def_Nx, (float)xyz.y+0.5f-0.5f*(float)def_Ny, (float)xyz.z+0.5f-0.5f*(float)def_Nz);
}
)+R(uint3 closest_coordinates(const float3 p) { // return closest lattice point to point p
return (uint3)((uint)(p.x+1.5f*(float)def_Nx)%def_Nx, (uint)(p.y+1.5f*(float)def_Ny)%def_Ny, (uint)(p.z+1.5f*(float)def_Nz)%def_Nz);
}
)+R(float3 mirror_position(const float3 p) { // mirror position into periodic boundaries
float3 r;
r.x = sign(p.x)*(fmod(fabs(p.x)+0.5f*(float)def_GNx, (float)def_GNx)-0.5f*(float)def_GNx);
r.y = sign(p.y)*(fmod(fabs(p.y)+0.5f*(float)def_GNy, (float)def_GNy)-0.5f*(float)def_GNy);
r.z = sign(p.z)*(fmod(fabs(p.z)+0.5f*(float)def_GNz, (float)def_GNz)-0.5f*(float)def_GNz);
return r;
}
)+R(float3 mirror_distance(const float3 d) { // mirror distance vector into periodic boundaries
return mirror_position(d);
}
)+R(bool is_halo(const uxx n) {
const uint3 xyz = coordinates(n);
return ((def_Dx>1u)&(xyz.x==0u||xyz.x>=def_Nx-1u))||((def_Dy>1u)&(xyz.y==0u||xyz.y>=def_Ny-1u))||((def_Dz>1u)&(xyz.z==0u||xyz.z>=def_Nz-1u));
}
)+R(bool is_halo_q(const uint3 xyz) {
return ((def_Dx>1u)&(xyz.x==0u||xyz.x>=def_Nx-2u))||((def_Dy>1u)&(xyz.y==0u||xyz.y>=def_Ny-2u))||((def_Dz>1u)&(xyz.z==0u||xyz.z>=def_Nz-2u)); // halo data is kept up-to-date, so allow using halo data for rendering
}
)+R(float half_to_float_custom(const ushort x) { // custom 16-bit floating-point format, 1-4-11, exp-15, +-1.99951168, +-6.10351562E-5, +-2.98023224E-8, 3.612 digits
const uint e = (x&0x7800)>>11; // exponent
const uint m = (x&0x07FF)<<12; // mantissa
const uint v = as_uint((float)m)>>23; // evil log2 bit hack to count leading zeros in denormalized format
return as_float((x&0x8000)<<16 | (e!=0)*((e+112)<<23|m) | ((e==0)&(m!=0))*((v-37)<<23|((m<<(150-v))&0x007FF000))); // sign : normalized : denormalized
}
)+R(ushort float_to_half_custom(const float x) { // custom 16-bit floating-point format, 1-4-11, exp-15, +-1.99951168, +-6.10351562E-5, +-2.98023224E-8, 3.612 digits
const uint b = as_uint(x)+0x00000800; // round-to-nearest-even: add last bit after truncated mantissa
const uint e = (b&0x7F800000)>>23; // exponent
const uint m = b&0x007FFFFF; // mantissa; in line below: 0x007FF800 = 0x00800000-0x00000800 = decimal indicator flag - initial rounding
return (b&0x80000000)>>16 | (e>112)*((((e-112)<<11)&0x7800)|m>>12) | ((e<113)&(e>100))*((((0x007FF800+m)>>(124-e))+1)>>1); // sign : normalized : denormalized (assume [-2,2])
}
)+R(ulong index_f(const uxx n, const uint i) { // 64-bit indexing for DDFs
return (ulong)i*def_N+(ulong)n; // SoA (>2x faster on GPUs)
}
)+R(float c(const uint i) { // avoid constant keyword by encapsulating data in function which gets inlined by compiler
const float c[3u*def_velocity_set] = {
)+"#if defined(D2Q9)"+R(
0, 1,-1, 0, 0, 1,-1, 1,-1, // x
0, 0, 0, 1,-1, 1,-1,-1, 1, // y
0, 0, 0, 0, 0, 0, 0, 0, 0 // z
)+"#elif defined(D3Q15)"+R(
0, 1,-1, 0, 0, 0, 0, 1,-1, 1,-1, 1,-1,-1, 1, // x
0, 0, 0, 1,-1, 0, 0, 1,-1, 1,-1,-1, 1, 1,-1, // y
0, 0, 0, 0, 0, 1,-1, 1,-1,-1, 1, 1,-1, 1,-1 // z
)+"#elif defined(D3Q19)"+R(
0, 1,-1, 0, 0, 0, 0, 1,-1, 1,-1, 0, 0, 1,-1, 1,-1, 0, 0, // x
0, 0, 0, 1,-1, 0, 0, 1,-1, 0, 0, 1,-1,-1, 1, 0, 0, 1,-1, // y
0, 0, 0, 0, 0, 1,-1, 0, 0, 1,-1, 1,-1, 0, 0,-1, 1,-1, 1 // z
)+"#elif defined(D3Q27)"+R(
0, 1,-1, 0, 0, 0, 0, 1,-1, 1,-1, 0, 0, 1,-1, 1,-1, 0, 0, 1,-1, 1,-1, 1,-1,-1, 1, // x
0, 0, 0, 1,-1, 0, 0, 1,-1, 0, 0, 1,-1,-1, 1, 0, 0, 1,-1, 1,-1, 1,-1,-1, 1, 1,-1, // y
0, 0, 0, 0, 0, 1,-1, 0, 0, 1,-1, 1,-1, 0, 0,-1, 1,-1, 1, 1,-1,-1, 1, 1,-1, 1,-1 // z
)+"#endif"+R( // D3Q27
};
return c[i];
}
)+R(float w(const uint i) { // avoid constant keyword by encapsulating data in function which gets inlined by compiler
const float w[def_velocity_set] = { def_w0, // velocity set weights
)+"#if defined(D2Q9)"+R(
def_ws, def_ws, def_ws, def_ws, def_we, def_we, def_we, def_we
)+"#elif defined(D3Q15)"+R(
def_ws, def_ws, def_ws, def_ws, def_ws, def_ws,
def_wc, def_wc, def_wc, def_wc, def_wc, def_wc, def_wc, def_wc
)+"#elif defined(D3Q19)"+R(
def_ws, def_ws, def_ws, def_ws, def_ws, def_ws,
def_we, def_we, def_we, def_we, def_we, def_we, def_we, def_we, def_we, def_we, def_we, def_we
)+"#elif defined(D3Q27)"+R(
def_ws, def_ws, def_ws, def_ws, def_ws, def_ws,
def_we, def_we, def_we, def_we, def_we, def_we, def_we, def_we, def_we, def_we, def_we, def_we,
def_wc, def_wc, def_wc, def_wc, def_wc, def_wc, def_wc, def_wc
)+"#endif"+R( // D3Q27
};
return w[i];
}
)+R(void calculate_indices(const uxx n, uxx* x0, uxx* xp, uxx* xm, uxx* y0, uxx* yp, uxx* ym, uxx* z0, uxx* zp, uxx* zm) {
const uint3 xyz = coordinates(n);
*x0 = (uxx) xyz.x; // pre-calculate indices (periodic boundary conditions)
*xp = (uxx) ((xyz.x +1u)%def_Nx);
*xm = (uxx) ((xyz.x+def_Nx-1u)%def_Nx);
*y0 = (uxx)( xyz.y *def_Nx);
*yp = (uxx)(((xyz.y +1u)%def_Ny)*def_Nx);
*ym = (uxx)(((xyz.y+def_Ny-1u)%def_Ny)*def_Nx);
*z0 = (uxx) xyz.z *(uxx)(def_Ny*def_Nx);
*zp = (uxx) ((xyz.z +1u)%def_Nz)*(uxx)(def_Ny*def_Nx);
*zm = (uxx) ((xyz.z+def_Nz-1u)%def_Nz)*(uxx)(def_Ny*def_Nx);
} // calculate_indices()
)+R(void neighbors(const uxx n, uxx* j) { // calculate neighbor indices
uxx x0, xp, xm, y0, yp, ym, z0, zp, zm;
calculate_indices(n, &x0, &xp, &xm, &y0, &yp, &ym, &z0, &zp, &zm);
j[0] = n;
)+"#if defined(D2Q9)"+R(
j[ 1] = xp+y0; j[ 2] = xm+y0; // +00 -00
j[ 3] = x0+yp; j[ 4] = x0+ym; // 0+0 0-0
j[ 5] = xp+yp; j[ 6] = xm+ym; // ++0 --0
j[ 7] = xp+ym; j[ 8] = xm+yp; // +-0 -+0
)+"#elif defined(D3Q15)"+R(
j[ 1] = xp+y0+z0; j[ 2] = xm+y0+z0; // +00 -00
j[ 3] = x0+yp+z0; j[ 4] = x0+ym+z0; // 0+0 0-0
j[ 5] = x0+y0+zp; j[ 6] = x0+y0+zm; // 00+ 00-
j[ 7] = xp+yp+zp; j[ 8] = xm+ym+zm; // +++ ---
j[ 9] = xp+yp+zm; j[10] = xm+ym+zp; // ++- --+
j[11] = xp+ym+zp; j[12] = xm+yp+zm; // +-+ -+-
j[13] = xm+yp+zp; j[14] = xp+ym+zm; // -++ +--
)+"#elif defined(D3Q19)"+R(
j[ 1] = xp+y0+z0; j[ 2] = xm+y0+z0; // +00 -00
j[ 3] = x0+yp+z0; j[ 4] = x0+ym+z0; // 0+0 0-0
j[ 5] = x0+y0+zp; j[ 6] = x0+y0+zm; // 00+ 00-
j[ 7] = xp+yp+z0; j[ 8] = xm+ym+z0; // ++0 --0
j[ 9] = xp+y0+zp; j[10] = xm+y0+zm; // +0+ -0-
j[11] = x0+yp+zp; j[12] = x0+ym+zm; // 0++ 0--
j[13] = xp+ym+z0; j[14] = xm+yp+z0; // +-0 -+0
j[15] = xp+y0+zm; j[16] = xm+y0+zp; // +0- -0+
j[17] = x0+yp+zm; j[18] = x0+ym+zp; // 0+- 0-+
)+"#elif defined(D3Q27)"+R(
j[ 1] = xp+y0+z0; j[ 2] = xm+y0+z0; // +00 -00
j[ 3] = x0+yp+z0; j[ 4] = x0+ym+z0; // 0+0 0-0
j[ 5] = x0+y0+zp; j[ 6] = x0+y0+zm; // 00+ 00-
j[ 7] = xp+yp+z0; j[ 8] = xm+ym+z0; // ++0 --0
j[ 9] = xp+y0+zp; j[10] = xm+y0+zm; // +0+ -0-
j[11] = x0+yp+zp; j[12] = x0+ym+zm; // 0++ 0--
j[13] = xp+ym+z0; j[14] = xm+yp+z0; // +-0 -+0
j[15] = xp+y0+zm; j[16] = xm+y0+zp; // +0- -0+
j[17] = x0+yp+zm; j[18] = x0+ym+zp; // 0+- 0-+
j[19] = xp+yp+zp; j[20] = xm+ym+zm; // +++ ---
j[21] = xp+yp+zm; j[22] = xm+ym+zp; // ++- --+
j[23] = xp+ym+zp; j[24] = xm+yp+zm; // +-+ -+-
j[25] = xm+yp+zp; j[26] = xp+ym+zm; // -++ +--
)+"#endif"+R( // D3Q27
} // neighbors()
)+R(float3 load3(const uxx n, const global float* v) {
return (float3)(v[n], v[def_N+(ulong)n], v[2ul*def_N+(ulong)n]);
}
)+R(float3 closest_u(const float3 p, const global float* u) { // return velocity of closest lattice point to point p
return load3(index(closest_coordinates(p)), u);
}
)+R(float3 interpolate_u(const float3 p, const global float* u) { // trilinear interpolation of velocity at point p
const float xa=p.x-0.5f+1.5f*(float)def_Nx, ya=p.y-0.5f+1.5f*(float)def_Ny, za=p.z-0.5f+1.5f*(float)def_Nz; // subtract lattice offsets
const uint xb=(uint)xa, yb=(uint)ya, zb=(uint)za; // integer casting to find bottom left corner
const float3 pn = (float3)(xa-(float)xb, ya-(float)yb, za-(float)zb); // calculate interpolation factors
float3 un[8]; // velocitiy at unit cube corner points
for(uint c=0u; c<8u; c++) { // count over eight corner points
const uint i=(c&0x04u)>>2, j=(c&0x02u)>>1, k=c&0x01u; // disassemble c into corner indices ijk
const uint x=(xb+i)%def_Nx, y=(yb+j)%def_Ny, z=(zb+k)%def_Nz; // calculate corner lattice positions
const uxx n = (uxx)x+(uxx)(y+z*def_Ny)*(uxx)def_Nx; // calculate lattice linear index
un[c] = load3(n, u); // load velocity from lattice point
}
return trilinear3(pn, un); // perform trilinear interpolation
} // interpolate_u()
)+R(float calculate_Q_cached(const float3 u0, const float3 u1, const float3 u2, const float3 u3, const float3 u4, const float3 u5) { // Q-criterion
const float duxdx=u0.x-u1.x, duydx=u0.y-u1.y, duzdx=u0.z-u1.z; // du/dx = (u2-u0)/2
const float duxdy=u2.x-u3.x, duydy=u2.y-u3.y, duzdy=u2.z-u3.z;
const float duxdz=u4.x-u5.x, duydz=u4.y-u5.y, duzdz=u4.z-u5.z;
const float omega_xy=duxdy-duydx, omega_xz=duxdz-duzdx, omega_yz=duydz-duzdy; // antisymmetric tensor, omega_xx = omega_yy = omega_zz = 0
const float s_xx2=duxdx, s_yy2=duydy, s_zz2=duzdz; // s_xx2 = s_xx/2, s_yy2 = s_yy/2, s_zz2 = s_zz/2
const float s_xy=duxdy+duydx, s_xz=duxdz+duzdx, s_yz=duydz+duzdy; // symmetric tensor
const float omega2 = sq(omega_xy)+sq(omega_xz)+sq(omega_yz); // ||omega||_2^2
const float s2 = 2.0f*(sq(s_xx2)+sq(s_yy2)+sq(s_zz2))+sq(s_xy)+sq(s_xz)+sq(s_yz); // ||s||_2^2
return 0.25f*(omega2-s2); // Q = 1/2*(||omega||_2^2-||s||_2^2), addidional factor 1/2 from cental finite differences of velocity
} // calculate_Q_cached()
)+R(float calculate_Q(const uxx n, const global float* u) { // Q-criterion