Skip to content

Commit 93d6c72

Browse files
committed
Unify sine/cosine approximations across backends
1 parent 8ab4403 commit 93d6c72

72 files changed

Lines changed: 812 additions & 340 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,7 @@ set(INTERNAL_SOURCE_FILES internal/AtmosphereRef.h
120120
internal/Time.cpp
121121
internal/TonemapRef.h
122122
internal/TonemapRef.cpp
123+
internal/Trig.h
123124
internal/UNetFilter.h
124125
internal/UNetFilter.cpp
125126
internal/VectorGPU.h)

internal/AtmosphereRef.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -959,8 +959,8 @@ void Ray::Ref::ShadeSky(const pass_settings_t &ps, const float limit, Span<const
959959

960960
const fvec4 light_dir = {l.dir.dir[0], l.dir.dir[1], l.dir.dir[2], 0.0f};
961961
fvec4 light_col = {l.col[0], l.col[1], l.col[2], 0.0f};
962-
if (l.dir.angle != 0.0f) {
963-
const float radius = tanf(l.dir.angle);
962+
if (l.dir.tan_angle != 0.0f) {
963+
const float radius = l.dir.tan_angle;
964964
light_col *= (PI * radius * radius);
965965
}
966966

internal/Core.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -228,8 +228,9 @@ struct light_t {
228228
float _unused[9];
229229
} tri;
230230
struct {
231-
float dir[3], angle;
232-
float _unused[8];
231+
float dir[3], cos_angle;
232+
float tan_angle, angle, _unused0[2];
233+
float _unused1[4];
233234
} dir;
234235
};
235236
};

internal/CoreRef.cpp

Lines changed: 135 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -704,7 +704,8 @@ fvec4 map_to_cone(float r1, float r2, fvec4 N, float radius) {
704704
theta = 0.5f * PI * (1.0f - 0.5f * (offset.get<0>() / offset.get<1>()));
705705
}
706706

707-
const fvec2 uv = fvec2(radius * r * cosf(theta), radius * r * sinf(theta));
707+
const fvec2 sincos_theta = portable_sincos(theta);
708+
const fvec2 uv = fvec2(radius * r * sincos_theta.get<1>(), radius * r * sincos_theta.get<0>());
708709

709710
fvec4 LT, LB;
710711
create_tbn(normalize(N), LT, LB);
@@ -764,7 +765,7 @@ force_inline bool quadratic(float a, float b, float c, float &t0, float &t1) {
764765
}
765766

766767
force_inline float ngon_rad(const float theta, const float n) {
767-
return cosf(PI / n) / cosf(theta - (2.0f * PI / n) * floorf((n * theta + PI) / (2.0f * PI)));
768+
return portable_cos(PI / n) / portable_cos(theta - (2.0f * PI / n) * floorf((n * theta + PI) / (2.0f * PI)));
768769
}
769770

770771
float approx_atan2(const float y, const float x) { // max error is 0.000004f
@@ -1116,10 +1117,112 @@ force_inline fvec4 slerp(const fvec4 &start, const fvec4 &end, const float perce
11161117
// And multiplying that by percent returns the angle between
11171118
// start and the final result.
11181119
const float theta = acosf(cos_theta) * percent;
1119-
fvec4 relative_vec = safe_normalize(end - start * cos_theta);
1120+
const fvec4 relative_vec = safe_normalize(end - start * cos_theta);
11201121
// Orthonormal basis
11211122
// The final result.
1122-
return start * cosf(theta) + relative_vec * sinf(theta);
1123+
const fvec2 sincos_theta = portable_sincos(theta);
1124+
return start * sincos_theta.get<1>() + relative_vec * sincos_theta.get<0>();
1125+
}
1126+
1127+
//
1128+
// Polynomial approximation of cosine
1129+
// Max error : 6.987e-07
1130+
//
1131+
float portable_cos(float a) {
1132+
// Normalize angle to [0, 1] range, where 1 corresponds to 2*PI
1133+
a = fract(fabs(a) * 0.15915494309189535f);
1134+
1135+
// Select between ranges [0; 0.25), [0.25; 0.75), [0.75; 1.0]
1136+
fvec4 selector = 0.0f;
1137+
selector.set<0>(float(a < 0.25f));
1138+
selector.set<2>(float(a >= 0.75f));
1139+
selector.set<1>(-1.0f + dot(selector, fvec4{1, 0, 1, 0}));
1140+
1141+
// Center around ranges
1142+
fvec4 arg = fvec4{a} - fvec4{0.0f, 0.5f, 1.0f, 0.0f};
1143+
// Squared value gives better precision
1144+
arg *= arg;
1145+
1146+
// Evaluate 5th-degree polynome
1147+
fvec4 res = fmadd(-25.0407296503853054f, arg, 60.1524123580209817f);
1148+
res = fmsub(res, arg, 85.4539888046442542f);
1149+
res = fmadd(res, arg, 64.9393549651994562f);
1150+
res = fmsub(res, arg, 19.7392086060579359f);
1151+
res = fmadd(res, arg, 0.9999999998415476f);
1152+
1153+
// Combine contributions based on selector to get final value
1154+
return dot(res, selector);
1155+
}
1156+
1157+
//
1158+
// Polynomial approximation of sine
1159+
// Max error : 8.482e-07
1160+
//
1161+
float portable_sin(float a) {
1162+
// Normalize angle to [0, 1] range, where 1 corresponds to 2*PI
1163+
a = fract(fabs(a - 1.5707963267948966f) * 0.15915494309189535f);
1164+
1165+
// Select between ranges [0; 0.25), [0.25; 0.75), [0.75; 1.0]
1166+
fvec4 selector = 0.0f;
1167+
selector.set<0>(float(a < 0.25f));
1168+
selector.set<2>(float(a >= 0.75f));
1169+
selector.set<1>(-1.0f + dot(selector, fvec4{1, 0, 1, 0}));
1170+
1171+
// Center around ranges
1172+
fvec4 arg = fvec4{a} - fvec4{0.0f, 0.5f, 1.0f, 0.0f};
1173+
// Squared value gives better precision
1174+
arg *= arg;
1175+
1176+
// Evaluate 5th-degree polynome
1177+
fvec4 res = fmadd(-25.0407296503853054f, arg, 60.1524123580209817f);
1178+
res = fmsub(res, arg, 85.4539888046442542f);
1179+
res = fmadd(res, arg, 64.9393549651994562f);
1180+
res = fmsub(res, arg, 19.7392086060579359f);
1181+
res = fmadd(res, arg, 0.9999999998415476f);
1182+
1183+
// Combine contributions based on selector to get final value
1184+
return dot(res, selector);
1185+
}
1186+
1187+
//
1188+
// Combined approximation of sine/cosine
1189+
// Max error : 8.482e-07
1190+
//
1191+
fvec2 portable_sincos(const float a) {
1192+
// Normalize angle to [0, 1] range, where 1 corresponds to 2*PI
1193+
const float a_cos = fract(fabsf(a) * 0.15915494309189535f);
1194+
const float a_sin = fract(fabsf(a - 1.5707963267948966f) * 0.15915494309189535f);
1195+
1196+
// Select between ranges [0; 0.25), [0.25; 0.75), [0.75; 1.0]
1197+
fvec4 selector_cos = 0.0f, selector_sin = 0.0f;
1198+
selector_cos.set<0>(float(a_cos < 0.25f));
1199+
selector_cos.set<2>(float(a_cos >= 0.75f));
1200+
selector_cos.set<1>(-1.0f + dot(selector_cos, fvec4{1, 0, 1, 0}));
1201+
selector_sin.set<0>(float(a_sin < 0.25f));
1202+
selector_sin.set<2>(float(a_sin >= 0.75f));
1203+
selector_sin.set<1>(-1.0f + dot(selector_sin, fvec4{1, 0, 1, 0}));
1204+
1205+
// Center around ranges
1206+
fvec4 arg_cos = fvec4{a_cos} - fvec4{0.0f, 0.5f, 1.0f, 0.0f};
1207+
fvec4 arg_sin = fvec4{a_sin} - fvec4{0.0f, 0.5f, 1.0f, 0.0f};
1208+
// Squared value gives better precision
1209+
arg_cos *= arg_cos;
1210+
arg_sin *= arg_sin;
1211+
1212+
// Evaluate 5th-degree polynome
1213+
fvec4 res_cos = fmadd(-25.0407296503853054f, arg_cos, 60.1524123580209817f);
1214+
fvec4 res_sin = fmadd(-25.0407296503853054f, arg_sin, 60.1524123580209817f);
1215+
res_cos = fmsub(res_cos, arg_cos, 85.4539888046442542f);
1216+
res_sin = fmsub(res_sin, arg_sin, 85.4539888046442542f);
1217+
res_cos = fmadd(res_cos, arg_cos, 64.9393549651994562f);
1218+
res_sin = fmadd(res_sin, arg_sin, 64.9393549651994562f);
1219+
res_cos = fmsub(res_cos, arg_cos, 19.7392086060579359f);
1220+
res_sin = fmsub(res_sin, arg_sin, 19.7392086060579359f);
1221+
res_cos = fmadd(res_cos, arg_cos, 0.9999999998415476f);
1222+
res_sin = fmadd(res_sin, arg_sin, 0.9999999998415476f);
1223+
1224+
// Combine contributions based on selector to get final value
1225+
return fvec2{dot(res_sin, selector_sin), dot(res_cos, selector_cos)};
11231226
}
11241227

11251228
//
@@ -1224,7 +1327,8 @@ float Ray::Ref::SampleSphericalRectangle(const fvec4 &P, const fvec4 &light_pos,
12241327
if (out_p) {
12251328
// compute cu
12261329
const float au = Xi.get<0>() * area + k;
1227-
const float fu = safe_div((cosf(au) * b0 - b1), sinf(au));
1330+
const fvec2 sincos_au = portable_sincos(au);
1331+
const float fu = safe_div((sincos_au.get<1>() * b0 - b1), sincos_au.get<0>());
12281332
float cu = 1.0f / sqrtf(fu * fu + b0sq) * (fu > 0.0f ? 1.0f : -1.0f);
12291333
cu = clamp(cu, -1.0f, 1.0f);
12301334
// compute xu
@@ -1281,17 +1385,20 @@ float Ray::Ref::SampleSphericalTriangle(const fvec4 &P, const fvec4 &p1, const f
12811385
const float area_S = Xi.get<0>() * area;
12821386

12831387
// Save the sine and cosine of the angle delta
1284-
const float p = sinf(area_S - alpha);
1285-
const float q = cosf(area_S - alpha);
1388+
const fvec2 sincos_area = portable_sincos(area_S - alpha);
1389+
const float p = sincos_area.get<0>();
1390+
const float q = sincos_area.get<1>();
12861391

12871392
// Compute the pair(u; v) that determines sin(beta_s) and cos(beta_s)
1288-
const float u = q - cosf(alpha);
1289-
const float v = p + sinf(alpha) * cosf(c);
1393+
const fvec2 sincos_alpha = portable_sincos(alpha);
1394+
const float u = q - sincos_alpha.get<1>();
1395+
const float v = p + sincos_alpha.get<0>() * portable_cos(c);
12901396

12911397
// Compute the s coordinate as normalized arc length from A to C_s
1292-
const float denom = ((v * p + u * q) * sinf(alpha));
1293-
const float s = safe_div(1.0f, b) *
1294-
portable_acosf(clamp(safe_div(((v * q - u * p) * cosf(alpha) - v), denom), -1.0f, 1.0f));
1398+
const float denom = ((v * p + u * q) * sincos_alpha.get<0>());
1399+
const float s =
1400+
safe_div(1.0f, b) *
1401+
portable_acosf(clamp(safe_div(((v * q - u * p) * sincos_alpha.get<1>() - v), denom), -1.0f, 1.0f));
12951402

12961403
// Compute the third vertex of the sub - triangle
12971404
const fvec4 C_s = slerp(A, C, s);
@@ -1403,8 +1510,9 @@ void Ray::Ref::GeneratePrimaryRays(const camera_t &cam, const rect_t &rect, cons
14031510

14041511
theta += cam.lens_rotation;
14051512

1406-
offset.set<0>(0.5f * r * cosf(theta) / cam.lens_ratio);
1407-
offset.set<1>(0.5f * r * sinf(theta));
1513+
const fvec2 sincos_theta = portable_sincos(theta);
1514+
offset.set<0>(0.5f * r * sincos_theta.get<1>() / cam.lens_ratio);
1515+
offset.set<1>(0.5f * r * sincos_theta.get<0>());
14081516
}
14091517

14101518
const float coc = 0.5f * (cam.focal_length / cam.fstop);
@@ -3263,10 +3371,10 @@ void Ray::Ref::SampleLightSource(const fvec4 &P, const fvec4 &T, const fvec4 &B,
32633371
ls.L = make_fvec3(l.dir.dir);
32643372
ls.area = 0.0f;
32653373
ls.pdf = 1.0f;
3266-
if (l.dir.angle != 0.0f) {
3374+
if (l.dir.tan_angle != 0.0f) {
32673375
const float r1 = rand_light_uv.get<0>(), r2 = rand_light_uv.get<1>();
32683376

3269-
const float radius = tanf(l.dir.angle);
3377+
const float radius = l.dir.tan_angle;
32703378
ls.L = normalize(map_to_cone(r1, r2, ls.L, radius));
32713379
ls.area = PI * radius * radius;
32723380

@@ -3332,8 +3440,10 @@ void Ray::Ref::SampleLightSource(const fvec4 &P, const fvec4 &T, const fvec4 &B,
33323440
theta = 0.5f * PI - 0.25f * PI * (offset.get<0>() / offset.get<1>());
33333441
}
33343442

3335-
offset.set<0>(0.5f * r * cosf(theta));
3336-
offset.set<1>(0.5f * r * sinf(theta));
3443+
const fvec2 sincos_theta = portable_sincos(theta);
3444+
3445+
offset.set<0>(0.5f * r * sincos_theta.get<1>());
3446+
offset.set<1>(0.5f * r * sincos_theta.get<0>());
33373447
}
33383448

33393449
const fvec4 lp = light_pos + light_u * offset.get<0>() + light_v * offset.get<1>();
@@ -3375,7 +3485,8 @@ void Ray::Ref::SampleLightSource(const fvec4 &P, const fvec4 &T, const fvec4 &B,
33753485
fvec4 light_v = cross(light_u, light_dir);
33763486

33773487
const float phi = PI * r1;
3378-
const fvec4 normal = cosf(phi) * light_u + sinf(phi) * light_v;
3488+
const fvec2 sincos_phi = portable_sincos(phi);
3489+
const fvec4 normal = sincos_phi.get<1>() * light_u + sincos_phi.get<0>() * light_v;
33793490

33803491
const fvec4 lp = light_pos + normal * l.line.radius + (r2 - 0.5f) * light_dir * l.line.height;
33813492

@@ -3473,7 +3584,8 @@ void Ray::Ref::SampleLightSource(const fvec4 &P, const fvec4 &T, const fvec4 &B,
34733584
} else {
34743585
// Sample environment as hemishpere
34753586
const float phi = 2 * PI * ry;
3476-
const float cos_phi = cosf(phi), sin_phi = sinf(phi);
3587+
const fvec2 sincos_phi = portable_sincos(phi);
3588+
const float cos_phi = sincos_phi.get<1>(), sin_phi = sincos_phi.get<0>();
34773589

34783590
const float dir = sqrtf(1.0f - rx * rx);
34793591
auto V = fvec4{dir * cos_phi, dir * sin_phi, rx, 0.0f}; // in tangent-space
@@ -3652,7 +3764,7 @@ void Ray::Ref::IntersectAreaLights(Span<const ray_data_t> rays, Span<const light
36523764
} else if (l.type == LIGHT_TYPE_DIR) {
36533765
const fvec4 light_dir = make_fvec3(l.dir.dir);
36543766
const float cos_theta = dot(rd, light_dir);
3655-
if ((inout_inter.v < 0.0f || no_shadow) && cos_theta > cosf(l.dir.angle)) {
3767+
if ((inout_inter.v < 0.0f || no_shadow) && cos_theta > l.dir.cos_angle) {
36563768
inout_inter.v = 0.0f;
36573769
inout_inter.obj_index = -int(light_index) - 1;
36583770
inout_inter.t = 1.0f / cos_theta;
@@ -3898,7 +4010,7 @@ void Ray::Ref::IntersectAreaLights(Span<const ray_data_t> rays, Span<const light
38984010
} else if (l.type == LIGHT_TYPE_DIR) {
38994011
const fvec4 light_dir = make_fvec3(l.dir.dir);
39004012
const float cos_theta = dot(rd, light_dir);
3901-
if ((inout_inter.v < 0.0f || no_shadow) && cos_theta > cosf(l.dir.angle)) {
4013+
if ((inout_inter.v < 0.0f || no_shadow) && cos_theta > l.dir.cos_angle) {
39024014
inout_inter.v = 0.0f;
39034015
inout_inter.obj_index = -int(light_index) - 1;
39044016
inout_inter.t = 1.0f / cos_theta;
@@ -4098,7 +4210,7 @@ void Ray::Ref::IntersectAreaLights(Span<const ray_data_t> rays, Span<const light
40984210
} else if (l.type == LIGHT_TYPE_DIR) {
40994211
const fvec4 light_dir = make_fvec3(l.dir.dir);
41004212
const float cos_theta = dot(rd, light_dir);
4101-
if ((inout_inter.v < 0.0f || no_shadow) && cos_theta > cosf(l.dir.angle)) {
4213+
if ((inout_inter.v < 0.0f || no_shadow) && cos_theta > l.dir.cos_angle) {
41024214
inout_inter.v = 0.0f;
41034215
inout_inter.obj_index = -int(light_index) - 1;
41044216
inout_inter.t = 1.0f / cos_theta;

internal/CoreRef.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -302,6 +302,10 @@ force_inline fvec4 tangent_from_world(const fvec4 &T, const fvec4 &B, const fvec
302302
return fvec4{dot(V, T), dot(V, B), dot(V, N), 0.0f};
303303
}
304304

305+
float portable_cos(float a);
306+
float portable_sin(float a);
307+
fvec2 portable_sincos(float a);
308+
305309
fvec2 get_scrambled_2d_rand(const uint32_t dim, const uint32_t seed, const int sample, const uint32_t rand_seq[]);
306310

307311
float SampleSphericalRectangle(const fvec4 &P, const fvec4 &light_pos, const fvec4 &axis_u, const fvec4 &axis_v,

0 commit comments

Comments
 (0)