Skip to content

Commit 7870b07

Browse files
authored
Merge pull request #320 from AlchemyViewer/rye/render-perf
Rendering performance: GLTF bind caching, shadow tunables, framebuffer-blit copies
2 parents c90f302 + bff2bef commit 7870b07

18 files changed

Lines changed: 425 additions & 206 deletions

indra/llmath/llmath.h

Lines changed: 6 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -155,14 +155,14 @@ constexpr F64 llabs(const F64 a) noexcept
155155
return std::bit_cast<F64>(std::bit_cast<U64>(a) & 0x7fffffffffffffffull);
156156
}
157157

158-
constexpr S32 lltrunc(F32 f)
158+
inline S32 lltrunc(F32 f)
159159
{
160-
return narrow(f);
160+
return (S32)std::trunc(f);
161161
}
162162

163-
constexpr S32 lltrunc(F64 f)
163+
inline S32 lltrunc(F64 f)
164164
{
165-
return narrow(f);
165+
return (S32)std::trunc(f);
166166
}
167167

168168
inline S32 llfloor(F32 f)
@@ -184,67 +184,19 @@ inline S32 llfloor(F32 f)
184184
#endif
185185
}
186186

187-
188187
inline S32 llceil( F32 f )
189188
{
190189
// This could probably be optimized, but this works.
191190
return (S32)ceil(f);
192191
}
193192

194-
195-
#ifndef BOGUS_ROUND
196-
// Use this round. Does an arithmetic round (0.5 always rounds up)
197193
inline S32 ll_round(const F32 val)
198194
{
199-
return llfloor(val + 0.5f);
195+
return (S32)lround(val);
200196
}
201-
202-
#else // BOGUS_ROUND
203-
// Old ll_round implementation - does banker's round (toward nearest even in the case of a 0.5.
204-
// Not using this because we don't have a consistent implementation on both platforms, use
205-
// llfloor(val + 0.5f), which is consistent on all platforms.
206-
inline S32 ll_round(const F32 val)
207-
{
208-
#if LL_WINDOWS
209-
// Note: assumes that the floating point control word is set to rounding mode (the default)
210-
S32 ret_val;
211-
_asm fld val
212-
_asm fistp ret_val;
213-
return ret_val;
214-
#elif LL_LINUX
215-
// Note: assumes that the floating point control word is set
216-
// to rounding mode (the default)
217-
S32 ret_val;
218-
__asm__ __volatile__( "flds %1 \n\t"
219-
"fistpl %0 \n\t"
220-
: "=m" (ret_val)
221-
: "m" (val) );
222-
return ret_val;
223-
#else
224-
return llfloor(val + 0.5f);
225-
#endif
226-
}
227-
228-
// A fast arithmentic round on intel, from Laurent de Soras http://ldesoras.free.fr
229-
inline int round_int(double x)
230-
{
231-
const float round_to_nearest = 0.5f;
232-
int i;
233-
__asm
234-
{
235-
fld x
236-
fadd st, st (0)
237-
fadd round_to_nearest
238-
fistp i
239-
sar i, 1
240-
}
241-
return (i);
242-
}
243-
#endif // BOGUS_ROUND
244-
245197
inline F64 ll_round(const F64 val)
246198
{
247-
return F64(floor(val + 0.5f));
199+
return round(val);
248200
}
249201

250202
inline F32 ll_round( F32 val, F32 nearest )

indra/llrender/llrendertarget.cpp

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -641,6 +641,86 @@ void LLRenderTarget::flush()
641641
}
642642
}
643643

644+
void LLRenderTarget::copyContents(LLRenderTarget& source, S32 srcX0, S32 srcY0, S32 srcX1, S32 srcY1, S32 dstX0, S32 dstY0, S32 dstX1,
645+
S32 dstY1, U32 mask, U32 filter)
646+
{
647+
LL_PROFILE_GPU_ZONE("LLRenderTarget::copyContents");
648+
649+
GLboolean write_depth = mask & GL_DEPTH_BUFFER_BIT ? GL_TRUE : GL_FALSE;
650+
651+
LLGLDepthTest depth(write_depth, write_depth);
652+
653+
gGL.flush();
654+
if (!source.mFBO || !mFBO)
655+
{
656+
LL_WARNS() << "Cannot copy framebuffer contents for non FBO render targets." << LL_ENDL;
657+
return;
658+
}
659+
660+
if (mask == GL_DEPTH_BUFFER_BIT && source.mStencil != mStencil)
661+
{
662+
stop_glerror();
663+
664+
glBindFramebuffer(GL_FRAMEBUFFER, source.mFBO);
665+
check_framebuffer_status();
666+
gGL.getTexUnit(0)->bind(this, true);
667+
stop_glerror();
668+
// glCopyTexSubImage2D(target, level, xoffset, yoffset, x, y, width, height):
669+
// xoffset/yoffset are the destination texel offset, x/y the source framebuffer
670+
// origin, and the last two are dimensions (not endpoints).
671+
glCopyTexSubImage2D(LLTexUnit::getInternalType(mUsage), 0, dstX0, dstY0, srcX0, srcY0, srcX1 - srcX0, srcY1 - srcY0);
672+
stop_glerror();
673+
glBindFramebuffer(GL_FRAMEBUFFER, sCurFBO);
674+
stop_glerror();
675+
}
676+
else
677+
{
678+
glBindFramebuffer(GL_READ_FRAMEBUFFER, source.mFBO);
679+
stop_glerror();
680+
glBindFramebuffer(GL_DRAW_FRAMEBUFFER, mFBO);
681+
stop_glerror();
682+
check_framebuffer_status();
683+
stop_glerror();
684+
glBlitFramebuffer(srcX0, srcY0, srcX1, srcY1, dstX0, dstY0, dstX1, dstY1, mask, filter);
685+
stop_glerror();
686+
glBindFramebuffer(GL_READ_FRAMEBUFFER, 0);
687+
stop_glerror();
688+
glBindFramebuffer(GL_DRAW_FRAMEBUFFER, 0);
689+
stop_glerror();
690+
glBindFramebuffer(GL_FRAMEBUFFER, sCurFBO);
691+
stop_glerror();
692+
}
693+
}
694+
695+
// static
696+
void LLRenderTarget::copyContentsToFramebuffer(LLRenderTarget& source, S32 srcX0, S32 srcY0, S32 srcX1, S32 srcY1, S32 dstX0, S32 dstY0,
697+
S32 dstX1, S32 dstY1, U32 mask, U32 filter)
698+
{
699+
if (!source.mFBO)
700+
{
701+
LL_WARNS() << "Cannot copy framebuffer contents for non FBO render targets." << LL_ENDL;
702+
return;
703+
}
704+
705+
{
706+
LL_PROFILE_GPU_ZONE("copyContentsToFramebuffer");
707+
GLboolean write_depth = mask & GL_DEPTH_BUFFER_BIT ? GL_TRUE : GL_FALSE;
708+
709+
LLGLDepthTest depth(write_depth, write_depth);
710+
711+
glBindFramebuffer(GL_READ_FRAMEBUFFER, source.mFBO);
712+
stop_glerror();
713+
glBindFramebuffer(GL_DRAW_FRAMEBUFFER, 0);
714+
stop_glerror();
715+
check_framebuffer_status();
716+
stop_glerror();
717+
glBlitFramebuffer(srcX0, srcY0, srcX1, srcY1, dstX0, dstY0, dstX1, dstY1, mask, filter);
718+
stop_glerror();
719+
glBindFramebuffer(GL_FRAMEBUFFER, sCurFBO);
720+
stop_glerror();
721+
}
722+
}
723+
644724
bool LLRenderTarget::isComplete() const
645725
{
646726
return !mTex.empty() || mDepth;

indra/llrender/llrendertarget.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -170,6 +170,12 @@ class LLRenderTarget
170170
// asserts that this target is currently bound
171171
void flush();
172172

173+
void copyContents(LLRenderTarget& source, S32 srcX0, S32 srcY0, S32 srcX1, S32 srcY1, S32 dstX0, S32 dstY0, S32 dstX1, S32 dstY1,
174+
U32 mask, U32 filter);
175+
176+
static void copyContentsToFramebuffer(LLRenderTarget& source, S32 srcX0, S32 srcY0, S32 srcX1, S32 srcY1, S32 dstX0, S32 dstY0,
177+
S32 dstX1, S32 dstY1, U32 mask, U32 filter);
178+
173179
//Returns TRUE if target is ready to be rendered into.
174180
//That is, if the target has been allocated with at least
175181
//one renderable attachment (i.e. color buffer, depth buffer).

indra/newview/app_settings/settings_alchemy.xml

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1202,6 +1202,28 @@
12021202
<key>Value</key>
12031203
<integer>0</integer>
12041204
</map>
1205+
<key>RenderAvatarShadowDetail</key>
1206+
<map>
1207+
<key>Comment</key>
1208+
<string>Which avatar passes cast shadows. 0 = opaque only, 1 = opaque + alpha mask, 2 = full (also alpha blend). Lower values speed up crowd scenes by skipping the expensive alpha-blend avatar shadow pass across all cascades.</string>
1209+
<key>Persist</key>
1210+
<integer>1</integer>
1211+
<key>Type</key>
1212+
<string>S32</string>
1213+
<key>Value</key>
1214+
<integer>2</integer>
1215+
</map>
1216+
<key>RenderShadowCullMode</key>
1217+
<map>
1218+
<key>Comment</key>
1219+
<string>How sun shadow cascades are culled. 0 = cull and sort each cascade separately (default). 1 = cull and sort once against a frustum spanning all cascades, sharing the result (less CPU per frame, more GPU vertex work per cascade). Experimental.</string>
1220+
<key>Persist</key>
1221+
<integer>1</integer>
1222+
<key>Type</key>
1223+
<string>S32</string>
1224+
<key>Value</key>
1225+
<integer>0</integer>
1226+
</map>
12051227
<key>RenderBloomHDR</key>
12061228
<map>
12071229
<key>Comment</key>

indra/newview/lldrawpool.cpp

Lines changed: 22 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -786,6 +786,8 @@ void LLRenderPass::pushGLTFBatches(U32 type, bool textured)
786786
void LLRenderPass::pushGLTFBatches(U32 type)
787787
{
788788
LL_PROFILE_ZONE_SCOPED_CATEGORY_DRAWPOOL;
789+
LLFetchedGLTFMaterial* lastMat = nullptr;
790+
LLViewerTexture* lastTex = nullptr;
789791
auto* begin = gPipeline.beginRenderMap(type);
790792
auto* end = gPipeline.endRenderMap(type);
791793
for (LLCullResult::drawinfo_iterator i = begin; i != end; )
@@ -794,7 +796,7 @@ void LLRenderPass::pushGLTFBatches(U32 type)
794796
LLDrawInfo& params = **i;
795797
LLCullResult::increment_iterator(i, end);
796798

797-
pushGLTFBatch(params);
799+
pushGLTFBatch(params, lastMat, lastTex);
798800
}
799801
}
800802

@@ -814,16 +816,25 @@ void LLRenderPass::pushUntexturedGLTFBatches(U32 type)
814816
}
815817

816818
// static
817-
void LLRenderPass::pushGLTFBatch(LLDrawInfo& params)
819+
void LLRenderPass::pushGLTFBatch(LLDrawInfo& params, LLFetchedGLTFMaterial*& lastMat, LLViewerTexture*& lastTex)
818820
{
819-
auto& mat = params.mGLTFMaterial;
821+
LLFetchedGLTFMaterial* mat = params.mGLTFMaterial.get();
820822

821-
if (mat.notNull())
823+
if (mat)
822824
{
823-
mat->bind(params.mTexture);
825+
// params.mTexture is the media override (bind() applies it to base color
826+
// and emissive), so it is part of the cache key -- otherwise media faces
827+
// sharing a material would render with a stale base texture.
828+
LLViewerTexture* tex = params.mTexture.get();
829+
if (mat != lastMat || tex != lastTex)
830+
{
831+
mat->bind(params.mTexture);
832+
lastMat = mat;
833+
lastTex = tex;
834+
}
824835
}
825836

826-
LLGLDisable cull_face(mat.notNull() && mat->mDoubleSided ? GL_CULL_FACE : 0);
837+
LLGLDisable cull_face(mat && mat->mDoubleSided ? GL_CULL_FACE : 0);
827838

828839
setup_texture_matrix(params);
829840

@@ -866,6 +877,8 @@ void LLRenderPass::pushRiggedGLTFBatches(U32 type)
866877
const LLVOAvatar* lastAvatar = nullptr;
867878
U64 lastMeshId = 0;
868879
bool skipLastSkin = false;
880+
LLFetchedGLTFMaterial* lastMat = nullptr;
881+
LLViewerTexture* lastTex = nullptr;
869882

870883
auto* begin = gPipeline.beginRenderMap(type);
871884
auto* end = gPipeline.endRenderMap(type);
@@ -875,7 +888,7 @@ void LLRenderPass::pushRiggedGLTFBatches(U32 type)
875888
LLDrawInfo& params = **i;
876889
LLCullResult::increment_iterator(i, end);
877890

878-
pushRiggedGLTFBatch(params, lastAvatar, lastMeshId, skipLastSkin);
891+
pushRiggedGLTFBatch(params, lastAvatar, lastMeshId, skipLastSkin, lastMat, lastTex);
879892
}
880893
}
881894

@@ -900,11 +913,11 @@ void LLRenderPass::pushUntexturedRiggedGLTFBatches(U32 type)
900913

901914

902915
// static
903-
void LLRenderPass::pushRiggedGLTFBatch(LLDrawInfo& params, const LLVOAvatar*& lastAvatar, U64& lastMeshId, bool& skipLastSkin)
916+
void LLRenderPass::pushRiggedGLTFBatch(LLDrawInfo& params, const LLVOAvatar*& lastAvatar, U64& lastMeshId, bool& skipLastSkin, LLFetchedGLTFMaterial*& lastMat, LLViewerTexture*& lastTex)
904917
{
905918
if (uploadMatrixPalette(params.mAvatar, params.mSkinInfo, lastAvatar, lastMeshId, skipLastSkin))
906919
{
907-
pushGLTFBatch(params);
920+
pushGLTFBatch(params, lastMat, lastTex);
908921
}
909922
}
910923

indra/newview/lldrawpool.h

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ class LLDrawInfo;
4040
class LLVOAvatar;
4141
class LLGLSLShader;
4242
class LLMeshSkinInfo;
43+
class LLFetchedGLTFMaterial;
4344

4445
class LLDrawPool
4546
{
@@ -376,8 +377,10 @@ class LLRenderPass : public LLDrawPool
376377
void pushUntexturedRiggedGLTFBatches(U32 type);
377378

378379
// push a single GLTF draw call
379-
static void pushGLTFBatch(LLDrawInfo& params);
380-
static void pushRiggedGLTFBatch(LLDrawInfo& params, const LLVOAvatar*& lastAvatar, U64& lastMeshId, bool& skipLastSkin);
380+
// lastMat/lastTex track the most recently bound material+media texture so
381+
// consecutive draws sharing a material skip the redundant LLFetchedGLTFMaterial::bind
382+
static void pushGLTFBatch(LLDrawInfo& params, LLFetchedGLTFMaterial*& lastMat, LLViewerTexture*& lastTex);
383+
static void pushRiggedGLTFBatch(LLDrawInfo& params, const LLVOAvatar*& lastAvatar, U64& lastMeshId, bool& skipLastSkin, LLFetchedGLTFMaterial*& lastMat, LLViewerTexture*& lastTex);
381384
static void pushUntexturedGLTFBatch(LLDrawInfo& params);
382385
static void pushUntexturedRiggedGLTFBatch(LLDrawInfo& params, const LLVOAvatar*& lastAvatar, U64& lastMeshId, bool& skipLastSkin);
383386

indra/newview/lldrawpoolavatar.cpp

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -397,6 +397,18 @@ void LLDrawPoolAvatar::renderShadow(S32 pass)
397397
return;
398398
}
399399

400+
// Optionally skip the costlier avatar shadow passes (alpha blend is the most
401+
// expensive and least visually important; alpha mask next). Default 2 = full.
402+
static LLCachedControl<S32> avatar_shadow_detail(gSavedSettings, "RenderAvatarShadowDetail", 2);
403+
if (pass == SHADOW_PASS_AVATAR_ALPHA_BLEND && avatar_shadow_detail() < 2)
404+
{
405+
return;
406+
}
407+
if (pass == SHADOW_PASS_AVATAR_ALPHA_MASK && avatar_shadow_detail() < 1)
408+
{
409+
return;
410+
}
411+
400412
LLDrawPoolAvatar::sShadowPass = pass;
401413

402414
if (pass == SHADOW_PASS_AVATAR_OPAQUE)

indra/newview/lldrawpoolbump.cpp

Lines changed: 29 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -546,28 +546,51 @@ void LLDrawPoolBump::renderDeferred(S32 pass)
546546
for (int i = 0; i < 2; ++i)
547547
{
548548
bool rigged = i == 1;
549+
550+
U32 type = rigged ? LLRenderPass::PASS_BUMP_RIGGED : LLRenderPass::PASS_BUMP;
551+
LLCullResult::drawinfo_iterator begin = gPipeline.beginRenderMap(type);
552+
LLCullResult::drawinfo_iterator end = gPipeline.endRenderMap(type);
553+
if (begin == end)
554+
{ // no bump geometry in this pass -- skip the shader bind and texture setup
555+
continue;
556+
}
557+
549558
gDeferredBumpProgram.bind(rigged);
550559
diffuse_channel = LLGLSLShader::sCurBoundShaderPtr->enableTexture(LLViewerShaderMgr::DIFFUSE_MAP);
551560
bump_channel = LLGLSLShader::sCurBoundShaderPtr->enableTexture(LLViewerShaderMgr::BUMP_MAP);
552561
gGL.getTexUnit(diffuse_channel)->unbind(LLTexUnit::TT_TEXTURE);
553562
gGL.getTexUnit(bump_channel)->unbind(LLTexUnit::TT_TEXTURE);
554563

555-
U32 type = rigged ? LLRenderPass::PASS_BUMP_RIGGED : LLRenderPass::PASS_BUMP;
556-
LLCullResult::drawinfo_iterator begin = gPipeline.beginRenderMap(type);
557-
LLCullResult::drawinfo_iterator end = gPipeline.endRenderMap(type);
558-
559564
const LLVOAvatar* lastAvatar = nullptr;
560565
U64 lastMeshId = 0;
561566
bool skipLastSkin = false;
562567

568+
// Faces are sorted by bumpmap then texture, so the alpha-mask cutoff and the
569+
// bump-image bind (an image lookup + texture bind) repeat across runs of faces.
570+
// Skip them when unchanged. (bindBumpMap's only side effect, addTextureStats, is
571+
// max-based on the source texture, so skipping a repeat is a no-op there too.)
572+
U8 lastBump = 255;
573+
LLViewerTexture* lastBumpTex = nullptr;
574+
F32 lastAlpha = -1.f;
575+
563576
for (LLCullResult::drawinfo_iterator i = begin; i != end; )
564577
{
565578
LLDrawInfo& params = **i;
566579

567580
LLCullResult::increment_iterator(i, end);
568581

569-
LLGLSLShader::sCurBoundShaderPtr->setMinimumAlpha(params.mAlphaMaskCutoff);
570-
LLDrawPoolBump::bindBumpMap(params, bump_channel);
582+
if (params.mAlphaMaskCutoff != lastAlpha)
583+
{
584+
lastAlpha = params.mAlphaMaskCutoff;
585+
LLGLSLShader::sCurBoundShaderPtr->setMinimumAlpha(lastAlpha);
586+
}
587+
588+
if (params.mBump != lastBump || params.mTexture.get() != lastBumpTex)
589+
{
590+
lastBump = params.mBump;
591+
lastBumpTex = params.mTexture.get();
592+
LLDrawPoolBump::bindBumpMap(params, bump_channel);
593+
}
571594

572595
if (rigged)
573596
{

0 commit comments

Comments
 (0)