You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
// CUDA graph replay reuses the same per-layer Marlin workspaces.
137
+
// The graph itself does not contain a workspace reset, so enqueue
138
+
// one on the same stream before launch. This is correct but costs
139
+
// decode latency; the intended follow-up is a reusable global
140
+
// zero workspace/lock buffer shared by all Marlin layers.
141
+
model_->reset_runtime_state();
110
142
111
143
auto graph = std::get<0>(result->second.compiled);
112
144
auto shared_output = std::shared_ptr<InfinilmModel::Output>(new InfinilmModel::Output{std::get<1>(result->second.compiled)->logits->resume_from_blob_()});
0 commit comments