@@ -1025,11 +1025,7 @@ struct llama_sampler_dist : public llama_sampler_backend {
10251025
10261026 std::mt19937 rng;
10271027
1028- // backend input
1029- struct ggml_tensor * inp_uniform;
1030-
1031- ggml_context_ptr inp_ctx;
1032- ggml_backend_buffer_ptr inp_buf;
1028+ ggml_tensor * inp_uniform;
10331029};
10341030
10351031static const char * llama_sampler_dist_name (const struct llama_sampler * smpl) {
@@ -1138,37 +1134,10 @@ static bool llama_sampler_dist_backend_init(
11381134 ggml_backend_buffer_type_t buft) {
11391135 auto * sctx = (llama_sampler_dist *) smpl->ctx ;
11401136
1141- // allocate inputs
1142- {
1143- ggml_init_params params = {
1144- /* .mem_size =*/ ggml_tensor_overhead (),
1145- /* .mem_buffer =*/ nullptr ,
1146- /* .no_alloc =*/ true ,
1147- };
1148-
1149- sctx->inp_ctx .reset (ggml_init (params));
1150-
1151- // Create the uniform random scalar input tensor. This will be set by
1152- // llama_sampler_dist_backend_set_input after this graph is built.
1153- sctx->inp_uniform = ggml_new_tensor_1d (sctx->inp_ctx .get (), GGML_TYPE_F32, 1 );
1154- ggml_set_name (sctx->inp_uniform , " uniform" );
1155- ggml_set_input (sctx->inp_uniform );
1156-
1157- // Allocate all tensors from our context to the backend
1158- sctx->inp_buf .reset (ggml_backend_alloc_ctx_tensors_from_buft (sctx->inp_ctx .get (), buft));
1159-
1160- ggml_backend_buffer_clear (sctx->inp_buf .get (), 0 );
1161- }
1162-
11631137 const bool res = llama_sampler_backend_support (smpl, buft);
11641138
11651139 sctx->init (res);
11661140
1167- if (!res) {
1168- sctx->inp_ctx .reset (nullptr );
1169- sctx->inp_buf .reset (nullptr );
1170- }
1171-
11721141 return res;
11731142}
11741143
@@ -1178,8 +1147,13 @@ static void llama_sampler_dist_backend_apply(
11781147 struct ggml_cgraph * gf,
11791148 struct llama_sampler_data * data) {
11801149 GGML_UNUSED (gf);
1150+
11811151 auto * sctx = (llama_sampler_dist *) smpl->ctx ;
11821152
1153+ sctx->inp_uniform = ggml_new_tensor_1d (ctx, GGML_TYPE_F32, 1 );
1154+ ggml_set_name (sctx->inp_uniform , " uniform" );
1155+ ggml_set_input (sctx->inp_uniform );
1156+
11831157 struct ggml_tensor * probs = ggml_soft_max (ctx, data->logits );
11841158 ggml_set_name (probs, " dist_probs" );
11851159
@@ -1226,6 +1200,7 @@ static void llama_sampler_dist_backend_apply(
12261200
12271201static void llama_sampler_dist_backend_set_input (struct llama_sampler * smpl) {
12281202 auto * sctx = (llama_sampler_dist *) smpl->ctx ;
1203+
12291204 GGML_ASSERT (sctx->inp_uniform != nullptr );
12301205
12311206 // We sample in double precision and cast to float to match rnd numbers of
@@ -1262,8 +1237,6 @@ struct llama_sampler * llama_sampler_init_dist(uint32_t seed) {
12621237 /* .seed_cur = */ seed_cur,
12631238 /* .rng = */ std::mt19937 (seed_cur),
12641239 /* .inp_uniform = */ nullptr ,
1265- /* .inp_ctx = */ nullptr ,
1266- /* .inp_buf = */ nullptr ,
12671240 }
12681241 );
12691242}
@@ -3461,9 +3434,6 @@ struct llama_sampler_logit_bias : public llama_sampler_backend {
34613434
34623435 struct ggml_tensor * inp_logit_bias;
34633436 struct ggml_tensor * inp_logit_idxs;
3464-
3465- ggml_context_ptr inp_ctx;
3466- ggml_backend_buffer_ptr inp_buf;
34673437};
34683438
34693439static const char * llama_sampler_logit_bias_name (const struct llama_sampler * smpl) {
@@ -3526,6 +3496,16 @@ static void llama_sampler_logit_bias_backend_apply(
35263496 return ;
35273497 }
35283498
3499+ const size_t n = sctx->logit_bias .size ();
3500+
3501+ sctx->inp_logit_bias = ggml_new_tensor_2d (ctx, GGML_TYPE_F32, 1 , n);
3502+ ggml_set_name (sctx->inp_logit_bias , " logit_bias" );
3503+ ggml_set_input (sctx->inp_logit_bias );
3504+
3505+ sctx->inp_logit_idxs = ggml_new_tensor_1d (ctx, GGML_TYPE_I32, n);
3506+ ggml_set_name (sctx->inp_logit_idxs , " logit_idxs" );
3507+ ggml_set_input (sctx->inp_logit_idxs );
3508+
35293509 ggml_tensor * cur = ggml_fill (ctx, data->logits , 0 .0f );
35303510
35313511 cur = ggml_reshape_2d (ctx, cur, 1 , ggml_nelements (cur));
@@ -3562,6 +3542,8 @@ static void llama_sampler_logit_bias_backend_set_input(struct llama_sampler * sm
35623542static bool llama_sampler_logit_bias_backend_init (
35633543 struct llama_sampler * smpl,
35643544 ggml_backend_buffer_type_t buft) {
3545+ GGML_UNUSED (buft);
3546+
35653547 auto * sctx = (llama_sampler_logit_bias *) smpl->ctx ;
35663548
35673549 sctx->init (true );
@@ -3570,29 +3552,6 @@ static bool llama_sampler_logit_bias_backend_init(
35703552 return true ;
35713553 }
35723554
3573- ggml_init_params params = {
3574- /* .mem_size =*/ 2 *ggml_tensor_overhead (),
3575- /* .mem_buffer =*/ nullptr ,
3576- /* .no_alloc =*/ true ,
3577- };
3578-
3579- sctx->inp_ctx .reset (ggml_init (params));
3580-
3581- const size_t n = sctx->logit_bias .size ();
3582-
3583- sctx->inp_logit_bias = ggml_new_tensor_2d (sctx->inp_ctx .get (), GGML_TYPE_F32, 1 , n);
3584- ggml_set_name (sctx->inp_logit_bias , " logit_bias" );
3585- ggml_set_input (sctx->inp_logit_bias );
3586-
3587- sctx->inp_logit_idxs = ggml_new_tensor_1d (sctx->inp_ctx .get (), GGML_TYPE_I32, n);
3588- ggml_set_name (sctx->inp_logit_idxs , " logit_idxs" );
3589- ggml_set_input (sctx->inp_logit_idxs );
3590-
3591- // Allocate all tensors from our context to the backend
3592- sctx->inp_buf .reset (ggml_backend_alloc_ctx_tensors_from_buft (sctx->inp_ctx .get (), buft));
3593-
3594- ggml_backend_buffer_clear (sctx->inp_buf .get (), 0 );
3595-
35963555 return true ;
35973556}
35983557
@@ -3628,8 +3587,6 @@ struct llama_sampler * llama_sampler_init_logit_bias(
36283587 /* .to_search = */ {},
36293588 /* .inp_logit_bias = */ nullptr ,
36303589 /* .inp_logit_idxs = */ nullptr ,
3631- /* .inp_ctx = */ nullptr ,
3632- /* .inp_buf = */ nullptr ,
36333590 }
36343591 );
36353592}
0 commit comments