|
| 1 | +# Licensed to the Apache Software Foundation (ASF) under one |
| 2 | +# or more contributor license agreements. See the NOTICE file |
| 3 | +# distributed with this work for additional information |
| 4 | +# regarding copyright ownership. The ASF licenses this file |
| 5 | +# to you under the Apache License, Version 2.0 (the |
| 6 | +# "License"); you may not use this file except in compliance |
| 7 | +# with the License. You may obtain a copy of the License at |
| 8 | +# |
| 9 | +# http://www.apache.org/licenses/LICENSE-2.0 |
| 10 | +# |
| 11 | +# Unless required by applicable law or agreed to in writing, |
| 12 | +# software distributed under the License is distributed on an |
| 13 | +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| 14 | +# KIND, either express or implied. See the License for the |
| 15 | +# specific language governing permissions and limitations |
| 16 | +# under the License. |
| 17 | +# |
| 18 | + |
| 19 | +import random |
| 20 | +import threading |
| 21 | +import time |
| 22 | + |
| 23 | +import numpy as np |
| 24 | +import torch |
| 25 | +import torch.multiprocessing as mp |
| 26 | +from transformers import PretrainedConfig, PreTrainedModel |
| 27 | + |
| 28 | +from ainode.core.config import AINodeDescriptor |
| 29 | +from ainode.core.inference.inference_request import InferenceRequest |
| 30 | +from ainode.core.log import Logger |
| 31 | + |
| 32 | +logger = Logger() |
| 33 | + |
| 34 | + |
| 35 | +class InferenceRequestPool(mp.Process): |
| 36 | + """ |
| 37 | + The request pool to handle inference for a specific model. |
| 38 | + """ |
| 39 | + |
| 40 | + FIX_SEED = 2021 |
| 41 | + WAITING_INTERVAL_IN_MS = ( |
| 42 | + AINodeDescriptor().get_config().get_ain_inference_batch_interval_in_ms() |
| 43 | + ) # How often to check for requests in the waiting/running queue |
| 44 | + |
| 45 | + def __init__( |
| 46 | + self, |
| 47 | + pool_id: int, |
| 48 | + model: PreTrainedModel, |
| 49 | + config: PretrainedConfig, |
| 50 | + request_queue: mp.Queue, |
| 51 | + result_queue: mp.Queue, |
| 52 | + **pool_kwargs, |
| 53 | + ): |
| 54 | + super().__init__() |
| 55 | + self.pool_id = pool_id |
| 56 | + self.model = model |
| 57 | + self.device = self.model.device |
| 58 | + self.config = config |
| 59 | + self.pool_kwargs = pool_kwargs |
| 60 | + |
| 61 | + # TODO: A scheduler is necessary for better handling following queues |
| 62 | + self._threads = [] |
| 63 | + self._waiting_queue = request_queue # Requests that are waiting to be processed |
| 64 | + self._running_queue = mp.Queue() # Requests that are currently being processed |
| 65 | + self._finished_queue = result_queue # Requests that are finished |
| 66 | + self._stop_event = mp.Event() |
| 67 | + |
| 68 | + # Fix inference seed |
| 69 | + random.seed(self.FIX_SEED) |
| 70 | + torch.manual_seed(self.FIX_SEED) |
| 71 | + np.random.seed(self.FIX_SEED) |
| 72 | + |
| 73 | + def memory_is_available(self, request): |
| 74 | + # need test with several rounds of dummy data |
| 75 | + pass |
| 76 | + |
| 77 | + def _activate_requests(self): |
| 78 | + if self._waiting_queue.empty(): |
| 79 | + return |
| 80 | + request: InferenceRequest = self._waiting_queue.get() |
| 81 | + # TODO: Check memory size before activating requests |
| 82 | + request.inputs = request.inference_pipeline.preprocess_inputs(request.inputs) |
| 83 | + request.mark_running() |
| 84 | + logger.debug( |
| 85 | + f"[Inference][Device-{self.device}][Pool-{self.pool_id}][ID-{request.req_id}] Request is activated with inputs shape {request.inputs.shape}" |
| 86 | + ) |
| 87 | + self._running_queue.put(request) |
| 88 | + |
| 89 | + def _requests_activate_loop(self): |
| 90 | + while not self._stop_event.is_set(): |
| 91 | + time.sleep(self.WAITING_INTERVAL_IN_MS / 1000) |
| 92 | + self._activate_requests() |
| 93 | + |
| 94 | + def _step(self): |
| 95 | + if self._running_queue.empty(): |
| 96 | + return |
| 97 | + # TODO: We need a batcher to accelerate the concurrent inference |
| 98 | + # TODO: Check memory size before executing requests |
| 99 | + request: InferenceRequest = self._running_queue.get() |
| 100 | + output = self.model.generate( |
| 101 | + request.inputs, |
| 102 | + max_new_tokens=request.max_new_tokens, |
| 103 | + num_samples=10, |
| 104 | + revin=True, |
| 105 | + ) |
| 106 | + request.write_step_output(output[0].mean(dim=0)) |
| 107 | + request.inference_pipeline.post_decode() |
| 108 | + if request.is_finished(): |
| 109 | + request.inference_pipeline.post_inference() |
| 110 | + logger.debug( |
| 111 | + f"[Inference][Device-{self.device}][Pool-{self.pool_id}][ID-{request.req_id}] Request is finished" |
| 112 | + ) |
| 113 | + self._finished_queue.put(request) |
| 114 | + else: |
| 115 | + logger.debug( |
| 116 | + f"[Inference][Device-{self.device}][Pool-{self.pool_id}][ID-{request.req_id}] Request is not finished, re-queueing" |
| 117 | + ) |
| 118 | + self._waiting_queue.put(request) |
| 119 | + |
| 120 | + def _requests_execute_loop(self): |
| 121 | + while not self._stop_event.is_set(): |
| 122 | + time.sleep(self.WAITING_INTERVAL_IN_MS / 1000) |
| 123 | + self._step() |
| 124 | + |
| 125 | + def run(self): |
| 126 | + activate_daemon = threading.Thread( |
| 127 | + target=self._requests_activate_loop, daemon=True |
| 128 | + ) |
| 129 | + self._threads.append(activate_daemon) |
| 130 | + activate_daemon.start() |
| 131 | + execute_daemon = threading.Thread( |
| 132 | + target=self._requests_execute_loop, daemon=True |
| 133 | + ) |
| 134 | + self._threads.append(execute_daemon) |
| 135 | + execute_daemon.start() |
| 136 | + for thread in self._threads: |
| 137 | + thread.join() |
| 138 | + |
| 139 | + def stop(self): |
| 140 | + self._stop_event.set() |
0 commit comments