|
20 | 20 | "AbsmaxPertensorObserver", |
21 | 21 | "AbsMaxTokenWiseActObserver", |
22 | 22 | "AbsmaxPerchannelObserver", |
| 23 | + "MoEAbsmaxPertensorObserver", |
23 | 24 | ] |
24 | 25 |
|
25 | 26 |
|
@@ -217,3 +218,91 @@ def zero_points(self): |
217 | 218 | if self._zero_point is None: |
218 | 219 | self.cal_thresholds() |
219 | 220 | return self._zero_point |
| 221 | + |
| 222 | + |
| 223 | +class MoEAbsmaxPertensorObserver(BaseObserver): |
| 224 | + def __init__(self, layer_name=None, quant_bits=8, **kwargs): |
| 225 | + super(MoEAbsmaxPertensorObserver, self).__init__(quant_bits=quant_bits) |
| 226 | + self.layer_name = layer_name |
| 227 | + self._scale = None |
| 228 | + self._zero_point = None |
| 229 | + self._min = None |
| 230 | + self._max = torch.tensor(1e-7, dtype=torch.float32) |
| 231 | + self.step = 0 |
| 232 | + self.dtype = None |
| 233 | + self.parent_observer = ( |
| 234 | + kwargs["parent_observer"] |
| 235 | + if kwargs and "parent_observer" in kwargs |
| 236 | + else None |
| 237 | + ) |
| 238 | + |
| 239 | + def forward(self, inputs): |
| 240 | + """Calculate forward pass.""" |
| 241 | + self.step += 1 |
| 242 | + if not self.dtype: |
| 243 | + self.dtype = inputs.dtype |
| 244 | + if inputs.numel() > 0: |
| 245 | + self._min, self._max = self._cal_min_max(inputs) |
| 246 | + if self.parent_observer is not None: |
| 247 | + self.parent_observer.update(self._min, self._max, self.step) |
| 248 | + else: |
| 249 | + assert self.parent_observer is not None |
| 250 | + self._update_min_max(self.parent_observer.min, self.parent_observer.max) |
| 251 | + return inputs |
| 252 | + |
| 253 | + def _cal_min_max(self, inputs): |
| 254 | + if inputs.dim() >= 2: |
| 255 | + abs_inputs = torch.abs(inputs) |
| 256 | + batch_size = abs_inputs.shape[0] |
| 257 | + abs_inputs_flat = abs_inputs.view( |
| 258 | + batch_size, -1 |
| 259 | + ) # [batch_size, seq_len * hidden_dim] |
| 260 | + abs_max_val, _ = torch.max( |
| 261 | + abs_inputs_flat, dim=1, keepdim=True |
| 262 | + ) # [batch_size, 1] |
| 263 | + min_threshold = self._max.to(abs_max_val.device).expand_as(abs_max_val) |
| 264 | + abs_max_val = torch.maximum(abs_max_val, min_threshold) |
| 265 | + else: |
| 266 | + abs_max_val = torch.max(torch.abs(inputs)) |
| 267 | + if abs_max_val.data < self._max.data: |
| 268 | + abs_max_val = self._max |
| 269 | + abs_max_val = abs_max_val.unsqueeze(0).unsqueeze(0) # [1, 1] |
| 270 | + return 0, abs_max_val.to(inputs.device) |
| 271 | + |
| 272 | + def _update_min_max(self, min, max): |
| 273 | + if min is not None and max is not None: |
| 274 | + if self._min is None or min < self._min: |
| 275 | + self._min = min |
| 276 | + if self._max is None or max > self._max: |
| 277 | + self._max = max |
| 278 | + |
| 279 | + def cal_thresholds(self): |
| 280 | + """Compute thresholds for MAX function.""" |
| 281 | + if self._scale is None: |
| 282 | + self._scale = self._max |
| 283 | + self._zero_point = torch.zeros_like(self._scale) |
| 284 | + |
| 285 | + def quant_axis(self): |
| 286 | + """Return quantization axis.""" |
| 287 | + return -1 |
| 288 | + |
| 289 | + def scales(self): |
| 290 | + """Return output scales.""" |
| 291 | + if self.step == 0 and self.parent_observer is not None: |
| 292 | + self._update_min_max(self.parent_observer.min, self.parent_observer.max) |
| 293 | + self.step = self.parent_observer.step |
| 294 | + if self.step == 0: |
| 295 | + raise ValueError( |
| 296 | + "AbsmaxPertensorObserver scales must calibrate data first!" |
| 297 | + ) |
| 298 | + if self._scale is None: |
| 299 | + self.cal_thresholds() |
| 300 | + if self.dtype: |
| 301 | + self._scale = self._scale.type(self.dtype) |
| 302 | + return self._scale |
| 303 | + |
| 304 | + def zero_points(self): |
| 305 | + """Return output zero points.""" |
| 306 | + if self._zero_point is None: |
| 307 | + self.cal_thresholds() |
| 308 | + return self._zero_point |
0 commit comments