Fix quant cache OOM (#494)

predibase · May 30, 2024 · 26e0982 · 26e0982
1 parent 7d6b1d4
commit 26e0982
Showing 1 changed file with 1 addition and 2 deletions.
diff --git a/server/lorax_server/models/flash_causal_lm.py b/server/lorax_server/models/flash_causal_lm.py
@@ -813,8 +813,7 @@ def warmup(self, batch: FlashCausalLMBatch, max_new_tokens: int):
 
  # Inspired by the original implementation in [vllm](https://github.com/vllm-project/vllm)
  # Calculate the number of blocks that can be allocated with the free memory
- cache_dtype = torch.uint8 if fp8_supported else self.dtype
- dtype_size = torch.tensor([], dtype=cache_dtype).element_size()
+ dtype_size = torch.tensor([], dtype=self.dtype).element_size()
  cache_block_size = BLOCK_SIZE * self.num_kv_heads * self.head_size
  total_cache_size = self.num_layers * cache_block_size * 2 * dtype_size