FMInference · julian-q · Sep 16, 2023 · Sep 16, 2023
diff --git a/flexgen/compression.py b/flexgen/compression.py
@@ -33,16 +33,16 @@ def allocate(self, shape, dtype, comp_config, pin_memory=None, name=None):
  """Allocate a compressed TorchTensor. Round up the shape to group boundary."""
  assert comp_config.num_bits == 4 and dtype == np.float16
 
- group_size, group_dim = comp_config.group_size, comp_config.group_dim
+ group_size, group_dim, symmetric = comp_config.group_size, comp_config.group_dim, comp_config.symmetric
 
  # Round up
  num_groups = (shape[group_dim] + group_size - 1) // group_size
  data_shape = (
  shape[:group_dim] + (num_groups * (group_size // 2),) + shape[group_dim+1:])
  scale_shape = (
- shape[:group_dim] + (num_groups, 2) + shape[group_dim+1:])
+ shape[:group_dim] + (num_groups, 1 if symmetric else 2) + shape[group_dim+1:])
 
- data = self.base_device.allocate(data_shape, np.uint8, pin_memory=pin_memory)
+ data = self.base_device.allocate(data_shape, np.int8 if symmetric else np.uint8, pin_memory=pin_memory)
  scale = self.base_device.allocate(scale_shape, np.float16, pin_memory=pin_memory)
 
  return TorchTensor(shape, np_dtype_to_torch_dtype[dtype],
@@ -89,7 +89,7 @@ def compress(self, tensor, comp_config):
  group_size, num_bits, group_dim, symmetric = (
  comp_config.group_size, comp_config.num_bits,
  comp_config.group_dim, comp_config.symmetric)
- assert num_bits == 4 and group_size % 2 == 0 and not symmetric
+ assert num_bits == 4 and group_size % 2 == 0
 
  if tensor.device.type == "cpu" and tensor.dtype == torch.float16:
  tensor = tensor.float()
@@ -110,14 +110,21 @@ def compress(self, tensor, comp_config):
  data = tensor.view(new_shape)
 
  # Quantize
- B = 2 ** num_bits - 1
- mn = torch.min(data, dim=group_dim + 1, keepdim=True)[0]
- mx = torch.max(data, dim=group_dim + 1, keepdim=True)[0]
-
- scale = B / (mx - mn)
- data = data - mn
- data.mul_(scale)
- data = data.clamp_(0, B).round_().to(torch.uint8)
+ if symmetric:
+ B = 2 ** (num_bits - 1) - 1
+ mn = None
+ mx = torch.max(data.abs(), dim=group_dim + 1, keepdim=True)[0]
+ scale = B / mx
+ data = data * scale
+ data = data.clamp_(-(B + 1), B).round_().to(torch.int8)
+ else:
+ B = 2 ** num_bits - 1
+ mn = torch.min(data, dim=group_dim + 1, keepdim=True)[0]
+ mx = torch.max(data, dim=group_dim + 1, keepdim=True)[0]
+ scale = B / (mx - mn)
+ data = data - mn
+ data.mul_(scale)
+ data = data.clamp_(0, B).round_().to(torch.uint8)
 
  # Pack
  left_indices = (
@@ -127,15 +134,19 @@ def compress(self, tensor, comp_config):
  tuple(slice(0, x) for x in data.shape[:group_dim+1]) +
  (slice(1, data.shape[group_dim+1], 2),))
  data = torch.bitwise_or(
- data[left_indices].bitwise_left_shift(4), data[right_indices])
+ data[left_indices].bitwise_left_shift(4),
+ data[right_indices].bitwise_and(0xF))
 
  # Reshape
  data_shape = (
  shape[:group_dim] + (num_groups * (group_size // 2),) + shape[group_dim+1:])
- scale_shape = (
- shape[:group_dim] + (num_groups, 2) + shape[group_dim+1:])
  data = data.view(data_shape)
- scale = torch.cat([scale, mn], dim=group_dim+1).view(scale_shape)
+ scale_shape = (
+ shape[:group_dim] + (num_groups, 1 if symmetric else 2) + shape[group_dim+1:])
+ if symmetric:
+ scale = scale.view(scale_shape)
+ else:
+ scale = torch.cat([scale, mn], dim=group_dim+1).view(scale_shape)
 
  data = TorchTensor.create_from_torch(data, self.base_device)
  scale = TorchTensor.create_from_torch(scale, self.base_device)
@@ -182,12 +193,16 @@ def decompress(self, tensor):
  tuple(slice(0, x) for x in data.shape[:group_dim+1]) +
  (slice(1, data.shape[group_dim+1], 2),))
  data[left_indices] = packed.bitwise_right_shift(4)
- data[right_indices] = packed.bitwise_and(0xF)
+ data[right_indices] = packed.bitwise_and(0xF).bitwise_left_shift(4).bitwise_right_shift(4)
 
  # Dequantize
- scale, mn = scale.data.split(1, dim=group_dim + 1)
- data.div_(scale)
- data.add_(mn)
+ if symmetric:
+ scale = scale.data
+ data = data / scale
+ else:
+ scale, mn = scale.data.split(1, dim=group_dim + 1)
+ data.div_(scale)
+ data.add_(mn)
 
  # Reshape
  unpad_len = (group_size - tensor.shape[group_dim] % group_size) % group_size
@@ -281,7 +296,7 @@ def compress(tensor, config):
  B = 2 ** (num_bits - 1) - 1
  scale = B / torch.max(data.abs(), dim=group_dim + 1, keepdim=True)[0]
  data = data * scale
- data = data.clamp_(-B, B).round_().to(torch.int8)
+ data = data.clamp_(-(B + 1), B).round_().to(torch.int8)
  return data, scale, original_shape
  else:
  B = 2 ** num_bits - 1

diff --git a/flexgen/flex_opt.py b/flexgen/flex_opt.py
@@ -1205,7 +1205,7 @@ def run_flexgen(args):
  group_dim=0, symmetric=False),
  args.compress_cache,
  CompressionConfig(num_bits=4, group_size=64,
- group_dim=2, symmetric=False))
+ group_dim=2, symmetric=True))
  assert not (args.compress_cache and args.attn_sparsity < 1.0), "Not implemented"
 
  opt_config = get_opt_config(args.model)