1. Table of Contents

  • Implementation of im2col+gemm operation: ✅
  • Add INT8-quantizer to the first operation: ✅
  • Add SmoothQuant to the second operation: ✅
  • Verify operation through different example inputs: ✅
  • Integrate im2col+gemm SmoothQuant INT8-quantized layer into model: ✅
  • Validate the accuracy of the layer by:
    • through different example inputs: ✅
    • through actual data flow of the model: ✅
    • through accuracy: In progress… (due to Delta only comes back online really late)

2. Implementation of im2col+gemm operation

class MyConv2d(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1):
        super(MyQuantConv2d, self).__init__()
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.kernel_size = kernel_size
        self.stride = stride
        self.padding = padding
        self.dilation = dilation

        self.weight = Parameter(torch.Tensor(out_channels, in_channels, kernel_size, kernel_size)).to(DEVICE)

    def forward(self, input):
        h_in, w_in = input.shape[2:]

        h_out = math.floor((h_in + 2*self.padding - self.dilation*(self.kernel_size-1)-1)/self.stride+1)
        w_out = math.floor((w_in + 2*self.padding - self.dilation*(self.kernel_size-1)-1)/self.stride+1)

        # x: [bs ksize num_sliding] (im2col)
        x = torch.nn.functional.unfold(input, kernel_size=self.kernel_size, padding=self.padding, stride=self.stride)

        bs = input.shape[0]
        ksize = self.in_channels*self.kernel_size*self.kernel_size
        num_sliding = x.shape[2]

        assert x.shape[1] == ksize

        # x: [bs*num_sliding ksize]
        x = torch.transpose(x, 1, 2).reshape(-1, ksize)
        weight_flat = self.weight.view(self.out_channels, ksize)

        # (gemm)
        x = torch.mm(x, weight_flat.t())

        x = x.reshape(bs, num_sliding, self.out_channels)
        x = torch.transpose(x, 1, 2)

        x = x.reshape(bs, self.out_channels, h_out, w_out)

        return x

3. Add INT8-quantizer to im2col+gemm

class MyInitQuantConv2d(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, input_quantizer=None, weight_quantizer=None):
        super(MyQuantConv2d, self).__init__()
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.kernel_size = kernel_size
        self.stride = stride
        self.padding = padding
        self.dilation = dilation
        self._input_quantizer = input_quantizer
        self._weight_quantizer = weight_quantizer

        self.weight = Parameter(torch.Tensor(out_channels, in_channels, kernel_size, kernel_size)).to(DEVICE)

    def forward(self, input):
        h_in, w_in = input.shape[2:]

        h_out = math.floor((h_in + 2*self.padding - self.dilation*(self.kernel_size-1)-1)/self.stride+1)
        w_out = math.floor((w_in + 2*self.padding - self.dilation*(self.kernel_size-1)-1)/self.stride+1)

        # x: [bs ksize num_sliding]
        x = torch.nn.functional.unfold(input, kernel_size=self.kernel_size, padding=self.padding, stride=self.stride)

        bs = input.shape[0]
        ksize = self.in_channels*self.kernel_size*self.kernel_size
        num_sliding = x.shape[2]

        assert x.shape[1] == ksize

        # x: [bs*num_sliding ksize] (im2col)
        x = torch.transpose(x, 1, 2).reshape(-1, ksize)

        weight_flat = self.weight.view(self.out_channels, ksize)

        x = self._input_quantizer(x)
        weight_flat = self._weight_quantizer(weight_flat)

        # (gemm)
        x = torch.mm(x, weight_flat.t())

        x = x.reshape(bs, num_sliding, self.out_channels)
        x = torch.transpose(x, 1, 2)

        x = x.reshape(bs, self.out_channels, h_out, w_out)
        return x

4. Add SmoothQuant to im2col+gemm W8A8 quantized layer

class MyQuantConv2d(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, input_quantizer=None, weight_quantizer=None):
        super(MyQuantConv2d, self).__init__()
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.kernel_size = kernel_size
        self.stride = stride
        self.padding = padding
        self.dilation = dilation
        self._input_quantizer = input_quantizer
        self._weight_quantizer = weight_quantizer

        self.weight = Parameter(torch.Tensor(out_channels, in_channels, kernel_size, kernel_size)).to(DEVICE)

    def forward(self, input):
        h_in, w_in = input.shape[2:]

        h_out = math.floor((h_in + 2*self.padding - self.dilation*(self.kernel_size-1)-1)/self.stride+1)
        w_out = math.floor((w_in + 2*self.padding - self.dilation*(self.kernel_size-1)-1)/self.stride+1)

        # x: [bs ksize num_sliding]
        x = torch.nn.functional.unfold(input, kernel_size=self.kernel_size, padding=self.padding, stride=self.stride)

        bs = input.shape[0]
        ksize = self.in_channels*self.kernel_size*self.kernel_size
        num_sliding = x.shape[2]

        assert x.shape[1] == ksize

        # x: [bs*num_sliding ksize] (im2col)
        x = torch.transpose(x, 1, 2).reshape(-1, ksize)

        weight_flat = self.weight.view(self.out_channels, ksize)

        tensor_x = x.abs().detach()
        tensor_weight = weight_flat.abs().detach()

        act_scale = torch.max(tensor_x, dim=0)[0]
        weight_scale = torch.max(tensor_weight, dim=0)[0]

        scale = torch.sqrt(act_scale/weight_scale)

        x /= scale
        weight_flat = weight_flat * scale

        x = self._input_quantizer(x)
        weight_flat = self._weight_quantizer(weight_flat)

        # (gemm)
        x = torch.mm(x, weight_flat.t())

        x = x.reshape(bs, num_sliding, self.out_channels)
        x = torch.transpose(x, 1, 2)

        x = x.reshape(bs, self.out_channels, h_out, w_out)
        return x

5.1 Verify operation through different example inputs

img

5.2 Verify operation through actual model

image-20240517180654035

As we can see, the L1loss between SmoothQuanted and original model is much lower (1/4) than the right hand side, meaning that the combination of im2col+gemm and SmoothQuanted works.

img

However, if we replace the Conv2d layer in the head part of the model, we can see huge L1loss and even NaN. Therefore, we may exclude the head part for now and only replace the Conv2d layers inside backbone_2d.

6. What’s next?

  • Do L1loss tests within the model again to see which part (SmoothQuant or transformation) has greater benefits.
  • Do multiple tests on 50X input-scale with the scaling factor of SmoothQuant changing to see if other factors can provide better results (lower L1loss).
  • Get the accuracy results of the model by only replacing Conv2d layers that are inside ‘backbone_2d’ group.
  • Dive into the problem of “why merely transforming quantized convolution operation into quantized im2col+gemm already decreases the L1loss a lot”.