Depthwise Separable Convolution

Introduction

Depthwise separable convolution reduces the memory and math bandwidth requirements for convolution in neural networks. Therefore, it is widely used for neural networks that are intended to run on edge devices.

In this blog post, I would like to briefly discuss depthwise separable convolution and compare its computation cost with ordinary convolution.

Depthwise Separable Convolution

We define $(K, C, R, S)$ as a convolution kernel that has a kernel shape of $(R, S)$, input channels of $C$, and output channels of $K$.

Depthwise separable convolution, sometimes referred as separable conv, performs $(1, 1, R, S)$ convolution for each input channel from the input and concatenation of all the convolution outputs as the intermediate output, followed by a $(K, C, 1, 1)$ convolution on the intermediate output.

If there is no bias term, ordinary convolution has $K \times C \times R \times S$ parameters, whereas depthwise separable convolution has $C \times R \times S + K \times C$ parameters. If there is bias term, we need additional $K$ parameters and $C + K$ parameters for ordinary convolution and depthwise separable convolution, respectively.

Let’s further take a look at the ratio of the number of parameters in depthwise separable convolution and ordinary convolution. Assuming $R \times S \ll \min(K, C)$ and $1 \ll \min(K, C)$,

$$
\begin{align}
\frac{K \times C \times R \times S + K}{C \times R \times S + K \times C + C + K}
&\approx \frac{K \times C \times R \times S + K}{K \times C + C + K} \\
&= \frac{R \times S + \frac{1}{C}}{1 + \frac{1}{K} + \frac{1}{C}} \\
&\approx R \times S \\
\end{align}
$$

Therefore, depthwise separable convolution could have $R \times S$ times fewer parameters than ordinary convolution.

Convolution VS Depthwise Separable Convolution

We implemented depthwise separable convolution using basic convolution operators in PyTorch, and measured the number of parameters and MACs for convolution and depthwise separable convolution that have exactly the same input shape and output shape.

conv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
import torch
import torch.nn as nn


def count_parameters(module: nn.Module, trainable: bool = True) -> int:

if trainable:
num_parameters = sum(p.numel() for p in module.parameters()
if p.requires_grad)
else:
num_parameters = sum(p.numel() for p in module.parameters())

return num_parameters


def conv_parameters(in_channels, out_channels, kernel_size, bias) -> int:

num_parameters = in_channels * out_channels * kernel_size[0] * kernel_size[
1]

if bias:
num_parameters += out_channels

return num_parameters


def separable_conv_parameters(in_channels, out_channels, kernel_size,
bias) -> int:

num_parameters = in_channels * kernel_size[0] * kernel_size[
1] + in_channels * out_channels

if bias:
num_parameters += (in_channels + out_channels)

return num_parameters


class DepthwiseConv2D(nn.Module):
def __init__(self,
in_channels,
out_channels,
kernel_size,
stride=1,
padding=0,
dilation=1,
groups=1,
bias=True,
padding_mode='zeros',
device=None,
dtype=None) -> None:
super().__init__()

self.depthwise_conv = nn.Conv2d(in_channels=in_channels,
out_channels=out_channels,
kernel_size=kernel_size,
stride=stride,
padding=padding,
dilation=dilation,
groups=groups,
bias=bias,
padding_mode=padding_mode,
device=device,
dtype=dtype)

def forward(self, x: torch.Tensor) -> torch.Tensor:

x = self.depthwise_conv(x)

return x


class PointwiseConv2D(nn.Module):
def __init__(self,
in_channels,
out_channels,
bias=True,
device=None,
dtype=None) -> None:
super().__init__()

self.pointwise_conv = nn.Conv2d(in_channels=in_channels,
out_channels=out_channels,
kernel_size=(1, 1),
stride=1,
padding=0,
dilation=1,
groups=1,
bias=bias,
padding_mode='zeros',
device=device,
dtype=dtype)

def forward(self, x: torch.Tensor) -> torch.Tensor:

x = self.pointwise_conv(x)

return x


class SeparableConv2D(nn.Module):
def __init__(self,
in_channels,
out_channels,
kernel_size,
stride=1,
padding=0,
dilation=1,
bias=True,
padding_mode='zeros',
device=None,
dtype=None) -> None:
super().__init__()

self.depthwise_conv = DepthwiseConv2D(in_channels=in_channels,
out_channels=in_channels,
kernel_size=kernel_size,
stride=stride,
padding=padding,
dilation=dilation,
groups=in_channels,
bias=bias,
padding_mode=padding_mode,
device=device,
dtype=dtype)

self.pointwise_conv = PointwiseConv2D(in_channels=in_channels,
out_channels=out_channels,
bias=bias,
device=device,
dtype=dtype)

def forward(self, x: torch.Tensor) -> torch.Tensor:

x = self.depthwise_conv(x)
x = self.pointwise_conv(x)

return x


if __name__ == "__main__":

input_size = (128, 128)
in_channels = 8
out_channels = 64
kernel_size = (3, 3)
bias = True

conv = nn.Conv2d(in_channels=in_channels,
out_channels=out_channels,
kernel_size=kernel_size,
bias=bias)
separable_conv = SeparableConv2D(in_channels=in_channels,
out_channels=out_channels,
kernel_size=kernel_size,
bias=bias)

num_parameters_conv = count_parameters(module=conv)
num_parameters_separable_conv = count_parameters(module=separable_conv)

assert num_parameters_conv == conv_parameters(in_channels=in_channels,
out_channels=out_channels,
kernel_size=kernel_size,
bias=bias)
assert num_parameters_separable_conv == separable_conv_parameters(
in_channels=in_channels,
out_channels=out_channels,
kernel_size=kernel_size,
bias=bias)

random_input = torch.rand((1, in_channels, *input_size))
assert conv(random_input).shape == separable_conv(random_input).shape

print(f"Input Size: {input_size}, In Channels: {in_channels}, "
f"Out Channels: {out_channels}, Kernel Size: {kernel_size}, "
f"Bias: {bias}.")
print(f"Number of Parameters for Conv: {num_parameters_conv}.")
print(f"Number of Parameters for Separable Conv: "
f"{num_parameters_separable_conv}.")

try:
# pip install ptflops
from ptflops import get_model_complexity_info
conv_macs, params = get_model_complexity_info(
model=conv,
input_res=(in_channels, *input_size),
as_strings=True,
print_per_layer_stat=False,
verbose=False)
separable_conv_macs, params = get_model_complexity_info(
model=separable_conv,
input_res=(in_channels, *input_size),
as_strings=True,
print_per_layer_stat=False,
verbose=False)
print(f"Number of MACs for Conv: {conv_macs}.")
print(f"Number of MACs for Separable Conv: {separable_conv_macs}.")
except:
pass

We could see that under common conventional settings, depthwise separable convolution uses much fewer parameters and MACs compared to ordinary convolution.

1
2
3
4
5
6
$ python conv.py 
Input Size: (128, 128), In Channels: 8, Out Channels: 64, Kernel Size: (3, 3), Bias: True.
Number of Parameters for Conv: 4672.
Number of Parameters for Separable Conv: 656.
Number of MACs for Conv: 0.07 GMac.
Number of MACs for Separable Conv: 0.01 GMac.

References

Author

Lei Mao

Posted on

11-08-2021

Updated on

11-08-2021

Licensed under


Comments