From cff217278c10bc1d8f3c09449bed00772ebffb9a Mon Sep 17 00:00:00 2001 From: jloveric Date: Sat, 30 Dec 2023 18:28:49 -0800 Subject: [PATCH] Just some notes --- language_interpolation/state_space_network.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/language_interpolation/state_space_network.py b/language_interpolation/state_space_network.py index d0929f7..e8b83c2 100644 --- a/language_interpolation/state_space_network.py +++ b/language_interpolation/state_space_network.py @@ -142,6 +142,8 @@ def __init__(self, args: ModelArgs): self.in_proj = nn.Linear(args.d_model, args.d_inner * 2, bias=args.bias) + # Kernel is ~4 and this does a depthwise + # convolution because groups=k*in_channels (k=1) self.conv1d = nn.Conv1d( in_channels=args.d_inner, out_channels=args.d_inner, @@ -185,7 +187,12 @@ class Mamba, https://github.com/state-spaces/mamba/blob/main/mamba_ssm/modules/m (x, res) = x_and_res.split(split_size=[self.args.d_inner, self.args.d_inner], dim=-1) x = rearrange(x, 'b l d_in -> b d_in l') - x = self.conv1d(x)[:, :, :l] + # Depthwise convolution + # Why do we use a convolution and not just an MLP that operates on the + # channels? Probably because it requires fewer parameters. At any rate, + # This appears to be causal as information isn't shared with the next + # time step (only within a timestep) + x = self.conv1d(x)[:, :, :l] # am I missing something, should always be size l. x = rearrange(x, 'b d_in l -> b l d_in') x = F.silu(x)