-
Notifications
You must be signed in to change notification settings - Fork 28
/
blockdiag_multiply.py
82 lines (69 loc) · 2.8 KB
/
blockdiag_multiply.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
# Adapted from https://github.com/HazyResearch/fly/tree/master/src/models/layers
import numpy as np
import torch
from torch.nn import functional as F
from einops import rearrange
def blockdiag_weight_to_dense_weight(weight):
"""
Argumments:
weight: (nblocks, out / nblocks, in / blocks)
Return:
dense_weight: (out / in)
"""
return torch.block_diag(*torch.unbind(weight, dim=0))
def blockdiag_multiply_reference(x, weight):
"""
This implementation is slow but more likely to be correct.
Arguments:
x: (..., n)
weight: (nblocks, q, n / nblocks)
Outputs:
out: (..., nblocks * q)
"""
n = x.shape[-1]
nblocks, q, p = weight.shape
assert nblocks * p == n
x_reshaped = rearrange(x, '... (nblocks p) -> ... nblocks p', nblocks=nblocks)
return rearrange(torch.einsum('...kp, kqp -> ...kq', x_reshaped, weight),
'... nblocks q -> ... (nblocks q)')
class BlockdiagMultiply(torch.autograd.Function):
"""This is a faster implementation, with careful memory copies for the fastest
bmm performance.
The backward pass is also written manually with careful memory copies.
Arguments:
x: (..., n)
weight: (nblocks, q, n / nblocks)
Outputs:
out: (..., nblocks * q)
"""
@staticmethod
@torch.cuda.amp.custom_fwd(cast_inputs=torch.bfloat16)
def forward(ctx, x, weight):
ctx.save_for_backward(x, weight)
batch_shape, n = x.shape[:-1], x.shape[-1]
batch_dim = np.prod(batch_shape)
nblocks, q, p = weight.shape
assert nblocks * p == n
x_reshaped = x.reshape(batch_dim, nblocks, p).transpose(0, 1)
out = torch.empty(batch_dim, nblocks, q, device=x.device, dtype=x.dtype).transpose(0, 1)
out = torch.bmm(x_reshaped, weight.transpose(-1, -2), out=out).transpose(0, 1)
return out.reshape(*batch_shape, nblocks * q)
@staticmethod
@torch.cuda.amp.custom_bwd
def backward(ctx, dout):
x, weight = ctx.saved_tensors
batch_shape, n = x.shape[:-1], x.shape[-1]
batch_dim = np.prod(batch_shape)
nblocks, q, p = weight.shape
assert nblocks * p == n
dx, dweight = None, None
dout_reshaped = dout.reshape(batch_dim, nblocks, q).transpose(0, 1)
if ctx.needs_input_grad[0]:
dx = torch.empty(batch_dim, nblocks, p, device=x.device, dtype=x.dtype)
dx = torch.bmm(dout_reshaped, weight.conj(),
out=dx.transpose(0, 1)).transpose(0, 1).reshape(*batch_shape, n)
if ctx.needs_input_grad[1]:
x_reshaped = x.reshape(batch_dim, nblocks, p).transpose(0, 1)
dweight = torch.bmm(dout_reshaped.transpose(-1, -2), x_reshaped.conj())
return dx, dweight
blockdiag_multiply = BlockdiagMultiply.apply