diff --git a/benchmarks/README.md b/benchmarks/README.md index 49a57b1b9277..0819b61d189b 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -7,3 +7,4 @@ ## TODO - **measure performance ratio**: This increases benchmark time as it requires benchmarking pure PyTorch implementation. +- **clean up**: Reuse code across benchmarks \ No newline at end of file diff --git a/benchmarks/benchmarks/__init__.py b/benchmarks/benchmarks/__init__.py index e69de29bb2d1..4819e512155f 100644 --- a/benchmarks/benchmarks/__init__.py +++ b/benchmarks/benchmarks/__init__.py @@ -0,0 +1,13 @@ +import os + +RUN_ALL = bool(int(os.environ.get("PYG_BENCH_RUN_ALL", "0"))) +RUN_CPU = RUN_ALL and bool(int(os.environ.get("PYG_BENCH_RUN_CPU", "0"))) +RUN_CUDA = RUN_ALL and bool(int(os.environ.get("PYG_BENCH_RUN_CUDA", "1"))) + +devices = [] + +if RUN_CPU: + devices.append("cpu") + +if RUN_CUDA: + devices.append("cuda") diff --git a/benchmarks/benchmarks/utils.py b/benchmarks/benchmarks/utils.py new file mode 100644 index 000000000000..0e7b86c97308 --- /dev/null +++ b/benchmarks/benchmarks/utils.py @@ -0,0 +1,312 @@ +import torch +from torch.utils.benchmark import Timer + +import torch_geometric +from torch_geometric.typing import SparseTensor +from torch_geometric.utils import ( + dense_to_sparse, + is_sparse, + is_torch_sparse_tensor, + scatter, + softmax, + spmm, + to_edge_index, + to_torch_coo_tensor, + to_torch_csc_tensor, + to_torch_csr_tensor, +) + +WITH_TORCH_SCATTER = True +try: + import torch_scatter +except ImportError: + WITH_TORCH_SCATTER = False + + +def pytorch_scatter(x, index, dim_size, reduce): + if reduce == "min" or reduce == "max": + reduce = f"a{reduce}" # `amin` or `amax` + elif reduce == "mul": + reduce = "prod" + out = x.new_zeros((dim_size, x.size(-1))) + include_self = reduce in ["sum", "mean"] + index = index.view(-1, 1).expand(-1, x.size(-1)) + out.scatter_reduce_(0, index, x, reduce, include_self=include_self) + return out + + +def own_scatter(x, index, dim_size, reduce): + return torch_scatter.scatter(x, index, dim=0, dim_size=dim_size, reduce=reduce) + + +def optimized_scatter(x, index, dim_size, reduce): + return scatter(x, index, dim=0, dim_size=dim_size, reduce=reduce) + + +def pytorch_index_add(x, index, dim_size, reduce): + out = x.new_zeros(dim_size, x.size(-1)) + out.index_add_(0, index, x) + return out + + +def grads_like(x): + return torch.ones_like(x, requires_grad=True) + + +class Scatter: + param_names = ["f", "reduce", "num_nodes, num_edges", "device"] + params = [ + [pytorch_scatter, own_scatter, optimized_scatter, pytorch_index_add], + ["sum", "mean", "min", "max", "mul"], + [(4_000, 4_000 * 50), (16_000, 16_000 * 50), (64_000, 64_000 * 50)], + ["cuda"], # TODO: Enable "cpu" + ] + unit = "us" + + def setup(self, *params): + f, reduce, (num_nodes, num_edges), device = params + + if f is own_scatter and not WITH_TORCH_SCATTER: + raise NotImplementedError + + if f is pytorch_index_add and reduce != "sum": + raise NotImplementedError + + self.globals = { + "x": torch.randn(num_edges, 64, device=device, requires_grad=True), + "index": torch.randint(num_nodes, (num_edges,), device=device), + "dim_size": num_nodes, + "reduce": reduce, + } + + def track_fwd(self, *params): + f, *_ = params + t = Timer( + stmt=f"{f.__name__}(x, index, dim_size, reduce)", + setup=f"from {__name__} import {f.__name__}", + globals=self.globals, + num_threads=4, + label="scatter", + sub_label=f.__name__, + description=self.globals["reduce"], + ) + m = t.blocked_autorange(min_run_time=1) + return m.median * 1_000**2 # us + + def track_bwd(self, *params): + f, *_ = params + t = Timer( + stmt="out.backward(out_grad, retain_graph=True)", + setup=( + f"from {__name__} import {f.__name__}, grads_like\n" + f"out = {f.__name__}(x, index, dim_size, reduce)\n" + f"out_grad = grads_like(out)" + ), + globals=self.globals, + num_threads=4, + label="scatter", + sub_label=f.__name__, + description=self.globals["reduce"], + ) + m = t.blocked_autorange(min_run_time=1) + return m.median * 1_000**2 # us + + +class Sparse: + param_names = ["f", "num_nodes, num_edges", "device"] + params = [ + [ + SparseTensor.from_edge_index, + to_torch_coo_tensor, + to_torch_csr_tensor, + to_torch_csc_tensor, + ], + [(10_000, 200_000)], + ["cuda"], # TODO: Enable "cpu" + ] + unit = "us" + + def setup(self, *params): + f, (num_nodes, num_edges), device = params + + self.globals = { + "f": f, + "edge_index": torch.randint(num_nodes, (2, num_edges), device=device), + "size": num_nodes, + } + + def track_fwd(self, *params): + f, *_ = params + t = Timer( + stmt="f(edge_index, None, (size, size))", + globals=self.globals, + num_threads=4, + label="sparse", + sub_label=f.__name__, + description=" ", + ) + m = t.blocked_autorange(min_run_time=1) + return m.median * 1_000**2 # us + + +class Spmm: + param_names = ["layout", "reduce", "num_nodes, num_edges", "device"] + params = [ + [torch.sparse_coo, torch.sparse_csr, torch.sparse_csc], + ["sum", "mean"], # TODO: if not cuda, add ["min", "max"] + [(10_000, 200_000)], + ["cuda"], # TODO: Enable "cpu" + ] + unit = "us" + + def setup(self, *params): + layout, reduce, (num_nodes, num_edges), device = params + x = torch.randn(num_nodes, 64, device=device, requires_grad=True) + edge_index = torch.randint(num_nodes, (2, num_edges), device=device) + adj = to_torch_coo_tensor(edge_index, size=num_nodes).to_sparse(layout=layout) + self.globals = { + "adj": adj, + "x": x, + "reduce": reduce, + } + + def track_fwd(self, *params): + layout, *_ = params + t = Timer( + stmt="spmm(adj, x, reduce)", + setup=f"from torch_geometric.utils import spmm", + globals=self.globals, + num_threads=4, + label="spmm", + sub_label=layout, + description=" ", + ) + m = t.blocked_autorange(min_run_time=1) + return m.median * 1_000**2 # us + + def track_bwd(self, *params): + layout, *_ = params + t = Timer( + stmt="out.backward(out_grad, retain_graph=True)", + setup=f"from torch_geometric.utils import spmm; from {__name__} import grads_like; out = spmm(adj, x, reduce); out_grad = grads_like(out)", + globals=self.globals, + num_threads=4, + label="spmm_bwd", + sub_label=layout, + description=" ", + ) + m = t.blocked_autorange(min_run_time=1) + return m.median * 1_000**2 # us + + +def trivial_map(src, index, max_index, inclusive): + if max_index is None: + max_index = max(src.max(), index.max()) + + if inclusive: + assoc = src.new_empty(max_index + 1) + else: + assoc = src.new_full((max_index + 1,), -1) + assoc[index] = torch.arange(index.numel(), device=index.device) + out = assoc[src] + + if inclusive: + return out, None + else: + mask = out != -1 + return out[mask], mask + + +from torch_geometric.utils.map import map_index + + +class Map: + param_names = ["f", "device"] + params = [ + [trivial_map, map_index], + ["cpu"], # TODO: Enable "cuda" if cudf is installed + ] + unit = "us" + + def setup(self, *params): + f, device = params + src = torch.randint(0, 100_000_000, (100_000,), device=device) + index = src.unique() + self.globals = { + "f": f, + "src": src, + "index": index, + } + + def track_inclusive(self, *_): + t = Timer( + stmt="f(src, index, None, True)", + globals=self.globals, + num_threads=4, + label="map", + sub_label=" ", + description=" ", + ) + m = t.blocked_autorange(min_run_time=1) + return m.median * 1_000**2 # us + + def track_exclusive(self, *_): + t = Timer( + stmt="f(src, index[:50_000], None, False)", + globals=self.globals, + num_threads=4, + label="map", + sub_label=" ", + description=" ", + ) + m = t.blocked_autorange(min_run_time=1) + return m.median * 1_000**2 # us + + +def dense_softmax(x, index): + x = x.view(x.size(0), -1, x.size(-1)) + return x.softmax(dim=-1) + + +class Softmax: + param_names = ["f", "compile", "num_nodes, num_edges", "device"] + params = [ + [softmax, dense_softmax], + [False, True], + [(10_000, 200_000)], + ["cuda"], # TODO: Enable "cpu" + ] + unit = "us" + + def setup(self, *params): + f, compile, (num_nodes, num_edges), device = params + self.globals = { + "f": torch_geometric.compile(f) if compile else f, + "x": torch.randn(num_edges, 64, device=device), + "index": torch.randint(num_nodes, (num_edges,), device=device), + } + + def track_fwd(self, *_): + t = Timer( + stmt="f(x, index)", + globals=self.globals.copy(), + num_threads=4, + label="softmax_fwd", + sub_label=" ", + description=" ", + ) + m = t.blocked_autorange(min_run_time=1) + return m.median * 1_000**2 # us + + def track_bwd(self, *_): + t = Timer( + stmt="out.backward(out_grad, retain_graph=True)", + setup=f"from {__name__} import grads_like; out = f(x, index); out_grad = grads_like(out)", + globals=self.globals, + num_threads=1, + label="softmax_bwd", + sub_label=" ", + description=" ", + ) + m = t.blocked_autorange(min_run_time=1) + return m.median * 1_000**2 # us