diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index a48f3cac5fc..58322136606 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -39,7 +39,7 @@ if(GINKGO_BUILD_EXTLIB_EXAMPLE)
 endif()
 
 if(GINKGO_BUILD_MPI)
-    list(APPEND EXAMPLES_LIST distributed-spmv distributed-solver)
+    list(APPEND EXAMPLES_LIST distributed-spmv distributed-spmv-scaling distributed-solver)
 endif()
 
 find_package(OpenCV QUIET)
diff --git a/examples/distributed-spmv-scaling/CMakeLists.txt b/examples/distributed-spmv-scaling/CMakeLists.txt
new file mode 100644
index 00000000000..b00d44733b2
--- /dev/null
+++ b/examples/distributed-spmv-scaling/CMakeLists.txt
@@ -0,0 +1,2 @@
+add_executable(distributed-spmv-scaling distributed-spmv-scaling.cpp)
+target_link_libraries(distributed-spmv-scaling Ginkgo::ginkgo)
diff --git a/examples/distributed-spmv-scaling/build.sh b/examples/distributed-spmv-scaling/build.sh
new file mode 100755
index 00000000000..f4a66345f00
--- /dev/null
+++ b/examples/distributed-spmv-scaling/build.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+
+# set up script
+if [ $# -ne 1 ]; then
+    echo -e "Usage: $0 GINKGO_BUILD_DIRECTORY"
+    exit 1
+fi
+BUILD_DIR=$1
+THIS_DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" &>/dev/null && pwd )
+
+source ${THIS_DIR}/../build-setup.sh
+
+# build
+mpic++ -std=c++14 -o ${THIS_DIR}/distributed-spmv ${THIS_DIR}/distributed-spmv-scaling.cpp \
+       -I${THIS_DIR}/../../include -I${BUILD_DIR}/include \
+       -L${THIS_DIR} ${LINK_FLAGS}
diff --git a/examples/distributed-spmv-scaling/distributed-spmv-scaling.cpp b/examples/distributed-spmv-scaling/distributed-spmv-scaling.cpp
new file mode 100644
index 00000000000..5c9b70905ba
--- /dev/null
+++ b/examples/distributed-spmv-scaling/distributed-spmv-scaling.cpp
@@ -0,0 +1,270 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2021, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+// @sect3{Include files}
+
+// This is the main ginkgo header file.
+#include <ginkgo/ginkgo.hpp>
+
+// Add the fstream header to read from data from files.
+#include <fstream>
+// Add the C++ iostream header to output information to the console.
+#include <iostream>
+// Add the STL map header for the executor selection
+#include <map>
+// Add the string manipulation header to handle strings.
+#include <chrono>
+#include <string>
+
+
+// Finally, we need the MPI header for MPI_Init and _Finalize
+#include <mpi.h>
+
+
+/**
+ * Generates matrix data for a 2D stencil matrix. If restricted is set to true,
+ * creates a 5-pt stencil, if it is false creates a 9-pt stencil. If
+ * strong_scaling is set to true, creates the same problemsize independent of
+ * the number of ranks, if it false the problem size grows with the number of
+ * ranks.
+ */
+template <typename ValueType, typename IndexType>
+gko::matrix_data<ValueType, IndexType> generate_2d_stencil(
+    const IndexType dp, std::shared_ptr<gko::mpi::communicator> comm,
+    bool restricted, bool strong_scaling)
+{
+    const auto mat_size = strong_scaling ? dp * dp : dp * dp * comm->size();
+    const auto rows_per_rank = gko::ceildiv(mat_size, comm->size());
+    const auto start = rows_per_rank * comm->rank();
+    const auto end = gko::min(rows_per_rank * (comm->rank() + 1), mat_size);
+
+    auto A_data =
+        gko::matrix_data<ValueType, IndexType>(gko::dim<2>{mat_size, mat_size});
+
+    for (IndexType row = start; row < end; row++) {
+        auto i = row / dp;
+        auto j = row % dp;
+        for (IndexType d_i = -1; d_i <= 1; d_i++) {
+            for (IndexType d_j = -1; d_j <= 1; d_j++) {
+                if (!restricted || (d_i == 0 || d_j == 0)) {
+                    auto col = j + d_j + (i + d_i) * dp;
+                    if (col >= 0 && col < mat_size) {
+                        A_data.nonzeros.emplace_back(row, col,
+                                                     gko::one<ValueType>());
+                    }
+                }
+            }
+        }
+    }
+
+    return A_data;
+}
+
+
+/**
+ * Generates matrix data for a 3D stencil matrix. If restricted is set to true,
+ * creates a 7-pt stencil, if it is false creates a 27-pt stencil. If
+ * strong_scaling is set to true, creates the same problemsize independent of
+ * the number of ranks, if it false the problem size grows with the number of
+ * ranks.
+ */
+template <typename ValueType, typename IndexType>
+gko::matrix_data<ValueType, IndexType> generate_3d_stencil(
+    const IndexType dp, std::shared_ptr<gko::mpi::communicator> comm,
+    bool restricted, bool strong_scaling)
+{
+    const auto mat_size =
+        strong_scaling ? dp * dp * dp : dp * dp * dp * comm->size();
+    const auto rows_per_rank = gko::ceildiv(mat_size, comm->size());
+    const auto start = rows_per_rank * comm->rank();
+    const auto end = gko::min(rows_per_rank * (comm->rank() + 1), mat_size);
+
+    auto A_data =
+        gko::matrix_data<ValueType, IndexType>(gko::dim<2>{mat_size, mat_size});
+
+    for (IndexType row = start; row < end; row++) {
+        auto i = row / (dp * dp);
+        auto j = (row % (dp * dp)) / dp;
+        auto k = row % dp;
+        for (IndexType d_i = -1; d_i <= 1; d_i++) {
+            for (IndexType d_j = -1; d_j <= 1; d_j++) {
+                for (IndexType d_k = -1; d_k <= 1; d_k++) {
+                    if (!restricted ||
+                        ((d_i == 0 && d_j == 0) || (d_i == 0 && d_k == 0) ||
+                         (d_j == 0 && d_k == 0))) {
+                        auto col =
+                            k + d_k + (j + d_j) * dp + (i + d_i) * dp * dp;
+                        if (col >= 0 && col < mat_size) {
+                            A_data.nonzeros.emplace_back(row, col,
+                                                         gko::one<ValueType>());
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    return A_data;
+}
+
+
+int main(int argc, char* argv[])
+{
+    const auto fin = gko::mpi::init_finalize(argc, argv);
+    // Use some shortcuts. In Ginkgo, vectors are seen as a gko::matrix::Dense
+    // with one column/one row. The advantage of this concept is that using
+    // multiple vectors is a now a natural extension of adding columns/rows are
+    // necessary.
+    using ValueType = double;
+    using GlobalIndexType = gko::distributed::global_index_type;
+    using LocalIndexType = GlobalIndexType;
+    using dist_mtx = gko::distributed::Matrix<ValueType>;
+    using dist_vec = gko::distributed::Vector<ValueType>;
+    using vec = gko::matrix::Dense<ValueType>;
+    using part_type = gko::distributed::Partition<gko::int32>;
+
+    const auto comm = gko::mpi::communicator::create_world();
+    const auto rank = comm->rank();
+    const auto local_rank = comm->local_rank();
+
+    // Print the ginkgo version information.
+    if (rank == 0) {
+        std::cout << gko::version_info::get() << std::endl;
+    }
+
+    if (argc == 2 && (std::string(argv[1]) == "--help")) {
+        if (rank == 0) {
+            std::cerr << "Usage: " << argv[0]
+                      << " [executor] [DISCRETIZATION_POINTS] [2D] "
+                         "[RESTRICT_STENCIL] [STRONG_SCALING]"
+                      << std::endl;
+            std::cerr << "Default values:" << std::endl;
+            std::cerr << "      - executor:        reference" << std::endl;
+            std::cerr << "      - DISCRETIZATION_POINTS: 100" << std::endl;
+            std::cerr << "      - 2D:                      1" << std::endl;
+            std::cerr << "      - RESTRICT_STENCIL:        0" << std::endl;
+            std::cerr << "      - STRONG_SCALING:          1" << std::endl;
+        }
+        std::exit(-1);
+    }
+
+    const auto executor_string = argc >= 2 ? argv[1] : "reference";
+    std::map<std::string, std::function<std::shared_ptr<gko::Executor>()>>
+        exec_map{
+            {"omp", [] { return gko::OmpExecutor::create(); }},
+            {"cuda",
+             [local_rank] {
+                 return gko::CudaExecutor::create(
+                     local_rank, gko::ReferenceExecutor::create(), true);
+             }},
+            {"hip",
+             [local_rank] {
+                 return gko::HipExecutor::create(
+                     local_rank, gko::ReferenceExecutor::create(), true);
+             }},
+            {"dpcpp",
+             [local_rank] {
+                 return gko::DpcppExecutor::create(
+                     local_rank, gko::ReferenceExecutor::create());
+             }},
+            {"reference", [] { return gko::ReferenceExecutor::create(); }}};
+
+    // executor where Ginkgo will perform the computation
+    const auto exec = exec_map.at(executor_string)();  // throws if not valid
+
+    const auto dp = argc >= 3 ? atoi(argv[2]) : 100;
+    const bool two_dim = argc >= 4 ? atoi(argv[3]) > 0 : true;
+    const bool restricted = argc >= 5 ? atoi(argv[4]) > 0 : false;
+    const bool strong_scaling = argc >= 6 ? atoi(argv[5]) > 0 : true;
+
+    // Generate matrix data on each rank
+    if (rank == 0) {
+        std::cout << "Generating stencil matrix..." << std::endl;
+    }
+    auto A_data = two_dim ? generate_2d_stencil<ValueType, GlobalIndexType>(
+                                dp, comm, restricted, strong_scaling)
+                          : generate_3d_stencil<ValueType, GlobalIndexType>(
+                                dp, comm, restricted, strong_scaling);
+    const auto mat_size = A_data.size[0];
+    const auto rows_per_rank = mat_size / comm->size();
+
+    // build partition: uniform number of rows per rank
+    gko::Array<gko::int64> ranges_array{
+        exec->get_master(), static_cast<gko::size_type>(comm->size() + 1)};
+    for (int i = 0; i < comm->size(); i++) {
+        ranges_array.get_data()[i] = i * rows_per_rank;
+    }
+    ranges_array.get_data()[comm->size()] = mat_size;
+    auto partition = gko::share(
+        part_type::build_from_contiguous(exec->get_master(), ranges_array));
+
+    // Build global matrix from local matrix data.
+    auto h_A = dist_mtx::create(exec->get_master(), comm);
+    auto A = dist_mtx::create(exec, comm);
+    h_A->read_distributed(A_data, partition);
+    A->copy_from(h_A.get());
+
+    // Set up global vectors for the distributed SpMV
+    if (rank == 0) {
+        std::cout << "Setting up vectors..." << std::endl;
+    }
+    const auto local_size =
+        ranges_array.get_data()[rank + 1] - ranges_array.get_data()[rank];
+    auto x = dist_vec::create(exec, comm, partition, gko::dim<2>{mat_size, 1},
+                              gko::dim<2>{local_size, 1});
+    x->fill(gko::one<ValueType>());
+    auto b = dist_vec::create(exec, comm, partition, gko::dim<2>{mat_size, 1},
+                              gko::dim<2>{local_size, 1});
+    b->fill(gko::one<ValueType>());
+
+    // Do a warmup run
+    if (rank == 0) {
+        std::cout << "Warming up..." << std::endl;
+    }
+    A->apply(lend(x), lend(b));
+
+    // Do and time the actual benchmark runs
+    if (rank == 0) {
+        std::cout << "Running benchmark..." << std::endl;
+    }
+    auto tic = std::chrono::steady_clock::now();
+    for (auto i = 0; i < 100; i++) {
+        A->apply(lend(x), lend(b));
+        exec->synchronize();
+    }
+    auto toc = std::chrono::steady_clock::now();
+
+    if (rank == 0) {
+        std::chrono::duration<double> duration = toc - tic;
+        std::cout << "DURATION: " << duration.count() << "s" << std::endl;
+    }
+}
diff --git a/examples/distributed-spmv-scaling/doc/builds-on b/examples/distributed-spmv-scaling/doc/builds-on
new file mode 100644
index 00000000000..dbf16906746
--- /dev/null
+++ b/examples/distributed-spmv-scaling/doc/builds-on
@@ -0,0 +1 @@
+distributed-spmv
diff --git a/examples/distributed-spmv-scaling/doc/intro.dox b/examples/distributed-spmv-scaling/doc/intro.dox
new file mode 100644
index 00000000000..c8f39263b9e
--- /dev/null
+++ b/examples/distributed-spmv-scaling/doc/intro.dox
@@ -0,0 +1,6 @@
+<a name="Intro"></a>
+<h1>Introduction</h1>
+This example should help you to inspect the scaling behaviour of the distributed sparse matrix-vector product in Ginkgo.
+You can select the hardware architecture you want to run the example on via the executor, choose if you want to run with
+a 2D five point, a 2D nine point, a 3D seven point or a 3D 27 point stencil matrix, control the problem size and select 
+if you want to inspect weak or strong scaling.
diff --git a/examples/distributed-spmv-scaling/doc/kind b/examples/distributed-spmv-scaling/doc/kind
new file mode 100644
index 00000000000..196aa616342
--- /dev/null
+++ b/examples/distributed-spmv-scaling/doc/kind
@@ -0,0 +1 @@
+distributed
diff --git a/examples/distributed-spmv-scaling/doc/results.dox b/examples/distributed-spmv-scaling/doc/results.dox
new file mode 100644
index 00000000000..043176dbea7
--- /dev/null
+++ b/examples/distributed-spmv-scaling/doc/results.dox
@@ -0,0 +1,14 @@
+<h1>Results</h1>
+The following is the expected result:
+
+@code{.cpp}
+
+Generating stencil matrix...
+Setting up vectors...
+Warming up...
+Running benchmark...
+DURATION: 0.0114743s
+
+@endcode
+
+<h3> Comments about programming and debugging </h3>
diff --git a/examples/distributed-spmv-scaling/doc/short-intro b/examples/distributed-spmv-scaling/doc/short-intro
new file mode 100644
index 00000000000..6862e6a495f
--- /dev/null
+++ b/examples/distributed-spmv-scaling/doc/short-intro
@@ -0,0 +1 @@
+The distributed SpMV scaling behaviour example.
diff --git a/examples/distributed-spmv-scaling/doc/tooltip b/examples/distributed-spmv-scaling/doc/tooltip
new file mode 100644
index 00000000000..aade238e62e
--- /dev/null
+++ b/examples/distributed-spmv-scaling/doc/tooltip
@@ -0,0 +1 @@
+Computes and times a distributed sparse matrix-vector product (SpMV).