Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP: Refactor Load Balancing #541

Open
wants to merge 42 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
42 commits
Select commit Hold shift + click to select a range
7a5fa78
Rename SparseDimPartitioner into DynamicPartitioner
squarefk Jun 13, 2022
8c8e975
Clean up optimizePartitionAlongDim
squarefk Jun 13, 2022
ae62b97
Rename computeLocalWorkLoad into setLocalWorkloadByParticles & setLoc…
squarefk Jun 13, 2022
777f942
Simplify optimizePartitionAlongDim
squarefk Jun 14, 2022
416b358
Fix cmake inside benchmark/cajita
squarefk Jun 14, 2022
cfcbeb7
Fix compile error
squarefk Jun 20, 2022
6c80d6b
Format
squarefk Jun 20, 2022
588a524
Add SparseMapDynamicPartitioner and ParticleDynamicPartitioner
squarefk Jun 20, 2022
4273392
Fix compile
squarefk Jun 21, 2022
562547f
Fix tstDynamicPartitioner
squarefk Jun 23, 2022
04f615e
Use base::base
squarefk Jun 28, 2022
d7ed955
Separate test files
squarefk Jun 28, 2022
eca5b0d
Separate class files
squarefk Jun 28, 2022
2120482
Separate performance test files
squarefk Jun 28, 2022
5ab6398
Clean up
squarefk Jun 28, 2022
ceecf4a
Rename setLocalWorkload
squarefk Jun 28, 2022
45559fb
Format
squarefk Jun 28, 2022
3b817f0
Format
squarefk Jun 28, 2022
bfce317
Add WorkloadSetter class
squarefk Jul 7, 2022
57a04bf
Use WorkloadSetter
squarefk Jul 12, 2022
a0d9c55
Fix benchmark compile
squarefk Jul 12, 2022
b266f21
Format
squarefk Jul 12, 2022
cc38788
Add comments
squarefk Jul 12, 2022
e19ad62
Rename optimizePartitionAlongDim into updatePartition
squarefk Jul 21, 2022
7d11253
Fix LAMBDA
squarefk Jul 21, 2022
6ad716f
Rename run into compute
squarefk Jul 21, 2022
2109f2f
Rename setter into measurer
squarefk Jul 21, 2022
8576b8d
Format
squarefk Jul 21, 2022
d39e1fa
Detailed comment
squarefk Jul 21, 2022
2d96eaa
Clean up LAMBDA
squarefk Jul 21, 2022
78e3d78
Rebase master and fix compile
squarefk Jul 21, 2022
1735939
Update cajita/src/Cajita_ParticleDynamicPartitioner.hpp
squarefk Jul 26, 2022
ade851c
Clean up workload tags
squarefk Jul 26, 2022
c6c70ba
Rename _proxy into _copy
squarefk Jul 26, 2022
ec7b0b5
Clean up constructor
squarefk Jul 26, 2022
16cea3e
Format
squarefk Jul 26, 2022
88ed450
Move initializeRecPartition into constructor
squarefk Jul 26, 2022
46ccece
Fix view
squarefk Aug 4, 2022
9b3aeb1
Fix compile
squarefk Aug 4, 2022
d989a81
Format
squarefk Aug 4, 2022
4698e0c
Add comments
squarefk Aug 8, 2022
76578b7
Fix compiling NVCC 11.4
squarefk Sep 6, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 8 additions & 3 deletions benchmark/cajita/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,11 @@
add_executable(SparseMapPerformance Cajita_SparseMapPerformance.cpp)
target_link_libraries(SparseMapPerformance Cajita)

add_executable(SparsePartitionerPerformance Cajita_SparsePartitionerPerformance.cpp)
target_link_libraries(SparsePartitionerPerformance Cajita)
add_executable(ParticleDynamicPartitionerPerformance Cajita_ParticleDynamicPartitionerPerformance.cpp)
target_link_libraries(ParticleDynamicPartitionerPerformance Cajita)

add_executable(SparseMapDynamicPartitionerPerformance Cajita_SparseMapDynamicPartitionerPerformance.cpp)
target_link_libraries(SparseMapDynamicPartitionerPerformance Cajita)

add_executable(HaloPerformance Cajita_HaloPerformance.cpp)
target_link_libraries(HaloPerformance Cajita)
Expand All @@ -29,7 +32,9 @@ endif()
if(Cabana_ENABLE_TESTING)
add_test(NAME Cajita_SparseMapPerformance COMMAND ${NONMPI_PRECOMMAND} SparseMapPerformance sparsemap_output.txt)

add_test(NAME Cajita_SparsePartitionerPerformance COMMAND ${NONMPI_PRECOMMAND} SparsePartitionerPerformance sparsepartitioner_output.txt)
add_test(NAME Cajita_ParticleDynamicPartitionerPerformance COMMAND ${NONMPI_PRECOMMAND} ParticleDynamicPartitionerPerformance particledynamicpartitioner_output.txt)

add_test(NAME Cajita_SparseMapDynamicPartitionerPerformance COMMAND ${NONMPI_PRECOMMAND} SparseMapDynamicPartitionerPerformance sparsemapdynamicpartitioner_output.txt)

add_test(NAME Cajita_HaloPerformance COMMAND ${NONMPI_PRECOMMAND} HaloPerformance halo_output.txt)

Expand Down
290 changes: 290 additions & 0 deletions benchmark/cajita/Cajita_ParticleDynamicPartitionerPerformance.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,290 @@
/****************************************************************************
* Copyright (c) 2018-2022 by the Cabana authors *
* All rights reserved. *
* *
* This file is part of the Cabana library. Cabana is distributed under a *
* BSD 3-clause license. For the licensing terms see the LICENSE file in *
* the top-level directory. *
* *
* SPDX-License-Identifier: BSD-3-Clause *
****************************************************************************/

#include "../Cabana_BenchmarkUtils.hpp"
#include "Cabana_ParticleInit.hpp"

#include <Cajita_ParticleDynamicPartitioner.hpp>
#include <Cajita_SparseIndexSpace.hpp>

#include <Kokkos_Core.hpp>

#include <algorithm>
#include <fstream>
#include <iostream>
#include <random>
#include <set>
#include <sstream>
#include <string>
#include <vector>

#include <mpi.h>

// generate average partitioner
std::array<std::vector<int>, 3>
computeAveragePartition( const int tile_per_dim,
const std::array<int, 3>& ranks_per_dim )
{
std::array<std::vector<int>, 3> rec_partitions;
for ( int d = 0; d < 3; ++d )
{
int ele = tile_per_dim / ranks_per_dim[d];
int part = 0;
for ( int i = 0; i < ranks_per_dim[d]; ++i )
{
rec_partitions[d].push_back( part );
part += ele;
}
rec_partitions[d].push_back( tile_per_dim );
}
return rec_partitions;
}

//---------------------------------------------------------------------------//
// Performance test.
template <class Device>
void performanceTest( std::ostream& stream, MPI_Comm comm,
const std::string& test_prefix,
std::vector<int> problem_sizes,
std::vector<int> num_cells_per_dim )
{
using memory_space = typename Device::memory_space;

// Get comm rank;
int comm_rank;
MPI_Comm_rank( comm, &comm_rank );

// Get comm size;
int comm_size;
MPI_Comm_size( comm, &comm_size );

// Domain size setup
std::array<float, 3> global_low_corner = { 0.0, 0.0, 0.0 };
std::array<float, 3> global_high_corner = { 1.0, 1.0, 1.0 };
constexpr int cell_num_per_tile_dim = 4;
constexpr int cell_bits_per_tile_dim = 2;

// Declare the total number of particles
int num_problem_size = problem_sizes.size();

// Declare the size (cell nums) of the domain
int num_cells_per_dim_size = num_cells_per_dim.size();

// Number of runs in the test loops.
int num_run = 10;

// Basic settings for partitioenr
int max_optimize_iteration = 10;

// Create random sets of particle positions.
using position_type = Kokkos::View<float* [3], memory_space>;
std::vector<position_type> positions( num_problem_size );
for ( int p = 0; p < num_problem_size; ++p )
{
positions[p] = position_type(
Kokkos::ViewAllocateWithoutInitializing( "positions" ),
problem_sizes[p] );
Cabana::createRandomParticles( positions[p], problem_sizes[p],
global_low_corner[0],
global_high_corner[0] );
}

for ( int c = 0; c < num_cells_per_dim_size; ++c )
{
// init the sparse grid domain
std::array<int, 3> global_num_cell = {
num_cells_per_dim[c], num_cells_per_dim[c], num_cells_per_dim[c] };
int num_tiles_per_dim = num_cells_per_dim[c] >> cell_bits_per_tile_dim;

// set up partitioner
Cajita::DynamicPartitioner<Device, cell_num_per_tile_dim> partitioner(
comm, global_num_cell, max_optimize_iteration );
auto ranks_per_dim =
partitioner.ranksPerDimension( comm, global_num_cell );
auto ave_partition =
computeAveragePartition( num_tiles_per_dim, ranks_per_dim );

// Create insertion timers
std::stringstream local_workload_name;
local_workload_name << test_prefix << "compute_local_workload_"
<< "domain_size(cell)_" << num_cells_per_dim[c];
Cabana::Benchmark::Timer local_workload_timer(
local_workload_name.str(), num_problem_size );

std::stringstream prefix_sum_name;
prefix_sum_name << test_prefix << "compute_prefix_sum_"
<< "domain_size(cell)_" << num_cells_per_dim[c];
Cabana::Benchmark::Timer prefix_sum_timer( prefix_sum_name.str(),
num_problem_size );

std::stringstream total_optimize_name;
total_optimize_name << test_prefix << "total_optimize_"
<< "domain_size(cell)_" << num_cells_per_dim[c];
Cabana::Benchmark::Timer total_optimize_timer(
total_optimize_name.str(), num_problem_size );

// loop over all the particle numbers
for ( int p = 0; p < num_problem_size; ++p )
{
// compute the number of particles handled by the current MPI rank
int par_num = problem_sizes[p] / comm_size +
( problem_sizes[p] % comm_size < comm_rank ? 1 : 0 );

auto pos_view = Kokkos::subview(
positions[p], Kokkos::pair<int, int>( 0, par_num ),
Kokkos::pair<int, int>( 0, 3 ) );

// try for num_run times
for ( int t = 0; t < num_run; ++t )
{
// ensure every optimization process starts from the same status
partitioner.initializePartitionByAverage( comm,
global_num_cell );

// compute local workload
local_workload_timer.start( p );
constexpr int cell_num_per_tile_dim = 4;
constexpr int num_space_dim = 3;
auto pws =
Cajita::createParticleDynamicPartitionerWorkloadMeasurer<
cell_num_per_tile_dim, num_space_dim, Device>(
pos_view, par_num, global_low_corner,
1.0f / num_cells_per_dim[c], comm );
partitioner.setLocalWorkload( &pws );
local_workload_timer.stop( p );

// compute prefix sum matrix
prefix_sum_timer.start( p );
partitioner.computeFullPrefixSum( comm );
prefix_sum_timer.stop( p );

// optimization
bool is_changed = false;
// full timer
total_optimize_timer.start( p );
for ( int i = 0; i < max_optimize_iteration; ++i )
{
partitioner.updatePartition( std::rand() % 3, is_changed );
if ( !is_changed )
break;
}
total_optimize_timer.stop( p );
}
}
// Output results
outputResults( stream, "insert_tile_num", problem_sizes,
local_workload_timer, comm );
outputResults( stream, "insert_tile_num", problem_sizes,
prefix_sum_timer, comm );
outputResults( stream, "insert_tile_num", problem_sizes,
total_optimize_timer, comm );
stream << std::flush;
}
}

//---------------------------------------------------------------------------//
// main
int main( int argc, char* argv[] )
{
// Initialize environment
MPI_Init( &argc, &argv );
Kokkos::initialize( argc, argv );

// Check arguments.
if ( argc < 2 )
throw std::runtime_error( "Incorrect number of arguments. \n \
First argument - file name for output \n \
Optional second argument - run size (small or large) \n \
\n \
Example: \n \
$/: ./SparseMapPerformance test_results.txt\n" );

// Define run sizes.
std::string run_type = "";
if ( argc > 2 )
run_type = argv[2];
std::vector<int> problem_sizes = { 1000, 10000 };
std::vector<int> num_cells_per_dim = { 32, 64 };
if ( run_type == "large" )
{
problem_sizes = { 1000, 10000, 100000, 1000000 };
num_cells_per_dim = { 32, 64, 128, 256 };
}
std::vector<double> occupy_fraction = { 0.01, 0.1, 0.5, 0.75, 1.0 };

// Get the name of the output file.
std::string filename = argv[1];

// Barier before continuing.
MPI_Barrier( MPI_COMM_WORLD );

// Get comm rank;
int comm_rank;
MPI_Comm_rank( MPI_COMM_WORLD, &comm_rank );

// Get comm size;
int comm_size;
MPI_Comm_size( MPI_COMM_WORLD, &comm_size );

// Get Cartesian comm
std::array<int, 3> ranks_per_dim;
for ( std::size_t d = 0; d < 3; ++d )
ranks_per_dim[d] = 0;
MPI_Dims_create( comm_size, 3, ranks_per_dim.data() );

// Open the output file on rank 0.
std::fstream file;
if ( 0 == comm_rank )
file.open( filename, std::fstream::out );

// Output problem details.
if ( 0 == comm_rank )
{
file << "\n";
file << "Cajita Sparse Partitioner Performance Benchmark"
<< "\n";
file << "----------------------------------------------"
<< "\n";
file << "MPI Ranks: " << comm_size << "\n";
file << "MPI Cartesian Dim Ranks: (" << ranks_per_dim[0] << ", "
<< ranks_per_dim[1] << ", " << ranks_per_dim[2] << ")\n";
file << "----------------------------------------------"
<< "\n";
file << "\n";
file << std::flush;
}

// Do everything on the default CPU.
using host_exec_space = Kokkos::DefaultHostExecutionSpace;
using host_device_type = host_exec_space::device_type;
// Do everything on the default device with default memory.
using exec_space = Kokkos::DefaultExecutionSpace;
using device_type = exec_space::device_type;

// Don't run twice on the CPU if only host enabled.
// Don't rerun on the CPU if already done or if turned off.
if ( !std::is_same<device_type, host_device_type>{} )
{
performanceTest<device_type>( file, MPI_COMM_WORLD,
"device_particleWL_", problem_sizes,
num_cells_per_dim );
}
performanceTest<host_device_type>( file, MPI_COMM_WORLD, "host_particleWL_",
problem_sizes, num_cells_per_dim );

// Close the output file on rank 0.
file.close();

// Finalize
Kokkos::finalize();
MPI_Finalize();
return 0;
}
Loading