diff --git a/thrust/testing/zip_iterator_reduce_by_key.cu b/thrust/testing/zip_iterator_reduce_by_key.cu index e3fc99d66a..9076bcba5c 100644 --- a/thrust/testing/zip_iterator_reduce_by_key.cu +++ b/thrust/testing/zip_iterator_reduce_by_key.cu @@ -69,7 +69,7 @@ struct TestZipIteratorReduceByKey ASSERT_EQUAL(h_data4, d_data4); ASSERT_EQUAL(h_data5, d_data5); } - + // The tests below get miscompiled on Tesla hw for 8b types #if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA @@ -118,6 +118,51 @@ struct TestZipIteratorReduceByKey ASSERT_EQUAL(h_data5, d_data5); ASSERT_EQUAL(h_data6, d_data6); } + + // const inputs, see #1527 + { + host_vector h_data3(n, 0.0f); + host_vector h_data4(n, 0); + host_vector h_data5(n, 0); + host_vector h_data6(n, 0.0f); + device_vector d_data3(n, 0.0f); + device_vector d_data4(n, 0); + device_vector d_data5(n, 0); + device_vector d_data6(n, 0.0f); + + // run on host + const T* h_begin1 = thrust::raw_pointer_cast(h_data1.data()); + const T* h_begin2 = thrust::raw_pointer_cast(h_data2.data()); + const float* h_begin3 = thrust::raw_pointer_cast(h_data3.data()); + T* h_begin4 = thrust::raw_pointer_cast(h_data4.data()); + T* h_begin5 = thrust::raw_pointer_cast(h_data5.data()); + float* h_begin6 = thrust::raw_pointer_cast(h_data6.data()); + thrust::reduce_by_key(thrust::host, + thrust::make_zip_iterator(thrust::make_tuple(h_begin1, h_begin2)), + thrust::make_zip_iterator(thrust::make_tuple(h_begin1, h_begin2)) + n, + h_begin3, + thrust::make_zip_iterator(thrust::make_tuple(h_begin4, h_begin5)), + h_begin6); + + // run on device + const T* d_begin1 = thrust::raw_pointer_cast(d_data1.data()); + const T* d_begin2 = thrust::raw_pointer_cast(d_data2.data()); + const float* d_begin3 = thrust::raw_pointer_cast(d_data3.data()); + T* d_begin4 = thrust::raw_pointer_cast(d_data4.data()); + T* d_begin5 = thrust::raw_pointer_cast(d_data5.data()); + float* d_begin6 = thrust::raw_pointer_cast(d_data6.data()); + thrust::reduce_by_key(thrust::device, + thrust::make_zip_iterator(thrust::make_tuple(d_begin1, d_begin2)), + thrust::make_zip_iterator(thrust::make_tuple(d_begin1, d_begin2)) + n, + d_begin3, + thrust::make_zip_iterator(thrust::make_tuple(d_begin4, d_begin5)), + d_begin6); + + ASSERT_EQUAL(h_data3, d_data3); + ASSERT_EQUAL(h_data4, d_data4); + ASSERT_EQUAL(h_data5, d_data5); + ASSERT_EQUAL(h_data6, d_data6); + } } }; VariableUnitTest TestZipIteratorReduceByKeyInstance; diff --git a/thrust/thrust/system/cuda/detail/reduce_by_key.h b/thrust/thrust/system/cuda/detail/reduce_by_key.h index 0f8fec1bec..88928f5c0e 100644 --- a/thrust/thrust/system/cuda/detail/reduce_by_key.h +++ b/thrust/thrust/system/cuda/detail/reduce_by_key.h @@ -688,7 +688,7 @@ namespace __reduce_by_key { } key_type tile_pred_key = (threadIdx.x == 0) - ? keys_load_it[tile_offset - 1] + ? key_type(keys_load_it[tile_offset - 1]) : key_type(); sync_threadblock(); @@ -1057,7 +1057,7 @@ namespace __reduce_by_key { status = cuda_cub::synchronize(policy); cuda_cub::throw_on_error(status, "reduce_by_key: failed to synchronize"); - int num_runs_out = cuda_cub::get_value(policy, d_num_runs_out); + const auto num_runs_out = cuda_cub::get_value(policy, d_num_runs_out); return thrust::make_pair( keys_output + num_runs_out,