Skip to content

Commit 42e56e4

Browse files
authored
Add std::exclusive_scan (#19)
Add `std::exclusive_scan`
1 parent 45f9c12 commit 42e56e4

File tree

6 files changed

+159
-6
lines changed

6 files changed

+159
-6
lines changed

README.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ Algorithms are added on an as-needed basis. If you need one [open an issue](http
3636
* [transform](https://en.cppreference.com/w/cpp/algorithm/transform)
3737

3838
### `<numeric>`
39+
* [exclusive_scan](https://en.cppreference.com/w/cpp/algorithm/exclusive_scan) (C++17 only)
3940
* [reduce](https://en.cppreference.com/w/cpp/algorithm/reduce)
4041
* [transform_reduce](https://en.cppreference.com/w/cpp/algorithm/transform_reduce) (C++17 only)
4142

@@ -195,6 +196,9 @@ sort(std::execution::par)/real_time 121 ms
195196
transform()/real_time 95.0 ms 94.9 ms 7
196197
transform(poolstl::par)/real_time 17.4 ms 0.037 ms 38
197198
transform(std::execution::par)/real_time 15.3 ms 13.2 ms 45
199+
exclusive_scan()/real_time 33.7 ms 33.7 ms 21
200+
exclusive_scan(poolstl::par)/real_time 11.6 ms 0.095 ms 55
201+
exclusive_scan(std::execution::par)/real_time 19.8 ms 15.3 ms 32
198202
reduce()/real_time 15.2 ms 15.2 ms 46
199203
reduce(poolstl::par)/real_time 4.06 ms 0.044 ms 169
200204
reduce(std::execution::par)/real_time 3.38 ms 3.16 ms 214

benchmark/numeric_bench.cpp

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,30 @@
1414
#include "utils.hpp"
1515

1616

17+
////////////////////////////////
18+
19+
template <class ExecPolicy>
20+
void exclusive_scan(benchmark::State& state) {
21+
auto values = iota_vector(arr_length);
22+
std::vector<int> dest(arr_length);
23+
24+
for ([[maybe_unused]] auto _ : state) {
25+
if constexpr (is_policy<ExecPolicy>::value) {
26+
std::exclusive_scan(policy<ExecPolicy>::get(), values.begin(), values.end(), dest.begin(), 0);
27+
} else {
28+
std::exclusive_scan(values.begin(), values.end(), dest.begin(), 0);
29+
}
30+
benchmark::DoNotOptimize(dest);
31+
benchmark::ClobberMemory();
32+
}
33+
}
34+
35+
BENCHMARK(exclusive_scan<seq>)->Name("exclusive_scan()")->UseRealTime();
36+
BENCHMARK(exclusive_scan<poolstl_par>)->Name("exclusive_scan(poolstl::par)")->UseRealTime();
37+
#ifdef POOLSTL_BENCH_STD_PAR
38+
BENCHMARK(exclusive_scan<std_par>)->Name("exclusive_scan(std::execution::par)")->UseRealTime();
39+
#endif
40+
1741
////////////////////////////////
1842

1943
template <class ExecPolicy>

include/poolstl/internal/ttp_impl.hpp

Lines changed: 27 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,24 @@
1717
namespace poolstl {
1818
namespace internal {
1919

20+
#if POOLSTL_HAVE_CXX17_LIB
21+
/**
22+
* Call std::apply in parallel.
23+
*/
24+
template <class ExecPolicy, class Op, class ArgContainer>
25+
std::vector<std::future<void>>
26+
parallel_apply(ExecPolicy &&policy, Op op, const ArgContainer& args_list) {
27+
std::vector<std::future<void>> futures;
28+
auto& task_pool = policy.pool();
29+
30+
for (const auto& args : args_list) {
31+
futures.emplace_back(task_pool.submit([op](const auto& args_fwd) { std::apply(op, args_fwd); }, args));
32+
}
33+
34+
return futures;
35+
}
36+
#endif
37+
2038
/**
2139
* Chunk a single range.
2240
*/
@@ -26,13 +44,14 @@ namespace poolstl {
2644
std::vector<std::future<
2745
decltype(std::declval<Chunk>()(std::declval<RandIt>(), std::declval<RandIt>()))
2846
>> futures;
29-
auto chunk_size = get_chunk_size(first, last, extra_split_factor * policy.pool().get_num_threads());
47+
auto& task_pool = policy.pool();
48+
auto chunk_size = get_chunk_size(first, last, extra_split_factor * task_pool.get_num_threads());
3049

3150
while (first < last) {
3251
auto iter_chunk_size = get_iter_chunk_size(first, last, chunk_size);
3352
RandIt loop_end = advanced(first, iter_chunk_size);
3453

35-
futures.emplace_back(policy.pool().submit(chunk, first, loop_end));
54+
futures.emplace_back(task_pool.submit(chunk, first, loop_end));
3655

3756
first = loop_end;
3857
}
@@ -54,13 +73,14 @@ namespace poolstl {
5473
std::declval<RandIt1>(),
5574
std::declval<RandIt2>()))
5675
>> futures;
57-
auto chunk_size = get_chunk_size(first1, last1, policy.pool().get_num_threads());
76+
auto& task_pool = policy.pool();
77+
auto chunk_size = get_chunk_size(first1, last1, task_pool.get_num_threads());
5878

5979
while (first1 < last1) {
6080
auto iter_chunk_size = get_iter_chunk_size(first1, last1, chunk_size);
6181
RandIt1 loop_end = advanced(first1, iter_chunk_size);
6282

63-
futures.emplace_back(policy.pool().submit(chunk, first1, loop_end, first2));
83+
futures.emplace_back(task_pool.submit(chunk, first1, loop_end, first2));
6484

6585
first1 = loop_end;
6686
std::advance(first2, iter_chunk_size);
@@ -86,13 +106,14 @@ namespace poolstl {
86106
std::declval<RandIt2>(),
87107
std::declval<RandIt3>()))
88108
>> futures;
89-
auto chunk_size = get_chunk_size(first1, last1, policy.pool().get_num_threads());
109+
auto& task_pool = policy.pool();
110+
auto chunk_size = get_chunk_size(first1, last1, task_pool.get_num_threads());
90111

91112
while (first1 < last1) {
92113
auto iter_chunk_size = get_iter_chunk_size(first1, last1, chunk_size);
93114
RandIt1 loop_end = advanced(first1, iter_chunk_size);
94115

95-
futures.emplace_back(policy.pool().submit(chunk, first1, loop_end, first2, first3));
116+
futures.emplace_back(task_pool.submit(chunk, first1, loop_end, first2, first3));
96117

97118
first1 = loop_end;
98119
std::advance(first2, iter_chunk_size);

include/poolstl/numeric

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,12 +8,74 @@
88
#define POOLSTL_NUMERIC_HPP
99

1010
#include <functional>
11+
#include <tuple>
1112

1213
#include "execution"
1314
#include "internal/ttp_impl.hpp"
1415

1516
namespace std {
1617

18+
#if POOLSTL_HAVE_CXX17_LIB
19+
/**
20+
* NOTE: Iterators are expected to be random access.
21+
* See std::exclusive_scan https://en.cppreference.com/w/cpp/algorithm/exclusive_scan
22+
*/
23+
template <class ExecPolicy, class RandIt1, class RandIt2, class T, class BinaryOp>
24+
poolstl::internal::enable_if_par<ExecPolicy, RandIt2>
25+
exclusive_scan(ExecPolicy &&policy, RandIt1 first, RandIt1 last, RandIt2 dest, T init, BinaryOp binop) {
26+
if (first == last) {
27+
return dest;
28+
}
29+
30+
// Pass 1: Chunk the input and find the sum of each chunk
31+
auto futures = poolstl::internal::parallel_chunk_for(std::forward<ExecPolicy>(policy), first, last,
32+
[binop](RandIt1 chunk_first, RandIt1 chunk_last) {
33+
auto sum = std::accumulate(chunk_first, chunk_last, T{}, binop);
34+
return std::make_tuple(std::make_pair(chunk_first, chunk_last), sum);
35+
});
36+
37+
std::vector<std::pair<RandIt1, RandIt1>> ranges;
38+
std::vector<T> sums;
39+
40+
for (auto& future : futures) {
41+
auto res = future.get();
42+
ranges.push_back(std::get<0>(res));
43+
sums.push_back(std::get<1>(res));
44+
}
45+
46+
// find initial values for each range
47+
std::exclusive_scan(sums.begin(), sums.end(), sums.begin(), init, binop);
48+
49+
// Pass 2: perform exclusive scan of each chunk, using the sum of previous chunks as init
50+
std::vector<std::tuple<RandIt1, RandIt1, RandIt2, T>> args;
51+
for (std::size_t i = 0; i < sums.size(); ++i) {
52+
auto chunk_first = std::get<0>(ranges[i]);
53+
args.emplace_back(std::make_tuple(
54+
chunk_first, std::get<1>(ranges[i]),
55+
dest + (chunk_first - first),
56+
sums[i]));
57+
}
58+
59+
auto futures2 = poolstl::internal::parallel_apply(std::forward<ExecPolicy>(policy),
60+
[binop](RandIt1 chunk_first, RandIt1 chunk_last, RandIt2 chunk_dest, T chunk_init){
61+
std::exclusive_scan(chunk_first, chunk_last, chunk_dest, chunk_init, binop);
62+
}, args);
63+
64+
poolstl::internal::get_futures(futures2);
65+
return dest + (last - first);
66+
}
67+
68+
/**
69+
* NOTE: Iterators are expected to be random access.
70+
* See std::exclusive_scan https://en.cppreference.com/w/cpp/algorithm/exclusive_scan
71+
*/
72+
template <class ExecPolicy, class RandIt1, class RandIt2, class T>
73+
poolstl::internal::enable_if_par<ExecPolicy, RandIt2>
74+
exclusive_scan(ExecPolicy &&policy, RandIt1 first, RandIt1 last, RandIt2 dest, T init) {
75+
return std::exclusive_scan(std::forward<ExecPolicy>(policy), first, last, dest, init, std::plus<T>());
76+
}
77+
#endif
78+
1779
/**
1880
* NOTE: Iterators are expected to be random access.
1981
* See std::reduce https://en.cppreference.com/w/cpp/algorithm/reduce

include/poolstl/seq_fwd.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,7 @@ namespace std {
8686
POOLSTL_DEFINE_BOTH_SEQ_FWD_AND_PAR_IF(transform)
8787

8888
#if POOLSTL_HAVE_CXX17_LIB
89+
POOLSTL_DEFINE_BOTH_SEQ_FWD_AND_PAR_IF(exclusive_scan)
8990
POOLSTL_DEFINE_BOTH_SEQ_FWD_AND_PAR_IF(reduce)
9091
POOLSTL_DEFINE_BOTH_SEQ_FWD_AND_PAR_IF(transform_reduce)
9192
#endif

tests/poolstl_test.cpp

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -350,6 +350,47 @@ TEST_CASE("transform_2", "[alg][algorithm]") {
350350
}
351351
}
352352

353+
#if POOLSTL_HAVE_CXX17_LIB
354+
TEST_CASE("exclusive_scan", "[alg][algorithm]") {
355+
for (auto num_threads : test_thread_counts) {
356+
ttp::task_thread_pool pool(num_threads);
357+
358+
for (auto num_iters : test_arr_sizes) {
359+
for (int init : {0, 10}) {
360+
auto v = iota_vector(num_iters);
361+
std::vector<int> dest1(v.size());
362+
std::vector<int> dest2(v.size());
363+
364+
auto seq_res = std::exclusive_scan(poolstl::par_if(false), v.cbegin(), v.cend(), dest1.begin(), init);
365+
auto par_res = std::exclusive_scan(poolstl::par.on(pool), v.cbegin(), v.cend(), dest2.begin(), init);
366+
// test return value
367+
REQUIRE((par_res - dest2.begin()) == (seq_res - dest1.begin()));
368+
REQUIRE(dest1 == dest2);
369+
370+
// test in-place
371+
std::exclusive_scan(poolstl::par.on(pool), v.begin(), v.end(), v.begin(), init);
372+
REQUIRE(v == dest2);
373+
374+
// test commutativity
375+
{
376+
std::vector<std::string> sv;
377+
sv.reserve(v.size());
378+
for (auto val : v) {
379+
sv.emplace_back(std::to_string(val));
380+
}
381+
std::vector<std::string> sdest1(sv.size());
382+
std::vector<std::string> sdest2(sv.size());
383+
384+
std::exclusive_scan(poolstl::par_if(false), sv.cbegin(), sv.cend(), sdest1.begin(), std::to_string(init));
385+
std::exclusive_scan(poolstl::par.on(pool), sv.cbegin(), sv.cend(), sdest2.begin(), std::to_string(init));
386+
REQUIRE(sdest1 == sdest2);
387+
}
388+
}
389+
}
390+
}
391+
}
392+
#endif
393+
353394
TEST_CASE("reduce", "[alg][numeric]") {
354395
for (auto num_threads : test_thread_counts) {
355396
ttp::task_thread_pool pool(num_threads);

0 commit comments

Comments
 (0)