| 1 |
// -*- C++ -*- |
| 2 |
//===-- parallel_backend_tbb.h --------------------------------------------===// |
| 3 |
// |
| 4 |
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 5 |
// See https://llvm.org/LICENSE.txt for license information. |
| 6 |
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 7 |
// |
| 8 |
//===----------------------------------------------------------------------===// |
| 9 |
|
| 10 |
#ifndef _PSTL_PARALLEL_BACKEND_TBB_H |
| 11 |
#define _PSTL_PARALLEL_BACKEND_TBB_H |
| 12 |
|
| 13 |
#include <algorithm> |
| 14 |
#include <type_traits> |
| 15 |
|
| 16 |
#include "parallel_backend_utils.h" |
| 17 |
|
| 18 |
// Bring in minimal required subset of Intel TBB |
| 19 |
#include <tbb/blocked_range.h> |
| 20 |
#include <tbb/parallel_for.h> |
| 21 |
#include <tbb/parallel_reduce.h> |
| 22 |
#include <tbb/parallel_scan.h> |
| 23 |
#include <tbb/parallel_invoke.h> |
| 24 |
#include <tbb/task_arena.h> |
| 25 |
#include <tbb/tbb_allocator.h> |
| 26 |
#include <tbb/task.h> |
| 27 |
|
| 28 |
#if TBB_INTERFACE_VERSION < 10000 |
| 29 |
# error Intel(R) Threading Building Blocks 2018 is required; older versions are not supported. |
| 30 |
#endif |
| 31 |
|
| 32 |
namespace __pstl |
| 33 |
{ |
| 34 |
namespace __tbb_backend |
| 35 |
{ |
| 36 |
|
| 37 |
//! Raw memory buffer with automatic freeing and no exceptions. |
| 38 |
/** Some of our algorithms need to start with raw memory buffer, |
| 39 |
not an initialize array, because initialization/destruction |
| 40 |
would make the span be at least O(N). */ |
| 41 |
// tbb::allocator can improve performance in some cases. |
| 42 |
template <typename _Tp> |
| 43 |
class __buffer |
| 44 |
{ |
| 45 |
tbb::tbb_allocator<_Tp> _M_allocator; |
| 46 |
_Tp* _M_ptr; |
| 47 |
const std::size_t _M_buf_size; |
| 48 |
__buffer(const __buffer&) = delete; |
| 49 |
void |
| 50 |
operator=(const __buffer&) = delete; |
| 51 |
|
| 52 |
public: |
| 53 |
//! Try to obtain buffer of given size to store objects of _Tp type |
| 54 |
__buffer(std::size_t n) : _M_allocator(), _M_ptr(_M_allocator.allocate(n)), _M_buf_size(n) {} |
| 55 |
//! True if buffer was successfully obtained, zero otherwise. |
| 56 |
operator bool() const { return _M_ptr != NULL; } |
| 57 |
//! Return pointer to buffer, or NULL if buffer could not be obtained. |
| 58 |
_Tp* |
| 59 |
get() const |
| 60 |
{ |
| 61 |
return _M_ptr; |
| 62 |
} |
| 63 |
//! Destroy buffer |
| 64 |
~__buffer() { _M_allocator.deallocate(_M_ptr, _M_buf_size); } |
| 65 |
}; |
| 66 |
|
| 67 |
// Wrapper for tbb::task |
| 68 |
inline void |
| 69 |
__cancel_execution() |
| 70 |
{ |
| 71 |
#if TBB_INTERFACE_VERSION <= 12000 |
| 72 |
tbb::task::self().group()->cancel_group_execution(); |
| 73 |
#else |
| 74 |
tbb::task::current_context()->cancel_group_execution(); |
| 75 |
#endif |
| 76 |
} |
| 77 |
|
| 78 |
//------------------------------------------------------------------------ |
| 79 |
// parallel_for |
| 80 |
//------------------------------------------------------------------------ |
| 81 |
|
| 82 |
template <class _Index, class _RealBody> |
| 83 |
class __parallel_for_body |
| 84 |
{ |
| 85 |
public: |
| 86 |
__parallel_for_body(const _RealBody& __body) : _M_body(__body) {} |
| 87 |
__parallel_for_body(const __parallel_for_body& __body) : _M_body(__body._M_body) {} |
| 88 |
void |
| 89 |
operator()(const tbb::blocked_range<_Index>& __range) const |
| 90 |
{ |
| 91 |
_M_body(__range.begin(), __range.end()); |
| 92 |
} |
| 93 |
|
| 94 |
private: |
| 95 |
_RealBody _M_body; |
| 96 |
}; |
| 97 |
|
| 98 |
//! Evaluation of brick f[i,j) for each subrange [i,j) of [first,last) |
| 99 |
// wrapper over tbb::parallel_for |
| 100 |
template <class _ExecutionPolicy, class _Index, class _Fp> |
| 101 |
void |
| 102 |
__parallel_for(_ExecutionPolicy&&, _Index __first, _Index __last, _Fp __f) |
| 103 |
{ |
| 104 |
tbb::this_task_arena::isolate([=]() { |
| 105 |
tbb::parallel_for(tbb::blocked_range<_Index>(__first, __last), __parallel_for_body<_Index, _Fp>(__f)); |
| 106 |
}); |
| 107 |
} |
| 108 |
|
| 109 |
//! Evaluation of brick f[i,j) for each subrange [i,j) of [first,last) |
| 110 |
// wrapper over tbb::parallel_reduce |
| 111 |
template <class _ExecutionPolicy, class _Value, class _Index, typename _RealBody, typename _Reduction> |
| 112 |
_Value |
| 113 |
__parallel_reduce(_ExecutionPolicy&&, _Index __first, _Index __last, const _Value& __identity, |
| 114 |
const _RealBody& __real_body, const _Reduction& __reduction) |
| 115 |
{ |
| 116 |
return tbb::this_task_arena::isolate([__first, __last, &__identity, &__real_body, &__reduction]() -> _Value { |
| 117 |
return tbb::parallel_reduce( |
| 118 |
tbb::blocked_range<_Index>(__first, __last), __identity, |
| 119 |
[__real_body](const tbb::blocked_range<_Index>& __r, const _Value& __value) -> _Value { |
| 120 |
return __real_body(__r.begin(), __r.end(), __value); |
| 121 |
}, |
| 122 |
__reduction); |
| 123 |
}); |
| 124 |
} |
| 125 |
|
| 126 |
//------------------------------------------------------------------------ |
| 127 |
// parallel_transform_reduce |
| 128 |
// |
| 129 |
// Notation: |
| 130 |
// r(i,j,init) returns reduction of init with reduction over [i,j) |
| 131 |
// u(i) returns f(i,i+1,identity) for a hypothetical left identity element of r |
| 132 |
// c(x,y) combines values x and y that were the result of r or u |
| 133 |
//------------------------------------------------------------------------ |
| 134 |
|
| 135 |
template <class _Index, class _Up, class _Tp, class _Cp, class _Rp> |
| 136 |
struct __par_trans_red_body |
| 137 |
{ |
| 138 |
alignas(_Tp) char _M_sum_storage[sizeof(_Tp)]; // Holds generalized non-commutative sum when has_sum==true |
| 139 |
_Rp _M_brick_reduce; // Most likely to have non-empty layout |
| 140 |
_Up _M_u; |
| 141 |
_Cp _M_combine; |
| 142 |
bool _M_has_sum; // Put last to minimize size of class |
| 143 |
_Tp& |
| 144 |
sum() |
| 145 |
{ |
| 146 |
_PSTL_ASSERT_MSG(_M_has_sum, "sum expected"); |
| 147 |
return *(_Tp*)_M_sum_storage; |
| 148 |
} |
| 149 |
__par_trans_red_body(_Up __u, _Tp __init, _Cp __c, _Rp __r) |
| 150 |
: _M_brick_reduce(__r), _M_u(__u), _M_combine(__c), _M_has_sum(true) |
| 151 |
{ |
| 152 |
new (_M_sum_storage) _Tp(__init); |
| 153 |
} |
| 154 |
|
| 155 |
__par_trans_red_body(__par_trans_red_body& __left, tbb::split) |
| 156 |
: _M_brick_reduce(__left._M_brick_reduce), _M_u(__left._M_u), _M_combine(__left._M_combine), _M_has_sum(false) |
| 157 |
{ |
| 158 |
} |
| 159 |
|
| 160 |
~__par_trans_red_body() |
| 161 |
{ |
| 162 |
// 17.6.5.12 tells us to not worry about catching exceptions from destructors. |
| 163 |
if (_M_has_sum) |
| 164 |
sum().~_Tp(); |
| 165 |
} |
| 166 |
|
| 167 |
void |
| 168 |
join(__par_trans_red_body& __rhs) |
| 169 |
{ |
| 170 |
sum() = _M_combine(sum(), __rhs.sum()); |
| 171 |
} |
| 172 |
|
| 173 |
void |
| 174 |
operator()(const tbb::blocked_range<_Index>& __range) |
| 175 |
{ |
| 176 |
_Index __i = __range.begin(); |
| 177 |
_Index __j = __range.end(); |
| 178 |
if (!_M_has_sum) |
| 179 |
{ |
| 180 |
_PSTL_ASSERT_MSG(__range.size() > 1, "there should be at least 2 elements"); |
| 181 |
new (&_M_sum_storage) |
| 182 |
_Tp(_M_combine(_M_u(__i), _M_u(__i + 1))); // The condition i+1 < j is provided by the grain size of 3 |
| 183 |
_M_has_sum = true; |
| 184 |
std::advance(__i, 2); |
| 185 |
if (__i == __j) |
| 186 |
return; |
| 187 |
} |
| 188 |
sum() = _M_brick_reduce(__i, __j, sum()); |
| 189 |
} |
| 190 |
}; |
| 191 |
|
| 192 |
template <class _ExecutionPolicy, class _Index, class _Up, class _Tp, class _Cp, class _Rp> |
| 193 |
_Tp |
| 194 |
__parallel_transform_reduce(_ExecutionPolicy&&, _Index __first, _Index __last, _Up __u, _Tp __init, _Cp __combine, |
| 195 |
_Rp __brick_reduce) |
| 196 |
{ |
| 197 |
__tbb_backend::__par_trans_red_body<_Index, _Up, _Tp, _Cp, _Rp> __body(__u, __init, __combine, __brick_reduce); |
| 198 |
// The grain size of 3 is used in order to provide mininum 2 elements for each body |
| 199 |
tbb::this_task_arena::isolate( |
| 200 |
[__first, __last, &__body]() { tbb::parallel_reduce(tbb::blocked_range<_Index>(__first, __last, 3), __body); }); |
| 201 |
return __body.sum(); |
| 202 |
} |
| 203 |
|
| 204 |
//------------------------------------------------------------------------ |
| 205 |
// parallel_scan |
| 206 |
//------------------------------------------------------------------------ |
| 207 |
|
| 208 |
template <class _Index, class _Up, class _Tp, class _Cp, class _Rp, class _Sp> |
| 209 |
class __trans_scan_body |
| 210 |
{ |
| 211 |
alignas(_Tp) char _M_sum_storage[sizeof(_Tp)]; // Holds generalized non-commutative sum when has_sum==true |
| 212 |
_Rp _M_brick_reduce; // Most likely to have non-empty layout |
| 213 |
_Up _M_u; |
| 214 |
_Cp _M_combine; |
| 215 |
_Sp _M_scan; |
| 216 |
bool _M_has_sum; // Put last to minimize size of class |
| 217 |
public: |
| 218 |
__trans_scan_body(_Up __u, _Tp __init, _Cp __combine, _Rp __reduce, _Sp __scan) |
| 219 |
: _M_brick_reduce(__reduce), _M_u(__u), _M_combine(__combine), _M_scan(__scan), _M_has_sum(true) |
| 220 |
{ |
| 221 |
new (_M_sum_storage) _Tp(__init); |
| 222 |
} |
| 223 |
|
| 224 |
__trans_scan_body(__trans_scan_body& __b, tbb::split) |
| 225 |
: _M_brick_reduce(__b._M_brick_reduce), _M_u(__b._M_u), _M_combine(__b._M_combine), _M_scan(__b._M_scan), |
| 226 |
_M_has_sum(false) |
| 227 |
{ |
| 228 |
} |
| 229 |
|
| 230 |
~__trans_scan_body() |
| 231 |
{ |
| 232 |
// 17.6.5.12 tells us to not worry about catching exceptions from destructors. |
| 233 |
if (_M_has_sum) |
| 234 |
sum().~_Tp(); |
| 235 |
} |
| 236 |
|
| 237 |
_Tp& |
| 238 |
sum() const |
| 239 |
{ |
| 240 |
_PSTL_ASSERT_MSG(_M_has_sum, "sum expected"); |
| 241 |
return *const_cast<_Tp*>(reinterpret_cast<_Tp const*>(_M_sum_storage)); |
| 242 |
} |
| 243 |
|
| 244 |
void |
| 245 |
operator()(const tbb::blocked_range<_Index>& __range, tbb::pre_scan_tag) |
| 246 |
{ |
| 247 |
_Index __i = __range.begin(); |
| 248 |
_Index __j = __range.end(); |
| 249 |
if (!_M_has_sum) |
| 250 |
{ |
| 251 |
new (&_M_sum_storage) _Tp(_M_u(__i)); |
| 252 |
_M_has_sum = true; |
| 253 |
++__i; |
| 254 |
if (__i == __j) |
| 255 |
return; |
| 256 |
} |
| 257 |
sum() = _M_brick_reduce(__i, __j, sum()); |
| 258 |
} |
| 259 |
|
| 260 |
void |
| 261 |
operator()(const tbb::blocked_range<_Index>& __range, tbb::final_scan_tag) |
| 262 |
{ |
| 263 |
sum() = _M_scan(__range.begin(), __range.end(), sum()); |
| 264 |
} |
| 265 |
|
| 266 |
void |
| 267 |
reverse_join(__trans_scan_body& __a) |
| 268 |
{ |
| 269 |
if (_M_has_sum) |
| 270 |
{ |
| 271 |
sum() = _M_combine(__a.sum(), sum()); |
| 272 |
} |
| 273 |
else |
| 274 |
{ |
| 275 |
new (&_M_sum_storage) _Tp(__a.sum()); |
| 276 |
_M_has_sum = true; |
| 277 |
} |
| 278 |
} |
| 279 |
|
| 280 |
void |
| 281 |
assign(__trans_scan_body& __b) |
| 282 |
{ |
| 283 |
sum() = __b.sum(); |
| 284 |
} |
| 285 |
}; |
| 286 |
|
| 287 |
template <typename _Index> |
| 288 |
_Index |
| 289 |
__split(_Index __m) |
| 290 |
{ |
| 291 |
_Index __k = 1; |
| 292 |
while (2 * __k < __m) |
| 293 |
__k *= 2; |
| 294 |
return __k; |
| 295 |
} |
| 296 |
|
| 297 |
//------------------------------------------------------------------------ |
| 298 |
// __parallel_strict_scan |
| 299 |
//------------------------------------------------------------------------ |
| 300 |
|
| 301 |
template <typename _Index, typename _Tp, typename _Rp, typename _Cp> |
| 302 |
void |
| 303 |
__upsweep(_Index __i, _Index __m, _Index __tilesize, _Tp* __r, _Index __lastsize, _Rp __reduce, _Cp __combine) |
| 304 |
{ |
| 305 |
if (__m == 1) |
| 306 |
__r[0] = __reduce(__i * __tilesize, __lastsize); |
| 307 |
else |
| 308 |
{ |
| 309 |
_Index __k = __split(__m); |
| 310 |
tbb::parallel_invoke( |
| 311 |
[=] { __tbb_backend::__upsweep(__i, __k, __tilesize, __r, __tilesize, __reduce, __combine); }, |
| 312 |
[=] { |
| 313 |
__tbb_backend::__upsweep(__i + __k, __m - __k, __tilesize, __r + __k, __lastsize, __reduce, __combine); |
| 314 |
}); |
| 315 |
if (__m == 2 * __k) |
| 316 |
__r[__m - 1] = __combine(__r[__k - 1], __r[__m - 1]); |
| 317 |
} |
| 318 |
} |
| 319 |
|
| 320 |
template <typename _Index, typename _Tp, typename _Cp, typename _Sp> |
| 321 |
void |
| 322 |
__downsweep(_Index __i, _Index __m, _Index __tilesize, _Tp* __r, _Index __lastsize, _Tp __initial, _Cp __combine, |
| 323 |
_Sp __scan) |
| 324 |
{ |
| 325 |
if (__m == 1) |
| 326 |
__scan(__i * __tilesize, __lastsize, __initial); |
| 327 |
else |
| 328 |
{ |
| 329 |
const _Index __k = __split(__m); |
| 330 |
tbb::parallel_invoke( |
| 331 |
[=] { __tbb_backend::__downsweep(__i, __k, __tilesize, __r, __tilesize, __initial, __combine, __scan); }, |
| 332 |
// Assumes that __combine never throws. |
| 333 |
//TODO: Consider adding a requirement for user functors to be constant. |
| 334 |
[=, &__combine] { |
| 335 |
__tbb_backend::__downsweep(__i + __k, __m - __k, __tilesize, __r + __k, __lastsize, |
| 336 |
__combine(__initial, __r[__k - 1]), __combine, __scan); |
| 337 |
}); |
| 338 |
} |
| 339 |
} |
| 340 |
|
| 341 |
// Adapted from Intel(R) Cilk(TM) version from cilkpub. |
| 342 |
// Let i:len denote a counted interval of length n starting at i. s denotes a generalized-sum value. |
| 343 |
// Expected actions of the functors are: |
| 344 |
// reduce(i,len) -> s -- return reduction value of i:len. |
| 345 |
// combine(s1,s2) -> s -- return merged sum |
| 346 |
// apex(s) -- do any processing necessary between reduce and scan. |
| 347 |
// scan(i,len,initial) -- perform scan over i:len starting with initial. |
| 348 |
// The initial range 0:n is partitioned into consecutive subranges. |
| 349 |
// reduce and scan are each called exactly once per subrange. |
| 350 |
// Thus callers can rely upon side effects in reduce. |
| 351 |
// combine must not throw an exception. |
| 352 |
// apex is called exactly once, after all calls to reduce and before all calls to scan. |
| 353 |
// For example, it's useful for allocating a __buffer used by scan but whose size is the sum of all reduction values. |
| 354 |
// T must have a trivial constructor and destructor. |
| 355 |
template <class _ExecutionPolicy, typename _Index, typename _Tp, typename _Rp, typename _Cp, typename _Sp, typename _Ap> |
| 356 |
void |
| 357 |
__parallel_strict_scan(_ExecutionPolicy&&, _Index __n, _Tp __initial, _Rp __reduce, _Cp __combine, _Sp __scan, |
| 358 |
_Ap __apex) |
| 359 |
{ |
| 360 |
tbb::this_task_arena::isolate([=, &__combine]() { |
| 361 |
if (__n > 1) |
| 362 |
{ |
| 363 |
_Index __p = tbb::this_task_arena::max_concurrency(); |
| 364 |
const _Index __slack = 4; |
| 365 |
_Index __tilesize = (__n - 1) / (__slack * __p) + 1; |
| 366 |
_Index __m = (__n - 1) / __tilesize; |
| 367 |
__buffer<_Tp> __buf(__m + 1); |
| 368 |
_Tp* __r = __buf.get(); |
| 369 |
__tbb_backend::__upsweep(_Index(0), _Index(__m + 1), __tilesize, __r, __n - __m * __tilesize, __reduce, |
| 370 |
__combine); |
| 371 |
|
| 372 |
// When __apex is a no-op and __combine has no side effects, a good optimizer |
| 373 |
// should be able to eliminate all code between here and __apex. |
| 374 |
// Alternatively, provide a default value for __apex that can be |
| 375 |
// recognized by metaprogramming that conditionlly executes the following. |
| 376 |
size_t __k = __m + 1; |
| 377 |
_Tp __t = __r[__k - 1]; |
| 378 |
while ((__k &= __k - 1)) |
| 379 |
__t = __combine(__r[__k - 1], __t); |
| 380 |
__apex(__combine(__initial, __t)); |
| 381 |
__tbb_backend::__downsweep(_Index(0), _Index(__m + 1), __tilesize, __r, __n - __m * __tilesize, __initial, |
| 382 |
__combine, __scan); |
| 383 |
return; |
| 384 |
} |
| 385 |
// Fewer than 2 elements in sequence, or out of memory. Handle has single block. |
| 386 |
_Tp __sum = __initial; |
| 387 |
if (__n) |
| 388 |
__sum = __combine(__sum, __reduce(_Index(0), __n)); |
| 389 |
__apex(__sum); |
| 390 |
if (__n) |
| 391 |
__scan(_Index(0), __n, __initial); |
| 392 |
}); |
| 393 |
} |
| 394 |
|
| 395 |
template <class _ExecutionPolicy, class _Index, class _Up, class _Tp, class _Cp, class _Rp, class _Sp> |
| 396 |
_Tp |
| 397 |
__parallel_transform_scan(_ExecutionPolicy&&, _Index __n, _Up __u, _Tp __init, _Cp __combine, _Rp __brick_reduce, |
| 398 |
_Sp __scan) |
| 399 |
{ |
| 400 |
__trans_scan_body<_Index, _Up, _Tp, _Cp, _Rp, _Sp> __body(__u, __init, __combine, __brick_reduce, __scan); |
| 401 |
auto __range = tbb::blocked_range<_Index>(0, __n); |
| 402 |
tbb::this_task_arena::isolate([__range, &__body]() { tbb::parallel_scan(__range, __body); }); |
| 403 |
return __body.sum(); |
| 404 |
} |
| 405 |
|
| 406 |
//------------------------------------------------------------------------ |
| 407 |
// parallel_stable_sort |
| 408 |
//------------------------------------------------------------------------ |
| 409 |
|
| 410 |
//------------------------------------------------------------------------ |
| 411 |
// stable_sort utilities |
| 412 |
// |
| 413 |
// These are used by parallel implementations but do not depend on them. |
| 414 |
//------------------------------------------------------------------------ |
| 415 |
#define _PSTL_MERGE_CUT_OFF 2000 |
| 416 |
|
| 417 |
template <typename _Func> |
| 418 |
class __func_task; |
| 419 |
template <typename _Func> |
| 420 |
class __root_task; |
| 421 |
|
| 422 |
#if TBB_INTERFACE_VERSION <= 12000 |
| 423 |
class __task : public tbb::task |
| 424 |
{ |
| 425 |
public: |
| 426 |
template <typename _Fn> |
| 427 |
__task* |
| 428 |
make_continuation(_Fn&& __f) |
| 429 |
{ |
| 430 |
return new (allocate_continuation()) __func_task<typename std::decay<_Fn>::type>(std::forward<_Fn>(__f)); |
| 431 |
} |
| 432 |
|
| 433 |
template <typename _Fn> |
| 434 |
__task* |
| 435 |
make_child_of(__task* parent, _Fn&& __f) |
| 436 |
{ |
| 437 |
return new (parent->allocate_child()) __func_task<typename std::decay<_Fn>::type>(std::forward<_Fn>(__f)); |
| 438 |
} |
| 439 |
|
| 440 |
template <typename _Fn> |
| 441 |
__task* |
| 442 |
make_additional_child_of(tbb::task* parent, _Fn&& __f) |
| 443 |
{ |
| 444 |
return new (tbb::task::allocate_additional_child_of(*parent)) |
| 445 |
__func_task<typename std::decay<_Fn>::type>(std::forward<_Fn>(__f)); |
| 446 |
} |
| 447 |
|
| 448 |
inline void |
| 449 |
recycle_as_continuation() |
| 450 |
{ |
| 451 |
tbb::task::recycle_as_continuation(); |
| 452 |
} |
| 453 |
|
| 454 |
inline void |
| 455 |
recycle_as_child_of(__task* parent) |
| 456 |
{ |
| 457 |
tbb::task::recycle_as_child_of(*parent); |
| 458 |
} |
| 459 |
|
| 460 |
inline void |
| 461 |
spawn(__task* __t) |
| 462 |
{ |
| 463 |
tbb::task::spawn(*__t); |
| 464 |
} |
| 465 |
|
| 466 |
template <typename _Fn> |
| 467 |
static inline void |
| 468 |
spawn_root_and_wait(__root_task<_Fn>& __root) |
| 469 |
{ |
| 470 |
tbb::task::spawn_root_and_wait(*__root._M_task); |
| 471 |
} |
| 472 |
}; |
| 473 |
|
| 474 |
template <typename _Func> |
| 475 |
class __func_task : public __task |
| 476 |
{ |
| 477 |
_Func _M_func; |
| 478 |
|
| 479 |
tbb::task* |
| 480 |
execute() |
| 481 |
{ |
| 482 |
return _M_func(this); |
| 483 |
}; |
| 484 |
|
| 485 |
public: |
| 486 |
template <typename _Fn> |
| 487 |
__func_task(_Fn&& __f) : _M_func{std::forward<_Fn>(__f)} |
| 488 |
{ |
| 489 |
} |
| 490 |
|
| 491 |
_Func& |
| 492 |
body() |
| 493 |
{ |
| 494 |
return _M_func; |
| 495 |
} |
| 496 |
}; |
| 497 |
|
| 498 |
template <typename _Func> |
| 499 |
class __root_task |
| 500 |
{ |
| 501 |
tbb::task* _M_task; |
| 502 |
|
| 503 |
public: |
| 504 |
template <typename... Args> |
| 505 |
__root_task(Args&&... args) |
| 506 |
: _M_task{new (tbb::task::allocate_root()) __func_task<_Func>{_Func(std::forward<Args>(args)...)}} |
| 507 |
{ |
| 508 |
} |
| 509 |
|
| 510 |
friend class __task; |
| 511 |
friend class __func_task<_Func>; |
| 512 |
}; |
| 513 |
|
| 514 |
#else // TBB_INTERFACE_VERSION <= 12000 |
| 515 |
class __task : public tbb::detail::d1::task |
| 516 |
{ |
| 517 |
protected: |
| 518 |
tbb::detail::d1::small_object_allocator _M_allocator{}; |
| 519 |
tbb::detail::d1::execution_data* _M_execute_data{}; |
| 520 |
__task* _M_parent{}; |
| 521 |
std::atomic<int> _M_refcount{}; |
| 522 |
bool _M_recycle{}; |
| 523 |
|
| 524 |
template <typename _Fn> |
| 525 |
__task* |
| 526 |
allocate_func_task(_Fn&& __f) |
| 527 |
{ |
| 528 |
_PSTL_ASSERT(_M_execute_data != nullptr); |
| 529 |
tbb::detail::d1::small_object_allocator __alloc{}; |
| 530 |
auto __t = |
| 531 |
__alloc.new_object<__func_task<typename std::decay<_Fn>::type>>(*_M_execute_data, std::forward<_Fn>(__f)); |
| 532 |
__t->_M_allocator = __alloc; |
| 533 |
return __t; |
| 534 |
} |
| 535 |
|
| 536 |
public: |
| 537 |
__task* |
| 538 |
parent() |
| 539 |
{ |
| 540 |
return _M_parent; |
| 541 |
} |
| 542 |
|
| 543 |
void |
| 544 |
set_ref_count(int __n) |
| 545 |
{ |
| 546 |
_M_refcount.store(__n, std::memory_order_release); |
| 547 |
} |
| 548 |
|
| 549 |
template <typename _Fn> |
| 550 |
__task* |
| 551 |
make_continuation(_Fn&& __f) |
| 552 |
{ |
| 553 |
auto __t = allocate_func_task(std::forward<_Fn&&>(__f)); |
| 554 |
__t->_M_parent = _M_parent; |
| 555 |
_M_parent = nullptr; |
| 556 |
return __t; |
| 557 |
} |
| 558 |
|
| 559 |
template <typename _Fn> |
| 560 |
__task* |
| 561 |
make_child_of(__task* __parent, _Fn&& __f) |
| 562 |
{ |
| 563 |
auto __t = allocate_func_task(std::forward<_Fn&&>(__f)); |
| 564 |
__t->_M_parent = __parent; |
| 565 |
return __t; |
| 566 |
} |
| 567 |
|
| 568 |
template <typename _Fn> |
| 569 |
__task* |
| 570 |
make_additional_child_of(__task* __parent, _Fn&& __f) |
| 571 |
{ |
| 572 |
auto __t = make_child_of(__parent, std::forward<_Fn>(__f)); |
| 573 |
_PSTL_ASSERT(__parent->_M_refcount.load(std::memory_order_relaxed) > 0); |
| 574 |
++__parent->_M_refcount; |
| 575 |
return __t; |
| 576 |
} |
| 577 |
|
| 578 |
inline void |
| 579 |
recycle_as_continuation() |
| 580 |
{ |
| 581 |
_M_recycle = true; |
| 582 |
} |
| 583 |
|
| 584 |
inline void |
| 585 |
recycle_as_child_of(__task* parent) |
| 586 |
{ |
| 587 |
_M_recycle = true; |
| 588 |
_M_parent = parent; |
| 589 |
} |
| 590 |
|
| 591 |
inline void |
| 592 |
spawn(__task* __t) |
| 593 |
{ |
| 594 |
_PSTL_ASSERT(_M_execute_data != nullptr); |
| 595 |
tbb::detail::d1::spawn(*__t, *_M_execute_data->context); |
| 596 |
} |
| 597 |
|
| 598 |
template <typename _Fn> |
| 599 |
static inline void |
| 600 |
spawn_root_and_wait(__root_task<_Fn>& __root) |
| 601 |
{ |
| 602 |
tbb::detail::d1::execute_and_wait(*__root._M_func_task, __root._M_context, __root._M_wait_object, |
| 603 |
__root._M_context); |
| 604 |
} |
| 605 |
|
| 606 |
template <typename _Func> |
| 607 |
friend class __func_task; |
| 608 |
}; |
| 609 |
|
| 610 |
template <typename _Func> |
| 611 |
class __func_task : public __task |
| 612 |
{ |
| 613 |
_Func _M_func; |
| 614 |
|
| 615 |
__task* |
| 616 |
execute(tbb::detail::d1::execution_data& __ed) override |
| 617 |
{ |
| 618 |
_M_execute_data = &__ed; |
| 619 |
_M_recycle = false; |
| 620 |
__task* __next = _M_func(this); |
| 621 |
return finalize(__next); |
| 622 |
}; |
| 623 |
|
| 624 |
__task* |
| 625 |
cancel(tbb::detail::d1::execution_data& __ed) override |
| 626 |
{ |
| 627 |
return finalize(nullptr); |
| 628 |
} |
| 629 |
|
| 630 |
__task* |
| 631 |
finalize(__task* __next) |
| 632 |
{ |
| 633 |
bool __recycle = _M_recycle; |
| 634 |
_M_recycle = false; |
| 635 |
|
| 636 |
if (__recycle) |
| 637 |
{ |
| 638 |
return __next; |
| 639 |
} |
| 640 |
|
| 641 |
auto __parent = _M_parent; |
| 642 |
auto __alloc = _M_allocator; |
| 643 |
auto __ed = _M_execute_data; |
| 644 |
|
| 645 |
this->~__func_task(); |
| 646 |
|
| 647 |
_PSTL_ASSERT(__parent != nullptr); |
| 648 |
_PSTL_ASSERT(__parent->_M_refcount.load(std::memory_order_relaxed) > 0); |
| 649 |
if (--__parent->_M_refcount == 0) |
| 650 |
{ |
| 651 |
_PSTL_ASSERT(__next == nullptr); |
| 652 |
__alloc.deallocate(this, *__ed); |
| 653 |
return __parent; |
| 654 |
} |
| 655 |
|
| 656 |
return __next; |
| 657 |
} |
| 658 |
|
| 659 |
friend class __root_task<_Func>; |
| 660 |
|
| 661 |
public: |
| 662 |
template <typename _Fn> |
| 663 |
__func_task(_Fn&& __f) : _M_func(std::forward<_Fn>(__f)) |
| 664 |
{ |
| 665 |
} |
| 666 |
|
| 667 |
_Func& |
| 668 |
body() |
| 669 |
{ |
| 670 |
return _M_func; |
| 671 |
} |
| 672 |
}; |
| 673 |
|
| 674 |
template <typename _Func> |
| 675 |
class __root_task : public __task |
| 676 |
{ |
| 677 |
__task* |
| 678 |
execute(tbb::detail::d1::execution_data& __ed) override |
| 679 |
{ |
| 680 |
_M_wait_object.release(); |
| 681 |
return nullptr; |
| 682 |
}; |
| 683 |
|
| 684 |
__task* |
| 685 |
cancel(tbb::detail::d1::execution_data& __ed) override |
| 686 |
{ |
| 687 |
_M_wait_object.release(); |
| 688 |
return nullptr; |
| 689 |
} |
| 690 |
|
| 691 |
__func_task<_Func>* _M_func_task{}; |
| 692 |
tbb::detail::d1::wait_context _M_wait_object{0}; |
| 693 |
tbb::task_group_context _M_context{}; |
| 694 |
|
| 695 |
public: |
| 696 |
template <typename... Args> |
| 697 |
__root_task(Args&&... args) : _M_wait_object{1} |
| 698 |
{ |
| 699 |
tbb::detail::d1::small_object_allocator __alloc{}; |
| 700 |
_M_func_task = __alloc.new_object<__func_task<_Func>>(_Func(std::forward<Args>(args)...)); |
| 701 |
_M_func_task->_M_allocator = __alloc; |
| 702 |
_M_func_task->_M_parent = this; |
| 703 |
_M_refcount.store(1, std::memory_order_relaxed); |
| 704 |
} |
| 705 |
|
| 706 |
friend class __task; |
| 707 |
}; |
| 708 |
#endif // TBB_INTERFACE_VERSION <= 12000 |
| 709 |
|
| 710 |
template <typename _RandomAccessIterator1, typename _RandomAccessIterator2, typename _Compare, typename _Cleanup, |
| 711 |
typename _LeafMerge> |
| 712 |
class __merge_func |
| 713 |
{ |
| 714 |
typedef typename std::iterator_traits<_RandomAccessIterator1>::difference_type _DifferenceType1; |
| 715 |
typedef typename std::iterator_traits<_RandomAccessIterator2>::difference_type _DifferenceType2; |
| 716 |
typedef typename std::common_type<_DifferenceType1, _DifferenceType2>::type _SizeType; |
| 717 |
typedef typename std::iterator_traits<_RandomAccessIterator1>::value_type _ValueType; |
| 718 |
|
| 719 |
_RandomAccessIterator1 _M_x_beg; |
| 720 |
_RandomAccessIterator2 _M_z_beg; |
| 721 |
|
| 722 |
_SizeType _M_xs, _M_xe; |
| 723 |
_SizeType _M_ys, _M_ye; |
| 724 |
_SizeType _M_zs; |
| 725 |
_Compare _M_comp; |
| 726 |
_LeafMerge _M_leaf_merge; |
| 727 |
_SizeType _M_nsort; //number of elements to be sorted for partial_sort alforithm |
| 728 |
|
| 729 |
static const _SizeType __merge_cut_off = _PSTL_MERGE_CUT_OFF; |
| 730 |
|
| 731 |
bool _root; //means a task is merging root task |
| 732 |
bool _x_orig; //"true" means X(or left ) subrange is in the original container; false - in the buffer |
| 733 |
bool _y_orig; //"true" means Y(or right) subrange is in the original container; false - in the buffer |
| 734 |
bool _split; //"true" means a merge task is a split task for parallel merging, the execution logic differs |
| 735 |
|
| 736 |
bool |
| 737 |
is_partial() const |
| 738 |
{ |
| 739 |
return _M_nsort > 0; |
| 740 |
} |
| 741 |
|
| 742 |
struct __move_value |
| 743 |
{ |
| 744 |
template <typename Iterator1, typename Iterator2> |
| 745 |
void |
| 746 |
operator()(Iterator1 __x, Iterator2 __z) |
| 747 |
{ |
| 748 |
*__z = std::move(*__x); |
| 749 |
} |
| 750 |
}; |
| 751 |
|
| 752 |
struct __move_value_construct |
| 753 |
{ |
| 754 |
template <typename Iterator1, typename Iterator2> |
| 755 |
void |
| 756 |
operator()(Iterator1 __x, Iterator2 __z) |
| 757 |
{ |
| 758 |
::new (std::addressof(*__z)) _ValueType(std::move(*__x)); |
| 759 |
} |
| 760 |
}; |
| 761 |
|
| 762 |
struct __move_range |
| 763 |
{ |
| 764 |
template <typename Iterator1, typename Iterator2> |
| 765 |
Iterator2 |
| 766 |
operator()(Iterator1 __first1, Iterator1 __last1, Iterator2 __first2) |
| 767 |
{ |
| 768 |
if (__last1 - __first1 < __merge_cut_off) |
| 769 |
return std::move(__first1, __last1, __first2); |
| 770 |
|
| 771 |
auto __n = __last1 - __first1; |
| 772 |
tbb::parallel_for(tbb::blocked_range<_SizeType>(0, __n, __merge_cut_off), |
| 773 |
[__first1, __first2](const tbb::blocked_range<_SizeType>& __range) { |
| 774 |
std::move(__first1 + __range.begin(), __first1 + __range.end(), |
| 775 |
__first2 + __range.begin()); |
| 776 |
}); |
| 777 |
return __first2 + __n; |
| 778 |
} |
| 779 |
}; |
| 780 |
|
| 781 |
struct __move_range_construct |
| 782 |
{ |
| 783 |
template <typename Iterator1, typename Iterator2> |
| 784 |
Iterator2 |
| 785 |
operator()(Iterator1 __first1, Iterator1 __last1, Iterator2 __first2) |
| 786 |
{ |
| 787 |
if (__last1 - __first1 < __merge_cut_off) |
| 788 |
{ |
| 789 |
for (; __first1 != __last1; ++__first1, ++__first2) |
| 790 |
__move_value_construct()(__first1, __first2); |
| 791 |
return __first2; |
| 792 |
} |
| 793 |
|
| 794 |
auto __n = __last1 - __first1; |
| 795 |
tbb::parallel_for(tbb::blocked_range<_SizeType>(0, __n, __merge_cut_off), |
| 796 |
[__first1, __first2](const tbb::blocked_range<_SizeType>& __range) { |
| 797 |
for (auto i = __range.begin(); i != __range.end(); ++i) |
| 798 |
__move_value_construct()(__first1 + i, __first2 + i); |
| 799 |
}); |
| 800 |
return __first2 + __n; |
| 801 |
} |
| 802 |
}; |
| 803 |
|
| 804 |
struct __cleanup_range |
| 805 |
{ |
| 806 |
template <typename Iterator> |
| 807 |
void |
| 808 |
operator()(Iterator __first, Iterator __last) |
| 809 |
{ |
| 810 |
if (__last - __first < __merge_cut_off) |
| 811 |
_Cleanup()(__first, __last); |
| 812 |
else |
| 813 |
{ |
| 814 |
auto __n = __last - __first; |
| 815 |
tbb::parallel_for(tbb::blocked_range<_SizeType>(0, __n, __merge_cut_off), |
| 816 |
[__first](const tbb::blocked_range<_SizeType>& __range) { |
| 817 |
_Cleanup()(__first + __range.begin(), __first + __range.end()); |
| 818 |
}); |
| 819 |
} |
| 820 |
} |
| 821 |
}; |
| 822 |
|
| 823 |
public: |
| 824 |
__merge_func(_SizeType __xs, _SizeType __xe, _SizeType __ys, _SizeType __ye, _SizeType __zs, _Compare __comp, |
| 825 |
_Cleanup, _LeafMerge __leaf_merge, _SizeType __nsort, _RandomAccessIterator1 __x_beg, |
| 826 |
_RandomAccessIterator2 __z_beg, bool __x_orig, bool __y_orig, bool __root) |
| 827 |
: _M_xs(__xs), _M_xe(__xe), _M_ys(__ys), _M_ye(__ye), _M_zs(__zs), _M_x_beg(__x_beg), _M_z_beg(__z_beg), |
| 828 |
_M_comp(__comp), _M_leaf_merge(__leaf_merge), _M_nsort(__nsort), _root(__root), |
| 829 |
_x_orig(__x_orig), _y_orig(__y_orig), _split(false) |
| 830 |
{ |
| 831 |
} |
| 832 |
|
| 833 |
bool |
| 834 |
is_left(_SizeType __idx) const |
| 835 |
{ |
| 836 |
return _M_xs == __idx; |
| 837 |
} |
| 838 |
|
| 839 |
template <typename IndexType> |
| 840 |
void |
| 841 |
set_odd(IndexType __idx, bool __on_off) |
| 842 |
{ |
| 843 |
if (is_left(__idx)) |
| 844 |
_x_orig = __on_off; |
| 845 |
else |
| 846 |
_y_orig = __on_off; |
| 847 |
} |
| 848 |
|
| 849 |
__task* |
| 850 |
operator()(__task* __self); |
| 851 |
|
| 852 |
private: |
| 853 |
__merge_func* |
| 854 |
parent_merge(__task* __self) const |
| 855 |
{ |
| 856 |
return _root ? nullptr : &static_cast<__func_task<__merge_func>*>(__self->parent())->body(); |
| 857 |
} |
| 858 |
bool |
| 859 |
x_less_y() |
| 860 |
{ |
| 861 |
const auto __nx = (_M_xe - _M_xs); |
| 862 |
const auto __ny = (_M_ye - _M_ys); |
| 863 |
_PSTL_ASSERT(__nx > 0 && __ny > 0); |
| 864 |
|
| 865 |
_PSTL_ASSERT(_x_orig == _y_orig); |
| 866 |
_PSTL_ASSERT(!is_partial()); |
| 867 |
|
| 868 |
if (_x_orig) |
| 869 |
{ |
| 870 |
_PSTL_ASSERT(std::is_sorted(_M_x_beg + _M_xs, _M_x_beg + _M_xe, _M_comp)); |
| 871 |
_PSTL_ASSERT(std::is_sorted(_M_x_beg + _M_ys, _M_x_beg + _M_ye, _M_comp)); |
| 872 |
return !_M_comp(*(_M_x_beg + _M_ys), *(_M_x_beg + _M_xe - 1)); |
| 873 |
} |
| 874 |
|
| 875 |
_PSTL_ASSERT(std::is_sorted(_M_z_beg + _M_xs, _M_z_beg + _M_xe, _M_comp)); |
| 876 |
_PSTL_ASSERT(std::is_sorted(_M_z_beg + _M_ys, _M_z_beg + _M_ye, _M_comp)); |
| 877 |
return !_M_comp(*(_M_z_beg + _M_zs + __nx), *(_M_z_beg + _M_zs + __nx - 1)); |
| 878 |
} |
| 879 |
void |
| 880 |
move_x_range() |
| 881 |
{ |
| 882 |
const auto __nx = (_M_xe - _M_xs); |
| 883 |
const auto __ny = (_M_ye - _M_ys); |
| 884 |
_PSTL_ASSERT(__nx > 0 && __ny > 0); |
| 885 |
|
| 886 |
if (_x_orig) |
| 887 |
__move_range_construct()(_M_x_beg + _M_xs, _M_x_beg + _M_xe, _M_z_beg + _M_zs); |
| 888 |
else |
| 889 |
{ |
| 890 |
__move_range()(_M_z_beg + _M_zs, _M_z_beg + _M_zs + __nx, _M_x_beg + _M_xs); |
| 891 |
__cleanup_range()(_M_z_beg + _M_zs, _M_z_beg + _M_zs + __nx); |
| 892 |
} |
| 893 |
|
| 894 |
_x_orig = !_x_orig; |
| 895 |
} |
| 896 |
void |
| 897 |
move_y_range() |
| 898 |
{ |
| 899 |
const auto __nx = (_M_xe - _M_xs); |
| 900 |
const auto __ny = (_M_ye - _M_ys); |
| 901 |
|
| 902 |
if (_y_orig) |
| 903 |
__move_range_construct()(_M_x_beg + _M_ys, _M_x_beg + _M_ye, _M_z_beg + _M_zs + __nx); |
| 904 |
else |
| 905 |
{ |
| 906 |
__move_range()(_M_z_beg + _M_zs + __nx, _M_z_beg + _M_zs + __nx + __ny, _M_x_beg + _M_ys); |
| 907 |
__cleanup_range()(_M_z_beg + _M_zs + __nx, _M_z_beg + _M_zs + __nx + __ny); |
| 908 |
} |
| 909 |
|
| 910 |
_y_orig = !_y_orig; |
| 911 |
} |
| 912 |
__task* |
| 913 |
merge_ranges(__task* __self) |
| 914 |
{ |
| 915 |
_PSTL_ASSERT(_x_orig == _y_orig); //two merged subrange must be lie into the same buffer |
| 916 |
|
| 917 |
const auto __nx = (_M_xe - _M_xs); |
| 918 |
const auto __ny = (_M_ye - _M_ys); |
| 919 |
const auto __n = __nx + __ny; |
| 920 |
|
| 921 |
// need to merge {x} and {y} |
| 922 |
if (__n > __merge_cut_off) |
| 923 |
return split_merging(__self); |
| 924 |
|
| 925 |
//merge to buffer |
| 926 |
if (_x_orig) |
| 927 |
{ |
| 928 |
_M_leaf_merge(_M_x_beg + _M_xs, _M_x_beg + _M_xe, _M_x_beg + _M_ys, _M_x_beg + _M_ye, _M_z_beg + _M_zs, |
| 929 |
_M_comp, __move_value_construct(), __move_value_construct(), __move_range_construct(), |
| 930 |
__move_range_construct()); |
| 931 |
_PSTL_ASSERT(parent_merge(__self)); //not root merging task |
| 932 |
} |
| 933 |
//merge to "origin" |
| 934 |
else |
| 935 |
{ |
| 936 |
_PSTL_ASSERT(_x_orig == _y_orig); |
| 937 |
|
| 938 |
_PSTL_ASSERT(is_partial() || std::is_sorted(_M_z_beg + _M_xs, _M_z_beg + _M_xe, _M_comp)); |
| 939 |
_PSTL_ASSERT(is_partial() || std::is_sorted(_M_z_beg + _M_ys, _M_z_beg + _M_ye, _M_comp)); |
| 940 |
|
| 941 |
const auto __nx = (_M_xe - _M_xs); |
| 942 |
const auto __ny = (_M_ye - _M_ys); |
| 943 |
|
| 944 |
_M_leaf_merge(_M_z_beg + _M_xs, _M_z_beg + _M_xe, _M_z_beg + _M_ys, _M_z_beg + _M_ye, _M_x_beg + _M_zs, |
| 945 |
_M_comp, __move_value(), __move_value(), __move_range(), __move_range()); |
| 946 |
|
| 947 |
__cleanup_range()(_M_z_beg + _M_xs, _M_z_beg + _M_xe); |
| 948 |
__cleanup_range()(_M_z_beg + _M_ys, _M_z_beg + _M_ye); |
| 949 |
} |
| 950 |
return nullptr; |
| 951 |
} |
| 952 |
|
| 953 |
__task* |
| 954 |
process_ranges(__task* __self) |
| 955 |
{ |
| 956 |
_PSTL_ASSERT(_x_orig == _y_orig); |
| 957 |
_PSTL_ASSERT(!_split); |
| 958 |
|
| 959 |
auto p = parent_merge(__self); |
| 960 |
|
| 961 |
if (!p) |
| 962 |
{ //root merging task |
| 963 |
|
| 964 |
//optimization, just for sort algorithm, //{x} <= {y} |
| 965 |
if (!is_partial() && x_less_y()) //we have a solution |
| 966 |
{ |
| 967 |
if (!_x_orig) |
| 968 |
{ //we have to move the solution to the origin |
| 969 |
move_x_range(); //parallel moving |
| 970 |
move_y_range(); //parallel moving |
| 971 |
} |
| 972 |
return nullptr; |
| 973 |
} |
| 974 |
//else: if we have data in the origin, |
| 975 |
//we have to move data to the buffer for final merging into the origin. |
| 976 |
if (_x_orig) |
| 977 |
{ |
| 978 |
move_x_range(); //parallel moving |
| 979 |
move_y_range(); //parallel moving |
| 980 |
} |
| 981 |
// need to merge {x} and {y}. |
| 982 |
return merge_ranges(__self); |
| 983 |
} |
| 984 |
//else: not root merging task (parent_merge() == NULL) |
| 985 |
//optimization, just for sort algorithm, //{x} <= {y} |
| 986 |
if (!is_partial() && x_less_y()) |
| 987 |
{ |
| 988 |
const auto id_range = _M_zs; |
| 989 |
p->set_odd(id_range, _x_orig); |
| 990 |
return nullptr; |
| 991 |
} |
| 992 |
//else: we have to revert "_x(y)_orig" flag of the parent merging task |
| 993 |
const auto id_range = _M_zs; |
| 994 |
p->set_odd(id_range, !_x_orig); |
| 995 |
|
| 996 |
return merge_ranges(__self); |
| 997 |
} |
| 998 |
|
| 999 |
//splitting as merge task into 2 of the same level |
| 1000 |
__task* |
| 1001 |
split_merging(__task* __self) |
| 1002 |
{ |
| 1003 |
_PSTL_ASSERT(_x_orig == _y_orig); |
| 1004 |
const auto __nx = (_M_xe - _M_xs); |
| 1005 |
const auto __ny = (_M_ye - _M_ys); |
| 1006 |
|
| 1007 |
_SizeType __xm{}; |
| 1008 |
_SizeType __ym{}; |
| 1009 |
if (__nx < __ny) |
| 1010 |
{ |
| 1011 |
__ym = _M_ys + __ny / 2; |
| 1012 |
|
| 1013 |
if (_x_orig) |
| 1014 |
__xm = std::upper_bound(_M_x_beg + _M_xs, _M_x_beg + _M_xe, *(_M_x_beg + __ym), _M_comp) - _M_x_beg; |
| 1015 |
else |
| 1016 |
__xm = std::upper_bound(_M_z_beg + _M_xs, _M_z_beg + _M_xe, *(_M_z_beg + __ym), _M_comp) - _M_z_beg; |
| 1017 |
} |
| 1018 |
else |
| 1019 |
{ |
| 1020 |
__xm = _M_xs + __nx / 2; |
| 1021 |
|
| 1022 |
if (_y_orig) |
| 1023 |
__ym = std::lower_bound(_M_x_beg + _M_ys, _M_x_beg + _M_ye, *(_M_x_beg + __xm), _M_comp) - _M_x_beg; |
| 1024 |
else |
| 1025 |
__ym = std::lower_bound(_M_z_beg + _M_ys, _M_z_beg + _M_ye, *(_M_z_beg + __xm), _M_comp) - _M_z_beg; |
| 1026 |
} |
| 1027 |
|
| 1028 |
auto __zm = _M_zs + ((__xm - _M_xs) + (__ym - _M_ys)); |
| 1029 |
__merge_func __right_func(__xm, _M_xe, __ym, _M_ye, __zm, _M_comp, _Cleanup(), _M_leaf_merge, _M_nsort, |
| 1030 |
_M_x_beg, _M_z_beg, _x_orig, _y_orig, _root); |
| 1031 |
__right_func._split = true; |
| 1032 |
auto __merge_task = __self->make_additional_child_of(__self->parent(), std::move(__right_func)); |
| 1033 |
__self->spawn(__merge_task); |
| 1034 |
__self->recycle_as_continuation(); |
| 1035 |
|
| 1036 |
_M_xe = __xm; |
| 1037 |
_M_ye = __ym; |
| 1038 |
_split = true; |
| 1039 |
|
| 1040 |
return __self; |
| 1041 |
} |
| 1042 |
}; |
| 1043 |
|
| 1044 |
template <typename _RandomAccessIterator1, typename _RandomAccessIterator2, typename __M_Compare, typename _Cleanup, |
| 1045 |
typename _LeafMerge> |
| 1046 |
__task* |
| 1047 |
__merge_func<_RandomAccessIterator1, _RandomAccessIterator2, __M_Compare, _Cleanup, _LeafMerge>:: |
| 1048 |
operator()(__task* __self) |
| 1049 |
{ |
| 1050 |
//a. split merge task into 2 of the same level; the special logic, |
| 1051 |
//without processing(process_ranges) adjacent sub-ranges x and y |
| 1052 |
if (_split) |
| 1053 |
return merge_ranges(__self); |
| 1054 |
|
| 1055 |
//b. General merging of adjacent sub-ranges x and y (with optimization in case of {x} <= {y} ) |
| 1056 |
|
| 1057 |
//1. x and y are in the even buffer |
| 1058 |
//2. x and y are in the odd buffer |
| 1059 |
if (_x_orig == _y_orig) |
| 1060 |
return process_ranges(__self); |
| 1061 |
|
| 1062 |
//3. x is in even buffer, y is in the odd buffer |
| 1063 |
//4. x is in odd buffer, y is in the even buffer |
| 1064 |
if (!parent_merge(__self)) |
| 1065 |
{ //root merge task |
| 1066 |
if (_x_orig) |
| 1067 |
move_x_range(); |
| 1068 |
else |
| 1069 |
move_y_range(); |
| 1070 |
} |
| 1071 |
else |
| 1072 |
{ |
| 1073 |
const _SizeType __nx = (_M_xe - _M_xs); |
| 1074 |
const _SizeType __ny = (_M_ye - _M_ys); |
| 1075 |
_PSTL_ASSERT(__nx > 0); |
| 1076 |
_PSTL_ASSERT(__nx > 0); |
| 1077 |
|
| 1078 |
if (__nx < __ny) |
| 1079 |
move_x_range(); |
| 1080 |
else |
| 1081 |
move_y_range(); |
| 1082 |
} |
| 1083 |
|
| 1084 |
return process_ranges(__self); |
| 1085 |
} |
| 1086 |
|
| 1087 |
template <typename _RandomAccessIterator1, typename _RandomAccessIterator2, typename _Compare, typename _LeafSort> |
| 1088 |
class __stable_sort_func |
| 1089 |
{ |
| 1090 |
public: |
| 1091 |
typedef typename std::iterator_traits<_RandomAccessIterator1>::difference_type _DifferenceType1; |
| 1092 |
typedef typename std::iterator_traits<_RandomAccessIterator2>::difference_type _DifferenceType2; |
| 1093 |
typedef typename std::common_type<_DifferenceType1, _DifferenceType2>::type _SizeType; |
| 1094 |
|
| 1095 |
private: |
| 1096 |
_RandomAccessIterator1 _M_xs, _M_xe, _M_x_beg; |
| 1097 |
_RandomAccessIterator2 _M_zs, _M_z_beg; |
| 1098 |
_Compare _M_comp; |
| 1099 |
_LeafSort _M_leaf_sort; |
| 1100 |
bool _M_root; |
| 1101 |
_SizeType _M_nsort; //zero or number of elements to be sorted for partial_sort alforithm |
| 1102 |
|
| 1103 |
public: |
| 1104 |
__stable_sort_func(_RandomAccessIterator1 __xs, _RandomAccessIterator1 __xe, _RandomAccessIterator2 __zs, |
| 1105 |
bool __root, _Compare __comp, _LeafSort __leaf_sort, _SizeType __nsort, |
| 1106 |
_RandomAccessIterator1 __x_beg, _RandomAccessIterator2 __z_beg) |
| 1107 |
: _M_xs(__xs), _M_xe(__xe), _M_x_beg(__x_beg), _M_zs(__zs), _M_z_beg(__z_beg), _M_comp(__comp), |
| 1108 |
_M_leaf_sort(__leaf_sort), _M_root(__root), _M_nsort(__nsort) |
| 1109 |
{ |
| 1110 |
} |
| 1111 |
|
| 1112 |
__task* |
| 1113 |
operator()(__task* __self); |
| 1114 |
}; |
| 1115 |
|
| 1116 |
#define _PSTL_STABLE_SORT_CUT_OFF 500 |
| 1117 |
|
| 1118 |
template <typename _RandomAccessIterator1, typename _RandomAccessIterator2, typename _Compare, typename _LeafSort> |
| 1119 |
__task* |
| 1120 |
__stable_sort_func<_RandomAccessIterator1, _RandomAccessIterator2, _Compare, _LeafSort>::operator()(__task* __self) |
| 1121 |
{ |
| 1122 |
typedef __merge_func<_RandomAccessIterator1, _RandomAccessIterator2, _Compare, __utils::__serial_destroy, |
| 1123 |
__utils::__serial_move_merge> |
| 1124 |
_MergeTaskType; |
| 1125 |
|
| 1126 |
const _SizeType __n = _M_xe - _M_xs; |
| 1127 |
const _SizeType __nmerge = _M_nsort > 0 ? _M_nsort : __n; |
| 1128 |
const _SizeType __sort_cut_off = _PSTL_STABLE_SORT_CUT_OFF; |
| 1129 |
if (__n <= __sort_cut_off) |
| 1130 |
{ |
| 1131 |
_M_leaf_sort(_M_xs, _M_xe, _M_comp); |
| 1132 |
_PSTL_ASSERT(!_M_root); |
| 1133 |
return nullptr; |
| 1134 |
} |
| 1135 |
|
| 1136 |
const _RandomAccessIterator1 __xm = _M_xs + __n / 2; |
| 1137 |
const _RandomAccessIterator2 __zm = _M_zs + (__xm - _M_xs); |
| 1138 |
const _RandomAccessIterator2 __ze = _M_zs + __n; |
| 1139 |
_MergeTaskType __m(_MergeTaskType(_M_xs - _M_x_beg, __xm - _M_x_beg, __xm - _M_x_beg, _M_xe - _M_x_beg, |
| 1140 |
_M_zs - _M_z_beg, _M_comp, __utils::__serial_destroy(), |
| 1141 |
__utils::__serial_move_merge(__nmerge), _M_nsort, _M_x_beg, _M_z_beg, |
| 1142 |
/*x_orig*/ true, /*y_orig*/ true, /*root*/ _M_root)); |
| 1143 |
auto __parent = __self->make_continuation(std::move(__m)); |
| 1144 |
__parent->set_ref_count(2); |
| 1145 |
auto __right = __self->make_child_of( |
| 1146 |
__parent, __stable_sort_func(__xm, _M_xe, __zm, false, _M_comp, _M_leaf_sort, _M_nsort, _M_x_beg, _M_z_beg)); |
| 1147 |
__self->spawn(__right); |
| 1148 |
__self->recycle_as_child_of(__parent); |
| 1149 |
_M_root = false; |
| 1150 |
_M_xe = __xm; |
| 1151 |
|
| 1152 |
return __self; |
| 1153 |
} |
| 1154 |
|
| 1155 |
template <class _ExecutionPolicy, typename _RandomAccessIterator, typename _Compare, typename _LeafSort> |
| 1156 |
void |
| 1157 |
__parallel_stable_sort(_ExecutionPolicy&&, _RandomAccessIterator __xs, _RandomAccessIterator __xe, _Compare __comp, |
| 1158 |
_LeafSort __leaf_sort, std::size_t __nsort = 0) |
| 1159 |
{ |
| 1160 |
tbb::this_task_arena::isolate([=, &__nsort]() { |
| 1161 |
//sorting based on task tree and parallel merge |
| 1162 |
typedef typename std::iterator_traits<_RandomAccessIterator>::value_type _ValueType; |
| 1163 |
typedef typename std::iterator_traits<_RandomAccessIterator>::difference_type _DifferenceType; |
| 1164 |
const _DifferenceType __n = __xe - __xs; |
| 1165 |
if (__nsort == __n) |
| 1166 |
__nsort = 0; // 'partial_sort' becames 'sort' |
| 1167 |
|
| 1168 |
const _DifferenceType __sort_cut_off = _PSTL_STABLE_SORT_CUT_OFF; |
| 1169 |
if (__n > __sort_cut_off) |
| 1170 |
{ |
| 1171 |
__buffer<_ValueType> __buf(__n); |
| 1172 |
__root_task<__stable_sort_func<_RandomAccessIterator, _ValueType*, _Compare, _LeafSort>> __root{ |
| 1173 |
__xs, __xe, __buf.get(), true, __comp, __leaf_sort, __nsort, __xs, __buf.get()}; |
| 1174 |
__task::spawn_root_and_wait(__root); |
| 1175 |
return; |
| 1176 |
} |
| 1177 |
//serial sort |
| 1178 |
__leaf_sort(__xs, __xe, __comp); |
| 1179 |
}); |
| 1180 |
} |
| 1181 |
|
| 1182 |
//------------------------------------------------------------------------ |
| 1183 |
// parallel_merge |
| 1184 |
//------------------------------------------------------------------------ |
| 1185 |
template <typename _RandomAccessIterator1, typename _RandomAccessIterator2, typename _RandomAccessIterator3, |
| 1186 |
typename _Compare, typename _LeafMerge> |
| 1187 |
class __merge_func_static |
| 1188 |
{ |
| 1189 |
_RandomAccessIterator1 _M_xs, _M_xe; |
| 1190 |
_RandomAccessIterator2 _M_ys, _M_ye; |
| 1191 |
_RandomAccessIterator3 _M_zs; |
| 1192 |
_Compare _M_comp; |
| 1193 |
_LeafMerge _M_leaf_merge; |
| 1194 |
|
| 1195 |
public: |
| 1196 |
__merge_func_static(_RandomAccessIterator1 __xs, _RandomAccessIterator1 __xe, _RandomAccessIterator2 __ys, |
| 1197 |
_RandomAccessIterator2 __ye, _RandomAccessIterator3 __zs, _Compare __comp, |
| 1198 |
_LeafMerge __leaf_merge) |
| 1199 |
: _M_xs(__xs), _M_xe(__xe), _M_ys(__ys), _M_ye(__ye), _M_zs(__zs), _M_comp(__comp), _M_leaf_merge(__leaf_merge) |
| 1200 |
{ |
| 1201 |
} |
| 1202 |
|
| 1203 |
__task* |
| 1204 |
operator()(__task* __self); |
| 1205 |
}; |
| 1206 |
|
| 1207 |
//TODO: consider usage of parallel_for with a custom blocked_range |
| 1208 |
template <typename _RandomAccessIterator1, typename _RandomAccessIterator2, typename _RandomAccessIterator3, |
| 1209 |
typename __M_Compare, typename _LeafMerge> |
| 1210 |
__task* |
| 1211 |
__merge_func_static<_RandomAccessIterator1, _RandomAccessIterator2, _RandomAccessIterator3, __M_Compare, _LeafMerge>:: |
| 1212 |
operator()(__task* __self) |
| 1213 |
{ |
| 1214 |
typedef typename std::iterator_traits<_RandomAccessIterator1>::difference_type _DifferenceType1; |
| 1215 |
typedef typename std::iterator_traits<_RandomAccessIterator2>::difference_type _DifferenceType2; |
| 1216 |
typedef typename std::common_type<_DifferenceType1, _DifferenceType2>::type _SizeType; |
| 1217 |
const _SizeType __n = (_M_xe - _M_xs) + (_M_ye - _M_ys); |
| 1218 |
const _SizeType __merge_cut_off = _PSTL_MERGE_CUT_OFF; |
| 1219 |
if (__n <= __merge_cut_off) |
| 1220 |
{ |
| 1221 |
_M_leaf_merge(_M_xs, _M_xe, _M_ys, _M_ye, _M_zs, _M_comp); |
| 1222 |
return nullptr; |
| 1223 |
} |
| 1224 |
|
| 1225 |
_RandomAccessIterator1 __xm; |
| 1226 |
_RandomAccessIterator2 __ym; |
| 1227 |
if (_M_xe - _M_xs < _M_ye - _M_ys) |
| 1228 |
{ |
| 1229 |
__ym = _M_ys + (_M_ye - _M_ys) / 2; |
| 1230 |
__xm = std::upper_bound(_M_xs, _M_xe, *__ym, _M_comp); |
| 1231 |
} |
| 1232 |
else |
| 1233 |
{ |
| 1234 |
__xm = _M_xs + (_M_xe - _M_xs) / 2; |
| 1235 |
__ym = std::lower_bound(_M_ys, _M_ye, *__xm, _M_comp); |
| 1236 |
} |
| 1237 |
const _RandomAccessIterator3 __zm = _M_zs + ((__xm - _M_xs) + (__ym - _M_ys)); |
| 1238 |
auto __right = __self->make_additional_child_of( |
| 1239 |
__self->parent(), __merge_func_static(__xm, _M_xe, __ym, _M_ye, __zm, _M_comp, _M_leaf_merge)); |
| 1240 |
__self->spawn(__right); |
| 1241 |
__self->recycle_as_continuation(); |
| 1242 |
_M_xe = __xm; |
| 1243 |
_M_ye = __ym; |
| 1244 |
|
| 1245 |
return __self; |
| 1246 |
} |
| 1247 |
|
| 1248 |
template <class _ExecutionPolicy, typename _RandomAccessIterator1, typename _RandomAccessIterator2, |
| 1249 |
typename _RandomAccessIterator3, typename _Compare, typename _LeafMerge> |
| 1250 |
void |
| 1251 |
__parallel_merge(_ExecutionPolicy&&, _RandomAccessIterator1 __xs, _RandomAccessIterator1 __xe, |
| 1252 |
_RandomAccessIterator2 __ys, _RandomAccessIterator2 __ye, _RandomAccessIterator3 __zs, _Compare __comp, |
| 1253 |
_LeafMerge __leaf_merge) |
| 1254 |
{ |
| 1255 |
typedef typename std::iterator_traits<_RandomAccessIterator1>::difference_type _DifferenceType1; |
| 1256 |
typedef typename std::iterator_traits<_RandomAccessIterator2>::difference_type _DifferenceType2; |
| 1257 |
typedef typename std::common_type<_DifferenceType1, _DifferenceType2>::type _SizeType; |
| 1258 |
const _SizeType __n = (__xe - __xs) + (__ye - __ys); |
| 1259 |
const _SizeType __merge_cut_off = _PSTL_MERGE_CUT_OFF; |
| 1260 |
if (__n <= __merge_cut_off) |
| 1261 |
{ |
| 1262 |
// Fall back on serial merge |
| 1263 |
__leaf_merge(__xs, __xe, __ys, __ye, __zs, __comp); |
| 1264 |
} |
| 1265 |
else |
| 1266 |
{ |
| 1267 |
tbb::this_task_arena::isolate([=]() { |
| 1268 |
typedef __merge_func_static<_RandomAccessIterator1, _RandomAccessIterator2, _RandomAccessIterator3, |
| 1269 |
_Compare, _LeafMerge> |
| 1270 |
_TaskType; |
| 1271 |
__root_task<_TaskType> __root{__xs, __xe, __ys, __ye, __zs, __comp, __leaf_merge}; |
| 1272 |
__task::spawn_root_and_wait(__root); |
| 1273 |
}); |
| 1274 |
} |
| 1275 |
} |
| 1276 |
|
| 1277 |
//------------------------------------------------------------------------ |
| 1278 |
// parallel_invoke |
| 1279 |
//------------------------------------------------------------------------ |
| 1280 |
template <class _ExecutionPolicy, typename _F1, typename _F2> |
| 1281 |
void |
| 1282 |
__parallel_invoke(_ExecutionPolicy&&, _F1&& __f1, _F2&& __f2) |
| 1283 |
{ |
| 1284 |
//TODO: a version of tbb::this_task_arena::isolate with variadic arguments pack should be added in the future |
| 1285 |
tbb::this_task_arena::isolate([&]() { tbb::parallel_invoke(std::forward<_F1>(__f1), std::forward<_F2>(__f2)); }); |
| 1286 |
} |
| 1287 |
|
| 1288 |
} // namespace __tbb_backend |
| 1289 |
} // namespace __pstl |
| 1290 |
|
| 1291 |
#endif /* _PSTL_PARALLEL_BACKEND_TBB_H */ |