| 1 |
// -*- C++ -*- |
| 2 |
|
| 3 |
// Copyright (C) 2007-2021 Free Software Foundation, Inc. |
| 4 |
// |
| 5 |
// This file is part of the GNU ISO C++ Library. This library is free |
| 6 |
// software; you can redistribute it and/or modify it under the terms |
| 7 |
// of the GNU General Public License as published by the Free Software |
| 8 |
// Foundation; either version 3, or (at your option) any later |
| 9 |
// version. |
| 10 |
|
| 11 |
// This library is distributed in the hope that it will be useful, but |
| 12 |
// WITHOUT ANY WARRANTY; without even the implied warranty of |
| 13 |
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| 14 |
// General Public License for more details. |
| 15 |
|
| 16 |
// Under Section 7 of GPL version 3, you are granted additional |
| 17 |
// permissions described in the GCC Runtime Library Exception, version |
| 18 |
// 3.1, as published by the Free Software Foundation. |
| 19 |
|
| 20 |
// You should have received a copy of the GNU General Public License and |
| 21 |
// a copy of the GCC Runtime Library Exception along with this program; |
| 22 |
// see the files COPYING3 and COPYING.RUNTIME respectively. If not, see |
| 23 |
// <http://www.gnu.org/licenses/>. |
| 24 |
|
| 25 |
/** @file parallel/workstealing.h |
| 26 |
* @brief Parallelization of embarrassingly parallel execution by |
| 27 |
* means of work-stealing. |
| 28 |
* |
| 29 |
* Work stealing is described in |
| 30 |
* |
| 31 |
* R. D. Blumofe and C. E. Leiserson. |
| 32 |
* Scheduling multithreaded computations by work stealing. |
| 33 |
* Journal of the ACM, 46(5):720-748, 1999. |
| 34 |
* |
| 35 |
* This file is a GNU parallel extension to the Standard C++ Library. |
| 36 |
*/ |
| 37 |
|
| 38 |
// Written by Felix Putze. |
| 39 |
|
| 40 |
#ifndef _GLIBCXX_PARALLEL_WORKSTEALING_H |
| 41 |
#define _GLIBCXX_PARALLEL_WORKSTEALING_H 1 |
| 42 |
|
| 43 |
#include <parallel/parallel.h> |
| 44 |
#include <parallel/random_number.h> |
| 45 |
#include <parallel/compatibility.h> |
| 46 |
|
| 47 |
namespace __gnu_parallel |
| 48 |
{ |
| 49 |
|
| 50 |
#define _GLIBCXX_JOB_VOLATILE volatile |
| 51 |
|
| 52 |
/** @brief One __job for a certain thread. */ |
| 53 |
template<typename _DifferenceTp> |
| 54 |
struct _Job |
| 55 |
{ |
| 56 |
typedef _DifferenceTp _DifferenceType; |
| 57 |
|
| 58 |
/** @brief First element. |
| 59 |
* |
| 60 |
* Changed by owning and stealing thread. By stealing thread, |
| 61 |
* always incremented. */ |
| 62 |
_GLIBCXX_JOB_VOLATILE _DifferenceType _M_first; |
| 63 |
|
| 64 |
/** @brief Last element. |
| 65 |
* |
| 66 |
* Changed by owning thread only. */ |
| 67 |
_GLIBCXX_JOB_VOLATILE _DifferenceType _M_last; |
| 68 |
|
| 69 |
/** @brief Number of elements, i.e. @c _M_last-_M_first+1. |
| 70 |
* |
| 71 |
* Changed by owning thread only. */ |
| 72 |
_GLIBCXX_JOB_VOLATILE _DifferenceType _M_load; |
| 73 |
}; |
| 74 |
|
| 75 |
/** @brief Work stealing algorithm for random access iterators. |
| 76 |
* |
| 77 |
* Uses O(1) additional memory. Synchronization at job lists is |
| 78 |
* done with atomic operations. |
| 79 |
* @param __begin Begin iterator of element sequence. |
| 80 |
* @param __end End iterator of element sequence. |
| 81 |
* @param __op User-supplied functor (comparator, predicate, adding |
| 82 |
* functor, ...). |
| 83 |
* @param __f Functor to @a process an element with __op (depends on |
| 84 |
* desired functionality, e. g. for std::for_each(), ...). |
| 85 |
* @param __r Functor to @a add a single __result to the already |
| 86 |
* processed elements (depends on functionality). |
| 87 |
* @param __base Base value for reduction. |
| 88 |
* @param __output Pointer to position where final result is written to |
| 89 |
* @param __bound Maximum number of elements processed (e. g. for |
| 90 |
* std::count_n()). |
| 91 |
* @return User-supplied functor (that may contain a part of the result). |
| 92 |
*/ |
| 93 |
template<typename _RAIter, |
| 94 |
typename _Op, |
| 95 |
typename _Fu, |
| 96 |
typename _Red, |
| 97 |
typename _Result> |
| 98 |
_Op |
| 99 |
__for_each_template_random_access_workstealing(_RAIter __begin, |
| 100 |
_RAIter __end, _Op __op, |
| 101 |
_Fu& __f, _Red __r, |
| 102 |
_Result __base, |
| 103 |
_Result& __output, |
| 104 |
typename std::iterator_traits<_RAIter>::difference_type __bound) |
| 105 |
{ |
| 106 |
_GLIBCXX_CALL(__end - __begin) |
| 107 |
|
| 108 |
typedef std::iterator_traits<_RAIter> _TraitsType; |
| 109 |
typedef typename _TraitsType::difference_type _DifferenceType; |
| 110 |
|
| 111 |
const _Settings& __s = _Settings::get(); |
| 112 |
|
| 113 |
_DifferenceType __chunk_size = |
| 114 |
static_cast<_DifferenceType>(__s.workstealing_chunk_size); |
| 115 |
|
| 116 |
// How many jobs? |
| 117 |
_DifferenceType __length = (__bound < 0) ? (__end - __begin) : __bound; |
| 118 |
|
| 119 |
// To avoid false sharing in a cache line. |
| 120 |
const int __stride = (__s.cache_line_size * 10 |
| 121 |
/ sizeof(_Job<_DifferenceType>) + 1); |
| 122 |
|
| 123 |
// Total number of threads currently working. |
| 124 |
_ThreadIndex __busy = 0; |
| 125 |
|
| 126 |
_Job<_DifferenceType> *__job; |
| 127 |
|
| 128 |
omp_lock_t __output_lock; |
| 129 |
omp_init_lock(&__output_lock); |
| 130 |
|
| 131 |
// Write base value to output. |
| 132 |
__output = __base; |
| 133 |
|
| 134 |
// No more threads than jobs, at least one thread. |
| 135 |
_ThreadIndex __num_threads = __gnu_parallel::max<_ThreadIndex> |
| 136 |
(1, __gnu_parallel::min<_DifferenceType>(__length, |
| 137 |
__get_max_threads())); |
| 138 |
|
| 139 |
# pragma omp parallel shared(__busy) num_threads(__num_threads) |
| 140 |
{ |
| 141 |
# pragma omp single |
| 142 |
{ |
| 143 |
__num_threads = omp_get_num_threads(); |
| 144 |
|
| 145 |
// Create job description array. |
| 146 |
__job = new _Job<_DifferenceType>[__num_threads * __stride]; |
| 147 |
} |
| 148 |
|
| 149 |
// Initialization phase. |
| 150 |
|
| 151 |
// Flags for every thread if it is doing productive work. |
| 152 |
bool __iam_working = false; |
| 153 |
|
| 154 |
// Thread id. |
| 155 |
_ThreadIndex __iam = omp_get_thread_num(); |
| 156 |
|
| 157 |
// This job. |
| 158 |
_Job<_DifferenceType>& __my_job = __job[__iam * __stride]; |
| 159 |
|
| 160 |
// Random number (for work stealing). |
| 161 |
_ThreadIndex __victim; |
| 162 |
|
| 163 |
// Local value for reduction. |
| 164 |
_Result __result = _Result(); |
| 165 |
|
| 166 |
// Number of elements to steal in one attempt. |
| 167 |
_DifferenceType __steal; |
| 168 |
|
| 169 |
// Every thread has its own random number generator |
| 170 |
// (modulo __num_threads). |
| 171 |
_RandomNumber __rand_gen(__iam, __num_threads); |
| 172 |
|
| 173 |
// This thread is currently working. |
| 174 |
# pragma omp atomic |
| 175 |
++__busy; |
| 176 |
|
| 177 |
__iam_working = true; |
| 178 |
|
| 179 |
// How many jobs per thread? last thread gets the rest. |
| 180 |
__my_job._M_first = static_cast<_DifferenceType> |
| 181 |
(__iam * (__length / __num_threads)); |
| 182 |
|
| 183 |
__my_job._M_last = (__iam == (__num_threads - 1) |
| 184 |
? (__length - 1) |
| 185 |
: ((__iam + 1) * (__length / __num_threads) - 1)); |
| 186 |
__my_job._M_load = __my_job._M_last - __my_job._M_first + 1; |
| 187 |
|
| 188 |
// Init result with _M_first value (to have a base value for reduction) |
| 189 |
if (__my_job._M_first <= __my_job._M_last) |
| 190 |
{ |
| 191 |
// Cannot use volatile variable directly. |
| 192 |
_DifferenceType __my_first = __my_job._M_first; |
| 193 |
__result = __f(__op, __begin + __my_first); |
| 194 |
++__my_job._M_first; |
| 195 |
--__my_job._M_load; |
| 196 |
} |
| 197 |
|
| 198 |
_RAIter __current; |
| 199 |
|
| 200 |
# pragma omp barrier |
| 201 |
|
| 202 |
// Actual work phase |
| 203 |
// Work on own or stolen current start |
| 204 |
while (__busy > 0) |
| 205 |
{ |
| 206 |
// Work until no productive thread left. |
| 207 |
# pragma omp flush(__busy) |
| 208 |
|
| 209 |
// Thread has own work to do |
| 210 |
while (__my_job._M_first <= __my_job._M_last) |
| 211 |
{ |
| 212 |
// fetch-and-add call |
| 213 |
// Reserve current job block (size __chunk_size) in my queue. |
| 214 |
_DifferenceType __current_job = |
| 215 |
__fetch_and_add<_DifferenceType>(&(__my_job._M_first), |
| 216 |
__chunk_size); |
| 217 |
|
| 218 |
// Update _M_load, to make the three values consistent, |
| 219 |
// _M_first might have been changed in the meantime |
| 220 |
__my_job._M_load = __my_job._M_last - __my_job._M_first + 1; |
| 221 |
for (_DifferenceType __job_counter = 0; |
| 222 |
__job_counter < __chunk_size |
| 223 |
&& __current_job <= __my_job._M_last; |
| 224 |
++__job_counter) |
| 225 |
{ |
| 226 |
// Yes: process it! |
| 227 |
__current = __begin + __current_job; |
| 228 |
++__current_job; |
| 229 |
|
| 230 |
// Do actual work. |
| 231 |
__result = __r(__result, __f(__op, __current)); |
| 232 |
} |
| 233 |
|
| 234 |
# pragma omp flush(__busy) |
| 235 |
} |
| 236 |
|
| 237 |
// After reaching this point, a thread's __job list is empty. |
| 238 |
if (__iam_working) |
| 239 |
{ |
| 240 |
// This thread no longer has work. |
| 241 |
# pragma omp atomic |
| 242 |
--__busy; |
| 243 |
|
| 244 |
__iam_working = false; |
| 245 |
} |
| 246 |
|
| 247 |
_DifferenceType __supposed_first, __supposed_last, |
| 248 |
__supposed_load; |
| 249 |
do |
| 250 |
{ |
| 251 |
// Find random nonempty deque (not own), do consistency check. |
| 252 |
__yield(); |
| 253 |
# pragma omp flush(__busy) |
| 254 |
__victim = __rand_gen(); |
| 255 |
__supposed_first = __job[__victim * __stride]._M_first; |
| 256 |
__supposed_last = __job[__victim * __stride]._M_last; |
| 257 |
__supposed_load = __job[__victim * __stride]._M_load; |
| 258 |
} |
| 259 |
while (__busy > 0 |
| 260 |
&& ((__supposed_load <= 0) |
| 261 |
|| ((__supposed_first + __supposed_load - 1) |
| 262 |
!= __supposed_last))); |
| 263 |
|
| 264 |
if (__busy == 0) |
| 265 |
break; |
| 266 |
|
| 267 |
if (__supposed_load > 0) |
| 268 |
{ |
| 269 |
// Has work and work to do. |
| 270 |
// Number of elements to steal (at least one). |
| 271 |
__steal = (__supposed_load < 2) ? 1 : __supposed_load / 2; |
| 272 |
|
| 273 |
// Push __victim's current start forward. |
| 274 |
_DifferenceType __stolen_first = |
| 275 |
__fetch_and_add<_DifferenceType> |
| 276 |
(&(__job[__victim * __stride]._M_first), __steal); |
| 277 |
_DifferenceType __stolen_try = (__stolen_first + __steal |
| 278 |
- _DifferenceType(1)); |
| 279 |
|
| 280 |
__my_job._M_first = __stolen_first; |
| 281 |
__my_job._M_last = __gnu_parallel::min(__stolen_try, |
| 282 |
__supposed_last); |
| 283 |
__my_job._M_load = __my_job._M_last - __my_job._M_first + 1; |
| 284 |
|
| 285 |
// Has potential work again. |
| 286 |
# pragma omp atomic |
| 287 |
++__busy; |
| 288 |
__iam_working = true; |
| 289 |
|
| 290 |
# pragma omp flush(__busy) |
| 291 |
} |
| 292 |
# pragma omp flush(__busy) |
| 293 |
} // end while __busy > 0 |
| 294 |
// Add accumulated result to output. |
| 295 |
omp_set_lock(&__output_lock); |
| 296 |
__output = __r(__output, __result); |
| 297 |
omp_unset_lock(&__output_lock); |
| 298 |
} |
| 299 |
|
| 300 |
delete[] __job; |
| 301 |
|
| 302 |
// Points to last element processed (needed as return value for |
| 303 |
// some algorithms like transform) |
| 304 |
__f._M_finish_iterator = __begin + __length; |
| 305 |
|
| 306 |
omp_destroy_lock(&__output_lock); |
| 307 |
|
| 308 |
return __op; |
| 309 |
} |
| 310 |
} // end namespace |
| 311 |
|
| 312 |
#endif /* _GLIBCXX_PARALLEL_WORKSTEALING_H */ |