| 1 |
// -*- C++ -*- |
| 2 |
|
| 3 |
// Copyright (C) 2007-2021 Free Software Foundation, Inc. |
| 4 |
// |
| 5 |
// This file is part of the GNU ISO C++ Library. This library is free |
| 6 |
// software; you can redistribute it and/or modify it under the terms |
| 7 |
// of the GNU General Public License as published by the Free Software |
| 8 |
// Foundation; either version 3, or (at your option) any later |
| 9 |
// version. |
| 10 |
|
| 11 |
// This library is distributed in the hope that it will be useful, but |
| 12 |
// WITHOUT ANY WARRANTY; without even the implied warranty of |
| 13 |
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| 14 |
// General Public License for more details. |
| 15 |
|
| 16 |
// Under Section 7 of GPL version 3, you are granted additional |
| 17 |
// permissions described in the GCC Runtime Library Exception, version |
| 18 |
// 3.1, as published by the Free Software Foundation. |
| 19 |
|
| 20 |
// You should have received a copy of the GNU General Public License and |
| 21 |
// a copy of the GCC Runtime Library Exception along with this program; |
| 22 |
// see the files COPYING3 and COPYING.RUNTIME respectively. If not, see |
| 23 |
// <http://www.gnu.org/licenses/>. |
| 24 |
|
| 25 |
/** @file parallel/settings.h |
| 26 |
* @brief Runtime settings and tuning parameters, heuristics to decide |
| 27 |
* whether to use parallelized algorithms. |
| 28 |
* |
| 29 |
* This file is a GNU parallel extension to the Standard C++ Library. |
| 30 |
* |
| 31 |
* @section parallelization_decision Deciding whether to run an algorithm in parallel. |
| 32 |
* |
| 33 |
* There are several ways the user can switch on and off the parallel |
| 34 |
* execution of an algorithm, both at compile- and run-time. |
| 35 |
* |
| 36 |
* Only sequential execution can be forced at compile-time. This |
| 37 |
* reduces code size and protects code parts that have |
| 38 |
* non-thread-safe side effects. |
| 39 |
* |
| 40 |
* Ultimately, forcing parallel execution at compile-time makes |
| 41 |
* sense. Often, the sequential algorithm implementation is used as |
| 42 |
* a subroutine, so no reduction in code size can be achieved. Also, |
| 43 |
* the machine the program is run on might have only one processor |
| 44 |
* core, so to avoid overhead, the algorithm is executed |
| 45 |
* sequentially. |
| 46 |
* |
| 47 |
* To force sequential execution of an algorithm ultimately at |
| 48 |
* compile-time, the user must add the tag |
| 49 |
* gnu_parallel::sequential_tag() to the end of the parameter list, |
| 50 |
* e. g. |
| 51 |
* |
| 52 |
* \code |
| 53 |
* std::sort(__v.begin(), __v.end(), __gnu_parallel::sequential_tag()); |
| 54 |
* \endcode |
| 55 |
* |
| 56 |
* This is compatible with all overloaded algorithm variants. No |
| 57 |
* additional code will be instantiated, at all. The same holds for |
| 58 |
* most algorithm calls with iterators not providing random access. |
| 59 |
* |
| 60 |
* If the algorithm call is not forced to be executed sequentially |
| 61 |
* at compile-time, the decision is made at run-time. |
| 62 |
* The global variable __gnu_parallel::_Settings::algorithm_strategy |
| 63 |
* is checked. It is a tristate variable corresponding to: |
| 64 |
* - a. force_sequential, meaning the sequential algorithm is executed. |
| 65 |
* - b. force_parallel, meaning the parallel algorithm is executed. |
| 66 |
* - c. heuristic |
| 67 |
* |
| 68 |
* For heuristic, the parallel algorithm implementation is called |
| 69 |
* only if the input size is sufficiently large. For most |
| 70 |
* algorithms, the input size is the (combined) length of the input |
| 71 |
* sequence(__s). The threshold can be set by the user, individually |
| 72 |
* for each algorithm. The according variables are called |
| 73 |
* gnu_parallel::_Settings::[algorithm]_minimal_n . |
| 74 |
* |
| 75 |
* For some of the algorithms, there are even more tuning options, |
| 76 |
* e. g. the ability to choose from multiple algorithm variants. See |
| 77 |
* below for details. |
| 78 |
*/ |
| 79 |
|
| 80 |
// Written by Johannes Singler and Felix Putze. |
| 81 |
|
| 82 |
#ifndef _GLIBCXX_PARALLEL_SETTINGS_H |
| 83 |
#define _GLIBCXX_PARALLEL_SETTINGS_H 1 |
| 84 |
|
| 85 |
#include <parallel/types.h> |
| 86 |
|
| 87 |
/** |
| 88 |
* @brief Determine at compile(?)-time if the parallel variant of an |
| 89 |
* algorithm should be called. |
| 90 |
* @param __c A condition that is convertible to bool that is overruled by |
| 91 |
* __gnu_parallel::_Settings::algorithm_strategy. Usually a decision |
| 92 |
* based on the input size. |
| 93 |
*/ |
| 94 |
#define _GLIBCXX_PARALLEL_CONDITION(__c) \ |
| 95 |
(__gnu_parallel::_Settings::get().algorithm_strategy \ |
| 96 |
!= __gnu_parallel::force_sequential \ |
| 97 |
&& ((__gnu_parallel::__get_max_threads() > 1 && (__c)) \ |
| 98 |
|| __gnu_parallel::_Settings::get().algorithm_strategy \ |
| 99 |
== __gnu_parallel::force_parallel)) |
| 100 |
|
| 101 |
/* |
| 102 |
inline bool |
| 103 |
parallel_condition(bool __c) |
| 104 |
{ |
| 105 |
bool ret = false; |
| 106 |
const _Settings& __s = _Settings::get(); |
| 107 |
if (__s.algorithm_strategy != force_seqential) |
| 108 |
{ |
| 109 |
if (__s.algorithm_strategy == force_parallel) |
| 110 |
ret = true; |
| 111 |
else |
| 112 |
ret = __get_max_threads() > 1 && __c; |
| 113 |
} |
| 114 |
return ret; |
| 115 |
} |
| 116 |
*/ |
| 117 |
|
| 118 |
namespace __gnu_parallel |
| 119 |
{ |
| 120 |
/// class _Settings |
| 121 |
/// Run-time settings for the parallel mode including all tunable parameters. |
| 122 |
struct _Settings |
| 123 |
{ |
| 124 |
_AlgorithmStrategy algorithm_strategy; |
| 125 |
|
| 126 |
_SortAlgorithm sort_algorithm; |
| 127 |
_PartialSumAlgorithm partial_sum_algorithm; |
| 128 |
_MultiwayMergeAlgorithm multiway_merge_algorithm; |
| 129 |
_FindAlgorithm find_algorithm; |
| 130 |
|
| 131 |
_SplittingAlgorithm sort_splitting; |
| 132 |
_SplittingAlgorithm merge_splitting; |
| 133 |
_SplittingAlgorithm multiway_merge_splitting; |
| 134 |
|
| 135 |
// Per-algorithm settings. |
| 136 |
|
| 137 |
/// Minimal input size for accumulate. |
| 138 |
_SequenceIndex accumulate_minimal_n; |
| 139 |
|
| 140 |
/// Minimal input size for adjacent_difference. |
| 141 |
unsigned int adjacent_difference_minimal_n; |
| 142 |
|
| 143 |
/// Minimal input size for count and count_if. |
| 144 |
_SequenceIndex count_minimal_n; |
| 145 |
|
| 146 |
/// Minimal input size for fill. |
| 147 |
_SequenceIndex fill_minimal_n; |
| 148 |
|
| 149 |
/// Block size increase factor for find. |
| 150 |
double find_increasing_factor; |
| 151 |
|
| 152 |
/// Initial block size for find. |
| 153 |
_SequenceIndex find_initial_block_size; |
| 154 |
|
| 155 |
/// Maximal block size for find. |
| 156 |
_SequenceIndex find_maximum_block_size; |
| 157 |
|
| 158 |
/// Start with looking for this many elements sequentially, for find. |
| 159 |
_SequenceIndex find_sequential_search_size; |
| 160 |
|
| 161 |
/// Minimal input size for for_each. |
| 162 |
_SequenceIndex for_each_minimal_n; |
| 163 |
|
| 164 |
/// Minimal input size for generate. |
| 165 |
_SequenceIndex generate_minimal_n; |
| 166 |
|
| 167 |
/// Minimal input size for max_element. |
| 168 |
_SequenceIndex max_element_minimal_n; |
| 169 |
|
| 170 |
/// Minimal input size for merge. |
| 171 |
_SequenceIndex merge_minimal_n; |
| 172 |
|
| 173 |
/// Oversampling factor for merge. |
| 174 |
unsigned int merge_oversampling; |
| 175 |
|
| 176 |
/// Minimal input size for min_element. |
| 177 |
_SequenceIndex min_element_minimal_n; |
| 178 |
|
| 179 |
/// Minimal input size for multiway_merge. |
| 180 |
_SequenceIndex multiway_merge_minimal_n; |
| 181 |
|
| 182 |
/// Oversampling factor for multiway_merge. |
| 183 |
int multiway_merge_minimal_k; |
| 184 |
|
| 185 |
/// Oversampling factor for multiway_merge. |
| 186 |
unsigned int multiway_merge_oversampling; |
| 187 |
|
| 188 |
/// Minimal input size for nth_element. |
| 189 |
_SequenceIndex nth_element_minimal_n; |
| 190 |
|
| 191 |
/// Chunk size for partition. |
| 192 |
_SequenceIndex partition_chunk_size; |
| 193 |
|
| 194 |
/// Chunk size for partition, relative to input size. If > 0.0, |
| 195 |
/// this value overrides partition_chunk_size. |
| 196 |
double partition_chunk_share; |
| 197 |
|
| 198 |
/// Minimal input size for partition. |
| 199 |
_SequenceIndex partition_minimal_n; |
| 200 |
|
| 201 |
/// Minimal input size for partial_sort. |
| 202 |
_SequenceIndex partial_sort_minimal_n; |
| 203 |
|
| 204 |
/// Ratio for partial_sum. Assume "sum and write result" to be |
| 205 |
/// this factor slower than just "sum". |
| 206 |
float partial_sum_dilation; |
| 207 |
|
| 208 |
/// Minimal input size for partial_sum. |
| 209 |
unsigned int partial_sum_minimal_n; |
| 210 |
|
| 211 |
/// Minimal input size for random_shuffle. |
| 212 |
unsigned int random_shuffle_minimal_n; |
| 213 |
|
| 214 |
/// Minimal input size for replace and replace_if. |
| 215 |
_SequenceIndex replace_minimal_n; |
| 216 |
|
| 217 |
/// Minimal input size for set_difference. |
| 218 |
_SequenceIndex set_difference_minimal_n; |
| 219 |
|
| 220 |
/// Minimal input size for set_intersection. |
| 221 |
_SequenceIndex set_intersection_minimal_n; |
| 222 |
|
| 223 |
/// Minimal input size for set_symmetric_difference. |
| 224 |
_SequenceIndex set_symmetric_difference_minimal_n; |
| 225 |
|
| 226 |
/// Minimal input size for set_union. |
| 227 |
_SequenceIndex set_union_minimal_n; |
| 228 |
|
| 229 |
/// Minimal input size for parallel sorting. |
| 230 |
_SequenceIndex sort_minimal_n; |
| 231 |
|
| 232 |
/// Oversampling factor for parallel std::sort (MWMS). |
| 233 |
unsigned int sort_mwms_oversampling; |
| 234 |
|
| 235 |
/// Such many samples to take to find a good pivot (quicksort). |
| 236 |
unsigned int sort_qs_num_samples_preset; |
| 237 |
|
| 238 |
/// Maximal subsequence __length to switch to unbalanced __base case. |
| 239 |
/// Applies to std::sort with dynamically load-balanced quicksort. |
| 240 |
_SequenceIndex sort_qsb_base_case_maximal_n; |
| 241 |
|
| 242 |
/// Minimal input size for parallel std::transform. |
| 243 |
_SequenceIndex transform_minimal_n; |
| 244 |
|
| 245 |
/// Minimal input size for unique_copy. |
| 246 |
_SequenceIndex unique_copy_minimal_n; |
| 247 |
|
| 248 |
_SequenceIndex workstealing_chunk_size; |
| 249 |
|
| 250 |
// Hardware dependent tuning parameters. |
| 251 |
|
| 252 |
/// size of the L1 cache in bytes (underestimation). |
| 253 |
unsigned long long L1_cache_size; |
| 254 |
|
| 255 |
/// size of the L2 cache in bytes (underestimation). |
| 256 |
unsigned long long L2_cache_size; |
| 257 |
|
| 258 |
/// size of the Translation Lookaside Buffer (underestimation). |
| 259 |
unsigned int TLB_size; |
| 260 |
|
| 261 |
/// Overestimation of cache line size. Used to avoid false |
| 262 |
/// sharing, i.e. elements of different threads are at least this |
| 263 |
/// amount apart. |
| 264 |
unsigned int cache_line_size; |
| 265 |
|
| 266 |
// Statistics. |
| 267 |
|
| 268 |
/// The number of stolen ranges in load-balanced quicksort. |
| 269 |
_SequenceIndex qsb_steals; |
| 270 |
|
| 271 |
/// Minimal input size for search and search_n. |
| 272 |
_SequenceIndex search_minimal_n; |
| 273 |
|
| 274 |
/// Block size scale-down factor with respect to current position. |
| 275 |
float find_scale_factor; |
| 276 |
|
| 277 |
/// Get the global settings. |
| 278 |
_GLIBCXX_CONST static const _Settings& |
| 279 |
get() throw(); |
| 280 |
|
| 281 |
/// Set the global settings. |
| 282 |
static void |
| 283 |
set(_Settings&) throw(); |
| 284 |
|
| 285 |
explicit |
| 286 |
_Settings() : |
| 287 |
algorithm_strategy(heuristic), |
| 288 |
sort_algorithm(MWMS), |
| 289 |
partial_sum_algorithm(LINEAR), |
| 290 |
multiway_merge_algorithm(LOSER_TREE), |
| 291 |
find_algorithm(CONSTANT_SIZE_BLOCKS), |
| 292 |
sort_splitting(EXACT), |
| 293 |
merge_splitting(EXACT), |
| 294 |
multiway_merge_splitting(EXACT), |
| 295 |
accumulate_minimal_n(1000), |
| 296 |
adjacent_difference_minimal_n(1000), |
| 297 |
count_minimal_n(1000), |
| 298 |
fill_minimal_n(1000), |
| 299 |
find_increasing_factor(2.0), |
| 300 |
find_initial_block_size(256), |
| 301 |
find_maximum_block_size(8192), |
| 302 |
find_sequential_search_size(256), |
| 303 |
for_each_minimal_n(1000), |
| 304 |
generate_minimal_n(1000), |
| 305 |
max_element_minimal_n(1000), |
| 306 |
merge_minimal_n(1000), |
| 307 |
merge_oversampling(10), |
| 308 |
min_element_minimal_n(1000), |
| 309 |
multiway_merge_minimal_n(1000), |
| 310 |
multiway_merge_minimal_k(2), multiway_merge_oversampling(10), |
| 311 |
nth_element_minimal_n(1000), |
| 312 |
partition_chunk_size(1000), |
| 313 |
partition_chunk_share(0.0), |
| 314 |
partition_minimal_n(1000), |
| 315 |
partial_sort_minimal_n(1000), |
| 316 |
partial_sum_dilation(1.0f), |
| 317 |
partial_sum_minimal_n(1000), |
| 318 |
random_shuffle_minimal_n(1000), |
| 319 |
replace_minimal_n(1000), |
| 320 |
set_difference_minimal_n(1000), |
| 321 |
set_intersection_minimal_n(1000), |
| 322 |
set_symmetric_difference_minimal_n(1000), |
| 323 |
set_union_minimal_n(1000), |
| 324 |
sort_minimal_n(1000), |
| 325 |
sort_mwms_oversampling(10), |
| 326 |
sort_qs_num_samples_preset(100), |
| 327 |
sort_qsb_base_case_maximal_n(100), |
| 328 |
transform_minimal_n(1000), |
| 329 |
unique_copy_minimal_n(10000), |
| 330 |
workstealing_chunk_size(100), |
| 331 |
L1_cache_size(16 << 10), |
| 332 |
L2_cache_size(256 << 10), |
| 333 |
TLB_size(128), |
| 334 |
cache_line_size(64), |
| 335 |
qsb_steals(0), |
| 336 |
search_minimal_n(1000), |
| 337 |
find_scale_factor(0.01f) |
| 338 |
{ } |
| 339 |
}; |
| 340 |
} |
| 341 |
|
| 342 |
#endif /* _GLIBCXX_PARALLEL_SETTINGS_H */ |