| 1 | // -*- C++ -*- | 
 
 
 
 
 | 2 |  | 
 
 
 
 
 | 3 | // Copyright (C) 2007-2021 Free Software Foundation, Inc. | 
 
 
 
 
 | 4 | // | 
 
 
 
 
 | 5 | // This file is part of the GNU ISO C++ Library.  This library is free | 
 
 
 
 
 | 6 | // software; you can redistribute it and/or modify it under the terms | 
 
 
 
 
 | 7 | // of the GNU General Public License as published by the Free Software | 
 
 
 
 
 | 8 | // Foundation; either version 3, or (at your option) any later | 
 
 
 
 
 | 9 | // version. | 
 
 
 
 
 | 10 |  | 
 
 
 
 
 | 11 | // This library is distributed in the hope that it will be useful, but | 
 
 
 
 
 | 12 | // WITHOUT ANY WARRANTY; without even the implied warranty of | 
 
 
 
 
 | 13 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU | 
 
 
 
 
 | 14 | // General Public License for more details. | 
 
 
 
 
 | 15 |  | 
 
 
 
 
 | 16 | // Under Section 7 of GPL version 3, you are granted additional | 
 
 
 
 
 | 17 | // permissions described in the GCC Runtime Library Exception, version | 
 
 
 
 
 | 18 | // 3.1, as published by the Free Software Foundation. | 
 
 
 
 
 | 19 |  | 
 
 
 
 
 | 20 | // You should have received a copy of the GNU General Public License and | 
 
 
 
 
 | 21 | // a copy of the GCC Runtime Library Exception along with this program; | 
 
 
 
 
 | 22 | // see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see | 
 
 
 
 
 | 23 | // <http://www.gnu.org/licenses/>. | 
 
 
 
 
 | 24 |  | 
 
 
 
 
 | 25 | /** @file parallel/settings.h | 
 
 
 
 
 | 26 | *  @brief Runtime settings and tuning parameters, heuristics to decide | 
 
 
 
 
 | 27 | *  whether to use parallelized algorithms. | 
 
 
 
 
 | 28 | * | 
 
 
 
 
 | 29 | *  This file is a GNU parallel extension to the Standard C++ Library. | 
 
 
 
 
 | 30 | * | 
 
 
 
 
 | 31 | *  @section parallelization_decision Deciding whether to run an algorithm in parallel. | 
 
 
 
 
 | 32 | * | 
 
 
 
 
 | 33 | *  There are several ways the user can switch on and off the parallel | 
 
 
 
 
 | 34 | *  execution of an algorithm, both at compile- and run-time. | 
 
 
 
 
 | 35 | * | 
 
 
 
 
 | 36 | *  Only sequential execution can be forced at compile-time.  This | 
 
 
 
 
 | 37 | *  reduces code size and protects code parts that have | 
 
 
 
 
 | 38 | *  non-thread-safe side effects. | 
 
 
 
 
 | 39 | * | 
 
 
 
 
 | 40 | *  Ultimately, forcing parallel execution at compile-time makes | 
 
 
 
 
 | 41 | *  sense.  Often, the sequential algorithm implementation is used as | 
 
 
 
 
 | 42 | *  a subroutine, so no reduction in code size can be achieved.  Also, | 
 
 
 
 
 | 43 | *  the machine the program is run on might have only one processor | 
 
 
 
 
 | 44 | *  core, so to avoid overhead, the algorithm is executed | 
 
 
 
 
 | 45 | *  sequentially. | 
 
 
 
 
 | 46 | * | 
 
 
 
 
 | 47 | *  To force sequential execution of an algorithm ultimately at | 
 
 
 
 
 | 48 | *  compile-time, the user must add the tag | 
 
 
 
 
 | 49 | *  gnu_parallel::sequential_tag() to the end of the parameter list, | 
 
 
 
 
 | 50 | *  e. g. | 
 
 
 
 
 | 51 | * | 
 
 
 
 
 | 52 | *  \code | 
 
 
 
 
 | 53 | *  std::sort(__v.begin(), __v.end(), __gnu_parallel::sequential_tag()); | 
 
 
 
 
 | 54 | *  \endcode | 
 
 
 
 
 | 55 | * | 
 
 
 
 
 | 56 | *  This is compatible with all overloaded algorithm variants.  No | 
 
 
 
 
 | 57 | *  additional code will be instantiated, at all.  The same holds for | 
 
 
 
 
 | 58 | *  most algorithm calls with iterators not providing random access. | 
 
 
 
 
 | 59 | * | 
 
 
 
 
 | 60 | *  If the algorithm call is not forced to be executed sequentially | 
 
 
 
 
 | 61 | *  at compile-time, the decision is made at run-time. | 
 
 
 
 
 | 62 | *  The global variable __gnu_parallel::_Settings::algorithm_strategy | 
 
 
 
 
 | 63 | *  is checked. It is a tristate variable corresponding to: | 
 
 
 
 
 | 64 | *    - a. force_sequential, meaning the sequential algorithm is executed. | 
 
 
 
 
 | 65 | *    - b. force_parallel, meaning the parallel algorithm is executed. | 
 
 
 
 
 | 66 | *    - c. heuristic | 
 
 
 
 
 | 67 | * | 
 
 
 
 
 | 68 | *  For heuristic, the parallel algorithm implementation is called | 
 
 
 
 
 | 69 | *  only if the input size is sufficiently large.  For most | 
 
 
 
 
 | 70 | *  algorithms, the input size is the (combined) length of the input | 
 
 
 
 
 | 71 | *  sequence(__s).  The threshold can be set by the user, individually | 
 
 
 
 
 | 72 | *  for each algorithm.  The according variables are called | 
 
 
 
 
 | 73 | *  gnu_parallel::_Settings::[algorithm]_minimal_n . | 
 
 
 
 
 | 74 | * | 
 
 
 
 
 | 75 | *  For some of the algorithms, there are even more tuning options, | 
 
 
 
 
 | 76 | *  e. g. the ability to choose from multiple algorithm variants.  See | 
 
 
 
 
 | 77 | *  below for details. | 
 
 
 
 
 | 78 | */ | 
 
 
 
 
 | 79 |  | 
 
 
 
 
 | 80 | // Written by Johannes Singler and Felix Putze. | 
 
 
 
 
 | 81 |  | 
 
 
 
 
 | 82 | #ifndef _GLIBCXX_PARALLEL_SETTINGS_H | 
 
 
 
 
 | 83 | #define _GLIBCXX_PARALLEL_SETTINGS_H 1 | 
 
 
 
 
 | 84 |  | 
 
 
 
 
 | 85 | #include <parallel/types.h> | 
 
 
 
 
 | 86 |  | 
 
 
 
 
 | 87 | /** | 
 
 
 
 
 | 88 | * @brief Determine at compile(?)-time if the parallel variant of an | 
 
 
 
 
 | 89 | * algorithm should be called. | 
 
 
 
 
 | 90 | * @param __c A condition that is convertible to bool that is overruled by | 
 
 
 
 
 | 91 | * __gnu_parallel::_Settings::algorithm_strategy. Usually a decision | 
 
 
 
 
 | 92 | * based on the input size. | 
 
 
 
 
 | 93 | */ | 
 
 
 
 
 | 94 | #define _GLIBCXX_PARALLEL_CONDITION(__c) \ | 
 
 
 
 
 | 95 | (__gnu_parallel::_Settings::get().algorithm_strategy \ | 
 
 
 
 
 | 96 | != __gnu_parallel::force_sequential \ | 
 
 
 
 
 | 97 | && ((__gnu_parallel::__get_max_threads() > 1 && (__c)) \ | 
 
 
 
 
 | 98 | || __gnu_parallel::_Settings::get().algorithm_strategy \ | 
 
 
 
 
 | 99 | == __gnu_parallel::force_parallel)) | 
 
 
 
 
 | 100 |  | 
 
 
 
 
 | 101 | /* | 
 
 
 
 
 | 102 | inline bool | 
 
 
 
 
 | 103 | parallel_condition(bool __c) | 
 
 
 
 
 | 104 | { | 
 
 
 
 
 | 105 | bool ret = false; | 
 
 
 
 
 | 106 | const _Settings& __s = _Settings::get(); | 
 
 
 
 
 | 107 | if (__s.algorithm_strategy != force_seqential) | 
 
 
 
 
 | 108 | { | 
 
 
 
 
 | 109 | if (__s.algorithm_strategy == force_parallel) | 
 
 
 
 
 | 110 | ret = true; | 
 
 
 
 
 | 111 | else | 
 
 
 
 
 | 112 | ret = __get_max_threads() > 1 && __c; | 
 
 
 
 
 | 113 | } | 
 
 
 
 
 | 114 | return ret; | 
 
 
 
 
 | 115 | } | 
 
 
 
 
 | 116 | */ | 
 
 
 
 
 | 117 |  | 
 
 
 
 
 | 118 | namespace __gnu_parallel | 
 
 
 
 
 | 119 | { | 
 
 
 
 
 | 120 | /// class _Settings | 
 
 
 
 
 | 121 | /// Run-time settings for the parallel mode including all tunable parameters. | 
 
 
 
 
 | 122 | struct _Settings | 
 
 
 
 
 | 123 | { | 
 
 
 
 
 | 124 | _AlgorithmStrategy          algorithm_strategy; | 
 
 
 
 
 | 125 |  | 
 
 
 
 
 | 126 | _SortAlgorithm              sort_algorithm; | 
 
 
 
 
 | 127 | _PartialSumAlgorithm        partial_sum_algorithm; | 
 
 
 
 
 | 128 | _MultiwayMergeAlgorithm     multiway_merge_algorithm; | 
 
 
 
 
 | 129 | _FindAlgorithm              find_algorithm; | 
 
 
 
 
 | 130 |  | 
 
 
 
 
 | 131 | _SplittingAlgorithm         sort_splitting; | 
 
 
 
 
 | 132 | _SplittingAlgorithm         merge_splitting; | 
 
 
 
 
 | 133 | _SplittingAlgorithm         multiway_merge_splitting; | 
 
 
 
 
 | 134 |  | 
 
 
 
 
 | 135 | // Per-algorithm settings. | 
 
 
 
 
 | 136 |  | 
 
 
 
 
 | 137 | /// Minimal input size for accumulate. | 
 
 
 
 
 | 138 | _SequenceIndex              accumulate_minimal_n; | 
 
 
 
 
 | 139 |  | 
 
 
 
 
 | 140 | /// Minimal input size for adjacent_difference. | 
 
 
 
 
 | 141 | unsigned int                adjacent_difference_minimal_n; | 
 
 
 
 
 | 142 |  | 
 
 
 
 
 | 143 | /// Minimal input size for count and count_if. | 
 
 
 
 
 | 144 | _SequenceIndex              count_minimal_n; | 
 
 
 
 
 | 145 |  | 
 
 
 
 
 | 146 | /// Minimal input size for fill. | 
 
 
 
 
 | 147 | _SequenceIndex              fill_minimal_n; | 
 
 
 
 
 | 148 |  | 
 
 
 
 
 | 149 | /// Block size increase factor for find. | 
 
 
 
 
 | 150 | double                      find_increasing_factor; | 
 
 
 
 
 | 151 |  | 
 
 
 
 
 | 152 | /// Initial block size for find. | 
 
 
 
 
 | 153 | _SequenceIndex              find_initial_block_size; | 
 
 
 
 
 | 154 |  | 
 
 
 
 
 | 155 | /// Maximal block size for find. | 
 
 
 
 
 | 156 | _SequenceIndex              find_maximum_block_size; | 
 
 
 
 
 | 157 |  | 
 
 
 
 
 | 158 | /// Start with looking for this many elements sequentially, for find. | 
 
 
 
 
 | 159 | _SequenceIndex              find_sequential_search_size; | 
 
 
 
 
 | 160 |  | 
 
 
 
 
 | 161 | /// Minimal input size for for_each. | 
 
 
 
 
 | 162 | _SequenceIndex              for_each_minimal_n; | 
 
 
 
 
 | 163 |  | 
 
 
 
 
 | 164 | /// Minimal input size for generate. | 
 
 
 
 
 | 165 | _SequenceIndex              generate_minimal_n; | 
 
 
 
 
 | 166 |  | 
 
 
 
 
 | 167 | /// Minimal input size for max_element. | 
 
 
 
 
 | 168 | _SequenceIndex              max_element_minimal_n; | 
 
 
 
 
 | 169 |  | 
 
 
 
 
 | 170 | /// Minimal input size for merge. | 
 
 
 
 
 | 171 | _SequenceIndex              merge_minimal_n; | 
 
 
 
 
 | 172 |  | 
 
 
 
 
 | 173 | /// Oversampling factor for merge. | 
 
 
 
 
 | 174 | unsigned int                merge_oversampling; | 
 
 
 
 
 | 175 |  | 
 
 
 
 
 | 176 | /// Minimal input size for min_element. | 
 
 
 
 
 | 177 | _SequenceIndex              min_element_minimal_n; | 
 
 
 
 
 | 178 |  | 
 
 
 
 
 | 179 | /// Minimal input size for multiway_merge. | 
 
 
 
 
 | 180 | _SequenceIndex              multiway_merge_minimal_n; | 
 
 
 
 
 | 181 |  | 
 
 
 
 
 | 182 | /// Oversampling factor for multiway_merge. | 
 
 
 
 
 | 183 | int                         multiway_merge_minimal_k; | 
 
 
 
 
 | 184 |  | 
 
 
 
 
 | 185 | /// Oversampling factor for multiway_merge. | 
 
 
 
 
 | 186 | unsigned int                multiway_merge_oversampling; | 
 
 
 
 
 | 187 |  | 
 
 
 
 
 | 188 | /// Minimal input size for nth_element. | 
 
 
 
 
 | 189 | _SequenceIndex              nth_element_minimal_n; | 
 
 
 
 
 | 190 |  | 
 
 
 
 
 | 191 | /// Chunk size for partition. | 
 
 
 
 
 | 192 | _SequenceIndex              partition_chunk_size; | 
 
 
 
 
 | 193 |  | 
 
 
 
 
 | 194 | /// Chunk size for partition, relative to input size.  If > 0.0, | 
 
 
 
 
 | 195 | /// this value overrides partition_chunk_size. | 
 
 
 
 
 | 196 | double                      partition_chunk_share; | 
 
 
 
 
 | 197 |  | 
 
 
 
 
 | 198 | /// Minimal input size for partition. | 
 
 
 
 
 | 199 | _SequenceIndex              partition_minimal_n; | 
 
 
 
 
 | 200 |  | 
 
 
 
 
 | 201 | /// Minimal input size for partial_sort. | 
 
 
 
 
 | 202 | _SequenceIndex              partial_sort_minimal_n; | 
 
 
 
 
 | 203 |  | 
 
 
 
 
 | 204 | /// Ratio for partial_sum. Assume "sum and write result" to be | 
 
 
 
 
 | 205 | /// this factor slower than just "sum". | 
 
 
 
 
 | 206 | float                       partial_sum_dilation; | 
 
 
 
 
 | 207 |  | 
 
 
 
 
 | 208 | /// Minimal input size for partial_sum. | 
 
 
 
 
 | 209 | unsigned int                partial_sum_minimal_n; | 
 
 
 
 
 | 210 |  | 
 
 
 
 
 | 211 | /// Minimal input size for random_shuffle. | 
 
 
 
 
 | 212 | unsigned int                random_shuffle_minimal_n; | 
 
 
 
 
 | 213 |  | 
 
 
 
 
 | 214 | /// Minimal input size for replace and replace_if. | 
 
 
 
 
 | 215 | _SequenceIndex              replace_minimal_n; | 
 
 
 
 
 | 216 |  | 
 
 
 
 
 | 217 | /// Minimal input size for set_difference. | 
 
 
 
 
 | 218 | _SequenceIndex              set_difference_minimal_n; | 
 
 
 
 
 | 219 |  | 
 
 
 
 
 | 220 | /// Minimal input size for set_intersection. | 
 
 
 
 
 | 221 | _SequenceIndex              set_intersection_minimal_n; | 
 
 
 
 
 | 222 |  | 
 
 
 
 
 | 223 | /// Minimal input size for set_symmetric_difference. | 
 
 
 
 
 | 224 | _SequenceIndex              set_symmetric_difference_minimal_n; | 
 
 
 
 
 | 225 |  | 
 
 
 
 
 | 226 | /// Minimal input size for set_union. | 
 
 
 
 
 | 227 | _SequenceIndex              set_union_minimal_n; | 
 
 
 
 
 | 228 |  | 
 
 
 
 
 | 229 | /// Minimal input size for parallel sorting. | 
 
 
 
 
 | 230 | _SequenceIndex              sort_minimal_n; | 
 
 
 
 
 | 231 |  | 
 
 
 
 
 | 232 | /// Oversampling factor for parallel std::sort (MWMS). | 
 
 
 
 
 | 233 | unsigned int                sort_mwms_oversampling; | 
 
 
 
 
 | 234 |  | 
 
 
 
 
 | 235 | /// Such many samples to take to find a good pivot (quicksort). | 
 
 
 
 
 | 236 | unsigned int                sort_qs_num_samples_preset; | 
 
 
 
 
 | 237 |  | 
 
 
 
 
 | 238 | /// Maximal subsequence __length to switch to unbalanced __base case. | 
 
 
 
 
 | 239 | /// Applies to std::sort with dynamically load-balanced quicksort. | 
 
 
 
 
 | 240 | _SequenceIndex              sort_qsb_base_case_maximal_n; | 
 
 
 
 
 | 241 |  | 
 
 
 
 
 | 242 | /// Minimal input size for parallel std::transform. | 
 
 
 
 
 | 243 | _SequenceIndex              transform_minimal_n; | 
 
 
 
 
 | 244 |  | 
 
 
 
 
 | 245 | /// Minimal input size for unique_copy. | 
 
 
 
 
 | 246 | _SequenceIndex              unique_copy_minimal_n; | 
 
 
 
 
 | 247 |  | 
 
 
 
 
 | 248 | _SequenceIndex              workstealing_chunk_size; | 
 
 
 
 
 | 249 |  | 
 
 
 
 
 | 250 | // Hardware dependent tuning parameters. | 
 
 
 
 
 | 251 |  | 
 
 
 
 
 | 252 | /// size of the L1 cache in bytes (underestimation). | 
 
 
 
 
 | 253 | unsigned long long          L1_cache_size; | 
 
 
 
 
 | 254 |  | 
 
 
 
 
 | 255 | /// size of the L2 cache in bytes (underestimation). | 
 
 
 
 
 | 256 | unsigned long long          L2_cache_size; | 
 
 
 
 
 | 257 |  | 
 
 
 
 
 | 258 | /// size of the Translation Lookaside Buffer (underestimation). | 
 
 
 
 
 | 259 | unsigned int                TLB_size; | 
 
 
 
 
 | 260 |  | 
 
 
 
 
 | 261 | /// Overestimation of cache line size.  Used to avoid false | 
 
 
 
 
 | 262 | /// sharing, i.e. elements of different threads are at least this | 
 
 
 
 
 | 263 | /// amount apart. | 
 
 
 
 
 | 264 | unsigned int                cache_line_size; | 
 
 
 
 
 | 265 |  | 
 
 
 
 
 | 266 | // Statistics. | 
 
 
 
 
 | 267 |  | 
 
 
 
 
 | 268 | /// The number of stolen ranges in load-balanced quicksort. | 
 
 
 
 
 | 269 | _SequenceIndex              qsb_steals; | 
 
 
 
 
 | 270 |  | 
 
 
 
 
 | 271 | /// Minimal input size for search and search_n. | 
 
 
 
 
 | 272 | _SequenceIndex              search_minimal_n; | 
 
 
 
 
 | 273 |  | 
 
 
 
 
 | 274 | /// Block size scale-down factor with respect to current position. | 
 
 
 
 
 | 275 | float                       find_scale_factor; | 
 
 
 
 
 | 276 |  | 
 
 
 
 
 | 277 | /// Get the global settings. | 
 
 
 
 
 | 278 | _GLIBCXX_CONST static const _Settings& | 
 
 
 
 
 | 279 | get() throw(); | 
 
 
 
 
 | 280 |  | 
 
 
 
 
 | 281 | /// Set the global settings. | 
 
 
 
 
 | 282 | static void | 
 
 
 
 
 | 283 | set(_Settings&) throw(); | 
 
 
 
 
 | 284 |  | 
 
 
 
 
 | 285 | explicit | 
 
 
 
 
 | 286 | _Settings() : | 
 
 
 
 
 | 287 | algorithm_strategy(heuristic), | 
 
 
 
 
 | 288 | sort_algorithm(MWMS), | 
 
 
 
 
 | 289 | partial_sum_algorithm(LINEAR), | 
 
 
 
 
 | 290 | multiway_merge_algorithm(LOSER_TREE), | 
 
 
 
 
 | 291 | find_algorithm(CONSTANT_SIZE_BLOCKS), | 
 
 
 
 
 | 292 | sort_splitting(EXACT), | 
 
 
 
 
 | 293 | merge_splitting(EXACT), | 
 
 
 
 
 | 294 | multiway_merge_splitting(EXACT), | 
 
 
 
 
 | 295 | accumulate_minimal_n(1000), | 
 
 
 
 
 | 296 | adjacent_difference_minimal_n(1000), | 
 
 
 
 
 | 297 | count_minimal_n(1000), | 
 
 
 
 
 | 298 | fill_minimal_n(1000), | 
 
 
 
 
 | 299 | find_increasing_factor(2.0), | 
 
 
 
 
 | 300 | find_initial_block_size(256), | 
 
 
 
 
 | 301 | find_maximum_block_size(8192), | 
 
 
 
 
 | 302 | find_sequential_search_size(256), | 
 
 
 
 
 | 303 | for_each_minimal_n(1000), | 
 
 
 
 
 | 304 | generate_minimal_n(1000), | 
 
 
 
 
 | 305 | max_element_minimal_n(1000), | 
 
 
 
 
 | 306 | merge_minimal_n(1000), | 
 
 
 
 
 | 307 | merge_oversampling(10), | 
 
 
 
 
 | 308 | min_element_minimal_n(1000), | 
 
 
 
 
 | 309 | multiway_merge_minimal_n(1000), | 
 
 
 
 
 | 310 | multiway_merge_minimal_k(2), multiway_merge_oversampling(10), | 
 
 
 
 
 | 311 | nth_element_minimal_n(1000), | 
 
 
 
 
 | 312 | partition_chunk_size(1000), | 
 
 
 
 
 | 313 | partition_chunk_share(0.0), | 
 
 
 
 
 | 314 | partition_minimal_n(1000), | 
 
 
 
 
 | 315 | partial_sort_minimal_n(1000), | 
 
 
 
 
 | 316 | partial_sum_dilation(1.0f), | 
 
 
 
 
 | 317 | partial_sum_minimal_n(1000), | 
 
 
 
 
 | 318 | random_shuffle_minimal_n(1000), | 
 
 
 
 
 | 319 | replace_minimal_n(1000), | 
 
 
 
 
 | 320 | set_difference_minimal_n(1000), | 
 
 
 
 
 | 321 | set_intersection_minimal_n(1000), | 
 
 
 
 
 | 322 | set_symmetric_difference_minimal_n(1000), | 
 
 
 
 
 | 323 | set_union_minimal_n(1000), | 
 
 
 
 
 | 324 | sort_minimal_n(1000), | 
 
 
 
 
 | 325 | sort_mwms_oversampling(10), | 
 
 
 
 
 | 326 | sort_qs_num_samples_preset(100), | 
 
 
 
 
 | 327 | sort_qsb_base_case_maximal_n(100), | 
 
 
 
 
 | 328 | transform_minimal_n(1000), | 
 
 
 
 
 | 329 | unique_copy_minimal_n(10000), | 
 
 
 
 
 | 330 | workstealing_chunk_size(100), | 
 
 
 
 
 | 331 | L1_cache_size(16 << 10), | 
 
 
 
 
 | 332 | L2_cache_size(256 << 10), | 
 
 
 
 
 | 333 | TLB_size(128), | 
 
 
 
 
 | 334 | cache_line_size(64), | 
 
 
 
 
 | 335 | qsb_steals(0), | 
 
 
 
 
 | 336 | search_minimal_n(1000), | 
 
 
 
 
 | 337 | find_scale_factor(0.01f) | 
 
 
 
 
 | 338 | { } | 
 
 
 
 
 | 339 | }; | 
 
 
 
 
 | 340 | } | 
 
 
 
 
 | 341 |  | 
 
 
 
 
 | 342 | #endif /* _GLIBCXX_PARALLEL_SETTINGS_H */ |