libstdc++
workstealing.h
Go to the documentation of this file.
1 // -*- C++ -*-
2 
3 // Copyright (C) 2007-2014 Free Software Foundation, Inc.
4 //
5 // This file is part of the GNU ISO C++ Library. This library is free
6 // software; you can redistribute it and/or modify it under the terms
7 // of the GNU General Public License as published by the Free Software
8 // Foundation; either version 3, or (at your option) any later
9 // version.
10 
11 // This library is distributed in the hope that it will be useful, but
12 // WITHOUT ANY WARRANTY; without even the implied warranty of
13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 // General Public License for more details.
15 
16 // Under Section 7 of GPL version 3, you are granted additional
17 // permissions described in the GCC Runtime Library Exception, version
18 // 3.1, as published by the Free Software Foundation.
19 
20 // You should have received a copy of the GNU General Public License and
21 // a copy of the GCC Runtime Library Exception along with this program;
22 // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
23 // <http://www.gnu.org/licenses/>.
24 
25 /** @file parallel/workstealing.h
26  * @brief Parallelization of embarrassingly parallel execution by
27  * means of work-stealing.
28  *
29  * Work stealing is described in
30  *
31  * R. D. Blumofe and C. E. Leiserson.
32  * Scheduling multithreaded computations by work stealing.
33  * Journal of the ACM, 46(5):720–748, 1999.
34  *
35  * This file is a GNU parallel extension to the Standard C++ Library.
36  */
37 
38 // Written by Felix Putze.
39 
40 #ifndef _GLIBCXX_PARALLEL_WORKSTEALING_H
41 #define _GLIBCXX_PARALLEL_WORKSTEALING_H 1
42 
43 #include <parallel/parallel.h>
44 #include <parallel/random_number.h>
45 #include <parallel/compatibility.h>
46 
47 namespace __gnu_parallel
48 {
49 
50 #define _GLIBCXX_JOB_VOLATILE volatile
51 
52  /** @brief One __job for a certain thread. */
53  template<typename _DifferenceTp>
54  struct _Job
55  {
56  typedef _DifferenceTp _DifferenceType;
57 
58  /** @brief First element.
59  *
60  * Changed by owning and stealing thread. By stealing thread,
61  * always incremented. */
62  _GLIBCXX_JOB_VOLATILE _DifferenceType _M_first;
63 
64  /** @brief Last element.
65  *
66  * Changed by owning thread only. */
67  _GLIBCXX_JOB_VOLATILE _DifferenceType _M_last;
68 
69  /** @brief Number of elements, i.e. @c _M_last-_M_first+1.
70  *
71  * Changed by owning thread only. */
72  _GLIBCXX_JOB_VOLATILE _DifferenceType _M_load;
73  };
74 
75  /** @brief Work stealing algorithm for random access iterators.
76  *
77  * Uses O(1) additional memory. Synchronization at job lists is
78  * done with atomic operations.
79  * @param __begin Begin iterator of element sequence.
80  * @param __end End iterator of element sequence.
81  * @param __op User-supplied functor (comparator, predicate, adding
82  * functor, ...).
83  * @param __f Functor to @a process an element with __op (depends on
84  * desired functionality, e. g. for std::for_each(), ...).
85  * @param __r Functor to @a add a single __result to the already
86  * processed elements (depends on functionality).
87  * @param __base Base value for reduction.
88  * @param __output Pointer to position where final result is written to
89  * @param __bound Maximum number of elements processed (e. g. for
90  * std::count_n()).
91  * @return User-supplied functor (that may contain a part of the result).
92  */
93  template<typename _RAIter,
94  typename _Op,
95  typename _Fu,
96  typename _Red,
97  typename _Result>
98  _Op
100  _RAIter __end, _Op __op,
101  _Fu& __f, _Red __r,
102  _Result __base,
103  _Result& __output,
104  typename std::iterator_traits<_RAIter>::difference_type __bound)
105  {
106  _GLIBCXX_CALL(__end - __begin)
107 
108  typedef std::iterator_traits<_RAIter> _TraitsType;
109  typedef typename _TraitsType::difference_type _DifferenceType;
110 
111  const _Settings& __s = _Settings::get();
112 
113  _DifferenceType __chunk_size =
114  static_cast<_DifferenceType>(__s.workstealing_chunk_size);
115 
116  // How many jobs?
117  _DifferenceType __length = (__bound < 0) ? (__end - __begin) : __bound;
118 
119  // To avoid false sharing in a cache line.
120  const int __stride = (__s.cache_line_size * 10
121  / sizeof(_Job<_DifferenceType>) + 1);
122 
123  // Total number of threads currently working.
124  _ThreadIndex __busy = 0;
125 
126  _Job<_DifferenceType> *__job;
127 
128  omp_lock_t __output_lock;
129  omp_init_lock(&__output_lock);
130 
131  // Write base value to output.
132  __output = __base;
133 
134  // No more threads than jobs, at least one thread.
135  _ThreadIndex __num_threads = __gnu_parallel::max<_ThreadIndex>
136  (1, __gnu_parallel::min<_DifferenceType>(__length,
137  __get_max_threads()));
138 
139 # pragma omp parallel shared(__busy) num_threads(__num_threads)
140  {
141 # pragma omp single
142  {
143  __num_threads = omp_get_num_threads();
144 
145  // Create job description array.
146  __job = new _Job<_DifferenceType>[__num_threads * __stride];
147  }
148 
149  // Initialization phase.
150 
151  // Flags for every thread if it is doing productive work.
152  bool __iam_working = false;
153 
154  // Thread id.
155  _ThreadIndex __iam = omp_get_thread_num();
156 
157  // This job.
158  _Job<_DifferenceType>& __my_job = __job[__iam * __stride];
159 
160  // Random number (for work stealing).
161  _ThreadIndex __victim;
162 
163  // Local value for reduction.
164  _Result __result = _Result();
165 
166  // Number of elements to steal in one attempt.
167  _DifferenceType __steal;
168 
169  // Every thread has its own random number generator
170  // (modulo __num_threads).
171  _RandomNumber __rand_gen(__iam, __num_threads);
172 
173  // This thread is currently working.
174 # pragma omp atomic
175  ++__busy;
176 
177  __iam_working = true;
178 
179  // How many jobs per thread? last thread gets the rest.
180  __my_job._M_first = static_cast<_DifferenceType>
181  (__iam * (__length / __num_threads));
182 
183  __my_job._M_last = (__iam == (__num_threads - 1)
184  ? (__length - 1)
185  : ((__iam + 1) * (__length / __num_threads) - 1));
186  __my_job._M_load = __my_job._M_last - __my_job._M_first + 1;
187 
188  // Init result with _M_first value (to have a base value for reduction)
189  if (__my_job._M_first <= __my_job._M_last)
190  {
191  // Cannot use volatile variable directly.
192  _DifferenceType __my_first = __my_job._M_first;
193  __result = __f(__op, __begin + __my_first);
194  ++__my_job._M_first;
195  --__my_job._M_load;
196  }
197 
198  _RAIter __current;
199 
200 # pragma omp barrier
201 
202  // Actual work phase
203  // Work on own or stolen current start
204  while (__busy > 0)
205  {
206  // Work until no productive thread left.
207 # pragma omp flush(__busy)
208 
209  // Thread has own work to do
210  while (__my_job._M_first <= __my_job._M_last)
211  {
212  // fetch-and-add call
213  // Reserve current job block (size __chunk_size) in my queue.
214  _DifferenceType __current_job =
215  __fetch_and_add<_DifferenceType>(&(__my_job._M_first),
216  __chunk_size);
217 
218  // Update _M_load, to make the three values consistent,
219  // _M_first might have been changed in the meantime
220  __my_job._M_load = __my_job._M_last - __my_job._M_first + 1;
221  for (_DifferenceType __job_counter = 0;
222  __job_counter < __chunk_size
223  && __current_job <= __my_job._M_last;
224  ++__job_counter)
225  {
226  // Yes: process it!
227  __current = __begin + __current_job;
228  ++__current_job;
229 
230  // Do actual work.
231  __result = __r(__result, __f(__op, __current));
232  }
233 
234 # pragma omp flush(__busy)
235  }
236 
237  // After reaching this point, a thread's __job list is empty.
238  if (__iam_working)
239  {
240  // This thread no longer has work.
241 # pragma omp atomic
242  --__busy;
243 
244  __iam_working = false;
245  }
246 
247  _DifferenceType __supposed_first, __supposed_last,
248  __supposed_load;
249  do
250  {
251  // Find random nonempty deque (not own), do consistency check.
252  __yield();
253 # pragma omp flush(__busy)
254  __victim = __rand_gen();
255  __supposed_first = __job[__victim * __stride]._M_first;
256  __supposed_last = __job[__victim * __stride]._M_last;
257  __supposed_load = __job[__victim * __stride]._M_load;
258  }
259  while (__busy > 0
260  && ((__supposed_load <= 0)
261  || ((__supposed_first + __supposed_load - 1)
262  != __supposed_last)));
263 
264  if (__busy == 0)
265  break;
266 
267  if (__supposed_load > 0)
268  {
269  // Has work and work to do.
270  // Number of elements to steal (at least one).
271  __steal = (__supposed_load < 2) ? 1 : __supposed_load / 2;
272 
273  // Push __victim's current start forward.
274  _DifferenceType __stolen_first =
275  __fetch_and_add<_DifferenceType>
276  (&(__job[__victim * __stride]._M_first), __steal);
277  _DifferenceType __stolen_try = (__stolen_first + __steal
278  - _DifferenceType(1));
279 
280  __my_job._M_first = __stolen_first;
281  __my_job._M_last = __gnu_parallel::min(__stolen_try,
282  __supposed_last);
283  __my_job._M_load = __my_job._M_last - __my_job._M_first + 1;
284 
285  // Has potential work again.
286 # pragma omp atomic
287  ++__busy;
288  __iam_working = true;
289 
290 # pragma omp flush(__busy)
291  }
292 # pragma omp flush(__busy)
293  } // end while __busy > 0
294  // Add accumulated result to output.
295  omp_set_lock(&__output_lock);
296  __output = __r(__output, __result);
297  omp_unset_lock(&__output_lock);
298  }
299 
300  delete[] __job;
301 
302  // Points to last element processed (needed as return value for
303  // some algorithms like transform)
304  __f._M_finish_iterator = __begin + __length;
305 
306  omp_destroy_lock(&__output_lock);
307 
308  return __op;
309  }
310 } // end namespace
311 
312 #endif /* _GLIBCXX_PARALLEL_WORKSTEALING_H */
static const _Settings & get()
Get the global settings.
Compatibility layer, mostly concerned with atomic operations.
uint16_t _ThreadIndex
Unsigned integer to index a thread number. The maximum thread number (for each processor) must fit in...
Definition: types.h:123
class _Settings Run-time settings for the parallel mode including all tunable parameters.
Definition: settings.h:123
unsigned int cache_line_size
Overestimation of cache line size. Used to avoid false sharing, i.e. elements of different threads ar...
Definition: settings.h:265
void __yield()
Yield control to another thread, without waiting for the end of the time slice.
volatile _DifferenceType _M_load
Number of elements, i.e. _M_last-_M_first+1.
Definition: workstealing.h:72
_Op __for_each_template_random_access_workstealing(_RAIter __begin, _RAIter __end, _Op __op, _Fu &__f, _Red __r, _Result __base, _Result &__output, typename std::iterator_traits< _RAIter >::difference_type __bound)
Work stealing algorithm for random access iterators.
Definition: workstealing.h:99
volatile _DifferenceType _M_last
Last element.
Definition: workstealing.h:67
volatile _DifferenceType _M_first
First element.
Definition: workstealing.h:62
Random number generator based on the Mersenne twister. This file is a GNU parallel extension to the S...
#define _GLIBCXX_CALL(__n)
Macro to produce log message when entering a function.
One __job for a certain thread.
Definition: workstealing.h:54
End-user include file. Provides advanced settings and tuning options. This file is a GNU parallel ext...
Random number generator, based on the Mersenne twister.
Definition: random_number.h:42
const _Tp & min(const _Tp &__a, const _Tp &__b)
Equivalent to std::min.
_Siter_base< _Iterator >::iterator_type __base(_Iterator __it)
Definition: functions.h:558