Skip to content

Commit 7d521fd

Browse files
committed
Revert "Revert "Feature/faster ad tls v3""
This reverts commit 883d781.
1 parent ca2d77f commit 7d521fd

18 files changed

+373
-162
lines changed

Jenkinsfile

Lines changed: 28 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,9 @@ pipeline {
4949
skipDefaultCheckout()
5050
preserveStashes(buildCount: 7)
5151
}
52+
environment {
53+
STAN_NUM_THREADS = '4'
54+
}
5255
stages {
5356
stage('Kill previous builds') {
5457
when {
@@ -159,6 +162,21 @@ pipeline {
159162
}
160163
post { always { retry(3) { deleteDir() } } }
161164
}
165+
stage('Windows Threading tests') {
166+
agent { label 'windows' }
167+
steps {
168+
deleteDirWin()
169+
unstash 'MathSetup'
170+
bat "echo CXX=${env.CXX} -Werror > make/local"
171+
bat "echo CXXFLAGS+=-DSTAN_THREADS >> make/local"
172+
runTestsWin("test/unit -f thread")
173+
runTestsWin("test/unit -f map_rect")
174+
}
175+
}
176+
}
177+
}
178+
stage('Always-run tests part 2') {
179+
parallel {
162180
stage('Full unit with GPU') {
163181
agent { label "gpu" }
164182
steps {
@@ -172,19 +190,6 @@ pipeline {
172190
}
173191
post { always { retry(3) { deleteDir() } } }
174192
}
175-
stage('Windows Headers & Unit') {
176-
agent { label 'windows' }
177-
steps {
178-
deleteDirWin()
179-
unstash 'MathSetup'
180-
bat "make -j${env.PARALLEL} test-headers"
181-
runTestsWin("test/unit")
182-
}
183-
}
184-
}
185-
}
186-
stage('Always-run tests part 2') {
187-
parallel {
188193
stage('Distribution tests') {
189194
agent { label "distribution-tests" }
190195
steps {
@@ -212,19 +217,6 @@ pipeline {
212217
}
213218
}
214219
}
215-
stage('Threading tests') {
216-
agent any
217-
steps {
218-
deleteDir()
219-
unstash 'MathSetup'
220-
sh "echo CXX=${env.CXX} -Werror > make/local"
221-
sh "echo CPPFLAGS+=-DSTAN_THREADS >> make/local"
222-
runTests("test/unit -f thread")
223-
sh "find . -name *_test.xml | xargs rm"
224-
runTests("test/unit -f map_rect")
225-
}
226-
post { always { retry(3) { deleteDir() } } }
227-
}
228220
}
229221
}
230222
stage('Additional merge tests') {
@@ -236,7 +228,7 @@ pipeline {
236228
deleteDir()
237229
unstash 'MathSetup'
238230
sh "echo CXX=${GCC} >> make/local"
239-
sh "echo CPPFLAGS=-DSTAN_THREADS >> make/local"
231+
sh "echo CXXFLAGS=-DSTAN_THREADS >> make/local"
240232
runTests("test/unit")
241233
}
242234
post { always { retry(3) { deleteDir() } } }
@@ -252,6 +244,15 @@ pipeline {
252244
}
253245
post { always { retry(3) { deleteDir() } } }
254246
}
247+
stage('Windows Headers & Unit') {
248+
agent { label 'windows' }
249+
steps {
250+
deleteDirWin()
251+
unstash 'MathSetup'
252+
bat "make -j${env.PARALLEL} test-headers"
253+
runTestsWin("test/unit")
254+
}
255+
}
255256
}
256257
}
257258
stage('Upstream tests') {

stan/math/prim/mat/functor/map_rect.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
#ifndef STAN_MATH_PRIM_MAT_FUNCTOR_MAP_RECT_HPP
22
#define STAN_MATH_PRIM_MAT_FUNCTOR_MAP_RECT_HPP
33

4+
#include <stan/math/prim/scal/meta/return_type.hpp>
45
#include <stan/math/prim/arr/err/check_matching_sizes.hpp>
56
#include <stan/math/prim/mat/fun/dims.hpp>
67
#include <stan/math/prim/mat/fun/typedefs.hpp>

stan/math/prim/mat/functor/map_rect_concurrent.hpp

Lines changed: 3 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -3,15 +3,13 @@
33

44
#include <stan/math/prim/mat/fun/typedefs.hpp>
55

6-
#include <stan/math/prim/mat/functor/map_rect_reduce.hpp>
7-
#include <stan/math/prim/mat/functor/map_rect_combine.hpp>
6+
#include <stan/math/prim/scal/meta/return_type.hpp>
87
#include <stan/math/prim/scal/err/invalid_argument.hpp>
98
#include <boost/lexical_cast.hpp>
109

10+
#include <cstdlib>
1111
#include <vector>
1212
#include <thread>
13-
#include <future>
14-
#include <cstdlib>
1513

1614
namespace stan {
1715
namespace math {
@@ -77,72 +75,7 @@ map_rect_concurrent(
7775
const std::vector<Eigen::Matrix<T_job_param, Eigen::Dynamic, 1>>&
7876
job_params,
7977
const std::vector<std::vector<double>>& x_r,
80-
const std::vector<std::vector<int>>& x_i, std::ostream* msgs = nullptr) {
81-
typedef map_rect_reduce<F, T_shared_param, T_job_param> ReduceF;
82-
typedef map_rect_combine<F, T_shared_param, T_job_param> CombineF;
83-
84-
const int num_jobs = job_params.size();
85-
const vector_d shared_params_dbl = value_of(shared_params);
86-
std::vector<std::future<std::vector<matrix_d>>> futures;
87-
88-
auto execute_chunk = [&](int start, int size) -> std::vector<matrix_d> {
89-
const int end = start + size;
90-
std::vector<matrix_d> chunk_f_out;
91-
chunk_f_out.reserve(size);
92-
for (int i = start; i != end; i++)
93-
chunk_f_out.push_back(ReduceF()(
94-
shared_params_dbl, value_of(job_params[i]), x_r[i], x_i[i], msgs));
95-
return chunk_f_out;
96-
};
97-
98-
int num_threads = get_num_threads(num_jobs);
99-
int num_jobs_per_thread = num_jobs / num_threads;
100-
futures.emplace_back(
101-
std::async(std::launch::deferred, execute_chunk, 0, num_jobs_per_thread));
102-
103-
#ifdef STAN_THREADS
104-
if (num_threads > 1) {
105-
const int num_big_threads = num_jobs % num_threads;
106-
const int first_big_thread = num_threads - num_big_threads;
107-
for (int i = 1, job_start = num_jobs_per_thread, job_size = 0;
108-
i < num_threads; ++i, job_start += job_size) {
109-
job_size = i >= first_big_thread ? num_jobs_per_thread + 1
110-
: num_jobs_per_thread;
111-
futures.emplace_back(
112-
std::async(std::launch::async, execute_chunk, job_start, job_size));
113-
}
114-
}
115-
#endif
116-
117-
// collect results
118-
std::vector<int> world_f_out;
119-
world_f_out.reserve(num_jobs);
120-
matrix_d world_output(0, 0);
121-
122-
int offset = 0;
123-
for (std::size_t i = 0; i < futures.size(); ++i) {
124-
const std::vector<matrix_d>& chunk_result = futures[i].get();
125-
if (i == 0)
126-
world_output.resize(chunk_result[0].rows(),
127-
num_jobs * chunk_result[0].cols());
128-
129-
for (const auto& job_result : chunk_result) {
130-
const int num_job_outputs = job_result.cols();
131-
world_f_out.push_back(num_job_outputs);
132-
133-
if (world_output.cols() < offset + num_job_outputs)
134-
world_output.conservativeResize(Eigen::NoChange,
135-
2 * (offset + num_job_outputs));
136-
137-
world_output.block(0, offset, world_output.rows(), num_job_outputs)
138-
= job_result;
139-
140-
offset += num_job_outputs;
141-
}
142-
}
143-
CombineF combine(shared_params, job_params);
144-
return combine(world_output, world_f_out);
145-
}
78+
const std::vector<std::vector<int>>& x_i, std::ostream* msgs = nullptr);
14679

14780
} // namespace internal
14881
} // namespace math

stan/math/rev/core.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
#include <stan/math/rev/core/build_vari_array.hpp>
66
#include <stan/math/rev/core/chainable_alloc.hpp>
77
#include <stan/math/rev/core/chainablestack.hpp>
8+
#include <stan/math/rev/core/init_chainablestack.hpp>
89
#include <stan/math/rev/core/ddv_vari.hpp>
910
#include <stan/math/rev/core/dv_vari.hpp>
1011
#include <stan/math/rev/core/dvd_vari.hpp>

stan/math/rev/core/autodiffstackstorage.hpp

Lines changed: 98 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -9,40 +9,75 @@ namespace math {
99

1010
/**
1111
* Provides a thread_local singleton if needed. Read warnings below!
12-
* For performance reasons the singleton is a global static for the
13-
* case of no threading which is returned by a function. This design
14-
* should allow the compiler to apply necessary inlining to get
15-
* maximal performance. However, this design suffers from "the static
16-
* init order fiasco"[0]. Anywhere this is used, we must be
17-
* absolutely positive that it doesn't matter when the singleton will
18-
* get initialized relative to other static variables. In exchange,
19-
* we get a more performant singleton pattern for the non-threading
20-
* case. In the threading case we use the defacto standard C++11
21-
* singleton pattern relying on a function wrapping a static local
22-
* variable. This standard pattern is expected to be well supported
23-
* by the major compilers (as its standard), but it does incur some
24-
* performance penalty. There has been some discussion on this; see
25-
* [1] and [2] and the discussions those PRs link to as well.
12+
* With STAN_THREADS defined, the singleton is a thread_local static pointer
13+
* for performance reasons. When STAN_THREADS is not set, we have the old
14+
* static AD stack in the instance_ field because we saw odd performance
15+
* issues on the Mac Pro[4]. The rest of this commentary is specifically
16+
* talking about the design choices in the STAN_THREADS=true case.
17+
* When a TLS is used then initialization with
18+
* a constant expression is required for fast access to the TLS. As
19+
* the AD storage struct is non-POD it must be initialized as a
20+
* dynamic expression such that compilers will wrap any access to the
21+
* TLS by a TLS wrapper function which causes a significant
22+
* slow-down. A pointer to the AD storage instance can be initialized
23+
* to a compile-time constant expression of nullptr. In this case the
24+
* compiler avoids the use of a TLS wrapper function. Furthermore we
25+
* use the __thread keyword on compilers which support it. The
26+
* __thread keyword is a compiler-specific (gcc, clang, Intel)
27+
* extension which requires initialization with a compile time
28+
* constant expression. The C++11 keyword thread_local does allow for
29+
* constant and dynamic initialization of the TLS. Thus, only the
30+
* __thread keyword gurantees that constant initialization and it's
31+
* implied speedup, is used.
2632
*
27-
* These are thread_local only if the user asks for it with
28-
* -DSTAN_THREADS. This is primarily because Apple clang compilers
29-
* before 2016 don't support thread_local and the additional
30-
* performance cost. We have proposed removing support for those[3],
31-
* and at that time we should evaluate the performance of a switch to
32-
* thread_local. If there is no loss in performance, we can remove
33-
* this ifdef.
33+
* The initialzation of the AD instance is handled by the lifetime of
34+
* a AutodiffStackSingleton object. More specifically, the first
35+
* instance of the AutodiffStackSingleton object will initialize the
36+
* AD instance and take ownership. Thus whenever the first instance of
37+
* the AutodiffStackSingleton object gets destructed, the AD tape will
38+
* be destructed as well. Within stan-math the initialization of the
39+
* AD instance for the main thread of the program is handled by
40+
* instantiating once the singleton once in the
41+
* init_chainablestack.hpp file. Whenever STAN_THREADS is defined then
42+
* all created child threads must call the init() method of the AD
43+
* singleton in order to initialize the TLS if child threads want to
44+
* perform AD operations (the initialization in the main process is
45+
* already taken care of in any case).
46+
*
47+
* The design of a globally held (optionally TLS) pointer, which is
48+
* globally initialized, allows the compiler to apply necessary
49+
* inlining to get maximal performance. However, the design suffers
50+
* from "the static init order fiasco"[0]. Whenever the static init
51+
* order fiasco occurs, the C++ client of the library may call the
52+
* init method as needed to ensure proper initialization order. In
53+
* exchange, we get a more performant singleton pattern with automatic
54+
* initialization of the AD stack for the main thread. There has been
55+
* some discussion on earlier designs using the Mayer singleton
56+
* approach; see [1] and [2] and the discussions those PRs link to as
57+
* well.
3458
*
3559
* [0] https://isocpp.org/wiki/faq/ctors#static-init-order
3660
* [1] https://github.com/stan-dev/math/pull/840
3761
* [2] https://github.com/stan-dev/math/pull/826
3862
* [3]
3963
* http://discourse.mc-stan.org/t/potentially-dropping-support-for-older-versions-of-apples-version-of-clang/3780/
64+
* [4] https://github.com/stan-dev/math/pull/1135
4065
*/
4166
template <typename ChainableT, typename ChainableAllocT>
4267
struct AutodiffStackSingleton {
4368
typedef AutodiffStackSingleton<ChainableT, ChainableAllocT>
4469
AutodiffStackSingleton_t;
4570

71+
AutodiffStackSingleton() : own_instance_(init()) {}
72+
~AutodiffStackSingleton() {
73+
#ifdef STAN_THREADS
74+
if (own_instance_) {
75+
delete instance_;
76+
instance_ = nullptr;
77+
}
78+
#endif
79+
}
80+
4681
struct AutodiffStackStorage {
4782
AutodiffStackStorage &operator=(const AutodiffStackStorage &) = delete;
4883

@@ -57,28 +92,60 @@ struct AutodiffStackSingleton {
5792
std::vector<size_t> nested_var_alloc_stack_starts_;
5893
};
5994

60-
AutodiffStackSingleton() = delete;
6195
explicit AutodiffStackSingleton(AutodiffStackSingleton_t const &) = delete;
6296
AutodiffStackSingleton &operator=(const AutodiffStackSingleton_t &) = delete;
6397

64-
static inline AutodiffStackStorage &instance() {
98+
static constexpr inline AutodiffStackStorage &instance() {
99+
return
65100
#ifdef STAN_THREADS
66-
thread_local static AutodiffStackStorage instance_;
101+
*
67102
#endif
68-
return instance_;
103+
instance_;
69104
}
70105

71-
#ifndef STAN_THREADS
72-
73106
private:
74-
static AutodiffStackStorage instance_;
107+
static bool init() {
108+
#ifdef STAN_THREADS
109+
if (!instance_) {
110+
instance_ = new AutodiffStackStorage();
111+
return true;
112+
}
113+
#endif
114+
return false;
115+
}
116+
117+
static
118+
#ifdef STAN_THREADS
119+
#ifdef __GNUC__
120+
__thread
121+
#else
122+
thread_local
123+
#endif
75124
#endif
125+
AutodiffStackStorage
126+
#ifdef STAN_THREADS
127+
*
128+
#endif
129+
instance_;
130+
131+
bool own_instance_;
76132
};
77133

78-
#ifndef STAN_THREADS
79134
template <typename ChainableT, typename ChainableAllocT>
80-
typename AutodiffStackSingleton<ChainableT,
81-
ChainableAllocT>::AutodiffStackStorage
135+
#ifdef STAN_THREADS
136+
#ifdef __GNUC__
137+
__thread
138+
#else
139+
thread_local
140+
#endif
141+
#endif
142+
typename AutodiffStackSingleton<ChainableT,
143+
ChainableAllocT>::AutodiffStackStorage
144+
145+
#ifdef STAN_THREADS
146+
*AutodiffStackSingleton<ChainableT, ChainableAllocT>::instance_
147+
= nullptr;
148+
#else
82149
AutodiffStackSingleton<ChainableT, ChainableAllocT>::instance_;
83150
#endif
84151

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
#ifndef STAN_MATH_REV_CORE_INIT_CHAINABLESTACK_HPP
2+
#define STAN_MATH_REV_CORE_INIT_CHAINABLESTACK_HPP
3+
4+
#include <stan/math/rev/core/chainablestack.hpp>
5+
6+
namespace stan {
7+
namespace math {
8+
namespace {
9+
10+
ChainableStack global_stack_instance_init;
11+
}
12+
} // namespace math
13+
} // namespace stan
14+
#endif

stan/math/rev/mat.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@
6868
#include <stan/math/rev/mat/functor/integrate_ode_adams.hpp>
6969
#include <stan/math/rev/mat/functor/integrate_ode_bdf.hpp>
7070
#include <stan/math/rev/mat/functor/integrate_dae.hpp>
71+
#include <stan/math/rev/mat/functor/map_rect_concurrent.hpp>
7172
#include <stan/math/rev/mat/functor/map_rect_reduce.hpp>
7273

7374
#endif

0 commit comments

Comments
 (0)