Skip to content

Commit 883d781

Browse files
authored
Revert "Feature/faster ad tls v3"
1 parent 8cf3554 commit 883d781

18 files changed

+162
-373
lines changed

Jenkinsfile

Lines changed: 27 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -49,9 +49,6 @@ pipeline {
4949
skipDefaultCheckout()
5050
preserveStashes(buildCount: 7)
5151
}
52-
environment {
53-
STAN_NUM_THREADS = '4'
54-
}
5552
stages {
5653
stage('Kill previous builds') {
5754
when {
@@ -162,21 +159,6 @@ pipeline {
162159
}
163160
post { always { retry(3) { deleteDir() } } }
164161
}
165-
stage('Windows Threading tests') {
166-
agent { label 'windows' }
167-
steps {
168-
deleteDirWin()
169-
unstash 'MathSetup'
170-
bat "echo CXX=${env.CXX} -Werror > make/local"
171-
bat "echo CXXFLAGS+=-DSTAN_THREADS >> make/local"
172-
runTestsWin("test/unit -f thread")
173-
runTestsWin("test/unit -f map_rect")
174-
}
175-
}
176-
}
177-
}
178-
stage('Always-run tests part 2') {
179-
parallel {
180162
stage('Full unit with GPU') {
181163
agent { label "gpu" }
182164
steps {
@@ -190,6 +172,19 @@ pipeline {
190172
}
191173
post { always { retry(3) { deleteDir() } } }
192174
}
175+
stage('Windows Headers & Unit') {
176+
agent { label 'windows' }
177+
steps {
178+
deleteDirWin()
179+
unstash 'MathSetup'
180+
bat "make -j${env.PARALLEL} test-headers"
181+
runTestsWin("test/unit")
182+
}
183+
}
184+
}
185+
}
186+
stage('Always-run tests part 2') {
187+
parallel {
193188
stage('Distribution tests') {
194189
agent { label "distribution-tests" }
195190
steps {
@@ -217,6 +212,19 @@ pipeline {
217212
}
218213
}
219214
}
215+
stage('Threading tests') {
216+
agent any
217+
steps {
218+
deleteDir()
219+
unstash 'MathSetup'
220+
sh "echo CXX=${env.CXX} -Werror > make/local"
221+
sh "echo CPPFLAGS+=-DSTAN_THREADS >> make/local"
222+
runTests("test/unit -f thread")
223+
sh "find . -name *_test.xml | xargs rm"
224+
runTests("test/unit -f map_rect")
225+
}
226+
post { always { retry(3) { deleteDir() } } }
227+
}
220228
}
221229
}
222230
stage('Additional merge tests') {
@@ -228,7 +236,7 @@ pipeline {
228236
deleteDir()
229237
unstash 'MathSetup'
230238
sh "echo CXX=${GCC} >> make/local"
231-
sh "echo CXXFLAGS=-DSTAN_THREADS >> make/local"
239+
sh "echo CPPFLAGS=-DSTAN_THREADS >> make/local"
232240
runTests("test/unit")
233241
}
234242
post { always { retry(3) { deleteDir() } } }
@@ -244,15 +252,6 @@ pipeline {
244252
}
245253
post { always { retry(3) { deleteDir() } } }
246254
}
247-
stage('Windows Headers & Unit') {
248-
agent { label 'windows' }
249-
steps {
250-
deleteDirWin()
251-
unstash 'MathSetup'
252-
bat "make -j${env.PARALLEL} test-headers"
253-
runTestsWin("test/unit")
254-
}
255-
}
256255
}
257256
}
258257
stage('Upstream tests') {

stan/math/prim/mat/functor/map_rect.hpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
#ifndef STAN_MATH_PRIM_MAT_FUNCTOR_MAP_RECT_HPP
22
#define STAN_MATH_PRIM_MAT_FUNCTOR_MAP_RECT_HPP
33

4-
#include <stan/math/prim/scal/meta/return_type.hpp>
54
#include <stan/math/prim/arr/err/check_matching_sizes.hpp>
65
#include <stan/math/prim/mat/fun/dims.hpp>
76
#include <stan/math/prim/mat/fun/typedefs.hpp>

stan/math/prim/mat/functor/map_rect_concurrent.hpp

Lines changed: 70 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,13 +3,15 @@
33

44
#include <stan/math/prim/mat/fun/typedefs.hpp>
55

6-
#include <stan/math/prim/scal/meta/return_type.hpp>
6+
#include <stan/math/prim/mat/functor/map_rect_reduce.hpp>
7+
#include <stan/math/prim/mat/functor/map_rect_combine.hpp>
78
#include <stan/math/prim/scal/err/invalid_argument.hpp>
89
#include <boost/lexical_cast.hpp>
910

10-
#include <cstdlib>
1111
#include <vector>
1212
#include <thread>
13+
#include <future>
14+
#include <cstdlib>
1315

1416
namespace stan {
1517
namespace math {
@@ -75,7 +77,72 @@ map_rect_concurrent(
7577
const std::vector<Eigen::Matrix<T_job_param, Eigen::Dynamic, 1>>&
7678
job_params,
7779
const std::vector<std::vector<double>>& x_r,
78-
const std::vector<std::vector<int>>& x_i, std::ostream* msgs = nullptr);
80+
const std::vector<std::vector<int>>& x_i, std::ostream* msgs = nullptr) {
81+
typedef map_rect_reduce<F, T_shared_param, T_job_param> ReduceF;
82+
typedef map_rect_combine<F, T_shared_param, T_job_param> CombineF;
83+
84+
const int num_jobs = job_params.size();
85+
const vector_d shared_params_dbl = value_of(shared_params);
86+
std::vector<std::future<std::vector<matrix_d>>> futures;
87+
88+
auto execute_chunk = [&](int start, int size) -> std::vector<matrix_d> {
89+
const int end = start + size;
90+
std::vector<matrix_d> chunk_f_out;
91+
chunk_f_out.reserve(size);
92+
for (int i = start; i != end; i++)
93+
chunk_f_out.push_back(ReduceF()(
94+
shared_params_dbl, value_of(job_params[i]), x_r[i], x_i[i], msgs));
95+
return chunk_f_out;
96+
};
97+
98+
int num_threads = get_num_threads(num_jobs);
99+
int num_jobs_per_thread = num_jobs / num_threads;
100+
futures.emplace_back(
101+
std::async(std::launch::deferred, execute_chunk, 0, num_jobs_per_thread));
102+
103+
#ifdef STAN_THREADS
104+
if (num_threads > 1) {
105+
const int num_big_threads = num_jobs % num_threads;
106+
const int first_big_thread = num_threads - num_big_threads;
107+
for (int i = 1, job_start = num_jobs_per_thread, job_size = 0;
108+
i < num_threads; ++i, job_start += job_size) {
109+
job_size = i >= first_big_thread ? num_jobs_per_thread + 1
110+
: num_jobs_per_thread;
111+
futures.emplace_back(
112+
std::async(std::launch::async, execute_chunk, job_start, job_size));
113+
}
114+
}
115+
#endif
116+
117+
// collect results
118+
std::vector<int> world_f_out;
119+
world_f_out.reserve(num_jobs);
120+
matrix_d world_output(0, 0);
121+
122+
int offset = 0;
123+
for (std::size_t i = 0; i < futures.size(); ++i) {
124+
const std::vector<matrix_d>& chunk_result = futures[i].get();
125+
if (i == 0)
126+
world_output.resize(chunk_result[0].rows(),
127+
num_jobs * chunk_result[0].cols());
128+
129+
for (const auto& job_result : chunk_result) {
130+
const int num_job_outputs = job_result.cols();
131+
world_f_out.push_back(num_job_outputs);
132+
133+
if (world_output.cols() < offset + num_job_outputs)
134+
world_output.conservativeResize(Eigen::NoChange,
135+
2 * (offset + num_job_outputs));
136+
137+
world_output.block(0, offset, world_output.rows(), num_job_outputs)
138+
= job_result;
139+
140+
offset += num_job_outputs;
141+
}
142+
}
143+
CombineF combine(shared_params, job_params);
144+
return combine(world_output, world_f_out);
145+
}
79146

80147
} // namespace internal
81148
} // namespace math

stan/math/rev/core.hpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
#include <stan/math/rev/core/build_vari_array.hpp>
66
#include <stan/math/rev/core/chainable_alloc.hpp>
77
#include <stan/math/rev/core/chainablestack.hpp>
8-
#include <stan/math/rev/core/init_chainablestack.hpp>
98
#include <stan/math/rev/core/ddv_vari.hpp>
109
#include <stan/math/rev/core/dv_vari.hpp>
1110
#include <stan/math/rev/core/dvd_vari.hpp>

stan/math/rev/core/autodiffstackstorage.hpp

Lines changed: 31 additions & 98 deletions
Original file line numberDiff line numberDiff line change
@@ -9,75 +9,40 @@ namespace math {
99

1010
/**
1111
* Provides a thread_local singleton if needed. Read warnings below!
12-
* With STAN_THREADS defined, the singleton is a thread_local static pointer
13-
* for performance reasons. When STAN_THREADS is not set, we have the old
14-
* static AD stack in the instance_ field because we saw odd performance
15-
* issues on the Mac Pro[4]. The rest of this commentary is specifically
16-
* talking about the design choices in the STAN_THREADS=true case.
17-
* When a TLS is used then initialization with
18-
* a constant expression is required for fast access to the TLS. As
19-
* the AD storage struct is non-POD it must be initialized as a
20-
* dynamic expression such that compilers will wrap any access to the
21-
* TLS by a TLS wrapper function which causes a significant
22-
* slow-down. A pointer to the AD storage instance can be initialized
23-
* to a compile-time constant expression of nullptr. In this case the
24-
* compiler avoids the use of a TLS wrapper function. Furthermore we
25-
* use the __thread keyword on compilers which support it. The
26-
* __thread keyword is a compiler-specific (gcc, clang, Intel)
27-
* extension which requires initialization with a compile time
28-
* constant expression. The C++11 keyword thread_local does allow for
29-
* constant and dynamic initialization of the TLS. Thus, only the
30-
* __thread keyword gurantees that constant initialization and it's
31-
* implied speedup, is used.
12+
* For performance reasons the singleton is a global static for the
13+
* case of no threading which is returned by a function. This design
14+
* should allow the compiler to apply necessary inlining to get
15+
* maximal performance. However, this design suffers from "the static
16+
* init order fiasco"[0]. Anywhere this is used, we must be
17+
* absolutely positive that it doesn't matter when the singleton will
18+
* get initialized relative to other static variables. In exchange,
19+
* we get a more performant singleton pattern for the non-threading
20+
* case. In the threading case we use the defacto standard C++11
21+
* singleton pattern relying on a function wrapping a static local
22+
* variable. This standard pattern is expected to be well supported
23+
* by the major compilers (as its standard), but it does incur some
24+
* performance penalty. There has been some discussion on this; see
25+
* [1] and [2] and the discussions those PRs link to as well.
3226
*
33-
* The initialzation of the AD instance is handled by the lifetime of
34-
* a AutodiffStackSingleton object. More specifically, the first
35-
* instance of the AutodiffStackSingleton object will initialize the
36-
* AD instance and take ownership. Thus whenever the first instance of
37-
* the AutodiffStackSingleton object gets destructed, the AD tape will
38-
* be destructed as well. Within stan-math the initialization of the
39-
* AD instance for the main thread of the program is handled by
40-
* instantiating once the singleton once in the
41-
* init_chainablestack.hpp file. Whenever STAN_THREADS is defined then
42-
* all created child threads must call the init() method of the AD
43-
* singleton in order to initialize the TLS if child threads want to
44-
* perform AD operations (the initialization in the main process is
45-
* already taken care of in any case).
46-
*
47-
* The design of a globally held (optionally TLS) pointer, which is
48-
* globally initialized, allows the compiler to apply necessary
49-
* inlining to get maximal performance. However, the design suffers
50-
* from "the static init order fiasco"[0]. Whenever the static init
51-
* order fiasco occurs, the C++ client of the library may call the
52-
* init method as needed to ensure proper initialization order. In
53-
* exchange, we get a more performant singleton pattern with automatic
54-
* initialization of the AD stack for the main thread. There has been
55-
* some discussion on earlier designs using the Mayer singleton
56-
* approach; see [1] and [2] and the discussions those PRs link to as
57-
* well.
27+
* These are thread_local only if the user asks for it with
28+
* -DSTAN_THREADS. This is primarily because Apple clang compilers
29+
* before 2016 don't support thread_local and the additional
30+
* performance cost. We have proposed removing support for those[3],
31+
* and at that time we should evaluate the performance of a switch to
32+
* thread_local. If there is no loss in performance, we can remove
33+
* this ifdef.
5834
*
5935
* [0] https://isocpp.org/wiki/faq/ctors#static-init-order
6036
* [1] https://github.com/stan-dev/math/pull/840
6137
* [2] https://github.com/stan-dev/math/pull/826
6238
* [3]
6339
* http://discourse.mc-stan.org/t/potentially-dropping-support-for-older-versions-of-apples-version-of-clang/3780/
64-
* [4] https://github.com/stan-dev/math/pull/1135
6540
*/
6641
template <typename ChainableT, typename ChainableAllocT>
6742
struct AutodiffStackSingleton {
6843
typedef AutodiffStackSingleton<ChainableT, ChainableAllocT>
6944
AutodiffStackSingleton_t;
7045

71-
AutodiffStackSingleton() : own_instance_(init()) {}
72-
~AutodiffStackSingleton() {
73-
#ifdef STAN_THREADS
74-
if (own_instance_) {
75-
delete instance_;
76-
instance_ = nullptr;
77-
}
78-
#endif
79-
}
80-
8146
struct AutodiffStackStorage {
8247
AutodiffStackStorage &operator=(const AutodiffStackStorage &) = delete;
8348

@@ -92,60 +57,28 @@ struct AutodiffStackSingleton {
9257
std::vector<size_t> nested_var_alloc_stack_starts_;
9358
};
9459

60+
AutodiffStackSingleton() = delete;
9561
explicit AutodiffStackSingleton(AutodiffStackSingleton_t const &) = delete;
9662
AutodiffStackSingleton &operator=(const AutodiffStackSingleton_t &) = delete;
9763

98-
static constexpr inline AutodiffStackStorage &instance() {
99-
return
64+
static inline AutodiffStackStorage &instance() {
10065
#ifdef STAN_THREADS
101-
*
66+
thread_local static AutodiffStackStorage instance_;
10267
#endif
103-
instance_;
68+
return instance_;
10469
}
10570

106-
private:
107-
static bool init() {
108-
#ifdef STAN_THREADS
109-
if (!instance_) {
110-
instance_ = new AutodiffStackStorage();
111-
return true;
112-
}
113-
#endif
114-
return false;
115-
}
71+
#ifndef STAN_THREADS
11672

117-
static
118-
#ifdef STAN_THREADS
119-
#ifdef __GNUC__
120-
__thread
121-
#else
122-
thread_local
123-
#endif
124-
#endif
125-
AutodiffStackStorage
126-
#ifdef STAN_THREADS
127-
*
73+
private:
74+
static AutodiffStackStorage instance_;
12875
#endif
129-
instance_;
130-
131-
bool own_instance_;
13276
};
13377

78+
#ifndef STAN_THREADS
13479
template <typename ChainableT, typename ChainableAllocT>
135-
#ifdef STAN_THREADS
136-
#ifdef __GNUC__
137-
__thread
138-
#else
139-
thread_local
140-
#endif
141-
#endif
142-
typename AutodiffStackSingleton<ChainableT,
143-
ChainableAllocT>::AutodiffStackStorage
144-
145-
#ifdef STAN_THREADS
146-
*AutodiffStackSingleton<ChainableT, ChainableAllocT>::instance_
147-
= nullptr;
148-
#else
80+
typename AutodiffStackSingleton<ChainableT,
81+
ChainableAllocT>::AutodiffStackStorage
14982
AutodiffStackSingleton<ChainableT, ChainableAllocT>::instance_;
15083
#endif
15184

stan/math/rev/core/init_chainablestack.hpp

Lines changed: 0 additions & 14 deletions
This file was deleted.

stan/math/rev/mat.hpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,6 @@
6868
#include <stan/math/rev/mat/functor/integrate_ode_adams.hpp>
6969
#include <stan/math/rev/mat/functor/integrate_ode_bdf.hpp>
7070
#include <stan/math/rev/mat/functor/integrate_dae.hpp>
71-
#include <stan/math/rev/mat/functor/map_rect_concurrent.hpp>
7271
#include <stan/math/rev/mat/functor/map_rect_reduce.hpp>
7372

7473
#endif

0 commit comments

Comments
 (0)