Skip to content

Commit db1a002

Browse files
committed
Revert "Merge pull request #1212 from stan-dev/feature/faster-ad-tls-v5"
This reverts commit 5e76d67, reversing changes made to ec52b8b.
1 parent dd45af4 commit db1a002

18 files changed

+145
-362
lines changed

Jenkinsfile

Lines changed: 1 addition & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -49,9 +49,6 @@ pipeline {
4949
skipDefaultCheckout()
5050
preserveStashes(buildCount: 7)
5151
}
52-
environment {
53-
STAN_NUM_THREADS = '4'
54-
}
5552
stages {
5653
stage('Kill previous builds') {
5754
when {
@@ -228,17 +225,6 @@ pipeline {
228225
runTestsWin("test/unit")
229226
}
230227
}
231-
stage('Windows Threading') {
232-
agent { label 'windows' }
233-
steps {
234-
deleteDirWin()
235-
unstash 'MathSetup'
236-
bat "echo CXX=${env.CXX} -Werror > make/local"
237-
bat "echo CXXFLAGS+=-DSTAN_THREADS >> make/local"
238-
runTestsWin("test/unit -f thread")
239-
runTestsWin("test/unit -f map_rect")
240-
}
241-
}
242228
}
243229
}
244230
stage('Additional merge tests') {
@@ -250,7 +236,7 @@ pipeline {
250236
deleteDir()
251237
unstash 'MathSetup'
252238
sh "echo CXX=${GCC} >> make/local"
253-
sh "echo CXXFLAGS=-DSTAN_THREADS >> make/local"
239+
sh "echo CPPFLAGS=-DSTAN_THREADS >> make/local"
254240
runTests("test/unit")
255241
}
256242
post { always { retry(3) { deleteDir() } } }

stan/math/prim/mat/functor/map_rect.hpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
#include <stan/math/prim/arr/err/check_matching_sizes.hpp>
55
#include <stan/math/prim/mat/fun/dims.hpp>
66
#include <stan/math/prim/mat/fun/typedefs.hpp>
7-
#include <stan/math/prim/scal/meta/return_type.hpp>
87

98
#define STAN_REGISTER_MAP_RECT(CALLID, FUNCTOR)
109

stan/math/prim/mat/functor/map_rect_concurrent.hpp

Lines changed: 70 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,13 +3,15 @@
33

44
#include <stan/math/prim/mat/fun/typedefs.hpp>
55

6-
#include <stan/math/prim/scal/meta/return_type.hpp>
6+
#include <stan/math/prim/mat/functor/map_rect_reduce.hpp>
7+
#include <stan/math/prim/mat/functor/map_rect_combine.hpp>
78
#include <stan/math/prim/scal/err/invalid_argument.hpp>
89
#include <boost/lexical_cast.hpp>
910

10-
#include <cstdlib>
1111
#include <vector>
1212
#include <thread>
13+
#include <future>
14+
#include <cstdlib>
1315

1416
namespace stan {
1517
namespace math {
@@ -75,7 +77,72 @@ map_rect_concurrent(
7577
const std::vector<Eigen::Matrix<T_job_param, Eigen::Dynamic, 1>>&
7678
job_params,
7779
const std::vector<std::vector<double>>& x_r,
78-
const std::vector<std::vector<int>>& x_i, std::ostream* msgs = nullptr);
80+
const std::vector<std::vector<int>>& x_i, std::ostream* msgs = nullptr) {
81+
typedef map_rect_reduce<F, T_shared_param, T_job_param> ReduceF;
82+
typedef map_rect_combine<F, T_shared_param, T_job_param> CombineF;
83+
84+
const int num_jobs = job_params.size();
85+
const vector_d shared_params_dbl = value_of(shared_params);
86+
std::vector<std::future<std::vector<matrix_d>>> futures;
87+
88+
auto execute_chunk = [&](int start, int size) -> std::vector<matrix_d> {
89+
const int end = start + size;
90+
std::vector<matrix_d> chunk_f_out;
91+
chunk_f_out.reserve(size);
92+
for (int i = start; i != end; i++)
93+
chunk_f_out.push_back(ReduceF()(
94+
shared_params_dbl, value_of(job_params[i]), x_r[i], x_i[i], msgs));
95+
return chunk_f_out;
96+
};
97+
98+
int num_threads = get_num_threads(num_jobs);
99+
int num_jobs_per_thread = num_jobs / num_threads;
100+
futures.emplace_back(
101+
std::async(std::launch::deferred, execute_chunk, 0, num_jobs_per_thread));
102+
103+
#ifdef STAN_THREADS
104+
if (num_threads > 1) {
105+
const int num_big_threads = num_jobs % num_threads;
106+
const int first_big_thread = num_threads - num_big_threads;
107+
for (int i = 1, job_start = num_jobs_per_thread, job_size = 0;
108+
i < num_threads; ++i, job_start += job_size) {
109+
job_size = i >= first_big_thread ? num_jobs_per_thread + 1
110+
: num_jobs_per_thread;
111+
futures.emplace_back(
112+
std::async(std::launch::async, execute_chunk, job_start, job_size));
113+
}
114+
}
115+
#endif
116+
117+
// collect results
118+
std::vector<int> world_f_out;
119+
world_f_out.reserve(num_jobs);
120+
matrix_d world_output(0, 0);
121+
122+
int offset = 0;
123+
for (std::size_t i = 0; i < futures.size(); ++i) {
124+
const std::vector<matrix_d>& chunk_result = futures[i].get();
125+
if (i == 0)
126+
world_output.resize(chunk_result[0].rows(),
127+
num_jobs * chunk_result[0].cols());
128+
129+
for (const auto& job_result : chunk_result) {
130+
const int num_job_outputs = job_result.cols();
131+
world_f_out.push_back(num_job_outputs);
132+
133+
if (world_output.cols() < offset + num_job_outputs)
134+
world_output.conservativeResize(Eigen::NoChange,
135+
2 * (offset + num_job_outputs));
136+
137+
world_output.block(0, offset, world_output.rows(), num_job_outputs)
138+
= job_result;
139+
140+
offset += num_job_outputs;
141+
}
142+
}
143+
CombineF combine(shared_params, job_params);
144+
return combine(world_output, world_f_out);
145+
}
79146

80147
} // namespace internal
81148
} // namespace math

stan/math/rev/core.hpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
#include <stan/math/rev/core/build_vari_array.hpp>
66
#include <stan/math/rev/core/chainable_alloc.hpp>
77
#include <stan/math/rev/core/chainablestack.hpp>
8-
#include <stan/math/rev/core/init_chainablestack.hpp>
98
#include <stan/math/rev/core/ddv_vari.hpp>
109
#include <stan/math/rev/core/dv_vari.hpp>
1110
#include <stan/math/rev/core/dvd_vari.hpp>

stan/math/rev/core/autodiffstackstorage.hpp

Lines changed: 37 additions & 95 deletions
Original file line numberDiff line numberDiff line change
@@ -7,98 +7,42 @@
77
namespace stan {
88
namespace math {
99

10-
// Internal macro used to modify global pointer definition to the
11-
// global AD instance.
12-
#ifdef STAN_THREADS
13-
// Whenever STAN_THREADS is set a TLS keyword is used. For reasons
14-
// explained below we use the GNU compiler extension __thread if
15-
// supported by the compiler while the generic thread_local C++11
16-
// keyword is used otherwise.
17-
#ifdef __GNUC__
18-
#define STAN_THREADS_DEF __thread
19-
#else
20-
#define STAN_THREADS_DEF thread_local
21-
#endif
22-
#else
23-
// In case STAN_THREADS is not set, then no modifier is needed.
24-
#define STAN_THREADS_DEF
25-
#endif
26-
2710
/**
28-
* This struct always provides access to the autodiff stack using
29-
* the singleton pattern. Read warnings below!
30-
*
31-
* The singleton <code>instance_</code> is a global static pointer,
32-
* which is thread local (TLS) if the STAN_THREADS preprocess variable
33-
* is defined.
11+
* Provides a thread_local singleton if needed. Read warnings below!
12+
* For performance reasons the singleton is a global static for the
13+
* case of no threading which is returned by a function. This design
14+
* should allow the compiler to apply necessary inlining to get
15+
* maximal performance. However, this design suffers from "the static
16+
* init order fiasco"[0]. Anywhere this is used, we must be
17+
* absolutely positive that it doesn't matter when the singleton will
18+
* get initialized relative to other static variables. In exchange,
19+
* we get a more performant singleton pattern for the non-threading
20+
* case. In the threading case we use the defacto standard C++11
21+
* singleton pattern relying on a function wrapping a static local
22+
* variable. This standard pattern is expected to be well supported
23+
* by the major compilers (as its standard), but it does incur some
24+
* performance penalty. There has been some discussion on this; see
25+
* [1] and [2] and the discussions those PRs link to as well.
3426
*
35-
* The use of a pointer is motivated by performance reasons for the
36-
* threading case. When a TLS is used, initialization with a constant
37-
* expression at compile time is required for fast access to the
38-
* TLS. As the autodiff storage struct is non-POD, its initialization
39-
* is a dynamic expression at compile time. These dynamic expressions
40-
* are wrapped, in the TLS case, by a TLS wrapper function which slows
41-
* down its access. Using a pointer instead allows to initialize at
42-
* compile time to <code>nullptr</code>, which is a compile time
43-
* constant. In this case, the compiler avoids the use of a TLS
44-
* wrapper function.
45-
*
46-
* For performance reasons we use the __thread keyword on compilers
47-
* which support it. The __thread keyword is a GNU compiler-specific
48-
* (gcc, clang, Intel) extension which requires initialization with a
49-
* compile time constant expression. The C++11 keyword thread_local
50-
* does allow for constant and dynamic initialization of the
51-
* TLS. Thus, only the __thread keyword gurantees that constant
52-
* initialization and it's implied speedup, is used.
53-
*
54-
* The initialzation of the AD instance at run-time is handled by the
55-
* lifetime of a AutodiffStackSingleton object. More specifically, the
56-
* first instance of the AutodiffStackSingleton object will initialize
57-
* the AD instance and take ownership (it is the only one instance
58-
* with the private member own_instance_ being true). Thus, whenever
59-
* the first instance of the AutodiffStackSingleton object gets
60-
* destructed, the AD tape will be destructed as well. Within
61-
* stan-math the initialization of the AD instance for the main thread
62-
* of the program is handled by instantiating the singleton once in
63-
* the init_chainablestack.hpp file. Whenever STAN_THREADS is defined
64-
* then all created child threads must instantiate a
65-
* AutodiffStackSingleton object within the child thread before
66-
* accessing the AD system in order to initialize the TLS AD tape
67-
* within the child thread.
68-
*
69-
* The design of a globally held (optionally TLS) pointer, which is
70-
* globally initialized, allows the compiler to apply necessary
71-
* inlining to get maximal performance. However, the design suffers
72-
* from "the static init order fiasco"[0]. Whenever the static init
73-
* order fiasco occurs, the C++ client of the library may instantiate
74-
* a AutodiffStackSingleton object at the adequate code position prior
75-
* to any AD tape access to ensure proper initialization order. In
76-
* exchange, we get a more performant singleton pattern with automatic
77-
* initialization of the AD stack for the main thread. There has been
78-
* some discussion on earlier designs using the Mayer singleton
79-
* approach; see [1] and [2] and the discussions those PRs link to as
80-
* well.
27+
* These are thread_local only if the user asks for it with
28+
* -DSTAN_THREADS. This is primarily because Apple clang compilers
29+
* before 2016 don't support thread_local and the additional
30+
* performance cost. We have proposed removing support for those[3],
31+
* and at that time we should evaluate the performance of a switch to
32+
* thread_local. If there is no loss in performance, we can remove
33+
* this ifdef.
8134
*
8235
* [0] https://isocpp.org/wiki/faq/ctors#static-init-order
8336
* [1] https://github.com/stan-dev/math/pull/840
8437
* [2] https://github.com/stan-dev/math/pull/826
8538
* [3]
8639
* http://discourse.mc-stan.org/t/potentially-dropping-support-for-older-versions-of-apples-version-of-clang/3780/
87-
* [4] https://github.com/stan-dev/math/pull/1135
8840
*/
8941
template <typename ChainableT, typename ChainableAllocT>
9042
struct AutodiffStackSingleton {
9143
typedef AutodiffStackSingleton<ChainableT, ChainableAllocT>
9244
AutodiffStackSingleton_t;
9345

94-
AutodiffStackSingleton() : own_instance_(init()) {}
95-
~AutodiffStackSingleton() {
96-
if (own_instance_) {
97-
delete instance_;
98-
instance_ = nullptr;
99-
}
100-
}
101-
10246
struct AutodiffStackStorage {
10347
AutodiffStackStorage &operator=(const AutodiffStackStorage &) = delete;
10448

@@ -113,32 +57,30 @@ struct AutodiffStackSingleton {
11357
std::vector<size_t> nested_var_alloc_stack_starts_;
11458
};
11559

60+
AutodiffStackSingleton() = delete;
11661
explicit AutodiffStackSingleton(AutodiffStackSingleton_t const &) = delete;
11762
AutodiffStackSingleton &operator=(const AutodiffStackSingleton_t &) = delete;
11863

119-
static inline constexpr AutodiffStackStorage &instance() {
120-
return *instance_;
64+
static inline AutodiffStackStorage &instance() {
65+
#ifdef STAN_THREADS
66+
thread_local static AutodiffStackStorage instance_;
67+
#endif
68+
return instance_;
12169
}
12270

123-
private:
124-
static bool init() {
125-
if (!instance_) {
126-
instance_ = new AutodiffStackStorage();
127-
return true;
128-
}
129-
return false;
130-
}
71+
#ifndef STAN_THREADS
13172

132-
static STAN_THREADS_DEF AutodiffStackStorage *instance_;
133-
const bool own_instance_;
73+
private:
74+
static AutodiffStackStorage instance_;
75+
#endif
13476
};
13577

78+
#ifndef STAN_THREADS
13679
template <typename ChainableT, typename ChainableAllocT>
137-
STAN_THREADS_DEF
138-
typename AutodiffStackSingleton<ChainableT,
139-
ChainableAllocT>::AutodiffStackStorage
140-
*AutodiffStackSingleton<ChainableT, ChainableAllocT>::instance_
141-
= nullptr;
80+
typename AutodiffStackSingleton<ChainableT,
81+
ChainableAllocT>::AutodiffStackStorage
82+
AutodiffStackSingleton<ChainableT, ChainableAllocT>::instance_;
83+
#endif
14284

14385
} // namespace math
14486
} // namespace stan

stan/math/rev/core/init_chainablestack.hpp

Lines changed: 0 additions & 18 deletions
This file was deleted.

stan/math/rev/mat.hpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,6 @@
6868
#include <stan/math/rev/mat/functor/integrate_ode_adams.hpp>
6969
#include <stan/math/rev/mat/functor/integrate_ode_bdf.hpp>
7070
#include <stan/math/rev/mat/functor/integrate_dae.hpp>
71-
#include <stan/math/rev/mat/functor/map_rect_concurrent.hpp>
7271
#include <stan/math/rev/mat/functor/map_rect_reduce.hpp>
7372

7473
#endif

0 commit comments

Comments
 (0)