7
7
namespace stan {
8
8
namespace math {
9
9
10
- // Internal macro used to modify global pointer definition to the
11
- // global AD instance.
12
- #ifdef STAN_THREADS
13
- // Whenever STAN_THREADS is set a TLS keyword is used. For reasons
14
- // explained below we use the GNU compiler extension __thread if
15
- // supported by the compiler while the generic thread_local C++11
16
- // keyword is used otherwise.
17
- #ifdef __GNUC__
18
- #define STAN_THREADS_DEF __thread
19
- #else
20
- #define STAN_THREADS_DEF thread_local
21
- #endif
22
- #else
23
- // In case STAN_THREADS is not set, then no modifier is needed.
24
- #define STAN_THREADS_DEF
25
- #endif
26
-
27
10
/* *
28
- * This struct always provides access to the autodiff stack using
29
- * the singleton pattern. Read warnings below!
30
- *
31
- * The singleton <code>instance_</code> is a global static pointer,
32
- * which is thread local (TLS) if the STAN_THREADS preprocess variable
33
- * is defined.
11
+ * Provides a thread_local singleton if needed. Read warnings below!
12
+ * For performance reasons the singleton is a global static for the
13
+ * case of no threading which is returned by a function. This design
14
+ * should allow the compiler to apply necessary inlining to get
15
+ * maximal performance. However, this design suffers from "the static
16
+ * init order fiasco"[0]. Anywhere this is used, we must be
17
+ * absolutely positive that it doesn't matter when the singleton will
18
+ * get initialized relative to other static variables. In exchange,
19
+ * we get a more performant singleton pattern for the non-threading
20
+ * case. In the threading case we use the defacto standard C++11
21
+ * singleton pattern relying on a function wrapping a static local
22
+ * variable. This standard pattern is expected to be well supported
23
+ * by the major compilers (as its standard), but it does incur some
24
+ * performance penalty. There has been some discussion on this; see
25
+ * [1] and [2] and the discussions those PRs link to as well.
34
26
*
35
- * The use of a pointer is motivated by performance reasons for the
36
- * threading case. When a TLS is used, initialization with a constant
37
- * expression at compile time is required for fast access to the
38
- * TLS. As the autodiff storage struct is non-POD, its initialization
39
- * is a dynamic expression at compile time. These dynamic expressions
40
- * are wrapped, in the TLS case, by a TLS wrapper function which slows
41
- * down its access. Using a pointer instead allows to initialize at
42
- * compile time to <code>nullptr</code>, which is a compile time
43
- * constant. In this case, the compiler avoids the use of a TLS
44
- * wrapper function.
45
- *
46
- * For performance reasons we use the __thread keyword on compilers
47
- * which support it. The __thread keyword is a GNU compiler-specific
48
- * (gcc, clang, Intel) extension which requires initialization with a
49
- * compile time constant expression. The C++11 keyword thread_local
50
- * does allow for constant and dynamic initialization of the
51
- * TLS. Thus, only the __thread keyword gurantees that constant
52
- * initialization and it's implied speedup, is used.
53
- *
54
- * The initialzation of the AD instance at run-time is handled by the
55
- * lifetime of a AutodiffStackSingleton object. More specifically, the
56
- * first instance of the AutodiffStackSingleton object will initialize
57
- * the AD instance and take ownership (it is the only one instance
58
- * with the private member own_instance_ being true). Thus, whenever
59
- * the first instance of the AutodiffStackSingleton object gets
60
- * destructed, the AD tape will be destructed as well. Within
61
- * stan-math the initialization of the AD instance for the main thread
62
- * of the program is handled by instantiating the singleton once in
63
- * the init_chainablestack.hpp file. Whenever STAN_THREADS is defined
64
- * then all created child threads must instantiate a
65
- * AutodiffStackSingleton object within the child thread before
66
- * accessing the AD system in order to initialize the TLS AD tape
67
- * within the child thread.
68
- *
69
- * The design of a globally held (optionally TLS) pointer, which is
70
- * globally initialized, allows the compiler to apply necessary
71
- * inlining to get maximal performance. However, the design suffers
72
- * from "the static init order fiasco"[0]. Whenever the static init
73
- * order fiasco occurs, the C++ client of the library may instantiate
74
- * a AutodiffStackSingleton object at the adequate code position prior
75
- * to any AD tape access to ensure proper initialization order. In
76
- * exchange, we get a more performant singleton pattern with automatic
77
- * initialization of the AD stack for the main thread. There has been
78
- * some discussion on earlier designs using the Mayer singleton
79
- * approach; see [1] and [2] and the discussions those PRs link to as
80
- * well.
27
+ * These are thread_local only if the user asks for it with
28
+ * -DSTAN_THREADS. This is primarily because Apple clang compilers
29
+ * before 2016 don't support thread_local and the additional
30
+ * performance cost. We have proposed removing support for those[3],
31
+ * and at that time we should evaluate the performance of a switch to
32
+ * thread_local. If there is no loss in performance, we can remove
33
+ * this ifdef.
81
34
*
82
35
* [0] https://isocpp.org/wiki/faq/ctors#static-init-order
83
36
* [1] https://github.com/stan-dev/math/pull/840
84
37
* [2] https://github.com/stan-dev/math/pull/826
85
38
* [3]
86
39
* http://discourse.mc-stan.org/t/potentially-dropping-support-for-older-versions-of-apples-version-of-clang/3780/
87
- * [4] https://github.com/stan-dev/math/pull/1135
88
40
*/
89
41
template <typename ChainableT, typename ChainableAllocT>
90
42
struct AutodiffStackSingleton {
91
43
typedef AutodiffStackSingleton<ChainableT, ChainableAllocT>
92
44
AutodiffStackSingleton_t;
93
45
94
- AutodiffStackSingleton () : own_instance_(init()) {}
95
- ~AutodiffStackSingleton () {
96
- if (own_instance_) {
97
- delete instance_;
98
- instance_ = nullptr ;
99
- }
100
- }
101
-
102
46
struct AutodiffStackStorage {
103
47
AutodiffStackStorage &operator =(const AutodiffStackStorage &) = delete ;
104
48
@@ -113,32 +57,30 @@ struct AutodiffStackSingleton {
113
57
std::vector<size_t > nested_var_alloc_stack_starts_;
114
58
};
115
59
60
+ AutodiffStackSingleton () = delete ;
116
61
explicit AutodiffStackSingleton (AutodiffStackSingleton_t const &) = delete;
117
62
AutodiffStackSingleton &operator =(const AutodiffStackSingleton_t &) = delete ;
118
63
119
- static inline constexpr AutodiffStackStorage &instance () {
120
- return *instance_;
64
+ static inline AutodiffStackStorage &instance () {
65
+ #ifdef STAN_THREADS
66
+ thread_local static AutodiffStackStorage instance_;
67
+ #endif
68
+ return instance_;
121
69
}
122
70
123
- private:
124
- static bool init () {
125
- if (!instance_) {
126
- instance_ = new AutodiffStackStorage ();
127
- return true ;
128
- }
129
- return false ;
130
- }
71
+ #ifndef STAN_THREADS
131
72
132
- static STAN_THREADS_DEF AutodiffStackStorage *instance_;
133
- const bool own_instance_;
73
+ private:
74
+ static AutodiffStackStorage instance_;
75
+ #endif
134
76
};
135
77
78
+ #ifndef STAN_THREADS
136
79
template <typename ChainableT, typename ChainableAllocT>
137
- STAN_THREADS_DEF
138
- typename AutodiffStackSingleton<ChainableT,
139
- ChainableAllocT>::AutodiffStackStorage
140
- *AutodiffStackSingleton<ChainableT, ChainableAllocT>::instance_
141
- = nullptr ;
80
+ typename AutodiffStackSingleton<ChainableT,
81
+ ChainableAllocT>::AutodiffStackStorage
82
+ AutodiffStackSingleton<ChainableT, ChainableAllocT>::instance_;
83
+ #endif
142
84
143
85
} // namespace math
144
86
} // namespace stan
0 commit comments