@@ -9,75 +9,40 @@ namespace math {
9
9
10
10
/* *
11
11
* Provides a thread_local singleton if needed. Read warnings below!
12
- * With STAN_THREADS defined, the singleton is a thread_local static pointer
13
- * for performance reasons. When STAN_THREADS is not set, we have the old
14
- * static AD stack in the instance_ field because we saw odd performance
15
- * issues on the Mac Pro[4]. The rest of this commentary is specifically
16
- * talking about the design choices in the STAN_THREADS=true case.
17
- * When a TLS is used then initialization with
18
- * a constant expression is required for fast access to the TLS. As
19
- * the AD storage struct is non-POD it must be initialized as a
20
- * dynamic expression such that compilers will wrap any access to the
21
- * TLS by a TLS wrapper function which causes a significant
22
- * slow-down. A pointer to the AD storage instance can be initialized
23
- * to a compile-time constant expression of nullptr. In this case the
24
- * compiler avoids the use of a TLS wrapper function. Furthermore we
25
- * use the __thread keyword on compilers which support it. The
26
- * __thread keyword is a compiler-specific (gcc, clang, Intel)
27
- * extension which requires initialization with a compile time
28
- * constant expression. The C++11 keyword thread_local does allow for
29
- * constant and dynamic initialization of the TLS. Thus, only the
30
- * __thread keyword gurantees that constant initialization and it's
31
- * implied speedup, is used.
12
+ * For performance reasons the singleton is a global static for the
13
+ * case of no threading which is returned by a function. This design
14
+ * should allow the compiler to apply necessary inlining to get
15
+ * maximal performance. However, this design suffers from "the static
16
+ * init order fiasco"[0]. Anywhere this is used, we must be
17
+ * absolutely positive that it doesn't matter when the singleton will
18
+ * get initialized relative to other static variables. In exchange,
19
+ * we get a more performant singleton pattern for the non-threading
20
+ * case. In the threading case we use the defacto standard C++11
21
+ * singleton pattern relying on a function wrapping a static local
22
+ * variable. This standard pattern is expected to be well supported
23
+ * by the major compilers (as its standard), but it does incur some
24
+ * performance penalty. There has been some discussion on this; see
25
+ * [1] and [2] and the discussions those PRs link to as well.
32
26
*
33
- * The initialzation of the AD instance is handled by the lifetime of
34
- * a AutodiffStackSingleton object. More specifically, the first
35
- * instance of the AutodiffStackSingleton object will initialize the
36
- * AD instance and take ownership. Thus whenever the first instance of
37
- * the AutodiffStackSingleton object gets destructed, the AD tape will
38
- * be destructed as well. Within stan-math the initialization of the
39
- * AD instance for the main thread of the program is handled by
40
- * instantiating once the singleton once in the
41
- * init_chainablestack.hpp file. Whenever STAN_THREADS is defined then
42
- * all created child threads must call the init() method of the AD
43
- * singleton in order to initialize the TLS if child threads want to
44
- * perform AD operations (the initialization in the main process is
45
- * already taken care of in any case).
46
- *
47
- * The design of a globally held (optionally TLS) pointer, which is
48
- * globally initialized, allows the compiler to apply necessary
49
- * inlining to get maximal performance. However, the design suffers
50
- * from "the static init order fiasco"[0]. Whenever the static init
51
- * order fiasco occurs, the C++ client of the library may call the
52
- * init method as needed to ensure proper initialization order. In
53
- * exchange, we get a more performant singleton pattern with automatic
54
- * initialization of the AD stack for the main thread. There has been
55
- * some discussion on earlier designs using the Mayer singleton
56
- * approach; see [1] and [2] and the discussions those PRs link to as
57
- * well.
27
+ * These are thread_local only if the user asks for it with
28
+ * -DSTAN_THREADS. This is primarily because Apple clang compilers
29
+ * before 2016 don't support thread_local and the additional
30
+ * performance cost. We have proposed removing support for those[3],
31
+ * and at that time we should evaluate the performance of a switch to
32
+ * thread_local. If there is no loss in performance, we can remove
33
+ * this ifdef.
58
34
*
59
35
* [0] https://isocpp.org/wiki/faq/ctors#static-init-order
60
36
* [1] https://github.com/stan-dev/math/pull/840
61
37
* [2] https://github.com/stan-dev/math/pull/826
62
38
* [3]
63
39
* http://discourse.mc-stan.org/t/potentially-dropping-support-for-older-versions-of-apples-version-of-clang/3780/
64
- * [4] https://github.com/stan-dev/math/pull/1135
65
40
*/
66
41
template <typename ChainableT, typename ChainableAllocT>
67
42
struct AutodiffStackSingleton {
68
43
typedef AutodiffStackSingleton<ChainableT, ChainableAllocT>
69
44
AutodiffStackSingleton_t;
70
45
71
- AutodiffStackSingleton () : own_instance_(init()) {}
72
- ~AutodiffStackSingleton () {
73
- #ifdef STAN_THREADS
74
- if (own_instance_) {
75
- delete instance_;
76
- instance_ = nullptr ;
77
- }
78
- #endif
79
- }
80
-
81
46
struct AutodiffStackStorage {
82
47
AutodiffStackStorage &operator =(const AutodiffStackStorage &) = delete ;
83
48
@@ -92,60 +57,28 @@ struct AutodiffStackSingleton {
92
57
std::vector<size_t > nested_var_alloc_stack_starts_;
93
58
};
94
59
60
+ AutodiffStackSingleton () = delete ;
95
61
explicit AutodiffStackSingleton (AutodiffStackSingleton_t const &) = delete;
96
62
AutodiffStackSingleton &operator =(const AutodiffStackSingleton_t &) = delete ;
97
63
98
- static constexpr inline AutodiffStackStorage &instance () {
99
- return
64
+ static inline AutodiffStackStorage &instance () {
100
65
#ifdef STAN_THREADS
101
- *
66
+ thread_local static AutodiffStackStorage instance_;
102
67
#endif
103
- instance_;
68
+ return instance_;
104
69
}
105
70
106
- private:
107
- static bool init () {
108
- #ifdef STAN_THREADS
109
- if (!instance_) {
110
- instance_ = new AutodiffStackStorage ();
111
- return true ;
112
- }
113
- #endif
114
- return false ;
115
- }
71
+ #ifndef STAN_THREADS
116
72
117
- static
118
- #ifdef STAN_THREADS
119
- #ifdef __GNUC__
120
- __thread
121
- #else
122
- thread_local
123
- #endif
124
- #endif
125
- AutodiffStackStorage
126
- #ifdef STAN_THREADS
127
- *
73
+ private:
74
+ static AutodiffStackStorage instance_;
128
75
#endif
129
- instance_;
130
-
131
- bool own_instance_;
132
76
};
133
77
78
+ #ifndef STAN_THREADS
134
79
template <typename ChainableT, typename ChainableAllocT>
135
- #ifdef STAN_THREADS
136
- #ifdef __GNUC__
137
- __thread
138
- #else
139
- thread_local
140
- #endif
141
- #endif
142
- typename AutodiffStackSingleton<ChainableT,
143
- ChainableAllocT>::AutodiffStackStorage
144
-
145
- #ifdef STAN_THREADS
146
- *AutodiffStackSingleton<ChainableT, ChainableAllocT>::instance_
147
- = nullptr ;
148
- #else
80
+ typename AutodiffStackSingleton<ChainableT,
81
+ ChainableAllocT>::AutodiffStackStorage
149
82
AutodiffStackSingleton<ChainableT, ChainableAllocT>::instance_;
150
83
#endif
151
84
0 commit comments