@@ -9,40 +9,75 @@ namespace math {
9
9
10
10
/* *
11
11
* Provides a thread_local singleton if needed. Read warnings below!
12
- * For performance reasons the singleton is a global static for the
13
- * case of no threading which is returned by a function. This design
14
- * should allow the compiler to apply necessary inlining to get
15
- * maximal performance. However, this design suffers from "the static
16
- * init order fiasco"[0]. Anywhere this is used, we must be
17
- * absolutely positive that it doesn't matter when the singleton will
18
- * get initialized relative to other static variables. In exchange,
19
- * we get a more performant singleton pattern for the non-threading
20
- * case. In the threading case we use the defacto standard C++11
21
- * singleton pattern relying on a function wrapping a static local
22
- * variable. This standard pattern is expected to be well supported
23
- * by the major compilers (as its standard), but it does incur some
24
- * performance penalty. There has been some discussion on this; see
25
- * [1] and [2] and the discussions those PRs link to as well.
12
+ * With STAN_THREADS defined, the singleton is a thread_local static pointer
13
+ * for performance reasons. When STAN_THREADS is not set, we have the old
14
+ * static AD stack in the instance_ field because we saw odd performance
15
+ * issues on the Mac Pro[4]. The rest of this commentary is specifically
16
+ * talking about the design choices in the STAN_THREADS=true case.
17
+ * When a TLS is used then initialization with
18
+ * a constant expression is required for fast access to the TLS. As
19
+ * the AD storage struct is non-POD it must be initialized as a
20
+ * dynamic expression such that compilers will wrap any access to the
21
+ * TLS by a TLS wrapper function which causes a significant
22
+ * slow-down. A pointer to the AD storage instance can be initialized
23
+ * to a compile-time constant expression of nullptr. In this case the
24
+ * compiler avoids the use of a TLS wrapper function. Furthermore we
25
+ * use the __thread keyword on compilers which support it. The
26
+ * __thread keyword is a compiler-specific (gcc, clang, Intel)
27
+ * extension which requires initialization with a compile time
28
+ * constant expression. The C++11 keyword thread_local does allow for
29
+ * constant and dynamic initialization of the TLS. Thus, only the
30
+ * __thread keyword gurantees that constant initialization and it's
31
+ * implied speedup, is used.
26
32
*
27
- * These are thread_local only if the user asks for it with
28
- * -DSTAN_THREADS. This is primarily because Apple clang compilers
29
- * before 2016 don't support thread_local and the additional
30
- * performance cost. We have proposed removing support for those[3],
31
- * and at that time we should evaluate the performance of a switch to
32
- * thread_local. If there is no loss in performance, we can remove
33
- * this ifdef.
33
+ * The initialzation of the AD instance is handled by the lifetime of
34
+ * a AutodiffStackSingleton object. More specifically, the first
35
+ * instance of the AutodiffStackSingleton object will initialize the
36
+ * AD instance and take ownership. Thus whenever the first instance of
37
+ * the AutodiffStackSingleton object gets destructed, the AD tape will
38
+ * be destructed as well. Within stan-math the initialization of the
39
+ * AD instance for the main thread of the program is handled by
40
+ * instantiating once the singleton once in the
41
+ * init_chainablestack.hpp file. Whenever STAN_THREADS is defined then
42
+ * all created child threads must call the init() method of the AD
43
+ * singleton in order to initialize the TLS if child threads want to
44
+ * perform AD operations (the initialization in the main process is
45
+ * already taken care of in any case).
46
+ *
47
+ * The design of a globally held (optionally TLS) pointer, which is
48
+ * globally initialized, allows the compiler to apply necessary
49
+ * inlining to get maximal performance. However, the design suffers
50
+ * from "the static init order fiasco"[0]. Whenever the static init
51
+ * order fiasco occurs, the C++ client of the library may call the
52
+ * init method as needed to ensure proper initialization order. In
53
+ * exchange, we get a more performant singleton pattern with automatic
54
+ * initialization of the AD stack for the main thread. There has been
55
+ * some discussion on earlier designs using the Mayer singleton
56
+ * approach; see [1] and [2] and the discussions those PRs link to as
57
+ * well.
34
58
*
35
59
* [0] https://isocpp.org/wiki/faq/ctors#static-init-order
36
60
* [1] https://github.com/stan-dev/math/pull/840
37
61
* [2] https://github.com/stan-dev/math/pull/826
38
62
* [3]
39
63
* http://discourse.mc-stan.org/t/potentially-dropping-support-for-older-versions-of-apples-version-of-clang/3780/
64
+ * [4] https://github.com/stan-dev/math/pull/1135
40
65
*/
41
66
template <typename ChainableT, typename ChainableAllocT>
42
67
struct AutodiffStackSingleton {
43
68
typedef AutodiffStackSingleton<ChainableT, ChainableAllocT>
44
69
AutodiffStackSingleton_t;
45
70
71
+ AutodiffStackSingleton () : own_instance_(init()) {}
72
+ ~AutodiffStackSingleton () {
73
+ #ifdef STAN_THREADS
74
+ if (own_instance_) {
75
+ delete instance_;
76
+ instance_ = nullptr ;
77
+ }
78
+ #endif
79
+ }
80
+
46
81
struct AutodiffStackStorage {
47
82
AutodiffStackStorage &operator =(const AutodiffStackStorage &) = delete ;
48
83
@@ -57,28 +92,60 @@ struct AutodiffStackSingleton {
57
92
std::vector<size_t > nested_var_alloc_stack_starts_;
58
93
};
59
94
60
- AutodiffStackSingleton () = delete ;
61
95
explicit AutodiffStackSingleton (AutodiffStackSingleton_t const &) = delete;
62
96
AutodiffStackSingleton &operator =(const AutodiffStackSingleton_t &) = delete ;
63
97
64
- static inline AutodiffStackStorage &instance () {
98
+ static constexpr inline AutodiffStackStorage &instance () {
99
+ return
65
100
#ifdef STAN_THREADS
66
- thread_local static AutodiffStackStorage instance_;
101
+ *
67
102
#endif
68
- return instance_;
103
+ instance_;
69
104
}
70
105
71
- #ifndef STAN_THREADS
72
-
73
106
private:
74
- static AutodiffStackStorage instance_;
107
+ static bool init () {
108
+ #ifdef STAN_THREADS
109
+ if (!instance_) {
110
+ instance_ = new AutodiffStackStorage ();
111
+ return true ;
112
+ }
113
+ #endif
114
+ return false ;
115
+ }
116
+
117
+ static
118
+ #ifdef STAN_THREADS
119
+ #ifdef __GNUC__
120
+ __thread
121
+ #else
122
+ thread_local
123
+ #endif
75
124
#endif
125
+ AutodiffStackStorage
126
+ #ifdef STAN_THREADS
127
+ *
128
+ #endif
129
+ instance_;
130
+
131
+ bool own_instance_;
76
132
};
77
133
78
- #ifndef STAN_THREADS
79
134
template <typename ChainableT, typename ChainableAllocT>
80
- typename AutodiffStackSingleton<ChainableT,
81
- ChainableAllocT>::AutodiffStackStorage
135
+ #ifdef STAN_THREADS
136
+ #ifdef __GNUC__
137
+ __thread
138
+ #else
139
+ thread_local
140
+ #endif
141
+ #endif
142
+ typename AutodiffStackSingleton<ChainableT,
143
+ ChainableAllocT>::AutodiffStackStorage
144
+
145
+ #ifdef STAN_THREADS
146
+ *AutodiffStackSingleton<ChainableT, ChainableAllocT>::instance_
147
+ = nullptr ;
148
+ #else
82
149
AutodiffStackSingleton<ChainableT, ChainableAllocT>::instance_;
83
150
#endif
84
151
0 commit comments