stan-dev
diff --git a/‎Jenkinsfile‎
Lines changed: 28 additions & 27 deletions b/‎Jenkinsfile‎
Lines changed: 28 additions & 27 deletions
diff --git a/‎stan/math/prim/mat/functor/map_rect.hpp‎
Lines changed: 1 addition & 0 deletions b/‎stan/math/prim/mat/functor/map_rect.hpp‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎stan/math/prim/mat/functor/map_rect_concurrent.hpp‎
Lines changed: 3 additions & 70 deletions b/‎stan/math/prim/mat/functor/map_rect_concurrent.hpp‎
Lines changed: 3 additions & 70 deletions
diff --git a/‎stan/math/rev/core.hpp‎
Lines changed: 1 addition & 0 deletions b/‎stan/math/rev/core.hpp‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎stan/math/rev/core/autodiffstackstorage.hpp‎
Lines changed: 98 additions & 31 deletions b/‎stan/math/rev/core/autodiffstackstorage.hpp‎
Lines changed: 98 additions & 31 deletions
diff --git a/‎stan/math/rev/core/init_chainablestack.hpp‎
Lines changed: 14 additions & 0 deletions b/‎stan/math/rev/core/init_chainablestack.hpp‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎stan/math/rev/mat.hpp‎
Lines changed: 1 addition & 0 deletions b/‎stan/math/rev/mat.hpp‎
Lines changed: 1 addition & 0 deletions
@@ -49,6 +49,9 @@ pipeline {
         skipDefaultCheckout()
         preserveStashes(buildCount: 7)
     }
+    environment {
+        STAN_NUM_THREADS = '4'
+    }
     stages {
         stage('Kill previous builds') {
             when {
@@ -159,6 +162,21 @@ pipeline {
                     }
                     post { always { retry(3) { deleteDir() } } }
                 }
+                stage('Windows Threading tests') {
+                    agent { label 'windows' }
+                    steps {
+                        deleteDirWin()
+                        unstash 'MathSetup'
+                        bat "echo CXX=${env.CXX} -Werror > make/local"
+                        bat "echo CXXFLAGS+=-DSTAN_THREADS >> make/local"
+                        runTestsWin("test/unit -f thread")
+                        runTestsWin("test/unit -f map_rect")
+                    }
+                }
+            }
+        }
+        stage('Always-run tests part 2') {
+            parallel {
                 stage('Full unit with GPU') {
                     agent { label "gpu" }
                     steps {
@@ -172,19 +190,6 @@ pipeline {
                     }
                     post { always { retry(3) { deleteDir() } } }
                 }
-                stage('Windows Headers & Unit') {
-                    agent { label 'windows' }
-                    steps {
-                        deleteDirWin()
-                        unstash 'MathSetup'
-                        bat "make -j${env.PARALLEL} test-headers"
-                        runTestsWin("test/unit")
-                    }
-                }
-            }
-        }
-        stage('Always-run tests part 2') {
-            parallel {
                 stage('Distribution tests') {
                     agent { label "distribution-tests" }
                     steps {
@@ -212,19 +217,6 @@ pipeline {
                             }
                     }
                 }
-                stage('Threading tests') {
-                    agent any
-                    steps {
-                        deleteDir()
-                        unstash 'MathSetup'
-                        sh "echo CXX=${env.CXX} -Werror > make/local"
-                        sh "echo CPPFLAGS+=-DSTAN_THREADS >> make/local"
-                        runTests("test/unit -f thread")
-                        sh "find . -name *_test.xml | xargs rm"
-                        runTests("test/unit -f map_rect")
-                    }
-                    post { always { retry(3) { deleteDir() } } }
-                }
             }
         }
         stage('Additional merge tests') {
@@ -236,7 +228,7 @@ pipeline {
                         deleteDir()
                         unstash 'MathSetup'
                         sh "echo CXX=${GCC} >> make/local"
-                        sh "echo CPPFLAGS=-DSTAN_THREADS >> make/local"
+                        sh "echo CXXFLAGS=-DSTAN_THREADS >> make/local"
                         runTests("test/unit")
                     }
                     post { always { retry(3) { deleteDir() } } }
@@ -252,6 +244,15 @@ pipeline {
                     }
                     post { always { retry(3) { deleteDir() } } }
                 }
+                stage('Windows Headers & Unit') {
+                    agent { label 'windows' }
+                    steps {
+                        deleteDirWin()
+                        unstash 'MathSetup'
+                        bat "make -j${env.PARALLEL} test-headers"
+                        runTestsWin("test/unit")
+                    }
+                }
             }
         }
         stage('Upstream tests') {
 
@@ -1,6 +1,7 @@
 #ifndef STAN_MATH_PRIM_MAT_FUNCTOR_MAP_RECT_HPP
 #define STAN_MATH_PRIM_MAT_FUNCTOR_MAP_RECT_HPP
 
+#include <stan/math/prim/scal/meta/return_type.hpp>
 #include <stan/math/prim/arr/err/check_matching_sizes.hpp>
 #include <stan/math/prim/mat/fun/dims.hpp>
 #include <stan/math/prim/mat/fun/typedefs.hpp>
 
@@ -3,15 +3,13 @@
 
 #include <stan/math/prim/mat/fun/typedefs.hpp>
 
-#include <stan/math/prim/mat/functor/map_rect_reduce.hpp>
-#include <stan/math/prim/mat/functor/map_rect_combine.hpp>
+#include <stan/math/prim/scal/meta/return_type.hpp>
 #include <stan/math/prim/scal/err/invalid_argument.hpp>
 #include <boost/lexical_cast.hpp>
 
+#include <cstdlib>
 #include <vector>
 #include <thread>
-#include <future>
-#include <cstdlib>
 
 namespace stan {
 namespace math {
@@ -77,72 +75,7 @@ map_rect_concurrent(
     const std::vector<Eigen::Matrix<T_job_param, Eigen::Dynamic, 1>>&
         job_params,
     const std::vector<std::vector<double>>& x_r,
-    const std::vector<std::vector<int>>& x_i, std::ostream* msgs = nullptr) {
-  typedef map_rect_reduce<F, T_shared_param, T_job_param> ReduceF;
-  typedef map_rect_combine<F, T_shared_param, T_job_param> CombineF;
-
-  const int num_jobs = job_params.size();
-  const vector_d shared_params_dbl = value_of(shared_params);
-  std::vector<std::future<std::vector<matrix_d>>> futures;
-
-  auto execute_chunk = [&](int start, int size) -> std::vector<matrix_d> {
-    const int end = start + size;
-    std::vector<matrix_d> chunk_f_out;
-    chunk_f_out.reserve(size);
-    for (int i = start; i != end; i++)
-      chunk_f_out.push_back(ReduceF()(
-          shared_params_dbl, value_of(job_params[i]), x_r[i], x_i[i], msgs));
-    return chunk_f_out;
-  };
-
-  int num_threads = get_num_threads(num_jobs);
-  int num_jobs_per_thread = num_jobs / num_threads;
-  futures.emplace_back(
-      std::async(std::launch::deferred, execute_chunk, 0, num_jobs_per_thread));
-
-#ifdef STAN_THREADS
-  if (num_threads > 1) {
-    const int num_big_threads = num_jobs % num_threads;
-    const int first_big_thread = num_threads - num_big_threads;
-    for (int i = 1, job_start = num_jobs_per_thread, job_size = 0;
-         i < num_threads; ++i, job_start += job_size) {
-      job_size = i >= first_big_thread ? num_jobs_per_thread + 1
-                                       : num_jobs_per_thread;
-      futures.emplace_back(
-          std::async(std::launch::async, execute_chunk, job_start, job_size));
-    }
-  }
-#endif
-
-  // collect results
-  std::vector<int> world_f_out;
-  world_f_out.reserve(num_jobs);
-  matrix_d world_output(0, 0);
-
-  int offset = 0;
-  for (std::size_t i = 0; i < futures.size(); ++i) {
-    const std::vector<matrix_d>& chunk_result = futures[i].get();
-    if (i == 0)
-      world_output.resize(chunk_result[0].rows(),
-                          num_jobs * chunk_result[0].cols());
-
-    for (const auto& job_result : chunk_result) {
-      const int num_job_outputs = job_result.cols();
-      world_f_out.push_back(num_job_outputs);
-
-      if (world_output.cols() < offset + num_job_outputs)
-        world_output.conservativeResize(Eigen::NoChange,
-                                        2 * (offset + num_job_outputs));
-
-      world_output.block(0, offset, world_output.rows(), num_job_outputs)
-          = job_result;
-
-      offset += num_job_outputs;
-    }
-  }
-  CombineF combine(shared_params, job_params);
-  return combine(world_output, world_f_out);
-}
+    const std::vector<std::vector<int>>& x_i, std::ostream* msgs = nullptr);
 
 }  // namespace internal
 }  // namespace math
 
@@ -5,6 +5,7 @@
 #include <stan/math/rev/core/build_vari_array.hpp>
 #include <stan/math/rev/core/chainable_alloc.hpp>
 #include <stan/math/rev/core/chainablestack.hpp>
+#include <stan/math/rev/core/init_chainablestack.hpp>
 #include <stan/math/rev/core/ddv_vari.hpp>
 #include <stan/math/rev/core/dv_vari.hpp>
 #include <stan/math/rev/core/dvd_vari.hpp>
 
@@ -9,40 +9,75 @@ namespace math {
 
 /**
  * Provides a thread_local singleton if needed. Read warnings below!
- * For performance reasons the singleton is a global static for the
- * case of no threading which is returned by a function. This design
- * should allow the compiler to apply necessary inlining to get
- * maximal performance. However, this design suffers from "the static
- * init order fiasco"[0].  Anywhere this is used, we must be
- * absolutely positive that it doesn't matter when the singleton will
- * get initialized relative to other static variables.  In exchange,
- * we get a more performant singleton pattern for the non-threading
- * case. In the threading case we use the defacto standard C++11
- * singleton pattern relying on a function wrapping a static local
- * variable. This standard pattern is expected to be well supported
- * by the major compilers (as its standard), but it does incur some
- * performance penalty.  There has been some discussion on this; see
- * [1] and [2] and the discussions those PRs link to as well.
+ * With STAN_THREADS defined, the singleton is a thread_local static pointer
+ * for performance reasons. When STAN_THREADS is not set, we have the old
+ * static AD stack in the instance_ field because we saw odd performance
+ * issues on the Mac Pro[4]. The rest of this commentary is specifically
+ * talking about the design choices in the STAN_THREADS=true case.
+ * When a TLS is used then initialization with
+ * a constant expression is required for fast access to the TLS. As
+ * the AD storage struct is non-POD it must be initialized as a
+ * dynamic expression such that compilers will wrap any access to the
+ * TLS by a TLS wrapper function which causes a significant
+ * slow-down. A pointer to the AD storage instance can be initialized
+ * to a compile-time constant expression of nullptr. In this case the
+ * compiler avoids the use of a TLS wrapper function. Furthermore we
+ * use the __thread keyword on compilers which support it. The
+ * __thread keyword is a compiler-specific (gcc, clang, Intel)
+ * extension which requires initialization with a compile time
+ * constant expression. The C++11 keyword thread_local does allow for
+ * constant and dynamic initialization of the TLS. Thus, only the
+ * __thread keyword gurantees that constant initialization and it's
+ * implied speedup, is used.
  *
- * These are thread_local only if the user asks for it with
- * -DSTAN_THREADS. This is primarily because Apple clang compilers
- * before 2016 don't support thread_local and the additional
- * performance cost. We have proposed removing support for those[3],
- * and at that time we should evaluate the performance of a switch to
- * thread_local.  If there is no loss in performance, we can remove
- * this ifdef.
+ * The initialzation of the AD instance is handled by the lifetime of
+ * a AutodiffStackSingleton object. More specifically, the first
+ * instance of the AutodiffStackSingleton object will initialize the
+ * AD instance and take ownership. Thus whenever the first instance of
+ * the AutodiffStackSingleton object gets destructed, the AD tape will
+ * be destructed as well.  Within stan-math the initialization of the
+ * AD instance for the main thread of the program is handled by
+ * instantiating once the singleton once in the
+ * init_chainablestack.hpp file. Whenever STAN_THREADS is defined then
+ * all created child threads must call the init() method of the AD
+ * singleton in order to initialize the TLS if child threads want to
+ * perform AD operations (the initialization in the main process is
+ * already taken care of in any case).
+ *
+ * The design of a globally held (optionally TLS) pointer, which is
+ * globally initialized, allows the compiler to apply necessary
+ * inlining to get maximal performance. However, the design suffers
+ * from "the static init order fiasco"[0]. Whenever the static init
+ * order fiasco occurs, the C++ client of the library may call the
+ * init method as needed to ensure proper initialization order. In
+ * exchange, we get a more performant singleton pattern with automatic
+ * initialization of the AD stack for the main thread. There has been
+ * some discussion on earlier designs using the Mayer singleton
+ * approach; see [1] and [2] and the discussions those PRs link to as
+ * well.
  *
  * [0] https://isocpp.org/wiki/faq/ctors#static-init-order
  * [1] https://github.com/stan-dev/math/pull/840
  * [2] https://github.com/stan-dev/math/pull/826
  * [3]
  * http://discourse.mc-stan.org/t/potentially-dropping-support-for-older-versions-of-apples-version-of-clang/3780/
+ * [4] https://github.com/stan-dev/math/pull/1135
  */
 template <typename ChainableT, typename ChainableAllocT>
 struct AutodiffStackSingleton {
   typedef AutodiffStackSingleton<ChainableT, ChainableAllocT>
       AutodiffStackSingleton_t;
 
+  AutodiffStackSingleton() : own_instance_(init()) {}
+  ~AutodiffStackSingleton() {
+#ifdef STAN_THREADS
+    if (own_instance_) {
+      delete instance_;
+      instance_ = nullptr;
+    }
+#endif
+  }
+
   struct AutodiffStackStorage {
     AutodiffStackStorage &operator=(const AutodiffStackStorage &) = delete;
 
@@ -57,28 +92,60 @@ struct AutodiffStackSingleton {
     std::vector<size_t> nested_var_alloc_stack_starts_;
   };
 
-  AutodiffStackSingleton() = delete;
   explicit AutodiffStackSingleton(AutodiffStackSingleton_t const &) = delete;
   AutodiffStackSingleton &operator=(const AutodiffStackSingleton_t &) = delete;
 
-  static inline AutodiffStackStorage &instance() {
+  static constexpr inline AutodiffStackStorage &instance() {
+    return
 #ifdef STAN_THREADS
-    thread_local static AutodiffStackStorage instance_;
+        *
 #endif
-    return instance_;
+        instance_;
   }
 
-#ifndef STAN_THREADS
-
  private:
-  static AutodiffStackStorage instance_;
+  static bool init() {
+#ifdef STAN_THREADS
+    if (!instance_) {
+      instance_ = new AutodiffStackStorage();
+      return true;
+    }
+#endif
+    return false;
+  }
+
+  static
+#ifdef STAN_THREADS
+#ifdef __GNUC__
+      __thread
+#else
+      thread_local
+#endif
 #endif
+      AutodiffStackStorage
+#ifdef STAN_THREADS
+          *
+#endif
+              instance_;
+
+  bool own_instance_;
 };
 
-#ifndef STAN_THREADS
 template <typename ChainableT, typename ChainableAllocT>
-typename AutodiffStackSingleton<ChainableT,
-                                ChainableAllocT>::AutodiffStackStorage
+#ifdef STAN_THREADS
+#ifdef __GNUC__
+__thread
+#else
+thread_local
+#endif
+#endif
+    typename AutodiffStackSingleton<ChainableT,
+                                    ChainableAllocT>::AutodiffStackStorage
+
+#ifdef STAN_THREADS
+        *AutodiffStackSingleton<ChainableT, ChainableAllocT>::instance_
+    = nullptr;
+#else
     AutodiffStackSingleton<ChainableT, ChainableAllocT>::instance_;
 #endif
 
 
@@ -0,0 +1,14 @@
+#ifndef STAN_MATH_REV_CORE_INIT_CHAINABLESTACK_HPP
+#define STAN_MATH_REV_CORE_INIT_CHAINABLESTACK_HPP
+
+#include <stan/math/rev/core/chainablestack.hpp>
+
+namespace stan {
+namespace math {
+namespace {
+
+ChainableStack global_stack_instance_init;
+}
+}  // namespace math
+}  // namespace stan
+#endif
@@ -68,6 +68,7 @@
 #include <stan/math/rev/mat/functor/integrate_ode_adams.hpp>
 #include <stan/math/rev/mat/functor/integrate_ode_bdf.hpp>
 #include <stan/math/rev/mat/functor/integrate_dae.hpp>
+#include <stan/math/rev/mat/functor/map_rect_concurrent.hpp>
 #include <stan/math/rev/mat/functor/map_rect_reduce.hpp>
 
 #endif