mutex_benchmark.cc 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310
  1. // Copyright 2017 The Abseil Authors.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // https://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. #include <cstdint>
  15. #include <mutex> // NOLINT(build/c++11)
  16. #include <vector>
  17. #include "absl/base/config.h"
  18. #include "absl/base/internal/cycleclock.h"
  19. #include "absl/base/internal/spinlock.h"
  20. #include "absl/synchronization/blocking_counter.h"
  21. #include "absl/synchronization/internal/thread_pool.h"
  22. #include "absl/synchronization/mutex.h"
  23. #include "benchmark/benchmark.h"
  24. namespace {
  25. void BM_Mutex(benchmark::State& state) {
  26. static absl::Mutex* mu = new absl::Mutex;
  27. for (auto _ : state) {
  28. absl::MutexLock lock(mu);
  29. }
  30. }
  31. BENCHMARK(BM_Mutex)->UseRealTime()->Threads(1)->ThreadPerCpu();
  32. static void DelayNs(int64_t ns, int* data) {
  33. int64_t end = absl::base_internal::CycleClock::Now() +
  34. ns * absl::base_internal::CycleClock::Frequency() / 1e9;
  35. while (absl::base_internal::CycleClock::Now() < end) {
  36. ++(*data);
  37. benchmark::DoNotOptimize(*data);
  38. }
  39. }
  40. template <typename MutexType>
  41. class RaiiLocker {
  42. public:
  43. explicit RaiiLocker(MutexType* mu) : mu_(mu) { mu_->Lock(); }
  44. ~RaiiLocker() { mu_->Unlock(); }
  45. private:
  46. MutexType* mu_;
  47. };
  48. template <>
  49. class RaiiLocker<std::mutex> {
  50. public:
  51. explicit RaiiLocker(std::mutex* mu) : mu_(mu) { mu_->lock(); }
  52. ~RaiiLocker() { mu_->unlock(); }
  53. private:
  54. std::mutex* mu_;
  55. };
  56. // RAII object to change the Mutex priority of the running thread.
  57. class ScopedThreadMutexPriority {
  58. public:
  59. explicit ScopedThreadMutexPriority(int priority) {
  60. absl::base_internal::ThreadIdentity* identity =
  61. absl::synchronization_internal::GetOrCreateCurrentThreadIdentity();
  62. identity->per_thread_synch.priority = priority;
  63. // Bump next_priority_read_cycles to the infinite future so that the
  64. // implementation doesn't re-read the thread's actual scheduler priority
  65. // and replace our temporary scoped priority.
  66. identity->per_thread_synch.next_priority_read_cycles =
  67. std::numeric_limits<int64_t>::max();
  68. }
  69. ~ScopedThreadMutexPriority() {
  70. // Reset the "next priority read time" back to the infinite past so that
  71. // the next time the Mutex implementation wants to know this thread's
  72. // priority, it re-reads it from the OS instead of using our overridden
  73. // priority.
  74. absl::synchronization_internal::GetOrCreateCurrentThreadIdentity()
  75. ->per_thread_synch.next_priority_read_cycles =
  76. std::numeric_limits<int64_t>::min();
  77. }
  78. };
  79. void BM_MutexEnqueue(benchmark::State& state) {
  80. // In the "multiple priorities" variant of the benchmark, one of the
  81. // threads runs with Mutex priority 0 while the rest run at elevated priority.
  82. // This benchmarks the performance impact of the presence of a low priority
  83. // waiter when a higher priority waiter adds itself of the queue
  84. // (b/175224064).
  85. //
  86. // NOTE: The actual scheduler priority is not modified in this benchmark:
  87. // all of the threads get CPU slices with the same priority. Only the
  88. // Mutex queueing behavior is modified.
  89. const bool multiple_priorities = state.range(0);
  90. ScopedThreadMutexPriority priority_setter(
  91. (multiple_priorities && state.thread_index() != 0) ? 1 : 0);
  92. struct Shared {
  93. absl::Mutex mu;
  94. std::atomic<int> looping_threads{0};
  95. std::atomic<int> blocked_threads{0};
  96. std::atomic<bool> thread_has_mutex{false};
  97. };
  98. static Shared* shared = new Shared;
  99. // Set up 'blocked_threads' to count how many threads are currently blocked
  100. // in Abseil synchronization code.
  101. //
  102. // NOTE: Blocking done within the Google Benchmark library itself (e.g.
  103. // the barrier which synchronizes threads entering and exiting the benchmark
  104. // loop) does _not_ get registered in this counter. This is because Google
  105. // Benchmark uses its own synchronization primitives based on std::mutex, not
  106. // Abseil synchronization primitives. If at some point the benchmark library
  107. // merges into Abseil, this code may break.
  108. absl::synchronization_internal::PerThreadSem::SetThreadBlockedCounter(
  109. &shared->blocked_threads);
  110. // The benchmark framework may run several iterations in the same process,
  111. // reusing the same static-initialized 'shared' object. Given the semantics
  112. // of the members, here, we expect everything to be reset to zero by the
  113. // end of any iteration. Assert that's the case, just to be sure.
  114. ABSL_RAW_CHECK(
  115. shared->looping_threads.load(std::memory_order_relaxed) == 0 &&
  116. shared->blocked_threads.load(std::memory_order_relaxed) == 0 &&
  117. !shared->thread_has_mutex.load(std::memory_order_relaxed),
  118. "Shared state isn't zeroed at start of benchmark iteration");
  119. static constexpr int kBatchSize = 1000;
  120. while (state.KeepRunningBatch(kBatchSize)) {
  121. shared->looping_threads.fetch_add(1);
  122. for (int i = 0; i < kBatchSize; i++) {
  123. {
  124. absl::MutexLock l(&shared->mu);
  125. shared->thread_has_mutex.store(true, std::memory_order_relaxed);
  126. // Spin until all other threads are either out of the benchmark loop
  127. // or blocked on the mutex. This ensures that the mutex queue is kept
  128. // at its maximal length to benchmark the performance of queueing on
  129. // a highly contended mutex.
  130. while (shared->looping_threads.load(std::memory_order_relaxed) -
  131. shared->blocked_threads.load(std::memory_order_relaxed) !=
  132. 1) {
  133. }
  134. shared->thread_has_mutex.store(false);
  135. }
  136. // Spin until some other thread has acquired the mutex before we block
  137. // again. This ensures that we always go through the slow (queueing)
  138. // acquisition path rather than reacquiring the mutex we just released.
  139. while (!shared->thread_has_mutex.load(std::memory_order_relaxed) &&
  140. shared->looping_threads.load(std::memory_order_relaxed) > 1) {
  141. }
  142. }
  143. // The benchmark framework uses a barrier to ensure that all of the threads
  144. // complete their benchmark loop together before any of the threads exit
  145. // the loop. So, we need to remove ourselves from the "looping threads"
  146. // counter here before potentially blocking on that barrier. Otherwise,
  147. // another thread spinning above might wait forever for this thread to
  148. // block on the mutex while we in fact are waiting to exit.
  149. shared->looping_threads.fetch_add(-1);
  150. }
  151. absl::synchronization_internal::PerThreadSem::SetThreadBlockedCounter(
  152. nullptr);
  153. }
  154. BENCHMARK(BM_MutexEnqueue)
  155. ->Threads(4)
  156. ->Threads(64)
  157. ->Threads(128)
  158. ->Threads(512)
  159. ->ArgName("multiple_priorities")
  160. ->Arg(false)
  161. ->Arg(true);
  162. template <typename MutexType>
  163. void BM_Contended(benchmark::State& state) {
  164. int priority = state.thread_index() % state.range(1);
  165. ScopedThreadMutexPriority priority_setter(priority);
  166. struct Shared {
  167. MutexType mu;
  168. int data = 0;
  169. };
  170. static auto* shared = new Shared;
  171. int local = 0;
  172. for (auto _ : state) {
  173. // Here we model both local work outside of the critical section as well as
  174. // some work inside of the critical section. The idea is to capture some
  175. // more or less realisitic contention levels.
  176. // If contention is too low, the benchmark won't measure anything useful.
  177. // If contention is unrealistically high, the benchmark will favor
  178. // bad mutex implementations that block and otherwise distract threads
  179. // from the mutex and shared state for as much as possible.
  180. // To achieve this amount of local work is multiplied by number of threads
  181. // to keep ratio between local work and critical section approximately
  182. // equal regardless of number of threads.
  183. DelayNs(100 * state.threads(), &local);
  184. RaiiLocker<MutexType> locker(&shared->mu);
  185. DelayNs(state.range(0), &shared->data);
  186. }
  187. }
  188. void SetupBenchmarkArgs(benchmark::internal::Benchmark* bm,
  189. bool do_test_priorities) {
  190. const int max_num_priorities = do_test_priorities ? 2 : 1;
  191. bm->UseRealTime()
  192. // ThreadPerCpu poorly handles non-power-of-two CPU counts.
  193. ->Threads(1)
  194. ->Threads(2)
  195. ->Threads(4)
  196. ->Threads(6)
  197. ->Threads(8)
  198. ->Threads(12)
  199. ->Threads(16)
  200. ->Threads(24)
  201. ->Threads(32)
  202. ->Threads(48)
  203. ->Threads(64)
  204. ->Threads(96)
  205. ->Threads(128)
  206. ->Threads(192)
  207. ->Threads(256)
  208. ->ArgNames({"cs_ns", "num_prios"});
  209. // Some empirically chosen amounts of work in critical section.
  210. // 1 is low contention, 2000 is high contention and few values in between.
  211. for (int critical_section_ns : {1, 20, 50, 200, 2000}) {
  212. for (int num_priorities = 1; num_priorities <= max_num_priorities;
  213. num_priorities++) {
  214. bm->ArgPair(critical_section_ns, num_priorities);
  215. }
  216. }
  217. }
  218. BENCHMARK_TEMPLATE(BM_Contended, absl::Mutex)
  219. ->Apply([](benchmark::internal::Benchmark* bm) {
  220. SetupBenchmarkArgs(bm, /*do_test_priorities=*/true);
  221. });
  222. BENCHMARK_TEMPLATE(BM_Contended, absl::base_internal::SpinLock)
  223. ->Apply([](benchmark::internal::Benchmark* bm) {
  224. SetupBenchmarkArgs(bm, /*do_test_priorities=*/false);
  225. });
  226. BENCHMARK_TEMPLATE(BM_Contended, std::mutex)
  227. ->Apply([](benchmark::internal::Benchmark* bm) {
  228. SetupBenchmarkArgs(bm, /*do_test_priorities=*/false);
  229. });
  230. // Measure the overhead of conditions on mutex release (when they must be
  231. // evaluated). Mutex has (some) support for equivalence classes allowing
  232. // Conditions with the same function/argument to potentially not be multiply
  233. // evaluated.
  234. //
  235. // num_classes==0 is used for the special case of every waiter being distinct.
  236. void BM_ConditionWaiters(benchmark::State& state) {
  237. int num_classes = state.range(0);
  238. int num_waiters = state.range(1);
  239. struct Helper {
  240. static void Waiter(absl::BlockingCounter* init, absl::Mutex* m, int* p) {
  241. init->DecrementCount();
  242. m->LockWhen(absl::Condition(
  243. static_cast<bool (*)(int*)>([](int* v) { return *v == 0; }), p));
  244. m->Unlock();
  245. }
  246. };
  247. if (num_classes == 0) {
  248. // No equivalence classes.
  249. num_classes = num_waiters;
  250. }
  251. absl::BlockingCounter init(num_waiters);
  252. absl::Mutex mu;
  253. std::vector<int> equivalence_classes(num_classes, 1);
  254. // Must be declared last to be destroyed first.
  255. absl::synchronization_internal::ThreadPool pool(num_waiters);
  256. for (int i = 0; i < num_waiters; i++) {
  257. // Mutex considers Conditions with the same function and argument
  258. // to be equivalent.
  259. pool.Schedule([&, i] {
  260. Helper::Waiter(&init, &mu, &equivalence_classes[i % num_classes]);
  261. });
  262. }
  263. init.Wait();
  264. for (auto _ : state) {
  265. mu.Lock();
  266. mu.Unlock(); // Each unlock requires Condition evaluation for our waiters.
  267. }
  268. mu.Lock();
  269. for (int i = 0; i < num_classes; i++) {
  270. equivalence_classes[i] = 0;
  271. }
  272. mu.Unlock();
  273. }
  274. // Some configurations have higher thread limits than others.
  275. #if defined(__linux__) && !defined(ABSL_HAVE_THREAD_SANITIZER)
  276. constexpr int kMaxConditionWaiters = 8192;
  277. #else
  278. constexpr int kMaxConditionWaiters = 1024;
  279. #endif
  280. BENCHMARK(BM_ConditionWaiters)->RangePair(0, 2, 1, kMaxConditionWaiters);
  281. } // namespace