123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310 |
- // Copyright 2017 The Abseil Authors.
- //
- // Licensed under the Apache License, Version 2.0 (the "License");
- // you may not use this file except in compliance with the License.
- // You may obtain a copy of the License at
- //
- // https://www.apache.org/licenses/LICENSE-2.0
- //
- // Unless required by applicable law or agreed to in writing, software
- // distributed under the License is distributed on an "AS IS" BASIS,
- // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- // See the License for the specific language governing permissions and
- // limitations under the License.
- #include <cstdint>
- #include <mutex> // NOLINT(build/c++11)
- #include <vector>
- #include "absl/base/config.h"
- #include "absl/base/internal/cycleclock.h"
- #include "absl/base/internal/spinlock.h"
- #include "absl/synchronization/blocking_counter.h"
- #include "absl/synchronization/internal/thread_pool.h"
- #include "absl/synchronization/mutex.h"
- #include "benchmark/benchmark.h"
- namespace {
- void BM_Mutex(benchmark::State& state) {
- static absl::Mutex* mu = new absl::Mutex;
- for (auto _ : state) {
- absl::MutexLock lock(mu);
- }
- }
- BENCHMARK(BM_Mutex)->UseRealTime()->Threads(1)->ThreadPerCpu();
- static void DelayNs(int64_t ns, int* data) {
- int64_t end = absl::base_internal::CycleClock::Now() +
- ns * absl::base_internal::CycleClock::Frequency() / 1e9;
- while (absl::base_internal::CycleClock::Now() < end) {
- ++(*data);
- benchmark::DoNotOptimize(*data);
- }
- }
- template <typename MutexType>
- class RaiiLocker {
- public:
- explicit RaiiLocker(MutexType* mu) : mu_(mu) { mu_->Lock(); }
- ~RaiiLocker() { mu_->Unlock(); }
- private:
- MutexType* mu_;
- };
- template <>
- class RaiiLocker<std::mutex> {
- public:
- explicit RaiiLocker(std::mutex* mu) : mu_(mu) { mu_->lock(); }
- ~RaiiLocker() { mu_->unlock(); }
- private:
- std::mutex* mu_;
- };
- // RAII object to change the Mutex priority of the running thread.
- class ScopedThreadMutexPriority {
- public:
- explicit ScopedThreadMutexPriority(int priority) {
- absl::base_internal::ThreadIdentity* identity =
- absl::synchronization_internal::GetOrCreateCurrentThreadIdentity();
- identity->per_thread_synch.priority = priority;
- // Bump next_priority_read_cycles to the infinite future so that the
- // implementation doesn't re-read the thread's actual scheduler priority
- // and replace our temporary scoped priority.
- identity->per_thread_synch.next_priority_read_cycles =
- std::numeric_limits<int64_t>::max();
- }
- ~ScopedThreadMutexPriority() {
- // Reset the "next priority read time" back to the infinite past so that
- // the next time the Mutex implementation wants to know this thread's
- // priority, it re-reads it from the OS instead of using our overridden
- // priority.
- absl::synchronization_internal::GetOrCreateCurrentThreadIdentity()
- ->per_thread_synch.next_priority_read_cycles =
- std::numeric_limits<int64_t>::min();
- }
- };
- void BM_MutexEnqueue(benchmark::State& state) {
- // In the "multiple priorities" variant of the benchmark, one of the
- // threads runs with Mutex priority 0 while the rest run at elevated priority.
- // This benchmarks the performance impact of the presence of a low priority
- // waiter when a higher priority waiter adds itself of the queue
- // (b/175224064).
- //
- // NOTE: The actual scheduler priority is not modified in this benchmark:
- // all of the threads get CPU slices with the same priority. Only the
- // Mutex queueing behavior is modified.
- const bool multiple_priorities = state.range(0);
- ScopedThreadMutexPriority priority_setter(
- (multiple_priorities && state.thread_index() != 0) ? 1 : 0);
- struct Shared {
- absl::Mutex mu;
- std::atomic<int> looping_threads{0};
- std::atomic<int> blocked_threads{0};
- std::atomic<bool> thread_has_mutex{false};
- };
- static Shared* shared = new Shared;
- // Set up 'blocked_threads' to count how many threads are currently blocked
- // in Abseil synchronization code.
- //
- // NOTE: Blocking done within the Google Benchmark library itself (e.g.
- // the barrier which synchronizes threads entering and exiting the benchmark
- // loop) does _not_ get registered in this counter. This is because Google
- // Benchmark uses its own synchronization primitives based on std::mutex, not
- // Abseil synchronization primitives. If at some point the benchmark library
- // merges into Abseil, this code may break.
- absl::synchronization_internal::PerThreadSem::SetThreadBlockedCounter(
- &shared->blocked_threads);
- // The benchmark framework may run several iterations in the same process,
- // reusing the same static-initialized 'shared' object. Given the semantics
- // of the members, here, we expect everything to be reset to zero by the
- // end of any iteration. Assert that's the case, just to be sure.
- ABSL_RAW_CHECK(
- shared->looping_threads.load(std::memory_order_relaxed) == 0 &&
- shared->blocked_threads.load(std::memory_order_relaxed) == 0 &&
- !shared->thread_has_mutex.load(std::memory_order_relaxed),
- "Shared state isn't zeroed at start of benchmark iteration");
- static constexpr int kBatchSize = 1000;
- while (state.KeepRunningBatch(kBatchSize)) {
- shared->looping_threads.fetch_add(1);
- for (int i = 0; i < kBatchSize; i++) {
- {
- absl::MutexLock l(&shared->mu);
- shared->thread_has_mutex.store(true, std::memory_order_relaxed);
- // Spin until all other threads are either out of the benchmark loop
- // or blocked on the mutex. This ensures that the mutex queue is kept
- // at its maximal length to benchmark the performance of queueing on
- // a highly contended mutex.
- while (shared->looping_threads.load(std::memory_order_relaxed) -
- shared->blocked_threads.load(std::memory_order_relaxed) !=
- 1) {
- }
- shared->thread_has_mutex.store(false);
- }
- // Spin until some other thread has acquired the mutex before we block
- // again. This ensures that we always go through the slow (queueing)
- // acquisition path rather than reacquiring the mutex we just released.
- while (!shared->thread_has_mutex.load(std::memory_order_relaxed) &&
- shared->looping_threads.load(std::memory_order_relaxed) > 1) {
- }
- }
- // The benchmark framework uses a barrier to ensure that all of the threads
- // complete their benchmark loop together before any of the threads exit
- // the loop. So, we need to remove ourselves from the "looping threads"
- // counter here before potentially blocking on that barrier. Otherwise,
- // another thread spinning above might wait forever for this thread to
- // block on the mutex while we in fact are waiting to exit.
- shared->looping_threads.fetch_add(-1);
- }
- absl::synchronization_internal::PerThreadSem::SetThreadBlockedCounter(
- nullptr);
- }
- BENCHMARK(BM_MutexEnqueue)
- ->Threads(4)
- ->Threads(64)
- ->Threads(128)
- ->Threads(512)
- ->ArgName("multiple_priorities")
- ->Arg(false)
- ->Arg(true);
- template <typename MutexType>
- void BM_Contended(benchmark::State& state) {
- int priority = state.thread_index() % state.range(1);
- ScopedThreadMutexPriority priority_setter(priority);
- struct Shared {
- MutexType mu;
- int data = 0;
- };
- static auto* shared = new Shared;
- int local = 0;
- for (auto _ : state) {
- // Here we model both local work outside of the critical section as well as
- // some work inside of the critical section. The idea is to capture some
- // more or less realisitic contention levels.
- // If contention is too low, the benchmark won't measure anything useful.
- // If contention is unrealistically high, the benchmark will favor
- // bad mutex implementations that block and otherwise distract threads
- // from the mutex and shared state for as much as possible.
- // To achieve this amount of local work is multiplied by number of threads
- // to keep ratio between local work and critical section approximately
- // equal regardless of number of threads.
- DelayNs(100 * state.threads(), &local);
- RaiiLocker<MutexType> locker(&shared->mu);
- DelayNs(state.range(0), &shared->data);
- }
- }
- void SetupBenchmarkArgs(benchmark::internal::Benchmark* bm,
- bool do_test_priorities) {
- const int max_num_priorities = do_test_priorities ? 2 : 1;
- bm->UseRealTime()
- // ThreadPerCpu poorly handles non-power-of-two CPU counts.
- ->Threads(1)
- ->Threads(2)
- ->Threads(4)
- ->Threads(6)
- ->Threads(8)
- ->Threads(12)
- ->Threads(16)
- ->Threads(24)
- ->Threads(32)
- ->Threads(48)
- ->Threads(64)
- ->Threads(96)
- ->Threads(128)
- ->Threads(192)
- ->Threads(256)
- ->ArgNames({"cs_ns", "num_prios"});
- // Some empirically chosen amounts of work in critical section.
- // 1 is low contention, 2000 is high contention and few values in between.
- for (int critical_section_ns : {1, 20, 50, 200, 2000}) {
- for (int num_priorities = 1; num_priorities <= max_num_priorities;
- num_priorities++) {
- bm->ArgPair(critical_section_ns, num_priorities);
- }
- }
- }
- BENCHMARK_TEMPLATE(BM_Contended, absl::Mutex)
- ->Apply([](benchmark::internal::Benchmark* bm) {
- SetupBenchmarkArgs(bm, /*do_test_priorities=*/true);
- });
- BENCHMARK_TEMPLATE(BM_Contended, absl::base_internal::SpinLock)
- ->Apply([](benchmark::internal::Benchmark* bm) {
- SetupBenchmarkArgs(bm, /*do_test_priorities=*/false);
- });
- BENCHMARK_TEMPLATE(BM_Contended, std::mutex)
- ->Apply([](benchmark::internal::Benchmark* bm) {
- SetupBenchmarkArgs(bm, /*do_test_priorities=*/false);
- });
- // Measure the overhead of conditions on mutex release (when they must be
- // evaluated). Mutex has (some) support for equivalence classes allowing
- // Conditions with the same function/argument to potentially not be multiply
- // evaluated.
- //
- // num_classes==0 is used for the special case of every waiter being distinct.
- void BM_ConditionWaiters(benchmark::State& state) {
- int num_classes = state.range(0);
- int num_waiters = state.range(1);
- struct Helper {
- static void Waiter(absl::BlockingCounter* init, absl::Mutex* m, int* p) {
- init->DecrementCount();
- m->LockWhen(absl::Condition(
- static_cast<bool (*)(int*)>([](int* v) { return *v == 0; }), p));
- m->Unlock();
- }
- };
- if (num_classes == 0) {
- // No equivalence classes.
- num_classes = num_waiters;
- }
- absl::BlockingCounter init(num_waiters);
- absl::Mutex mu;
- std::vector<int> equivalence_classes(num_classes, 1);
- // Must be declared last to be destroyed first.
- absl::synchronization_internal::ThreadPool pool(num_waiters);
- for (int i = 0; i < num_waiters; i++) {
- // Mutex considers Conditions with the same function and argument
- // to be equivalent.
- pool.Schedule([&, i] {
- Helper::Waiter(&init, &mu, &equivalence_classes[i % num_classes]);
- });
- }
- init.Wait();
- for (auto _ : state) {
- mu.Lock();
- mu.Unlock(); // Each unlock requires Condition evaluation for our waiters.
- }
- mu.Lock();
- for (int i = 0; i < num_classes; i++) {
- equivalence_classes[i] = 0;
- }
- mu.Unlock();
- }
- // Some configurations have higher thread limits than others.
- #if defined(__linux__) && !defined(ABSL_HAVE_THREAD_SANITIZER)
- constexpr int kMaxConditionWaiters = 8192;
- #else
- constexpr int kMaxConditionWaiters = 1024;
- #endif
- BENCHMARK(BM_ConditionWaiters)->RangePair(0, 2, 1, kMaxConditionWaiters);
- } // namespace
|