benchmarks/malloc_benchmark.cpp - android_bionic - Gitiles

 /*
  * Copyright (C) 2019 The Android Open Source Project
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  *  * Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  *  * Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in
  *    the documentation and/or other materials provided with the
  *    distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
  * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
  * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */

 #include <malloc.h>
 #include <unistd.h>

 #include <condition_variable>
 #include <mutex>
 #include <random>
 #include <thread>
 #include <vector>

 #include <benchmark/benchmark.h>
 #include "ScopedDecayTimeRestorer.h"
 #include "util.h"

 #if defined(__BIONIC__)

 static void RunMalloptPurge(benchmark::State& state, int purge_value) {
   ScopedDecayTimeRestorer restorer;

   static size_t sizes[] = {8, 16, 32, 64, 128, 1024, 4096, 16384, 65536, 131072, 1048576};
   static int pagesize = getpagesize();
   mallopt(M_DECAY_TIME, 1);
   mallopt(M_PURGE_ALL, 0);
   for (auto _ : state) {
     state.PauseTiming();
     std::vector<void*> ptrs;
     for (auto size : sizes) {
       // Allocate at least two pages worth of the allocations.
       for (size_t allocated = 0; allocated < 2 * static_cast<size_t>(pagesize); allocated += size) {
         void* ptr = malloc(size);
         if (ptr == nullptr) {
           state.SkipWithError("Failed to allocate memory");
         }
         MakeAllocationResident(ptr, size, pagesize);
         ptrs.push_back(ptr);
       }
     }
     // Free the memory, which should leave many of the pages resident until
     // the purge call.
     for (auto ptr : ptrs) {
       free(ptr);
     }
     ptrs.clear();
     state.ResumeTiming();

     mallopt(purge_value, 0);
   }
 }

 static void RunThreadsThroughput(benchmark::State& state, size_t size, size_t num_threads) {
   constexpr size_t kMaxBytes = 1 << 24;
   constexpr size_t kMaxThreads = 8;
   constexpr size_t kMinRounds = 4;
   const size_t MaxAllocCounts = kMaxBytes / size;
   std::mutex m;
   bool ready = false;
   std::condition_variable cv;
   std::thread* threads[kMaxThreads];

   // The goal is to create malloc/free interleaving patterns across threads.
   // The bytes processed by each thread will be the same. The difference is the
   // patterns. Here's an example:
   //
   // A: Allocation
   // D: Deallocation
   //
   //   T1    T2    T3
   //   A     A     A
   //   A     A     D
   //   A     D     A
   //   A     D     D
   //   D     A     A
   //   D     A     D
   //   D     D     A
   //   D     D     D
   //
   // To do this, `AllocCounts` and `AllocRounds` will be adjusted according to the
   // thread id.
   auto thread_task = [&](size_t id) {
     {
       std::unique_lock lock(m);
       // Wait until all threads are created.
       cv.wait(lock, [&] { return ready; });
     }

     void** MemPool;
     const size_t AllocCounts = (MaxAllocCounts >> id);
     const size_t AllocRounds = (kMinRounds << id);
     MemPool = new void*[AllocCounts];

     for (size_t i = 0; i < AllocRounds; ++i) {
       for (size_t j = 0; j < AllocCounts; ++j) {
         void* ptr = malloc(size);
         MemPool[j] = ptr;
       }

       // Use a fix seed to reduce the noise of different round of benchmark.
       const unsigned seed = 33529;
       std::shuffle(MemPool, &MemPool[AllocCounts], std::default_random_engine(seed));

       for (size_t j = 0; j < AllocCounts; ++j) free(MemPool[j]);
     }

     delete[] MemPool;
   };

   for (auto _ : state) {
     state.PauseTiming();
     // Don't need to acquire the lock because no thread is created.
     ready = false;

     for (size_t i = 0; i < num_threads; ++i) threads[i] = new std::thread(thread_task, i);

     state.ResumeTiming();

     {
       std::unique_lock lock(m);
       ready = true;
     }

     cv.notify_all();

     for (size_t i = 0; i < num_threads; ++i) {
       threads[i]->join();
       delete threads[i];
     }
   }

   const size_t ThreadsBytesProcessed = kMaxBytes * kMinRounds * num_threads;
   state.SetBytesProcessed(ThreadsBytesProcessed * static_cast<size_t>(state.iterations()));
 }

 static void BM_mallopt_purge(benchmark::State& state) {
   RunMalloptPurge(state, M_PURGE);
 }
 BIONIC_BENCHMARK(BM_mallopt_purge);

 static void BM_mallopt_purge_all(benchmark::State& state) {
   RunMalloptPurge(state, M_PURGE_ALL);
 }
 BIONIC_BENCHMARK(BM_mallopt_purge_all);

 // Note that this will only test a single size class at a time so that we can
 // observe the impact of contention more often.
 #define BM_MALLOC_THREADS_THROUGHPUT(SIZE, NUM_THREADS)                                      \
   static void BM_malloc_threads_throughput_##SIZE##_##NUM_THREADS(benchmark::State& state) { \
     RunThreadsThroughput(state, SIZE, NUM_THREADS);                                          \
   }                                                                                          \
   BIONIC_BENCHMARK(BM_malloc_threads_throughput_##SIZE##_##NUM_THREADS);

 // There are three block categories in Scudo, we choose 1 from each category.
 BM_MALLOC_THREADS_THROUGHPUT(64, 2);
 BM_MALLOC_THREADS_THROUGHPUT(64, 4);
 BM_MALLOC_THREADS_THROUGHPUT(64, 8);
 BM_MALLOC_THREADS_THROUGHPUT(512, 2);
 BM_MALLOC_THREADS_THROUGHPUT(512, 4);
 BM_MALLOC_THREADS_THROUGHPUT(512, 8);
 BM_MALLOC_THREADS_THROUGHPUT(8192, 2);
 BM_MALLOC_THREADS_THROUGHPUT(8192, 4);
 BM_MALLOC_THREADS_THROUGHPUT(8192, 8);

 #endif
	/*
	* Copyright (C) 2019 The Android Open Source Project
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* * Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* * Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in
	* the documentation and/or other materials provided with the
	* distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
	* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
	* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
	* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
	* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
	* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
	* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
	* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	#include <malloc.h>
	#include <unistd.h>

	#include <condition_variable>
	#include <mutex>
	#include <random>
	#include <thread>
	#include <vector>

	#include <benchmark/benchmark.h>
	#include "ScopedDecayTimeRestorer.h"
	#include "util.h"

	#if defined(__BIONIC__)

	static void RunMalloptPurge(benchmark::State& state, int purge_value) {
	ScopedDecayTimeRestorer restorer;

	static size_t sizes[] = {8, 16, 32, 64, 128, 1024, 4096, 16384, 65536, 131072, 1048576};
	static int pagesize = getpagesize();
	mallopt(M_DECAY_TIME, 1);
	mallopt(M_PURGE_ALL, 0);
	for (auto _ : state) {
	state.PauseTiming();
	std::vector<void*> ptrs;
	for (auto size : sizes) {
	// Allocate at least two pages worth of the allocations.
	for (size_t allocated = 0; allocated < 2 * static_cast<size_t>(pagesize); allocated += size) {
	void* ptr = malloc(size);
	if (ptr == nullptr) {
	state.SkipWithError("Failed to allocate memory");
	}
	MakeAllocationResident(ptr, size, pagesize);
	ptrs.push_back(ptr);
	}
	}
	// Free the memory, which should leave many of the pages resident until
	// the purge call.
	for (auto ptr : ptrs) {
	free(ptr);
	}
	ptrs.clear();
	state.ResumeTiming();

	mallopt(purge_value, 0);
	}
	}

	static void RunThreadsThroughput(benchmark::State& state, size_t size, size_t num_threads) {
	constexpr size_t kMaxBytes = 1 << 24;
	constexpr size_t kMaxThreads = 8;
	constexpr size_t kMinRounds = 4;
	const size_t MaxAllocCounts = kMaxBytes / size;
	std::mutex m;
	bool ready = false;
	std::condition_variable cv;
	std::thread* threads[kMaxThreads];

	// The goal is to create malloc/free interleaving patterns across threads.
	// The bytes processed by each thread will be the same. The difference is the
	// patterns. Here's an example:
	//
	// A: Allocation
	// D: Deallocation
	//
	// T1 T2 T3
	// A A A
	// A A D
	// A D A
	// A D D
	// D A A
	// D A D
	// D D A
	// D D D
	//
	// To do this, `AllocCounts` and `AllocRounds` will be adjusted according to the
	// thread id.
	auto thread_task = [&](size_t id) {
	{
	std::unique_lock lock(m);
	// Wait until all threads are created.
	cv.wait(lock, [&] { return ready; });
	}

	void** MemPool;
	const size_t AllocCounts = (MaxAllocCounts >> id);
	const size_t AllocRounds = (kMinRounds << id);
	MemPool = new void*[AllocCounts];

	for (size_t i = 0; i < AllocRounds; ++i) {
	for (size_t j = 0; j < AllocCounts; ++j) {
	void* ptr = malloc(size);
	MemPool[j] = ptr;
	}

	// Use a fix seed to reduce the noise of different round of benchmark.
	const unsigned seed = 33529;
	std::shuffle(MemPool, &MemPool[AllocCounts], std::default_random_engine(seed));

	for (size_t j = 0; j < AllocCounts; ++j) free(MemPool[j]);
	}

	delete[] MemPool;
	};

	for (auto _ : state) {
	state.PauseTiming();
	// Don't need to acquire the lock because no thread is created.
	ready = false;

	for (size_t i = 0; i < num_threads; ++i) threads[i] = new std::thread(thread_task, i);

	state.ResumeTiming();

	{
	std::unique_lock lock(m);
	ready = true;
	}

	cv.notify_all();

	for (size_t i = 0; i < num_threads; ++i) {
	threads[i]->join();
	delete threads[i];
	}
	}

	const size_t ThreadsBytesProcessed = kMaxBytes * kMinRounds * num_threads;
	state.SetBytesProcessed(ThreadsBytesProcessed * static_cast<size_t>(state.iterations()));
	}

	static void BM_mallopt_purge(benchmark::State& state) {
	RunMalloptPurge(state, M_PURGE);
	}
	BIONIC_BENCHMARK(BM_mallopt_purge);

	static void BM_mallopt_purge_all(benchmark::State& state) {
	RunMalloptPurge(state, M_PURGE_ALL);
	}
	BIONIC_BENCHMARK(BM_mallopt_purge_all);

	// Note that this will only test a single size class at a time so that we can
	// observe the impact of contention more often.
	#define BM_MALLOC_THREADS_THROUGHPUT(SIZE, NUM_THREADS) \
	static void BM_malloc_threads_throughput_##SIZE##_##NUM_THREADS(benchmark::State& state) { \
	RunThreadsThroughput(state, SIZE, NUM_THREADS); \
	} \
	BIONIC_BENCHMARK(BM_malloc_threads_throughput_##SIZE##_##NUM_THREADS);

	// There are three block categories in Scudo, we choose 1 from each category.
	BM_MALLOC_THREADS_THROUGHPUT(64, 2);
	BM_MALLOC_THREADS_THROUGHPUT(64, 4);
	BM_MALLOC_THREADS_THROUGHPUT(64, 8);
	BM_MALLOC_THREADS_THROUGHPUT(512, 2);
	BM_MALLOC_THREADS_THROUGHPUT(512, 4);
	BM_MALLOC_THREADS_THROUGHPUT(512, 8);
	BM_MALLOC_THREADS_THROUGHPUT(8192, 2);
	BM_MALLOC_THREADS_THROUGHPUT(8192, 4);
	BM_MALLOC_THREADS_THROUGHPUT(8192, 8);

	#endif