티스토리 뷰
no optimize
google benchmark의 해당 구문은 아래와 같이 구성된다
// The DoNotOptimize(...) function can be used to prevent a value or // expression from being optimized away by the compiler. This function is // intended to add little to no overhead. // See: https://youtu.be/nXaxk27zwlk?t=2441 #ifndef BENCHMARK_HAS_NO_INLINE_ASSEMBLY template <class Tp> inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp const& value) { asm volatile("" : : "r,m"(value) : "memory"); } template <class Tp> inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp& value) { #if defined(__clang__) asm volatile("" : "+r,m"(value) : : "memory"); #else asm volatile("" : "+m,r"(value) : : "memory"); #endif } // Force the compiler to flush pending writes to global memory. Acts as an // effective read/write barrier inline BENCHMARK_ALWAYS_INLINE void ClobberMemory() { asm volatile("" : : : "memory"); }
각부분의 의미는 inline asm을 봐야 이해가능
그냥 믿고 쓰는게 나을듯
asm volatile("" ::: "memory");
https://stackoverflow.com/questions/14950614/working-of-asm-volatile-memory
https://github.com/google/benchmark/blob/master/include/benchmark/benchmark.h
// The DoNotOptimize(...) function can be used to prevent a value or // expression from being optimized away by the compiler. This function is // intended to add little to no overhead. // See: https://youtu.be/nXaxk27zwlk?t=2441 #ifndef BENCHMARK_HAS_NO_INLINE_ASSEMBLY template <class Tp> inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp const& value) { asm volatile("" : : "r,m"(value) : "memory"); } template <class Tp> inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp& value) { #if defined(__clang__) asm volatile("" : "+r,m"(value) : : "memory"); #else asm volatile("" : "+m,r"(value) : : "memory"); #endif } // Force the compiler to flush pending writes to global memory. Acts as an // effective read/write barrier inline BENCHMARK_ALWAYS_INLINE void ClobberMemory() { asm volatile("" : : : "memory"); } #elif defined(_MSC_VER) template <class Tp> inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp const& value) { internal::UseCharPointer(&reinterpret_cast<char const volatile&>(value)); _ReadWriteBarrier(); } inline BENCHMARK_ALWAYS_INLINE void ClobberMemory() { _ReadWriteBarrier(); } #else template <class Tp> inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp const& value) { internal::UseCharPointer(&reinterpret_cast<char const volatile&>(value)); } // FIXME Add ClobberMemory() for non-gnu and non-msvc compilers #endif
_MSC_VER는 따로임
// The DoNotOptimize(...) function can be used to prevent a value or
// expression from being optimized away by the compiler. This function is
// intended to add little to no overhead.
// See: https://youtu.be/nXaxk27zwlk?t=2441
#ifndef BENCHMARK_HAS_NO_INLINE_ASSEMBLY
template <class Tp>
inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp const& value) {
asm volatile("" : : "r,m"(value) : "memory");
}
template <class Tp>
inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp& value) {
#if defined(__clang__)
asm volatile("" : "+r,m"(value) : : "memory");
#else
asm volatile("" : "+m,r"(value) : : "memory");
#endif
}
// Force the compiler to flush pending writes to global memory. Acts as an
// effective read/write barrier
inline BENCHMARK_ALWAYS_INLINE void ClobberMemory() {
asm volatile("" : : : "memory");
}
#elif defined(_MSC_VER)
template <class Tp>
inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp const& value) {
internal::UseCharPointer(&reinterpret_cast<char const volatile&>(value));
_ReadWriteBarrier();
}
MSVC에서 테스트
#include <benchmark/benchmark.h>
#include <string>
#include <vector>
#pragma comment ( lib, "Shlwapi.lib" )
#if _DEBUG
#pragma comment(lib, "benchmarkd.lib")
#pragma comment(lib, "benchmark_maind.lib")
#else
#pragma comment(lib, "benchmark.lib")
#pragma comment(lib, "benchmark_main.lib")
#endif
#define BASIC_BENCHMARK_TEST(x) BENCHMARK(x)->Arg(8)->Arg(512)->Arg(8192)
void BM_empty(benchmark::State& state) {
for (auto _ : state) {
/*benchmark::DoNotOptimize*/(state.iterations());
}
}
BENCHMARK(BM_empty);
Run on (12 X 3194 MHz CPU s)
CPU Caches:
L1 Data 32 KiB (x6)
L1 Instruction 64 KiB (x6)
L2 Unified 512 KiB (x6)
L3 Unified 8192 KiB (x2)
-----------------------------------------------------
Benchmark Time CPU Iterations
-----------------------------------------------------
BM_empty 0.340 ns 0.344 ns 1000000000
benchmark::DoNotOptimize(state.iterations());
Run on (12 X 3194 MHz CPU s)
CPU Caches:
L1 Data 32 KiB (x6)
L1 Instruction 64 KiB (x6)
L2 Unified 512 KiB (x6)
L3 Unified 8192 KiB (x2)
-----------------------------------------------------
Benchmark Time CPU Iterations
-----------------------------------------------------
BM_empty 2.05 ns 2.04 ns 344615385
댓글
공지사항
최근에 올라온 글
최근에 달린 댓글
- Total
- Today
- Yesterday
링크