티스토리 뷰

카테고리 없음

no optimize

newpolaris 2020. 1. 31. 16:02

no optimize

google benchmark의 해당 구문은 아래와 같이 구성된다

// The DoNotOptimize(...) function can be used to prevent a value or // expression from being optimized away by the compiler. This function is // intended to add little to no overhead. // See: https://youtu.be/nXaxk27zwlk?t=2441 #ifndef BENCHMARK_HAS_NO_INLINE_ASSEMBLY template <class Tp> inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp const& value) {   asm volatile("" : : "r,m"(value) : "memory"); }  template <class Tp> inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp& value) { #if defined(__clang__)   asm volatile("" : "+r,m"(value) : : "memory"); #else   asm volatile("" : "+m,r"(value) : : "memory"); #endif }  // Force the compiler to flush pending writes to global memory. Acts as an // effective read/write barrier inline BENCHMARK_ALWAYS_INLINE void ClobberMemory() {   asm volatile("" : : : "memory"); } 

각부분의 의미는 inline asm을 봐야 이해가능

그냥 믿고 쓰는게 나을듯

asm volatile("" ::: "memory");

https://stackoverflow.com/questions/14950614/working-of-asm-volatile-memory

https://github.com/google/benchmark/blob/master/include/benchmark/benchmark.h

// The DoNotOptimize(...) function can be used to prevent a value or // expression from being optimized away by the compiler. This function is // intended to add little to no overhead. // See: https://youtu.be/nXaxk27zwlk?t=2441 #ifndef BENCHMARK_HAS_NO_INLINE_ASSEMBLY template <class Tp> inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp const& value) {   asm volatile("" : : "r,m"(value) : "memory"); }  template <class Tp> inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp& value) { #if defined(__clang__)   asm volatile("" : "+r,m"(value) : : "memory"); #else   asm volatile("" : "+m,r"(value) : : "memory"); #endif }  // Force the compiler to flush pending writes to global memory. Acts as an // effective read/write barrier inline BENCHMARK_ALWAYS_INLINE void ClobberMemory() {   asm volatile("" : : : "memory"); } #elif defined(_MSC_VER) template <class Tp> inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp const& value) {   internal::UseCharPointer(&reinterpret_cast<char const volatile&>(value));   _ReadWriteBarrier(); }  inline BENCHMARK_ALWAYS_INLINE void ClobberMemory() { _ReadWriteBarrier(); } #else template <class Tp> inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp const& value) {   internal::UseCharPointer(&reinterpret_cast<char const volatile&>(value)); } // FIXME Add ClobberMemory() for non-gnu and non-msvc compilers #endif 

_MSC_VER는 따로임

// The DoNotOptimize(...) function can be used to prevent a value or
// expression from being optimized away by the compiler. This function is
// intended to add little to no overhead.
// See: https://youtu.be/nXaxk27zwlk?t=2441
#ifndef BENCHMARK_HAS_NO_INLINE_ASSEMBLY
template <class Tp>
inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp const& value) {
  asm volatile("" : : "r,m"(value) : "memory");
}

template <class Tp>
inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp& value) {
#if defined(__clang__)
  asm volatile("" : "+r,m"(value) : : "memory");
#else
  asm volatile("" : "+m,r"(value) : : "memory");
#endif
}

// Force the compiler to flush pending writes to global memory. Acts as an
// effective read/write barrier
inline BENCHMARK_ALWAYS_INLINE void ClobberMemory() {
  asm volatile("" : : : "memory");
}
#elif defined(_MSC_VER)
template <class Tp>
inline BENCHMARK_ALWAYS_INLINE void DoNotOptimize(Tp const& value) {
  internal::UseCharPointer(&reinterpret_cast<char const volatile&>(value));
  _ReadWriteBarrier();
}

MSVC에서 테스트

#include <benchmark/benchmark.h>
#include <string>
#include <vector>

#pragma comment ( lib, "Shlwapi.lib" )
#if _DEBUG
#pragma comment(lib, "benchmarkd.lib")
#pragma comment(lib, "benchmark_maind.lib")
#else
#pragma comment(lib, "benchmark.lib")
#pragma comment(lib, "benchmark_main.lib")
#endif

#define BASIC_BENCHMARK_TEST(x) BENCHMARK(x)->Arg(8)->Arg(512)->Arg(8192)

void BM_empty(benchmark::State& state) {
  for (auto _ : state) {
    /*benchmark::DoNotOptimize*/(state.iterations());
  }
}
BENCHMARK(BM_empty);
Run on (12 X 3194 MHz CPU s)
CPU Caches:
  L1 Data 32 KiB (x6)
  L1 Instruction 64 KiB (x6)
  L2 Unified 512 KiB (x6)
  L3 Unified 8192 KiB (x2)
-----------------------------------------------------
Benchmark           Time             CPU   Iterations
-----------------------------------------------------
BM_empty        0.340 ns        0.344 ns   1000000000
benchmark::DoNotOptimize(state.iterations());
Run on (12 X 3194 MHz CPU s)
CPU Caches:
  L1 Data 32 KiB (x6)
  L1 Instruction 64 KiB (x6)
  L2 Unified 512 KiB (x6)
  L3 Unified 8192 KiB (x2)
-----------------------------------------------------
Benchmark           Time             CPU   Iterations
-----------------------------------------------------
BM_empty         2.05 ns         2.04 ns    344615385
댓글
공지사항
최근에 올라온 글
최근에 달린 댓글
Total
Today
Yesterday
링크