From 494729b9f0bb4d75775307c3577ffadfa0466af0 Mon Sep 17 00:00:00 2001 From: Mwsxy Date: Fri, 7 Jan 2022 22:10:31 +0800 Subject: [PATCH 1/2] perf. AOS to SOA. try eigen. --- CMakeLists.txt | 11 +++ run.sh | 3 + work.cpp | 185 +++++++++++++++++++++++++++++++++++++++++++++++++ work_eigen.cpp | 111 +++++++++++++++++++++++++++++ 4 files changed, 310 insertions(+) create mode 100644 work.cpp create mode 100644 work_eigen.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 29b152c..0c4af75 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -7,3 +7,14 @@ if (NOT CMAKE_BUILD_TYPE) endif() add_executable(main main.cpp) +# target_compile_options(main PUBLIC -fopenmp -ffast-math -march=native) +add_executable(work work.cpp) +target_compile_options(work PUBLIC -fopenmp -ffast-math -march=native) +# target_compile_options(work PUBLIC -fopenmp -march=native) + +find_package(Eigen3 REQUIRED) +add_executable(work_eigen work_eigen.cpp) +target_include_directories(work_eigen PUBLIC ${EIGEN3_INCLUDE_DIR}) +target_compile_options(work_eigen PUBLIC -march=native) + +set(CMAKE_EXPORT_COMPILE_COMMANDS ON) diff --git a/run.sh b/run.sh index 99e6ef6..4fc0566 100755 --- a/run.sh +++ b/run.sh @@ -2,4 +2,7 @@ set -e cmake -B build cmake --build build +echo "------baseline------" build/main +echo "------ my ------" +build/work diff --git a/work.cpp b/work.cpp new file mode 100644 index 0000000..ca152d1 --- /dev/null +++ b/work.cpp @@ -0,0 +1,185 @@ +#include +#include +#include +#include +#include +#include +#include + +float frand() { + return (float)rand() / RAND_MAX * 2 - 1; +} + +struct Star { + float px, py, pz; + float vx, vy, vz; + float mass; +}; + +constexpr int NSTAR = 48; +struct StarSOA +{ + float data[7][NSTAR]; + inline float* operator[] (size_t pos) { return data[pos];} + inline const float* operator[] (const size_t pos) const { return &(*data[pos]);} +}; +StarSOA stars2; +std::vector stars; + +float G = 0.001; +float eps = 0.001; +float dt = 0.01; + + + +void init() { + for (int i = 0; i < 48; i++) { + stars.push_back({ + frand(), frand(), frand(), + frand(), frand(), frand(), + frand() + 1, + }); + } +} + +void step() { + for (auto &star: stars) { + for (auto &other: stars) { + float dx = other.px - star.px; + float dy = other.py - star.py; + float dz = other.pz - star.pz; + float d2 = dx * dx + dy * dy + dz * dz + eps * eps; + d2 *= sqrt(d2); + star.vx += dx * other.mass * G * dt / d2; + star.vy += dy * other.mass * G * dt / d2; + star.vz += dz * other.mass * G * dt / d2; + } + } + for (auto &star: stars) { + star.px += star.vx * dt; + star.py += star.vy * dt; + star.pz += star.vz * dt; + } +} + + +float calc() { + float energy = 0; + for (auto &star: stars) { + float v2 = star.vx * star.vx + star.vy * star.vy + star.vz * star.vz; + energy += star.mass * v2 / 2; + for (auto &other: stars) { + float dx = other.px - star.px; + float dy = other.py - star.py; + float dz = other.pz - star.pz; + float d2 = dx * dx + dy * dy + dz * dz + eps * eps; + energy -= other.mass * star.mass * G / sqrt(d2) / 2; + } + } + return energy; +} +void init2() { + // for (int i=0; i=0; j--) { + // stars2[j][i] = frand(); + // } + // stars2[6][i] = frand() + 1; + // } + for (int i=0; i(&y); // evil floating point bit level hacking + i = 0x5f3759df - (i >> 1); + y = *reinterpret_cast(&i); + y = y * (threehalfs - (x2 * y * y)); // 1st iteration + y = y * (threehalfs - (x2 * y * y)); // 2nd iteration, this can be removed + + return y; +} + +void step2() { + auto other = stars2; + for (int i=0; i +long benchmark(Func const &func) { + auto t0 = std::chrono::steady_clock::now(); + func(); + auto t1 = std::chrono::steady_clock::now(); + auto dt = std::chrono::duration_cast(t1 - t0); + return dt.count(); +} + +int main() { + init(); + init2(); + printf("original:\n"); + printf("Initial energy: %f\n", calc()); + auto dt = benchmark([&] { + for (int i = 0; i < 100000; i++) + step(); + }); + printf("Final energy: %f\n", calc()); + printf("Time elapsed: %ld ms\n", dt); + + printf("my:\n"); + printf("Initial energy: %f\n", calc2()); + auto dt2 = benchmark([&] { + for (int i = 0; i < 100000; i++) + step2(); + }); + printf("Final energy: %f\n", calc2()); + printf("Time elapsed: %ld ms\n", dt2); + return 0; +} \ No newline at end of file diff --git a/work_eigen.cpp b/work_eigen.cpp new file mode 100644 index 0000000..fe9d953 --- /dev/null +++ b/work_eigen.cpp @@ -0,0 +1,111 @@ +#include +#include +#include +#include +#include +#include + +float frand() { + return (float)rand() / RAND_MAX * 2 - 1; +} + +// struct Star { +// float px, py, pz; +// float vx, vy, vz; +// float mass; +// }; + +// std::vector stars; + + +constexpr int NSTAR = 48; +using Mat3 = Eigen::Matrix; +Mat3 points, velocity; +Eigen::Matrix mass; + + + +void init() { + for (int i = 0; i < NSTAR; i++) { + points(0, i) = frand(); + points(1, i) = frand(); + points(2, i) = frand(); + + velocity(0, i) = frand(); + velocity(1, i) = frand(); + velocity(2, i) = frand(); + + mass(i) = frand() + 1; + } +} + +float G = 0.001; +float eps = 0.001; +float dt = 0.01; + +void step() { + for (int i = 0; i < NSTAR; i++) { + auto star = points.col(i); + + for (int j = 0; j < NSTAR; j++) { + auto d = points.col(j) - points.col(i); + float d2 = d.squaredNorm()+eps; + d2 *= std::sqrt(d2); + velocity.col(i) += d * mass(j) * G * dt / d2; + } + } + points += velocity * dt; + + // for (auto &star: stars) { + // for (auto &other: stars) { + // float dx = other.px - star.px; + // float dy = other.py - star.py; + // float dz = other.pz - star.pz; + // float d2 = dx * dx + dy * dy + dz * dz + eps * eps; + // d2 *= sqrt(d2); + // star.vx += dx * other.mass * G * dt / d2; + // star.vy += dy * other.mass * G * dt / d2; + // star.vz += dz * other.mass * G * dt / d2; + // } + // } + // for (auto &star: stars) { + // star.px += star.vx * dt; + // star.py += star.vy * dt; + // star.pz += star.vz * dt; + // } +} + +float calc() { + float energy = 0; + for (int i = 0; i < NSTAR; i++) { + float v2 = velocity.col(i).squaredNorm()+eps; + energy += mass(i) * v2 / 2; + for (int j = 0; j < NSTAR; j++) { + auto d = points.col(j) - points.col(i); + float d2 = d.norm()+eps; + energy -= mass(j) * mass(i) * G / d2 /2 ; + } + } + return energy; +} + +template +long benchmark(Func const &func) { + auto t0 = std::chrono::steady_clock::now(); + func(); + auto t1 = std::chrono::steady_clock::now(); + auto dt = std::chrono::duration_cast(t1 - t0); + return dt.count(); +} + +int main() { + init(); + printf("Initial energy: %f\n", calc()); + auto dt = benchmark([&] { + for (int i = 0; i < 100000; i++) + step(); + }); + printf("Final energy: %f\n", calc()); + printf("Time elapsed: %ld ms\n", dt); + return 0; +} From 20bb62eed059dabe5a9efc0fcfc935d3f5d486b5 Mon Sep 17 00:00:00 2001 From: Mwsxy Date: Sat, 8 Jan 2022 00:08:53 +0800 Subject: [PATCH 2/2] work2.cpp use xmmintrin.h optimize. --- CMakeLists.txt | 11 +-- run.sh | 2 + work.cpp | 24 +++-- work2.cpp | 238 +++++++++++++++++++++++++++++++++++++++++++++++++ work_eigen.cpp | 111 ----------------------- 5 files changed, 261 insertions(+), 125 deletions(-) create mode 100644 work2.cpp delete mode 100644 work_eigen.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 0c4af75..7f84d72 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -7,14 +7,11 @@ if (NOT CMAKE_BUILD_TYPE) endif() add_executable(main main.cpp) -# target_compile_options(main PUBLIC -fopenmp -ffast-math -march=native) +# target_compile_options(main PUBLIC -ffast-math -march=native) add_executable(work work.cpp) -target_compile_options(work PUBLIC -fopenmp -ffast-math -march=native) -# target_compile_options(work PUBLIC -fopenmp -march=native) +target_compile_options(work PUBLIC -ffast-math -march=native) -find_package(Eigen3 REQUIRED) -add_executable(work_eigen work_eigen.cpp) -target_include_directories(work_eigen PUBLIC ${EIGEN3_INCLUDE_DIR}) -target_compile_options(work_eigen PUBLIC -march=native) +add_executable(work2 work2.cpp) +target_compile_options(work2 PUBLIC -ffast-math -march=native) set(CMAKE_EXPORT_COMPILE_COMMANDS ON) diff --git a/run.sh b/run.sh index 4fc0566..965d7fb 100755 --- a/run.sh +++ b/run.sh @@ -6,3 +6,5 @@ echo "------baseline------" build/main echo "------ my ------" build/work +echo "------ my2 ------" +build/work2 \ No newline at end of file diff --git a/work.cpp b/work.cpp index ca152d1..cdc86b8 100644 --- a/work.cpp +++ b/work.cpp @@ -120,11 +120,21 @@ void step2() { float dy = other[1][j] - stars2[1][i]; float dz = other[2][j] - stars2[2][i]; float d2 = dx * dx + dy * dy + dz * dz + eps * eps; - // d2 = 1./(std::sqrt(d2)*d2); // 292 ms - d2 = Q_rsqrt(d2)/d2; // 273ms - vx += dx * other[6][j] * (G * dt) * d2; - vy += dy * other[6][j] * (G * dt) * d2; - vz += dz * other[6][j] * (G * dt) * d2; + if (true) + { + d2 *= std::sqrt(d2); // 292 ms + vx += dx * other[6][j] * (G * dt) / d2; + vy += dy * other[6][j] * (G * dt) / d2; + vz += dz * other[6][j] * (G * dt) / d2; + } + else { + // d2 = 1./(std::sqrt(d2)*d2); // 292 ms + d2 = Q_rsqrt(d2)/d2; // 273ms + vx += dx * other[6][j] * (G * dt) * d2; + vy += dy * other[6][j] * (G * dt) * d2; + vz += dz * other[6][j] * (G * dt) * d2; + } + } stars2[3][i] += vx; stars2[4][i] += vy; @@ -146,7 +156,7 @@ float calc2() { float dy = stars2[1][j] - stars2[1][i]; float dz = stars2[2][j] - stars2[2][i]; float d2 = dx * dx + dy * dy + dz * dz + eps * eps; - energy -= stars2[6][j] * stars2[6][j] * G / sqrt(d2) / 2; + energy -= stars2[6][j] * stars2[6][i] * G / sqrt(d2) / 2; } } return energy; @@ -180,6 +190,6 @@ int main() { step2(); }); printf("Final energy: %f\n", calc2()); - printf("Time elapsed: %ld ms\n", dt2); + printf("Time elapsed: %ld ms\n", dt2); return 0; } \ No newline at end of file diff --git a/work2.cpp b/work2.cpp new file mode 100644 index 0000000..dd19dc8 --- /dev/null +++ b/work2.cpp @@ -0,0 +1,238 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +float frand() { + return (float)rand() / RAND_MAX * 2 - 1; +} + +struct Star { + float px, py, pz; + float vx, vy, vz; + float mass; +}; + +constexpr int NSTAR = 48; +struct StarSOA +{ + float data[7][NSTAR]; + inline float* operator[] (size_t pos) { return data[pos];} + inline const float* operator[] (const size_t pos) const { return &(*data[pos]);} +}; +StarSOA stars2; +std::vector stars; + +float G = 0.001; +float eps = 0.001; +float dt = 0.01; + + + +void init() { + for (int i = 0; i < 48; i++) { + stars.push_back({ + frand(), frand(), frand(), + frand(), frand(), frand(), + frand() + 1, + }); + } +} + +void step() { + for (auto &star: stars) { + for (auto &other: stars) { + float dx = other.px - star.px; + float dy = other.py - star.py; + float dz = other.pz - star.pz; + float d2 = dx * dx + dy * dy + dz * dz + eps * eps; + d2 *= sqrt(d2); + star.vx += dx * other.mass * G * dt / d2; + star.vy += dy * other.mass * G * dt / d2; + star.vz += dz * other.mass * G * dt / d2; + } + } + for (auto &star: stars) { + star.px += star.vx * dt; + star.py += star.vy * dt; + star.pz += star.vz * dt; + } +} + + +float calc() { + float energy = 0; + for (auto &star: stars) { + float v2 = star.vx * star.vx + star.vy * star.vy + star.vz * star.vz; + energy += star.mass * v2 / 2; + for (auto &other: stars) { + float dx = other.px - star.px; + float dy = other.py - star.py; + float dz = other.pz - star.pz; + float d2 = dx * dx + dy * dy + dz * dz + eps * eps; + energy -= other.mass * star.mass * G / sqrt(d2) / 2; + } + } + return energy; +} +void init2() { + // for (int i=0; i=0; j--) { + // stars2[j][i] = frand(); + // } + // stars2[6][i] = frand() + 1; + // } + for (int i=0; i(&y); // evil floating point bit level hacking + i = 0x5f3759df - (i >> 1); + y = *reinterpret_cast(&i); + y = y * (threehalfs - (x2 * y * y)); // 1st iteration + y = y * (threehalfs - (x2 * y * y)); // 2nd iteration, this can be removed + + return y; +} + + +// #define HAVE_AVX512 +#ifdef HAVE_AVX512 +#define NSTEP 16 +#define LOAD_PS(_P) _mm512_load_ps(_P) +#define STORE_PS(_P, _F) _mm512_store_ps(_P, _F) +#define SET1_PS(_F) _mm512_set1_ps(_F) +#define MUL_PS(_a, _b) _mm512_mul_ps(_a, _b) +#define SUB_PS(_a, _b) _mm512_sub_ps(_a, _b) +#define ADD_PS(_a, _b) _mm512_add_ps(_a, _b) +#define DIV_PS(_a, _b) _mm512_div_ps(_a, _b) +#define RSQRT_PS(_a) _mm512_rsqrt14_ps(_a) +#else +#define NSTEP 8 +#define LOAD_PS(_P) _mm256_load_ps(_P) +#define STORE_PS(_P, _F) _mm256_store_ps(_P, _F) +#define SET1_PS(_F) _mm256_set1_ps(_F) +#define MUL_PS(_a, _b) _mm256_mul_ps(_a, _b) +#define SUB_PS(_a, _b) _mm256_sub_ps(_a, _b) +#define ADD_PS(_a, _b) _mm256_add_ps(_a, _b) +#define DIV_PS(_a, _b) _mm256_div_ps(_a, _b) +#define RSQRT_PS(_a) _mm256_rsqrt_ps(_a) +#endif + +void step2() { + auto other = stars2; + for (int i=0; i +long benchmark(Func const &func) { + auto t0 = std::chrono::steady_clock::now(); + func(); + auto t1 = std::chrono::steady_clock::now(); + auto dt = std::chrono::duration_cast(t1 - t0); + return dt.count(); +} + +int main() { + init(); + init2(); + printf("original:\n"); + printf("Initial energy: %f\n", calc()); + auto dt = benchmark([&] { + for (int i = 0; i < 100000; i++) + step(); + }); + printf("Final energy: %f\n", calc()); + printf("Time elapsed: %ld ms\n", dt); + + printf("my:\n"); + printf("Initial energy: %f\n", calc2()); + auto dt2 = benchmark([&] { + for (int i = 0; i < 100000; i++) + step2(); + }); + printf("Final energy: %f\n", calc2()); + printf("Time elapsed: %ld ms\n", dt2); + return 0; +} \ No newline at end of file diff --git a/work_eigen.cpp b/work_eigen.cpp deleted file mode 100644 index fe9d953..0000000 --- a/work_eigen.cpp +++ /dev/null @@ -1,111 +0,0 @@ -#include -#include -#include -#include -#include -#include - -float frand() { - return (float)rand() / RAND_MAX * 2 - 1; -} - -// struct Star { -// float px, py, pz; -// float vx, vy, vz; -// float mass; -// }; - -// std::vector stars; - - -constexpr int NSTAR = 48; -using Mat3 = Eigen::Matrix; -Mat3 points, velocity; -Eigen::Matrix mass; - - - -void init() { - for (int i = 0; i < NSTAR; i++) { - points(0, i) = frand(); - points(1, i) = frand(); - points(2, i) = frand(); - - velocity(0, i) = frand(); - velocity(1, i) = frand(); - velocity(2, i) = frand(); - - mass(i) = frand() + 1; - } -} - -float G = 0.001; -float eps = 0.001; -float dt = 0.01; - -void step() { - for (int i = 0; i < NSTAR; i++) { - auto star = points.col(i); - - for (int j = 0; j < NSTAR; j++) { - auto d = points.col(j) - points.col(i); - float d2 = d.squaredNorm()+eps; - d2 *= std::sqrt(d2); - velocity.col(i) += d * mass(j) * G * dt / d2; - } - } - points += velocity * dt; - - // for (auto &star: stars) { - // for (auto &other: stars) { - // float dx = other.px - star.px; - // float dy = other.py - star.py; - // float dz = other.pz - star.pz; - // float d2 = dx * dx + dy * dy + dz * dz + eps * eps; - // d2 *= sqrt(d2); - // star.vx += dx * other.mass * G * dt / d2; - // star.vy += dy * other.mass * G * dt / d2; - // star.vz += dz * other.mass * G * dt / d2; - // } - // } - // for (auto &star: stars) { - // star.px += star.vx * dt; - // star.py += star.vy * dt; - // star.pz += star.vz * dt; - // } -} - -float calc() { - float energy = 0; - for (int i = 0; i < NSTAR; i++) { - float v2 = velocity.col(i).squaredNorm()+eps; - energy += mass(i) * v2 / 2; - for (int j = 0; j < NSTAR; j++) { - auto d = points.col(j) - points.col(i); - float d2 = d.norm()+eps; - energy -= mass(j) * mass(i) * G / d2 /2 ; - } - } - return energy; -} - -template -long benchmark(Func const &func) { - auto t0 = std::chrono::steady_clock::now(); - func(); - auto t1 = std::chrono::steady_clock::now(); - auto dt = std::chrono::duration_cast(t1 - t0); - return dt.count(); -} - -int main() { - init(); - printf("Initial energy: %f\n", calc()); - auto dt = benchmark([&] { - for (int i = 0; i < 100000; i++) - step(); - }); - printf("Final energy: %f\n", calc()); - printf("Time elapsed: %ld ms\n", dt); - return 0; -}