diff --git a/CMakeLists.txt b/CMakeLists.txt index 29b152c..7f84d72 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -7,3 +7,11 @@ if (NOT CMAKE_BUILD_TYPE) endif() add_executable(main main.cpp) +# target_compile_options(main PUBLIC -ffast-math -march=native) +add_executable(work work.cpp) +target_compile_options(work PUBLIC -ffast-math -march=native) + +add_executable(work2 work2.cpp) +target_compile_options(work2 PUBLIC -ffast-math -march=native) + +set(CMAKE_EXPORT_COMPILE_COMMANDS ON) diff --git a/run.sh b/run.sh index 99e6ef6..965d7fb 100755 --- a/run.sh +++ b/run.sh @@ -2,4 +2,9 @@ set -e cmake -B build cmake --build build +echo "------baseline------" build/main +echo "------ my ------" +build/work +echo "------ my2 ------" +build/work2 \ No newline at end of file diff --git a/work.cpp b/work.cpp new file mode 100644 index 0000000..cdc86b8 --- /dev/null +++ b/work.cpp @@ -0,0 +1,195 @@ +#include +#include +#include +#include +#include +#include +#include + +float frand() { + return (float)rand() / RAND_MAX * 2 - 1; +} + +struct Star { + float px, py, pz; + float vx, vy, vz; + float mass; +}; + +constexpr int NSTAR = 48; +struct StarSOA +{ + float data[7][NSTAR]; + inline float* operator[] (size_t pos) { return data[pos];} + inline const float* operator[] (const size_t pos) const { return &(*data[pos]);} +}; +StarSOA stars2; +std::vector stars; + +float G = 0.001; +float eps = 0.001; +float dt = 0.01; + + + +void init() { + for (int i = 0; i < 48; i++) { + stars.push_back({ + frand(), frand(), frand(), + frand(), frand(), frand(), + frand() + 1, + }); + } +} + +void step() { + for (auto &star: stars) { + for (auto &other: stars) { + float dx = other.px - star.px; + float dy = other.py - star.py; + float dz = other.pz - star.pz; + float d2 = dx * dx + dy * dy + dz * dz + eps * eps; + d2 *= sqrt(d2); + star.vx += dx * other.mass * G * dt / d2; + star.vy += dy * other.mass * G * dt / d2; + star.vz += dz * other.mass * G * dt / d2; + } + } + for (auto &star: stars) { + star.px += star.vx * dt; + star.py += star.vy * dt; + star.pz += star.vz * dt; + } +} + + +float calc() { + float energy = 0; + for (auto &star: stars) { + float v2 = star.vx * star.vx + star.vy * star.vy + star.vz * star.vz; + energy += star.mass * v2 / 2; + for (auto &other: stars) { + float dx = other.px - star.px; + float dy = other.py - star.py; + float dz = other.pz - star.pz; + float d2 = dx * dx + dy * dy + dz * dz + eps * eps; + energy -= other.mass * star.mass * G / sqrt(d2) / 2; + } + } + return energy; +} +void init2() { + // for (int i=0; i=0; j--) { + // stars2[j][i] = frand(); + // } + // stars2[6][i] = frand() + 1; + // } + for (int i=0; i(&y); // evil floating point bit level hacking + i = 0x5f3759df - (i >> 1); + y = *reinterpret_cast(&i); + y = y * (threehalfs - (x2 * y * y)); // 1st iteration + y = y * (threehalfs - (x2 * y * y)); // 2nd iteration, this can be removed + + return y; +} + +void step2() { + auto other = stars2; + for (int i=0; i +long benchmark(Func const &func) { + auto t0 = std::chrono::steady_clock::now(); + func(); + auto t1 = std::chrono::steady_clock::now(); + auto dt = std::chrono::duration_cast(t1 - t0); + return dt.count(); +} + +int main() { + init(); + init2(); + printf("original:\n"); + printf("Initial energy: %f\n", calc()); + auto dt = benchmark([&] { + for (int i = 0; i < 100000; i++) + step(); + }); + printf("Final energy: %f\n", calc()); + printf("Time elapsed: %ld ms\n", dt); + + printf("my:\n"); + printf("Initial energy: %f\n", calc2()); + auto dt2 = benchmark([&] { + for (int i = 0; i < 100000; i++) + step2(); + }); + printf("Final energy: %f\n", calc2()); + printf("Time elapsed: %ld ms\n", dt2); + return 0; +} \ No newline at end of file diff --git a/work2.cpp b/work2.cpp new file mode 100644 index 0000000..dd19dc8 --- /dev/null +++ b/work2.cpp @@ -0,0 +1,238 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +float frand() { + return (float)rand() / RAND_MAX * 2 - 1; +} + +struct Star { + float px, py, pz; + float vx, vy, vz; + float mass; +}; + +constexpr int NSTAR = 48; +struct StarSOA +{ + float data[7][NSTAR]; + inline float* operator[] (size_t pos) { return data[pos];} + inline const float* operator[] (const size_t pos) const { return &(*data[pos]);} +}; +StarSOA stars2; +std::vector stars; + +float G = 0.001; +float eps = 0.001; +float dt = 0.01; + + + +void init() { + for (int i = 0; i < 48; i++) { + stars.push_back({ + frand(), frand(), frand(), + frand(), frand(), frand(), + frand() + 1, + }); + } +} + +void step() { + for (auto &star: stars) { + for (auto &other: stars) { + float dx = other.px - star.px; + float dy = other.py - star.py; + float dz = other.pz - star.pz; + float d2 = dx * dx + dy * dy + dz * dz + eps * eps; + d2 *= sqrt(d2); + star.vx += dx * other.mass * G * dt / d2; + star.vy += dy * other.mass * G * dt / d2; + star.vz += dz * other.mass * G * dt / d2; + } + } + for (auto &star: stars) { + star.px += star.vx * dt; + star.py += star.vy * dt; + star.pz += star.vz * dt; + } +} + + +float calc() { + float energy = 0; + for (auto &star: stars) { + float v2 = star.vx * star.vx + star.vy * star.vy + star.vz * star.vz; + energy += star.mass * v2 / 2; + for (auto &other: stars) { + float dx = other.px - star.px; + float dy = other.py - star.py; + float dz = other.pz - star.pz; + float d2 = dx * dx + dy * dy + dz * dz + eps * eps; + energy -= other.mass * star.mass * G / sqrt(d2) / 2; + } + } + return energy; +} +void init2() { + // for (int i=0; i=0; j--) { + // stars2[j][i] = frand(); + // } + // stars2[6][i] = frand() + 1; + // } + for (int i=0; i(&y); // evil floating point bit level hacking + i = 0x5f3759df - (i >> 1); + y = *reinterpret_cast(&i); + y = y * (threehalfs - (x2 * y * y)); // 1st iteration + y = y * (threehalfs - (x2 * y * y)); // 2nd iteration, this can be removed + + return y; +} + + +// #define HAVE_AVX512 +#ifdef HAVE_AVX512 +#define NSTEP 16 +#define LOAD_PS(_P) _mm512_load_ps(_P) +#define STORE_PS(_P, _F) _mm512_store_ps(_P, _F) +#define SET1_PS(_F) _mm512_set1_ps(_F) +#define MUL_PS(_a, _b) _mm512_mul_ps(_a, _b) +#define SUB_PS(_a, _b) _mm512_sub_ps(_a, _b) +#define ADD_PS(_a, _b) _mm512_add_ps(_a, _b) +#define DIV_PS(_a, _b) _mm512_div_ps(_a, _b) +#define RSQRT_PS(_a) _mm512_rsqrt14_ps(_a) +#else +#define NSTEP 8 +#define LOAD_PS(_P) _mm256_load_ps(_P) +#define STORE_PS(_P, _F) _mm256_store_ps(_P, _F) +#define SET1_PS(_F) _mm256_set1_ps(_F) +#define MUL_PS(_a, _b) _mm256_mul_ps(_a, _b) +#define SUB_PS(_a, _b) _mm256_sub_ps(_a, _b) +#define ADD_PS(_a, _b) _mm256_add_ps(_a, _b) +#define DIV_PS(_a, _b) _mm256_div_ps(_a, _b) +#define RSQRT_PS(_a) _mm256_rsqrt_ps(_a) +#endif + +void step2() { + auto other = stars2; + for (int i=0; i +long benchmark(Func const &func) { + auto t0 = std::chrono::steady_clock::now(); + func(); + auto t1 = std::chrono::steady_clock::now(); + auto dt = std::chrono::duration_cast(t1 - t0); + return dt.count(); +} + +int main() { + init(); + init2(); + printf("original:\n"); + printf("Initial energy: %f\n", calc()); + auto dt = benchmark([&] { + for (int i = 0; i < 100000; i++) + step(); + }); + printf("Final energy: %f\n", calc()); + printf("Time elapsed: %ld ms\n", dt); + + printf("my:\n"); + printf("Initial energy: %f\n", calc2()); + auto dt2 = benchmark([&] { + for (int i = 0; i < 100000; i++) + step2(); + }); + printf("Final energy: %f\n", calc2()); + printf("Time elapsed: %ld ms\n", dt2); + return 0; +} \ No newline at end of file