From faf2aaa0bbe000419ac8529ff1a35f47d9c65796 Mon Sep 17 00:00:00 2001 From: dgi <1097190201@qq.com> Date: Mon, 7 Mar 2022 01:53:49 +0800 Subject: [PATCH] add some speed-up tricks --- CMakeLists.txt | 2 + main.cpp | 244 +++++++++++++++++++++++++++++++------------------ score.txt | 88 +++++++++++++++--- 3 files changed, 233 insertions(+), 101 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 29b152c..4ac7422 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -2,8 +2,10 @@ cmake_minimum_required(VERSION 3.12) project(hellocmake LANGUAGES CXX) set(CMAKE_CXX_STANDARD 17) +SET(CMAKE_CXX_FLAGS_DEBUG "$ENV{CXXFLAGS} -O3 -Wall -g -ggdb") if (NOT CMAKE_BUILD_TYPE) set(CMAKE_BUILD_TYPE Release) endif() add_executable(main main.cpp) +target_compile_options(main PUBLIC -ffast-math -march=native) diff --git a/main.cpp b/main.cpp index cf6369b..6c516d2 100644 --- a/main.cpp +++ b/main.cpp @@ -1,88 +1,156 @@ -#include -#include -#include -#include -#include - -float frand() { - return (float)rand() / RAND_MAX * 2 - 1; -} - -struct Star { - float px, py, pz; - float vx, vy, vz; - float mass; -}; - -std::vector stars; - -void init() { - for (int i = 0; i < 48; i++) { - stars.push_back({ - frand(), frand(), frand(), - frand(), frand(), frand(), - frand() + 1, - }); - } -} - -float G = 0.001; -float eps = 0.001; -float dt = 0.01; - -void step() { - for (auto &star: stars) { - for (auto &other: stars) { - float dx = other.px - star.px; - float dy = other.py - star.py; - float dz = other.pz - star.pz; - float d2 = dx * dx + dy * dy + dz * dz + eps * eps; - d2 *= sqrt(d2); - star.vx += dx * other.mass * G * dt / d2; - star.vy += dy * other.mass * G * dt / d2; - star.vz += dz * other.mass * G * dt / d2; - } - } - for (auto &star: stars) { - star.px += star.vx * dt; - star.py += star.vy * dt; - star.pz += star.vz * dt; - } -} - -float calc() { - float energy = 0; - for (auto &star: stars) { - float v2 = star.vx * star.vx + star.vy * star.vy + star.vz * star.vz; - energy += star.mass * v2 / 2; - for (auto &other: stars) { - float dx = other.px - star.px; - float dy = other.py - star.py; - float dz = other.pz - star.pz; - float d2 = dx * dx + dy * dy + dz * dz + eps * eps; - energy -= other.mass * star.mass * G / sqrt(d2) / 2; - } - } - return energy; -} - -template -long benchmark(Func const &func) { - auto t0 = std::chrono::steady_clock::now(); - func(); - auto t1 = std::chrono::steady_clock::now(); - auto dt = std::chrono::duration_cast(t1 - t0); - return dt.count(); -} - -int main() { - init(); - printf("Initial energy: %f\n", calc()); - auto dt = benchmark([&] { - for (int i = 0; i < 100000; i++) - step(); - }); - printf("Final energy: %f\n", calc()); - printf("Time elapsed: %ld ms\n", dt); - return 0; -} +#include +#include +#include +#include +#include +#include + +inline float frand() { + return (float)rand() / RAND_MAX * 2 - 1; +} +// AoS:Array of struct +//struct Star { +// float px, py, pz; +// float vx, vy, vz; +// float mass; +//}; +// SoA:struct of array +const int N = 48; +template +struct Star { + std::array px,py,pz; + std::array vx,vy,vz,mass; +}; + +//std::vector stars; +Star stars; + +void init() { +// for (size_t i = 0; i < 48; i++) { +// stars.push_back({ +// frand(), frand(), frand(), +// frand(), frand(), frand(), +// frand() + 1, +// }); +// } +#pragma GCC unroll 4 + for(size_t i = 0;i < 48;i++){ + stars.px[i] = frand(); + stars.py[i] = frand(); + stars.pz[i] = frand(); + stars.vx[i] = frand(); + stars.vy[i] = frand(); + stars.vz[i] = frand(); + stars.mass[i] = frand() + 1; + } + +} + +const float G = 0.001; +const float eps = 0.001; +const float dt = 0.01; + +void step() { +// for (auto &star: stars) { +// for (auto &other: stars) { +// float dx = other.px - star.px; +// float dy = other.py - star.py; +// float dz = other.pz - star.pz; +// float d2 = dx * dx + dy * dy + dz * dz + eps * eps; +// d2 *= sqrt(d2); +// float inve_d2 = 1.0 / d2; +//// star.vx += dx * other.mass * G * dt / d2; +//// star.vy += dy * other.mass * G * dt / d2; +//// star.vz += dz * other.mass * G * dt / d2; +// star.vx += dx * other.mass * G * dt * inve_d2; +// star.vy += dy * other.mass * G * dt * inve_d2; +// star.vz += dz * other.mass * G * dt * inve_d2; +// } +// } +float gt = G * dt; +#pragma GCC unroll 4 + for(size_t i = 0;i < N;i++){ + for(size_t j = 0;j < N;j++){ + float dx = stars.px[j] - stars.px[i]; + float dy = stars.py[j] - stars.py[i]; + float dz = stars.pz[j] - stars.pz[i]; + float d2 = dx * dx + dy * dy + dz * dz + eps * eps; + d2 *= std::sqrt(d2); + float inve_d2 = gt / d2; +// star.vx += dx * other.mass * G * dt / d2; +// star.vy += dy * other.mass * G * dt / d2; +// star.vz += dz * other.mass * G * dt / d2; + stars.vx[i] += dx * stars.mass[j] * inve_d2; + stars.vy[i] += dy * stars.mass[j] * inve_d2; + stars.vz[i] += dz * stars.mass[j] * inve_d2; + } + } + + +// for (auto &star: stars) { +// star.px += star.vx * dt; +// star.py += star.vy * dt; +// star.pz += star.vz * dt; +// } +#pragma GCC unroll 4 + for(size_t k = 0;k < N;k++){ + stars.px[k] += stars.vx[k] * dt; + stars.py[k] += stars.vy[k] * dt; + stars.pz[k] += stars.vz[k] * dt; + } + + +} + +float calc() { + float energy = 0; +// for (auto &star: stars) { +// float v2 = star.vx * star.vx + star.vy * star.vy + star.vz * star.vz; +//// energy += star.mass * v2 / 2; +// energy += star.mass * v2 * 0.5; +// for (auto &other: stars) { +// float dx = other.px - star.px; +// float dy = other.py - star.py; +// float dz = other.pz - star.pz; +// float d2 = dx * dx + dy * dy + dz * dz + eps * eps; +//// energy -= other.mass * star.mass * G / sqrt(d2) / 2; +// energy -= other.mass * star.mass * G / sqrt(d2) * 0.5; +// } +// } +// return energy; +const float eps2 = eps * eps; +#pragma GCC unroll 4 + for(size_t i = 0;i < N;i++){ + float v2 = stars.vx[i] * stars.vx[i] + stars.vy[i] * stars.vy[i] + stars.vz[i] * stars.vz[i]; + energy += stars.mass[i] * v2 * 0.5; + for(size_t j = 0;j < N;j++) { + float dx = stars.px[j] - stars.px[i]; + float dy = stars.py[j] - stars.py[i]; + float dz = stars.pz[j] - stars.pz[i]; + float d2 = dx * dx + dy * dy + dz * dz + eps2; + energy -= stars.mass[j] * stars.mass[i] * G / std::sqrt(d2) * 0.5; + } + } + return energy; +} + +template +long benchmark(Func const &func) { + auto t0 = std::chrono::steady_clock::now(); + func(); + auto t1 = std::chrono::steady_clock::now(); + auto dt = std::chrono::duration_cast(t1 - t0); + return dt.count(); +} + +int main() { + init(); + printf("Initial energy: %f\n", calc()); + auto dt = benchmark([&] { + for (size_t i = 0; i < 100000; i++) + step(); + }); + printf("Final energy: %f\n", calc()); + printf("Time elapsed: %ld ms\n", dt); + return 0; +} diff --git a/score.txt b/score.txt index 32c9a28..e416a28 100644 --- a/score.txt +++ b/score.txt @@ -1,13 +1,75 @@ -目前 (01/12) 同学们作业在老师电脑上的跑分 (毫秒): - -原版: 1107 -#1: 113 -#2: 116 -#3: 65 -#4: 125 -#5: 243 -#6: 114 -#11: 114 -#12: 161 -#13: 83 -#14: 121 +原始版本 +Initial energy: -13.414000 +Final energy: -13.356842 +Time elapsed: 6285 ms + +-o1 优化 +Initial energy: -13.414000 +Final energy: -13.356842 +Time elapsed: 2659 ms + +-o2 +Initial energy: -13.414000 +Final energy: -13.356842 +Time elapsed: 2558 ms + +-o3 +Initial energy: -13.414000 +Final energy: -13.356842 +Time elapsed: 2560 ms + +索引int变成size_t +Initial energy: -13.414000 +Final energy: -13.356842 +Time elapsed: 2560 ms + +inline frand函数 +Initial energy: -13.414000 +Final energy: -13.356842 +Time elapsed: 2593 ms + +将/2改成*0.5 +Initial energy: -13.414000 +Final energy: -13.356842 +Time elapsed: 2585 ms + + +将公共除法变成乘法 +Initial energy: -13.414000 +Final energy: -13.356842 +Time elapsed: 2030 ms + + +-ffast-math 选项定义了预处理器宏 __FAST_MATH__, 指示编译不必遵循 IEEE 和 ISO 的浮点运算标准 +-march=native,GCC会自动检测你的CPU支持的指令集。 + Initial energy: -13.414000 + Final energy: -13.356841 + Time elapsed: 1606 ms + + +SoA 阵列结构 struct of array +Initial energy: -13.414000 +Final energy: -13.356841 +Time elapsed: 1630 ms + +sqrt前+std:: 使用模板完成传入参数匹配 +Initial energy: -13.414000 +Final energy: -13.356841 +Time elapsed: 1220 ms + +#pragma GCC unroll 4 +Initial energy: -13.414000 +Final energy: -13.356841 +Time elapsed: 1212 ms + +G * dt / d2 提取到循环外面计算 +Initial energy: -13.414000 +Final energy: -13.356841 +Time elapsed: 1096 ms + + +final result: +6285 ms / 1096 ms = 5.734 + + +