Skip to content

Instantly share code, notes, and snippets.

@tejainece
Created April 29, 2024 00:09
Show Gist options
  • Save tejainece/306e92a29dbcb63fa42ebcdd1d57f03f to your computer and use it in GitHub Desktop.
Save tejainece/306e92a29dbcb63fa42ebcdd1d57f03f to your computer and use it in GitHub Desktop.
template <typename I>
const char *tcNegSlow(I *out, const I *inp, uint64_t nel) {
constexpr size_t laneSize = simdSize<I>();
uint16_t concurrency = std::thread::hardware_concurrency();
uint64_t totalLanes = (nel + laneSize - 1) / laneSize;
uint64_t lanesPerThread = std::max(
uint64_t((totalLanes + concurrency - 1) / concurrency), uint64_t(1)
);
std::vector<std::future<void>> futures(concurrency);
for (uint16_t threadNum = 0; threadNum < concurrency; threadNum++) {
futures[threadNum] = std::async(
std::launch::async,
[threadNum, lanesPerThread, out, inp, laneSize]() {
uint64_t start = threadNum * lanesPerThread * laneSize;
uint64_t last = (threadNum + 1) * lanesPerThread * laneSize;
__m256 v;
for (uint64_t lane = start; lane < last; lane += laneSize) {
v = _mm256_load_ps(inp + lane);
v = _mm256_sub_ps(_mm256_setzero_ps(), v);
_mm256_store_ps(out + lane, v);
}
}
);
}
for (uint16_t i = 0; i < concurrency; i++) {
futures[i].wait();
}
return nullptr;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment