1#ifndef VECCORE_BACKEND_STD_SIMD_H
2#define VECCORE_BACKEND_STD_SIMD_H
4#if __cplusplus >= 202002L && defined(__has_include)
5#if __has_include(<experimental/simd>)
6#define VECCORE_ENABLE_STD_SIMD
10#ifdef VECCORE_ENABLE_STD_SIMD
12#include <experimental/simd>
16template <
typename T,
class Abi>
17struct TypeTraits<std::experimental::simd_mask<T, Abi>> {
18 using IndexType =
typename std::experimental::simd_mask<T, Abi>::simd_type;
19 using ScalarType =
typename std::experimental::simd_mask<T, Abi>::value_type;
20 static constexpr size_t Size = std::experimental::simd<T, Abi>::size();
23template <
typename T,
class Abi>
24struct TypeTraits<std::experimental::simd<T, Abi>> {
25 using MaskType =
typename std::experimental::simd<T, Abi>::mask_type;
26 using IndexType =
typename std::experimental::simd<T, Abi>;
27 using ScalarType =
typename std::experimental::simd<T, Abi>::value_type;
28 static constexpr size_t Size = std::experimental::simd<T, Abi>::size();
33template <
class Abi>
class SIMD {
35 using Real_v = std::experimental::simd<Real_s, Abi>;
36 using Float_v = std::experimental::simd<float, Abi>;
37 using Double_v = std::experimental::simd<double, Abi>;
39 using Int_v = std::experimental::simd<int, Abi>;
40 using Int16_v = std::experimental::simd<int16_t, Abi>;
41 using Int32_v = std::experimental::simd<int32_t, Abi>;
42 using Int64_v = std::experimental::simd<int64_t, Abi>;
44 using UInt_v = std::experimental::simd<unsigned int, Abi>;
45 using UInt16_v = std::experimental::simd<uint16_t, Abi>;
46 using UInt32_v = std::experimental::simd<uint32_t, Abi>;
47 using UInt64_v = std::experimental::simd<uint64_t, Abi>;
52 using Real_v = std::experimental::native_simd<Real_s>;
53 using Float_v = std::experimental::native_simd<float>;
54 using Double_v = std::experimental::native_simd<double>;
56 using Int_v = std::experimental::native_simd<int>;
57 using Int16_v = std::experimental::native_simd<int16_t>;
58 using Int32_v = std::experimental::native_simd<int32_t>;
59 using Int64_v = std::experimental::native_simd<int64_t>;
61 using UInt_v = std::experimental::native_simd<unsigned int>;
62 using UInt16_v = std::experimental::native_simd<uint16_t>;
63 using UInt32_v = std::experimental::native_simd<uint32_t>;
64 using UInt64_v = std::experimental::native_simd<uint64_t>;
67using SIMDScalar = SIMD<std::experimental::simd_abi::scalar>;
70using SIMDVector = SIMD<std::experimental::simd_abi::fixed_size<N>>;
74template <
typename T,
class Abi>
75bool MaskEmpty(std::experimental::simd_mask<T, Abi> mask) {
76 for (
int i = 0; i < mask.size(); ++i)
82template <
typename T,
class Abi>
83bool MaskFull(std::experimental::simd_mask<T, Abi> mask) {
84 for (
int i = 0; i < mask.size(); ++i)
90template <
typename T,
class Abi>
91struct IndexingImplementation<std::experimental::simd<T, Abi>> {
92 using V = std::experimental::simd<T, Abi>;
94 static inline T
Get(
const V &v,
size_t i) {
return v[i]; }
95 static inline void Set(V &v,
size_t i, T
const val) { v[i] = val; }
98template <
typename T,
class Abi>
99struct IndexingImplementation<std::experimental::simd_mask<T, Abi>> {
100 using V = std::experimental::simd_mask<T, Abi>;
102 static inline T
Get(
const V &v,
size_t i) {
return v[i]; }
103 static inline void Set(V &v,
size_t i, T
const val) { v[i] = !!val; }
106template <
typename T,
class Abi>
107struct LoadStoreImplementation<std::experimental::simd<T, Abi>> {
108 using V = std::experimental::simd<T, Abi>;
110 template <
typename S = T>
static inline void Load(V &v, S
const *ptr) {
111 for (
size_t i = 0; i < V::size(); ++i)
115 template <
typename S = T>
static inline void Store(V
const &v, S *ptr) {
116 for (
size_t i = 0; i < V::size(); ++i)
117 ptr[i] =
static_cast<S
>(v[i]);
121template <
typename T,
class Abi>
122struct LoadStoreImplementation<std::experimental::simd_mask<T, Abi>> {
123 using V =
typename std::experimental::simd_mask<T, Abi>;
125 template <
typename S = T>
static inline void Load(V &v, S
const *ptr) {
126 for (
size_t i = 0; i < V::size(); ++i)
130 template <
typename S = T>
static inline void Store(V
const &v, S *ptr) {
131 for (
size_t i = 0; i < V::size(); ++i)
132 ptr[i] =
static_cast<S
>(v[i]);
136template <
typename T,
class Abi>
137struct MaskingImplementation<std::experimental::simd<T, Abi>> {
138 using V =
typename std::experimental::simd<T, Abi>;
139 using M =
typename std::experimental::simd<T, Abi>::mask_type;
141 static inline void Assign(V &dst, M
const &mask, V
const &src) {
142 where(mask, dst) = src;
145 static inline void Blend(V &dst, M
const &mask, V
const &src1,
147 where(mask, dst) = src1;
148 where(!mask, dst) = src2;
153struct GatherScatterImplementation<std::experimental::simd<float, Abi>> {
154 using V =
typename std::experimental::simd<float, Abi>;
156 template <
typename S =
float>
157 static inline void Gather(V &v, S
const *ptr, V
const &idx) {
158 auto ii = std::experimental::static_simd_cast<int, float, Abi>(idx);
159 for (
size_t i = 0; i < V::size(); ++i)
163 template <
typename S =
float>
164 static inline void Scatter(V
const &v, S *ptr, V
const &idx) {
165 auto ii = std::experimental::static_simd_cast<int, float, Abi>(idx);
166 for (
size_t i = 0; i < V::size(); ++i)
172struct GatherScatterImplementation<std::experimental::simd<double, Abi>> {
173 using V =
typename std::experimental::simd<double, Abi>;
175 template <
typename S =
double>
176 static inline void Gather(V &v, S
const *ptr, V
const &idx) {
177 auto ii = std::experimental::static_simd_cast<int64_t, double, Abi>(idx);
178 for (
size_t i = 0; i < V::size(); ++i)
182 template <
typename S =
double>
183 static inline void Scatter(V
const &v, S *ptr, V
const &idx) {
184 auto ii = std::experimental::static_simd_cast<int64_t, double, Abi>(idx);
185 for (
size_t i = 0; i < V::size(); ++i)
VECCORE_ATT_HOST_DEVICE bool MaskEmpty(const M &mask)
VECCORE_ATT_HOST_DEVICE bool MaskFull(const M &mask)
VECCORE_FORCE_INLINE static VECCORE_ATT_HOST_DEVICE void Scatter(T const &v, S *ptr, Index< T > const &idx)
VECCORE_FORCE_INLINE static VECCORE_ATT_HOST_DEVICE void Gather(T &v, S const *ptr, Index< T > const &idx)
VECCORE_FORCE_INLINE static VECCORE_ATT_HOST_DEVICE void Set(T &v, size_t i, Scalar< T > const val)
VECCORE_FORCE_INLINE static VECCORE_ATT_HOST_DEVICE Scalar< T > Get(const T &v, size_t i)
VECCORE_FORCE_INLINE static VECCORE_ATT_HOST_DEVICE void Load(T &v, S const *ptr)
VECCORE_FORCE_INLINE static VECCORE_ATT_HOST_DEVICE void Store(T const &v, S *ptr)
VECCORE_FORCE_INLINE static VECCORE_ATT_HOST_DEVICE void Blend(T &dst, Mask< T > const &mask, T const &src1, T const &src2)
VECCORE_FORCE_INLINE static VECCORE_ATT_HOST_DEVICE void Assign(T &dst, Mask< T > const &mask, T const &src)
static constexpr size_t Size