1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
use crate::{common::*, intrinsics::*, macros::*, vec4f::Vec4f};
use derive_more::{From, Into};
use std::ops::{Add, Div, Mul, Neg, Sub};

use super::Vec8fBase;

#[repr(transparent)]
#[derive(Clone, Copy, From, Into)]
pub struct Vec8f(__m256);

impl super::Vec8fBase for Vec8f {
    #[inline]
    fn new(v0: f32, v1: f32, v2: f32, v3: f32, v4: f32, v5: f32, v6: f32, v7: f32) -> Self {
        // SAFETY: the `cfg_if!` in `vec8f/mod.rs` guarantees the intrinsic is available.
        unsafe { _mm256_setr_ps(v0, v1, v2, v3, v4, v5, v6, v7) }.into()
    }

    #[inline]
    fn join(low: Vec4f, high: Vec4f) -> Self {
        // SAFETY: the `cfg_if!` in `vec8f/mod.rs` guarantees the intrinsic is available.
        unsafe { _mm256_set_m128(high.into(), low.into()) }.into()
    }

    #[inline]
    fn low(self) -> Vec4f {
        // SAFETY: the `cfg_if!` in `vec8f/mod.rs` guarantees the intrinsic is available.
        unsafe { _mm256_castps256_ps128(self.0) }.into()
    }

    #[inline]
    fn high(self) -> Vec4f {
        // SAFETY: the `cfg_if!` in `vec8f/mod.rs` guarantees the intrinsic is available.
        unsafe { _mm256_extractf128_ps(self.0, 1) }.into()
    }

    #[inline]
    unsafe fn load_ptr_aligned(addr: *const f32) -> Self {
        _mm256_load_ps(addr).into()
    }

    #[inline]
    unsafe fn store_ptr_aligned(self, addr: *mut f32) {
        _mm256_store_ps(addr, self.0);
    }

    #[inline]
    unsafe fn store_ptr_non_temporal(self, addr: *mut f32) {
        _mm256_stream_ps(addr, self.0)
    }
}

impl SIMDBase<8> for Vec8f {
    type Underlying = __m256;
    type Element = f32;

    #[inline]
    fn broadcast(value: f32) -> Self {
        // SAFETY: the `cfg_if!` in `vec8f/mod.rs` guarantees the intrinsic is available.
        unsafe { _mm256_set1_ps(value) }.into()
    }

    #[inline]
    unsafe fn load_ptr(addr: *const f32) -> Self {
        _mm256_loadu_ps(addr).into()
    }

    #[inline]
    unsafe fn store_ptr(self, addr: *mut Self::Element) {
        _mm256_storeu_ps(addr, self.0);
    }

    #[inline]
    fn sum(self) -> Self::Element {
        (self.low() + self.high()).sum()
    }
}

impl Default for Vec8f {
    #[inline]
    fn default() -> Self {
        // SAFETY: the `cfg_if!` in `vec8f/mod.rs` guarantees the intrinsic is available.
        unsafe { _mm256_setzero_ps() }.into()
    }
}

impl Neg for Vec8f {
    type Output = Self;

    #[inline]
    fn neg(self) -> Self::Output {
        // SAFETY: the `cfg_if!` in `vec8f/mod.rs` guarantees the intrinsic is available.
        unsafe { _mm256_xor_ps(self.0, _mm256_set1_ps(-0.0)) }.into()
    }
}

impl PartialEq for Vec8f {
    #[inline]
    fn eq(&self, other: &Self) -> bool {
        // SAFETY: the `cfg_if!` in `vec8f/mod.rs` guarantees the intrinsic is available.
        unsafe {
            let cmp_result = _mm256_cmp_ps::<0>(self.0, other.0);
            _mm256_testz_ps(cmp_result, cmp_result) == 0
        }
    }
}

#[cfg(target_feature = "fma")]
impl crate::common::SIMDFusedCalc for Vec8f {
    #[inline]
    fn mul_add(self, b: Self, c: Self) -> Self {
        // SAFETY: the `cfg_if!` in `vec8f/mod.rs` guarantees the intrinsic is available.
        unsafe { _mm256_fmadd_ps(self.0, b.0, c.0) }.into()
    }

    #[inline]
    fn mul_sub(self, b: Self, c: Self) -> Self {
        // SAFETY: the `cfg_if!` in `vec8f/mod.rs` guarantees the intrinsic is available.
        unsafe { _mm256_fmsub_ps(self.0, b.0, c.0) }.into()
    }

    #[inline]
    fn nmul_add(self, b: Self, c: Self) -> Self {
        // SAFETY: the `cfg_if!` in `vec8f/mod.rs` guarantees the intrinsic is available.
        unsafe { _mm256_fnmadd_ps(self.0, b.0, c.0) }.into()
    }

    #[inline]
    fn nmul_sub(self, b: Self, c: Self) -> Self {
        // SAFETY: the `cfg_if!` in `vec8f/mod.rs` guarantees the intrinsic is available.
        unsafe { _mm256_fnmsub_ps(self.0, b.0, c.0) }.into()
    }
}

#[cfg(not(target_feature = "fma"))]
impl crate::common::SIMDFusedCalcFallback for Vec8f {}

vec_impl_binary_op!(Vec8f, Add, add, _mm256_add_ps);
vec_impl_binary_op!(Vec8f, Sub, sub, _mm256_sub_ps);
vec_impl_binary_op!(Vec8f, Mul, mul, _mm256_mul_ps);
vec_impl_binary_op!(Vec8f, Div, div, _mm256_div_ps);

impl SIMDFloat for Vec8f {
    fn round(self) -> Self {
        // SAFETY: the `cfg_if!` in `vec8f/mod.rs` guarantees the intrinsic is available.
        unsafe { _mm256_round_ps(self.0, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) }.into()
    }
}