diff options
author | Android Build Coastguard Worker <android-build-coastguard-worker@google.com> | 2022-06-15 21:45:00 +0000 |
---|---|---|
committer | Android Build Coastguard Worker <android-build-coastguard-worker@google.com> | 2022-06-15 21:45:00 +0000 |
commit | beb2d6b25405f6231f2177e825ebe1b47bbe4e7c (patch) | |
tree | 7d9f824e8baa7032c8f8bf51f965345a27a21cef | |
parent | 5cc7723d3b10b618b1a0679849536eb7060379b6 (diff) | |
parent | 84737ef36d3bd9c929b801fdaa22c8e27899648e (diff) | |
download | ppv-lite86-aml_tz3_314012070.tar.gz |
Snap for 8730993 from 84737ef36d3bd9c929b801fdaa22c8e27899648e to mainline-tzdata3-releaseaml_tz3_314012070aml_tz3_314012050aml_tz3_314012010aml_tz3_313110000aml_tz3_312511020aml_tz3_312511010aml_tz3_312410020aml_tz3_312410010android12-mainline-tzdata3-releaseaml_tz3_314012010
Change-Id: Id5e435b4c7bc07f749f888569e184c874f7a279a
-rw-r--r-- | .cargo_vcs_info.json | 7 | ||||
-rw-r--r-- | Android.bp | 29 | ||||
-rw-r--r-- | CHANGELOG.md | 10 | ||||
-rw-r--r-- | Cargo.toml | 11 | ||||
-rw-r--r-- | Cargo.toml.orig | 2 | ||||
-rw-r--r-- | METADATA | 10 | ||||
-rw-r--r-- | TEST_MAPPING | 98 | ||||
-rw-r--r-- | cargo2android.json | 10 | ||||
-rw-r--r-- | src/generic.rs | 122 | ||||
-rw-r--r-- | src/soft.rs | 81 | ||||
-rw-r--r-- | src/types.rs | 22 | ||||
-rw-r--r-- | src/x86_64/mod.rs | 30 | ||||
-rw-r--r-- | src/x86_64/sse2.rs | 338 |
13 files changed, 237 insertions, 533 deletions
diff --git a/.cargo_vcs_info.json b/.cargo_vcs_info.json index e6ee0e5..adb1fc4 100644 --- a/.cargo_vcs_info.json +++ b/.cargo_vcs_info.json @@ -1,6 +1,5 @@ { "git": { - "sha1": "4b1e1d655d05c9da29aa833ce705feedb3da760b" - }, - "path_in_vcs": "utils-simd/ppv-lite86" -}
\ No newline at end of file + "sha1": "3012849c2d9c50228a780031e7c200b193a6b4fa" + } +} @@ -1,5 +1,4 @@ -// This file is generated by cargo2android.py --config cargo2android.json. -// Do not modify this file as changes will be overridden on upgrade. +// This file is generated by cargo2android.py --device --run --dependencies --tests. package { default_applicable_licenses: ["external_rust_crates_ppv-lite86_license"], @@ -41,35 +40,33 @@ rust_library { name: "libppv_lite86", host_supported: true, crate_name: "ppv_lite86", - cargo_env_compat: true, - cargo_pkg_version: "0.2.16", srcs: ["src/lib.rs"], edition: "2018", features: [ "default", "std", ], - apex_available: [ - "//apex_available:platform", - "com.android.virt", - ], } -rust_test { - name: "ppv-lite86_test_src_lib", - host_supported: true, +rust_defaults { + name: "ppv-lite86_defaults", crate_name: "ppv_lite86", - cargo_env_compat: true, - cargo_pkg_version: "0.2.16", srcs: ["src/lib.rs"], test_suites: ["general-tests"], auto_gen_config: true, - test_options: { - unit_test: true, - }, edition: "2018", features: [ "default", "std", ], } + +rust_test_host { + name: "ppv-lite86_host_test_src_lib", + defaults: ["ppv-lite86_defaults"], +} + +rust_test { + name: "ppv-lite86_device_test_src_lib", + defaults: ["ppv-lite86_defaults"], +} diff --git a/CHANGELOG.md b/CHANGELOG.md deleted file mode 100644 index 6e34be3..0000000 --- a/CHANGELOG.md +++ /dev/null @@ -1,10 +0,0 @@ -# Changelog -All notable changes to this project will be documented in this file. - -The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), -and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). - -## [0.2.16] -### Added -- add [u64; 4] conversion for generic vec256, to support BLAKE on non-x86. -- impl `From` (rather than just `Into`) for conversions between `*_storage` types and arrays. @@ -3,16 +3,17 @@ # When uploading crates to the registry Cargo will automatically # "normalize" Cargo.toml files for maximal compatibility # with all versions of Cargo and also rewrite `path` dependencies -# to registry (e.g., crates.io) dependencies. +# to registry (e.g., crates.io) dependencies # -# If you are reading this file be aware that the original Cargo.toml -# will likely look very different (and much more reasonable). -# See Cargo.toml.orig for the original contents. +# If you believe there's an error in this file please file an +# issue against the rust-lang/cargo repository. If you're +# editing this file be aware that the upstream Cargo.toml +# will likely look very different (and much more reasonable) [package] edition = "2018" name = "ppv-lite86" -version = "0.2.16" +version = "0.2.10" authors = ["The CryptoCorrosion Contributors"] description = "Implementation of the crypto-simd API for x86" keywords = ["crypto", "simd", "x86"] diff --git a/Cargo.toml.orig b/Cargo.toml.orig index b457f54..8f3fb52 100644 --- a/Cargo.toml.orig +++ b/Cargo.toml.orig @@ -1,6 +1,6 @@ [package] name = "ppv-lite86" -version = "0.2.16" +version = "0.2.10" authors = ["The CryptoCorrosion Contributors"] edition = "2018" license = "MIT/Apache-2.0" @@ -7,13 +7,13 @@ third_party { } url { type: ARCHIVE - value: "https://static.crates.io/crates/ppv-lite86/ppv-lite86-0.2.16.crate" + value: "https://static.crates.io/crates/ppv-lite86/ppv-lite86-0.2.10.crate" } - version: "0.2.16" + version: "0.2.10" license_type: NOTICE last_upgrade_date { - year: 2022 - month: 3 - day: 1 + year: 2020 + month: 11 + day: 2 } } diff --git a/TEST_MAPPING b/TEST_MAPPING index f0d135a..a45c687 100644 --- a/TEST_MAPPING +++ b/TEST_MAPPING @@ -1,102 +1,8 @@ -// Generated by update_crate_tests.py for tests that depend on this crate. +// Generated by cargo2android.py for tests in Android.bp { - "imports": [ - { - "path": "external/rust/crates/base64" - }, - { - "path": "external/rust/crates/cast" - }, - { - "path": "external/rust/crates/crc32fast" - }, - { - "path": "external/rust/crates/crossbeam-deque" - }, - { - "path": "external/rust/crates/crossbeam-epoch" - }, - { - "path": "external/rust/crates/crossbeam-queue" - }, - { - "path": "external/rust/crates/crossbeam-utils" - }, - { - "path": "external/rust/crates/mio" - }, - { - "path": "external/rust/crates/quickcheck" - }, - { - "path": "external/rust/crates/rand_chacha" - }, - { - "path": "external/rust/crates/regex" - }, - { - "path": "external/rust/crates/ryu" - }, - { - "path": "external/rust/crates/tokio" - } - ], "presubmit": [ { - "name": "ZipFuseTest" - }, - { - "name": "apkdmverity.test" - }, - { - "name": "authfs_device_test_src_lib" - }, - { - "name": "keystore2_test" - }, - { - "name": "keystore2_test_utils_test" - }, - { - "name": "legacykeystore_test" - }, - { - "name": "microdroid_manager_test" - }, - { - "name": "ppv-lite86_test_src_lib" - }, - { - "name": "virtualizationservice_device_test" - } - ], - "presubmit-rust": [ - { - "name": "ZipFuseTest" - }, - { - "name": "apkdmverity.test" - }, - { - "name": "authfs_device_test_src_lib" - }, - { - "name": "keystore2_test" - }, - { - "name": "keystore2_test_utils_test" - }, - { - "name": "legacykeystore_test" - }, - { - "name": "microdroid_manager_test" - }, - { - "name": "ppv-lite86_test_src_lib" - }, - { - "name": "virtualizationservice_device_test" + "name": "ppv-lite86_device_test_src_lib" } ] } diff --git a/cargo2android.json b/cargo2android.json deleted file mode 100644 index ac56e26..0000000 --- a/cargo2android.json +++ /dev/null @@ -1,10 +0,0 @@ -{ - "apex-available": [ - "//apex_available:platform", - "com.android.virt" - ], - "dependencies": true, - "device": true, - "run": true, - "tests": true -}
\ No newline at end of file diff --git a/src/generic.rs b/src/generic.rs index add6c48..f0e83d9 100644 --- a/src/generic.rs +++ b/src/generic.rs @@ -11,38 +11,38 @@ pub union vec128_storage { q: [u64; 2], } impl From<[u32; 4]> for vec128_storage { - #[inline(always)] + #[inline] fn from(d: [u32; 4]) -> Self { Self { d } } } impl From<vec128_storage> for [u32; 4] { - #[inline(always)] + #[inline] fn from(d: vec128_storage) -> Self { unsafe { d.d } } } impl From<[u64; 2]> for vec128_storage { - #[inline(always)] + #[inline] fn from(q: [u64; 2]) -> Self { Self { q } } } impl From<vec128_storage> for [u64; 2] { - #[inline(always)] + #[inline] fn from(q: vec128_storage) -> Self { unsafe { q.q } } } impl Default for vec128_storage { - #[inline(always)] + #[inline] fn default() -> Self { Self { q: [0, 0] } } } impl Eq for vec128_storage {} impl PartialEq<vec128_storage> for vec128_storage { - #[inline(always)] + #[inline] fn eq(&self, rhs: &Self) -> bool { unsafe { self.q == rhs.q } } @@ -62,21 +62,13 @@ impl vec256_storage { } } impl From<vec256_storage> for [u64; 4] { - #[inline(always)] + #[inline] fn from(q: vec256_storage) -> Self { let [a, b]: [u64; 2] = q.v128[0].into(); let [c, d]: [u64; 2] = q.v128[1].into(); [a, b, c, d] } } -impl From<[u64; 4]> for vec256_storage { - #[inline(always)] - fn from([a, b, c, d]: [u64; 4]) -> Self { - Self { - v128: [[a, b].into(), [c, d].into()], - } - } -} #[derive(Clone, Copy, PartialEq, Eq, Default)] pub struct vec512_storage { v128: [vec128_storage; 4], @@ -92,7 +84,6 @@ impl vec512_storage { } } -#[inline(always)] fn dmap<T, F>(t: T, f: F) -> T where T: Store<vec128_storage> + Into<vec128_storage>, @@ -126,7 +117,6 @@ where unsafe { T::unpack(d) } } -#[inline(always)] fn qmap<T, F>(t: T, f: F) -> T where T: Store<vec128_storage> + Into<vec128_storage>, @@ -140,7 +130,6 @@ where unsafe { T::unpack(q) } } -#[inline(always)] fn qmap2<T, F>(a: T, b: T, f: F) -> T where T: Store<vec128_storage> + Into<vec128_storage>, @@ -156,17 +145,14 @@ where unsafe { T::unpack(q) } } -#[inline(always)] fn o_of_q(q: [u64; 2]) -> u128 { u128::from(q[0]) | (u128::from(q[1]) << 64) } -#[inline(always)] fn q_of_o(o: u128) -> [u64; 2] { [o as u64, (o >> 64) as u64] } -#[inline(always)] fn omap<T, F>(a: T, f: F) -> T where T: Store<vec128_storage> + Into<vec128_storage>, @@ -178,7 +164,6 @@ where unsafe { T::unpack(o) } } -#[inline(always)] fn omap2<T, F>(a: T, b: T, f: F) -> T where T: Store<vec128_storage> + Into<vec128_storage>, @@ -262,39 +247,39 @@ macro_rules! impl_bitops { } impl Swap64 for $vec { - #[inline(always)] + #[inline] fn swap1(self) -> Self { qmap(self, |x| { ((x & 0x5555555555555555) << 1) | ((x & 0xaaaaaaaaaaaaaaaa) >> 1) }) } - #[inline(always)] + #[inline] fn swap2(self) -> Self { qmap(self, |x| { ((x & 0x3333333333333333) << 2) | ((x & 0xcccccccccccccccc) >> 2) }) } - #[inline(always)] + #[inline] fn swap4(self) -> Self { qmap(self, |x| { ((x & 0x0f0f0f0f0f0f0f0f) << 4) | ((x & 0xf0f0f0f0f0f0f0f0) >> 4) }) } - #[inline(always)] + #[inline] fn swap8(self) -> Self { qmap(self, |x| { ((x & 0x00ff00ff00ff00ff) << 8) | ((x & 0xff00ff00ff00ff00) >> 8) }) } - #[inline(always)] + #[inline] fn swap16(self) -> Self { dmap(self, |x| x.rotate_left(16)) } - #[inline(always)] + #[inline] fn swap32(self) -> Self { qmap(self, |x| x.rotate_left(32)) } - #[inline(always)] + #[inline] fn swap64(self) -> Self { omap(self, |x| (x << 64) | (x >> 64)) } @@ -306,83 +291,82 @@ impl_bitops!(u64x2_generic); impl_bitops!(u128x1_generic); impl RotateEachWord32 for u32x4_generic { - #[inline(always)] + #[inline] fn rotate_each_word_right7(self) -> Self { dmap(self, |x| x.rotate_right(7)) } - #[inline(always)] + #[inline] fn rotate_each_word_right8(self) -> Self { dmap(self, |x| x.rotate_right(8)) } - #[inline(always)] + #[inline] fn rotate_each_word_right11(self) -> Self { dmap(self, |x| x.rotate_right(11)) } - #[inline(always)] + #[inline] fn rotate_each_word_right12(self) -> Self { dmap(self, |x| x.rotate_right(12)) } - #[inline(always)] + #[inline] fn rotate_each_word_right16(self) -> Self { dmap(self, |x| x.rotate_right(16)) } - #[inline(always)] + #[inline] fn rotate_each_word_right20(self) -> Self { dmap(self, |x| x.rotate_right(20)) } - #[inline(always)] + #[inline] fn rotate_each_word_right24(self) -> Self { dmap(self, |x| x.rotate_right(24)) } - #[inline(always)] + #[inline] fn rotate_each_word_right25(self) -> Self { dmap(self, |x| x.rotate_right(25)) } } impl RotateEachWord32 for u64x2_generic { - #[inline(always)] + #[inline] fn rotate_each_word_right7(self) -> Self { qmap(self, |x| x.rotate_right(7)) } - #[inline(always)] + #[inline] fn rotate_each_word_right8(self) -> Self { qmap(self, |x| x.rotate_right(8)) } - #[inline(always)] + #[inline] fn rotate_each_word_right11(self) -> Self { qmap(self, |x| x.rotate_right(11)) } - #[inline(always)] + #[inline] fn rotate_each_word_right12(self) -> Self { qmap(self, |x| x.rotate_right(12)) } - #[inline(always)] + #[inline] fn rotate_each_word_right16(self) -> Self { qmap(self, |x| x.rotate_right(16)) } - #[inline(always)] + #[inline] fn rotate_each_word_right20(self) -> Self { qmap(self, |x| x.rotate_right(20)) } - #[inline(always)] + #[inline] fn rotate_each_word_right24(self) -> Self { qmap(self, |x| x.rotate_right(24)) } - #[inline(always)] + #[inline] fn rotate_each_word_right25(self) -> Self { qmap(self, |x| x.rotate_right(25)) } } impl RotateEachWord64 for u64x2_generic { - #[inline(always)] + #[inline] fn rotate_each_word_right32(self) -> Self { qmap(self, |x| x.rotate_right(32)) } } // workaround for koute/cargo-web#52 (u128::rotate_* broken with cargo web) -#[inline(always)] fn rotate_u128_right(x: u128, i: u32) -> u128 { (x >> i) | (x << (128 - i)) } @@ -393,41 +377,41 @@ fn test_rotate_u128() { } impl RotateEachWord32 for u128x1_generic { - #[inline(always)] + #[inline] fn rotate_each_word_right7(self) -> Self { Self([rotate_u128_right(self.0[0], 7)]) } - #[inline(always)] + #[inline] fn rotate_each_word_right8(self) -> Self { Self([rotate_u128_right(self.0[0], 8)]) } - #[inline(always)] + #[inline] fn rotate_each_word_right11(self) -> Self { Self([rotate_u128_right(self.0[0], 11)]) } - #[inline(always)] + #[inline] fn rotate_each_word_right12(self) -> Self { Self([rotate_u128_right(self.0[0], 12)]) } - #[inline(always)] + #[inline] fn rotate_each_word_right16(self) -> Self { Self([rotate_u128_right(self.0[0], 16)]) } - #[inline(always)] + #[inline] fn rotate_each_word_right20(self) -> Self { Self([rotate_u128_right(self.0[0], 20)]) } - #[inline(always)] + #[inline] fn rotate_each_word_right24(self) -> Self { Self([rotate_u128_right(self.0[0], 24)]) } - #[inline(always)] + #[inline] fn rotate_each_word_right25(self) -> Self { Self([rotate_u128_right(self.0[0], 25)]) } } impl RotateEachWord64 for u128x1_generic { - #[inline(always)] + #[inline] fn rotate_each_word_right32(self) -> Self { Self([rotate_u128_right(self.0[0], 32)]) } @@ -446,7 +430,7 @@ impl Machine for GenericMachine { type u32x4x4 = u32x4x4_generic; type u64x2x4 = u64x2x4_generic; type u128x4 = u128x4_generic; - #[inline(always)] + #[inline] unsafe fn instance() -> Self { Self } @@ -623,22 +607,6 @@ pub type u32x4x4_generic = x4<u32x4_generic>; pub type u64x2x4_generic = x4<u64x2_generic>; pub type u128x4_generic = x4<u128x1_generic>; -impl Vector<[u32; 16]> for u32x4x4_generic { - fn to_scalars(self) -> [u32; 16] { - let [a, b, c, d] = self.0; - let a = a.0; - let b = b.0; - let c = c.0; - let d = d.0; - [ - a[0], a[1], a[2], a[3], // - b[0], b[1], b[2], b[3], // - c[0], c[1], c[2], c[3], // - d[0], d[1], d[2], d[3], // - ] - } -} - impl MultiLane<[u32; 4]> for u32x4_generic { #[inline(always)] fn to_lanes(self) -> [u32; 4] { @@ -779,7 +747,7 @@ impl u128x4<GenericMachine> for u128x4_generic {} #[macro_export] macro_rules! dispatch { ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }) => { - #[inline(always)] + #[inline] $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret { let $mach = unsafe { $crate::generic::GenericMachine::instance() }; #[inline(always)] @@ -796,7 +764,7 @@ macro_rules! dispatch { #[macro_export] macro_rules! dispatch_light128 { ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }) => { - #[inline(always)] + #[inline] $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret { let $mach = unsafe { $crate::generic::GenericMachine::instance() }; #[inline(always)] @@ -813,7 +781,7 @@ macro_rules! dispatch_light128 { #[macro_export] macro_rules! dispatch_light256 { ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }) => { - #[inline(always)] + #[inline] $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret { let $mach = unsafe { $crate::generic::GenericMachine::instance() }; #[inline(always)] @@ -830,7 +798,7 @@ macro_rules! dispatch_light256 { #[macro_export] macro_rules! dispatch_light512 { ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }) => { - #[inline(always)] + #[inline] $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret { let $mach = unsafe { $crate::generic::GenericMachine::instance() }; #[inline(always)] diff --git a/src/soft.rs b/src/soft.rs index 0ae390c..8976c48 100644 --- a/src/soft.rs +++ b/src/soft.rs @@ -175,50 +175,26 @@ impl<W: BSwap + Copy, G> BSwap for x2<W, G> { impl<W: StoreBytes + BSwap + Copy, G> StoreBytes for x2<W, G> { #[inline(always)] unsafe fn unsafe_read_le(input: &[u8]) -> Self { - let input = input.split_at(input.len() / 2); + let input = input.split_at(16); x2::new([W::unsafe_read_le(input.0), W::unsafe_read_le(input.1)]) } #[inline(always)] unsafe fn unsafe_read_be(input: &[u8]) -> Self { - let input = input.split_at(input.len() / 2); - x2::new([W::unsafe_read_be(input.0), W::unsafe_read_be(input.1)]) + x2::unsafe_read_le(input).bswap() } #[inline(always)] fn write_le(self, out: &mut [u8]) { - let out = out.split_at_mut(out.len() / 2); + let out = out.split_at_mut(16); self.0[0].write_le(out.0); self.0[1].write_le(out.1); } #[inline(always)] fn write_be(self, out: &mut [u8]) { - let out = out.split_at_mut(out.len() / 2); + let out = out.split_at_mut(16); self.0[0].write_be(out.0); self.0[1].write_be(out.1); } } -impl<W: Copy + LaneWords4, G: Copy> LaneWords4 for x2<W, G> { - #[inline(always)] - fn shuffle_lane_words2301(self) -> Self { - Self::new([ - self.0[0].shuffle_lane_words2301(), - self.0[1].shuffle_lane_words2301(), - ]) - } - #[inline(always)] - fn shuffle_lane_words1230(self) -> Self { - Self::new([ - self.0[0].shuffle_lane_words1230(), - self.0[1].shuffle_lane_words1230(), - ]) - } - #[inline(always)] - fn shuffle_lane_words3012(self) -> Self { - Self::new([ - self.0[0].shuffle_lane_words3012(), - self.0[1].shuffle_lane_words3012(), - ]) - } -} #[derive(Copy, Clone, Default)] #[allow(non_camel_case_types)] @@ -334,20 +310,6 @@ impl<W: Copy> Vec4<W> for x4<W> { self } } -impl<W: Copy> Vec4Ext<W> for x4<W> { - #[inline(always)] - fn transpose4(a: Self, b: Self, c: Self, d: Self) -> (Self, Self, Self, Self) - where - Self: Sized, - { - ( - x4([a.0[0], b.0[0], c.0[0], d.0[0]]), - x4([a.0[1], b.0[1], c.0[1], d.0[1]]), - x4([a.0[2], b.0[2], c.0[2], d.0[2]]), - x4([a.0[3], b.0[3], c.0[3], d.0[3]]), - ) - } -} impl<W: Copy + Store<vec128_storage>> Store<vec512_storage> for x4<W> { #[inline(always)] unsafe fn unpack(p: vec512_storage) -> Self { @@ -406,39 +368,30 @@ impl<W: BSwap + Copy> BSwap for x4<W> { impl<W: StoreBytes + BSwap + Copy> StoreBytes for x4<W> { #[inline(always)] unsafe fn unsafe_read_le(input: &[u8]) -> Self { - let n = input.len() / 4; x4([ - W::unsafe_read_le(&input[..n]), - W::unsafe_read_le(&input[n..n * 2]), - W::unsafe_read_le(&input[n * 2..n * 3]), - W::unsafe_read_le(&input[n * 3..]), + W::unsafe_read_le(&input[0..16]), + W::unsafe_read_le(&input[16..32]), + W::unsafe_read_le(&input[32..48]), + W::unsafe_read_le(&input[48..64]), ]) } #[inline(always)] unsafe fn unsafe_read_be(input: &[u8]) -> Self { - let n = input.len() / 4; - x4([ - W::unsafe_read_be(&input[..n]), - W::unsafe_read_be(&input[n..n * 2]), - W::unsafe_read_be(&input[n * 2..n * 3]), - W::unsafe_read_be(&input[n * 3..]), - ]) + x4::unsafe_read_le(input).bswap() } #[inline(always)] fn write_le(self, out: &mut [u8]) { - let n = out.len() / 4; - self.0[0].write_le(&mut out[..n]); - self.0[1].write_le(&mut out[n..n * 2]); - self.0[2].write_le(&mut out[n * 2..n * 3]); - self.0[3].write_le(&mut out[n * 3..]); + self.0[0].write_le(&mut out[0..16]); + self.0[1].write_le(&mut out[16..32]); + self.0[2].write_le(&mut out[32..48]); + self.0[3].write_le(&mut out[48..64]); } #[inline(always)] fn write_be(self, out: &mut [u8]) { - let n = out.len() / 4; - self.0[0].write_be(&mut out[..n]); - self.0[1].write_be(&mut out[n..n * 2]); - self.0[2].write_be(&mut out[n * 2..n * 3]); - self.0[3].write_be(&mut out[n * 3..]); + self.0[0].write_be(&mut out[0..16]); + self.0[1].write_be(&mut out[16..32]); + self.0[2].write_be(&mut out[32..48]); + self.0[3].write_be(&mut out[48..64]); } } impl<W: Copy + LaneWords4> LaneWords4 for x4<W> { diff --git a/src/types.rs b/src/types.rs index f9f3bf1..a282670 100644 --- a/src/types.rs +++ b/src/types.rs @@ -71,17 +71,6 @@ pub trait Vec4<W> { fn extract(self, i: u32) -> W; fn insert(self, w: W, i: u32) -> Self; } -/// Vec4 functions which may not be implemented yet for all Vec4 types. -/// NOTE: functions in this trait may be moved to Vec4 in any patch release. To avoid breakage, -/// import Vec4Ext only together with Vec4, and don't qualify its methods. -pub trait Vec4Ext<W> { - fn transpose4(a: Self, b: Self, c: Self, d: Self) -> (Self, Self, Self, Self) - where - Self: Sized; -} -pub trait Vector<T> { - fn to_scalars(self) -> T; -} // TODO: multiples of 4 should inherit this /// A vector composed of four words; depending on their size, operations may cross lanes. @@ -123,7 +112,12 @@ pub trait u32x4<M: Machine>: { } pub trait u64x2<M: Machine>: - BitOps64 + Store<vec128_storage> + ArithOps + Vec2<u64> + MultiLane<[u64; 2]> + Into<vec128_storage> + BitOps64 + + Store<vec128_storage> + + ArithOps + + Vec2<u64> + + MultiLane<[u64; 2]> + + Into<vec128_storage> { } pub trait u128x1<M: Machine>: @@ -138,7 +132,6 @@ pub trait u32x4x2<M: Machine>: + MultiLane<[M::u32x4; 2]> + ArithOps + Into<vec256_storage> - + StoreBytes { } pub trait u64x2x2<M: Machine>: @@ -176,13 +169,10 @@ pub trait u32x4x4<M: Machine>: BitOps32 + Store<vec512_storage> + Vec4<M::u32x4> - + Vec4Ext<M::u32x4> - + Vector<[u32; 16]> + MultiLane<[M::u32x4; 4]> + ArithOps + LaneWords4 + Into<vec512_storage> - + StoreBytes { } pub trait u64x2x4<M: Machine>: diff --git a/src/x86_64/mod.rs b/src/x86_64/mod.rs index 937732d..d7455d0 100644 --- a/src/x86_64/mod.rs +++ b/src/x86_64/mod.rs @@ -79,7 +79,7 @@ where type u64x2 = sse2::u64x2_sse2<YesS3, YesS4, NI>; type u128x1 = sse2::u128x1_sse2<YesS3, YesS4, NI>; - type u32x4x2 = sse2::avx2::u32x4x2_avx2<NI>; + type u32x4x2 = sse2::u32x4x2_sse2<YesS3, YesS4, NI>; type u64x2x2 = sse2::u64x2x2_sse2<YesS3, YesS4, NI>; type u64x4 = sse2::u64x4_sse2<YesS3, YesS4, NI>; type u128x2 = sse2::u128x2_sse2<YesS3, YesS4, NI>; @@ -119,16 +119,16 @@ impl Store<vec128_storage> for vec128_storage { p } } -impl<'a> From<&'a vec128_storage> for &'a [u32; 4] { +impl<'a> Into<&'a [u32; 4]> for &'a vec128_storage { #[inline(always)] - fn from(x: &'a vec128_storage) -> Self { - unsafe { &x.u32x4 } + fn into(self) -> &'a [u32; 4] { + unsafe { &self.u32x4 } } } -impl From<[u32; 4]> for vec128_storage { +impl Into<vec128_storage> for [u32; 4] { #[inline(always)] - fn from(u32x4: [u32; 4]) -> Self { - vec128_storage { u32x4 } + fn into(self) -> vec128_storage { + vec128_storage { u32x4: self } } } impl Default for vec128_storage { @@ -154,10 +154,10 @@ pub union vec256_storage { sse2: [vec128_storage; 2], avx: __m256i, } -impl From<[u64; 4]> for vec256_storage { +impl Into<vec256_storage> for [u64; 4] { #[inline(always)] - fn from(u64x4: [u64; 4]) -> Self { - vec256_storage { u64x4 } + fn into(self) -> vec256_storage { + vec256_storage { u64x4: self } } } impl Default for vec256_storage { @@ -167,11 +167,9 @@ impl Default for vec256_storage { } } impl vec256_storage { - #[inline(always)] pub fn new128(xs: [vec128_storage; 2]) -> Self { Self { sse2: xs } } - #[inline(always)] pub fn split128(self) -> [vec128_storage; 2] { unsafe { self.sse2 } } @@ -202,11 +200,9 @@ impl Default for vec512_storage { } } impl vec512_storage { - #[inline(always)] pub fn new128(xs: [vec128_storage; 4]) -> Self { Self { sse2: xs } } - #[inline(always)] pub fn split128(self) -> [vec128_storage; 4] { unsafe { self.sse2 } } @@ -221,10 +217,10 @@ impl PartialEq for vec512_storage { macro_rules! impl_into { ($storage:ident, $array:ty, $name:ident) => { - impl From<$storage> for $array { + impl Into<$array> for $storage { #[inline(always)] - fn from(vec: $storage) -> Self { - unsafe { vec.$name } + fn into(self) -> $array { + unsafe { self.$name } } } }; diff --git a/src/x86_64/sse2.rs b/src/x86_64/sse2.rs index 97197a4..bf0063f 100644 --- a/src/x86_64/sse2.rs +++ b/src/x86_64/sse2.rs @@ -189,21 +189,21 @@ impl<S4: Copy, NI: Copy> RotateEachWord32 for u32x4_sse2<YesS3, S4, NI> { rotr_32!(rotate_each_word_right7, 7); rotr_32_s3!( rotate_each_word_right8, - 0x0c0f_0e0d_080b_0a09, - 0x0407_0605_0003_0201 + 0x0c0f0e0d_080b0a09, + 0x04070605_00030201 ); rotr_32!(rotate_each_word_right11, 11); rotr_32!(rotate_each_word_right12, 12); rotr_32_s3!( rotate_each_word_right16, - 0x0d0c_0f0e_0908_0b0a, - 0x0504_0706_0100_0302 + 0x0d0c0f0e_09080b0a, + 0x05040706_01000302 ); rotr_32!(rotate_each_word_right20, 20); rotr_32_s3!( rotate_each_word_right24, - 0x0e0d_0c0f_0a09_080b, - 0x0605_0407_0201_0003 + 0x0e0d0c0f_0a09080b, + 0x06050407_02010003 ); rotr_32!(rotate_each_word_right25, 25); } @@ -880,13 +880,6 @@ pub type u64x2x4_sse2<S3, S4, NI> = x4<u64x2_sse2<S3, S4, NI>>; #[allow(non_camel_case_types)] pub type u128x4_sse2<S3, S4, NI> = x4<u128x1_sse2<S3, S4, NI>>; -impl<S3, S4, NI> Vector<[u32; 16]> for u32x4x4_sse2<S3, S4, NI> { - #[inline(always)] - fn to_scalars(self) -> [u32; 16] { - unsafe { core::mem::transmute(self) } - } -} - impl<S3: Copy, S4: Copy, NI: Copy> u32x4x2<Machine86<S3, S4, NI>> for u32x4x2_sse2<S3, S4, NI> where u32x4_sse2<S3, S4, NI>: RotateEachWord32 + BSwap, @@ -990,8 +983,6 @@ where Machine86<S3, S4, NI>: Machine, u32x4x4_sse2<S3, S4, NI>: MultiLane<[<Machine86<S3, S4, NI> as Machine>::u32x4; 4]>, u32x4x4_sse2<S3, S4, NI>: Vec4<<Machine86<S3, S4, NI> as Machine>::u32x4>, - u32x4x4_sse2<S3, S4, NI>: Vec4Ext<<Machine86<S3, S4, NI> as Machine>::u32x4>, - u32x4x4_sse2<S3, S4, NI>: Vector<[u32; 16]>, { } impl<S3: Copy, S4: Copy, NI: Copy> u64x2x4<Machine86<S3, S4, NI>> for u64x2x4_sse2<S3, S4, NI> @@ -1013,6 +1004,14 @@ where { } +impl<NI: Copy> u32x4x4<Avx2Machine<NI>> for u32x4x4_sse2<YesS3, YesS4, NI> +where + u32x4_sse2<YesS3, YesS4, NI>: RotateEachWord32 + BSwap, + Avx2Machine<NI>: Machine, + u32x4x4_sse2<YesS3, YesS4, NI>: MultiLane<[<Avx2Machine<NI> as Machine>::u32x4; 4]>, + u32x4x4_sse2<YesS3, YesS4, NI>: Vec4<<Avx2Machine<NI> as Machine>::u32x4>, +{ +} impl<NI: Copy> u64x2x4<Avx2Machine<NI>> for u64x2x4_sse2<YesS3, YesS4, NI> where u64x2_sse2<YesS3, YesS4, NI>: RotateEachWord64 + RotateEachWord32 + BSwap, @@ -1375,78 +1374,65 @@ mod test { pub mod avx2 { #![allow(non_camel_case_types)] - use crate::soft::{x2, x4}; + use crate::soft::x4; use crate::types::*; - use crate::x86_64::sse2::{u128x1_sse2, u32x4_sse2, G0}; + use crate::x86_64::sse2::{u128x1_sse2, u32x4_sse2}; use crate::x86_64::{vec256_storage, vec512_storage, Avx2Machine, YesS3, YesS4}; use core::arch::x86_64::*; use core::marker::PhantomData; use core::ops::*; #[derive(Copy, Clone)] - pub struct u32x4x2_avx2<NI> { - x: __m256i, + pub struct u32x4x4_avx2<NI> { + x: [__m256i; 2], ni: PhantomData<NI>, } - impl<NI> u32x4x2_avx2<NI> { + impl<NI> u32x4x4_avx2<NI> { #[inline(always)] - fn new(x: __m256i) -> Self { + fn new(x: [__m256i; 2]) -> Self { Self { x, ni: PhantomData } } } - impl<NI> u32x4x2<Avx2Machine<NI>> for u32x4x2_avx2<NI> where NI: Copy {} - impl<NI> Store<vec256_storage> for u32x4x2_avx2<NI> { - #[inline(always)] - unsafe fn unpack(p: vec256_storage) -> Self { - Self::new(p.avx) - } - } - impl<NI> StoreBytes for u32x4x2_avx2<NI> { - #[inline(always)] - unsafe fn unsafe_read_le(input: &[u8]) -> Self { - assert_eq!(input.len(), 32); - Self::new(_mm256_loadu_si256(input.as_ptr() as *const _)) - } + impl<NI> u32x4x4<Avx2Machine<NI>> for u32x4x4_avx2<NI> where NI: Copy {} + impl<NI> Store<vec512_storage> for u32x4x4_avx2<NI> { #[inline(always)] - unsafe fn unsafe_read_be(input: &[u8]) -> Self { - Self::unsafe_read_le(input).bswap() - } - #[inline(always)] - fn write_le(self, out: &mut [u8]) { - unsafe { - assert_eq!(out.len(), 32); - _mm256_storeu_si256(out.as_mut_ptr() as *mut _, self.x) - } - } - #[inline(always)] - fn write_be(self, out: &mut [u8]) { - self.bswap().write_le(out) + unsafe fn unpack(p: vec512_storage) -> Self { + Self::new([p.avx[0].avx, p.avx[1].avx]) } } - impl<NI> MultiLane<[u32x4_sse2<YesS3, YesS4, NI>; 2]> for u32x4x2_avx2<NI> { + impl<NI> MultiLane<[u32x4_sse2<YesS3, YesS4, NI>; 4]> for u32x4x4_avx2<NI> { #[inline(always)] - fn to_lanes(self) -> [u32x4_sse2<YesS3, YesS4, NI>; 2] { + fn to_lanes(self) -> [u32x4_sse2<YesS3, YesS4, NI>; 4] { unsafe { [ - u32x4_sse2::new(_mm256_extracti128_si256(self.x, 0)), - u32x4_sse2::new(_mm256_extracti128_si256(self.x, 1)), + u32x4_sse2::new(_mm256_extracti128_si256(self.x[0], 0)), + u32x4_sse2::new(_mm256_extracti128_si256(self.x[0], 1)), + u32x4_sse2::new(_mm256_extracti128_si256(self.x[1], 0)), + u32x4_sse2::new(_mm256_extracti128_si256(self.x[1], 1)), ] } } #[inline(always)] - fn from_lanes(x: [u32x4_sse2<YesS3, YesS4, NI>; 2]) -> Self { - Self::new(unsafe { _mm256_setr_m128i(x[0].x, x[1].x) }) + fn from_lanes(x: [u32x4_sse2<YesS3, YesS4, NI>; 4]) -> Self { + Self::new(unsafe { + [ + _mm256_setr_m128i(x[0].x, x[1].x), + _mm256_setr_m128i(x[2].x, x[3].x), + ] + }) } } - impl<NI> Vec2<u32x4_sse2<YesS3, YesS4, NI>> for u32x4x2_avx2<NI> { + impl<NI> Vec4<u32x4_sse2<YesS3, YesS4, NI>> for u32x4x4_avx2<NI> { #[inline(always)] fn extract(self, i: u32) -> u32x4_sse2<YesS3, YesS4, NI> { unsafe { match i { - 0 => u32x4_sse2::new(_mm256_extracti128_si256(self.x, 0)), - 1 => u32x4_sse2::new(_mm256_extracti128_si256(self.x, 1)), + 0 => u32x4_sse2::new(_mm256_extracti128_si256(self.x[0], 0)), + 1 => u32x4_sse2::new(_mm256_extracti128_si256(self.x[0], 1)), + 2 => u32x4_sse2::new(_mm256_extracti128_si256(self.x[1], 0)), + 3 => u32x4_sse2::new(_mm256_extracti128_si256(self.x[1], 1)), _ => panic!(), } } @@ -1455,21 +1441,55 @@ pub mod avx2 { fn insert(self, w: u32x4_sse2<YesS3, YesS4, NI>, i: u32) -> Self { Self::new(unsafe { match i { - 0 => _mm256_inserti128_si256(self.x, w.x, 0), - 1 => _mm256_inserti128_si256(self.x, w.x, 1), + 0 => [_mm256_inserti128_si256(self.x[0], w.x, 0), self.x[1]], + 1 => [_mm256_inserti128_si256(self.x[0], w.x, 1), self.x[1]], + 2 => [self.x[0], _mm256_inserti128_si256(self.x[1], w.x, 0)], + 3 => [self.x[0], _mm256_inserti128_si256(self.x[1], w.x, 1)], _ => panic!(), } }) } } - impl<NI> BitOps32 for u32x4x2_avx2<NI> where NI: Copy {} - impl<NI> ArithOps for u32x4x2_avx2<NI> where NI: Copy {} + impl<NI> LaneWords4 for u32x4x4_avx2<NI> { + #[inline(always)] + fn shuffle_lane_words1230(self) -> Self { + Self::new(unsafe { + [ + _mm256_shuffle_epi32(self.x[0], 0b1001_0011), + _mm256_shuffle_epi32(self.x[1], 0b1001_0011), + ] + }) + } + #[inline(always)] + fn shuffle_lane_words2301(self) -> Self { + Self::new(unsafe { + [ + _mm256_shuffle_epi32(self.x[0], 0b0100_1110), + _mm256_shuffle_epi32(self.x[1], 0b0100_1110), + ] + }) + } + #[inline(always)] + fn shuffle_lane_words3012(self) -> Self { + Self::new(unsafe { + [ + _mm256_shuffle_epi32(self.x[0], 0b0011_1001), + _mm256_shuffle_epi32(self.x[1], 0b0011_1001), + ] + }) + } + } + impl<NI> BitOps32 for u32x4x4_avx2<NI> where NI: Copy {} + impl<NI> ArithOps for u32x4x4_avx2<NI> where NI: Copy {} macro_rules! shuf_lane_bytes { ($name:ident, $k0:expr, $k1:expr) => { #[inline(always)] fn $name(self) -> Self { Self::new(unsafe { - _mm256_shuffle_epi8(self.x, _mm256_set_epi64x($k0, $k1, $k0, $k1)) + [ + _mm256_shuffle_epi8(self.x[0], _mm256_set_epi64x($k0, $k1, $k0, $k1)), + _mm256_shuffle_epi8(self.x[1], _mm256_set_epi64x($k0, $k1, $k0, $k1)), + ] }) } }; @@ -1479,41 +1499,52 @@ pub mod avx2 { #[inline(always)] fn $name(self) -> Self { Self::new(unsafe { - _mm256_or_si256( - _mm256_srli_epi32(self.x, $i as i32), - _mm256_slli_epi32(self.x, 32 - $i as i32), - ) + [ + _mm256_or_si256( + _mm256_srli_epi32(self.x[0], $i as i32), + _mm256_slli_epi32(self.x[0], 32 - $i as i32), + ), + _mm256_or_si256( + _mm256_srli_epi32(self.x[1], $i as i32), + _mm256_slli_epi32(self.x[1], 32 - $i as i32), + ), + ] }) } }; } - impl<NI: Copy> RotateEachWord32 for u32x4x2_avx2<NI> { + impl<NI: Copy> RotateEachWord32 for u32x4x4_avx2<NI> { rotr_32!(rotate_each_word_right7, 7); shuf_lane_bytes!( rotate_each_word_right8, - 0x0c0f_0e0d_080b_0a09, - 0x0407_0605_0003_0201 + 0x0c0f0e0d_080b0a09, + 0x04070605_00030201 ); rotr_32!(rotate_each_word_right11, 11); rotr_32!(rotate_each_word_right12, 12); shuf_lane_bytes!( rotate_each_word_right16, - 0x0d0c_0f0e_0908_0b0a, - 0x0504_0706_0100_0302 + 0x0d0c0f0e_09080b0a, + 0x05040706_01000302 ); rotr_32!(rotate_each_word_right20, 20); shuf_lane_bytes!( rotate_each_word_right24, - 0x0e0d_0c0f_0a09_080b, - 0x0605_0407_0201_0003 + 0x0e0d0c0f_0a09080b, + 0x06050407_02010003 ); rotr_32!(rotate_each_word_right25, 25); } - impl<NI> BitOps0 for u32x4x2_avx2<NI> where NI: Copy {} - impl<NI> From<u32x4x2_avx2<NI>> for vec256_storage { + impl<NI> BitOps0 for u32x4x4_avx2<NI> where NI: Copy {} + impl<NI> From<u32x4x4_avx2<NI>> for vec512_storage { #[inline(always)] - fn from(x: u32x4x2_avx2<NI>) -> Self { - Self { avx: x.x } + fn from(x: u32x4x4_avx2<NI>) -> Self { + Self { + avx: [ + vec256_storage { avx: x.x[0] }, + vec256_storage { avx: x.x[1] }, + ], + } } } @@ -1530,172 +1561,55 @@ pub mod avx2 { } }; } - impl_assign!(u32x4x2_avx2, BitXorAssign, bitxor_assign, bitxor); - impl_assign!(u32x4x2_avx2, BitOrAssign, bitor_assign, bitor); - impl_assign!(u32x4x2_avx2, BitAndAssign, bitand_assign, bitand); - impl_assign!(u32x4x2_avx2, AddAssign, add_assign, add); + impl_assign!(u32x4x4_avx2, BitXorAssign, bitxor_assign, bitxor); + impl_assign!(u32x4x4_avx2, BitOrAssign, bitor_assign, bitor); + impl_assign!(u32x4x4_avx2, BitAndAssign, bitand_assign, bitand); + impl_assign!(u32x4x4_avx2, AddAssign, add_assign, add); - macro_rules! impl_bitop { + macro_rules! impl_bitop_x2 { ($vec:ident, $Op:ident, $op_fn:ident, $impl_fn:ident) => { impl<NI> $Op for $vec<NI> { type Output = Self; #[inline(always)] fn $op_fn(self, rhs: Self) -> Self::Output { - Self::new(unsafe { $impl_fn(self.x, rhs.x) }) + Self::new(unsafe { + [$impl_fn(self.x[0], rhs.x[0]), $impl_fn(self.x[1], rhs.x[1])] + }) } } }; } - impl_bitop!(u32x4x2_avx2, BitXor, bitxor, _mm256_xor_si256); - impl_bitop!(u32x4x2_avx2, BitOr, bitor, _mm256_or_si256); - impl_bitop!(u32x4x2_avx2, BitAnd, bitand, _mm256_and_si256); - impl_bitop!(u32x4x2_avx2, AndNot, andnot, _mm256_andnot_si256); - impl_bitop!(u32x4x2_avx2, Add, add, _mm256_add_epi32); + impl_bitop_x2!(u32x4x4_avx2, BitXor, bitxor, _mm256_xor_si256); + impl_bitop_x2!(u32x4x4_avx2, BitOr, bitor, _mm256_or_si256); + impl_bitop_x2!(u32x4x4_avx2, BitAnd, bitand, _mm256_and_si256); + impl_bitop_x2!(u32x4x4_avx2, AndNot, andnot, _mm256_andnot_si256); + impl_bitop_x2!(u32x4x4_avx2, Add, add, _mm256_add_epi32); - impl<NI> Not for u32x4x2_avx2<NI> { + impl<NI> Not for u32x4x4_avx2<NI> { type Output = Self; #[inline(always)] fn not(self) -> Self::Output { unsafe { let f = _mm256_set1_epi8(-0x7f); - Self::new(f) ^ self + Self::new([f, f]) ^ self } } } - impl<NI> BSwap for u32x4x2_avx2<NI> { + impl<NI> BSwap for u32x4x4_avx2<NI> { shuf_lane_bytes!(bswap, 0x0c0d_0e0f_0809_0a0b, 0x0405_0607_0001_0203); } - impl<NI> From<x2<u128x1_sse2<YesS3, YesS4, NI>, G0>> for u32x4x2_avx2<NI> + impl<NI> From<x4<u128x1_sse2<YesS3, YesS4, NI>>> for u32x4x4_avx2<NI> where NI: Copy, { #[inline(always)] - fn from(x: x2<u128x1_sse2<YesS3, YesS4, NI>, G0>) -> Self { - Self::new(unsafe { _mm256_setr_m128i(x.0[0].x, x.0[1].x) }) - } - } - - impl<NI> LaneWords4 for u32x4x2_avx2<NI> { - #[inline(always)] - fn shuffle_lane_words1230(self) -> Self { - Self::new(unsafe { _mm256_shuffle_epi32(self.x, 0b1001_0011) }) - } - #[inline(always)] - fn shuffle_lane_words2301(self) -> Self { - Self::new(unsafe { _mm256_shuffle_epi32(self.x, 0b0100_1110) }) - } - #[inline(always)] - fn shuffle_lane_words3012(self) -> Self { - Self::new(unsafe { _mm256_shuffle_epi32(self.x, 0b0011_1001) }) - } - } - - /////////////////////////////////////////////////////////////////////////////////////////// - - pub type u32x4x4_avx2<NI> = x2<u32x4x2_avx2<NI>, G0>; - impl<NI: Copy> u32x4x4<Avx2Machine<NI>> for u32x4x4_avx2<NI> {} - - impl<NI: Copy> Store<vec512_storage> for u32x4x4_avx2<NI> { - #[inline(always)] - unsafe fn unpack(p: vec512_storage) -> Self { - Self::new([ - u32x4x2_avx2::unpack(p.avx[0]), - u32x4x2_avx2::unpack(p.avx[1]), - ]) - } - } - impl<NI: Copy> MultiLane<[u32x4_sse2<YesS3, YesS4, NI>; 4]> for u32x4x4_avx2<NI> { - #[inline(always)] - fn to_lanes(self) -> [u32x4_sse2<YesS3, YesS4, NI>; 4] { - let [a, b] = self.0[0].to_lanes(); - let [c, d] = self.0[1].to_lanes(); - [a, b, c, d] - } - #[inline(always)] - fn from_lanes(x: [u32x4_sse2<YesS3, YesS4, NI>; 4]) -> Self { - let ab = u32x4x2_avx2::from_lanes([x[0], x[1]]); - let cd = u32x4x2_avx2::from_lanes([x[2], x[3]]); - Self::new([ab, cd]) - } - } - impl<NI: Copy> Vec4<u32x4_sse2<YesS3, YesS4, NI>> for u32x4x4_avx2<NI> { - #[inline(always)] - fn extract(self, i: u32) -> u32x4_sse2<YesS3, YesS4, NI> { - match i { - 0 => self.0[0].extract(0), - 1 => self.0[0].extract(1), - 2 => self.0[1].extract(0), - 3 => self.0[1].extract(1), - _ => panic!(), - } - } - #[inline(always)] - fn insert(self, w: u32x4_sse2<YesS3, YesS4, NI>, i: u32) -> Self { - Self::new(match i { - 0 | 1 => [self.0[0].insert(w, i), self.0[1]], - 2 | 3 => [self.0[0], self.0[1].insert(w, i - 2)], - _ => panic!(), - }) - } - } - impl<NI: Copy> Vec4Ext<u32x4_sse2<YesS3, YesS4, NI>> for u32x4x4_avx2<NI> { - #[inline(always)] - fn transpose4(a: Self, b: Self, c: Self, d: Self) -> (Self, Self, Self, Self) { - /* - * a00:a01 a10:a11 - * b00:b01 b10:b11 - * c00:c01 c10:c11 - * d00:d01 d10:d11 - * => - * a00:b00 c00:d00 - * a01:b01 c01:d01 - * a10:b10 c10:d10 - * a11:b11 c11:d11 - */ - unsafe { - let ab00 = u32x4x2_avx2::new(_mm256_permute2x128_si256(a.0[0].x, b.0[0].x, 0x20)); - let ab01 = u32x4x2_avx2::new(_mm256_permute2x128_si256(a.0[0].x, b.0[0].x, 0x31)); - let ab10 = u32x4x2_avx2::new(_mm256_permute2x128_si256(a.0[1].x, b.0[1].x, 0x20)); - let ab11 = u32x4x2_avx2::new(_mm256_permute2x128_si256(a.0[1].x, b.0[1].x, 0x31)); - let cd00 = u32x4x2_avx2::new(_mm256_permute2x128_si256(c.0[0].x, d.0[0].x, 0x20)); - let cd01 = u32x4x2_avx2::new(_mm256_permute2x128_si256(c.0[0].x, d.0[0].x, 0x31)); - let cd10 = u32x4x2_avx2::new(_mm256_permute2x128_si256(c.0[1].x, d.0[1].x, 0x20)); - let cd11 = u32x4x2_avx2::new(_mm256_permute2x128_si256(c.0[1].x, d.0[1].x, 0x31)); - ( - Self::new([ab00, cd00]), - Self::new([ab01, cd01]), - Self::new([ab10, cd10]), - Self::new([ab11, cd11]), - ) - } - } - } - impl<NI: Copy> Vector<[u32; 16]> for u32x4x4_avx2<NI> { - #[inline(always)] - fn to_scalars(self) -> [u32; 16] { - unsafe { core::mem::transmute(self) } - } - } - impl<NI: Copy> From<u32x4x4_avx2<NI>> for vec512_storage { - #[inline(always)] - fn from(x: u32x4x4_avx2<NI>) -> Self { - Self { - avx: [ - vec256_storage { avx: x.0[0].x }, - vec256_storage { avx: x.0[1].x }, - ], - } - } - } - impl<NI: Copy> From<x4<u128x1_sse2<YesS3, YesS4, NI>>> for u32x4x4_avx2<NI> { - #[inline(always)] fn from(x: x4<u128x1_sse2<YesS3, YesS4, NI>>) -> Self { Self::new(unsafe { [ - u32x4x2_avx2::new(_mm256_setr_m128i(x.0[0].x, x.0[1].x)), - u32x4x2_avx2::new(_mm256_setr_m128i(x.0[2].x, x.0[3].x)), + _mm256_setr_m128i(x.0[0].x, x.0[1].x), + _mm256_setr_m128i(x.0[2].x, x.0[3].x), ] }) } |