diff --git a/.gitignore b/.gitignore
index 04a4061d35..c16125ed70 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,4 @@
 Cargo.lock
 .*.swp
-/target
-tags
+target
+tags
\ No newline at end of file
diff --git a/.travis.yml b/.travis.yml
index f97609447c..95be95f59f 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -15,7 +15,8 @@ matrix:
     - os: osx
       env: TARGET=x86_64-apple-darwin NO_ADD=1
       script: ci/run.sh
-    - install: true
+    - env: DOCUMENTATION
+      install: true
       script: ci/dox.sh
     - env: RUSTFMT=On TARGET=x86_64-unknown-linux-gnu NO_ADD=1
       script: |
diff --git a/Cargo.toml b/Cargo.toml
index cd4560406b..b64b2f8e67 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -2,15 +2,17 @@
 name = "stdsimd"
 version = "0.0.3"
 authors = ["Andrew Gallant <jamslam@gmail.com>"]
-description = "Experiments for adding SIMD support to Rust's standard library."
+description = "SIMD support in Rust's standard library."
 documentation = "https://docs.rs/stdsimd"
 homepage = "https://github.com/BurntSushi/stdsimd"
 repository = "https://github.com/BurntSushi/stdsimd"
 readme = "README.md"
 keywords = ["std", "simd", "intrinsics"]
-categories = ["hardware-support", "no-std"]
+categories = ["hardware-support"]
 license = "MIT/Apache-2.0"
 
+[workspace]
+
 [badges]
 travis-ci = { repository = "BurntSushi/stdsimd" }
 appveyor = { repository = "BurntSushi/stdsimd"  }
@@ -18,6 +20,9 @@ is-it-maintained-issue-resolution = { repository = "BurntSushi/stdsimd" }
 is-it-maintained-open-issues = { repository = "BurntSushi/stdsimd" }
 maintenance = { status = "experimental" }
 
+[dependencies]
+coresimd = { version = "0.0.3", path = "coresimd/" }
+
 [profile.release]
 debug = true
 opt-level = 3
@@ -26,15 +31,9 @@ opt-level = 3
 debug = true
 opt-level = 3
 
-[dev-dependencies]
-stdsimd-test = { version = "0.*", path = "stdsimd-test" }
-cupid = "0.5.0"
-
 [features]
-std = []
-
-# Internal-only: denies all warnings.
-strict = []
-# Internal-only: enables only those intrinsics supported by Intel's
+# Internal-usage only: denies all warnings.
+strict = [ "coresimd/strict" ]
+# Internal-usage only: enables only those intrinsics supported by Intel's
 # Software Development Environment (SDE).
-intel_sde = []
\ No newline at end of file
+intel_sde = [ "coresimd/intel_sde" ]
diff --git a/ci/dox.sh b/ci/dox.sh
index b8998bb15e..3bcb7bcf9f 100755
--- a/ci/dox.sh
+++ b/ci/dox.sh
@@ -22,7 +22,10 @@ dox() {
   rm -rf target/doc/$arch
   mkdir target/doc/$arch
 
-  rustdoc --target $target -o target/doc/$arch src/lib.rs --crate-name stdsimd
+  cargo clean
+  cargo build --target $target
+
+  rustdoc --target $target -o target/doc/$arch src/lib.rs --crate-name stdsimd --library-path target/$target/debug/deps
 }
 
 dox i686 i686-unknown-linux-gnu
diff --git a/ci/run.sh b/ci/run.sh
index e2326b2d9b..51734e5fce 100755
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -16,14 +16,15 @@ case ${TARGET} in
 esac
 
 FEATURES="strict,$FEATURES"
-FEATURES_STD="$std,${FEATURES}"
 
 echo "RUSTFLAGS=${RUSTFLAGS}"
 echo "FEATURES=${FEATURES}"
 echo "OBJDUMP=${OBJDUMP}"
 
-cargo test --target $TARGET --features $FEATURES --verbose -- --nocapture
-cargo test --release --target $TARGET --features $FEATURES --verbose -- --nocapture
+cargo_test() {
+    cmd="cargo test --all --target=$TARGET --features $FEATURES --verbose $1 -- --nocapture $2"
+    $cmd
+}
 
-cargo test --target $TARGET --features $FEATURES_STD --verbose -- --nocapture
-cargo test --release --target $TARGET --features $FEATURES_STD --verbose -- --nocapture
+cargo_test
+cargo_test "--release"
diff --git a/coresimd/Cargo.toml b/coresimd/Cargo.toml
new file mode 100644
index 0000000000..52415468b9
--- /dev/null
+++ b/coresimd/Cargo.toml
@@ -0,0 +1,30 @@
+[package]
+name = "coresimd"
+version = "0.0.3"
+authors = ["Andrew Gallant <jamslam@gmail.com>"]
+description = "SIMD support in Rust's core library."
+documentation = "https://docs.rs/stdsimd"
+homepage = "https://github.com/BurntSushi/stdsimd"
+repository = "https://github.com/BurntSushi/stdsimd"
+readme = "README.md"
+keywords = ["core", "simd", "intrinsics"]
+categories = ["hardware-support", "no-std"]
+license = "MIT/Apache-2.0"
+
+[badges]
+travis-ci = { repository = "BurntSushi/stdsimd" }
+appveyor = { repository = "BurntSushi/stdsimd"  }
+is-it-maintained-issue-resolution = { repository = "BurntSushi/stdsimd" }
+is-it-maintained-open-issues = { repository = "BurntSushi/stdsimd" }
+maintenance = { status = "experimental" }
+
+[dev-dependencies]
+cupid = "0.5.0"
+stdsimd-test = { version = "0.*", path = "../stdsimd-test" }
+
+[features]
+# Internal-usage only: denies all warnings.
+strict = []
+# Internal-usage only: enables only those intrinsics supported by Intel's
+# Software Development Environment (SDE).
+intel_sde = []
\ No newline at end of file
diff --git a/coresimd/LICENSE-APACHE b/coresimd/LICENSE-APACHE
new file mode 120000
index 0000000000..965b606f33
--- /dev/null
+++ b/coresimd/LICENSE-APACHE
@@ -0,0 +1 @@
+../LICENSE-APACHE
\ No newline at end of file
diff --git a/coresimd/LICENSE-MIT b/coresimd/LICENSE-MIT
new file mode 120000
index 0000000000..76219eb72e
--- /dev/null
+++ b/coresimd/LICENSE-MIT
@@ -0,0 +1 @@
+../LICENSE-MIT
\ No newline at end of file
diff --git a/coresimd/README.md b/coresimd/README.md
new file mode 120000
index 0000000000..32d46ee883
--- /dev/null
+++ b/coresimd/README.md
@@ -0,0 +1 @@
+../README.md
\ No newline at end of file
diff --git a/coresimd/rustfmt.toml b/coresimd/rustfmt.toml
new file mode 120000
index 0000000000..39f97b043b
--- /dev/null
+++ b/coresimd/rustfmt.toml
@@ -0,0 +1 @@
+../rustfmt.toml
\ No newline at end of file
diff --git a/src/aarch64/mod.rs b/coresimd/src/aarch64/mod.rs
similarity index 100%
rename from src/aarch64/mod.rs
rename to coresimd/src/aarch64/mod.rs
diff --git a/src/aarch64/neon.rs b/coresimd/src/aarch64/neon.rs
similarity index 100%
rename from src/aarch64/neon.rs
rename to coresimd/src/aarch64/neon.rs
diff --git a/src/aarch64/v8.rs b/coresimd/src/aarch64/v8.rs
similarity index 100%
rename from src/aarch64/v8.rs
rename to coresimd/src/aarch64/v8.rs
diff --git a/src/arm/mod.rs b/coresimd/src/arm/mod.rs
similarity index 100%
rename from src/arm/mod.rs
rename to coresimd/src/arm/mod.rs
diff --git a/src/arm/neon.rs b/coresimd/src/arm/neon.rs
similarity index 100%
rename from src/arm/neon.rs
rename to coresimd/src/arm/neon.rs
diff --git a/src/arm/v6.rs b/coresimd/src/arm/v6.rs
similarity index 100%
rename from src/arm/v6.rs
rename to coresimd/src/arm/v6.rs
diff --git a/src/arm/v7.rs b/coresimd/src/arm/v7.rs
similarity index 100%
rename from src/arm/v7.rs
rename to coresimd/src/arm/v7.rs
diff --git a/coresimd/src/lib.rs b/coresimd/src/lib.rs
new file mode 100644
index 0000000000..f0ce9e8178
--- /dev/null
+++ b/coresimd/src/lib.rs
@@ -0,0 +1,231 @@
+//! SIMD support
+//!
+//! This crate provides the fundamentals of supporting SIMD in Rust. This crate
+//! should compile on all platforms and provide `simd` and `vendor` modules at
+//! the top-level. The `simd` module contains *portable vector types* which
+//! should work across all platforms and be implemented in the most efficient
+//! manner possible for the platform at hand. The `vendor` module contains
+//! vendor intrinsics that operate over these SIMD types, typically
+//! corresponding to a particular CPU instruction
+//!
+//! ```rust
+//! extern crate coresimd as stdsimd;
+//! use stdsimd::simd::u32x4;
+//!
+//! fn main() {
+//!     let a = u32x4::new(1, 2, 3, 4);
+//!     let b = u32x4::splat(10);
+//!     assert_eq!(a + b, u32x4::new(11, 12, 13, 14));
+//! }
+//! ```
+//!
+//! > **Note**: This crate is *nightly only* at the moment, and requires a
+//! > nightly rust toolchain to compile.
+//!
+//! This documentation is only for one particular architecture, you can find
+//! others at:
+//!
+//! * [i686](https://rust-lang-nursery.github.io/stdsimd/i686/stdsimd/)
+//! * [`x86_64`](https://rust-lang-nursery.github.io/stdsimd/x86_64/stdsimd/)
+//! * [arm](https://rust-lang-nursery.github.io/stdsimd/arm/stdsimd/)
+//! * [aarch64](https://rust-lang-nursery.github.io/stdsimd/aarch64/stdsimd/)
+//!
+//! ## Portability
+//!
+//! The `simd` module and its types should be portable to all platforms. The
+//! runtime characteristics of these types may vary per platform and per CPU
+//! feature enabled, but they should always have the most optimized
+//! implementation for the target at hand.
+//!
+//! The `vendor` module provides no portability guarantees. The `vendor` module
+//! is per CPU architecture currently and provides intrinsics corresponding to
+//! functions for that particular CPU architecture. Note that the functions
+//! provided in this module are intended to correspond to CPU instructions and
+//! have no runtime support for whether you CPU actually supports the
+//! instruction.
+//!
+//! CPU target feature detection is done via the `cfg_feature_enabled!` macro
+//! at runtime. This macro will detect at runtime whether the specified feature
+//! is available or not, returning true or false depending on the current CPU.
+//!
+//! ```
+//! #![feature(cfg_target_feature)]
+//!
+//! #[macro_use]
+//! extern crate coresimd as stdsimd;
+//!
+//! fn main() {
+//!     if cfg_feature_enabled!("avx2") {
+//!         println!("avx2 intrinsics will work");
+//!     } else {
+//!         println!("avx2 intrinsics will not work");
+//!         // undefined behavior: may generate a `SIGILL`.
+//!     }
+//! }
+//! ```
+//!
+//! After verifying that a specified feature is available, use `target_feature`
+//! to enable a given feature and use the desired intrinsic.
+//!
+//! ```ignore
+//! # #![feature(cfg_target_feature)]
+//! # #![feature(target_feature)]
+//! # #[macro_use]
+//! # extern crate coresimd as stdsimd;
+//! # fn main() {
+//! #     if cfg_feature_enabled!("avx2") {
+//! // avx2 specific code may be used in this function
+//! #[target_feature = "+avx2"]
+//! fn and_256() {
+//!     // avx2 feature specific intrinsics will work here!
+//!     use stdsimd::vendor::{__m256i, _mm256_and_si256};
+//!
+//!     let a = __m256i::splat(5);
+//!     let b = __m256i::splat(3);
+//!
+//!     let got = unsafe { _mm256_and_si256(a, b) };
+//!
+//!     assert_eq!(got, __m256i::splat(1));
+//! }
+//! #         and_256();
+//! #     }
+//! # }
+//! ```
+//!
+//! # Status
+//!
+//! This crate is intended for eventual inclusion into the standard library,
+//! but some work and experimentation is needed to get there! First and
+//! foremost you can help out by kicking the tires on this crate and seeing if
+//! it works for your use case! Next up you can help us fill out the [vendor
+//! intrinsics][vendor] to ensure that we've got all the SIMD support
+//! necessary.
+//!
+//! The language support and status of SIMD is also still a little up in the
+//! air right now, you may be interested in a few issues along these lines:
+//!
+//! * [Overal tracking issue for SIMD support][simd_tracking_issue]
+//! * [`cfg_target_feature` tracking issue][cfg_target_feature_issue]
+//! * [SIMD types currently not sound][simd_soundness_bug]
+//! * [`#[target_feature]` improvements][target_feature_impr]
+//!
+//! [vendor]: https://github.com/rust-lang-nursery/stdsimd/issues/40
+//! [simd_tracking_issue]: https://github.com/rust-lang/rust/issues/27731
+//! [cfg_target_feature_issue]: https://github.com/rust-lang/rust/issues/29717
+//! [simd_soundness_bug]: https://github.com/rust-lang/rust/issues/44367
+//! [target_feature_impr]: https://github.com/rust-lang/rust/issues/44839
+
+#![cfg_attr(feature = "strict", deny(warnings))]
+#![allow(dead_code)]
+#![allow(unused_features)]
+#![feature(const_fn, link_llvm_intrinsics, platform_intrinsics, repr_simd,
+           simd_ffi, target_feature, cfg_target_feature, i128_type, asm,
+           const_atomic_usize_new, stmt_expr_attributes)]
+#![cfg_attr(test, feature(proc_macro, test, repr_align, attr_literals))]
+#![cfg_attr(feature = "cargo-clippy",
+            allow(inline_always, too_many_arguments, cast_sign_loss,
+                  cast_lossless, cast_possible_wrap,
+                  cast_possible_truncation, cast_precision_loss,
+                  shadow_reuse, cyclomatic_complexity, similar_names,
+                  many_single_char_names))]
+#![no_std]
+
+#[cfg(test)]
+#[macro_use]
+extern crate std;
+
+#[cfg(test)]
+extern crate stdsimd_test;
+
+#[cfg(test)]
+extern crate test;
+
+/// Platform independent SIMD vector types and operations.
+pub mod simd {
+    pub use v128::*;
+    pub use v256::*;
+    pub use v512::*;
+    pub use v64::*;
+}
+
+/// Platform dependent vendor intrinsics.
+pub mod vendor {
+    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+    pub use x86::*;
+
+    #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
+    pub use arm::*;
+
+    #[cfg(target_arch = "aarch64")]
+    pub use aarch64::*;
+
+    // FIXME: rust does not expose the nvptx and nvptx64 targets yet
+    #[cfg(not(any(target_arch = "x86", target_arch = "x86_64",
+                  target_arch = "arm", target_arch = "aarch64")))]
+    pub use nvptx::*;
+
+    #[cfg(
+        // x86/x86_64:
+        any(target_arch = "x86", target_arch = "x86_64")
+    )]
+    pub use runtime::{__unstable_detect_feature, __Feature};
+}
+
+#[cfg(
+    // x86/x86_64:
+    any(target_arch = "x86", target_arch = "x86_64")
+)]
+#[macro_use]
+mod runtime;
+
+#[macro_use]
+mod macros;
+mod simd_llvm;
+mod v128;
+mod v256;
+mod v512;
+mod v64;
+
+/// 32-bit wide vector tpyes
+mod v32 {
+    use simd_llvm::*;
+
+    define_ty! { i16x2, i16, i16 }
+    define_impl! { i16x2, i16, 2, i16x2, x0, x1 }
+    define_ty! { u16x2, u16, u16 }
+    define_impl! { u16x2, u16, 2, i16x2, x0, x1 }
+
+    define_ty! { i8x4, i8, i8, i8, i8 }
+    define_impl! { i8x4, i8, 4, i8x4, x0, x1, x2, x3 }
+    define_ty! { u8x4, u8, u8, u8, u8 }
+    define_impl! { u8x4, u8, 4, i8x4, x0, x1, x2, x3 }
+
+    define_casts!(
+        (i16x2, i64x2, as_i64x2),
+        (u16x2, i64x2, as_i64x2),
+        (i8x4, i32x4, as_i32x4),
+        (u8x4, i32x4, as_i32x4)
+    );
+}
+
+/// 16-bit wide vector tpyes
+mod v16 {
+    use simd_llvm::*;
+
+    define_ty! { i8x2, i8, i8 }
+    define_impl! { i8x2, i8, 2, i8x2, x0, x1 }
+    define_ty! { u8x2, u8, u8 }
+    define_impl! { u8x2, u8, 2, i8x2, x0, x1 }
+
+    define_casts!((i8x2, i64x2, as_i64x2), (u8x2, i64x2, as_i64x2));
+}
+
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+mod x86;
+
+#[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
+mod arm;
+#[cfg(target_arch = "aarch64")]
+mod aarch64;
+
+mod nvptx;
diff --git a/src/macros.rs b/coresimd/src/macros.rs
similarity index 100%
rename from src/macros.rs
rename to coresimd/src/macros.rs
diff --git a/src/nvptx/mod.rs b/coresimd/src/nvptx/mod.rs
similarity index 100%
rename from src/nvptx/mod.rs
rename to coresimd/src/nvptx/mod.rs
diff --git a/coresimd/src/runtime/bit.rs b/coresimd/src/runtime/bit.rs
new file mode 100644
index 0000000000..42483e5225
--- /dev/null
+++ b/coresimd/src/runtime/bit.rs
@@ -0,0 +1,11 @@
+//! Bit manipulation utilities
+
+/// Sets the `bit` of `x`.
+pub const fn set(x: usize, bit: u32) -> usize {
+    x | 1 << bit
+}
+
+/// Tests the `bit` of `x`.
+pub const fn test(x: usize, bit: u32) -> bool {
+    x & (1 << bit) != 0
+}
diff --git a/coresimd/src/runtime/cache.rs b/coresimd/src/runtime/cache.rs
new file mode 100644
index 0000000000..bb247fb531
--- /dev/null
+++ b/coresimd/src/runtime/cache.rs
@@ -0,0 +1,31 @@
+//! Cache of run-time feature detection
+
+use core::sync::atomic::{AtomicUsize, Ordering};
+use core::usize;
+
+use super::bit;
+
+/// This global variable is a bitset used to cache the features supported by
+/// the
+/// CPU.
+static CACHE: AtomicUsize = AtomicUsize::new(usize::MAX);
+
+/// Test the `bit` of the storage. If the storage has not been initialized,
+/// initializes it with the result of `f()`.
+///
+/// On its first invocation, it detects the CPU features and caches them in the
+/// `FEATURES` global variable as an `AtomicUsize`.
+///
+/// It uses the `__Feature` variant to index into this variable as a bitset. If
+/// the bit is set, the feature is enabled, and otherwise it is disabled.
+///
+/// PLEASE: do not use this, it is an implementation detail subject to change.
+pub fn test<F>(bit: u32, f: F) -> bool
+where
+    F: FnOnce() -> usize,
+{
+    if CACHE.load(Ordering::Relaxed) == usize::MAX {
+        CACHE.store(f(), Ordering::Relaxed);
+    }
+    bit::test(CACHE.load(Ordering::Relaxed), bit)
+}
diff --git a/coresimd/src/runtime/macros.rs b/coresimd/src/runtime/macros.rs
new file mode 100644
index 0000000000..e8278bb295
--- /dev/null
+++ b/coresimd/src/runtime/macros.rs
@@ -0,0 +1,39 @@
+//! Run-time feature detection macros.
+
+/// Is a feature supported by the host CPU?
+///
+/// This macro performs run-time feature detection. It returns true if the host
+/// CPU in which the binary is running on supports a particular feature.
+#[macro_export]
+macro_rules! cfg_feature_enabled {
+    ($name:tt) => (
+        {
+            #[cfg(target_feature = $name)]
+            {
+                true
+            }
+            #[cfg(not(target_feature = $name))]
+            {
+                __unstable_detect_feature!($name)
+            }
+        }
+    )
+}
+
+/// In all unsupported architectures using the macro is an error
+#[cfg(not(any(target_arch = "x86", target_arch = "x86_64",
+              target_arch = "arm", target_arch = "aarch64")))]
+#[macro_export]
+#[doc(hidden)]
+macro_rules! __unstable_detect_feature {
+    ($t:tt) => { compile_error!(concat!("unknown target feature: ", $t)) };
+}
+
+#[cfg(test)]
+mod tests {
+    #[cfg(target_arch = "x86_64")]
+    #[test]
+    fn test_macros() {
+        assert!(cfg_feature_enabled!("sse"));
+    }
+}
diff --git a/coresimd/src/runtime/mod.rs b/coresimd/src/runtime/mod.rs
new file mode 100644
index 0000000000..6ad497f7df
--- /dev/null
+++ b/coresimd/src/runtime/mod.rs
@@ -0,0 +1,17 @@
+//! Run-time feature detection
+mod cache;
+mod bit;
+
+#[macro_use]
+mod macros;
+
+#[macro_use]
+mod x86;
+pub use self::x86::__Feature;
+use self::x86::detect_features;
+
+/// Performs run-time feature detection.
+#[doc(hidden)]
+pub fn __unstable_detect_feature(x: __Feature) -> bool {
+    cache::test(x as u32, detect_features)
+}
diff --git a/src/runtime/x86.rs b/coresimd/src/runtime/x86.rs
similarity index 85%
rename from src/runtime/x86.rs
rename to coresimd/src/runtime/x86.rs
index a9d88cf8c1..54dda2eeca 100644
--- a/src/runtime/x86.rs
+++ b/coresimd/src/runtime/x86.rs
@@ -364,8 +364,7 @@ pub fn detect_features() -> usize {
                 enable(extended_features_ebx, 5, __Feature::avx2);
 
                 // For AVX-512 the OS also needs to support saving/restoring
-                // the
-                // extended state, only then we enable AVX-512 support:
+                // the extended state, only then we enable AVX-512 support:
                 if os_avx512_support {
                     enable(extended_features_ebx, 16, __Feature::avx512f);
                     enable(extended_features_ebx, 17, __Feature::avx512dq);
@@ -384,8 +383,8 @@ pub fn detect_features() -> usize {
                 }
             }
 
-            // Processor Extended State Enumeration Sub-leaf (EAX = 0DH, ECX =
-            // 1)
+            // Processor Extended State Enumeration Sub-leaf
+            // (EAX = 0DH, ECX = 1)
             if max_basic_leaf >= 0xd {
                 let CpuidResult {
                     eax: proc_extended_state1_eax,
@@ -417,9 +416,10 @@ pub fn detect_features() -> usize {
 
 #[cfg(test)]
 mod tests {
-    #[cfg(feature = "std")]
+    extern crate cupid;
+
     #[test]
-    fn runtime_detection_x86_nocapture() {
+    fn dump() {
         println!("sse: {:?}", cfg_feature_enabled!("sse"));
         println!("sse2: {:?}", cfg_feature_enabled!("sse2"));
         println!("sse3: {:?}", cfg_feature_enabled!("sse3"));
@@ -435,10 +435,10 @@ mod tests {
         println!("avx512bw {:?}", cfg_feature_enabled!("avx512bw"));
         println!("avx512dq {:?}", cfg_feature_enabled!("avx512dq"));
         println!("avx512vl {:?}", cfg_feature_enabled!("avx512vl"));
-        println!("avx512ifma {:?}", cfg_feature_enabled!("avx512ifma"));
-        println!("avx512vbmi {:?}", cfg_feature_enabled!("avx512vbmi"));
+        println!("avx512_ifma {:?}", cfg_feature_enabled!("avx512ifma"));
+        println!("avx512_vbmi {:?}", cfg_feature_enabled!("avx512vbmi"));
         println!(
-            "avx512vpopcntdq {:?}",
+            "avx512_vpopcntdq {:?}",
             cfg_feature_enabled!("avx512vpopcntdq")
         );
         println!("fma: {:?}", cfg_feature_enabled!("fma"));
@@ -453,4 +453,54 @@ mod tests {
         println!("xsaves {:?}", cfg_feature_enabled!("xsaves"));
         println!("xsavec {:?}", cfg_feature_enabled!("xsavec"));
     }
+
+    #[test]
+    fn compare_with_cupid() {
+        let information = cupid::master().unwrap();
+        assert_eq!(cfg_feature_enabled!("sse"), information.sse());
+        assert_eq!(cfg_feature_enabled!("sse2"), information.sse2());
+        assert_eq!(cfg_feature_enabled!("sse3"), information.sse3());
+        assert_eq!(cfg_feature_enabled!("ssse3"), information.ssse3());
+        assert_eq!(cfg_feature_enabled!("sse4.1"), information.sse4_1());
+        assert_eq!(cfg_feature_enabled!("sse4.2"), information.sse4_2());
+        assert_eq!(cfg_feature_enabled!("avx"), information.avx());
+        assert_eq!(cfg_feature_enabled!("avx2"), information.avx2());
+        assert_eq!(cfg_feature_enabled!("avx512f"), information.avx512f());
+        assert_eq!(cfg_feature_enabled!("avx512cd"), information.avx512cd());
+        assert_eq!(cfg_feature_enabled!("avx512er"), information.avx512er());
+        assert_eq!(cfg_feature_enabled!("avx512pf"), information.avx512pf());
+        assert_eq!(cfg_feature_enabled!("avx512bw"), information.avx512bw());
+        assert_eq!(cfg_feature_enabled!("avx512dq"), information.avx512dq());
+        assert_eq!(cfg_feature_enabled!("avx512vl"), information.avx512vl());
+        assert_eq!(
+            cfg_feature_enabled!("avx512ifma"),
+            information.avx512_ifma()
+        );
+        assert_eq!(
+            cfg_feature_enabled!("avx512vbmi"),
+            information.avx512_vbmi()
+        );
+        assert_eq!(
+            cfg_feature_enabled!("avx512vpopcntdq"),
+            information.avx512_vpopcntdq()
+        );
+        assert_eq!(cfg_feature_enabled!("fma"), information.fma());
+        assert_eq!(cfg_feature_enabled!("bmi"), information.bmi1());
+        assert_eq!(cfg_feature_enabled!("bmi2"), information.bmi2());
+        assert_eq!(cfg_feature_enabled!("popcnt"), information.popcnt());
+        assert_eq!(cfg_feature_enabled!("sse4a"), information.sse4a());
+        assert_eq!(cfg_feature_enabled!("abm"), information.lzcnt());
+        assert_eq!(cfg_feature_enabled!("tbm"), information.tbm());
+        assert_eq!(cfg_feature_enabled!("lzcnt"), information.lzcnt());
+        assert_eq!(cfg_feature_enabled!("xsave"), information.xsave());
+        assert_eq!(cfg_feature_enabled!("xsaveopt"), information.xsaveopt());
+        assert_eq!(
+            cfg_feature_enabled!("xsavec"),
+            information.xsavec_and_xrstor()
+        );
+        assert_eq!(
+            cfg_feature_enabled!("xsaves"),
+            information.xsaves_xrstors_and_ia32_xss()
+        );
+    }
 }
diff --git a/src/simd_llvm.rs b/coresimd/src/simd_llvm.rs
similarity index 100%
rename from src/simd_llvm.rs
rename to coresimd/src/simd_llvm.rs
diff --git a/src/v128.rs b/coresimd/src/v128.rs
similarity index 100%
rename from src/v128.rs
rename to coresimd/src/v128.rs
diff --git a/src/v256.rs b/coresimd/src/v256.rs
similarity index 100%
rename from src/v256.rs
rename to coresimd/src/v256.rs
diff --git a/src/v512.rs b/coresimd/src/v512.rs
similarity index 100%
rename from src/v512.rs
rename to coresimd/src/v512.rs
diff --git a/src/v64.rs b/coresimd/src/v64.rs
similarity index 100%
rename from src/v64.rs
rename to coresimd/src/v64.rs
diff --git a/src/x86/i386/eflags.rs b/coresimd/src/x86/i386/eflags.rs
similarity index 100%
rename from src/x86/i386/eflags.rs
rename to coresimd/src/x86/i386/eflags.rs
diff --git a/src/x86/i386/mod.rs b/coresimd/src/x86/i386/mod.rs
similarity index 100%
rename from src/x86/i386/mod.rs
rename to coresimd/src/x86/i386/mod.rs
diff --git a/src/x86/i586/abm.rs b/coresimd/src/x86/i586/abm.rs
similarity index 98%
rename from src/x86/i586/abm.rs
rename to coresimd/src/x86/i586/abm.rs
index a912bdded1..52dc991a84 100644
--- a/src/x86/i586/abm.rs
+++ b/coresimd/src/x86/i586/abm.rs
@@ -15,8 +15,7 @@
 //! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
 //! [amd64_ref]: http://support.amd.com/TechDocs/24594.pdf
 //! [wikipedia_bmi]:
-//! https://en.wikipedia.org/wiki/Bit_Manipulation_Instruction_Sets#ABM_.
-//! 28Advanced_Bit_Manipulation.29
+//! https://en.wikipedia.org/wiki/Bit_Manipulation_Instruction_Sets#ABM_.28Advanced_Bit_Manipulation.29
 
 #[cfg(test)]
 use stdsimd_test::assert_instr;
diff --git a/src/x86/i586/avx.rs b/coresimd/src/x86/i586/avx.rs
similarity index 99%
rename from src/x86/i586/avx.rs
rename to coresimd/src/x86/i586/avx.rs
index aa1e6fe99b..7df5fd6a2e 100644
--- a/src/x86/i586/avx.rs
+++ b/coresimd/src/x86/i586/avx.rs
@@ -326,9 +326,9 @@ pub unsafe fn _mm256_div_pd(a: f64x4, b: f64x4) -> f64x4 {
 /// - `0x02`: Round up, toward positive infinity.
 /// - `0x03`: Truncate the values.
 ///
-/// For a complete list of options, check the LLVM docs:
+/// For a complete list of options, check [the LLVM docs][llvm_docs].
 ///
-/// https://github.com/llvm-mirror/clang/blob/dcd8d797b20291f1a6b3e0ddda085aa2bbb382a8/lib/Headers/avxintrin.h#L382
+/// [llvm_docs]: https://github.com/llvm-mirror/clang/blob/dcd8d797b20291f1a6b3e0ddda085aa2bbb382a8/lib/Headers/avxintrin.h#L382
 #[inline(always)]
 #[target_feature = "+avx"]
 #[cfg_attr(test, assert_instr(vroundpd, b = 0x3))]
diff --git a/src/x86/i586/avx2.rs b/coresimd/src/x86/i586/avx2.rs
similarity index 92%
rename from src/x86/i586/avx2.rs
rename to coresimd/src/x86/i586/avx2.rs
index f7a6fb894a..7bf376b95f 100644
--- a/src/x86/i586/avx2.rs
+++ b/coresimd/src/x86/i586/avx2.rs
@@ -695,400 +695,496 @@ pub unsafe fn _mm256_hsubs_epi16(a: i16x16, b: i16x16) -> i16x16 {
     phsubsw(a, b)
 }
 
-/// Return values from `slice` at offsets determined by `offsets * scale`, where
+/// Return values from `slice` at offsets determined by `offsets * scale`,
+/// where
 /// `scale` is between 1 and 8.
 #[inline(always)]
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vpgatherdd, scale = 1))]
-pub unsafe fn _mm_i32gather_epi32(slice: *const i32, offsets: i32x4, scale: i8) -> i32x4 {
+pub unsafe fn _mm_i32gather_epi32(
+    slice: *const i32, offsets: i32x4, scale: i8
+) -> i32x4 {
     macro_rules! call {
         ($imm8:expr) => (pgatherdd(i32x4::splat(0), slice as *const i8, offsets, i32x4::splat(-1), $imm8))
     }
     constify_imm8!(scale, call)
 }
 
-/// Return values from `slice` at offsets determined by `offsets * scale`, where
+/// Return values from `slice` at offsets determined by `offsets * scale`,
+/// where
 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
 /// that position instead.
 #[inline(always)]
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vpgatherdd, scale = 1))]
-pub unsafe fn _mm_mask_i32gather_epi32(src: i32x4, slice: *const i32, offsets: i32x4, mask: i32x4, scale: i8) -> i32x4 {
+pub unsafe fn _mm_mask_i32gather_epi32(
+    src: i32x4, slice: *const i32, offsets: i32x4, mask: i32x4, scale: i8
+) -> i32x4 {
     macro_rules! call {
         ($imm8:expr) => (pgatherdd(src, slice as *const i8, offsets, mask, $imm8))
     }
     constify_imm8!(scale, call)
 }
 
-/// Return values from `slice` at offsets determined by `offsets * scale`, where
+/// Return values from `slice` at offsets determined by `offsets * scale`,
+/// where
 /// `scale` is between 1 and 8.
 #[inline(always)]
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vpgatherdd, scale = 1))]
-pub unsafe fn _mm256_i32gather_epi32(slice: *const i32, offsets: i32x8, scale: i8) -> i32x8 {
+pub unsafe fn _mm256_i32gather_epi32(
+    slice: *const i32, offsets: i32x8, scale: i8
+) -> i32x8 {
     macro_rules! call {
         ($imm8:expr) => (vpgatherdd(i32x8::splat(0), slice as *const i8, offsets, i32x8::splat(-1), $imm8))
     }
     constify_imm8!(scale, call)
 }
 
-/// Return values from `slice` at offsets determined by `offsets * scale`, where
+/// Return values from `slice` at offsets determined by `offsets * scale`,
+/// where
 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
 /// that position instead.
 #[inline(always)]
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vpgatherdd, scale = 1))]
-pub unsafe fn _mm256_mask_i32gather_epi32(src: i32x8, slice: *const i32, offsets: i32x8, mask: i32x8, scale: i8) -> i32x8 {
+pub unsafe fn _mm256_mask_i32gather_epi32(
+    src: i32x8, slice: *const i32, offsets: i32x8, mask: i32x8, scale: i8
+) -> i32x8 {
     macro_rules! call {
         ($imm8:expr) => (vpgatherdd(src, slice as *const i8, offsets, mask, $imm8))
     }
     constify_imm8!(scale, call)
 }
 
-/// Return values from `slice` at offsets determined by `offsets * scale`, where
+/// Return values from `slice` at offsets determined by `offsets * scale`,
+/// where
 /// `scale` is between 1 and 8.
 #[inline(always)]
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vgatherdps, scale = 1))]
-pub unsafe fn _mm_i32gather_ps(slice: *const f32, offsets: i32x4, scale: i8) -> f32x4 {
+pub unsafe fn _mm_i32gather_ps(
+    slice: *const f32, offsets: i32x4, scale: i8
+) -> f32x4 {
     macro_rules! call {
         ($imm8:expr) => (pgatherdps(f32x4::splat(0.0), slice as *const i8, offsets, f32x4::splat(-1.0), $imm8))
     }
     constify_imm8!(scale, call)
 }
 
-/// Return values from `slice` at offsets determined by `offsets * scale`, where
+/// Return values from `slice` at offsets determined by `offsets * scale`,
+/// where
 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
 /// that position instead.
 #[inline(always)]
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vgatherdps, scale = 1))]
-pub unsafe fn _mm_mask_i32gather_ps(src: f32x4, slice: *const f32, offsets: i32x4, mask: f32x4, scale: i8) -> f32x4 {
+pub unsafe fn _mm_mask_i32gather_ps(
+    src: f32x4, slice: *const f32, offsets: i32x4, mask: f32x4, scale: i8
+) -> f32x4 {
     macro_rules! call {
         ($imm8:expr) => (pgatherdps(src, slice as *const i8, offsets, mask, $imm8))
     }
     constify_imm8!(scale, call)
 }
 
-/// Return values from `slice` at offsets determined by `offsets * scale`, where
+/// Return values from `slice` at offsets determined by `offsets * scale`,
+/// where
 /// `scale` is between 1 and 8.
 #[inline(always)]
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vgatherdps, scale = 1))]
-pub unsafe fn _mm256_i32gather_ps(slice: *const f32, offsets: i32x8, scale: i8) -> f32x8 {
+pub unsafe fn _mm256_i32gather_ps(
+    slice: *const f32, offsets: i32x8, scale: i8
+) -> f32x8 {
     macro_rules! call {
         ($imm8:expr) => (vpgatherdps(f32x8::splat(0.0), slice as *const i8, offsets, f32x8::splat(-1.0), $imm8))
     }
     constify_imm8!(scale, call)
 }
 
-/// Return values from `slice` at offsets determined by `offsets * scale`, where
+/// Return values from `slice` at offsets determined by `offsets * scale`,
+/// where
 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
 /// that position instead.
 #[inline(always)]
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vgatherdps, scale = 1))]
-pub unsafe fn _mm256_mask_i32gather_ps(src: f32x8, slice: *const f32, offsets: i32x8, mask: f32x8, scale: i8) -> f32x8 {
+pub unsafe fn _mm256_mask_i32gather_ps(
+    src: f32x8, slice: *const f32, offsets: i32x8, mask: f32x8, scale: i8
+) -> f32x8 {
     macro_rules! call {
         ($imm8:expr) => (vpgatherdps(src, slice as *const i8, offsets, mask, $imm8))
     }
     constify_imm8!(scale, call)
 }
 
-/// Return values from `slice` at offsets determined by `offsets * scale`, where
+/// Return values from `slice` at offsets determined by `offsets * scale`,
+/// where
 /// `scale` is between 1 and 8.
 #[inline(always)]
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vpgatherdq, scale = 1))]
-pub unsafe fn _mm_i32gather_epi64(slice: *const i64, offsets: i32x4, scale: i8) -> i64x2 {
+pub unsafe fn _mm_i32gather_epi64(
+    slice: *const i64, offsets: i32x4, scale: i8
+) -> i64x2 {
     macro_rules! call {
         ($imm8:expr) => (pgatherdq(i64x2::splat(0), slice as *const i8, offsets, i64x2::splat(-1), $imm8))
     }
     constify_imm8!(scale, call)
 }
 
-/// Return values from `slice` at offsets determined by `offsets * scale`, where
+/// Return values from `slice` at offsets determined by `offsets * scale`,
+/// where
 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
 /// that position instead.
 #[inline(always)]
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vpgatherdq, scale = 1))]
-pub unsafe fn _mm_mask_i32gather_epi64(src: i64x2, slice: *const i64, offsets: i32x4, mask: i64x2, scale: i8) -> i64x2 {
+pub unsafe fn _mm_mask_i32gather_epi64(
+    src: i64x2, slice: *const i64, offsets: i32x4, mask: i64x2, scale: i8
+) -> i64x2 {
     macro_rules! call {
         ($imm8:expr) => (pgatherdq(src, slice as *const i8, offsets, mask, $imm8))
     }
     constify_imm8!(scale, call)
 }
 
-/// Return values from `slice` at offsets determined by `offsets * scale`, where
+/// Return values from `slice` at offsets determined by `offsets * scale`,
+/// where
 /// `scale` is between 1 and 8.
 #[inline(always)]
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vpgatherdq, scale = 1))]
-pub unsafe fn _mm256_i32gather_epi64(slice: *const i64, offsets: i32x4, scale: i8) -> i64x4 {
+pub unsafe fn _mm256_i32gather_epi64(
+    slice: *const i64, offsets: i32x4, scale: i8
+) -> i64x4 {
     macro_rules! call {
         ($imm8:expr) => (vpgatherdq(i64x4::splat(0), slice as *const i8, offsets, i64x4::splat(-1), $imm8))
     }
     constify_imm8!(scale, call)
 }
 
-/// Return values from `slice` at offsets determined by `offsets * scale`, where
+/// Return values from `slice` at offsets determined by `offsets * scale`,
+/// where
 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
 /// that position instead.
 #[inline(always)]
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vpgatherdq, scale = 1))]
-pub unsafe fn _mm256_mask_i32gather_epi64(src: i64x4, slice: *const i64, offsets: i32x4, mask: i64x4, scale: i8) -> i64x4 {
+pub unsafe fn _mm256_mask_i32gather_epi64(
+    src: i64x4, slice: *const i64, offsets: i32x4, mask: i64x4, scale: i8
+) -> i64x4 {
     macro_rules! call {
         ($imm8:expr) => (vpgatherdq(src, slice as *const i8, offsets, mask, $imm8))
     }
     constify_imm8!(scale, call)
 }
 
-/// Return values from `slice` at offsets determined by `offsets * scale`, where
+/// Return values from `slice` at offsets determined by `offsets * scale`,
+/// where
 /// `scale` is between 1 and 8.
 #[inline(always)]
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vgatherdpd, scale = 1))]
-pub unsafe fn _mm_i32gather_pd(slice: *const f64, offsets: i32x4, scale: i8) -> f64x2 {
+pub unsafe fn _mm_i32gather_pd(
+    slice: *const f64, offsets: i32x4, scale: i8
+) -> f64x2 {
     macro_rules! call {
         ($imm8:expr) => (pgatherdpd(f64x2::splat(0.0), slice as *const i8, offsets, f64x2::splat(-1.0), $imm8))
     }
     constify_imm8!(scale, call)
 }
 
-/// Return values from `slice` at offsets determined by `offsets * scale`, where
+/// Return values from `slice` at offsets determined by `offsets * scale`,
+/// where
 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
 /// that position instead.
 #[inline(always)]
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vgatherdpd, scale = 1))]
-pub unsafe fn _mm_mask_i32gather_pd(src: f64x2, slice: *const f64, offsets: i32x4, mask: f64x2, scale: i8) -> f64x2 {
+pub unsafe fn _mm_mask_i32gather_pd(
+    src: f64x2, slice: *const f64, offsets: i32x4, mask: f64x2, scale: i8
+) -> f64x2 {
     macro_rules! call {
         ($imm8:expr) => (pgatherdpd(src, slice as *const i8, offsets, mask, $imm8))
     }
     constify_imm8!(scale, call)
 }
 
-/// Return values from `slice` at offsets determined by `offsets * scale`, where
+/// Return values from `slice` at offsets determined by `offsets * scale`,
+/// where
 /// `scale` is between 1 and 8.
 #[inline(always)]
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vgatherdpd, scale = 1))]
-pub unsafe fn _mm256_i32gather_pd(slice: *const f64, offsets: i32x4, scale: i8) -> f64x4 {
+pub unsafe fn _mm256_i32gather_pd(
+    slice: *const f64, offsets: i32x4, scale: i8
+) -> f64x4 {
     macro_rules! call {
         ($imm8:expr) => (vpgatherdpd(f64x4::splat(0.0), slice as *const i8, offsets, f64x4::splat(-1.0), $imm8))
     }
     constify_imm8!(scale, call)
 }
 
-/// Return values from `slice` at offsets determined by `offsets * scale`, where
+/// Return values from `slice` at offsets determined by `offsets * scale`,
+/// where
 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
 /// that position instead.
 #[inline(always)]
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vgatherdpd, scale = 1))]
-pub unsafe fn _mm256_mask_i32gather_pd(src: f64x4, slice: *const f64, offsets: i32x4, mask: f64x4, scale: i8) -> f64x4 {
+pub unsafe fn _mm256_mask_i32gather_pd(
+    src: f64x4, slice: *const f64, offsets: i32x4, mask: f64x4, scale: i8
+) -> f64x4 {
     macro_rules! call {
         ($imm8:expr) => (vpgatherdpd(src, slice as *const i8, offsets, mask, $imm8))
     }
     constify_imm8!(scale, call)
 }
 
-/// Return values from `slice` at offsets determined by `offsets * scale`, where
+/// Return values from `slice` at offsets determined by `offsets * scale`,
+/// where
 /// `scale` is between 1 and 8.
 #[inline(always)]
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vpgatherqd, scale = 1))]
-pub unsafe fn _mm_i64gather_epi32(slice: *const i32, offsets: i64x2, scale: i8) -> i32x4 {
+pub unsafe fn _mm_i64gather_epi32(
+    slice: *const i32, offsets: i64x2, scale: i8
+) -> i32x4 {
     macro_rules! call {
         ($imm8:expr) => (pgatherqd(i32x4::splat(0), slice as *const i8, offsets, i32x4::splat(-1), $imm8))
     }
     constify_imm8!(scale, call)
 }
 
-/// Return values from `slice` at offsets determined by `offsets * scale`, where
+/// Return values from `slice` at offsets determined by `offsets * scale`,
+/// where
 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
 /// that position instead.
 #[inline(always)]
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vpgatherqd, scale = 1))]
-pub unsafe fn _mm_mask_i64gather_epi32(src: i32x4, slice: *const i32, offsets: i64x2, mask: i32x4, scale: i8) -> i32x4 {
+pub unsafe fn _mm_mask_i64gather_epi32(
+    src: i32x4, slice: *const i32, offsets: i64x2, mask: i32x4, scale: i8
+) -> i32x4 {
     macro_rules! call {
         ($imm8:expr) => (pgatherqd(src, slice as *const i8, offsets, mask, $imm8))
     }
     constify_imm8!(scale, call)
 }
 
-/// Return values from `slice` at offsets determined by `offsets * scale`, where
+/// Return values from `slice` at offsets determined by `offsets * scale`,
+/// where
 /// `scale` is between 1 and 8.
 #[inline(always)]
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vpgatherqd, scale = 1))]
-pub unsafe fn _mm256_i64gather_epi32(slice: *const i32, offsets: i64x4, scale: i8) -> i32x4 {
+pub unsafe fn _mm256_i64gather_epi32(
+    slice: *const i32, offsets: i64x4, scale: i8
+) -> i32x4 {
     macro_rules! call {
         ($imm8:expr) => (vpgatherqd(i32x4::splat(0), slice as *const i8, offsets, i32x4::splat(-1), $imm8))
     }
     constify_imm8!(scale, call)
 }
 
-/// Return values from `slice` at offsets determined by `offsets * scale`, where
+/// Return values from `slice` at offsets determined by `offsets * scale`,
+/// where
 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
 /// that position instead.
 #[inline(always)]
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vpgatherqd, scale = 1))]
-pub unsafe fn _mm256_mask_i64gather_epi32(src: i32x4, slice: *const i32, offsets: i64x4, mask: i32x4, scale: i8) -> i32x4 {
+pub unsafe fn _mm256_mask_i64gather_epi32(
+    src: i32x4, slice: *const i32, offsets: i64x4, mask: i32x4, scale: i8
+) -> i32x4 {
     macro_rules! call {
         ($imm8:expr) => (vpgatherqd(src, slice as *const i8, offsets, mask, $imm8))
     }
     constify_imm8!(scale, call)
 }
 
-/// Return values from `slice` at offsets determined by `offsets * scale`, where
+/// Return values from `slice` at offsets determined by `offsets * scale`,
+/// where
 /// `scale` is between 1 and 8.
 #[inline(always)]
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vgatherqps, scale = 1))]
-pub unsafe fn _mm_i64gather_ps(slice: *const f32, offsets: i64x2, scale: i8) -> f32x4 {
+pub unsafe fn _mm_i64gather_ps(
+    slice: *const f32, offsets: i64x2, scale: i8
+) -> f32x4 {
     macro_rules! call {
         ($imm8:expr) => (pgatherqps(f32x4::splat(0.0), slice as *const i8, offsets, f32x4::splat(-1.0), $imm8))
     }
     constify_imm8!(scale, call)
 }
 
-/// Return values from `slice` at offsets determined by `offsets * scale`, where
+/// Return values from `slice` at offsets determined by `offsets * scale`,
+/// where
 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
 /// that position instead.
 #[inline(always)]
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vgatherqps, scale = 1))]
-pub unsafe fn _mm_mask_i64gather_ps(src: f32x4, slice: *const f32, offsets: i64x2, mask: f32x4, scale: i8) -> f32x4 {
+pub unsafe fn _mm_mask_i64gather_ps(
+    src: f32x4, slice: *const f32, offsets: i64x2, mask: f32x4, scale: i8
+) -> f32x4 {
     macro_rules! call {
         ($imm8:expr) => (pgatherqps(src, slice as *const i8, offsets, mask, $imm8))
     }
     constify_imm8!(scale, call)
 }
 
-/// Return values from `slice` at offsets determined by `offsets * scale`, where
+/// Return values from `slice` at offsets determined by `offsets * scale`,
+/// where
 /// `scale` is between 1 and 8.
 #[inline(always)]
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vgatherqps, scale = 1))]
-pub unsafe fn _mm256_i64gather_ps(slice: *const f32, offsets: i64x4, scale: i8) -> f32x4 {
+pub unsafe fn _mm256_i64gather_ps(
+    slice: *const f32, offsets: i64x4, scale: i8
+) -> f32x4 {
     macro_rules! call {
         ($imm8:expr) => (vpgatherqps(f32x4::splat(0.0), slice as *const i8, offsets, f32x4::splat(-1.0), $imm8))
     }
     constify_imm8!(scale, call)
 }
 
-/// Return values from `slice` at offsets determined by `offsets * scale`, where
+/// Return values from `slice` at offsets determined by `offsets * scale`,
+/// where
 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
 /// that position instead.
 #[inline(always)]
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vgatherqps, scale = 1))]
-pub unsafe fn _mm256_mask_i64gather_ps(src: f32x4, slice: *const f32, offsets: i64x4, mask: f32x4, scale: i8) -> f32x4 {
+pub unsafe fn _mm256_mask_i64gather_ps(
+    src: f32x4, slice: *const f32, offsets: i64x4, mask: f32x4, scale: i8
+) -> f32x4 {
     macro_rules! call {
         ($imm8:expr) => (vpgatherqps(src, slice as *const i8, offsets, mask, $imm8))
     }
     constify_imm8!(scale, call)
 }
 
-/// Return values from `slice` at offsets determined by `offsets * scale`, where
+/// Return values from `slice` at offsets determined by `offsets * scale`,
+/// where
 /// `scale` is between 1 and 8.
 #[inline(always)]
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vpgatherqq, scale = 1))]
-pub unsafe fn _mm_i64gather_epi64(slice: *const i64, offsets: i64x2, scale: i8) -> i64x2 {
+pub unsafe fn _mm_i64gather_epi64(
+    slice: *const i64, offsets: i64x2, scale: i8
+) -> i64x2 {
     macro_rules! call {
         ($imm8:expr) => (pgatherqq(i64x2::splat(0), slice as *const i8, offsets, i64x2::splat(-1), $imm8))
     }
     constify_imm8!(scale, call)
 }
 
-/// Return values from `slice` at offsets determined by `offsets * scale`, where
+/// Return values from `slice` at offsets determined by `offsets * scale`,
+/// where
 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
 /// that position instead.
 #[inline(always)]
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vpgatherqq, scale = 1))]
-pub unsafe fn _mm_mask_i64gather_epi64(src: i64x2, slice: *const i64, offsets: i64x2, mask: i64x2, scale: i8) -> i64x2 {
+pub unsafe fn _mm_mask_i64gather_epi64(
+    src: i64x2, slice: *const i64, offsets: i64x2, mask: i64x2, scale: i8
+) -> i64x2 {
     macro_rules! call {
         ($imm8:expr) => (pgatherqq(src, slice as *const i8, offsets, mask, $imm8))
     }
     constify_imm8!(scale, call)
 }
 
-/// Return values from `slice` at offsets determined by `offsets * scale`, where
+/// Return values from `slice` at offsets determined by `offsets * scale`,
+/// where
 /// `scale` is between 1 and 8.
 #[inline(always)]
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vpgatherqq, scale = 1))]
-pub unsafe fn _mm256_i64gather_epi64(slice: *const i64, offsets: i64x4, scale: i8) -> i64x4 {
+pub unsafe fn _mm256_i64gather_epi64(
+    slice: *const i64, offsets: i64x4, scale: i8
+) -> i64x4 {
     macro_rules! call {
         ($imm8:expr) => (vpgatherqq(i64x4::splat(0), slice as *const i8, offsets, i64x4::splat(-1), $imm8))
     }
     constify_imm8!(scale, call)
 }
 
-/// Return values from `slice` at offsets determined by `offsets * scale`, where
+/// Return values from `slice` at offsets determined by `offsets * scale`,
+/// where
 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
 /// that position instead.
 #[inline(always)]
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vpgatherqq, scale = 1))]
-pub unsafe fn _mm256_mask_i64gather_epi64(src: i64x4, slice: *const i64, offsets: i64x4, mask: i64x4, scale: i8) -> i64x4 {
+pub unsafe fn _mm256_mask_i64gather_epi64(
+    src: i64x4, slice: *const i64, offsets: i64x4, mask: i64x4, scale: i8
+) -> i64x4 {
     macro_rules! call {
         ($imm8:expr) => (vpgatherqq(src, slice as *const i8, offsets, mask, $imm8))
     }
     constify_imm8!(scale, call)
 }
 
-/// Return values from `slice` at offsets determined by `offsets * scale`, where
+/// Return values from `slice` at offsets determined by `offsets * scale`,
+/// where
 /// `scale` is between 1 and 8.
 #[inline(always)]
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vgatherqpd, scale = 1))]
-pub unsafe fn _mm_i64gather_pd(slice: *const f64, offsets: i64x2, scale: i8) -> f64x2 {
+pub unsafe fn _mm_i64gather_pd(
+    slice: *const f64, offsets: i64x2, scale: i8
+) -> f64x2 {
     macro_rules! call {
         ($imm8:expr) => (pgatherqpd(f64x2::splat(0.0), slice as *const i8, offsets, f64x2::splat(-1.0), $imm8))
     }
     constify_imm8!(scale, call)
 }
 
-/// Return values from `slice` at offsets determined by `offsets * scale`, where
+/// Return values from `slice` at offsets determined by `offsets * scale`,
+/// where
 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
 /// that position instead.
 #[inline(always)]
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vgatherqpd, scale = 1))]
-pub unsafe fn _mm_mask_i64gather_pd(src: f64x2, slice: *const f64, offsets: i64x2, mask: f64x2, scale: i8) -> f64x2 {
+pub unsafe fn _mm_mask_i64gather_pd(
+    src: f64x2, slice: *const f64, offsets: i64x2, mask: f64x2, scale: i8
+) -> f64x2 {
     macro_rules! call {
         ($imm8:expr) => (pgatherqpd(src, slice as *const i8, offsets, mask, $imm8))
     }
     constify_imm8!(scale, call)
 }
 
-/// Return values from `slice` at offsets determined by `offsets * scale`, where
+/// Return values from `slice` at offsets determined by `offsets * scale`,
+/// where
 /// `scale` is between 1 and 8.
 #[inline(always)]
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vgatherqpd, scale = 1))]
-pub unsafe fn _mm256_i64gather_pd(slice: *const f64, offsets: i64x4, scale: i8) -> f64x4 {
+pub unsafe fn _mm256_i64gather_pd(
+    slice: *const f64, offsets: i64x4, scale: i8
+) -> f64x4 {
     macro_rules! call {
         ($imm8:expr) => (vpgatherqpd(f64x4::splat(0.0), slice as *const i8, offsets, f64x4::splat(-1.0), $imm8))
     }
     constify_imm8!(scale, call)
 }
 
-/// Return values from `slice` at offsets determined by `offsets * scale`, where
+/// Return values from `slice` at offsets determined by `offsets * scale`,
+/// where
 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
 /// that position instead.
 #[inline(always)]
 #[target_feature = "+avx2"]
 #[cfg_attr(test, assert_instr(vgatherqpd, scale = 1))]
-pub unsafe fn _mm256_mask_i64gather_pd(src: f64x4, slice: *const f64, offsets: i64x4, mask: f64x4, scale: i8) -> f64x4 {
+pub unsafe fn _mm256_mask_i64gather_pd(
+    src: f64x4, slice: *const f64, offsets: i64x4, mask: f64x4, scale: i8
+) -> f64x4 {
     macro_rules! call {
         ($imm8:expr) => (vpgatherqpd(src, slice as *const i8, offsets, mask, $imm8))
     }
@@ -1578,7 +1674,7 @@ pub unsafe fn _mm256_shuffle_epi8(a: u8x32, b: u8x32) -> u8x32 {
 /// # #![feature(cfg_target_feature)]
 /// # #![feature(target_feature)]
 /// #
-/// # #[macro_use] extern crate stdsimd;
+/// # #[macro_use] extern crate coresimd as stdsimd;
 /// #
 /// # fn main() {
 /// #     if cfg_feature_enabled!("avx2") {
@@ -2015,7 +2111,7 @@ pub unsafe fn _mm256_subs_epu8(a: u8x32, b: u8x32) -> u8x32 {
 /// # #![feature(cfg_target_feature)]
 /// # #![feature(target_feature)]
 /// #
-/// # #[macro_use] extern crate stdsimd;
+/// # #[macro_use] extern crate coresimd as stdsimd;
 /// #
 /// # fn main() {
 /// #     if cfg_feature_enabled!("avx2") {
@@ -2064,7 +2160,7 @@ pub unsafe fn _mm256_unpackhi_epi8(a: i8x32, b: i8x32) -> i8x32 {
 /// # #![feature(cfg_target_feature)]
 /// # #![feature(target_feature)]
 /// #
-/// # #[macro_use] extern crate stdsimd;
+/// # #[macro_use] extern crate coresimd as stdsimd;
 /// #
 /// # fn main() {
 /// #     if cfg_feature_enabled!("avx2") {
@@ -2112,7 +2208,7 @@ pub unsafe fn _mm256_unpacklo_epi8(a: i8x32, b: i8x32) -> i8x32 {
 /// # #![feature(cfg_target_feature)]
 /// # #![feature(target_feature)]
 /// #
-/// # #[macro_use] extern crate stdsimd;
+/// # #[macro_use] extern crate coresimd as stdsimd;
 /// #
 /// # fn main() {
 /// #     if cfg_feature_enabled!("avx2") {
@@ -2156,7 +2252,7 @@ pub unsafe fn _mm256_unpackhi_epi16(a: i16x16, b: i16x16) -> i16x16 {
 /// # #![feature(cfg_target_feature)]
 /// # #![feature(target_feature)]
 /// #
-/// # #[macro_use] extern crate stdsimd;
+/// # #[macro_use] extern crate coresimd as stdsimd;
 /// #
 /// # fn main() {
 /// #     if cfg_feature_enabled!("avx2") {
@@ -2200,7 +2296,7 @@ pub unsafe fn _mm256_unpacklo_epi16(a: i16x16, b: i16x16) -> i16x16 {
 /// # #![feature(cfg_target_feature)]
 /// # #![feature(target_feature)]
 /// #
-/// # #[macro_use] extern crate stdsimd;
+/// # #[macro_use] extern crate coresimd as stdsimd;
 /// #
 /// # fn main() {
 /// #     if cfg_feature_enabled!("avx2") {
@@ -2239,7 +2335,7 @@ pub unsafe fn _mm256_unpackhi_epi32(a: i32x8, b: i32x8) -> i32x8 {
 /// # #![feature(cfg_target_feature)]
 /// # #![feature(target_feature)]
 /// #
-/// # #[macro_use] extern crate stdsimd;
+/// # #[macro_use] extern crate coresimd as stdsimd;
 /// #
 /// # fn main() {
 /// #     if cfg_feature_enabled!("avx2") {
@@ -2278,7 +2374,7 @@ pub unsafe fn _mm256_unpacklo_epi32(a: i32x8, b: i32x8) -> i32x8 {
 /// # #![feature(cfg_target_feature)]
 /// # #![feature(target_feature)]
 /// #
-/// # #[macro_use] extern crate stdsimd;
+/// # #[macro_use] extern crate coresimd as stdsimd;
 /// #
 /// # fn main() {
 /// #     if cfg_feature_enabled!("avx2") {
@@ -2317,7 +2413,7 @@ pub unsafe fn _mm256_unpackhi_epi64(a: i64x4, b: i64x4) -> i64x4 {
 /// # #![feature(cfg_target_feature)]
 /// # #![feature(target_feature)]
 /// #
-/// # #[macro_use] extern crate stdsimd;
+/// # #[macro_use] extern crate coresimd as stdsimd;
 /// #
 /// # fn main() {
 /// #     if cfg_feature_enabled!("avx2") {
@@ -2531,37 +2627,69 @@ extern "C" {
     #[link_name = "llvm.x86.avx2.permd"]
     fn permd(a: u32x8, b: u32x8) -> u32x8;
     #[link_name = "llvm.x86.avx2.gather.d.d"]
-    fn pgatherdd(src: i32x4, slice: *const i8, offsets: i32x4, mask: i32x4, scale: i8) -> i32x4;
+    fn pgatherdd(
+        src: i32x4, slice: *const i8, offsets: i32x4, mask: i32x4, scale: i8
+    ) -> i32x4;
     #[link_name = "llvm.x86.avx2.gather.d.d.256"]
-    fn vpgatherdd(src: i32x8, slice: *const i8, offsets: i32x8, mask: i32x8, scale: i8) -> i32x8;
+    fn vpgatherdd(
+        src: i32x8, slice: *const i8, offsets: i32x8, mask: i32x8, scale: i8
+    ) -> i32x8;
     #[link_name = "llvm.x86.avx2.gather.d.q"]
-    fn pgatherdq(src: i64x2, slice: *const i8, offsets: i32x4, mask: i64x2, scale: i8) -> i64x2;
+    fn pgatherdq(
+        src: i64x2, slice: *const i8, offsets: i32x4, mask: i64x2, scale: i8
+    ) -> i64x2;
     #[link_name = "llvm.x86.avx2.gather.d.q.256"]
-    fn vpgatherdq(src: i64x4, slice: *const i8, offsets: i32x4, mask: i64x4, scale: i8) -> i64x4;
+    fn vpgatherdq(
+        src: i64x4, slice: *const i8, offsets: i32x4, mask: i64x4, scale: i8
+    ) -> i64x4;
     #[link_name = "llvm.x86.avx2.gather.q.d"]
-    fn pgatherqd(src: i32x4, slice: *const i8, offsets: i64x2, mask: i32x4, scale: i8) -> i32x4;
+    fn pgatherqd(
+        src: i32x4, slice: *const i8, offsets: i64x2, mask: i32x4, scale: i8
+    ) -> i32x4;
     #[link_name = "llvm.x86.avx2.gather.q.d.256"]
-    fn vpgatherqd(src: i32x4, slice: *const i8, offsets: i64x4, mask: i32x4, scale: i8) -> i32x4;
+    fn vpgatherqd(
+        src: i32x4, slice: *const i8, offsets: i64x4, mask: i32x4, scale: i8
+    ) -> i32x4;
     #[link_name = "llvm.x86.avx2.gather.q.q"]
-    fn pgatherqq(src: i64x2, slice: *const i8, offsets: i64x2, mask: i64x2, scale: i8) -> i64x2;
+    fn pgatherqq(
+        src: i64x2, slice: *const i8, offsets: i64x2, mask: i64x2, scale: i8
+    ) -> i64x2;
     #[link_name = "llvm.x86.avx2.gather.q.q.256"]
-    fn vpgatherqq(src: i64x4, slice: *const i8, offsets: i64x4, mask: i64x4, scale: i8) -> i64x4;
+    fn vpgatherqq(
+        src: i64x4, slice: *const i8, offsets: i64x4, mask: i64x4, scale: i8
+    ) -> i64x4;
     #[link_name = "llvm.x86.avx2.gather.d.pd"]
-    fn pgatherdpd (src: f64x2, slice: *const i8, offsets: i32x4, mask: f64x2, scale: i8) -> f64x2;
+    fn pgatherdpd(
+        src: f64x2, slice: *const i8, offsets: i32x4, mask: f64x2, scale: i8
+    ) -> f64x2;
     #[link_name = "llvm.x86.avx2.gather.d.pd.256"]
-    fn vpgatherdpd (src: f64x4, slice: *const i8, offsets: i32x4, mask: f64x4, scale: i8) -> f64x4;
+    fn vpgatherdpd(
+        src: f64x4, slice: *const i8, offsets: i32x4, mask: f64x4, scale: i8
+    ) -> f64x4;
     #[link_name = "llvm.x86.avx2.gather.q.pd"]
-    fn pgatherqpd (src: f64x2, slice: *const i8, offsets: i64x2, mask: f64x2, scale: i8) -> f64x2;
+    fn pgatherqpd(
+        src: f64x2, slice: *const i8, offsets: i64x2, mask: f64x2, scale: i8
+    ) -> f64x2;
     #[link_name = "llvm.x86.avx2.gather.q.pd.256"]
-    fn vpgatherqpd (src: f64x4, slice: *const i8, offsets: i64x4, mask: f64x4, scale: i8) -> f64x4;
+    fn vpgatherqpd(
+        src: f64x4, slice: *const i8, offsets: i64x4, mask: f64x4, scale: i8
+    ) -> f64x4;
     #[link_name = "llvm.x86.avx2.gather.d.ps"]
-    fn pgatherdps (src: f32x4, slice: *const i8, offsets: i32x4, mask: f32x4, scale: i8) -> f32x4;
+    fn pgatherdps(
+        src: f32x4, slice: *const i8, offsets: i32x4, mask: f32x4, scale: i8
+    ) -> f32x4;
     #[link_name = "llvm.x86.avx2.gather.d.ps.256"]
-    fn vpgatherdps (src: f32x8, slice: *const i8, offsets: i32x8, mask: f32x8, scale: i8) -> f32x8;
+    fn vpgatherdps(
+        src: f32x8, slice: *const i8, offsets: i32x8, mask: f32x8, scale: i8
+    ) -> f32x8;
     #[link_name = "llvm.x86.avx2.gather.q.ps"]
-    fn pgatherqps (src: f32x4, slice: *const i8, offsets: i64x2, mask: f32x4, scale: i8) -> f32x4;
+    fn pgatherqps(
+        src: f32x4, slice: *const i8, offsets: i64x2, mask: f32x4, scale: i8
+    ) -> f32x4;
     #[link_name = "llvm.x86.avx2.gather.q.ps.256"]
-    fn vpgatherqps (src: f32x4, slice: *const i8, offsets: i64x4, mask: f32x4, scale: i8) -> f32x4;
+    fn vpgatherqps(
+        src: f32x4, slice: *const i8, offsets: i64x4, mask: f32x4, scale: i8
+    ) -> f32x4;
 
 }
 
@@ -3998,7 +4126,11 @@ mod tests {
             arr[i as usize] = i;
         }
         // A multiplier of 4 is word-addressing
-        let r = avx2::_mm_i32gather_epi32(arr.as_ptr(), i32x4::new(0, 16, 32, 48), 4);
+        let r = avx2::_mm_i32gather_epi32(
+            arr.as_ptr(),
+            i32x4::new(0, 16, 32, 48),
+            4,
+        );
         assert_eq!(r, i32x4::new(0, 16, 32, 48));
     }
 
@@ -4009,10 +4141,13 @@ mod tests {
             arr[i as usize] = i;
         }
         // A multiplier of 4 is word-addressing
-        let r = avx2::_mm_mask_i32gather_epi32(i32x4::splat(256), arr.as_ptr(),
-                                               i32x4::new(0, 16, 64, 96),
-                                               i32x4::new(-1, -1, -1, 0),
-                                               4);
+        let r = avx2::_mm_mask_i32gather_epi32(
+            i32x4::splat(256),
+            arr.as_ptr(),
+            i32x4::new(0, 16, 64, 96),
+            i32x4::new(-1, -1, -1, 0),
+            4,
+        );
         assert_eq!(r, i32x4::new(0, 16, 64, 256));
     }
 
@@ -4023,7 +4158,11 @@ mod tests {
             arr[i as usize] = i;
         }
         // A multiplier of 4 is word-addressing
-        let r = avx2::_mm256_i32gather_epi32(arr.as_ptr(), i32x8::new(0, 16, 32, 48, 1, 2, 3, 4), 4);
+        let r = avx2::_mm256_i32gather_epi32(
+            arr.as_ptr(),
+            i32x8::new(0, 16, 32, 48, 1, 2, 3, 4),
+            4,
+        );
         assert_eq!(r, i32x8::new(0, 16, 32, 48, 1, 2, 3, 4));
     }
 
@@ -4034,10 +4173,13 @@ mod tests {
             arr[i as usize] = i;
         }
         // A multiplier of 4 is word-addressing
-        let r = avx2::_mm256_mask_i32gather_epi32(i32x8::splat(256), arr.as_ptr(),
-                                                  i32x8::new(0, 16, 64, 96, 0, 0, 0, 0),
-                                                  i32x8::new(-1, -1, -1, 0, 0, 0, 0, 0),
-                                                  4);
+        let r = avx2::_mm256_mask_i32gather_epi32(
+            i32x8::splat(256),
+            arr.as_ptr(),
+            i32x8::new(0, 16, 64, 96, 0, 0, 0, 0),
+            i32x8::new(-1, -1, -1, 0, 0, 0, 0, 0),
+            4,
+        );
         assert_eq!(r, i32x8::new(0, 16, 64, 256, 256, 256, 256, 256));
     }
 
@@ -4050,7 +4192,8 @@ mod tests {
             j += 1.0;
         }
         // A multiplier of 4 is word-addressing for f32s
-        let r = avx2::_mm_i32gather_ps(arr.as_ptr(), i32x4::new(0, 16, 32, 48), 4);
+        let r =
+            avx2::_mm_i32gather_ps(arr.as_ptr(), i32x4::new(0, 16, 32, 48), 4);
         assert_eq!(r, f32x4::new(0.0, 16.0, 32.0, 48.0));
     }
 
@@ -4063,10 +4206,13 @@ mod tests {
             j += 1.0;
         }
         // A multiplier of 4 is word-addressing for f32s
-        let r = avx2::_mm_mask_i32gather_ps(f32x4::splat(256.0), arr.as_ptr(),
-                                            i32x4::new(0, 16, 64, 96),
-                                            f32x4::new(-1.0, -1.0, -1.0, 0.0),
-                                            4);
+        let r = avx2::_mm_mask_i32gather_ps(
+            f32x4::splat(256.0),
+            arr.as_ptr(),
+            i32x4::new(0, 16, 64, 96),
+            f32x4::new(-1.0, -1.0, -1.0, 0.0),
+            4,
+        );
         assert_eq!(r, f32x4::new(0.0, 16.0, 64.0, 256.0));
     }
 
@@ -4079,7 +4225,11 @@ mod tests {
             j += 1.0;
         }
         // A multiplier of 4 is word-addressing for f32s
-        let r = avx2::_mm256_i32gather_ps(arr.as_ptr(), i32x8::new(0, 16, 32, 48, 1, 2, 3, 4), 4);
+        let r = avx2::_mm256_i32gather_ps(
+            arr.as_ptr(),
+            i32x8::new(0, 16, 32, 48, 1, 2, 3, 4),
+            4,
+        );
         assert_eq!(r, f32x8::new(0.0, 16.0, 32.0, 48.0, 1.0, 2.0, 3.0, 4.0));
     }
 
@@ -4092,11 +4242,17 @@ mod tests {
             j += 1.0;
         }
         // A multiplier of 4 is word-addressing for f32s
-        let r = avx2::_mm256_mask_i32gather_ps(f32x8::splat(256.0), arr.as_ptr(),
-                                               i32x8::new(0, 16, 64, 96, 0, 0, 0, 0),
-                                               f32x8::new(-1.0, -1.0, -1.0, 0.0, 0.0, 0.0, 0.0, 0.0),
-                                               4);
-        assert_eq!(r, f32x8::new(0.0, 16.0, 64.0, 256.0, 256.0, 256.0, 256.0, 256.0));
+        let r = avx2::_mm256_mask_i32gather_ps(
+            f32x8::splat(256.0),
+            arr.as_ptr(),
+            i32x8::new(0, 16, 64, 96, 0, 0, 0, 0),
+            f32x8::new(-1.0, -1.0, -1.0, 0.0, 0.0, 0.0, 0.0, 0.0),
+            4,
+        );
+        assert_eq!(
+            r,
+            f32x8::new(0.0, 16.0, 64.0, 256.0, 256.0, 256.0, 256.0, 256.0)
+        );
     }
 
 
@@ -4107,7 +4263,11 @@ mod tests {
             arr[i as usize] = i;
         }
         // A multiplier of 8 is word-addressing for i64s
-        let r = avx2::_mm_i32gather_epi64(arr.as_ptr(), i32x4::new(0, 16, 0, 0), 8);
+        let r = avx2::_mm_i32gather_epi64(
+            arr.as_ptr(),
+            i32x4::new(0, 16, 0, 0),
+            8,
+        );
         assert_eq!(r, i64x2::new(0, 16));
     }
 
@@ -4118,10 +4278,13 @@ mod tests {
             arr[i as usize] = i;
         }
         // A multiplier of 8 is word-addressing for i64s
-        let r = avx2::_mm_mask_i32gather_epi64(i64x2::splat(256), arr.as_ptr(),
-                                               i32x4::new(16, 16, 16, 16),
-                                               i64x2::new(-1, 0),
-                                               8);
+        let r = avx2::_mm_mask_i32gather_epi64(
+            i64x2::splat(256),
+            arr.as_ptr(),
+            i32x4::new(16, 16, 16, 16),
+            i64x2::new(-1, 0),
+            8,
+        );
         assert_eq!(r, i64x2::new(16, 256));
     }
 
@@ -4132,7 +4295,11 @@ mod tests {
             arr[i as usize] = i;
         }
         // A multiplier of 8 is word-addressing for i64s
-        let r = avx2::_mm256_i32gather_epi64(arr.as_ptr(), i32x4::new(0, 16, 32, 48), 8);
+        let r = avx2::_mm256_i32gather_epi64(
+            arr.as_ptr(),
+            i32x4::new(0, 16, 32, 48),
+            8,
+        );
         assert_eq!(r, i64x4::new(0, 16, 32, 48));
     }
 
@@ -4143,10 +4310,13 @@ mod tests {
             arr[i as usize] = i;
         }
         // A multiplier of 8 is word-addressing for i64s
-        let r = avx2::_mm256_mask_i32gather_epi64(i64x4::splat(256), arr.as_ptr(),
-                                                  i32x4::new(0, 16, 64, 96),
-                                                  i64x4::new(-1, -1, -1, 0),
-                                                  8);
+        let r = avx2::_mm256_mask_i32gather_epi64(
+            i64x4::splat(256),
+            arr.as_ptr(),
+            i32x4::new(0, 16, 64, 96),
+            i64x4::new(-1, -1, -1, 0),
+            8,
+        );
         assert_eq!(r, i64x4::new(0, 16, 64, 256));
     }
 
@@ -4159,7 +4329,8 @@ mod tests {
             j += 1.0;
         }
         // A multiplier of 8 is word-addressing for f64s
-        let r = avx2::_mm_i32gather_pd(arr.as_ptr(), i32x4::new(0, 16, 0, 0), 8);
+        let r =
+            avx2::_mm_i32gather_pd(arr.as_ptr(), i32x4::new(0, 16, 0, 0), 8);
         assert_eq!(r, f64x2::new(0.0, 16.0));
     }
 
@@ -4172,10 +4343,13 @@ mod tests {
             j += 1.0;
         }
         // A multiplier of 8 is word-addressing for f64s
-        let r = avx2::_mm_mask_i32gather_pd(f64x2::splat(256.0), arr.as_ptr(),
-                                            i32x4::new(16, 16, 16, 16),
-                                            f64x2::new(-1.0, 0.0),
-                                            8);
+        let r = avx2::_mm_mask_i32gather_pd(
+            f64x2::splat(256.0),
+            arr.as_ptr(),
+            i32x4::new(16, 16, 16, 16),
+            f64x2::new(-1.0, 0.0),
+            8,
+        );
         assert_eq!(r, f64x2::new(16.0, 256.0));
     }
 
@@ -4188,7 +4362,11 @@ mod tests {
             j += 1.0;
         }
         // A multiplier of 8 is word-addressing for f64s
-        let r = avx2::_mm256_i32gather_pd(arr.as_ptr(), i32x4::new(0, 16, 32, 48), 8);
+        let r = avx2::_mm256_i32gather_pd(
+            arr.as_ptr(),
+            i32x4::new(0, 16, 32, 48),
+            8,
+        );
         assert_eq!(r, f64x4::new(0.0, 16.0, 32.0, 48.0));
     }
 
@@ -4201,10 +4379,13 @@ mod tests {
             j += 1.0;
         }
         // A multiplier of 8 is word-addressing for f64s
-        let r = avx2::_mm256_mask_i32gather_pd(f64x4::splat(256.0), arr.as_ptr(),
-                                               i32x4::new(0, 16, 64, 96),
-                                               f64x4::new(-1.0, -1.0, -1.0, 0.0),
-                                               8);
+        let r = avx2::_mm256_mask_i32gather_pd(
+            f64x4::splat(256.0),
+            arr.as_ptr(),
+            i32x4::new(0, 16, 64, 96),
+            f64x4::new(-1.0, -1.0, -1.0, 0.0),
+            8,
+        );
         assert_eq!(r, f64x4::new(0.0, 16.0, 64.0, 256.0));
     }
 
@@ -4226,10 +4407,13 @@ mod tests {
             arr[i as usize] = i;
         }
         // A multiplier of 4 is word-addressing
-        let r = avx2::_mm_mask_i64gather_epi32(i32x4::splat(256), arr.as_ptr(),
-                                               i64x2::new(0, 16),
-                                               i32x4::new(-1, 0, -1, 0),
-                                               4);
+        let r = avx2::_mm_mask_i64gather_epi32(
+            i32x4::splat(256),
+            arr.as_ptr(),
+            i64x2::new(0, 16),
+            i32x4::new(-1, 0, -1, 0),
+            4,
+        );
         assert_eq!(r, i32x4::new(0, 256, 0, 0));
     }
 
@@ -4240,7 +4424,11 @@ mod tests {
             arr[i as usize] = i;
         }
         // A multiplier of 4 is word-addressing
-        let r = avx2::_mm256_i64gather_epi32(arr.as_ptr(), i64x4::new(0, 16, 32, 48), 4);
+        let r = avx2::_mm256_i64gather_epi32(
+            arr.as_ptr(),
+            i64x4::new(0, 16, 32, 48),
+            4,
+        );
         assert_eq!(r, i32x4::new(0, 16, 32, 48));
     }
 
@@ -4251,10 +4439,13 @@ mod tests {
             arr[i as usize] = i;
         }
         // A multiplier of 4 is word-addressing
-        let r = avx2::_mm256_mask_i64gather_epi32(i32x4::splat(256), arr.as_ptr(),
-                                                  i64x4::new(0, 16, 64, 96),
-                                                  i32x4::new(-1, -1, -1, 0),
-                                                  4);
+        let r = avx2::_mm256_mask_i64gather_epi32(
+            i32x4::splat(256),
+            arr.as_ptr(),
+            i64x4::new(0, 16, 64, 96),
+            i32x4::new(-1, -1, -1, 0),
+            4,
+        );
         assert_eq!(r, i32x4::new(0, 16, 64, 256));
     }
 
@@ -4280,10 +4471,13 @@ mod tests {
             j += 1.0;
         }
         // A multiplier of 4 is word-addressing for f32s
-        let r = avx2::_mm_mask_i64gather_ps(f32x4::splat(256.0), arr.as_ptr(),
-                                            i64x2::new(0, 16),
-                                            f32x4::new(-1.0, 0.0, -1.0, 0.0),
-                                            4);
+        let r = avx2::_mm_mask_i64gather_ps(
+            f32x4::splat(256.0),
+            arr.as_ptr(),
+            i64x2::new(0, 16),
+            f32x4::new(-1.0, 0.0, -1.0, 0.0),
+            4,
+        );
         assert_eq!(r, f32x4::new(0.0, 256.0, 0.0, 0.0));
     }
 
@@ -4296,7 +4490,11 @@ mod tests {
             j += 1.0;
         }
         // A multiplier of 4 is word-addressing for f32s
-        let r = avx2::_mm256_i64gather_ps(arr.as_ptr(), i64x4::new(0, 16, 32, 48), 4);
+        let r = avx2::_mm256_i64gather_ps(
+            arr.as_ptr(),
+            i64x4::new(0, 16, 32, 48),
+            4,
+        );
         assert_eq!(r, f32x4::new(0.0, 16.0, 32.0, 48.0));
     }
 
@@ -4309,10 +4507,13 @@ mod tests {
             j += 1.0;
         }
         // A multiplier of 4 is word-addressing for f32s
-        let r = avx2::_mm256_mask_i64gather_ps(f32x4::splat(256.0), arr.as_ptr(),
-                                               i64x4::new(0, 16, 64, 96),
-                                               f32x4::new(-1.0, -1.0, -1.0, 0.0),
-                                               4);
+        let r = avx2::_mm256_mask_i64gather_ps(
+            f32x4::splat(256.0),
+            arr.as_ptr(),
+            i64x4::new(0, 16, 64, 96),
+            f32x4::new(-1.0, -1.0, -1.0, 0.0),
+            4,
+        );
         assert_eq!(r, f32x4::new(0.0, 16.0, 64.0, 256.0));
     }
 
@@ -4335,10 +4536,13 @@ mod tests {
             arr[i as usize] = i;
         }
         // A multiplier of 8 is word-addressing for i64s
-        let r = avx2::_mm_mask_i64gather_epi64(i64x2::splat(256), arr.as_ptr(),
-                                               i64x2::new(16, 16),
-                                               i64x2::new(-1, 0),
-                                               8);
+        let r = avx2::_mm_mask_i64gather_epi64(
+            i64x2::splat(256),
+            arr.as_ptr(),
+            i64x2::new(16, 16),
+            i64x2::new(-1, 0),
+            8,
+        );
         assert_eq!(r, i64x2::new(16, 256));
     }
 
@@ -4349,7 +4553,11 @@ mod tests {
             arr[i as usize] = i;
         }
         // A multiplier of 8 is word-addressing for i64s
-        let r = avx2::_mm256_i64gather_epi64(arr.as_ptr(), i64x4::new(0, 16, 32, 48), 8);
+        let r = avx2::_mm256_i64gather_epi64(
+            arr.as_ptr(),
+            i64x4::new(0, 16, 32, 48),
+            8,
+        );
         assert_eq!(r, i64x4::new(0, 16, 32, 48));
     }
 
@@ -4360,10 +4568,13 @@ mod tests {
             arr[i as usize] = i;
         }
         // A multiplier of 8 is word-addressing for i64s
-        let r = avx2::_mm256_mask_i64gather_epi64(i64x4::splat(256), arr.as_ptr(),
-                                                  i64x4::new(0, 16, 64, 96),
-                                                  i64x4::new(-1, -1, -1, 0),
-                                                  8);
+        let r = avx2::_mm256_mask_i64gather_epi64(
+            i64x4::splat(256),
+            arr.as_ptr(),
+            i64x4::new(0, 16, 64, 96),
+            i64x4::new(-1, -1, -1, 0),
+            8,
+        );
         assert_eq!(r, i64x4::new(0, 16, 64, 256));
     }
 
@@ -4389,10 +4600,13 @@ mod tests {
             j += 1.0;
         }
         // A multiplier of 8 is word-addressing for f64s
-        let r = avx2::_mm_mask_i64gather_pd(f64x2::splat(256.0), arr.as_ptr(),
-                                            i64x2::new(16, 16),
-                                            f64x2::new(-1.0, 0.0),
-                                            8);
+        let r = avx2::_mm_mask_i64gather_pd(
+            f64x2::splat(256.0),
+            arr.as_ptr(),
+            i64x2::new(16, 16),
+            f64x2::new(-1.0, 0.0),
+            8,
+        );
         assert_eq!(r, f64x2::new(16.0, 256.0));
     }
 
@@ -4405,7 +4619,11 @@ mod tests {
             j += 1.0;
         }
         // A multiplier of 8 is word-addressing for f64s
-        let r = avx2::_mm256_i64gather_pd(arr.as_ptr(), i64x4::new(0, 16, 32, 48), 8);
+        let r = avx2::_mm256_i64gather_pd(
+            arr.as_ptr(),
+            i64x4::new(0, 16, 32, 48),
+            8,
+        );
         assert_eq!(r, f64x4::new(0.0, 16.0, 32.0, 48.0));
     }
 
@@ -4418,10 +4636,13 @@ mod tests {
             j += 1.0;
         }
         // A multiplier of 8 is word-addressing for f64s
-        let r = avx2::_mm256_mask_i64gather_pd(f64x4::splat(256.0), arr.as_ptr(),
-                                               i64x4::new(0, 16, 64, 96),
-                                               f64x4::new(-1.0, -1.0, -1.0, 0.0),
-                                               8);
+        let r = avx2::_mm256_mask_i64gather_pd(
+            f64x4::splat(256.0),
+            arr.as_ptr(),
+            i64x4::new(0, 16, 64, 96),
+            f64x4::new(-1.0, -1.0, -1.0, 0.0),
+            8,
+        );
         assert_eq!(r, f64x4::new(0.0, 16.0, 64.0, 256.0));
     }
 
diff --git a/src/x86/i586/bmi.rs b/coresimd/src/x86/i586/bmi.rs
similarity index 98%
rename from src/x86/i586/bmi.rs
rename to coresimd/src/x86/i586/bmi.rs
index 1b71896dec..5f00a7c67f 100644
--- a/src/x86/i586/bmi.rs
+++ b/coresimd/src/x86/i586/bmi.rs
@@ -7,10 +7,7 @@
 //! available.
 //!
 //! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
-//! [wikipedia_bmi]:
-//! https://en.wikipedia.org/wiki/Bit_Manipulation_Instruction_Sets#ABM_.
-//! 28Advanced_Bit_Manipulation.29
-
+//! [wikipedia_bmi]: https://en.wikipedia.org/wiki/Bit_Manipulation_Instruction_Sets#ABM_.28Advanced_Bit_Manipulation.29
 
 #[cfg(test)]
 use stdsimd_test::assert_instr;
diff --git a/src/x86/i586/bmi2.rs b/coresimd/src/x86/i586/bmi2.rs
similarity index 99%
rename from src/x86/i586/bmi2.rs
rename to coresimd/src/x86/i586/bmi2.rs
index 0e1c81a88a..f32778063a 100644
--- a/src/x86/i586/bmi2.rs
+++ b/coresimd/src/x86/i586/bmi2.rs
@@ -8,8 +8,7 @@
 //!
 //! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
 //! [wikipedia_bmi]:
-//! https://en.wikipedia.org/wiki/Bit_Manipulation_Instruction_Sets#ABM_.
-//! 28Advanced_Bit_Manipulation.29
+//! https://en.wikipedia.org/wiki/Bit_Manipulation_Instruction_Sets#ABM_.28Advanced_Bit_Manipulation.29
 
 #[cfg(test)]
 use stdsimd_test::assert_instr;
diff --git a/src/x86/i586/cpuid.rs b/coresimd/src/x86/i586/cpuid.rs
similarity index 100%
rename from src/x86/i586/cpuid.rs
rename to coresimd/src/x86/i586/cpuid.rs
diff --git a/src/x86/i586/mod.rs b/coresimd/src/x86/i586/mod.rs
similarity index 100%
rename from src/x86/i586/mod.rs
rename to coresimd/src/x86/i586/mod.rs
diff --git a/src/x86/i586/sse.rs b/coresimd/src/x86/i586/sse.rs
similarity index 99%
rename from src/x86/i586/sse.rs
rename to coresimd/src/x86/i586/sse.rs
index 6891d4ea64..f5533f5f78 100644
--- a/src/x86/i586/sse.rs
+++ b/coresimd/src/x86/i586/sse.rs
@@ -6,7 +6,6 @@ use core::ptr;
 use simd_llvm::simd_shuffle4;
 use v128::*;
 use v64::f32x2;
-use x86::c_void;
 
 #[cfg(test)]
 use stdsimd_test::assert_instr;
@@ -885,7 +884,7 @@ pub unsafe fn _mm_movemask_ps(a: f32x4) -> i32 {
 /// # #![feature(cfg_target_feature)]
 /// # #![feature(target_feature)]
 /// #
-/// # #[macro_use] extern crate stdsimd;
+/// # #[macro_use] extern crate coresimd as stdsimd;
 /// #
 /// # // The real main function
 /// # fn main() {
@@ -937,7 +936,7 @@ pub unsafe fn _mm_loadh_pi(a: f32x4, p: *const f32) -> f32x4 {
 /// # #![feature(cfg_target_feature)]
 /// # #![feature(target_feature)]
 /// #
-/// # #[macro_use] extern crate stdsimd;
+/// # #[macro_use] extern crate coresimd as stdsimd;
 /// #
 /// # // The real main function
 /// # fn main() {
@@ -1572,7 +1571,7 @@ pub const _MM_HINT_NTA: i8 = 0;
 #[cfg_attr(test, assert_instr(prefetcht1, strategy = _MM_HINT_T1))]
 #[cfg_attr(test, assert_instr(prefetcht2, strategy = _MM_HINT_T2))]
 #[cfg_attr(test, assert_instr(prefetchnta, strategy = _MM_HINT_NTA))]
-pub unsafe fn _mm_prefetch(p: *const c_void, strategy: i8) {
+pub unsafe fn _mm_prefetch(p: *const u8, strategy: i8) {
     // The `strategy` must be a compile-time constant, so we use a short form
     // of `constify_imm8!` for now.
     // We use the `llvm.prefetch` instrinsic with `rw` = 0 (read), and
@@ -1684,7 +1683,7 @@ extern "C" {
     #[link_name = "llvm.x86.sse.ldmxcsr"]
     fn ldmxcsr(p: *const i8);
     #[link_name = "llvm.prefetch"]
-    fn prefetch(p: *const c_void, rw: i32, loc: i32, ty: i32);
+    fn prefetch(p: *const u8, rw: i32, loc: i32, ty: i32);
     #[link_name = "llvm.x86.sse.cmp.ss"]
     fn cmpss(a: f32x4, b: f32x4, imm8: i8) -> f32x4;
 }
diff --git a/src/x86/i586/sse2.rs b/coresimd/src/x86/i586/sse2.rs
similarity index 99%
rename from src/x86/i586/sse2.rs
rename to coresimd/src/x86/i586/sse2.rs
index a31c88ed22..19978f5400 100644
--- a/src/x86/i586/sse2.rs
+++ b/coresimd/src/x86/i586/sse2.rs
@@ -8,7 +8,6 @@ use core::ptr;
 
 use simd_llvm::{simd_cast, simd_shuffle16, simd_shuffle2, simd_shuffle4,
                 simd_shuffle8};
-use x86::c_void;
 use x86::__m128i;
 use v128::*;
 use v64::*;
@@ -29,7 +28,7 @@ pub unsafe fn _mm_pause() {
 #[inline(always)]
 #[target_feature = "+sse2"]
 #[cfg_attr(test, assert_instr(clflush))]
-pub unsafe fn _mm_clflush(p: *mut c_void) {
+pub unsafe fn _mm_clflush(p: *mut u8) {
     clflush(p)
 }
 
@@ -1989,7 +1988,7 @@ extern "C" {
     #[link_name = "llvm.x86.sse2.pause"]
     fn pause();
     #[link_name = "llvm.x86.sse2.clflush"]
-    fn clflush(p: *mut c_void);
+    fn clflush(p: *mut u8);
     #[link_name = "llvm.x86.sse2.lfence"]
     fn lfence();
     #[link_name = "llvm.x86.sse2.mfence"]
@@ -2142,7 +2141,6 @@ extern "C" {
 
 #[cfg(test)]
 mod tests {
-    use super::c_void;
     use stdsimd_test::simd_test;
     use test::black_box; // Used to inhibit constant-folding.
 
@@ -2158,7 +2156,7 @@ mod tests {
     #[simd_test = "sse2"]
     unsafe fn _mm_clflush() {
         let x = 0;
-        sse2::_mm_clflush(&x as *const _ as *mut c_void);
+        sse2::_mm_clflush(&x as *const _ as *mut u8);
     }
 
     #[simd_test = "sse2"]
diff --git a/src/x86/i586/sse3.rs b/coresimd/src/x86/i586/sse3.rs
similarity index 100%
rename from src/x86/i586/sse3.rs
rename to coresimd/src/x86/i586/sse3.rs
diff --git a/src/x86/i586/sse41.rs b/coresimd/src/x86/i586/sse41.rs
similarity index 99%
rename from src/x86/i586/sse41.rs
rename to coresimd/src/x86/i586/sse41.rs
index a87fc35708..7e51a9c3e9 100644
--- a/src/x86/i586/sse41.rs
+++ b/coresimd/src/x86/i586/sse41.rs
@@ -504,7 +504,7 @@ pub unsafe fn _mm_ceil_ss(a: f32x4, b: f32x4) -> f32x4 {
 /// Rounding is done according to the rounding parameter, which can be one of:
 ///
 /// ```
-/// use stdsimd::vendor;
+/// use coresimd::vendor;
 ///
 /// // round to nearest, and suppress exceptions:
 /// (vendor::_MM_FROUND_TO_NEAREST_INT |vendor::_MM_FROUND_NO_EXC);
@@ -533,7 +533,7 @@ pub unsafe fn _mm_round_pd(a: f64x2, rounding: i32) -> f64x2 {
 /// Rounding is done according to the rounding parameter, which can be one of:
 ///
 /// ```
-/// use stdsimd::vendor;
+/// use coresimd::vendor;
 ///
 /// // round to nearest, and suppress exceptions:
 /// (vendor::_MM_FROUND_TO_NEAREST_INT |vendor::_MM_FROUND_NO_EXC);
@@ -564,7 +564,7 @@ pub unsafe fn _mm_round_ps(a: f32x4, rounding: i32) -> f32x4 {
 /// Rounding is done according to the rounding parameter, which can be one of:
 ///
 /// ```
-/// use stdsimd::vendor;
+/// use coresimd::vendor;
 ///
 /// // round to nearest, and suppress exceptions:
 /// (vendor::_MM_FROUND_TO_NEAREST_INT |vendor::_MM_FROUND_NO_EXC);
@@ -595,7 +595,7 @@ pub unsafe fn _mm_round_sd(a: f64x2, b: f64x2, rounding: i32) -> f64x2 {
 /// Rounding is done according to the rounding parameter, which can be one of:
 ///
 /// ```
-/// use stdsimd::vendor;
+/// use coresimd::vendor;
 ///
 /// // round to nearest, and suppress exceptions:
 /// (vendor::_MM_FROUND_TO_NEAREST_INT |vendor::_MM_FROUND_NO_EXC);
diff --git a/src/x86/i586/sse42.rs b/coresimd/src/x86/i586/sse42.rs
similarity index 99%
rename from src/x86/i586/sse42.rs
rename to coresimd/src/x86/i586/sse42.rs
index e3bc7b7757..ce3bcdbaaa 100644
--- a/src/x86/i586/sse42.rs
+++ b/coresimd/src/x86/i586/sse42.rs
@@ -96,7 +96,7 @@ pub unsafe fn _mm_cmpistrm(a: __m128i, b: __m128i, imm8: i8) -> u8x16 {
 /// # #![feature(cfg_target_feature)]
 /// # #![feature(target_feature)]
 /// #
-/// # #[macro_use] extern crate stdsimd;
+/// # #[macro_use] extern crate coresimd as stdsimd;
 /// #
 /// # fn main() {
 /// #     if cfg_feature_enabled!("sse4.2") {
@@ -139,7 +139,7 @@ pub unsafe fn _mm_cmpistrm(a: __m128i, b: __m128i, imm8: i8) -> u8x16 {
 /// # #![feature(cfg_target_feature)]
 /// # #![feature(target_feature)]
 /// #
-/// # #[macro_use] extern crate stdsimd;
+/// # #[macro_use] extern crate coresimd as stdsimd;
 /// #
 /// # fn main() {
 /// #     if cfg_feature_enabled!("sse4.2") {
@@ -180,7 +180,7 @@ pub unsafe fn _mm_cmpistrm(a: __m128i, b: __m128i, imm8: i8) -> u8x16 {
 /// # #![feature(cfg_target_feature)]
 /// # #![feature(target_feature)]
 /// #
-/// # #[macro_use] extern crate stdsimd;
+/// # #[macro_use] extern crate coresimd as stdsimd;
 /// #
 /// # fn main() {
 /// #     if cfg_feature_enabled!("sse4.2") {
@@ -219,7 +219,7 @@ pub unsafe fn _mm_cmpistrm(a: __m128i, b: __m128i, imm8: i8) -> u8x16 {
 /// # #![feature(cfg_target_feature)]
 /// # #![feature(target_feature)]
 /// #
-/// # #[macro_use] extern crate stdsimd;
+/// # #[macro_use] extern crate coresimd as stdsimd;
 /// #
 /// # fn main() {
 /// #     if cfg_feature_enabled!("sse4.2") {
@@ -392,7 +392,7 @@ pub unsafe fn _mm_cmpestrm(
 /// # #![feature(cfg_target_feature)]
 /// # #![feature(target_feature)]
 /// #
-/// # #[macro_use] extern crate stdsimd;
+/// # #[macro_use] extern crate coresimd as stdsimd;
 /// #
 /// # fn main() {
 /// #     if cfg_feature_enabled!("sse4.2") {
diff --git a/src/x86/i586/ssse3.rs b/coresimd/src/x86/i586/ssse3.rs
similarity index 100%
rename from src/x86/i586/ssse3.rs
rename to coresimd/src/x86/i586/ssse3.rs
diff --git a/src/x86/i586/tbm.rs b/coresimd/src/x86/i586/tbm.rs
similarity index 99%
rename from src/x86/i586/tbm.rs
rename to coresimd/src/x86/i586/tbm.rs
index b0d6253296..f42a382abf 100644
--- a/src/x86/i586/tbm.rs
+++ b/coresimd/src/x86/i586/tbm.rs
@@ -8,8 +8,7 @@
 //!
 //! [amd64_ref]: http://support.amd.com/TechDocs/24594.pdf
 //! [wikipedia_bmi]:
-//! https://en.wikipedia.org/wiki/Bit_Manipulation_Instruction_Sets#ABM_.
-//! 28Advanced_Bit_Manipulation.29
+//! https://en.wikipedia.org/wiki/Bit_Manipulation_Instruction_Sets#ABM_.28Advanced_Bit_Manipulation.29
 
 #[cfg(test)]
 use stdsimd_test::assert_instr;
diff --git a/src/x86/i586/xsave.rs b/coresimd/src/x86/i586/xsave.rs
similarity index 91%
rename from src/x86/i586/xsave.rs
rename to coresimd/src/x86/i586/xsave.rs
index 45c567fba5..10003388a5 100644
--- a/src/x86/i586/xsave.rs
+++ b/coresimd/src/x86/i586/xsave.rs
@@ -5,14 +5,12 @@
 #[cfg(test)]
 use stdsimd_test::assert_instr;
 
-use x86::c_void;
-
 #[allow(improper_ctypes)]
 extern "C" {
     #[link_name = "llvm.x86.xsave"]
     fn xsave(p: *mut i8, hi: i32, lo: i32) -> ();
     #[link_name = "llvm.x86.xrstor"]
-    fn xrstor(p: *const c_void, hi: i32, lo: i32) -> ();
+    fn xrstor(p: *const u8, hi: i32, lo: i32) -> ();
     #[link_name = "llvm.x86.xsetbv"]
     fn xsetbv(v: i32, hi: i32, lo: i32) -> ();
     #[link_name = "llvm.x86.xgetbv"]
@@ -20,7 +18,7 @@ extern "C" {
     #[link_name = "llvm.x86.xsave64"]
     fn xsave64(p: *mut i8, hi: i32, lo: i32) -> ();
     #[link_name = "llvm.x86.xrstor64"]
-    fn xrstor64(p: *const c_void, hi: i32, lo: i32) -> ();
+    fn xrstor64(p: *const u8, hi: i32, lo: i32) -> ();
     #[link_name = "llvm.x86.xsaveopt"]
     fn xsaveopt(p: *mut i8, hi: i32, lo: i32) -> ();
     #[link_name = "llvm.x86.xsaveopt64"]
@@ -34,9 +32,9 @@ extern "C" {
     #[link_name = "llvm.x86.xsaves64"]
     fn xsaves64(p: *mut i8, hi: i32, lo: i32) -> ();
     #[link_name = "llvm.x86.xrstors"]
-    fn xrstors(p: *const c_void, hi: i32, lo: i32) -> ();
+    fn xrstors(p: *const u8, hi: i32, lo: i32) -> ();
     #[link_name = "llvm.x86.xrstors64"]
-    fn xrstors64(p: *const c_void, hi: i32, lo: i32) -> ();
+    fn xrstors64(p: *const u8, hi: i32, lo: i32) -> ();
 }
 
 /// Perform a full or partial save of the enabled processor states to memory at
@@ -50,7 +48,7 @@ extern "C" {
 #[inline(always)]
 #[target_feature = "+xsave"]
 #[cfg_attr(test, assert_instr(xsave))]
-pub unsafe fn _xsave(mem_addr: *mut c_void, save_mask: u64) -> () {
+pub unsafe fn _xsave(mem_addr: *mut u8, save_mask: u64) -> () {
     xsave(mem_addr as *mut i8, (save_mask >> 32) as i32, save_mask as i32);
 }
 
@@ -63,7 +61,7 @@ pub unsafe fn _xsave(mem_addr: *mut c_void, save_mask: u64) -> () {
 #[inline(always)]
 #[target_feature = "+xsave"]
 #[cfg_attr(test, assert_instr(xrstor))]
-pub unsafe fn _xrstor(mem_addr: *const c_void, rs_mask: u64) -> () {
+pub unsafe fn _xrstor(mem_addr: *const u8, rs_mask: u64) -> () {
     xrstor(mem_addr, (rs_mask >> 32) as i32, rs_mask as i32);
 }
 
@@ -104,7 +102,7 @@ pub unsafe fn _xgetbv(xcr_no: u32) -> u64 {
 #[target_feature = "+xsave"]
 #[cfg_attr(test, assert_instr(xsave64))]
 #[cfg(not(target_arch = "x86"))]
-pub unsafe fn _xsave64(mem_addr: *mut c_void, save_mask: u64) -> () {
+pub unsafe fn _xsave64(mem_addr: *mut u8, save_mask: u64) -> () {
     xsave64(mem_addr as *mut i8, (save_mask >> 32) as i32, save_mask as i32);
 }
 
@@ -118,7 +116,7 @@ pub unsafe fn _xsave64(mem_addr: *mut c_void, save_mask: u64) -> () {
 #[target_feature = "+xsave"]
 #[cfg_attr(test, assert_instr(xrstor64))]
 #[cfg(not(target_arch = "x86"))]
-pub unsafe fn _xrstor64(mem_addr: *const c_void, rs_mask: u64) -> () {
+pub unsafe fn _xrstor64(mem_addr: *const u8, rs_mask: u64) -> () {
     xrstor64(mem_addr, (rs_mask >> 32) as i32, rs_mask as i32);
 }
 
@@ -132,7 +130,7 @@ pub unsafe fn _xrstor64(mem_addr: *const c_void, rs_mask: u64) -> () {
 #[inline(always)]
 #[target_feature = "+xsave,+xsaveopt"]
 #[cfg_attr(test, assert_instr(xsaveopt))]
-pub unsafe fn _xsaveopt(mem_addr: *mut c_void, save_mask: u64) -> () {
+pub unsafe fn _xsaveopt(mem_addr: *mut u8, save_mask: u64) -> () {
     xsaveopt(mem_addr as *mut i8, (save_mask >> 32) as i32, save_mask as i32);
 }
 
@@ -147,7 +145,7 @@ pub unsafe fn _xsaveopt(mem_addr: *mut c_void, save_mask: u64) -> () {
 #[target_feature = "+xsave,+xsaveopt"]
 #[cfg_attr(test, assert_instr(xsaveopt64))]
 #[cfg(not(target_arch = "x86"))]
-pub unsafe fn _xsaveopt64(mem_addr: *mut c_void, save_mask: u64) -> () {
+pub unsafe fn _xsaveopt64(mem_addr: *mut u8, save_mask: u64) -> () {
     xsaveopt64(
         mem_addr as *mut i8,
         (save_mask >> 32) as i32,
@@ -164,7 +162,7 @@ pub unsafe fn _xsaveopt64(mem_addr: *mut c_void, save_mask: u64) -> () {
 #[inline(always)]
 #[target_feature = "+xsave,+xsavec"]
 #[cfg_attr(test, assert_instr(xsavec))]
-pub unsafe fn _xsavec(mem_addr: *mut c_void, save_mask: u64) -> () {
+pub unsafe fn _xsavec(mem_addr: *mut u8, save_mask: u64) -> () {
     xsavec(mem_addr as *mut i8, (save_mask >> 32) as i32, save_mask as i32);
 }
 
@@ -178,7 +176,7 @@ pub unsafe fn _xsavec(mem_addr: *mut c_void, save_mask: u64) -> () {
 #[target_feature = "+xsave,+xsavec"]
 #[cfg_attr(test, assert_instr(xsavec64))]
 #[cfg(not(target_arch = "x86"))]
-pub unsafe fn _xsavec64(mem_addr: *mut c_void, save_mask: u64) -> () {
+pub unsafe fn _xsavec64(mem_addr: *mut u8, save_mask: u64) -> () {
     xsavec64(mem_addr as *mut i8, (save_mask >> 32) as i32, save_mask as i32);
 }
 
@@ -192,7 +190,7 @@ pub unsafe fn _xsavec64(mem_addr: *mut c_void, save_mask: u64) -> () {
 #[inline(always)]
 #[target_feature = "+xsave,+xsaves"]
 #[cfg_attr(test, assert_instr(xsaves))]
-pub unsafe fn _xsaves(mem_addr: *mut c_void, save_mask: u64) -> () {
+pub unsafe fn _xsaves(mem_addr: *mut u8, save_mask: u64) -> () {
     xsaves(mem_addr as *mut i8, (save_mask >> 32) as i32, save_mask as i32);
 }
 
@@ -207,7 +205,7 @@ pub unsafe fn _xsaves(mem_addr: *mut c_void, save_mask: u64) -> () {
 #[target_feature = "+xsave,+xsaves"]
 #[cfg_attr(test, assert_instr(xsaves64))]
 #[cfg(not(target_arch = "x86"))]
-pub unsafe fn _xsaves64(mem_addr: *mut c_void, save_mask: u64) -> () {
+pub unsafe fn _xsaves64(mem_addr: *mut u8, save_mask: u64) -> () {
     xsaves64(mem_addr as *mut i8, (save_mask >> 32) as i32, save_mask as i32);
 }
 
@@ -223,7 +221,7 @@ pub unsafe fn _xsaves64(mem_addr: *mut c_void, save_mask: u64) -> () {
 #[inline(always)]
 #[target_feature = "+xsave,+xsaves"]
 #[cfg_attr(test, assert_instr(xrstors))]
-pub unsafe fn _xrstors(mem_addr: *const c_void, rs_mask: u64) -> () {
+pub unsafe fn _xrstors(mem_addr: *const u8, rs_mask: u64) -> () {
     xrstors(mem_addr, (rs_mask >> 32) as i32, rs_mask as i32);
 }
 
@@ -240,13 +238,12 @@ pub unsafe fn _xrstors(mem_addr: *const c_void, rs_mask: u64) -> () {
 #[target_feature = "+xsave,+xsaves"]
 #[cfg_attr(test, assert_instr(xrstors64))]
 #[cfg(not(target_arch = "x86"))]
-pub unsafe fn _xrstors64(mem_addr: *const c_void, rs_mask: u64) -> () {
+pub unsafe fn _xrstors64(mem_addr: *const u8, rs_mask: u64) -> () {
     xrstors64(mem_addr, (rs_mask >> 32) as i32, rs_mask as i32);
 }
 
 #[cfg(test)]
 mod tests {
-    use x86::c_void;
     use x86::i586::xsave;
     use stdsimd_test::simd_test;
     use std::fmt;
@@ -260,8 +257,8 @@ mod tests {
         fn new() -> Buffer {
             Buffer { data: [0; 1024] }
         }
-        fn ptr(&mut self) -> *mut c_void {
-            &mut self.data[0] as *mut _ as *mut c_void
+        fn ptr(&mut self) -> *mut u8 {
+            &mut self.data[0] as *mut _ as *mut u8
         }
     }
 
diff --git a/src/x86/i686/mod.rs b/coresimd/src/x86/i686/mod.rs
similarity index 100%
rename from src/x86/i686/mod.rs
rename to coresimd/src/x86/i686/mod.rs
diff --git a/src/x86/i686/sse2.rs b/coresimd/src/x86/i686/sse2.rs
similarity index 100%
rename from src/x86/i686/sse2.rs
rename to coresimd/src/x86/i686/sse2.rs
diff --git a/src/x86/i686/sse41.rs b/coresimd/src/x86/i686/sse41.rs
similarity index 100%
rename from src/x86/i686/sse41.rs
rename to coresimd/src/x86/i686/sse41.rs
diff --git a/src/x86/i686/sse42.rs b/coresimd/src/x86/i686/sse42.rs
similarity index 100%
rename from src/x86/i686/sse42.rs
rename to coresimd/src/x86/i686/sse42.rs
diff --git a/src/x86/macros.rs b/coresimd/src/x86/macros.rs
similarity index 100%
rename from src/x86/macros.rs
rename to coresimd/src/x86/macros.rs
diff --git a/src/x86/mod.rs b/coresimd/src/x86/mod.rs
similarity index 74%
rename from src/x86/mod.rs
rename to coresimd/src/x86/mod.rs
index 11811b328f..fcbcdead03 100644
--- a/src/x86/mod.rs
+++ b/coresimd/src/x86/mod.rs
@@ -32,17 +32,3 @@ pub type __m128i = ::v128::i8x16;
 /// 256-bit wide signed integer vector type
 #[allow(non_camel_case_types)]
 pub type __m256i = ::v256::i8x32;
-
-
-/// `C`'s `void` type.
-#[cfg(not(feature = "std"))]
-#[allow(non_camel_case_types)]
-#[repr(u8)]
-pub enum c_void {
-    #[doc(hidden)] __variant1,
-    #[doc(hidden)] __variant2,
-}
-
-// FIXME: we should not depend on std for this
-#[cfg(feature = "std")]
-use std::os::raw::c_void;
diff --git a/src/x86/x86_64/mod.rs b/coresimd/src/x86/x86_64/mod.rs
similarity index 100%
rename from src/x86/x86_64/mod.rs
rename to coresimd/src/x86/x86_64/mod.rs
diff --git a/src/x86/x86_64/sse.rs b/coresimd/src/x86/x86_64/sse.rs
similarity index 100%
rename from src/x86/x86_64/sse.rs
rename to coresimd/src/x86/x86_64/sse.rs
diff --git a/src/x86/x86_64/sse2.rs b/coresimd/src/x86/x86_64/sse2.rs
similarity index 100%
rename from src/x86/x86_64/sse2.rs
rename to coresimd/src/x86/x86_64/sse2.rs
diff --git a/src/x86/x86_64/sse42.rs b/coresimd/src/x86/x86_64/sse42.rs
similarity index 100%
rename from src/x86/x86_64/sse42.rs
rename to coresimd/src/x86/x86_64/sse42.rs
diff --git a/src/lib.rs b/src/lib.rs
index 1d9345b995..37fe7b6f59 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -104,139 +104,41 @@
 //! The language support and status of SIMD is also still a little up in the
 //! air right now, you may be interested in a few issues along these lines:
 //!
-//! * [Overal tracking issue for SIMD support]
-//!   (https://github.com/rust-lang/rust/issues/27731)
-//! * [`cfg_target_feature` tracking issue]
-//!   (https://github.com/rust-lang/rust/issues/29717)
-//! * [SIMD types currently not sound]
-//!   (https://github.com/rust-lang/rust/issues/44367)
-//! * [`#[target_feature]` improvements]
-//!   (https://github.com/rust-lang/rust/issues/44839)
+//! * [Overal tracking issue for SIMD support][simd_tracking_issue]
+//! * [`cfg_target_feature` tracking issue][cfg_target_feature_issue]
+//! * [SIMD types currently not sound][simd_soundness_bug]
+//! * [`#[target_feature]` improvements][target_feature_impr]
 //!
 //! [vendor]: https://github.com/rust-lang-nursery/stdsimd/issues/40
+//! [simd_tracking_issue]: https://github.com/rust-lang/rust/issues/27731
+//! [cfg_target_feature_issue]: https://github.com/rust-lang/rust/issues/29717
+//! [simd_soundness_bug]: https://github.com/rust-lang/rust/issues/44367
+//! [target_feature_impr]: https://github.com/rust-lang/rust/issues/44839
 
-#![cfg_attr(feature = "strict", deny(warnings))]
-#![allow(dead_code)]
-#![allow(unused_features)]
-#![feature(const_fn, link_llvm_intrinsics, platform_intrinsics, repr_simd,
-           simd_ffi, target_feature, cfg_target_feature, i128_type, asm,
-           const_atomic_usize_new, stmt_expr_attributes)]
-#![cfg_attr(test, feature(proc_macro, test, repr_align, attr_literals))]
-#![cfg_attr(feature = "cargo-clippy",
-            allow(inline_always, too_many_arguments, cast_sign_loss,
-                  cast_lossless, cast_possible_wrap,
-                  cast_possible_truncation, cast_precision_loss,
-                  shadow_reuse, cyclomatic_complexity, similar_names,
-                  doc_markdown, many_single_char_names))]
-#![no_std]
+#![feature(macro_reexport, const_fn, const_atomic_usize_new)]
 
-#[cfg(any(feature = "std", test))]
-#[macro_use]
-extern crate std;
-
-#[cfg(test)]
-extern crate stdsimd_test;
-
-#[cfg(test)]
-extern crate test;
+/// We re-export run-time feature detection for those architectures that have
+/// suport for it in `core`:
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+#[macro_reexport(cfg_feature_enabled, __unstable_detect_feature)]
+extern crate coresimd;
 
-/// Platform independent SIMD vector types and operations.
-pub mod simd {
-    pub use v128::*;
-    pub use v256::*;
-    pub use v512::*;
-    pub use v64::*;
-}
+#[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))]
+extern crate coresimd;
 
 /// Platform dependent vendor intrinsics.
 pub mod vendor {
-    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-    pub use x86::*;
+    pub use coresimd::vendor::*;
 
-    #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
-    pub use arm::*;
-
-    #[cfg(target_arch = "aarch64")]
-    pub use aarch64::*;
-
-    // FIXME: rust does not expose the nvptx and nvptx64 targets yet
-    #[cfg(not(any(target_arch = "x86", target_arch = "x86_64",
-                  target_arch = "arm", target_arch = "aarch64")))]
-    pub use nvptx::*;
+    #[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))]
+    pub use super::runtime::{__unstable_detect_feature, __Feature};
+}
 
-    #[cfg(any(
-        // x86/x86_64:
-        any(target_arch = "x86", target_arch = "x86_64"),
-        // linux + std + (arm|aarch64):
-        all(target_os = "linux",
-            feature = "std",
-            any(target_arch = "arm", target_arch = "aarch64"))
-    ))]
-    pub use runtime::{__unstable_detect_feature, __Feature};
+/// Platform independent SIMD vector types and operations.
+pub mod simd {
+    pub use coresimd::simd::*;
 }
 
-#[cfg(any(
-    // x86/x86_64:
-    any(target_arch = "x86", target_arch = "x86_64"),
-    // linux + std + (arm|aarch64):
-    all(target_os = "linux",
-        feature = "std",
-        any(target_arch = "arm", target_arch = "aarch64"))
-))]
+#[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))]
 #[macro_use]
 mod runtime;
-
-#[macro_use]
-mod macros;
-mod simd_llvm;
-mod v128;
-mod v256;
-mod v512;
-mod v64;
-
-/// 32-bit wide vector tpyes
-mod v32 {
-    use simd_llvm::*;
-
-    define_ty! { i16x2, i16, i16 }
-    define_impl! { i16x2, i16, 2, i16x2, x0, x1 }
-    define_ty! { u16x2, u16, u16 }
-    define_impl! { u16x2, u16, 2, i16x2, x0, x1 }
-
-    define_ty! { i8x4, i8, i8, i8, i8 }
-    define_impl! { i8x4, i8, 4, i8x4, x0, x1, x2, x3 }
-    define_ty! { u8x4, u8, u8, u8, u8 }
-    define_impl! { u8x4, u8, 4, i8x4, x0, x1, x2, x3 }
-
-    define_casts!(
-        (i16x2, i64x2, as_i64x2),
-        (u16x2, i64x2, as_i64x2),
-        (i8x4, i32x4, as_i32x4),
-        (u8x4, i32x4, as_i32x4)
-    );
-}
-
-/// 16-bit wide vector tpyes
-mod v16 {
-    use simd_llvm::*;
-
-    define_ty! { i8x2, i8, i8 }
-    define_impl! { i8x2, i8, 2, i8x2, x0, x1 }
-    define_ty! { u8x2, u8, u8 }
-    define_impl! { u8x2, u8, 2, i8x2, x0, x1 }
-
-    define_casts!((i8x2, i64x2, as_i64x2), (u8x2, i64x2, as_i64x2));
-}
-
-#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-mod x86;
-
-#[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
-mod arm;
-#[cfg(target_arch = "aarch64")]
-mod aarch64;
-
-// FIXME: rust does not expose the nvptx and nvptx64 targets yet
-#[cfg(not(any(target_arch = "x86", target_arch = "x86_64",
-              target_arch = "arm", target_arch = "aarch64")))]
-mod nvptx;
diff --git a/src/runtime/cache.rs b/src/runtime/cache.rs
index bb247fb531..6aab8add41 100644
--- a/src/runtime/cache.rs
+++ b/src/runtime/cache.rs
@@ -1,7 +1,7 @@
 //! Cache of run-time feature detection
 
-use core::sync::atomic::{AtomicUsize, Ordering};
-use core::usize;
+use std::sync::atomic::{AtomicUsize, Ordering};
+use std::usize;
 
 use super::bit;
 
diff --git a/src/runtime/linux/cpuinfo.rs b/src/runtime/linux/cpuinfo.rs
index 8e4c8e066b..777be3de56 100644
--- a/src/runtime/linux/cpuinfo.rs
+++ b/src/runtime/linux/cpuinfo.rs
@@ -28,6 +28,7 @@ impl<'a> CpuInfoField<'a> {
         }
     }
     /// Does the field exist?
+    #[cfg(test)]
     pub fn exists(&self) -> bool {
         self.0.is_some()
     }
@@ -61,17 +62,19 @@ impl CpuInfo {
     pub fn field(&self, field: &str) -> CpuInfoField {
         for l in self.raw.lines() {
             if l.trim().starts_with(field) {
-                return CpuInfoField(l.split(": ").skip(1).next());
+                return CpuInfoField::new(l.split(": ").skip(1).next());
             }
         }
         CpuInfoField(None)
     }
 
     /// Returns the `raw` contents of `/proc/cpuinfo`
+    #[cfg(test)]
     fn raw(&self) -> &String {
         &self.raw
     }
 
+    #[cfg(test)]
     fn from_str(other: &str) -> Result<CpuInfo, ::std::io::Error> {
         Ok(CpuInfo {
             raw: String::from(other),
@@ -148,7 +151,8 @@ power management:
         assert!(!cpuinfo.field("flags").has("avx"));
     }
 
-    const ARM_CORTEX_A53: &str = r"Processor   : AArch64 Processor rev 3 (aarch64)
+    const ARM_CORTEX_A53: &str =
+        r"Processor   : AArch64 Processor rev 3 (aarch64)
         processor   : 0
         processor   : 1
         processor   : 2
diff --git a/src/runtime/mod.rs b/src/runtime/mod.rs
index a48b7b20ce..2ab5e246c1 100644
--- a/src/runtime/mod.rs
+++ b/src/runtime/mod.rs
@@ -5,31 +5,23 @@ mod bit;
 #[macro_use]
 mod macros;
 
-#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-#[macro_use]
-mod x86;
-#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-pub use self::x86::__Feature;
-#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-use self::x86::detect_features;
-
-#[cfg(all(target_arch = "arm", target_os = "linux", feature = "std"))]
+#[cfg(all(target_arch = "arm", target_os = "linux"))]
 #[macro_use]
 mod arm;
-#[cfg(all(target_arch = "arm", target_os = "linux", feature = "std"))]
+#[cfg(all(target_arch = "arm", target_os = "linux"))]
 pub use self::arm::__Feature;
 
-#[cfg(all(target_arch = "aarch64", target_os = "linux", feature = "std"))]
+#[cfg(all(target_arch = "aarch64", target_os = "linux"))]
 #[macro_use]
 mod aarch64;
-#[cfg(all(target_arch = "aarch64", target_os = "linux", feature = "std"))]
+#[cfg(all(target_arch = "aarch64", target_os = "linux"))]
 pub use self::aarch64::__Feature;
 
-#[cfg(all(feature = "std", target_os = "linux",
+#[cfg(all(target_os = "linux",
           any(target_arch = "arm", target_arch = "aarch64")))]
 mod linux;
 
-#[cfg(all(feature = "std", target_os = "linux",
+#[cfg(all(target_os = "linux",
           any(target_arch = "arm", target_arch = "aarch64")))]
 pub use self::linux::detect_features;
 
diff --git a/stdsimd-test/assert-instr-macro/Cargo.toml b/stdsimd-test/assert-instr-macro/Cargo.toml
index ba66015381..6f224d1f40 100644
--- a/stdsimd-test/assert-instr-macro/Cargo.toml
+++ b/stdsimd-test/assert-instr-macro/Cargo.toml
@@ -5,6 +5,7 @@ authors = ["Alex Crichton <alex@alexcrichton.com>"]
 
 [lib]
 proc-macro = true
+test = false
 
 [dependencies]
 proc-macro2 = { version = "0.1", features = ["unstable"] }
diff --git a/stdsimd-test/simd-test-macro/Cargo.toml b/stdsimd-test/simd-test-macro/Cargo.toml
index b4b860d3b1..0bb9be7e7a 100644
--- a/stdsimd-test/simd-test-macro/Cargo.toml
+++ b/stdsimd-test/simd-test-macro/Cargo.toml
@@ -5,6 +5,7 @@ authors = ["Alex Crichton <alex@alexcrichton.com>"]
 
 [lib]
 proc-macro = true
+test = false
 
 [dependencies]
 proc-macro2 = { version = "0.1", features = ["unstable"] }
diff --git a/stdsimd-test/src/lib.rs b/stdsimd-test/src/lib.rs
index ce52ea5d27..bd497b6cd5 100644
--- a/stdsimd-test/src/lib.rs
+++ b/stdsimd-test/src/lib.rs
@@ -5,6 +5,8 @@
 //! assertions about the disassembly of a function.
 
 #![feature(proc_macro)]
+#![cfg_attr(feature = "cargo-clippy",
+            allow(missing_docs_in_private_items, print_stdout))]
 
 extern crate assert_instr_macro;
 extern crate backtrace;
@@ -71,9 +73,10 @@ fn disassemble_myself() -> HashMap<String, Vec<Function>> {
         );
         assert!(output.status.success());
 
-        parse_otool(&str::from_utf8(&output.stdout).expect("stdout not utf8"))
+        parse_otool(str::from_utf8(&output.stdout).expect("stdout not utf8"))
     } else {
-        let objdump = env::var("OBJDUMP").unwrap_or("objdump".to_string());
+        let objdump =
+            env::var("OBJDUMP").unwrap_or_else(|_| "objdump".to_string());
         let output = Command::new(objdump)
             .arg("--disassemble")
             .arg(&me)
@@ -86,21 +89,18 @@ fn disassemble_myself() -> HashMap<String, Vec<Function>> {
         );
         assert!(output.status.success());
 
-        parse_objdump(
-            &str::from_utf8(&output.stdout).expect("stdout not utf8"),
-        )
+        parse_objdump(str::from_utf8(&output.stdout).expect("stdout not utf8"))
     }
 }
 
 fn parse_objdump(output: &str) -> HashMap<String, Vec<Function>> {
     let mut lines = output.lines();
-    let expected_len = if cfg!(target_arch = "arm") {
-        8
-    } else if cfg!(target_arch = "aarch64") {
-        8
-    } else {
-        2
-    };
+    let expected_len =
+        if cfg!(target_arch = "arm") || cfg!(target_arch = "aarch64") {
+            8
+        } else {
+            2
+        };
 
     for line in output.lines().take(100) {
         println!("{}", line);
@@ -112,7 +112,8 @@ fn parse_objdump(output: &str) -> HashMap<String, Vec<Function>> {
         if !header.ends_with(">:") {
             continue;
         }
-        let start = header.find("<").unwrap();
+        let start = header.find('<')
+            .expect(&format!("\"<\" not found in symbol pattern of the form \"$hex_addr <$name>\": {}", header));
         let symbol = &header[start + 1..header.len() - 2];
 
         let mut instructions = Vec::new();
@@ -136,13 +137,13 @@ fn parse_objdump(output: &str) -> HashMap<String, Vec<Function>> {
         }
 
         ret.entry(normalize(symbol))
-            .or_insert(Vec::new())
+            .or_insert_with(Vec::new)
             .push(Function {
                 instrs: instructions,
             });
     }
 
-    return ret;
+    ret
 }
 
 fn parse_otool(output: &str) -> HashMap<String, Vec<Function>> {
@@ -154,13 +155,9 @@ fn parse_otool(output: &str) -> HashMap<String, Vec<Function>> {
 
     let mut ret = HashMap::new();
     let mut cached_header = None;
-    loop {
-        let header = match cached_header.take().or_else(|| lines.next()) {
-            Some(header) => header,
-            None => break,
-        };
+    while let Some(header) = cached_header.take().or_else(|| lines.next()) {
         // symbols should start with `$symbol:`
-        if !header.ends_with(":") {
+        if !header.ends_with(':') {
             continue;
         }
         // strip the leading underscore and the trailing colon
@@ -168,7 +165,7 @@ fn parse_otool(output: &str) -> HashMap<String, Vec<Function>> {
 
         let mut instructions = Vec::new();
         while let Some(instruction) = lines.next() {
-            if instruction.ends_with(":") {
+            if instruction.ends_with(':') {
                 cached_header = Some(instruction);
                 break;
             }
@@ -184,13 +181,13 @@ fn parse_otool(output: &str) -> HashMap<String, Vec<Function>> {
         }
 
         ret.entry(normalize(symbol))
-            .or_insert(Vec::new())
+            .or_insert_with(Vec::new)
             .push(Function {
                 instrs: instructions,
             });
     }
 
-    return ret;
+    ret
 }
 
 fn parse_dumpbin(output: &str) -> HashMap<String, Vec<Function>> {
@@ -202,13 +199,9 @@ fn parse_dumpbin(output: &str) -> HashMap<String, Vec<Function>> {
 
     let mut ret = HashMap::new();
     let mut cached_header = None;
-    loop {
-        let header = match cached_header.take().or_else(|| lines.next()) {
-            Some(header) => header,
-            None => break,
-        };
+    while let Some(header) = cached_header.take().or_else(|| lines.next()) {
         // symbols should start with `$symbol:`
-        if !header.ends_with(":") {
+        if !header.ends_with(':') {
             continue;
         }
         // strip the trailing colon
@@ -239,13 +232,13 @@ fn parse_dumpbin(output: &str) -> HashMap<String, Vec<Function>> {
         }
 
         ret.entry(normalize(symbol))
-            .or_insert(Vec::new())
+            .or_insert_with(Vec::new)
             .push(Function {
                 instrs: instructions,
             });
     }
 
-    return ret;
+    ret
 }
 
 fn normalize(symbol: &str) -> String {
@@ -268,9 +261,10 @@ pub fn assert(fnptr: usize, fnname: &str, expected: &str) {
         sym = name.name().and_then(|s| s.as_str()).map(normalize);
     });
 
-    let functions = match sym.as_ref().and_then(|s| DISASSEMBLY.get(s)) {
-        Some(s) => s,
-        None => {
+    let functions =
+        if let Some(s) = sym.as_ref().and_then(|s| DISASSEMBLY.get(s)) {
+            s
+        } else {
             if let Some(sym) = sym {
                 println!("assumed symbol name: `{}`", sym);
             }
@@ -279,8 +273,7 @@ pub fn assert(fnptr: usize, fnname: &str, expected: &str) {
                 println!("\t- {}", f);
             }
             panic!("failed to find disassembly of {:#x} ({})", fnptr, fnname);
-        }
-    };
+        };
 
     assert_eq!(functions.len(), 1);
     let function = &functions[0];
@@ -288,7 +281,7 @@ pub fn assert(fnptr: usize, fnname: &str, expected: &str) {
     // Look for `expected` as the first part of any instruction in this
     // function, returning if we do indeed find it.
     let mut found = false;
-    for instr in function.instrs.iter() {
+    for instr in &function.instrs {
         // Gets the first instruction, e.g. tzcntl in tzcntl %rax,%rax
         if let Some(part) = instr.parts.get(0) {
             // Truncates the instruction with the length of the expected
@@ -308,13 +301,13 @@ pub fn assert(fnptr: usize, fnname: &str, expected: &str) {
 
     // Help debug by printing out the found disassembly, and then panic as we
     // didn't find the instruction.
-    println!("disassembly for {}: ", sym.as_ref().unwrap());
+    println!("disassembly for {}: ", sym.as_ref().expect("symbol not found"));
     for (i, instr) in function.instrs.iter().enumerate() {
         print!("\t{:2}: ", i);
-        for part in instr.parts.iter() {
+        for part in &instr.parts {
             print!("{} ", part);
         }
-        println!("");
+        println!();
     }
 
     if !found {
diff --git a/tests/cpu-detection.rs b/tests/cpu-detection.rs
index 9bbb17500f..b272b1e442 100644
--- a/tests/cpu-detection.rs
+++ b/tests/cpu-detection.rs
@@ -2,53 +2,21 @@
 #![cfg_attr(feature = "strict", deny(warnings))]
 #![cfg_attr(feature = "cargo-clippy", allow(option_unwrap_used))]
 
-extern crate cupid;
-
+#[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
 #[macro_use]
-#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
 extern crate stdsimd;
 
 #[test]
-#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-fn works() {
-    let information = cupid::master().unwrap();
-    assert_eq!(cfg_feature_enabled!("sse"), information.sse());
-    assert_eq!(cfg_feature_enabled!("sse2"), information.sse2());
-    assert_eq!(cfg_feature_enabled!("sse3"), information.sse3());
-    assert_eq!(cfg_feature_enabled!("ssse3"), information.ssse3());
-    assert_eq!(cfg_feature_enabled!("sse4.1"), information.sse4_1());
-    assert_eq!(cfg_feature_enabled!("sse4.2"), information.sse4_2());
-    assert_eq!(cfg_feature_enabled!("avx"), information.avx());
-    assert_eq!(cfg_feature_enabled!("avx2"), information.avx2());
-    assert_eq!(cfg_feature_enabled!("avx512f"), information.avx512f());
-    assert_eq!(cfg_feature_enabled!("avx512cd"), information.avx512cd());
-    assert_eq!(cfg_feature_enabled!("avx512er"), information.avx512er());
-    assert_eq!(cfg_feature_enabled!("avx512pf"), information.avx512pf());
-    assert_eq!(cfg_feature_enabled!("avx512bw"), information.avx512bw());
-    assert_eq!(cfg_feature_enabled!("avx512dq"), information.avx512dq());
-    assert_eq!(cfg_feature_enabled!("avx512vl"), information.avx512vl());
-    assert_eq!(cfg_feature_enabled!("avx512ifma"), information.avx512_ifma());
-    assert_eq!(cfg_feature_enabled!("avx512vbmi"), information.avx512_vbmi());
-    assert_eq!(
-        cfg_feature_enabled!("avx512vpopcntdq"),
-        information.avx512_vpopcntdq()
-    );
-    assert_eq!(cfg_feature_enabled!("fma"), information.fma());
-    assert_eq!(cfg_feature_enabled!("bmi"), information.bmi1());
-    assert_eq!(cfg_feature_enabled!("bmi2"), information.bmi2());
-    assert_eq!(cfg_feature_enabled!("popcnt"), information.popcnt());
-    assert_eq!(cfg_feature_enabled!("sse4a"), information.sse4a());
-    assert_eq!(cfg_feature_enabled!("abm"), information.lzcnt());
-    assert_eq!(cfg_feature_enabled!("tbm"), information.tbm());
-    assert_eq!(cfg_feature_enabled!("lzcnt"), information.lzcnt());
-    assert_eq!(cfg_feature_enabled!("xsave"), information.xsave());
-    assert_eq!(cfg_feature_enabled!("xsaveopt"), information.xsaveopt());
-    assert_eq!(
-        cfg_feature_enabled!("xsavec"),
-        information.xsavec_and_xrstor()
-    );
-    assert_eq!(
-        cfg_feature_enabled!("xsavec"),
-        information.xsaves_xrstors_and_ia32_xss()
-    );
+#[cfg(all(target_arch = "arm", target_os = "linux"))]
+fn arm_linux() {
+    println!("neon: {}", cfg_feature_enabled!("neon"));
+    println!("pmull: {}", cfg_feature_enabled!("pmull"));
+}
+
+#[test]
+#[cfg(all(target_arch = "aarch64", target_os = "linux"))]
+fn aarch64_linux() {
+    println!("neon: {}", cfg_feature_enabled!("neon"));
+    println!("asimd: {}", cfg_feature_enabled!("asimd"));
+    println!("pmull: {}", cfg_feature_enabled!("pmull"));
 }