Skip to content

Commit 1175903

Browse files
authored
Merge pull request #2 from swfsql/avoid-ci-errors
Avoid ci errors
2 parents 4722a99 + 557687c commit 1175903

File tree

6 files changed

+42
-54
lines changed

6 files changed

+42
-54
lines changed

dfdx-core/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ num-traits = { workspace = true }
3535
safetensors = { workspace = true, optional = true }
3636
memmap2 = { workspace = true, optional = true }
3737
half = { version = "2.3.1", optional = true, features = ["num-traits", "rand_distr"] }
38-
gemm = { version = "0.16.14", default-features = false, optional = true, features = ["rayon"] }
38+
gemm = { version = "0.17.1", default-features = false, optional = true, features = ["rayon"] }
3939
rayon = { version = "1.7.0", optional = true }
4040
libm = { workspace = true }
4141
wgpu = { version = "0.18.0", features = ["glsl", "spirv"], optional = true }

dfdx-core/src/data/collate.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ impl<A, B> Collate for Vec<(A, B)> {
5555
impl<'a, A, B> Collate for Vec<&'a (A, B)> {
5656
type Collated = (Vec<&'a A>, Vec<&'a B>);
5757
fn collated(self) -> Self::Collated {
58+
#[allow(clippy::map_identity)]
5859
self.into_iter().map(|(a, b)| (a, b)).unzip()
5960
}
6061
}

dfdx-core/src/lib.rs

Lines changed: 0 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -128,44 +128,6 @@ pub mod prelude {
128128
pub use crate::tensor_ops::*;
129129
}
130130

131-
/// Sets a CPU `sse` flag to flush denormal floating point numbers to zero. The opposite of this is [keep_denormals()].
132-
///
133-
/// Some resources:
134-
/// 1. [Effects of Flush-To-Zero mode](https://developer.arm.com/documentation/dui0473/c/neon-and-vfp-programming/the-effects-of-using-flush-to-zero-mode?lang=en)
135-
/// 2. [When to use Flush-To-Zero mode](https://developer.arm.com/documentation/dui0473/c/neon-and-vfp-programming/when-to-use-flush-to-zero-mode?lang=en)
136-
pub fn flush_denormals_to_zero() {
137-
#[cfg(all(target_arch = "x86", target_feature = "sse"))]
138-
{
139-
use std::arch::x86::{_MM_FLUSH_ZERO_ON, _MM_SET_FLUSH_ZERO_MODE};
140-
unsafe { _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON) }
141-
}
142-
143-
#[cfg(all(target_arch = "x86_64", target_feature = "sse"))]
144-
{
145-
use std::arch::x86_64::{_MM_FLUSH_ZERO_ON, _MM_SET_FLUSH_ZERO_MODE};
146-
unsafe { _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON) }
147-
}
148-
}
149-
150-
/// Sets a CPU flag to keep denormal floating point numbers. The opposite of this is [flush_denormals_to_zero()].
151-
///
152-
/// Some resources:
153-
/// 1. [Effects of Flush-To-Zero mode](https://developer.arm.com/documentation/dui0473/c/neon-and-vfp-programming/the-effects-of-using-flush-to-zero-mode?lang=en)
154-
/// 2. [When to use Flush-To-Zero mode](https://developer.arm.com/documentation/dui0473/c/neon-and-vfp-programming/when-to-use-flush-to-zero-mode?lang=en)
155-
pub fn keep_denormals() {
156-
#[cfg(all(target_arch = "x86", target_feature = "sse"))]
157-
{
158-
use std::arch::x86::{_MM_FLUSH_ZERO_OFF, _MM_SET_FLUSH_ZERO_MODE};
159-
unsafe { _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_OFF) }
160-
}
161-
162-
#[cfg(all(target_arch = "x86_64", target_feature = "sse"))]
163-
{
164-
use std::arch::x86_64::{_MM_FLUSH_ZERO_OFF, _MM_SET_FLUSH_ZERO_MODE};
165-
unsafe { _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_OFF) }
166-
}
167-
}
168-
169131
#[cfg(test)]
170132
pub(crate) mod tests {
171133
pub use num_traits::{Float, NumCast, Zero};

dfdx-core/src/tensor/gradients.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -153,7 +153,7 @@ impl<E, D: Storage<E>> Gradients<E, D> {
153153
#[inline]
154154
pub(crate) fn many_and_ref<L: Shape, R: Shape>(
155155
&mut self,
156-
ls: &Vec<impl Tensorlike<L, E, D>>,
156+
ls: &[impl Tensorlike<L, E, D>],
157157
r: &impl Tensorlike<R, E, D>,
158158
) -> (Vec<&mut D::Vec>, &D::Vec) {
159159
for i in 0..ls.len() {

dfdx-core/src/tensor_ops/utilities/device.rs

Lines changed: 39 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -114,33 +114,61 @@ pub trait Device<E: Dtype>:
114114
+ crate::tensor_ops::axpy::AxpyKernel<E>
115115

116116
// conv1d
117-
+ super::super::conv1d::Conv1DKernel<E>
117+
+ NonCudnnCuda<E>
118+
{
119+
}
120+
121+
#[cfg(feature = "cudnn")]
122+
pub trait NonCudnnCuda<E: Dtype> {}
123+
124+
#[cfg(not(feature = "cudnn"))]
125+
pub trait NonCudnnCuda<E: Dtype>:
126+
// conv1d
127+
super::super::conv1d::Conv1DKernel<E>
118128
{
119129
}
120130

121131
#[cfg(feature = "f16")]
122-
impl Device<f16> for crate::tensor::Cpu {}
123-
#[cfg(feature = "f16")]
124-
impl Device<AMP<f16>> for crate::tensor::Cpu {}
132+
mod f16_ {
133+
use super::*;
134+
impl Device<f16> for crate::tensor::Cpu {}
135+
impl NonCudnnCuda<f16> for crate::tensor::Cpu {}
136+
impl Device<AMP<f16>> for crate::tensor::Cpu {}
137+
impl NonCudnnCuda<AMP<f16>> for crate::tensor::Cpu {}
138+
}
125139
impl Device<f32> for crate::tensor::Cpu {}
140+
impl NonCudnnCuda<f32> for crate::tensor::Cpu {}
126141
impl Device<f64> for crate::tensor::Cpu {}
142+
impl NonCudnnCuda<f64> for crate::tensor::Cpu {}
127143

128144
#[cfg(all(feature = "cuda", feature = "f16"))]
129-
impl Device<f16> for crate::tensor::Cuda {}
130-
#[cfg(all(feature = "cuda", feature = "f16"))]
131-
impl Device<AMP<f16>> for crate::tensor::Cuda {}
132-
#[cfg(feature = "cuda")]
133-
impl Device<f32> for crate::tensor::Cuda {}
145+
mod cuda_f16 {
146+
use super::*;
147+
impl Device<f16> for crate::tensor::Cuda {}
148+
impl NonCudnnCuda<f16> for crate::tensor::Cuda {}
149+
impl Device<AMP<f16>> for crate::tensor::Cuda {}
150+
impl NonCudnnCuda<AMP<f16>> for crate::tensor::Cuda {}
151+
}
134152
#[cfg(feature = "cuda")]
135-
impl Device<f64> for crate::tensor::Cuda {}
153+
mod cuda {
154+
use super::*;
155+
impl Device<f32> for crate::tensor::Cuda {}
156+
impl NonCudnnCuda<f32> for crate::tensor::Cuda {}
157+
impl Device<f64> for crate::tensor::Cuda {}
158+
impl NonCudnnCuda<f64> for crate::tensor::Cuda {}
159+
}
136160

137161
// TODO: How can we implement this for f16 when WGSL doesn't support f16 yet?
138162
// #[cfg(all(feature = "webgpu", feature = "f16"))]
139163
// impl Device<f16> for crate::tensor::Webgpu {}
140164
// #[cfg(all(feature = "webgpu", feature = "f16"))]
141165
// impl Device<AMP<f16>> for crate::tensor::Webgpu {}
142166
#[cfg(feature = "webgpu")]
143-
impl Device<f32> for crate::tensor::Webgpu {}
167+
mod webgpu {
168+
use super::*;
169+
impl Device<f32> for crate::tensor::Webgpu {}
170+
impl NonCudnnCuda<f32> for crate::tensor::Webgpu {}
171+
}
144172

145173
// TODO: How can we implement this for f64 when WGSL doesn't support f64 yet?
146174
// #[cfg(feature = "webgpu")]

dfdx/examples/12-mnist.rs

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -62,9 +62,6 @@ type Mlp = (
6262
const BATCH_SIZE: usize = 32;
6363

6464
fn main() {
65-
// ftz substantially improves performance
66-
dfdx::flush_denormals_to_zero();
67-
6865
let mnist_path = std::env::args()
6966
.nth(1)
7067
.unwrap_or_else(|| "./datasets/MNIST/raw".to_string());

0 commit comments

Comments
 (0)