Skip to content

Improve performance of spsc_queue and stream. #44963

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Oct 11, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions src/libstd/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -244,6 +244,7 @@
#![feature(allow_internal_unstable)]
#![feature(align_offset)]
#![feature(asm)]
#![feature(attr_literals)]
#![feature(box_syntax)]
#![feature(cfg_target_has_atomic)]
#![feature(cfg_target_thread_local)]
Expand Down Expand Up @@ -290,6 +291,7 @@
#![feature(prelude_import)]
#![feature(rand)]
#![feature(raw)]
#![feature(repr_align)]
#![feature(repr_simd)]
#![feature(rustc_attrs)]
#![cfg_attr(not(stage0), feature(rustc_const_unstable))]
Expand Down
37 changes: 37 additions & 0 deletions src/libstd/sync/mpsc/cache_aligned.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
// Copyright 2017 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.

use ops::{Deref, DerefMut};

#[derive(Copy, Clone, Default, PartialEq, Eq, PartialOrd, Ord, Hash)]
#[repr(align(64))]
pub(super) struct Aligner;

#[derive(Copy, Clone, Default, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub(super) struct CacheAligned<T>(pub T, pub Aligner);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: I think this can be just

#[derive(Copy, Clone, Default, PartialEq, Eq, PartialOrd, Ord, Hash)]
#[repr(align(64))]
pub(super) struct CacheAligned<T>(pub T);


impl<T> Deref for CacheAligned<T> {
type Target = T;
fn deref(&self) -> &Self::Target {
&self.0
}
}

impl<T> DerefMut for CacheAligned<T> {
fn deref_mut(&mut self) -> &mut Self::Target {
&mut self.0
}
}

impl<T> CacheAligned<T> {
pub(super) fn new(t: T) -> Self {
CacheAligned(t, Aligner)
}
}
2 changes: 2 additions & 0 deletions src/libstd/sync/mpsc/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -297,6 +297,8 @@ mod sync;
mod mpsc_queue;
mod spsc_queue;

mod cache_aligned;

/// The receiving half of Rust's [`channel`][] (or [`sync_channel`]) type.
/// This half can only be owned by one thread.
///
Expand Down
153 changes: 90 additions & 63 deletions src/libstd/sync/mpsc/spsc_queue.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,51 +22,71 @@ use core::cell::UnsafeCell;

use sync::atomic::{AtomicPtr, AtomicUsize, Ordering};

use super::cache_aligned::CacheAligned;

// Node within the linked list queue of messages to send
struct Node<T> {
// FIXME: this could be an uninitialized T if we're careful enough, and
// that would reduce memory usage (and be a bit faster).
// is it worth it?
value: Option<T>, // nullable for re-use of nodes
cached: bool, // This node goes into the node cache
next: AtomicPtr<Node<T>>, // next node in the queue
}

/// The single-producer single-consumer queue. This structure is not cloneable,
/// but it can be safely shared in an Arc if it is guaranteed that there
/// is only one popper and one pusher touching the queue at any one point in
/// time.
pub struct Queue<T> {
pub struct Queue<T, ProducerAddition=(), ConsumerAddition=()> {
// consumer fields
consumer: CacheAligned<Consumer<T, ConsumerAddition>>,

// producer fields
producer: CacheAligned<Producer<T, ProducerAddition>>,
}

struct Consumer<T, Addition> {
tail: UnsafeCell<*mut Node<T>>, // where to pop from
tail_prev: AtomicPtr<Node<T>>, // where to pop from
cache_bound: usize, // maximum cache size
cached_nodes: AtomicUsize, // number of nodes marked as cachable
addition: Addition,
}

// producer fields
struct Producer<T, Addition> {
head: UnsafeCell<*mut Node<T>>, // where to push to
first: UnsafeCell<*mut Node<T>>, // where to get new nodes from
tail_copy: UnsafeCell<*mut Node<T>>, // between first/tail

// Cache maintenance fields. Additions and subtractions are stored
// separately in order to allow them to use nonatomic addition/subtraction.
cache_bound: usize,
cache_additions: AtomicUsize,
cache_subtractions: AtomicUsize,
addition: Addition,
}

unsafe impl<T: Send> Send for Queue<T> { }
unsafe impl<T: Send, P: Send + Sync, C: Send + Sync> Send for Queue<T, P, C> { }

unsafe impl<T: Send> Sync for Queue<T> { }
unsafe impl<T: Send, P: Send + Sync, C: Send + Sync> Sync for Queue<T, P, C> { }

impl<T> Node<T> {
fn new() -> *mut Node<T> {
Box::into_raw(box Node {
value: None,
cached: false,
next: AtomicPtr::new(ptr::null_mut::<Node<T>>()),
})
}
}

impl<T> Queue<T> {
/// Creates a new queue.
impl<T, ProducerAddition, ConsumerAddition> Queue<T, ProducerAddition, ConsumerAddition> {

/// Creates a new queue. With given additional elements in the producer and
/// consumer portions of the queue.
///
/// Due to the performance implications of cache-contention,
/// we wish to keep fields used mainly by the producer on a separate cache
/// line than those used by the consumer.
/// Since cache lines are usually 64 bytes, it is unreasonably expensive to
/// allocate one for small fields, so we allow users to insert additional
/// fields into the cache lines already allocated by this for the producer
/// and consumer.
///
/// This is unsafe as the type system doesn't enforce a single
/// consumer-producer relationship. It also allows the consumer to `pop`
Expand All @@ -83,19 +103,28 @@ impl<T> Queue<T> {
/// cache (if desired). If the value is 0, then the cache has
/// no bound. Otherwise, the cache will never grow larger than
/// `bound` (although the queue itself could be much larger.
pub unsafe fn new(bound: usize) -> Queue<T> {
pub unsafe fn with_additions(
bound: usize,
producer_addition: ProducerAddition,
consumer_addition: ConsumerAddition,
) -> Self {
let n1 = Node::new();
let n2 = Node::new();
(*n1).next.store(n2, Ordering::Relaxed);
Queue {
tail: UnsafeCell::new(n2),
tail_prev: AtomicPtr::new(n1),
head: UnsafeCell::new(n2),
first: UnsafeCell::new(n1),
tail_copy: UnsafeCell::new(n1),
cache_bound: bound,
cache_additions: AtomicUsize::new(0),
cache_subtractions: AtomicUsize::new(0),
consumer: CacheAligned::new(Consumer {
tail: UnsafeCell::new(n2),
tail_prev: AtomicPtr::new(n1),
cache_bound: bound,
cached_nodes: AtomicUsize::new(0),
addition: consumer_addition
}),
producer: CacheAligned::new(Producer {
head: UnsafeCell::new(n2),
first: UnsafeCell::new(n1),
tail_copy: UnsafeCell::new(n1),
addition: producer_addition
}),
}
}

Expand All @@ -109,35 +138,25 @@ impl<T> Queue<T> {
assert!((*n).value.is_none());
(*n).value = Some(t);
(*n).next.store(ptr::null_mut(), Ordering::Relaxed);
(**self.head.get()).next.store(n, Ordering::Release);
*self.head.get() = n;
(**self.producer.head.get()).next.store(n, Ordering::Release);
*(&self.producer.head).get() = n;
}
}

unsafe fn alloc(&self) -> *mut Node<T> {
// First try to see if we can consume the 'first' node for our uses.
// We try to avoid as many atomic instructions as possible here, so
// the addition to cache_subtractions is not atomic (plus we're the
// only one subtracting from the cache).
if *self.first.get() != *self.tail_copy.get() {
if self.cache_bound > 0 {
let b = self.cache_subtractions.load(Ordering::Relaxed);
self.cache_subtractions.store(b + 1, Ordering::Relaxed);
}
let ret = *self.first.get();
*self.first.get() = (*ret).next.load(Ordering::Relaxed);
if *self.producer.first.get() != *self.producer.tail_copy.get() {
let ret = *self.producer.first.get();
*self.producer.0.first.get() = (*ret).next.load(Ordering::Relaxed);
return ret;
}
// If the above fails, then update our copy of the tail and try
// again.
*self.tail_copy.get() = self.tail_prev.load(Ordering::Acquire);
if *self.first.get() != *self.tail_copy.get() {
if self.cache_bound > 0 {
let b = self.cache_subtractions.load(Ordering::Relaxed);
self.cache_subtractions.store(b + 1, Ordering::Relaxed);
}
let ret = *self.first.get();
*self.first.get() = (*ret).next.load(Ordering::Relaxed);
*self.producer.0.tail_copy.get() =
self.consumer.tail_prev.load(Ordering::Acquire);
if *self.producer.first.get() != *self.producer.tail_copy.get() {
let ret = *self.producer.first.get();
*self.producer.0.first.get() = (*ret).next.load(Ordering::Relaxed);
return ret;
}
// If all of that fails, then we have to allocate a new node
Expand All @@ -153,27 +172,27 @@ impl<T> Queue<T> {
// sentinel from where we should start popping from. Hence, look at
// tail's next field and see if we can use it. If we do a pop, then
// the current tail node is a candidate for going into the cache.
let tail = *self.tail.get();
let tail = *self.consumer.tail.get();
let next = (*tail).next.load(Ordering::Acquire);
if next.is_null() { return None }
assert!((*next).value.is_some());
let ret = (*next).value.take();

*self.tail.get() = next;
if self.cache_bound == 0 {
self.tail_prev.store(tail, Ordering::Release);
*self.consumer.0.tail.get() = next;
if self.consumer.cache_bound == 0 {
self.consumer.tail_prev.store(tail, Ordering::Release);
} else {
// FIXME: this is dubious with overflow.
let additions = self.cache_additions.load(Ordering::Relaxed);
let subtractions = self.cache_subtractions.load(Ordering::Relaxed);
let size = additions - subtractions;

if size < self.cache_bound {
self.tail_prev.store(tail, Ordering::Release);
self.cache_additions.store(additions + 1, Ordering::Relaxed);
let cached_nodes = self.consumer.cached_nodes.load(Ordering::Relaxed);
if cached_nodes < self.consumer.cache_bound && !(*tail).cached {
self.consumer.cached_nodes.store(cached_nodes, Ordering::Relaxed);
(*tail).cached = true;
}

if (*tail).cached {
self.consumer.tail_prev.store(tail, Ordering::Release);
} else {
(*self.tail_prev.load(Ordering::Relaxed))
.next.store(next, Ordering::Relaxed);
(*self.consumer.tail_prev.load(Ordering::Relaxed))
.next.store(next, Ordering::Relaxed);
// We have successfully erased all references to 'tail', so
// now we can safely drop it.
let _: Box<Node<T>> = Box::from_raw(tail);
Expand All @@ -194,17 +213,25 @@ impl<T> Queue<T> {
// This is essentially the same as above with all the popping bits
// stripped out.
unsafe {
let tail = *self.tail.get();
let tail = *self.consumer.tail.get();
let next = (*tail).next.load(Ordering::Acquire);
if next.is_null() { None } else { (*next).value.as_mut() }
}
}

pub fn producer_addition(&self) -> &ProducerAddition {
&self.producer.addition
}

pub fn consumer_addition(&self) -> &ConsumerAddition {
&self.consumer.addition
}
}

impl<T> Drop for Queue<T> {
impl<T, ProducerAddition, ConsumerAddition> Drop for Queue<T, ProducerAddition, ConsumerAddition> {
fn drop(&mut self) {
unsafe {
let mut cur = *self.first.get();
let mut cur = *self.producer.first.get();
while !cur.is_null() {
let next = (*cur).next.load(Ordering::Relaxed);
let _n: Box<Node<T>> = Box::from_raw(cur);
Expand All @@ -224,7 +251,7 @@ mod tests {
#[test]
fn smoke() {
unsafe {
let queue = Queue::new(0);
let queue = Queue::with_additions(0, (), ());
queue.push(1);
queue.push(2);
assert_eq!(queue.pop(), Some(1));
Expand All @@ -241,7 +268,7 @@ mod tests {
#[test]
fn peek() {
unsafe {
let queue = Queue::new(0);
let queue = Queue::with_additions(0, (), ());
queue.push(vec![1]);

// Ensure the borrowchecker works
Expand All @@ -264,7 +291,7 @@ mod tests {
#[test]
fn drop_full() {
unsafe {
let q: Queue<Box<_>> = Queue::new(0);
let q: Queue<Box<_>> = Queue::with_additions(0, (), ());
q.push(box 1);
q.push(box 2);
}
Expand All @@ -273,7 +300,7 @@ mod tests {
#[test]
fn smoke_bound() {
unsafe {
let q = Queue::new(0);
let q = Queue::with_additions(0, (), ());
q.push(1);
q.push(2);
assert_eq!(q.pop(), Some(1));
Expand All @@ -295,7 +322,7 @@ mod tests {
}

unsafe fn stress_bound(bound: usize) {
let q = Arc::new(Queue::new(bound));
let q = Arc::new(Queue::with_additions(bound, (), ()));

let (tx, rx) = channel();
let q2 = q.clone();
Expand Down
Loading