Skip to content

Commit f411852

Browse files
committed
Refactor Windows stdio and remove stdin double buffering
1 parent cc20ed6 commit f411852

File tree

2 files changed

+178
-116
lines changed

2 files changed

+178
-116
lines changed

src/libstd/sys/windows/process.rs

+2-2
Original file line numberDiff line numberDiff line change
@@ -252,9 +252,9 @@ impl Stdio {
252252
// should still be unavailable so propagate the
253253
// INVALID_HANDLE_VALUE.
254254
Stdio::Inherit => {
255-
match stdio::get(stdio_id) {
255+
match stdio::get_handle(stdio_id) {
256256
Ok(io) => {
257-
let io = Handle::new(io.handle());
257+
let io = Handle::new(io);
258258
let ret = io.duplicate(0, true,
259259
c::DUPLICATE_SAME_ACCESS);
260260
io.into_raw();

src/libstd/sys/windows/stdio.rs

+176-114
Original file line numberDiff line numberDiff line change
@@ -1,131 +1,226 @@
11
#![unstable(issue = "0", feature = "windows_stdio")]
22

3+
use cell::Cell;
34
use cmp;
4-
use io::{self, Cursor};
5+
use io;
56
use ptr;
67
use str;
7-
use sync::Mutex;
88
use sys::c;
99
use sys::cvt;
1010
use sys::handle::Handle;
1111

12-
pub enum Output {
13-
Console(c::HANDLE),
14-
Pipe(c::HANDLE),
15-
}
16-
12+
// Don't cache handles but get them fresh for every read/write. This allows us to track changes to
13+
// the value over time (such as if a process calls `SetStdHandle` while it's running). See #40490.
1714
pub struct Stdin {
18-
utf8: Mutex<io::Cursor<Vec<u8>>>,
15+
high_surrogate: Cell<u16>,
1916
}
2017
pub struct Stdout;
2118
pub struct Stderr;
2219

23-
pub fn get(handle: c::DWORD) -> io::Result<Output> {
24-
let handle = unsafe { c::GetStdHandle(handle) };
20+
// Apparently Windows doesn't handle large reads on stdin or writes to stdout/stderr well (see
21+
// #13304 for details).
22+
//
23+
// From MSDN (2011): "The storage for this buffer is allocated from a shared heap for the
24+
// process that is 64 KB in size. The maximum size of the buffer will depend on heap usage."
25+
//
26+
// We choose the cap at 8 KiB because libuv does the same, and it seems to be acceptable so far.
27+
const MAX_BUFFER_SIZE: usize = 8192;
28+
29+
// The standard buffer size of BufReader for Stdin should be able to hold 3x more bytes than there
30+
// are `u16`'s in MAX_BUFFER_SIZE. This ensures the read data can always be completely decoded from
31+
// UTF-16 to UTF-8.
32+
pub const STDIN_BUF_SIZE: usize = MAX_BUFFER_SIZE / 2 * 3;
33+
34+
pub fn get_handle(handle_id: c::DWORD) -> io::Result<c::HANDLE> {
35+
let handle = unsafe { c::GetStdHandle(handle_id) };
2536
if handle == c::INVALID_HANDLE_VALUE {
2637
Err(io::Error::last_os_error())
2738
} else if handle.is_null() {
2839
Err(io::Error::from_raw_os_error(c::ERROR_INVALID_HANDLE as i32))
2940
} else {
30-
let mut out = 0;
31-
match unsafe { c::GetConsoleMode(handle, &mut out) } {
32-
0 => Ok(Output::Pipe(handle)),
33-
_ => Ok(Output::Console(handle)),
34-
}
41+
Ok(handle)
3542
}
3643
}
3744

38-
fn write(handle: c::DWORD, data: &[u8]) -> io::Result<usize> {
39-
let handle = match get(handle)? {
40-
Output::Console(c) => c,
41-
Output::Pipe(p) => {
42-
let handle = Handle::new(p);
43-
let ret = handle.write(data);
44-
handle.into_raw();
45-
return ret
46-
}
47-
};
45+
fn is_console(handle: c::HANDLE) -> bool {
46+
// `GetConsoleMode` will return false (0) if this is a pipe (we don't care about the reported
47+
// mode). This will only detect Windows Console, not other terminals connected to a pipe like
48+
// MSYS. Which is exactly what we need, as only Windows Console needs a conversion to UTF-16.
49+
let mut mode = 0;
50+
unsafe { c::GetConsoleMode(handle, &mut mode) != 0 }
51+
}
4852

49-
// As with stdin on windows, stdout often can't handle writes of large
50-
// sizes. For an example, see #14940. For this reason, don't try to
51-
// write the entire output buffer on windows.
52-
//
53-
// For some other references, it appears that this problem has been
54-
// encountered by others [1] [2]. We choose the number 8K just because
55-
// libuv does the same.
53+
fn write(handle_id: c::DWORD, data: &[u8]) -> io::Result<usize> {
54+
let handle = get_handle(handle_id)?;
55+
if !is_console(handle) {
56+
let handle = Handle::new(handle);
57+
let ret = handle.write(data);
58+
handle.into_raw(); // Don't close the handle
59+
return ret;
60+
}
61+
62+
// As the console is meant for presenting text, we assume bytes of `data` come from a string
63+
// and are encoded as UTF-8, which needs to be encoded as UTF-16.
5664
//
57-
// [1]: https://tahoe-lafs.org/trac/tahoe-lafs/ticket/1232
58-
// [2]: http://www.mail-archive.com/[email protected]/msg00661.html
59-
const OUT_MAX: usize = 8192;
60-
let len = cmp::min(data.len(), OUT_MAX);
65+
// If the data is not valid UTF-8 we write out as many bytes as are valid.
66+
// Only when there are no valid bytes (which will happen on the next call), return an error.
67+
let len = cmp::min(data.len(), MAX_BUFFER_SIZE);
6168
let utf8 = match str::from_utf8(&data[..len]) {
6269
Ok(s) => s,
63-
Err(ref e) if e.valid_up_to() == 0 => return Err(invalid_encoding()),
70+
Err(ref e) if e.valid_up_to() == 0 => {
71+
return Err(io::Error::new(io::ErrorKind::InvalidData,
72+
"Windows stdio in console mode does not support non-UTF-8 byte sequences; \
73+
see https://github.com/rust-lang/rust/issues/23344"))
74+
},
6475
Err(e) => str::from_utf8(&data[..e.valid_up_to()]).unwrap(),
6576
};
6677
let utf16 = utf8.encode_utf16().collect::<Vec<u16>>();
78+
79+
let mut written = write_u16s(handle, &utf16)?;
80+
81+
// Figure out how many bytes of as UTF-8 were written away as UTF-16.
82+
if written >= utf16.len() {
83+
Ok(utf8.len())
84+
} else {
85+
// Make sure we didn't end up writing only half of a surrogate pair (even though the chance
86+
// is tiny). Because it is not possible for user code to re-slice `data` in such a way that
87+
// a missing surrogate can be produced (and also because of the UTF-8 validation above),
88+
// write the missing surrogate out now.
89+
// Buffering it would mean we have to lie about the number of bytes written.
90+
let first_char_remaining = utf16[written];
91+
if first_char_remaining >= 0xDCEE && first_char_remaining <= 0xDFFF { // low surrogate
92+
// We just hope this works, and give up otherwise
93+
let _ = write_u16s(handle, &utf16[written..written]);
94+
written += 1;
95+
}
96+
// Calculate the number of bytes of `utf8` that were actually written.
97+
let mut count = 0;
98+
for ch in utf16[..written].iter() {
99+
count += match ch {
100+
0x0000 ..= 0x007F => 1,
101+
0x0080 ..= 0x07FF => 2,
102+
0xDCEE ..= 0xDFFF => 1, // Low surrogate. We already counted 3 bytes for the other.
103+
_ => 3,
104+
};
105+
}
106+
Ok(count)
107+
}
108+
}
109+
110+
fn write_u16s(handle: c::HANDLE, data: &[u16]) -> io::Result<usize> {
67111
let mut written = 0;
68112
cvt(unsafe {
69113
c::WriteConsoleW(handle,
70-
utf16.as_ptr() as c::LPCVOID,
71-
utf16.len() as u32,
114+
data.as_ptr() as c::LPCVOID,
115+
data.len() as u32,
72116
&mut written,
73117
ptr::null_mut())
74118
})?;
75-
76-
// FIXME if this only partially writes the utf16 buffer then we need to
77-
// figure out how many bytes of `data` were actually written
78-
assert_eq!(written as usize, utf16.len());
79-
Ok(utf8.len())
119+
Ok(written as usize)
80120
}
81121

82122
impl Stdin {
83123
pub fn new() -> io::Result<Stdin> {
84-
Ok(Stdin {
85-
utf8: Mutex::new(Cursor::new(Vec::new())),
86-
})
124+
Ok(Stdin { high_surrogate: Cell::new(0) })
87125
}
88126

89127
pub fn read(&self, buf: &mut [u8]) -> io::Result<usize> {
90-
let handle = match get(c::STD_INPUT_HANDLE)? {
91-
Output::Console(c) => c,
92-
Output::Pipe(p) => {
93-
let handle = Handle::new(p);
94-
let ret = handle.read(buf);
95-
handle.into_raw();
96-
return ret
97-
}
128+
let handle = get_handle(c::STD_INPUT_HANDLE)?;
129+
if !is_console(handle) {
130+
let handle = Handle::new(handle);
131+
let ret = handle.read(buf);
132+
handle.into_raw(); // Don't close the handle
133+
return ret;
134+
}
135+
136+
if buf.len() == 0 {
137+
return Ok(0);
138+
} else if buf.len() < 4 {
139+
return Err(io::Error::new(io::ErrorKind::InvalidInput,
140+
"Windows stdin in console mode does not support a buffer too small to; \
141+
guarantee holding one arbitrary UTF-8 character (4 bytes)"))
142+
}
143+
144+
let mut utf16_buf = [0u16; MAX_BUFFER_SIZE / 2];
145+
// In the worst case, an UTF-8 string can take 3 bytes for every `u16` of an UTF-16. So
146+
// we can read at most a third of `buf.len()` chars and uphold the guarantee no data gets
147+
// lost.
148+
let amount = cmp::min(buf.len() / 3, utf16_buf.len());
149+
let read = self.read_u16s_fixup_surrogates(handle, &mut utf16_buf, amount)?;
150+
let utf16 = &utf16_buf[..read];
151+
152+
// FIXME: it would be nice if we could directly decode into the buffer instead of doing an
153+
// allocation.
154+
let data = match String::from_utf16(&utf16) {
155+
Ok(utf8) => utf8.into_bytes(),
156+
Err(..) => {
157+
// We can't really do any better than forget all data and return an error.
158+
return Err(io::Error::new(io::ErrorKind::InvalidData,
159+
"Windows stdin in console mode does not support non-UTF-16 input; \
160+
encountered unpaired surrogate"))
161+
},
98162
};
99-
let mut utf8 = self.utf8.lock().unwrap();
100-
// Read more if the buffer is empty
101-
if utf8.position() as usize == utf8.get_ref().len() {
102-
let mut utf16 = vec![0u16; 0x1000];
103-
let mut num = 0;
104-
let mut input_control = readconsole_input_control(CTRL_Z_MASK);
105-
cvt(unsafe {
106-
c::ReadConsoleW(handle,
107-
utf16.as_mut_ptr() as c::LPVOID,
108-
utf16.len() as u32,
109-
&mut num,
110-
&mut input_control as c::PCONSOLE_READCONSOLE_CONTROL)
111-
})?;
112-
utf16.truncate(num as usize);
113-
// FIXME: what to do about this data that has already been read?
114-
let mut data = match String::from_utf16(&utf16) {
115-
Ok(utf8) => utf8.into_bytes(),
116-
Err(..) => return Err(invalid_encoding()),
117-
};
118-
if let Some(&last_byte) = data.last() {
119-
if last_byte == CTRL_Z {
120-
data.pop();
121-
}
163+
buf.copy_from_slice(&data);
164+
Ok(data.len())
165+
}
166+
167+
// We assume that if the last `u16` is an unpaired surrogate they got sliced apart by our
168+
// buffer size, and keep it around for the next read hoping to put them together.
169+
// This is a best effort, and may not work if we are not the only reader on Stdin.
170+
pub fn read_u16s_fixup_surrogates(&self, handle: c::HANDLE, buf: &mut [u16], mut amount: usize)
171+
-> io::Result<usize>
172+
{
173+
// Insert possibly remaining unpaired surrogate from last read.
174+
let mut start = 0;
175+
if self.high_surrogate.get() != 0 {
176+
buf[0] = self.high_surrogate.replace(0);
177+
start = 1;
178+
if amount == 1 {
179+
// Special case: `Stdin::read` guarantees we can always read at least one new `u16`
180+
// and combine it with an unpaired surrogate, because the UTF-8 buffer is at least
181+
// 4 bytes.
182+
amount = 2;
183+
}
184+
}
185+
let mut amount = read_u16s(handle, &mut buf[start..amount])? + start;
186+
187+
if amount > 0 {
188+
let last_char = buf[amount - 1];
189+
if last_char >= 0xD800 && last_char <= 0xDBFF { // high surrogate
190+
self.high_surrogate.set(last_char);
191+
amount -= 1;
122192
}
123-
*utf8 = Cursor::new(data);
124193
}
194+
Ok(amount)
195+
}
196+
}
197+
198+
fn read_u16s(handle: c::HANDLE, buf: &mut [u16]) -> io::Result<usize> {
199+
// Configure the `pInputControl` parameter to not only return on `\r\n` but also Ctrl-Z, the
200+
// traditional DOS method to indicate end of character stream / user input (SUB).
201+
// See #38274 and https://stackoverflow.com/questions/43836040/win-api-readconsole.
202+
const CTRL_Z: u16 = 0x1A;
203+
const CTRL_Z_MASK: c::ULONG = 1 << CTRL_Z;
204+
let mut input_control = c::CONSOLE_READCONSOLE_CONTROL {
205+
nLength: ::mem::size_of::<c::CONSOLE_READCONSOLE_CONTROL>() as c::ULONG,
206+
nInitialChars: 0,
207+
dwCtrlWakeupMask: CTRL_Z_MASK,
208+
dwControlKeyState: 0,
209+
};
210+
211+
let mut amount = 0;
212+
cvt(unsafe {
213+
c::ReadConsoleW(handle,
214+
buf.as_mut_ptr() as c::LPVOID,
215+
buf.len() as u32,
216+
&mut amount,
217+
&mut input_control as c::PCONSOLE_READCONSOLE_CONTROL)
218+
})?;
125219

126-
// MemReader shouldn't error here since we just filled it
127-
utf8.read(buf)
220+
if amount > 0 && buf[amount as usize - 1] == CTRL_Z {
221+
amount -= 1;
128222
}
223+
Ok(amount as usize)
129224
}
130225

131226
impl Stdout {
@@ -156,43 +251,10 @@ impl Stderr {
156251
}
157252
}
158253

159-
impl Output {
160-
pub fn handle(&self) -> c::HANDLE {
161-
match *self {
162-
Output::Console(c) => c,
163-
Output::Pipe(c) => c,
164-
}
165-
}
166-
}
167-
168-
fn invalid_encoding() -> io::Error {
169-
io::Error::new(io::ErrorKind::InvalidData,
170-
"Windows stdio in console mode does not support non-UTF-8 byte sequences; \
171-
see https://github.com/rust-lang/rust/issues/23344")
172-
}
173-
174-
fn readconsole_input_control(wakeup_mask: c::ULONG) -> c::CONSOLE_READCONSOLE_CONTROL {
175-
c::CONSOLE_READCONSOLE_CONTROL {
176-
nLength: ::mem::size_of::<c::CONSOLE_READCONSOLE_CONTROL>() as c::ULONG,
177-
nInitialChars: 0,
178-
dwCtrlWakeupMask: wakeup_mask,
179-
dwControlKeyState: 0,
180-
}
181-
}
182-
183-
const CTRL_Z: u8 = 0x1A;
184-
const CTRL_Z_MASK: c::ULONG = 0x4000000; //1 << 0x1A
185-
186254
pub fn is_ebadf(err: &io::Error) -> bool {
187255
err.raw_os_error() == Some(c::ERROR_INVALID_HANDLE as i32)
188256
}
189257

190-
// The default buffer capacity is 64k, but apparently windows
191-
// doesn't like 64k reads on stdin. See #13304 for details, but the
192-
// idea is that on windows we use a slightly smaller buffer that's
193-
// been seen to be acceptable.
194-
pub const STDIN_BUF_SIZE: usize = 8 * 1024;
195-
196258
pub fn panic_output() -> Option<impl io::Write> {
197259
io::stderr_raw().ok()
198260
}

0 commit comments

Comments
 (0)