Skip to content

Commit c6b82c7

Browse files
committed
Deprecate str::from_utf8_lossy
Use `String::from_utf8_lossy` instead [breaking-change]
1 parent 1900abd commit c6b82c7

File tree

11 files changed

+258
-256
lines changed

11 files changed

+258
-256
lines changed

src/libcollections/str.rs

+3-195
Original file line numberDiff line numberDiff line change
@@ -402,131 +402,10 @@ macro_rules! utf8_acc_cont_byte(
402402
($ch:expr, $byte:expr) => (($ch << 6) | ($byte & 63u8) as u32)
403403
)
404404

405-
static TAG_CONT_U8: u8 = 128u8;
406-
407-
/// Converts a vector of bytes to a new utf-8 string.
408-
/// Any invalid utf-8 sequences are replaced with U+FFFD REPLACEMENT CHARACTER.
409-
///
410-
/// # Example
411-
///
412-
/// ```rust
413-
/// let input = b"Hello \xF0\x90\x80World";
414-
/// let output = std::str::from_utf8_lossy(input);
415-
/// assert_eq!(output.as_slice(), "Hello \uFFFDWorld");
416-
/// ```
405+
/// Deprecated. Use `String::from_utf8_lossy`.
406+
#[deprecated = "Replaced by String::from_utf8_lossy"]
417407
pub fn from_utf8_lossy<'a>(v: &'a [u8]) -> MaybeOwned<'a> {
418-
if is_utf8(v) {
419-
return Slice(unsafe { mem::transmute(v) })
420-
}
421-
422-
static REPLACEMENT: &'static [u8] = b"\xEF\xBF\xBD"; // U+FFFD in UTF-8
423-
let mut i = 0;
424-
let total = v.len();
425-
fn unsafe_get(xs: &[u8], i: uint) -> u8 {
426-
unsafe { *xs.unsafe_ref(i) }
427-
}
428-
fn safe_get(xs: &[u8], i: uint, total: uint) -> u8 {
429-
if i >= total {
430-
0
431-
} else {
432-
unsafe_get(xs, i)
433-
}
434-
}
435-
436-
let mut res = String::with_capacity(total);
437-
438-
if i > 0 {
439-
unsafe {
440-
res.push_bytes(v.slice_to(i))
441-
};
442-
}
443-
444-
// subseqidx is the index of the first byte of the subsequence we're looking at.
445-
// It's used to copy a bunch of contiguous good codepoints at once instead of copying
446-
// them one by one.
447-
let mut subseqidx = 0;
448-
449-
while i < total {
450-
let i_ = i;
451-
let byte = unsafe_get(v, i);
452-
i += 1;
453-
454-
macro_rules! error(() => ({
455-
unsafe {
456-
if subseqidx != i_ {
457-
res.push_bytes(v.slice(subseqidx, i_));
458-
}
459-
subseqidx = i;
460-
res.push_bytes(REPLACEMENT);
461-
}
462-
}))
463-
464-
if byte < 128u8 {
465-
// subseqidx handles this
466-
} else {
467-
let w = utf8_char_width(byte);
468-
469-
match w {
470-
2 => {
471-
if safe_get(v, i, total) & 192u8 != TAG_CONT_U8 {
472-
error!();
473-
continue;
474-
}
475-
i += 1;
476-
}
477-
3 => {
478-
match (byte, safe_get(v, i, total)) {
479-
(0xE0 , 0xA0 .. 0xBF) => (),
480-
(0xE1 .. 0xEC, 0x80 .. 0xBF) => (),
481-
(0xED , 0x80 .. 0x9F) => (),
482-
(0xEE .. 0xEF, 0x80 .. 0xBF) => (),
483-
_ => {
484-
error!();
485-
continue;
486-
}
487-
}
488-
i += 1;
489-
if safe_get(v, i, total) & 192u8 != TAG_CONT_U8 {
490-
error!();
491-
continue;
492-
}
493-
i += 1;
494-
}
495-
4 => {
496-
match (byte, safe_get(v, i, total)) {
497-
(0xF0 , 0x90 .. 0xBF) => (),
498-
(0xF1 .. 0xF3, 0x80 .. 0xBF) => (),
499-
(0xF4 , 0x80 .. 0x8F) => (),
500-
_ => {
501-
error!();
502-
continue;
503-
}
504-
}
505-
i += 1;
506-
if safe_get(v, i, total) & 192u8 != TAG_CONT_U8 {
507-
error!();
508-
continue;
509-
}
510-
i += 1;
511-
if safe_get(v, i, total) & 192u8 != TAG_CONT_U8 {
512-
error!();
513-
continue;
514-
}
515-
i += 1;
516-
}
517-
_ => {
518-
error!();
519-
continue;
520-
}
521-
}
522-
}
523-
}
524-
if subseqidx < total {
525-
unsafe {
526-
res.push_bytes(v.slice(subseqidx, total))
527-
};
528-
}
529-
Owned(res.into_string())
408+
String::from_utf8_lossy(v)
530409
}
531410

532411
/*
@@ -2052,41 +1931,6 @@ String::from_str("\u1111\u1171\u11b6"));
20521931
assert_eq!(from_utf8(xs), None);
20531932
}
20541933

2055-
#[test]
2056-
fn test_str_from_utf8_lossy() {
2057-
let xs = b"hello";
2058-
assert_eq!(from_utf8_lossy(xs), Slice("hello"));
2059-
2060-
let xs = "ศไทย中华Việt Nam".as_bytes();
2061-
assert_eq!(from_utf8_lossy(xs), Slice("ศไทย中华Việt Nam"));
2062-
2063-
let xs = b"Hello\xC2 There\xFF Goodbye";
2064-
assert_eq!(from_utf8_lossy(xs), Owned(String::from_str("Hello\uFFFD There\uFFFD Goodbye")));
2065-
2066-
let xs = b"Hello\xC0\x80 There\xE6\x83 Goodbye";
2067-
assert_eq!(from_utf8_lossy(xs),
2068-
Owned(String::from_str("Hello\uFFFD\uFFFD There\uFFFD Goodbye")));
2069-
2070-
let xs = b"\xF5foo\xF5\x80bar";
2071-
assert_eq!(from_utf8_lossy(xs), Owned(String::from_str("\uFFFDfoo\uFFFD\uFFFDbar")));
2072-
2073-
let xs = b"\xF1foo\xF1\x80bar\xF1\x80\x80baz";
2074-
assert_eq!(from_utf8_lossy(xs), Owned(String::from_str("\uFFFDfoo\uFFFDbar\uFFFDbaz")));
2075-
2076-
let xs = b"\xF4foo\xF4\x80bar\xF4\xBFbaz";
2077-
assert_eq!(from_utf8_lossy(xs),
2078-
Owned(String::from_str("\uFFFDfoo\uFFFDbar\uFFFD\uFFFDbaz")));
2079-
2080-
let xs = b"\xF0\x80\x80\x80foo\xF0\x90\x80\x80bar";
2081-
assert_eq!(from_utf8_lossy(xs), Owned(String::from_str("\uFFFD\uFFFD\uFFFD\uFFFD\
2082-
foo\U00010000bar")));
2083-
2084-
// surrogates
2085-
let xs = b"\xED\xA0\x80foo\xED\xBF\xBFbar";
2086-
assert_eq!(from_utf8_lossy(xs), Owned(String::from_str("\uFFFD\uFFFD\uFFFDfoo\
2087-
\uFFFD\uFFFD\uFFFDbar")));
2088-
}
2089-
20901934
#[test]
20911935
fn test_maybe_owned_traits() {
20921936
let s = Slice("abcde");
@@ -2296,42 +2140,6 @@ mod bench {
22962140
});
22972141
}
22982142

2299-
#[bench]
2300-
fn from_utf8_lossy_100_ascii(b: &mut Bencher) {
2301-
let s = b"Hello there, the quick brown fox jumped over the lazy dog! \
2302-
Lorem ipsum dolor sit amet, consectetur. ";
2303-
2304-
assert_eq!(100, s.len());
2305-
b.iter(|| {
2306-
let _ = from_utf8_lossy(s);
2307-
});
2308-
}
2309-
2310-
#[bench]
2311-
fn from_utf8_lossy_100_multibyte(b: &mut Bencher) {
2312-
let s = "𐌀𐌖𐌋𐌄𐌑𐌉ปรدولة الكويتทศไทย中华𐍅𐌿𐌻𐍆𐌹𐌻𐌰".as_bytes();
2313-
assert_eq!(100, s.len());
2314-
b.iter(|| {
2315-
let _ = from_utf8_lossy(s);
2316-
});
2317-
}
2318-
2319-
#[bench]
2320-
fn from_utf8_lossy_invalid(b: &mut Bencher) {
2321-
let s = b"Hello\xC0\x80 There\xE6\x83 Goodbye";
2322-
b.iter(|| {
2323-
let _ = from_utf8_lossy(s);
2324-
});
2325-
}
2326-
2327-
#[bench]
2328-
fn from_utf8_lossy_100_invalid(b: &mut Bencher) {
2329-
let s = Vec::from_elem(100, 0xF5u8);
2330-
b.iter(|| {
2331-
let _ = from_utf8_lossy(s.as_slice());
2332-
});
2333-
}
2334-
23352143
#[bench]
23362144
fn bench_connect(b: &mut Bencher) {
23372145
let s = "ศไทย中华Việt Nam; Mary had a little lamb, Little lamb";

0 commit comments

Comments
 (0)