bytes, strings: add Lines, SplitSeq, SplitAfterSeq, FieldsSeq, FieldsFuncSeq

aimuz · aimuz · commit a29818a17e20 · 2024-05-21T22:37:53.000+08:00
Fixes #61901.
diff --git a/api/next/61901.txt b/api/next/61901.txt
@@ -0,0 +1,10 @@
+pkg bytes, func FieldsFuncSeq([]uint8, func(int32) bool) iter.Seq[[]uint8] #61901
+pkg bytes, func FieldsSeq([]uint8) iter.Seq[[]uint8] #61901
+pkg bytes, func Lines([]uint8) iter.Seq[[]uint8] #61901
+pkg bytes, func SplitAfterSeq([]uint8, []uint8) iter.Seq[[]uint8] #61901
+pkg bytes, func SplitSeq([]uint8, []uint8) iter.Seq[[]uint8] #61901
+pkg strings, func FieldsFuncSeq(string, func(int32) bool) iter.Seq[string] #61901
+pkg strings, func FieldsSeq(string) iter.Seq[string] #61901
+pkg strings, func Lines(string) iter.Seq[string] #61901
+pkg strings, func SplitAfterSeq(string, string) iter.Seq[string] #61901
+pkg strings, func SplitSeq(string, string) iter.Seq[string] #61901
diff --git a/doc/next/6-stdlib/3-iter.md b/doc/next/6-stdlib/3-iter.md
@@ -27,3 +27,29 @@ The [maps] package adds several functions that work with iterators:
 - [Values](/pkg/maps#Values) returns an iterator over values in m.
 - [Insert](/pkg/maps#Insert) adds the key-value pairs from seq to m.
 - [Collect](/pkg/maps#Collect) collects key-value pairs from seq into a new map and returns it.
+
+The [bytes] package adds several functions that work with iterators:
+- [Lines](/pkg/bytes#Lines) returns an iterator over the
+  newline-terminated lines in the byte slice s.
+- [SplitSeq](/pkg/bytes#SplitSeq) returns an iterator over
+  all substrings of s separated by sep.
+- [SplitAfterSeq](/pkg/bytes#SplitAfterSeq) returns an iterator
+  over substrings of s split after each instance of sep.
+- [FieldsSeq](/pkg/bytes#FieldsSeq) returns an iterator over
+  substrings of s split around runs of whitespace characters, 
+  as defined by unicode.IsSpace.
+- [FieldsFuncSeq](/pkg/bytes#FieldsFuncSeq) returns an iterator
+  over substrings of s split around runs of Unicode code points satisfying f(c).
+
+The [strings] package adds several functions that work with iterators:
+- [Lines](/pkg/strings#Lines) returns an iterator over
+  the newline-terminated lines in the string s.
+- [SplitSeq](/pkg/strings#SplitSeq) returns an iterator over
+  all substrings of s separated by sep.
+- [SplitAfterSeq](/pkg/strings#SplitAfterSeq) returns an iterator
+  over substrings of s split after each instance of sep.
+- [FieldsSeq](/pkg/strings#FieldsSeq) returns an iterator over
+  substrings of s split around runs of whitespace characters,
+  as defined by unicode.IsSpace.
+- [FieldsFuncSeq](/pkg/strings#FieldsFuncSeq) returns an iterator
+  over substrings of s split around runs of Unicode code points satisfying f(c).
diff --git a/doc/next/6-stdlib/99-minor/bytes/61901.md b/doc/next/6-stdlib/99-minor/bytes/61901.md
@@ -0,0 +1 @@
+<!-- see ../../3-iter.md -->
diff --git a/doc/next/6-stdlib/99-minor/strings/61901.md b/doc/next/6-stdlib/99-minor/strings/61901.md
@@ -0,0 +1 @@
+<!-- see ../../3-iter.md -->
diff --git a/src/bytes/bytes.go b/src/bytes/bytes.go
@@ -8,6 +8,7 @@ package bytes
 
 import (
 	"internal/bytealg"
+	"iter"
 	"unicode"
 	"unicode/utf8"
 )
@@ -50,6 +51,20 @@ func explode(s []byte, n int) [][]byte {
 	return a[0:na]
 }
 
+// explodeSeq returns an iterator over the runes in s.
+func explodeSeq(s []byte) iter.Seq[[]byte] {
+	return func(yield func([]byte) bool) {
+		s := s
+		for len(s) > 0 {
+			_, size := utf8.DecodeRune(s)
+			if !yield(s[:size]) {
+				return
+			}
+			s = s[size:]
+		}
+	}
+}
+
 // Count counts the number of non-overlapping instances of sep in s.
 // If sep is an empty slice, Count returns 1 + the number of UTF-8-encoded code points in s.
 func Count(s, sep []byte) int {
@@ -318,6 +333,28 @@ func LastIndexAny(s []byte, chars string) int {
 	return -1
 }
 
+// Lines returns an iterator over the newline-terminated lines in the byte slice s.
+// The lines yielded by the iterator include their terminating newlines.
+// If s is empty, the iterator yields no lines at all.
+// If s does not end in a newline, the final yielded line will not end in a newline.
+func Lines(s []byte) iter.Seq[[]byte] {
+	return func(yield func([]byte) bool) {
+		s := s
+		for len(s) > 0 {
+			var line []byte
+			if i := IndexByte(s, '\n'); i >= 0 {
+				line, s = s[:i+1], s[i+1:]
+			} else {
+				line, s = s, nil
+			}
+			if !yield(line) {
+				return
+			}
+		}
+		return
+	}
+}
+
 // Generic split: splits after each instance of sep,
 // including sepSave bytes of sep in the subslices.
 func genSplit(s, sep []byte, sepSave, n int) [][]byte {
@@ -350,6 +387,29 @@ func genSplit(s, sep []byte, sepSave, n int) [][]byte {
 	return a[:i+1]
 }
 
+// splitSeq is SplitSeq or SplitAfterSeq, configured by how many
+// bytes of sep to include in the results (none or all).
+func splitSeq(s, sep []byte, sepSave int) iter.Seq[[]byte] {
+	if len(sep) == 0 {
+		return explodeSeq(s)
+	}
+	return func(yield func([]byte) bool) {
+		s := s
+		for {
+			i := Index(s, sep)
+			if i < 0 {
+				break
+			}
+			frag := s[:i+sepSave]
+			if !yield(frag) {
+				return
+			}
+			s = s[i+len(sep):]
+		}
+		yield(s)
+	}
+}
+
 // SplitN slices s into subslices separated by sep and returns a slice of
 // the subslices between those separators.
 // If sep is empty, SplitN splits after each UTF-8 sequence.
@@ -390,6 +450,20 @@ func SplitAfter(s, sep []byte) [][]byte {
 	return genSplit(s, sep, len(sep), -1)
 }
 
+// SplitSeq returns an iterator over all substrings of s separated by sep.
+// The iterator yields the same strings that would be returned by Split(s, sep),
+// but without constructing the slice.
+func SplitSeq(s, sep []byte) iter.Seq[[]byte] {
+	return splitSeq(s, sep, 0)
+}
+
+// SplitAfterSeq returns an iterator over substrings of s split after each instance of sep.
+// The iterator yields the same strings that would be returned by SplitAfter(s, sep),
+// but without constructing the slice.
+func SplitAfterSeq(s, sep []byte) iter.Seq[[]byte] {
+	return splitSeq(s, sep, len(sep))
+}
+
 var asciiSpace = [256]uint8{'\t': 1, '\n': 1, '\v': 1, '\f': 1, '\r': 1, ' ': 1}
 
 // Fields interprets s as a sequence of UTF-8-encoded code points.
@@ -446,6 +520,40 @@ func Fields(s []byte) [][]byte {
 	return a
 }
 
+// FieldsSeq returns an iterator over substrings of s split around runs of
+// whitespace characters, as defined by unicode.IsSpace.
+// The iterator yields the same strings that would be returned by Fields(s),
+// but without constructing the slice.
+func FieldsSeq(s []byte) iter.Seq[[]byte] {
+	return func(yield func([]byte) bool) {
+		s := s
+		start := -1
+		for i := 0; i < len(s); {
+			size := 1
+			r := rune(s[i])
+			isSpace := asciiSpace[s[i]] != 0
+			if r >= utf8.RuneSelf {
+				r, size = utf8.DecodeRune(s[i:])
+				isSpace = unicode.IsSpace(r)
+			}
+			if isSpace {
+				if start >= 0 {
+					if !yield(s[start:i]) {
+						return
+					}
+					start = -1
+				}
+			} else if start < 0 {
+				start = i
+			}
+			i += size
+		}
+		if start >= 0 {
+			yield(s[start:])
+		}
+	}
+}
+
 // FieldsFunc interprets s as a sequence of UTF-8-encoded code points.
 // It splits the slice s at each run of code points c satisfying f(c) and
 // returns a slice of subslices of s. If all code points in s satisfy f(c), or
@@ -500,6 +608,38 @@ func FieldsFunc(s []byte, f func(rune) bool) [][]byte {
 	return a
 }
 
+// FieldsFuncSeq returns an iterator over substrings of s split around runs of
+// Unicode code points satisfying f(c).
+// The iterator yields the same strings that would be returned by FieldsFunc(s),
+// but without constructing the slice.
+func FieldsFuncSeq(s []byte, f func(rune) bool) iter.Seq[[]byte] {
+	return func(yield func([]byte) bool) {
+		s := s
+		start := -1
+		for i := 0; i < len(s); {
+			size := 1
+			r := rune(s[i])
+			if r >= utf8.RuneSelf {
+				r, size = utf8.DecodeRune(s[i:])
+			}
+			if f(r) {
+				if start >= 0 {
+					if !yield(s[start:i]) {
+						return
+					}
+					start = -1
+				}
+			} else if start < 0 {
+				start = i
+			}
+			i += size
+		}
+		if start >= 0 {
+			yield(s[start:])
+		}
+	}
+}
+
 // Join concatenates the elements of s to create a new byte slice. The separator
 // sep is placed between elements in the resulting slice.
 func Join(s [][]byte, sep []byte) []byte {
diff --git a/src/bytes/bytes_test.go b/src/bytes/bytes_test.go
@@ -8,6 +8,7 @@ import (
 	. "bytes"
 	"fmt"
 	"internal/testenv"
+	"iter"
 	"math"
 	"math/rand"
 	"reflect"
@@ -769,6 +770,22 @@ func BenchmarkCountSingle(b *testing.B) {
 	})
 }
 
+var LinesTest = []string{
+	"abc\nabc\n",
+	"abc\r\nabc",
+	"abc\r\n",
+	"abc\n",
+}
+
+func TestLines(t *testing.T) {
+	for _, s := range LinesTest {
+		result := Join(collect(t, Lines([]byte(s))), []byte(""))
+		if string(result) != s {
+			t.Errorf(`Join(collect(Lines(%q)), "") = %q`, s, result)
+		}
+	}
+}
+
 type SplitTest struct {
 	s   string
 	sep string
@@ -812,6 +829,14 @@ func TestSplit(t *testing.T) {
 			t.Errorf(`Split(%q, %q, %d) = %v; want %v`, tt.s, tt.sep, tt.n, result, tt.a)
 			continue
 		}
+
+		if tt.n < 0 {
+			result2 := sliceOfString(collect(t, SplitSeq([]byte(tt.s), []byte(tt.sep))))
+			if !eq(result2, tt.a) {
+				t.Errorf(`collect(SplitSeq(%q, %q)) = %v; want %v`, tt.s, tt.sep, result2, tt.a)
+			}
+		}
+
 		if tt.n == 0 || len(a) == 0 {
 			continue
 		}
@@ -871,6 +896,13 @@ func TestSplitAfter(t *testing.T) {
 			continue
 		}
 
+		if tt.n < 0 {
+			result2 := sliceOfString(collect(t, SplitAfterSeq([]byte(tt.s), []byte(tt.sep))))
+			if !eq(result2, tt.a) {
+				t.Errorf(`collect(SplitAfterSeq(%q, %q)) = %v; want %v`, tt.s, tt.sep, result2, tt.a)
+			}
+		}
+
 		if want := tt.a[len(tt.a)-1] + "z"; string(x) != want {
 			t.Errorf("last appended result was %s; want %s", x, want)
 		}
@@ -924,6 +956,11 @@ func TestFields(t *testing.T) {
 			continue
 		}
 
+		result2 := sliceOfString(collect(t, FieldsSeq([]byte(tt.s))))
+		if !eq(result2, tt.a) {
+			t.Errorf(`collect(FieldsSeq(%q)) = %v; want %v`, tt.s, result2, tt.a)
+		}
+
 		if string(b) != tt.s {
 			t.Errorf("slice changed to %s; want %s", string(b), tt.s)
 		}
@@ -966,6 +1003,11 @@ func TestFieldsFunc(t *testing.T) {
 			t.Errorf("FieldsFunc(%q) = %v, want %v", tt.s, a, tt.a)
 		}
 
+		result2 := sliceOfString(collect(t, FieldsFuncSeq([]byte(tt.s), pred)))
+		if !eq(result2, tt.a) {
+			t.Errorf(`collect(FieldsFuncSeq(%q)) = %v; want %v`, tt.s, result2, tt.a)
+		}
+
 		if string(b) != tt.s {
 			t.Errorf("slice changed to %s; want %s", b, tt.s)
 		}
@@ -2278,3 +2320,20 @@ func TestClone(t *testing.T) {
 		}
 	}
 }
+
+func collect[T []byte | string](t *testing.T, seq iter.Seq[T]) []T {
+	out := collect1(seq)
+	out1 := collect1(seq)
+	if !reflect.DeepEqual(out, out1) {
+		t.Fatalf("inconsistent seq:\n%s\n%s", out, out1)
+	}
+	return out
+}
+
+func collect1[T []byte | string](seq iter.Seq[T]) []T {
+	var out []T
+	for x := range seq {
+		out = append(out, x)
+	}
+	return out
+}
diff --git a/src/strings/strings.go b/src/strings/strings.go
diff --git a/src/strings/strings_test.go b/src/strings/strings_test.go