Skip to content

Commit a29818a

Browse files
committed
bytes, strings: add Lines, SplitSeq, SplitAfterSeq, FieldsSeq, FieldsFuncSeq
Fixes #61901.
1 parent 22344e1 commit a29818a

File tree

8 files changed

+431
-0
lines changed

8 files changed

+431
-0
lines changed

api/next/61901.txt

+10
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
pkg bytes, func FieldsFuncSeq([]uint8, func(int32) bool) iter.Seq[[]uint8] #61901
2+
pkg bytes, func FieldsSeq([]uint8) iter.Seq[[]uint8] #61901
3+
pkg bytes, func Lines([]uint8) iter.Seq[[]uint8] #61901
4+
pkg bytes, func SplitAfterSeq([]uint8, []uint8) iter.Seq[[]uint8] #61901
5+
pkg bytes, func SplitSeq([]uint8, []uint8) iter.Seq[[]uint8] #61901
6+
pkg strings, func FieldsFuncSeq(string, func(int32) bool) iter.Seq[string] #61901
7+
pkg strings, func FieldsSeq(string) iter.Seq[string] #61901
8+
pkg strings, func Lines(string) iter.Seq[string] #61901
9+
pkg strings, func SplitAfterSeq(string, string) iter.Seq[string] #61901
10+
pkg strings, func SplitSeq(string, string) iter.Seq[string] #61901

doc/next/6-stdlib/3-iter.md

+26
Original file line numberDiff line numberDiff line change
@@ -27,3 +27,29 @@ The [maps] package adds several functions that work with iterators:
2727
- [Values](/pkg/maps#Values) returns an iterator over values in m.
2828
- [Insert](/pkg/maps#Insert) adds the key-value pairs from seq to m.
2929
- [Collect](/pkg/maps#Collect) collects key-value pairs from seq into a new map and returns it.
30+
31+
The [bytes] package adds several functions that work with iterators:
32+
- [Lines](/pkg/bytes#Lines) returns an iterator over the
33+
newline-terminated lines in the byte slice s.
34+
- [SplitSeq](/pkg/bytes#SplitSeq) returns an iterator over
35+
all substrings of s separated by sep.
36+
- [SplitAfterSeq](/pkg/bytes#SplitAfterSeq) returns an iterator
37+
over substrings of s split after each instance of sep.
38+
- [FieldsSeq](/pkg/bytes#FieldsSeq) returns an iterator over
39+
substrings of s split around runs of whitespace characters,
40+
as defined by unicode.IsSpace.
41+
- [FieldsFuncSeq](/pkg/bytes#FieldsFuncSeq) returns an iterator
42+
over substrings of s split around runs of Unicode code points satisfying f(c).
43+
44+
The [strings] package adds several functions that work with iterators:
45+
- [Lines](/pkg/strings#Lines) returns an iterator over
46+
the newline-terminated lines in the string s.
47+
- [SplitSeq](/pkg/strings#SplitSeq) returns an iterator over
48+
all substrings of s separated by sep.
49+
- [SplitAfterSeq](/pkg/strings#SplitAfterSeq) returns an iterator
50+
over substrings of s split after each instance of sep.
51+
- [FieldsSeq](/pkg/strings#FieldsSeq) returns an iterator over
52+
substrings of s split around runs of whitespace characters,
53+
as defined by unicode.IsSpace.
54+
- [FieldsFuncSeq](/pkg/strings#FieldsFuncSeq) returns an iterator
55+
over substrings of s split around runs of Unicode code points satisfying f(c).
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
<!-- see ../../3-iter.md -->
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
<!-- see ../../3-iter.md -->

src/bytes/bytes.go

+140
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ package bytes
88

99
import (
1010
"internal/bytealg"
11+
"iter"
1112
"unicode"
1213
"unicode/utf8"
1314
)
@@ -50,6 +51,20 @@ func explode(s []byte, n int) [][]byte {
5051
return a[0:na]
5152
}
5253

54+
// explodeSeq returns an iterator over the runes in s.
55+
func explodeSeq(s []byte) iter.Seq[[]byte] {
56+
return func(yield func([]byte) bool) {
57+
s := s
58+
for len(s) > 0 {
59+
_, size := utf8.DecodeRune(s)
60+
if !yield(s[:size]) {
61+
return
62+
}
63+
s = s[size:]
64+
}
65+
}
66+
}
67+
5368
// Count counts the number of non-overlapping instances of sep in s.
5469
// If sep is an empty slice, Count returns 1 + the number of UTF-8-encoded code points in s.
5570
func Count(s, sep []byte) int {
@@ -318,6 +333,28 @@ func LastIndexAny(s []byte, chars string) int {
318333
return -1
319334
}
320335

336+
// Lines returns an iterator over the newline-terminated lines in the byte slice s.
337+
// The lines yielded by the iterator include their terminating newlines.
338+
// If s is empty, the iterator yields no lines at all.
339+
// If s does not end in a newline, the final yielded line will not end in a newline.
340+
func Lines(s []byte) iter.Seq[[]byte] {
341+
return func(yield func([]byte) bool) {
342+
s := s
343+
for len(s) > 0 {
344+
var line []byte
345+
if i := IndexByte(s, '\n'); i >= 0 {
346+
line, s = s[:i+1], s[i+1:]
347+
} else {
348+
line, s = s, nil
349+
}
350+
if !yield(line) {
351+
return
352+
}
353+
}
354+
return
355+
}
356+
}
357+
321358
// Generic split: splits after each instance of sep,
322359
// including sepSave bytes of sep in the subslices.
323360
func genSplit(s, sep []byte, sepSave, n int) [][]byte {
@@ -350,6 +387,29 @@ func genSplit(s, sep []byte, sepSave, n int) [][]byte {
350387
return a[:i+1]
351388
}
352389

390+
// splitSeq is SplitSeq or SplitAfterSeq, configured by how many
391+
// bytes of sep to include in the results (none or all).
392+
func splitSeq(s, sep []byte, sepSave int) iter.Seq[[]byte] {
393+
if len(sep) == 0 {
394+
return explodeSeq(s)
395+
}
396+
return func(yield func([]byte) bool) {
397+
s := s
398+
for {
399+
i := Index(s, sep)
400+
if i < 0 {
401+
break
402+
}
403+
frag := s[:i+sepSave]
404+
if !yield(frag) {
405+
return
406+
}
407+
s = s[i+len(sep):]
408+
}
409+
yield(s)
410+
}
411+
}
412+
353413
// SplitN slices s into subslices separated by sep and returns a slice of
354414
// the subslices between those separators.
355415
// If sep is empty, SplitN splits after each UTF-8 sequence.
@@ -390,6 +450,20 @@ func SplitAfter(s, sep []byte) [][]byte {
390450
return genSplit(s, sep, len(sep), -1)
391451
}
392452

453+
// SplitSeq returns an iterator over all substrings of s separated by sep.
454+
// The iterator yields the same strings that would be returned by Split(s, sep),
455+
// but without constructing the slice.
456+
func SplitSeq(s, sep []byte) iter.Seq[[]byte] {
457+
return splitSeq(s, sep, 0)
458+
}
459+
460+
// SplitAfterSeq returns an iterator over substrings of s split after each instance of sep.
461+
// The iterator yields the same strings that would be returned by SplitAfter(s, sep),
462+
// but without constructing the slice.
463+
func SplitAfterSeq(s, sep []byte) iter.Seq[[]byte] {
464+
return splitSeq(s, sep, len(sep))
465+
}
466+
393467
var asciiSpace = [256]uint8{'\t': 1, '\n': 1, '\v': 1, '\f': 1, '\r': 1, ' ': 1}
394468

395469
// Fields interprets s as a sequence of UTF-8-encoded code points.
@@ -446,6 +520,40 @@ func Fields(s []byte) [][]byte {
446520
return a
447521
}
448522

523+
// FieldsSeq returns an iterator over substrings of s split around runs of
524+
// whitespace characters, as defined by unicode.IsSpace.
525+
// The iterator yields the same strings that would be returned by Fields(s),
526+
// but without constructing the slice.
527+
func FieldsSeq(s []byte) iter.Seq[[]byte] {
528+
return func(yield func([]byte) bool) {
529+
s := s
530+
start := -1
531+
for i := 0; i < len(s); {
532+
size := 1
533+
r := rune(s[i])
534+
isSpace := asciiSpace[s[i]] != 0
535+
if r >= utf8.RuneSelf {
536+
r, size = utf8.DecodeRune(s[i:])
537+
isSpace = unicode.IsSpace(r)
538+
}
539+
if isSpace {
540+
if start >= 0 {
541+
if !yield(s[start:i]) {
542+
return
543+
}
544+
start = -1
545+
}
546+
} else if start < 0 {
547+
start = i
548+
}
549+
i += size
550+
}
551+
if start >= 0 {
552+
yield(s[start:])
553+
}
554+
}
555+
}
556+
449557
// FieldsFunc interprets s as a sequence of UTF-8-encoded code points.
450558
// It splits the slice s at each run of code points c satisfying f(c) and
451559
// returns a slice of subslices of s. If all code points in s satisfy f(c), or
@@ -500,6 +608,38 @@ func FieldsFunc(s []byte, f func(rune) bool) [][]byte {
500608
return a
501609
}
502610

611+
// FieldsFuncSeq returns an iterator over substrings of s split around runs of
612+
// Unicode code points satisfying f(c).
613+
// The iterator yields the same strings that would be returned by FieldsFunc(s),
614+
// but without constructing the slice.
615+
func FieldsFuncSeq(s []byte, f func(rune) bool) iter.Seq[[]byte] {
616+
return func(yield func([]byte) bool) {
617+
s := s
618+
start := -1
619+
for i := 0; i < len(s); {
620+
size := 1
621+
r := rune(s[i])
622+
if r >= utf8.RuneSelf {
623+
r, size = utf8.DecodeRune(s[i:])
624+
}
625+
if f(r) {
626+
if start >= 0 {
627+
if !yield(s[start:i]) {
628+
return
629+
}
630+
start = -1
631+
}
632+
} else if start < 0 {
633+
start = i
634+
}
635+
i += size
636+
}
637+
if start >= 0 {
638+
yield(s[start:])
639+
}
640+
}
641+
}
642+
503643
// Join concatenates the elements of s to create a new byte slice. The separator
504644
// sep is placed between elements in the resulting slice.
505645
func Join(s [][]byte, sep []byte) []byte {

src/bytes/bytes_test.go

+59
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ import (
88
. "bytes"
99
"fmt"
1010
"internal/testenv"
11+
"iter"
1112
"math"
1213
"math/rand"
1314
"reflect"
@@ -769,6 +770,22 @@ func BenchmarkCountSingle(b *testing.B) {
769770
})
770771
}
771772

773+
var LinesTest = []string{
774+
"abc\nabc\n",
775+
"abc\r\nabc",
776+
"abc\r\n",
777+
"abc\n",
778+
}
779+
780+
func TestLines(t *testing.T) {
781+
for _, s := range LinesTest {
782+
result := Join(collect(t, Lines([]byte(s))), []byte(""))
783+
if string(result) != s {
784+
t.Errorf(`Join(collect(Lines(%q)), "") = %q`, s, result)
785+
}
786+
}
787+
}
788+
772789
type SplitTest struct {
773790
s string
774791
sep string
@@ -812,6 +829,14 @@ func TestSplit(t *testing.T) {
812829
t.Errorf(`Split(%q, %q, %d) = %v; want %v`, tt.s, tt.sep, tt.n, result, tt.a)
813830
continue
814831
}
832+
833+
if tt.n < 0 {
834+
result2 := sliceOfString(collect(t, SplitSeq([]byte(tt.s), []byte(tt.sep))))
835+
if !eq(result2, tt.a) {
836+
t.Errorf(`collect(SplitSeq(%q, %q)) = %v; want %v`, tt.s, tt.sep, result2, tt.a)
837+
}
838+
}
839+
815840
if tt.n == 0 || len(a) == 0 {
816841
continue
817842
}
@@ -871,6 +896,13 @@ func TestSplitAfter(t *testing.T) {
871896
continue
872897
}
873898

899+
if tt.n < 0 {
900+
result2 := sliceOfString(collect(t, SplitAfterSeq([]byte(tt.s), []byte(tt.sep))))
901+
if !eq(result2, tt.a) {
902+
t.Errorf(`collect(SplitAfterSeq(%q, %q)) = %v; want %v`, tt.s, tt.sep, result2, tt.a)
903+
}
904+
}
905+
874906
if want := tt.a[len(tt.a)-1] + "z"; string(x) != want {
875907
t.Errorf("last appended result was %s; want %s", x, want)
876908
}
@@ -924,6 +956,11 @@ func TestFields(t *testing.T) {
924956
continue
925957
}
926958

959+
result2 := sliceOfString(collect(t, FieldsSeq([]byte(tt.s))))
960+
if !eq(result2, tt.a) {
961+
t.Errorf(`collect(FieldsSeq(%q)) = %v; want %v`, tt.s, result2, tt.a)
962+
}
963+
927964
if string(b) != tt.s {
928965
t.Errorf("slice changed to %s; want %s", string(b), tt.s)
929966
}
@@ -966,6 +1003,11 @@ func TestFieldsFunc(t *testing.T) {
9661003
t.Errorf("FieldsFunc(%q) = %v, want %v", tt.s, a, tt.a)
9671004
}
9681005

1006+
result2 := sliceOfString(collect(t, FieldsFuncSeq([]byte(tt.s), pred)))
1007+
if !eq(result2, tt.a) {
1008+
t.Errorf(`collect(FieldsFuncSeq(%q)) = %v; want %v`, tt.s, result2, tt.a)
1009+
}
1010+
9691011
if string(b) != tt.s {
9701012
t.Errorf("slice changed to %s; want %s", b, tt.s)
9711013
}
@@ -2278,3 +2320,20 @@ func TestClone(t *testing.T) {
22782320
}
22792321
}
22802322
}
2323+
2324+
func collect[T []byte | string](t *testing.T, seq iter.Seq[T]) []T {
2325+
out := collect1(seq)
2326+
out1 := collect1(seq)
2327+
if !reflect.DeepEqual(out, out1) {
2328+
t.Fatalf("inconsistent seq:\n%s\n%s", out, out1)
2329+
}
2330+
return out
2331+
}
2332+
2333+
func collect1[T []byte | string](seq iter.Seq[T]) []T {
2334+
var out []T
2335+
for x := range seq {
2336+
out = append(out, x)
2337+
}
2338+
return out
2339+
}

0 commit comments

Comments
 (0)