Skip to content

bytes, strings: add Lines, SplitSeq, SplitAfterSeq, FieldsSeq, FieldsFuncSeq #67543

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions api/next/61901.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
pkg bytes, func FieldsFuncSeq([]uint8, func(int32) bool) iter.Seq[[]uint8] #61901
pkg bytes, func FieldsSeq([]uint8) iter.Seq[[]uint8] #61901
pkg bytes, func Lines([]uint8) iter.Seq[[]uint8] #61901
pkg bytes, func SplitAfterSeq([]uint8, []uint8) iter.Seq[[]uint8] #61901
pkg bytes, func SplitSeq([]uint8, []uint8) iter.Seq[[]uint8] #61901
pkg strings, func FieldsFuncSeq(string, func(int32) bool) iter.Seq[string] #61901
pkg strings, func FieldsSeq(string) iter.Seq[string] #61901
pkg strings, func Lines(string) iter.Seq[string] #61901
pkg strings, func SplitAfterSeq(string, string) iter.Seq[string] #61901
pkg strings, func SplitSeq(string, string) iter.Seq[string] #61901
12 changes: 12 additions & 0 deletions doc/next/6-stdlib/99-minor/bytes/61901.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
The [bytes] package adds several functions that work with iterators:
- [Lines] returns an iterator over the
newline-terminated lines in the byte slice s.
- [SplitSeq] returns an iterator over
all substrings of s separated by sep.
- [SplitAfterSeq] returns an iterator
over substrings of s split after each instance of sep.
- [FieldsSeq] returns an iterator over
substrings of s split around runs of whitespace characters,
as defined by unicode.IsSpace.
- [FieldsFuncSeq] returns an iterator
over substrings of s split around runs of Unicode code points satisfying f(c).
12 changes: 12 additions & 0 deletions doc/next/6-stdlib/99-minor/strings/61901.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
The [strings] package adds several functions that work with iterators:
- [Lines] returns an iterator over
the newline-terminated lines in the string s.
- [SplitSeq] returns an iterator over
all substrings of s separated by sep.
- [SplitAfterSeq] returns an iterator
over substrings of s split after each instance of sep.
- [FieldsSeq] returns an iterator over
substrings of s split around runs of whitespace characters,
as defined by unicode.IsSpace.
- [FieldsFuncSeq] returns an iterator
over substrings of s split around runs of Unicode code points satisfying f(c).
57 changes: 57 additions & 0 deletions src/bytes/bytes_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import (
. "bytes"
"fmt"
"internal/testenv"
"iter"
"math"
"math/rand"
"slices"
Expand All @@ -26,6 +27,37 @@ func sliceOfString(s [][]byte) []string {
return result
}

func collect(t *testing.T, seq iter.Seq[[]byte]) [][]byte {
out := slices.Collect(seq)
out1 := slices.Collect(seq)
if !slices.Equal(sliceOfString(out), sliceOfString(out1)) {
t.Fatalf("inconsistent seq:\n%s\n%s", out, out1)
}
return out
}

type LinesTest struct {
a string
b []string
}

var linesTests = []LinesTest{
{a: "abc\nabc\n", b: []string{"abc\n", "abc\n"}},
{a: "abc\r\nabc", b: []string{"abc\r\n", "abc"}},
{a: "abc\r\n", b: []string{"abc\r\n"}},
{a: "\nabc", b: []string{"\n", "abc"}},
{a: "\nabc\n\n", b: []string{"\n", "abc\n", "\n"}},
}

func TestLines(t *testing.T) {
for _, s := range linesTests {
result := sliceOfString(slices.Collect(Lines([]byte(s.a))))
if !slices.Equal(result, s.b) {
t.Errorf(`slices.Collect(Lines(%q)) = %q; want %q`, s.a, result, s.b)
}
}
}

// For ease of reading, the test cases use strings that are converted to byte
// slices before invoking the functions.

Expand Down Expand Up @@ -800,6 +832,14 @@ func TestSplit(t *testing.T) {
t.Errorf(`Split(%q, %q, %d) = %v; want %v`, tt.s, tt.sep, tt.n, result, tt.a)
continue
}

if tt.n < 0 {
b := sliceOfString(slices.Collect(SplitSeq([]byte(tt.s), []byte(tt.sep))))
if !slices.Equal(b, tt.a) {
t.Errorf(`collect(SplitSeq(%q, %q)) = %v; want %v`, tt.s, tt.sep, b, tt.a)
}
}

if tt.n == 0 || len(a) == 0 {
continue
}
Expand Down Expand Up @@ -859,6 +899,13 @@ func TestSplitAfter(t *testing.T) {
continue
}

if tt.n < 0 {
b := sliceOfString(slices.Collect(SplitAfterSeq([]byte(tt.s), []byte(tt.sep))))
if !slices.Equal(b, tt.a) {
t.Errorf(`collect(SplitAfterSeq(%q, %q)) = %v; want %v`, tt.s, tt.sep, b, tt.a)
}
}

if want := tt.a[len(tt.a)-1] + "z"; string(x) != want {
t.Errorf("last appended result was %s; want %s", x, want)
}
Expand Down Expand Up @@ -912,6 +959,11 @@ func TestFields(t *testing.T) {
continue
}

result2 := sliceOfString(collect(t, FieldsSeq([]byte(tt.s))))
if !slices.Equal(result2, tt.a) {
t.Errorf(`collect(FieldsSeq(%q)) = %v; want %v`, tt.s, result2, tt.a)
}

if string(b) != tt.s {
t.Errorf("slice changed to %s; want %s", string(b), tt.s)
}
Expand Down Expand Up @@ -954,6 +1006,11 @@ func TestFieldsFunc(t *testing.T) {
t.Errorf("FieldsFunc(%q) = %v, want %v", tt.s, a, tt.a)
}

result2 := sliceOfString(collect(t, FieldsFuncSeq([]byte(tt.s), pred)))
if !slices.Equal(result2, tt.a) {
t.Errorf(`collect(FieldsFuncSeq(%q)) = %v; want %v`, tt.s, result2, tt.a)
}

if string(b) != tt.s {
t.Errorf("slice changed to %s; want %s", b, tt.s)
}
Expand Down
148 changes: 148 additions & 0 deletions src/bytes/iter.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
// Copyright 2024 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

package bytes

import (
"iter"
"unicode"
"unicode/utf8"
)

// Lines returns an iterator over the newline-terminated lines in the byte slice s.
// The lines yielded by the iterator include their terminating newlines.
// If s is empty, the iterator yields no lines at all.
// If s does not end in a newline, the final yielded line will not end in a newline.
// It returns a single-use iterator.
func Lines(s []byte) iter.Seq[[]byte] {
return func(yield func([]byte) bool) {
for len(s) > 0 {
var line []byte
if i := IndexByte(s, '\n'); i >= 0 {
line, s = s[:i+1], s[i+1:]
} else {
line, s = s, nil
}
if !yield(line[:len(line):len(line)]) {
return
}
}
return
}
}

// explodeSeq returns an iterator over the runes in s.
func explodeSeq(s []byte) iter.Seq[[]byte] {
return func(yield func([]byte) bool) {
for len(s) > 0 {
_, size := utf8.DecodeRune(s)
if !yield(s[:size:size]) {
return
}
s = s[size:]
}
}
}

// splitSeq is SplitSeq or SplitAfterSeq, configured by how many
// bytes of sep to include in the results (none or all).
func splitSeq(s, sep []byte, sepSave int) iter.Seq[[]byte] {
if len(sep) == 0 {
return explodeSeq(s)
}
return func(yield func([]byte) bool) {
for {
i := Index(s, sep)
if i < 0 {
break
}
frag := s[:i+sepSave]
if !yield(frag[:len(frag):len(frag)]) {
return
}
s = s[i+len(sep):]
}
yield(s[:len(s):len(s)])
}
}

// SplitSeq returns an iterator over all substrings of s separated by sep.
// The iterator yields the same strings that would be returned by Split(s, sep),
// but without constructing the slice.
// It returns a single-use iterator.
func SplitSeq(s, sep []byte) iter.Seq[[]byte] {
return splitSeq(s, sep, 0)
}

// SplitAfterSeq returns an iterator over substrings of s split after each instance of sep.
// The iterator yields the same strings that would be returned by SplitAfter(s, sep),
// but without constructing the slice.
// It returns a single-use iterator.
func SplitAfterSeq(s, sep []byte) iter.Seq[[]byte] {
return splitSeq(s, sep, len(sep))
}

// FieldsSeq returns an iterator over substrings of s split around runs of
// whitespace characters, as defined by unicode.IsSpace.
// The iterator yields the same strings that would be returned by Fields(s),
// but without constructing the slice.
func FieldsSeq(s []byte) iter.Seq[[]byte] {
return func(yield func([]byte) bool) {
start := -1
for i := 0; i < len(s); {
size := 1
r := rune(s[i])
isSpace := asciiSpace[s[i]] != 0
if r >= utf8.RuneSelf {
r, size = utf8.DecodeRune(s[i:])
isSpace = unicode.IsSpace(r)
}
if isSpace {
if start >= 0 {
if !yield(s[start:i:i]) {
return
}
start = -1
}
} else if start < 0 {
start = i
}
i += size
}
if start >= 0 {
yield(s[start:len(s):len(s)])
}
}
}

// FieldsFuncSeq returns an iterator over substrings of s split around runs of
// Unicode code points satisfying f(c).
// The iterator yields the same strings that would be returned by FieldsFunc(s),
// but without constructing the slice.
func FieldsFuncSeq(s []byte, f func(rune) bool) iter.Seq[[]byte] {
return func(yield func([]byte) bool) {
start := -1
for i := 0; i < len(s); {
size := 1
r := rune(s[i])
if r >= utf8.RuneSelf {
r, size = utf8.DecodeRune(s[i:])
}
if f(r) {
if start >= 0 {
if !yield(s[start:i:i]) {
return
}
start = -1
}
} else if start < 0 {
start = i
}
i += size
}
if start >= 0 {
yield(s[start:len(s):len(s)])
}
}
}
Loading