|
1 |
| -// run |
| 1 | +// run -gcflags=-d=maymorestack=runtime.mayMoreStackMove |
2 | 2 |
|
3 | 3 | // Copyright 2024 The Go Authors. All rights reserved.
|
4 | 4 | // Use of this source code is governed by a BSD-style
|
|
7 | 7 | package main
|
8 | 8 |
|
9 | 9 | import (
|
10 |
| - "bufio" |
11 |
| - "fmt" |
12 |
| - "io" |
13 | 10 | "iter"
|
14 |
| - "math/rand" |
15 |
| - "os" |
16 |
| - "strings" |
17 |
| - "unicode" |
18 | 11 | )
|
19 | 12 |
|
20 |
| -// WordReader is the struct that implements io.Reader |
21 |
| -type WordReader struct { |
22 |
| - scanner *bufio.Scanner |
23 |
| -} |
24 |
| - |
25 |
| -// NewWordReader creates a new WordReader from an io.Reader |
26 |
| -func NewWordReader(r io.Reader) *WordReader { |
27 |
| - scanner := bufio.NewScanner(r) |
28 |
| - scanner.Split(bufio.ScanWords) |
29 |
| - return &WordReader{ |
30 |
| - scanner: scanner, |
31 |
| - } |
32 |
| -} |
33 |
| - |
34 |
| -// Read reads data from the input stream and returns a single lowercase word at a time |
35 |
| -func (wr *WordReader) Read(p []byte) (n int, err error) { |
36 |
| - if !wr.scanner.Scan() { |
37 |
| - if err := wr.scanner.Err(); err != nil { |
38 |
| - return 0, err |
39 |
| - } |
40 |
| - return 0, io.EOF |
41 |
| - } |
42 |
| - word := wr.scanner.Text() |
43 |
| - cleanedWord := removeNonAlphabetic(word) |
44 |
| - if len(cleanedWord) == 0 { |
45 |
| - return wr.Read(p) |
46 |
| - } |
47 |
| - n = copy(p, []byte(cleanedWord)) |
48 |
| - return n, nil |
49 |
| -} |
50 |
| - |
51 |
| -// All returns an iterator allowing the caller to iterate over the WordReader using for/range. |
52 |
| -func (wr *WordReader) All() iter.Seq[string] { |
53 |
| - word := make([]byte, 1024) |
54 |
| - return func(yield func(string) bool) { |
55 |
| - var err error |
56 |
| - var n int |
57 |
| - for n, err = wr.Read(word); err == nil; n, err = wr.Read(word) { |
58 |
| - if !yield(string(word[:n])) { |
| 13 | +func All() iter.Seq[int] { |
| 14 | + return func(yield func(int) bool) { |
| 15 | + for i := 0; i < 10; i++ { |
| 16 | + if !yield(i) { |
59 | 17 | return
|
60 | 18 | }
|
61 | 19 | }
|
62 |
| - if err != io.EOF { |
63 |
| - fmt.Fprintf(os.Stderr, "error reading word: %v\n", err) |
64 |
| - } |
65 | 20 | }
|
66 | 21 | }
|
67 | 22 |
|
68 |
| -// removeNonAlphabetic removes non-alphabetic characters from a word using strings.Map |
69 |
| -func removeNonAlphabetic(word string) string { |
70 |
| - return strings.Map(func(r rune) rune { |
71 |
| - if unicode.IsLetter(r) { |
72 |
| - return unicode.ToLower(r) |
73 |
| - } |
74 |
| - return -1 |
75 |
| - }, word) |
76 |
| -} |
77 |
| - |
78 |
| -// ProbabilisticSkipper determines if an item should be retained with probability 1/(1<<n) |
79 |
| -type ProbabilisticSkipper struct { |
80 |
| - n int |
81 |
| - counter uint64 |
82 |
| - bitmask uint64 |
| 23 | +type S struct { |
| 24 | + round int |
83 | 25 | }
|
84 | 26 |
|
85 |
| -// NewProbabilisticSkipper initializes the ProbabilisticSkipper |
86 |
| -func NewProbabilisticSkipper(n int) *ProbabilisticSkipper { |
87 |
| - pr := &ProbabilisticSkipper{n: n} |
88 |
| - pr.refreshCounter() |
89 |
| - return pr |
| 27 | +func NewS(round int) *S { |
| 28 | + s := &S{round: round} |
| 29 | + return s |
90 | 30 | }
|
91 | 31 |
|
92 |
| -// check panics if pr.n is not the expected value |
93 |
| -func (pr *ProbabilisticSkipper) check(n int) { |
94 |
| - if pr.n != n { |
95 |
| - panic(fmt.Sprintf("check: pr.n != n %d != %d", pr.n, n)) |
| 32 | +func (s *S) check(round int) { |
| 33 | + if s.round != round { |
| 34 | + panic("bad round") |
96 | 35 | }
|
97 | 36 | }
|
98 | 37 |
|
99 |
| -// refreshCounter refreshes the counter with a new random value |
100 |
| -func (pr *ProbabilisticSkipper) refreshCounter() { |
101 |
| - if pr.n == 0 { |
102 |
| - pr.bitmask = ^uint64(0) // All bits set to 1 |
103 |
| - } else { |
104 |
| - pr.bitmask = rand.Uint64() |
105 |
| - for i := 0; i < pr.n-1; i++ { |
106 |
| - pr.bitmask &= rand.Uint64() |
107 |
| - } |
108 |
| - } |
109 |
| - pr.counter = 64 |
110 |
| -} |
111 |
| - |
112 |
| -// ShouldSkip returns true with probability 1/(1<<n) |
113 |
| -func (pr *ProbabilisticSkipper) ShouldSkip() bool { |
114 |
| - remove := pr.bitmask&1 == 0 |
115 |
| - pr.bitmask >>= 1 |
116 |
| - pr.counter-- |
117 |
| - if pr.counter == 0 { |
118 |
| - pr.refreshCounter() |
119 |
| - } |
120 |
| - return remove |
121 |
| -} |
122 |
| - |
123 |
| -// EstimateUniqueWordsIter estimates the number of unique words using a probabilistic counting method |
124 |
| -func EstimateUniqueWordsIter(reader io.Reader, memorySize int) int { |
125 |
| - wordReader := NewWordReader(reader) |
126 |
| - words := make(map[string]struct{}, memorySize) |
127 |
| - |
| 38 | +func f() { |
128 | 39 | rounds := 0
|
129 |
| - roundRemover := NewProbabilisticSkipper(1) |
130 |
| - wordSkipper := NewProbabilisticSkipper(rounds) |
131 |
| - wordSkipper.check(rounds) |
132 |
| - |
133 |
| - for word := range wordReader.All() { |
134 |
| - wordSkipper.check(rounds) |
135 |
| - if wordSkipper.ShouldSkip() { |
136 |
| - delete(words, word) |
137 |
| - } else { |
138 |
| - words[word] = struct{}{} |
139 |
| - |
140 |
| - if len(words) >= memorySize { |
141 |
| - rounds++ |
| 40 | + s := NewS(rounds) |
| 41 | + s.check(rounds) |
142 | 42 |
|
143 |
| - wordSkipper = NewProbabilisticSkipper(rounds) |
144 |
| - for w := range words { |
145 |
| - if roundRemover.ShouldSkip() { |
146 |
| - delete(words, w) |
147 |
| - } |
148 |
| - } |
149 |
| - } |
150 |
| - } |
151 |
| - wordSkipper.check(rounds) |
152 |
| - } |
153 |
| - |
154 |
| - if len(words) == 0 { |
155 |
| - return 0 |
| 43 | + for range All() { |
| 44 | + s.check(rounds) |
| 45 | + rounds++ |
| 46 | + s = NewS(rounds) |
| 47 | + s.check(rounds) |
156 | 48 | }
|
157 |
| - |
158 |
| - invProbability := 1 << rounds |
159 |
| - estimatedUniqueWords := len(words) * invProbability |
160 |
| - return estimatedUniqueWords |
161 | 49 | }
|
162 | 50 |
|
163 | 51 | func main() {
|
164 |
| - input := "Hello, world! This is a test. Hello, world, hello!" |
165 |
| - expectedUniqueWords := 6 // "hello", "world", "this", "is", "a", "test" (but "hello" and "world" are repeated) |
166 |
| - memorySize := 6 |
167 |
| - |
168 |
| - reader := strings.NewReader(input) |
169 |
| - estimatedUniqueWords := EstimateUniqueWordsIter(reader, memorySize) |
170 |
| - if estimatedUniqueWords != expectedUniqueWords { |
171 |
| - // ... |
172 |
| - } |
| 52 | + f() |
173 | 53 | }
|
0 commit comments