Skip to content

Commit 16abfe5

Browse files
committed
Implement Michael's suggestions
1 parent 85352c2 commit 16abfe5

File tree

1 file changed

+99
-69
lines changed

1 file changed

+99
-69
lines changed

stdlib/public/core/StringGraphemeBreaking.swift

+99-69
Original file line numberDiff line numberDiff line change
@@ -99,24 +99,14 @@ extension _StringGuts {
9999
@usableFromInline @inline(never)
100100
@_effects(releasenone)
101101
internal func _opaqueCharacterStride(startingAt i: Int) -> Int {
102-
let nextIdx: Int
103-
104102
if _slowPath(isForeign) {
105-
nextIdx = nextBoundary(startingAt: i) {
106-
let scalars = String.UnicodeScalarView(self)
107-
let idx = String.Index(_encodedOffset: $0)
108-
109-
let scalar = scalars[idx]
110-
let nextIdx = scalars.index(after: idx)
103+
return _foreignOpaqueCharacterStride(startingAt: i)
104+
}
111105

112-
return (scalar, nextIdx._encodedOffset)
113-
}
114-
} else {
115-
nextIdx = withFastUTF8 { utf8 in
116-
nextBoundary(startingAt: i) {
117-
let (scalar, len) = _decodeScalar(utf8, startingAt: $0)
118-
return (scalar, $0 &+ len)
119-
}
106+
let nextIdx = withFastUTF8 { utf8 in
107+
nextBoundary(startingAt: i) {
108+
let (scalar, len) = _decodeScalar(utf8, startingAt: $0)
109+
return (scalar, $0 &+ len)
120110
}
121111
}
122112

@@ -126,45 +116,83 @@ extension _StringGuts {
126116
@usableFromInline @inline(never)
127117
@_effects(releasenone)
128118
internal func _opaqueCharacterStride(endingAt i: Int) -> Int {
129-
let previousIdx: Int
130-
131119
if _slowPath(isForeign) {
132-
previousIdx = previousBoundary(endingAt: i) {
133-
let scalars = String.UnicodeScalarView(self)
134-
let idx = String.Index(_encodedOffset: $0)
135-
136-
let previousIdx = scalars.index(before: idx)
137-
let scalar = scalars[previousIdx]
120+
return _foreignOpaqueCharacterStride(endingAt: i)
121+
}
138122

139-
return (scalar, previousIdx._encodedOffset)
140-
}
141-
} else {
142-
previousIdx = withFastUTF8 { utf8 in
143-
previousBoundary(endingAt: i) {
144-
let (scalar, len) = _decodeScalar(utf8, endingAt: $0)
145-
return (scalar, $0 &- len)
146-
}
123+
let previousIdx = withFastUTF8 { utf8 in
124+
previousBoundary(endingAt: i) {
125+
let (scalar, len) = _decodeScalar(utf8, endingAt: $0)
126+
return (scalar, $0 &- len)
147127
}
148128
}
149129

150130
return i &- previousIdx
151131
}
152-
}
153132

154-
internal struct _GraphemeBreakingState {
155-
var isBackwards: Bool = false
156-
var isInEmojiSequence: Bool = false
157-
var shouldBreakRI: Bool = false
133+
@inline(never)
134+
@_effects(releasenone)
135+
private func _foreignOpaqueCharacterStride(startingAt i: Int) -> Int {
136+
#if _runtime(_ObjC)
137+
_internalInvariant(isForeign)
138+
139+
let nextIdx = nextBoundary(startingAt: i) {
140+
let scalars = String.UnicodeScalarView(self)
141+
let idx = String.Index(_encodedOffset: $0)
142+
143+
let scalar = scalars[idx]
144+
let nextIdx = scalars.index(after: idx)
158145

159-
static func forward() -> _GraphemeBreakingState {
160-
_GraphemeBreakingState()
146+
return (scalar, nextIdx._encodedOffset)
147+
}
148+
149+
return nextIdx &- i
150+
#else
151+
fatalError("No foreign strings on Linux in this version of Swift")
152+
#endif
161153
}
162154

163-
static func backward() -> _GraphemeBreakingState {
164-
_GraphemeBreakingState(isBackwards: true)
155+
@inline(never)
156+
@_effects(releasenone)
157+
private func _foreignOpaqueCharacterStride(endingAt i: Int) -> Int {
158+
#if _runtime(_ObjC)
159+
_internalInvariant(isForeign)
160+
161+
let previousIdx = previousBoundary(endingAt: i) {
162+
let scalars = String.UnicodeScalarView(self)
163+
let idx = String.Index(_encodedOffset: $0)
164+
165+
let previousIdx = scalars.index(before: idx)
166+
let scalar = scalars[previousIdx]
167+
168+
return (scalar, previousIdx._encodedOffset)
169+
}
170+
171+
return i &- previousIdx
172+
#else
173+
fatalError("No foreign strings on Linux in this version of Swift")
174+
#endif
165175
}
166176
}
167177

178+
internal struct _GraphemeBreakingState {
179+
// When walking forwards in a string, we need to know whether or not we've
180+
// entered an emoji sequence to be able to eventually break after all of the
181+
// emoji's various extenders and zero width joiners. This bit allows us to
182+
// keep track of whether or not we're still in an emoji sequence when deciding
183+
// to break.
184+
var isInEmojiSequence: Bool = false
185+
186+
// When walking forward in a string, we need to not break on emoji flag
187+
// sequences. Emoji flag sequences are composed of 2 regional indicators, so
188+
// when we see our first (.regionalIndicator, .regionalIndicator) decision,
189+
// we need to know to return false in this case. However, if the next scalar
190+
// is another regional indicator, we reach the same decision rule, but in this
191+
// case we actually need to break there's a boundary between emoji flag
192+
// sequences.
193+
var shouldBreakRI: Bool = false
194+
}
195+
168196
extension _StringGuts {
169197
// Returns the stride of the next grapheme cluster at the previous boundary
170198
// offset.
@@ -173,7 +201,7 @@ extension _StringGuts {
173201
nextScalar: (Int) -> (Unicode.Scalar, end: Int)
174202
) -> Int {
175203
_internalInvariant(index != endIndex._encodedOffset)
176-
var state = _GraphemeBreakingState.forward()
204+
var state = _GraphemeBreakingState()
177205
var index = index
178206

179207
while true {
@@ -201,7 +229,7 @@ extension _StringGuts {
201229
previousScalar: (Int) -> (Unicode.Scalar, start: Int)
202230
) -> Int {
203231
_internalInvariant(index != startIndex._encodedOffset)
204-
var state = _GraphemeBreakingState.backward()
232+
var state = _GraphemeBreakingState()
205233
var index = index
206234

207235
while true {
@@ -214,7 +242,13 @@ extension _StringGuts {
214242

215243
let (scalar1, _) = previousScalar(index)
216244

217-
if shouldBreak(scalar1, between: scalar2, &state, index) {
245+
if shouldBreak(
246+
scalar1,
247+
between: scalar2,
248+
&state,
249+
index,
250+
isBackwards: true
251+
) {
218252
break
219253
}
220254
}
@@ -233,7 +267,8 @@ extension _StringGuts {
233267
_ scalar1: Unicode.Scalar,
234268
between scalar2: Unicode.Scalar,
235269
_ state: inout _GraphemeBreakingState,
236-
_ index: Int
270+
_ index: Int,
271+
isBackwards: Bool = false
237272
) -> Bool {
238273
// GB3
239274
if scalar1.value == 0xD, scalar2.value == 0xA {
@@ -315,24 +350,23 @@ extension _StringGuts {
315350

316351
// GB11
317352
case (.zwj, .extendedPictographic):
318-
if state.isBackwards {
319-
checkIfInEmojiSequence(&state, index)
353+
if isBackwards {
354+
return !checkIfInEmojiSequence(index)
320355
}
321356

322-
if state.isInEmojiSequence {
323-
return false
324-
} else {
325-
return true
326-
}
357+
return !state.isInEmojiSequence
327358

328359
// GB12 & GB13
329360
case (.regionalIndicator, .regionalIndicator):
330-
if state.isBackwards {
331-
countRIs(&state, index)
361+
if isBackwards {
362+
return countRIs(index)
332363
}
333364

334-
state.shouldBreakRI.toggle()
335-
return !state.shouldBreakRI
365+
defer {
366+
state.shouldBreakRI.toggle()
367+
}
368+
369+
return state.shouldBreakRI
336370

337371
// GB999
338372
default:
@@ -384,14 +418,12 @@ extension _StringGuts {
384418
// know that we are in an emoji sequence so our initial
385419
// break question is answered as NO.
386420
internal func checkIfInEmojiSequence(
387-
_ state: inout _GraphemeBreakingState,
388421
_ index: Int
389-
) {
422+
) -> Bool {
390423
var emojiIdx = String.Index(_encodedOffset: index)
391424

392425
guard emojiIdx != startIndex else {
393-
state.isInEmojiSequence = false
394-
return
426+
return false
395427
}
396428

397429
let scalars = String.UnicodeScalarView(self)
@@ -407,13 +439,13 @@ extension _StringGuts {
407439
case .extend:
408440
continue
409441
case .extendedPictographic:
410-
state.isInEmojiSequence = true
411-
return
442+
return true
412443
default:
413-
state.isInEmojiSequence = false
414-
return
444+
return false
415445
}
416446
}
447+
448+
return false
417449
}
418450

419451
// When walking backwards, it's impossible to know whether we break when we
@@ -447,14 +479,12 @@ extension _StringGuts {
447479
// | = Not a .regionalIndicator. riCount = 1 which is odd, so break
448480
// the last two .regionalIndicators.
449481
internal func countRIs(
450-
_ state: inout _GraphemeBreakingState,
451482
_ index: Int
452-
) {
483+
) -> Bool {
453484
var riIdx = String.Index(_encodedOffset: index)
454485

455486
guard riIdx != startIndex else {
456-
state.shouldBreakRI = false
457-
return
487+
return false
458488
}
459489

460490
var riCount = 0
@@ -475,6 +505,6 @@ extension _StringGuts {
475505
riCount += 1
476506
}
477507

478-
state.shouldBreakRI = riCount & 1 != 0
508+
return riCount & 1 != 0
479509
}
480510
}

0 commit comments

Comments
 (0)