Skip to content
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,6 @@ DerivedData/
.swiftpm/config/registries.json
.swiftpm/xcode/package.xcworkspace/contents.xcworkspacedata
.netrc
.idea
.idea
.index-build
*.out
2 changes: 1 addition & 1 deletion Package.swift
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ let package = Package(
.target(name: "Generation", dependencies: ["Tokenizers", "TensorUtils"]),
.target(name: "Models", dependencies: ["Tokenizers", "Generation", "TensorUtils"]),
.testTarget(name: "TokenizersTests", dependencies: ["Tokenizers", "Models", "Hub"], resources: [.process("Resources"), .process("Vocabs")]),
.testTarget(name: "HubTests", dependencies: ["Hub"]),
.testTarget(name: "HubTests", dependencies: ["Hub", .product(name: "Jinja", package: "Jinja")]),
.testTarget(name: "PreTokenizerTests", dependencies: ["Tokenizers", "Hub"]),
.testTarget(name: "TensorUtilsTests", dependencies: ["TensorUtils", "Models", "Hub"], resources: [.process("Resources")]),
.testTarget(name: "NormalizerTests", dependencies: ["Tokenizers", "Hub"]),
Expand Down
242 changes: 242 additions & 0 deletions Sources/Hub/BinaryDistinct.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,242 @@
//
// BinaryDistinct.swift
// swift-transformers
//
// Created by Piotr Kowalczuk on 06.03.25.
//

import Foundation

/// BinaryDistinctString helps to overcome limitations of both String and NSString types. Where the prior is performing unicode normalization and the following is not Sendable. For more reference [Modifying-and-Comparing-Strings](https://developer.apple.com/documentation/swift/string#Modifying-and-Comparing-Strings).
public struct BinaryDistinctString: Equatable, Hashable, Sendable, Comparable, CustomStringConvertible, ExpressibleByStringLiteral {
public let value: [UInt16]

public var nsString: NSString {
String(utf16CodeUnits: value, count: value.count) as NSString
}

public var string: String {
String(nsString)
}

public var count: Int {
string.count
}

/// Satisfies ``CustomStringConvertible`` protocol.
public var description: String {
string
}

public init(_ bytes: [UInt16]) {
value = bytes
}

public init(_ str: NSString) {
value = Array(str as String).flatMap { $0.utf16 }
}

public init(_ str: String) {
self.init(str as NSString)
}

public init(_ character: BinaryDistinctCharacter) {
value = character.bytes
}

public init(_ characters: [BinaryDistinctCharacter]) {
var data: [UInt16] = []
for character in characters {
data.append(contentsOf: character.bytes)
}
value = data
}

/// Satisfies ``ExpressibleByStringLiteral`` protocol.
public init(stringLiteral value: String) {
self.init(value)
}

public static func == (lhs: BinaryDistinctString, rhs: BinaryDistinctString) -> Bool {
lhs.value == rhs.value
}

public static func < (lhs: BinaryDistinctString, rhs: BinaryDistinctString) -> Bool {
lhs.value.lexicographicallyPrecedes(rhs.value)
}

public static func + (lhs: BinaryDistinctString, rhs: BinaryDistinctString) -> BinaryDistinctString {
BinaryDistinctString(lhs.value + rhs.value)
}

public func hasPrefix(_ prefix: BinaryDistinctString) -> Bool {
guard prefix.value.count <= value.count else { return false }
return value.starts(with: prefix.value)
}

public func hasSuffix(_ suffix: BinaryDistinctString) -> Bool {
guard suffix.value.count <= value.count else { return false }
return value.suffix(suffix.value.count) == suffix.value
}

public func lowercased() -> BinaryDistinctString {
.init(string.lowercased())
}

public func replacingOccurrences(of: Self, with: Self) -> BinaryDistinctString {
BinaryDistinctString(string.replacingOccurrences(of: of.string, with: with.string))
}
}

public extension BinaryDistinctString {
typealias Index = Int // Treat indices as integers

var startIndex: Index { 0 }
var endIndex: Index { count }

func index(_ i: Index, offsetBy distance: Int) -> Index {
let newIndex = i + distance
guard newIndex >= 0, newIndex <= count else {
fatalError("Index out of bounds")
}
return newIndex
}

func index(_ i: Index, offsetBy distance: Int, limitedBy limit: Index) -> Index? {
let newIndex = i + distance
return newIndex <= limit ? newIndex : nil
}
}

extension BinaryDistinctString: Sequence {
public func makeIterator() -> AnyIterator<BinaryDistinctCharacter> {
var iterator = string.makeIterator() // Use native Swift String iterator

return AnyIterator {
guard let char = iterator.next() else { return nil }
return BinaryDistinctCharacter(char)
}
}
}

public extension BinaryDistinctString {
subscript(bounds: PartialRangeFrom<Int>) -> BinaryDistinctString {
let validRange = bounds.lowerBound..<value.count // Convert to Range<Int>
return self[validRange]
}

/// Returns a slice of the `BinaryDistinctString` while ensuring correct rune (grapheme cluster) boundaries.
subscript(bounds: Range<Int>) -> BinaryDistinctString {
guard bounds.lowerBound >= 0, bounds.upperBound <= count else {
fatalError("Index out of bounds")
}

let utf8Bytes = value
var byteIndices: [Int] = []

// Decode UTF-8 manually to find rune start positions
var currentByteIndex = 0
for (index, scalar) in string.unicodeScalars.enumerated() {
if index == bounds.lowerBound {
byteIndices.append(currentByteIndex)
}
currentByteIndex += scalar.utf8.count
if index == bounds.upperBound - 1 {
byteIndices.append(currentByteIndex)
break
}
}

// Extract the byte range
let startByteIndex = byteIndices.first ?? 0
let endByteIndex = byteIndices.last ?? utf8Bytes.count

let slicedBytes = Array(utf8Bytes[startByteIndex..<endByteIndex])
return BinaryDistinctString(slicedBytes)
}
}

public extension Dictionary where Key == BinaryDistinctString {
/// Merges another `BinaryDistinctDictionary` into this one
mutating func merge(_ other: [BinaryDistinctString: Value], strategy: (Value, Value) -> Value = { _, new in new }) {
merge(other, uniquingKeysWith: strategy)
}

/// Merges a `[String: Value]` dictionary into this one
mutating func merge(_ other: [String: Value], strategy: (Value, Value) -> Value = { _, new in new }) {
let converted = Dictionary(uniqueKeysWithValues: other.map { (BinaryDistinctString($0.key), $0.value) })
merge(converted, uniquingKeysWith: strategy)
}

/// Merges a `[NSString: Value]` dictionary into this one
mutating func merge(_ other: [NSString: Value], strategy: (Value, Value) -> Value = { _, new in new }) {
let converted = Dictionary(uniqueKeysWithValues: other.map { (BinaryDistinctString($0.key), $0.value) })
merge(converted, uniquingKeysWith: strategy)
}

func merging(_ other: [String: Value], strategy: (Value, Value) -> Value = { _, new in new }) -> Self {
var newDict = self
newDict.merge(other, strategy: strategy)
return newDict
}

func merging(_ other: [BinaryDistinctString: Value], strategy: (Value, Value) -> Value = { _, new in new }) -> Self {
var newDict = self
newDict.merge(other, strategy: strategy)
return newDict
}

func merging(_ other: [NSString: Value], strategy: (Value, Value) -> Value = { _, new in new }) -> Self {
var newDict = self
newDict.merge(other, strategy: strategy)
return newDict
}
}

public protocol StringConvertible: ExpressibleByStringLiteral { }

extension BinaryDistinctString: StringConvertible { }
extension String: StringConvertible { }
extension NSString: StringConvertible { }

public struct BinaryDistinctCharacter: Equatable, Hashable, CustomStringConvertible, ExpressibleByStringLiteral {
let bytes: [UInt16]

public init(_ character: Character) {
bytes = Array(character.utf16)
}

public init(_ string: String) {
bytes = Array(string.utf16)
}

public init(_ nsString: NSString) {
let swiftString = nsString as String
bytes = Array(swiftString.utf16)
}

public init(bytes: [UInt16]) {
self.bytes = bytes
}

/// Satisfies ``ExpressibleByStringLiteral`` protocol.
public init(stringLiteral value: String) {
self.init(value)
}

var stringValue: String? {
String(utf16CodeUnits: bytes, count: bytes.count)
}

public var description: String {
if let str = stringValue {
"BinaryDistinctCharacter('\(str)', bytes: \(bytes.map { String(format: "0x%02X", $0) }))"
} else {
"BinaryDistinctCharacter(invalid UTF-8, bytes: \(bytes.map { String(format: "0x%02X", $0) }))"
}
}

public static func == (lhs: BinaryDistinctCharacter, rhs: BinaryDistinctCharacter) -> Bool {
lhs.bytes == rhs.bytes
}
}
Loading
Loading