diff --git a/utils/gen-unicode-data/.gitignore b/utils/gen-unicode-data/.gitignore new file mode 100644 index 0000000000000..59e29472d021b --- /dev/null +++ b/utils/gen-unicode-data/.gitignore @@ -0,0 +1,8 @@ +.DS_Store +/.build +/Packages +/*.xcodeproj +xcuserdata/ +DerivedData/ +.swiftpm/xcode/package.xcworkspace/contents.xcworkspacedata +Package.resolved diff --git a/utils/gen-unicode-data/Package.swift b/utils/gen-unicode-data/Package.swift new file mode 100644 index 0000000000000..f070cd30d9269 --- /dev/null +++ b/utils/gen-unicode-data/Package.swift @@ -0,0 +1,14 @@ +// swift-tools-version:5.4 + +import PackageDescription + +let package = Package( + name: "GenUnicodeData", + platforms: [.macOS(.v10_15)], + targets: [ + .target( + name: "GenUtils", + dependencies: [] + ) + ] +) diff --git a/utils/gen-unicode-data/Sources/GenUtils/BitArray.swift b/utils/gen-unicode-data/Sources/GenUtils/BitArray.swift new file mode 100644 index 0000000000000..0501a473165fe --- /dev/null +++ b/utils/gen-unicode-data/Sources/GenUtils/BitArray.swift @@ -0,0 +1,41 @@ +//===----------------------------------------------------------------------===// +// +// This source file is part of the Swift.org open source project +// +// Copyright (c) 2021 Apple Inc. and the Swift project authors +// Licensed under Apache License v2.0 with Runtime Library Exception +// +// See https://swift.org/LICENSE.txt for license information +// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors +// +//===----------------------------------------------------------------------===// + +public struct BitArray { + public var words: [UInt64] + public var size: UInt16 + + public init(size: Int) { + self.words = .init(repeating: 0, count: (size + 63) / 64) + self.size = UInt16(size) + } + + public subscript(_ bit: Int) -> Bool { + get { + return words[bit / 64] & (1 << (bit % 64)) != 0 + } + + set { + if newValue { + words[bit / 64] |= 1 << (bit % 64) + } else { + words[bit / 64] &= ~(1 << (bit % 64)) + } + } + } + + public mutating func insert(_ bit: Int) -> Bool { + let oldData = words[bit / 64] + words[bit / 64] |= 1 << (bit % 64) + return oldData & (1 << (bit % 64)) == 0 + } +} diff --git a/utils/gen-unicode-data/Sources/GenUtils/Emit.swift b/utils/gen-unicode-data/Sources/GenUtils/Emit.swift new file mode 100644 index 0000000000000..a41446d736cb8 --- /dev/null +++ b/utils/gen-unicode-data/Sources/GenUtils/Emit.swift @@ -0,0 +1,102 @@ +//===----------------------------------------------------------------------===// +// +// This source file is part of the Swift.org open source project +// +// Copyright (c) 2021 Apple Inc. and the Swift project authors +// Licensed under Apache License v2.0 with Runtime Library Exception +// +// See https://swift.org/LICENSE.txt for license information +// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors +// +//===----------------------------------------------------------------------===// + +public func emitCollection( + _ collection: C, + name: String, + type: String, + into result: inout String, + formatter: (C.Element) -> String +) { + result += """ + static const \(type) \(name)[\(collection.count)] = { + + """ + + formatCollection(collection, into: &result, using: formatter) + + result += "\n};\n\n" +} + +public func emitCollection( + _ collection: C, + name: String, + into result: inout String +) where C.Element: FixedWidthInteger { + result += """ + static const __swift_uint\(C.Element.bitWidth)_t \(name)[\(collection.count)] = { + + """ + + formatCollection(collection, into: &result) { + "0x\(String($0, radix: 16, uppercase: true))" + } + + result += "\n};\n\n" +} + +// Emits an abstract minimal perfect hash function into C arrays. +public func emitMph(_ mph: Mph, name: String, into result: inout String) { + emitMphSizes(mph, name, into: &result) + emitMphBitarrays(mph, name, into: &result) + emitMphRanks(mph, name, into: &result) +} + +// BitArray sizes +func emitMphSizes(_ mph: Mph, _ name: String, into result: inout String) { + emitCollection( + mph.bitArrays, + name: "\(name)_sizes", + type: "__swift_uint16_t", + into: &result + ) { + "0x\(String($0.size, radix: 16, uppercase: true))" + } +} + +func emitMphBitarrays(_ mph: Mph, _ name: String, into result: inout String) { + // Individual bitarrays + + for (i, ba) in mph.bitArrays.enumerated() { + emitCollection(ba.words, name: "\(name)_keys\(i)", into: &result) + } + + // Overall bitarrays + + emitCollection( + mph.bitArrays.indices, + name: "\(name)_keys", + type: "__swift_uint64_t * const", + into: &result + ) { + "\(name)_keys\($0)" + } +} + +func emitMphRanks(_ mph: Mph, _ name: String, into result: inout String) { + // Individual ranks + + for (i, rank) in mph.ranks.enumerated() { + emitCollection(rank, name: "\(name)_ranks\(i)", into: &result) + } + + // Overall ranks + + emitCollection( + mph.ranks.indices, + name: "\(name)_ranks", + type: "__swift_uint16_t * const", + into: &result + ) { + "\(name)_ranks\($0)" + } +} diff --git a/utils/gen-unicode-data/Sources/GenUtils/Files.swift b/utils/gen-unicode-data/Sources/GenUtils/Files.swift new file mode 100644 index 0000000000000..07f37dc9d607a --- /dev/null +++ b/utils/gen-unicode-data/Sources/GenUtils/Files.swift @@ -0,0 +1,29 @@ +//===----------------------------------------------------------------------===// +// +// This source file is part of the Swift.org open source project +// +// Copyright (c) 2021 Apple Inc. and the Swift project authors +// Licensed under Apache License v2.0 with Runtime Library Exception +// +// See https://swift.org/LICENSE.txt for license information +// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors +// +//===----------------------------------------------------------------------===// + +import Foundation + +public func readFile(_ path: String) -> String { + do { + return try String(contentsOfFile: path) + } catch { + fatalError(error.localizedDescription) + } +} + +public func write(_ data: String, to path: String) { + do { + try data.write(toFile: path, atomically: false, encoding: .utf8) + } catch { + fatalError(error.localizedDescription) + } +} diff --git a/utils/gen-unicode-data/Sources/GenUtils/Flatten.swift b/utils/gen-unicode-data/Sources/GenUtils/Flatten.swift new file mode 100644 index 0000000000000..309e0d490ce68 --- /dev/null +++ b/utils/gen-unicode-data/Sources/GenUtils/Flatten.swift @@ -0,0 +1,75 @@ +//===----------------------------------------------------------------------===// +// +// This source file is part of the Swift.org open source project +// +// Copyright (c) 2021 Apple Inc. and the Swift project authors +// Licensed under Apache License v2.0 with Runtime Library Exception +// +// See https://swift.org/LICENSE.txt for license information +// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors +// +//===----------------------------------------------------------------------===// + +// Takes an unflattened array of scalar ranges and some Equatable property and +// attempts to merge ranges who share the same Equatable property. E.g: +// +// 0x0 ... 0xA = .control +// 0xB ... 0xB = .control +// 0xC ... 0x1F = .control +// +// into: +// +// 0x0 ... 0x1F = .control +public func flatten( + _ unflattened: [(ClosedRange, T)] +) -> [(ClosedRange, T)] { + var result: [(ClosedRange, T)] = [] + + for elt in unflattened.sorted(by: { $0.0.lowerBound < $1.0.lowerBound }) { + guard !result.isEmpty, result.last!.1 == elt.1 else { + result.append(elt) + continue + } + + if elt.0.lowerBound == result.last!.0.upperBound + 1 { + result[result.count - 1].0 = result.last!.0.lowerBound ... elt.0.upperBound + } else { + result.append(elt) + } + } + + return result +} + +// Takes an unflattened array of scalars and some Equatable property and +// attempts to merge scalars into ranges who share the same Equatable +// property. E.g: +// +// 0x9 = .control +// 0xA = .control +// 0xB = .control +// 0xC = .control +// +// into: +// +// 0x9 ... 0xC = .control +public func flatten( + _ unflattened: [(UInt32, T)] +) -> [(ClosedRange, T)] { + var result: [(ClosedRange, T)] = [] + + for elt in unflattened.sorted(by: { $0.0 < $1.0 }) { + guard !result.isEmpty, result.last!.1 == elt.1 else { + result.append((elt.0 ... elt.0, elt.1)) + continue + } + + if elt.0 == result.last!.0.upperBound + 1 { + result[result.count - 1].0 = result.last!.0.lowerBound ... elt.0 + } else { + result.append((elt.0 ... elt.0, elt.1)) + } + } + + return result +} diff --git a/utils/gen-unicode-data/Sources/GenUtils/Formatting.swift b/utils/gen-unicode-data/Sources/GenUtils/Formatting.swift new file mode 100644 index 0000000000000..81f348b2c96b8 --- /dev/null +++ b/utils/gen-unicode-data/Sources/GenUtils/Formatting.swift @@ -0,0 +1,43 @@ +//===----------------------------------------------------------------------===// +// +// This source file is part of the Swift.org open source project +// +// Copyright (c) 2021 Apple Inc. and the Swift project authors +// Licensed under Apache License v2.0 with Runtime Library Exception +// +// See https://swift.org/LICENSE.txt for license information +// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors +// +//===----------------------------------------------------------------------===// + +// Given a collection, format it into a string within 80 columns and fitting as +// many elements in a row as possible. +public func formatCollection( + _ c: C, + into result: inout String, + using handler: (C.Element) -> String +) { + // Our row length always starts at 2 for the initial indentation. + var rowLength = 2 + + for element in c { + let string = handler(element) + + if rowLength == 2 { + result += " " + } + + if rowLength + string.count + 1 > 80 { + result += "\n " + + rowLength = 2 + } else { + result += rowLength == 2 ? "" : " " + } + + result += "\(string)," + + // string.count + , + space + rowLength += string.count + 1 + 1 + } +} diff --git a/utils/gen-unicode-data/Sources/GenUtils/Hashing.swift b/utils/gen-unicode-data/Sources/GenUtils/Hashing.swift new file mode 100644 index 0000000000000..9d13cf109a2ca --- /dev/null +++ b/utils/gen-unicode-data/Sources/GenUtils/Hashing.swift @@ -0,0 +1,50 @@ +//===----------------------------------------------------------------------===// +// +// This source file is part of the Swift.org open source project +// +// Copyright (c) 2021 Apple Inc. and the Swift project authors +// Licensed under Apache License v2.0 with Runtime Library Exception +// +// See https://swift.org/LICENSE.txt for license information +// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors +// +//===----------------------------------------------------------------------===// + +func hash(_ key: UInt64, _ n: UInt64, seed: UInt64) -> UInt64 { + let key = key | (n << 32) + let hash = UInt64(murmur3(key, seed: UInt32(seed))) + + return hash % n +} + +func scramble(_ key: UInt32) -> UInt32 { + var key = key + key &*= 0xCC9E2D51 + key = (key << 15) | (key >> 17) + key &*= 0x1B873593 + return key +} + +func murmur3(_ key: UInt64, seed: UInt32) -> UInt32 { + var hash = seed + var k: UInt32 + var key = key + + for _ in 0 ..< 2 { + k = UInt32((key << 32) >> 32) + key >>= 32 + + hash ^= scramble(k) + hash = (hash << 13) | (hash >> 19) + hash = hash &* 5 &+ 0xE6546B64 + } + + hash ^= 8 + hash ^= hash >> 16 + hash &*= 0x85EBCA6B + hash ^= hash >> 13 + hash &*= 0xC2B2AE35 + hash ^= hash >> 16 + + return hash +} diff --git a/utils/gen-unicode-data/Sources/GenUtils/Mph.swift b/utils/gen-unicode-data/Sources/GenUtils/Mph.swift new file mode 100644 index 0000000000000..71e1992077c6e --- /dev/null +++ b/utils/gen-unicode-data/Sources/GenUtils/Mph.swift @@ -0,0 +1,105 @@ +//===----------------------------------------------------------------------===// +// +// This source file is part of the Swift.org open source project +// +// Copyright (c) 2021 Apple Inc. and the Swift project authors +// Licensed under Apache License v2.0 with Runtime Library Exception +// +// See https://swift.org/LICENSE.txt for license information +// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors +// +//===----------------------------------------------------------------------===// + +public struct Mph { + public var bitArrays: [BitArray] = [] + public var ranks: [[UInt16]] = [] + + init(gamma: Double, keys: [UInt64]) { + var size: Int + var a: BitArray + var collide: Set + var redoKeys: [UInt64] = keys + var i: UInt64 = 0 + + repeat { + size = Swift.max(64, Int(gamma * Double(redoKeys.count))) + a = BitArray(size: size) + collide = [] + + for key in redoKeys { + let idx = Int(hash(key, UInt64(size), seed: i)) + + if !collide.contains(idx), !a.insert(idx) { + collide.insert(idx) + } + } + + var tmpRedo: [UInt64] = [] + + for key in redoKeys { + let idx = Int(hash(key, UInt64(size), seed: i)) + + if collide.contains(idx) { + a[idx] = false + tmpRedo.append(key) + } + } + + bitArrays.append(a) + redoKeys = tmpRedo + i += 1 + } while !redoKeys.isEmpty + + computeRanks() + } + + mutating func computeRanks() { + var pop: UInt16 = 0 + + for bitArray in bitArrays { + var rank: [UInt16] = [] + + for i in 0 ..< bitArray.words.count { + let v = bitArray.words[i] + + if i % 8 == 0 { + rank.append(pop) + } + + pop += UInt16(v.nonzeroBitCount) + } + + ranks.append(rank) + } + } + + public func index(for key: UInt64) -> Int { + for i in 0 ..< bitArrays.count { + let b = bitArrays[i] + let idx = Int(hash(key, UInt64(b.size), seed: UInt64(i))) + + if b[idx] { + var rank = ranks[i][idx / 512] + + for j in (idx / 64) & ~7 ..< idx / 64 { + rank += UInt16(b.words[j].nonzeroBitCount) + } + + let finalWord = b.words[idx / 64] + + if idx % 64 > 0 { + rank += UInt16((finalWord << (64 - (idx % 64))).nonzeroBitCount) + } + + return Int(rank) + } + } + + return -1 + } +} + + +public func mph(for keys: [UInt64]) -> Mph { + Mph(gamma: 1, keys: keys) +}