Skip to content

[utils] Add a Unicode data generator util package #39213

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Sep 9, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions utils/gen-unicode-data/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
.DS_Store
/.build
/Packages
/*.xcodeproj
xcuserdata/
DerivedData/
.swiftpm/xcode/package.xcworkspace/contents.xcworkspacedata
Package.resolved
14 changes: 14 additions & 0 deletions utils/gen-unicode-data/Package.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
// swift-tools-version:5.4

import PackageDescription

let package = Package(
name: "GenUnicodeData",
platforms: [.macOS(.v10_15)],
targets: [
.target(
name: "GenUtils",
dependencies: []
)
]
)
41 changes: 41 additions & 0 deletions utils/gen-unicode-data/Sources/GenUtils/BitArray.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
//===----------------------------------------------------------------------===//
//
// This source file is part of the Swift.org open source project
//
// Copyright (c) 2021 Apple Inc. and the Swift project authors
// Licensed under Apache License v2.0 with Runtime Library Exception
//
// See https://swift.org/LICENSE.txt for license information
// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
//
//===----------------------------------------------------------------------===//

public struct BitArray {
public var words: [UInt64]
public var size: UInt16

public init(size: Int) {
self.words = .init(repeating: 0, count: (size + 63) / 64)
self.size = UInt16(size)
}

public subscript(_ bit: Int) -> Bool {
get {
return words[bit / 64] & (1 << (bit % 64)) != 0
}

set {
if newValue {
words[bit / 64] |= 1 << (bit % 64)
} else {
words[bit / 64] &= ~(1 << (bit % 64))
}
}
}

public mutating func insert(_ bit: Int) -> Bool {
let oldData = words[bit / 64]
words[bit / 64] |= 1 << (bit % 64)
return oldData & (1 << (bit % 64)) == 0
}
}
102 changes: 102 additions & 0 deletions utils/gen-unicode-data/Sources/GenUtils/Emit.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
//===----------------------------------------------------------------------===//
//
// This source file is part of the Swift.org open source project
//
// Copyright (c) 2021 Apple Inc. and the Swift project authors
// Licensed under Apache License v2.0 with Runtime Library Exception
//
// See https://swift.org/LICENSE.txt for license information
// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
//
//===----------------------------------------------------------------------===//

public func emitCollection<C: Collection>(
_ collection: C,
name: String,
type: String,
into result: inout String,
formatter: (C.Element) -> String
) {
result += """
static const \(type) \(name)[\(collection.count)] = {

"""

formatCollection(collection, into: &result, using: formatter)

result += "\n};\n\n"
}

public func emitCollection<C: Collection>(
_ collection: C,
name: String,
into result: inout String
) where C.Element: FixedWidthInteger {
result += """
static const __swift_uint\(C.Element.bitWidth)_t \(name)[\(collection.count)] = {

"""

formatCollection(collection, into: &result) {
"0x\(String($0, radix: 16, uppercase: true))"
}

result += "\n};\n\n"
}

// Emits an abstract minimal perfect hash function into C arrays.
public func emitMph(_ mph: Mph, name: String, into result: inout String) {
emitMphSizes(mph, name, into: &result)
emitMphBitarrays(mph, name, into: &result)
emitMphRanks(mph, name, into: &result)
}

// BitArray sizes
func emitMphSizes(_ mph: Mph, _ name: String, into result: inout String) {
emitCollection(
mph.bitArrays,
name: "\(name)_sizes",
type: "__swift_uint16_t",
into: &result
) {
"0x\(String($0.size, radix: 16, uppercase: true))"
}
}

func emitMphBitarrays(_ mph: Mph, _ name: String, into result: inout String) {
// Individual bitarrays

for (i, ba) in mph.bitArrays.enumerated() {
emitCollection(ba.words, name: "\(name)_keys\(i)", into: &result)
}

// Overall bitarrays

emitCollection(
mph.bitArrays.indices,
name: "\(name)_keys",
type: "__swift_uint64_t * const",
into: &result
) {
"\(name)_keys\($0)"
}
}

func emitMphRanks(_ mph: Mph, _ name: String, into result: inout String) {
// Individual ranks

for (i, rank) in mph.ranks.enumerated() {
emitCollection(rank, name: "\(name)_ranks\(i)", into: &result)
}

// Overall ranks

emitCollection(
mph.ranks.indices,
name: "\(name)_ranks",
type: "__swift_uint16_t * const",
into: &result
) {
"\(name)_ranks\($0)"
}
}
29 changes: 29 additions & 0 deletions utils/gen-unicode-data/Sources/GenUtils/Files.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
//===----------------------------------------------------------------------===//
//
// This source file is part of the Swift.org open source project
//
// Copyright (c) 2021 Apple Inc. and the Swift project authors
// Licensed under Apache License v2.0 with Runtime Library Exception
//
// See https://swift.org/LICENSE.txt for license information
// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
//
//===----------------------------------------------------------------------===//

import Foundation

public func readFile(_ path: String) -> String {
do {
return try String(contentsOfFile: path)
} catch {
fatalError(error.localizedDescription)
}
}

public func write(_ data: String, to path: String) {
do {
try data.write(toFile: path, atomically: false, encoding: .utf8)
} catch {
fatalError(error.localizedDescription)
}
}
75 changes: 75 additions & 0 deletions utils/gen-unicode-data/Sources/GenUtils/Flatten.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
//===----------------------------------------------------------------------===//
//
// This source file is part of the Swift.org open source project
//
// Copyright (c) 2021 Apple Inc. and the Swift project authors
// Licensed under Apache License v2.0 with Runtime Library Exception
//
// See https://swift.org/LICENSE.txt for license information
// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
//
//===----------------------------------------------------------------------===//

// Takes an unflattened array of scalar ranges and some Equatable property and
// attempts to merge ranges who share the same Equatable property. E.g:
//
// 0x0 ... 0xA = .control
// 0xB ... 0xB = .control
// 0xC ... 0x1F = .control
//
// into:
//
// 0x0 ... 0x1F = .control
public func flatten<T: Equatable>(
_ unflattened: [(ClosedRange<UInt32>, T)]
) -> [(ClosedRange<UInt32>, T)] {
var result: [(ClosedRange<UInt32>, T)] = []

for elt in unflattened.sorted(by: { $0.0.lowerBound < $1.0.lowerBound }) {
guard !result.isEmpty, result.last!.1 == elt.1 else {
result.append(elt)
continue
}

if elt.0.lowerBound == result.last!.0.upperBound + 1 {
result[result.count - 1].0 = result.last!.0.lowerBound ... elt.0.upperBound
} else {
result.append(elt)
}
}

return result
}

// Takes an unflattened array of scalars and some Equatable property and
// attempts to merge scalars into ranges who share the same Equatable
// property. E.g:
//
// 0x9 = .control
// 0xA = .control
// 0xB = .control
// 0xC = .control
//
// into:
//
// 0x9 ... 0xC = .control
public func flatten<T: Equatable>(
_ unflattened: [(UInt32, T)]
) -> [(ClosedRange<UInt32>, T)] {
var result: [(ClosedRange<UInt32>, T)] = []

for elt in unflattened.sorted(by: { $0.0 < $1.0 }) {
guard !result.isEmpty, result.last!.1 == elt.1 else {
result.append((elt.0 ... elt.0, elt.1))
continue
}

if elt.0 == result.last!.0.upperBound + 1 {
result[result.count - 1].0 = result.last!.0.lowerBound ... elt.0
} else {
result.append((elt.0 ... elt.0, elt.1))
}
}

return result
}
43 changes: 43 additions & 0 deletions utils/gen-unicode-data/Sources/GenUtils/Formatting.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
//===----------------------------------------------------------------------===//
//
// This source file is part of the Swift.org open source project
//
// Copyright (c) 2021 Apple Inc. and the Swift project authors
// Licensed under Apache License v2.0 with Runtime Library Exception
//
// See https://swift.org/LICENSE.txt for license information
// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
//
//===----------------------------------------------------------------------===//

// Given a collection, format it into a string within 80 columns and fitting as
// many elements in a row as possible.
public func formatCollection<C: Collection>(
_ c: C,
into result: inout String,
using handler: (C.Element) -> String
) {
// Our row length always starts at 2 for the initial indentation.
var rowLength = 2

for element in c {
let string = handler(element)

if rowLength == 2 {
result += " "
}

if rowLength + string.count + 1 > 80 {
result += "\n "

rowLength = 2
} else {
result += rowLength == 2 ? "" : " "
}

result += "\(string),"

// string.count + , + space
rowLength += string.count + 1 + 1
}
}
50 changes: 50 additions & 0 deletions utils/gen-unicode-data/Sources/GenUtils/Hashing.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
//===----------------------------------------------------------------------===//
//
// This source file is part of the Swift.org open source project
//
// Copyright (c) 2021 Apple Inc. and the Swift project authors
// Licensed under Apache License v2.0 with Runtime Library Exception
//
// See https://swift.org/LICENSE.txt for license information
// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
//
//===----------------------------------------------------------------------===//

func hash(_ key: UInt64, _ n: UInt64, seed: UInt64) -> UInt64 {
let key = key | (n << 32)
let hash = UInt64(murmur3(key, seed: UInt32(seed)))

return hash % n
}

func scramble(_ key: UInt32) -> UInt32 {
var key = key
key &*= 0xCC9E2D51
key = (key << 15) | (key >> 17)
key &*= 0x1B873593
return key
}

func murmur3(_ key: UInt64, seed: UInt32) -> UInt32 {
var hash = seed
var k: UInt32
var key = key

for _ in 0 ..< 2 {
k = UInt32((key << 32) >> 32)
key >>= 32

hash ^= scramble(k)
hash = (hash << 13) | (hash >> 19)
hash = hash &* 5 &+ 0xE6546B64
}

hash ^= 8
hash ^= hash >> 16
hash &*= 0x85EBCA6B
hash ^= hash >> 13
hash &*= 0xC2B2AE35
hash ^= hash >> 16

return hash
}
Loading