Skip to content

Use custom hash table as node lookup table #11

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Sep 22, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 10 additions & 3 deletions Sources/SwiftSyntax/RawSyntax.swift
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ fileprivate enum RawSyntaxData {
/// Represents the raw tree structure underlying the syntax tree. These nodes
/// have no notion of identity and only provide structure to the tree. They
/// are immutable and can be freely shared between syntax nodes.
struct RawSyntax {
final class RawSyntax {
fileprivate let data: RawSyntaxData
let presence: SourcePresence

Expand Down Expand Up @@ -124,6 +124,13 @@ struct RawSyntax {
}
}).value
}

/// Creates a copy of `other`.
init(_ other: RawSyntax) {
self.data = other.data
self.presence = other.presence
self.id = other.id
}

init(kind: SyntaxKind, layout: [RawSyntax?], presence: SourcePresence,
id: SyntaxNodeId? = nil) {
Expand Down Expand Up @@ -336,7 +343,7 @@ extension RawSyntax {

extension RawSyntax: Codable {
/// Creates a RawSyntax from the provided Foundation Decoder.
init(from decoder: Decoder) throws {
convenience init(from decoder: Decoder) throws {
let container = try decoder.container(keyedBy: CodingKeys.self)
let id = try container.decodeIfPresent(SyntaxNodeId.self, forKey: .id)
let omitted = try container.decodeIfPresent(Bool.self, forKey: .omitted) ?? false
Expand All @@ -352,7 +359,7 @@ extension RawSyntax: Codable {
guard let lookupNode = lookupFunc(id) else {
throw IncrementalDecodingError.nodeLookupFailed(id)
}
self = lookupNode
self.init(lookupNode)
return
}

Expand Down
25 changes: 18 additions & 7 deletions Sources/SwiftSyntax/SwiftSyntax.swift
Original file line number Diff line number Diff line change
Expand Up @@ -46,17 +46,24 @@ public enum SerializationFormat {
case byteTree
}

// For WeakLookupTable.
extension RawSyntax : Identifiable {}

/// Deserializes the syntax tree from its serialized form to an object tree in
/// Swift. To deserialize incrementally transferred syntax trees, the same
/// instance of the deserializer must be used for all subsequent
/// deserializations.
public final class SyntaxTreeDeserializer {
// FIXME: This lookup table just accumulates nodes, we should invalidate nodes
// that are no longer used at some point and remove them from the table

/// Syntax nodes that have already been parsed and are able to be reused if
/// they were omitted in an incremental syntax tree transfer
private var nodeLookupTable: [SyntaxNodeId: RawSyntax] = [:]
private var nodeLookupTable: WeakLookupTable<RawSyntax> = .init()

/// Keep a strong reference to the syntax tree that contains the nodes in the
/// `nodeLookupTable`. Because `nodeLookupTable` only holds a weak reference
/// to the RawSyntax nodes, all retired `RawSyntax` nodes will be deallocated
/// once we set a new tree. The weak references in `nodeLookupTable` will then
/// become `nil` and the slot will be reused to refer another node.
private var nodeLookupTree: RawSyntax? = nil

/// The IDs of the nodes that were reused as part of incremental syntax
/// parsing during the last deserialization
Expand All @@ -70,19 +77,23 @@ public final class SyntaxTreeDeserializer {
let decoder = JSONDecoder()
decoder.userInfo[.rawSyntaxDecodedCallback] = self.addToLookupTable
decoder.userInfo[.omittedNodeLookupFunction] = self.lookupNode
return try decoder.decode(RawSyntax.self, from: data)
let tree = try decoder.decode(RawSyntax.self, from: data)
self.nodeLookupTree = tree
return tree
}

/// Deserialize the given data as a ByteTree encoded syntax tree
private func deserializeByteTree(_ data: Data) throws -> RawSyntax {
var userInfo: [ByteTreeUserInfoKey: Any] = [:]
userInfo[.rawSyntaxDecodedCallback] = self.addToLookupTable
userInfo[.omittedNodeLookupFunction] = self.lookupNode
return try ByteTreeReader.read(RawSyntax.self, from: data,
let tree = try ByteTreeReader.read(RawSyntax.self, from: data,
userInfo: &userInfo) {
(version: ByteTreeProtocolVersion) in
return version.major == 1
}
self.nodeLookupTree = tree
return tree
}

/// Decode a serialized form of SourceFileSyntax to a syntax tree.
Expand Down Expand Up @@ -116,7 +127,7 @@ public final class SyntaxTreeDeserializer {
}

private func addToLookupTable(_ node: RawSyntax) {
nodeLookupTable[node.id] = node
nodeLookupTable.insert(node)
}
}

Expand Down
228 changes: 228 additions & 0 deletions Sources/SwiftSyntax/WeakLookupTable.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,228 @@
//===----------- WeakLookupTable.swift - Swift Syntax Library -------------===//
//
// This source file is part of the Swift.org open source project
//
// Copyright (c) 2018 Apple Inc. and the Swift project authors
// Licensed under Apache License v2.0 with Runtime Library Exception
//
// See https://swift.org/LICENSE.txt for license information
// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
//
//===----------------------------------------------------------------------===//
// This file provides lookup table with weak reference.
//===----------------------------------------------------------------------===//

/// Protocol for self-identifiable object
protocol Identifiable {
associatedtype Identifier : Hashable
var id: Identifier { get }
}

private struct WeakReference<T: AnyObject>: ExpressibleByNilLiteral {
weak var value: T?
init(_ value: T?) {
self.value = value
}
init(nilLiteral: ()) {
self.value = nil
}
}

/// 'Set' like container providing lookup table for objects with identifier.
/// This doesn't take ownership of the objects. Instead, the objects are
/// held by weak references.
///
/// References are stored in a hash table with simple open adressing. Because
/// of weak reference, unlike normal open addressing, erased bucket are simply
/// turned into 'nil'.
class WeakLookupTable<Element: Identifiable & AnyObject> {

/// Storage for the hash table.
private var buckets: UnsafeMutablePointer<WeakReference<Element>>
private var bucketCount: Int

/// Estimated count of inserted values. This is greater than or equal to
/// the number of actually occupied buckets.
/// i.e. estimatedCount >= _countOccupiedBuckets()
private var estimatedCount: Int

init(capacity: Int = 0) {
bucketCount = WeakLookupTable<Element>._bucketCount(for: capacity)
buckets = .allocate(capacity: bucketCount)
buckets.initialize(repeating: nil, count: bucketCount)
estimatedCount = 0
}

deinit {
buckets.deinitialize(count: bucketCount)
buckets.deallocate()
}

/// Constant max load factor for hash table.
private static var maxLoadFactor: Double {
@inline(__always) get {
return 0.75
}
}

/// Minimal number of bucket count enough to hold specified capacity of
/// objects with taking max load factor into account.
private static func _minimalBucketCount(for capacity: Int) -> Int {
return Int((Double(capacity) / maxLoadFactor).rounded(.up))
}

/// Number of bucket count to allocate to hold specified number of objects.
/// This is the next power of 2 greater than or equal to
/// '_minimalBucketCount(for: capacity)'
private static func _bucketCount(for capacity: Int,
from current: Int = 2) -> Int {
// Bucket count must always be power of 2.
precondition((current & (current - 1)) == 0)
// Minimum is 2 to guarantee at least 1 hole.
precondition(current >= 2)

let minimalBucketCount = _minimalBucketCount(for: capacity)

// Make sure it's representable. If 'minimalBucketCount' here is over
// 0x4000_..., the bucket count must be 0x8000_... thus overflows.
precondition(minimalBucketCount <= (Int.max >> 1) + 1)

var bucketCount = current
while bucketCount < minimalBucketCount {
// '&*=' for performance. Guaranteed by above 'precondition()'.
bucketCount &*= 2
}
return bucketCount
}

private var _bucketMask: Int {
@inline(__always) get {
// '&-' for performance. We know 'bucketCount >= 2'.
return bucketCount &- 1
}
}

@inline(__always)
private func _idealBucket(for id: Element.Identifier) -> Int {
return id.hashValue & _bucketMask
}

/// Finds the bucket where the object with the specified id should be stored
/// to.
private
func _findHole(_ id: Element.Identifier) -> (pos: Int, alreadyExists: Bool) {
var bucket = _idealBucket(for: id)

// Starting from the ideal bucket for the id, search an available bucket,
// or the bucket holding the id.
while true {
guard let obj = buckets[bucket].value else {
return (bucket, false)
}
if obj.id == id {
return (bucket, true)
}
// '&+' for performance. 'bucketCount' is 0x4000_... or below.
bucket = (bucket &+ 1) & _bucketMask
}
}

/// Reserves enough space to store the specified number of elements. Returns
/// true if resizing happened.
func reserveCapacity(_ requiredCapacity: Int) -> Bool {
let requiredBucketCount = WeakLookupTable<Element>
._bucketCount(for: requiredCapacity, from: bucketCount)
if (bucketCount >= requiredBucketCount) {
return false
}

// Slow path. Resizing.
let oldBuckets = buckets
let oldBucketRange = buckets ..< buckets.advanced(by: bucketCount)

bucketCount = requiredBucketCount
buckets = .allocate(capacity: requiredBucketCount)
buckets.initialize(repeating: nil, count: requiredBucketCount)

// Move all nodes from the old buffer.
for oldBucket in oldBucketRange {
if let id = oldBucket.pointee.value?.id {
let newBucket = buckets.advanced(by: _findHole(id).pos)
newBucket.moveAssign(from: oldBucket, count: 1)
} else {
oldBucket.deinitialize(count: 1)
}
}

oldBuckets.deallocate()

return true
}

/// Count the actual number of occupied buckets.
@inline(__always)
private func _countOccupiedBuckets() -> Int {
var count = 0
for i in 0 ..< bucketCount where buckets[i].value != nil {
// '&+=' for performance. 'bucketCount' is 0x4000_... or below.
count &+= 1
}
return count
}

/// Reserves enough space to store a single new object. Returns true if
/// resizing happened.
private func _ensurePlusOneCapacity() -> Bool {
// '&+' for performance. 'estimatedCount' is always less than 'bucketCount'
// which is 0x4000_... or below.
Copy link
Member

@ahoppen ahoppen Sep 21, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is not true. estimatedCount is always less than buckecCount / maxLoadFactor (it ends up giving the same guarantees for a maxLoadFactor > 0.5 but still.

Consider the following example:

class Foo: Identifiable {
  let id: Int = 1
}

let lookupTable = WeakLookupTable<Foo>()
lookupTable.reserveCapacity(32)

while true {
  let newValue = Foo()
  lookupTable.insert(newValue)
  print(lookupTable.estimatedCount) /* need to make estimatedCount public for this test */
}

With each insert estimatedCount gets increased until _minimalBucketCount(for: estimatedCount + 1), i.e. (estimatedCount + 1) / maxLoadFactor is greater than bucketCount at which point the slow path is taken and the lookup table realises that all the previous elements have been released and recounts.

This might influence some of the other comments as well. I haven't looked at them in detail.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

estimatedCount is always less than buckecCount / maxLoadFactor (it ends up giving the same guarantees for a maxLoadFactor > 0.5 but still.

I believe this is "estimatedCount is alway less than or equal to bucketCount * maxLoadFactor". So, 'estimatedCount' is always less than 'bucketCount' is true because maxLoadFactor is less than 1.

Either way, what we want to justify here is that estimatedCount + 1 doesn't overflow. i.e. estimatedCount < Int.max, right?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, yes. I see. Sorry for the confusion. I somehow assumed that bucketCount is 32 in the example above, but its 64. You're right and it definitely doesn't overflow.

if bucketCount >= WeakLookupTable<Element>
._minimalBucketCount(for: estimatedCount &+ 1) {
return false
}

// Slow path.
estimatedCount = _countOccupiedBuckets()
// '&+' for performance. We know 'estimatedCount' derived by
// '_countOccupiedBuckets()' is equal to or less than previous
// 'estimatedCount'.
return reserveCapacity(estimatedCount &+ 1)
}

/// Inserts the given object into the table.
@discardableResult
func insert(_ obj: Element) -> Bool {
var (pos, alreadyExists) = _findHole(obj.id)
if alreadyExists {
return false
}

if /*resized=*/_ensurePlusOneCapacity() {
pos = _findHole(obj.id).pos
}
buckets[pos].value = obj
// '&+=' for performance. '_ensurePlusOneCapacity()' ensures it's safe.
estimatedCount &+= 1
return true
}

/// Get a object with specified id. Returns 'nil' if the object hasn't been
/// insert()-ed or it's already been freed.
subscript(id: Element.Identifier) -> Element? {
// Since we don't fill the bucket when the object is freed (because we don't
// know), we can't stop iteration at a hole. So in the worst case (i.e. if
// the object doesn't exist in the table), full linear search is needed.
// However, since we assume the object exists and hasn't been freed yet,
// we expect it's stored near the 'idealBucket' anyway.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We could think about adding a tombstone property to WeakReference so that we can detect when we found a hole.
I.e.
WeakReference starts of as being {occupied: false, value: nil}. Once a value is set it becomes {occupied: true, value: someValue}. When someValue is freed it becomes {occupied: true, value: nil}. That way we can stop the search whenever we reach a WeakReference with occupied == false and thus avoid the full linear search.

However, in our use case, as you mentioned, we always expect to find the element, I think its worth it to make the trade-off to save an extra bit/byte for a linear worst-case complexity that we should never encounter in practice.

TL;DR: No need to change anything unless we are planning to merge this into corelibs-foundation.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, I don't intend to merge this into corelibs-foundation. :)
NSHashTable has very different API anyway.

let idealBucket = _idealBucket(for: id)
var bucket = idealBucket
repeat {
if let obj = buckets[bucket].value, obj.id == id {
return obj
}
// '&+' for performance. 'bucketCount' is 0x4000_... or below.
bucket = (bucket &+ 1) & _bucketMask
} while bucket != idealBucket

return nil
}
}
28 changes: 17 additions & 11 deletions Tests/LinuxMain.swift
Original file line number Diff line number Diff line change
@@ -1,14 +1,20 @@
import XCTest
import SwiftSyntaxTest

XCTMain([
testCase(AbsolutePositionTestCase.allTests),
testCase(DecodeSyntaxTestCase.allTests),
testCase(DiagnosticTestCase.allTests),
testCase(LazyCachingTestCase.allTests),
testCase(ParseFileTestCase.allTests),
testCase(SyntaxChildrenAPITestCase.allTests),
testCase(SyntaxCollectionsAPITestCase.allTests),
testCase(SyntaxFactoryAPITestCase.allTests),
testCase(SyntaxVisitorTestCase.allTests),
])
XCTMain({ () -> [XCTestCaseEntry] in
var testCases: [XCTestCaseEntry] = [
testCase(AbsolutePositionTestCase.allTests),
testCase(DecodeSyntaxTestCase.allTests),
testCase(DiagnosticTestCase.allTests),
testCase(LazyCachingTestCase.allTests),
testCase(ParseFileTestCase.allTests),
testCase(SyntaxChildrenAPITestCase.allTests),
testCase(SyntaxCollectionsAPITestCase.allTests),
testCase(SyntaxFactoryAPITestCase.allTests),
testCase(SyntaxVisitorTestCase.allTests),
]
#if DEBUG
testCases.append(testCase(WeakLookupTableTestCase.allTests))
#endif
return testCases
}())
Loading