EDIT: I removed the original question and replaced it with the following smaller and clearer formulation. Also I should probably have mentioned SIMD, vectorization and closure parameters in the tile ... Anyway:
//==============================================================================
// This little program is a question about the Swift optimizer.
//
// It has a test that measures the time it takes to compute the sum of a million
// random values of a given Testable type.
//
// The program performs the test for SIMD float4x4 and a custom 4x4 matrix type.
//
// The custom 4x4 float matrix type is built up from a generic V4<T> type, like
// this: V4<V4<Float>>.
//
// NOTE: The body of V4<T>'s + operator func contains two lines. Use either one
// of them by commenting out the other.
//
// Using one of them will let the optimizer generate code that makes the
// custom 4x4 matrix almost as fast as SIMD float4x4, while using the
// other will be a couple of 100 times slower.
//
// It would be so great if the optimizer could handle both equally well.
// Perhaps what I'm asking is whether the slow + operator version could
// be vectorized just like the fast version seems to be currently.
//
// Can a future Swift-version make that happen or is it impossible, if so why?
//==============================================================================
import Cocoa
import simd
//------------------------------------------------------------------------------
// Testable requirements and defaults:
//------------------------------------------------------------------------------
protocol Testable {
init()
static func random() -> Self
func +(lhs: Self, rhs: Self) -> Self
}
extension Testable {
static func random(count: Int) -> [Self] { return (0 ..< count).map { _ in Self.random() } }
}
extension Float : Testable { static func random() -> Float { return unsafeBitCast(UInt32(127 << 23) | (arc4random() & 0x7fffff), Float.self) - 1.0 } }
extension float4 : Testable { static func random() -> float4 { return float4(Float.random(4)) } }
extension float4x4 : Testable { static func random() -> float4x4 { return float4x4(float4.random(4)) } }
//------------------------------------------------------------------------------
// A generic Testable type with a "static storage" of four elements.
// Note that V4<V4<Float>> makes a (Testable) 4x4 float matrix type.
//------------------------------------------------------------------------------
struct V4<T: Testable> : Testable {
var elements: (T, T, T, T)
init(_ e0: T, _ e1: T, _ e2: T, _ e3: T) { elements = (e0, e1, e2, e3) }
init() { self.init(T(), T(), T(), T()) }
static func random() -> V4 { return .init(T.random(), T.random(), T.random(), T.random()) }
func mapWith(other: V4, transform: (T, T) -> T) -> V4 {
return .init(
transform(elements.0, other.elements.0),
transform(elements.1, other.elements.1),
transform(elements.2, other.elements.2),
transform(elements.3, other.elements.3)
)
}
func addedTo(other: V4) -> V4 {
return .init(
elements.0 + other.elements.0,
elements.1 + other.elements.1,
elements.2 + other.elements.2,
elements.3 + other.elements.3
)
}
}
//------------------------------------------------------------------------------
// The + operator for V4<T> (which includes both V4<V4<Float>> and V4<Float>)
//------------------------------------------------------------------------------
func +<T: Testable>(lhs: V4<T>, rhs: V4<T>) -> V4<T> {
// return lhs.mapWith(rhs, transform: +) // <--- Slow
return lhs.addedTo(rhs) // <--- Fast (almost as fast as SIMD float4x4, so I presume the optimizer is able to vectorize this version.)
}
//------------------------------------------------------------------------------
// The test
//------------------------------------------------------------------------------
func test<T: Testable>(_: T.Type) {
let a = T.random(1_000_000)
var times = [Double]()
var sum = T()
print("\(T.self):")
for _ in 0 ..< 10 {
let t0 = CACurrentMediaTime()
for i in a.indices { sum = sum + a[i] }
let t1 = CACurrentMediaTime()
times.append(t1 - t0)
}
print(String(format: " median time: %.6f s (deadcodeeliminationprevention: %lld)",
times.sort()[times.count / 2],
"\(sum)".hashValue))
}
test(float4x4)
test(V4<V4<Float>>)
This is what it prints when run on my machine with the fast version of the + operator:
float4x4:
median time: 0.003526 s (deadcodeeliminationprevention: 4799450059808157151)
V4<V4<Float>>:
median time: 0.005425 s (deadcodeeliminationprevention: 4799450061309272077)
and when using the slow version:
float4x4:
median time: 0.003302 s (deadcodeeliminationprevention: -4799450061362652137)
V4<V4<Float>>:
median time: 1.279336 s (deadcodeeliminationprevention: 4799450059394151605)