Hi
I am porting some applications to M1 that make extensive use of vDSP. I found in many cases there to be a minimal speed-up, which I put down to Rosetta doing a good job translating SSE instructions into equivalent Neon instructions in the vDSP library.
To try and understand this more I started profiling various areas of code and have found situations where the performance of translated code runs faster than natively. Often native code speed is similar or faster as expected, but there are a notable numbers of cases where it is not. This is not what I expected.
I include a sample below to show a somewhat contrived and trivial routine exhibiting the effect. I have built it using XCode 12.5.1 in Release with an 11.3 deployment target. The Mac is running macOS 11.6.
On my M1 Mac mini the Rosetta build takes around 900-1000 µs to run to completion, switching to native code it takes around 1500-1600 µs.
I can make various adjustments to the data size or types of vDSP operations used to find scenarios where native builds are faster, that is not difficult, but it shouldn't be necessary. I can understand why vDSP could perhaps perform similarly across native vs translated runs, but surely it should never be the case that translated code could beat native code by a margin like this. What is going on, and is it expected?
Thanks, Matt
#include <iostream>
#include <sys/types.h>
#include <sys/sysctl.h>
// determine if process is running through Rosetta translation
int processIsTranslated() {
int ret = 0;
size_t size = sizeof(ret);
if (sysctlbyname("sysctl.proc_translated", &ret, &size, NULL, 0) == -1)
{
if (errno == ENOENT)
return 0;
return -1;
}
return ret;
}
int main(int argc, const char * argv[])
{
// print translation status
if(processIsTranslated() == 1)
std::cout << "Rosetta" << std::endl;
else
std::cout << "Native" << std::endl;
// size of test
vDSP_Length array_len = 512;
const int iterations = 10000;
// allocate and clear memory
float* buf1_ptr = (float*)malloc(array_len * sizeof(float));
float* buf2_ptr = (float*)malloc(array_len * sizeof(float));
float* buf3_ptr = (float*)malloc(array_len * sizeof(float));
float* buf4_ptr = (float*)malloc(array_len * sizeof(float));
if(!buf1_ptr) return EXIT_FAILURE;
if(!buf2_ptr) return EXIT_FAILURE;
if(!buf3_ptr) return EXIT_FAILURE;
if(!buf4_ptr) return EXIT_FAILURE;
memset(buf1_ptr, 0, array_len * sizeof(float));
memset(buf2_ptr, 0, array_len * sizeof(float));
memset(buf3_ptr, 0, array_len * sizeof(float));
memset(buf4_ptr, 0, array_len * sizeof(float));
// start timer
__uint64_t start_ns = clock_gettime_nsec_np(CLOCK_UPTIME_RAW);
// scalar constants
const float scalar1 = 10;
const float scalar2 = 11;
// loop test
for(int i = 0; i < iterations; i++)
{
vDSP_vsadd(buf1_ptr, 1, &scalar1, buf2_ptr, 1, array_len);
vDSP_vsadd(buf1_ptr, 1, &scalar2, buf3_ptr, 1, array_len);
vDSP_vadd(buf2_ptr, 1, buf3_ptr, 1, buf4_ptr, 1, array_len);
}
// report test time
__uint64_t end_ns = clock_gettime_nsec_np(CLOCK_UPTIME_RAW);
double time_us = (end_ns - start_ns) / 1000.f;
std::cout << time_us << " us" << std::endl;
// clean up
if(buf1_ptr) free(buf1_ptr);
if(buf2_ptr) free(buf2_ptr);
if(buf3_ptr) free(buf3_ptr);
return 0;
}