Bug in OpenCL implementation of erf-function

Dear all,


I have modified the Hello World example from https://developer.apple.com/library/content/samplecode/OpenCL_Hello_World_Example/Introduction/Intro.html

to illustrate a possible bug in the OpenCL implementation of the erf-function running in double precision on the CPU:


#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <OpenCL/opencl.h>
/
/
/
#define DATA_SIZE (1024)
#define DATA_TYPE double
/
/
/
const char *KernelSource = "\n" \
"#pragma OPENCL EXTENSION cl_khr_fp64 : enable                          \n" \
"__kernel void square(                                                  \n" \
"   __global double* input,                                             \n" \
"   __global double* output,                                            \n" \
"   const unsigned int count)                                           \n" \
"{                                                                      \n" \
"   int i = get_global_id(0);                                           \n" \
"   if(i < count)                                                       \n" \
"       output[i] = erf(input[i]);                                      \n" \
"}                                                                      \n" \
"\n";
/
int main(int argc, char** argv)
{
    int err;                            /
    
    DATA_TYPE data[DATA_SIZE];          /
    DATA_TYPE results[DATA_SIZE];       /
    unsigned int correct;               /
    size_t global;                      /
    size_t local;                       /
    cl_device_id device_id;             /
    cl_context context;                 /
    cl_command_queue commands;          /
    cl_program program;                 /
    cl_kernel kernel;                   /
  
    cl_mem input;                       /
    cl_mem output;                      /
  
    /
    /
    int i = 0;
    unsigned int count = DATA_SIZE;
    for(i = 0; i < count; i++)
        data[i] = rand() / (DATA_TYPE)RAND_MAX;
  
    /
    /
    int gpu = 0;
    err = clGetDeviceIDs(NULL, gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &device_id, NULL);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to create a device group!\n");
        return EXIT_FAILURE;
    }

    /
    /
    context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
    if (!context)
    {
        printf("Error: Failed to create a compute context!\n");
        return EXIT_FAILURE;
    }
    /
    /
    commands = clCreateCommandQueue(context, device_id, 0, &err);
    if (!commands)
    {
        printf("Error: Failed to create a command commands!\n");
        return EXIT_FAILURE;
    }
    /
    /
    program = clCreateProgramWithSource(context, 1, (const char **) & KernelSource, NULL, &err);
    if (!program)
    {
        printf("Error: Failed to create compute program!\n");
        return EXIT_FAILURE;
    }
    /
    /
    err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
    if (err != CL_SUCCESS)
    {
        size_t len;
        char buffer[2048];
        printf("Error: Failed to build program executable!\n");
        clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
        printf("%s\n", buffer);
        exit(1);
    }
    /
    /
    kernel = clCreateKernel(program, "square", &err);
    if (!kernel || err != CL_SUCCESS)
    {
        printf("Error: Failed to create compute kernel!\n");
        exit(1);
    }
    /
    /
    input = clCreateBuffer(context,  CL_MEM_READ_ONLY,  sizeof(DATA_TYPE) * count, NULL, NULL);
    output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(DATA_TYPE) * count, NULL, NULL);
    if (!input || !output)
    {
        printf("Error: Failed to allocate device memory!\n");
        exit(1);
    }  
  
    /
    /
    err = clEnqueueWriteBuffer(commands, input, CL_TRUE, 0, sizeof(DATA_TYPE) * count, data, 0, NULL, NULL);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to write to source array!\n");
        exit(1);
    }
    /
    /
    err = 0;
    err  = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input);
    err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &output);
    err |= clSetKernelArg(kernel, 2, sizeof(unsigned int), &count);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to set kernel arguments! %d\n", err);
        exit(1);
    }
    /
    /
    err = clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL);
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to retrieve kernel work group info! %d\n", err);
        exit(1);
    }
    /
    /
    /
    global = count;
    err = clEnqueueNDRangeKernel(commands, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
    if (err)
    {
        printf("Error: Failed to execute kernel!\n");
        return EXIT_FAILURE;
    }
    /
    /
    clFinish(commands);
    /
    /
    err = clEnqueueReadBuffer( commands, output, CL_TRUE, 0, sizeof(DATA_TYPE) * count, results, 0, NULL, NULL );
    if (err != CL_SUCCESS)
    {
        printf("Error: Failed to read output array! %d\n", err);
        exit(1);
    }
  
    /
    /
    correct = 0;
    for(i = 0; i < count; i++)
    {
        printf("result=%g, correct=%g, diff=%g\n", results[i], erf(data[i]), results[i]-erf(data[i]));
        if(fabs(results[i] - erf(data[i])) < 1e-5)
            correct++;
    }
  
    /
    /
    printf("Computed '%d/%d' correct values!\n", correct, count);
  
    /
    /
    clReleaseMemObject(input);
    clReleaseMemObject(output);
    clReleaseProgram(program);
    clReleaseKernel(kernel);
    clReleaseCommandQueue(commands);
    clReleaseContext(context);
    return 0;
}


System Configuration OS X 10.11.6 MacBook Air (13-inch, Early 2014) Xcode Version 8.0 (8A218a)


The erf-function yields zeros if used with double-type data

result=0, correct=8.83111e-06, diff=-8.83111e-06
result=0, correct=0.147573, diff=-0.147573
result=0, correct=0.714744, diff=-0.714744
...

but produces the correct results if used with float-type data (changes from double to float in lines 14, 21-22).


Interestingly enough, the erfc-function works correctly both for float- and double-type data.


Any help is appreciated.


Best,

Matthias

Replies

I can reproduce this on a 2015 MacBook Pro with Retina display as well, and it may indeed be a bug. I'd encourage you to file a bug report with this sample code attached.

I did file a bug report ID 28447418 already in September but it seems that it is completely ignored since then.

Thanks for the report.

this implementation by apple is absolutely terrible. it's buggy as anythng i've ever seen.


deprecated function, examples programs that crash, i've have yet to find an example program that actually compiles from being downloaded.


example 1 histogram after i have to fix paths, libraries

==============================

Testing Histogram with Buffers

==============================

RGBA 8-bit: verified

Performance Number Time to compute RGBA unorm8 histogram

(in ms, lower is better): 1.84145

RGBA fp32: verified

Performance Number Time to compute RGBA fp32 histogram

(in ms, lower is better): 2.56007

=============================

Testing Histogram with Images

=============================

clBuildProgram() failed.

Log:


Program ended with exit code: 1


example 2 hello world

ld: warning: directory not found for option '-L../../lib'

ld: warning: directory not found for option '-L/Volumes/Swap/Down_Swap/OpenCL_Hello_World_Example/../lib'


example 3

/Applications/Hacks/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX10.12.sdk/System/Library/Frameworks/OpenCL.framework/Headers/cl.h:1170:1: 'clCreateImage2D' has been explicitly marked deprecated here


usibg ffmpeg to bench opencl


clEnqueueNDRangeKernel error 'INVALID WORK GROUP SIZE'

[opencl @ 0x10b214108] Benchmark failed with OpenCL device Intel(R) Core(TM) i7-4980HQ CPU @ 2.80GHz

platform_idx device_idx device_name runtime

0 1 AMD Radeon R9 M370X Compute Engine 5670

0 0 Iris Pro 11348