Why ARM NEON is not faster than ARM ACLE ?

hello,

i wrote a method with Neon for accelerate my code..

I don't have the expected result

only simples instructions and, or, shift... on 64bits

no gain speed. nada :-(

i dont understand... how programm this method for speed?

original code :

unsigned long long RXBitBoard::get_legal_moves(const unsigned long long p_discs, const unsigned long long o_discs) {
    
    
    const unsigned long long inner_o_discs = o_discs & 0x7E7E7E7E7E7E7E7EULL;
    
    
    /* direction W */
    unsigned long long
    flipped  = (p_discs >> 1) & inner_o_discs;
    flipped |= (flipped >> 1) & inner_o_discs;
    
    unsigned long long adjacent_o_discs = inner_o_discs & (inner_o_discs >> 1);
    
    flipped |= (flipped >> 2) & adjacent_o_discs;
    flipped |= (flipped >> 2) & adjacent_o_discs;
    
    unsigned long long legals = flipped >> 1;
    
    
    //    /* direction _E*/
    //    flipped  = (p_discs << 1) & inner_o_discs;
    //    flipped |= (flipped << 1) & inner_o_discs;
    //
    //    adjacent_o_discs = inner_o_discs & (inner_o_discs << 1);
    //
    //    flipped |= (flipped << 2) & adjacent_o_discs;
    //    flipped |= (flipped << 2) & adjacent_o_discs;
    //
    //    legals |= flipped << 1;
    
    // trick
    /* direction _E */
    flipped = (p_discs << 1);
    legals |= ((flipped + inner_o_discs) & ~flipped);
    
    
    /* direction S */
    flipped  = (p_discs >>  8) & o_discs;
    flipped |= (flipped >>  8) & o_discs;
    
    adjacent_o_discs = o_discs & (o_discs >> 8);
    
    flipped |= (flipped >> 16) & adjacent_o_discs;
    flipped |= (flipped >> 16) & adjacent_o_discs;
    
    legals |= flipped >> 8;
    
    
    /* direction N */
    flipped  = (p_discs <<  8) & o_discs;
    flipped |= (flipped <<  8) & o_discs;
    
    adjacent_o_discs = o_discs & (o_discs << 8);
    
    flipped |= (flipped << 16) & adjacent_o_discs;
    flipped |= (flipped << 16) & adjacent_o_discs;
    
    legals |= flipped << 8;
    
    
    /* direction NE */
    flipped  = (p_discs >>  7) & inner_o_discs;
    flipped |= (flipped >>  7) & inner_o_discs;
    
    adjacent_o_discs = inner_o_discs & (inner_o_discs >> 7);
    
    flipped |= (flipped >> 14) & adjacent_o_discs;
    flipped |= (flipped >> 14) & adjacent_o_discs;
    
    legals |= flipped >> 7;
    
    
    /* direction SW */
    flipped  = (p_discs <<  7) & inner_o_discs;
    flipped |= (flipped <<  7) & inner_o_discs;
    
    adjacent_o_discs = inner_o_discs & (inner_o_discs << 7);
    
    flipped |= (flipped << 14) & adjacent_o_discs;
    flipped |= (flipped << 14) & adjacent_o_discs;
    
    legals |= flipped << 7;
    
    
    /* direction NW */
    flipped  = (p_discs >>  9) & inner_o_discs;
    flipped |= (flipped >>  9) & inner_o_discs;
    
    adjacent_o_discs = inner_o_discs & (inner_o_discs >> 9);
    
    flipped |= (flipped >> 18) & adjacent_o_discs;
    flipped |= (flipped >> 18) & adjacent_o_discs;
    
    legals |= flipped >> 9;
    
    
    /* direction SE */
    flipped  = (p_discs <<  9) & inner_o_discs;
    flipped |= (flipped <<  9) & inner_o_discs;
    
    adjacent_o_discs = inner_o_discs & (inner_o_discs << 9);
    
    flipped |= (flipped << 18) & adjacent_o_discs;
    flipped |= (flipped << 18) & adjacent_o_discs;
    
    legals |= flipped << 9;
    
    //Removes existing discs
    legals &= ~(p_discs | o_discs);
    
    return legals;
    
}

my neon code

    
    const uint64x2_t pp_discs = vdupq_n_u64(p_discs);
    const uint64x2_t oo_discs = vdupq_n_u64(o_discs);
    
    const uint64x2_t inner_oo_discs = vdupq_n_u64(o_discs & 0x7E7E7E7E7E7E7E7EULL);

    
    //horizontals directions -1, +1
    static const int64x2_t shift_1 = {-1, 1};
    static const int64x2_t shift_2 = {-2, 2};
    
    uint64x2_t
    flipped = vandq_u64(vshlq_u64(pp_discs, shift_1), inner_oo_discs);
    flipped = vorrq_u64(flipped, vandq_u64(vshlq_u64(flipped, shift_1), inner_oo_discs));

    uint64x2_t 
    adjacent_oo_discs = vandq_u64(inner_oo_discs, vshlq_u64(inner_oo_discs, shift_1));

    flipped = vorrq_u64(flipped, vandq_u64(vshlq_u64(flipped, shift_2), adjacent_oo_discs));
    flipped = vorrq_u64(flipped, vandq_u64(vshlq_u64(flipped, shift_2), adjacent_oo_discs));

    uint64x2_t legals = vshlq_u64(flipped, shift_1);

    //verticals directions -8 , +8
    static const int64x2_t shift_8  = {-8,   8};
    static const int64x2_t shift_16 = {-16, 16};

    flipped = vandq_u64(vshlq_u64(pp_discs, shift_8), oo_discs);
    flipped = vorrq_u64(flipped, vandq_u64(vshlq_u64(flipped, shift_8), oo_discs));
    
    adjacent_oo_discs = vandq_u64(oo_discs, vshlq_u64(oo_discs, shift_8));
    
    flipped = vorrq_u64(flipped, vandq_u64(vshlq_u64(flipped, shift_16), adjacent_oo_discs));
    flipped = vorrq_u64(flipped, vandq_u64(vshlq_u64(flipped, shift_16), adjacent_oo_discs));

    legals = vorrq_u64(legals, vshlq_u64(flipped, shift_8));

    //diagonals directions -7 , +7
    static const int64x2_t shift_7  = {-7,   7};
    static const int64x2_t shift_14 = {-14, 14};

    flipped = vandq_u64(vshlq_u64(pp_discs, shift_7), inner_oo_discs);
    flipped = vorrq_u64(flipped, vandq_u64(vshlq_u64(flipped, shift_7), inner_oo_discs));

    adjacent_oo_discs = vandq_u64(inner_oo_discs, vshlq_u64(inner_oo_discs, shift_7));

    flipped = vorrq_u64(flipped, vandq_u64(vshlq_u64(flipped, shift_14), adjacent_oo_discs));
    flipped = vorrq_u64(flipped, vandq_u64(vshlq_u64(flipped, shift_14), adjacent_oo_discs));

    legals = vorrq_u64(legals, vshlq_u64(flipped, shift_7));
    
    //diagonals directions -9 , +9
    static const int64x2_t shift_9  = {-9,   9};
    static const int64x2_t shift_18 = {-18, 18};

    flipped = vandq_u64(vshlq_u64(pp_discs, shift_9), inner_oo_discs);
    flipped = vorrq_u64(flipped, vandq_u64(vshlq_u64(flipped, shift_9), inner_oo_discs));

    adjacent_oo_discs = vandq_u64(inner_oo_discs, vshlq_u64(inner_oo_discs, shift_9));

    flipped = vorrq_u64(flipped, vandq_u64(vshlq_u64(flipped, shift_18), adjacent_oo_discs));
    flipped = vorrq_u64(flipped, vandq_u64(vshlq_u64(flipped, shift_18), adjacent_oo_discs));

    legals = vorrq_u64(legals, vshlq_u64(flipped, shift_9));
    
    
    return ((vgetq_lane_u64(legals, 0) | vgetq_lane_u64(legals, 1)) & ~(p_discs | o_discs));

}

i even wrote a interleave version

I can expect a speed gain, no?

Have you looked at the generated assembler?

How are you measuring the performance?

My last attempt at writing NEON code was also disappointing. Looking at the generated code, in my case it was clear that the NEON version had a significant overhead before and after the NEON instructions to move the data to and from the vector registers; on the other hand, the non-NEON version was better optimised and actually managed to use some vector instructions.

In your case of 64-bit elements the best possible speedup is 2X, while with 32-bit elements (e.g. floats) the best-possible is 4X, and with bytes it is 16X. Based on my experience, I wouldn't try to use NEON in order to get only a max 2X improvement - but I probably would consider it to get a 16X improvement. 4X is borderline.

Is this code for a board game?

Have you looked at the generated assembler? no, how to do ?

How are you measuring the performance? time system

2X will very cool.. yes a game OTHELLO

Have you looked at the generated assembler?

no, how to do ?

I have to Google it every time I need to do it. You'll find Stack Overflow answers for ancient versions of xCode, and things have changed a bit. I think the key is the menu item "Product" -> "Perform Action" -> "Assemble (filename)". Or, if it's a small self-contained file, compile it on the command line with clang -S.

2X will very cool..

The point is that 2X is the limit on how much performance improvement you will get from vectorisation. In reality it will be lower. Most likely there are other things you can do with better results. Does the algorithm have the right complexity order? What does the profiler tell you?

Why ARM NEON is not faster than ARM ACLE ?
 
 
Q