hello,
i wrote a method with Neon for accelerate my code..
I don't have the expected result
only simples instructions and, or, shift... on 64bits
no gain speed. nada :-(
i dont understand... how programm this method for speed?
original code :
unsigned long long RXBitBoard::get_legal_moves(const unsigned long long p_discs, const unsigned long long o_discs) {
const unsigned long long inner_o_discs = o_discs & 0x7E7E7E7E7E7E7E7EULL;
/* direction W */
unsigned long long
flipped = (p_discs >> 1) & inner_o_discs;
flipped |= (flipped >> 1) & inner_o_discs;
unsigned long long adjacent_o_discs = inner_o_discs & (inner_o_discs >> 1);
flipped |= (flipped >> 2) & adjacent_o_discs;
flipped |= (flipped >> 2) & adjacent_o_discs;
unsigned long long legals = flipped >> 1;
// /* direction _E*/
// flipped = (p_discs << 1) & inner_o_discs;
// flipped |= (flipped << 1) & inner_o_discs;
//
// adjacent_o_discs = inner_o_discs & (inner_o_discs << 1);
//
// flipped |= (flipped << 2) & adjacent_o_discs;
// flipped |= (flipped << 2) & adjacent_o_discs;
//
// legals |= flipped << 1;
// trick
/* direction _E */
flipped = (p_discs << 1);
legals |= ((flipped + inner_o_discs) & ~flipped);
/* direction S */
flipped = (p_discs >> 8) & o_discs;
flipped |= (flipped >> 8) & o_discs;
adjacent_o_discs = o_discs & (o_discs >> 8);
flipped |= (flipped >> 16) & adjacent_o_discs;
flipped |= (flipped >> 16) & adjacent_o_discs;
legals |= flipped >> 8;
/* direction N */
flipped = (p_discs << 8) & o_discs;
flipped |= (flipped << 8) & o_discs;
adjacent_o_discs = o_discs & (o_discs << 8);
flipped |= (flipped << 16) & adjacent_o_discs;
flipped |= (flipped << 16) & adjacent_o_discs;
legals |= flipped << 8;
/* direction NE */
flipped = (p_discs >> 7) & inner_o_discs;
flipped |= (flipped >> 7) & inner_o_discs;
adjacent_o_discs = inner_o_discs & (inner_o_discs >> 7);
flipped |= (flipped >> 14) & adjacent_o_discs;
flipped |= (flipped >> 14) & adjacent_o_discs;
legals |= flipped >> 7;
/* direction SW */
flipped = (p_discs << 7) & inner_o_discs;
flipped |= (flipped << 7) & inner_o_discs;
adjacent_o_discs = inner_o_discs & (inner_o_discs << 7);
flipped |= (flipped << 14) & adjacent_o_discs;
flipped |= (flipped << 14) & adjacent_o_discs;
legals |= flipped << 7;
/* direction NW */
flipped = (p_discs >> 9) & inner_o_discs;
flipped |= (flipped >> 9) & inner_o_discs;
adjacent_o_discs = inner_o_discs & (inner_o_discs >> 9);
flipped |= (flipped >> 18) & adjacent_o_discs;
flipped |= (flipped >> 18) & adjacent_o_discs;
legals |= flipped >> 9;
/* direction SE */
flipped = (p_discs << 9) & inner_o_discs;
flipped |= (flipped << 9) & inner_o_discs;
adjacent_o_discs = inner_o_discs & (inner_o_discs << 9);
flipped |= (flipped << 18) & adjacent_o_discs;
flipped |= (flipped << 18) & adjacent_o_discs;
legals |= flipped << 9;
//Removes existing discs
legals &= ~(p_discs | o_discs);
return legals;
}
my neon code
const uint64x2_t pp_discs = vdupq_n_u64(p_discs);
const uint64x2_t oo_discs = vdupq_n_u64(o_discs);
const uint64x2_t inner_oo_discs = vdupq_n_u64(o_discs & 0x7E7E7E7E7E7E7E7EULL);
//horizontals directions -1, +1
static const int64x2_t shift_1 = {-1, 1};
static const int64x2_t shift_2 = {-2, 2};
uint64x2_t
flipped = vandq_u64(vshlq_u64(pp_discs, shift_1), inner_oo_discs);
flipped = vorrq_u64(flipped, vandq_u64(vshlq_u64(flipped, shift_1), inner_oo_discs));
uint64x2_t
adjacent_oo_discs = vandq_u64(inner_oo_discs, vshlq_u64(inner_oo_discs, shift_1));
flipped = vorrq_u64(flipped, vandq_u64(vshlq_u64(flipped, shift_2), adjacent_oo_discs));
flipped = vorrq_u64(flipped, vandq_u64(vshlq_u64(flipped, shift_2), adjacent_oo_discs));
uint64x2_t legals = vshlq_u64(flipped, shift_1);
//verticals directions -8 , +8
static const int64x2_t shift_8 = {-8, 8};
static const int64x2_t shift_16 = {-16, 16};
flipped = vandq_u64(vshlq_u64(pp_discs, shift_8), oo_discs);
flipped = vorrq_u64(flipped, vandq_u64(vshlq_u64(flipped, shift_8), oo_discs));
adjacent_oo_discs = vandq_u64(oo_discs, vshlq_u64(oo_discs, shift_8));
flipped = vorrq_u64(flipped, vandq_u64(vshlq_u64(flipped, shift_16), adjacent_oo_discs));
flipped = vorrq_u64(flipped, vandq_u64(vshlq_u64(flipped, shift_16), adjacent_oo_discs));
legals = vorrq_u64(legals, vshlq_u64(flipped, shift_8));
//diagonals directions -7 , +7
static const int64x2_t shift_7 = {-7, 7};
static const int64x2_t shift_14 = {-14, 14};
flipped = vandq_u64(vshlq_u64(pp_discs, shift_7), inner_oo_discs);
flipped = vorrq_u64(flipped, vandq_u64(vshlq_u64(flipped, shift_7), inner_oo_discs));
adjacent_oo_discs = vandq_u64(inner_oo_discs, vshlq_u64(inner_oo_discs, shift_7));
flipped = vorrq_u64(flipped, vandq_u64(vshlq_u64(flipped, shift_14), adjacent_oo_discs));
flipped = vorrq_u64(flipped, vandq_u64(vshlq_u64(flipped, shift_14), adjacent_oo_discs));
legals = vorrq_u64(legals, vshlq_u64(flipped, shift_7));
//diagonals directions -9 , +9
static const int64x2_t shift_9 = {-9, 9};
static const int64x2_t shift_18 = {-18, 18};
flipped = vandq_u64(vshlq_u64(pp_discs, shift_9), inner_oo_discs);
flipped = vorrq_u64(flipped, vandq_u64(vshlq_u64(flipped, shift_9), inner_oo_discs));
adjacent_oo_discs = vandq_u64(inner_oo_discs, vshlq_u64(inner_oo_discs, shift_9));
flipped = vorrq_u64(flipped, vandq_u64(vshlq_u64(flipped, shift_18), adjacent_oo_discs));
flipped = vorrq_u64(flipped, vandq_u64(vshlq_u64(flipped, shift_18), adjacent_oo_discs));
legals = vorrq_u64(legals, vshlq_u64(flipped, shift_9));
return ((vgetq_lane_u64(legals, 0) | vgetq_lane_u64(legals, 1)) & ~(p_discs | o_discs));
}
i even wrote a interleave version
I can expect a speed gain, no?