I'm looking to do conditional programming with NEON. I try the __ARN_NEON tag. it doesn't work
can you help me?
Post
Replies
Boosts
Views
Activity
hello,
i have this error :
error: accessing build database "/Users/.../Build/Intermediates.noindex/XCBuildData/build.db": disk I/O error
only if i use a custom paths for derived data...
how to fix it
Hi,
is there an intrinsic instruction to shift a vector from another a shift vector?
thank
hello,
i wrote a method with Neon for accelerate my code..
I don't have the expected result
only simples instructions and, or, shift... on 64bits
no gain speed. nada :-(
i dont understand... how programm this method for speed?
original code :
unsigned long long RXBitBoard::get_legal_moves(const unsigned long long p_discs, const unsigned long long o_discs) {
const unsigned long long inner_o_discs = o_discs & 0x7E7E7E7E7E7E7E7EULL;
/* direction W */
unsigned long long
flipped = (p_discs >> 1) & inner_o_discs;
flipped |= (flipped >> 1) & inner_o_discs;
unsigned long long adjacent_o_discs = inner_o_discs & (inner_o_discs >> 1);
flipped |= (flipped >> 2) & adjacent_o_discs;
flipped |= (flipped >> 2) & adjacent_o_discs;
unsigned long long legals = flipped >> 1;
// /* direction _E*/
// flipped = (p_discs << 1) & inner_o_discs;
// flipped |= (flipped << 1) & inner_o_discs;
//
// adjacent_o_discs = inner_o_discs & (inner_o_discs << 1);
//
// flipped |= (flipped << 2) & adjacent_o_discs;
// flipped |= (flipped << 2) & adjacent_o_discs;
//
// legals |= flipped << 1;
// trick
/* direction _E */
flipped = (p_discs << 1);
legals |= ((flipped + inner_o_discs) & ~flipped);
/* direction S */
flipped = (p_discs >> 8) & o_discs;
flipped |= (flipped >> 8) & o_discs;
adjacent_o_discs = o_discs & (o_discs >> 8);
flipped |= (flipped >> 16) & adjacent_o_discs;
flipped |= (flipped >> 16) & adjacent_o_discs;
legals |= flipped >> 8;
/* direction N */
flipped = (p_discs << 8) & o_discs;
flipped |= (flipped << 8) & o_discs;
adjacent_o_discs = o_discs & (o_discs << 8);
flipped |= (flipped << 16) & adjacent_o_discs;
flipped |= (flipped << 16) & adjacent_o_discs;
legals |= flipped << 8;
/* direction NE */
flipped = (p_discs >> 7) & inner_o_discs;
flipped |= (flipped >> 7) & inner_o_discs;
adjacent_o_discs = inner_o_discs & (inner_o_discs >> 7);
flipped |= (flipped >> 14) & adjacent_o_discs;
flipped |= (flipped >> 14) & adjacent_o_discs;
legals |= flipped >> 7;
/* direction SW */
flipped = (p_discs << 7) & inner_o_discs;
flipped |= (flipped << 7) & inner_o_discs;
adjacent_o_discs = inner_o_discs & (inner_o_discs << 7);
flipped |= (flipped << 14) & adjacent_o_discs;
flipped |= (flipped << 14) & adjacent_o_discs;
legals |= flipped << 7;
/* direction NW */
flipped = (p_discs >> 9) & inner_o_discs;
flipped |= (flipped >> 9) & inner_o_discs;
adjacent_o_discs = inner_o_discs & (inner_o_discs >> 9);
flipped |= (flipped >> 18) & adjacent_o_discs;
flipped |= (flipped >> 18) & adjacent_o_discs;
legals |= flipped >> 9;
/* direction SE */
flipped = (p_discs << 9) & inner_o_discs;
flipped |= (flipped << 9) & inner_o_discs;
adjacent_o_discs = inner_o_discs & (inner_o_discs << 9);
flipped |= (flipped << 18) & adjacent_o_discs;
flipped |= (flipped << 18) & adjacent_o_discs;
legals |= flipped << 9;
//Removes existing discs
legals &= ~(p_discs | o_discs);
return legals;
}
my neon code
const uint64x2_t pp_discs = vdupq_n_u64(p_discs);
const uint64x2_t oo_discs = vdupq_n_u64(o_discs);
const uint64x2_t inner_oo_discs = vdupq_n_u64(o_discs & 0x7E7E7E7E7E7E7E7EULL);
//horizontals directions -1, +1
static const int64x2_t shift_1 = {-1, 1};
static const int64x2_t shift_2 = {-2, 2};
uint64x2_t
flipped = vandq_u64(vshlq_u64(pp_discs, shift_1), inner_oo_discs);
flipped = vorrq_u64(flipped, vandq_u64(vshlq_u64(flipped, shift_1), inner_oo_discs));
uint64x2_t
adjacent_oo_discs = vandq_u64(inner_oo_discs, vshlq_u64(inner_oo_discs, shift_1));
flipped = vorrq_u64(flipped, vandq_u64(vshlq_u64(flipped, shift_2), adjacent_oo_discs));
flipped = vorrq_u64(flipped, vandq_u64(vshlq_u64(flipped, shift_2), adjacent_oo_discs));
uint64x2_t legals = vshlq_u64(flipped, shift_1);
//verticals directions -8 , +8
static const int64x2_t shift_8 = {-8, 8};
static const int64x2_t shift_16 = {-16, 16};
flipped = vandq_u64(vshlq_u64(pp_discs, shift_8), oo_discs);
flipped = vorrq_u64(flipped, vandq_u64(vshlq_u64(flipped, shift_8), oo_discs));
adjacent_oo_discs = vandq_u64(oo_discs, vshlq_u64(oo_discs, shift_8));
flipped = vorrq_u64(flipped, vandq_u64(vshlq_u64(flipped, shift_16), adjacent_oo_discs));
flipped = vorrq_u64(flipped, vandq_u64(vshlq_u64(flipped, shift_16), adjacent_oo_discs));
legals = vorrq_u64(legals, vshlq_u64(flipped, shift_8));
//diagonals directions -7 , +7
static const int64x2_t shift_7 = {-7, 7};
static const int64x2_t shift_14 = {-14, 14};
flipped = vandq_u64(vshlq_u64(pp_discs, shift_7), inner_oo_discs);
flipped = vorrq_u64(flipped, vandq_u64(vshlq_u64(flipped, shift_7), inner_oo_discs));
adjacent_oo_discs = vandq_u64(inner_oo_discs, vshlq_u64(inner_oo_discs, shift_7));
flipped = vorrq_u64(flipped, vandq_u64(vshlq_u64(flipped, shift_14), adjacent_oo_discs));
flipped = vorrq_u64(flipped, vandq_u64(vshlq_u64(flipped, shift_14), adjacent_oo_discs));
legals = vorrq_u64(legals, vshlq_u64(flipped, shift_7));
//diagonals directions -9 , +9
static const int64x2_t shift_9 = {-9, 9};
static const int64x2_t shift_18 = {-18, 18};
flipped = vandq_u64(vshlq_u64(pp_discs, shift_9), inner_oo_discs);
flipped = vorrq_u64(flipped, vandq_u64(vshlq_u64(flipped, shift_9), inner_oo_discs));
adjacent_oo_discs = vandq_u64(inner_oo_discs, vshlq_u64(inner_oo_discs, shift_9));
flipped = vorrq_u64(flipped, vandq_u64(vshlq_u64(flipped, shift_18), adjacent_oo_discs));
flipped = vorrq_u64(flipped, vandq_u64(vshlq_u64(flipped, shift_18), adjacent_oo_discs));
legals = vorrq_u64(legals, vshlq_u64(flipped, shift_9));
return ((vgetq_lane_u64(legals, 0) | vgetq_lane_u64(legals, 1)) & ~(p_discs | o_discs));
}
i even wrote a interleave version
I can expect a speed gain, no?
Do you know any software that indicates the frequency of the cores in real time?
the macbook pro adapts the frequency of the cores according to the load/temperature it seems to me
thank
hi
i dont know if i post in a good section. i have crash without message in console only error 34 311
what does this code mean?
erreur -311 smFHBlkDispErr : an error occurred during _sDisposePtr (suppression du bloc FHeader).
erreur -34 dskFulErr : full disk
or is this something else?
thank
hello,
I would like to run benchmarks on p-cores vs e-cores,
I suspect a difference of more than 50%
is it possible to choose your processor (P or E) to run your program?
nice day
hi
Question is in the title.. do we already know the length of the vectors?
best regards
hi
i want create a commande line tool for arm64 et intel x86-64
Xcode 16 macbook pro M3
my setting :
and
but my application compile only for arm64
who can explain to me ?