Using SIMD instructions in mapZoom

This commit is contained in:
jewe37 2018-04-21 22:29:42 +02:00
parent 37bb2819d3
commit e6e9487d9e
3 changed files with 270 additions and 3 deletions

111
layers.c
View File

@ -162,7 +162,116 @@ void mapIsland(Layer *l, int * __restrict out, int areaX, int areaZ, int areaWid
}
}
#ifdef __AVX2__
void mapZoom(Layer *l, int* __restrict out, int areaX, int areaZ, int areaWidth, int areaHeight) {
int pWidth = (areaWidth>>1)+2, pHeight = (areaHeight>>1)+1;
__m256i (*selectRand)(__m256i* cs, int ws, __m256i a1, __m256i a2, __m256i a3, __m256i a4) = (l->p->getMap == mapIsland) ? select8Random4 : select8ModeOrRandom;
int newWidth = areaWidth+10&0xFFFFFFFE;//modified to ignore ends
int x, z;
__m256i cs, a, b, a1, b1, toBuf1, toBuf2, aSuf;
__m256i mask1 = _mm256_setr_epi32(0xFFFFFFFF, 0x0, 0xFFFFFFFF, 0x0, 0xFFFFFFFF, 0x0, 0xFFFFFFFF, 0x0), mask2 = _mm256_setr_epi32(0x0, 0xFFFFFFFF, 0x0, 0xFFFFFFFF, 0x0, 0xFFFFFFFF, 0x0, 0xFFFFFFFF);
__m256i shuffle = _mm256_setr_epi32(0, 4, 1, 5, 2, 6, 3, 7);
int pX = areaX&0xFFFFFFFE;
__m256i xs = _mm256_set_epi32(pX+14, pX+12, pX+10, pX+8, pX+6, pX+4, pX+2, pX), zs;
__m256i v2 = _mm256_set1_epi32(2), v16 = _mm256_set1_epi32(16);
int* buf = malloc((newWidth+1)*(areaHeight+2|1)*sizeof(*buf));
int* idx = buf;
int* outIdx = out;
//z first!
for (x = 0; x < pWidth-1; x += 8) {
a = _mm256_loadu_si256((__m256i*)(outIdx));//0, 0
b = _mm256_loadu_si256((__m256i*)(outIdx+1));//1, 0
zs = _mm256_set1_epi32(areaZ&0xFFFFFFFE);
for (z = 0; z < pHeight; z++) {
cs = set8ChunkSeeds(l->worldSeed, xs, zs);
outIdx += pWidth;
a1 = _mm256_loadu_si256((__m256i*)(outIdx));//0, 1
b1 = _mm256_loadu_si256((__m256i*)(outIdx+1));//1, 1
toBuf1 = _mm256_permutevar8x32_epi32(select8Random2(&cs, l->worldSeed, a, a1), shuffle);
toBuf2 = _mm256_permutevar8x32_epi32(select8Random2(&cs, l->worldSeed, a, b), shuffle);
aSuf = _mm256_permutevar8x32_epi32(a, shuffle);
_mm256_maskstore_epi32(idx, mask1, aSuf);
_mm256_maskstore_epi32(idx+1, mask1, toBuf2);
_mm256_maskstore_epi32(idx+7, mask2, aSuf);
_mm256_maskstore_epi32(idx+8, mask2, toBuf2);
idx += newWidth;
toBuf2 = _mm256_permutevar8x32_epi32(selectRand(&cs, l->worldSeed, a, b, a1, b1), shuffle);
_mm256_maskstore_epi32(idx, mask1, toBuf1);
_mm256_maskstore_epi32(idx+1, mask1, toBuf2);
_mm256_maskstore_epi32(idx+7, mask2, toBuf1);
_mm256_maskstore_epi32(idx+8, mask2, toBuf2);
idx += newWidth;
a = a1;
b = b1;
zs = _mm256_add_epi32(zs, v2);
}
outIdx += 8-pHeight*pWidth;
idx += 16-pHeight*2*newWidth;
xs = _mm256_add_epi32(xs, v16);
}
for(z = 0; z < areaHeight; z++)
{
memcpy(&out[z*areaWidth], &buf[(z + (areaZ & 1))*newWidth + (areaX & 1)], areaWidth*sizeof(int));
}
free(buf);
}
#elif defined __SSE4_2__
void mapZoom(Layer *l, int* __restrict out, int areaX, int areaZ, int areaWidth, int areaHeight) {
int pWidth = (areaWidth>>1)+2, pHeight = (areaHeight>>1)+1;
__m128i (*selectRand)(__m128i* cs, int ws, __m128i a1, __m128i a2, __m128i a3, __m128i a4) = (l->p->getMap == mapIsland) ? select4Random4 : select4ModeOrRandom;
int newWidth = areaWidth+6&0xFFFFFFFE;//modified to ignore ends
int x, z;
__m128i cs, a, b, a1, b1, toBuf1, toBuf2, aSuf;
__m128i mask1 = _mm_setr_epi32(0xFFFFFFFF, 0x0, 0xFFFFFFFF, 0x0), mask2 = _mm_setr_epi32(0x0, 0xFFFFFFFF, 0x0, 0xFFFFFFFF);
int pX = areaX&0xFFFFFFFE;
__m128i xs = _mm_set_epi32(pX+6, pX+4, pX+2, pX), zs;
__m128i v2 = _mm_set1_epi32(2), v8 = _mm_set1_epi32(8);
int* buf = malloc((newWidth+1)*(areaHeight+2|1)*sizeof(*buf));
int* idx = buf;
int* outIdx = out;
//z first!
for (x = 0; x < pWidth-1; x += 4) {
a = _mm_loadu_si128((__m128i_u*)(outIdx));//0, 0
b = _mm_loadu_si128((__m128i_u*)(outIdx+1));//1, 0
zs = _mm_set1_epi32(areaZ&0xFFFFFFFE);
for (z = 0; z < pHeight; z++) {
cs = set4ChunkSeeds(l->worldSeed, xs, zs);
outIdx += pWidth;
a1 = _mm_loadu_si128((__m128i_u*)(outIdx));//0, 1
b1 = _mm_loadu_si128((__m128i_u*)(outIdx+1));//1, 1
toBuf1 = _mm_shuffle_epi32(select4Random2(&cs, l->worldSeed, a, a1), 0xD8);//11011000->3120->1324
toBuf2 = _mm_shuffle_epi32(select4Random2(&cs, l->worldSeed, a, b), 0xD8);
aSuf = _mm_shuffle_epi32(a, 0xD8);
_mm_maskmoveu_si128(aSuf, mask1, (char*)(idx));
_mm_maskmoveu_si128(toBuf2, mask1, (char*)(idx+1));
_mm_maskmoveu_si128(aSuf, mask2, (char*)(idx+3));
_mm_maskmoveu_si128(toBuf2, mask2, (char*)(idx+4));
idx += newWidth;
toBuf2 = _mm_shuffle_epi32(selectRand(&cs, l->worldSeed, a, b, a1, b1), 0xD8);
_mm_maskmoveu_si128(toBuf1, mask1, (char*)(idx));
_mm_maskmoveu_si128(toBuf2, mask1, (char*)(idx+1));
_mm_maskmoveu_si128(toBuf1, mask2, (char*)(idx+3));
_mm_maskmoveu_si128(toBuf2, mask2, (char*)(idx+4));
idx += newWidth;
a = a1;
b = b1;
zs = _mm_add_epi32(zs, v2);
}
outIdx += 4-pHeight*pWidth;
idx += 8-pHeight*2*newWidth;
xs = _mm_add_epi32(xs, v8);
}
for(z = 0; z < areaHeight; z++)
{
memcpy(&out[z*areaWidth], &buf[(z + (areaZ & 1))*newWidth + (areaX & 1)], areaWidth*sizeof(int));
}
free(buf);
}
#else
void mapZoom(Layer *l, int * __restrict out, int areaX, int areaZ, int areaWidth, int areaHeight)
{
int pX = areaX >> 1;
@ -256,7 +365,7 @@ void mapZoom(Layer *l, int * __restrict out, int areaX, int areaZ, int areaWidth
free(buf);
}
#endif
void mapAddIsland(Layer *l, int * __restrict out, int areaX, int areaZ, int areaWidth, int areaHeight)
{

158
layers.h
View File

@ -3,6 +3,19 @@
#include <stdlib.h>
#ifdef __AVX2__
#include <emmintrin.h>
#include <smmintrin.h>
#include <immintrin.h>
#warning "Using AVX2 extensions."
#elif defined __SSE4_2__
#include <emmintrin.h>
#include <smmintrin.h>
#warning "Using SSE4.2 extensions."
#else
#warning "Using no SIMD extensions."
#endif
#define STRUCT(S) typedef struct S S; struct S
#define OPT_O2 __attribute__((optimize("O2")))
@ -172,6 +185,150 @@ static inline void setBaseSeed(Layer *layer, long seed)
layer->chunkSeed = 0;
}
#ifdef __AVX2__
static inline __m256i set8ChunkSeeds(int ws, __m256i xs, __m256i zs) {
__m256i out = _mm256_set1_epi32(ws);
__m256i mul = _mm256_set1_epi32(1284865837);
__m256i add = _mm256_set1_epi32(4150755663);
out = _mm256_add_epi32(xs, _mm256_mullo_epi32(out, _mm256_add_epi32(add, _mm256_mullo_epi32(out, mul))));
out = _mm256_add_epi32(zs, _mm256_mullo_epi32(out, _mm256_add_epi32(add, _mm256_mullo_epi32(out, mul))));
out = _mm256_add_epi32(xs, _mm256_mullo_epi32(out, _mm256_add_epi32(add, _mm256_mullo_epi32(out, mul))));
return _mm256_add_epi32(zs, _mm256_mullo_epi32(out, _mm256_add_epi32(add, _mm256_mullo_epi32(out, mul))));
}
static inline __m256i mc8NextInt(__m256i* cs, int ws, int mask) {
__m256i and = _mm256_set1_epi32(mask);
__m256i ret = _mm256_and_si256(and, _mm256_srli_epi32(*cs, 24));
*cs = _mm256_add_epi32(_mm256_set1_epi32(ws), _mm256_mullo_epi32(*cs, _mm256_add_epi32(_mm256_set1_epi32(4150755663), _mm256_mullo_epi32(*cs, _mm256_set1_epi32(1284865837)))));
return _mm256_add_epi32(ret, _mm256_and_si256(and, _mm256_cmpgt_epi32(_mm256_set1_epi32(0), ret)));;
}
static inline __m256i select8Random2(__m256i* cs, int ws, __m256i a1, __m256i a2) {
__m256i cmp = _mm256_cmpeq_epi32(_mm256_set1_epi32(0), mc8NextInt(cs, ws, 0x1));
return _mm256_or_si256(_mm256_and_si256(cmp, a1), _mm256_andnot_si256(cmp, a2));
}
static inline __m256i select8Random4(__m256i* cs, int ws, __m256i a1, __m256i a2, __m256i a3, __m256i a4) {
__m256i val = mc8NextInt(cs, ws, 0x3);
__m256i v2 = _mm256_set1_epi32(2);
__m256i cmp1 = _mm256_cmpeq_epi32(val, _mm256_set1_epi32(0));
__m256i cmp2 = _mm256_cmpeq_epi32(v2, val);
__m256i cmp3 = _mm256_cmpgt_epi32(v2, val);
return _mm256_or_si256(
_mm256_and_si256(cmp3, _mm256_or_si256(_mm256_and_si256(cmp1, a1), _mm256_andnot_si256(cmp1, a2))),
_mm256_andnot_si256(cmp3, _mm256_or_si256(_mm256_and_si256(cmp2, a3), _mm256_andnot_si256(cmp2, a4)))
);
}
static inline __m256i select8ModeOrRandom(__m256i* cs, int ws, __m256i a1, __m256i a2, __m256i a3, __m256i a4) {
__m256i cmp1 = _mm256_cmpeq_epi32(a1, a2);
__m256i cmp2 = _mm256_cmpeq_epi32(a1, a3);
__m256i cmp3 = _mm256_cmpeq_epi32(a1, a4);
__m256i cmp4 = _mm256_cmpeq_epi32(a2, a3);
__m256i cmp5 = _mm256_cmpeq_epi32(a2, a4);
__m256i cmp6 = _mm256_cmpeq_epi32(a3, a4);
__m256i isa1 = _mm256_or_si256(
_mm256_andnot_si256(cmp6, cmp1),
_mm256_or_si256 (
_mm256_andnot_si256(cmp5, cmp2),
_mm256_andnot_si256(cmp4, cmp3)
)
);
__m256i isa2 = _mm256_or_si256(
_mm256_andnot_si256(cmp3, cmp4),
_mm256_andnot_si256(cmp2, cmp5)
);
__m256i isa3 = _mm256_andnot_si256(cmp1, cmp6);
return _mm256_or_si256(
_mm256_andnot_si256(
_mm256_or_si256(
isa1,
_mm256_or_si256(isa2, isa3)
),
select8Random4(cs, ws, a1, a2, a3, a4)
),
_mm256_or_si256(
_mm256_and_si256(isa1, a1),
_mm256_or_si256(
_mm256_and_si256(isa2, a2),
_mm256_and_si256(isa3, a3)
)
)
);
}
#elif defined __SSE4_2__
static inline __m128i set4ChunkSeeds(int ws, __m128i xs, __m128i zs) {
__m128i out = _mm_set1_epi32(ws);
__m128i mul = _mm_set1_epi32(1284865837);
__m128i add = _mm_set1_epi32(4150755663);
out = _mm_add_epi32(xs, _mm_mullo_epi32(out, _mm_add_epi32(add, _mm_mullo_epi32(out, mul))));
out = _mm_add_epi32(zs, _mm_mullo_epi32(out, _mm_add_epi32(add, _mm_mullo_epi32(out, mul))));
out = _mm_add_epi32(xs, _mm_mullo_epi32(out, _mm_add_epi32(add, _mm_mullo_epi32(out, mul))));
return _mm_add_epi32(zs, _mm_mullo_epi32(out, _mm_add_epi32(add, _mm_mullo_epi32(out, mul))));
}
static inline __m128i mc4NextInt(__m128i* cs, int ws, int mask) {
__m128i and = _mm_set1_epi32(mask);
__m128i ret = _mm_and_si128(and, _mm_srli_epi32(*cs, 24));
*cs = _mm_add_epi32( _mm_set1_epi32(ws), _mm_mullo_epi32(*cs, _mm_add_epi32(_mm_set1_epi32(4150755663), _mm_mullo_epi32(*cs, _mm_set1_epi32(1284865837)))));
return _mm_add_epi32(ret, _mm_and_si128(and, _mm_cmplt_epi32(ret, _mm_set1_epi32(0))));;
}
static inline __m128i select4Random2(__m128i* cs, int ws, __m128i a1, __m128i a2) {
__m128i cmp = _mm_cmpeq_epi32(_mm_set1_epi32(0), mc4NextInt(cs, ws, 0x1));
return _mm_or_si128(_mm_and_si128(cmp, a1), _mm_andnot_si128(cmp, a2));
}
static inline __m128i select4Random4(__m128i* cs, int ws, __m128i a1, __m128i a2, __m128i a3, __m128i a4) {
__m128i val = mc4NextInt(cs, ws, 0x3);
__m128i v2 = _mm_set1_epi32(2);
__m128i cmp1 = _mm_cmpeq_epi32(val, _mm_set1_epi32(0));
__m128i cmp2 = _mm_cmpeq_epi32(val, v2);
__m128i cmp3 = _mm_cmplt_epi32(val, v2);
return _mm_or_si128(
_mm_and_si128(cmp3, _mm_or_si128(_mm_and_si128(cmp1, a1), _mm_andnot_si128(cmp1, a2))),
_mm_andnot_si128(cmp3, _mm_or_si128(_mm_and_si128(cmp2, a3), _mm_andnot_si128(cmp2, a4)))
);
}
static inline __m128i select4ModeOrRandom(__m128i* cs, int ws, __m128i a1, __m128i a2, __m128i a3, __m128i a4) {
//((a == b)&(c != d) | (a == c)&(b != d) | (a == d)&(b != c))&a | ((b == c)&(a != d) | (b == d)&(a != c))&b | ((c == d)&(a != b))&c
__m128i cmp1 = _mm_cmpeq_epi32(a1, a2);
__m128i cmp2 = _mm_cmpeq_epi32(a1, a3);
__m128i cmp3 = _mm_cmpeq_epi32(a1, a4);
__m128i cmp4 = _mm_cmpeq_epi32(a2, a3);
__m128i cmp5 = _mm_cmpeq_epi32(a2, a4);
__m128i cmp6 = _mm_cmpeq_epi32(a3, a4);
__m128i isa1 = _mm_or_si128(
_mm_andnot_si128(cmp6, cmp1),
_mm_or_si128 (
_mm_andnot_si128(cmp5, cmp2),
_mm_andnot_si128(cmp4, cmp3)
)
);
__m128i isa2 = _mm_or_si128(
_mm_andnot_si128(cmp3, cmp4),
_mm_andnot_si128(cmp2, cmp5)
);
__m128i isa3 = _mm_andnot_si128(cmp1, cmp6);
return _mm_or_si128(
_mm_andnot_si128(
_mm_or_si128(
isa1,
_mm_or_si128(isa2, isa3)
),
select4Random4(cs, ws, a1, a2, a3, a4)
),
_mm_or_si128(
_mm_and_si128(isa1, a1),
_mm_or_si128(
_mm_and_si128(isa2, a2),
_mm_and_si128(isa3, a3)
)
)
);
}
#else
static inline int selectRandom2(Layer *l, int a1, int a2)
{
int i = mcNextInt(l, 2);
@ -201,6 +358,7 @@ static inline int selectModeOrRandom(Layer *l, int a1, int a2, int a3, int a4)
return rndarg;
}
#endif
// A null layer does nothing, and can be used to apply a layer to existing data.
void mapNull(Layer *l, int * __restrict out, int x, int z, int w, int h);

View File

@ -1,5 +1,5 @@
CC = g++
CFLAGS = -O3 -Wall -fwrapv
CC = gcc
CFLAGS = -O3 -Wall -fwrapv -march=native
LDFLAGS = -lm -pthread
.PHONY : all clean