mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-10 11:02:55 +00:00
Merge commit 'f07ccd6e4fbc5bbfeb94d40e0f14bc527a7d5439' as 'cpp/simdcomp'
This commit is contained in:
182
cpp/simdcomp/scripts/avxpacking.py
Executable file
182
cpp/simdcomp/scripts/avxpacking.py
Executable file
@@ -0,0 +1,182 @@
|
||||
#!/usr/bin/env python
|
||||
import sys
|
||||
def howmany(bit):
|
||||
""" how many values are we going to pack? """
|
||||
return 256
|
||||
|
||||
def howmanywords(bit):
|
||||
return (howmany(bit) * bit + 255)/256
|
||||
|
||||
def howmanybytes(bit):
|
||||
return howmanywords(bit) * 16
|
||||
|
||||
print("""
|
||||
/** code generated by avxpacking.py starts here **/
|
||||
""")
|
||||
|
||||
print("""typedef void (*avxpackblockfnc)(const uint32_t * pin, __m256i * compressed);""")
|
||||
print("""typedef void (*avxunpackblockfnc)(const __m256i * compressed, uint32_t * pout);""")
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def plurial(number):
|
||||
if(number <> 1):
|
||||
return "s"
|
||||
else :
|
||||
return ""
|
||||
|
||||
print("")
|
||||
print("static void avxpackblock0(const uint32_t * pin, __m256i * compressed) {");
|
||||
print(" (void)compressed;");
|
||||
print(" (void) pin; /* we consumed {0} 32-bit integer{1} */ ".format(howmany(0),plurial(howmany(0))));
|
||||
print("}");
|
||||
print("")
|
||||
|
||||
for bit in range(1,33):
|
||||
print("")
|
||||
print("/* we are going to pack {0} {1}-bit values, touching {2} 256-bit words, using {3} bytes */ ".format(howmany(bit),bit,howmanywords(bit),howmanybytes(bit)))
|
||||
print("static void avxpackblock{0}(const uint32_t * pin, __m256i * compressed) {{".format(bit));
|
||||
print(" const __m256i * in = (const __m256i *) pin;");
|
||||
print(" /* we are going to touch {0} 256-bit word{1} */ ".format(howmanywords(bit),plurial(howmanywords(bit))));
|
||||
if(howmanywords(bit) == 1):
|
||||
print(" __m256i w0;")
|
||||
else:
|
||||
print(" __m256i w0, w1;")
|
||||
if( (bit & (bit-1)) <> 0) : print(" __m256i tmp; /* used to store inputs at word boundary */")
|
||||
oldword = 0
|
||||
for j in range(howmany(bit)/8):
|
||||
firstword = j * bit / 32
|
||||
if(firstword > oldword):
|
||||
print(" _mm256_storeu_si256(compressed + {0}, w{1});".format(oldword,oldword%2))
|
||||
oldword = firstword
|
||||
secondword = (j * bit + bit - 1)/32
|
||||
firstshift = (j*bit) % 32
|
||||
if( firstword == secondword):
|
||||
if(firstshift == 0):
|
||||
print(" w{0} = _mm256_lddqu_si256 (in + {1});".format(firstword%2,j))
|
||||
else:
|
||||
print(" w{0} = _mm256_or_si256(w{0},_mm256_slli_epi32(_mm256_lddqu_si256 (in + {1}) , {2}));".format(firstword%2,j,firstshift))
|
||||
else:
|
||||
print(" tmp = _mm256_lddqu_si256 (in + {0});".format(j))
|
||||
print(" w{0} = _mm256_or_si256(w{0},_mm256_slli_epi32(tmp , {2}));".format(firstword%2,j,firstshift))
|
||||
secondshift = 32-firstshift
|
||||
print(" w{0} = _mm256_srli_epi32(tmp,{2});".format(secondword%2,j,secondshift))
|
||||
print(" _mm256_storeu_si256(compressed + {0}, w{1});".format(secondword,secondword%2))
|
||||
print("}");
|
||||
print("")
|
||||
|
||||
|
||||
print("")
|
||||
print("static void avxpackblockmask0(const uint32_t * pin, __m256i * compressed) {");
|
||||
print(" (void)compressed;");
|
||||
print(" (void) pin; /* we consumed {0} 32-bit integer{1} */ ".format(howmany(0),plurial(howmany(0))));
|
||||
print("}");
|
||||
print("")
|
||||
|
||||
for bit in range(1,33):
|
||||
print("")
|
||||
print("/* we are going to pack {0} {1}-bit values, touching {2} 256-bit words, using {3} bytes */ ".format(howmany(bit),bit,howmanywords(bit),howmanybytes(bit)))
|
||||
print("static void avxpackblockmask{0}(const uint32_t * pin, __m256i * compressed) {{".format(bit));
|
||||
print(" /* we are going to touch {0} 256-bit word{1} */ ".format(howmanywords(bit),plurial(howmanywords(bit))));
|
||||
if(howmanywords(bit) == 1):
|
||||
print(" __m256i w0;")
|
||||
else:
|
||||
print(" __m256i w0, w1;")
|
||||
print(" const __m256i * in = (const __m256i *) pin;");
|
||||
if(bit < 32): print(" const __m256i mask = _mm256_set1_epi32({0});".format((1<<bit)-1));
|
||||
def maskfnc(x):
|
||||
if(bit == 32): return x
|
||||
return " _mm256_and_si256 ( mask, {0}) ".format(x)
|
||||
if( (bit & (bit-1)) <> 0) : print(" __m256i tmp; /* used to store inputs at word boundary */")
|
||||
oldword = 0
|
||||
for j in range(howmany(bit)/8):
|
||||
firstword = j * bit / 32
|
||||
if(firstword > oldword):
|
||||
print(" _mm256_storeu_si256(compressed + {0}, w{1});".format(oldword,oldword%2))
|
||||
oldword = firstword
|
||||
secondword = (j * bit + bit - 1)/32
|
||||
firstshift = (j*bit) % 32
|
||||
loadstr = maskfnc(" _mm256_lddqu_si256 (in + {0}) ".format(j))
|
||||
if( firstword == secondword):
|
||||
if(firstshift == 0):
|
||||
print(" w{0} = {1};".format(firstword%2,loadstr))
|
||||
else:
|
||||
print(" w{0} = _mm256_or_si256(w{0},_mm256_slli_epi32({1} , {2}));".format(firstword%2,loadstr,firstshift))
|
||||
else:
|
||||
print(" tmp = {0};".format(loadstr))
|
||||
print(" w{0} = _mm256_or_si256(w{0},_mm256_slli_epi32(tmp , {2}));".format(firstword%2,j,firstshift))
|
||||
secondshift = 32-firstshift
|
||||
print(" w{0} = _mm256_srli_epi32(tmp,{2});".format(secondword%2,j,secondshift))
|
||||
print(" _mm256_storeu_si256(compressed + {0}, w{1});".format(secondword,secondword%2))
|
||||
print("}");
|
||||
print("")
|
||||
|
||||
|
||||
print("static void avxunpackblock0(const __m256i * compressed, uint32_t * pout) {");
|
||||
print(" (void) compressed;");
|
||||
print(" memset(pout,0,{0});".format(howmany(0)));
|
||||
print("}");
|
||||
print("")
|
||||
|
||||
for bit in range(1,33):
|
||||
print("")
|
||||
print("/* we packed {0} {1}-bit values, touching {2} 256-bit words, using {3} bytes */ ".format(howmany(bit),bit,howmanywords(bit),howmanybytes(bit)))
|
||||
print("static void avxunpackblock{0}(const __m256i * compressed, uint32_t * pout) {{".format(bit));
|
||||
print(" /* we are going to access {0} 256-bit word{1} */ ".format(howmanywords(bit),plurial(howmanywords(bit))));
|
||||
if(howmanywords(bit) == 1):
|
||||
print(" __m256i w0;")
|
||||
else:
|
||||
print(" __m256i w0, w1;")
|
||||
print(" __m256i * out = (__m256i *) pout;");
|
||||
if(bit < 32): print(" const __m256i mask = _mm256_set1_epi32({0});".format((1<<bit)-1));
|
||||
maskstr = " _mm256_and_si256 ( mask, {0}) "
|
||||
if (bit == 32) : maskstr = " {0} " # no need
|
||||
oldword = 0
|
||||
print(" w0 = _mm256_lddqu_si256 (compressed);")
|
||||
for j in range(howmany(bit)/8):
|
||||
firstword = j * bit / 32
|
||||
secondword = (j * bit + bit - 1)/32
|
||||
if(secondword > oldword):
|
||||
print(" w{0} = _mm256_lddqu_si256 (compressed + {1});".format(secondword%2,secondword))
|
||||
oldword = secondword
|
||||
firstshift = (j*bit) % 32
|
||||
firstshiftstr = "_mm256_srli_epi32( w{0} , "+str(firstshift)+") "
|
||||
if(firstshift == 0):
|
||||
firstshiftstr =" w{0} " # no need
|
||||
wfirst = firstshiftstr.format(firstword%2)
|
||||
if( firstword == secondword):
|
||||
if(firstshift + bit <> 32):
|
||||
wfirst = maskstr.format(wfirst)
|
||||
print(" _mm256_storeu_si256(out + {0}, {1});".format(j,wfirst))
|
||||
else:
|
||||
secondshift = (32-firstshift)
|
||||
wsecond = "_mm256_slli_epi32( w{0} , {1} ) ".format((firstword+1)%2,secondshift)
|
||||
wfirstorsecond = " _mm256_or_si256 ({0},{1}) ".format(wfirst,wsecond)
|
||||
wfirstorsecond = maskstr.format(wfirstorsecond)
|
||||
print(" _mm256_storeu_si256(out + {0},\n {1});".format(j,wfirstorsecond))
|
||||
print("}");
|
||||
print("")
|
||||
|
||||
|
||||
print("static avxpackblockfnc avxfuncPackArr[] = {")
|
||||
for bit in range(0,32):
|
||||
print("&avxpackblock{0},".format(bit))
|
||||
print("&avxpackblock32")
|
||||
print("};")
|
||||
|
||||
print("static avxpackblockfnc avxfuncPackMaskArr[] = {")
|
||||
for bit in range(0,32):
|
||||
print("&avxpackblockmask{0},".format(bit))
|
||||
print("&avxpackblockmask32")
|
||||
print("};")
|
||||
|
||||
|
||||
print("static avxunpackblockfnc avxfuncUnpackArr[] = {")
|
||||
for bit in range(0,32):
|
||||
print("&avxunpackblock{0},".format(bit))
|
||||
print("&avxunpackblock32")
|
||||
print("};")
|
||||
print("/** code generated by avxpacking.py ends here **/")
|
||||
152
cpp/simdcomp/scripts/simdfor.py
Executable file
152
cpp/simdcomp/scripts/simdfor.py
Executable file
@@ -0,0 +1,152 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
|
||||
from math import ceil
|
||||
|
||||
print("""
|
||||
/**
|
||||
* Blablabla
|
||||
*
|
||||
*/
|
||||
|
||||
""");
|
||||
|
||||
def mask(bit):
|
||||
return str((1 << bit) - 1)
|
||||
|
||||
for length in [32]:
|
||||
print("""
|
||||
static __m128i iunpackFOR0(__m128i initOffset, const __m128i * _in , uint32_t * _out) {
|
||||
__m128i *out = (__m128i*)(_out);
|
||||
int i;
|
||||
(void) _in;
|
||||
for (i = 0; i < 8; ++i) {
|
||||
_mm_store_si128(out++, initOffset);
|
||||
_mm_store_si128(out++, initOffset);
|
||||
_mm_store_si128(out++, initOffset);
|
||||
_mm_store_si128(out++, initOffset);
|
||||
}
|
||||
|
||||
return initOffset;
|
||||
}
|
||||
|
||||
""")
|
||||
print("""
|
||||
|
||||
static void ipackFOR0(__m128i initOffset , const uint32_t * _in , __m128i * out ) {
|
||||
(void) initOffset;
|
||||
(void) _in;
|
||||
(void) out;
|
||||
}
|
||||
""")
|
||||
for bit in range(1,33):
|
||||
offsetVar = " initOffset";
|
||||
print("""
|
||||
static void ipackFOR"""+str(bit)+"""(__m128i """+offsetVar+""", const uint32_t * _in, __m128i * out) {
|
||||
const __m128i *in = (const __m128i*)(_in);
|
||||
__m128i OutReg;
|
||||
|
||||
""");
|
||||
|
||||
if (bit != 32):
|
||||
print(" __m128i CurrIn = _mm_load_si128(in);");
|
||||
print(" __m128i InReg = _mm_sub_epi32(CurrIn, initOffset);");
|
||||
else:
|
||||
print(" __m128i InReg = _mm_load_si128(in);");
|
||||
print(" (void) initOffset;");
|
||||
|
||||
|
||||
inwordpointer = 0
|
||||
valuecounter = 0
|
||||
for k in range(ceil((length * bit) / 32)):
|
||||
if(valuecounter == length): break
|
||||
for x in range(inwordpointer,32,bit):
|
||||
if(x!=0) :
|
||||
print(" OutReg = _mm_or_si128(OutReg, _mm_slli_epi32(InReg, " + str(x) + "));");
|
||||
else:
|
||||
print(" OutReg = InReg; ");
|
||||
if((x+bit>=32) ):
|
||||
while(inwordpointer<32):
|
||||
inwordpointer += bit
|
||||
print(" _mm_store_si128(out, OutReg);");
|
||||
print("");
|
||||
|
||||
if(valuecounter + 1 < length):
|
||||
print(" ++out;")
|
||||
inwordpointer -= 32;
|
||||
if(inwordpointer>0):
|
||||
print(" OutReg = _mm_srli_epi32(InReg, " + str(bit) + " - " + str(inwordpointer) + ");");
|
||||
if(valuecounter + 1 < length):
|
||||
print(" ++in;")
|
||||
|
||||
if (bit != 32):
|
||||
print(" CurrIn = _mm_load_si128(in);");
|
||||
print(" InReg = _mm_sub_epi32(CurrIn, initOffset);");
|
||||
else:
|
||||
print(" InReg = _mm_load_si128(in);");
|
||||
print("");
|
||||
valuecounter = valuecounter + 1
|
||||
if(valuecounter == length): break
|
||||
assert(valuecounter == length)
|
||||
print("\n}\n\n""")
|
||||
|
||||
for bit in range(1,32):
|
||||
offsetVar = " initOffset";
|
||||
print("""\n
|
||||
static __m128i iunpackFOR"""+str(bit)+"""(__m128i """+offsetVar+""", const __m128i* in, uint32_t * _out) {
|
||||
""");
|
||||
print(""" __m128i* out = (__m128i*)(_out);
|
||||
__m128i InReg = _mm_load_si128(in);
|
||||
__m128i OutReg;
|
||||
__m128i tmp;
|
||||
const __m128i mask = _mm_set1_epi32((1U<<"""+str(bit)+""")-1);
|
||||
|
||||
""");
|
||||
|
||||
MainText = "";
|
||||
|
||||
MainText += "\n";
|
||||
inwordpointer = 0
|
||||
valuecounter = 0
|
||||
for k in range(ceil((length * bit) / 32)):
|
||||
for x in range(inwordpointer,32,bit):
|
||||
if(valuecounter == length): break
|
||||
if (x > 0):
|
||||
MainText += " tmp = _mm_srli_epi32(InReg," + str(x) +");\n";
|
||||
else:
|
||||
MainText += " tmp = InReg;\n";
|
||||
if(x+bit<32):
|
||||
MainText += " OutReg = _mm_and_si128(tmp, mask);\n";
|
||||
else:
|
||||
MainText += " OutReg = tmp;\n";
|
||||
if((x+bit>=32) ):
|
||||
while(inwordpointer<32):
|
||||
inwordpointer += bit
|
||||
if(valuecounter + 1 < length):
|
||||
MainText += " ++in;"
|
||||
MainText += " InReg = _mm_load_si128(in);\n";
|
||||
inwordpointer -= 32;
|
||||
if(inwordpointer>0):
|
||||
MainText += " OutReg = _mm_or_si128(OutReg, _mm_and_si128(_mm_slli_epi32(InReg, " + str(bit) + "-" + str(inwordpointer) + "), mask));\n\n";
|
||||
if (bit != 32):
|
||||
MainText += " OutReg = _mm_add_epi32(OutReg, initOffset);\n";
|
||||
MainText += " _mm_store_si128(out++, OutReg);\n\n";
|
||||
MainText += "";
|
||||
valuecounter = valuecounter + 1
|
||||
if(valuecounter == length): break
|
||||
assert(valuecounter == length)
|
||||
print(MainText)
|
||||
print(" return initOffset;");
|
||||
print("\n}\n\n")
|
||||
print("""
|
||||
static __m128i iunpackFOR32(__m128i initvalue , const __m128i* in, uint32_t * _out) {
|
||||
__m128i * mout = (__m128i *)_out;
|
||||
__m128i invec;
|
||||
size_t k;
|
||||
for(k = 0; k < 128/4; ++k) {
|
||||
invec = _mm_load_si128(in++);
|
||||
_mm_store_si128(mout++, invec);
|
||||
}
|
||||
return invec;
|
||||
}
|
||||
""")
|
||||
Reference in New Issue
Block a user