Files
tantivy/cpp/simdcomp/scripts/avxpacking.py

183 lines
7.3 KiB
Python
Executable File

#!/usr/bin/env python
import sys
def howmany(bit):
""" how many values are we going to pack? """
return 256
def howmanywords(bit):
return (howmany(bit) * bit + 255)/256
def howmanybytes(bit):
return howmanywords(bit) * 16
print("""
/** code generated by avxpacking.py starts here **/
""")
print("""typedef void (*avxpackblockfnc)(const uint32_t * pin, __m256i * compressed);""")
print("""typedef void (*avxunpackblockfnc)(const __m256i * compressed, uint32_t * pout);""")
def plurial(number):
if(number <> 1):
return "s"
else :
return ""
print("")
print("static void avxpackblock0(const uint32_t * pin, __m256i * compressed) {");
print(" (void)compressed;");
print(" (void) pin; /* we consumed {0} 32-bit integer{1} */ ".format(howmany(0),plurial(howmany(0))));
print("}");
print("")
for bit in range(1,33):
print("")
print("/* we are going to pack {0} {1}-bit values, touching {2} 256-bit words, using {3} bytes */ ".format(howmany(bit),bit,howmanywords(bit),howmanybytes(bit)))
print("static void avxpackblock{0}(const uint32_t * pin, __m256i * compressed) {{".format(bit));
print(" const __m256i * in = (const __m256i *) pin;");
print(" /* we are going to touch {0} 256-bit word{1} */ ".format(howmanywords(bit),plurial(howmanywords(bit))));
if(howmanywords(bit) == 1):
print(" __m256i w0;")
else:
print(" __m256i w0, w1;")
if( (bit & (bit-1)) <> 0) : print(" __m256i tmp; /* used to store inputs at word boundary */")
oldword = 0
for j in range(howmany(bit)/8):
firstword = j * bit / 32
if(firstword > oldword):
print(" _mm256_storeu_si256(compressed + {0}, w{1});".format(oldword,oldword%2))
oldword = firstword
secondword = (j * bit + bit - 1)/32
firstshift = (j*bit) % 32
if( firstword == secondword):
if(firstshift == 0):
print(" w{0} = _mm256_lddqu_si256 (in + {1});".format(firstword%2,j))
else:
print(" w{0} = _mm256_or_si256(w{0},_mm256_slli_epi32(_mm256_lddqu_si256 (in + {1}) , {2}));".format(firstword%2,j,firstshift))
else:
print(" tmp = _mm256_lddqu_si256 (in + {0});".format(j))
print(" w{0} = _mm256_or_si256(w{0},_mm256_slli_epi32(tmp , {2}));".format(firstword%2,j,firstshift))
secondshift = 32-firstshift
print(" w{0} = _mm256_srli_epi32(tmp,{2});".format(secondword%2,j,secondshift))
print(" _mm256_storeu_si256(compressed + {0}, w{1});".format(secondword,secondword%2))
print("}");
print("")
print("")
print("static void avxpackblockmask0(const uint32_t * pin, __m256i * compressed) {");
print(" (void)compressed;");
print(" (void) pin; /* we consumed {0} 32-bit integer{1} */ ".format(howmany(0),plurial(howmany(0))));
print("}");
print("")
for bit in range(1,33):
print("")
print("/* we are going to pack {0} {1}-bit values, touching {2} 256-bit words, using {3} bytes */ ".format(howmany(bit),bit,howmanywords(bit),howmanybytes(bit)))
print("static void avxpackblockmask{0}(const uint32_t * pin, __m256i * compressed) {{".format(bit));
print(" /* we are going to touch {0} 256-bit word{1} */ ".format(howmanywords(bit),plurial(howmanywords(bit))));
if(howmanywords(bit) == 1):
print(" __m256i w0;")
else:
print(" __m256i w0, w1;")
print(" const __m256i * in = (const __m256i *) pin;");
if(bit < 32): print(" const __m256i mask = _mm256_set1_epi32({0});".format((1<<bit)-1));
def maskfnc(x):
if(bit == 32): return x
return " _mm256_and_si256 ( mask, {0}) ".format(x)
if( (bit & (bit-1)) <> 0) : print(" __m256i tmp; /* used to store inputs at word boundary */")
oldword = 0
for j in range(howmany(bit)/8):
firstword = j * bit / 32
if(firstword > oldword):
print(" _mm256_storeu_si256(compressed + {0}, w{1});".format(oldword,oldword%2))
oldword = firstword
secondword = (j * bit + bit - 1)/32
firstshift = (j*bit) % 32
loadstr = maskfnc(" _mm256_lddqu_si256 (in + {0}) ".format(j))
if( firstword == secondword):
if(firstshift == 0):
print(" w{0} = {1};".format(firstword%2,loadstr))
else:
print(" w{0} = _mm256_or_si256(w{0},_mm256_slli_epi32({1} , {2}));".format(firstword%2,loadstr,firstshift))
else:
print(" tmp = {0};".format(loadstr))
print(" w{0} = _mm256_or_si256(w{0},_mm256_slli_epi32(tmp , {2}));".format(firstword%2,j,firstshift))
secondshift = 32-firstshift
print(" w{0} = _mm256_srli_epi32(tmp,{2});".format(secondword%2,j,secondshift))
print(" _mm256_storeu_si256(compressed + {0}, w{1});".format(secondword,secondword%2))
print("}");
print("")
print("static void avxunpackblock0(const __m256i * compressed, uint32_t * pout) {");
print(" (void) compressed;");
print(" memset(pout,0,{0});".format(howmany(0)));
print("}");
print("")
for bit in range(1,33):
print("")
print("/* we packed {0} {1}-bit values, touching {2} 256-bit words, using {3} bytes */ ".format(howmany(bit),bit,howmanywords(bit),howmanybytes(bit)))
print("static void avxunpackblock{0}(const __m256i * compressed, uint32_t * pout) {{".format(bit));
print(" /* we are going to access {0} 256-bit word{1} */ ".format(howmanywords(bit),plurial(howmanywords(bit))));
if(howmanywords(bit) == 1):
print(" __m256i w0;")
else:
print(" __m256i w0, w1;")
print(" __m256i * out = (__m256i *) pout;");
if(bit < 32): print(" const __m256i mask = _mm256_set1_epi32({0});".format((1<<bit)-1));
maskstr = " _mm256_and_si256 ( mask, {0}) "
if (bit == 32) : maskstr = " {0} " # no need
oldword = 0
print(" w0 = _mm256_lddqu_si256 (compressed);")
for j in range(howmany(bit)/8):
firstword = j * bit / 32
secondword = (j * bit + bit - 1)/32
if(secondword > oldword):
print(" w{0} = _mm256_lddqu_si256 (compressed + {1});".format(secondword%2,secondword))
oldword = secondword
firstshift = (j*bit) % 32
firstshiftstr = "_mm256_srli_epi32( w{0} , "+str(firstshift)+") "
if(firstshift == 0):
firstshiftstr =" w{0} " # no need
wfirst = firstshiftstr.format(firstword%2)
if( firstword == secondword):
if(firstshift + bit <> 32):
wfirst = maskstr.format(wfirst)
print(" _mm256_storeu_si256(out + {0}, {1});".format(j,wfirst))
else:
secondshift = (32-firstshift)
wsecond = "_mm256_slli_epi32( w{0} , {1} ) ".format((firstword+1)%2,secondshift)
wfirstorsecond = " _mm256_or_si256 ({0},{1}) ".format(wfirst,wsecond)
wfirstorsecond = maskstr.format(wfirstorsecond)
print(" _mm256_storeu_si256(out + {0},\n {1});".format(j,wfirstorsecond))
print("}");
print("")
print("static avxpackblockfnc avxfuncPackArr[] = {")
for bit in range(0,32):
print("&avxpackblock{0},".format(bit))
print("&avxpackblock32")
print("};")
print("static avxpackblockfnc avxfuncPackMaskArr[] = {")
for bit in range(0,32):
print("&avxpackblockmask{0},".format(bit))
print("&avxpackblockmask32")
print("};")
print("static avxunpackblockfnc avxfuncUnpackArr[] = {")
for bit in range(0,32):
print("&avxunpackblock{0},".format(bit))
print("&avxunpackblock32")
print("};")
print("/** code generated by avxpacking.py ends here **/")