对于8位除法,可以通过创建幻数表来实现。
见"Hacker's Delight", page 238
签名:
__m128i _mm_div_epi8(__m128i a, __m128i b)
{
__m128i abs_b = _mm_abs_epi8(b);
static const uint16_t magic_number_table[129] =
{
0x0000, 0x0000, 0x8080, 0x5580, 0x4040, 0x3380, 0x2ac0, 0x24c0, 0x2020, 0x1c80, 0x19c0, 0x1760, 0x1560, 0x13c0, 0x1260, 0x1120,
0x1010, 0x0f20, 0x0e40, 0x0d80, 0x0ce0, 0x0c40, 0x0bb0, 0x0b30, 0x0ab0, 0x0a40, 0x09e0, 0x0980, 0x0930, 0x08e0, 0x0890, 0x0850,
0x0808, 0x07d0, 0x0790, 0x0758, 0x0720, 0x06f0, 0x06c0, 0x0698, 0x0670, 0x0640, 0x0620, 0x05f8, 0x05d8, 0x05b8, 0x0598, 0x0578,
0x0558, 0x0540, 0x0520, 0x0508, 0x04f0, 0x04d8, 0x04c0, 0x04b0, 0x0498, 0x0480, 0x0470, 0x0458, 0x0448, 0x0438, 0x0428, 0x0418,
0x0404, 0x03f8, 0x03e8, 0x03d8, 0x03c8, 0x03b8, 0x03ac, 0x03a0, 0x0390, 0x0388, 0x0378, 0x0370, 0x0360, 0x0358, 0x034c, 0x0340,
0x0338, 0x032c, 0x0320, 0x0318, 0x0310, 0x0308, 0x02fc, 0x02f4, 0x02ec, 0x02e4, 0x02dc, 0x02d4, 0x02cc, 0x02c4, 0x02bc, 0x02b4,
0x02ac, 0x02a8, 0x02a0, 0x0298, 0x0290, 0x028c, 0x0284, 0x0280, 0x0278, 0x0274, 0x026c, 0x0268, 0x0260, 0x025c, 0x0258, 0x0250,
0x024c, 0x0248, 0x0240, 0x023c, 0x0238, 0x0234, 0x022c, 0x0228, 0x0224, 0x0220, 0x021c, 0x0218, 0x0214, 0x0210, 0x020c, 0x0208,
0x0202
};
Uint8 load_den[16];
_mm_storeu_si128((__m128i*)load_den, abs_b);
uint16_t mul[16];
for (size_t i = 0; i < 16; i++)
{
uint16_t cur_den = load_den[i];
mul[i] = magic_number_table[cur_den];
}
// for denominator 1, magic number is 0x10080 that 16bit-overflow occurs.
__m128i one = _mm_set1_epi8(1);
__m128i is_one = _mm_cmpeq_epi8(abs_b, one);
// -128/-128 is a special case where magic number does not work.
__m128i v80 = _mm_set1_epi8(0x80);
__m128i is_80_a = _mm_cmpeq_epi8(a, v80);
__m128i is_80_b = _mm_cmpeq_epi8(b, v80);
__m128i is_80 = _mm_and_si128(is_80_a, is_80_b);
// __m128i zero = _mm_setzero_si128();
// __m128i less_a = _mm_cmpgt_epi8(zero, a);
// __m128i less_b = _mm_cmpgt_epi8(zero, b);
// __m128i sign = _mm_xor_si128(less_a, less_b);
__m128i abs_a = _mm_abs_epi8(a);
#if 0
__m128i p = _mm_unpacklo_epi8(abs_a, zero);
__m128i q = _mm_unpackhi_epi8(abs_a, zero);
__m256i c = _mm256_castsi128_si256(p);
c = _mm256_insertf128_si256(c, q, 1);
#else
// Thanks to Peter Cordes
__m256i c = _mm256_cvtepu8_epi16(abs_a);
#endif
__m256i magic = _mm256_loadu_si256((const __m256i*)mul);
__m256i high = _mm256_mulhi_epu16(magic, c);
__m128i v0h = _mm256_extractf128_si256(high, 0);
__m128i v0l = _mm256_extractf128_si256(high, 1);
__m128i res = _mm_packus_epi16(v0h, v0l);
__m128i div = _mm_blendv_epi8(res, abs_a, is_one);
// __m128i neg = _mm_sub_epi8(zero, div);
// __m128i select = _mm_blendv_epi8(div, neg, sign);
__m128i select = _mm_sign_epi8(div, _mm_or_si128(_mm_xor_si128(a, b), one));
return _mm_blendv_epi8(select, one, is_80);
}
无符号:
__m128i _mm_div_epu8(__m128i n, __m128i den)
{
static const uint16_t magic_number_table[256] =
{
0x0001, 0x0000, 0x8000, 0x5580, 0x4000, 0x3340, 0x2ac0, 0x04a0, 0x2000, 0x1c80, 0x19a0, 0x0750, 0x1560, 0x13c0, 0x0250, 0x1120,
0x1000, 0x0f10, 0x0e40, 0x0d80, 0x0cd0, 0x0438, 0x03a8, 0x0328, 0x0ab0, 0x0a40, 0x09e0, 0x0980, 0x0128, 0x00d8, 0x0890, 0x0048,
0x0800, 0x07c8, 0x0788, 0x0758, 0x0720, 0x06f0, 0x06c0, 0x0294, 0x0668, 0x0640, 0x021c, 0x05f8, 0x05d8, 0x01b4, 0x0194, 0x0578,
0x0558, 0x013c, 0x0520, 0x0508, 0x04f0, 0x04d8, 0x04c0, 0x04a8, 0x0094, 0x0480, 0x006c, 0x0458, 0x0448, 0x0034, 0x0024, 0x0014,
0x0400, 0x03f4, 0x03e4, 0x03d4, 0x03c8, 0x03b8, 0x03ac, 0x039c, 0x0390, 0x0384, 0x0378, 0x036c, 0x0360, 0x0354, 0x014a, 0x0340,
0x0334, 0x032c, 0x0320, 0x0318, 0x010e, 0x0304, 0x02fc, 0x02f4, 0x02ec, 0x02e4, 0x02dc, 0x02d4, 0x02cc, 0x02c4, 0x02bc, 0x02b4,
0x02ac, 0x02a4, 0x02a0, 0x0298, 0x0290, 0x028c, 0x0284, 0x007e, 0x0278, 0x0072, 0x026c, 0x0066, 0x0260, 0x025c, 0x0254, 0x0250,
0x004a, 0x0244, 0x0240, 0x023c, 0x0036, 0x0032, 0x022c, 0x0228, 0x0224, 0x001e, 0x001a, 0x0016, 0x0012, 0x000e, 0x000a, 0x0006,
0x0200, 0x00fd, 0x01fc, 0x01f8, 0x01f4, 0x01f0, 0x01ec, 0x01e8, 0x01e4, 0x01e0, 0x01dc, 0x01d8, 0x01d6, 0x01d4, 0x01d0, 0x01cc,
0x01c8, 0x01c4, 0x01c2, 0x01c0, 0x01bc, 0x01b8, 0x01b6, 0x01b4, 0x01b0, 0x01ae, 0x01ac, 0x01a8, 0x01a6, 0x01a4, 0x01a0, 0x019e,
0x019c, 0x0198, 0x0196, 0x0194, 0x0190, 0x018e, 0x018c, 0x018a, 0x0188, 0x0184, 0x0182, 0x0180, 0x017e, 0x017c, 0x017a, 0x0178,
0x0176, 0x0174, 0x0172, 0x0170, 0x016e, 0x016c, 0x016a, 0x0168, 0x0166, 0x0164, 0x0162, 0x0160, 0x015e, 0x015c, 0x015a, 0x0158,
0x0156, 0x0154, 0x0152, 0x0051, 0x0150, 0x014e, 0x014c, 0x014a, 0x0148, 0x0047, 0x0146, 0x0144, 0x0142, 0x0140, 0x003f, 0x013e,
0x013c, 0x013a, 0x0039, 0x0138, 0x0136, 0x0134, 0x0033, 0x0132, 0x0130, 0x002f, 0x012e, 0x012c, 0x012a, 0x0029, 0x0128, 0x0126,
0x0025, 0x0124, 0x0122, 0x0021, 0x0120, 0x001f, 0x011e, 0x011c, 0x001b, 0x011a, 0x0019, 0x0118, 0x0116, 0x0015, 0x0114, 0x0013,
0x0112, 0x0110, 0x000f, 0x010e, 0x000d, 0x010c, 0x000b, 0x010a, 0x0009, 0x0108, 0x0007, 0x0106, 0x0005, 0x0104, 0x0003, 0x0102
};
static const uint16_t shift_table[256] =
{
0x0001, 0x0100, 0x0100, 0x0080, 0x0100, 0x0040, 0x0040, 0x0020, 0x0100, 0x0080, 0x0020, 0x0010, 0x0020, 0x0040, 0x0010, 0x0020,
0x0100, 0x0010, 0x0040, 0x0080, 0x0010, 0x0008, 0x0008, 0x0008, 0x0010, 0x0040, 0x0020, 0x0080, 0x0008, 0x0008, 0x0010, 0x0008,
0x0100, 0x0008, 0x0008, 0x0008, 0x0020, 0x0010, 0x0040, 0x0004, 0x0008, 0x0040, 0x0004, 0x0008, 0x0008, 0x0004, 0x0004, 0x0008,
0x0008, 0x0004, 0x0020, 0x0008, 0x0010, 0x0008, 0x0040, 0x0008, 0x0004, 0x0080, 0x0004, 0x0008, 0x0008, 0x0004, 0x0004, 0x0004,
0x0100, 0x0004, 0x0004, 0x0004, 0x0008, 0x0008, 0x0004, 0x0004, 0x0010, 0x0004, 0x0008, 0x0004, 0x0020, 0x0004, 0x0002, 0x0040,
0x0004, 0x0004, 0x0020, 0x0008, 0x0002, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004, 0x0004,
0x0004, 0x0004, 0x0020, 0x0008, 0x0010, 0x0004, 0x0004, 0x0002, 0x0008, 0x0002, 0x0004, 0x0002, 0x0020, 0x0004, 0x0004, 0x0010,
0x0002, 0x0004, 0x0040, 0x0004, 0x0002, 0x0002, 0x0004, 0x0008, 0x0004, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002,
0x0100, 0x0001, 0x0004, 0x0008, 0x0004, 0x0010, 0x0004, 0x0008, 0x0004, 0x0020, 0x0004, 0x0008, 0x0002, 0x0004, 0x0010, 0x0004,
0x0008, 0x0004, 0x0002, 0x0040, 0x0004, 0x0008, 0x0002, 0x0004, 0x0010, 0x0002, 0x0004, 0x0008, 0x0002, 0x0004, 0x0020, 0x0002,
0x0004, 0x0008, 0x0002, 0x0004, 0x0010, 0x0002, 0x0004, 0x0002, 0x0008, 0x0004, 0x0002, 0x0080, 0x0002, 0x0004, 0x0002, 0x0008,
0x0002, 0x0004, 0x0002, 0x0010, 0x0002, 0x0004, 0x0002, 0x0008, 0x0002, 0x0004, 0x0002, 0x0020, 0x0002, 0x0004, 0x0002, 0x0008,
0x0002, 0x0004, 0x0002, 0x0001, 0x0010, 0x0002, 0x0004, 0x0002, 0x0008, 0x0001, 0x0002, 0x0004, 0x0002, 0x0040, 0x0001, 0x0002,
0x0004, 0x0002, 0x0001, 0x0008, 0x0002, 0x0004, 0x0001, 0x0002, 0x0010, 0x0001, 0x0002, 0x0004, 0x0002, 0x0001, 0x0008, 0x0002,
0x0001, 0x0004, 0x0002, 0x0001, 0x0020, 0x0001, 0x0002, 0x0004, 0x0001, 0x0002, 0x0001, 0x0008, 0x0002, 0x0001, 0x0004, 0x0001,
0x0002, 0x0010, 0x0001, 0x0002, 0x0001, 0x0004, 0x0001, 0x0002, 0x0001, 0x0008, 0x0001, 0x0002, 0x0001, 0x0004, 0x0001, 0x0002
};
static const uint16_t mask_table[256] =
{
0x0000, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xffff, 0x0000, 0x0000, 0x0000, 0xffff, 0x0000, 0x0000, 0xffff, 0x0000,
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xffff, 0xffff, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000, 0xffff,
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xffff, 0x0000, 0x0000, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000,
0x0000, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xffff, 0x0000, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff, 0xffff,
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xffff, 0x0000,
0x0000, 0x0000, 0x0000, 0x0000, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xffff, 0x0000, 0xffff, 0x0000, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000,
0xffff, 0x0000, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0x0000, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
0x0000, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
0x0000, 0x0000, 0x0000, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0xffff, 0x0000,
0x0000, 0x0000, 0xffff, 0x0000, 0x0000, 0x0000, 0xffff, 0x0000, 0x0000, 0xffff, 0x0000, 0x0000, 0x0000, 0xffff, 0x0000, 0x0000,
0xffff, 0x0000, 0x0000, 0xffff, 0x0000, 0xffff, 0x0000, 0x0000, 0xffff, 0x0000, 0xffff, 0x0000, 0x0000, 0xffff, 0x0000, 0xffff,
0x0000, 0x0000, 0xffff, 0x0000, 0xffff, 0x0000, 0xffff, 0x0000, 0xffff, 0x0000, 0xffff, 0x0000, 0xffff, 0x0000, 0xffff, 0x0000
};
uint8_t load_den[16];
_mm_storeu_si128((__m128i*)load_den, den);
uint16_t mul[16];
uint16_t mask[16];
uint16_t shift[16];
for (size_t i = 0; i < 16; i++)
{
const uint16_t cur_den = load_den[i];
mul[i] = magic_number_table[cur_den];
mask[i] = mask_table[cur_den];
shift[i] = shift_table[cur_den];
}
#if 0
__m128i a = _mm_unpacklo_epi8(n, _mm_setzero_si128());
__m128i b = _mm_unpackhi_epi8(n, _mm_setzero_si128());
__m256i c = _mm256_castsi128_si256(a);
c = _mm256_insertf128_si256(c, b, 1);
#else
// Thanks to Peter Cordes
__m256i c = _mm256_cvtepu8_epi16(n);
#endif
__m256i magic = _mm256_loadu_si256((const __m256i*)mul);
__m256i high = _mm256_mulhi_epu16(magic, c);
__m256i low = _mm256_mullo_epi16(magic, c);
__m256i low_down = _mm256_srli_epi16(low, 8);
__m256i high_up = _mm256_slli_epi16(high, 8);
__m256i low_high = _mm256_or_si256(low_down, high_up);
__m256i target_up = _mm256_mullo_epi16(c, _mm256_loadu_si256((const __m256i*)shift));
__m256i cal1 = _mm256_sub_epi16(target_up, low_high);
__m256i cal2 = _mm256_srli_epi16(cal1, 1);
__m256i cal3 = _mm256_add_epi16(cal2, low_high);
__m256i cal4 = _mm256_srli_epi16(cal3, 7);
__m256i res = _mm256_blendv_epi8(high, cal4, _mm256_loadu_si256((const __m256i*)mask));
__m128i v0h = _mm256_extractf128_si256(res, 0);
__m128i v0l = _mm256_extractf128_si256(res, 1);
return _mm_packus_epi16(v0h, v0l);
}