Hackers News
Dividing unsigned 8-bit numbers
Step 1: Updating reminder
Unlike SSE/AVX2 code it’s easier to actually perform shift right
to place i-th bit at position zero. Then isolating the least
siginificant bit and merging it with quotient can be expressed
as a single ternary operation.
reminder <<= 1; t0 = divisor >> i; reminder = reminder | (t0 & 1); // ternary operation
Step 2: Comparison
AVX512 supports unsigned byte comparison, and returns a mask.
Step 3: Conditional operations
This is straightforward use of masked operations.
Implementation
The actual AVX512 implementation is shown below. Unlike SSE code, the inner
loop is manually unrolled. Also, there’s no explictly use of the ternary logic
intrinsic function — but examing the assembly code revelas that a compiler
nicely fuses binary operation.
void avx512_long_div_u8(const uint8_t* A, const uint8_t* B, uint8_t* C, size_t n) { const __m512i one = _mm512_set1_epi8(1); for (size_t i=0; i < n; i += 64) { const __m512i dividend = _mm512_loadu_si512((const __m512*)(&A[i])); const __m512i divisor = _mm512_loadu_si512((const __m512*)(&B[i])); const __m512i dividend_bit7 = _mm512_and_epi32(_mm512_srli_epi32(dividend, 7), one); const __m512i dividend_bit6 = _mm512_and_epi32(_mm512_srli_epi32(dividend, 6), one); const __m512i dividend_bit5 = _mm512_and_epi32(_mm512_srli_epi32(dividend, 5), one); const __m512i dividend_bit4 = _mm512_and_epi32(_mm512_srli_epi32(dividend, 4), one); const __m512i dividend_bit3 = _mm512_and_epi32(_mm512_srli_epi32(dividend, 3), one); const __m512i dividend_bit2 = _mm512_and_epi32(_mm512_srli_epi32(dividend, 2), one); const __m512i dividend_bit1 = _mm512_and_epi32(_mm512_srli_epi32(dividend, 1), one); const __m512i dividend_bit0 = _mm512_and_epi32(_mm512_srli_epi32(dividend, 0), one); __m512i quotient = _mm512_set1_epi32(0); __m512i reminder = dividend_bit7; { const __mmask64 ge = _mm512_cmpge_epu8_mask(reminder, divisor); reminder = _mm512_mask_sub_epi8(reminder, ge, reminder, divisor); quotient = _mm512_mask_add_epi8(quotient, ge, quotient, one); } reminder = _mm512_add_epi32(reminder, reminder); reminder = _mm512_or_epi32(reminder, dividend_bit6); { const __mmask64 ge = _mm512_cmpge_epu8_mask(reminder, divisor); reminder = _mm512_mask_sub_epi8(reminder, ge, reminder, divisor); quotient = _mm512_add_epi32(quotient, quotient); quotient = _mm512_mask_add_epi8(quotient, ge, quotient, one); } reminder = _mm512_add_epi32(reminder, reminder); reminder = _mm512_or_epi32(reminder, dividend_bit5); { const __mmask64 ge = _mm512_cmpge_epu8_mask(reminder, divisor); reminder = _mm512_mask_sub_epi8(reminder, ge, reminder, divisor); quotient = _mm512_add_epi32(quotient, quotient); quotient = _mm512_mask_add_epi8(quotient, ge, quotient, one); } reminder = _mm512_add_epi32(reminder, reminder); reminder = _mm512_or_epi32(reminder, dividend_bit4); { const __mmask64 ge = _mm512_cmpge_epu8_mask(reminder, divisor); reminder = _mm512_mask_sub_epi8(reminder, ge, reminder, divisor); quotient = _mm512_add_epi32(quotient, quotient); quotient = _mm512_mask_add_epi8(quotient, ge, quotient, one); } reminder = _mm512_add_epi32(reminder, reminder); reminder = _mm512_or_epi32(reminder, dividend_bit3); { const __mmask64 ge = _mm512_cmpge_epu8_mask(reminder, divisor); reminder = _mm512_mask_sub_epi8(reminder, ge, reminder, divisor); quotient = _mm512_add_epi32(quotient, quotient); quotient = _mm512_mask_add_epi8(quotient, ge, quotient, one); } reminder = _mm512_add_epi32(reminder, reminder); reminder = _mm512_or_epi32(reminder, dividend_bit2); { const __mmask64 ge = _mm512_cmpge_epu8_mask(reminder, divisor); reminder = _mm512_mask_sub_epi8(reminder, ge, reminder, divisor); quotient = _mm512_add_epi32(quotient, quotient); quotient = _mm512_mask_add_epi8(quotient, ge, quotient, one); } reminder = _mm512_add_epi32(reminder, reminder); reminder = _mm512_or_epi32(reminder, dividend_bit1); { const __mmask64 ge = _mm512_cmpge_epu8_mask(reminder, divisor); reminder = _mm512_mask_sub_epi8(reminder, ge, reminder, divisor); quotient = _mm512_add_epi32(quotient, quotient); quotient = _mm512_mask_add_epi8(quotient, ge, quotient, one); } reminder = _mm512_add_epi32(reminder, reminder); reminder = _mm512_or_epi32(reminder, dividend_bit0); { const __mmask64 ge = _mm512_cmpge_epu8_mask(reminder, divisor); reminder = _mm512_mask_sub_epi8(reminder, ge, reminder, divisor); quotient = _mm512_add_epi8(quotient, quotient); quotient = _mm512_mask_add_epi8(quotient, ge, quotient, one); } _mm512_storeu_si512((__m512*)&C[i], quotient); } }