Re: Cost of handling misaligned access

Liste des GroupesRevenir à c arch 
Sujet : Re: Cost of handling misaligned access
De : mitchalsup (at) *nospam* aol.com (MitchAlsup1)
Groupes : comp.arch
Date : 07. Feb 2025, 23:27:03
Autres entêtes
Organisation : Rocksolid Light
Message-ID : <50c7a978b92f0af32ffe1ec06bbfa68f@www.novabbs.org>
References : 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
User-Agent : Rocksolid Light
On Fri, 7 Feb 2025 15:04:23 +0000, Michael S wrote:

Here is my manual code. Handling of the tail is too clever. I did not
have time to simplify. Otherwise, for 250x250 it should perform about
the same as simpler code.
>
#include <stdint.h>
#include <immintrin.h>
>
int foo_tst(const uint32_t* keylocks, int len, int li)
{
  if (li >= len || li <= 0)
    return 0;
  const uint32_t* keyx = &keylocks[li];
  unsigned ni = len - li;
  __m256i res0 = _mm256_setzero_si256();
  __m256i res1 = _mm256_setzero_si256();
  __m256i res2 = _mm256_setzero_si256();
  __m256i res3 = _mm256_setzero_si256();
  const uint32_t* keyx_last = &keyx[ni & -32];
  for (; keyx != keyx_last; keyx += 32) {
    __m256i lock0 = _mm256_loadu_si256((const __m256i*)&keyx[0*8]);
    __m256i lock1 = _mm256_loadu_si256((const __m256i*)&keyx[1*8]);
    __m256i lock2 = _mm256_loadu_si256((const __m256i*)&keyx[2*8]);
    __m256i lock3 = _mm256_loadu_si256((const __m256i*)&keyx[3*8]);
    // for (int k = 0; k < li; ++k) {
    // for (int k = 0, nk = li; nk > 0; ++k, --nk) {
    for (const uint32_t* keyy = keylocks; keyy != &keylocks[li];
++keyy) { // __m256i lockk =
_mm256_castps_si256(_mm256_broadcast_ss((const float*)&keylocks[k]));
__m256i lockk = _mm256_castps_si256(_mm256_broadcast_ss((const
float*)keyy)); res0 = _mm256_sub_epi32(res0,
_mm256_cmpeq_epi32(_mm256_and_si256(lockk, lock0),
_mm256_setzero_si256())); res1 = _mm256_sub_epi32(res1,
_mm256_cmpeq_epi32(_mm256_and_si256(lockk, lock1),
_mm256_setzero_si256())); res2 = _mm256_sub_epi32(res2,
_mm256_cmpeq_epi32(_mm256_and_si256(lockk, lock2),
_mm256_setzero_si256())); res3 = _mm256_sub_epi32(res3,
_mm256_cmpeq_epi32(_mm256_and_si256(lockk, lock3),
_mm256_setzero_si256())); } } int res = 0; if (ni % 32) { uint32_t
tmp[32]; const uint32_t* keyy_last = &keylocks[li & -32]; if (li % 32) {
      for (int k = 0; k < li % 32; ++k)
        tmp[k] = keyy_last[k];
      for (int k = li % 32; k < 32; ++k)
        tmp[k] = (uint32_t)-1;
    }
    const uint32_t* keyx_last = &keyx[ni % 32];
    int nz = 0;
    for (; keyx != keyx_last; keyx += 1) {
      if (*keyx) {
        __m256i lockk = _mm256_castps_si256(_mm256_broadcast_ss((const
float*)keyx)); for (const uint32_t* keyy = keylocks; keyy != keyy_last;
keyy += 32) { __m256i lock0 = _mm256_loadu_si256((const
__m256i*)&keyy[0*8]); __m256i lock1 = _mm256_loadu_si256((const
__m256i*)&keyy[1*8]); __m256i lock2 = _mm256_loadu_si256((const
__m256i*)&keyy[2*8]); __m256i lock3 = _mm256_loadu_si256((const
__m256i*)&keyy[3*8]); res0 = _mm256_sub_epi32(res0,
_mm256_cmpeq_epi32(_mm256_and_si256(lockk, lock0),
_mm256_setzero_si256())); res1 = _mm256_sub_epi32(res1,
_mm256_cmpeq_epi32(_mm256_and_si256(lockk, lock1),
_mm256_setzero_si256())); res2 = _mm256_sub_epi32(res2,
_mm256_cmpeq_epi32(_mm256_and_si256(lockk, lock2),
_mm256_setzero_si256())); res3 = _mm256_sub_epi32(res3,
_mm256_cmpeq_epi32(_mm256_and_si256(lockk, lock3),
_mm256_setzero_si256())); } if (li % 32) { __m256i lock0 =
_mm256_loadu_si256((const __m256i*)&tmp[0*8]); __m256i lock1 =
_mm256_loadu_si256((const __m256i*)&tmp[1*8]); __m256i lock2 =
_mm256_loadu_si256((const __m256i*)&tmp[2*8]); __m256i lock3 =
_mm256_loadu_si256((const __m256i*)&tmp[3*8]); res0 =
_mm256_sub_epi32(res0, _mm256_cmpeq_epi32(_mm256_and_si256(lockk,
lock0), _mm256_setzero_si256())); res1 = _mm256_sub_epi32(res1,
_mm256_cmpeq_epi32(_mm256_and_si256(lockk, lock1),
_mm256_setzero_si256())); res2 = _mm256_sub_epi32(res2,
_mm256_cmpeq_epi32(_mm256_and_si256(lockk, lock2),
_mm256_setzero_si256())); res3 = _mm256_sub_epi32(res3,
_mm256_cmpeq_epi32(_mm256_and_si256(lockk, lock3),
_mm256_setzero_si256())); } } else { nz += 1; } } res = nz * li; }
  // fold accumulators
  res0 = _mm256_add_epi32(res0, res2);
  res1 = _mm256_add_epi32(res1, res3);
  res0 = _mm256_add_epi32(res0, res1);
  res0 = _mm256_hadd_epi32(res0, res0);
  res0 = _mm256_hadd_epi32(res0, res0);
>
  res += _mm256_extract_epi32(res0, 0);
  res += _mm256_extract_epi32(res0, 4);
  return res;
}
Simple question:: how would you port this code to a machine
with a different SIMD instruction set ??

Date Sujet#  Auteur
2 Feb 25 * Re: Cost of handling misaligned access112BGB
3 Feb 25 +* Re: Cost of handling misaligned access2MitchAlsup1
3 Feb 25 i`- Re: Cost of handling misaligned access1BGB
3 Feb 25 `* Re: Cost of handling misaligned access109Anton Ertl
3 Feb 25  +* Re: Cost of handling misaligned access11BGB
3 Feb 25  i`* Re: Cost of handling misaligned access10Anton Ertl
3 Feb 25  i +- Re: Cost of handling misaligned access1BGB
3 Feb 25  i `* Re: Cost of handling misaligned access8Thomas Koenig
4 Feb 25  i  `* Re: Cost of handling misaligned access7Anton Ertl
4 Feb 25  i   +* Re: Cost of handling misaligned access5Thomas Koenig
4 Feb 25  i   i`* Re: Cost of handling misaligned access4Anton Ertl
4 Feb 25  i   i +* Re: Cost of handling misaligned access2Thomas Koenig
10 Feb 25  i   i i`- Re: Cost of handling misaligned access1Mike Stump
10 Feb 25  i   i `- Re: Cost of handling misaligned access1Mike Stump
4 Feb 25  i   `- Re: Cost of handling misaligned access1MitchAlsup1
3 Feb 25  +* Re: Cost of handling misaligned access3Thomas Koenig
3 Feb 25  i`* Re: Cost of handling misaligned access2BGB
3 Feb 25  i `- Re: Cost of handling misaligned access1MitchAlsup1
4 Feb 25  +* Re: Cost of handling misaligned access41Anton Ertl
5 Feb 25  i`* Re: Cost of handling misaligned access40Terje Mathisen
5 Feb 25  i +* Re: Cost of handling misaligned access4Anton Ertl
5 Feb 25  i i+* Re: Cost of handling misaligned access2Terje Mathisen
6 Feb 25  i ii`- Re: Cost of handling misaligned access1Anton Ertl
6 Feb 25  i i`- Re: Cost of handling misaligned access1Anton Ertl
5 Feb 25  i `* Re: Cost of handling misaligned access35Michael S
6 Feb 25  i  +* Re: Cost of handling misaligned access32Anton Ertl
6 Feb 25  i  i`* Re: Cost of handling misaligned access31Michael S
6 Feb 25  i  i +* Re: Cost of handling misaligned access2Anton Ertl
6 Feb 25  i  i i`- Re: Cost of handling misaligned access1Michael S
6 Feb 25  i  i `* Re: Cost of handling misaligned access28Terje Mathisen
6 Feb 25  i  i  `* Re: Cost of handling misaligned access27Terje Mathisen
6 Feb 25  i  i   `* Re: Cost of handling misaligned access26Michael S
6 Feb 25  i  i    `* Re: Cost of handling misaligned access25Terje Mathisen
6 Feb 25  i  i     +* Re: Cost of handling misaligned access19Michael S
7 Feb 25  i  i     i`* Re: Cost of handling misaligned access18Terje Mathisen
7 Feb 25  i  i     i `* Re: Cost of handling misaligned access17Michael S
7 Feb 25  i  i     i  `* Re: Cost of handling misaligned access16Terje Mathisen
7 Feb 25  i  i     i   `* Re: Cost of handling misaligned access15Michael S
7 Feb 25  i  i     i    +- Re: Cost of handling misaligned access1Terje Mathisen
7 Feb 25  i  i     i    +* Re: Cost of handling misaligned access3MitchAlsup1
8 Feb 25  i  i     i    i+- Re: Cost of handling misaligned access1Terje Mathisen
8 Feb 25  i  i     i    i`- Re: Cost of handling misaligned access1Michael S
8 Feb 25  i  i     i    `* Re: Cost of handling misaligned access10Anton Ertl
8 Feb 25  i  i     i     +- Re: Cost of handling misaligned access1Terje Mathisen
8 Feb 25  i  i     i     +* Re: Cost of handling misaligned access6Michael S
8 Feb 25  i  i     i     i`* Re: Cost of handling misaligned access5Anton Ertl
8 Feb 25  i  i     i     i +- Re: Cost of handling misaligned access1Michael S
9 Feb 25  i  i     i     i +* Re: Cost of handling misaligned access2Michael S
11 Feb 25  i  i     i     i i`- Re: Cost of handling misaligned access1Michael S
9 Feb 25  i  i     i     i `- Re: Cost of handling misaligned access1Michael S
9 Feb 25  i  i     i     +- Re: Cost of handling misaligned access1Michael S
10 Feb 25  i  i     i     `- Re: Cost of handling misaligned access1Michael S
7 Feb 25  i  i     `* Re: Cost of handling misaligned access5BGB
7 Feb 25  i  i      `* Re: Cost of handling misaligned access4MitchAlsup1
7 Feb 25  i  i       `* Re: Cost of handling misaligned access3BGB
8 Feb 25  i  i        `* Re: Cost of handling misaligned access2Anssi Saari
8 Feb 25  i  i         `- Re: Cost of handling misaligned access1BGB
6 Feb 25  i  `* Re: Cost of handling misaligned access2Terje Mathisen
6 Feb 25  i   `- Re: Cost of handling misaligned access1Michael S
6 Feb 25  +* Re: Cost of handling misaligned access5Waldek Hebisch
6 Feb 25  i+* Re: Cost of handling misaligned access3Anton Ertl
6 Feb 25  ii`* Re: Cost of handling misaligned access2Waldek Hebisch
6 Feb 25  ii `- Re: Cost of handling misaligned access1Anton Ertl
6 Feb 25  i`- Re: Cost of handling misaligned access1Terje Mathisen
13 Feb 25  `* Re: Cost of handling misaligned access48Marcus
13 Feb 25   +- Re: Cost of handling misaligned access1Thomas Koenig
14 Feb 25   +* Re: Cost of handling misaligned access41BGB
14 Feb 25   i`* Re: Cost of handling misaligned access40MitchAlsup1
18 Feb 25   i `* Re: Cost of handling misaligned access39BGB
18 Feb 25   i  +* Re: Cost of handling misaligned access33MitchAlsup1
18 Feb 25   i  i+- Re: Cost of handling misaligned access1BGB
18 Feb 25   i  i`* Re: Cost of handling misaligned access31Michael S
18 Feb 25   i  i +- Re: Cost of handling misaligned access1Thomas Koenig
18 Feb 25   i  i +* Re: Cost of handling misaligned access26MitchAlsup1
18 Feb 25   i  i i`* Re: Cost of handling misaligned access25Terje Mathisen
18 Feb 25   i  i i `* Re: Cost of handling misaligned access24MitchAlsup1
19 Feb 25   i  i i  `* Re: Cost of handling misaligned access23Terje Mathisen
19 Feb 25   i  i i   `* Re: Cost of handling misaligned access22MitchAlsup1
19 Feb 25   i  i i    `* Re: Cost of handling misaligned access21BGB
20 Feb 25   i  i i     +- Re: Cost of handling misaligned access1Robert Finch
20 Feb 25   i  i i     +* Re: Cost of handling misaligned access5MitchAlsup1
20 Feb 25   i  i i     i+* Re: Cost of handling misaligned access2BGB
20 Feb 25   i  i i     ii`- Re: Cost of handling misaligned access1BGB
21 Feb 25   i  i i     i`* Re: Cost of handling misaligned access2Robert Finch
21 Feb 25   i  i i     i `- Re: Cost of handling misaligned access1BGB
21 Feb 25   i  i i     `* Re: Cost of handling misaligned access14BGB
22 Feb 25   i  i i      +- Re: Cost of handling misaligned access1Robert Finch
22 Feb 25   i  i i      `* Re: Cost of handling misaligned access12Robert Finch
23 Feb 25   i  i i       +* Re: Cost of handling misaligned access10BGB
23 Feb 25   i  i i       i`* Re: Cost of handling misaligned access9Michael S
24 Feb 25   i  i i       i +- Re: Cost of handling misaligned access1BGB
24 Feb 25   i  i i       i `* Re: Cost of handling misaligned access7Michael S
24 Feb 25   i  i i       i  +* Re: Cost of handling misaligned access4Robert Finch
24 Feb 25   i  i i       i  i+- Re: Cost of handling misaligned access1BGB
24 Feb 25   i  i i       i  i`* Re: Cost of handling misaligned access2MitchAlsup1
25 Feb 25   i  i i       i  i `- Re: Cost of handling misaligned access1BGB
25 Feb 25   i  i i       i  `* Re: Cost of handling misaligned access2MitchAlsup1
25 Feb 25   i  i i       i   `- Re: Cost of handling misaligned access1BGB
23 Feb 25   i  i i       `- Re: Cost of handling misaligned access1Robert Finch
18 Feb 25   i  i `* Re: Cost of handling misaligned access3BGB
19 Feb 25   i  i  `* Re: Cost of handling misaligned access2MitchAlsup1
18 Feb 25   i  `* Re: Cost of handling misaligned access5Robert Finch
17 Feb 25   `* Re: Cost of handling misaligned access5Terje Mathisen

Haut de la page

Les messages affichés proviennent d'usenet.

NewsPortal