使用位交错的 3D Morton 编码，传统与 BMI2 指令集答案

【问题标题】：3D Morton Encoding using bit interleaving, Conventional vs BMI2 Instruction Set使用位交错的 3D Morton 编码，传统与 BMI2 指令集
【发布时间】：2018-10-17 00:49:01
【问题描述】：

我希望以快速有效的方式用 C 语言为 Morton Z-Order 编码和解码编写两个函数，即。

uint64_t morton_encode(uint32_t xindex, uint32_t yindex, uint32_t zindex);
void morton_decode(uint64_t morton_number, uint32_t *xindex, uint32_t *yindex, uint32_t *zindex);

我之前已经关注了问题

how to compute a 3d morton number interleave the bits of 3 ints

我目前基于 SO 和开源代码的解决方案是

uint64_t spread(uint64_t w)  {
    w &=                0x00000000001fffff; 
    w = (w | w << 32) & 0x001f00000000ffff;  
    w = (w | w << 16) & 0x001f0000ff0000ff;  
    w = (w | w <<  8) & 0x010f00f00f00f00f; 
    w = (w | w <<  4) & 0x10c30c30c30c30c3; 
    w = (w | w <<  2) & 0x1249249249249249;
    return w;
    }

uint64_t morton_encode(uint32_t x, uint32_t y, uint32_t z)  {
   return ((spread((uint64_t)x)) | (spread((uint64_t)y) << 1) | (spread((uint64_t)z) << 2));
   }

///////////////// For Decoding //////////////////////

uint32_t compact(uint64_t w) {
    w &=                  0x1249249249249249;
    w = (w ^ (w >> 2))  & 0x30c30c30c30c30c3;
    w = (w ^ (w >> 4))  & 0xf00f00f00f00f00f;
    w = (w ^ (w >> 8))  & 0x00ff0000ff0000ff;
    w = (w ^ (w >> 16)) & 0x00ff00000000ffff;
    w = (w ^ (w >> 32)) & 0x00000000001fffff;
    return (uint32_t)w;
    }

void morton_decode(uint64_t morton_number, uint32_t *xindex, uint32_t *yindex, uint32_t *zindex){
    *xindex = compact(code);
    *yindex = compact(code >> 1);
    *zindex = compact(code >> 2);
}

我最近遇到了这个 SO 问题（在尝试使用 2D morton 代码时）：2d morton code encode decode 64bits

#include <immintrin.h>
#include <stdint.h>

// on GCC, compile with option -mbmi2, requires Haswell or better.

uint64_t xy_to_morton (uint32_t x, uint32_t y)
{
  return _pdep_u32(x, 0x55555555) | _pdep_u32(y,0xaaaaaaaa);
}

uint64_t morton_to_xy (uint64_t m, uint32_t *x, uint32_t *y)
{
  *x = _pext_u64(m, 0x5555555555555555);
  *y = _pext_u64(m, 0xaaaaaaaaaaaaaaaa);
}

据我了解，这不是一个可移植的解决方案，但由于我（将）运行我的代码的每个系统都有 Haswell CPU（甚至在 HPC 集群上）。我的问题：

如何为 3D 系统修改此代码或这些 BMI 指令集是否可用于编码解码 3D 莫顿数？
如果我需要在每个时间步解码几百万个莫顿数字并且有数百万个这样的时间步，那么使用这些指令是否比我现在使用的标准解决方案更有效。

编辑：对于第一季度，我非常接近解决方案，但仍然无法弄清楚

0x55555555 -> 0000 0000 0101 0101 0101 0101 0101 0101 0101 0101 
0xaaaaaaaa -> 0000 0000 1010 1010 1010 1010 1010 1010 1010 1010

很明显，掩码是交替的 x 和 y 位。所以对于 3d，我需要一个类似的面具

0000 0000 01 001 001 001 001 001 001 001 001 001 001 (for x)
0000 0000 01 010 010 010 010 010 010 010 010 010 010 (for y)
0000 0000 01 100 100 100 100 100 100 100 100 100 100 (for z)
           ^

我对 64 位 morton 代码的 ^ 标记之前的位有点困惑，只有 x、y 和 z 的前 21 位是 32 位整数才重要。

【问题讨论】：

还有问题吗？看起来你已经解决了。顺便说一句，您可以直接在 Morton 编码的坐标上进行一些算术运算，例如沿轴递增。
@harold 无法找出完整的掩码值。位置 21 之外的位真的有任何考虑吗？关于算术部分，我的应用程序要求是从四叉树单元中解码 morton 代码并从另一个数组中读取一些值。我想知道你的意思是什么！
@harold 有趣的博客！看起来你也来自荷兰:)
只是这个掩码，你已经拥有了：0x1249249249249249（为 y/z 掩码向左移动 1 或 2）
@harold BMI 成功了！更新了答案。

标签： c gcc bit-manipulation z-order-curve

【解决方案1】：

所以在摆弄了一下之后，我得出了一个我认为应该在这里分享的解决方案。

// on GCC, compile with option -mbmi2, requires Haswell or better.
#include <stdio.h>
#include <limits.h>
#include <immintrin.h>
#include <inttypes.h>
#include <sys/time.h>

#define maask 0x1249249249249249

/* Morton Encoding Mehtod 1 */
uint64_t Z_encode1 (uint32_t x, uint32_t y, uint32_t z)
{
  return _pdep_u32(x, maask)       | \
         _pdep_u32(y,(maask << 1)) | \
         _pdep_u32(z,(maask << 2));
}

/* Morton Decoding Method 1 */
uint64_t Z_decode1 (uint64_t m, uint32_t *x, uint32_t *y, uint32_t *z)
{
  *x = _pext_u64(m, maask);
  *y = _pext_u64(m, (maask << 1));
  *z = _pext_u64(m, (maask << 2));
}

// method 2 functions 
uint64_t spread(uint64_t w)  {
    w &=                0x00000000001fffff; 
    w = (w | w << 32) & 0x001f00000000ffff;  
    w = (w | w << 16) & 0x001f0000ff0000ff;  
    w = (w | w <<  8) & 0x010f00f00f00f00f; 
    w = (w | w <<  4) & 0x10c30c30c30c30c3; 
    w = (w | w <<  2) & 0x1249249249249249;
    return w;
    }

uint32_t compact(uint64_t w) {
    w &=                  0x1249249249249249;
    w = (w ^ (w >> 2))  & 0x30c30c30c30c30c3;
    w = (w ^ (w >> 4))  & 0xf00f00f00f00f00f;
    w = (w ^ (w >> 8))  & 0x00ff0000ff0000ff;
    w = (w ^ (w >> 16)) & 0x00ff00000000ffff;
    w = (w ^ (w >> 32)) & 0x00000000001fffff;
    return (uint32_t)w;
    }

uint64_t Z_encode2(uint32_t x, uint32_t y, uint32_t z)  {
   return ((spread((uint64_t)x)) | (spread((uint64_t)y) << 1) | (spread((uint64_t)z) << 2));
   }



void Z_decode2(uint64_t Z_code, uint32_t *xindex, uint32_t *yindex, uint32_t *zindex){
    *xindex = compact(Z_code);
    *yindex = compact(Z_code >> 1);
    *zindex = compact(Z_code >> 2);
}
int main()
{
    const int size = 1024;
    struct timeval start, stop;
    double time_encode1 = 0.0, time_encode2 = 0.0;
    double time_decode1 = 0.0, time_decode2 = 0.0;

    uint64_t Zindex = 0;
    uint32_t xindex=0,yindex=0,zindex=0;

    /* method 1 ENCODING benchmark */
    gettimeofday(&start, NULL);
    for (uint32_t i = 0; i < size; i++){
        for (uint32_t j = 0; j < size; j++) {
            for (uint32_t k = 0; k < size; k++) {
                Zindex = Z_encode1(i, j, k);
            }
        }
    }
    gettimeofday(&stop, NULL);
    time_encode1 = (double)(stop.tv_usec - start.tv_usec) / 1000000 + (double)(stop.tv_sec - start.tv_sec);

    /* method 2 ENCODING benchmark */
    gettimeofday(&start, NULL);
    for (uint32_t i = 0; i < size; i++){
        for (uint32_t j = 0; j < size; j++) {
            for (uint32_t k = 0; k < size; k++) {
                Zindex = Z_encode2(i, j, k);
            }
        }
    }
    gettimeofday(&stop, NULL);
    time_encode2 = (double)(stop.tv_usec - start.tv_usec) / 1000000 + (double)(stop.tv_sec - start.tv_sec);

    //////////////////////// DECODING ////////////////
    /* method 1 DECODING benchmark */
    gettimeofday(&start, NULL);
    for (uint64_t i = 0; i < size; i++)
        Z_decode1(i, &xindex, &yindex, &zindex);
    gettimeofday(&stop, NULL);
    time_decode1 = (double)(stop.tv_usec - start.tv_usec) / 1000000 + (double)(stop.tv_sec - start.tv_sec);

    /* method 1 DECODING benchmark */
    gettimeofday(&start, NULL);
    for (uint64_t i = 0; i < size; i++)
        Z_decode2(i, &xindex, &yindex, &zindex);
    gettimeofday(&stop, NULL);
    time_decode2 = (double)(stop.tv_usec - start.tv_usec) / 1000000 + (double)(stop.tv_sec - start.tv_sec);

    printf("Method1 -> Encoding: %f Decoding: %f\n", time_encode1, time_decode1);
    printf("Method2 -> Encoding: %f Decoding: %f\n", time_encode2, time_decode2);
    return 0;
}

这是结果

size = 512 ( 512x512x512 = 134217728 numbers)
======================================================
Method 1 -> Encoding: 0.600302sec Decoding: 0.000003sec
Method 2 -> Encoding: 2.778170sec Decoding: 0.000011sec

size = 1024 ( 1024x1024x1024 = 1073741824 numbers)
======================================================
Method 1 -> Encoding:  4.623594sec Decoding: 0.000006sec
Method 2 -> Encoding: 22.339238sec Decoding: 0.000022sec

size = 2048 ( 2048*2048*2048 = 8589934592 numbers)
======================================================
Method 1 -> Encoding:  36.981743sec Decoding: 0.000011sec
Method 2 -> Encoding: 178.164773sec Decoding: 0.000045sec

结论：编码比解码成本高，使用BMI指令集优化性能。

PS。 - 不可移植，因为需要 Haswell cpu 或更高。

【讨论】：