大整数的小查找表答案

【问题标题】：Small Lookup Table for Large Integers大整数的小查找表
【发布时间】：2020-09-24 00:24:52
【问题描述】：

我想制作一个快速、相对较小但输入范围较大的查找表： -输入：最大 32 位值。（一个 32 位颜色值） -输出：最大 8 位索引。（表的索引）

类似于下面的代码。（如果索引的值超过 256 个，则索引将为 0）

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>

static uint8_t getIndx(uint32_t value);
static uint8_t **indx;
static uint8_t count = 0;

int main(void) {
  // set up the indx
  const uint32_t size = 0xFFFF;  // for demonstrative purposes not even nearly as large as wished to be (0xFFFFFFFF) plus my for loop below would get in trouble, I think
  indx = (uint8_t**)malloc(sizeof(uint8_t*) * size);
  if(indx == NULL) {
    printf("could not allocate memory\n");
    return 0;
  }
  for(int i = 0; i < size + 1; i++) {
    indx[i] = NULL;
  }

  printf("%d\n", getIndx(111));
  printf("%d\n", getIndx(222));
  printf("%d\n", getIndx(333));
  printf("%d\n", getIndx(111));
  printf("%d\n", getIndx(222));
  printf("%d\n", getIndx(333));

  return 0;
}

static uint8_t getIndx(uint32_t value) {
  if(indx[value] == NULL) {
    if(count > 255) return 0;
    indx[value] = (uint8_t*)malloc(sizeof(uint8_t));
    *indx[value] = count;
    count++;
  }
  return *(indx[value]);
}

输出是：

无论我怎么想，我总是以类似的方式结束。输入范围为 32 位（4294967296 个状态），我需要分配太多内存才能获得 256 个可能的输出。在 for 循环内形成 256 if else 也不是我想要的。

有没有什么快速的方法，不管是表还是不表，最终功能相同，我还没听说过？

提前非常感谢！

【问题讨论】：

听起来您可能正在寻找“哈希表”。你有没有研究过这是否符合你的需要？哈希表有多种变体，具体取决于哪种内存与性能权衡点适合您的需求。
谷歌搜索“c字典”带你here。
请注意，if(count > 255) 永远不会为真，因为 count 的类型为 uint8_t。说表中元素的最大数量为 256 是否正确？您的目标是哪种硬件？
你要什么？不使用太多内存的解决方案？你想要一个函数来记住它的前 256 个输入，并在它再次看到这些输入时返回它们分配的数字，而对于其他输入则返回零？
仅供参考，您为 size 元素分配空间，但初始化 size+1 元素。

标签： c performance lookup-tables

【解决方案1】：

您可以使用哈希表将 32 位值映射到查找表索引。哈希表的长度应明显长于查找表，以减少哈希冲突的机会。

以下示例使用哈希表的线性搜索，从输入值派生的哈希值开始，直到找到匹配条目或找到空哈希表。如果没有找到输入值，并且查找表中有空间并且哈希表中有空间（应该有，因为它比查找表长），则将输入值添加到查找中- up 表和散列表被更新。它使用四倍于查找表大小的哈希表。

该示例使用相同的伪随机序列进行两次遍历。第一遍应该主要填满查找表并更新哈希表。第二遍不应再对查找表或哈希表进行任何更改，因为它只是重复第一个序列。

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <inttypes.h>
#include <string.h>

struct hash_index {
  uint8_t index;
  char used;
};

#define HASHBITS  10
#define HASHSIZE  (1U << HASHBITS)
#define HASHMASK  (HASHSIZE - 1)
#define LOOKUPSIZE  256U

static uint8_t getIndx(uint32_t value);
static uint32_t lookup[LOOKUPSIZE];
static struct hash_index hashtab[HASHSIZE];
static unsigned int count = 0;
static unsigned int tot_collisions = 0;
static unsigned int max_collisions = 0;
static unsigned int hash_used = 0;

int main(void) {
  int pass;
  unsigned int i;
  uint32_t value;
  uint8_t index;
  unsigned int successes;
  unsigned int failures;
  int ok;

  printf("Lookup size: %u, Hash size: %u\n", LOOKUPSIZE, HASHSIZE);
  for (pass = 1; pass <= 2; pass++) {
    successes = 0;
    failures = 0;
    tot_collisions = 0;
    max_collisions = 0;
    /* Not resetting hash_used because it shouldn't change after first pass. */
    printf("\nPass %d, currently used hashes: %u\n\n", pass, hash_used);
    srand(1);
    for (i = 0; i < 260; i++) {
      static const char * const outcomes[2] = {"FAIL", "OK"};
      value = rand();
      index = getIndx(value);
      ok = lookup[index] == value;
      printf("%" PRIu32 " -> %" PRIu8 " (%s)\n", value, index, outcomes[ok]);
      if (ok) {
        successes++;
      } else {
        failures++;
      }
    }
    printf("\nSuccesses: %u, Failures: %u\n", successes, failures);
    printf("Used hashes: %u, Total collisions: %u, Max collisions: %u\n\n",
           hash_used, tot_collisions, max_collisions);
  }

  return 0;
}

static uint8_t getIndx(uint32_t value) {
  unsigned int initial_hash;
  unsigned int hash;
  unsigned int collisions = 0;
  uint8_t index;

  /*
   * Search for value using hash table, starting at position hashed from value.
   *
   * The hash table is longer than the maximum number of used entries,
   * so we should always be able to find an unused entry in the hash table.
   */
  initial_hash = ((value * UINT32_C(0x61c88647)) >> (32 - HASHBITS)) & HASHMASK;
  for (hash = initial_hash;
       collisions < HASHSIZE && hashtab[hash].used;
       hash = (hash + 1) & HASHMASK) {
    /*
     * This hash table entry is used.  Get the corresponding index in the
     * main lookup table to check if the value matches.
     */
    index = hashtab[hash].index;
    if (lookup[index] == value) {
      /* Matching value found.  Return its index in the main table. */
      return index;
    }
    /* Count hash collisions and total hash collisions. */
    collisions++;
    tot_collisions++;
    if (max_collisions < collisions) {
      /* Update max hash collisions for diagnostics. */
      max_collisions = collisions;
    }
  }
  /* Value not found. */
  if (count < LOOKUPSIZE && collisions < HASHSIZE) {
    /*
     * There is room in the main lookup table for the new value
     * and room in the hash table.  The index of the new value in the
     * main lookup table will be the current count, which will be incremented.
     */
    index = count++;
    /*
     * Add value to main lookup table,
     * add index in main lookup table to hash table,
     * and return the index in the main lookup table.
     */
    lookup[index] = value;
    hashtab[hash].index = index;
    hashtab[hash].used = 1;
    hash_used++;  /* Count of used hash table entries for diagnostics. */
    return index;
  }
  /*
   * Value not found and main lookup table is full or hash table is full.
   * Give up.
   */
  return 0;
}

另一种处理冲突的方法是让每个哈希表入口指向一个匹配值的链表，但这更复杂。

【讨论】：

这看起来很有希望。你在哪里学习散列？你能推荐一本书、一个网站或其他任何东西吗？因为我对此一无所知...