为什么我的 trie 会泄露数据？答案

【问题标题】：Why is my trie leaking data?为什么我的 trie 会泄露数据？
【发布时间】：2016-10-09 22:19:29
【问题描述】：

我正在尝试使用 trie 进行拼写检查，但我尝试添加到我的 trie 中的单词似乎没有被输入。泄漏在哪里？

我花了几个小时使用调试器并单步执行我的代码...

主要功能：

/**
 * Implements a spell-checker.
 */

#include <ctype.h>
#include <stdio.h>
#include <sys/resource.h>
#include <sys/time.h>

#include "dictionary.h"
#undef calculate
#undef getrusage

// default dictionary
#define DICTIONARY "dictionaries/large"

// prototype
double calculate(const struct rusage *b, const struct rusage *a);

int main(int argc, char *argv[])
{
    // check for correct number of args
    if (argc != 2 && argc != 3)
    {
        printf("Usage: speller [dictionary] text\n");
        return 1;
    }

    // structs for timing data
    struct rusage before, after;

    // benchmarks
    double time_load = 0.0, time_check = 0.0, time_size = 0.0, time_unload = 0.0;

    // determine dictionary to use
    char* dictionary = (argc == 3) ? argv[1] : DICTIONARY;

    // load dictionary
    getrusage(RUSAGE_SELF, &before);
    bool loaded = load(dictionary);
    getrusage(RUSAGE_SELF, &after);

    // abort if dictionary not loaded
    if (!loaded)
    {
        printf("Could not load %s.\n", dictionary);
        return 1;
    }

    // calculate time to load dictionary
    time_load = calculate(&before, &after);

    // try to open text
    char *text = (argc == 3) ? argv[2] : argv[1];
    FILE *fp = fopen(text, "r");
    if (fp == NULL)
    {
        printf("Could not open %s.\n", text);
        unload();
        return 1;
    }

    // prepare to report misspellings
    printf("\nMISSPELLED WORDS\n\n");

    // prepare to spell-check
    int index = 0, misspellings = 0, words = 0;
    char word[LENGTH+1];

    // spell-check each word in text
    for (int c = fgetc(fp); c != EOF; c = fgetc(fp))
    {
        // allow only alphabetical characters and apostrophes
        if (isalpha(c) || (c == '\'' && index > 0))
        {
            // append character to word
            word[index] = c;
            index++;

            // ignore alphabetical strings too long to be words
            if (index > LENGTH)
            {
                // consume remainder of alphabetical string
                while ((c = fgetc(fp)) != EOF && isalpha(c));

                // prepare for new word
                index = 0;
            }
        }

        // ignore words with numbers (like MS Word can)
        else if (isdigit(c))
        {
            // consume remainder of alphanumeric string
            while ((c = fgetc(fp)) != EOF && isalnum(c));

            // prepare for new word
            index = 0;
        }

        // we must have found a whole word
        else if (index > 0)
        {
            // terminate current word
            word[index] = '\0';

            // update counter
            words++;

            // check word's spelling
            getrusage(RUSAGE_SELF, &before);
            bool misspelled = !check(word);
            getrusage(RUSAGE_SELF, &after);

            // update benchmark
            time_check += calculate(&before, &after);

            // print word if misspelled
            if (misspelled)
            {
                printf("%s\n", word);
                misspellings++;
            }

            // prepare for next word
            index = 0;
        }
    }

    // check whether there was an error
    if (ferror(fp))
    {
        fclose(fp);
        printf("Error reading %s.\n", text);
        unload();
        return 1;
    }

    // close text
    fclose(fp);

    // determine dictionary'size
    getrusage(RUSAGE_SELF, &before);
    unsigned int n = size();
    getrusage(RUSAGE_SELF, &after);

    // calculate time to determine dictionary's size
    time_size = calculate(&before, &after);

    // unload dictionary
    getrusage(RUSAGE_SELF, &before);
    bool unloaded = unload();
    getrusage(RUSAGE_SELF, &after);

    // abort if dictionary not unloaded
    if (!unloaded)
    {
        printf("Could not unload %s.\n", dictionary);
        return 1;
    }

    // calculate time to unload dictionary
    time_unload = calculate(&before, &after);

    // report benchmarks
    printf("\nWORDS MISSPELLED:     %d\n", misspellings);
    printf("WORDS IN DICTIONARY:  %d\n", n);
    printf("WORDS IN TEXT:        %d\n", words);
    printf("TIME IN load:         %.2f\n", time_load);
    printf("TIME IN check:        %.2f\n", time_check);
    printf("TIME IN size:         %.2f\n", time_size);
    printf("TIME IN unload:       %.2f\n", time_unload);
    printf("TIME IN TOTAL:        %.2f\n\n", 
     time_load + time_check + time_size + time_unload);

    // that's all folks
    return 0;
}

/**
 * Returns number of seconds between b and a.
 */
double calculate(const struct rusage *b, const struct rusage *a)
{
    if (b == NULL || a == NULL)
    {
        return 0.0;
    }
    else
    {
        return ((((a->ru_utime.tv_sec * 1000000 + a->ru_utime.tv_usec) -
                 (b->ru_utime.tv_sec * 1000000 + b->ru_utime.tv_usec)) +
                ((a->ru_stime.tv_sec * 1000000 + a->ru_stime.tv_usec) -
                 (b->ru_stime.tv_sec * 1000000 + b->ru_stime.tv_usec)))
                / 1000000.0);
    }
}

检查单词是否在 trie 中的函数：

/**
 * Implements a dictionary's functionality.
 */

#include <stdbool.h>
#include <stdlib.h>
#include <ctype.h>
#include <string.h>
#include <stdio.h>

#include "dictionary.h"

int dictionary_size;

bool find(char *word, node *next)
{
    word++;
    if (next == NULL)
    {
        return false;
    }
    else if (word[0] == '\0')
    {
        if (next->is_word == true)
        {
            return true;
        }
        else
        {
            return false;
        }
    }
    else
    {
        //go to this nodes chold that is now eual to the first letter of the word
        if (word[0] == '\'')
        {
            return find(word, next->children[26]);
        }
        else
        {
            return find(word, next->children[word[0] - 'a']);
        }
    }
}

/**
 * Returns true if word is in dictionary else false.
 */
bool check(const char *word)
{
    //get word from text and make it malluable
    char w[47];
    memcpy(w, word, strlen(word) + 1);
    //make sure it's all lower case
    for (int i = 0; i < strlen(word); i++)
    {
        w[i] = tolower(w[i]);
    }
    //go to the root child node equal to the first letter of the word
    if (find(w, root))
    {
        return true;
    }
    else
    {
        return false;
    }
}

将单词字典加载到 trie 中的函数。（我猜这是泄漏的地方？）：

struct node *newNode(char *word, node *next)
{
    next = malloc(sizeof(node));
    word++;
    if (word[0] == '\0')
    {
        next->is_word = true;
        return next;
    }
    else
    {
        node *new_node = NULL;
        // go to this nodes choild that is now equal to the first letter of the word
        if (word[0] == '\'')
        {
           return next->children[26] = newNode(word, new_node);
        }
        else
        { 
           return next->children[word[0] - 97] = newNode(word, new_node);
        }
    }
}


/**
 * Loads dictionary into memory. Returns true if successful else false.
 */
bool load(const char *dictionary)
{
    dictionary_size = 0;
    FILE *dic = fopen(dictionary, "r");
    if (dic == NULL)
    {
        return false;
    }
    // Initalize root node
    root = malloc(sizeof(node));

    //get word from dictinary
    int ch;
    while ((ch = fgetc(dic)) != EOF)
    {
        char word[47];
        fgets(word, 47, dic);
        dictionary_size++;
        // make sure it's all lower case
        for (int i = 0; i < 47; i++)
        {
            word[i] = tolower(word[i]);
        }
        // get rid of new line char
        char *pos;
        if ((pos = strchr(word, '\n')) != NULL)
        {
            *pos = '\0';
        }
        printf("%s\n", word);
        //go to root nodes child that is equal to the first letter of the word
        node *child_node = NULL;
        root->children[word[0] - 'a'] = newNode(word, child_node);
    }
return true;
}

/**
 * Returns number of words in dictionary if loaded else 0 if not yet loaded.
 */
unsigned int size(void)
{
    if (dictionary_size != 0)
    {
        return dictionary_size;
    }
    return 0;
}

void free_node(node *next)
{
    // safety including root node
    if(!next) return;   

    // takes you to end of trie
    for (int i = 0; i < 26; i++)
    {
       free_node(next->children[i]);
    }

    // base case
    free(next);
}

/**
 * Unloads dictionary from memory. Returns true if successful else false.
 */
bool unload(void)
{
    free_node(root);
    return true;
}

定义节点结构：

typedef struct node
{
    bool is_word;
    struct node *children[27]; 
}
node;

node *root;

【问题讨论】：

在previous incarnation of this question（需要 10K）中，我指出“我很确定如果您的文件包含 'aardvark' 和 'abelone'，那么您会丢弃（泄漏）添加'abelone'时有关'aardvark'的信息。“我指的是return next->children[word[0] - 97] = newNode(word, new_node);这行，它完全用你刚刚为'abelone'创建的内容替换了之前为'aardvark'创建的内容。你必须以完全不同的方式处理事情。那 97 应该写成'a'。
你可以看看Adding word to trie structure dictionary。我敢肯定还有其他相关的问题。这是 CS50 的吗？

标签： c trie

【解决方案1】：

此代码使用彻底重写的addWord() 函数代替addNode() 工作。

/* SO 3994-9116 */
#include <assert.h>
#include <ctype.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

typedef struct node
{
    bool is_word;
    struct node *children[27];
} node;

static node *root = 0;
static int dictionary_size = 0;

static void addWord(char *word, node *trie)
{
    assert(islower((unsigned char)word[0]) || word[0] == '\0');
    if (word[0] == '\0')
        trie->is_word = true;
    else
    {
        int code = word[0] - 'a';
        if (trie->children[code] == 0)
        {
            trie->children[code] = calloc(sizeof(node), 1);
            if (trie->children[code] == 0)
            {
                fprintf(stderr, "Memory allocation failed\n");
                exit(1);
            }
        }
        addWord(&word[1], trie->children[code]);
    }
}

static
bool load(const char *dictionary)
{
    dictionary_size = 0;
    FILE *dic = fopen(dictionary, "r");
    if (dic == NULL)
        return false;
    root = calloc(1, sizeof(node));
    if (root == 0)
    {
        fprintf(stderr, "Memory allocation failed\n");
        exit(1);
    }

    char word[47];
    while (fgets(word, sizeof(word), dic) != 0)
    {
        char *pos;
        if ((pos = strchr(word, '\n')) != NULL)
            *pos = '\0';
        dictionary_size++;
        for (int i = 0, len = strlen(word); i < len; i++)
        {
            if (!isalpha((unsigned char)word[i]))
                fprintf(stderr, "Non-alphabetic character in '%s'\n", word);
            else
                word[i] = tolower((unsigned char)word[i]);
        }
        addWord(word, root);
    }
    return true;
}

static void print_subtrie(const char *prefix, char nchar, node *child)
{
    if (child != 0)
    {
        int len = strlen(prefix);
        char buffer[len + 2];
        strcpy(buffer, prefix);
        buffer[len] = nchar;
        buffer[len+1] = '\0';
        if (child->is_word)
            printf("%s\n", buffer);
        for (int i = 0; i < 26; i++)
            print_subtrie(buffer, 'a' + i, child->children[i]);
    }
}

static void print_trie(node *top)
{
    for (int i = 0; i < 26; i++)
        print_subtrie("", 'a' + i, top->children[i]);
}

static void free_trie(node *trie)
{
    if (trie != 0)
    {
        for (int i = 0; i < 27; i++)
        {
            if (trie->children[i] != 0)
                free_trie(trie->children[i]);
        }
        free(trie);
    }
}

int main(void)
{
    if (load("words.list"))
    {
        printf("Nominal dictionary size: %d\n", dictionary_size);
        print_trie(root);
        free_trie(root);
    }
    return 0;
}

测试文件 1：

aardvark
abelone
assignation
assignment
assigned
assert
assertion

输出文件 1：

Nominal dictionary size: 7
aardvark
abelone
assert
assertion
assignation
assigned
assignment

测试文件 2：

Mike
Aquarius
delta
assert
zulu
assertion
alpha
aardvark
abelone
culmination
Zulu
kilo
amniocentesis
Oscar
Yankee
cabal
echo
Sierra
duck
Charlie
Quebec
centre
assigned
Papa
foxtrot
uMBRelLA
Romeo
cab
India
Juliet
bravo
amniocentesis
hotel
amniocentesis
Lima
Whisky
cab
culminate
cold
Xray
cab
cabs
assignation
cam
Victor
golf
Tango
November
cinema
cabbie
assignment

输出文件 2：

Nominal dictionary size: 56
a
aardvark
abelone
alpha
amniocentesis
aquarius
as
assert
assertion
assign
assignation
assigned
assignment
bravo
cab
cabal
cabbie
cabs
cam
centre
charlie
cinema
cold
culminate
culmination
delta
duck
echo
foxtrot
golf
hotel
i
india
juliet
kilo
lima
mike
november
o
oscar
papa
quebec
romeo
sierra
tango
umbrella
victor
whisky
xray
yankee
zulu

注意 56 的“标称大小”太大了；输入中有 46 个不同的单词，因此在输出中。

【讨论】：