参考了本文:http://www.cnblogs.com/xulb597/archive/2012/07/05/2578562.html

  • 支持模糊搜索,比如,【bkmh】可以匹配【BuKaManHua】;
  • 支持优先级,首字母、大小字母有更高的优先级。

亟需解决的问题:

  • 目前搜索结果与关键词中字母的顺序无关,即【buk】可以匹配【BKManHua】
  • 若条目中存在重复的任意关键字,即使不包含其他关键字,仍然能匹配上来
  • 内存的占用:因为接触C++不久,内存管理一窍不通,一个1.65MB的文件(312964个单词),索引之后程序(VS2013编译,Release版本)的内存有68 860KB,

trie.hpp

#ifndef TRIE
#define TRIE

#include "LinkedList.hpp"
#include "MatchInfo.hpp"

#include <stdlib.h>
#include <iostream>
#include <string>

#define BRANCH_SIZE 28
#define START_CHAR 'a'

#define INDEX(x) (x == '?'? 27 : (x == '*'? 26 : x - START_CHAR))
        
class Trie
{
public:
    Trie()
    {
        rootNode = new Trie_Node(0);
        memset(nodeMap, NULL, sizeof(nodeMap));
        memset(indexList, NULL, sizeof(indexList));
    }

    ~Trie()
    {
        //delete rootNode;
    }

    void insert(const char *data, const int i)
    {
        bool flag_start = false,flag_capital = false;
        Trie_Node *location = rootNode;
        int pos = 0;
        while (*data)
        {
            char c = *data;
            // check wether it's capital and convert to lowwer case if so.
            if(c > 'A'-1 && c < 'Z'+1)
            {
                flag_capital = true;
                c += 32;
            }else{
                flag_capital = false;
            }
            // map the char value to int which starts from 0
            int index = INDEX(c);
            // skip invalid chars
            if(index < 0)
            {
                data++;
                pos++;
                continue;
            }
            // find next
            if(location->next[index] == NULL)
            {
                location->next[index] = getNode(index);
            }
            location = location->next[index];

            // build MatchInfo and add it to the trie node's indexList
            MatchInfo *info = new MatchInfo();
            info->itemindex = i;
            info->position = pos;    // position of the char in the string
            info->priority = 1;

            // intial or capital char has a higher priority
            if(!flag_start)
            {
                flag_start = true;
                info->priority++;
            }
            if(flag_capital)
                info->priority++;
            if(indexList[index] == NULL)
                indexList[index] = new LinkedList<MatchInfo>();
            indexList[index]->add(info);
            data++;
            pos++;
        }
        // end character has a higher priority
        //location->indexList->getCurrent()->value->priority++;
    }

    /*int match(const char *data)
    {
        Trie_Node *location = rootNode;
        while (*data && location)
        {
            location = location->next[INDEX(*data)];
            data++;
        }
        return (location != NULL);
    }*/

    /*void fuzzy_match(const char *data, int* indexMap, size_t indexMapLength)
    {
        predicateIndexMap(data, indexMap, indexMapLength);
        
        int index;
        Trie_Node *location = rootNode;
        while (*data && location)
        {
            index = INDEX(*data);
            location = location->next[INDEX(*data)];
            if(location != NULL)
            {
                fillIndexArray(indexMap, index);
            }
            data++;
        }
    }*/

    void fuzzy_match(const char *data, int* indexMap, size_t indexMapLength)
    {
        predicateIndexMap(data, indexMap, indexMapLength);
        
        int index;
        Trie_Node *location = nodeMap[INDEX(*data)];
        do
        {
            index = INDEX(*data);
            if(location != NULL)
            {
                fillIndexArray(indexMap, index);
            }
            else
            {
                break;
            }
            data++;
        } while ((*data) && (location = nodeMap[index]));
    }

    /*void print()
    {
        print(rootNode);
    }*/

private:
    //
    // a list to record matche info of each char in indexed words.
    // it's for priority and fuzzy seaching.
    //
    LinkedList<MatchInfo>* indexList[BRANCH_SIZE];

    struct Trie_Node
    {
        //int index;
        Trie_Node *next[BRANCH_SIZE];
        
        Trie_Node(int _index)
        {
            //index = _index;
            memset(next, NULL, sizeof(next));
        };
        ~Trie_Node()
        {
            //delete indexList;
            for (int i = 0; i < BRANCH_SIZE; i++)
            {
                if(next[i])
                    delete next[i];
            }
        }
    };

    Trie_Node *rootNode;

    //
    // a map to hold all created Trie_Node.
    //
    Trie_Node *nodeMap[BRANCH_SIZE];

    //
    // /*get a trie node from map.*/
    // return a new Trie_Node;
    // index: (char - 'a')
    //
    Trie_Node *getNode(int index)
    {
        //return new Trie_Node(index);
        Trie_Node *tempNode = nodeMap[index];
        if(tempNode == NULL)
        {
            tempNode = new Trie_Node(index);
            nodeMap[index] = tempNode;
        }
        return tempNode;
    }

    //
    // fill [indexMap] with priority of char at [index]
    //
    void fillIndexArray(int* indexMap, int index)
    {
        if(indexList[index] == NULL)
            indexList[index] = new LinkedList<MatchInfo>();
        LinkedList<MatchInfo> *list = indexList[index];
        Node<MatchInfo> *node = list->getRoot();
        while (node)
        {
            int itemIndex = node->value->itemindex;
            if(indexMap[itemIndex] != -1)
                indexMap[itemIndex] += node->value->priority;
            node = node->next;
        }
    }

    //
    // keep moving node to next until it's itemindex in value has been changed.
    // node will set to NULL if reaches the end.
    //
    void moveToNextItemIndex(Node<MatchInfo> **node)
    {
        int index = (*node)->value->itemindex;
        if((*node)->next == NULL)
            (*node) = NULL;
        else
        {
            while ((*node)->value->itemindex == index)
            {
                (*node)=(*node)->next;
                if((*node) == NULL)
                    break;
            }
        }
    }

    //
    // predicate whether an index in indexMap is impossiable to be matched.
    // It will be set to -1 if so.
    //
    void predicateIndexMap(const char* keyword, int* indexMap, size_t indexMapLength)
    {
        int *indexesMatched = new int[indexMapLength];
        int keywordLength = strlen(keyword);
        unsigned int keywordRecords[BRANCH_SIZE];
        size_t size = indexMapLength * sizeof(int);
        memset(indexesMatched, 0, size);
        memset(indexMap, -1, size);
        LinkedList<MatchInfo> *list;
        Node<MatchInfo> *match_node;
        int charIndex, index = 0;
        while (*keyword)
        {
            charIndex = INDEX(*keyword);
            if(keywordRecords[charIndex] == 1)
            {
                keyword++;
                continue;
            }
            keywordRecords[charIndex] = 1;
            list = indexList[charIndex];
            if(list != NULL)
            {
                match_node = list->getRoot();
                while (match_node != NULL)
                {
                    indexesMatched[match_node->value->itemindex]++;
                    match_node = match_node->next;
                    //moveToNextItemIndex(&match_node);
                }
            }
            keyword++;
        }
        for (int i = 0; i < indexMapLength; i++)
        {
            if(indexesMatched[i] >= keywordLength)
                indexMap[i] = 0;
        }
        delete indexesMatched;
    }

    /*void print(Trie_Node* node)
    {
        char c;
        for (int i = 0; i < BRANCH_SIZE; i++)
        {
            if(node->next[i] != NULL)\n{
                c = node->index + 'a';
                printf("%c-", c);
                print(node->next[i]);
            }
        }
    }*/
};
#endif // TRIE
View Code

相关文章:

  • 2022-02-24
  • 2022-02-12
  • 2022-12-23
  • 2021-07-29
  • 2021-09-15
  • 2022-12-23
  • 2021-09-20
  • 2021-09-17
猜你喜欢
  • 2021-07-03
  • 2021-12-15
  • 2022-12-23
  • 2022-12-23
  • 2022-12-23
相关资源
相似解决方案