参考了本文:http://www.cnblogs.com/xulb597/archive/2012/07/05/2578562.html
- 支持模糊搜索,比如,【bkmh】可以匹配【BuKaManHua】;
- 支持优先级,首字母、大小字母有更高的优先级。
亟需解决的问题:
- 目前搜索结果与关键词中字母的顺序无关,即【buk】可以匹配【BKManHua】
- 若条目中存在重复的任意关键字,即使不包含其他关键字,仍然能匹配上来
- 内存的占用:因为接触C++不久,内存管理一窍不通,一个1.65MB的文件(312964个单词),索引之后程序(VS2013编译,Release版本)的内存有68 860KB,
trie.hpp
#ifndef TRIE #define TRIE #include "LinkedList.hpp" #include "MatchInfo.hpp" #include <stdlib.h> #include <iostream> #include <string> #define BRANCH_SIZE 28 #define START_CHAR 'a' #define INDEX(x) (x == '?'? 27 : (x == '*'? 26 : x - START_CHAR)) class Trie { public: Trie() { rootNode = new Trie_Node(0); memset(nodeMap, NULL, sizeof(nodeMap)); memset(indexList, NULL, sizeof(indexList)); } ~Trie() { //delete rootNode; } void insert(const char *data, const int i) { bool flag_start = false,flag_capital = false; Trie_Node *location = rootNode; int pos = 0; while (*data) { char c = *data; // check wether it's capital and convert to lowwer case if so. if(c > 'A'-1 && c < 'Z'+1) { flag_capital = true; c += 32; }else{ flag_capital = false; } // map the char value to int which starts from 0 int index = INDEX(c); // skip invalid chars if(index < 0) { data++; pos++; continue; } // find next if(location->next[index] == NULL) { location->next[index] = getNode(index); } location = location->next[index]; // build MatchInfo and add it to the trie node's indexList MatchInfo *info = new MatchInfo(); info->itemindex = i; info->position = pos; // position of the char in the string info->priority = 1; // intial or capital char has a higher priority if(!flag_start) { flag_start = true; info->priority++; } if(flag_capital) info->priority++; if(indexList[index] == NULL) indexList[index] = new LinkedList<MatchInfo>(); indexList[index]->add(info); data++; pos++; } // end character has a higher priority //location->indexList->getCurrent()->value->priority++; } /*int match(const char *data) { Trie_Node *location = rootNode; while (*data && location) { location = location->next[INDEX(*data)]; data++; } return (location != NULL); }*/ /*void fuzzy_match(const char *data, int* indexMap, size_t indexMapLength) { predicateIndexMap(data, indexMap, indexMapLength); int index; Trie_Node *location = rootNode; while (*data && location) { index = INDEX(*data); location = location->next[INDEX(*data)]; if(location != NULL) { fillIndexArray(indexMap, index); } data++; } }*/ void fuzzy_match(const char *data, int* indexMap, size_t indexMapLength) { predicateIndexMap(data, indexMap, indexMapLength); int index; Trie_Node *location = nodeMap[INDEX(*data)]; do { index = INDEX(*data); if(location != NULL) { fillIndexArray(indexMap, index); } else { break; } data++; } while ((*data) && (location = nodeMap[index])); } /*void print() { print(rootNode); }*/ private: // // a list to record matche info of each char in indexed words. // it's for priority and fuzzy seaching. // LinkedList<MatchInfo>* indexList[BRANCH_SIZE]; struct Trie_Node { //int index; Trie_Node *next[BRANCH_SIZE]; Trie_Node(int _index) { //index = _index; memset(next, NULL, sizeof(next)); }; ~Trie_Node() { //delete indexList; for (int i = 0; i < BRANCH_SIZE; i++) { if(next[i]) delete next[i]; } } }; Trie_Node *rootNode; // // a map to hold all created Trie_Node. // Trie_Node *nodeMap[BRANCH_SIZE]; // // /*get a trie node from map.*/ // return a new Trie_Node; // index: (char - 'a') // Trie_Node *getNode(int index) { //return new Trie_Node(index); Trie_Node *tempNode = nodeMap[index]; if(tempNode == NULL) { tempNode = new Trie_Node(index); nodeMap[index] = tempNode; } return tempNode; } // // fill [indexMap] with priority of char at [index] // void fillIndexArray(int* indexMap, int index) { if(indexList[index] == NULL) indexList[index] = new LinkedList<MatchInfo>(); LinkedList<MatchInfo> *list = indexList[index]; Node<MatchInfo> *node = list->getRoot(); while (node) { int itemIndex = node->value->itemindex; if(indexMap[itemIndex] != -1) indexMap[itemIndex] += node->value->priority; node = node->next; } } // // keep moving node to next until it's itemindex in value has been changed. // node will set to NULL if reaches the end. // void moveToNextItemIndex(Node<MatchInfo> **node) { int index = (*node)->value->itemindex; if((*node)->next == NULL) (*node) = NULL; else { while ((*node)->value->itemindex == index) { (*node)=(*node)->next; if((*node) == NULL) break; } } } // // predicate whether an index in indexMap is impossiable to be matched. // It will be set to -1 if so. // void predicateIndexMap(const char* keyword, int* indexMap, size_t indexMapLength) { int *indexesMatched = new int[indexMapLength]; int keywordLength = strlen(keyword); unsigned int keywordRecords[BRANCH_SIZE]; size_t size = indexMapLength * sizeof(int); memset(indexesMatched, 0, size); memset(indexMap, -1, size); LinkedList<MatchInfo> *list; Node<MatchInfo> *match_node; int charIndex, index = 0; while (*keyword) { charIndex = INDEX(*keyword); if(keywordRecords[charIndex] == 1) { keyword++; continue; } keywordRecords[charIndex] = 1; list = indexList[charIndex]; if(list != NULL) { match_node = list->getRoot(); while (match_node != NULL) { indexesMatched[match_node->value->itemindex]++; match_node = match_node->next; //moveToNextItemIndex(&match_node); } } keyword++; } for (int i = 0; i < indexMapLength; i++) { if(indexesMatched[i] >= keywordLength) indexMap[i] = 0; } delete indexesMatched; } /*void print(Trie_Node* node) { char c; for (int i = 0; i < BRANCH_SIZE; i++) { if(node->next[i] != NULL)\n{ c = node->index + 'a'; printf("%c-", c); print(node->next[i]); } } }*/ }; #endif // TRIE