具体思路:

1->敏感词库,可从数据库读取,也可以从文件加载.

2->将敏感词转化为gbk编码,因为gbk严格按照字符一个字节,汉字两个字节的格式编码,便于容易切分文字段.

3->将所有敏感词以首个字符[英文一字节,汉字两字节]转换为一个整数,然后按照这个整数给所有敏感词建立索引,索引的value用list,因为考虑到同一个整数对应多个关键字.

4->检测一段内文字类容时,也实现转化为gbk,然后逐个字符[英文一字节,汉字两字节]检测是否有以该字符为首的敏感词.

代码.h

 1 #ifndef SENSITIVE_WORDS_CHECKER_
 2 #define SENSITIVE_WORDS_CHECKER_
 3 #include <stdint.h>
 4 #include <stdio.h>
 5 #include <memory.h>
 6 #include <map>
 7 #include <vector>
 8 
 9 enum {
10     enmMaxWordLength = 32,    //每个敏感词最大长度
11     enmMaxWordsFileLength = 1024 * 128,    //敏感词文件最大长度128k
12     enmMaxContentLength = 1024,    // 单次检测内容测最大长度
13 };
14 
15 struct SensitiveWord
16 {
17     char szWord[enmMaxWordLength];
18     SensitiveWord()
19     {
20         memset(szWord, 0, enmMaxWordLength);
21     }
22 };
23 
24 typedef std::vector<SensitiveWord*> WordList;
25 typedef std::map<uint32_t, WordList*> WordMap;
26 
27 class SensitiveWordsChecker
28 {
29 public:
30     SensitiveWordsChecker() :arrSensitiveWord(NULL), nSensitiveWordCnt(0){}
31     ~SensitiveWordsChecker(){ delete[] arrSensitiveWord; }
32 public:
33     void LoadWordsFromUTF8File(const char *file_name);
34     void LoadWordsFromGBKFile(const char *file_name);
35 protected:
36     int32_t WriteToFile(const char buf[], const int32_t buf_size, const char *file_name);
37     void DumpWordMap();
38     void GenTestData();
39     void Test();
40     void StrAppend(char buf[], const uint32_t bufLen, uint32_t &offset, const char *fmt, ...);
41 private:
42     int32_t LoadFile(char buf[], const uint32_t buf_size, const char *file_name);
43     int32_t CodeConvert(char *from_charset, char *to_charset, char *inbuf, size_t inlen, char *outbuf, size_t outlen);
44     int32_t UTF8_To_GBK(char *inbuf, size_t inlen, char *outbuf, size_t outlen);
45     int32_t GBK_To_UTF8(char *inbuf, size_t inlen, char *outbuf, size_t outlen);
46     uint32_t GetWordsCount(char buf[],const uint32_t buf_size,char separator);
47     char *StrcpyExcludeChar(char *dst, const uint32_t dst_len, const char *src, const char *exclude_list);
48     int32_t GetWords(char gbk_buf[], const uint32_t buf_size, char separator);
49     void BuildWordMap();
50     uint32_t GetFirstCharFromGBK(char gbk_buf[]);
51     uint32_t GetFirstCharFromTUF8(char utf8_buf[]);
52     uint32_t GetFirstChar(char buf[]);
53     // 返回 0 表示in_utf8_buf里面没有敏感词
54     // 返回 1 表示in_utf8_buf里面含有关键词,并将关键词替换为*输出到out_utf8_buf
55     int32_t CheckSensitiveWord(char out_utf8_buf[], char in_utf8_buf[]);
56     const SensitiveWord* FindSensitiveWord(uint32_t code,const char *pos);
57 private:
58     SensitiveWord *arrSensitiveWord;
59     uint32_t nSensitiveWordCnt;
60     WordMap mapWords;
61 };
62 
63 #endif
View Code

相关文章:

  • 2021-06-15
  • 2022-12-23
  • 2021-06-08
  • 2021-10-19
  • 2021-12-05
  • 2022-12-23
  • 2021-12-06
猜你喜欢
  • 2022-12-23
  • 2021-07-13
  • 2022-12-23
  • 2022-12-23
  • 2021-12-10
  • 2022-12-23
  • 2021-06-10
相关资源
相似解决方案