weka数据挖掘拾遗（二）---- 特征选择（IG、chi-square)

一、说明

　　IG是information gain 的缩写，中文名称是信息增益，是选择特征的一个很有效的方法（特别是在使用svm分类时）。这里不做详细介绍，有兴趣的可以googling一下。

　　chi-square 是一个常用特征筛选方法，在种子词扩展那篇文章中，有详细说明，这里不再赘述。

二、weka中的使用方法

　　1、特征筛选代码

 1 package com.lvxinjian.alg.models.feature;
 2 
 3 import java.nio.charset.Charset;
 4 import java.util.ArrayList;
 5 
 6 import weka.attributeSelection.ASEvaluation;
 7 import weka.attributeSelection.AttributeEvaluator;
 8 import weka.attributeSelection.Ranker;
 9 import weka.core.Instances;
10 
11 import com.iminer.tool.common.util.FileTool;
12 /**
13  * @Description : 使用Weka的特征筛选方法（目前支持IG、Chi-square）
14  * @author Lv Xinjian
15  *
16  */
17 public class FeatureSelectorByWeka {
18     
19     /**
20      * @function 使用weka内置的算法筛选特征
21      * @param eval 特征筛选方法的对象实例
22      * @param data arff格式的数据
23      * @param maxNumberOfAttribute 支持的最大的特征个数
24      * @param outputPath lex输出文件
25      * @throws Exception
26      */
27     public void EvalueAndRank(ASEvaluation eval , Instances data ,int maxNumberOfAttribute , String outputPath) throws Exception
28     {
29         Ranker rank = new Ranker();        
30         eval.buildEvaluator(data);
31         rank.search(eval, data);
32         
33          // 按照特定搜索算法对属性进行筛选 在这里使用的Ranker算法仅仅是属性按照InfoGain/Chi-square的大小进行排序            
34         int[] attrIndex = rank.search(eval, data);
35         
36          // 打印结果信息 在这里我们了属性的排序结果                 
37         ArrayList<String> attributeWords = new ArrayList<String>();
38         for (int i = 0; i < attrIndex.length; i++) {
39             //如果权重等于0，则跳出循环
40             if (((AttributeEvaluator) eval).evaluateAttribute(attrIndex[i]) == 0)
41                 break;
42             if (i >= maxNumberOfAttribute)
43                 break;
44             attributeWords.add(i + "\t"
45                     + data.attribute(attrIndex[i]).name() + "\t" + "1");
46         }
47         FileTool.SaveListToFile(attributeWords, outputPath, false,
48                 Charset.forName("utf8"));
49     }
50 
51 }

View Code