【发布时间】:2015-01-27 11:30:19
【问题描述】:
我试图在我的项目中使用 Weka 来使用朴素贝叶斯分类器对文本文档进行分类。我在this site 上找到了以下两个课程。
第一个类MyFilteredLearner 构建、训练、评估并将分类器保存到磁盘,一切正常。
第二个类MyFilteredClassifier 从文本文件中加载单个文本字符串并成功地将其放入实例中。它还从磁盘恢复分类器。它未能使用classify() 方法对实例进行分类,而是返回异常消息“未定义输出实例格式”。
我花了很长时间寻找答案,尝试安装 Weka 的开发版和稳定版,但仍然遇到同样的问题。
有人知道代码中有什么不正确或需要以不同的方式添加/完成吗?文件详情及代码如下:
用于训练分类器的 ARFF 文件 (spam.ARFF):
@relation sms_test
@attribute spamclass {spam,ham}
@attribute text String
@data
ham,'Go until jurong point, crazy.. Available only in bugis n great world la e buffet...Cine there got amore wat...'
etc……………………………………………………………………
新实例的单行文本文件 (toClassify.txt):
this is spam or not, who knows?
MyFilteredLearner的代码:
public class MyFilteredLearner {
Instances trainData;
StringToWordVector filter;
FilteredClassifier classifier;
public void loadDataset(String fileName) {
try {
BufferedReader reader = new BufferedReader(new FileReader(fileName));
ArffReader arff = new ArffReader(reader);
trainData = arff.getData();
System.out.println("===== Loaded dataset: " + fileName + " =====");
reader.close();
}
catch (IOException e) {
System.out.println("Problem found when reading: " + fileName);
}
}
public void learn() {
try {
trainData.setClassIndex(0);
classifier = new FilteredClassifier();
filter = new StringToWordVector();
filter.setAttributeIndices("last");
classifier.setFilter(filter);
classifier.setClassifier(new NaiveBayes());
classifier.buildClassifier(trainData);
System.out.println("===== Training on filtered (training) dataset done =====");
}
catch (Exception e) {
System.out.println("Problem found when training");
}
}
public void evaluate() {
try {
trainData.setClassIndex(0);
filter = new StringToWordVector();
filter.setAttributeIndices("last");
classifier = new FilteredClassifier();
classifier.setFilter(filter);
classifier.setClassifier(new NaiveBayes());
Evaluation eval = new Evaluation(trainData);
eval.crossValidateModel(classifier, trainData, 4, new Random(1));
System.out.println(eval.toSummaryString());
System.out.println(eval.toClassDetailsString());
System.out.println("===== Evaluating on filtered (training) dataset done =====");
}
catch (Exception e) {
System.out.println("Problem found when evaluating");
}
}
public void saveModel(String fileName) {
try {
ObjectOutputStream out = new ObjectOutputStream(new FileOutputStream(fileName));
out.writeObject(classifier);
System.out.println("Saved model: " + out.toString());
out.close();
System.out.println("===== Saved model: " + fileName + "=====");
}
catch (IOException e) {
System.out.println("Problem found when writing: " + fileName);
}
}
}
MyFilteredClassifier的代码:
public class MyFilteredClassifier {
String text;
Instances instances;
FilteredClassifier classifier;
StringToWordVector filter;
public void load(String fileName) {
try {
BufferedReader reader = new BufferedReader(new FileReader(fileName));
String line;
text = "";
while ((line = reader.readLine()) != null) {
text = text + " " + line;
}
System.out.println("===== Loaded text data: " + fileName + " =====");
reader.close();
System.out.println(text);
}
catch (IOException e) {
System.out.println("Problem found when reading: " + fileName);
}
}
public void makeInstance() {
FastVector fvNominalVal = new FastVector(2);
fvNominalVal.addElement("spam");
fvNominalVal.addElement("ham");
Attribute attribute1 = new Attribute("class", fvNominalVal);
Attribute attribute2 = new Attribute("text",(FastVector) null);
FastVector fvWekaAttributes = new FastVector(2);
fvWekaAttributes.addElement(attribute1);
fvWekaAttributes.addElement(attribute2);
instances = new Instances("Test relation", fvWekaAttributes,1);
instances.setClassIndex(0);
DenseInstance instance = new DenseInstance(2);
instance.setValue(attribute2, text);
instances.add(instance);
System.out.println("===== Instance created with reference dataset =====");
System.out.println(instances);
}
public void loadModel(String fileName) {
try {
ObjectInputStream in = new ObjectInputStream(new FileInputStream(fileName));
Object tmp = in.readObject();
classifier = (FilteredClassifier) tmp;
in.close();
System.out.println("===== Loaded model: " + fileName + "=====");
}
catch (Exception e) {
System.out.println("Problem found when reading: " + fileName);
}
}
public void classify() {
try {
double pred = classifier.classifyInstance(instances.instance(0));
System.out.println("===== Classified instance =====");
System.out.println("Class predicted: " + instances.classAttribute().value((int) pred));
}
catch (Exception e) {
System.out.println("Error: " + e.getMessage());
}
}
public static void main(String args[]) {
MyFilteredLearner c = new MyFilteredLearner();
c.loadDataset("spam.ARFF");
c.learn();
c.evaluate();
c.saveModel("spamClassifier.binary");
MyFilteredClassifier c1 = new MyFilteredClassifier();
c1.load("toClassify.txt");
c1.loadModel("spamClassifier.binary");
c1.makeInstance();
c1.classify();
}
}
【问题讨论】:
标签: java classification weka