Hadoop 文本分类终于跑通了

Training
入口
package org.wordCount;

import java.util.ArrayList;
import java.util.List;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;

public class WordMain {

    // private static List<String> secondDir = new ArrayList<String>();

    public static void main(String[] args) throws Exception {
    
        Configuration conf = new Configuration();
        //下面两行很重要
        conf.set("mapred.jar", "E://eclipse//jar-work//WordMain.jar");
        conf.set("mapred.job.tracker", "192.168.190.128:9001");
        
        //设置单词先验概率的保存路径
        String priorProbality = "hdfs://192.168.190.128:9000/user/hadoop/output/priorP/priorProbability.txt";
        conf.set("priorProbality", priorProbality);
        
        //单词总种类数的保存路径
        String totalWordsPath = "hdfs://192.168.190.128:9000/user/hadoop/output/totalwords.txt";
        conf.set("totalWordsPath", totalWordsPath);
        
        //每个类别中单词总数
        String wordsInClassPath = "hdfs://192.168.190.128:9000/user/hadoop/mid/wordsFrequence/_wordsInClass/wordsInClass-r-00000";
        conf.set("wordsInClassPath", wordsInClassPath);
        
        //设置输入 和 单词词频的输出路径
        // "/user/hadoop/input/NBCorpus/Country"
        String input = "hdfs://192.168.190.128:9000/user/hadoop/input/NBCorpus/Country";
        String wordsOutput = "hdfs://192.168.190.128:9000/user/hadoop/mid/wordsFrequence";
        conf.set("input", input);
        conf.set("wordsOutput", wordsOutput);
        
        //每个类别单词概率保存路径,
        //单词词频的输入路径也就是单词词频的输出路径
        
        String freqOutput = "hdfs://192.168.190.128:9000/user/hadoop/output/probability/";
        conf.set("freqOutput", freqOutput);

        
        
        
        FileCount.run(conf);
        WordCount.run(conf);
        Probability.run(conf);
/*        
        System.out.print("----------");
        
        
        String[] otherArgs = new String[] { "hdfs://192.168.190.128:9000/user/hadoop/test/",
                "hdfs://192.168.190.128:9000/user/hadoop/wordcount/output2/" };
        conf.set("mapred.jar", "E://eclipse//jar-work//WordMain.jar");
    
        Job job = new Job(conf, "word count");
        job.setJarByClass(WordMain.class);

        job.setInputFormatClass(MyInputFormat.class);

        job.setMapperClass(WordMapper.class);
//        job.setCombinerClass(WordReducer.class);
        job.setReducerClass(WordReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);
        // MyUtils.addInputPath(job, new Path(otherArgs[0]), conf);

        
        
        List<Path> inputPaths = getSecondDir(conf, otherArgs[0]);
        for (Path path : inputPaths) {
            System.out.println("path = " + path.toString());
            MyInputFormat.addInputPath(job, path);
            
        }
        System.out.println("addinputpath     ok" );
//        FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
        FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
        


        
        System.exit(job.waitForCompletion(true) ? 0 : 1);*/
        
        
    }

    // 获取文件夹下面二级文件夹路径的方法
    static List<Path> getSecondDir(Configuration conf, String folder) throws Exception {
        Path path = new Path(folder);
        FileSystem fs = path.getFileSystem(conf);
        FileStatus[] stats = fs.listStatus(path);
        List<Path> folderPath = new ArrayList<Path>();
        for (FileStatus stat : stats) {
            if (stat.isDir()) {
                if (fs.listStatus(stat.getPath()).length > 10) { // 筛选出文件数大于10个的类别作为
                                                                    // 输入路径
                    folderPath.add(stat.getPath());
                }
            }
        }
        return folderPath;
    }

    

    

}
View Code