1) WordCount
这个就不多说了,满大街都是,网上有几篇对WordCount的详细分析
http://www.sxt.cn/u/235/blog/5809
http://www.cnblogs.com/zhanghuijunjava/archive/2013/04/27/3036549.html
这二篇都写得不错, 特别几张图画得很清晰
2) 去重处理(Distinct)
类似于db中的select distinct(x) from table , 去重处理甚至比WordCount还要简单,假如我们要对以下文件的内容做去重处理(注:该文件也是后面几个示例的输入参数)
2 8 8 3 2 3 5 3 0 2 7
基本上啥也不用做,在map阶段,把每一行的值当成key分发下去,然后在reduce阶段回收上来就可以了.
注:里面用到了一个自己写的类HDFSUtil,可以在 hadoop: hdfs API示例 一文中找到.
原理:map阶段完成后,在reduce开始之前,会有一个combine的过程,相同的key值会自动合并,所以自然而然的就去掉了重复.
1 package yjmyzz.mr; 2 3 import org.apache.hadoop.conf.Configuration; 4 import org.apache.hadoop.fs.Path; 5 import org.apache.hadoop.io.NullWritable; 6 import org.apache.hadoop.io.Text; 7 import org.apache.hadoop.mapreduce.Job; 8 import org.apache.hadoop.mapreduce.Mapper; 9 import org.apache.hadoop.mapreduce.Reducer; 10 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 11 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 12 import org.apache.hadoop.util.GenericOptionsParser; 13 14 import yjmyzz.util.HDFSUtil; 15 16 import java.io.IOException; 17 18 19 public class RemoveDup { 20 21 public static class RemoveDupMapper 22 extends Mapper<Object, Text, Text, NullWritable> { 23 24 public void map(Object key, Text value, Context context) 25 throws IOException, InterruptedException { 26 context.write(value, NullWritable.get()); 27 //System.out.println("map: key=" + key + ",value=" + value); 28 } 29 30 } 31 32 public static class RemoveDupReducer extends Reducer<Text, NullWritable, Text, NullWritable> { 33 public void reduce(Text key, Iterable<NullWritable> values, Context context) 34 throws IOException, InterruptedException { 35 context.write(key, NullWritable.get()); 36 //System.out.println("reduce: key=" + key); 37 } 38 } 39 40 public static void main(String[] args) throws Exception { 41 Configuration conf = new Configuration(); 42 String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); 43 if (otherArgs.length < 2) { 44 System.err.println("Usage: RemoveDup <in> [<in>...] <out>"); 45 System.exit(2); 46 } 47 48 //删除输出目录(可选,省得多次运行时,总是报OUTPUT目录已存在) 49 HDFSUtil.deleteFile(conf, otherArgs[otherArgs.length - 1]); 50 51 Job job = Job.getInstance(conf, "RemoveDup"); 52 job.setJarByClass(RemoveDup.class); 53 job.setMapperClass(RemoveDupMapper.class); 54 job.setCombinerClass(RemoveDupReducer.class); 55 job.setReducerClass(RemoveDupReducer.class); 56 job.setOutputKeyClass(Text.class); 57 job.setOutputValueClass(NullWritable.class); 58 59 60 for (int i = 0; i < otherArgs.length - 1; ++i) { 61 FileInputFormat.addInputPath(job, new Path(otherArgs[i])); 62 } 63 FileOutputFormat.setOutputPath(job, 64 new Path(otherArgs[otherArgs.length - 1])); 65 System.exit(job.waitForCompletion(true) ? 0 : 1); 66 } 67 68 69 }