1) WordCount 

这个就不多说了,满大街都是,网上有几篇对WordCount的详细分析

http://www.sxt.cn/u/235/blog/5809

http://www.cnblogs.com/zhanghuijunjava/archive/2013/04/27/3036549.html

这二篇都写得不错, 特别几张图画得很清晰

 

2) 去重处理(Distinct)

类似于db中的select distinct(x) from table , 去重处理甚至比WordCount还要简单,假如我们要对以下文件的内容做去重处理(注:该文件也是后面几个示例的输入参数)

2
8
8
3
2
3
5
3
0
2
7

基本上啥也不用做,在map阶段,把每一行的值当成key分发下去,然后在reduce阶段回收上来就可以了.

注:里面用到了一个自己写的类HDFSUtil,可以在 hadoop: hdfs API示例 一文中找到.

原理:map阶段完成后,在reduce开始之前,会有一个combine的过程,相同的key值会自动合并,所以自然而然的就去掉了重复.

 1 package yjmyzz.mr;
 2 
 3 import org.apache.hadoop.conf.Configuration;
 4 import org.apache.hadoop.fs.Path;
 5 import org.apache.hadoop.io.NullWritable;
 6 import org.apache.hadoop.io.Text;
 7 import org.apache.hadoop.mapreduce.Job;
 8 import org.apache.hadoop.mapreduce.Mapper;
 9 import org.apache.hadoop.mapreduce.Reducer;
10 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
11 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
12 import org.apache.hadoop.util.GenericOptionsParser;
13 
14 import yjmyzz.util.HDFSUtil;
15 
16 import java.io.IOException;
17 
18 
19 public class RemoveDup {
20 
21     public static class RemoveDupMapper
22             extends Mapper<Object, Text, Text, NullWritable> {
23 
24         public void map(Object key, Text value, Context context)
25                 throws IOException, InterruptedException {
26             context.write(value, NullWritable.get());
27             //System.out.println("map: key=" + key + ",value=" + value);
28         }
29 
30     }
31 
32     public static class RemoveDupReducer extends Reducer<Text, NullWritable, Text, NullWritable> {
33         public void reduce(Text key, Iterable<NullWritable> values, Context context)
34                 throws IOException, InterruptedException {
35             context.write(key, NullWritable.get());
36             //System.out.println("reduce: key=" + key);
37         }
38     }
39 
40     public static void main(String[] args) throws Exception {
41         Configuration conf = new Configuration();
42         String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
43         if (otherArgs.length < 2) {
44             System.err.println("Usage: RemoveDup <in> [<in>...] <out>");
45             System.exit(2);
46         }
47 
48         //删除输出目录(可选,省得多次运行时,总是报OUTPUT目录已存在)
49         HDFSUtil.deleteFile(conf, otherArgs[otherArgs.length - 1]);
50 
51         Job job = Job.getInstance(conf, "RemoveDup");
52         job.setJarByClass(RemoveDup.class);
53         job.setMapperClass(RemoveDupMapper.class);
54         job.setCombinerClass(RemoveDupReducer.class);
55         job.setReducerClass(RemoveDupReducer.class);
56         job.setOutputKeyClass(Text.class);
57         job.setOutputValueClass(NullWritable.class);
58 
59 
60         for (int i = 0; i < otherArgs.length - 1; ++i) {
61             FileInputFormat.addInputPath(job, new Path(otherArgs[i]));
62         }
63         FileOutputFormat.setOutputPath(job,
64                 new Path(otherArgs[otherArgs.length - 1]));
65         System.exit(job.waitForCompletion(true) ? 0 : 1);
66     }
67 
68 
69 }
View Code

相关文章:

  • 2022-02-05
  • 2021-09-26
  • 2021-10-12
  • 2022-12-23
  • 2022-02-27
  • 2021-10-28
  • 2021-11-04
  • 2022-12-23
猜你喜欢
  • 2022-01-19
  • 2022-12-23
  • 2021-09-03
  • 2021-09-30
  • 2022-12-23
  • 2022-12-23
  • 2022-12-23
相关资源
相似解决方案