CombineTextInputFormat 案例

一、核心代码(依托于自定义的WordCount)

1、位置

在设置输入和输出路径前

2、代码

// 设置
job.setInputFormatClass(CombineTextInputFormat.class);
CombineTextInputFormat.setMaxInputSplitSize(job, 134217728);

注意：size的值为：1024 * 1024 * nM

3、体现

number of splits:1

二、示例

1、前提条件

创建Maven项目，导入依赖，配置日志

2、Mapper

package com.wt;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;


public class WordCountMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
    Text k = new Text();
    IntWritable v = new IntWritable(1);
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        // 1.获取第一行
        String line = value.toString();
        // 2.切割
        String[] words = line.split("\\s+");
        // 3.输出
        for (String word : words) {
            /*
            * Text k = new Text(); 每个 key 执行一次 map 因此，把 这个放在外面，减少内存消耗
            * new IntWritable(1); 同上
            * */
            k.set(word);
            context.write(k, v);
        }
    }
}

Mapper类