1 /**
 2  * Licensed to the Apache Software Foundation (ASF) under one
 3  * or more contributor license agreements.  See the NOTICE file
 4  * distributed with this work for additional information
 5  * regarding copyright ownership.  The ASF licenses this file
 6  * to you under the Apache License, Version 2.0 (the
 7  * "License"); you may not use this file except in compliance
 8  * with the License.  You may obtain a copy of the License at
 9  *
10  *     http://www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing, software
13  * distributed under the License is distributed on an "AS IS" BASIS,
14  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  * See the License for the specific language governing permissions and
16  * limitations under the License.
17  */
18 
19 package org.apache.hadoop.mapreduce.lib.input;
20 
21 import org.apache.hadoop.classification.InterfaceAudience;
22 import org.apache.hadoop.classification.InterfaceStability;
23 import org.apache.hadoop.fs.Path;
24 import org.apache.hadoop.io.LongWritable;
25 import org.apache.hadoop.io.Text;
26 import org.apache.hadoop.io.compress.CompressionCodec;
27 import org.apache.hadoop.io.compress.CompressionCodecFactory;
28 import org.apache.hadoop.io.compress.SplittableCompressionCodec;
29 import org.apache.hadoop.mapreduce.InputFormat;
30 import org.apache.hadoop.mapreduce.InputSplit;
31 import org.apache.hadoop.mapreduce.JobContext;
32 import org.apache.hadoop.mapreduce.RecordReader;
33 import org.apache.hadoop.mapreduce.TaskAttemptContext;
34 
35 import com.google.common.base.Charsets;
36 
37 /** An {@link InputFormat} for plain text files.  Files are broken into lines.
38  * Either linefeed or carriage-return are used to signal end of line.  Keys are
39  * the position in the file, and values are the line of text.. */
40 @InterfaceAudience.Public
41 @InterfaceStability.Stable
42 public class TextInputFormat extends FileInputFormat<LongWritable, Text> {
43 
44   @Override
45   public RecordReader<LongWritable, Text> 
46     createRecordReader(InputSplit split,
47                        TaskAttemptContext context) {
48     String delimiter = context.getConfiguration().get(
49         "textinputformat.record.delimiter");
50     byte[] recordDelimiterBytes = null;
51     if (null != delimiter)
52       recordDelimiterBytes = delimiter.getBytes(Charsets.UTF_8);
53     return new LineRecordReader(recordDelimiterBytes);
54   }
55 
56   @Override
57   protected boolean isSplitable(JobContext context, Path file) {
58     final CompressionCodec codec =
59       new CompressionCodecFactory(context.getConfiguration()).getCodec(file);
60     if (null == codec) {
61       return true;
62     }
63     return codec instanceof SplittableCompressionCodec;
64   }
65 
66 }
TextInputFormat

相关文章:

  • 2021-12-03
  • 2021-06-05
  • 2021-05-29
  • 2021-07-18
  • 2021-05-26
  • 2021-08-14
  • 2021-11-13
  • 2022-12-23
猜你喜欢
  • 2023-03-26
  • 2022-01-03
  • 2022-12-23
  • 2021-09-24
  • 2020-04-01
  • 2021-10-07
相关资源
相似解决方案