Hadoop中决定map个数的的因素有几个,由于版本的不同,决定因素也不一样,掌握这些因素对了解hadoop分片的划分有很大帮助,
并且对优化hadoop性能也很有大的益处。
旧API中getSplits方法:
1 public InputSplit[] getSplits(JobConf job, int numSplits) 2 throws IOException { 3 FileStatus[] files = listStatus(job); 4 5 // Save the number of input files in the job-conf 6 job.setLong(NUM_INPUT_FILES, files.length); 7 long totalSize = 0; // compute total size 8 for (FileStatus file: files) { // check we have valid files 9 if (file.isDir()) { 10 throw new IOException("Not a file: "+ file.getPath()); 11 } 12 totalSize += file.getLen(); 13 } 14 15 long goalSize = totalSize / (numSplits == 0 ? 1 : numSplits); 16 long minSize = Math.max(job.getLong("mapred.min.split.size", 1), 17 minSplitSize); 18 19 // generate splits 20 ArrayList<FileSplit> splits = new ArrayList<FileSplit>(numSplits); 21 NetworkTopology clusterMap = new NetworkTopology(); 22 for (FileStatus file: files) { 23 Path path = file.getPath(); 24 FileSystem fs = path.getFileSystem(job); 25 long length = file.getLen(); 26 BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length); 27 if ((length != 0) && isSplitable(fs, path)) { 28 long blockSize = file.getBlockSize(); 29 long splitSize = computeSplitSize(goalSize, minSize, blockSize); 30 31 long bytesRemaining = length; 32 while (((double) bytesRemaining)/splitSize > SPLIT_SLOP) { 33 String[] splitHosts = getSplitHosts(blkLocations, 34 length-bytesRemaining, splitSize, clusterMap); 35 splits.add(new FileSplit(path, length-bytesRemaining, splitSize, 36 splitHosts)); 37 bytesRemaining -= splitSize; 38 } 39 40 if (bytesRemaining != 0) { 41 splits.add(new FileSplit(path, length-bytesRemaining, bytesRemaining, 42 blkLocations[blkLocations.length-1].getHosts())); 43 } 44 } else if (length != 0) { 45 String[] splitHosts = getSplitHosts(blkLocations,0,length,clusterMap); 46 splits.add(new FileSplit(path, 0, length, splitHosts)); 47 } else { 48 //Create empty hosts array for zero length files 49 splits.add(new FileSplit(path, 0, length, new String[0])); 50 } 51 } 52 LOG.debug("Total # of splits: " + splits.size()); 53 return splits.toArray(new FileSplit[splits.size()]); 54 } 55 56 protected long computeSplitSize(long goalSize, long minSize, 57 long blockSize) { 58 return Math.max(minSize, Math.min(goalSize, blockSize)); 59 }