raster::blockSize() 建议每个块使用的行数,以及相应的行号。
栅格代数
如 https://rspatial.org/raster/pkg/4-algebra.html#raster-algebra 中所述,许多允许简单而优雅的栅格代数的通用函数已经为 Raster* 对象实现,包括正常的代数运算符,如 { +、-、*、/},逻辑运算符,如 { >、 >=, <, ==, ! } 和函数,例如 { abs、round、ceiling、floor、trunc、sqrt、log、log10、exp、cos、sin、max、min、range、prod、sum、any、all }。在这些函数中,您可以将光栅对象与数字混合使用,只要第一个参数是光栅对象即可。
为大型栅格编写函数
https://rspatial.org/raster/pkg/appendix1.html#appendix-i-writing-functions-for-large-raster-files 的附录 I 描述了如何为大型光栅文件编写函数。特别是,示例函数 f8() 描述了一个函数,该函数将接受一个大栅格并将我们选择的任何函数一次一个块地应用于该栅格。像这样:
block_fun <- function( big_raster, do_what ){
for( i in 1:num_blocks ){
get_block_data( i ) # from big_raster
process_block_data( i ) # using do_what
save_chunk( i )
}
然后我们可以使用上面描述的栅格代数来组合函数,例如:
f1 <- function( x ) sqrt( x )
f2 <- function( x ) x + 3
f3 <- function( x ) sin( x / 180 )
示范
# Get a raster file for demonstration
require( rgdal ) # for loading a raster from a file
large_raster <- raster::raster( 'path/to/a/large/raster/file.tif' )
# Define the block-by-block function
# -----------------------------------------------------------------------
# Adapted from
# https://rspatial.org/raster/pkg/appendix1.html#a-complete-function
# -----------------------------------------------------------------------
process_large_raster <- function(
input_raster
, f = function( r ) sin( r / 180 )
, chunk_file_name = ''
, overwrite = FALSE )
, ...
){
chunk <- raster::raster( input_raster )
chunk_file_name <- raster::trim( chunk_file_name )
# Find out if the amount of memory needed for the function is available
# n : integer number of copies of the Raster* object cell values
# this function needs to be able to have in memory
RASTER_FITS_IN_MEMORY <- raster::canProcessInMemory(
x = chunk, n = 3, verbose = TRUE
)
# ---------------------------------------------------------
# Can we process this whole raster in memory all at once?
# If not, use a temporary raster file to process chunks
# writeStart [ writeValues ... ] writeStop
# Open [ save parts ... ] Close
# ---------------------------------------------------------
# Use memory to process the raster, unless either of the following is true
# * The raster is too large to process in memory
# * A file name is given
# Create a temporary file if the raster is too large to fit in memory
if( !RASTER_FITS_IN_MEMORY & chunk_file_name == '' ){
chunk_file_name <- raster::rasterTmpFile()
}
if( chunk_file_name != '' ){ # File name is available
# Use a RASTER
chunk <- raster::writeStart( chunk, chunk_file_name, ... )
write_chunks_to_file <- TRUE
} else { # File name is NOT available
# Use a MATRIX
matrix_chunk <- matrix( ncol = nrow( chunk ), nrow = ncol( chunk ))
write_chunks_to_file <- FALSE
}
# Obtain suggested chunk sizes (number of rows and corresponding row numbers)
block <- raster::blockSize( input_raster )
progress_bar <- raster::pbCreate( block$n, ... )
chunk_to_file <- function(){
raster::writeValues( chunk, v, block$row[ i ] )
}
chunk_to_memory <- function(){
cols <- block$row[ i ]:( block$row[ i ] + block$nrows[ i ] - 1 )
matrix_chunk[ , cols ] <- matrix( v, nrow = chunk@ncols )
matrix_chunk
}
save_chunk <- function(){
if( write_chunks_to_file ){ # to file
chunk_to_file()
} else { # to memory
chunk_to_memory()
}
raster::pbStep( progress_bar, i )
}
save_result <- function( blocks_remaining ){
if( write_chunks_to_file ){
chunk <- raster::writeStop( chunk )
} else { if( blocks_remaining ){
chunk <- raster::setValues( chunk, as.vector( matrix_chunk ))
}}
chunk
}
get_block_data <- function(){
raster::getValues( input_raster, row = block$row[ i ], nrows = block$nrows[ i ] )
}
process_block_data <- function( v ){
f( v )
}
close_chunk_file <- function( a_chunked_raster ){
raster::writeStop( a_chunked_raster )
}
mat2ras <- function( mtrx, rstr ){
# Given a matrix and a raster, set the values of the raster
# using values from the matrix
# Return the raster with updated values
raster::setValues( rstr, as.vector( mtrx ))
}
and_finally <- function(){
raster::pbClose( progress_bar )
if( write_chunks_to_file ){
close_chunk_file( a_chunked_raster = chunk )
} else {
mat2ras( mtrx = as.numeric( matrix_chunk ), rstr = chunk )
}
}
# ============================================
for( i in 1:block$n ){
v <- get_block_data()
v <- process_block_data( v )
save_chunk()
if( write_chunks_to_file ){
chunk <- chunk_to_file()
} else {
matrix_chunk <- chunk_to_memory()
}
}
# ============================================
and_finally()
}
调用我们的新函数
process_large_raster( input_raster = large_raster, f = f1)
# GB
# available : 3.54
# 60% : 2.12
# needed : 0.05
# allowed : 4.66 (if available)
# class : RasterLayer
# dimensions : 1230, 1877, 2308710 (nrow, ncol, ncell)
# resolution : 0.0002695, 0.0002695 (x, y)
# extent : -121.9, -121.4, 37.85, 38.19 (xmin, xmax, ymin, ymax)
# crs : +proj=longlat +datum=WGS84 +no_defs
# source : memory
# names : layer
# values : 0.2824, 0.5955 (min, max)
process_large_raster( input_raster = large_raster, f = f2)
# GB
# available : 3.53
# 60% : 2.12
# needed : 0.05
# allowed : 4.66 (if available)
# class : RasterLayer
# dimensions : 1230, 1877, 2308710 (nrow, ncol, ncell)
# resolution : 0.0002695, 0.0002695 (x, y)
# extent : -121.9, -121.4, 37.85, 38.19 (xmin, xmax, ymin, ymax)
# crs : +proj=longlat +datum=WGS84 +no_defs
# source : memory
# names : layer
# values : 3.08, 3.355 (min, max)
process_large_raster( input_raster = large_raster, f = f3 )
# GB
# available : 2.97
# 60% : 1.78
# needed : 0.05
# allowed : 4.66 (if available)
# class : RasterLayer
# dimensions : 1230, 1877, 2308710 (nrow, ncol, ncell)
# resolution : 0.0002695, 0.0002695 (x, y)
# extent : -121.9, -121.4, 37.85, 38.19 (xmin, xmax, ymin, ymax)
# crs : +proj=longlat +datum=WGS84 +no_defs
# source : memory
# names : layer
# values : 0.000443, 0.00197 (min, max)
到目前为止,一切都很好。但事情即将出轨!
4.重新分类栅格
# Define the changes we want to make using reclassify()
reclassification_scheme = tibble::tribble(
~from, ~to, ~becomes
, 0 , 0.1, 5
, 0.1 , 0.2 , 15
, 0.2 , 0.3 , 25
, 0.3 , 0.4 , 35
, 0.4 , 0.5 , 45
, 0.5 , 0.6 , 55
, 0.6 , 0.7 , 65
, 0.7 , 0.8 , 75
, 0.8 , 0.9 , 85
, 0.9 , 1.0 , 95
)
# Attempt to apply this scheme in a block-by-block fashion
process_large_raster(
input_raster = large_raster
, f = function( x ) raster::reclassify( x, cfg$reclassification_scheme)
)
# GB
# available : 3.52
# 60% : 2.11
# needed : 0.05
# allowed : 4.66 (if available)
# Error in (function (classes, fdef, mtable) :
# unable to find an inherited method for function ‘reclassify’
# for signature ‘"numeric"’
如果您查看 raster 包的源代码,特别是 https://rdrr.io/cran/raster/src/R/reclassify.R,您会看到:
# Author: Robert J. Hijmans
# Date : June 2008
# Version 1.0
# Licence GPL v3
setMethod('reclassify', signature(x='Raster', rcl='ANY'),
function(x, rcl, filename='', include.lowest=FALSE, right=TRUE, ...) {
filename <- trim(filename)
...
这部分 signature(x='Raster', rcl='ANY') 说我们可以对栅格对象使用重新分类。然而,我们的逐块方案返回一个向量、矩阵、列表或其他东西,重新分类函数没有相应的签名。当我们将其中一个非光栅对象传递给 reclassify 时,它并不高兴。
在reclassify.R 的更下方,该函数已经被构造为逐块处理,因此 reclassify() 没有开箱即用地做得更好,这有点神秘:
for (i in 1:tr$n) {
vals <- getValues( x, row=tr$row[i], nrows=tr$nrows[i] )
vals <- .reclassify(vals, rcl, include.lowest, right, leftright, onlyNA, valNA)
if (nl > 1) {
vals <- matrix(vals, ncol=nl)
}
out <- writeValues(out, vals, tr$row[i])
pbStep(pb, i)
}
这条语句 .reclassify(vals, rcl, include.lowest, right, leftright, onlyNA, valNA) 正在调用 C++ 函数来完成实际工作。我们能否通过将 reclassify() 的业务部分放入我们的逐块函数中来获得任何加速?只有一种方法可以找出答案!
黑客
.reclassify() 函数是raster 包的内部函数,因此我们不能像调用任何其他函数一样调用它。相反,我们必须使用三冒号 (:::) 运算符。从已发布的包中访问内部函数通常是不是推荐,因为未导出的函数通常没有文档,因此包作者不能保证他们会继续做他们现在所做的事情。当软件包得到改进并且一些内部结构在没有警告的情况下发生变化时,我们的 hack 可能会在未来的某个时候崩溃。尽管如此,看看会发生什么还是很有趣的!
# A function that calls the internal function, .reclassify()
reclassify_hack <- function( x ){
raster:::.reclassify(
x
, as.matrix( reclassification_scheme )
, dolowest = FALSE
, doright = TRUE
, doleftright = FALSE
, NAonly = FALSE
, NAval = NA
)
}
process_large_raster( input_raster = large_raster, f = reclassify_hack )
# GB
# available : 2.92
# 60% : 1.75
# needed : 0.05
# allowed : 4.66 (if available)
# class : RasterLayer
# dimensions : 1230, 1877, 2308710 (nrow, ncol, ncell)
# resolution : 0.0002695, 0.0002695 (x, y)
# extent : -121.9, -121.4, 37.85, 38.19 (xmin, xmax, ymin, ymax)
# crs : +proj=longlat +datum=WGS84 +no_defs
# source : memory
# names : layer
# values : 5, 35 (min, max)
# It worked! (on this test file, anyway)
5. 并行计算
我尝试使用 clusterR(..reclassify..) 来使用更多内核并加快速度,但无论我设置了多少内核,我似乎都没有设法真正加快速度。
如http://homepage.divms.uiowa.edu/~luke/R/cluster/cluster.html所述:
- 通信比计算慢得多。
- 通信在这个简单的设计中被序列化。
- 一些 R 函数会产生较大的结果——在返回之前在计算节点上进行缩减。
也许并行计算与最佳块大小相结合会更好地工作。
一般的想法是以最小化网络和磁盘瓶颈的方式在处理器之间分配块(然后组合结果)。
如需这方面的帮助,请提供一个大型栅格示例。