大于内存操作：空间连接与 R答案

【问题标题】：Larger than memory operations: Spatial joins with R大于内存操作：空间连接与 R
【发布时间】：2018-12-24 12:06:26
【问题描述】：

我有 3 亿个点想要与 6000 万个多边形相交。这两者的组合比我可以轻松放入机器内存的容量更大。我已经推出了一个解决方案，我将每个数据集加载到 PostGIS 中，对每个数据集执行空间索引，然后执行空间连接。

在 PostGIS 中看起来像：

SELECT pts.*, grid.gridID
into test_join
FROM pts, grid
WHERE ST_Contains( grid.geometry, pts.geometry);

pts（3 亿点）上的空间索引大约需要 90 分钟。然后上面的加入大约需要 190 分钟。

我以前从未使用R 处理过大于 RAM 的空间数据。

有没有办法使用R 中的sf 包来处理这种规模的数据？
存在哪些加快此操作的策略？
我应该考虑其他工具或方法吗？

我更喜欢使用开源工具（R、PostGIS、Python 等）。但我并不致力于任何特定的工具链。

其他数据 似乎我没有说明具体的解决方案造成了混乱。我最初没有提供任何语法或示例的原因是我不拘泥于特定平台。我对使用任何开源堆栈的想法持开放态度。正如标题所说，我在文中重申，这里的问题是规模，而不是解决一个琐碎示例的语法。

这是一个非常具体的解决方案，使用 R 中的 sf 包解决。下面的示例适用于 500 平方公里和 1000 个随机点的美国网格。我想将其扩展到 1 公里以下的网格和 300,000,000 个点。我根本不关心绘图，但我在下面绘制了一些东西仅用于说明。

library(sf)
#> Linking to GEOS 3.6.1, GDAL 2.1.3, PROJ 4.9.3
library(tidyverse)
library(spData)
#> To access larger datasets in this package, install the spDataLarge
#> package with: `install.packages('spDataLarge',
#> repos='https://nowosad.github.io/drat/', type='source'))`

# size of squares in projection units (in this case meters)
grid_size <- 500000
num_pts <- 1000   # number of points to join

data(us_states) # loads the us_states shape

all_states <-
  us_states %>%
  # st_sf() %>%
  st_transform(102003) %>% # project to a meters based projection
  st_combine   %>% #flattens the shape file to one big outline (no states)
  st_buffer(10000) # add a 10k buffer

#a nice outter buffer of the usa lower 48
ggplot() +
  geom_sf(data = all_states)

## let's put a grid over the whole US
state_box <- st_bbox(all_states)
xrange <- state_box$xmax - state_box$xmin
yrange <- state_box$ymax - state_box$ymin

cell_dim <-
  c(ceiling(xrange / grid_size),
    ceiling(yrange / grid_size)) # dimension of polygons necessary

full_us_grid <-
  st_make_grid(all_states, square = TRUE, n = cell_dim) %>%
  st_intersection(all_states) %>% # only the inside part
  st_sf() %>%
  mutate(grid_id = 1:n())

ggplot() +
  geom_sf(data = full_us_grid)

## now let's create some random points
random_pts <- data.frame(
  point_id = 1:num_pts,
  lat = runif(num_pts, 30, 50),
  lon = runif(num_pts, -117, -78)
) %>%
  # these are in degrees so need crs in same
  st_as_sf(coords = c("lon", "lat"), crs = 4326) %>%
  st_transform(102003)  # transform into our metric crs

ggplot() +
  geom_sf(data = full_us_grid) +
  geom_sf(data = random_pts)

## here is the spatial join!!
joined_data <-
  full_us_grid %>%
  st_join(random_pts)

## this is the mapping from grid_id to point_id
joined_data %>%
  st_set_geometry(NULL) %>%
  na.omit()  %>%
  head
#>     grid_id point_id
#> 7         7       26
#> 7.1       7      322
#> 7.2       7      516
#> 7.3       7      561
#> 7.4       7      641
#> 7.5       7      680

^{由reprex package (v0.2.1) 于 2018 年 12 月 24 日创建}

【问题讨论】：

你能提供一些包含几个点和多边形的示例数据吗？

标签： r postgis sf

【解决方案1】：

在这种特殊情况下（找出哪些点位于矩形单元格内）您可以通过以下方式提高速度并减少内存需求使用 createTree 包中的函数 SearchTrees 构建四叉树和然后使用其rectLookup 函数查找单元格中的点。这样你既可以节省内存（无需构建多边形网格），又可以增加自构建 QuadTreee rectLookup 以来的速度非常快，因为它大大减少了要进行的坐标比较的数量。例如：

library(sf)
library(spData)
library(SearchTrees)
library(data.table)
library(ggplot2)

data(us_states) # loads the us_states shape
all_states <-
  us_states %>%
  # st_sf() %>%
  st_transform(102003) %>% # project to a meters based projection
  st_combine()   %>% #flattens the shape file to one big outline (no states)
  st_buffer(10000) # add a 10k buffer

# define the grid - no need to create a polygon grid, which is memory intensinve
# for small grids. Just get the bbox, compute number of cells and assign a unique
# index to each
#
grid_size <- 500000
state_box <- st_bbox(all_states)
xrange <- state_box$xmax - state_box$xmin
yrange <- state_box$ymax - state_box$ymin
cell_dim <-
  c(ceiling(xrange / grid_size),
    ceiling(yrange / grid_size))
n_cells <- cell_dim[1] * cell_dim[2]
ind_rows <- ceiling(1:n_cells / cell_dim[1])
ind_cols <- (1:n_cells) - (ind_rows - 1) * cell_dim[1]
cell_indexes <- data.frame(grid_id = 1:n_cells,
                           ind_row = ind_rows,
                           ind_col = ind_cols,
                           stringsAsFactors = FALSE)

## now let's create some random points - Here I build the points directly in 
## 102003 projection for speed reasons because st_transform() does not scale 
## very well with number of points. If your points are in 4326 you may consider 
## transforming them beforehand and store the results in a RData or gpkg or 
## shapefile. I also avoid creating a `sf` object to save memory: a plain x-y-id 
## data.table suffices
set.seed(1234)
t1 <- Sys.time()
num_pts <- 3000
random_pts <- data.table::data.table(
  point_id = 1:num_pts,
  lon = runif(num_pts, state_box$xmin, state_box$xmax),
  lat = runif(num_pts, state_box$ymin, state_box$ymax)
)

# Build a Quadtree over the points.
qtree <- SearchTrees::createTree(random_pts, columns = c(2,3))

# Define a function which uses `SearchTrees::rectLookup` to find points within 
# a given grid cell. Also deal with "corner cases": cells outside all_states and
# cells only partially within all_states.

find_points <- function(cell, qtree, random_pts, state_box, all_states, grid_size, cell_indexes) {

  cur_xmin <- state_box[["xmin"]] + grid_size * (cell_indexes$ind_col[cell] - 1)
  cur_xmax <- state_box[["xmin"]] + grid_size * (cell_indexes$ind_col[cell])
  cur_ymin <- state_box[["ymin"]] + grid_size * (cell_indexes$ind_row[cell] - 1)
  cur_ymax <- state_box[["ymin"]] + grid_size * (cell_indexes$ind_row[cell])

  cur_bbox <- sf::st_bbox(c(xmin = cur_xmin, xmax = cur_xmax,
                            ymin = cur_ymin, ymax = cur_ymax),
                          crs = sf::st_crs(all_states)) %>%
    sf::st_as_sfc()

  # look for contained points only if the cell intersects with the all_states poly
  if (lengths(sf::st_intersects(cur_bbox, all_states)) != 0) {

    if (lengths(sf::st_contains(all_states, cur_bbox)) != 0) {
      # If cell completely contained, use `rectLookup` to find contained points
      pts <-  SearchTrees::rectLookup(
        qtree,
        xlims = c(cur_xmin, cur_xmax),
        ylims = c(cur_ymin, cur_ymax))


    } else {
      # If cell intersects, but is not completely contained (i.e., on borders),
      # limit the rectLookup to the bbox of intersection to speed-up, then find
      # points properly contained
      cur_bbox <- sf::st_bbox(sf::st_intersection(all_states, cur_bbox))
      pts <-  SearchTrees::rectLookup(
        qtree,
        xlims = c(cur_bbox[["xmin"]], cur_bbox[["xmax"]]),
        ylims = c(cur_bbox[["ymin"]], cur_bbox[["ymax"]]))
      # now we should have "few" points - we can use sf operators - here st_contains
      # is much faster than an intersect. This should be fast even over large
      # number of points if the cells are small
      contained_pts <- sf::st_contains(
        all_states,
        sf::st_as_sf(random_pts[pts,],
                     coords = c("lon", "lat"),
                     crs = sf::st_crs(all_states)))[[1]]
      pts  <- random_pts[pts[contained_pts],][["point_id"]]
    }
    if (length(pts) == 0 ) {
      pts <- as.numeric(NA)
    } else {
      pts  <- random_pts$point_id[pts]
    }
  } else {
    pts <- as.numeric(NA)
  }
  out <- data.table::data.table(
    grid_id  = cell_indexes$grid_id[cell],
    point_id = pts)
  return(out)
}

让我们看看它是否有效：

# Run the function through a `lapply` over grid cells

out <- lapply(1:n_cells, FUN = function(x)
  find_points(x, qtree, random_pts, state_box, all_states, grid_size,cell_indexes))
out <- data.table::rbindlist(out)
out
#>       grid_id point_id
#>    1:       1       NA
#>    2:       2       NA
#>    3:       3       NA
#>    4:       4      325
#>    5:       4     1715
#>   ---                 
#> 1841:      59     1058
#> 1842:      60      899
#> 1843:      60     2044
#> 1844:      60      556
#> 1845:      60     2420

grd <- sf::st_make_grid(all_states, cellsize = 500000) %>% 
  sf::st_sf() %>% 
  dplyr::mutate(grid_id = 1:60)

id_sub = c(5, 23)
sub_pts <- out[grid_id %in% id_sub]
sub_pts <- dplyr::left_join(sub_pts, random_pts) %>%
  sf::st_as_sf(coords = c("lon", "lat"), crs = st_crs(all_states))
#> Joining, by = "point_id"

ggplot2::ggplot(data = grd) +
  geom_sf(data = grd, fill = "transparent") +
  geom_sf_text(aes(label = grid_id)) +
  geom_sf(data = all_states, fill = "transparent") +
  geom_sf(data = sub_pts)

根据我的（有限的）经验，这应该可以很好地扩展点/单元，并且具有相当低的内存占用。此外，它很容易并行化（假设你有足够的内存）。

如果您仍然无法在完整数据集上运行它（我无法测试它在我的笔记本电脑上），您还可以通过分析中的点来“拆分”执行 “块”（例如，将它们保存到 shp/gpkg，然后只读取部分点使用query 参数，或保存为由 lon 排序的表并阅读前 XX 行 - 这可以让您进一步了解如果您过滤经度/纬度，则加速，因为这样您也可以自动减少要分析的细胞数量，并节省大量时间。

【讨论】：

这是一个巨大的帮助。我从未听说过 SearchTrees。谢谢！

【解决方案2】：

尝试使用以下链接中描述的云解决方案：

https://blog.sicara.com/speedup-r-rstudio-parallel-cloud-performance-aws-96d25c1b13e2

【讨论】：

对不起，但这解决了我所说的问题中的零。但它确实提供了更高的复杂性和更多的障碍。