问题是合并/丢失的单元格现在我已经查看了数据。您需要决定如何处理它们。
理想情况下,您应该让数据所有者不使用表格中的合并单元格,并且让表格中的所有行都具有相同的长度(列数)。
另一个选项似乎是删除包含它们的行,如 [link][1]。鉴于那里有有用的信息,我建议您决定如何填补空白并编写自定义函数来处理这个问题。
以下虽然不是很好,但是以当前格式处理表格的一种方法。这假设所有文件都具有相同的格式 - 考虑到数学性质,这似乎很可能。
根据表号,我以不同方式处理表以确保行的长度相等。有时我也会将数据移动到标题中。这确实是您如何清理输入的起点。
将其重构为使用 tidyverse 函数和用户函数调用而不是当前循环处理会更具 R 风格。
library(rlang)
library(rvest)
#> Loading required package: xml2
#> Warning: package 'xml2' was built under R version 4.0.3
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
get_max_cols <- function(table) {
return(length(table %>% html_nodes("tr:nth-of-type(3) th,tr:nth-of-type(3) td")))
}
get_row <- function(target, css_selector){
row_data <- target %>% html_nodes(css_selector) %>%
html_text() %>%
trimws()
return(row_data)
}
path <- "FX17_SFpanel_subsc_PC_RFCnbtopt_nbpredoptim_accass.html"
page <- read_html(path)
headers <- c("", "Reference class", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "")
first_row <- c("", "(1)", "(2)", "(3)", "(4)", "(5)", "(6)", "(7)", "(8)", "(9)", "(10)", "(11)", "(12)", "(13)", "(14)", "(15)", "(16)", "(17)", "(18)", "(19)", "(20)", "(21)", "Sum")
tables <- page %>% html_nodes("table")
for (i in seq_along(tables)) {
table <- tables[[i]]
max_cols <- get_max_cols(table)
rows <- table %>% html_nodes("tr")
num_rows <- length(rows)
fix_flag <<- i %in% c(2, 5)
temp_table <- data.frame(matrix(NA, nrow = if_else(fix_flag, as.integer(num_rows - 1), num_rows), ncol = max_cols))
if (fix_flag) {
for (r in seq_along(rows)) {
if (r == 1) {
temp_table <- setNames(temp_table, headers)
} else if (r == 2) {
temp_table[r - 1, ] <- first_row
}
else {
temp_table[r - 1, ] <- get_row(rows[[r]], "th, td")
}
}
}else if(i==1){
temp_table <- table %>% html_table(fill = True)
temp_table <- setNames(temp_table, get_row(table, 'tr:nth-of-type(1) th, tr:nth-of-type(1) td'))
temp_table <- temp_table[-c(1),]
}
else {
temp_table <- table %>% html_table(fill = True)
}
print(temp_table)
}
#> Class ID 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21
#> 2 Class Names Reference 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21
#> 3 Class Names Prediction 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21
#> Reference class
#> 1 (1) (2) (3) (4) (5) (6) (7) (8) (9) (10) (11) (12) (13)
#> 2 (1) 1 54 0 0 0 0 4 0 0 0 0 0 14 0
#> 3 (2) 2 3 19 0 0 0 0 0 0 0 0 0 6 0
#> 4 (3) 3 0 0 30 0 0 0 0 0 0 0 0 0 0
#> 5 (4) 4 0 0 0 19 0 0 0 0 0 0 0 0 0
#> 6 (5) 5 0 0 0 0 62 10 4 0 0 0 0 0 0
#> 7 (6) 6 0 0 0 0 11 73 0 0 0 0 0 0 0
#> 8 (7) 7 0 0 0 0 0 4 65 0 0 0 0 3 0
#> 9 (8) 8 0 0 0 0 0 0 0 12 0 0 0 0 0
#> 10 (9) 9 0 0 0 0 0 2 0 0 19 0 0 0 0
#> 11 (10) 10 3 0 0 0 0 0 0 0 0 89 0 1 0
#> 12 (11) 11 0 0 0 0 0 0 0 0 0 0 128 0 0
#> 13 (12) 12 39 3 0 0 0 0 0 0 0 0 0 311 0
#> 14 (13) 13 0 0 0 2 0 0 0 0 0 0 0 0 1056
#> 15 (14) 14 0 0 0 0 0 0 0 0 0 0 0 0 0
#> 16 (15) 15 0 0 0 0 0 0 0 0 0 0 0 0 0
#> 17 (16) 16 3 8 0 10 0 0 0 0 0 0 0 4 0
#> 18 (17) 17 0 0 0 0 0 0 0 0 0 0 0 0 0
#> 19 (18) 18 2 0 0 0 0 0 0 0 0 0 0 2 0
#> 20 (19) 19 0 1 0 0 0 0 0 0 0 0 0 0 0
#> 21 (20) 20 0 0 0 3 0 0 0 0 0 0 0 0 0
#> 22 (21) 21 0 0 0 0 0 0 0 0 0 0 0 0 0
#> 23 Sum 104 31 30 34 73 93 69 12 19 89 128 341 1056
#>
#> 1 (14) (15) (16) (17) (18) (19) (20) (21) Sum
#> 2 0 0 2 0 1 2 0 0 77
#> 3 0 0 7 0 0 1 0 0 36
#> 4 0 0 0 0 0 0 0 0 30
#> 5 0 0 10 0 0 0 0 0 29
#> 6 0 0 0 0 0 0 0 0 76
#> 7 0 0 0 0 0 0 0 0 84
#> 8 0 0 0 0 0 0 0 0 72
#> 9 0 0 0 0 0 0 0 0 12
#> 10 0 0 0 0 0 0 0 0 21
#> 11 0 0 0 2 0 0 0 0 95
#> 12 0 0 0 0 0 0 0 0 128
#> 13 0 0 0 0 1 0 0 10 364
#> 14 1 0 0 0 0 0 0 0 1059
#> 15 100 0 0 0 0 0 0 0 100
#> 16 0 27 4 6 0 0 0 0 37
#> 17 0 0 109 0 2 7 3 1 147
#> 18 0 5 0 33 0 0 0 0 38
#> 19 0 0 0 0 8 0 0 0 12
#> 20 0 0 2 0 0 2 0 0 5
#> 21 0 1 0 0 0 0 24 0 28
#> 22 0 0 0 0 0 0 0 0 0
#> 23 101 33 134 41 12 12 27 11 2450
#> Measure Estimate [%] 95 % Confidence Interval [%]
#> 1 Overall Accuracy 91.43 90.0
#> 2 Kappa Accuracy 88.99 87.61
#> 3 Mean F1 Accuracy 77.60 -
#> 95 % Confidence Interval [%]
#> 1 92.37
#> 2 90.37
#> 3 -
#> User's Accuracy [%] User's Accuracy [%] User's Accuracy [%]
#> 1 Map class Estimate 95 % Interval 95 % Interval
#> 2 (1) 1 70.13 68.15 72.11
#> 3 (2) 2 52.78 50.85 54.71
#> 4 (3) 3 100.0 100.0 100.0
#> 5 (4) 4 65.52 63.55 67.48
#> 6 (5) 5 81.58 80.16 83.0
#> 7 (6) 6 86.9 85.28 88.53
#> 8 (7) 7 90.28 89.35 91.2
#> 9 (8) 8 100.0 100.0 100.0
#> 10 (9) 9 90.48 90.48 90.48
#> 11 (10) 10 93.68 93.68 93.68
#> 12 (11) 11 100.0 100.0 100.0
#> 13 (12) 12 85.44 84.32 86.56
#> 14 (13) 13 99.72 99.72 99.72
#> 15 (14) 14 100.0 99.61 100.39
#> 16 (15) 15 72.97 71.45 74.5
#> 17 (16) 16 74.15 72.61 75.69
#> 18 (17) 17 86.84 85.27 88.41
#> 19 (18) 18 66.67 64.8 68.53
#> 20 (19) 19 40.0 38.52 41.48
#> 21 (20) 20 85.71 84.47 86.96
#> 22 (21) 21 0.0 0.0 0.0
#> Producer's Accuracy [%] Producer's Accuracy [%] Producer's Accuracy [%]
#> 1 Estimate 95% Interval 95% Interval
#> 2 51.92 42.73 61.12
#> 3 61.29 47.74 74.84
#> 4 100.0 100.0 100.0
#> 5 55.88 40.82 70.95
#> 6 84.93 77.3 92.57
#> 7 78.49 72.18 84.81
#> 8 94.2 87.77 100.63
#> 9 100.0 100.0 100.0
#> 10 100.0 88.19 111.81
#> 11 100.0 95.34 104.66
#> 12 100.0 100.0 100.0
#> 13 91.2 88.69 93.71
#> 14 100.0 99.69 100.31
#> 15 99.01 99.01 99.01
#> 16 81.82 69.79 93.84
#> 17 81.34 75.88 86.8
#> 18 80.49 71.1 89.87
#> 19 66.67 43.27 90.06
#> 20 16.67 -23.88 57.22
#> 21 88.89 77.23 100.55
#> 22 0.0 nan nan
#> F1 Accuracy F1 Accuracy F1 Accuracy
#> 1 Estimate 95% Interval 95% Interval
#> 2 59.67 56.93 62.4
#> 3 56.72 54.02 59.42
#> 4 100.0 100.0 100.0
#> 5 60.32 57.56 63.07
#> 6 83.22 81.25 85.19
#> 7 82.49 80.26 84.71
#> 8 92.2 90.9 93.49
#> 9 100.0 100.0 100.0
#> 10 95.0 95.0 95.0
#> 11 96.74 96.74 96.74
#> 12 100.0 100.0 100.0
#> 13 88.23 86.78 89.67
#> 14 99.86 99.86 99.86
#> 15 99.5 99.5 99.5
#> 16 77.14 75.0 79.29
#> 17 77.58 75.48 79.68
#> 18 83.54 81.36 85.73
#> 19 66.67 64.04 69.3
#> 20 23.53 21.44 25.61
#> 21 87.27 85.52 89.02
#> 22 0.0 0.0 0.0
#> Reference class
#> 1 (1) (2) (3) (4) (5) (6) (7) (8)
#> 2 (1) 1 0.022 0.0 0.0 0.0 0.0 0.0016 0.0 0.0
#> 3 (2) 2 0.0012 0.0078 0.0 0.0 0.0 0.0 0.0 0.0
#> 4 (3) 3 0.0 0.0 0.0122 0.0 0.0 0.0 0.0 0.0
#> 5 (4) 4 0.0 0.0 0.0 0.0078 0.0 0.0 0.0 0.0
#> 6 (5) 5 0.0 0.0 0.0 0.0 0.0253 0.0041 0.0016 0.0
#> 7 (6) 6 0.0 0.0 0.0 0.0 0.0045 0.0298 0.0 0.0
#> 8 (7) 7 0.0 0.0 0.0 0.0 0.0 0.0016 0.0265 0.0
#> 9 (8) 8 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0049
#> 10 (9) 9 0.0 0.0 0.0 0.0 0.0 0.0008 0.0 0.0
#> 11 (10) 10 0.0012 0.0 0.0 0.0 0.0 0.0 0.0 0.0
#> 12 (11) 11 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
#> 13 (12) 12 0.0159 0.0012 0.0 0.0 0.0 0.0 0.0 0.0
#> 14 (13) 13 0.0 0.0 0.0 0.0008 0.0 0.0 0.0 0.0
#> 15 (14) 14 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
#> 16 (15) 15 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
#> 17 (16) 16 0.0012 0.0033 0.0 0.0041 0.0 0.0 0.0 0.0
#> 18 (17) 17 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
#> 19 (18) 18 0.0008 0.0 0.0 0.0 0.0 0.0 0.0 0.0
#> 20 (19) 19 0.0 0.0004 0.0 0.0 0.0 0.0 0.0 0.0
#> 21 (20) 20 0.0 0.0 0.0 0.0012 0.0 0.0 0.0 0.0
#> 22 (21) 21 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
#> 23 Sum 0.0424 0.0127 0.0122 0.0139 0.0298 0.038 0.0282 0.0049
#>
#> 1 (9) (10) (11) (12) (13) (14) (15) (16) (17) (18) (19)
#> 2 0.0 0.0 0.0 0.0057 0.0 0.0 0.0 0.0008 0.0 0.0004 0.0008
#> 3 0.0 0.0 0.0 0.0024 0.0 0.0 0.0 0.0029 0.0 0.0 0.0004
#> 4 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
#> 5 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0041 0.0 0.0 0.0
#> 6 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
#> 7 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
#> 8 0.0 0.0 0.0 0.0012 0.0 0.0 0.0 0.0 0.0 0.0 0.0
#> 9 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
#> 10 0.0078 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
#> 11 0.0 0.0363 0.0 0.0004 0.0 0.0 0.0 0.0 0.0008 0.0 0.0
#> 12 0.0 0.0 0.0522 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
#> 13 0.0 0.0 0.0 0.1269 0.0 0.0 0.0 0.0 0.0 0.0004 0.0
#> 14 0.0 0.0 0.0 0.0 0.431 0.0004 0.0 0.0 0.0 0.0 0.0
#> 15 0.0 0.0 0.0 0.0 0.0 0.0408 0.0 0.0 0.0 0.0 0.0
#> 16 0.0 0.0 0.0 0.0 0.0 0.0 0.011 0.0016 0.0024 0.0 0.0
#> 17 0.0 0.0 0.0 0.0016 0.0 0.0 0.0 0.0445 0.0 0.0008 0.0029
#> 18 0.0 0.0 0.0 0.0 0.0 0.0 0.002 0.0 0.0135 0.0 0.0
#> 19 0.0 0.0 0.0 0.0008 0.0 0.0 0.0 0.0 0.0 0.0033 0.0
#> 20 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0008 0.0 0.0 0.0008
#> 21 0.0 0.0 0.0 0.0 0.0 0.0 0.0004 0.0 0.0 0.0 0.0
#> 22 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
#> 23 0.0078 0.0363 0.0522 0.1392 0.431 0.0412 0.0135 0.0547 0.0167 0.0049 0.0049
#>
#> 1 (20) (21) Sum
#> 2 0.0 0.0 0.0314
#> 3 0.0 0.0 0.0147
#> 4 0.0 0.0 0.0122
#> 5 0.0 0.0 0.0118
#> 6 0.0 0.0 0.031
#> 7 0.0 0.0 0.0343
#> 8 0.0 0.0 0.0294
#> 9 0.0 0.0 0.0049
#> 10 0.0 0.0 0.0086
#> 11 0.0 0.0 0.0388
#> 12 0.0 0.0 0.0522
#> 13 0.0 0.0041 0.1486
#> 14 0.0 0.0 0.4322
#> 15 0.0 0.0 0.0408
#> 16 0.0 0.0 0.0151
#> 17 0.0012 0.0004 0.06
#> 18 0.0 0.0 0.0155
#> 19 0.0 0.0 0.0049
#> 20 0.0 0.0 0.002
#> 21 0.0098 0.0 0.0114
#> 22 0.0 0.0 0.0
#> 23 0.011 0.0045 1.0
#> Proportion Proportion Proportion Area [px] Area [px]
#> 1 Map class Estimate 95 % Interval 95 % Interval Estimate 95 % Interval
#> 2 (1) 1 0.0314 0.0247 0.0382 0.0 0.0
#> 3 (2) 2 0.0147 0.0112 0.0182 0.0 0.0
#> 4 (3) 3 0.0122 0.0122 0.0122 0.0 0.0
#> 5 (4) 4 0.0118 0.008 0.0157 0.0 0.0
#> 6 (5) 5 0.031 0.0275 0.0345 0.0 0.0
#> 7 (6) 6 0.0343 0.0296 0.039 0.0 0.0
#> 8 (7) 7 0.0294 0.0272 0.0316 0.0 0.0
#> 9 (8) 8 0.0049 0.0049 0.0049 0.0 0.0
#> 10 (9) 9 0.0086 0.0086 0.0086 0.0 0.0
#> 11 (10) 10 0.0388 0.0388 0.0388 0.0 0.0
#> 12 (11) 11 0.0522 0.0522 0.0522 0.0 0.0
#> 13 (12) 12 0.1486 0.1427 0.1545 0.0 0.0
#> 14 (13) 13 0.4322 0.4322 0.4322 0.0 0.0
#> 15 (14) 14 0.0408 0.0397 0.0419 0.0 0.0
#> 16 (15) 15 0.0151 0.0125 0.0177 0.0 0.0
#> 17 (16) 16 0.06 0.0549 0.0651 0.0 0.0
#> 18 (17) 17 0.0155 0.0126 0.0185 0.0 0.0
#> 19 (18) 18 0.0049 0.0028 0.007 0.0 0.0
#> 20 (19) 19 0.002 -0.0007 0.0048 0.0 -0.0
#> 21 (20) 20 0.0114 0.0095 0.0133 0.0 0.0
#> 22 (21) 21 0.0 -0.0026 0.0026 0.0 -0.0
#> Area [px]
#> 1 95 % Interval
#> 2 0.0
#> 3 0.0
#> 4 0.0
#> 5 0.0
#> 6 0.0
#> 7 0.0
#> 8 0.0
#> 9 0.0
#> 10 0.0
#> 11 0.0
#> 12 0.0
#> 13 0.0
#> 14 0.0
#> 15 0.0
#> 16 0.0
#> 17 0.0
#> 18 0.0
#> 19 0.0
#> 20 0.0
#> 21 0.0
#> 22 0.0
由reprex package (v0.3.0) 于 2021-03-09 创建