【发布时间】:2013-12-02 18:59:49
【问题描述】:
我是这个论坛的新手,也是 R 的初学者,如果我的脚本/问题以令人困惑的方式编写,我深表歉意。 我有 3 年内来自大约 130 个不同站点的天气数据,我想填补这些空白。现在,我只为全球辐射这样做,但我还有四个变量。 我的桌子是这样的:
tbl <- read.table(text =
" Date.and.Time I glbRad I precipitation.mm.day I rel.hum... I wind.speed..m.s I temperature.
1 I 2010-01-01-01 00:00:00 I 0.6 I 0.1 I 99.6 I 1 I 2.3
2 I 2010-01-01-01 01:00:00 I 0.6 I 0 I 99.5 I 1 I 2.2 ",
sep = "I", header = TRUE)
如果间隔只持续一两个小时,我会取之前和之后的测量值的平均值。 如果间隔持续时间超过两个小时,我将使用最近的气象站的值,该气象站在所需时间段内拥有完整数据。我有一个表 distanzen.csv,其中包含第一列中气象站的 name_i 和相邻气象站的 name_j。相邻站点按距离排序。
neighbors <- read.table(header = TRUE, sep = "I",stringsAsFactors = FALSE,text =
"name_j I name_i I distance
1 I Ainersthofen I Edelshausen I 16.303
2 I Ainersthofen I Gablingen I 19.684")
一般来说,该脚本有效。但这太慢了。你知道我怎样才能加快速度吗?我知道我应该以某种方式摆脱循环,但我不太知道该怎么做。 此外,如果相邻站的日期完全丢失(整行丢失),我会收到错误“参数长度为零”。在这种情况下,我想选择第二近的邻居。
#reading data
file_path="F:/SkriptAktion/wetter_csv_spalten_richtig_Ortsnamen/"
setwd(file_path)
names <-list.files()
d =1
for (n in names){
table<-read.csv(paste(file_path,n, sep=""), sep=",", header=TRUE, stringsAsFactors=FALSE)
#change date format
date <- as.POSIXlt(table$Date.and.Time, tz="utc", format="%d.%m.%Y %H:%M")
table$Date.and.Time<-date
#add a column "gaps_radiation" where A) it says „ok“ if the value is not missing B) it says „MW“ if one or two subsequent values are missing C) it says the name of the neighbouring station if data of the neighbouring station has been used
# write „MW“ for all missing values
table$gaps_radiation <- character(nrow(table))
table$gaps_radiation<-lapply(table[,"glbRad"],function(x) ifelse (x!=".", "ok", "MW"))
#change global.radiation from character to numeric
table$glbRad <- as.numeric(table$glbRad)
# If the gap lasts only one or two hours, I take the average of the previous and the subsequent measurements.
#1h gap
for (i in 2:(length(table$glbRad)-1)){
if (table$gaps_radiation[i] == "MW" & table$gaps_radiation[i-1]=="ok" & table$gaps_radiation[i+1]=="ok"){
table$glbRad[i] <- (table$glbRad[i-1]+table$[i+1])/2
}else {
#if ((table$gaps_radiation[i] == "MW"){(table$gaps_radiation[i] == "MW"}
table$glbRad[i] <- table$glbRad[i]
}
}
#2h gap
for (i in 3:(length(table$glbRad)-1)){
if (table$gaps_radiation[i] == "MW"
& table$gaps_radiation[i-1] == "MW"
& table$gaps_radiation[i-2] == "ok"
& table$gaps_radiation[i+1]=="ok"){
table$glbRad[i] <- (table$glbRad[i-2]+table$glbRad[i+1])/2
table$glbRad[i-1] <- (table$glbRad[i-2]+table$glbRad[i+1])/2
}else {table$glbRad[i] <- table$glbRad[i]
}
}
# gaps in the beginning/end of table
# 1h gap
if (table$gaps_radiation[length(table$glbRad)]== "MW" & table$gaps_radiation[length(table$glbRad)-1]=="ok"){
table$glbRad[length(table$glbRad)] <- table$glbRad[length(table$glbRad)-1]
}else {table$glbRad[length(table$glbRad)] <- table$glbRad[length(table$glbRad)]
}
if (table$gaps_radiation[1]== "MW" & table$gaps_radiation[2]=="ok"){
table$glbRad[1] <- table$glbRad[2]
}else {table$glbRad[1] <- table$glbRad[1]
}
# 2h gap
if (table$gaps_radiation[length(table$glbRad)]== "MW" & table$gaps_radiation[length(table$glbRad)-1] == "MW" & table$gaps_radiation[length(table$glbRad)-2]=="ok"){
table$glbRad[length(table$glbRad)] <- table$glbRad[length(table$glbRad)-2]
table$glbRad[length(table$glbRad)-1] <- table$glbRad[length(table$glbRad)-2]
}else {table$glbRad[length(table$glbRad)] <- table$glbRad[length(table$glbRad)]
table$glbRad[length(table$glbRad)-1] <- table$glbRad[length(table$glbRad)-1]
}
if (table$gaps_radiation[1]== "MW" & table$gaps_radiation[2] == "MW"& table$gaps_radiation[3]=="ok"){
table$glbRad[1] <- table$glbRad[3]
table$glbRad[2] <- table$glbRad[3]
}else {table$glbRad[1] <- table$glbRad[1]
table$glbRad[2] <- table$glbRad[2]
}
#gaps > 2h
mis_dates <- table[(is.na(table$glbRad)),"Date.and.Time"]
if (length(mis_dates)>=1){
neighbours <- read.csv(file="F:/SkriptAktion/distanzen.csv", header=TRUE, sep=",", dec=".", fill=TRUE, stringsAsFactors=FALSE)
tab1 <- read.csv(file=paste(file_path, neighbours$name_j[d*130+1], ".csv", sep=""), sep=",", header=TRUE, stringsAsFactors=FALSE)
tab1$Date.and.Time <- as.POSIXlt(tab1$Date.and.Time, tz="utc",format="%d.%m.%Y %H:%M")
tab1$glbRad <- as.numeric(tab1$glbRad)
for (i in 1:length(mis_dates)){
table[table$Date.and.Time == mis_dates[i], "glbRad"] <- tab1[tab1$Date.and.Time == mis_dates[i], "glbRad"]
table[table$Date.and.Time == mis_dates[i],"gaps_radiation"] <- neighbours$name_j[d*130+1]}
if (nrow(table[is.na(table$glbRad),])>0) {
tab1 <- read.csv(file=paste(file_path, neighbours$name_j[d*130+2], ".csv", sep=""), sep=",", header=TRUE, stringsAsFactors=FALSE)
tab1$Date.and.Time <- as.POSIXlt(tab1$Date.and.Time, tz="utc",format="%d.%m.%Y %H:%M:%S")
for (i in 1:length(mis_dates)){
table[table$Date.and.Time == mis_dates[i], "glbRad"] <- as.numeric(tab1[tab1$Date.and.Time == mis_dates[i], "glbRad"])
table[table$Date.and.Time == mis_dates[i],"gaps_radiation"] <- neighbours$name_j[d*130+2]}
}else {table <- table}
if (nrow(table[is.na(table$glbRad),])>0) {
tab1 <- read.csv(file=paste(file_path, neighbours$name_j[d*130+3], ".csv", sep=""), sep=",", header=TRUE, stringsAsFactors=FALSE)
tab1$Date.and.Time <- as.POSIXlt(tab1$Date.and.Time, tz="utc",format="%d.%m.%Y %H:%M:%S")
for (i in 1:length(mis_dates)){
table[table$Date.and.Time == mis_dates[i], "glbRad"] <- tab1[tab1$Date.and.Time == mis_dates[i], "glbRad"]
table[table$Date.and.Time == mis_dates[i],"gaps_radiation"] <- neighbours$name_j[d*130+3]}
}else {write.table(table,paste("F:/SkriptAktion/Lueckenfueller_radiation/", n, sep=""),sep=",", row.names=FALSE, col.names=TRUE, na="")}
if (nrow(table[is.na(table$glbRad),])>0) {
write.table(table,paste("F:/SkriptAktion/Lueckenfueller_radiation/", "lueckig", n, sep=""),sep=",", row.names=FALSE, col.names=TRUE, na="")
}else {table <- table}
}else {write.table(table,paste("F:/SkriptAktion/Lueckenfueller_radiation/", n, sep=""),sep=",", row.names=FALSE, col.names=TRUE, na="")}
d<- d+1
}
【问题讨论】:
-
首先使用
&&而不是&以减少所需的比较次数。请参阅?Logic了解更多信息。正如所写,else什么都不做,所以摆脱它。 -
为什么在脚本的第 7 行有 n=names?此外,这段代码过于冗长,并且在 R 中使用了许多内置函数,例如名称和表格,这使得代码难以阅读。您能否重命名 col 名称并使它们更短,例如替换“global.radiation..W.qm”。与“glbRad”
-
抱歉,n=names 不应该在那里。
标签: r performance loops