【问题标题】:How to find a day with maximum outliers如何找到具有最大异常值的一天
【发布时间】:2021-03-09 23:25:20
【问题描述】:

我有包含几天数据的时间序列。我需要找到异常值最多的一天并仅绘制这一天的数据。

我是怎么做的:

#generate sample data

Sys.setlocale("LC_ALL","English")

Values <- sample(0:100,24241,  replace = T)

Values <- rpois(24241, lambda=75)

start <- as.POSIXct("2012-01-15 06:10:00")
interval <- 15
end <- start + as.difftime(4, units="days") + as.difftime(5, units = "hours")

DateTimes <-  seq(from=start, by=interval, to=end) 

cpu_df <- tibble(datetime = DateTimes, Value =  Values)


# find and plot outliers of all days ========================================

upper_bound <- quantile(cpu_df$Value, 0.975)
outlier_ind <- which(cpu_df$Value > upper_bound)
cpu_df_susp <- cpu_df[outlier_ind, ]

alldays_plot <-   ggplot(data = cpu_df, aes(x = datetime, y = Value)) +
  geom_point(size = 0.9, color = "darkgreen") +
  geom_point(data = cpu_df_susp, color = "red", size = 1) + 
  geom_hline(yintercept=upper_bound, linetype="dashed", color = "red") +
  theme_bw()  +
  labs(x="", title = paste0("% Processor Time, _Total, Percentile: 0.975, Threshold: ", round(upper_bound,2)))


# ========== convert to xts  ====================================================
suppressMessages(library(xts))

cpu_df_xts <- xts(x = cpu_df$Value, order.by = cpu_df$datetime)

days <- split(cpu_df_xts, f="days")


#========= find worst day - with biggest number of outliers
outliers_number <- 0
worstday_index <- 0

for (i in 1:(length(days))) {
 
  upper_bound <- quantile( coredata(days[[i]]), 0.975)
  outlier_ind <- which(coredata(days[[i]]) > upper_bound)
  outlier_day_number <- length(outlier_ind)
  
 
  if ( outlier_day_number > outliers_number 
  ){
    worstday_index <- i
    outliers_number <- outlier_day_number
    worst_day_outliers_ind <- outlier_ind 
  }
  
}

WorstDay <- days[[worstday_index]]

# find outliers of worst day ====================================================

worst_day_outliers <- WorstDay[worst_day_outliers_ind, ]

# convert xts back to tibble

WorstDayTibble <-  tibble( datetime = index(WorstDay),
                          Value = coredata(WorstDay) )


outliersTibble <- tibble( datetime = index(worst_day_outliers),
                          Value = coredata(worst_day_outliers) )

# plot worst day  ====================================================

worstDay_Plot <-   ggplot(data = WorstDayTibble, aes(x = datetime, y = Value)) +
  geom_point(size = 0.9, color = "darkgreen") +
  geom_point(data = outliersTibble, color = "red", size = 1) + 
  geom_hline(yintercept=upper_bound, linetype="dashed", color = "red") +
  theme_bw()  +
  labs(x="", title = paste0("% Processor Time, _Total, Percentile: 0.975, Threshold: ", round(upper_bound,2)))


library(ggpubr)
ggpubr::ggarrange(alldays_plot, worstDay_Plot)

结果如下:

我不喜欢我的代码 - 将数据拆分为天并通过它们进行搜索,我需要将其转换为 xts。要通过 ggplot2 绘制数据,我必须将数据转换回 tibble。是否可以避免这种双重转换并使代码更简单?

【问题讨论】:

    标签: r ggplot2 xts outliers


    【解决方案1】:

    您无需将数据转换为 xts 并返回。将数据保存在 dateframe/tibble 中,您可能会遇到最糟糕的一天:

    library(dplyr)
    
    #Add date column
    cpu_df <- cpu_df %>% mutate(date = as.Date(datetime))
    
    #For each date count number of Value greater than 0.975 quantile 
    #and select the date with max outliers.
    WorstDay <- cpu_df %>%
                 group_by(date) %>%
                 summarise(n = sum(Value > quantile(Value, 0.975))) %>%
                 slice(which.max(n)) %>%
                 left_join(cpu_df, by = 'date')
    

    您可以使用此数据进行绘图。

    【讨论】:

      猜你喜欢
      • 1970-01-01
      • 2020-10-27
      • 2013-11-14
      • 2017-06-20
      • 1970-01-01
      • 2021-05-24
      • 1970-01-01
      • 2021-11-23
      • 1970-01-01
      相关资源
      最近更新 更多