【问题标题】:Input 0 where there is no value in R data frame在 R 数据框中没有值的地方输入 0
【发布时间】:2020-01-09 00:29:16
【问题描述】:

我想获得如下所示的 data.frame,但包括每个主题的所有年份。我制作的这个按年份计算每个主题的项目数,但是当某年没有项目时,它只是不会为该特定主题创建该行,并且在最终图表中为空白。谁能告诉我如何为没有价值的主题添加 Count == 0 的缺失年份?

dtd2 <- structure(list(Topic = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 
3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 
5L, 5L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 
7L, 7L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 9L, 9L, 9L, 9L, 9L, 9L, 
10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 11L, 11L, 11L, 11L, 
11L, 11L, 11L, 11L, 11L, 12L, 12L, 12L, 12L, 12L, 12L, 12L), .Label = c("Topic 1", 
"Topic 10", "Topic 11", "Topic 12", "Topic 2", "Topic 3", "Topic 4", 
"Topic 5", "Topic 6", "Topic 7", "Topic 8", "Topic 9"), class = "factor"), 
    Year = structure(c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 2L, 
    3L, 5L, 6L, 7L, 8L, 9L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 
    3L, 4L, 5L, 6L, 7L, 8L, 9L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 
    9L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 
    8L, 9L, 1L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 1L, 2L, 6L, 7L, 8L, 
    9L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 1L, 2L, 3L, 4L, 5L, 
    6L, 7L, 8L, 9L, 2L, 3L, 4L, 5L, 6L, 7L, 8L), .Label = c("2011", 
    "2012", "2013", "2014", "2015", "2016", "2017", "2018", "2019"
    ), class = "factor"), Count = c(3L, 3L, 3L, 5L, 5L, 11L, 
    17L, 14L, 4L, 1L, 1L, 4L, 2L, 3L, 9L, 4L, 2L, 1L, 3L, 4L, 
    5L, 18L, 23L, 19L, 15L, 1L, 5L, 6L, 8L, 11L, 17L, 7L, 1L, 
    3L, 6L, 4L, 20L, 21L, 18L, 12L, 3L, 1L, 1L, 2L, 5L, 5L, 11L, 
    5L, 2L, 1L, 1L, 2L, 2L, 5L, 7L, 23L, 9L, 1L, 1L, 2L, 3L, 
    6L, 4L, 9L, 8L, 1L, 1L, 6L, 2L, 3L, 3L, 1L, 3L, 2L, 5L, 7L, 
    11L, 11L, 28L, 11L, 2L, 1L, 2L, 2L, 5L, 6L, 5L, 16L, 3L, 
    4L, 2L, 2L, 7L, 6L, 8L, 6L)), row.names = c(NA, -96L), class = "data.frame")

ggplot(dtd2, aes(x = Year, y = Count, colour = Topic, group = Topic)) + geom_point() + geom_line() + labs(x = "Year", y = NULL, title = "Timeline")

【问题讨论】:

    标签: r count


    【解决方案1】:

    我们可以使用tidyr 中的complete 来添加缺失的年份并用0 填充Count 值。

    tidyr::complete(dtd2, Topic, Year = unique(Year), fill = list(Count = 0))
    
    #A tibble: 108 x 3
    #   Topic    Year  Count
    #   <fct>    <fct> <dbl>
    # 1 Topic 1  2011      3
    # 2 Topic 1  2012      3
    # 3 Topic 1  2013      3
    # 4 Topic 1  2014      5
    # 5 Topic 1  2015      5
    # 6 Topic 1  2016     11
    # 7 Topic 1  2017     17
    # 8 Topic 1  2018     14
    # 9 Topic 1  2019      4
    #10 Topic 10 2011      0
    # … with 98 more rows
    

    并在ggplot2 中使用它,以便连接线

    library(ggplot2)
     tidyr::complete(dtd2, Topic, Year = unique(Year), fill = list(Count = 0)) %>%
       ggplot(., aes(x = Year, y = Count, colour = Topic, group = Topic)) + 
       geom_point() + geom_line() + labs(x = "Year", y = NULL, title = "Timeline")
    

    【讨论】:

      【解决方案2】:

      时间序列方法可以是

      library(tidyverse)
      library(lubridate)
      #> 
      #> Attaching package: 'lubridate'
      #> The following object is masked from 'package:base':
      #> 
      #>     date
      library(tsibble)
      #> 
      #> Attaching package: 'tsibble'
      #> The following objects are masked from 'package:lubridate':
      #> 
      #>     interval, new_interval
      #> The following object is masked from 'package:dplyr':
      #> 
      #>     id
      
      
      dtd2 <- structure(list(Topic = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 
        1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 
        3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 
        5L, 5L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 
        7L, 7L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 9L, 9L, 9L, 9L, 9L, 9L, 
        10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 11L, 11L, 11L, 11L, 
        11L, 11L, 11L, 11L, 11L, 12L, 12L, 12L, 12L, 12L, 12L, 12L), .Label = c("Topic 1", 
          "Topic 10", "Topic 11", "Topic 12", "Topic 2", "Topic 3", "Topic 4", 
          "Topic 5", "Topic 6", "Topic 7", "Topic 8", "Topic 9"), class = "factor"), 
        Year = structure(c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 2L, 
          3L, 5L, 6L, 7L, 8L, 9L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 
          3L, 4L, 5L, 6L, 7L, 8L, 9L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 
          9L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 
          8L, 9L, 1L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 1L, 2L, 6L, 7L, 8L, 
          9L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 1L, 2L, 3L, 4L, 5L, 
          6L, 7L, 8L, 9L, 2L, 3L, 4L, 5L, 6L, 7L, 8L), .Label = c("2011", 
            "2012", "2013", "2014", "2015", "2016", "2017", "2018", "2019"
          ), class = "factor"), Count = c(3L, 3L, 3L, 5L, 5L, 11L, 
            17L, 14L, 4L, 1L, 1L, 4L, 2L, 3L, 9L, 4L, 2L, 1L, 3L, 4L, 
            5L, 18L, 23L, 19L, 15L, 1L, 5L, 6L, 8L, 11L, 17L, 7L, 1L, 
            3L, 6L, 4L, 20L, 21L, 18L, 12L, 3L, 1L, 1L, 2L, 5L, 5L, 11L, 
            5L, 2L, 1L, 1L, 2L, 2L, 5L, 7L, 23L, 9L, 1L, 1L, 2L, 3L, 
            6L, 4L, 9L, 8L, 1L, 1L, 6L, 2L, 3L, 3L, 1L, 3L, 2L, 5L, 7L, 
            11L, 11L, 28L, 11L, 2L, 1L, 2L, 2L, 5L, 6L, 5L, 16L, 3L, 
            4L, 2L, 2L, 7L, 6L, 8L, 6L)), row.names = c(NA, -96L), class = "data.frame")
      tsibble2 <- dtd2 %>%
        mutate(Year = as_date(str_c(Year,"01",'01'))) %>% 
        as_tsibble(index = Year,key = Topic) %>%
        tsibble::fill_gaps(.full = TRUE) %>%
        group_by_key() %>% 
        index_by(year = Year %>% year) %>% 
        summarise(Count = Count %>% sum(na.rm = T)) %>% 
        as_tibble() %>% 
        mutate(year = year %>% as_factor())
      
      tsibble2 %>% 
        ggplot() +
        aes(x = year,y = Count,color = Topic,group = Topic) +
        geom_line() +
        geom_point()
      

      reprex package (v0.3.0) 于 2020-01-08 创建

      【讨论】:

        【解决方案3】:

        我们可以使用expand

        library(dplyr)
        library(tidyr)
        library(ggplot2)
        dtd2 %>%
            expand(Topic = factor(Topic, levels = gtools::mixedsort(levels(Topic))) ,
                         Year = unique(Year)) %>% 
            left_join(dtd2) %>% 
            mutate(Count = replace_na(Count, 0)) %>%
            ggplot(aes(x = Year, y = Count, colour = Topic, group = Topic)) + 
                 geom_point() +
                 geom_line() +
                 labs(x = "Year", y = NULL, title = "Timeline")
        

        -输出

        【讨论】:

          猜你喜欢
          • 2018-04-27
          • 1970-01-01
          • 1970-01-01
          • 1970-01-01
          • 1970-01-01
          • 1970-01-01
          • 1970-01-01
          • 2021-01-07
          • 2012-10-23
          相关资源
          最近更新 更多