【问题标题】:Create time Series Object from Wide Data, group by column value从宽数据创建时间序列对象,按列值分组
【发布时间】:2019-04-30 20:04:18
【问题描述】:

下面是我的数据框 (my_df)。我正在尝试将其作为时间序列对象来预测 2020 年,但我正在努力转换这种数据格式。

我正在尝试使用下面的代码将其转换为时间序列,但我在位置得到 NA 并且数据不是时间序列格式

我的尝试

ts(my_df[,c(-2,-3)], start=c(2009), end=c(2014), frequency=1)

下面的数据框 my_df

structure(list(`Geogrphical Location` = c("United States", "Northeast", 
"Midwest", "South", "West", ".Alabama", ".Alaska", ".Arizona", 
".Arkansas", ".California", ".Colorado", ".Connecticut", ".Delaware", 
".District of Columbia", ".Florida", ".Georgia", ".Hawaii", ".Idaho", 
".Illinois", ".Indiana", ".Iowa", ".Kansas", ".Kentucky", ".Louisiana", 
".Maine", ".Maryland", ".Massachusetts", ".Michigan", ".Minnesota", 
".Mississippi", ".Missouri", ".Montana", ".Nebraska", ".Nevada", 
".New Hampshire", ".New Jersey", ".New Mexico", ".New York", 
".North Carolina", ".North Dakota", ".Ohio", ".Oklahoma", ".Oregon", 
".Pennsylvania", ".Rhode Island", ".South Carolina", ".South Dakota", 
".Tennessee", ".Texas", ".Utah", ".Vermont", ".Virginia", ".Washington", 
".West Virginia", ".Wisconsin", ".Wyoming", "Puerto Rico"), Census = c(308745538, 
55317240, 66927001, 114555744, 71945553, 4779736, 710231, 6392017, 
2915918, 37253956, 5029196, 3574097, 897934, 601723, 18801310, 
9687653, 1360301, 1567582, 12830632, 6483802, 3046355, 2853118, 
4339367, 4533372, 1328361, 5773552, 6547629, 9883640, 5303925, 
2967297, 5988927, 989415, 1826341, 2700551, 1316470, 8791894, 
2059179, 19378102, 9535483, 672591, 11536504, 3751351, 3831074, 
12702379, 1052567, 4625364, 814180, 6346105, 25145561, 2763885, 
625741, 8001024, 6724540, 1852994, 5686986, 563626, 3725789), 
    `Estimates Base` = c(308758105, 55318353, 66929825, 114563005, 
    71946922, 4780131, 710249, 6392301, 2916025, 37254522, 5029324, 
    3574114, 897936, 601766, 18804592, 9688680, 1360301, 1567650, 
    12831574, 6484136, 3046869, 2853129, 4339344, 4533479, 1328364, 
    5773786, 6547813, 9884129, 5303924, 2968103, 5988928, 989414, 
    1826334, 2700691, 1316461, 8791953, 2059198, 19378110, 9535688, 
    672591, 11536727, 3751615, 3831072, 12702857, 1052940, 4625410, 
    814195, 6346298, 25146100, 2763888, 625741, 8001041, 6724545, 
    1853011, 5687289, 563767, 3726157), `2010` = c(309348193, 
    55388056, 66978602, 114863114, 72118421, 4785492, 714031, 
    6408312, 2921995, 37332685, 5048644, 3579899, 899816, 605183, 
    18849098, 9713521, 1363945, 1571010, 12841578, 6490528, 3050738, 
    2858850, 4348662, 4544996, 1327730, 5788584, 6565524, 9877495, 
    5311147, 2970322, 5996118, 990641, 1830051, 2703284, 1316872, 
    8803729, 2064756, 19402640, 9558915, 674526, 11540983, 3759603, 
    3838048, 12712343, 1053337, 4635943, 816325, 6356671, 25244310, 
    2775326, 625982, 8025773, 6743226, 1854230, 5690263, 564513, 
    3721525), `2011` = c(311663358, 55632766, 67153331, 116061801, 
    72815460, 4799918, 722713, 6467163, 2939493, 37676861, 5118360, 
    3589893, 907924, 620477, 19096952, 9811610, 1377864, 1584143, 
    12860012, 6516480, 3065223, 2869503, 4369354, 4575404, 1328231, 
    5843603, 6611923, 9876213, 5348562, 2978162, 6010717, 997821, 
    1842283, 2718379, 1318473, 8841243, 2077756, 19519529, 9650963, 
    685476, 11544824, 3786274, 3868031, 12744293, 1052451, 4672637, 
    824398, 6397634, 25646389, 2816124, 626730, 8110035, 6822520, 
    1854972, 5709640, 567725, 3678732), `2012` = c(313998379, 
    55829059, 67332320, 117299171, 73537829, 4815960, 731089, 
    6549634, 2950685, 38011074, 5189867, 3593795, 916993, 635327, 
    19344156, 9914668, 1391820, 1595911, 12870798, 6537743, 3076310, 
    2885262, 4384799, 4603429, 1328895, 5889651, 6658008, 9887238, 
    5380285, 2984945, 6025415, 1005196, 1855725, 2752565, 1321182, 
    8873211, 2083784, 19602769, 9746175, 702087, 11550839, 3817054, 
    3899116, 12771854, 1052901, 4720760, 834441, 6454306, 26071655, 
    2855782, 626444, 8192048, 6895226, 1856560, 5726177, 576765, 
    3634488), `2013` = c(316204908, 55988771, 67543948, 118424320, 
    74247869, 4829479, 736879, 6624617, 2958663, 38335203, 5267603, 
    3596003, 925395, 649165, 19582022, 9984938, 1406481, 1612011, 
    12879505, 6569102, 3091930, 2892821, 4400477, 4626402, 1329076, 
    5931129, 6706786, 9898982, 5418521, 2990482, 6042711, 1014314, 
    1868559, 2786464, 1322687, 8899162, 2085193, 19673546, 9841590, 
    724019, 11570022, 3852415, 3925751, 12781338, 1053033, 4767894, 
    844922, 6494821, 26473525, 2902663, 627140, 8262692, 6968006, 
    1853231, 5742854, 582684, 3593077), `2014` = c(318563456, 
    56116791, 67726368, 119696311, 75023986, 4843214, 736705, 
    6719993, 2966912, 38680810, 5349648, 3591873, 934948, 659005, 
    19888741, 10087231, 1416349, 1633532, 12867544, 6595233, 
    3108030, 2899360, 4413057, 4647880, 1330719, 5967295, 6749911, 
    9915767, 5453109, 2992400, 6060930, 1022867, 1881145, 2833013, 
    1328743, 8925001, 2083024, 19718515, 9934399, 739904, 11594408, 
    3877499, 3968371, 12790565, 1054480, 4828430, 852561, 6544663, 
    26944751, 2941836, 626984, 8317372, 7054196, 1848514, 5758377, 
    583642, 3534874), `2015` = c(320896618, 56184737, 67838387, 
    121039206, 75834288, 4853875, 737709, 6817565, 2977853, 38993940, 
    5448819, 3584730, 944076, 670377, 20244914, 10199398, 1425157, 
    1652828, 12839047, 6612768, 3121997, 2906721, 4424611, 4668960, 
    1329453, 5994983, 6784240, 9917715, 5482435, 2989390, 6076204, 
    1032073, 1893765, 2883758, 1330111, 8935421, 2080328, 19747183, 
    10035186, 756835, 11605090, 3907414, 4024634, 12791904, 1055607, 
    4894834, 857919, 6595056, 27429639, 2990632, 626088, 8367587, 
    7160290, 1841053, 5767891, 586555, 3473181), `2016` = c(323127513, 
    56209510, 67941429, 122319574, 76657000, 4863300, 741894, 
    6931071, 2988248, 39250017, 5540545, 3576452, 952065, 681170, 
    20612439, 10310371, 1428557, 1683140, 12801539, 6633053, 
    3134693, 2907289, 4436974, 4681666, 1331479, 6016447, 6811779, 
    9928300, 5519952, 2988726, 6093000, 1042520, 1907116, 2940058, 
    1334795, 8944469, 2081015, 19745289, 10146788, 757952, 11614373, 
    3923561, 4093465, 12784227, 1056426, 4961119, 865454, 6651194, 
    27862596, 3051217, 624594, 8411808, 7288000, 1831102, 5778708, 
    585501, 3411307)), row.names = c(NA, -57L), class = c("tbl_df", 
"tbl", "data.frame"))

请帮助我构建一个时间序列对象,以便我可以使用线性回归来预测第 1 列中提到的任何状态的 2020 年

【问题讨论】:

    标签: r time-series linear-regression


    【解决方案1】:

    tsibble 包旨在简化此操作。

    library(tidyverse)
    library(tsibble)
    my_ts <- my_df %>% 
      rename(Location = "Geogrphical Location") %>%
      select(Location, `2010`:`2016`) %>%
      gather(key="Year", value="value", `2010`:`2016`) %>%
      mutate(
        Year = as.numeric(Year),
        Location = gsub("\\.", "", Location)
      ) %>%
      as_tsibble(index=Year, key=id(Location)) %>%
      as.ts()
    

    其中大部分只是将数据放在长格式中的 tidyverse 代码,并清理变量名称和位置值。 as_tsibble() 行完成了将其设置为多变量时间序列的大部分工作,然后 as.ts() 将其变为 ts 对象。

    my_ts
    #> Time Series:
    #> Start = 2010 
    #> End = 2016 
    #> Frequency = 1 
    #>      Alabama Alaska Arizona Arkansas California Colorado Connecticut
    #> 2010 4785492 714031 6408312  2921995   37332685  5048644     3579899
    #> 2011 4799918 722713 6467163  2939493   37676861  5118360     3589893
    #> 2012 4815960 731089 6549634  2950685   38011074  5189867     3593795
    #> 2013 4829479 736879 6624617  2958663   38335203  5267603     3596003
    #> 2014 4843214 736705 6719993  2966912   38680810  5349648     3591873
    #> 2015 4853875 737709 6817565  2977853   38993940  5448819     3584730
    #> 2016 4863300 741894 6931071  2988248   39250017  5540545     3576452
    

    【讨论】:

    • 这很完美,你能告诉我如何添加线性回归吗,因为我似乎无法正确使用公式,我使用 tslm(my_ts) (仅在犹他州子集之后)
    • 适合
    【解决方案2】:

    尝试将位置变量转换为因子。但是,这些位置是因子的数字格式(标签被丢弃)。

    df1$`Geogrphical Location` <- as.factor(df1$`Geogrphical Location`)
    
    > ts(df1[, -(2:3)], start=c(2009), end=c(2014), frequency=1)
    Time Series:
    Start = 2009 
    End = 2014 
    Frequency = 1 
         Geogrphical Location      2010      2011      2012      2013      2014      2015      2016
    2009                   56 309348193 311663358 313998379 316204908 318563456 320896618 323127513
    2010                   53  55388056  55632766  55829059  55988771  56116791  56184737  56209510
    2011                   52  66978602  67153331  67332320  67543948  67726368  67838387  67941429
    2012                   55 114863114 116061801 117299171 118424320 119696311 121039206 122319574
    2013                   57  72118421  72815460  73537829  74247869  75023986  75834288  76657000
    2014                    1   4785492   4799918   4815960   4829479   4843214   4853875   4863300
    

    一个小小的“密码书”给你这个代码:

    data.frame(lbl=df1$`Geogrphical Location`, 
               num=as.numeric(df1$`Geogrphical Location`))
    

    注意:R中的变量名最好不要使用空格。

    names(df1)[1] <- "Geographical.Location"
    

    您还可以摆脱领先点。

    df1$Geographical.Location <- gsub("\\.", "", df1$Geographical.Location)
    

    然后像上面那样做:

    df1$Geographical.Location <- as.factor(df1$Geographical.Location)
    

    【讨论】:

      猜你喜欢
      • 2016-07-01
      • 1970-01-01
      • 1970-01-01
      • 1970-01-01
      • 1970-01-01
      • 2020-12-24
      • 1970-01-01
      • 1970-01-01
      • 2020-03-17
      相关资源
      最近更新 更多