【发布时间】:2021-06-01 17:39:47
【问题描述】:
我的数据框是这样的,是一个长格式的数据集。每个用户名可能有很多观察。
structure(list(username = structure(c(67L, 95L, 58L, 61L, 65L,
66L, 10L, 2L, 41L, 85L, 29L, 76L, 59L, 77L, 97L, 22L, 96L, 47L,
74L, 13L, 48L, 78L, 15L, 18L, 71L, 9L, 89L, 50L, 98L, 24L, 14L,
90L, 64L, 1L, 93L, 62L, 52L, 83L, 88L, 82L, 73L, 8L, 45L, 6L,
28L, 57L, 7L, 94L, 69L, 20L, 11L, 26L, 42L, 16L, 43L, 4L, 92L,
70L, 5L, 56L), .Label = c("__aubssss", "_CurlyFryGuy_", "_JLovee",
"100ProofWoman", "amoodyknapp", "ankushthebest10", "anna_michaux",
"AnnetteVitelli3", "Bierkast", "Bigfamlife", "bluemoon357", "boogy1228",
"Brenderzzz_", "brendochendo", "cappir", "CarmelScotsNews", "charlesdlcruz",
"ChildOUniverse", "CllrKenPollock", "CoastCyclist", "Crampedsultana",
"Crescen04324237", "CuddiCAPALOT", "d1no_nugget", "damnnndor",
"danieldurrans", "dbrown13", "ducksandchucks", "EclecticCoding",
"Felstedboy", "gengen0309", "Herbie555", "Ho8Go8L1N", "Honeylotus333",
"huegolden", "iamesrvan", "IanSmalley3", "Ipeethree3", "Its_Jack_Brooo",
"ItsBittie", "Jackie_montes", "james_southcott", "JanJoostBouwman",
"Just_Jones33", "Karabo_Mtaung", "KathrynwithaY", "Keioney_kisses",
"kikilovescats", "KLobstar", "LattaZakyra", "lilac_bun", "lizziemonkhouse",
"m_melodias", "MainMandarin", "marge_cord", "Maverick1914", "May_leita",
"mcbadlon", "MiyaDior", "msmrocks", "NaazhimSupreme", "napitupulu_a",
"natashametzler", "Nick_Miles_", "nivanacampos", "nrazaliyah",
"Ohh_Ziggy", "OnlySimphiwe", "orodelancs", "parrothead34", "PeachiesPromo",
"PickledGingerBC", "popkis", "preciousrubie", "reverend_thom",
"RevTinTin", "RudyJb1024", "samanthacraig15", "SandraRPearce1",
"siempreAM0nae", "simplyaracelii", "SimStrength", "smutwiizard",
"spaceboosh", "SplendentSweven", "stankloaf", "TaterSaladJD",
"tayzer6", "terri2kool", "thehulkster", "ThisOffendsMeTV", "undersiegexo",
"Warrior_Maiden", "WayCatPub", "WEAVYwonder_", "wsrphoto", "xkimmygirlx",
"Yildiiiiiiiiz"), class = "factor"), positive = c(0.165, 0, 0.34,
0.06, 0.106, 0, 0.292, 0, 0, 0.059, 0.139, 0, 0, 0, 0, 0.031,
0, 0.119, 0, 0.457, 0.192, 0, 0, 0.128, 0.121, 0.101, 0.317,
0, 0.528, 0.374, 0, 0.06, 0, 0.233, 0.092, 0.079, 0, 0, 0.174,
0.094, 0.059, 0, 0.093, 0.103, 0.099, 0.097, 0.102, 0, 0.112,
0, 0, 0, 0, 0, 0.225, 0.095, 0.213, 0.116, 0.043, 0.078), compound = c(-0.1027,
-0.3612, 0.5574, -0.886, 0.4738, 0, 0.9277, 0, -0.6077, 0.5023,
0.5635, 0, 0, -0.4767, -0.8248, -0.4678, -0.296, 0.0094, 0, 0.9274,
0.6124, -0.6664, 0, 0.6486, 0.6116, 0.5399, 0.8926, 0, 0.6792,
0.9768, 0, 0.2732, -0.7073, 0.892, -0.7783, 0.3818, 0, -0.6739,
0.7314, 0.4588, -0.2411, 0, -0.2212, 0.2023, -0.2244, 0.296,
-0.4417, -0.7003, 0.2946, -0.6808, 0, 0, -0.0387, -0.3816, 0.5106,
0.296, 0.6739, 0.5487, -0.2023, 0.5229), Date = structure(c(38L,
38L, 35L, 35L, 30L, 29L, 27L, 27L, 27L, 27L, 25L, 25L, 25L, 19L,
19L, 16L, 16L, 15L, 12L, 7L, 24L, 4L, 2L, 1L, 39L, 38L, 30L,
29L, 3L, 2L, 29L, 21L, 18L, 17L, 16L, 34L, 32L, 10L, 10L, 8L,
39L, 39L, 36L, 34L, 33L, 31L, 29L, 27L, 24L, 23L, 22L, 18L, 17L,
5L, 1L, 24L, 39L, 31L, 31L, 28L), .Label = c("2020-02-02", "2020-02-07",
"2020-02-08", "2020-02-09", "2020-02-11", "2020-02-13", "2020-02-14",
"2020-02-15", "2020-02-16", "2020-02-17", "2020-02-18", "2020-02-21",
"2020-02-22", "2020-02-24", "2020-02-26", "2020-02-27", "2020-02-29",
"2020-03-01", "2020-03-05", "2020-03-10", "2020-03-11", "2020-03-13",
"2020-03-14", "2020-03-15", "2020-03-16", "2020-03-17", "2020-03-18",
"2020-03-19", "2020-03-20", "2020-03-21", "2020-03-22", "2020-03-23",
"2020-03-24", "2020-03-25", "2020-03-26", "2020-03-27", "2020-03-29",
"2020-03-30", "2020-03-31"), class = "factor"), agegroup = structure(c(3L,
3L, 3L, 3L, 3L, 3L, 3L, 1L, 3L, 3L, 1L, 3L, 3L, 3L, 3L, 3L, 1L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 1L, 2L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 1L,
1L, 1L, 3L, 3L, 1L, 3L, 3L, 3L, 3L, 1L, 3L), .Label = c("MA",
"OA", "YA"), class = "factor")), row.names = c(NA, 60L), class = "data.frame")
我希望生成一个带有新变量时间的输出表(添加):
如果时间落在 2020 年 9 月代码 1
如果时间落在 2020 年 10 月代码 2
如果时间落在 2020 年 11 月 Code 3
如果时间落在 2020 年 12 月 Code 4
如果时间落在 2021 年 1 月 Code 5
如果时间落在 2021 年 2 月 Code 6
如果时间落在 2021 年 3 月 Code 7
如果时间落在 2021 年 4 月 Code 8
数据框会是这样的:
每个用户名将有 8 个观察值,因此新数据将有 60*7 行。
如果用户名在特定时间没有值,代码 NA。 如果用户名在此时间范围(月)内具有值,则计算这些观察值的平均值(复合)。
username agegroup time mean_compound
a YA 1 NA
a YA 2 0.5
a YA 3 NA
a YA 4 0.1
a YA 5 0.1
a YA 6 0.2
a YA 7 0.2
a YA 8 NA
@arkun 谢谢!我将您的代码更改为
data_meergeed = data_d %>%
group_by(username, agegroup,
Date = ceiling_date(ymd(Date), 'month')) %>%
summarise(mean_compound = mean(compound), .groups = 'drop') %>%
complete(username, agegroup,Date = seq(as.Date('2020-09-01'),
as.Date('2021-04-01'), by = '1 month'))%>%
mutate(Date = format(Date, '%b %Y'))
但我发现一个用户名还有多个保留
我怎样才能让一个用户名有 8 个观察结果,这意味着每个月他们只有一个观察结果(计算平均值)?
【问题讨论】:
-
Feb 2020呢? -
预期输出是否正确(基于输入数据)?
标签: r