将因子变量转换为R中的数字后如何保留因子变量的标签？答案

【问题标题】：How to keep labels of a factor variable after transforming it to numeric in R?将因子变量转换为R中的数字后如何保留因子变量的标签？
【发布时间】：2020-12-08 14:54:18
【问题描述】：

我有以下三个主要变量的数据：

i) 教育（因子）：代表三种不同的教育水平（1、2、3）

ii) 份额（数字）：代表该国每个教育水平的人口百分比

iii) 国家（因素）：代表 30 个国家。

其目的是通过将最高教育水平 (3) 按教育比例最低的国家重新排序到教育水平最高的国家来绘制每个国家/地区的教育份额。问题是我最后会丢失国家标签，因为我必须将变量转换为数字才能重新排序。绘制图表后，国家被标记为 (25, 6, 26, 17) 而不是正确的标签 (PT, CZ, RO, IT)。我在stackoverflow中阅读了不同的线程，但没有解决这个问题。我有办法在重新订购后保留国家/地区的标签，这样我以后就不必手动输入它们了吗？

library(forcats)
library(ggplot2)
library(dplyr)

 x$country = as.numeric(x$cntry2)
 x$educ = as.integer(x$educ)
 x$educ = as.factor(x$educ)
    
 country_order <- x %>% 
  filter(educ == 3) %>%
  mutate(country = fct_reorder(factor(country), share, .desc = FALSE)) %>% 
  pull(country) %>%
  levels()

df2 <- x %>%
  mutate(country = fct_relevel(factor(country), country_order))

ggplot(df2, aes(x=country, y=share)) + 
  geom_col(aes(fill=educ), color = "black") +
  labs(fill= "Education") +
  theme_classic() +
  xlab("Country")

这是下面的数据：

structure(list(educ = structure(c(1L, 2L, 3L, 1L, 2L, 3L, 1L, 
2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 
3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 
1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 
2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 
3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L), .Label = c("1", "2", 
"3"), class = "factor"), cntry2 = structure(c(1L, 1L, 1L, 2L, 
2L, 2L, 3L, 3L, 3L, 4L, 4L, 4L, 5L, 5L, 5L, 6L, 6L, 6L, 7L, 7L, 
7L, 8L, 8L, 8L, 9L, 9L, 9L, 10L, 10L, 10L, 11L, 11L, 11L, 12L, 
12L, 12L, 13L, 13L, 13L, 14L, 14L, 14L, 15L, 15L, 15L, 16L, 16L, 
16L, 17L, 17L, 17L, 18L, 18L, 18L, 19L, 19L, 19L, 20L, 20L, 20L, 
21L, 21L, 21L, 22L, 22L, 22L, 23L, 23L, 23L, 24L, 24L, 24L, 25L, 
25L, 25L, 26L, 26L, 26L, 27L, 27L, 27L), .Label = c("AU", "BE", 
"BG", "CH", "CZ", "DK", "EE", "ES", "FI", "FR", "GR", "HU", "IE", 
"IS", "IT", "LT", "LU", "LV", "NL", "NO", "PO", "PT", "RO", "SE", 
"SK", "SV", "UK"), class = "factor"), share = c(14.9585723390695, 
64.8311026131294, 20.2103250478011, 20.3203525363306, 37.9050825638106, 
41.7745648998589, 20.5482068669118, 58.6719831908696, 20.7798099422186, 
11.0478359908884, 52.7334851936219, 36.2186788154898, 8.1806499751285, 
77.2156358812801, 14.6037141435914, 18.43684842358, 44.6831364124597, 
36.8800151639603, 13.0425889732285, 58.1996272896687, 28.7577837371029, 
42.6625051189251, 24.1934234264148, 33.1440714546602, 16.4821228232769, 
46.3050582898395, 37.2128188868836, 22.0117072122872, 47.7342785027657, 
30.2540142849471, 31.6958715347475, 40.8370856615852, 27.4670428036673, 
15.620426612099, 63.1486925776748, 21.2308808102263, 27.79203576455, 
33.4878715125424, 38.7200927229075, 29.0666986564299, 41.950575815739, 
28.9827255278311, 36.0270124068613, 47.1984225312789, 16.7745650618598, 
8.20398339670027, 60.9892218075273, 30.8067947957724, 37.0050817095017, 
37.4766935985084, 25.5182246919899, 15.7399902739504, 59.1482759419216, 
25.111733784128, 19.2624176167015, 43.4944817814291, 37.2431006018693, 
17.6501727404436, 44.6784798840967, 37.6713473754597, 10.0098831213475, 
69.2849776555517, 20.7051392231007, 64.5019644095216, 21.5391726369309, 
13.9588629535475, 21.8434913468774, 62.6661650363682, 15.4903436167545, 
11.4840104928012, 55.3435190932938, 33.172470413905, 4.23006072183939, 
74.1147574537763, 21.6551818243843, 15.6869892409901, 61.3851490387442, 
22.9278617202657, 14.2357801080394, 49.3703276303246, 36.393892261636
), country = c(1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 6, 6, 6, 7, 
7, 7, 8, 8, 8, 9, 9, 9, 10, 10, 10, 11, 11, 11, 12, 12, 12, 14, 
14, 14, 15, 15, 15, 16, 16, 16, 17, 17, 17, 18, 18, 18, 19, 19, 
19, 20, 20, 20, 22, 22, 22, 23, 23, 23, 24, 24, 24, 25, 25, 25, 
26, 26, 26, 27, 27, 27, 28, 28, 28, 29, 29, 29, 30, 30, 30)), row.names = c(NA, 
-81L), class = c("tbl_df", "tbl", "data.frame"))

【问题讨论】：

标签： r ggplot2 graph dplyr tidyverse

【解决方案1】：

试试这种热带方法：

library(ggplot2)
library(dplyr)
#Data
x$lab <- as.character(x$cntry2)
x$country = as.numeric(x$cntry2)
x$educ = as.integer(x$educ)
x$educ = as.factor(x$educ)

现在，我们存储标签：

#Labels
labs <- x[!duplicated(x$country),]
labs <- labs[,c('country','lab')]

然后是更多的数据处理：

#Data
country_order <- x %>% 
  filter(educ == 3) %>%
  mutate(country = fct_reorder(factor(country), share, .desc = FALSE)) %>% 
  pull(country) %>%
  levels()
df2 <- x %>%
  mutate(country = fct_relevel(factor(country), country_order))

还有剧情：

#Plot
ggplot(df2, aes(x=country, y=share)) + 
  geom_col(aes(fill=educ), color = "black") +
  labs(fill= "Education") +
  theme_classic() +
  xlab("Country")+
  scale_x_discrete(labels=labs$lab[match(country_order,labs$country)])

输出：

【讨论】：