【问题标题】:How to keep labels of a factor variable after transforming it to numeric in R?将因子变量转换为R中的数字后如何保留因子变量的标签?
【发布时间】:2020-12-08 14:54:18
【问题描述】:

我有以下三个主要变量的数据:

i) 教育(因子):代表三种不同的教育水平(1、2、3)

ii) 份额(数字):代表该国每个教育水平的人口百分比

iii) 国家(因素):代表 30 个国家。

其目的是通过将最高教育水平 (3) 按教育比例最低的国家重新排序到教育水平最高的国家来绘制每个国家/地区的教育份额。问题是我最后会丢失国家标签,因为我必须将变量转换为数字才能重新排序。绘制图表后,国家被标记为 (25, 6, 26, 17) 而不是正确的标签 (PT, CZ, RO, IT)。我在stackoverflow中阅读了不同的线程,但没有解决这个问题。我有办法在重新订购后保留国家/地区的标签,这样我以后就不必手动输入它们了吗?

library(forcats)
library(ggplot2)
library(dplyr)

 x$country = as.numeric(x$cntry2)
 x$educ = as.integer(x$educ)
 x$educ = as.factor(x$educ)
    
 country_order <- x %>% 
  filter(educ == 3) %>%
  mutate(country = fct_reorder(factor(country), share, .desc = FALSE)) %>% 
  pull(country) %>%
  levels()

df2 <- x %>%
  mutate(country = fct_relevel(factor(country), country_order))

ggplot(df2, aes(x=country, y=share)) + 
  geom_col(aes(fill=educ), color = "black") +
  labs(fill= "Education") +
  theme_classic() +
  xlab("Country") 

这是下面的数据:

structure(list(educ = structure(c(1L, 2L, 3L, 1L, 2L, 3L, 1L, 
2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 
3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 
1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 
2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 
3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L), .Label = c("1", "2", 
"3"), class = "factor"), cntry2 = structure(c(1L, 1L, 1L, 2L, 
2L, 2L, 3L, 3L, 3L, 4L, 4L, 4L, 5L, 5L, 5L, 6L, 6L, 6L, 7L, 7L, 
7L, 8L, 8L, 8L, 9L, 9L, 9L, 10L, 10L, 10L, 11L, 11L, 11L, 12L, 
12L, 12L, 13L, 13L, 13L, 14L, 14L, 14L, 15L, 15L, 15L, 16L, 16L, 
16L, 17L, 17L, 17L, 18L, 18L, 18L, 19L, 19L, 19L, 20L, 20L, 20L, 
21L, 21L, 21L, 22L, 22L, 22L, 23L, 23L, 23L, 24L, 24L, 24L, 25L, 
25L, 25L, 26L, 26L, 26L, 27L, 27L, 27L), .Label = c("AU", "BE", 
"BG", "CH", "CZ", "DK", "EE", "ES", "FI", "FR", "GR", "HU", "IE", 
"IS", "IT", "LT", "LU", "LV", "NL", "NO", "PO", "PT", "RO", "SE", 
"SK", "SV", "UK"), class = "factor"), share = c(14.9585723390695, 
64.8311026131294, 20.2103250478011, 20.3203525363306, 37.9050825638106, 
41.7745648998589, 20.5482068669118, 58.6719831908696, 20.7798099422186, 
11.0478359908884, 52.7334851936219, 36.2186788154898, 8.1806499751285, 
77.2156358812801, 14.6037141435914, 18.43684842358, 44.6831364124597, 
36.8800151639603, 13.0425889732285, 58.1996272896687, 28.7577837371029, 
42.6625051189251, 24.1934234264148, 33.1440714546602, 16.4821228232769, 
46.3050582898395, 37.2128188868836, 22.0117072122872, 47.7342785027657, 
30.2540142849471, 31.6958715347475, 40.8370856615852, 27.4670428036673, 
15.620426612099, 63.1486925776748, 21.2308808102263, 27.79203576455, 
33.4878715125424, 38.7200927229075, 29.0666986564299, 41.950575815739, 
28.9827255278311, 36.0270124068613, 47.1984225312789, 16.7745650618598, 
8.20398339670027, 60.9892218075273, 30.8067947957724, 37.0050817095017, 
37.4766935985084, 25.5182246919899, 15.7399902739504, 59.1482759419216, 
25.111733784128, 19.2624176167015, 43.4944817814291, 37.2431006018693, 
17.6501727404436, 44.6784798840967, 37.6713473754597, 10.0098831213475, 
69.2849776555517, 20.7051392231007, 64.5019644095216, 21.5391726369309, 
13.9588629535475, 21.8434913468774, 62.6661650363682, 15.4903436167545, 
11.4840104928012, 55.3435190932938, 33.172470413905, 4.23006072183939, 
74.1147574537763, 21.6551818243843, 15.6869892409901, 61.3851490387442, 
22.9278617202657, 14.2357801080394, 49.3703276303246, 36.393892261636
), country = c(1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 6, 6, 6, 7, 
7, 7, 8, 8, 8, 9, 9, 9, 10, 10, 10, 11, 11, 11, 12, 12, 12, 14, 
14, 14, 15, 15, 15, 16, 16, 16, 17, 17, 17, 18, 18, 18, 19, 19, 
19, 20, 20, 20, 22, 22, 22, 23, 23, 23, 24, 24, 24, 25, 25, 25, 
26, 26, 26, 27, 27, 27, 28, 28, 28, 29, 29, 29, 30, 30, 30)), row.names = c(NA, 
-81L), class = c("tbl_df", "tbl", "data.frame"))

【问题讨论】:

    标签: r ggplot2 graph dplyr tidyverse


    【解决方案1】:

    试试这种热带方法:

    library(ggplot2)
    library(dplyr)
    #Data
    x$lab <- as.character(x$cntry2)
    x$country = as.numeric(x$cntry2)
    x$educ = as.integer(x$educ)
    x$educ = as.factor(x$educ)
    

    现在,我们存储标签:

    #Labels
    labs <- x[!duplicated(x$country),]
    labs <- labs[,c('country','lab')]
    

    然后是更多的数据处理:

    #Data
    country_order <- x %>% 
      filter(educ == 3) %>%
      mutate(country = fct_reorder(factor(country), share, .desc = FALSE)) %>% 
      pull(country) %>%
      levels()
    df2 <- x %>%
      mutate(country = fct_relevel(factor(country), country_order))
    

    还有剧情:

    #Plot
    ggplot(df2, aes(x=country, y=share)) + 
      geom_col(aes(fill=educ), color = "black") +
      labs(fill= "Education") +
      theme_classic() +
      xlab("Country")+
      scale_x_discrete(labels=labs$lab[match(country_order,labs$country)])
    

    输出:

    【讨论】:

      猜你喜欢
      • 2019-09-21
      • 1970-01-01
      • 1970-01-01
      • 1970-01-01
      • 1970-01-01
      • 1970-01-01
      • 1970-01-01
      • 2018-07-15
      • 1970-01-01
      相关资源
      最近更新 更多