ggplot2画散点图

引言

aes中的x,y分别表示在x,y轴的变量；geom_point表示增加散点图图层，其中的size控制点的大小，shape控制形状，一共25个，为0-25。

library(gcookbook)
library(ggplot2)
head(heightweight)
# sex ageYear ageMonth heightIn weightLb
#1 f 11.92 143 56.3 85.0
#2 f 12.92 155 62.3 105.0
#3 f 12.75 153 63.3 108.0
#4 f 13.42 161 59.0 92.0
#5 f 15.92 191 62.5 112.5
#6 f 14.25 171 62.5 112.0
ggplot(heightweight, aes(x=ageYear, y=heightIn)) + geom_point(size=3,shape=21)

library(ggplot2)
set.seed(1234)
x <- rnorm(100,mean = 2, sd = 3)
y <- -1.5 + 2*x + rnorm(100)
df <- data.frame(x = x, y = y)
ggplot(data = df, mapping = aes(x = x, y = y)) + geom_point()

分组散点图

可将分组变量(因子或字符变量)赋值给颜色或形状属性，实现分组散点图的绘制

将离散变量或因子映射给颜色属性或形状属性
set.seed(1234)
x <- rnorm(100,mean = 2, sd = 3)
y <- -1.5 + 2*x + rnorm(100)
z <- sample(c(0,1), size = 100, replace = TRUE)
df <- data.frame(x = x, y = y, z = z)
#将数值型变量转换为因子型变量
df$z <- factor(df$z)
#分组变量赋值给颜色属性
ggplot(data = df, mapping = aes(x = x, y = y, colour = z)) + geom_point(size = 3)
#分组变量赋值给形状属性
ggplot(data = df, mapping = aes(x = x, y = y, shape = z)) + geom_point(size = 3)

#可以通过scale_colour_brewer()或scale_colour_manual()函数自定义点的颜色；通过scale_shape_manual()函数实现自定义点的形状。为了说明问题，这里将分组变量同时赋值给颜色属性和形状属性。
ggplot(data = df, mapping = aes(x = x, y = y, colour = z, shape = z)) + geom_point(size = 3) + scale_color_brewer(palette = \'Accent\') + scale_shape_manual(values = c(2,16))

这里需要提醒的是，21-25之间的点形状，既可以赋值边框颜色，又可以赋值填充色，当数据点颜色较浅时，带边框线的点就显得尤为重要，这样可以将数据点与背景色区分开来，而0-20之间的点形状，只能赋值边框颜色。

连续变量映射给颜色属性或大小属性。

x <- c(10,13,11,15,18,20,21,22,24,26)
y <- c(76,60,70,58,55,48,44,40,26,18)
z <- c(100,120,300,180,80,210,30,95,145,420)
df <- data.frame(x = x, y = y, z = z)

#将连续变量映射给颜色属性
ggplot(data = df, mapping = aes(x = x, y = y, colour = z)) + geom_point(size = 3)

但这里发现一个问题，颜色越深而对应的值越小，如何将值的大小与颜色的深浅保持一致呢？很简单，只需人为的设置色阶，从低到高设置不同的颜色即可。
ggplot(data = df, mapping = aes(x = x, y = y, colour = z)) + geom_point(size = 3) + scale_colour_gradient(low = \'lightblue\', high = \'darkblue\')

#将连续变量映射给大小属性
ggplot(data = df, mapping = aes(x = x, y = y, size = z)) + geom_point()

上面将连续变量赋值给颜色属性或大小属性，我们还可以人为的设置色阶间隔或大小间隔。
#自定义色阶间隔
ggplot(data = df, mapping = aes(x = x, y = y, fill = z)) + geom_point(shape = 21, size = 3) + scale_fill_continuous(low = \'lightblue\', high = \'darkblue\', breaks = c(100,150,200,300,350,400))

#自定义球大小的间隔
ggplot(data = df, mapping = aes(x = x, y = y, size = z)) + geom_point() + scale_size_continuous(breaks = c(100,150,200,300,350,400), guide = guide_legend())

#将连续变量值的大小与球的大小成比例
ggplot(data = df, mapping = aes(x = x, y = y, size = z)) + geom_point() + scale_size_area(max_size = 10)

重叠点的处理

当数据点非常多时，可能会导致数据点重叠非常严重，该如何处理这样的问题呢？一般有以下几种方法：

1）使用半透明的点
2）数据分箱，并用矩形表示
3）数据分箱，并用六边形表示
4）使用二维密度估计，并将等高线添加到散点图中
5）向散点图中添加边际地毯

set.seed(1234)
x <- rnorm(10000)
y <- rnorm(10000,0,2)
df <- data.frame(x = x, y = y)
#不作任何处理
ggplot(data = df, mapping = aes(x = x, y = y)) + geom_point()

#使用透明度处理点的重叠问题
ggplot(data = df, mapping = aes(x = x, y = y)) + geom_point(alpha = 0.1)

#分箱，并用矩阵表示。把点的形状设定为长方形，密度越大的长方形区域越透明。
ggplot(data = df, mapping = aes(x = x, y = y)) + stat_bin2d()

#默认情况下，stat_bin_2d()函数将x轴和y轴的数据点各分为30个段，即参数900个箱子，用户还可以自定义分段个数,以及箱子在垂直和水平方向上的宽度。

#设置bins为50
ggplot(data = df, mapping = aes(x = x, y = y)) + stat_bin2d(bins = 50) + scale_fill_gradient(low = \'steelblue\', high = \'darkred\', limits = c(0,100), breaks = c(0,25,50,100))

将图形划分为小的正方形箱可能会产生分散注意力的视觉假象，一般建议使用六边形代之。
#分箱，并用六边形表示
ggplot(data = df, mapping = aes(x = x, y = y)) + stat_binhex(binwidth = c(0.2,0.3)) + scale_fill_gradient(low = \'lightgreen\', high = \'darkred\', limits = c(0,100), breaks = c(0,25,50,100))

#使用stat_density2d作二维密度估计，并将等高线添加到散点图中
ggplot(data = df, mapping = aes(x = x, y = y)) + geom_point() + stat_density2d()

#使用大小与分布密度成正比例的点
ggplot(data = df, mapping = aes(x = x, y = y)) + stat_density2d(geom = \'point\', aes(size = ..density..), contour = FALSE) + scale_size_area()

#使用热图展示数据分布密度情况
ggplot(data = df, mapping = aes(x = x, y = y)) + stat_density2d(geom = \'tile\', aes(fill = ..density..), contour = FALSE)

#向散点图中添加边际地毯
ggplot(data = faithful, mapping = aes(x = eruptions, y = waiting)) + geom_point() + geom_rug()

#通过边际地毯，可以快速查看每个坐标轴上数据的分布密疏情况。还可以通过向边际地毯线的位置坐标添加扰动并设定size减少线宽，从而减轻边际地毯线的重叠程度。
ggplot(data = faithful, mapping = aes(x = eruptions, y = waiting)) + geom_point() + geom_rug(position = \'jitter\', size = 0.1)

如果一个变量为离散变量，另一个变量为连续变量时，如何绘制散点图？
set.seed(1234)
x <- rep(1:5, each = 1000)
y <- c(rnorm(1000),rnorm(1000,1,2),rnorm(1000,3,4),rt(1000,2),rt(1000,4))
df <- data.frame(x = x, y = y)
df$x <- factor(df$x)
#不作任何处理的散点图
ggplot(data = df, mapping = aes(x = x, y = y)) + geom_point()

#给数据点添加随机扰动
ggplot(data = df, mapping = aes(x = x, y = y)) + geom_point(position = \'jitter\')

#默认情况下，扰动函数在每个方向（水平和垂直）上添加的扰动值为数据点最小精度的40%，当然也可以通过width和height参数自定义扰动量。在水平方向上添加50%的扰动量。
ggplot(data = df, mapping = aes(x = x, y = y)) + geom_point(position = position_jitter(width = 0.5, height = 0))

#绘制箱线图(适用于一个或两个变量为离散变量)
ggplot(data = df, mapping = aes(x = x, y = y)) + geom_boxplot(mapping = aes(group = x), fill = \'steelblue\')

这里需要提醒的是，横坐标为数值型变量时，必须要将其转换为因子，并在geom_boxplot()函数的属性中将因子映射给group，否则产生的效果图将是错误的。

增加拟合的回归线

#不添加任何拟合线
ggplot(data = iris, mapping = aes(x = Petal.Length, y = Petal.Width, colour = Species)) + geom_point()

#添加线性拟合线
ggplot(data = iris, mapping = aes(x = Petal.Length, y = Petal.Width, colour = Species)) + geom_point() + stat_smooth(method = \'lm\')

#置信区间默认的置信度是95%，我们也可以对其进行修改。
ggplot(heightweight, aes(x=ageYear, y=heightIn))+ geom_point() + stat_smooth(method=lm, level=0.99)#99%的置信度

#不显示置信区间
ggplot(heightweight, aes(x=ageYear, y=heightIn))+ geom_point(colour="grey60") + stat_smooth(method=lm, se=FALSE)

#添加局部加权多项式曲线
ggplot(data = iris, mapping = aes(x = Petal.Length, y = Petal.Width, colour = Species)) + geom_point() + stat_smooth(method = \'loess\')

#添加Logistic曲线
library(MASS)
b <- biopsy
#绘制Logistic曲线必须将因变量强制转换为0-1
b <- transform(b, class_trans = ifelse(class == \'benign\', 0, 1))
ggplot(data = b, mapping = aes(x = V1, y = class_trans)) + geom_point(position = position_jitter(width = 0.3, height = 0.06), alpha = 0.4, shape = 21, size = 2) + stat_smooth(method = glm, method.args = list(family = "binomial"))

#给点增加标签
#给参数label赋予对应的名字即可。
ggplot(subset(countries, Year==2009 & healthexp>2000),
aes(x=healthexp, y=infmortality)) +geom_text(aes(label=Name), size=4)

气泡图
cdat <- subset(countries, Year==2009 &
Name %in% c("Canada", "Ireland", "United Kingdom", "United States", "New Zealand", "Iceland", "Japan", "Luxembourg", "Netherlands", "Switzerland"))
#气泡大小用size参数进行控制，把第三个变量赋值给该参数就行。
ggplot(cdat, aes(x=healthexp, y=infmortality, size=GDP)) +
geom_point(shape=21, colour="black", fill="cornsilk")

value1 <- rep(c(\'高价值\',\'中价值\',\'低价值\'), each = 3)
value2 <- rep(c(\'高价值\',\'中价值\',\'低价值\'), times = 3)
nums <- c(500,287,123,156,720,390,80,468,1200)
df <- data.frame(value1 = value1, value2 = value2, nums = nums)
df$value1 <- factor(df$value1, levels = c(\'高价值\',\'中价值\',\'低价值\'), order = TRUE)
df$value2 <- factor(df$value2, levels = c(\'低价值\',\'中价值\',\'高价值\'), order = TRUE)
ggplot(data = df, mapping = aes(x = value1, y = value2, size = nums)) + geom_point(colour = \'steelblue\') + scale_size_area(max_size = 30, guide = FALSE) + geom_text(aes(label = nums), vjust = 0, colour = \'black\', size = 5)

从图中可知，高价值用户中有80个流向了低价值，而低价值用户中又有128个流向高价值。

散点矩阵图
c2009 <- subset(countries, Year==2009,
select=c(Name, GDP, laborrate, healthexp, infmortality))
pairs(c2009[,2:5])
这里我们不用ggplot2包进行，这是因为该包不太适合对这类图形进行展示。

#使用pairs()函数绘制散点图矩阵
data(tips, package = "reshape")
pairs(tips[,1:3])

#使用car包中的scatterplot.matrix()函数
library(car)
scatterplot.matrix(tips[,1:3])

#使用GGally包中的ggpairs()函数绘制散点图矩阵
library(GGally)
ggpairs(tips[, 1:3])
通过GGally包中的ggpairs()函数绘制散点图矩阵还是非常引入入目的，将连续变量和离散变量非常完美的结合在一起。

Cleveland点图

可以减少图形造成的视觉混乱，同时图形更具可读性。
set.seed(1234)
names <- letters
Score <- runif(26, min = 55, max = 90)
df <- data.frame(names = names, Score = Score)

#条形图
ggplot(data = df, mapping = aes(x = reorder(names,Score), y = Score)) + geom_bar(stat = \'identity\', fill = \'steelblue\', colour = \'black\') + xlab(\'Name\') + geom_text(aes(label = round(Score)), vjust = 1)

#Cleveland点图
ggplot(data = df, mapping = aes(x = reorder(names,Score), y = Score)) + geom_point(size = 5, shape = 21, fill = \'steelblue\', colour = \'black\') + xlab(\'Name\')

##########################画散点+pdf组合图

library(tidyverse)
# 画出左上的密度分布图
g_top_left <- mtcars %>%
  ggplot(aes(x = mpg)) +
  geom_density(fill = "#177cb0") +
  theme_bw() +
  theme(panel.background = element_blank(),
      panel.border = element_blank(),
      axis.title = element_blank(),
      panel.grid.major = element_blank(),
      panel.grid.minor = element_blank(),
      axis.line.x = element_line(),
      axis.line.y = element_line()) +
  scale_x_continuous(expand = c(0.03, -0.4), limits = c(10, 35))

# 画出右下的密度分布图
g_bottom_right <- mtcars %>%
  ggplot(aes(x = wt)) +
  geom_density(fill = "#789262") +
  theme_bw() +
  theme(panel.background = element_blank(),
      panel.border = element_blank(),
      axis.title = element_blank(),
      panel.grid.major = element_blank(),
      panel.grid.minor = element_blank(),
      axis.line.x = element_line(),
      axis.line.y = element_line()) +
  coord_flip() +
  scale_x_continuous(expand = c(0, 0), limits = c(1.3, 5.8))

# 画出左下的散点图
g_bottom_left <- mtcars %>%
  ggplot(aes(x = mpg, y = wt)) +
  geom_point() +
  theme_bw()

# 画出整体“画布”
df <- data.frame(x = 1:10, y = 1:10)
base <- ggplot(df, aes(x, y)) +
  geom_blank() +
  theme_bw() +
  theme(panel.background = element_blank(),
      panel.border = element_blank(),
      axis.text = element_blank(),
      axis.title = element_blank(),
      axis.ticks = element_blank(),
      axis.line = element_blank(),
      panel.grid.major = element_blank(),
      panel.grid.minor = element_blank())

# 将小图在“画布”上组合
base +
  annotation_custom(grob = ggplotGrob(g_bottom_left), xmin = 1, xmax = 8, ymin = 1, ymax = 8) +
  annotation_custom(grob = ggplotGrob(g_bottom_right), xmin = 8, xmax = 10, ymin = 1.3, ymax = 8.2) +
  annotation_custom(grob = ggplotGrob(g_top_left), xmin = 1, xmax = 8, ymin = 8, ymax = 10)