【问题标题】:Labels don't appear in hierarchical clustering plot (dendogram) in R标签不会出现在 R 的层次聚类图中(树状图)
【发布时间】:2016-07-17 07:51:14
【问题描述】:

我希望使用 R (https://cran.r-project.org/web/packages/pdc/pdc.pdf) 的排列分布聚类包进行多变量时间序列聚类。在使用 pdclust 方法(url pdf 的第 11 页)进行层次聚类后,我使用 plot 方法(再次第 11 页)绘制了树状图。有 60 个样本。所以,在剧情中(Hierarchical clustering plot ),有 60 个时间序列,但它们没有标记。当我尝试指定标签向量而不是标签 = NULL 时,我总是收到此错误“图形错误:::plotHclust(n1, merge, height, order(x$order), hang, : invalid dendrogram input"。任何帮助将不胜感激。这是我的代码:

data1  <- read.csv(file="file_PID_1_1Apr_00-03.csv",head=FALSE,sep=",")
data2  <- read.csv(file="file_PID_2_1Apr_00-03.csv",head=FALSE,sep=",")
data3  <- read.csv(file="file_PID_3_1Apr_00-03.csv",head=FALSE,sep=",")
data4  <- read.csv(file="file_PID_4_1Apr_00-03.csv",head=FALSE,sep=",")
data5  <- read.csv(file="file_PID_5_1Apr_00-03.csv",head=FALSE,sep=",")
data6  <- read.csv(file="file_PID_6_1Apr_00-03.csv",head=FALSE,sep=",")
data7  <- read.csv(file="file_PID_7_1Apr_00-03.csv",head=FALSE,sep=",")
data8  <- read.csv(file="file_PID_8_1Apr_00-03.csv",head=FALSE,sep=",")

data9  <- read.csv(file="file_PID_1_1Apr_03-06.csv",head=FALSE,sep=",")
data10 <- read.csv(file="file_PID_2_1Apr_03-06.csv",head=FALSE,sep=",")
data11 <- read.csv(file="file_PID_3_1Apr_03-06.csv",head=FALSE,sep=",")
data12 <- read.csv(file="file_PID_4_1Apr_03-06.csv",head=FALSE,sep=",")
data13 <- read.csv(file="file_PID_5_1Apr_03-06.csv",head=FALSE,sep=",")
data14 <- read.csv(file="file_PID_6_1Apr_03-06.csv",head=FALSE,sep=",")
data15 <- read.csv(file="file_PID_7_1Apr_03-06.csv",head=FALSE,sep=",")
data16 <- read.csv(file="file_PID_8_1Apr_03-06.csv",head=FALSE,sep=",")

data17 <- read.csv(file="file_PID_1_1Apr_06-09.csv",head=FALSE,sep=",")
data18 <- read.csv(file="file_PID_2_1Apr_06-09.csv",head=FALSE,sep=",")
data19 <- read.csv(file="file_PID_3_1Apr_06-09.csv",head=FALSE,sep=",")
data20 <- read.csv(file="file_PID_4_1Apr_06-09.csv",head=FALSE,sep=",")
data21 <- read.csv(file="file_PID_5_1Apr_06-09.csv",head=FALSE,sep=",")
data22 <- read.csv(file="file_PID_6_1Apr_06-09.csv",head=FALSE,sep=",")
data23 <- read.csv(file="file_PID_7_1Apr_06-09.csv",head=FALSE,sep=",")
data24 <- read.csv(file="file_PID_8_1Apr_06-09.csv",head=FALSE,sep=",")

data25 <- read.csv(file="file_PID_1_1Apr_09-12.csv",head=FALSE,sep=",")
data26 <- read.csv(file="file_PID_2_1Apr_09-12.csv",head=FALSE,sep=",")
data27 <- read.csv(file="file_PID_3_1Apr_09-12.csv",head=FALSE,sep=",")
data28 <- read.csv(file="file_PID_4_1Apr_09-12.csv",head=FALSE,sep=",")
data29 <- read.csv(file="file_PID_5_1Apr_09-12.csv",head=FALSE,sep=",")
data30 <- read.csv(file="file_PID_6_1Apr_09-12.csv",head=FALSE,sep=",")
data31 <- read.csv(file="file_PID_7_1Apr_09-12.csv",head=FALSE,sep=",")
data32 <- read.csv(file="file_PID_8_1Apr_09-12.csv",head=FALSE,sep=",")

data33 <- read.csv(file="file_PID_1_1Apr_12-15.csv",head=FALSE,sep=",")
data34 <- read.csv(file="file_PID_2_1Apr_12-15.csv",head=FALSE,sep=",")
data35 <- read.csv(file="file_PID_3_1Apr_12-15.csv",head=FALSE,sep=",")
data36 <- read.csv(file="file_PID_4_1Apr_12-15.csv",head=FALSE,sep=",")
data37 <- read.csv(file="file_PID_5_1Apr_12-15.csv",head=FALSE,sep=",")
data38 <- read.csv(file="file_PID_6_1Apr_12-15.csv",head=FALSE,sep=",")
data39 <- read.csv(file="file_PID_7_1Apr_12-15.csv",head=FALSE,sep=",")
data40 <- read.csv(file="file_PID_8_1Apr_12-15.csv",head=FALSE,sep=",")

data41 <- read.csv(file="file_PID_2_1Apr_15-18.csv",head=FALSE,sep=",")
data42 <- read.csv(file="file_PID_3_1Apr_15-18.csv",head=FALSE,sep=",")
data43 <- read.csv(file="file_PID_4_1Apr_15-18.csv",head=FALSE,sep=",")
data44 <- read.csv(file="file_PID_6_1Apr_15-18.csv",head=FALSE,sep=",")
data45 <- read.csv(file="file_PID_7_1Apr_15-18.csv",head=FALSE,sep=",")
data46 <- read.csv(file="file_PID_8_1Apr_15-18.csv",head=FALSE,sep=",")

data47 <- read.csv(file="file_PID_1_1Apr_18-21.csv",head=FALSE,sep=",")
data48 <- read.csv(file="file_PID_2_1Apr_18-21.csv",head=FALSE,sep=",")
data49 <- read.csv(file="file_PID_3_1Apr_18-21.csv",head=FALSE,sep=",")
data50 <- read.csv(file="file_PID_4_1Apr_18-21.csv",head=FALSE,sep=",")
data51 <- read.csv(file="file_PID_6_1Apr_18-21.csv",head=FALSE,sep=",")
data52 <- read.csv(file="file_PID_7_1Apr_18-21.csv",head=FALSE,sep=",")
data53 <- read.csv(file="file_PID_8_1Apr_18-21.csv",head=FALSE,sep=",")

data54 <- read.csv(file="file_PID_1_1Apr_21-24.csv",head=FALSE,sep=",")
data55 <- read.csv(file="file_PID_2_1Apr_21-24.csv",head=FALSE,sep=",")
data56 <- read.csv(file="file_PID_3_1Apr_21-24.csv",head=FALSE,sep=",")
data57 <- read.csv(file="file_PID_4_1Apr_21-24.csv",head=FALSE,sep=",")
data58 <- read.csv(file="file_PID_6_1Apr_21-24.csv",head=FALSE,sep=",")
data59 <- read.csv(file="file_PID_7_1Apr_21-24.csv",head=FALSE,sep=",")
data60 <- read.csv(file="file_PID_8_1Apr_21-24.csv",head=FALSE,sep=",")





list <- array(0,dim=c(720,60,4))

myfunc <- function(j,i,k){
    if (j == 1) return (data1[i,k]) 
    else if (j==2) return (data2[i,k])
    else if (j==3) return (data17[i,k])
    else if (j==4) return (data9[i,k])
    else if (j==5) return (data5[i,k])
    else if (j==6) return (data6[i,k])
    else if (j==7) return (data7[i,k])
    else if (j==8) return (data8[i,k])
    else if (j==9) return (data9[i,k])
    else if (j==10) return (data10[i,k])
    else if (j==11) return (data11[i,k])
    else if (j==12) return (data12[i,k])
    else if (j==13) return (data13[i,k])
    else if (j==14) return (data14[i,k])
    else if (j==15) return (data15[i,k])
    else if (j==16) return (data16[i,k])
    else if (j==17) return (data17[i,k])
    else if (j==18) return (data18[i,k])
    else if (j==19) return (data19[i,k])
    else if (j==20) return (data20[i,k])
    else if (j==21) return (data21[i,k])
    else if (j==22) return (data22[i,k])
    else if (j==23) return (data23[i,k])
    else if (j==24) return (data24[i,k])
    else if (j==25) return (data25[i,k])
    else if (j==26) return (data26[i,k])
    else if (j==27) return (data27[i,k])
    else if (j==28) return (data28[i,k])
    else if (j==29) return (data29[i,k])
    else if (j==30) return (data30[i,k])
    else if (j==31) return (data31[i,k])
    else if (j==32) return (data32[i,k])
    else if (j==33) return (data33[i,k])
    else if (j==34) return (data34[i,k])
    else if (j==35) return (data35[i,k])
    else if (j==36) return (data36[i,k])
    else if (j==37) return (data37[i,k])
    else if (j==38) return (data38[i,k])
    else if (j==39) return (data39[i,k])
    else if (j==40) return (data40[i,k])
    else if (j==41) return (data41[i,k])
    else if (j==42) return (data42[i,k])
    else if (j==43) return (data43[i,k])
    else if (j==44) return (data44[i,k])
    else if (j==45) return (data45[i,k])
    else if (j==46) return (data46[i,k])
    else if (j==47) return (data47[i,k])
    else if (j==48) return (data48[i,k])
    else if (j==49) return (data49[i,k])
    else if (j==50) return (data50[i,k])
    else if (j==51) return (data51[i,k])
    else if (j==52) return (data52[i,k])
    else if (j==53) return (data53[i,k])
    else if (j==54) return (data54[i,k])
    else if (j==55) return (data55[i,k])
    else if (j==56) return (data56[i,k])
    else if (j==57) return (data57[i,k])
    else if (j==58) return (data58[i,k])
    else if (j==59) return (data59[i,k])
    else if (j==60) return (data60[i,k])

}

list <- array(0,dim=c(720,60,4))


for(i in 1:720){
    for (j in 1:60){
        list[i,j,1] <- myfunc(j,i,6)
        list[i,j,2] <- myfunc(j,i,7)
        list[i,j,3] <- myfunc(j,i,8)
        list[i,j,4] <- myfunc(j,i,9)
    }
}

library("pdc")
clustering <- pdclust(list)
plot(clustering, labels= NULL, type="rectangle", timeseries.as.labels = T, p.values=T)

【问题讨论】:

  • 有时在聚类和绘图之前已经为输入数据提供了一些标题会有所帮助,但只是猜测......
  • 我试过了,但没有用

标签: r machine-learning time-series hierarchical-clustering pdc


【解决方案1】:

我为您的代码创建了一个精简版本,它可以在没有数据文件的情况下工作,以便更轻松地讨论您的问题。在这里,我创建了 60 个具有 4 个维度和 720 个时间点的时间序列(就像你做的那样)。只是我从随机法线模拟了一半的试验,另一半从带有叠加随机法线的线性趋势模拟。因此,对于 pdc,它们显然应该可分为两组。代码如下:

require("pdc")

# make this replicable by setting a random seed
set.seed(7823)

# 60 TS each with 4 dimensions and 720 timepoints
# half of them are random uniform other half are a mix of random uniform 
# and linear increase
list <- array(0,dim=c(720,60,4))
for (i in 1:30) {
  for (j in 1:4) {
    list[,i,j] <- rnorm(n = 720)
    list[,i+30,j] <- rnorm(n=720)+1:720
  }
}
cols <- c(rep("red",30),rep("blue",30))
labels <- c(rep("normal",30),rep("normal+trend",30))

# run clustering and color original groups each in red and blue
clustering <- pdclust(list)

pdf("pdcplot.pdf")
plot(clustering, labels= labels, type="rectangle", cols=cols, cex=0.5)
dev.off()

我可以轻松绘制标签。我添加了一个“cex=0.5”来减小图中的字体大小。此外,我删除了“timeseries.as.labels = T”,因为当您指定标签时它会被覆盖。这是我的情节的样子(带标签):

Clustering of simulated data with labels

当我指定的标签数量与时间序列数量不匹配时,我只能重现您报告的错误。您可能需要再次检查标签向量的大小(例如,length(labels)==60)。

【讨论】:

    猜你喜欢
    • 2016-01-31
    • 2018-03-06
    • 2014-12-23
    • 1970-01-01
    • 2014-12-14
    • 1970-01-01
    • 1970-01-01
    • 2018-11-26
    • 2020-11-03
    相关资源
    最近更新 更多