如何跨多个列获得最常见的组合答案

【问题标题】：How to get the most common combination across multiple columns如何跨多个列获得最常见的组合
【发布时间】：2020-05-09 01:43:57
【问题描述】：

我有以下数据集，其中我有一个 1 表示一种食物类型已被淘汰，一个 0 表示它不是。我知道最常见的食物种类是 2 种，但我很想知道哪种组合最常见。

这是数据集的示例

structure(list(Type_SunflowerSeeds = c(1L, 1L, 1L, 0L, 0L), Type_SafflowerSeeds = c(0L, 
0L, 0L, 0L, 0L), Type_Nyjer = c(0L, 0L, 0L, 0L, 0L), Type_EconMix = c(1L, 
1L, 0L, 1L, 1L), Type_PremMix = c(0L, 0L, 0L, 0L, 0L), Type_Grains = c(0L, 
0L, 0L, 0L, 0L), Type_Nuts = c(0L, 0L, 1L, 1L, 0L), Type_Suet = c(1L, 
0L, 0L, 0L, 0L), Type_SugarWater = c(1L, 0L, 0L, 0L, 1L), Type_FruitOrJams = c(0L, 
0L, 0L, 1L, 0L), Type_Mealworms = c(0L, 0L, 0L, 0L, 0L), Type_Corn = c(0L, 
0L, 0L, 0L, 0L), Type_BarkOrPeanutButter = c(0L, 0L, 0L, 0L, 
0L), Type_Scraps = c(1L, 1L, 1L, 1L, 0L), Type_Bread = c(0L, 
0L, 0L, 0L, 0L), Type_Other = c(0L, 0L, 0L, 0L, 0L), total = c(5, 
3, 3, 4, 2)), row.names = c(NA, 5L), class = "data.frame")

我想知道最常见的成对组合以及最常见的三向组合。

因此，成对的输出看起来像这样，而三路组合的输出看起来像这样：

                 type1           type2 number_of_times
1     Type_SugarWater    Type_EconMix             351
2 Type_SunflowerSeeds Type_SugarWater             335

【问题讨论】：

你能显示预期的输出吗
组合是指成对还是...？显示预期的输出会有所帮助。此外，最好提供一个最小的示例。而不是 10-15 列，也许 5 列会显示您的问题。
好问题，我会做必要的修改

标签： r datatable dplyr manipulate

【解决方案1】：

您可以使用基于这种分析的规则。你可以阅读更多关于它的一些用途here

这是你的数据：

df = structure(list(Type_SunflowerSeeds = c(1L, 1L, 1L, 0L, 0L), Type_SafflowerSeeds = c(0L, 
0L, 0L, 0L, 0L), Type_Nyjer = c(0L, 0L, 0L, 0L, 0L), Type_EconMix = c(1L, 
1L, 0L, 1L, 1L), Type_PremMix = c(0L, 0L, 0L, 0L, 0L), Type_Grains = c(0L, 
0L, 0L, 0L, 0L), Type_Nuts = c(0L, 0L, 1L, 1L, 0L), Type_Suet = c(1L, 
0L, 0L, 0L, 0L), Type_SugarWater = c(1L, 0L, 0L, 0L, 1L), Type_FruitOrJams = c(0L, 
0L, 0L, 1L, 0L), Type_Mealworms = c(0L, 0L, 0L, 0L, 0L), Type_Corn = c(0L, 
0L, 0L, 0L, 0L), Type_BarkOrPeanutButter = c(0L, 0L, 0L, 0L, 
0L), Type_Scraps = c(1L, 1L, 1L, 1L, 0L), Type_Bread = c(0L, 
0L, 0L, 0L, 0L), Type_Other = c(0L, 0L, 0L, 0L, 0L), total = c(5, 
3, 3, 4, 2)), row.names = c(NA, 5L), class = "data.frame")

我们将其设为矩阵并将其转换为transactions 对象，我省略了最后一列，因为您不需要总计：

library(arules)
m = as(as.matrix(df[,-ncol(df)]),"transactions")
summary(m)
#gives you a lot of information about this data
# now we get a co-occurence matrix
counts = crossTable(m)

要得到你所说的数据框，你需要使用dplyr和tidyr：

# convert to data.frame
counts[upper.tri(counts)]=NA
diag(counts)=NA
data.frame(counts) %>% 
# add rownames as item1
tibble::rownames_to_column("item1") %>% 
# make it long format, like you wanted
pivot_longer(-item1,names_to="item2") %>%
# remove rows where item1 == item2 
filter(!is.na(value)) %>% 
# sort
arrange(desc(value))

    # A tibble: 120 x 3
   item1           item2               value
   <chr>           <chr>               <int>
 1 Type_Scraps     Type_SunflowerSeeds     3
 2 Type_Scraps     Type_EconMix            3
 3 Type_EconMix    Type_SunflowerSeeds     2
 4 Type_SugarWater Type_EconMix            2

上面可以通过在 arules 中使用apriori 来简化：

# number of combinations
N = 2
# create apriori object
rules = apriori(m,parameter=list(maxlen=N,minlen=N,conf =0.01,support=0.01))
gi <- generatingItemsets(rules)
d <- which(duplicated(gi))
rules = sort(rules[-d])
# output results
data.frame(
lhs=labels(lhs(rules)),
rhs=labels(rhs(rules)),
count=quality(rules)$count)

                     lhs                   rhs count
1  {Type_SunflowerSeeds}         {Type_Scraps}     3
2         {Type_EconMix}         {Type_Scraps}     3
3      {Type_SugarWater}        {Type_EconMix}     2
4            {Type_Nuts}         {Type_Scraps}     2
5  {Type_SunflowerSeeds}        {Type_EconMix}     2
6     {Type_FruitOrJams}           {Type_Nuts}     1

如果出现 3，只需将上面的 N 更改为 3。

【讨论】：

【解决方案2】：

如果我正确理解了您的问题，并且您正在寻找同一行中两个相同类别为 1 的频率（例如，像 @M-- 这样的成对问题），这就是我在过去的。我敢肯定有一种更优雅的方式来处理它：D

library(dplyr)
library(tidyr)

test.df <- structure(list(Type_SunflowerSeeds = c(1L, 1L, 1L, 0L, 0L, 1L, 
1L, 0L, 0L, 0L), Type_SafflowerSeeds = c(0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L), Type_Nyjer = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L), Type_EconMix = c(1L, 1L, 0L, 1L, 1L, 0L, 1L, 0L, 
0L, 0L), Type_PremMix = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
1L), Type_Grains = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), 
    Type_Nuts = c(0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L), Type_Suet = c(1L, 
    0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L), Type_SugarWater = c(1L, 
    0L, 0L, 0L, 1L, 1L, 1L, 0L, 1L, 1L), Type_FruitOrJams = c(0L, 
    0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L), Type_Mealworms = c(0L, 
    0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), Type_Corn = c(0L, 0L, 
    0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L), Type_BarkOrPeanutButter = c(0L, 
    0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), Type_Scraps = c(1L, 
    1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L), Type_Bread = c(0L, 0L, 
    0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), Type_Other = c(0L, 0L, 0L, 
    0L, 0L, 0L, 0L, 0L, 0L, 0L), total = c(5, 3, 3, 4, 2, 3, 
    3, 1, 1, 2)), row.names = c(NA, 10L), class = "data.frame")

test.df %>%
  mutate(food.id = 1:n()) %>%
  gather(key = "type1", value = "val", -food.id, -total) %>% #create an ID column for each row
  filter(val==1) %>% 
  select(food.id, type1) %>% #now we have a data.frame with one column for food.id and 
# one column for every food.type it is associated with
  left_join( # this left join is essentially doing the same thing we did before
    test.df %>%
      mutate(food.id = 1:n()) %>%
      gather(key = "type2", value = "val", -food.id, -total) %>%
      filter(val==1) %>%
      select(food.id, type2),
    by = c("food.id") # now we're matching each food with all of its associated types
  ) %>%
  mutate(type1.n = as.numeric(factor(type1)), # quick way of making sure we're not counting duplicates 
# (e.g. if type1 = Type_SunflowerSeeds and type2 = Type_SafflowerSeeds, that's the same if they were switched)
         type2.n = as.numeric(factor(type2))) %>%
  filter(type1 > type2) %>% # this filter step takes care of the flip flopping issue
  group_by(type1, type2) %>%
  summarise( #finally, count the combinations/pairwise values
    n.times = n()
  ) %>%
  ungroup() %>%
  arrange(desc(n.times), type1, type2)

输出：

   type1               type2            n.times
   <chr>               <chr>              <int>
 1 Type_Scraps         Type_EconMix           3
 2 Type_SugarWater     Type_EconMix           3
 3 Type_SunflowerSeeds Type_EconMix           3
 4 Type_SunflowerSeeds Type_Scraps            3
 5 Type_SunflowerSeeds Type_SugarWater        3
 6 Type_Scraps         Type_Nuts              2
 7 Type_SugarWater     Type_Suet              2
 8 Type_SunflowerSeeds Type_Suet              2
 9 Type_FruitOrJams    Type_EconMix           1
10 Type_Nuts           Type_EconMix           1
11 Type_Nuts           Type_FruitOrJams       1
12 Type_Scraps         Type_FruitOrJams       1
13 Type_Suet           Type_EconMix           1
14 Type_Suet           Type_Scraps            1
15 Type_SugarWater     Type_PremMix           1
16 Type_SugarWater     Type_Scraps            1
17 Type_SunflowerSeeds Type_Nuts              1

要扩展它并进行三向组合计数，您可以按照此代码进行操作。我还添加了一些额外的 cmets 来演练正在发生的事情：

# create a baseline data.frame with food.id and every food type that it matches
food.type.long.df <- test.df %>%
  mutate(food.id = 1:n()) %>%
  gather(key = "type1", value = "val", -food.id, -total) %>%
  filter(val==1) %>%
  select(food.id, type1) %>%
  arrange(food.id)

# join the baseline data.frame to itself to see all possible combinations of food types
# note: this includes repeated types like type1=Type_Corn and type2=Type_Corn
# this also includes rows where the types are simply flip-flopped types 
# ex. Row 2 is type1=Type_SunflowerSeeds    and type2 = Type_EconMix 
# but Row 6 is type1=Type_EconMix   and type2 = Type_SunflowerSeeds - we don't want to count this combinations twice
food.2types.df <- food.type.long.df %>%
  left_join(
    select(food.type.long.df, food.id, type2 = type1),
    by = "food.id"
  ) %>%
  arrange(food.id)

# let's add the third type as well; as with before, the same issues are in this df but we'll fix the duplicates
# and flip flops later
food.3types.df <- food.2types.df %>%
  left_join(
    select(food.type.long.df, food.id, type3 = type1),
    by = "food.id"
  ) %>%
  arrange(food.id)

food.3types.df.fixed <- food.3types.df %>%
  distinct() %>%
  mutate(type1.n = as.numeric(factor(type1)), # assign each type1 a number (in alphabetical order)
         type2.n = as.numeric(factor(type2)), # assign each type2 a number (in alphabetical order)
         type3.n = as.numeric(factor(type3))) %>%  # assign each type3 a number (in alphabetical order)
  filter(type1 > type2) %>% # to remove duplicates and flip-flopped rows for types 1 and 2, use a strict inequality
  filter(type2 > type3) # to remove duplicates and flip-flopped rows for types 2 and 3, use a strict inequality

food.3type.combination.count <- food.3types.df.fixed %>%
  group_by(type1, type2, type3) %>% # group by all three types you want to count
  summarise(
    n.times = n()
  ) %>%
  ungroup() %>%
  arrange(desc(n.times), type1, type2, type3)

输出：

   type1               type2            type3            n.times
   <chr>               <chr>            <chr>              <int>
 1 Type_SunflowerSeeds Type_Scraps      Type_EconMix           2
 2 Type_SunflowerSeeds Type_SugarWater  Type_EconMix           2
 3 Type_SunflowerSeeds Type_SugarWater  Type_Suet              2
 4 Type_Nuts           Type_FruitOrJams Type_EconMix           1
 5 Type_Scraps         Type_FruitOrJams Type_EconMix           1
 6 Type_Scraps         Type_Nuts        Type_EconMix           1
 7 Type_Scraps         Type_Nuts        Type_FruitOrJams       1
 8 Type_Suet           Type_Scraps      Type_EconMix           1
 9 Type_SugarWater     Type_Scraps      Type_EconMix           1
10 Type_SugarWater     Type_Suet        Type_EconMix           1
11 Type_SugarWater     Type_Suet        Type_Scraps            1
12 Type_SunflowerSeeds Type_Scraps      Type_Nuts              1
13 Type_SunflowerSeeds Type_Suet        Type_EconMix           1
14 Type_SunflowerSeeds Type_Suet        Type_Scraps            1
15 Type_SunflowerSeeds Type_SugarWater  Type_Scraps            1

【讨论】：

这太完美了！谢谢，这有助于我获得成对组合。我确实有点迷失在代码中，但它似乎运行正常。
好的！计算成对出现是很棘手的。我看到您更新了您的问题以查看三向组合，因此我将更新我的答案并尝试提供更多解释。