【问题标题】:Programming mechanics: Assigning group IDs编程机制:分配组 ID
【发布时间】:2012-09-28 19:45:04
【问题描述】:

假设我有一些航空公司的航班数据。一个字段是始发机场,另一个字段是目的地机场。我想按路线(独特的起点到目的地组合)对观察进行分组。问题是,对于每条独特的路线,还需要包括相应的返回路线。例如,如果我有机场 A 到机场 B 进行一组观察,但在接下来的几次观察中从机场 B 到机场 A,我希望它们都具有相同的路线 ID。

我可以为此使用 SAS、Stata 或 R。即使是 Python,如果它更容易的话。

代表。代码如下:

df1 <- structure(list(airl = c("US", "US", "US", "US", "US", "US", "US", 
"US", "US", "US", "US", "US", "US", "US", "US", "US", "US", "US", 
"US", "US", "US", "US", "US", "US", "US", "US", "US", "US", "US", 
"US", "US", "US", "US", "US", "US", "US", "US", "US", "US", "US", 
"US", "US", "US", "US", "US", "US", "US", "US", "US", "US", "US", 
"US", "US", "US", "US", "US", "US", "US", "US", "US", "US", "US", 
"US", "US", "US", "US", "US", "US", "US"), ORIGIN = c("ABE", 
"ABE", "ABE", "ABE", "ABE", "ABE", "ABE", "ABE", "ABE", "ABE", 
"ABE", "ABE", "ABE", "ABE", "ABE", "ABE", "ABE", "ABE", "ABE", 
"ABE", "ABE", "ABE", "ABE", "ABE", "ABE", "ABE", "ABE", "ABE", 
"ABE", "ABE", "ABE", "ABE", "ABE", "ABE", "CLT", "CLT", "CLT", 
"CLT", "CLT", "CLT", "CLT", "CLT", "CLT", "CLT", "CLT", "CLT", 
"CLT", "CLT", "CLT", "CLT", "CLT", "CLT", "PHL", "PHL", "PHL", 
"PHL", "PHL", "PHL", "PHL", "PHL", "PHL", "PHL", "PHL", "PHL", 
"PHL", "PHL", "PHL", "PHL", "PHL"), DESTINATION = c("CLT", "CLT", 
"CLT", "CLT", "CLT", "CLT", "CLT", "CLT", "CLT", "CLT", "CLT", 
"CLT", "CLT", "CLT", "CLT", "CLT", "CLT", "PHL", "PHL", "PHL", 
"PHL", "PHL", "PHL", "PHL", "PHL", "PHL", "PHL", "PHL", "PHL", 
"PHL", "PHL", "PHL", "PHL", "PHL", "ABE", "ABE", "ABE", "ABE", 
"ABE", "ABE", "ABE", "ABE", "ABE", "ABE", "ABE", "ABE", "ABE", 
"ABE", "ABE", "ABE", "ABE", "ABE", "ABE", "ABE", "ABE", "ABE", 
"ABE", "ABE", "ABE", "ABE", "ABE", "ABE", "ABE", "ABE", "ABE", 
"ABE", "ABE", "ABE", "ABE"), miles = c(480, 480, 480, 480, 480, 
480, 480, 480, 480, 480, 480, 480, 480, 480, 480, 480, 480, 54, 
54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 54, 
480, 480, 480, 480, 480, 480, 480, 480, 480, 480, 480, 480, 480, 
480, 480, 480, 480, 480, 54, 54, 54, 54, 54, 54, 54, 54, 54, 
54, 54, 54, 54, 54, 54, 54, 54), orig_area = c(23, 23, 23, 23, 
23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 
23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 36, 36, 
36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 
23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 
23), dest_area = c(36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 
36, 36, 36, 36, 36, 36, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 
23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 
23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 
23, 23, 23, 23, 23, 23, 23, 23, 23, 23), month = c(1, 2, 3, 4, 
5, 6, 7, 8, 9, 10, 11, 12, 1, 2, 3, 4, 5, 4, 5, 6, 7, 8, 9, 10, 
11, 12, 1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 1, 2, 3, 4, 5, 6, 
7, 8, 9, 10, 11, 12, 1, 2, 3, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 
1, 2, 3, 4, 5, 6, 7), freq = c(88, 80, 89, 78, 88, 83, 85, 80, 
76, 79, 76, 81, 86, 65, 62, 60, 82, 137, 138, 142, 144, 149, 
147, 150, 143, 150, 138, 128, 151, 145, 148, 146, 147, 149, 79, 
76, 81, 86, 65, 62, 60, 82, 82, 82, 84, 81, 83, 81, 85, 84, 76, 
85, 143, 137, 138, 142, 143, 151, 147, 150, 143, 150, 137, 128, 
151, 145, 148, 146, 147), seats = c(8146, 7352, 7599, 6920, 6759, 
6060, 6189, 5939, 6137, 6504, 6440, 6804, 6862, 5330, 5242, 5068, 
6204, 6460, 6276, 6047, 6095, 6306, 6102, 6265, 7085, 7344, 6809, 
6348, 6965, 6626, 6893, 6741, 6765, 6865, 6504, 6440, 6804, 6862, 
5330, 5242, 5068, 6204, 6104, 6030, 6278, 6034, 6944, 6816, 6544, 
6494, 5872, 6544, 6747, 6460, 6276, 6034, 6058, 6380, 6102, 6278, 
7085, 7344, 6759, 6348, 6952, 6613, 6919, 6728, 6765), year = c(2009, 
2009, 2009, 2009, 2009, 2009, 2009, 2009, 2009, 2009, 2009, 2009, 
2010, 2010, 2010, 2010, 2010, 2010, 2010, 2010, 2010, 2010, 2010, 
2010, 2010, 2010, 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2011, 
2009, 2009, 2009, 2010, 2010, 2010, 2010, 2010, 2010, 2010, 2010, 
2010, 2010, 2010, 2010, 2011, 2011, 2011, 2010, 2010, 2010, 2010, 
2010, 2010, 2010, 2010, 2010, 2010, 2011, 2011, 2011, 2011, 2011, 
2011, 2011)), .Names = c("airl", "ORIGIN", "DESTINATION", "miles", 
"orig_area", "dest_area", "month", "freq", "seats", "year"), class = "data.frame", row.names = c(NA, 
69L))

【问题讨论】:

  • 您能提供一些示例数据吗?
  • 如果您提供reproducible example,[r] 人会喜欢它
  • 你能告诉我们想要的结果吗?

标签: r group-by grouping sas stata


【解决方案1】:

采用最简单的标签(两个机场代码,按字母顺序列出,因此出发地和目的地无关紧要):

df1$group <- apply(df1[c("ORIGIN", "DESTINATION")], 
                   1, 
                   function(cit) {paste(sort(cit),collapse="-")})

简而言之,只需提取机场代码,并为每一行按字母顺序对代码进行排序,然后用连字符将它们分开粘贴。分配 group 变量。

> df1$group
 [1] "ABE-CLT" "ABE-CLT" "ABE-CLT" "ABE-CLT" "ABE-CLT" "ABE-CLT" "ABE-CLT"
 [8] "ABE-CLT" "ABE-CLT" "ABE-CLT" "ABE-CLT" "ABE-CLT" "ABE-CLT" "ABE-CLT"
[15] "ABE-CLT" "ABE-CLT" "ABE-CLT" "ABE-PHL" "ABE-PHL" "ABE-PHL" "ABE-PHL"
[22] "ABE-PHL" "ABE-PHL" "ABE-PHL" "ABE-PHL" "ABE-PHL" "ABE-PHL" "ABE-PHL"
[29] "ABE-PHL" "ABE-PHL" "ABE-PHL" "ABE-PHL" "ABE-PHL" "ABE-PHL" "ABE-CLT"
[36] "ABE-CLT" "ABE-CLT" "ABE-CLT" "ABE-CLT" "ABE-CLT" "ABE-CLT" "ABE-CLT"
[43] "ABE-CLT" "ABE-CLT" "ABE-CLT" "ABE-CLT" "ABE-CLT" "ABE-CLT" "ABE-CLT"
[50] "ABE-CLT" "ABE-CLT" "ABE-CLT" "ABE-PHL" "ABE-PHL" "ABE-PHL" "ABE-PHL"
[57] "ABE-PHL" "ABE-PHL" "ABE-PHL" "ABE-PHL" "ABE-PHL" "ABE-PHL" "ABE-PHL"
[64] "ABE-PHL" "ABE-PHL" "ABE-PHL" "ABE-PHL" "ABE-PHL" "ABE-PHL"

【讨论】:

    【解决方案2】:

    在Stata中,你可以利用字符串之间的关系:

    assert "ABE"<"CLT"
    

    没问题,当然任何一个

    assert "ABE"<"CLT"
    assert "ABE"<1
    

    会产生错误(第一个是假的,第二个是比较不兼容的数据类型)。所以在 R 中反映 Brian 的建议,我们可以有

    gen str7 route = origin + "-" + destination if origin < destination & !missing(origin) & !missing(destination)
    replace  route = destination + "-" + origin if destination < origin & !missing(origin) & !missing(destination)
    list origin destination route if missing( route )
    

    当然,所有缺失值检查都只是妄想症。但是谁知道你的数据有多糟糕:)。

    【讨论】:

      【解决方案3】:

      一种使用Rdata.table 的方法(用于编码优雅和内存效率)

      library(data.table)
      DT <- as.data.table(df1)
      
      
      DT[, id := paste(sort(c(ORIGIN, DESTINATION)), collapse ='-') ,
            by = list(ORIGIN, DESTINATION)]
      

      【讨论】:

        【解决方案4】:

        SAS 解决方案,使用 PROC FORMAT。我在这里采取的步骤比您可能需要的要多得多 - 只需拆分每个步骤以明确我在做什么。这将为每一对分配一个值,并将该对的两个方向分配给相同的值。

        你可以做与 R 解决方案完全相同的操作,标签本身就是组定义,通过使用格式来转换 'ABQ-DEN' 或 'DEN-ABQ' 到 'ABQ-DEN' - 这样做,但在 pre-proc 排序数据步骤中设置 label=start。您还需要将 INFORMAT 转换为 FORMAT。

        proc format;
        *What this will look like - this is an example and NOT used in the final solution;
        invalue $AIRRT
        'ABE-CLT'=1
        'CLT-ABE'=1
        'ABE-PHL'=2
        'PHL-ABE'=2
        'ABQ-DEN'=3
        'DEN-ABQ'=3
        'ABQ-ELP'=4
        'ELP-ABQ'=4
        'MDW-MCI'=5
        ;
        *Only used to create sample data;
        value AIRPORT
        1="ABE"
        2="CLT"
        3="PHL"
        4="ABQ"
        5="ELP"
        6="DEN"
        7="MDW"
        8="MCI"
        ;
        quit;
        
        *create sample data;
        data have;
        do _t = 1 to 100;
            origin=put(ceil(8*ranuni(7)),$AIRPORT.);
            do until (destination ne origin);
                destination=put(ceil(8*ranuni(7)),AIRPORT.);
            end;
            output;
        end;
        run;
        
        *create preliminary dataset for format, creating combined field;
        data for_format_pre;
         set have;
         call sortc(of origin destination);
         start = catx('-',origin,destination); 
         keep start origin destination;
        run;
        
        *sort down to one per route;
        proc sort nodupkey data=for_format_pre;
         by start;
        run;
        
        *create final format dataset, with group counter;
        data for_format;
        set for_format_pre;
         retain fmtname "AIRRT" type 'j';
         label+1;
         output;
         start=catx('-',destination,origin);
         output;
        run;
        
        *import into formats;
        proc format cntlin=for_format;
        quit;
        
        *apply to dataset;
        data want;
        set have;
        combined=catx('-',origin,destination);
        group_sort = input(combined,$AIRRT.);
        drop _:;
        run;
        

        【讨论】:

          猜你喜欢
          • 1970-01-01
          • 2015-03-22
          • 1970-01-01
          • 2014-05-15
          • 1970-01-01
          • 2012-10-10
          • 1970-01-01
          • 1970-01-01
          相关资源
          最近更新 更多