考虑具有k 键变量和n 数据变量的表结构。
关键变量定义BY 组,并从组内的所有行中选择最少的行,其非缺失数据值完全覆盖具有相同但较少非缺失数据值的其他行。
考虑单行,包含 5 个混合类型的数据变量,为此分析抽象命名:
C1, C2, N3, C4, C5
注意:由于类型不同,所有数据变量不能通过单个 DATA 步 array 引用。
继续:
- 构造对应于非缺失值的位掩码。它将开启
m 位。
- 识别子掩码
考虑一个示例行:
样本数据值的掩码为 m=3
有23-1 = 7子掩码
1 1 0 0 *
1 * 0 0 1
1 * 0 0 *
* 1 0 0 1
* 1 0 0 *
* * 0 0 1
* * 0 0 *
对于组中具有相同值(或空值)的任何其他行,相应的掩码将是样本行的子掩码,因此覆盖率“低”,因此可以丢弃。
哈希,其键是所有数据变量,可用于跟踪带有一些掩码的主要行,以及计算的子掩码。如果后续行具有与前一行的主掩码相对应的子掩码,则将前一行标记为劣等,因此可以从选择中过滤掉。
从0 到2n-1 的 2n 个值的循环是遍历所有候选掩码的简单方法。针对主掩码的BAND 操作将从候选者中计算出子掩码。
示例代码
data have;input
ID FIRST:$ LAST:$ YEAR CITY:$ COUNTRY:$; datalines;
1 John SMITH 1985 NewYork USA
1 John . 1985 . USA
1 John . 1985 . UK
1 . SMITH . Miami USA
1 John SMITH 1985 NewYork USA
1 Mark SMITH 1990 London UK
1 Mark SMITH 1990 London UK
1 Mark SMITH 1990 London UK
1 Mark SMITH 1990 London UK
1 . SMITH 1990 London UK
1 Mark . 1990 London UK
1 Mark SMITH . London UK
1 Mark SMITH 1990 . UK
1 Mark SMITH 1990 London .
1 Mark SMITH . London UK
1 Mark . . London UK
;
data have;
set have have(in=_2);
if _2 then id=2;
run;
%macro loadkeysFor(var);
%local i j n itop bits;
%let n = %sysfunc(countw(&var));
%let itop = %eval(&n-1);
%do i = 0 %to &itop;
%local var&i;
%let var&i = %scan(&var,&i+1); %* data variable names;
%end;
%do i = 0 %to &itop;
_&i = &&var&i; %* generate code to save data values;
%end;
%do i = 0 %to &itop;
_bit&i = not missing(_&i); %* generate code to compute bits of mask;
%end;
%* mask indicates the non-missing permutations of 1 bits in mask submasks will be ;
_mask = input(cats(of _bit&itop-_bit0), binary&n..);
/* put (_&itop-_0) (=);*/
/* put (_bit&itop-_bit0) (1.) +1 _mask binary&n..;*/
if h.find() = 0 then continue;
%* continue will skip this row because when 'found' the data values of the row are either
%* - identical to a prior row, or
%* - inferior to a prior row
%*;
_submask = _mask;
_seq = index; %* non-missing seq is the mark of a principal row;
h.add(); %* new principal row. save it, and replace all keys of corresponding key values as inferior;
array _mark(0:%eval(2**&n)) _temporary_;
call missing (of _mark(*));
_seq = .; %* seq is the mark of an inferior submask;
do _maskbits_ = 0 to %eval(2**&n-1);
_submask = band (_mask, _maskbits_); %* good ole BAND - binary and, compute the submask;
if _submask = _mask then continue; %* skip principal row;
if missing(_mark(_submask)) then do; %* reduce extra work, each submask done only once;
_mark(_submask) = 1;
%* generate code to assign data values (from saved values) according to submask;
%* set host variables according to submask;
%do i = 0 %to &itop;
if band(_submask,blshift(1,&i)) then &&var&i = _&i; else call missing(&&var&i);
%end;
rc = h.replace(); %* add/replace hash entry of inferior sub-mask;
end;
end;
format _submask _mask binary&n.. _seq _n_ 4.;
%mend;
options mprint;
data want;
if 0 then set have;
if _n_ = 1 then do;
declare hash h (ordered:'a');
h.defineKey ('first', 'last', 'year', 'city', 'country');
h.defineData('first', 'last', 'year', 'city', 'country', '_seq'); * , 'row', '_submask', '_mask';
h.defineDone();
declare hiter hi('h');
declare hash select();
select.defineKey('_seq');
select.defineDone();
end;
h.clear();
do index = 1 by 1 until (last.id);
set have;
by id;
row = index;
%loadkeysFor(first last year city country)
end;
put index=;
indexTop = index;
%* retrieve maximally covering principal rows of group;
select.clear();
do _n_ = 1 by 1 while (hi.next() = 0);
if _seq then OUTPUT;
end;
keep id first last year city country;
run;