有几个注意事项:
- 将数据转置为又高又瘦的结构时,处理更容易
- 在某些情况下,该人的年龄范围可能存在“差距”
- 示例:20-29 岁发病,40-49 岁发病
这对我来说是 30-39 岁年龄段疾病的隐含零计数。在计算所有 id 中 30-39 的疾病比例时,这种隐含意义很重要(并且应该考虑在内)。
- 当年龄的标准化值用于某个范围内的年龄时,可以轻松完成“空白”的填充。可以使用自定义格式和自定义信息计算“标准”值。
- 只应在登记日期和最后一次患病日期之间的年龄范围内添加空白记录
方法
- 为每个 id 和注册年龄转置年龄和标志
- 使用
MEANS 对每个id 年龄(标准化)的标志求和,以获取一个年龄范围内的并发疾病计数
- 左连接交叉连接 ID 和年龄,并填入总和
- 再次使用
MEANS计算年龄和age*flag_count
- 使用
DATA step 计算计数比例[age*flag_count]/[age]
示例代码
* custom format and informat for standardizing age;
proc format;
value agegrp
20-29 = '20-29'
30-39 = '30-39'
40-49 = '40-49'
50-59 = '50-59'
60-69 = '60-69'
70-79 = '70-79'
80-89 = '80-89'
90-high = '90 + '
;
invalue agegrp
'20-29' = 20
'30-39' = 30
'40-49' = 40
'50-59' = 50
'60-69' = 60
'70-79' = 70
'80-89' = 80
'90 + ' = 90
;
run;
* generate some sample data;
data have;
call streaminit(123);
do id = 1 to 10000;
enrolled = '01jan2000'd + rand('integer', 1, 3650);
age = 20 + rand('integer', 59);
flag1 = rand('uniform') < 0.25;
date1 = enrolled + rand('integer',2500);
flag2 = rand('uniform') < 0.25;
date2 = date1 + rand('integer',2500);
flag3 = rand('uniform') < 0.25;
date3 = date2 + rand('integer',2500);
output;
end;
format enrolled date: yymmdd10. flag: 1.;
run;
* compute age at each disease flag time point;
data stage1;
set have;
age1 = age + intck('year', enrolled, date1);
age2 = age + intck('year', enrolled, date2);
age3 = age + intck('year', enrolled, date3);
run;
* parallel array based transposition of ages and flags;
data stage2;
set stage1;
* map age in range to first value in range;
* need for later when creating zero counts for intermediate age ranges not in data;
* counts of zero counts needed to compute proportion;
enroll_age = input(put(age,agegrp.),agegrp.);
age = input(put(age1,agegrp.),agegrp.); flag = flag1; output;
age = input(put(age2,agegrp.),agegrp.); flag = flag2; output;
age = input(put(age3,agegrp.),agegrp.); flag = flag3; output;
keep id enroll_age age flag;
format enroll_age age 4. flag 1.;
run;
ods listing;
* compute number of simultaneous diseases in age range;
proc means noprint nway data=stage2;
class id enroll_age age;
format enroll_age age agegrp.;
var flag;
output out=stage3(drop= _type_ _freq_) sum=flag_count;
run;
* data for cross join;
data all_ages;
do age = 20 to 90 by 10;
output;
end;
format age agegrp.;
run;
* combine all_ages with each id so as to get a zero count
* for a range not present between enroll_date and max date;
proc sql;
create table stage4 as
select ids.id, ages.age, coalesce(stage3.flag_count,0) as flag_count
from
( select distinct id from stage3 ) as ids
cross join
all_ages as ages
left join
stage3
on ids.id = stage3.id and ages.age = stage3.age
group by
ids.id
having
ages.age between min(stage3.enroll_age) and max(stage3.age)
order by
ids.id, age
;
quit;
* compute counts;
proc means noprint data=stage4;
class age flag_count;
output out=stage5 N(id)=;
types age age*flag_count;
run;
* compute proportions;
data want;
merge
stage5 (where=(_type_=2) rename=_freq_=age_freq)
stage5 (where=(_type_=3) rename=_freq_=flag_count_freq)
;
by age;
age_flag_count_proportion = flag_count_freq / age_freq;
format age_flag_count_proportion percent5.;
keep age flag_count age_freq flag_count_freq age_flag_count_proportion;
format flag_count 1.;
run;
输出数据示例