【发布时间】:2015-07-11 04:58:15
【问题描述】:
我有两个大型数据集,df1 和 df2。第一个数据集 df1 包含“ID”和“actual.data”列。
df1 <- data.frame(ID=c(1,1,1,2,3,4,4), actual.date=c('10/01/1997','2/01/1998','5/01/2002','7/01/1999','9/01/2005','5/01/2006','2/03/2003'));
dcis <- grep('date$',names(df1));
df1[dcis] <- lapply(df1[dcis],as.Date,'%m/%d/%Y');
df1;
ID actual.date
1 1 1997-10-01
2 1 1998-02-01
3 1 2002-05-01
4 2 1999-07-01
5 3 2005-09-01
6 4 2006-05-01
7 4 2003-02-03
第二个数据集 df2 包含两个日期字段“before,date”和“after.date”,分别表示开始日期和结束日期:
df2 <- data.frame(ID=c(1,1,1,2,3,4,4,4), before.date=c('10/1/1996','1/1/1998','1/1/2000','1/1/2001','1/1/2001','1/1/2001','10/1/2004','10/3/2004'), after.date=c('12/1/1996','9/30/2003','12/31/2004','3/31/2006','9/30/2006','9/30/2005','12/30/2004','11/28/2004') );
dcis <- grep('date$',names(df2));
df2[dcis] <- lapply(df2[dcis],as.Date,'%m/%d/%Y');
df2;
ID before.date after.date
1 1 1996-10-01 1996-12-01
2 1 1998-01-01 2003-09-30
3 1 2000-01-01 2004-12-31
4 2 2001-01-01 2006-03-31
5 3 2001-01-01 2006-09-30
6 4 2001-01-01 2005-09-30
7 4 2004-10-01 2004-12-30
8 4 2004-10-03 2004-11-28
我的目标是在 df1 的末尾创建一个名为“match”的新列,该列指示 df1 中每一行的“actual.date”是否介于任何的“before.date”和“after.date”之间df2 中相同 ID 的观测值。如果介于两者之间,我想给“匹配”列赋值 1,否则为 0(包括没有“ID”匹配的实例)。
这是我希望的输出:
ID before.date match
1 1 1997-10-01 0
2 1 1998-02-01 1
3 1 2002-05-01 1
4 2 1999-07-01 0
5 3 2005-09-01 1
7 4 2006-05-01 0
8 4 2003-02-03 1
我认为这可以通过 for() 循环来完成,但我对 R 不太了解。
样本数据:
DF1
structure(list(cikcode = c("20", "20", "20", "20", "20", "20",
"20", "20", "20", "20", "20", "20", "20", "20", "20", "20", "20",
"20", "20", "1750"), auditorkey = c("4", "4", "5", "5", "5",
"5", "6", "6", "6", "6", "6", "6", "6", "6", "6", "6", "6", "6",
"6", "4"), yearendeddate = structure(c(4L, 4L, 2L, 2L, 3L, 3L,
5L, 5L, 6L, 6L, 7L, 7L, 8L, 8L, 9L, 9L, 10L, 10L, 11L, 1L), .Label = c("2000-05-31",
"2000-12-30", "2001-12-29", "2002-12-28", "2004-01-03", "2005-01-01",
"2005-12-31", "2006-12-30", "2007-12-29", "2009-01-03", "2010-01-02"
), class = "factor"), source = structure(c(1L, 3L, 1L, 3L, 2L,
3L, 1L, 3L, 1L, 3L, 1L, 3L, 1L, 3L, 1L, 3L, 1L, 3L, 1L, 2L), .Label = c("10-K",
"10-K405", "DEF 14A"), class = "factor"), sourcedate = structure(c(6L,
7L, 2L, 3L, 4L, 5L, 8L, 9L, 10L, 20L, 11L, 12L, 13L, 14L, 15L,
16L, 17L, 18L, 19L, 1L), .Label = c("2000-08-24", "2001-03-26",
"2001-03-28", "2002-03-20", "2002-03-25", "2003-03-27", "2003-03-31",
"2004-04-01", "2004-04-06", "2005-03-31", "2006-03-23", "2006-03-28",
"2007-03-09", "2007-03-27", "2008-03-12", "2008-04-04", "2009-03-13",
"2009-04-06", "2010-03-15", "2005-04-04"), class = "factor"),
financialsdate = structure(c(4L, 4L, 2L, 2L, 3L, 3L, 5L,
5L, 6L, 6L, 7L, 7L, 8L, 8L, 9L, 9L, 10L, 10L, 11L, 1L), .Label = c("2000-05-31",
"2000-12-30", "2001-12-29", "2002-12-28", "2004-01-03", "2005-01-01",
"2005-12-31", "2006-12-30", "2007-12-29", "2009-01-03", "2010-01-02"
), class = "factor"), auditopinionkey = c("3538", "NA", "66900",
"NA", "78252", "NA", "39225", "NA", "84748", "NA", "102979",
"NA", "120889", "NA", "148621", "NA", "171604", "NA", "192814",
"156138"), auditorstatecode = structure(c(2L, NA, 2L, NA,
2L, NA, 2L, NA, 2L, NA, 2L, NA, 2L, NA, 2L, NA, 2L, NA, 2L,
1L), .Label = c("IL", "PA"), class = "factor"), auditorstatename = structure(c(2L,
NA, 2L, NA, 2L, NA, 2L, NA, 2L, NA, 2L, NA, 2L, NA, 2L, NA,
2L, NA, 2L, 1L), .Label = c("ILLINOIS", "PENNSYLVANIA"), class = "factor"),
goingconcern = structure(c(1L, NA, 1L, NA, 1L, NA, 1L, NA,
1L, NA, 1L, NA, 1L, NA, 1L, NA, 1L, NA, 1L, 1L), .Label = "No", class = "factor"),
goingconcernissuekeylist = structure(c(1L, NA, 1L, NA, 1L,
NA, 1L, NA, 1L, NA, 1L, NA, 1L, NA, 1L, NA, 1L, NA, 1L, 1L
), .Label = "", class = "factor"), goingconcernissuephraselist = structure(c(1L,
NA, 1L, NA, 1L, NA, 1L, NA, 1L, NA, 1L, NA, 1L, NA, 1L, NA,
1L, NA, 1L, 1L), .Label = "", class = "factor"), isadditionalopinion = structure(c(1L,
NA, 1L, NA, 1L, NA, 1L, NA, 1L, NA, 1L, NA, 1L, NA, 1L, NA,
1L, NA, 1L, 1L), .Label = "No", class = "factor"), restatement = c("NA",
"0", "NA", "0", "NA", "0", "NA", "0", "NA", "0", "NA", "0",
"NA", "0", "NA", "0", "NA", "0", "0", "NA"), yearended = c("NA",
"2002", "NA", "2000", "NA", "2001", "NA", "2003", "NA", "2004",
"NA", "2005", "NA", "2006", "NA", "2007", "NA", "2008", "2009",
"NA"), assets = c("50,459,000", "50,459,000", "54,421,000",
"54,421,000", "47,644,000", "47,644,000", "83,081,000", "83,081,000",
"93,016,000", "93,016,000", "89,110,000", "89,110,000", "140,996,000",
"140,996,000", "184,118,000", "184,118,000", "199,444,000",
"199,444,000", "204,236,000", "740,998,000"), auditfees = c("123,700",
"123,700", "200,000", "200,000", "185,000", "185,000", "137,100",
"137,100", "225,000", "225,000", "244,000", "244,000", "574,000",
"574,000", "669,000", "669,000", "680,000", "680,000", "643,000",
"NA"), auditor = c("KPMG LLP", "KPMG LLP", "Arthur Andersen LLP",
"Arthur Andersen LLP", "Arthur Andersen LLP", "Arthur Andersen LLP",
"Grant Thornton LLP", "Grant Thornton LLP", "Grant Thornton LLP",
"Grant Thornton LLP", "Grant Thornton LLP", "Grant Thornton LLP",
"Grant Thornton LLP", "Grant Thornton LLP", "Grant Thornton LLP",
"Grant Thornton LLP", "Grant Thornton LLP", "Grant Thornton LLP",
"Grant Thornton LLP", "KPMG LLP"), earnings = c("3,284,000",
"3,284,000", "5,838,000", "5,838,000", "1,048,000", "1,048,000",
"NA", "", "NA", "", "NA", "", "NA", "", "21,321,000", "21,321,000",
"25,773,000", "25,773,000", "21,555,000", "35,163,000"),
naicscode = c("334513", "334513", "334513", "334513", "334513",
"334513", "334513", "334513", "334513", "334513", "334513",
"334513", "334513", "334513", "334513", "334513", "334513",
"334513", "334513", "334613"), revenue = c("68,231,000",
"68,231,000", "84,912,000", "84,912,000", "71,819,000", "71,819,000",
"94,676,000", "94,676,000", "112,494,000", "112,494,000",
"118,940,000", "118,940,000", "148,127,008", "148,127,008",
"201,677,000", "201,677,000", "243,018,000", "243,018,000",
"190,774,000", "1,024,333,000"), siccode = c("3823", "3823",
"3823", "3823", "3823", "3823", "3823", "3823", "3823", "3823",
"3823", "3823", "3823", "3823", "3823", "3823", "3823", "3823",
"3823", "3720"), statecode = c("NJ", "NJ", "NJ", "NJ", "NJ",
"NJ", "NJ", "NJ", "NJ", "NJ", "NJ", "NJ", "NJ", "NJ", "NJ",
"NJ", "NJ", "NJ", "NJ", "IL"), statename = c("NEW JERSEY",
"NEW JERSEY", "NEW JERSEY", "NEW JERSEY", "NEW JERSEY", "NEW JERSEY",
"NEW JERSEY", "NEW JERSEY", "NEW JERSEY", "NEW JERSEY", "NEW JERSEY",
"NEW JERSEY", "NEW JERSEY", "NEW JERSEY", "NEW JERSEY", "NEW JERSEY",
"NEW JERSEY", "NEW JERSEY", "NEW JERSEY", "ILLINOIS"), ticker = c("NA",
"", "NA", "", "NA", "", "NA", "", "NA", "", "NA", "", "NA",
"", "NA", "", "NA", "", "", "AIR")), .Names = c("cikcode",
"auditorkey", "yearendeddate", "source", "sourcedate", "financialsdate",
"auditopinionkey", "auditorstatecode", "auditorstatename", "goingconcern",
"goingconcernissuekeylist", "goingconcernissuephraselist", "isadditionalopinion",
"restatement", "yearended", "assets", "auditfees", "auditor",
"earnings", "naicscode", "revenue", "siccode", "statecode", "statename",
"ticker"), row.names = c(NA, 20L), class = "data.frame")
DF2
structure(list(cikcode = c(320193L, 72971L, 72971L, 200406L,
40545L, 40545L, 1114448L, 19617L, 19617L, 1067983L, 70858L, 313807L,
1578845L, 1113172L, 64803L, 1135644L, 731766L, 14272L, 14272L,
66740L), auditoratdisclosuredate = c("KPMG LLP", "KPMG LLP",
"KPMG LLP", "PricewaterhouseCoopers LLP", "KPMG LLP", "KPMG LLP",
"PricewaterhouseCoopers LLP", "PricewaterhouseCoopers LLP", "PricewaterhouseCoopers LLP",
"Deloitte & Touche LLP", "PricewaterhouseCoopers LLP", "Ernst & Young LLP",
"PricewaterhouseCoopers LLP", "PricewaterhouseCoopers LLP", "KPMG LLP",
"KPMG LLP", "Deloitte & Touche LLP", "PricewaterhouseCoopers LLP",
"PricewaterhouseCoopers LLP", "PricewaterhouseCoopers LLP"),
auditoratdisclosuredatekey = c("4", "4", "4", "1", "4", "4",
"1", "1", "1", "3", "1", "2", "1", "1", "4", "4", "3", "1",
"1", "1"), auditorduringrestatedperiod = c("|KPMG LLP|",
"|KPMG LLP|", "|KPMG LLP|", "|PricewaterhouseCoopers LLP|",
"|KPMG LLP|", "|KPMG LLP|", "|PricewaterhouseCoopers LLP|PricewaterhouseCoopers LLP (PricewaterhouseCoopers AG, SWITZERLAND)|",
"|PricewaterhouseCoopers LLP|", "|PricewaterhouseCoopers LLP|",
"|Deloitte & Touche LLP|", "|PricewaterhouseCoopers LLP|PricewaterhouseCoopers LLP (Price Waterhouse & Co SRL, ARGENTINA)|",
"|Ernst & Young LLP|Ernst & Young LLP (Ernst & Young LLP, UNITED KINGDOM)|",
"|PricewaterhouseCoopers LLP|", "|Deloitte & Touche LLP (Deloitte Touche Tohmatsu Auditores Independentes, BRAZIL)|Deloitte & Touche LLP (KPMG Auditores Independentes, BRAZIL)|KPMG LLP (Deloitte Touche Tohmatsu Auditores Independentes, BRAZIL)|KPMG LLP (KPMG Auditores Independentes, BRAZIL)|",
"|KPMG LLP|", "|KPMG LLP (KPMG Deutsche Treuhand-Gesellschaft AG WPG, GERMANY)|",
"|Arthur Andersen LLP|Deloitte & Touche LLP|", "|PricewaterhouseCoopers LLP|",
"|PricewaterhouseCoopers LLP|", "|PricewaterhouseCoopers LLP|"
), auditorduringrestatedperiodkeys = c("|4|", "|4|", "|4|",
"|1|", "|4|", "|4|", "|1|", "|1|", "|1|", "|3|", "|1|", "|2|",
"|1|", "|3|4|", "|4|", "|4|", "|5|3|", "|1|", "|1|", "|1|"
), auditoropinionperiodendduringrestatedperiod = c("|KPMG LLP|",
"|KPMG LLP|", "|KPMG LLP|", "", "|KPMG LLP|", "|KPMG LLP|",
"|PricewaterhouseCoopers LLP[PricewaterhouseCoopers AG]|",
"|PricewaterhouseCoopers LLP|", "", "", "|PricewaterhouseCoopers LLP|",
"|Ernst & Young LLP[Ernst & Young LLP]|", "|PricewaterhouseCoopers LLP|",
"|KPMG LLP[KPMG Auditores Independentes]|", "", "|KPMG LLP[KPMG Deutsche Treuhand-Gesellschaft AG WPG]|",
"|Arthur Andersen LLP|Deloitte & Touche LLP|", "|PricewaterhouseCoopers LLP|",
"|PricewaterhouseCoopers LLP|", "|PricewaterhouseCoopers LLP|"
), auditoropinionperiodendduringrestatedperiodkeys = c("|4|",
"|4|", "|4|", "", "|4|", "|4|", "|1|", "|1|", "", "", "|1|",
"|2|", "|1|", "|4|", "", "|4|", "|5|3|", "|1|", "|1|", "|1|"
), restatementkey = c("10196", "12617", "42632", "44280",
"857", "22310", "7995", "10187", "38972", "5379", "6357",
"9495", "49566", "32005", "45575", "15890", "13410", "695",
"3752", "36923"), restatedperiodbegin = c("1996-10-01", "1998-01-01",
"2005-01-01", "2012-01-01", "2001-01-01", "2001-01-01", "2003-01-01",
"2003-01-01", "2012-01-01", "2001-07-01", "2001-01-01", "2000-01-01",
"2014-01-01", "2007-01-01", "1997-01-01", "2000-10-01", "1994-01-01",
"2001-01-01", "1999-01-01", "2010-01-01"), restatedperiodended = c("2006-04-01",
"2003-09-30", "2007-09-30", "2012-07-01", "2006-03-31", "2006-09-30",
"2004-12-31", "2006-03-31", "2012-03-31", "2001-09-30", "2005-09-30",
"2004-12-31", "2014-12-31", "2008-12-31", "1998-12-31", "2006-09-30",
"2006-09-30", "2003-09-30", "2002-06-30", "2011-03-31"),
disclosure = c("Press Release", "10-Q/A", "10-K", "10-Q",
"8-K", "8-K", "20-F", "8-K", "8-K", "10-Q/A", "8-K", "20-F/A",
"10-Q", "20-F", "8-K", "20-F", "8-K", "8-K", "Press Release",
"10-Q"), disclosuredate = c("2006-06-29", "2004-01-16", "2008-02-29",
"2012-11-09", "2005-05-06", "2007-01-19", "2006-01-30", "2006-08-03",
"2012-07-13", "2002-03-15", "2006-02-22", "2006-06-13", "2015-05-11",
"2010-04-21", "1999-11-15", "2006-12-11", "2006-04-07", "2004-01-29",
"2002-10-22", "2011-08-04"), cumulativechangeinnetincome = c(NaN,
NaN, 0, 0, 3.04344437838208e-315, NaN, NaN, 0, NaN, 0, 2.66795448754273e-315,
0, 0, 0, 0, NaN, NaN, NaN, NaN, 0), accountingrulegaapfasbapplicationfailures = c("|Deferred, stock-based and/or executive comp issues|Deferred, stock-based options backdating only (subcategory)|",
"|Lease, SFAS 5, legal, contingency and commitment issues|Lease, leasehold and FAS 13 (98) only (subcategory)|",
"|Cash flow statement (SFAS 95) classification errors|Lease, SFAS 5, legal, contingency and commitment issues|",
"|Cash flow statement (SFAS 95) classification errors|",
"|Financial derivatives/hedging (FAS 133) acct issues|",
"|Financial derivatives/hedging (FAS 133) acct issues|Foreign, related party, affiliated, or subsidiary issues|",
"|PPE intangible or fixed asset (value/diminution) issues|Acquisitions, mergers, disposals, re-org acct issues|Consolidation issues incl Fin 46 variable interest & off-B/S|Deferred, stock-based and/or executive comp issues|Tax expense/benefit/deferral/other (FAS 109) issues|GAAP - Changes in Acct Principles FASB/EITF or Foreign GAAP|Acquisitions, mergers, only (subcategory) acct issues|PPE issues - Intangible assets, goodwill only (subcategory)|Pension and other post-retirement benefit issues|",
"|Accounts/loans receivable, investments & cash issues|Cash flow statement (SFAS 95) classification errors|",
"|Revenue recognition issues|", "|Consolidation issues incl Fin 46 variable interest & off-B/S|Intercompany, investment in subs./affiliate issues|",
"|Financial derivatives/hedging (FAS 133) acct issues|",
"|Revenue recognition issues|Financial derivatives/hedging (FAS 133) acct issues|Inventory, vendor and/or cost of sales issues|GAAP - Changes in Acct Principles FASB/EITF or Foreign GAAP|",
"|Debt, quasi-debt, warrants & equity ( BCF) security issues|Foreign, related party, affiliated, or subsidiary issues|Cash flow statement (SFAS 95) classification errors|Foreign, subsidiary only issues (subcategory)|",
"", "|Acquisitions, mergers, disposals, re-org acct issues|Acquisitions, mergers, only (subcategory) acct issues|",
"|Tax expense/benefit/deferral/other (FAS 109) issues|",
"|Deferred, stock-based and/or executive comp issues|Deferred, stock-based options backdating only (subcategory)|",
"|Acquisitions, mergers, disposals, re-org acct issues|Consolidation issues incl Fin 46 variable interest & off-B/S|Accounts/loans receivable, investments & cash issues|Tax expense/benefit/deferral/other (FAS 109) issues|Fin Statement, footnote & segment disclosure issues|Pension and other post-retirement benefit issues|",
"|Revenue recognition issues|Accounts/loans receivable, investments & cash issues|Inventory, vendor and/or cost of sales issues|",
"|Cash flow statement (SFAS 95) classification errors|"),
financialfraudirregularitiesandmisrepresentations = c("",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "",
"", "", "", ""), errorsaccountingandclericalapplications = c("",
"", "", "", "", "", "", "", "", "", "", "", "", "|EPS, ratio and classification of income statement issues|",
"", "", "", "", "", ""), othersignificantissues = c("", "",
"", "", "", "", "", "", "|Z - Material Weakness - Section 404 or 302 issues identified|",
"", "", "", "", "", "", "", "", "", "|Y - Loan covenant violations/issues|",
""), secinvestigation = c("", "", "Y", "", "Y", "Y", "",
"", "", "", "", "", "", "", "", "", "", "", "", "")), .Names = c("cikcode",
"auditoratdisclosuredate", "auditoratdisclosuredatekey", "auditorduringrestatedperiod",
"auditorduringrestatedperiodkeys", "auditoropinionperiodendduringrestatedperiod",
"auditoropinionperiodendduringrestatedperiodkeys", "restatementkey",
"restatedperiodbegin", "restatedperiodended", "disclosure", "disclosuredate",
"cumulativechangeinnetincome", "accountingrulegaapfasbapplicationfailures",
"financialfraudirregularitiesandmisrepresentations", "errorsaccountingandclericalapplications",
"othersignificantissues", "secinvestigation"), row.names = c("1",
"2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13",
"14", "15", "16", "17", "18", "19", "20"), class = "data.frame")
【问题讨论】:
-
你可以使用
foverlapsfromdata.table -
我通过复制/粘贴 dput 得到了一些错误,即
Error in structure(list(cikcode = c("20", "20", "20", "20", "20", "20", : object 'auditorkey' not found我想知道为什么最后没有引用一些错误?例如auditorkey,auditopinionkey, ... -
我尝试添加引号,然后再次运行它。现在错误是
Error in structure(c(1L, 3L, 1L, 3L, 2L, 3L, 1L, 3L, 1L, 3L, 1L, 3L, 1L, : object 'K405' not found能否请您再次运行 dput 以查看一切是否正确 -
顺便说一句,您要比较的两个数据集中的日期列是什么?在
DF1中,有yearendeddate、sourcedate、financialdate。在DF2、restatedperiodbegin、restatedperiodended和disclosuredate。另外,这两个数据集之间的常见 ID 列/列是什么? -
在下面引用我的评论:“开始日期 - 重述期间开始结束日期 - 重述期间结束 ID - cikcode 实际日期 - 年结束日期”