2017-07-20 10 views
1

2つのデータセットをマージします。私はこれをIDと日付にマージしたい。ただし、日付の一部は、他の日付にマージする行の1〜2日後に除外されます。どのように日付をマージすることができますが、日付の間に最大2日間の違いができますか?日付範囲に基づいてデータセットをマージする方法

マイデータ:

#Random letters to fill out a pathology report 
pathRep<-replicate(20,paste(sample(LETTERS,50,replace=T),collapse="")) 
pathDate<-as.Date(c("1993-12-22","1994-05-16","1992-07-20","1996-06-02","1992-04-20","1996-08-30","1992-01-26","1991-03-23","1995-12-28","1995-07-15","1993-04-04","1994-01-11","1999-08-21","1993-11-10","1994-02-26","1992-08-06","1993-06-29","1997-03-08","1998-03-03","1998-04-17")) 
#Random Numbers 
pathHospitalNum<-c("H432243","T662272","G424284","W787634","H432243","Y980037","H432243","W787634","Y980037","E432243","U874287","Y980037","U874287","W787634","Y980037","H432243","Y980037","E432243","W787634","W787634") 
#Create the dataframe 
pathdf<-data.frame(pathRep,pathDate,pathHospitalNum) 

#Random letters to fill out a pathology report 
EndoRep<-replicate(20,paste(sample(LETTERS,50,replace=T),collapse="")) 
EndoDate<-as.Date(c("1993-12-22","1994-05-14","1992-07-19","1996-06-01","1992-04-20","1996-08-30","1992-01-24","1991-03-21","1995-12-28","1995-07-15","1993-04-02","1994-01-10","1999-08-21","1993-11-10","1994-02-26","1992-08-05","1993-06-29","1997-03-07","1998-03-03","1998-04-17")) 
#Random Numbers 
EndoHospitalNum<-c("H432243","T662272","G424284","W787634","H432243","Y980037","H432243","W787634","Y980037","E432243","U874287","Y980037","U874287","W787634","Y980037","H432243","Y980037","E432243","W787634","W787634") 
#Create the dataframe: 
Endodf<-data.frame(EndoRep,EndoDate,EndoHospitalNum) 

これだけの正確な日付をマージします。

merge(Endodf,pathdf,by=c("Date","HospNum")) 

私は多分私はdifftime列を作成することができると思ったが、私は、私はすべての日にすべての日付を比較してしまうだろうと思います時間がかかることがありますか?

+0

あなたは一から一かかわりを持っている場合は、あなたがそれらを並べ替えることができ、リンクhttps://stackoverflow.com/questions/9269179/r-speeding-up-approximate-date-match-idata-frame – Wen

+0

をチェック関心のある列を 'cbind'するだけです。 –

答えて

1

data.tableroll="nearest"オプションを使用すると、このセットでは機能しますが、2日以上異なる日付に参加しようとすると失敗します。

library(data.table) 

#Random letters to fill out a pathology report 
pathRep<-replicate(20,paste(sample(LETTERS,50,replace=T),collapse="")) 
pathDate<-as.Date(c("1993-12-22","1994-05-16","1992-07-20","1996-06-02","1992-04-20","1996-08-30","1992-01-26","1991-03-23","1995-12-28","1995-07-15","1993-04-04","1994-01-11","1999-08-21","1993-11-10","1994-02-26","1992-08-06","1993-06-29","1997-03-08","1998-03-03","1998-04-17")) 
#Random Numbers 
pathHospitalNum<-c("H432243","T662272","G424284","W787634","H432243","Y980037","H432243","W787634","Y980037","E432243","U874287","Y980037","U874287","W787634","Y980037","H432243","Y980037","E432243","W787634","W787634") 
#Create the data table and set key fields 
pathdt<-data.table(pathRep,pathDate,pathHospitalNum) 
setkey(pathdt, pathHospitalNum, pathDate) 


#Random letters to fill out a pathology report 
EndoRep<-replicate(20,paste(sample(LETTERS,50,replace=T),collapse="")) 
EndoDate<-as.Date(c("1993-12-22","1994-05-14","1992-07-19","1996-06-01","1992-04-20","1996-08-30","1992-01-24","1991-03-21","1995-12-28","1995-07-15","1993-04-02","1994-01-10","1999-08-21","1993-11-10","1994-02-26","1992-08-05","1993-06-29","1997-03-07","1998-03-03","1998-04-17")) 
#Random Numbers 
EndoHospitalNum<-c("H432243","T662272","G424284","W787634","H432243","Y980037","H432243","W787634","Y980037","E432243","U874287","Y980037","U874287","W787634","Y980037","H432243","Y980037","E432243","W787634","W787634") 
#Create the data table and set keys 
Endodt<-data.table(EndoRep,EndoDate,EndoHospitalNum) 
setkey(Endodt, EndoHospitalNum, EndoDate) 

#run the join 
Endodt[pathdt,roll="nearest"] 
+0

最後の行に何か問題があります –

+0

私のために働きます。ありがとう。 dplyrの回答が好きだったかもしれませんが、私は学習したdata.tableをまだhaventしていないかもしれません。 –

2

data.tableローリングと@ alaybournの答えの参加は、私は質問の一部(「アップ許可の日付の間の2日間の差に」が、ほとんどだけにに対処するための別のオプションを追加するつもりだ、とてもいいですがfuzzyjoinパッケージにいくつかの愛を共有してください)。

library(dplyr) 
library(fuzzyjoin) 

Path <- 
    data.frame(
    PathDate = as.Date(c("1993-12-22", "1994-05-16", "1992-07-20", "1996-06-02", "1992-04-20", "1996-08-30", "1992-01-26", "1991-03-23", "1995-12-28", "1995-07-15", "1993-04-04", "1994-01-11", "1999-08-21", "1993-11-10", "1994-02-26", "1992-08-06", "1993-06-29", "1997-03-08", "1998-03-03", "1998-04-17")), 
    PathHospNum = c("H432243", "T662272", "G424284", "W787634", "H432243", "Y980037", "H432243", "W787634", "Y980037", "E432243", "U874287", "Y980037", "U874287", "W787634", "Y980037", "H432243", "Y980037", "E432243", "W787634", "W787634"), 
    PathRep = replicate(20, paste(sample(LETTERS, 10, replace = T), collapse = "")) 
) 

Endo <- 
    data.frame(
    EndoDate = as.Date(c("1993-12-22", "1994-05-14", "1992-07-19", "1996-06-01", "1992-04-20", "1996-08-30", "1992-01-24", "1991-03-21", "1995-12-28", "1995-07-15", "1993-04-02", "1994-01-10", "1999-08-21", "1993-11-10", "1994-02-26", "1992-08-05", "1993-06-29", "1997-03-07", "1998-03-03", "1998-04-17")), 
    EndoHospNum = c("H432243", "T662272", "G424284", "W787634", "H432243", "Y980037", "H432243", "W787634", "Y980037", "E432243", "U874287", "Y980037", "U874287", "W787634", "Y980037", "H432243", "Y980037", "E432243", "W787634", "W787634"), 
    EndoRep = replicate(20, paste(sample(LETTERS, 10, replace = T), collapse = "")) 
) 

Path$date <- as.numeric(Path$PathDate) 
Endo$date <- as.numeric(Endo$EndoDate) 

Result <- 
    fuzzyjoin::difference_full_join(Endo, Path, by = 'date', max_dist = 2, distance_col = 'Days') %>% 
    filter(EndoHospNum == PathHospNum) %>% 
    select(HospNum = EndoHospNum, EndoDate, PathDate, Days, EndoRep, PathRep) 

Result 

    HospNum EndoDate PathDate Days EndoRep PathRep 
1 H432243 1993-12-22 1993-12-22 0 YBGDMGMZOJ HISSGSCRFR 
2 T662272 1994-05-14 1994-05-16 2 ANAPSCKUEB HIDIFHBDBL 
3 G424284 1992-07-19 1992-07-20 1 HKOCQZAXDU PLXGUPHQBM 
4 W787634 1996-06-01 1996-06-02 1 OTPLUZBLAF KGVILKEHLI 
5 H432243 1992-04-20 1992-04-20 0 GRWJUQPNET UGCKMNKDLW 
6 Y980037 1996-08-30 1996-08-30 0 ORUVMMGGAV EYOWEYAZFK 
7 H432243 1992-01-24 1992-01-26 2 JVSPGIVXEM LCNXQNVGGR 
8 W787634 1991-03-21 1991-03-23 2 WXZNHJIBZW OTXKNTYNKV 
9 Y980037 1995-12-28 1995-12-28 0 PQQQLKTYPG UAMMKJZRFG 
10 E432243 1995-07-15 1995-07-15 0 VYLDWUNAFP EXNTQSYVJM 
11 U874287 1993-04-02 1993-04-04 2 MTBBBVULOD CTKXUKEOQG 
12 Y980037 1994-01-10 1994-01-11 1 TRZWBYAUZR XIHXMOEFVP 
13 U874287 1999-08-21 1999-08-21 0 DYBUWJIAZB KFFGYNQUYM 
14 W787634 1993-11-10 1993-11-10 0 CBWRBZAPAF KYUOZSLIGF 
15 Y980037 1994-02-26 1994-02-26 0 GDUFEYZQFU BUSFQIJDHK 
16 H432243 1992-08-05 1992-08-06 1 JMGSCWTHOI ZRCJFDFNCX 
17 Y980037 1993-06-29 1993-06-29 0 HDTGHCMORL EQYWNJHOET 
18 E432243 1997-03-07 1997-03-08 1 WIMMVJHDSE LYLDELIBYK 
19 W787634 1998-03-03 1998-03-03 0 GRHBDHEWJF AWDYEQZZWY 
20 W787634 1998-04-17 1998-04-17 0 AOFIXWLZDT BBUEROUIWO 
+0

私はすでにポイントを獲得しましたが、私はあなたの答えがもっと好きです。確かに、ファジー論理パッケージをチェックアウトするつもりです。とても面白いですね。ありがとうございました –

関連する問題