我首先将城市名称转换为字符向量,因为(如果我理解正确的话)您想包括df2中包含的城市名称。
df1$city<-as.character(df1$city)
df2$city<-as.character(df2$city)
然后按国家合并它们:
df = merge(df1,df2,by = ("ctry"))
> df
ctry date.x city.x number col date.y city.y other_number other_col
1 Austria 2002-07-30 Vienna 100 cherry 2002-07-01 Vienna 101 beige
2 Denmark 1999-06-30 Copenhagen 60 cucumber 1999-06-29 Copenhagen 61 orange
3 France 1999-06-12 Paris 20 banana 1999-06-12 East-Paris 17 green
4 Germany 2003-08-29 Berlin 10 apple 2003-08-29 Berlin 13 yellow
5 Italy 1999-02-24 Rome 40 banana 1999-02-24 Rome 45 red
6 Poland 1999-03-16 Warsaw 70 apple 1999-03-14 Warsaw 780 blue
7 Russia 1999-07-16 Moscow 80 peach 1999-07-17 Moscow 85 red
8 Switzerland 2001-04-17 Bern 50 lemon 2001-04-17 Zurich 51 purple
9 Tunisia 2001-08-29 Tunis 90 cherry 2000-01-29 Tunis 90 black
10 UK 2000-08-29 London 30 pear 2000-08-29 near London 3100 blue
库stringr
将允许您在此处查看city.x是否在city.y内(请参见最后一列):
library(stringr)
df$city_keep<-str_detect(df$city.y,df$city.x) # this returns logical vector if city.x is contained in city.y (works one way)
> df
ctry date.x city.x number col date.y city.y other_number other_col city_keep
1 Austria 2002-07-30 Vienna 100 cherry 2002-07-01 Vienna 101 beige TRUE
2 Denmark 1999-06-30 Copenhagen 60 cucumber 1999-06-29 Copenhagen 61 orange TRUE
3 France 1999-06-12 Paris 20 banana 1999-06-12 East-Paris 17 green TRUE
4 Germany 2003-08-29 Berlin 10 apple 2003-08-29 Berlin 13 yellow TRUE
5 Italy 1999-02-24 Rome 40 banana 1999-02-24 Rome 45 red TRUE
6 Poland 1999-03-16 Warsaw 70 apple 1999-03-14 Warsaw 780 blue TRUE
7 Russia 1999-07-16 Moscow 80 peach 1999-07-17 Moscow 85 red TRUE
8 Switzerland 2001-04-17 Bern 50 lemon 2001-04-17 Zurich 51 purple FALSE
9 Tunisia 2001-08-29 Tunis 90 cherry 2000-01-29 Tunis 90 black TRUE
10 UK 2000-08-29 London 30 pear 2000-08-29 near London 3100 blue TRUE
然后您可以获得日期之间的天数差异:
df$dayDiff<-abs(as.POSIXlt(df$date.x)$yday - as.POSIXlt(df$date.y)$yday)
和数字上的差异:
df$numDiff<-abs(df$number - df$other_number)
这是结果数据框的样子:
> df
ctry date.x city.x number col date.y city.y other_number other_col city_keep dayDiff numDiff
1 Austria 2002-07-30 Vienna 100 cherry 2002-07-01 Vienna 101 beige TRUE 29 1
2 Denmark 1999-06-30 Copenhagen 60 cucumber 1999-06-29 Copenhagen 61 orange TRUE 1 1
3 France 1999-06-12 Paris 20 banana 1999-06-12 East-Paris 17 green TRUE 0 3
4 Germany 2003-08-29 Berlin 10 apple 2003-08-29 Berlin 13 yellow TRUE 0 3
5 Italy 1999-02-24 Rome 40 banana 1999-02-24 Rome 45 red TRUE 0 5
6 Poland 1999-03-16 Warsaw 70 apple 1999-03-14 Warsaw 780 blue TRUE 2 710
7 Russia 1999-07-16 Moscow 80 peach 1999-07-17 Moscow 85 red TRUE 1 5
8 Switzerland 2001-04-17 Bern 50 lemon 2001-04-17 Zurich 51 purple FALSE 0 1
9 Tunisia 2001-08-29 Tunis 90 cherry 2000-01-29 Tunis 90 black TRUE 212 0
10 UK 2000-08-29 London 30 pear 2000-08-29 near London 3100 blue TRUE 0 3070
但是我们要删除在city.y中找不到city.x的地方,其中日期差大于5或数字差大于3:
df<-df[df$dayDiff<=5 & df$numDiff<=3 & df$city_keep==TRUE,]
> df
ctry date.x city.x number col date.y city.y other_number other_col city_keep dayDiff numDiff
2 Denmark 1999-06-30 Copenhagen 60 cucumber 1999-06-29 Copenhagen 61 orange TRUE 1 1
3 France 1999-06-12 Paris 20 banana 1999-06-12 East-Paris 17 green TRUE 0 3
4 Germany 2003-08-29 Berlin 10 apple 2003-08-29 Berlin 13 yellow TRUE 0 3
剩下的是您上面的三行(第一列中包含点)。
现在,我们可以删除创建的三列以及df2中的日期和城市:
> df<-subset(df,select=-c(city.y,date.y,city_keep,dayDiff,numDiff))
> df
ctry date.x city.x number col other_number other_col
2 Denmark 1999-06-30 Copenhagen 60 cucumber 61 orange
3 France 1999-06-12 Paris 20 banana 17 green
4 Germany 2003-08-29 Berlin 10 apple 13 yellow
,
第1步:根据“城市”和“哭泣”合并数据:
df = merge(df1,by = c("city","ctry"))
第2步:如果日期条目之间的差异为> threshold.date(以天为单位),则删除行:
date_diff = abs(as.numeric(difftime(strptime(df$date.x,format = "%Y-%m-%d"),strptime(df$date.y,units="days")))
index_remove = date_diff > threshold.date
df = df[-index_remove,]
第3步:如果数字之间的差为threshhold.number,则删除行:
number_diff = abs(df$number - df$other_number)
index_remove = number_diff > threshold.numbers
df = df[-index_remove,]
在应用条件之前,应合并数据,以防行不匹配。
,
您可以使用city
测试grepl
的匹配情况,并使用ctry
来测试==
的匹配情况。对于直到此处匹配的用户,您可以通过使用date
转换为as.Date
并将其与difftime
进行比较来计算日期差。 number
的差异以相同的方式完成。
i1 <- seq_len(nrow(df1)) #Store all rows
i2 <- seq_len(nrow(df2))
res <- do.call(rbind,sapply(seq_len(nrow(df1)),function(i) { #Loop over all rows in df1
t1 <- which(df1$ctry[i] == df2$ctry) #Match ctry
t2 <- grepl(df1$city[i],df2$city[t1]) | sapply(df2$city[t1],grepl,df1$city[i]) #Match city
t1 <- t1[t2 & abs(as.Date(df1$date[i]) - as.Date(df2$date[t1[t2]])) <=
as.difftime(threshold.date,units = "days") & #Test for date difference
abs(df1$number[i] - df2$other_number[t1[t2]]) <= threshold.numbers] #Test for number difference
if(length(t1) > 0) { #Match found
i1 <<- i1[i1!=i] #Remove row as it was found
i2 <<- i2[i2!=t1]
cbind(df1[i,],df2[t1,c("other_number","other_col")],match=".")
}
}))
rbind(res,cbind(df1[i1,other_number=NA,other_col=NA,match="1"),cbind(df2[i2,1:3],number=NA,col=NA,other_number=df2[i2,4],other_col=df2[i2,5],match="2"))
# date city ctry number col other_number other_col match
#1 2003-08-29 Berlin Germany 10 apple 13 yellow .
#2 1999-06-12 Paris France 20 banana 17 green .
#6 1999-06-30 Copenhagen Denmark 60 cucumber 61 orange .
#3 2000-08-29 London UK 30 pear NA <NA> 1
#4 1999-02-24 Rome Italy 40 banana NA <NA> 1
#5 2001-04-17 Bern Switzerland 50 lemon NA <NA> 1
#7 1999-03-16 Warsaw Poland 70 apple NA <NA> 1
#8 1999-07-16 Moscow Russia 80 peach NA <NA> 1
#9 2001-08-29 Tunis Tunisia 90 cherry NA <NA> 1
#10 2002-07-30 Vienna Austria 100 cherry NA <NA> 1
#31 2000-08-29 near London UK NA <NA> 3100 blue 2
#41 1999-02-24 Rome Italy NA <NA> 45 red 2
#51 2001-04-17 Zurich Switzerland NA <NA> 51 purple 2
#71 1999-03-14 Warsaw Poland NA <NA> 780 blue 2
#81 1999-07-17 Moscow Russia NA <NA> 85 red 2
#91 2000-01-29 Tunis Tunisia NA <NA> 90 black 2
#101 2002-07-01 Vienna Austria NA <NA> 101 beige 2
,
这是使用我的软件包 safejoin 的解决方案,在这种情况下,包装为 fuzzyjoin 软件包。
我们可以使用by
参数来指定复杂的条件,使用函数X()
从df1
获取值,并使用Y()
从{ {1}}。
如果您的真实表很大,那么这可能会很慢,甚至不可能像笛卡尔积一样进行操作,但是在这里效果很好。
我们想要的是完全连接(保留所有行,并连接可以连接的内容),并且我们希望在连接时保留第一个值,并明智地使用下一个,这意味着我们要处理通过合并而具有相同名称的列的冲突,因此我们使用参数df2
conflict = dplyr::coalesce
输出:
# remotes::install_github("moodymudskipper/safejoin")
# with provides inputs date is a factor,this will cause issues,so we need to
# convert either to date or character,character will do for now.
df1$date <- as.character(df1$date)
df2$date <- as.character(df2$date)
# we want our joining columns named the same to make them conflicted and use our
# conflict agument on conflicted paires
names(df2)[1:4] <- names(df1)[1:4]
library(safejoin)
safe_full_join(
df1,by = ~ {
# must convert every type because fuzzy join uses a matrix so coerces all inputs to character
# see explanation at the bottom
city1 <- X("city")
city2 <- Y("city")
date1 <- as.Date(X("date"),origin = "1970-01-01")
date2 <- as.Date(Y("date"),origin = "1970-01-01")
number1 <- as.numeric(X("number"))
number2 <- as.numeric(Y("number"))
# join if one city name contains the other
(mapply(grepl,city1,city2) | mapply(grepl,city2,city1)) &
# and dates are close enough (need to work in seconds because difftime is dangerous)
abs(difftime(date1,date2,"sec")) <= threshold.date*3600*24 &
# and numbers are close enough
abs(number1 - number2) <= threshold.numbers
},conflict = dplyr::coalesce)
由reprex package(v0.3.0)于2019-11-13创建
不幸的是, fuzzyjoin 在进行多联接时会强制转换矩阵中的所有列,而 safejoin 包装了 fuzzyjoin ,因此我们必须将变量转换为by参数中的适当类型,这说明了#> date city ctry number col other_col
#> 1 2003-08-29 Berlin Germany 10 apple yellow
#> 2 1999-06-12 Paris France 20 banana green
#> 3 1999-06-30 Copenhagen Denmark 60 cucumber orange
#> 4 2000-08-29 London UK 30 pear <NA>
#> 5 1999-02-24 Rome Italy 40 banana <NA>
#> 6 2001-04-17 Bern Switzerland 50 lemon <NA>
#> 7 1999-03-16 Warsaw Poland 70 apple <NA>
#> 8 1999-07-16 Moscow Russia 80 peach <NA>
#> 9 2001-08-29 Tunis Tunisia 90 cherry <NA>
#> 10 2002-07-30 Vienna Austria 100 cherry <NA>
#> 11 2000-08-29 near London UK 3100 <NA> blue
#> 12 1999-02-24 Rome Italy 45 <NA> red
#> 13 2001-04-17 Zurich Switzerland 51 <NA> purple
#> 14 1999-03-14 Warsaw Poland 780 <NA> blue
#> 15 1999-07-17 Moscow Russia 85 <NA> red
#> 16 2000-01-29 Tunis Tunisia 90 <NA> black
#> 17 2002-07-01 Vienna Austria 101 <NA> beige
参数中的第一行。
有关 safejoin 的更多信息:https://github.com/moodymudskipper/safejoin
,
使用data.table
(内嵌解释)的选项:
library(data.table)
setDT(df1)
setDT(df2)
#dupe columns and create ranges for non-equi joins
df1[,c("n","ln","un","d","ld","ud") := .(
number,number - threshold.numbers,number + threshold.numbers,date,date - threshold.date,date + threshold.date)]
df2[,"ud") := .(
other_number,other_number - threshold.numbers,other_number + threshold.numbers,date + threshold.date)]
#perform non-equi join using ctry,num,dates in both ways
res <- rbindlist(list(
df1[df2,on=.(ctry,n>=ln,n<=un,d>=ld,d<=ud),.(date1=x.date,date2=i.date,city1=x.city,city2=i.city,ctry1=x.ctry,ctry2=i.ctry,number,col,other_number,other_col)],df2[df1,.(date1=i.date,date2=x.date,city1=i.city,city2=x.city,ctry1=i.ctry,ctry2=x.ctry,other_col)]),use.names=TRUE,fill=TRUE)
#determine if cities are substrings of one and another
res[,city_match := {
i <- mapply(grepl,city1)
replace(i,is.na(i),TRUE)
}]
#just like SQL coalesce (there is a version in dev in rdatatable github)
coalesce <- function(...) Reduce(function(x,y) fifelse(!is.na(y),y,x),list(...))
#for rows that are matching or no matches to be found
ans1 <- unique(res[(city_match),.(date=coalesce(date1,date2),city=coalesce(city1,city2),ctry=coalesce(ctry1,ctry2),other_col)])
#for rows that are close in terms of dates and numbers but are diff cities
ans2 <- res[(!city_match),.(date=c(.BY$date1,.BY$date2),city=c(.BY$city1,.BY$city2),ctry=c(.BY$ctry1,.BY$ctry2),number=c(.BY$number,NA),col=c(.BY$col,other_number=c(NA,.BY$other_number),other_col=c(NA,.BY$other_col)),names(res)][,seq_along(names(res)) := NULL]
#final desired output
setorder(rbindlist(list(ans1,ans2)),city,na.last=TRUE)[]
输出:
date city ctry number col other_number other_col
1: 1999-02-24 Rome Italy 40 banana NA <NA>
2: 1999-02-24 Rome Italy NA <NA> 45 red
3: 1999-03-14 Warsaw Poland NA <NA> 780 blue
4: 1999-03-16 Warsaw Poland 70 apple NA <NA>
5: 1999-06-12 East-Paris France 20 banana 17 green
6: 1999-06-29 Copenhagen Denmark 60 cucumber 61 orange
7: 1999-07-16 Moscow Russia 80 peach NA <NA>
8: 1999-07-17 Moscow Russia NA <NA> 85 red
9: 2000-01-29 Tunis Tunisia NA <NA> 90 black
10: 2000-08-29 London UK 30 pear NA <NA>
11: 2000-08-29 near London UK NA <NA> 3100 blue
12: 2001-04-17 Bern Switzerland 50 lemon NA <NA>
13: 2001-04-17 Zurich Switzerland NA <NA> 51 purple
14: 2001-08-29 Tunis Tunisia 90 cherry NA <NA>
15: 2002-07-01 Vienna Austria NA <NA> 101 beige
16: 2002-07-30 Vienna Austria 100 cherry NA <NA>
17: 2003-08-29 Berlin Germany 10 apple 13 yellow
,
这是一种灵活的方法,可让您指定选择的任何合并条件集合。
预备工作
我确保df1
和df2
中的所有字符串都是字符串,而不是因素(如其他几个答案所述)。我还把日期包装在as.Date
中,使它们成为真实日期。
指定合并条件
创建列表列表。主列表中的每个元素都是一个条件。条件的成员是
-
final.col.name
:最终表中我们想要的列的名称
-
col.name.1
:df1
中的列名
-
col.name.2
:df2
中的列名
-
exact
:布尔值;我们应该在此列上进行完全匹配吗?
-
threshold
:阈值(如果我们不进行精确匹配)
-
match.function
:一个返回行是否匹配的函数(对于特殊情况,例如使用grepl
进行字符串匹配;请注意,此函数必须被向量化)
merge.criteria = list(
list(final.col.name = "date",col.name.1 = "date",col.name.2 = "date",exact = F,threshold = 5),list(final.col.name = "city",col.name.1 = "city",col.name.2 = "city",match.function = function(x,y) {
return(mapply(grepl,x,y) |
mapply(grepl,x))
}),list(final.col.name = "ctry",col.name.1 = "ctry",col.name.2 = "ctry",exact = T),list(final.col.name = "number",col.name.1 = "number",col.name.2 = "other_number",threshold = 3)
)
合并功能
此函数采用三个参数:我们要合并的两个数据框,以及匹配条件列表。其过程如下:
- 遍历匹配条件并确定哪些行对符合或不符合所有条件。 (受@GKi的答案启发,它使用行索引而不是执行完整的外部联接,这对于大型数据集而言可能不那么占用大量内存。)
- 仅用所需的行创建骨架数据框(在匹配的情况下合并行,在不匹配的记录中合并行)。
- 遍历原始数据帧的列,并使用它们填充新数据帧中的所需列。 (首先对匹配条件中出现的列进行此操作,然后对剩下的任何其他列进行此操作。)
library(dplyr)
merge.data.frames = function(df1,merge.criteria) {
# Create a data frame with all possible pairs of rows from df1 and rows from
# df2.
row.decisions = expand.grid(df1.row = 1:nrow(df1),df2.row = 1:nrow(df2))
# Iterate over the criteria in merge.criteria. For each criterion,flag row
# pairs that don't meet the criterion.
row.decisions$merge = T
for(criterion in merge.criteria) {
# If we're looking for an exact match,test for equality.
if(criterion$exact) {
row.decisions$merge = row.decisions$merge &
df1[row.decisions$df1.row,criterion$col.name.1] == df2[row.decisions$df2.row,criterion$col.name.2]
}
# If we're doing a threshhold test,test for difference.
else if(!is.null(criterion$threshold)) {
row.decisions$merge = row.decisions$merge &
abs(df1[row.decisions$df1.row,criterion$col.name.1] - df2[row.decisions$df2.row,criterion$col.name.2]) <= criterion$threshold
}
# If the user provided a function,use that.
else if(!is.null(criterion$match.function)) {
row.decisions$merge = row.decisions$merge &
criterion$match.function(df1[row.decisions$df1.row,criterion$col.name.1],df2[row.decisions$df2.row,criterion$col.name.2])
}
}
# Create the new dataframe. Just row numbers of the source dfs to start.
new.df = bind_rows(
# Merged rows.
row.decisions %>% filter(merge) %>% select(-merge),# Rows from df1 only.
row.decisions %>% group_by(df1.row) %>% summarize(matches = sum(merge)) %>% filter(matches == 0) %>% select(df1.row),# Rows from df2 only.
row.decisions %>% group_by(df2.row) %>% summarize(matches = sum(merge)) %>% filter(matches == 0) %>% select(df2.row)
)
# Iterate over the merge criteria and add columns that were used for matching
# (from df1 if available; otherwise from df2).
for(criterion in merge.criteria) {
new.df[criterion$final.col.name] = coalesce(df1[new.df$df1.row,df2[new.df$df2.row,criterion$col.name.2])
}
# Now add all the columns from either data frame that weren't used for
# matching.
for(other.col in setdiff(colnames(df1),sapply(merge.criteria,function(x) x$col.name.1))) {
new.df[other.col] = df1[new.df$df1.row,other.col]
}
for(other.col in setdiff(colnames(df2),function(x) x$col.name.2))) {
new.df[other.col] = df2[new.df$df2.row,other.col]
}
# Return the result.
return(new.df)
}
应用功能,我们就完成了
df = merge.data.frames(df1,merge.criteria)