我有一个结构如下的表格,其中包含有关俱乐部会员的信息
id start end cancelled
1 2020-01-01 2020-12-31 2021-01-10
1 2021-02-01 2022-01-31 NA
2 2020-01-01 2020-12-31 NA
3 2020-01-01 2020-06-30 2020-07-01
3 2020-07-10 2021-01-09 2021-01-31
3 2021-02-02 2021-08-01 NA
我对后来重新加入的成员感兴趣。对于上述数据,在整个期间,ID1
这样做了一次,ID2
没有,而ID3
这样做了两次。
每次发生这种情况,从他们的会员资格结束到他们重新加入的时间是多少?对于上述数据,这将是:
id rejoin_date time_to_rejoin
1 2021-01-31 22 days
3 2020-07-10 9 days
3 2021-02-02 2 days
structure(list(id = c(1, 1, 2, 3, 3, 3), start = c("2020-01-01",
"2021-02-01", "2020-01-01", "2020-01-01", "2020-07-10", "2021-02-02"
), end = c("2020-12-31", "2022-01-31", "2020-12-31", "2020-06-30",
"2021-01-09", "2021-08-01"), cancelled = c("2021-01-10", NA,
NA, "2020-07-01", "2021-01-31", NA)), class = "data.frame", row.names = c(NA,
-6L))
我很乐意得到使用R(理想情况下使用dplyr)或SQL(MySQL)的答案
首先,更改为date
类,任何人都不应该尝试计算character
上的日期差异。
dat[,-1] <- lapply(dat[,-1], as.Date)
do.call(rbind, by(dat, dat$id, function(z) {
if (nrow(z) == 1) return()
data.frame(id = z$id[1], rejoin_date = z$start[-1],
time_to_rejoin = as.numeric(z$start[-1] - z$cancelled[-nrow(z)], units="days"))
}))
# id rejoin_date time_to_rejoin
# 1 1 2021-02-01 22
# 3.1 3 2020-07-10 9
# 3.2 3 2021-02-02 2
library(dplyr)
dat %>%
group_by(id) %>%
summarize(
rejoin_date = start[-1],
time_to_rejoin = as.numeric(start[-1] - cancelled[-n()], units="days")
) %>%
ungroup()
# # A tibble: 3 x 3
# id rejoin_date time_to_rejoin
# <dbl> <date> <dbl>
# 1 1 2021-02-01 22
# 2 3 2020-07-10 9
# 3 3 2021-02-02 2
或
dat %>%
group_by(id) %>%
summarize(
rejoin_date = start[-1],
time_to_rejoin = as.numeric(start - lag(cancelled), units="days")[-1]
) %>%
ungroup()
library(data.table)
datDT <- as.data.table(dat)
datDT[, .(rejoin_date = start[-1],
time_to_rejoin = as.numeric(start[-1] - cancelled[-.N], units="days")), by = .(id) ]
# id rejoin_date time_to_rejoin
# <num> <Date> <num>
# 1: 1 2021-02-01 22
# 2: 3 2020-07-10 9
# 3: 3 2021-02-02 2
您可以使用lag()
然后筛选:
select t.*,
datediff(cancelled, prev_cancelled) as num_days_since_cancel
from (select t.*,
lag(cancelled) over (partition by id order by start) as prev_cancelled
from t
) t
where prev_cancelled is not null;