提问者:小点点

提取重新连接的时间


我有一个结构如下的表格,其中包含有关俱乐部会员的信息

id  start       end          cancelled
1   2020-01-01  2020-12-31   2021-01-10
1   2021-02-01  2022-01-31   NA
2   2020-01-01  2020-12-31   NA
3   2020-01-01  2020-06-30   2020-07-01
3   2020-07-10  2021-01-09   2021-01-31
3   2021-02-02  2021-08-01   NA

我对后来重新加入的成员感兴趣。对于上述数据,在整个期间,ID1这样做了一次,ID2没有,而ID3这样做了两次。

每次发生这种情况,从他们的会员资格结束到他们重新加入的时间是多少?对于上述数据,这将是:

id  rejoin_date time_to_rejoin
1   2021-01-31  22 days 
3   2020-07-10  9 days
3   2021-02-02  2 days
structure(list(id = c(1, 1, 2, 3, 3, 3), start = c("2020-01-01", 
"2021-02-01", "2020-01-01", "2020-01-01", "2020-07-10", "2021-02-02"
), end = c("2020-12-31", "2022-01-31", "2020-12-31", "2020-06-30", 
"2021-01-09", "2021-08-01"), cancelled = c("2021-01-10", NA, 
NA, "2020-07-01", "2021-01-31", NA)), class = "data.frame", row.names = c(NA, 
-6L))

我很乐意得到使用R(理想情况下使用dplyr)或SQL(MySQL)的答案


共2个答案

匿名用户

首先,更改为date类,任何人都不应该尝试计算character上的日期差异。

dat[,-1] <- lapply(dat[,-1], as.Date)
do.call(rbind, by(dat, dat$id, function(z) {
  if (nrow(z) == 1) return()
  data.frame(id = z$id[1], rejoin_date = z$start[-1],
             time_to_rejoin = as.numeric(z$start[-1] - z$cancelled[-nrow(z)], units="days"))
}))
#     id rejoin_date time_to_rejoin
# 1    1  2021-02-01             22
# 3.1  3  2020-07-10              9
# 3.2  3  2021-02-02              2
library(dplyr)
dat %>%
  group_by(id) %>%
  summarize(
    rejoin_date = start[-1],
    time_to_rejoin = as.numeric(start[-1] - cancelled[-n()], units="days")
  ) %>%
  ungroup()
# # A tibble: 3 x 3
#      id rejoin_date time_to_rejoin
#   <dbl> <date>               <dbl>
# 1     1 2021-02-01              22
# 2     3 2020-07-10               9
# 3     3 2021-02-02               2

dat %>%
  group_by(id) %>%
  summarize(
    rejoin_date = start[-1],
    time_to_rejoin = as.numeric(start - lag(cancelled), units="days")[-1]
  ) %>%
  ungroup()
library(data.table)
datDT <- as.data.table(dat)
datDT[, .(rejoin_date = start[-1],
          time_to_rejoin = as.numeric(start[-1] - cancelled[-.N], units="days")), by = .(id) ]
#       id rejoin_date time_to_rejoin
#    <num>      <Date>          <num>
# 1:     1  2021-02-01             22
# 2:     3  2020-07-10              9
# 3:     3  2021-02-02              2

匿名用户

您可以使用lag()然后筛选:

select t.*,
       datediff(cancelled, prev_cancelled) as num_days_since_cancel
from (select t.*,
             lag(cancelled) over (partition by id order by start) as prev_cancelled
      from t
     ) t
where prev_cancelled is not null;