• 给大厨写的R数据分析代码


    ###************************************** 新老客户统计 ***************************************###
    dachu <- read.csv("D:\\Dasktop\\bigdata_game\\天池\\大厨\\qijiandiankehu.csv", header = T, encoding = "utf-8", colClasses = c("character", "Date"))
    str(dachu)
    head(dachu,20)
    temp <- table(dachu$买家昵称)
    plot(table(sort(temp))/length(temp))
    #library(data.table)
    #month(dachu$下单日期[nrow(dachu)])
    
    min(dachu$下单日期)
    max(dachu$下单日期)
    
    dachu$ym <- substr(dachu$下单日期, 1,7); head(dachu)
    newcusts <- c()
    oldcusts <- c()
    ss <- sort(unique(dachu$ym))
    #新客户满足一下两个条件:1)当月购买一次;2)之前无购买记录
    #老客户满足一下两个条件之一:1)当月购买两次及以上;2)当月购买一次且之前有购买记录
    for(i in 1:length(ss)){
      #date1 = as.Date(paste(substr(kk, 1, 6), paste(as.integer(substr(kk, 7, 7))+1,"-01", sep = ""), sep = ""))
      
      if(i == 1){
        date2 = as.Date(paste(ss[i+1], "-01", sep = ""))
        now = dachu$买家昵称[dachu$下单日期 < date2]
        temp = table(now)
        uniq = unique(now)
        newcusts = c(newcusts, sum(temp == 1))
        oldcusts = c(oldcusts, sum(temp > 1))
      }else if(i < length(ss)){
        date1 = as.Date(paste(ss[i], "-01", sep = ""))
        date2 = as.Date(paste(ss[i+1], "-01", sep = ""))
        now = dachu$买家昵称[(dachu$下单日期 < date2) & (dachu$下单日期 >= date1)]
        temp = table(now)
        #old_now = names(temp)[temp>1]
        new_now = names(temp)[temp==1]
        temp2 = table(c(uniq, new_now))
        newcusts = c(newcusts, (length(new_now) - sum(temp2 > 1)))
        #oldcusts = c(oldcusts, (length(old_now) + sum(temp2 > 1)))
        oldcusts = c(oldcusts, (length(temp) - length(new_now) + sum(temp2 > 1)))
        #uniq = unique(c(uniq, old_now, new_now))
        uniq = unique(c(uniq, names(temp)))
        
      }else{
        date1 = as.Date(paste(ss[i], "-01", sep = ""))
        now = dachu$买家昵称[dachu$下单日期 >= date1]
        temp = table(now)
        #old_now = names(temp)[temp>1]
        new_now = names(temp)[temp==1]
        temp2 = table(c(uniq, new_now))
        newcusts = c(newcusts, (length(new_now) - sum(temp2 > 1)))
        #oldcusts = c(oldcusts, (length(old_now) + sum(temp2 > 1)))
        oldcusts = c(oldcusts, (length(temp) - length(new_now) + sum(temp2 > 1)))
        #uniq = unique(c(uniq, old_now, new_now))
        uniq = unique(c(uniq, names(temp)))
      }
      
    }
    newcusts
    oldcusts
    (newcusts1 = cbind(date=ss, newcusts))
    (oldcusts1 = cbind(date=ss, oldcusts))
    write.csv(newcusts1, "C:\\Users\\hasee\\Desktop\\newcusts.csv",quote = F)
    write.csv(oldcusts1, "C:\\Users\\hasee\\Desktop\\oldcusts.csv",quote = F)
    
    #library(timeSeries)
    win.graph()
    opar <- par(no.readonly=TRUE)
    par(lty=1, pch=1)  #par("cex") 查看默认值
    # plot.ts(ts(newcusts+oldcusts, start = c(2014, 3), frequency = 12),main="薏凡特月度新老客户购买数量变化趋势", col=1)
    # lines(ts(newcusts, start = c(2014, 3), frequency = 12), col=2)
    # lines(ts(oldcusts, start = c(2014, 3), frequency = 12), col=3)
    time <- seq.Date(as.Date("2014/3/1"), by = "month", length = length(ss))
    plot(time, newcusts+oldcusts, xlab="月份", ylab="客户数", main="薏凡特月度新老客户购买数量变化趋势", 
         type = "o", col=1) 
    # type画点/线, "p" for points, "l" for lines, "b" for both points and lines, "c" for empty points joined by lines, 
    # "o" for overplotted points and lines, "s" and "S" for stair steps and "h" for histogram-like vertical lines. 
    # Finally, "n" does not produce any points or lines. 
    # pch点型, 
    # cex点大小: 
    # lty线型:0=blank, 1=solid (default), 2=dashed, 3=dotted, 4=dotdash, 5=longdash, 6=twodash)
    # lwd线宽
    lines(time, newcusts, type = "o", col=2)
    lines(time, oldcusts, type = "o", col=3)
    legend("topright", c("总体客户", "新客户", "老客户"), col=1:3, lty=1, pch=1)
    # “bottomright”, “bottom”, “bottomleft”, “left”, “topleft”, “top”, “topright”, “right”, “center”
    par(opar)
    #par(new=TRUE)
    
    
    ###************************************** 当月回购率 ***************************************###
    # 月初统计购买一次的客户数,月末统计这部分人回购人数。
    # 当月新进的客户且购买2次以上的不计入新客户
    # 新客户可直接table=1的sum,但是当月回购的客户如何计算是难点。(可以unique内连接计数)
    #数据导入
    dachu <- read.csv("D:\\Dasktop\\bigdata_game\\天池\\大厨\\qijiandiankehu.csv", header = T, encoding = "utf-8", colClasses = c("character", "Date"))
    str(dachu)
    
    #定义保存新客户回购数据
    new_customer <- data.frame()
    
    min(dachu$下单时间)
    max(dachu$下单时间)
    ss=sort(unique(substr(dachu$下单时间,1,7)))
    
    #从第二个月开始,首月新客数和回购数均为0
    for(i in seq(length(ss))[-1]){
      data1 = as.Date(paste(ss[i], "-01", sep = ""))
      
      #月初之前客户购买记录
      data2 = max(i-12,1)
      data2 = as.Date(paste(ss[data2], "-01", sep = ""))
      temp <- table(dachu$买家昵称[(dachu$下单时间 >= data2)&(dachu$下单时间 < data1)])
      
      #月内客户购买记录
      if(i < length(ss)){
        data2 = as.Date(paste(ss[i+1], "-01", sep = ""))
        temp2 <- table(dachu$买家昵称[(dachu$下单时间 >= data1)&(dachu$下单时间 < data2)])
      }else{
        temp2 <- table(dachu$买家昵称[dachu$下单时间 >= data1])
      }
    
      #月内回购记录
      temp2 = merge(data.frame(k=names(temp)[temp==1]),
                    data.frame(k=names(temp2)),
                    by=c('k'))
      
      #保存日期、月初新客数、月内回购数
      new_customer = rbind(new_customer, data.frame(date=ss[i], counts=sum(temp==1), repurchase=nrow(temp2)))
      
    }
    #计算回购率
    new_customer$rate <- new_customer[[3]] / new_customer[[2]]
    #colnames(new_customer) = c('date','counts','repurchase','rate')
    
    win.graph()
    opar<-par(mfrow=c(2,2))
    plot(new_customer$date,new_customer$counts)
    plot(new_customer$date,new_customer$repurchase);plot(new_customer$date,new_customer$rate)
    par(opar)
    
    write.csv(new_customer,"C:\\Users\\hasee\\Desktop\\new_customer.csv")
    
    
    
    ###****************************************** 季度转化率 ****************************************###
    #数据导入
    dachu <- read.csv("C:\\Users\\hasee\\Desktop\\qijiandiankehu.csv", header = T, encoding = "utf-8", colClasses = c("character", "Date"))
    str(dachu)
    
    #定义保存新客户回购数据
    new_customer <- data.frame()
    
    min(dachu$下单时间)
    max(dachu$下单时间)
    ss=sort(unique(substr(dachu$下单时间,1,7)))
    
    #从第二个月开始,首月新客数和回购数均为0
    for(i in seq(length(ss)-2)[-1]){
      data1 = as.Date(paste(ss[i], "-01", sep = ""))
      
      #季度初之前客户购买记录
      temp <- table(dachu$买家昵称[dachu$下单时间 < data1])
      
      #季度内客户购买记录
      if(i < length(ss)-2){
        data2 = as.Date(paste(ss[i+3], "-01", sep = ""))
        temp2 <- table(dachu$买家昵称[(dachu$下单时间 >= data1)&(dachu$下单时间 < data2)])
      }else{
        temp2 <- table(dachu$买家昵称[dachu$下单时间 >= data1])
      }
      
      #季度内回购记录
      temp2 = merge(data.frame(k=names(temp)[temp==1]),
                    data.frame(k=names(temp2)),
                    by=c('k'))
      
      #保存日期、季度初新客数、月内回购数
      new_customer = rbind(new_customer, data.frame(date=ss[i], counts=sum(temp==1), repurchase=nrow(temp2)))
      
    }
    #计算回购率
    new_customer$rate <- new_customer[[3]] / new_customer[[2]]
    #colnames(new_customer) = c('date','counts','repurchase','rate')
    
    win.graph()
    opar<-par(mfrow=c(2,2))
    plot(new_customer$date,new_customer$counts)
    plot(new_customer$date,new_customer$repurchase);plot(new_customer$date,new_customer$rate)
    par(opar)
    
    write.csv(new_customer,"C:\\Users\\hasee\\Desktop\\new_customer.csv")
    
    
    
    
    ###************************************ 客户连带率:该段代码貌似有问题 ***********************************###
    # 只针对所有一次客户
    # 月连带率=本月发生连带的客户数/本月成交总客户数
    # 产品连带率=购买该产品连带的客户数/购买该产品总体客户数
    # 成交总客户=1次多件客户+一次一件客户
    #数据导入
    library(readxl)
    # dachu <- read.csv("C:\\Users\\hasee\\Desktop\\liandailv.xlsx", header = T, encoding = "utf-8", colClasses = c("character", "Date", "character"))
    # read_excel(path, sheet = 1, col_names = TRUE, col_types = NULL, na = "", skip = 0)
    dachu <- read_excel("C:\\Users\\hasee\\Desktop\\liandailv.xlsx", sheet = 1, col_names = TRUE, col_types = c("text", "text", "text"), na = "", skip = 0)
    dachu$下单日期 <- as.Date(dachu$下单日期)
    str(dachu)
    unique(dachu$商品ID)
    
    #定义保存月度连带率
    min(dachu$下单日期)
    max(dachu$下单日期)
    month_set=sort(unique(substr(dachu$下单日期,1,7)))
    
    #月度连带率
    month_associate_rate = data.frame()
    date1 = min(dachu$下单日期)
    for(i in seq(length(month_set))){
      if(i < length(month_set)){
        date2 = as.Date(paste(month_set[i+1], "-01", sep = ""))
        temp <- table(dachu$买家昵称[(dachu$下单日期 >= date1)&(dachu$下单日期 < date2)])
        date1 = date2
      }else{
        temp = table(dachu$买家昵称[dachu$下单日期 >= date1])
      }
      month_associate_rate = rbind(month_associate_rate, data.frame(month=month_set[i], count = length(temp), count2= sum(temp>1), rate=(sum(temp>1)/length(temp))))
    }
    month_associate_rate
    
    #产品连带率
    dachu$flag <- 0
    head(dachu)
    temp = table(dachu$买家昵称)
    # library(dplyr)
    # temp2 = left_join(dachu, data.frame(x = names(temp)[temp>1], flag.y = 1), by= c("买家昵称" = "x"),suffix = c("", ".y"))
    temp2 = merge(dachu, data.frame(x = names(temp)[temp>1], flag.x = 1), by.x = "买家昵称", by.y = "x", all.x = TRUE)
    temp2$flag[temp2$flag.x==1] = 1
    temp2$flag.x = NULL
    temp2
    
    #定义保存产品连带率
    prod_set=unique(dachu$商品ID)
    product_associate_rate = data.frame()
    
    #产品连带率
    for(pi in prod_set){
      temp <- temp2$flag[temp2$商品ID == pi]
      product_associate_rate = rbind(product_associate_rate, data.frame(product=pi, count = length(temp), count2= sum(temp==1), rate=(sum(temp==1)/length(temp))))
        
    }
      
    product_associate_rate = product_associate_rate[order(product_associate_rate$count, decreasing = TRUE),]
    product_associate_rate$product = as.character(product_associate_rate$product)
    head(product_associate_rate)  
    
    #验证
    dachu[dachu$买家昵称 %in% dachu[dachu$商品ID=="42303520877",]$买家昵称,]
    
    #产品连带率前五月度变化
    #temp2为产品连带率里计算的那个
    prod_set = product_associate_rate$product[1:5]
    product_associate_rate_top5 = data.frame()
    date1 = min(temp2$下单日期)
    for(i in seq(length(month_set))){
      if(i < length(month_set)){
        date2 = as.Date(paste(month_set[i+1], "-01", sep = ""))
        temp <- temp2[(temp2$下单日期 >= date1)&(temp2$下单日期 < date2),]
        date1 = date2
      }else{
        temp = temp2[temp2$下单日期 >= date1,]
      }
      
      temp3 = data.frame(month=month_set[i])
      for(pi in prod_set){
        temp4 = temp$flag[temp$商品ID==pi]
        temp3 = cbind(temp3, length(temp4), sum(temp4==1), ifelse(length(temp4)==0,0,sum(temp4==1)/length(temp4)))
      }
      
      product_associate_rate_top5 = rbind(product_associate_rate_top5, temp3)
    }
    colnames(product_associate_rate_top5)[-1] <- paste('top',rep(1:5,each=3),c('count','count2','rate'),sep = '')
    product_associate_rate_top5
    
    
    
    #图形展示
    win.graph()
    opar<-par(mfrow=c(1,2))
    plot(month_associate_rate$month, month_associate_rate$rate, type="l", col = "blue", main = "月度连带率", xlab = "月份", ylab="连带率")
    plot(product_associate_rate$rate, main = "产品连带率", xlab = "产品", ylab="连带率")
    par(opar)
    
    write.csv(month_associate_rate,"C:\\Users\\hasee\\Desktop\\month_associate_rate.csv")
    write.csv(product_associate_rate,"C:\\Users\\hasee\\Desktop\\product_associate_rate.csv") #, quote = TRUE
    write.csv(product_associate_rate_top5,"C:\\Users\\hasee\\Desktop\\product_associate_rate_top5.csv") #, quote = TRUE
    
    
    
    # dplyr包包含了各种关联查询的函数,如inner_join,left_join,full_join,rigth_join......
    library(dplyr)
    library("nycflights13")
    # Drop unimportant variables so it's easier to understand the join results.
    flights2 <- 
      flights %>% 
      select(year:day,tailnum, carrier)
    flights2 %>% 
      left_join(airlines,by= "carrier")
    
    #merge(data.frame(x=1:3,y=0,z=2),data.frame(x=2:3,y=1:2),by=c("x"),all.x = T)
    
    
    
    ###******************************************* 回购率与首次消费金额关系 ********************************************###
    dachu <- read.csv("D:\\Dasktop\\bigdata_game\\天池\\大厨\\suoyoukehushuju.csv", header = T, encoding = "utf-8", colClasses = c("character", "Date", "numeric"))
    str(dachu)
    head(dachu,20)
    
    library(dplyr)
    temp=head(dachu,20)
    temp = head(arrange(dachu, 买家昵称, desc(下单时间)), 100);temp
    #flights[order(flights$year, flights$month, flights$day), ]
    #flights[order(desc(flights$arr_delay)), ]
    #filter(group_by(temp, 买家昵称))
    
    temp <- dachu%>%
      arrange(买家昵称, 下单时间) %>%
      group_by(买家昵称)%>%
      mutate(count = n())%>%
      slice(1)%>%
      filter()
    
    win.graph()
    opar<-par(mfrow=c(1,2))
    #实付金额——购买次数分布图
    plot(temp$实付金额, temp$count)
    #实付金额——频数(人次)分布图
    plot(table(temp$实付金额))
    par(opar)
    
    #通过第一个图,暂且分组0-1000等距每200,1000-2000,2000以上
    temp$group <- 0
    temp[temp$实付金额 < 1000, ]$group <- temp[temp$实付金额 < 1000, ]$实付金额 %/% 100
    temp[(temp$实付金额 >= 1000) & (temp$实付金额 < 2000), ]$group <- 10
    temp[temp$实付金额 >= 2000, ]$group <- 11
    head(temp,20)
    temp2 <- temp%>%
      group_by(group)%>%
      summarise(n1=sum(count>1), n2=n(), rate = n1/n2)
    
    win.graph()
    #各组回购率分布图
    plot(temp2$group, temp2$rate)
    
    # i <- c("gamma","a")
    # switch(i,
    #        beta = "You typed beta",
    #        alpha = "You typed alpha",
    #        gamma = "You typed gamma",
    #        delta = "You typed delta" 
    # )
    
    
    
    
    ###******************************************* 客户联带对回购的影响 *******************************************###
    t0 <- Sys.time()
    dachu <- read.csv("D:\\Dasktop\\bigdata_game\\天池\\大厨\\AnalysisOrderDownLoad-订单信息-子订单(全量)-10027396-8025-107.csv", 
                      header = T, encoding = "utf-8", colClasses = c(rep("character",4), rep("Date",3), rep("character",5), "integer","numeric","character",rep("numeric",2)))
    str(dachu)
    dachu <- dachu[,4:5]
    head(dachu)
    dachu$买家昵称 <- substr(dachu$买家昵称,3,nchar(dachu$买家昵称)-1)
    head(dachu,20)
    
    library(dplyr)
    #首单购买件数回购率
    temp <- dachu %>%
      group_by(买家昵称, 下单时间) %>%
      summarise(count=n()) %>%
      arrange(买家昵称, 下单时间) %>%
      group_by(买家昵称) %>%
      mutate(count2=n()) %>%
      slice(1) %>%
      group_by(count) %>%
      mutate(n1 = n(), n2 = sum(count2>1), rate = n2/n1) %>%
      slice(1) %>%
      select(count, n1, n2, rate)
    
    temp
    
    win.graph()
    plot(temp$count, temp$rate, main="首单购买件数与回购率", xlab = "首单购买件数",
         ylab = "回购客户占比", col="red")
    
    #按月计算新客中回购客户占比
    temp <- dachu %>%
      group_by(买家昵称, 下单时间) %>%
      summarise(count=n()) %>%  #连带件数
      mutate(year=as.integer(substr(下单时间,1,4)),
             month=as.integer(substr(下单时间,6,7))) %>%
      arrange(买家昵称, 下单时间) %>%
      group_by(买家昵称) %>%
      mutate(count2=n()) %>%  #回购次数
      slice(1) %>%  #第一次出现(前面的按时间排序不可少)即为新客
      group_by(year, month) %>%
      mutate(n1 = n(), n2 = sum(count>1), rate = n2/n1) %>%
      slice(1) %>%
      select(下单时间, year, month, n1, n2, rate)
      
    temp
    win.graph()
    time <- seq.Date(as.Date(paste(substr(min(temp$下单时间),1,7), "-01", sep = "")), 
                     by = "month", length = nrow(temp))
    plot(time, temp$rate, main = "各月新客中连带客户占比", xlab = "月份",
         ylab = "首单购买多件客户占比", type = "l")
    
    
    #按订单统计连带率(即购买多件订单与总订单之比)
    temp <- dachu %>%
      group_by(买家昵称, 下单时间) %>%
      summarise(count=n())
    
    sum(temp$count>1)/nrow(temp)
    
    Sys.time()-t0
    
    
    
    
    
    ###############################################################################################################
    #setwd("H:/数据分析/内部数据/薏凡特旗舰店数据/旗舰店客户数据分析/0803")
    setwd("D:\\Dasktop\\bigdata_game\\天池\\大厨")
    dat <- read.csv("kehushuju.csv",header=TRUE,encoding="utf-8",colClasses=c("character","Date","integer","numeric","integer"),stringsAsFactors = F)
    dat <- arrange(dat, 买家昵称, 下单日期)
    head(dat)
    # new_dat<-unique(dat)  #数据量多时,计算量很大,而且基本不会出现重复记录,所以可以省略
    # head(new_dat)
    library(dplyr)
    
    ##回购次数与回购概率
    ###
    temp <- dat %>%
      group_by(买家昵称)%>%
      summarise(count=n())
    head(temp)
    rr1 <- c()
    rr2 <- c()
    rate <- c()
    max_count <- max(temp$count)
    for (i in 1:(max_count-1)){    ###可能会出错,rate分母=0
      # rr1[i] <- summarise(filter(temp,count==i+1),n())
      # rr2[i] <- summarise(filter(temp,count>=i),n())
      # rate[i] <- summarise(filter(temp,count==i+1),n())/summarise(filter(temp,count>=i),n())
      rr1 <- c(rr1, sum(temp$count == i+1))  #效率更高
      rr2 <- c(rr2, sum(temp$count >= i))
      rate <- c(rate, rr1[i]/rr2[i])  #避免重复计算
    }
    temp2<-filter(temp,count>=2)
    head(temp2)
    rrr<-cbind(rr1,rr2,rate)
    
    rrr
    # write.csv(rrr,"H:/数据分析/内部数据/薏凡特旗舰店数据/旗舰店客户数据分析/0803/rrr.csv")
    
    
    #计算回购周期#####
    
    #添加购买次数列
    
    new_dat2 <- select(dat, 买家昵称,下单日期, 下单时点)
    # new_dat2<-data.frame(new_dat2)   #已经是数据框结构,而且即便转换格式此处也不对,应该为:new_dat2<-as.data.frame(new_dat2)
    # new_dat2<-unique(new_dat2)
    # head(new_dat2)
    
    # temp2<-group_by(new_dat2,买家昵称)
    # temp2<-summarise(temp2,count=n())
    # temp2 <- new_dat2 %>%
    #   group_by(买家昵称) %>%
    #   summarise(count=n())
    # head(temp2)
    # count2<-unique(temp2$count)
    # 
    # new_dat2$counts=0  
    # for(i in count2){  
    #   rg<-temp[temp2$count==i,]$买家昵称;
    #   new_dat2[new_dat2$买家昵称 %in% rg,]$counts=i
    #   
    # }
    
    new_dat2 <- merge(new_dat2, temp, by=c('买家昵称'))
    
    
    
    head(new_dat2)
    # old_dat<-filter(new_dat2,counts>=2)
    # old_dat<-arrange(old_dat,下单日期)
    # old_dat <- new_dat2 %>%    ##此处太慢,后面给出改进方法
    #   filter(count>=2) %>%
    #   arrange(下单日期)
    # # old_dat<-unique(old_dat)
    # head(old_dat)
    # #max_count2<-max(old_dat$counts)
    # #num<-c(1:max_count2)
    # rebuy<-c()
    # redays<-c()
    # # t=1
    # for(i in unique(old_dat$买家昵称) ){
    #   rg<-filter(old_dat,old_dat$买家昵称==i)
    #   
    #   for(j in 1:(rg$count[1]-1))
    #   {
    #     #t_diff <- rg$下单日期[j+1] - rg$下单日期[j]
    #     t_diff <- as.integer(rg$下单日期[j+1] - rg$下单日期[j])
    #     # rebuy[t]=j+1
    #     # redays[t]=t_diff
    #     # t=t+1
    #     rebuy = c(rebuy,j+1)
    #     redays = c(redays,t_diff)
    #   }
    # }
    # 
    # head(rebuy)
    # head(redays)
    # mydata<-data.frame(rebuy,redays)
    # #write.csv(mydata,"H:/数据分析/内部数据/薏凡特旗舰店数据/旗舰店客户数据分析/0803/mydata.csv")
    # head(mydata)
    
    
    ###各时点回购人数占比
    #不考虑时间因素时
    rate <- data.frame()
    for(i in sort(unique(dat$下单时点))){
      temp2 = new_dat2[new_dat2$下单时点 == i,]$count
      rate = rbind(rate, c(i, sum(temp2>1)/length(temp2)))
    }
    colnames(rate) <- c("下单时点", "rate")
    rate
    
    #考虑时间因素时
    ###如果考虑时间因素,则需加以下代码
    new_dat3 <- arrange(new_dat2, 买家昵称, 下单日期)  #最好加排序,防止出错
    head(new_dat3, 50)
    # for(i in temp$买家昵称){  #由于循环较大故运行时间较长
    #   new_dat3[new_dat3$买家昵称 == i,]$count <- 1:(temp[temp$买家昵称 == i,]$count)
    # }
    # head(new_dat3, 50)
    
    #改进后,此方法必须对数据先排序!!
    # t0 <- Sys.time()
    # i <- 1; nmax <- nrow(new_dat3)
    # repeat{
    #   #m = i
    #   n = new_dat3[i,4]
    #   #ss = new_dat3[i,1]
    #   # repeat{
    #   #   i <- i + 1
    #   #   if((new_dat3[i,1] != ss) | (i > nmax)){
    #   #     new_dat3[m:(i-1),4] <- 1:new_dat3[m,4]
    #   #     break
    #   #   }
    #   # }
    #   new_dat3[i:(i + n - 1),4] <- 1:n
    #   i = i+n
    #   if(i > nmax)  break
    # }
    # Sys.time()-t0
    # 
    # t0 <- Sys.time()
    # i <- 1; nmax <- nrow(new_dat3)
    # while(i <= nmax){
    #   #m = i
    #   n = new_dat3[i,4]
    #   #ss = new_dat3[i,1]
    #   # repeat{
    #   #   i <- i + 1
    #   #   if((new_dat3[i,1] != ss) | (i > nmax)){
    #   #     new_dat3[m:(i-1),4] <- 1:new_dat3[m,4]
    #   #     break
    #   #   }
    #   # }
    #   new_dat3[i:(i + n - 1),4] <- 1:n
    #   i = i+n
    # }
    # Sys.time()-t0
    
    
    t0 <- Sys.time()
    for(i in sort(unique(temp$count))){  #必须加sort排序
      df = (new_dat3$count == i)
      new_dat3[df, 4] <- rep(1:i, sum(df)/i)
    }
    Sys.time()-t0
    head(new_dat3, 50)
    tail(new_dat3,50)
    
    #计算
    rate2 <- data.frame(下单时点=c(), rate=c())
    for(i in sort(unique(dat$下单时点))){
      temp2 = new_dat3[new_dat3$下单时点 == i,]$count
      rate2 = rbind(rate2, c(i, sum(temp2>1)/length(temp2)))
    }
    colnames(rate2) <- c("下单时点", "rate")
    rate2
    
    
    #改进方法
    new_dat3$t_diff <- as.integer(new_dat3$下单日期 - c(new_dat3$下单日期[1], new_dat3$下单日期[-nrow(new_dat3)]))
    head(new_dat3)
    new_dat3$t_diff[new_dat3$count==1] <- 0
    mydata <- new_dat3 %>%
      select(count, t_diff) %>%
      filter(count > 1) %>%
      rename(rebuy = count, redays = t_diff)
    head(mydata)
    
    plot(mydata)
    
    
    #各次购买5天内回购情况
    new_dat3$m5 <- (new_dat3$t_diff <5)
    new_dat3$m5[new_dat3$count == 1] <- 0
    
    
    
    setwd("H:/数据分析/内部数据/薏凡特旗舰店数据/旗舰店客户数据分析/0803/自我研究")
    dat<-read.csv("kehushuju.csv",header=T,encoding="utf-8",colClasses=c("character","Date","integer","numeric","integer"))
    head(dat)
    library(dplyr)
    dat1<-arrange(dat,下单日期)
    head(dat1)
    m=5 #定义回购周期,M=5表示客户在5天内回购
    counts<-c(rep(0,length(dat1[,2])))
    t0<-Sys.time()
    for(i in 1:length(dat1[,2])){
      t_run<-dat1[,2][i]+m
      goal_dat1<-filter(dat1,下单日期<=t_run)
      if(length(filter(goal_dat1,goal_dat1$买家昵称==dat1[,1][i])[,1])>=2){
        counts[i]<-1
      }
    }
    tt<-Sys.time()-t0
    head(counts)
    end_dat5<-cbind(dat1,counts)
    write.csv(end_dat5,"H:/数据分析/内部数据/薏凡特旗舰店数据/旗舰店客户数据分析/0803/自我研究/end_dat5.csv")
    

      

  • 相关阅读:
    DataSet调用Dispose必须吗
    Python基础
    windows下pip升级到8.1.1 及安装Selenium
    python文件编码说明 coding
    Response.End() VS Context.ApplicationInstance.CompleteRequest()
    Python练习正则
    Python2.7爬虫练习爬百度百科python词条
    未知错误:1000正在终止线程
    debug pin用
    读取文件
  • 原文地址:https://www.cnblogs.com/iupoint/p/9769362.html
Copyright © 2020-2023  润新知