# install install.packages("readxl") install.packages("writexl") install.packages("dplyr") install.packages("ggplot2") # load packages library("readxl") library("writexl") library("dplyr") library("ggplot2") # ======================================== S C R U B B I N G ======================================== # set file names input <- "/home/carl/Desktop/RtnData.xlsx" output <- "/home/carl/Desktop/RtnDataClean.xlsx" # read in the data data <- read_excel(input, sheet = "Data") sic <- read_excel(input, sheet = "SIC") # left outer-join on industry code rtnData <- merge(x = data, y = sic, by.x = "INDUSTRY_CODE", by.y = "SIC_CODE", all.x = TRUE) # drop VAL_DATE column rtnData <- rtnData %>% select(-c("VAL_DATE")) # remove NBOC and Amendment rows rtnData <- rtnData %>% filter(EVENT_TYPE != "NBOC") rtnData <- rtnData %>% filter(EVENT_TYPE != "Amendment") # use only positive premiums rtnData <- rtnData %>% filter(NEEDED_PREMIUM > 0 & FINAL_QUOTED_PREMIUM > 0) # use plans with positive enrolled rtnData <- rtnData %>% filter(ENROLLED_LIVES > 0) # standardize renewals rtnData <- rtnData %>% mutate(RENEWAL_INSTANCE = case_when(EVENT_TYPE == "New Business" ~ "0", RENEWAL_INSTANCE == "1st Renewal" ~ "1", RENEWAL_INSTANCE == "2nd Renewal" ~ "2", RENEWAL_INSTANCE == "2nd+" ~ "3", RENEWAL_INSTANCE == "3rd Renewal +" ~ "3", TRUE ~ as.character(RENEWAL_INSTANCE))) rtnData <- rtnData %>% filter(!is.na(RENEWAL_INSTANCE)) # convert renewals to numeric rtnData$RENEWAL_INSTANCE <- as.numeric(rtnData$RENEWAL_INSTANCE) # format date timestamps rtnData$RAE_EFF_DATE <- as.Date(rtnData$RAE_EFF_DATE, "%y-%m-%d") rtnData$COV_EFF_DATE <- as.Date(rtnData$COV_EFF_DATE, "%y-%m-%d") # set date range >= 2017 rtnData <- rtnData %>% filter(RAE_EFF_DATE >= as.Date("2017-01-01")) rtnData <- rtnData %>% filter(COV_EFF_DATE >= as.Date("2017-01-01")) # clean up sales office names rtnData <- rtnData %>% mutate(SALES_OFFICE = tolower(SALES_OFFICE)) rtnData <- rtnData %>% mutate(SALES_OFFICE = case_when(SALES_OFFICE == "central philadelphia" ~ "philadelphia", SALES_OFFICE == "chicago/indy/milwaukee" ~ "chicago", SALES_OFFICE == "home" ~ "ft wayne", SALES_OFFICE == "miami/orlando/tampa" ~ "miami", SALES_OFFICE == "washington dc" ~ "washington d.c.", TRUE ~ as.character(SALES_OFFICE))) # map sales office to sales region rtnData <- rtnData %>% mutate(UW_REGION = tolower(UW_REGION)) regions <- list("home" = c("ft wayne"), "central" = c("chicago","cincinnati","cleveland","detroit","ft lauderdale","indianapolis","miami","milwaukee","minneapolis","omaha","orlando","pittsburgh","st. louis","tampa"), "east" = c("atlanta","boston","charlotte","long island","nashville","new jersey","new york","parsnippany","philadelphia","portland, me","rochester","washington d.c.","white plains"), "west" = c("dallas","denver","houston","kansas city","los angeles","orange county","phoenix","portland, or","sacramento","san diego","san francisco","seattle")) rtnData <- rtnData %>% mutate(UW_REGION = case_when(SALES_OFFICE %in% regions$home ~ "home", SALES_OFFICE %in% regions$central ~ "central", SALES_OFFICE %in% regions$east ~ "east", SALES_OFFICE %in% regions$west ~ "west", TRUE ~ as.character(UW_REGION))) # define coverage types coverageType <- list("Life" = c("Basic Life","Dependent Life","Life","Life EE Paid","Life ER Paid","Optional Child Life","Optional Life","Optional Spouse Life","Voluntary Child Life","Voluntary Life","Voluntary Life - Unismoke","Voluntary Life - Unismoke ALT Plan","Voluntary Spouse Life"), "STD" = c("NY DBL","State Disability - DBL","State Disability - TDB","STD","STD ASO","STD ATP","STD Core/Buy Up","STD EE Paid","STD ER Paid","STD Spec Worksite","STD True Zero","SW STD Alternate","Voluntary STD"), "LTD" = c("LTD","LTD Core/Buy Up","LTD EE Paid","LTD ER Paid","LTD Spec Worksite","LTD True Zero","Optional LTD","Voluntary LTD"), "AD&D" = c("AD&D","AD&D EE Paid","AD&D ER Paid","Basic AD&D","Optional AD&D","Voluntary AD&D","Voluntary Spouse AD&D"), "Dental" = c("Dental","Dental HMO","Self-Funded Dental","Voluntary Dental","Voluntary Dental HMO")) # aggregate coverage names into coverage types rtnData <- rtnData %>% mutate(COVERAGE_TYPE = case_when(COVERAGE_NAME %in% coverageType$Life ~ "Life", COVERAGE_NAME %in% coverageType$STD ~ "STD", COVERAGE_NAME %in% coverageType$LTD ~ "LTD", COVERAGE_NAME %in% coverageType$`AD&D` ~ "AD&D", COVERAGE_NAME %in% coverageType$Dental ~ "Dental", TRUE ~ "Misc")) # remove misc coverage types (i.e. Implementation Credit, Vision, Voluntary Vision) rtnData <- rtnData %>% filter(COVERAGE_TYPE != "Misc") # classify groups into size by eligible lives rtnData <- rtnData %>% mutate(GROUP_SIZE = case_when(ELIGIBLE_LIVES < 100 ~ "S", ELIGIBLE_LIVES >= 100 & ELIGIBLE_LIVES < 1000 ~ "M", ELIGIBLE_LIVES >= 1000 & ELIGIBLE_LIVES < 10000 ~ "L", ELIGIBLE_LIVES >= 10000 ~ "XL", TRUE ~ "NA")) # add rtn column rtnData <- rtnData %>% mutate(RTN = FINAL_QUOTED_PREMIUM / NEEDED_PREMIUM) # write rtn data to Excel #write_xlsx(rtnData, path = output, col_names = TRUE) # ======================================== M O D E L I N G ======================================== # shift renewal instance for logarithmic model because log(0) = -inf rtnData <- rtnData %>% mutate(RENEWAL_INSTANCE = RENEWAL_INSTANCE + 1.0) # create modeling factors based on coverage type, region & industry model.factors <- as.data.table(rtnData %>% group_by(COVERAGE_TYPE, UW_REGION, INDUSTRY) %>% summarise()) # initialize model list models <- list() # build out models for all factors (3 regions) x (5 coverage types) x (10 industries) = 150 combinations for (i in 1:nrow(model.factors)) { # pull in relevant data for each factor model.data <- rtnData %>% filter(COVERAGE_TYPE == model.factors[i, "COVERAGE_TYPE"][[1]], UW_REGION == model.factors[i, "UW_REGION"][[1]], INDUSTRY == model.factors[i, "INDUSTRY"][[1]]) %>% select(RENEWAL_INSTANCE, RTN, NEEDED_PREMIUM) # save regression to model list models[[i]] <- lm(RTN ~ log(RENEWAL_INSTANCE), data = model.data, weights = NEEDED_PREMIUM) } # example using short-term disability in the service industry from the east region std.east.service <- rtnData %>% filter(COVERAGE_TYPE == "STD", UW_REGION == "east", INDUSTRY == "Services") %>% select(RENEWAL_INSTANCE, RTN, NEEDED_PREMIUM) # create a logarithmic model weighted by needed premium std.east.service.fit <- lm(RTN ~ log(RENEWAL_INSTANCE), data = std.east.service, weights = NEEDED_PREMIUM) # generate sampling points for model x <- data.frame(RENEWAL_INSTANCE = seq(from = range(std.east.service$RENEWAL_INSTANCE)[1], to = range(std.east.service$RENEWAL_INSTANCE)[2], length.out = 100)) # get prediction errors errors <- predict(std.east.service.fit, newdata = x, se.fit = TRUE) # create alpha confidence interval alpha <- 0.99 x$lci <- errors$fit - qnorm(alpha) * errors$se.fit x$fit <- errors$fit x$uci <- errors$fit + qnorm(alpha) * errors$se.fit # plot points, model and confidence interval ggplot(x, aes(x = RENEWAL_INSTANCE, y = fit)) + xlab("Renewal") + ylab("RTN") + theme_bw() + geom_line() + geom_smooth(aes(ymin = lci, ymax = uci), stat = "identity") + geom_point(data = std.east.service, aes(x = RENEWAL_INSTANCE, y = RTN, size = NEEDED_PREMIUM))