Skip to content
Snippets Groups Projects
Select Git revision
  • bce83d69a1ffe6050e360ddcd05f506ad37445e8
  • master default protected
2 results

transform_data.r

Blame
  • Carl Corder's avatar
    Carl Corder authored
    bce83d69
    History
    transform_data.r 5.40 KiB
    library("readxl")
    library("writexl")
    library("dplyr")
    
    # Phase 2 Excel
    input <- "C:/Users/its-student/Desktop/Phase2In.xlsx"
    output <- "C:/Users/its-student/Desktop/Phase2Out.xlsx"
    
    # read in sheets
    data        <- read_excel(input, sheet = "Data")
    commission  <- read_excel(input, sheet = "Commission")  # join by group id & policy duration
    demographic <- read_excel(input, sheet = "Demographic") # join by group id
    expense     <- read_excel(input, sheet = "Expense")     # join by annualized net premium
    rtn         <- read_excel(input, sheet = "RTN")         # join by max lives, voluntary & policy duration
    sic         <- read_excel(input, sheet = "SIC")         # join by sic code
    tax         <- read_excel(input, sheet = "Tax")         # join by state
    
    # drop reserves not related to STD (IBNR)
    data <- data %>% select(-c(ICOS, WAIVER_IBNR, GAAP_RESV, WAIVER_RESERVE))
    
    # keep positive max lives, est premium, gross premium, paid claims and reserves
    data <- data %>% filter(MAX_LIVES > 0,
                            EST_ANNUALIZED_NET_PREM > 0,
                            PREM > 0,
                            PAID_CLAIMS > 0,
                            IBNR > 0)
    
    # left outer-join on industry code
    data <- merge(x = data, y = sic, by.x = "SIC", by.y = "SIC_CODE", all.x = TRUE)
    
    # remove rows where the sic has no industry (e.g. SIC = 1790)
    data <- data %>% filter(!is.na(INDUSTRY))
    
    # left outer-join on demographics (age, gender & salary)
    data <- merge(x = data, y = demographic, by.x = "GROUP_ID", by.y = "GROUP_ID", all.x = TRUE)
    
    # append percent commission
    data <- merge(x = data, 
                  y = commission[, c("GROUP_ID", "POLICY_DURATION", "PERCENT_COMMISSION")], 
                  by = c("GROUP_ID" = "GROUP_ID", "POLICY_DURATION" = "POLICY_DURATION"), 
                  all.x = TRUE)
    
    # remove rows where percent comission is NA or negative due to chargebacks
    data <- data %>% filter(0 <= PERCENT_COMMISSION & PERCENT_COMMISSION <= 1)
    
    # append state premium tax
    data <- merge(x = data, y = tax, by.x = "STATE", by.y = "STATE", all.x = TRUE)
    
    # remove rows with unmapped state tax (e.g. 91, FO)
    data <- data %>% filter(!is.na(PREMIUM_TAX))
    
    # get internal expense from annual net premium
    get_internal_expense <- function(premium) {
      sapply(premium, function(x) {
        n <- nrow(expense)
        if (x < expense$EST_ANN_NET_PREM_MIN[1]) {
          return(expense$INTERNAL_EXPENSE[1])
        }
        if (x > expense$EST_ANN_NET_PREM_MAX[n]) {
          return(expense$INTERNAL_EXPENSE[n])
        }
        for (i in 1:n) {
          if (x >= expense$EST_ANN_NET_PREM_MIN[i] &
              x < expense$EST_ANN_NET_PREM_MAX[i]) {
            return(expense$INTERNAL_EXPENSE[i])
          }
        }
      })
    }
    
    # create new internal expenses column
    data <- data %>% mutate(INTERNAL_EXPENSES = get_internal_expense(EST_ANNUALIZED_NET_PREM))
    
    # assume pepm rate = 0.5 for all est annualized premiums
    data <- data %>% mutate(PERCENT_PEPM = MAX_LIVES * 0.5 / PREM)
    
    # PEPM in [0,1]
    data <- data %>% filter(0 <= PERCENT_PEPM & PERCENT_PEPM <= 1)
    
    # create tolerable loss ratio
    data <- data %>% mutate(TLR = 1 - (PERCENT_COMMISSION + PREMIUM_TAX + PERCENT_PEPM + INTERNAL_EXPENSES))
    
    # TLR in [0,1]
    data <- data %>% filter(0 <= TLR & TLR <= 1)
    
    # rtn lookup from policy lives, duration & voluntary indicator
    data <- data %>% 
      mutate(RTN = case_when(MAX_LIVES  < 100  & TRUE_GROUP_VOL == "T" & POLICY_DURATION <  2 ~ 0.8833,
                             MAX_LIVES  < 100  & TRUE_GROUP_VOL == "T" & POLICY_DURATION <  4 ~ 0.9700,
                             MAX_LIVES  < 100  & TRUE_GROUP_VOL == "T" & POLICY_DURATION >= 4 ~ 1.0200,
                             MAX_LIVES  < 100  & TRUE_GROUP_VOL == "V" & POLICY_DURATION <  2 ~ 0.9733,
                             MAX_LIVES  < 100  & TRUE_GROUP_VOL == "V" & POLICY_DURATION <  4 ~ 1.0185,
                             MAX_LIVES  < 100  & TRUE_GROUP_VOL == "V" & POLICY_DURATION >= 4 ~ 1.0670,
                             MAX_LIVES  < 1000 & TRUE_GROUP_VOL == "T" & POLICY_DURATION <  2 ~ 0.8741,
                             MAX_LIVES  < 1000 & TRUE_GROUP_VOL == "T" & POLICY_DURATION <  4 ~ 0.9514,
                             MAX_LIVES  < 1000 & TRUE_GROUP_VOL == "T" & POLICY_DURATION >= 4 ~ 1.0208,
                             MAX_LIVES  < 1000 & TRUE_GROUP_VOL == "V" & POLICY_DURATION <  2 ~ 1.0104,
                             MAX_LIVES  < 1000 & TRUE_GROUP_VOL == "V" & POLICY_DURATION <  4 ~ 1.0347,
                             MAX_LIVES  < 1000 & TRUE_GROUP_VOL == "V" & POLICY_DURATION >= 4 ~ 1.0670,
                             MAX_LIVES >= 1000 & TRUE_GROUP_VOL == "T" & POLICY_DURATION <  2 ~ 0.9800,
                             MAX_LIVES >= 1000 & TRUE_GROUP_VOL == "T" & POLICY_DURATION <  4 ~ 1.0000,
                             MAX_LIVES >= 1000 & TRUE_GROUP_VOL == "T" & POLICY_DURATION >= 4 ~ 1.0200,
                             MAX_LIVES >= 1000 & TRUE_GROUP_VOL == "V" & POLICY_DURATION <  2 ~ 0.9800,
                             MAX_LIVES >= 1000 & TRUE_GROUP_VOL == "V" & POLICY_DURATION <  4 ~ 1.0000,
                             MAX_LIVES >= 1000 & TRUE_GROUP_VOL == "V" & POLICY_DURATION >= 4 ~ 1.0200))
    
    # calculate needed premium
    data <- data %>% mutate(NEEDED_PREMIUM = PREM / RTN)
    
    # calculate expected claims
    data <- data %>% mutate(EXPECTED_CLAIMS = NEEDED_PREMIUM * TLR)
    
    # calcualte actual claims
    data <- data %>% mutate(ACTUAL_CLAIMS = PAID_CLAIMS + IBNR)
    
    # calculate actual to expected ratio
    data <- data %>% mutate(ACTUAL_TO_EXPECTED = ACTUAL_CLAIMS / EXPECTED_CLAIMS)
    
    # write data to Excel
    write_xlsx(data, path = output, col_names = TRUE)