From 3d601a85ae659763b06c29fab1e3600d04e00e9d Mon Sep 17 00:00:00 2001 From: Carl Corder <carl.corder@huskers.unl.edu> Date: Thu, 10 Oct 2019 18:09:49 +0000 Subject: [PATCH] Update transform_data.r --- scripts/phase2/transform_data.r | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/scripts/phase2/transform_data.r b/scripts/phase2/transform_data.r index 5f3da66..5c7e02c 100644 --- a/scripts/phase2/transform_data.r +++ b/scripts/phase2/transform_data.r @@ -3,12 +3,13 @@ library("writexl") library("dplyr") # Phase 2 Excel -input <- "C:/Users/its-student/Desktop/Phase2In.xlsx" # https://unl.box.com/s/4dfo4iv2n8awiqt20himu24kyhtmgavk -output <- "C:/Users/its-student/Desktop/Phase2Out.xlsx" # https://unl.box.com/s/vyfmeb62bc1umiuuuuj0tkvo2zvp28uq +input <- "C:/Users/its-student/Desktop/Phase2In.xlsx" +output <- "C:/Users/its-student/Desktop/Phase2Out.xlsx" # read in sheets data <- read_excel(input, sheet = "Data") commission <- read_excel(input, sheet = "Commission") # by group and policy duration +demographic <- read_excel(input, sheet = "Demographic") expense <- read_excel(input, sheet = "Expense") # bin by annualized net premium rtn <- read_excel(input, sheet = "RTN") sic <- read_excel(input, sheet = "SIC") @@ -23,6 +24,9 @@ data <- merge(x = data, y = sic, by.x = "SIC", by.y = "SIC_CODE", all.x = TRUE) # remove rows where the sic has no industry (e.g. SIC = 1790) data <- data %>% filter(!is.na(INDUSTRY)) +# left outer-join on demographics (age, gender, salary) +data <- merge(x = data, y = demographic, by.x = "GROUP_ID", by.y = "GROUP_ID", all.x = TRUE) + # append percent commission data <- merge(x = data, y = commission[, c("GROUP_ID", "POLICY_DURATION", "PERCENT_COMMISSION")], -- GitLab