diff --git a/scripts/phase2/transform_data.r b/scripts/phase2/transform_data.r index b3d9e72f07fb4f6efbfcd6acbcb4287a9f70478f..96fd367203d28c1db27e6b474d347109ba16ded3 100644 --- a/scripts/phase2/transform_data.r +++ b/scripts/phase2/transform_data.r @@ -8,12 +8,12 @@ output <- "C:/Users/its-student/Desktop/Phase2Out.xlsx" # read in sheets data <- read_excel(input, sheet = "Data") -commission <- read_excel(input, sheet = "Commission") # by group and policy duration -demographic <- read_excel(input, sheet = "Demographic") -expense <- read_excel(input, sheet = "Expense") # bin by annualized net premium -rtn <- read_excel(input, sheet = "RTN") -sic <- read_excel(input, sheet = "SIC") -tax <- read_excel(input, sheet = "Tax") +commission <- read_excel(input, sheet = "Commission") # join by group id & policy duration +demographic <- read_excel(input, sheet = "Demographic") # join by group id +expense <- read_excel(input, sheet = "Expense") # join by annualized net premium +rtn <- read_excel(input, sheet = "RTN") # join by max lives, voluntary & policy duration +sic <- read_excel(input, sheet = "SIC") # join by sic code +tax <- read_excel(input, sheet = "Tax") # join by state # drop reserves not related to STD (IBNR) data <- data %>% select(-c(ICOS, WAIVER_IBNR, GAAP_RESV, WAIVER_RESERVE)) @@ -31,7 +31,7 @@ data <- merge(x = data, y = sic, by.x = "SIC", by.y = "SIC_CODE", all.x = TRUE) # remove rows where the sic has no industry (e.g. SIC = 1790) data <- data %>% filter(!is.na(INDUSTRY)) -# left outer-join on demographics (age, gender, salary) +# left outer-join on demographics (age, gender & salary) data <- merge(x = data, y = demographic, by.x = "GROUP_ID", by.y = "GROUP_ID", all.x = TRUE) # append percent commission