From 3d601a85ae659763b06c29fab1e3600d04e00e9d Mon Sep 17 00:00:00 2001
From: Carl Corder <carl.corder@huskers.unl.edu>
Date: Thu, 10 Oct 2019 18:09:49 +0000
Subject: [PATCH] Update transform_data.r

---
 scripts/phase2/transform_data.r | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/scripts/phase2/transform_data.r b/scripts/phase2/transform_data.r
index 5f3da66..5c7e02c 100644
--- a/scripts/phase2/transform_data.r
+++ b/scripts/phase2/transform_data.r
@@ -3,12 +3,13 @@ library("writexl")
 library("dplyr")
 
 # Phase 2 Excel
-input <- "C:/Users/its-student/Desktop/Phase2In.xlsx"   # https://unl.box.com/s/4dfo4iv2n8awiqt20himu24kyhtmgavk
-output <- "C:/Users/its-student/Desktop/Phase2Out.xlsx" # https://unl.box.com/s/vyfmeb62bc1umiuuuuj0tkvo2zvp28uq
+input <- "C:/Users/its-student/Desktop/Phase2In.xlsx"
+output <- "C:/Users/its-student/Desktop/Phase2Out.xlsx"
 
 # read in sheets
 data        <- read_excel(input, sheet = "Data")
 commission  <- read_excel(input, sheet = "Commission") # by group and policy duration
+demographic <- read_excel(input, sheet = "Demographic")
 expense     <- read_excel(input, sheet = "Expense") # bin by annualized net premium
 rtn         <- read_excel(input, sheet = "RTN")
 sic         <- read_excel(input, sheet = "SIC")
@@ -23,6 +24,9 @@ data <- merge(x = data, y = sic, by.x = "SIC", by.y = "SIC_CODE", all.x = TRUE)
 # remove rows where the sic has no industry (e.g. SIC = 1790)
 data <- data %>% filter(!is.na(INDUSTRY))
 
+# left outer-join on demographics (age, gender, salary)
+data <- merge(x = data, y = demographic, by.x = "GROUP_ID", by.y = "GROUP_ID", all.x = TRUE)
+
 # append percent commission
 data <- merge(x = data, 
               y = commission[, c("GROUP_ID", "POLICY_DURATION", "PERCENT_COMMISSION")], 
-- 
GitLab