diff --git a/scripts/phase2/transform_data_pca.r b/scripts/phase2/transform_data_pca.r
new file mode 100644
index 0000000000000000000000000000000000000000..54cd5bb62979bf12c0a62b05460d203dc8301bdd
--- /dev/null
+++ b/scripts/phase2/transform_data_pca.r
@@ -0,0 +1,177 @@
+library("readxl")
+library("writexl")
+library("dplyr")
+library("FactoMineR")
+
+# Phase 2 Excel
+input <- "C:/Users/its-student/Desktop/Phase2In.xlsx"
+output <- "C:/Users/its-student/Desktop/Phase2OutPCA.xlsx"
+
+# read in sheets
+data        <- read_excel(input, sheet = "Data")
+commission  <- read_excel(input, sheet = "Commission")  # join by group id & policy duration
+demographic <- read_excel(input, sheet = "Demographic") # join by group id
+expense     <- read_excel(input, sheet = "Expense")     # join by annualized net premium
+rtn         <- read_excel(input, sheet = "RTN")         # join by max lives, voluntary & policy duration
+sic         <- read_excel(input, sheet = "SIC")         # join by sic code
+tax         <- read_excel(input, sheet = "Tax")         # join by state
+
+# drop reserves not related to STD
+data <- data %>% select(-c(ICOS, WAIVER_IBNR, GAAP_RESV, WAIVER_RESERVE))
+
+# format incurred year as numeric
+data <- data %>% mutate(INC_YEAR = as.numeric(INC_YEAR))
+
+# 2018 is the only full year of data
+data <- data %>% filter(INC_YEAR == 2018)
+
+# positive lives and policy duration
+data <- data %>% filter(MAX_LIVES > 0, POLICY_DURATION >= 0)
+
+# group claims on annual basis
+data <- data %>% group_by(GROUP_ID, DIST_ID, REP_ID,
+                          COVG_CODE, TRUE_GROUP_VOL,
+                          POLICY_EFFECTIVE_DATE, 
+                          REG_OFFICE, SIC, STATE, 
+                          MAX_LIVES, ACTIVE_TERMED, LTD_INDICATOR, INC_YEAR, POLICY_DURATION) %>% 
+  summarise(PREM = sum(PREM), EST_ANNUALIZED_NET_PREM = mean(EST_ANNUALIZED_NET_PREM),
+            PAID_COMMISSION = sum(PAID_COMMISSION), PAID_CLAIMS = sum(PAID_CLAIMS), IBNR = sum(IBNR))
+
+# positive premiums, paid claims and reserves
+data <- data %>% filter(EST_ANNUALIZED_NET_PREM > 0, PREM > 0, PAID_CLAIMS >= 0, IBNR > 0)
+
+# left outer-join on industry code
+data <- merge(x = data, y = sic, by.x = "SIC", by.y = "SIC_CODE", all.x = TRUE)
+
+# normalize industry title by removing spaces and commas
+data <- data %>% mutate(INDUSTRY = stringr::str_replace_all(INDUSTRY, c("," = "", " " = "")))
+
+# remove rows where the sic has no industry (e.g. SIC = 1790)
+data <- data %>% filter(!is.na(INDUSTRY))
+
+# left outer-join on demographics (age, gender & salary)
+data <- merge(x = data, y = demographic, by.x = "GROUP_ID", by.y = "GROUP_ID", all.x = TRUE)
+
+# remove groups with missing demographics
+data <- data %>% filter(!is.na(AVG_AGE), !is.na(AVG_SALARY), !is.na(PCT_FEMALE))
+
+# append percent commission
+data <- merge(x = data, 
+              y = commission[, c("GROUP_ID", "POLICY_DURATION", "PERCENT_COMMISSION")], 
+              by = c("GROUP_ID" = "GROUP_ID", "POLICY_DURATION" = "POLICY_DURATION"), 
+              all.x = TRUE)
+
+# remove rows where percent comission is NA or negative due to chargebacks
+data <- data %>% filter(0 <= PERCENT_COMMISSION & PERCENT_COMMISSION <= 1)
+
+# append state premium tax
+data <- merge(x = data, y = tax, by.x = "STATE", by.y = "STATE", all.x = TRUE)
+
+# remove rows with unmapped state tax (e.g. 91, FO)
+data <- data %>% filter(!is.na(PREMIUM_TAX))
+
+# situs state to region map
+regions <- list("east"     = c("AL","CT","DC","DE","GA","MA","MD","ME","MS","NC","NH","NJ","NY","PA","RI","SC","TN","VA","VT"),
+                "central"  = c("FL","IA","IL","IN","KY","MI","MN","MO","ND","NE","OH","SD","WI","WV"),
+                "west"     = c("AK","AR","AZ","CA","CO","HI","ID","KS","LA","MT","NM","NV","OK","OR","TX","UT","WA","WY"))
+
+# create region column
+data <- data %>% mutate(REGION = case_when(STATE %in% regions$east ~ "east",
+                                           STATE %in% regions$central ~ "central",
+                                           STATE %in% regions$west ~ "west"))
+
+# get internal expense from estimated annual net premium
+get_internal_expense <- function(premium) {
+  sapply(premium, function(x) {
+    for (i in 1:nrow(expense)) {
+      if (expense$EST_ANN_NET_PREM_MIN[i] <= x & x < expense$EST_ANN_NET_PREM_MAX[i]) {
+        return(expense$INTERNAL_EXPENSE[i])
+      }
+    }
+  })
+}
+
+# create new internal expenses column
+data <- data %>% mutate(INTERNAL_EXPENSES = get_internal_expense(EST_ANNUALIZED_NET_PREM))
+
+# assume pepm rate = 0.5 for all est annualized premiums
+data <- data %>% mutate(PERCENT_PEPM = MAX_LIVES * 0.5 / PREM)
+
+# PEPM in [0,1]
+data <- data %>% filter(0 <= PERCENT_PEPM & PERCENT_PEPM <= 1)
+
+# create tolerable loss ratio
+data <- data %>% mutate(TLR = 1 - (PERCENT_COMMISSION + PREMIUM_TAX + PERCENT_PEPM + INTERNAL_EXPENSES))
+
+# TLR in [0,1]
+data <- data %>% filter(0 <= TLR & TLR <= 1)
+
+# rtn lookup from policy lives, duration & voluntary indicator
+data <- data %>% 
+  mutate(RTN = case_when(MAX_LIVES  < 100  & TRUE_GROUP_VOL == "T" & POLICY_DURATION <  2 ~ 0.8833,
+                         MAX_LIVES  < 100  & TRUE_GROUP_VOL == "T" & POLICY_DURATION <  4 ~ 0.9700,
+                         MAX_LIVES  < 100  & TRUE_GROUP_VOL == "T" & POLICY_DURATION >= 4 ~ 1.0200,
+                         MAX_LIVES  < 100  & TRUE_GROUP_VOL == "V" & POLICY_DURATION <  2 ~ 0.9733,
+                         MAX_LIVES  < 100  & TRUE_GROUP_VOL == "V" & POLICY_DURATION <  4 ~ 1.0185,
+                         MAX_LIVES  < 100  & TRUE_GROUP_VOL == "V" & POLICY_DURATION >= 4 ~ 1.0670,
+                         MAX_LIVES  < 1000 & TRUE_GROUP_VOL == "T" & POLICY_DURATION <  2 ~ 0.8741,
+                         MAX_LIVES  < 1000 & TRUE_GROUP_VOL == "T" & POLICY_DURATION <  4 ~ 0.9514,
+                         MAX_LIVES  < 1000 & TRUE_GROUP_VOL == "T" & POLICY_DURATION >= 4 ~ 1.0208,
+                         MAX_LIVES  < 1000 & TRUE_GROUP_VOL == "V" & POLICY_DURATION <  2 ~ 1.0104,
+                         MAX_LIVES  < 1000 & TRUE_GROUP_VOL == "V" & POLICY_DURATION <  4 ~ 1.0347,
+                         MAX_LIVES  < 1000 & TRUE_GROUP_VOL == "V" & POLICY_DURATION >= 4 ~ 1.0670,
+                         MAX_LIVES >= 1000 & TRUE_GROUP_VOL == "T" & POLICY_DURATION <  2 ~ 0.9800,
+                         MAX_LIVES >= 1000 & TRUE_GROUP_VOL == "T" & POLICY_DURATION <  4 ~ 1.0000,
+                         MAX_LIVES >= 1000 & TRUE_GROUP_VOL == "T" & POLICY_DURATION >= 4 ~ 1.0200,
+                         MAX_LIVES >= 1000 & TRUE_GROUP_VOL == "V" & POLICY_DURATION <  2 ~ 0.9800,
+                         MAX_LIVES >= 1000 & TRUE_GROUP_VOL == "V" & POLICY_DURATION <  4 ~ 1.0000,
+                         MAX_LIVES >= 1000 & TRUE_GROUP_VOL == "V" & POLICY_DURATION >= 4 ~ 1.0200))
+
+# calculate needed premium
+data <- data %>% mutate(NEEDED_PREMIUM = PREM / RTN)
+
+# calculate expected claims
+data <- data %>% mutate(EXPECTED_CLAIMS = NEEDED_PREMIUM * TLR)
+
+# calcualte actual claims
+data <- data %>% mutate(ACTUAL_CLAIMS = PAID_CLAIMS + IBNR)
+
+# calculate actual to expected ratio
+data <- data %>% mutate(ACTUAL_TO_EXPECTED = ACTUAL_CLAIMS / EXPECTED_CLAIMS)
+
+# one-hot binary indicators
+data <- data %>% mutate(TRUE_GROUP_VOL = case_when(TRUE_GROUP_VOL == "T" ~ 1,
+                                                   TRUE_GROUP_VOL == "V" ~ 0))
+
+data <- data %>% mutate(LTD_INDICATOR = case_when(LTD_INDICATOR == "With LTD"    ~ 1,
+                                                  LTD_INDICATOR == "Without LTD" ~ 0))
+
+data <- data %>% mutate(ACTIVE_TERMED = case_when(ACTIVE_TERMED == "Active"     ~ 1,
+                                                  ACTIVE_TERMED == "Terminated" ~ 0))
+
+# keep numerics for PCA
+data <- data %>% select(#"REGION", "INDUSTRY",
+                        #"COVG_CODE",
+                        "AVG_SALARY", "AVG_AGE", "PCT_FEMALE",
+                        "TRUE_GROUP_VOL", "LTD_INDICATOR", "ACTIVE_TERMED",
+                        "MAX_LIVES", "POLICY_DURATION", "PREM", "EST_ANNUALIZED_NET_PREM",
+                        "RTN",
+                        "PAID_COMMISSION", "PAID_CLAIMS", "IBNR",
+                        "PERCENT_COMMISSION", "PREMIUM_TAX", "INTERNAL_EXPENSES", "PERCENT_PEPM")
+
+# use FactoMineR for PCA
+data.pca <- PCA(data, scale.unit = TRUE, ncp = 5, graph = TRUE)
+
+# create scree plot
+fviz_eig(data.pca)
+
+# variable plot colored by contributions to the PC
+fviz_pca_var(data.pca, col.var = "contrib",
+             gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
+             repel = TRUE)
+
+# native PCA method
+data_pca <- prcomp(data, scale = TRUE)
+
+# display three most significant PCs
+data_pca$rotation[,1:3]