Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
acts-875
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container registry
Model registry
Operate
Environments
Monitor
Incidents
Service Desk
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
GitLab community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Carl Corder
acts-875
Commits
948ad024
Commit
948ad024
authored
5 years ago
by
Carl Corder
Browse files
Options
Downloads
Patches
Plain Diff
Add new file
parent
7f7a8c9c
No related branches found
No related tags found
No related merge requests found
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
scripts/phase2/transform_data_pca.r
+177
-0
177 additions, 0 deletions
scripts/phase2/transform_data_pca.r
with
177 additions
and
0 deletions
scripts/phase2/transform_data_pca.r
0 → 100644
+
177
−
0
View file @
948ad024
library
(
"readxl"
)
library
(
"writexl"
)
library
(
"dplyr"
)
library
(
"FactoMineR"
)
# Phase 2 Excel
input
<-
"C:/Users/its-student/Desktop/Phase2In.xlsx"
output
<-
"C:/Users/its-student/Desktop/Phase2OutPCA.xlsx"
# read in sheets
data
<-
read_excel
(
input
,
sheet
=
"Data"
)
commission
<-
read_excel
(
input
,
sheet
=
"Commission"
)
# join by group id & policy duration
demographic
<-
read_excel
(
input
,
sheet
=
"Demographic"
)
# join by group id
expense
<-
read_excel
(
input
,
sheet
=
"Expense"
)
# join by annualized net premium
rtn
<-
read_excel
(
input
,
sheet
=
"RTN"
)
# join by max lives, voluntary & policy duration
sic
<-
read_excel
(
input
,
sheet
=
"SIC"
)
# join by sic code
tax
<-
read_excel
(
input
,
sheet
=
"Tax"
)
# join by state
# drop reserves not related to STD
data
<-
data
%>%
select
(
-
c
(
ICOS
,
WAIVER_IBNR
,
GAAP_RESV
,
WAIVER_RESERVE
))
# format incurred year as numeric
data
<-
data
%>%
mutate
(
INC_YEAR
=
as.numeric
(
INC_YEAR
))
# 2018 is the only full year of data
data
<-
data
%>%
filter
(
INC_YEAR
==
2018
)
# positive lives and policy duration
data
<-
data
%>%
filter
(
MAX_LIVES
>
0
,
POLICY_DURATION
>=
0
)
# group claims on annual basis
data
<-
data
%>%
group_by
(
GROUP_ID
,
DIST_ID
,
REP_ID
,
COVG_CODE
,
TRUE_GROUP_VOL
,
POLICY_EFFECTIVE_DATE
,
REG_OFFICE
,
SIC
,
STATE
,
MAX_LIVES
,
ACTIVE_TERMED
,
LTD_INDICATOR
,
INC_YEAR
,
POLICY_DURATION
)
%>%
summarise
(
PREM
=
sum
(
PREM
),
EST_ANNUALIZED_NET_PREM
=
mean
(
EST_ANNUALIZED_NET_PREM
),
PAID_COMMISSION
=
sum
(
PAID_COMMISSION
),
PAID_CLAIMS
=
sum
(
PAID_CLAIMS
),
IBNR
=
sum
(
IBNR
))
# positive premiums, paid claims and reserves
data
<-
data
%>%
filter
(
EST_ANNUALIZED_NET_PREM
>
0
,
PREM
>
0
,
PAID_CLAIMS
>=
0
,
IBNR
>
0
)
# left outer-join on industry code
data
<-
merge
(
x
=
data
,
y
=
sic
,
by.x
=
"SIC"
,
by.y
=
"SIC_CODE"
,
all.x
=
TRUE
)
# normalize industry title by removing spaces and commas
data
<-
data
%>%
mutate
(
INDUSTRY
=
stringr
::
str_replace_all
(
INDUSTRY
,
c
(
","
=
""
,
" "
=
""
)))
# remove rows where the sic has no industry (e.g. SIC = 1790)
data
<-
data
%>%
filter
(
!
is.na
(
INDUSTRY
))
# left outer-join on demographics (age, gender & salary)
data
<-
merge
(
x
=
data
,
y
=
demographic
,
by.x
=
"GROUP_ID"
,
by.y
=
"GROUP_ID"
,
all.x
=
TRUE
)
# remove groups with missing demographics
data
<-
data
%>%
filter
(
!
is.na
(
AVG_AGE
),
!
is.na
(
AVG_SALARY
),
!
is.na
(
PCT_FEMALE
))
# append percent commission
data
<-
merge
(
x
=
data
,
y
=
commission
[,
c
(
"GROUP_ID"
,
"POLICY_DURATION"
,
"PERCENT_COMMISSION"
)],
by
=
c
(
"GROUP_ID"
=
"GROUP_ID"
,
"POLICY_DURATION"
=
"POLICY_DURATION"
),
all.x
=
TRUE
)
# remove rows where percent comission is NA or negative due to chargebacks
data
<-
data
%>%
filter
(
0
<=
PERCENT_COMMISSION
&
PERCENT_COMMISSION
<=
1
)
# append state premium tax
data
<-
merge
(
x
=
data
,
y
=
tax
,
by.x
=
"STATE"
,
by.y
=
"STATE"
,
all.x
=
TRUE
)
# remove rows with unmapped state tax (e.g. 91, FO)
data
<-
data
%>%
filter
(
!
is.na
(
PREMIUM_TAX
))
# situs state to region map
regions
<-
list
(
"east"
=
c
(
"AL"
,
"CT"
,
"DC"
,
"DE"
,
"GA"
,
"MA"
,
"MD"
,
"ME"
,
"MS"
,
"NC"
,
"NH"
,
"NJ"
,
"NY"
,
"PA"
,
"RI"
,
"SC"
,
"TN"
,
"VA"
,
"VT"
),
"central"
=
c
(
"FL"
,
"IA"
,
"IL"
,
"IN"
,
"KY"
,
"MI"
,
"MN"
,
"MO"
,
"ND"
,
"NE"
,
"OH"
,
"SD"
,
"WI"
,
"WV"
),
"west"
=
c
(
"AK"
,
"AR"
,
"AZ"
,
"CA"
,
"CO"
,
"HI"
,
"ID"
,
"KS"
,
"LA"
,
"MT"
,
"NM"
,
"NV"
,
"OK"
,
"OR"
,
"TX"
,
"UT"
,
"WA"
,
"WY"
))
# create region column
data
<-
data
%>%
mutate
(
REGION
=
case_when
(
STATE
%in%
regions
$
east
~
"east"
,
STATE
%in%
regions
$
central
~
"central"
,
STATE
%in%
regions
$
west
~
"west"
))
# get internal expense from estimated annual net premium
get_internal_expense
<-
function
(
premium
)
{
sapply
(
premium
,
function
(
x
)
{
for
(
i
in
1
:
nrow
(
expense
))
{
if
(
expense
$
EST_ANN_NET_PREM_MIN
[
i
]
<=
x
&
x
<
expense
$
EST_ANN_NET_PREM_MAX
[
i
])
{
return
(
expense
$
INTERNAL_EXPENSE
[
i
])
}
}
})
}
# create new internal expenses column
data
<-
data
%>%
mutate
(
INTERNAL_EXPENSES
=
get_internal_expense
(
EST_ANNUALIZED_NET_PREM
))
# assume pepm rate = 0.5 for all est annualized premiums
data
<-
data
%>%
mutate
(
PERCENT_PEPM
=
MAX_LIVES
*
0.5
/
PREM
)
# PEPM in [0,1]
data
<-
data
%>%
filter
(
0
<=
PERCENT_PEPM
&
PERCENT_PEPM
<=
1
)
# create tolerable loss ratio
data
<-
data
%>%
mutate
(
TLR
=
1
-
(
PERCENT_COMMISSION
+
PREMIUM_TAX
+
PERCENT_PEPM
+
INTERNAL_EXPENSES
))
# TLR in [0,1]
data
<-
data
%>%
filter
(
0
<=
TLR
&
TLR
<=
1
)
# rtn lookup from policy lives, duration & voluntary indicator
data
<-
data
%>%
mutate
(
RTN
=
case_when
(
MAX_LIVES
<
100
&
TRUE_GROUP_VOL
==
"T"
&
POLICY_DURATION
<
2
~
0.8833
,
MAX_LIVES
<
100
&
TRUE_GROUP_VOL
==
"T"
&
POLICY_DURATION
<
4
~
0.9700
,
MAX_LIVES
<
100
&
TRUE_GROUP_VOL
==
"T"
&
POLICY_DURATION
>=
4
~
1.0200
,
MAX_LIVES
<
100
&
TRUE_GROUP_VOL
==
"V"
&
POLICY_DURATION
<
2
~
0.9733
,
MAX_LIVES
<
100
&
TRUE_GROUP_VOL
==
"V"
&
POLICY_DURATION
<
4
~
1.0185
,
MAX_LIVES
<
100
&
TRUE_GROUP_VOL
==
"V"
&
POLICY_DURATION
>=
4
~
1.0670
,
MAX_LIVES
<
1000
&
TRUE_GROUP_VOL
==
"T"
&
POLICY_DURATION
<
2
~
0.8741
,
MAX_LIVES
<
1000
&
TRUE_GROUP_VOL
==
"T"
&
POLICY_DURATION
<
4
~
0.9514
,
MAX_LIVES
<
1000
&
TRUE_GROUP_VOL
==
"T"
&
POLICY_DURATION
>=
4
~
1.0208
,
MAX_LIVES
<
1000
&
TRUE_GROUP_VOL
==
"V"
&
POLICY_DURATION
<
2
~
1.0104
,
MAX_LIVES
<
1000
&
TRUE_GROUP_VOL
==
"V"
&
POLICY_DURATION
<
4
~
1.0347
,
MAX_LIVES
<
1000
&
TRUE_GROUP_VOL
==
"V"
&
POLICY_DURATION
>=
4
~
1.0670
,
MAX_LIVES
>=
1000
&
TRUE_GROUP_VOL
==
"T"
&
POLICY_DURATION
<
2
~
0.9800
,
MAX_LIVES
>=
1000
&
TRUE_GROUP_VOL
==
"T"
&
POLICY_DURATION
<
4
~
1.0000
,
MAX_LIVES
>=
1000
&
TRUE_GROUP_VOL
==
"T"
&
POLICY_DURATION
>=
4
~
1.0200
,
MAX_LIVES
>=
1000
&
TRUE_GROUP_VOL
==
"V"
&
POLICY_DURATION
<
2
~
0.9800
,
MAX_LIVES
>=
1000
&
TRUE_GROUP_VOL
==
"V"
&
POLICY_DURATION
<
4
~
1.0000
,
MAX_LIVES
>=
1000
&
TRUE_GROUP_VOL
==
"V"
&
POLICY_DURATION
>=
4
~
1.0200
))
# calculate needed premium
data
<-
data
%>%
mutate
(
NEEDED_PREMIUM
=
PREM
/
RTN
)
# calculate expected claims
data
<-
data
%>%
mutate
(
EXPECTED_CLAIMS
=
NEEDED_PREMIUM
*
TLR
)
# calcualte actual claims
data
<-
data
%>%
mutate
(
ACTUAL_CLAIMS
=
PAID_CLAIMS
+
IBNR
)
# calculate actual to expected ratio
data
<-
data
%>%
mutate
(
ACTUAL_TO_EXPECTED
=
ACTUAL_CLAIMS
/
EXPECTED_CLAIMS
)
# one-hot binary indicators
data
<-
data
%>%
mutate
(
TRUE_GROUP_VOL
=
case_when
(
TRUE_GROUP_VOL
==
"T"
~
1
,
TRUE_GROUP_VOL
==
"V"
~
0
))
data
<-
data
%>%
mutate
(
LTD_INDICATOR
=
case_when
(
LTD_INDICATOR
==
"With LTD"
~
1
,
LTD_INDICATOR
==
"Without LTD"
~
0
))
data
<-
data
%>%
mutate
(
ACTIVE_TERMED
=
case_when
(
ACTIVE_TERMED
==
"Active"
~
1
,
ACTIVE_TERMED
==
"Terminated"
~
0
))
# keep numerics for PCA
data
<-
data
%>%
select
(
#"REGION", "INDUSTRY",
#"COVG_CODE",
"AVG_SALARY"
,
"AVG_AGE"
,
"PCT_FEMALE"
,
"TRUE_GROUP_VOL"
,
"LTD_INDICATOR"
,
"ACTIVE_TERMED"
,
"MAX_LIVES"
,
"POLICY_DURATION"
,
"PREM"
,
"EST_ANNUALIZED_NET_PREM"
,
"RTN"
,
"PAID_COMMISSION"
,
"PAID_CLAIMS"
,
"IBNR"
,
"PERCENT_COMMISSION"
,
"PREMIUM_TAX"
,
"INTERNAL_EXPENSES"
,
"PERCENT_PEPM"
)
# use FactoMineR for PCA
data.pca
<-
PCA
(
data
,
scale.unit
=
TRUE
,
ncp
=
5
,
graph
=
TRUE
)
# create scree plot
fviz_eig
(
data.pca
)
# variable plot colored by contributions to the PC
fviz_pca_var
(
data.pca
,
col.var
=
"contrib"
,
gradient.cols
=
c
(
"#00AFBB"
,
"#E7B800"
,
"#FC4E07"
),
repel
=
TRUE
)
# native PCA method
data_pca
<-
prcomp
(
data
,
scale
=
TRUE
)
# display three most significant PCs
data_pca
$
rotation
[,
1
:
3
]
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment