In [1]:
import pandas as pd
import numpy as np
from datetime import timedelta
from matplotlib import pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import statsmodels.formula.api as smf
In [2]:
# Pandas settings
pd.options.display.max_columns = None

# Seaborn styles
sns.set(
    style='white',
    font_scale=1.5,
    rc={
        'figure.figsize': (14, 8)
    }
)

Pre-define cell styles

In [3]:
new_column_style = lambda _: 'background-color: greenyellow'
highlight_style = lambda _: 'background-color: gold'
loss_ratio_style = lambda _: '{ color: indianred }'

Notes on this Jupyter notebook

  • Newly merged columns are marked with light green backgrounds.
  • Highlighted cells are marked with gold backgrounds.
  • Net Premium refers to Total Premium - Refunded Premium.
    For example, if a driver has paid \$1,000 to purchase a policy and was refunded \\$400 after cancelling the policy, the net premium is \$600.
  • seaborn and plotly packages are used for visualizations. To run the notebook, these packages should be installed.
    Installing seaborn should be straightforward (pip install seaborn or conda install seaborn).
    For plotly, please refer to https://plotly.com/python/getting-started/

Import/clean data

Read CSV files

Each sheet from the Excel file has been separated into separate .csv files. While reading csv files, I'll parse datetime type columns when reading csv.

In [4]:
df_agency_dimension = pd.read_csv('data/agency_dimension.csv')
df_claims_fact = pd.read_csv('data/claims_fact.csv')
df_claim_branches = pd.read_csv('data/claim_branches.csv')
df_claim_expense_dimension = pd.read_csv('data/claim_expense_dimension.csv')
df_communication_data = pd.read_csv('data/communication_data.csv')
df_company_dimension = pd.read_csv('data/company_dimension.csv')
df_driver_dimension = pd.read_csv('data/driver_dimension.csv')
df_location_dimension = pd.read_csv('data/location_dimension.csv')
df_policy_dimension = pd.read_csv('data/policy_dimension.csv', 
                                  parse_dates=['Policy Effective Date', 'Policy Expiration Date', 'Cancel Date'])
df_population_distribution = pd.read_csv('data/population_distribution.csv')
df_risk_fact = pd.read_csv('data/risk_fact.csv')
df_vehicles_dimension = pd.read_csv('data/vehicles_dimension.csv')

Exploring data

Is there anyone who purchased multiple policies?

I check whether a driver has purchased more than one policy.

In [5]:
# Number of unique drivers in risk_fact
df_risk_fact['Driver ID'].nunique()
Out[5]:
35712
In [6]:
df_risk_fact[df_risk_fact['Total Premium'] > 0].shape[0]
Out[6]:
35712
Conclusion No, each driver has purchased only 1 policy.

Pre-processing

Risk Fact

Reconciling inconsistent Vehicle IDs (for same drivers) between Risk Fact and Claims Fact dataframes

Case assumption #9 mentions that some drivers have different Vehicle IDs in risk fact and claim fact tables. Use claims fact table as the source of truth.

In [7]:
df_vehicle_by_driver_risk_fact = df_risk_fact[['Driver ID', 'Vehicle ID']].drop_duplicates()
df_vehicle_by_driver_risk_fact.head(3)
Out[7]:
Driver ID Vehicle ID
0 4758 50
1 5901 68
2 12972 43
In [8]:
df_vehicle_by_driver_claims_fact = df_claims_fact[['Claimant Id', 'Vehicle ID']].copy() \
    .drop_duplicates() \
    .rename(columns={
        'Claimant Id': 'Driver ID'
    })

df_vehicle_by_driver_claims_fact.head(3)
Out[8]:
Driver ID Vehicle ID
0 28498 78
1 5163 27
2 12472 17
In [9]:
df_vehicle_comparison = df_vehicle_by_driver_risk_fact.merge(right=df_vehicle_by_driver_claims_fact, on='Driver ID', how='left') \
    .rename(columns={
        'Vehicle ID_x': 'Vehicle ID Risk Fact', 
        'Vehicle ID_y': 'Vehicle ID Claims Fact'
    })

df_vehicle_comparison['Vehicle ID Claims Fact'] = df_vehicle_comparison['Vehicle ID Claims Fact'].astype('Int64')
df_vehicle_comparison = df_vehicle_comparison[df_vehicle_comparison['Vehicle ID Claims Fact'].notna()]

df_vehicle_comparison.head(3)
Out[9]:
Driver ID Vehicle ID Risk Fact Vehicle ID Claims Fact
7 25890 10 10
36 10613 42 31
131 6047 81 33
In [10]:
df_different_vehicles = df_vehicle_comparison[df_vehicle_comparison['Vehicle ID Risk Fact'] != df_vehicle_comparison['Vehicle ID Claims Fact']]
display(
    df_different_vehicles.head() \
        .style \
        .applymap(highlight_style, subset=['Vehicle ID Risk Fact', 'Vehicle ID Claims Fact'])
)

print(f'{df_different_vehicles.shape[0]} drivers have different vehicle IDs in risk fact and claims fact tables.')
Driver ID Vehicle ID Risk Fact Vehicle ID Claims Fact
36 10613 42 31
131 6047 81 33
166 9516 75 60
191 11894 49 43
245 17467 72 78
568 drivers have different vehicle IDs in risk fact and claims fact tables.

Create a new risk fact dataframe with updated vehicle IDs

In [11]:
df_risk_fact_vehicle_id_fixed = df_risk_fact.merge(right=df_different_vehicles[['Driver ID', 'Vehicle ID Claims Fact']], on='Driver ID', how='left')
df_risk_fact_vehicle_id_fixed['Vehicle ID'] = np.where(df_risk_fact_vehicle_id_fixed['Vehicle ID Claims Fact'].isna(), df_risk_fact_vehicle_id_fixed['Vehicle ID'], df_risk_fact_vehicle_id_fixed['Vehicle ID Claims Fact'])

display(
    df_risk_fact_vehicle_id_fixed[df_risk_fact_vehicle_id_fixed['Vehicle ID Claims Fact'].notna()] \
        .head() \
        .style \
        .applymap(highlight_style, subset=['Vehicle ID', 'Vehicle ID Claims Fact'])
)

df_risk_fact_vehicle_id_fixed.drop(columns=['Vehicle ID Claims Fact'], inplace=True)
Record ID Policy Number Driver ID Vehicle ID Vehicle Model Year Location ID Total Premium Vehicle ID Claims Fact
36 37 1007613 10613 31 2006 335 3090.57 31
131 132 1003047 6047 33 2007 162 2970.57 33
166 167 1006516 9516 60 2007 590 2934.57 60
191 192 1008894 11894 43 2006 529 3066.57 43
245 246 1014467 17467 78 2005 485 4566.57 78

Driver Dimension

Parse `Date of Birth" as a datetime format

Most date of birth values are in YYYY-MM-DD hh:mm:ss format. There are 2162 rows that have an invalid format (whitespace strings).

In [12]:
# Using errors='coerce' will force invalid formats into NaT
df_driver_dimension['Date of Birth'] = pd.to_datetime(df_driver_dimension['Date of Birth'], errors='coerce')
df_driver_dimension.head(3).style.applymap(new_column_style, subset='Date of Birth')
Out[12]:
Driver ID First Name Last Name Number of Violations Number of Accidents Marital Status Gender Miles to work Primary Vehicle Use Date of Birth
0 1 Duane Crance 3 0 M M 9 Work 1952-04-16 00:00:00
1 2 Sheldon Sherbert 1 0 S M 8 Leisure 1971-11-15 00:00:00
2 3 Kristian Brix 0 0 M M 40 Work 1989-01-13 00:00:00
In [13]:
print(f'Column data type of "Date of Birth": {df_driver_dimension["Date of Birth"].dtype}')
print(f'Number of missing (NaT) "Date of Birth": {df_driver_dimension["Date of Birth"].isna().sum()}')
Column data type of "Date of Birth": datetime64[ns]
Number of missing (NaT) "Date of Birth": 2162
In [14]:
df_driver_dimension[df_driver_dimension['Date of Birth'].isna()] \
    .head(3) \
    .style \
    .applymap(highlight_style, subset='Date of Birth')
Out[14]:
Driver ID First Name Last Name Number of Violations Number of Accidents Marital Status Gender Miles to work Primary Vehicle Use Date of Birth
13 14 Tammara Swimm 1 1 M M nan Leisure NaT
14 15 Gina Creek 6 3 S F nan Work NaT
135 136 Noel Vanderwood 3 1 S M nan Leisure NaT

We will have to take care of the missing birthdays later.

Policy Dimension

Add annual/semiannual classification to the policy dimension

What is the duration of policies? Get a count of unique values.

In [15]:
df_policy_duration = df_policy_dimension['Policy Expiration Date'] - df_policy_dimension['Policy Effective Date']
df_policy_duration.value_counts()
Out[15]:
365 days    9999
184 days    5033
366 days    4967
182 days    3365
183 days    3360
181 days    3276
dtype: int64

All policies are either semiannual or annual. Extract this information to a new column.

In [16]:
df_policy_dimension['Duration'] = np.where(df_policy_duration > timedelta(days=200), 'Annual', 'Semiannual')

df_policy_dimension.head(3).style.applymap(new_column_style, subset=['Duration'])
Out[16]:
Policy Number Policy Effective Date Policy Expiration Date Agency ID Underwriting Company ID Cancel Date Duration
0 1000001 2014-03-09 00:00:00 2015-03-09 00:00:00 11 4 NaT Annual
1 1000002 2011-04-02 00:00:00 2012-04-02 00:00:00 15 2 NaT Annual
2 1000003 2012-01-29 00:00:00 2012-07-31 00:00:00 3 7 NaT Semiannual
In [17]:
df_policy_dimension['Duration'].value_counts()
Out[17]:
Semiannual    15034
Annual        14966
Name: Duration, dtype: int64

There seems to be an even split between semiannual and annual policies.

Creating merged master dataframes for easier analysis

Risk Fact (one row per driver)

For the risk fact table, there are 2 types of rows - purchase and refund. A positive premium indicates a purhcase of policy, while a negative premium indicates a refund (cancellation).

Let's merge purchase/refund rows into single rows, creating "Refund Amount" and "Net Premium" columns. The end goal is to have 1 row per driver.

In [18]:
# Create a dataframe of driver vs refund amount information
# Note that there is at maximum 1 refund row per driver
# If any driver has more than 1 refund, we will need to groupby "Driver ID" to sum the refund amounts
df_refund_by_driver = df_risk_fact_vehicle_id_fixed[df_risk_fact_vehicle_id_fixed['Total Premium'] < 0][['Driver ID', 'Total Premium']]
df_refund_by_driver.rename(columns={ 'Total Premium': 'Refund Amount' }, inplace=True)
df_refund_by_driver.head()
Out[18]:
Driver ID Refund Amount
35712 4758 -1695.029100
35713 7827 -414.218141
35714 11385 -790.590744
35715 21096 -425.856497
35716 21839 -144.661977
In [19]:
# Add refund amount to purchase rows (where Total Premium > 0)
df_risk_master = df_risk_fact_vehicle_id_fixed[df_risk_fact_vehicle_id_fixed['Total Premium'] > 0].merge(right=df_refund_by_driver, on='Driver ID', how='left')

# Add "Net Premium" column (Total Premium + Refund Amount)
# Treat np.nan as a 0 when adding two numbers
df_risk_master['Net Premium'] = df_risk_master['Total Premium'].add(df_risk_master['Refund Amount'], fill_value=0)

# Display added column
df_risk_master.head(3).style.applymap(new_column_style, subset=['Net Premium'])
Out[19]:
Record ID Policy Number Driver ID Vehicle ID Vehicle Model Year Location ID Total Premium Refund Amount Net Premium
0 1 1001758 4758 50 2006 65 3450.57 -1695.03 1755.54
1 2 1002901 5901 68 2013 536 1938.57 nan 1938.57
2 3 1009972 12972 43 2005 3174.57 nan 3174.57

Merge policy dimension to risk fact

In [20]:
df_risk_master = df_risk_master.merge(right=df_policy_dimension, how='inner', on='Policy Number').sort_values(by=['Policy Number', 'Driver ID'])

df_risk_master.head(3).style.applymap(new_column_style, 
                                            subset=df_policy_dimension.columns.to_series().drop('Policy Number'))
Out[20]:
Record ID Policy Number Driver ID Vehicle ID Vehicle Model Year Location ID Total Premium Refund Amount Net Premium Policy Effective Date Policy Expiration Date Agency ID Underwriting Company ID Cancel Date Duration
26 11 1000001 3001 34 2011 229 4434.57 nan 4434.57 2014-03-09 00:00:00 2015-03-09 00:00:00 11 4 NaT Annual
28353 22644 1000002 3002 61 2008 401 4795.82 nan 4795.82 2011-04-02 00:00:00 2012-04-02 00:00:00 15 2 NaT Annual
25958 20253 1000003 3003 61 2011 576 1325.81 nan 1325.81 2012-01-29 00:00:00 2012-07-31 00:00:00 3 7 NaT Semiannual

Merge driver dimension to risk fact

In [21]:
df_risk_master = df_risk_master.merge(right=df_driver_dimension, on='Driver ID')
df_risk_master.head(3).style.applymap(new_column_style,
                                           subset=df_driver_dimension.columns.to_series().drop('Driver ID'))
Out[21]:
Record ID Policy Number Driver ID Vehicle ID Vehicle Model Year Location ID Total Premium Refund Amount Net Premium Policy Effective Date Policy Expiration Date Agency ID Underwriting Company ID Cancel Date Duration First Name Last Name Number of Violations Number of Accidents Marital Status Gender Miles to work Primary Vehicle Use Date of Birth
0 11 1000001 3001 34 2011 229 4434.57 nan 4434.57 2014-03-09 00:00:00 2015-03-09 00:00:00 11 4 NaT Annual Josefa Turnbill 6 2 S M 28 Leisure 1994-02-28 00:00:00
1 22644 1000002 3002 61 2008 401 4795.82 nan 4795.82 2011-04-02 00:00:00 2012-04-02 00:00:00 15 2 NaT Annual Deidre Whilden 6 2 S M 34 Work 1965-07-04 00:00:00
2 20253 1000003 3003 61 2011 576 1325.81 nan 1325.81 2012-01-29 00:00:00 2012-07-31 00:00:00 3 7 NaT Semiannual Fernande Lashbrook 2 0 S F 19 Leisure 1973-09-19 00:00:00

Add age of driver based on date of birth and policy effective date

In [22]:
df_risk_master['Age'] = (df_risk_master['Policy Effective Date'].dt.year - df_risk_master['Date of Birth'].dt.year) - 1

# Increase age if effective date has passed the date of birth for that year
df_risk_master['Age'] = df_risk_master['Age'] + (df_risk_master['Policy Effective Date'].dt.strftime('%m%d') > df_risk_master['Date of Birth'].dt.strftime('%m%d'))

# Convert to integer type
df_risk_master['Age'] = df_risk_master['Age'].astype('Int64')

# Check results
df_risk_master[['Policy Effective Date', 'Date of Birth', 'Age']].head(3).style.applymap(new_column_style, subset=['Age'])
Out[22]:
Policy Effective Date Date of Birth Age
0 2014-03-09 00:00:00 1994-02-28 00:00:00 20
1 2011-04-02 00:00:00 1965-07-04 00:00:00 45
2 2012-01-29 00:00:00 1973-09-19 00:00:00 38

Create age bins

In [23]:
df_risk_master['Age Bin'] = pd.cut(df_risk_master['Age'],
                     bins=[9.5, 19.5, 29.5, 39.5, 49.5, 59.5, 69.5, np.Infinity],
                     labels=['10-19', '20-29', '30-39', '40-49', '50-59', '60-69', '70+'],
                     include_lowest=True,
                     right=False,
                    )

df_risk_master[['Driver ID', 'Policy Effective Date', 'Date of Birth', 'Age', 'Age Bin']].head(3) \
    .style \
    .applymap(new_column_style, subset='Age Bin')
Out[23]:
Driver ID Policy Effective Date Date of Birth Age Age Bin
0 3001 2014-03-09 00:00:00 1994-02-28 00:00:00 20 20-29
1 3002 2011-04-02 00:00:00 1965-07-04 00:00:00 45 40-49
2 3003 2012-01-29 00:00:00 1973-09-19 00:00:00 38 30-39

Calculate monthly premium based on semiannual/annual classification we've done in policy dimension

In [24]:
df_risk_master['Monthly Premium'] = np.where(df_risk_master['Duration'] == 'Annual',
                                             df_risk_master['Total Premium'] / 12,
                                             df_risk_master['Total Premium'] / 6)

df_risk_master[['Policy Number', 'Driver ID', 'Total Premium', 'Duration', 'Monthly Premium']] \
    .head(3) \
    .style \
    .applymap(new_column_style, subset=['Monthly Premium'])
Out[24]:
Policy Number Driver ID Total Premium Duration Monthly Premium
0 1000001 3001 4434.57 Annual 369.548
1 1000002 3002 4795.82 Annual 399.652
2 1000003 3003 1325.81 Semiannual 220.969

Merge Claim Fact dataframe with Claim Expense Dimension.

In [25]:
df_claims_fact_merged = df_claims_fact.merge(right=df_claim_expense_dimension, on='Claim ID', how='left')
df_claims_fact_merged.head(3).style.applymap(new_column_style, subset=df_claim_expense_dimension.columns.to_series().drop('Claim ID'))
Out[25]:
Claim ID Policy ID Claimant Id Vehicle ID Claim Amount Claim Description Expense Amount Claim Handling Branch
0 1 1025498 28498 78 4130 Damage from impacts with animals (deer, etc.); note: a crash from swerving to avoid animals will likely fall under collision 165.2 3
1 2 1002163 5163 27 13340 Personal Injury 1200.6 2
2 3 1009472 12472 17 2100 Bodily Injury Claims 147 2

For each driver, calculate (1) sum of all claim amounts, (2) sum of all claim expenses, (3) number of claims. This step is to aggregate different claims by a single driver into one row.

In [26]:
df_claims_by_driver = df_claims_fact_merged.groupby(by='Claimant Id', as_index=False) \
    .agg({
        'Claim Description': 'count',
        'Claim Amount': 'sum', 
        'Expense Amount': 'sum'
    }) \
    .rename(columns={
        'Claim Description': 'Claim Count',
        'Claimant Id': 'Driver ID',
        'Expense Amount': 'Claim Expense'
    })

df_claims_by_driver.head(3)
Out[26]:
Driver ID Claim Count Claim Amount Claim Expense
0 1146 1 7470 1419.3
1 1328 1 6400 1088.0
2 3112 1 11180 670.8
In [27]:
# Merge both policy_dimension and df_claims_by_driver into risk_fact
df_risk_master = df_risk_master.merge(right=df_claims_by_driver, on='Driver ID', how='left')

# Convert claim count to an integer type
df_risk_master['Claim Count'] = df_risk_master['Claim Count'].astype('Int64')

df_risk_master.head(3).style.applymap(new_column_style,
                                      subset=df_claims_by_driver.columns.to_series().drop('Driver ID'))
Out[27]:
Record ID Policy Number Driver ID Vehicle ID Vehicle Model Year Location ID Total Premium Refund Amount Net Premium Policy Effective Date Policy Expiration Date Agency ID Underwriting Company ID Cancel Date Duration First Name Last Name Number of Violations Number of Accidents Marital Status Gender Miles to work Primary Vehicle Use Date of Birth Age Age Bin Monthly Premium Claim Count Claim Amount Claim Expense
0 11 1000001 3001 34 2011 229 4434.57 nan 4434.57 2014-03-09 00:00:00 2015-03-09 00:00:00 11 4 NaT Annual Josefa Turnbill 6 2 S M 28 Leisure 1994-02-28 00:00:00 20 20-29 369.548 nan nan nan
1 22644 1000002 3002 61 2008 401 4795.82 nan 4795.82 2011-04-02 00:00:00 2012-04-02 00:00:00 15 2 NaT Annual Deidre Whilden 6 2 S M 34 Work 1965-07-04 00:00:00 45 40-49 399.652 nan nan nan
2 20253 1000003 3003 61 2011 576 1325.81 nan 1325.81 2012-01-29 00:00:00 2012-07-31 00:00:00 3 7 NaT Semiannual Fernande Lashbrook 2 0 S F 19 Leisure 1973-09-19 00:00:00 38 30-39 220.969 nan nan nan

Merge vehicles dimension into risk fact

In [28]:
df_vehicles_dimension.head(3)
Out[28]:
Vehicle ID Car Make Model Average Price
0 1 Nissan Versa 11000
1 2 Chevrolet Spark 13000
2 3 Chevrolet Sonic 15000
In [29]:
df_risk_master = df_risk_master.merge(right=df_vehicles_dimension, on='Vehicle ID', how='left').rename(columns={
    'Car Make': 'Make',
    'Average Price': 'Vehicle Average Price'
})

df_risk_master[['Policy Number', 'Driver ID', 'Make', 'Model', 'Vehicle Average Price']] \
    .head(3) \
    .style \
    .applymap(new_column_style, subset=['Make', 'Model', 'Vehicle Average Price'])
Out[29]:
Policy Number Driver ID Make Model Vehicle Average Price
0 1000001 3001 Dodge Challenger 26600
1 1000002 3002 Audi A3 SportBack 38900
2 1000003 3003 Audi A3 SportBack 38900

Add a column to indicate whether a policy was cancelled

In [30]:
df_risk_master['Cancelled'] = df_risk_master['Cancel Date'].notna()
df_risk_master[['Policy Number', 'Driver ID', 'Cancel Date', 'Cancelled']] \
    .head(3) \
    .style \
    .applymap(new_column_style, subset=['Cancelled'])
Out[30]:
Policy Number Driver ID Cancel Date Cancelled
0 1000001 3001 NaT False
1 1000002 3002 NaT False
2 1000003 3003 NaT False

Merge location dimension into risk fact

In [31]:
print('Before converting column data types')
print(f"df_risk_master['Location ID'].dtype: {df_risk_master['Location ID'].dtype}")
print(f"df_location_dimension['Location ID'].dtype: {df_location_dimension['Location ID'].dtype}")
Before converting column data types
df_risk_master['Location ID'].dtype: object
df_location_dimension['Location ID'].dtype: int64

However, the dtypes of df_risk_fact_cleaned and df_location_dimension are different as shown above. This will result in an error when merging the two dataframes by Location ID. Cast both to nullable integer types (Int64) first.

In [32]:
df_risk_master['Location ID'] = pd.to_numeric(df_risk_fact['Location ID'], errors='coerce').astype('Int64')
df_location_dimension['Location ID'] = df_location_dimension['Location ID'].astype('Int64')
print('After converting column data types')
print(f"df_risk_master['Location ID'].dtype: {df_risk_master['Location ID'].dtype}")
print(f"df_location_dimension['Location ID'].dtype: {df_location_dimension['Location ID'].dtype}")
After converting column data types
df_risk_master['Location ID'].dtype: Int64
df_location_dimension['Location ID'].dtype: Int64

Now, we should be able to merge.

In [33]:
df_risk_master = df_risk_master.merge(df_location_dimension, on='Location ID', how='left')
df_risk_master \
    .head(3)\
    .style \
    .applymap(new_column_style, subset=['ZIP', 'Primary_City', 'State', 'County', 'Estimated_population_2013'])
Out[33]:
Record ID Policy Number Driver ID Vehicle ID Vehicle Model Year Location ID Total Premium Refund Amount Net Premium Policy Effective Date Policy Expiration Date Agency ID Underwriting Company ID Cancel Date Duration First Name Last Name Number of Violations Number of Accidents Marital Status Gender Miles to work Primary Vehicle Use Date of Birth Age Age Bin Monthly Premium Claim Count Claim Amount Claim Expense Make Model Vehicle Average Price Cancelled ZIP Primary_City State County Estimated_population_2013
0 11 1000001 3001 34 2011 65 4434.57 nan 4434.57 2014-03-09 00:00:00 2015-03-09 00:00:00 11 4 NaT Annual Josefa Turnbill 6 2 S M 28 Leisure 1994-02-28 00:00:00 20 20-29 369.548 nan nan nan Dodge Challenger 26600 False 92630 Lake Forest CA Orange County 56565
1 22644 1000002 3002 61 2008 536 4795.82 nan 4795.82 2011-04-02 00:00:00 2012-04-02 00:00:00 15 2 NaT Annual Deidre Whilden 6 2 S M 34 Work 1965-07-04 00:00:00 45 40-49 399.652 nan nan nan Audi A3 SportBack 38900 False 90222 Compton CA Los Angeles County 26204
2 20253 1000003 3003 61 2011 nan 1325.81 nan 1325.81 2012-01-29 00:00:00 2012-07-31 00:00:00 3 7 NaT Semiannual Fernande Lashbrook 2 0 S F 19 Leisure 1973-09-19 00:00:00 38 30-39 220.969 nan nan nan Audi A3 SportBack 38900 False nan nan nan nan nan

Keep in mind that there is one row where the Location ID is missing.

In [34]:
df_risk_master[df_risk_master['Location ID'].isna()]
Out[34]:
Record ID Policy Number Driver ID Vehicle ID Vehicle Model Year Location ID Total Premium Refund Amount Net Premium Policy Effective Date Policy Expiration Date Agency ID Underwriting Company ID Cancel Date Duration First Name Last Name Number of Violations Number of Accidents Marital Status Gender Miles to work Primary Vehicle Use Date of Birth Age Age Bin Monthly Premium Claim Count Claim Amount Claim Expense Make Model Vehicle Average Price Cancelled ZIP Primary_City State County Estimated_population_2013
2 20253 1000003 3003 61 2011 NaN 1325.814736 NaN 1325.814736 2012-01-29 2012-07-31 3 7 NaT Semiannual Fernande Lashbrook 2 0 S F 19.0 Leisure 1973-09-19 38 30-39 220.969123 NaN NaN NaN Audi A3 SportBack 38900 False NaN NaN NaN NaN NaN
In [35]:
df_risk_master.rename(columns={
    'Primary_City': 'City', 
    'Estimated_population_2013': 'Estimated Population'
}, inplace=True) \

df_risk_master.head(3)
Out[35]:
Record ID Policy Number Driver ID Vehicle ID Vehicle Model Year Location ID Total Premium Refund Amount Net Premium Policy Effective Date Policy Expiration Date Agency ID Underwriting Company ID Cancel Date Duration First Name Last Name Number of Violations Number of Accidents Marital Status Gender Miles to work Primary Vehicle Use Date of Birth Age Age Bin Monthly Premium Claim Count Claim Amount Claim Expense Make Model Vehicle Average Price Cancelled ZIP City State County Estimated Population
0 11 1000001 3001 34 2011 65 4434.570900 NaN 4434.570900 2014-03-09 2015-03-09 11 4 NaT Annual Josefa Turnbill 6 2 S M 28.0 Leisure 1994-02-28 20 20-29 369.547575 NaN NaN NaN Dodge Challenger 26600 False 92630.0 Lake Forest CA Orange County 56565.0
1 22644 1000002 3002 61 2008 536 4795.820215 NaN 4795.820215 2011-04-02 2012-04-02 15 2 NaT Annual Deidre Whilden 6 2 S M 34.0 Work 1965-07-04 45 40-49 399.651685 NaN NaN NaN Audi A3 SportBack 38900 False 90222.0 Compton CA Los Angeles County 26204.0
2 20253 1000003 3003 61 2011 NaN 1325.814736 NaN 1325.814736 2012-01-29 2012-07-31 3 7 NaT Semiannual Fernande Lashbrook 2 0 S F 19.0 Leisure 1973-09-19 38 30-39 220.969123 NaN NaN NaN Audi A3 SportBack 38900 False NaN NaN NaN NaN NaN

Claims Fact (one row per claim)

Create a master claims fact dataframe.

In [36]:
df_claims_master = df_claims_fact_merged \
    .merge(
        right=df_claim_branches, 
        left_on='Claim Handling Branch',
        right_on='Branch Id',
        how='left'
    ) \
    .drop(columns=['Claim Handling Branch']) \
    .rename(columns={ 
        'Expense Amount': 'Claim Expense',
        'Branch Id': 'Branch ID',
        'Claimant Id': 'Driver ID'
    })

df_claims_master.head(3)
Out[36]:
Claim ID Policy ID Driver ID Vehicle ID Claim Amount Claim Description Claim Expense Branch ID Branch Name
0 1 1025498 28498 78 4130 Damage from impacts with animals (deer, etc.);... 165.2 3 Norwak CA Branch
1 2 1002163 5163 27 13340 Personal Injury 1200.6 2 Watsonville CA Branch
2 3 1009472 12472 17 2100 Bodily Injury Claims 147.0 2 Watsonville CA Branch

Potential erroneous parts of data

Remember, the data was artificially created for this project. This section explores any odd parts that seem unrealistic.

Can a driver have a refund amount larger than the paid amount?

Check for drivers where the sum of premium grouped by that driver ID is negative.

In [37]:
premiums = df_risk_master.groupby('Driver ID')['Net Premium'].sum().to_frame().reset_index()
negative_premiums = premiums[premiums['Net Premium'] < 0]

display(negative_premiums.head().style.applymap(highlight_style, subset=['Net Premium']))
print(f'{negative_premiums.shape[0]} drivers have negative sum of premiums.')
Driver ID Net Premium
252 3248 -94.8582
679 3675 -9.21436
740 3736 -10.6938
831 3827 -95.2198
1299 4295 -31.1103
90 drivers have negative sum of premiums.

There are policies that have refund amounts larger than the payment amount.

In [38]:
# Split the dataframe by whether net premium is positive negative
df_risk_fact_positive_premiums = df_risk_master[df_risk_master['Net Premium'] > 0]
df_risk_fact_negative_premiums = df_risk_master[df_risk_master['Net Premium'] < 0]

df_risk_fact_negative_premiums[['Policy Number', 'Driver ID', 'Total Premium', 'Refund Amount', 'Net Premium', 'Duration']] \
    .head() \
    .style \
    .applymap(highlight_style, subset=['Duration'])
Out[38]:
Policy Number Driver ID Total Premium Refund Amount Net Premium Duration
293 1000248 3248 2142.57 -2237.43 -94.8582 Semiannual
787 1000675 3675 1579.89 -1589.1 -9.21436 Semiannual
863 1000736 3736 1912.6 -1923.29 -10.6938 Semiannual
967 1000827 3827 999.842 -1095.06 -95.2198 Semiannual
1517 1001295 4295 1688.77 -1719.88 -31.1103 Semiannual

These negative net premium rows seem to have semianuual duration. Let's look at the exact proportions.

In [39]:
fig = make_subplots(1, 2, specs=[[{'type': 'domain'}, {'type': 'domain'}]], subplot_titles=['Positive Net Premiums', 'Negative Net Premiums'])
pie_labels = ['Annual', 'Semiannual']

fig.add_trace(
    go.Pie(
        labels=pie_labels,
        values=df_risk_fact_positive_premiums['Duration'].value_counts().sort_index(),
        text=pie_labels,
        textposition='inside',
        insidetextorientation='horizontal',
        pull=[0.0, 0, 0.1, 0],
        sort=False,
        hole=0.3
    ), 1, 1
)

fig.add_trace(
    go.Pie(
        labels=pie_labels,
        values=df_risk_fact_negative_premiums['Duration'].value_counts().sort_index(),
        text=pie_labels,
        textposition='inside',
        insidetextorientation='horizontal',
        pull=[0.0, 0, 0.1, 0],
        sort=False,
        hole=0.3,
    ), 1, 2
)

fig.update_layout(title_text='Durations by Positive/Negative Net Premiums')
fig.show()