Analysis of Hawk/dove multiple risk attitudes with adjustment¶

Analysis of parameter correlation with population risk attitudes.

In [40]:
import pandas as pd
import glob

df = pd.read_csv("../../data/hawkdovemulti/job_task_2024-02-07T064022_648635_model.csv")
In [41]:
df.head()
Out[41]:
RunId iteration Step grid_size risk_adjustment play_neighborhood observed_neighborhood adjust_neighborhood hawk_odds adjust_every ... total_r0 total_r1 total_r2 total_r3 total_r4 total_r5 total_r6 total_r7 total_r8 total_r9
0 6 0 31 10 adopt 8 8 8 0.5 2 ... 0 7 6 10 17 10 14 10 18 8
1 8 0 53 10 adopt 8 8 8 0.5 2 ... 35 9 6 2 0 0 0 6 15 27
2 7 0 65 10 adopt 8 8 8 0.5 2 ... 0 8 31 4 19 7 31 0 0 0
3 4 0 60 10 adopt 8 8 8 0.5 2 ... 10 15 24 15 12 9 9 2 1 3
4 12 0 61 10 adopt 8 8 8 0.5 10 ... 2 7 7 37 11 16 13 5 2 0

5 rows × 28 columns

In [42]:
df.columns
Out[42]:
Index(['RunId', 'iteration', 'Step', 'grid_size', 'risk_adjustment',
       'play_neighborhood', 'observed_neighborhood', 'adjust_neighborhood',
       'hawk_odds', 'adjust_every', 'risk_distribution', 'adjust_payoff',
       'max_agent_points', 'percent_hawk', 'rolling_percent_hawk', 'status',
       'total_agents', 'population_risk_category', 'total_r0', 'total_r1',
       'total_r2', 'total_r3', 'total_r4', 'total_r5', 'total_r6', 'total_r7',
       'total_r8', 'total_r9'],
      dtype='object')
In [43]:
# calculate percentages for each risk attitude

for i in range(0, 10):
    df[f'pct_r{i}'] = df.apply(lambda row: row[f'total_r{i}'] / row.total_agents, axis=1)

df.head()
Out[43]:
RunId iteration Step grid_size risk_adjustment play_neighborhood observed_neighborhood adjust_neighborhood hawk_odds adjust_every ... pct_r0 pct_r1 pct_r2 pct_r3 pct_r4 pct_r5 pct_r6 pct_r7 pct_r8 pct_r9
0 6 0 31 10 adopt 8 8 8 0.5 2 ... 0.00 0.07 0.06 0.10 0.17 0.10 0.14 0.10 0.18 0.08
1 8 0 53 10 adopt 8 8 8 0.5 2 ... 0.35 0.09 0.06 0.02 0.00 0.00 0.00 0.06 0.15 0.27
2 7 0 65 10 adopt 8 8 8 0.5 2 ... 0.00 0.08 0.31 0.04 0.19 0.07 0.31 0.00 0.00 0.00
3 4 0 60 10 adopt 8 8 8 0.5 2 ... 0.10 0.15 0.24 0.15 0.12 0.09 0.09 0.02 0.01 0.03
4 12 0 61 10 adopt 8 8 8 0.5 10 ... 0.02 0.07 0.07 0.37 0.11 0.16 0.13 0.05 0.02 0.00

5 rows × 38 columns

In [44]:
# calculate percentages for each risk grouping

#  Risk-inclined (RI) : r = 0, 1, 2
#  Risk-moderate (RM): r = 3, 4, 5, 6
#  Risk-avoidant (RA): r = 7, 8, 9

df['pct_risk_inclined'] = df.apply(lambda row: (row.total_r0 + row.total_r1 + row.total_r2) / row.total_agents, axis=1)
df['pct_risk_moderate'] = df.apply(lambda row: (row.total_r3 + row.total_r4 + row.total_r5 + row.total_r6) / row.total_agents, axis=1)
df['pct_risk_avoidant'] = df.apply(lambda row: (row.total_r7 + row.total_r8 + row.total_r9) / row.total_agents, axis=1)


df[['pct_r0', 'pct_r1', 'pct_risk_inclined', 'pct_risk_moderate', 'pct_risk_avoidant']].head(10)
Out[44]:
pct_r0 pct_r1 pct_risk_inclined pct_risk_moderate pct_risk_avoidant
0 0.00 0.07 0.13 0.51 0.36
1 0.35 0.09 0.50 0.02 0.48
2 0.00 0.08 0.39 0.61 0.00
3 0.10 0.15 0.49 0.45 0.06
4 0.02 0.07 0.16 0.77 0.07
5 0.00 0.06 0.09 0.37 0.54
6 0.06 0.13 0.20 0.77 0.03
7 0.00 0.21 0.28 0.71 0.01
8 0.14 0.66 0.88 0.12 0.00
9 0.00 0.11 0.17 0.83 0.00
In [45]:
from sklearn import linear_model

# independent variables
grid_size = df[['grid_size']]
# dependent variable
pct_risk_inclined = df['pct_risk_inclined']


regr = linear_model.LinearRegression()
regr.fit(grid_size, pct_risk_inclined)

print("Coefficients: \n", regr.coef_)
Coefficients: 
 [-0.00155227]
In [46]:
_pred = regr.predict(df[['grid_size']])
In [47]:
import matplotlib.pyplot as plt


plt.scatter(grid_size, pct_risk_inclined, color="black")
plt.plot(grid_size, _pred, color="blue", linewidth=3)
plt.show()
No description has been provided for this image
In [48]:
df.pct_risk_inclined.hist()
Out[48]:
<Axes: >
No description has been provided for this image
In [49]:
df.pct_risk_moderate.hist()
Out[49]:
<Axes: >
No description has been provided for this image
In [50]:
df.pct_risk_avoidant.hist()
Out[50]:
<Axes: >
No description has been provided for this image
In [51]:
df.columns
Out[51]:
Index(['RunId', 'iteration', 'Step', 'grid_size', 'risk_adjustment',
       'play_neighborhood', 'observed_neighborhood', 'adjust_neighborhood',
       'hawk_odds', 'adjust_every', 'risk_distribution', 'adjust_payoff',
       'max_agent_points', 'percent_hawk', 'rolling_percent_hawk', 'status',
       'total_agents', 'population_risk_category', 'total_r0', 'total_r1',
       'total_r2', 'total_r3', 'total_r4', 'total_r5', 'total_r6', 'total_r7',
       'total_r8', 'total_r9', 'pct_r0', 'pct_r1', 'pct_r2', 'pct_r3',
       'pct_r4', 'pct_r5', 'pct_r6', 'pct_r7', 'pct_r8', 'pct_r9',
       'pct_risk_inclined', 'pct_risk_moderate', 'pct_risk_avoidant'],
      dtype='object')
In [52]:
import seaborn as sns

# do a pair plot on a subset of numeric columns that might correlate to population results

df_compare = df[['grid_size', 'risk_adjustment', 'hawk_odds', 'play_neighborhood', 'observed_neighborhood', 'adjust_neighborhood', 'pct_risk_inclined']]

_ = sns.pairplot(df_compare, kind="reg", diag_kind="kde")
No description has been provided for this image
In [53]:
# from the pair plot, one of the few that looks like it might be interesting is hawk odds

hawk_odds = df[['hawk_odds']]

hawkodds_regr = linear_model.LinearRegression()
hawkodds_regr.fit(hawk_odds, pct_risk_inclined)

print("Coefficients: \n", regr.coef_)
Coefficients: 
 [-0.00155227]
In [54]:
hawkodds_pred = hawkodds_regr.predict(hawk_odds)
hawkodds_pred
Out[54]:
array([0.34652343, 0.34652343, 0.34652343, ..., 0.38121947, 0.3118274 ,
       0.34652343], shape=(8905,))
In [55]:
plt.scatter(hawk_odds, pct_risk_inclined, color="gray")
plt.plot(hawk_odds, hawkodds_pred, color="blue", linewidth=3)
plt.show()
No description has been provided for this image
In [56]:
# what about adjustment neighborhood?

adjust_nhood = df[['adjust_neighborhood']]

adjust_nhood_regr = linear_model.LinearRegression()
adjust_nhood_regr.fit(adjust_nhood, pct_risk_inclined)

print("Coefficients: \n", adjust_nhood_regr.coef_)
Coefficients: 
 [0.00126239]
In [57]:
adjust_nhood_pred = adjust_nhood_regr.predict(adjust_nhood)
adjust_nhood_pred
Out[57]:
array([0.34146791, 0.34146791, 0.34146791, ..., 0.33641833, 0.34146791,
       0.34146791], shape=(8905,))
In [58]:
plt.scatter(adjust_nhood, pct_risk_inclined, color="gray")
plt.plot(adjust_nhood, adjust_nhood_pred, color="blue", linewidth=3)
plt.show()
No description has been provided for this image
In [59]:
# following along from here:
# https://scikit-learn.org/stable/auto_examples/inspection/plot_linear_model_coefficient_interpretation.html#the-machine-learning-pipeline

from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder

categorical_columns = ["risk_adjustment", "adjust_payoff", "risk_distribution"]
numerical_columns = ['grid_size', 'hawk_odds', 'play_neighborhood', 'observed_neighborhood', 'adjust_neighborhood']

preprocessor = make_column_transformer(
    (OneHotEncoder(drop="if_binary"), categorical_columns),
    remainder="passthrough",
    verbose_feature_names_out=False,  # avoid to prepend the preprocessor names
)
In [60]:
import numpy as np
import scipy as sp

from sklearn.compose import TransformedTargetRegressor
from sklearn.linear_model import Ridge
from sklearn.pipeline import make_pipeline

model = make_pipeline(
    preprocessor,
    TransformedTargetRegressor(
        # the scikit learn tutorial uses a log function for predicting wages; we're checking for correlation for a % of population, so disable the log scale
        regressor=Ridge(alpha=1e-10), #func=np.log10, inverse_func=sp.special.exp10
    ),
)
In [61]:
# make an input dataset based on the categorical and numerical columns we want to use

df_input = df[categorical_columns + numerical_columns]
# what if we try with only the numerics first?
#df_input = df[numerical_columns]

df_input.head()
Out[61]:
risk_adjustment adjust_payoff risk_distribution grid_size hawk_odds play_neighborhood observed_neighborhood adjust_neighborhood
0 adopt recent skewed right 10 0.5 8 8 8
1 adopt recent bimodal 10 0.5 8 8 8
2 adopt total skewed right 10 0.5 8 8 8
3 adopt recent skewed left 10 0.5 8 8 8
4 adopt recent normal 10 0.5 8 8 8
In [62]:
df_input.columns
Out[62]:
Index(['risk_adjustment', 'adjust_payoff', 'risk_distribution', 'grid_size',
       'hawk_odds', 'play_neighborhood', 'observed_neighborhood',
       'adjust_neighborhood'],
      dtype='object')
In [63]:
# target for prediction - use percent of population that end up risk inclined;
# using this as a preliminary a stand-in for various population categories for now

target = df['pct_risk_inclined']
In [64]:
from sklearn.model_selection import train_test_split

input_train, input_test, target_train, target_test = train_test_split(df_input, target, random_state=42)
In [65]:
input_train
Out[65]:
risk_adjustment adjust_payoff risk_distribution grid_size hawk_odds play_neighborhood observed_neighborhood adjust_neighborhood
1951 adopt recent uniform 10 0.25 4 24 24
3746 average recent bimodal 10 0.50 24 4 8
351 adopt total uniform 10 0.75 8 24 8
1511 adopt total bimodal 10 0.75 24 4 24
5741 adopt total uniform 25 0.50 24 8 24
... ... ... ... ... ... ... ... ...
5734 adopt total skewed right 25 0.50 24 8 24
5191 adopt total skewed left 25 0.50 8 24 24
5390 adopt recent skewed left 25 0.25 8 4 8
860 adopt total uniform 10 0.50 24 8 24
7270 average recent normal 25 0.50 8 8 8

6678 rows × 8 columns

In [66]:
train_dataset = input_train.copy()
train_dataset.insert(0, "pct_risk_inclined", target_train)
# subset to numeric and target only
train_dataset_subset = train_dataset[['pct_risk_inclined', 'grid_size', 'hawk_odds', 'play_neighborhood', 'observed_neighborhood', 'adjust_neighborhood']]
_ = sns.pairplot(train_dataset_subset, kind="reg", diag_kind="kde")
No description has been provided for this image
In [67]:
# how many zeroes do we have?
target_train[target_train == 0]
Out[67]:
1511    0.0
2918    0.0
1583    0.0
1215    0.0
1129    0.0
       ... 
5249    0.0
5232    0.0
2731    0.0
6863    0.0
161     0.0
Name: pct_risk_inclined, Length: 266, dtype: float64
In [68]:
# fit the model to the training data
model.fit(input_train, target_train)
Out[68]:
Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('onehotencoder',
                                                  OneHotEncoder(drop='if_binary'),
                                                  ['risk_adjustment',
                                                   'adjust_payoff',
                                                   'risk_distribution'])],
                                   verbose_feature_names_out=False)),
                ('transformedtargetregressor',
                 TransformedTargetRegressor(regressor=Ridge(alpha=1e-10)))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Parameters
steps  [('columntransformer', ...), ('transformedtargetregressor', ...)]
transform_input  None
memory  None
verbose  False
Parameters
transformers  [('onehotencoder', ...)]
remainder  'passthrough'
sparse_threshold  0.3
n_jobs  None
transformer_weights  None
verbose  False
verbose_feature_names_out  False
force_int_remainder_cols  'deprecated'
['risk_adjustment', 'adjust_payoff', 'risk_distribution']
Parameters
categories  'auto'
drop  'if_binary'
sparse_output  True
dtype  <class 'numpy.float64'>
handle_unknown  'error'
min_frequency  None
max_categories  None
feature_name_combiner  'concat'
['grid_size', 'hawk_odds', 'play_neighborhood', 'observed_neighborhood', 'adjust_neighborhood']
passthrough
Parameters
regressor  Ridge(alpha=1e-10)
transformer  None
func  None
inverse_func  None
check_inverse  True
Ridge(alpha=1e-10)
Parameters
alpha  1e-10
fit_intercept  True
copy_X  True
max_iter  None
tol  0.0001
solver  'auto'
positive  False
random_state  None
In [69]:
from sklearn.metrics import PredictionErrorDisplay, median_absolute_error

mae_train = median_absolute_error(target_train, model.predict(input_train))
target_pred = model.predict(input_test)
mae_test = median_absolute_error(target_test, target_pred)
scores = {
    "MedAE on training set": f"{mae_train:.2f}",
    "MedAE on testing set": f"{mae_test:.2f}",
}
print(scores)
{'MedAE on training set': '0.10', 'MedAE on testing set': '0.10'}
In [70]:
_, ax = plt.subplots(figsize=(5, 5))
display = PredictionErrorDisplay.from_predictions(
    target_test, target_pred, kind="actual_vs_predicted", ax=ax, scatter_kwargs={"alpha": 0.5}
)
ax.set_title("Ridge model, small regularization")
for name, score in scores.items():
    ax.plot([], [], " ", label=f"{name}: {score}")
ax.legend(loc="upper left")
plt.tight_layout()
No description has been provided for this image
In [71]:
feature_names = model[:-1].get_feature_names_out()

coefs = pd.DataFrame(
    model[-1].regressor_.coef_,
    columns=["Coefficients"],
    index=feature_names,
)

coefs
Out[71]:
Coefficients
risk_adjustment_average 0.020990
adjust_payoff_total 0.005827
risk_distribution_bimodal 0.082914
risk_distribution_normal -0.155838
risk_distribution_skewed left 0.140480
risk_distribution_skewed right -0.122206
risk_distribution_uniform 0.021653
grid_size -0.001389
hawk_odds 0.138495
play_neighborhood -0.001303
observed_neighborhood -0.000993
adjust_neighborhood 0.001200
In [72]:
coefs.plot.barh(figsize=(9, 7))
plt.title("Ridge model, small regularization")
plt.axvline(x=0, color=".5")
plt.xlabel("Raw coefficient values")
plt.subplots_adjust(left=0.3)
No description has been provided for this image
In [73]:
input_train_preprocessed = pd.DataFrame(
    model[:-1].transform(input_train), columns=feature_names
)

input_train_preprocessed.std(axis=0).plot.barh(figsize=(9, 7))
plt.title("Feature ranges")
plt.xlabel("Std. dev. of feature values")
plt.subplots_adjust(left=0.3)
No description has been provided for this image
In [74]:
coefs = pd.DataFrame(
    model[-1].regressor_.coef_ * input_train_preprocessed.std(axis=0),
    columns=["Coefficient importance"],
    index=feature_names,
)
coefs.plot(kind="barh", figsize=(9, 7))
plt.xlabel("Coefficient values corrected by the feature's std. dev.")
plt.title("Ridge model, small regularization")
plt.axvline(x=0, color=".5")
plt.subplots_adjust(left=0.3)
No description has been provided for this image
In [75]:
from sklearn.model_selection import RepeatedKFold, cross_validate

cv = RepeatedKFold(n_splits=5, n_repeats=5, random_state=0)
cv_model = cross_validate(
    model,
    df_input,
    target,
    cv=cv,
    return_estimator=True,
    n_jobs=2,
)

coefs = pd.DataFrame(
    [
        est[-1].regressor_.coef_ * est[:-1].transform(df_input.iloc[train_idx]).std(axis=0)
        for est, (train_idx, _) in zip(cv_model["estimator"], cv.split(df_input, target))
    ],
    columns=feature_names,
)
/Users/rkoeser/workarea/env/simrisk/lib/python3.12/site-packages/scipy/_lib/_util.py:1226: LinAlgWarning: Ill-conditioned matrix (rcond=2.46716e-17): result may not be accurate.
  return f(*arrays, *other_args, **kwargs)
/Users/rkoeser/workarea/env/simrisk/lib/python3.12/site-packages/scipy/_lib/_util.py:1226: LinAlgWarning: Ill-conditioned matrix (rcond=8.45374e-17): result may not be accurate.
  return f(*arrays, *other_args, **kwargs)
/Users/rkoeser/workarea/env/simrisk/lib/python3.12/site-packages/scipy/_lib/_util.py:1226: LinAlgWarning: Ill-conditioned matrix (rcond=9.26618e-17): result may not be accurate.
  return f(*arrays, *other_args, **kwargs)
/Users/rkoeser/workarea/env/simrisk/lib/python3.12/site-packages/scipy/_lib/_util.py:1226: LinAlgWarning: Ill-conditioned matrix (rcond=9.81204e-17): result may not be accurate.
  return f(*arrays, *other_args, **kwargs)
In [76]:
plt.figure(figsize=(9, 7))
sns.stripplot(data=coefs, orient="h", palette="dark:k", alpha=0.5)
sns.boxplot(data=coefs, orient="h", color="cyan", saturation=0.5)
plt.axvline(x=0, color=".5")
plt.title("Coefficient importance and its variability")
plt.xlabel("Coefficient importance")
plt.suptitle("Ridge model, small regularization, AGE dropped")
plt.subplots_adjust(left=0.3)
No description has been provided for this image