The Benefits of Kernel PCovR for the WHO Dataset¶

[1]:

import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
from sklearn.linear_model import Ridge, RidgeCV
from sklearn.kernel_ridge import KernelRidge
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA, KernelPCA
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score

from skmatter.preprocessing import StandardFlexibleScaler
from skmatter.decomposition import PCovR, KernelPCovR
from skmatter.datasets import load_who_dataset

from scipy.stats import pearsonr

Load the Dataset¶

[2]:

df = load_who_dataset()['data']
df

[2]:

	Country	Year	SP.POP.TOTL	SH.TBS.INCD	SH.IMM.MEAS	SE.XPD.TOTL.GD.ZS	SH.DYN.AIDS.ZS	SP.DYN.LE00.IN	SH.IMM.IDPT	SH.XPD.CHEX.GD.ZS	SN.ITK.DEFC.ZS	NY.GDP.PCAP.CD
0	Afghanistan	2005	24411191.0	189.0	50.0	2.57000	0.1	58.361	58.0	9.948290	36.1	255.055120
1	Afghanistan	2006	25442944.0	189.0	53.0	2.90000	0.1	58.684	58.0	10.622766	33.3	274.000486
2	Afghanistan	2007	25903301.0	189.0	55.0	2.85000	0.1	59.111	63.0	9.904675	29.8	375.078128
3	Afghanistan	2008	26427199.0	189.0	59.0	3.51000	0.1	59.852	64.0	10.256495	26.5	387.849174
4	Afghanistan	2009	27385307.0	189.0	60.0	3.73000	0.1	60.364	63.0	9.818487	23.3	443.845151
...	...	...	...	...	...	...	...	...	...	...	...	...
2015	South Africa	2015	55876504.0	988.0	86.0	5.48285	18.2	63.950	85.0	8.790190	5.2	6204.929901
2016	South Africa	2016	56422274.0	805.0	84.0	5.44424	18.4	64.747	85.0	8.821429	5.4	5735.066787
2017	South Africa	2017	56641209.0	738.0	81.0	5.59867	18.5	65.402	84.0	8.722624	5.5	6734.475153
2018	South Africa	2018	57339635.0	677.0	81.0	5.64401	18.6	65.674	82.0	8.858297	5.7	7048.522211
2019	South Africa	2019	58087055.0	615.0	83.0	5.91771	18.6	66.175	85.0	9.109355	6.3	6688.787271

2020 rows × 12 columns

[3]:

columns = [
    "SP.POP.TOTL",
    "SH.TBS.INCD",
    "SH.IMM.MEAS",
    "SE.XPD.TOTL.GD.ZS",
    "SH.DYN.AIDS.ZS",
    "SH.IMM.IDPT",
    "SH.XPD.CHEX.GD.ZS",
    "SN.ITK.DEFC.ZS",
    "NY.GDP.PCAP.CD",
]

X_raw = np.array(df[columns])

# We are taking the logarithm of the population  and GDP to avoid extreme distributions
log_scaled = ["SP.POP.TOTL", "NY.GDP.PCAP.CD"]
for ls in log_scaled:
    print(X_raw[:, columns.index(ls)].min(), X_raw[:, columns.index(ls)].max())
    if ls in columns:
        X_raw[:, columns.index(ls)] = np.log10(X_raw[:, columns.index(ls)])
y_raw = np.array(df["SP.DYN.LE00.IN"])  # [np.where(df['Year']==2000)[0]])
y_raw = y_raw.reshape(-1, 1)
X_raw.shape

149841.0 7742681934.0
110.460874721483 123678.70214327476

[3]:

(2020, 9)

Scale and Center the Features and Targets¶

[4]:

x_scaler = StandardFlexibleScaler(column_wise=True)
X = x_scaler.fit_transform(X_raw)

y_scaler = StandardFlexibleScaler(column_wise=True)
y = y_scaler.fit_transform(y_raw)

n_components = 2

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=True, random_state=0)

Train the Different Linear DR Techniques¶

[5]:

# Best Error for Linear Regression
RidgeCV(cv=5, alphas=np.logspace(-8,2, 20), fit_intercept=False).fit(X_train, y_train).score(X_test, y_test)

[5]:

0.8548848257886273

PCovR¶

[6]:

pcovr = PCovR(n_components=n_components, regressor=Ridge(alpha=1e-4, fit_intercept=False), mixing=0.5, random_state=0).fit(X_train, y_train)
T_train_pcovr = pcovr.transform(X_train)
T_test_pcovr = pcovr.transform(X_test)
T_pcovr = pcovr.transform(X)

r_pcovr = Ridge(alpha=1e-4, fit_intercept=False, random_state=0).fit(T_train_pcovr, y_train)
yp_pcovr = r_pcovr.predict(T_test_pcovr)

plt.scatter(y_scaler.inverse_transform(y_test), y_scaler.inverse_transform(yp_pcovr))
r_pcovr.score(T_test_pcovr, y_test)

/home/docs/checkouts/readthedocs.org/user_builds/scikit-matter/envs/v0.1.4/lib/python3.8/site-packages/skmatter/decomposition/_pcovr.py:274: UserWarning: This class does not automatically center data, and your data mean is greater than the supplied tolerance.
  warnings.warn(

[6]:

0.8267220275787428

../_images/read-only-examples_PCovR-WHODataset_10_2.png

PCA¶

[7]:

pca = PCA(
    n_components=n_components, random_state=0,
).fit(X_train, y_train)
T_train_pca = pca.transform(X_train)
T_test_pca = pca.transform(X_test)
T_pca = pca.transform(X)

r_pca = Ridge(alpha=1e-4, fit_intercept=False, random_state=0).fit(T_train_pca, y_train)
yp_pca = r_pca.predict(T_test_pca)

plt.scatter(y_scaler.inverse_transform(y_test), y_scaler.inverse_transform(yp_pca))
r_pca.score(T_test_pca, y_test)

[7]:

0.8041174131375703

../_images/read-only-examples_PCovR-WHODataset_12_1.png

[8]:

for c, x in zip(columns, X.T):
    print(c, pearsonr(x, T_pca[:,0])[0], pearsonr(x, T_pca[:,1])[0])

SP.POP.TOTL 0.22694404485361105 -0.3777743593940674
SH.TBS.INCD 0.6249287177098696 0.631621515170247
SH.IMM.MEAS -0.8425862283813432 0.1360690482747245
SE.XPD.TOTL.GD.ZS -0.41457342404840203 0.6100854823971242
SH.DYN.AIDS.ZS 0.3260933054303088 0.8499296260662156
SH.IMM.IDPT -0.8422637385674647 0.16339769662914994
SH.XPD.CHEX.GD.ZS -0.4590012089554527 0.3068630393788181
SN.ITK.DEFC.ZS 0.8212324937958551 0.05510883584395285
NY.GDP.PCAP.CD -0.8042167907410391 0.06566227478694735

Train the Different Kernel DR Techniques¶

Select Kernel Hyperparameters¶

[9]:

param_grid = {"gamma": np.logspace(-8, 3, 20), "alpha": np.logspace(-8, 3, 20)}
clf = KernelRidge(kernel='rbf')

gs = GridSearchCV(estimator=clf, param_grid=param_grid)
gs.fit(X_train, y_train)
gs.best_estimator_

[9]:

KernelRidge(alpha=0.0016237767391887243, gamma=0.08858667904100832,
            kernel='rbf')

In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

[10]:

# Best Error for Kernel Regression
gs.best_score_

[10]:

0.971744613370008

[11]:

kernel_params = {"kernel": "rbf", "gamma": gs.best_estimator_.gamma}

KPCovR¶

[12]:

kpcovr = KernelPCovR(
    n_components=n_components,
    regressor=KernelRidge(alpha=gs.best_estimator_.alpha, **kernel_params),
    mixing=0.5,
    **kernel_params,
).fit(X_train, y_train)

T_train_kpcovr = kpcovr.transform(X_train)
T_test_kpcovr = kpcovr.transform(X_test)
T_kpcovr = kpcovr.transform(X)

r_kpcovr = KernelRidge(**kernel_params).fit(T_train_kpcovr, y_train)
yp_kpcovr = r_kpcovr.predict(T_test_kpcovr)

plt.scatter(y_scaler.inverse_transform(y_test), y_scaler.inverse_transform(yp_kpcovr))
r_kpcovr.score(T_test_kpcovr, y_test)

[12]:

0.9701003539459829

../_images/read-only-examples_PCovR-WHODataset_20_1.png

KPCA¶

[13]:

kpca = KernelPCA(
    n_components=n_components,
    **kernel_params,
    random_state=0
).fit(X_train, y_train)

T_train_kpca = kpca.transform(X_train)
T_test_kpca = kpca.transform(X_test)
T_kpca = kpca.transform(X)

r_kpca = KernelRidge(**kernel_params).fit(T_train_kpca, y_train)
yp_kpca = r_kpca.predict(T_test_kpca)

plt.scatter(y_scaler.inverse_transform(y_test), y_scaler.inverse_transform(yp_kpca))
r_kpca.score(T_test_kpca, y_test)

[13]:

0.6661226058827701

../_images/read-only-examples_PCovR-WHODataset_22_1.png

Correlation of the different variables with the KPCovR axes¶

[14]:

for c, x in zip(columns, X.T):
    print(c, pearsonr(x, T_kpcovr[:,0])[0], pearsonr(x, T_kpcovr[:,1])[0])

SP.POP.TOTL 0.07320109486752663 0.03969226130206769
SH.TBS.INCD 0.6836177728806969 -0.053847467712589955
SH.IMM.MEAS -0.6604939713031328 0.047519698516421745
SE.XPD.TOTL.GD.ZS -0.2300978893002612 -0.36227488660047863
SH.DYN.AIDS.ZS 0.5157981075022106 -0.11701327000167581
SH.IMM.IDPT -0.6449500965013223 0.052622267816944665
SH.XPD.CHEX.GD.ZS -0.38019935560128637 -0.5736426627630692
SN.ITK.DEFC.ZS 0.7301250686596606 0.04793454286931878
NY.GDP.PCAP.CD -0.822866009733033 -0.4938636569729072

Plot Our Results¶

[15]:

fig, axes = plt.subplot_mosaic(
    """
                                AFF.B
                                A.GGB
                                .....
                                CHH.D
                                C.IID
                                .....
                                EEEEE
                                """,
    figsize=(7.5, 7.5),
    gridspec_kw=dict(
        height_ratios=(0.5, 0.5, 0.1, 0.5, 0.5, 0.1, 0.1),
        width_ratios=(1, 0.1, 0.2, 0.1, 1)
    ),
)
axPCA, axPCovR, axKPCA, axKPCovR = axes["A"], axes["B"], axes["C"], axes["D"]
axPCAy, axPCovRy, axKPCAy, axKPCovRy = axes["F"], axes["G"], axes["H"], axes["I"]

def add_subplot(ax, axy, T, yp, let=''):
    p = ax.scatter(-T[:, 0], T[:, 1], c=y_raw, s=4)
    ax.set_xticks([])
    ax.set_yticks([])
    ax.annotate(
        xy=(0.025, 0.95),
        xycoords="axes fraction",
        text=f"({let})",
        va='top',
        ha='left'
    )
    axy.scatter(
        y_scaler.inverse_transform(y_test),
        y_scaler.inverse_transform(yp),
        c="k",
        s=1,
    )
    axy.plot([y_raw.min(), y_raw.max()], [y_raw.min(), y_raw.max()], "r--")
    axy.annotate(
        xy=(0.05, 0.95),
        xycoords="axes fraction",
        text=r"R$^2$=%0.2f" % round(r2_score(y_test, yp), 3),
        va='top',
        ha='left',
        fontsize=8,
    )
    axy.set_xticks([])
    axy.set_yticks([])
    return p


p = add_subplot(axPCA, axPCAy, T_pca, yp_pca, 'a')
axPCA.set_xlabel("PC$_1$")
axPCA.set_ylabel("PC$_2$")

add_subplot(axPCovR, axPCovRy, T_pcovr @ np.diag([-1,1]), yp_pcovr, 'b')
axPCovR.yaxis.set_label_position("right")
axPCovR.set_xlabel("PCov$_1$")
axPCovR.set_ylabel("PCov$_2$", rotation=-90, va="bottom")

add_subplot(axKPCA, axKPCAy, T_kpca @ np.diag([-1,1]), yp_kpca, 'c')
axKPCA.set_xlabel("Kernel PC$_1$", fontsize=10)
axKPCA.set_ylabel("Kernel PC$_2$", fontsize=10)

add_subplot(axKPCovR, axKPCovRy, T_kpcovr, yp_kpcovr, 'd')
axKPCovR.yaxis.set_label_position("right")
axKPCovR.set_xlabel("Kernel PCov$_1$", fontsize=10)
axKPCovR.set_ylabel("Kernel PCov$_2$", rotation=-90, va="bottom", fontsize=10)

plt.colorbar(
    p, cax=axes["E"], label="Life Expectancy [years]", orientation="horizontal"
)
fig.subplots_adjust(wspace=0, hspace=0.4)
fig.suptitle("Linear and Kernel PCovR for Predicting Life Expectancy", y=0.925, fontsize=10)
plt.show()

../_images/read-only-examples_PCovR-WHODataset_26_0.png