Feature Selection on the WHO Dataset

[1]:
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
from tqdm.auto import tqdm
from sklearn.kernel_ridge import KernelRidge
from sklearn.model_selection import train_test_split
from skmatter.preprocessing import StandardFlexibleScaler
from skmatter.feature_selection import PCovFPS, PCovCUR, FPS, CUR
from skmatter.datasets import load_who_dataset
/home/docs/checkouts/readthedocs.org/user_builds/scikit-matter/envs/v0.1.4/lib/python3.8/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
  from .autonotebook import tqdm as notebook_tqdm

Load the Dataset

[2]:
df = load_who_dataset()['data']
df
[2]:
Country Year SP.POP.TOTL SH.TBS.INCD SH.IMM.MEAS SE.XPD.TOTL.GD.ZS SH.DYN.AIDS.ZS SP.DYN.LE00.IN SH.IMM.IDPT SH.XPD.CHEX.GD.ZS SN.ITK.DEFC.ZS NY.GDP.PCAP.CD
0 Afghanistan 2005 24411191.0 189.0 50.0 2.57000 0.1 58.361 58.0 9.948290 36.1 255.055120
1 Afghanistan 2006 25442944.0 189.0 53.0 2.90000 0.1 58.684 58.0 10.622766 33.3 274.000486
2 Afghanistan 2007 25903301.0 189.0 55.0 2.85000 0.1 59.111 63.0 9.904675 29.8 375.078128
3 Afghanistan 2008 26427199.0 189.0 59.0 3.51000 0.1 59.852 64.0 10.256495 26.5 387.849174
4 Afghanistan 2009 27385307.0 189.0 60.0 3.73000 0.1 60.364 63.0 9.818487 23.3 443.845151
... ... ... ... ... ... ... ... ... ... ... ... ...
2015 South Africa 2015 55876504.0 988.0 86.0 5.48285 18.2 63.950 85.0 8.790190 5.2 6204.929901
2016 South Africa 2016 56422274.0 805.0 84.0 5.44424 18.4 64.747 85.0 8.821429 5.4 5735.066787
2017 South Africa 2017 56641209.0 738.0 81.0 5.59867 18.5 65.402 84.0 8.722624 5.5 6734.475153
2018 South Africa 2018 57339635.0 677.0 81.0 5.64401 18.6 65.674 82.0 8.858297 5.7 7048.522211
2019 South Africa 2019 58087055.0 615.0 83.0 5.91771 18.6 66.175 85.0 9.109355 6.3 6688.787271

2020 rows × 12 columns

[3]:
columns = np.array([
    "SP.POP.TOTL",
    "SH.TBS.INCD",
    "SH.IMM.MEAS",
    "SE.XPD.TOTL.GD.ZS",
    "SH.DYN.AIDS.ZS",
    "SH.IMM.IDPT",
    "SH.XPD.CHEX.GD.ZS",
    "SN.ITK.DEFC.ZS",
    "NY.GDP.PCAP.CD",
])

column_names = np.array([
    "Population",
    "Tuberculosis",
    "Immunization, measles",
    "Educ. Expenditure",
    "HIV",
    "Immunization, DPT",
    "Health Expenditure",
    "Undernourishment",
    "GDP per capita",
])

columns = columns[[8, 4, 5, 6, 1, 0, 7, 3, 2]].tolist()
column_names = column_names[[8, 4, 5, 6, 1, 0, 7, 3, 2]].tolist()
[4]:
X_raw = np.array(df[columns])

# We are taking the logarithm of the population and GDP to avoid extreme distributions
log_scaled = ['SP.POP.TOTL', 'NY.GDP.PCAP.CD']
for ls in log_scaled:
    print(X_raw[:, columns.index(ls)].min(), X_raw[:, columns.index(ls)].max())
    if ls in columns:
        X_raw[:, columns.index(ls)] = np.log10(
            X_raw[:, columns.index(ls)]
        )
y_raw = np.array(df["SP.DYN.LE00.IN"])  # [np.where(df['Year']==2000)[0]])
y_raw = y_raw.reshape(-1, 1)
X_raw.shape
149841.0 7742681934.0
110.460874721483 123678.70214327476
[4]:
(2020, 9)

Scale and Center the Features and Targets

[5]:
x_scaler = StandardFlexibleScaler(column_wise=True)
X = x_scaler.fit_transform(X_raw)

y_scaler = StandardFlexibleScaler(column_wise=True)
y = y_scaler.fit_transform(y_raw)

n_components = 2

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=True, random_state=0)

Provide an estimated target for the feature selector

[6]:
kernel_params = {"kernel": "rbf", "gamma": 0.08858667904100832}
krr = KernelRidge(alpha=0.006158482110660267, **kernel_params)

yp_train = krr.fit(X_train, y_train).predict(X_train)

Compute the Selections for Each Selector Type

[7]:
n_select = X.shape[1]

PCov-CUR

[8]:
pcur = PCovCUR(n_to_select=n_select, progress_bar=True, mixing=0.0)
pcur.fit(X_train, yp_train)
100%|██████████| 9/9 [00:00<00:00, 126.26it/s]
[8]:
PCovCUR(mixing=0.0, n_to_select=9, progress_bar=True)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

PCov-FPS

[9]:
pfps = PCovFPS(n_to_select=n_select, progress_bar=True, mixing=0.0, initialize=pcur.selected_idx_[0])
pfps.fit(X_train, yp_train)
100%|██████████| 8/8 [00:00<00:00, 6192.00it/s]
[9]:
PCovFPS(mixing=0.0, n_to_select=9, progress_bar=True)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

CUR

[10]:
cur = CUR(n_to_select=n_select, progress_bar=True)
cur.fit(X_train, y_train)
100%|██████████| 9/9 [00:00<00:00, 335.71it/s]
[10]:
CUR(n_to_select=9, progress_bar=True)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

FPS

[11]:
fps = FPS(n_to_select=n_select, progress_bar=True, initialize=cur.selected_idx_[0])
fps.fit(X_train, y_train)
100%|██████████| 8/8 [00:00<00:00, 4743.35it/s]
[11]:
FPS(initialize=8, n_to_select=9, progress_bar=True)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

(For Comparison) Recurisive Feature Addition

[12]:
class RecursiveFeatureAddition:
    def __init__(self, n_to_select):
        self.n_to_select = n_to_select
        self.selected_idx_ = np.zeros(n_to_select, dtype=int)
    def fit(self, X, y):
        remaining = np.arange(X.shape[1])
        for n in range(self.n_to_select):
            errors = np.zeros(len(remaining))
            for i, pp in enumerate(remaining):
                krr.fit(
                    X[:, [*self.selected_idx_[:n], pp]], y
                )
                errors[i] = krr.score(X[:, [*self.selected_idx_[:n], pp]], y)
            self.selected_idx_[n] = remaining[np.argmax(errors)]
            remaining = np.array(np.delete(remaining, np.argmax(errors)), dtype=int)
        return self
rfa = RecursiveFeatureAddition(n_select).fit(X_train, y_train)

Plot our Results

[13]:
fig, axes = plt.subplots(2, 1,figsize=(3.75, 5), gridspec_kw=dict(height_ratios=(1,1.5)), sharex=True, dpi=150)
ns = np.arange(1, n_select, dtype=int)

all_errors = {}
for selector, color, linestyle, label in zip(
    [cur, fps, pcur, pfps, rfa],
    ["red", "lightcoral",  "blue", "dodgerblue", "black"],
    ["solid", "solid", "solid", "solid", "dashed"],
    [
        "CUR",
        "FPS",
        "PCov-CUR\n"+r"($\alpha=0.0$)",
        "PCov-FPS\n"+r"($\alpha=0.0$)",
        "Recursive\nFeature\nSelection",
    ],
):
    if label not in all_errors:
        errors = np.zeros(len(ns))
        for i, n in enumerate(ns):
            krr.fit(X_train[:, selector.selected_idx_[:n]], y_train)
            errors[i] = krr.score(X_test[:, selector.selected_idx_[:n]], y_test)
        all_errors[label] = errors
    axes[0].plot(ns, all_errors[label], c=color, label=label, linestyle=linestyle)
    axes[1].plot(ns, selector.selected_idx_[:max(ns)], c=color, marker='.', linestyle=linestyle)

axes[1].set_xlabel(r"$n_{select}$")
axes[1].set_xticks(range(1, n_select))
axes[0].set_ylabel(r"R$^2$")
axes[1].set_yticks(np.arange(X.shape[1]))
axes[1].set_yticklabels(column_names, rotation=30, fontsize=10)
axes[0].legend(ncol=2, fontsize=8, bbox_to_anchor=(0.5, 1.0), loc='lower center')
axes[1].invert_yaxis()
axes[1].grid(axis='y', alpha=0.5)
plt.tight_layout()
plt.show()
../_images/read-only-examples_FeatureSelection-WHODataset_23_0.png