Source code for ms_mint.pca

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

import plotly.figure_factory as ff
from plotly import express as px

from sklearn.decomposition import PCA
from .tools import scale_dataframe


[docs] class PrincipalComponentsAnalyser: """ Class for applying PCA to Mint instance. """
[docs] def __init__(self, mint=None): """ Class for applying PCA to Mint instance. :param mint: Mint instance, defaults to None :type mint: ms_mint.Mint.Mint, optional """ self.mint = mint self.results = None self.plot = PCA_Plotter(self)
[docs] def run(self, n_components=3, on=None, var_name="peak_max", fillna="median", apply=None, groupby=None, scaler="standard"): """ Run Principal Component Analysis on current results. Results are stored in self.decomposition_results. :param on: Column name to use for pca, defaults to "peak_max" :type on: str, optional :param n_components: Number of PCA components to return, defaults to 3 :type n_components: int, optional :param fillna: Method to fill missing values, defaults to "median" :type fillna: str, optional :param scaler: Method to scale the columns, defaults to "standard" :type scaler: str, optional """ if on is not None: warnings.warn("on is depricated use var_name instead", DeprecationWarning) var_name = on df = self.mint.crosstab(var_name=var_name, apply=apply, scaler=scaler, groupby=groupby) if fillna == "median": fillna = df.median() elif fillna == "mean": fillna = df.mean() elif fillna == "zero": fillna = 0 df = df.fillna(fillna) min_dim = min(df.shape) n_components = min(n_components, min_dim) pca = PCA(n_components) X_projected = pca.fit_transform(df) # Convert to dataframe df_projected = pd.DataFrame(X_projected, index=df.index.get_level_values(0)) # Set columns to PC-1, PC-2, ... df_projected.columns = [f"PC-{int(i)+1}" for i in df_projected.columns] # Calculate cumulative explained variance in percent explained_variance = pca.explained_variance_ratio_ * 100 cum_expl_var = np.cumsum(explained_variance) # Create feature contributions a = np.zeros((n_components, n_components), int) np.fill_diagonal(a, 1) dfc = pd.DataFrame(pca.inverse_transform(a)) dfc.columns = df.columns dfc.index = [f"PC-{i+1}" for i in range(n_components)] dfc.index.name = "PC" # convert to long format dfc = dfc.stack().reset_index().rename(columns={0: "Coefficient"}) self.results = { "df_projected": df_projected, "cum_expl_var": cum_expl_var, "n_components": n_components, "type": "PCA", "feature_contributions": dfc, "class": pca, }
[docs] class PCA_Plotter: """ Class for plotting Mint PCA results. """
[docs] def __init__(self, pca): """ Class for plotting Mint PCA results. :param pca: PrincipalComponentsAnalyser instance :type pca: ms_mint.pca.PrincipalComponentsAnalyser """ self.pca = pca
[docs] def cumulative_variance(self, interactive=False, **kwargs): if interactive: return self.cumulative_variance_px(**kwargs) else: return self.cumulative_variance_sns(**kwargs)
[docs] def cumulative_variance_px(self, **kwargs): """ After running mint.pca() this function can be used to plot the cumulative variance of the principal components. :return: Returns a plotly express figure. :rtype: plotly.graph_objs._figure.Figure """ n_components = self.pca.results["n_components"] cum_expl_var = self.pca.results["cum_expl_var"] df = pd.DataFrame({'Principal Component': np.arange(n_components) + 1, 'Explained variance [%]': cum_expl_var}) fig = px.bar(df, x='Principal Component', y='Explained variance [%]', title="Cumulative explained variance", labels={'Principal Component':'Principal Component', 'Explained variance [%]':'Explained variance [%]'}, **kwargs) fig.update_layout(autosize=True, showlegend=False) return fig
[docs] def cumulative_variance_sns(self, **kwargs): """ After running mint.pca() this function can be used to plot the cumulative variance of the principal components. :return: Returns a matplotlib figure. :rtype: matplotlib.figure.Figure """ # Set default values for aspect and height aspect = kwargs.get('aspect', 1) height = kwargs.get('height', 5) n_components = self.pca.results["n_components"] cum_expl_var = self.pca.results["cum_expl_var"] # Calculate width based on aspect ratio and number of components width = height * aspect fig, ax = plt.subplots(figsize=(width, height)) ax.bar( np.arange(n_components) + 1, cum_expl_var, facecolor="grey", edgecolor="none", ) ax.set_xlabel("Principal Component") ax.set_ylabel("Explained variance [%]") ax.set_title("Cumulative explained variance") #ax.grid() ax.spines['top'].set_visible(False) ax.spines['right'].set_visible(False) ax.set_xticks(range(1, len(cum_expl_var) + 1)) return fig
def _prepare_data(self, n_components=3, hue=None): df = self.pca.results["df_projected"].copy() cols = df.columns.to_list()[:n_components] df = df[cols] df = pd.merge(df, self.pca.mint.meta.dropna(axis=1, how='all'), left_index=True, right_index=True) if hue and (not isinstance(hue, str)): df['Label'] = hue df['Label'] = df['Label'].astype(str) return df
[docs] def pairplot( self, n_components=3, hue=None, fig_kws=None, interactive=False, **kwargs ): """ After running mint.pca() this function can be used to plot a scatter matrix of the principal components. :param n_components: Number of principal components to plot, defaults to 3. :type n_components: int, optional :param hue: Labels used for hue. If string, the data will be taken from the mint.meta dataframe. :type hue: List[str] or str, optional :return: Returns a matplotlib figure. :rtype: seaborn.axisgrid.PairGrid """ df = self._prepare_data(n_components=n_components, hue=hue) if isinstance(hue, list): hue = 'label' if interactive: return self.pairplot_plotly(df, color_col=hue, **kwargs) else: return self.pairplot_sns(df, fig_kws=fig_kws, hue=hue, **kwargs)
[docs] def pairplot_sns(self, df, fig_kws=None, **kwargs): if fig_kws is None: fig_kws = {} plt.figure(**fig_kws) g = sns.pairplot(df, **kwargs) return g
[docs] def pairplot_plotly(self, df, color_col=None, **kwargs): columns = df.filter(regex=f'PC|^{color_col}$').columns fig = ff.create_scatterplotmatrix(df[columns], index=color_col, hovertext=df.index, **kwargs) # set the legendgroup equal to the marker color for t in fig.data: t.legendgroup = t.marker.color return fig
[docs] def loadings(self, interactive=False, **kwargs): if interactive: return self.loadings_plotly(**kwargs) else: return self.loadings_sns(**kwargs)
[docs] def loadings_sns(self, **kwargs): if 'row' not in kwargs: kwargs['row'] = 'PC' g = sns.catplot(data=self.pca.results['feature_contributions'], x='peak_label', y='Coefficient', kind='bar', **kwargs) plt.tight_layout() return g
[docs] def loadings_plotly(self, **kwargs): if 'facet_row' not in kwargs: kwargs['facet_row'] = 'PC' fig = px.bar(self.pca.results['feature_contributions'], x='peak_label', y='Coefficient', barmode='group', **kwargs) return fig