Source code for ms_mint.matplotlib_tools

import pandas as pd
import numpy as np
import seaborn as sns

from matplotlib import pyplot as plt
from scipy.cluster.hierarchy import linkage, dendrogram
from scipy.spatial.distance import pdist, squareform


[docs] def hierarchical_clustering( df, vmin=None, vmax=None, figsize=(8, 8), top_height=2, left_width=2, xmaxticks=None, ymaxticks=None, metric="cosine", cmap=None, ): """ Performs and plot hierarchical clustering on dataframe in dense format. :param df: Input data. :type df: pandas.DataFrame :param vmin: Minimum value to anchor the colormap, otherwise they are inferred from the data and other keyword arguments. :type vmin: int, optional :param vmin: Maximum value to anchor the colormap, otherwise they are inferred from the data and other keyword arguments. :type vmax: int, optional :param figsize: Size of the main figure in inches, defaults to (8, 8) :type figsize: tuple, optional :param top_height: Height of the top dendrogram, defaults to 2 :type top_height: int, optional :param left_width: Width of the left dendrogram, defaults to 2 :type left_width: int, optional :param xmaxticks: Maximum number of x-ticks to display, defaults to None :type xmaxticks: int, optional :param ymaxticks: Maxiumum number of y-ticks to display, defaults to None :type ymaxticks: int, optional :param metric: Metric to be used for distance calculation (both axes), defaults to "cosine" :type metric: str, optional :param cmap: Matplotlib color map name, defaults to None :type cmap: str, optional :return: Matplotlib figure :rtype: matplotlib.pyplot.Figure """ if isinstance(metric, str): metric_x, metric_y = metric, metric elif len(metric) == 2 and isinstance(metric[0], str) and isinstance(metric[1], str): metric_x, metric_y = metric elif metric is None: metric_x, metric_y = None, None df = df.copy() # Subplot sizes total_width, total_height = figsize main_h = 1 - (top_height / total_height) main_w = 1 - (left_width / total_width) gap_x = 0.1 / total_width gap_y = 0.1 / total_height left_h = main_h left_w = 1 - main_w top_h = 1 - main_h top_w = main_w if xmaxticks is None: xmaxticks = int(5 * main_w * total_width) if ymaxticks is None: ymaxticks = int(5 * main_h * total_height) dm = df.fillna(0).values D1 = squareform(pdist(dm, metric=metric_y)) D2 = squareform(pdist(dm.T, metric=metric_x)) fig = plt.figure(figsize=figsize) fig.set_tight_layout(False) # add left dendrogram ax1 = fig.add_axes([0, 0, left_w - gap_x, left_h], frameon=False) Y = linkage(D1, method="complete") Z1 = dendrogram(Y, orientation="left", color_threshold=0, above_threshold_color="k") ax1.set_xticks([]) ax1.set_yticks([]) # add top dendrogram ax2 = fig.add_axes([left_w, main_h + gap_y, top_w, top_h - gap_y], frameon=False) Y = linkage(D2, method="complete") Z2 = dendrogram(Y, color_threshold=0, above_threshold_color="k") ax2.set_xticks([]) ax2.set_yticks([]) # add matrix plot axmatrix = fig.add_axes([left_w, 0, main_w, main_h]) idx1 = Z1["leaves"] idx2 = Z2["leaves"] D = dm[idx1, :] D = D[:, idx2] if cmap is None: cmap = "coolwarm" fig = axmatrix.matshow(D[::-1], aspect="auto", cmap=cmap, vmin=vmin, vmax=vmax) axmatrix.set_xticks([]) axmatrix.set_yticks([]) ax = plt.gca() ax.yaxis.tick_right() ax.xaxis.tick_bottom() clustered = df.iloc[Z1["leaves"][::-1], Z2["leaves"]] ndx_y = np.linspace(0, len(clustered.index) - 1, ymaxticks) ndx_x = np.linspace(0, len(clustered.columns) - 1, xmaxticks) ndx_y = [int(i) for i in ndx_y] ndx_x = [int(i) for i in ndx_x] _ = plt.yticks(ndx_y, clustered.iloc[ndx_y].index) _ = plt.xticks(ndx_x, clustered.columns[ndx_x], rotation=90) ndx_leaves = Z1["leaves"][::-1] col_leaves = Z2["leaves"] return clustered, fig, ndx_leaves, col_leaves
[docs] def plot_peak_shapes( mint_results, mint_metadata=None, fns=None, peak_labels=None, height=3, aspect=1.5, legend=False, col_wrap=4, hue="ms_file_label", title=None, dpi=None, sharex=False, sharey=False, kind='line', **kwargs, ): """ Plot peak shapes of mint results. :param mint_results: DataFrame in Mint results format. :type mint_results: pandas.DataFrame :param mint_metadata: DataFrame in Mint metadata format. :type mint_metadata: pandas.DataFrame :param fns: Filenames to include, defaults to None :type fns: list, optional :param peak_labels: Peak-labels to include, defaults to None :type peak_labels: list, optional :param height: Height of the figure facets, defaults to 4 :type height: int, optional :param aspect: Aspect ratio of the figure facets, defaults to 1 :type aspect: int, optional :param legend: Whether or not to add a legend, defaults to False :type legend: bool, optional :param col_wrap: Number of columns for sub-plots, defaults to 4 :type col_wrap: int, optional :param hue: Column name for color groups, defaults to "ms_file" :type hue: str, optional :param title: Title to add, defaults to None :type title: str, optional :param dpi: Resolution of generated image, defaults to None :type dpi: int, optional :param sharex: Whether or not to share x-axis range between subplots, defaults to False :type sharex: bool, optional :param sharey: Whether or not to share y-axis range between subplots, defaults to False :type sharey: bool, optional :param kind: Kind of seaborn relplot :type kind: str, optional :return: Generated figure object. :rtype: matplotlib.pyplot.Figure """ # fig = plt.figure(dpi=dpi) R = mint_results.copy() R = R[R.peak_area > 0] R["peak_label"] = R["peak_label"] if peak_labels is not None: if isinstance(peak_labels, str): peak_labels = [peak_labels] R = R[R.peak_label.isin(peak_labels)] else: peak_labels = R.peak_label.drop_duplicates().values if fns is not None: R = R[R.ms_file.isin(fns)] dfs = [] for peak_label in peak_labels: for _, row in R[ (R.peak_label == peak_label) & (R.peak_n_datapoints > 1) ].iterrows(): peak_rt = [float(i) for i in row.peak_shape_rt.split(",")] peak_int = [float(i) for i in row.peak_shape_int.split(",")] ms_file_label = row.ms_file_label mz = row.mz_mean rt = row.rt df = pd.DataFrame( { "Scan time [s]": peak_rt, "Intensity": peak_int, "ms_file_label": ms_file_label, "peak_label": peak_label, "Expected Scan time [s]": rt, } ) dfs.append(df) df = pd.concat(dfs, ignore_index=True).reset_index(drop=True) # Add metadata if mint_metadata is not None: df = pd.merge(df, mint_metadata, left_on='ms_file_label', right_index=True, how='left') _facet_kws = dict(sharex=sharex, sharey=sharey) if 'facet_kws' in kwargs.keys(): _facet_kws.update(kwargs.pop('facet_kws')) g = sns.relplot( data=df, x="Scan time [s]", y="Intensity", hue=hue, col="peak_label", col_order=peak_labels, kind=kind, col_wrap=col_wrap, height=height, aspect=aspect, facet_kws=_facet_kws, legend=legend, **kwargs, ) g.set_titles(row_template="{row_name}", col_template="{col_name}") for ax in g.axes.flatten(): ax.ticklabel_format(style="sci", scilimits=(0, 0), axis="y") if title is not None: g.fig.suptitle(title, y=1.01) return g
[docs] def plot_peaks( series, peaks, highlight=None, expected_rt=None, weights=None, legend=True, label=None, **kwargs, ): if highlight is None: highlight = [] ax = plt.gca() ax.plot( series.index, series.values, label=label if label is not None else "Intensity", **kwargs, ) if peaks is not None: series.iloc[peaks.ndxs].plot( label="Peaks", marker="x", y="intensity", lw=0, ax=ax ) for i, ( ndx, (_, _, _, peak_base_height, _, rt_min, rt_max), ) in enumerate(peaks.iterrows()): if ndx in highlight: plt.axvspan(rt_min, rt_max, color="green", alpha=0.25, label="Selected") plt.hlines( peak_base_height, rt_min, rt_max, color="orange", label="Peak width" if i == 0 else None, ) if expected_rt is not None: plt.axvspan( expected_rt, expected_rt + 1, color="blue", alpha=1, label="Expected Rt" ) if weights is not None: plt.plot(weights, linestyle="--", label="Gaussian weight") plt.ylabel("Intensity") plt.xlabel("Scan time [s]") ax.ticklabel_format(axis="y", style="sci", scilimits=(0, 0)) #plt.ylim((0.001, None)) if not legend: ax.get_legend().remove() return plt.gcf()
[docs] def plot_metabolomics_hist2d( df, figsize=(4, 2.5), dpi=300, set_dim=True, cmap="jet", rt_range=None, mz_range=None, mz_bins=100, **kwargs, ): if set_dim: plt.figure(figsize=figsize, dpi=dpi) if mz_range is None: mz_range = (df.mz.min(), df.mz.max()) if rt_range is None: rt_range = (df.scan_time.min(), df.scan_time.max()) rt_bins = int((rt_range[1] - rt_range[0]) / 2) params = dict(vmin=1, vmax=1e3, cmap=cmap, range=(rt_range, mz_range)) params.update(kwargs) fig = plt.hist2d( df["scan_time"], df["mz"], weights=df["intensity"].apply(np.log1p), bins=[rt_bins, mz_bins], **params, ) plt.xlabel("Scan time [s]") plt.ylabel("m/z") # plt.grid() plt.gca().ticklabel_format(useOffset=False, style="plain") return fig