Source code for ms_mint.Mint

"""Main module of the ms-mint library."""

import os
import numpy as np
import pandas as pd
import time
import logging

from pathlib import Path as P
from multiprocessing import Pool, Manager, cpu_count
from glob import glob

from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler

from .processing import process_ms1_files_in_parallel, extract_chromatogram_from_ms1
from .io import export_to_excel, ms_file_to_df
from .TargetOptimizer import TargetOptimizer
from .targets import read_targets, check_targets, standardize_targets
from .tools import (
from .pca import PrincipalComponentsAnalyser
from .MintPlotter import MintPlotter
from .Chromatogram import Chromatogram

import ms_mint

from tqdm import tqdm

from typing import Callable
from functools import lru_cache

METADATA_DEFAUT_FN = 'metadata.parquet'

[docs] class Mint(object): """ Main class of the ms_mint package, which processes metabolomics files. :param verbose: Sets verbosity of the instance. :type verbose: bool :param progress_callback: A callback for a progress bar. :type progress_callback: Callable[] :parm wdir: Working directory :type wdir: str """ def __init__( self, verbose: bool = False, progress_callback: Callable = None, time_unit: str = "s", wdir: str = None ): self.verbose = verbose self._version = ms_mint.__version__ if verbose: print("Mint version:", self.version, "\n") self.progress_callback = progress_callback self.reset() self.plot = MintPlotter(mint=self) self.opt = TargetOptimizer(mint=self) self.pca = PrincipalComponentsAnalyser(self) self.tqdm = tqdm # Setup working directory as pathlib.Path self.wdir = os.getcwd() if wdir is None else wdir self.wdir = P(self.wdir) @property def version(self): """ ms-mint version number. :return: Version string. :rtype: str """ return self._version
[docs] def reset(self): """ Reset Mint instance. Removes targets, MS-files and results. :return: self :rtype: ms_mint.Mint.Mint """ self._files = [] self._targets_files = [] self._targets = pd.DataFrame(columns=TARGETS_COLUMNS) self._results = pd.DataFrame({i: [] for i in MINT_RESULTS_COLUMNS}) self._all_df = None self._progress = 0 self.runtime = None self._status = "waiting" self._messages = [] self.meta = init_metadata() return self
[docs] def clear_targets(self): """ Reset target list. """ self.targets = pd.DataFrame(columns=TARGETS_COLUMNS)
[docs] def clear_results(self): """ Reset results. """ self.results = pd.DataFrame(columns=MINT_RESULTS_COLUMNS)
[docs] def clear_ms_files(self): """ Reset ms files. """ self.ms_files = []
[docs] def run(self, nthreads=None, rt_margin=0.5, mode="standard", fn=None, **kwargs): """ Main routine to run MINT and process MS-files with current target list. :param nthreads: Number of cores to use, defaults to None :type nthreads: int * None - Run with min(n_cpus, c_files) CPUs * 1: Run without multiprocessing on one CPU * >1: Run with multiprocessing enabled using nthreads threads. :param mode: Compute mode ('standard' or 'express'), defaults to 'standard' :type mode: str * 'standard': calculates peak shaped projected to RT dimension * 'express': omits calculation of other features, only peak_areas :param fn: Output filename to not keep results in memory. :type fn: str :param kwargs: Arguments passed to the procesing function. """ self._status = "running" if (self.n_files == 0) or (len(self.targets) == 0): return None targets = self.targets.reset_index() self._set_rt_min_max(targets, rt_margin) nthreads = self._determine_nthreads(nthreads) if self.verbose: print(f"Run MINT with {nthreads} processes:") start = time.time() if nthreads > 1: self._run_parallel(nthreads=nthreads, mode=mode, fn=fn, **kwargs) else: self._run_sequential(mode=mode, fn=fn, targets=targets) self.progress = 100 self._report_runtime(start) self._status = "done" assert self.progress == 100 return self
def _set_rt_min_max(self, targets, rt_margin): if "rt" in targets.columns: update_rt_min = (targets.rt_min.isna()) & (~targets.rt.isna()) targets.loc[update_rt_min, "rt_min"] = ( targets.loc[update_rt_min, "rt"] - rt_margin ) update_rt_max = (targets.rt_max.isna()) & (~targets.rt.isna()) targets.loc[update_rt_max, "rt_max"] = ( targets.loc[update_rt_max, "rt"] + rt_margin ) def _determine_nthreads(self, nthreads): if nthreads is None: nthreads = min(cpu_count(), self.n_files) return nthreads def _run_sequential(self, mode, fn, targets): results = [] for i, filename in enumerate(self.ms_files): args = { "filename": filename, "targets": targets, "q": None, "mode": mode, "output_fn": None, } results.append(process_ms1_files_in_parallel(args)) self.progress = int(100 * (i / self.n_files)) self.results = pd.concat(results).reset_index(drop=True) def _report_runtime(self, start): end = time.time() self.runtime = end - start self.runtime_per_file = self.runtime / self.n_files self.runtime_per_peak = self.runtime / self.n_files / len(self.targets) if self.verbose: print(f"Total runtime: {self.runtime:.2f}s") print(f"Runtime per file: {self.runtime_per_file:.2f}s") print( f"Runtime per peak ({len(self.targets)}): {self.runtime_per_peak:.2f}s\n" ) print("Results:", self.results) def _run_parallel( self, nthreads=1, mode="standard", maxtasksperchild=None, fn=None ): pool = Pool(processes=nthreads, maxtasksperchild=maxtasksperchild) m = Manager() q = m.Queue() args = [] if fn is not None: # Prepare output file (only headers) pd.DataFrame(columns=MINT_RESULTS_COLUMNS).to_csv(fn, index=False) for filename in self.ms_files: args.append( { "filename": filename, "targets": self.targets.reset_index(), "queue": q, "mode": mode, "output_fn": fn, } ) results = pool.map_async(process_ms1_files_in_parallel, args) self._monitor_progress(results, q) pool.close() pool.join() if fn is None: results = results.get() self.results = pd.concat(results).reset_index(drop=True) def _monitor_progress(self, results, q): while not results.ready(): size = q.qsize() self.progress = 100 * size / self.n_files time.sleep(1) self.progress = 100 @property def status(self): """ Returns current status of Mint instance. :return: ['waiting', 'running', 'done'] :rtype: str """ return self._status @property def ms_files(self): """ Get/set ms-files to process. :getter: :return: List of filenames. :rtype: list[str] :setter: :param list_of_files: Filename or list of file names of MS-files. :type list_of_files: str or list[str] """ return self._files @ms_files.setter def ms_files(self, list_of_files): if isinstance(list_of_files, str): list_of_files = [list_of_files] list_of_files = [str(P(i)) for i in list_of_files if is_ms_file(i)] for f in list_of_files: if not os.path.isfile(f): logging.warning(f"File not found ({f})") self._files = list_of_files if self.verbose: print("Set files to:\n" + "\n".join(self.ms_files) + "\n") self.meta = self.meta.reindex([fn_to_label(fn) for fn in list_of_files]) @property def n_files(self): """ Number of currently stored ms filenames. :return: Number of files stored in self.ms_files :rtype: int """ return len(self.ms_files)
[docs] def load_files(self, obj): """ Load ms_files as a function that returns the Mint instance for chaining. :param list_of_files: Filename or list of file names. :type list_of_files: str or list[str] :return: self :rtype: ms_mint.Mint.Mint """ if isinstance(obj, str): self.ms_files = glob(obj, recursive=True) elif isinstance(obj, list): self.ms_files = obj return self
[docs] def load_targets(self, list_of_files): """ Load targets from a file (csv, xslx) :param list_of_files: Filename or list of file names. :type list_of_files: str or list[str] :return: self :rtype: ms_mint.Mint.Mint """ if isinstance(list_of_files, str) or isinstance(list_of_files, P): list_of_files = [list_of_files] if not isinstance(list_of_files, list): raise ValueError("Input should be a list of files.") for f in list_of_files: assert os.path.isfile(f), f"File not found ({f})" self._targets_files = list_of_files if self.verbose: print("Set targets files to:\n".join(self._targets_files) + "\n") self.targets = read_targets(list_of_files) return self
@property def targets(self): """ Set/get target list. :getter: :return: Target list :rtype: pandas.DataFrame :setter: :param targets: Sets the target list of the instance. :type targets: pandas.DataFrame """ return self._targets @targets.setter def targets(self, targets): targets = standardize_targets(targets) assert check_targets(targets), check_targets(targets) self._targets = targets.set_index("peak_label") if self.verbose: print("Set targets to:\n", self.targets.to_string(), "\n")
[docs] def get_target_params(self, peak_label): target_data = self.targets.loc[peak_label] mz_mean, mz_width, rt_min, rt_max = target_data[ ["mz_mean", "mz_width", "rt_min", "rt_max"] ] return mz_mean, mz_width, rt_min, rt_max
@property def peak_labels(self): return self.targets.index.to_list() @property def results(self): """ Get/Set the Mint results. :getter: :return: Results :rtype: pandas.DataFrame :setter: :param df: DataFrame with MINT results. :type df: pandas.DataFrame """ return self._results @results.setter def results(self, df): self._results = df
[docs] def crosstab(self, var_name: str = None, index: str = None, column: str = None, aggfunc: str = 'mean', apply: Callable = None, scaler: Callable = None, groupby: str = None): """ Create condensed representation of the results. More specifically, a cross-table with filenames as index and target labels. The values in the cells are determined by *col_name*. :param var_name: Name of the column from *mint.results* table that is used for the cell values. If None, defaults to 'peak_area_top3'. :type var_name: str, optional :param index: Name of the column to be used as index in the resulting cross-tabulation. If None, defaults to 'ms_file_label'. :type index: str, optional :param column: Name of the column to be used as columns in the resulting cross-tabulation. If None, defaults to 'peak_label'. :type column: str, optional :param aggfunc: Aggregation function to be used for aggregating values. Defaults to 'mean'. :type aggfunc: str, optional :param apply: Function to be applied to the resulting cross-tabulation. If None, no function is applied. :type apply: Callable, optional :param scaler: Function to scale the data in the resulting cross-tabulation. If None, no scaling is performed. :type scaler: Callable, optional :param groupby: Name of the column to group data before scaling. If None, scaling is applied to the whole data, not group-wise. :type groupby: str, optional :return: DataFrame representing the cross-tabulation. :rtype: pandas.DataFrame """ df_meta = pd.merge(self.meta, self.results, left_index=True, right_on='ms_file_label') # Remove None if in index if isinstance(index, list): if None in index: index.remove(None) if isinstance(groupby, str): groupby = [groupby] if index is None: index = 'ms_file_label' if column is None: column = 'peak_label' if var_name is None: var_name = 'peak_area_top3' if apply: if apply == 'log2p1': apply = log2p1 if apply == 'logp1': apply = np.log1p df_meta[var_name] = df_meta[var_name].apply(apply) if isinstance(scaler, str): scaler_dict = {'standard': StandardScaler(), 'robust': RobustScaler(), 'minmax': MinMaxScaler()} if scaler not in scaler_dict: raise ValueError(f"Unsupported scaler: {scaler}") scaler = scaler_dict[scaler] if scaler: if groupby: groupby_cols = groupby + [column] df_meta[var_name] = df_meta.groupby(groupby_cols)[var_name].transform(lambda x: self._scale_group(x, scaler)) else: df_meta[var_name] = df_meta.groupby(column)[var_name].transform(lambda x: self._scale_group(x, scaler)) df = pd.pivot_table( df_meta, index=index, columns=column, values=var_name, aggfunc=aggfunc, ).astype(np.float64) return df
@property def progress(self): """ Shows the current progress. :getter: Returns the current progress value. :setter: Set the progress to a value between 0 and 100 and calls the progress callback function. """ return self._progress @progress.setter def progress(self, value): assert value >= 0, value assert value <= 100, value self._progress = value if self.progress_callback is not None: self.progress_callback(value)
[docs] def export(self, fn=None): """ Export current results to file. :param fn: Filename, defaults to None :type fn: str, optional :param filename: **deprecated** :type filename: str, optional :return: file buffer if *filename* is None otherwise returns None :rtype: io.BytesIO """ if fn is None: buffer = export_to_excel(self, fn=fn) return buffer elif fn.endswith(".xlsx"): export_to_excel(self, fn=fn) elif fn.endswith(".csv"): self.results.to_csv(fn, index=False) elif fn.endswith(".parquet"): self.results.to_parquet(fn, index=False)
[docs] def load(self, fn): """ Load results into Mint instance. :param fn: Filename (csv, xlsx) :type fn: str :return: self :rtype: ms_mint.Mint.Mint """ if self.verbose: print(f"Loading MINT results from {fn}") if isinstance(fn, str): if fn.endswith("xlsx"): results = pd.read_excel(fn, sheet_name="Results") self.results = results elif fn.endswith(".csv"): results = pd.read_csv(fn) results["peak_shape_rt"] = results["peak_shape_rt"].fillna("") results["peak_shape_int"] = results["peak_shape_int"].fillna("") self.results = results elif fn.endswith(".parquet"): results = pd.read_parquet(fn) else: results = pd.read_csv(fn) # Add file labels if not present already if 'ms_file_label' not in results.columns: results['ms_file_label'] = [fn_to_label(fn) for fn in results.ms_file] self.results = results.rename(columns=DEPRECATED_LABELS) self.digest_results() return self
[docs] def digest_results(self): self.ms_files = get_ms_files_from_results(self.results) self.targets = get_targets_from_results(self.results)
[docs] def get_chromatograms(self, fns=None, peak_labels=None, filters=None, **kwargs): if fns is None: fns = self.ms_files if peak_labels is None: peak_labels = self.peak_labels return self._get_chromatograms(fns=tuple(fns), peak_labels=tuple(peak_labels), filters=tuple(filters) if filters is not None else None, **kwargs)
@lru_cache(1) def _get_chromatograms(self, fns=None, peak_labels=None, filters=None, **kwargs): if isinstance(fns, tuple): fns = list(fns) if not isinstance(fns, list): fns = [fns] labels = [fn_to_label(fn) for fn in fns] # Need to get the actual file names with get_chromatogramsath # in case only ms_file_labels are provided fns = [fn for fn in self.ms_files if fn_to_label(fn) in labels] data = [] for fn in self.tqdm(fns, desc="Loading chromatograms"): df = ms_file_to_df(fn) for label in peak_labels: mz_mean, mz_width, rt_min, rt_max = self.get_target_params(label) chrom_raw = extract_chromatogram_from_ms1( df, mz_mean=mz_mean, mz_width=mz_width ).to_frame() if len(chrom_raw) == 0: continue chrom = Chromatogram(chrom_raw.index, chrom_raw.values, filters=filters, **kwargs) if filters is not None: chrom.apply_filters() chrom_data = chrom_data["ms_file"] = fn chrom_data["ms_file_label"] = fn_to_label(fn) chrom_data["peak_label"] = label chrom_data["rt_min"] = rt_min chrom_data["rt_max"] = rt_max data.append(chrom_data) data = pd.concat(data).reset_index() data["ms_file"] = data["ms_file"].apply(lambda x: P(x).with_suffix("").name) return data
[docs] def load_metadata(self, fn=None): if fn is None: fn = self.wdir/METADATA_DEFAUT_FN if str(fn).endswith('.csv'): self.meta = pd.read_csv(fn, index_col=0) elif str(fn).endswith('.parquet'): self.meta = pd.read_parquet(fn) if 'ms_file_label' in self.meta.columns: self.meta = self.meta.set_index('ms_file_label') return self
[docs] def save_metadata(self, fn=None): if fn is None: fn = self.wdir/METADATA_DEFAUT_FN if str(fn).endswith('.csv'): self.meta.to_csv(fn, na_filter=False) elif str(fn).endswith('.parquet'): self.meta.to_parquet(fn) return self
[docs] def _scale_group(self, group, scaler): """ Helper function to scale groups individually. """ return scaler.fit_transform(group.to_numpy().reshape(-1, 1)).flatten()