Skip to content

Helper tools

df_diff(df1, df2, which='both')

Find differences between two dataframes.

Parameters:

Name Type Description Default
df1 DataFrame

Reference DataFrame.

required
df2 DataFrame

DataFrame to compare.

required
which str

Direction in which to compare. Options are "both", "left_only", "right_only".

'both'

Returns:

Type Description
DataFrame

DataFrame containing only the rows that differ according to the specified direction.

Source code in src/ms_mint/tools.py
def df_diff(df1: pd.DataFrame, df2: pd.DataFrame, which: str = "both") -> pd.DataFrame:
    """Find differences between two dataframes.

    Args:
        df1: Reference DataFrame.
        df2: DataFrame to compare.
        which: Direction in which to compare. Options are "both", "left_only", "right_only".

    Returns:
        DataFrame containing only the rows that differ according to the specified direction.
    """
    _df = df1.merge(df2, indicator=True, how="outer")
    diff_df = _df[_df["_merge"] != which]
    return diff_df.reset_index(drop=True)

find_peaks_in_timeseries(series, prominence=None, plot=False, rel_height=0.9, **kwargs)

Find peaks in a time series using scipy's peak finding algorithm.

Parameters:

Name Type Description Default
series Series

Time series data to find peaks in.

required
prominence Optional[float]

Minimum prominence of peaks. If None, all peaks are detected.

None
plot bool

Whether to generate a plot of the detected peaks.

False
rel_height float

Relative height from the peak at which to determine peak width.

0.9
**kwargs

Additional arguments passed to scipy.signal.find_peaks.

{}

Returns:

Type Description
DataFrame

DataFrame containing peak properties including retention times and heights.

Source code in src/ms_mint/tools.py
def find_peaks_in_timeseries(
    series: pd.Series,
    prominence: Optional[float] = None,
    plot: bool = False,
    rel_height: float = 0.9,
    **kwargs,
) -> pd.DataFrame:
    """Find peaks in a time series using scipy's peak finding algorithm.

    Args:
        series: Time series data to find peaks in.
        prominence: Minimum prominence of peaks. If None, all peaks are detected.
        plot: Whether to generate a plot of the detected peaks.
        rel_height: Relative height from the peak at which to determine peak width.
        **kwargs: Additional arguments passed to scipy.signal.find_peaks.

    Returns:
        DataFrame containing peak properties including retention times and heights.
    """
    t = series.index
    x = series.values
    peak_ndxs, _ = find_peaks(x, prominence=prominence, rel_height=rel_height, **kwargs)
    widths, heights, left_ips, right_ips = peak_widths(x, peak_ndxs, rel_height=rel_height)
    times = series.iloc[peak_ndxs].index

    t_start = _map_ndxs_to_time(left_ips, min(t), max(t), 0, len(t))
    t_end = _map_ndxs_to_time(right_ips, min(t), max(t), 0, len(t))

    data = dict(
        ndxs=peak_ndxs,
        rt=times,
        rt_span=widths,
        peak_base_height=heights,
        peak_height=series.iloc[peak_ndxs].values,
        rt_min=t_start,
        rt_max=t_end,
    )

    peaks = pd.DataFrame(data)

    if plot:
        plot_peaks(series, peaks)

    return peaks

fn_to_label(fn)

Convert a filename to a label by removing the file extension.

Parameters:

Name Type Description Default
fn Union[str, Path]

Filename or path.

required

Returns:

Type Description
str

Filename without extension.

Source code in src/ms_mint/tools.py
def fn_to_label(fn: Union[str, P]) -> str:
    """Convert a filename to a label by removing the file extension.

    Args:
        fn: Filename or path.

    Returns:
        Filename without extension.
    """
    return P(fn).with_suffix("").name

formula_to_mass(formulas, ms_mode=None)

Calculate m/z values from molecular formulas for specific ionization mode.

Parameters:

Name Type Description Default
formulas Union[str, List[str]]

List of molecular formulas (e.g., ['H2O']) or a single formula.

required
ms_mode Optional[Literal['negative', 'positive', 'neutral']]

Ionization mode. One of "negative", "positive", "neutral", or None.

None

Returns:

Type Description
List[Optional[float]]

List of calculated masses. None values are included for invalid formulas.

Raises:

Type Description
AssertionError

If ms_mode is not one of the allowed values.

Source code in src/ms_mint/tools.py
def formula_to_mass(
    formulas: Union[str, List[str]],
    ms_mode: Optional[Literal["negative", "positive", "neutral"]] = None,
) -> List[Optional[float]]:
    """Calculate m/z values from molecular formulas for specific ionization mode.

    Args:
        formulas: List of molecular formulas (e.g., ['H2O']) or a single formula.
        ms_mode: Ionization mode. One of "negative", "positive", "neutral", or None.

    Returns:
        List of calculated masses. None values are included for invalid formulas.

    Raises:
        AssertionError: If ms_mode is not one of the allowed values.
    """
    masses = []
    assert ms_mode in [None, "negative", "positive", "neutral"], ms_mode
    if isinstance(formulas, str):
        formulas = [formulas]
    for formula in formulas:
        try:
            mass = Formula(formula).isotope.mass
            if ms_mode == "positive":
                mass += M_PROTON
            elif ms_mode == "negative":
                mass -= M_PROTON
            mass = np.round(mass, 4)
            masses.append(mass)
        except FormulaError as e:
            masses.append(None)
            logging.warning(e)  # Fixed typo: waringin → warning
    return masses

gaussian(x, mu, sig)

Generate values for a Gaussian function.

Parameters:

Name Type Description Default
x Union[List[float], ndarray]

x-values to generate function values.

required
mu float

Mean of the Gaussian.

required
sig float

Standard deviation of the Gaussian.

required

Returns:

Type Description
ndarray

Array of Gaussian function values at the input x-values.

Source code in src/ms_mint/tools.py
def gaussian(x: Union[List[float], np.ndarray], mu: float, sig: float) -> np.ndarray:
    """Generate values for a Gaussian function.

    Args:
        x: x-values to generate function values.
        mu: Mean of the Gaussian.
        sig: Standard deviation of the Gaussian.

    Returns:
        Array of Gaussian function values at the input x-values.
    """
    x = np.array(x)
    return np.exp(-np.power(x - mu, 2.0) / (2 * np.power(sig, 2.0)))

get_ms_files_from_results(results)

Extract MS filenames from Mint results.

Parameters:

Name Type Description Default
results DataFrame

DataFrame in Mint results format.

required

Returns:

Type Description
List[Union[str, Path]]

List of MS filenames.

Source code in src/ms_mint/tools.py
def get_ms_files_from_results(results: pd.DataFrame) -> List[Union[str, P]]:
    """Extract MS filenames from Mint results.

    Args:
        results: DataFrame in Mint results format.

    Returns:
        List of MS filenames.
    """
    # Old schema
    if "ms_path" in results.columns:
        ms_files = results[["ms_path", "ms_file"]].drop_duplicates()
        ms_files = [P(ms_path) / ms_file for ms_path, ms_file in ms_files.values]
    else:
        ms_files = results.ms_file.unique()
    return ms_files

get_targets_from_results(results)

Extract targets DataFrame from MS-MINT results table.

Parameters:

Name Type Description Default
results DataFrame

Mint results table.

required

Returns:

Type Description
DataFrame

DataFrame containing target information extracted from results.

Source code in src/ms_mint/tools.py
def get_targets_from_results(results: pd.DataFrame) -> pd.DataFrame:
    """Extract targets DataFrame from MS-MINT results table.

    Args:
        results: Mint results table.

    Returns:
        DataFrame containing target information extracted from results.
    """
    return results[[col for col in TARGETS_COLUMNS if col in results.columns]].drop_duplicates()

init_metadata()

Initialize an empty metadata DataFrame with the standard columns.

Returns:

Type Description
DataFrame

Empty DataFrame with standard metadata columns and 'ms_file_label' as index.

Source code in src/ms_mint/tools.py
def init_metadata() -> pd.DataFrame:
    """Initialize an empty metadata DataFrame with the standard columns.

    Returns:
        Empty DataFrame with standard metadata columns and 'ms_file_label' as index.
    """
    cols = MINT_METADATA_COLUMNS
    return pd.DataFrame(columns=cols).set_index("ms_file_label")

is_ms_file(fn)

Check if a file is a recognized MS file format based on its extension.

Parameters:

Name Type Description Default
fn Union[str, Path]

Filename or path to check.

required

Returns:

Type Description
bool

True if the file has a recognized MS file extension, False otherwise.

Source code in src/ms_mint/tools.py
def is_ms_file(fn: Union[str, P]) -> bool:
    """Check if a file is a recognized MS file format based on its extension.

    Args:
        fn: Filename or path to check.

    Returns:
        True if the file has a recognized MS file extension, False otherwise.
    """
    fn = str(fn)
    if (
        (fn.lower().endswith(".mzxml"))
        or (fn.lower().endswith(".mzml"))
        or (fn.lower().endswith(".mzmlb"))
        or (fn.lower().endswith(".mzhdf"))
        or (fn.lower().endswith(".raw"))
        or (fn.lower().endswith(".parquet"))
        or (fn.lower().endswith(".feather"))
    ):
        return True
    else:
        return False

lock(fn)

Create a file lock to ensure safe writing to file.

Parameters:

Name Type Description Default
fn Union[str, Path]

Filename to lock.

required

Returns:

Type Description
FileLock

File lock object.

Source code in src/ms_mint/tools.py
def lock(fn: Union[str, P]) -> FileLock:
    """Create a file lock to ensure safe writing to file.

    Args:
        fn: Filename to lock.

    Returns:
        File lock object.
    """
    return FileLock(f"{fn}.lock", timeout=1)

log2p1(x)

Apply log2(x+1) transformation to numeric data.

Parameters:

Name Type Description Default
x Union[float, ndarray, Series]

Numeric value or array to transform.

required

Returns:

Type Description
Union[float, ndarray, Series]

Transformed value(s).

Source code in src/ms_mint/tools.py
def log2p1(x: Union[float, np.ndarray, pd.Series]) -> Union[float, np.ndarray, pd.Series]:
    """Apply log2(x+1) transformation to numeric data.

    Args:
        x: Numeric value or array to transform.

    Returns:
        Transformed value(s).
    """
    return np.log2(x + 1)

mz_mean_width_to_min_max(mz_mean, mz_width)

Convert m/z mean and width (in ppm) to min and max m/z values.

Parameters:

Name Type Description Default
mz_mean float

Mean m/z value.

required
mz_width float

Width in parts-per-million (ppm).

required

Returns:

Type Description
Tuple[float, float]

Tuple of (mz_min, mz_max) defining the m/z range.

Source code in src/ms_mint/tools.py
def mz_mean_width_to_min_max(mz_mean: float, mz_width: float) -> Tuple[float, float]:
    """Convert m/z mean and width (in ppm) to min and max m/z values.

    Args:
        mz_mean: Mean m/z value.
        mz_width: Width in parts-per-million (ppm).

    Returns:
        Tuple of (mz_min, mz_max) defining the m/z range.
    """
    delta_mass = mz_width * mz_mean * 1e-6
    mz_min = mz_mean - delta_mass
    mz_max = mz_mean + delta_mass
    return mz_min, mz_max

scale_dataframe(df, scaler='standard', **kwargs)

Scale all columns in a dense dataframe.

Parameters:

Name Type Description Default
df DataFrame

DataFrame to scale.

required
scaler Union[str, Any]

Scaler to use. Either a string ('robust', 'standard', 'minmax') or a scikit-learn scaler instance.

'standard'
**kwargs

Additional arguments passed to the scaler constructor.

{}

Returns:

Type Description
DataFrame

Scaled DataFrame with the same shape as the input.

Source code in src/ms_mint/tools.py
def scale_dataframe(
    df: pd.DataFrame, scaler: Union[str, Any] = "standard", **kwargs
) -> pd.DataFrame:
    """Scale all columns in a dense dataframe.

    Args:
        df: DataFrame to scale.
        scaler: Scaler to use. Either a string ('robust', 'standard', 'minmax')
            or a scikit-learn scaler instance.
        **kwargs: Additional arguments passed to the scaler constructor.

    Returns:
        Scaled DataFrame with the same shape as the input.
    """
    df = df.copy()
    if isinstance(scaler, str):
        if scaler == "standard":
            scaler = StandardScaler(**kwargs)
        elif scaler == "robust":
            scaler = RobustScaler(**kwargs)
        elif scaler == "minmax":
            scaler = MinMaxScaler(**kwargs)
    df.loc[:, :] = scaler.fit_transform(df)
    return df

options: show_root_heading: true show_root_full_path: true show_submodules: true members_order: source