Helper tools¶

`df_diff(df1, df2, which='both')` ¶

Find differences between two dataframes.

Parameters:

Name	Type	Description	Default
`df1`	`DataFrame`	Reference DataFrame.	required
`df2`	`DataFrame`	DataFrame to compare.	required
`which`	`str`	Direction in which to compare. Options are "both", "left_only", "right_only".	`'both'`

Returns:

Type	Description
`DataFrame`	DataFrame containing only the rows that differ according to the specified direction.

Source code in src/ms_mint/tools.py

def df_diff(df1: pd.DataFrame, df2: pd.DataFrame, which: str = "both") -> pd.DataFrame:
    """Find differences between two dataframes.

    Args:
        df1: Reference DataFrame.
        df2: DataFrame to compare.
        which: Direction in which to compare. Options are "both", "left_only", "right_only".

    Returns:
        DataFrame containing only the rows that differ according to the specified direction.
    """
    _df = df1.merge(df2, indicator=True, how="outer")
    diff_df = _df[_df["_merge"] != which]
    return diff_df.reset_index(drop=True)

`find_peaks_in_timeseries(series, prominence=None, plot=False, rel_height=0.9, **kwargs)` ¶

Find peaks in a time series using scipy's peak finding algorithm.

Parameters:

Name	Type	Description	Default
`series`	`Series`	Time series data to find peaks in.	required
`prominence`	`Optional[float]`	Minimum prominence of peaks. If None, all peaks are detected.	`None`
`plot`	`bool`	Whether to generate a plot of the detected peaks.	`False`
`rel_height`	`float`	Relative height from the peak at which to determine peak width.	`0.9`
`**kwargs`		Additional arguments passed to scipy.signal.find_peaks.	`{}`

Returns:

Type	Description
`DataFrame`	DataFrame containing peak properties including retention times and heights.

Source code in src/ms_mint/tools.py

def find_peaks_in_timeseries(
    series: pd.Series,
    prominence: Optional[float] = None,
    plot: bool = False,
    rel_height: float = 0.9,
    **kwargs,
) -> pd.DataFrame:
    """Find peaks in a time series using scipy's peak finding algorithm.

    Args:
        series: Time series data to find peaks in.
        prominence: Minimum prominence of peaks. If None, all peaks are detected.
        plot: Whether to generate a plot of the detected peaks.
        rel_height: Relative height from the peak at which to determine peak width.
        **kwargs: Additional arguments passed to scipy.signal.find_peaks.

    Returns:
        DataFrame containing peak properties including retention times and heights.
    """
    t = series.index
    x = series.values
    peak_ndxs, _ = find_peaks(x, prominence=prominence, rel_height=rel_height, **kwargs)
    widths, heights, left_ips, right_ips = peak_widths(x, peak_ndxs, rel_height=rel_height)
    times = series.iloc[peak_ndxs].index

    t_start = _map_ndxs_to_time(left_ips, min(t), max(t), 0, len(t))
    t_end = _map_ndxs_to_time(right_ips, min(t), max(t), 0, len(t))

    data = dict(
        ndxs=peak_ndxs,
        rt=times,
        rt_span=widths,
        peak_base_height=heights,
        peak_height=series.iloc[peak_ndxs].values,
        rt_min=t_start,
        rt_max=t_end,
    )

    peaks = pd.DataFrame(data)

    if plot:
        plot_peaks(series, peaks)

    return peaks

`fn_to_label(fn)` ¶

Convert a filename to a label by removing the file extension.

Parameters:

Name	Type	Description	Default
`fn`	`Union[str, Path]`	Filename or path.	required

Returns:

Type	Description
`str`	Filename without extension.

Source code in src/ms_mint/tools.py

def fn_to_label(fn: Union[str, P]) -> str:
    """Convert a filename to a label by removing the file extension.

    Args:
        fn: Filename or path.

    Returns:
        Filename without extension.
    """
    return P(fn).with_suffix("").name

`formula_to_mass(formulas, ms_mode=None)` ¶

Calculate m/z values from molecular formulas for specific ionization mode.

Parameters:

Name	Type	Description	Default
`formulas`	`Union[str, List[str]]`	List of molecular formulas (e.g., ['H2O']) or a single formula.	required
`ms_mode`	`Optional[Literal['negative', 'positive', 'neutral']]`	Ionization mode. One of "negative", "positive", "neutral", or None.	`None`

Returns:

Type	Description
`List[Optional[float]]`	List of calculated masses. None values are included for invalid formulas.

Raises:

Type	Description
`AssertionError`	If ms_mode is not one of the allowed values.

Source code in src/ms_mint/tools.py

def formula_to_mass(
    formulas: Union[str, List[str]],
    ms_mode: Optional[Literal["negative", "positive", "neutral"]] = None,
) -> List[Optional[float]]:
    """Calculate m/z values from molecular formulas for specific ionization mode.

    Args:
        formulas: List of molecular formulas (e.g., ['H2O']) or a single formula.
        ms_mode: Ionization mode. One of "negative", "positive", "neutral", or None.

    Returns:
        List of calculated masses. None values are included for invalid formulas.

    Raises:
        AssertionError: If ms_mode is not one of the allowed values.
    """
    masses = []
    assert ms_mode in [None, "negative", "positive", "neutral"], ms_mode
    if isinstance(formulas, str):
        formulas = [formulas]
    for formula in formulas:
        try:
            mass = Formula(formula).isotope.mass
            if ms_mode == "positive":
                mass += M_PROTON
            elif ms_mode == "negative":
                mass -= M_PROTON
            mass = np.round(mass, 4)
            masses.append(mass)
        except FormulaError as e:
            masses.append(None)
            logging.warning(e)  # Fixed typo: waringin → warning
    return masses

`gaussian(x, mu, sig)` ¶

Generate values for a Gaussian function.

Parameters:

Name	Type	Description	Default
`x`	`Union[List[float], ndarray]`	x-values to generate function values.	required
`mu`	`float`	Mean of the Gaussian.	required
`sig`	`float`	Standard deviation of the Gaussian.	required

Returns:

Type	Description
`ndarray`	Array of Gaussian function values at the input x-values.

Source code in src/ms_mint/tools.py

def gaussian(x: Union[List[float], np.ndarray], mu: float, sig: float) -> np.ndarray:
    """Generate values for a Gaussian function.

    Args:
        x: x-values to generate function values.
        mu: Mean of the Gaussian.
        sig: Standard deviation of the Gaussian.

    Returns:
        Array of Gaussian function values at the input x-values.
    """
    x = np.array(x)
    return np.exp(-np.power(x - mu, 2.0) / (2 * np.power(sig, 2.0)))

`get_ms_files_from_results(results)` ¶

Extract MS filenames from Mint results.

Parameters:

Name	Type	Description	Default
`results`	`DataFrame`	DataFrame in Mint results format.	required

Returns:

Type	Description
`List[Union[str, Path]]`	List of MS filenames.

Source code in src/ms_mint/tools.py

def get_ms_files_from_results(results: pd.DataFrame) -> List[Union[str, P]]:
    """Extract MS filenames from Mint results.

    Args:
        results: DataFrame in Mint results format.

    Returns:
        List of MS filenames.
    """
    # Old schema
    if "ms_path" in results.columns:
        ms_files = results[["ms_path", "ms_file"]].drop_duplicates()
        ms_files = [P(ms_path) / ms_file for ms_path, ms_file in ms_files.values]
    else:
        ms_files = results.ms_file.unique()
    return ms_files

`get_targets_from_results(results)` ¶

Extract targets DataFrame from MS-MINT results table.

Parameters:

Name	Type	Description	Default
`results`	`DataFrame`	Mint results table.	required

Returns:

Type	Description
`DataFrame`	DataFrame containing target information extracted from results.

Source code in src/ms_mint/tools.py

def get_targets_from_results(results: pd.DataFrame) -> pd.DataFrame:
    """Extract targets DataFrame from MS-MINT results table.

    Args:
        results: Mint results table.

    Returns:
        DataFrame containing target information extracted from results.
    """
    return results[[col for col in TARGETS_COLUMNS if col in results.columns]].drop_duplicates()

`init_metadata()` ¶

Initialize an empty metadata DataFrame with the standard columns.

Returns:

Type	Description
`DataFrame`	Empty DataFrame with standard metadata columns and 'ms_file_label' as index.

Source code in src/ms_mint/tools.py

def init_metadata() -> pd.DataFrame:
    """Initialize an empty metadata DataFrame with the standard columns.

    Returns:
        Empty DataFrame with standard metadata columns and 'ms_file_label' as index.
    """
    cols = MINT_METADATA_COLUMNS
    return pd.DataFrame(columns=cols).set_index("ms_file_label")

`is_ms_file(fn)` ¶

Check if a file is a recognized MS file format based on its extension.

Parameters:

Name	Type	Description	Default
`fn`	`Union[str, Path]`	Filename or path to check.	required

Returns:

Type	Description
`bool`	True if the file has a recognized MS file extension, False otherwise.

Source code in src/ms_mint/tools.py

def is_ms_file(fn: Union[str, P]) -> bool:
    """Check if a file is a recognized MS file format based on its extension.

    Args:
        fn: Filename or path to check.

    Returns:
        True if the file has a recognized MS file extension, False otherwise.
    """
    fn = str(fn)
    if (
        (fn.lower().endswith(".mzxml"))
        or (fn.lower().endswith(".mzml"))
        or (fn.lower().endswith(".mzmlb"))
        or (fn.lower().endswith(".mzhdf"))
        or (fn.lower().endswith(".raw"))
        or (fn.lower().endswith(".parquet"))
        or (fn.lower().endswith(".feather"))
    ):
        return True
    else:
        return False

`lock(fn)` ¶

Create a file lock to ensure safe writing to file.

Parameters:

Name	Type	Description	Default
`fn`	`Union[str, Path]`	Filename to lock.	required

Returns:

Type	Description
`FileLock`	File lock object.

Source code in src/ms_mint/tools.py

def lock(fn: Union[str, P]) -> FileLock:
    """Create a file lock to ensure safe writing to file.

    Args:
        fn: Filename to lock.

    Returns:
        File lock object.
    """
    return FileLock(f"{fn}.lock", timeout=1)

`log2p1(x)` ¶

Apply log2(x+1) transformation to numeric data.

Parameters:

Name	Type	Description	Default
`x`	`Union[float, ndarray, Series]`	Numeric value or array to transform.	required

Returns:

Type	Description
`Union[float, ndarray, Series]`	Transformed value(s).

Source code in src/ms_mint/tools.py

def log2p1(x: Union[float, np.ndarray, pd.Series]) -> Union[float, np.ndarray, pd.Series]:
    """Apply log2(x+1) transformation to numeric data.

    Args:
        x: Numeric value or array to transform.

    Returns:
        Transformed value(s).
    """
    return np.log2(x + 1)

`mz_mean_width_to_min_max(mz_mean, mz_width)` ¶

Convert m/z mean and width (in ppm) to min and max m/z values.

Parameters:

Name	Type	Description	Default
`mz_mean`	`float`	Mean m/z value.	required
`mz_width`	`float`	Width in parts-per-million (ppm).	required

Returns:

Type	Description
`Tuple[float, float]`	Tuple of (mz_min, mz_max) defining the m/z range.

Source code in src/ms_mint/tools.py

def mz_mean_width_to_min_max(mz_mean: float, mz_width: float) -> Tuple[float, float]:
    """Convert m/z mean and width (in ppm) to min and max m/z values.

    Args:
        mz_mean: Mean m/z value.
        mz_width: Width in parts-per-million (ppm).

    Returns:
        Tuple of (mz_min, mz_max) defining the m/z range.
    """
    delta_mass = mz_width * mz_mean * 1e-6
    mz_min = mz_mean - delta_mass
    mz_max = mz_mean + delta_mass
    return mz_min, mz_max

`scale_dataframe(df, scaler='standard', **kwargs)` ¶

Scale all columns in a dense dataframe.

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	DataFrame to scale.	required
`scaler`	`Union[str, Any]`	Scaler to use. Either a string ('robust', 'standard', 'minmax') or a scikit-learn scaler instance.	`'standard'`
`**kwargs`		Additional arguments passed to the scaler constructor.	`{}`

Returns:

Type	Description
`DataFrame`	Scaled DataFrame with the same shape as the input.

Source code in src/ms_mint/tools.py

def scale_dataframe(
    df: pd.DataFrame, scaler: Union[str, Any] = "standard", **kwargs
) -> pd.DataFrame:
    """Scale all columns in a dense dataframe.

    Args:
        df: DataFrame to scale.
        scaler: Scaler to use. Either a string ('robust', 'standard', 'minmax')
            or a scikit-learn scaler instance.
        **kwargs: Additional arguments passed to the scaler constructor.

    Returns:
        Scaled DataFrame with the same shape as the input.
    """
    df = df.copy()
    if isinstance(scaler, str):
        if scaler == "standard":
            scaler = StandardScaler(**kwargs)
        elif scaler == "robust":
            scaler = RobustScaler(**kwargs)
        elif scaler == "minmax":
            scaler = MinMaxScaler(**kwargs)
    df.loc[:, :] = scaler.fit_transform(df)
    return df

options: show_root_heading: true show_root_full_path: true show_submodules: true members_order: source

Helper tools¶

df_diff(df1, df2, which='both') ¶

find_peaks_in_timeseries(series, prominence=None, plot=False, rel_height=0.9, **kwargs) ¶

fn_to_label(fn) ¶

formula_to_mass(formulas, ms_mode=None) ¶

gaussian(x, mu, sig) ¶

get_ms_files_from_results(results) ¶

get_targets_from_results(results) ¶

init_metadata() ¶

is_ms_file(fn) ¶

lock(fn) ¶

log2p1(x) ¶

mz_mean_width_to_min_max(mz_mean, mz_width) ¶

scale_dataframe(df, scaler='standard', **kwargs) ¶

`df_diff(df1, df2, which='both')` ¶

`find_peaks_in_timeseries(series, prominence=None, plot=False, rel_height=0.9, **kwargs)` ¶

`fn_to_label(fn)` ¶

`formula_to_mass(formulas, ms_mode=None)` ¶

`gaussian(x, mu, sig)` ¶

`get_ms_files_from_results(results)` ¶

`get_targets_from_results(results)` ¶

`init_metadata()` ¶

`is_ms_file(fn)` ¶

`lock(fn)` ¶

`log2p1(x)` ¶

`mz_mean_width_to_min_max(mz_mean, mz_width)` ¶

`scale_dataframe(df, scaler='standard', **kwargs)` ¶