From 9b4ad728409627ad137fa55a26da6958a4ce989b Mon Sep 17 00:00:00 2001 From: mjeppesen3 <mjeppesen3@huskers.unl.edu> Date: Fri, 1 Mar 2024 13:20:15 -0600 Subject: [PATCH] inital upload --- .../vimms/Chemicals.py | 628 +++++++ .../vimms/ChineseRestaurantProcess.py | 29 + .../vimms/Chromatograms.py | 132 ++ .../vimms/Common.py | 196 +++ .../vimms/Controller.py | 726 ++++++++ Synthetic data creation scripts/vimms/DIA.py | 349 ++++ .../vimms/DataGenerator.py | 656 +++++++ Synthetic data creation scripts/vimms/DsDA.py | 112 ++ .../vimms/Evaluation.py | 58 + .../vimms/MassSpec.py | 769 +++++++++ .../vimms/MatrixFactorisation.py | 242 +++ .../vimms/MzmlWriter.py | 160 ++ .../vimms/PlotsForPaper.py | 718 ++++++++ Synthetic data creation scripts/vimms/Roi.py | 336 ++++ .../vimms/SpectralUtils.py | 136 ++ .../vimms/TopNExperiment.py | 147 ++ .../vimms/__init__.py | 1 + .../01. Download Data.ipynb | 1174 +++++++++++++ .../02. MS1 Simulations.ipynb | 321 ++++ .../03. Multiple Samples Example.ipynb | 1503 +++++++++++++++++ .../04. Top-N Simulations.ipynb | 791 +++++++++ .../05. Varying N in Top-N Simulations.ipynb | 897 ++++++++++ .../vimms_data_generation/ee.py | 4 + .../vimms_data_generation/intermediate | 106 ++ .../vimms_data_generation/make.py | 762 +++++++++ .../vimms_data_generation/mk.py | 542 ++++++ .../multiple_samples_example.ipynb | 865 ++++++++++ .../vimms_data_generation/prepare-eics | 122 ++ .../vimms_data_generation/validate | 21 + 29 files changed, 12503 insertions(+) create mode 100644 Synthetic data creation scripts/vimms/Chemicals.py create mode 100644 Synthetic data creation scripts/vimms/ChineseRestaurantProcess.py create mode 100644 Synthetic data creation scripts/vimms/Chromatograms.py create mode 100644 Synthetic data creation scripts/vimms/Common.py create mode 100644 Synthetic data creation scripts/vimms/Controller.py create mode 100644 Synthetic data creation scripts/vimms/DIA.py create mode 100644 Synthetic data creation scripts/vimms/DataGenerator.py create mode 100644 Synthetic data creation scripts/vimms/DsDA.py create mode 100644 Synthetic data creation scripts/vimms/Evaluation.py create mode 100644 Synthetic data creation scripts/vimms/MassSpec.py create mode 100644 Synthetic data creation scripts/vimms/MatrixFactorisation.py create mode 100644 Synthetic data creation scripts/vimms/MzmlWriter.py create mode 100644 Synthetic data creation scripts/vimms/PlotsForPaper.py create mode 100644 Synthetic data creation scripts/vimms/Roi.py create mode 100644 Synthetic data creation scripts/vimms/SpectralUtils.py create mode 100644 Synthetic data creation scripts/vimms/TopNExperiment.py create mode 100644 Synthetic data creation scripts/vimms/__init__.py create mode 100644 Synthetic data creation scripts/vimms_data_generation/01. Download Data.ipynb create mode 100644 Synthetic data creation scripts/vimms_data_generation/02. MS1 Simulations.ipynb create mode 100644 Synthetic data creation scripts/vimms_data_generation/03. Multiple Samples Example.ipynb create mode 100644 Synthetic data creation scripts/vimms_data_generation/04. Top-N Simulations.ipynb create mode 100644 Synthetic data creation scripts/vimms_data_generation/05. Varying N in Top-N Simulations.ipynb create mode 100644 Synthetic data creation scripts/vimms_data_generation/ee.py create mode 100644 Synthetic data creation scripts/vimms_data_generation/intermediate create mode 100644 Synthetic data creation scripts/vimms_data_generation/make.py create mode 100644 Synthetic data creation scripts/vimms_data_generation/mk.py create mode 100644 Synthetic data creation scripts/vimms_data_generation/multiple_samples_example.ipynb create mode 100644 Synthetic data creation scripts/vimms_data_generation/prepare-eics create mode 100644 Synthetic data creation scripts/vimms_data_generation/validate diff --git a/Synthetic data creation scripts/vimms/Chemicals.py b/Synthetic data creation scripts/vimms/Chemicals.py new file mode 100644 index 00000000..e7b712c1 --- /dev/null +++ b/Synthetic data creation scripts/vimms/Chemicals.py @@ -0,0 +1,628 @@ +import copy +import glob +import math +import random +import re +from pathlib import Path + +import numpy as np +import scipy +import scipy.stats +import copy + +from vimms.ChineseRestaurantProcess import Restricted_Crp +from vimms.Common import LoggerMixin, CHEM_DATA, POS_TRANSFORMATIONS, load_obj, takeClosest, save_obj +from vimms.Chromatograms import EmpiricalChromatogram + +GET_MS2_BY_PEAKS = "sample" +GET_MS2_BY_SPECTRA = "spectra" + +class DatabaseCompound(object): + def __init__(self, name, chemical_formula, monisotopic_molecular_weight, smiles, inchi, inchikey): + self.name = name + self.chemical_formula = chemical_formula + self.monisotopic_molecular_weight = monisotopic_molecular_weight + self.smiles = smiles + self.inchi = inchi + self.inchikey = inchikey + + +class Formula(object): + def __init__(self, formula_string): + self.formula_string = formula_string + self.atom_names = ['C', 'H', 'N', 'O', 'P', 'S', 'Cl', 'I', 'Br', 'Si', 'F', 'D'] + self.atoms = {} + for atom in self.atom_names: + self.atoms[atom] = self._get_n_element(atom) + self.mass = self._get_mz() + + def _get_mz(self): + return self.compute_exact_mass() + + def _get_n_element(self, atom_name): + # Do some regex matching to find the numbers of the important atoms + ex = atom_name + '(?![a-z])' + '\d*' + m = re.search(ex, self.formula_string) + if m == None: + return 0 + else: + ex = atom_name + '(?![a-z])' + '(\d*)' + m2 = re.findall(ex, self.formula_string) + total = 0 + for a in m2: + if len(a) == 0: + total += 1 + else: + total += int(a) + return total + + def compute_exact_mass(self): + masses = {'C': 12.00000000000, 'H': 1.00782503214, 'O': 15.99491462210, 'N': 14.00307400524, + 'P': 30.97376151200, 'S': 31.97207069000, 'Cl': 34.96885271000, 'I': 126.904468, 'Br': 78.9183376, + 'Si': 27.9769265327, 'F': 18.99840320500, 'D': 2.01410177800} + exact_mass = 0.0 + for a in self.atoms: + exact_mass += masses[a] * self.atoms[a] + return exact_mass + + def __repr__(self): + return self.formula_string + + def __str__(self): + return self.formula_string + + +class Isotopes(object): + def __init__(self, formula): + self.formula = formula + self.C12_proportion = 0.989 + self.mz_diff = 1.0033548378 + # TODO: Add functionality for elements other than Carbon + + def get_isotopes(self, total_proportion): + peaks = [() for i in range(len(self._get_isotope_proportions(total_proportion)))] + for i in range(len(peaks)): + peaks[i] += (self._get_isotope_mz(self._get_isotope_names(i)),) + peaks[i] += (self._get_isotope_proportions(total_proportion)[i],) + peaks[i] += (self._get_isotope_names(i),) + return peaks + + # outputs [(mz_1, intensity_proportion_1, isotope_name_1),...,(mz_n, intensity_proportion_n, isotope_name_n)] + + def _get_isotope_proportions(self, total_proportion): + proportions = [] + while sum(proportions) < total_proportion: + proportions.extend( + [scipy.stats.binom.pmf(len(proportions), self.formula._get_n_element("C"), 1 - self.C12_proportion)]) + normalised_proportions = [proportions[i] / sum(proportions) for i in range(len(proportions))] + return normalised_proportions + + def _get_isotope_names(self, isotope_number): + if isotope_number == 0: + return "Mono" + else: + return str(isotope_number) + "C13" + + def _get_isotope_mz(self, isotope): + if isotope == "Mono": + return self.formula._get_mz() + elif isotope[-3:] == "C13": + return self.formula._get_mz() + float(isotope.split("C13")[0]) * self.mz_diff + else: + return None + + +class Adducts(object): + def __init__(self, formula, adduct_proportion_cutoff=0.05): + self.adduct_names = list(POS_TRANSFORMATIONS.keys()) + self.formula = formula + self.adduct_proportion_cutoff = adduct_proportion_cutoff + + def get_adducts(self): + adducts = [] + proportions = self._get_adduct_proportions() + for j in range(len(self.adduct_names)): + if proportions[j] != 0: + adducts.extend([(self._get_adduct_names()[j], proportions[j])]) + return adducts + + def _get_adduct_proportions(self): + # TODO: replace this with something proper + prior = np.ones(len(self.adduct_names)) * 0.1 + prior[0] = 1.0 # give more weight to the first one, i.e. M+H + proportions = np.random.dirichlet(prior) + while max(proportions) < 0.2: + proportions = np.random.dirichlet(prior) + proportions[np.where(proportions < self.adduct_proportion_cutoff)] = 0 + proportions = proportions / max(proportions) + proportions.tolist() + return proportions + + def _get_adduct_names(self): + return self.adduct_names + + +class Chemical(object): + + def __repr__(self): + raise NotImplementedError() + + +class UnknownChemical(Chemical): + """ + Chemical from an unknown chemical formula + """ + + def __init__(self, mz, rt, max_intensity, chromatogram, children=None): + self.max_intensity = max_intensity + self.isotopes = [(mz, 1, "Mono")] # [(mz, intensity_proportion, isotope,name)] + self.adducts = [("M+H", 1)] + self.rt = rt + self.chromatogram = chromatogram + self.children = children + self.ms_level = 1 + self.mz_diff = 0 + + def __repr__(self): + return 'UnknownChemical mz=%.4f rt=%.2f max_intensity=%.2f' % ( + self.isotopes[0][0], self.rt, self.max_intensity) + + def __eq__(self, other): + if not isinstance(other, UnknownChemical): + return False + return get_key(self) == get_key(other) + + def __hash__(self): + return hash(get_key(self)) + + +class KnownChemical(Chemical): + """ + Chemical from an known chemical formula + """ + + def __init__(self, formula, isotopes, adducts, rt, max_intensity, chromatogram, children=None, + include_adducts_isotopes=True, total_proportion=0.99): + self.formula = formula + self.mz_diff = isotopes.mz_diff + if include_adducts_isotopes == True: + self.isotopes = isotopes.get_isotopes(total_proportion) + self.adducts = adducts.get_adducts() + else: + mz = isotopes.get_isotopes(total_proportion)[0][0] + self.isotopes = [(mz, 1, "Mono")] + self.adducts = [("M+H", 1)] + self.rt = rt + self.max_intensity = max_intensity + self.chromatogram = chromatogram + self.children = children + self.ms_level = 1 + + def __repr__(self): + return 'KnownChemical - %r rt=%.2f max_intensity=%.2f' % ( + self.formula.formula_string, self.rt, self.max_intensity) + + def __eq__(self, other): + if not isinstance(other, KnownChemical): + return False + return self.formula.formula_string == other.formula.formula_string + + def __hash__(self): + return hash(self.formula.formula_string) + + + +class MSN(Chemical): + """ + ms2+ fragments + """ + + def __init__(self, mz, ms_level, prop_ms2_mass, parent_mass_prop, children=None, parent=None): + self.isotopes = [(mz, None, "MSN")] + self.ms_level = ms_level + self.prop_ms2_mass = prop_ms2_mass + self.parent_mass_prop = parent_mass_prop + self.children = children + self.parent = parent + + def __repr__(self): + return 'MSN Fragment mz=%.4f ms_level=%d' % (self.isotopes[0][0], self.ms_level) + + +class ChemicalCreator(LoggerMixin): + def __init__(self, peak_sampler, ROI_sources=None, database=None): + self.peak_sampler = peak_sampler + self.ROI_sources = ROI_sources + self.database = database + + # sort database compounds by their mass + if self.database is not None: + self.logger.debug('Sorting database compounds by masses') + compound_mass_list = [Formula(compound.chemical_formula).mass for compound in self.database] + sort_index = np.argsort(compound_mass_list) + self.compound_mass_list = np.array(compound_mass_list)[sort_index].tolist() + self.compound_list = np.array(self.database)[sort_index].tolist() + + def sample(self, mz_range, rt_range, min_ms1_intensity, n_ms1_peaks, ms_levels, alpha=math.inf, + fixed_mz=False, adduct_proportion_cutoff=0.05, roi_rt_range=None, include_adducts_isotopes=True, + get_children_method=GET_MS2_BY_PEAKS): + self.mz_range = mz_range + self.rt_range = rt_range + self.min_ms1_intensity = min_ms1_intensity + self.n_ms1_peaks = n_ms1_peaks + self.ms_levels = ms_levels + self.alpha = alpha + self.fixed_mz = fixed_mz + self.adduct_proportion_cutoff = adduct_proportion_cutoff + self.include_adducts_isotopes = include_adducts_isotopes + self.get_children_method = get_children_method + + # set up some counters + self.crp_samples = [[] for i in range(self.ms_levels)] + self.crp_index = [[] for i in range(self.ms_levels)] + self.counts = [[] for i in range(self.ms_levels)] + + # Report error if tries to use spectra to generate MS2+ spectra + if get_children_method == GET_MS2_BY_SPECTRA and self.ms_levels > 2: + NotImplementedError("Using spectra to generate MS2+ spectra is not yet implemented") + + # sample from kernel densities + if self.ms_levels > 2: + print("Warning ms_level > 3 not implemented properly yet. Uses scaled ms_level = 2 information for now") + n_ms1 = self._get_n(1) + self.logger.debug("{} chemicals to be created.".format(n_ms1)) + sampled_peaks = self.peak_sampler.get_peak(1, n_ms1, self.mz_range[0][0], self.mz_range[0][1], + self.rt_range[0][0], + self.rt_range[0][1], self.min_ms1_intensity) + # Get formulae from database and check there are enough of them + self.formula_list = self._sample_formulae(sampled_peaks) + + # Get file split information + split = self._get_n_ROI_files() + + # create chemicals + chemicals = [] + # load first ROI file + current_ROI = 0 + ROIs = self._load_ROI_file(current_ROI, roi_rt_range) + ROI_intensities = np.array([r.max_intensity for r in ROIs]) + for i in range(n_ms1): + if i == sum(split[0:(current_ROI + 1)]): + current_ROI += 1 + ROIs = self._load_ROI_file(current_ROI, roi_rt_range) + ROI_intensities = np.array([r.max_intensity for r in ROIs]) + formula = self.formula_list[i] + ROI = ROIs[self._get_ROI_idx(ROI_intensities, sampled_peaks[i].intensity)] + chem = self._get_known_ms1(formula, ROI, sampled_peaks[i], self.include_adducts_isotopes) + if self.fixed_mz: + chem.chromatogram.mzs = [0 for i in range( + len(chem.chromatogram.raw_mzs))] + chem.mzs = [0 for i in range( + len(chem.chromatogram.raw_mzs))] + if ms_levels > 1: + chem.children = self._get_children(self.get_children_method, chem) + chem.type = CHEM_DATA + chemicals.append(chem) + # if i % 100 == 0: + # self.logger.debug("i = {}".format(i)) + return chemicals + + def _get_n_ROI_files(self): + count = 0 + for i in range(len(self.ROI_sources)): + count += len(list(Path(self.ROI_sources[i]).glob('*.p'))) + split = np.array([int(np.floor(self.n_ms1_peaks / count)) for i in range(count)]) + split[0:int(self.n_ms1_peaks - sum(split))] += 1 + return split + + def _load_ROI_file(self, file_index, roi_rt_range=None): + num_ROI = 0 + for i in range(len(self.ROI_sources)): + ROI_files = list(Path(self.ROI_sources[i]).glob('*.p')) + len_ROI = len(ROI_files) + if len_ROI > file_index: + ROI_file = ROI_files[file_index - num_ROI] + ROI = load_obj(ROI_file) + # self.logger.debug("Loaded {}".format(ROI_file)) + if roi_rt_range is not None: + ROI = self._filter_ROI(ROI, roi_rt_range) + return ROI + num_ROI += len_ROI + + def _filter_ROI(self, ROI, roi_rt_range): + lower = roi_rt_range[0] + upper = roi_rt_range[1] + results = [chem for chem in ROI if lower < np.abs(chem.chromatogram.max_rt - chem.chromatogram.min_rt) < upper] + return results + + def _get_ROI_idx(self, ROI_intensities, intensity): + return (np.abs(ROI_intensities - intensity)).argmin() + + def _sample_formulae(self, sampled_peaks): + assert len(sampled_peaks) < len(self.database), 'The number of sampled peaks must be less than ' \ + 'the number of database compounds' + formula_set = set() + for formula_index in range(len(sampled_peaks)): + if formula_index % 500 == 0: + self.logger.debug('Sampling formula %d/%d' % (formula_index, len(sampled_peaks))) + + mz_peak_sample = sampled_peaks[formula_index].mz + idx = np.argsort(abs(self.compound_mass_list - mz_peak_sample)) + + list_index = 0 + compound_found = False + while compound_found is False: + pos = idx[list_index] + new_compound = self.compound_list[pos].chemical_formula + if str(new_compound) not in formula_set: + formula_set.add(str(new_compound)) + compound_found = True + list_index += 1 + return list(formula_set) + + def _get_children(self, get_children_method, parent, n_peaks=None): + if get_children_method == GET_MS2_BY_SPECTRA: + kids = self._get_children_spectra(parent) + return kids + elif get_children_method == GET_MS2_BY_PEAKS: + kids = self._get_children_sample(parent, n_peaks) + return kids + # TODO: add ability to get children through prediction from parent formula + # will need to add a default if MS2+ is requested + else: + raise ValueError("'get_children_method' must be either 'spectra' or 'sample'") + + def _get_children_spectra(self, parent): + # spectra is a list containing one MassSpec.Scan object + + spectra = self.peak_sampler.get_ms2_spectra()[0] + kids = [] + return kids + intensity_props = self._get_msn_proportions(None, None, spectra.intensities) + parent_mass_prop = self.peak_sampler.get_parent_intensity_proportion() + for i in range(len(spectra.mzs)): + kid = MSN(spectra.mzs[i], spectra.ms_level, intensity_props[i], parent_mass_prop, None, parent) + kids.append(kid) + return kids + + def _get_children_sample(self, parent, n_peaks=None): + children_ms_level = parent.ms_level + 1 + if n_peaks is None: + n_peaks = self._get_n(children_ms_level) + kids = [] + parent_mass_prop = self.peak_sampler.get_parent_intensity_proportion() + kids_intensity_proportions = self._get_msn_proportions(children_ms_level, n_peaks) + if self.alpha < math.inf: + # draws from here if using Chinese Restaurant Process (SLOW!!!) + for index_children in range(n_peaks): + next_crp, self.counts[children_ms_level - 1] = Restricted_Crp(self.alpha, + self.counts[children_ms_level - 1], + self.crp_index[children_ms_level - 1], + index_children) + self.crp_index[children_ms_level - 1].append(next_crp) + if next_crp == max(self.crp_index[children_ms_level - 1]): + kid = self._get_unknown_msn(children_ms_level, parent) + kid.prop_ms2_mass = kids_intensity_proportions[index_children] + if children_ms_level < self.ms_levels: + kid.children = self._get_children(self.get_children_method, kid) + self.crp_samples[children_ms_level - 1].append(kid) + else: + kid = copy.deepcopy(self.crp_samples[children_ms_level - 1][next_crp]) + kid.parent_mass_prop = parent_mass_prop + kid.parent = parent + kids.append(kid) + self.crp_samples[children_ms_level - 1].extend(kids) + else: + # Draws from here if children all independent + for index_children in range(n_peaks): + kid = self._get_unknown_msn(children_ms_level, parent) + kid.prop_ms2_mass = kids_intensity_proportions[index_children] + kid.parent_mass_prop = parent_mass_prop + if children_ms_level < self.ms_levels: + kid.children = self._get_children(self.get_children_method, kid) + kids.append(kid) + return kids + + def _get_msn_proportions(self, children_ms_level=None, n_peaks=None, children_intensities=None): + if children_intensities is None: + if children_ms_level == 2: + kids_intensities = self.peak_sampler.get_peak(children_ms_level, n_peaks) + else: + kids_intensities = self.peak_sampler.get_peak(2, n_peaks) + kids_intensities_total = sum([x.intensity for x in kids_intensities]) + kids_intensities_proportion = [x.intensity / kids_intensities_total for x in kids_intensities] + else: + kids_intensities = children_intensities + kids_intensities_total = sum(kids_intensities) + kids_intensities_proportion = kids_intensities / kids_intensities_total + return kids_intensities_proportion + + def _get_n(self, ms_level): + if ms_level == 1: + return int(self.n_ms1_peaks) + elif ms_level == 2: + return int(self.peak_sampler.n_peaks(2, 1)) + else: + return int(math.floor(self.peak_sampler.n_peaks(2, 1) / (5 ** (ms_level - 2)))) + + def _get_known_ms1(self, formula, ROI, sampled_peak, include_adducts_isotopes): # fix this + ## from sampled_peak.rt (XCMS output), we get the point where maximum intensity occurs + ## so when convering ROI to chemicals, we want to adjust the RT to align it with the point where max intensity occurs + rt = sampled_peak.rt + min2mid_rt_ROI = list(ROI.chromatogram.rts[np.where(ROI.chromatogram.intensities == 1)])[0] + adjusted_rt = rt - min2mid_rt_ROI + intensity = sampled_peak.intensity + formula = Formula(formula) + isotopes = Isotopes(formula) + adducts = Adducts(formula, self.adduct_proportion_cutoff) + return KnownChemical(formula, isotopes, adducts, adjusted_rt, intensity, ROI.chromatogram, None, include_adducts_isotopes) + + def _get_unknown_msn(self, ms_level, parent=None): # fix this + if ms_level == 2: + mz = self.peak_sampler.get_peak(ms_level, 1)[0].mz + else: + mz = self.peak_sampler.get_peak(2, 1)[0].mz + return MSN(mz, ms_level, None, None, None, parent) + + def _valid_ms1_chem(self, chem): + if chem.max_intensity < self.min_ms1_intensity: + return False + elif chem.rt < self.rt_range[0][0]: + return False + elif chem.rt > self.rt_range[0][1]: + return False + return True + + +class MultiSampleCreator(LoggerMixin): + + def __init__(self, original_dataset, n_samples, classes, intensity_noise_sd, + change_probabilities, change_differences_means, change_differences_sds, dropout_probabilities=None, + dropout_numbers=None, experimental_classes=None, experimental_probabilitities=None, + experimental_sds=None, save_location=None): + self.original_dataset = original_dataset + self.n_samples = n_samples + self.classes = classes + self.intensity_noise_sd = intensity_noise_sd + self.change_probabilities = change_probabilities + self.change_differences_means = change_differences_means + self.change_differences_sds = change_differences_sds + self.dropout_probabilities = dropout_probabilities + self.dropout_numbers = dropout_numbers + self.experimental_classes = experimental_classes + self.experimental_probabilitities = experimental_probabilitities + self.experimental_sds = experimental_sds + self.save_location = save_location + + self.sample_classes = [] + for index_classes in range(len(self.classes)): + self.sample_classes.extend([self.classes[index_classes] for i in range(n_samples[index_classes])]) + self.chemical_statuses = self._get_chemical_statuses() + self.chemical_differences_from_class1 = self._get_chemical_differences_from_class1() + if self.experimental_classes is not None: + self.sample_experimental_statuses = self._get_experimental_statuses() + self.experimental_effects = self._get_experimental_effects() + self.logger.debug("Classes, Statuses and Differences defined.") + + self.samples = [] + for index_sample in range(sum(self.n_samples)): + self.logger.debug("Dataset {} of {} created.".format(index_sample + 1, sum(self.n_samples))) + new_sample = copy.deepcopy(self.original_dataset) + which_class = np.where(np.array(self.classes) == self.sample_classes[index_sample]) + for index_chemical in range(len(new_sample)): + if not np.array(self.chemical_statuses)[which_class][0][index_chemical] == "missing": + original_intensity = new_sample[index_chemical].max_intensity + intensity = self._get_intensity(original_intensity, which_class, index_chemical) + adjusted_intensity = self._get_experimental_factor_effect(intensity, index_sample, index_chemical) + noisy_adjusted_intensity = self._get_noisy_intensity(adjusted_intensity) + new_sample[index_chemical].max_intensity = noisy_adjusted_intensity.tolist()[0] + chemicals_to_keep = np.where((np.array(self.chemical_statuses)[which_class][0]) != "missing") + new_sample = np.array(new_sample)[chemicals_to_keep].tolist() + if self.save_location is not None: + save_obj(new_sample, Path(self.save_location, 'sample_%d.p' % index_sample)) + self.samples.append(new_sample) + + def _get_chemical_statuses(self): + chemical_statuses = [np.array(["unchanged" for i in range(len(self.original_dataset))])] + chemical_statuses.extend([np.random.choice(["changed", "unchanged"], len(self.original_dataset), + p=[self.change_probabilities[i], 1 - self.change_probabilities[i]]) + for i in range(len(self.classes) - 1)]) + self.missing = self._get_missing_chemicals(chemical_statuses) + self.missing_chemicals = [np.array(self.original_dataset)[miss].tolist() for miss in self.missing] + for index_chemical in range(len(chemical_statuses)): + chemical_statuses[index_chemical][self.missing[index_chemical]] = "missing" + return chemical_statuses + + def _get_missing_chemicals(self, chemical_statuses): + missing = [] + while len(missing) != len(chemical_statuses): + if self.dropout_probabilities is not None: + if self.dropout_numbers is not None: + print("using dropout_probabilties rather than dropout_number.") + new_missing = list(np.where(np.random.binomial(1, self.dropout_probabilities[len(missing)], + len(self.original_dataset)))[0]) + if self.dropout_probabilities is None and self.dropout_numbers is not None: + new_missing = random.sample(range(0, len(self.original_dataset)), self.dropout_numbers) + missing.append(new_missing) + missing = [list(x) for x in set(tuple(sorted(x)) for x in missing)] + return missing + + def _get_experimental_statuses(self): + experimental_statuses = [] + for i in range(len(self.experimental_classes)): + class_allocation = np.random.choice(self.experimental_classes[i], sum(self.n_samples), + p=self.experimental_probabilitities[i]) + experimental_statuses.append(class_allocation) + return experimental_statuses + + def _get_experimental_effects(self): + experimental_effects = [] + for i in range(len(self.experimental_classes)): + coef = [np.random.normal(0, self.experimental_sds[i], len(self.experimental_classes[i])) for j in + range(len(self.original_dataset))] + experimental_effects.append(coef) + return experimental_effects + + def _get_chemical_differences_from_class1(self): + chemical_differences_from_class1 = [np.array([0 for i in range(len(self.original_dataset))]) for j in + range(len(self.classes))] + for index_classes in range(1, len(self.classes)): + coef_mean = self.change_differences_means[index_classes - 1] + coef_sd = self.change_differences_sds[index_classes - 1] + coef_len = sum(self.chemical_statuses[index_classes] == "changed") + coef = np.random.normal(coef_mean, coef_sd, coef_len) + chemical_differences_from_class1[index_classes][ + np.where(self.chemical_statuses[index_classes] == "changed")] = coef + return chemical_differences_from_class1 + + def _get_intensity(self, original_intensity, which_class, index_chemical): + intensity = original_intensity + self.chemical_differences_from_class1[which_class[0][0]][index_chemical] + return intensity + + def _get_experimental_factor_effect(self, intensity, index_sample, index_chemical): + experimental_factor_effect = 0.0 + if self.experimental_classes == None: + return intensity + else: + for index_factor in range(len(self.experimental_classes)): + which_experimental_status = self.sample_experimental_statuses[index_factor][index_sample] + which_experimental_class = np.where( + np.array(self.experimental_classes[index_factor]) == which_experimental_status) + experimental_factor_effect += self.experimental_effects[index_factor][index_chemical][ + which_experimental_class] + return intensity + experimental_factor_effect + + def _get_noisy_intensity(self, adjusted_intensity): + noisy_intensity = adjusted_intensity + np.random.normal(0, self.intensity_noise_sd[0], 1) + if noisy_intensity < 0: + print("Warning: Negative Intensities have been created") + return noisy_intensity + + +def get_absolute_intensity(chem, query_rt): + return chem.max_intensity * chem.chromatogram.get_relative_intensity(query_rt - chem.rt) + + +def get_key(chem): + ''' + Turns a chemical object into (mz, rt, intensity) tuples for equal comparison + :param chem: A chemical object + :return: a tuple of the three values + ''' + return (tuple(chem.isotopes), chem.rt, chem.max_intensity) + +def RestrictedChemicalCreator(N, ps, prop_ms2_mass=0.7, mz_range = [(0,1000)]): + dataset = [] + chrom = EmpiricalChromatogram(np.array([0,20]),np.array([0,0]),np.array([1,1])) + for i in range(N): + mz = ps.get_peak(1, 1, mz_range[0][0], mz_range[0][1])[0].mz + chem = UnknownChemical(mz, 0, 1E5, chrom, children=None) + n_children = int(ps.n_peaks(2, 1)) + parent_mass_prop = [1/n_children for k in range(n_children)] + children = [] + for j in range(n_children): + mz = ps.get_peak(2, 1)[0].mz + children.append(MSN(mz, 2, prop_ms2_mass, parent_mass_prop[j], None, chem)) + chem.children = children + dataset.append(chem) + return dataset diff --git a/Synthetic data creation scripts/vimms/ChineseRestaurantProcess.py b/Synthetic data creation scripts/vimms/ChineseRestaurantProcess.py new file mode 100644 index 00000000..6e735232 --- /dev/null +++ b/Synthetic data creation scripts/vimms/ChineseRestaurantProcess.py @@ -0,0 +1,29 @@ +import numpy as np + + +def discrete_draw(p): + # samples a discrete number based on a vector of probabilities + probs = [float(z) / sum(p) for z in p] + rv = np.random.multinomial(1, probs) + return int(np.where(np.random.multinomial(1, probs) == 1)[0]) + + +def Restricted_Crp(alpha, previous_counts, previous_ms2, len_current_ms2): + # Draws a value from a Chinese Restaurant process, but excludes values already part of the current sample + n = len(previous_ms2) + if previous_ms2 == []: + return 0, [1] + assign_probs = [None] * (len(previous_counts) + 1) + index_to_zero = previous_ms2[-(len_current_ms2):] + for i in range(len(previous_counts)): + if i in index_to_zero: + assign_probs[i] = 0 + else: + assign_probs[i] = previous_counts[i] / (n - 1 + alpha) + assign_probs[-1] = alpha / (n - 1 + alpha) + next_crp = discrete_draw(assign_probs) + if next_crp == (len(previous_counts)): + previous_counts.append(1) + else: + previous_counts[next_crp] += 1 + return next_crp, previous_counts diff --git a/Synthetic data creation scripts/vimms/Chromatograms.py b/Synthetic data creation scripts/vimms/Chromatograms.py new file mode 100644 index 00000000..733d82b2 --- /dev/null +++ b/Synthetic data creation scripts/vimms/Chromatograms.py @@ -0,0 +1,132 @@ +import numpy as np +import scipy.stats + + +class Chromatogram(object): + + def get_relative_intensity(self, query_rt): + raise NotImplementedError() + + def get_relative_mz(self, query_rt): + raise NotImplementedError() + + def _rt_match(self, rt): + raise NotImplementedError() + + +class EmpiricalChromatogram(Chromatogram): + """ + Empirical Chromatograms to be used within Chemicals + """ + + def __init__(self, rts, mzs, intensities, single_point_length=0.9): + self.raw_rts = rts + self.raw_mzs = mzs + self.raw_intensities = intensities + # ensures that all arrays are in sorted order + if len(rts) > 1: + p = rts.argsort() + rts = rts[p] + mzs = mzs[p] + intensities = intensities[p] + else: + rts = np.array([rts[0] - 0.5 * single_point_length, rts[0] + 0.5 * single_point_length]) + mzs = np.array([mzs[0], mzs[0]]) + intensities = np.array([intensities[0], intensities[0]]) + # normalise arrays + self.rts = rts - min(rts) + self.mzs = mzs - np.mean(mzs) # may want to just set this to 0 and remove from input + self.intensities = intensities / max(intensities) + # chromatogramDensityNormalisation(rts, intensities) + + self.min_rt = min(self.rts) + self.max_rt = max(self.rts) + + def get_relative_intensity(self, query_rt): + if not self._rt_match(query_rt): + return None + else: + neighbours_which = self._get_rt_neighbours_which(query_rt) + intensity_below = self.intensities[neighbours_which[0]] + intensity_above = self.intensities[neighbours_which[1]] + return intensity_below + (intensity_above - intensity_below) * self._get_distance(query_rt) + + def get_relative_mz(self, query_rt): + if not self._rt_match(query_rt): + return None + else: + neighbours_which = self._get_rt_neighbours_which(query_rt) + mz_below = self.mzs[neighbours_which[0]] + mz_above = self.mzs[neighbours_which[1]] + return mz_below + (mz_above - mz_below) * self._get_distance(query_rt) + + def _get_rt_neighbours(self, query_rt): + which_rt_below, which_rt_above = self._get_rt_neighbours_which(query_rt) + rt_below = self.rts[which_rt_below] + rt_above = self.rts[which_rt_above] + return [rt_below, rt_above] + + def _get_rt_neighbours_which(self, query_rt): + # find the max index of self.rts smaller than query_rt + pos = np.where(self.rts <= query_rt)[0] + which_rt_below = pos[-1] + + # take the min index of self.rts larger than query_rt + pos = np.where(self.rts > query_rt)[0] + which_rt_above = pos[0] + return [which_rt_below, which_rt_above] + + def _get_distance(self, query_rt): + rt_below, rt_above = self._get_rt_neighbours(query_rt) + return (query_rt - rt_below) / (rt_above - rt_below) + + def _rt_match(self, query_rt): + return self.min_rt < query_rt < self.max_rt + + def __eq__(self, other): + if not isinstance(other, EmpiricalChromatogram): + # don't attempt to compare against unrelated types + return NotImplemented + + return np.array_equal(sorted(self.raw_mzs), sorted(other.raw_mzs)) and \ + np.array_equal(sorted(self.raw_rts), sorted(other.raw_rts)) and \ + np.array_equal(sorted(self.raw_intensities), sorted(other.raw_intensities)) + + +# Make this more generalisable. Make scipy.stats... as input, However this makes it difficult to do the cutoff +class FunctionalChromatogram(Chromatogram): + """ + Functional Chromatograms to be used within Chemicals + """ + + def __init__(self, distribution, parameters, cutoff=0.01): + self.cutoff = cutoff + self.mz = 0 + if distribution == "normal": + self.distrib = scipy.stats.norm(parameters[0], parameters[1]) + elif distribution == "gamma": + self.distrib = scipy.stats.gamma(parameters[0], parameters[1], parameters[2]) + elif distribution == "uniform": + self.distrib = scipy.stats.uniform(parameters[0], parameters[1]) + else: + raise NotImplementedError("distribution not implemented") + self.min_rt = 0 + self.max_rt = self.distrib.ppf(1 - (self.cutoff / 2)) - self.distrib.ppf(self.cutoff / 2) + + def get_relative_intensity(self, query_rt): + if self._rt_match(query_rt) == False: + return None + else: + return (self.distrib.pdf(query_rt + self.distrib.ppf(self.cutoff / 2)) * (1 / (1 - self.cutoff))) + + def get_relative_mz(self, query_rt): + if self._rt_match(query_rt) == False: + return None + else: + return self.mz + + def _rt_match(self, query_rt): + if query_rt < 0 or query_rt > self.distrib.ppf(1 - (self.cutoff / 2)) - self.distrib.ppf(self.cutoff / 2): + return False + else: + return True diff --git a/Synthetic data creation scripts/vimms/Common.py b/Synthetic data creation scripts/vimms/Common.py new file mode 100644 index 00000000..2fec49e4 --- /dev/null +++ b/Synthetic data creation scripts/vimms/Common.py @@ -0,0 +1,196 @@ +import collections +import gzip +import logging +import math +import os +import pathlib +import pickle +import zipfile +from bisect import bisect_left + +import numpy as np + +# some useful constants +import requests +from tqdm import tqdm + +MZ = 'mz' +INTENSITY = 'intensity' +RT = 'rt' +MZ_INTENSITY_RT = MZ + '_' + INTENSITY + '_' + RT +N_PEAKS = 'n_peaks' +SCAN_DURATION = 'scan_duration' +POSITIVE = 'positive' +NEGATIVE = 'negative' +DEFAULT_MS1_SCAN_WINDOW = (0, 1e3) +CHEM_DATA = 'data' +CHEM_NOISE = 'noise' + +PROTON_MASS = 1.00727645199076 + + +def create_if_not_exist(out_dir): + if not os.path.exists(out_dir) and len(out_dir) > 0: + print('Created %s' % out_dir) + pathlib.Path(out_dir).mkdir(parents=True, exist_ok=True) + + +def save_obj(obj, filename): + """ + Save object to file + :param obj: the object to save + :param filename: the output file + :return: None + """ + out_dir = os.path.dirname(filename) + create_if_not_exist(out_dir) + print('Saving %s to %s' % (type(obj), filename)) + with gzip.GzipFile(filename, 'w') as f: + pickle.dump(obj, f, protocol=pickle.HIGHEST_PROTOCOL) + + +def load_obj(filename): + """ + Load saved object from file + :param filename: The file to load + :return: the loaded object + """ + try: + with gzip.GzipFile(filename, 'rb') as f: + return pickle.load(f) + except OSError: + logging.getLogger().warning('Old, invalid or missing pickle in %s. Please regenerate this file.' % filename) + return None + + +def chromatogramDensityNormalisation(rts, intensities): + """ + Definition to standardise the area under a chromatogram to 1. Returns updated intensities + """ + area = 0.0 + for rt_index in range(len(rts) - 1): + area += ((intensities[rt_index] + intensities[rt_index + 1]) / 2) / (rts[rt_index + 1] - rts[rt_index]) + new_intensities = [x * (1 / area) for x in intensities] + return new_intensities + + +# Note: M+H should come first in this dict because of the prior specification +POS_TRANSFORMATIONS = collections.OrderedDict() +POS_TRANSFORMATIONS['M+H'] = lambda mz: (mz + PROTON_MASS) +POS_TRANSFORMATIONS['[M+ACN]+H'] = lambda mz: (mz + 42.033823) +POS_TRANSFORMATIONS['[M+CH3OH]+H'] = lambda mz: (mz + 33.033489) +POS_TRANSFORMATIONS['[M+NH3]+H'] = lambda mz: (mz + 18.033823) +POS_TRANSFORMATIONS['M+Na'] = lambda mz: (mz + 22.989218) +POS_TRANSFORMATIONS['M+K'] = lambda mz: (mz + 38.963158) +POS_TRANSFORMATIONS['M+2Na-H'] = lambda mz: (mz + 44.971160) +POS_TRANSFORMATIONS['M+ACN+Na'] = lambda mz: (mz + 64.015765) +POS_TRANSFORMATIONS['M+2Na-H'] = lambda mz: (mz + 44.971160) +POS_TRANSFORMATIONS['M+2K+H'] = lambda mz: (mz + 76.919040) +POS_TRANSFORMATIONS['[M+DMSO]+H'] = lambda mz: (mz + 79.02122) +POS_TRANSFORMATIONS['[M+2ACN]+H'] = lambda mz: (mz + 83.060370) +POS_TRANSFORMATIONS['2M+H'] = lambda mz: (mz * 2) + 1.007276 +POS_TRANSFORMATIONS['M+ACN+Na'] = lambda mz: (mz + 64.015765) +POS_TRANSFORMATIONS['2M+NH4'] = lambda mz: (mz * 2) + 18.033823 + + +def adduct_transformation(mz, adduct): + f = POS_TRANSFORMATIONS[adduct] + return f(mz) + + +def takeClosest(myList, myNumber): + """ + Assumes myList is sorted. Returns closest value to myNumber. + + If two numbers are equally close, return the smallest number. + """ + pos = bisect_left(myList, myNumber) + if pos == 0: + return 0 + if pos == len(myList): + return -1 + before = myList[pos - 1] + after = myList[pos] + if after - myNumber < myNumber - before: + return pos + else: + return pos - 1 + + +def set_log_level_warning(): + logging.getLogger().setLevel(logging.WARNING) + + +def set_log_level_info(): + logging.getLogger().setLevel(logging.INFO) + + +def set_log_level_debug(): + logging.getLogger().setLevel(logging.DEBUG) + + +# see https://stackoverflow.com/questions/3375443/how-to-pickle-loggers +class LoggerMixin(): + @property + def logger(self): + # turn off annoying matplotlib messages + mpl_logger = logging.getLogger('matplotlib') + mpl_logger.setLevel(logging.WARNING) + # initalise basic config for all loggers + name = "{}".format(type(self).__name__) + format = '%(levelname)-7s: %(name)-30s : %(message)s' + logging.basicConfig(level=logging.getLogger().level, format=format) + logger = logging.getLogger(name) + return logger + + +def get_rt(spectrum): + ''' + Extracts RT value from a pymzml spectrum object + :param spectrum: a pymzml spectrum object + :return: the retention time (in seconds) + ''' + rt, units = spectrum.scan_time + if units == 'minute': + rt *= 60.0 + return rt + + +def find_nearest_index_in_array(array, value): + ''' + Finds index in array where the value is the nearest + :param array: + :param value: + :return: + ''' + idx = (np.abs(array - value)).argmin() + return idx + + +def download_file(url, out_file=None): + r = requests.get(url, stream=True) + total_size = int(r.headers.get('content-length', 0)); + block_size = 1024 + current_size = 0 + + if out_file is None: + out_file = url.rsplit('/', 1)[-1] # get the last part in url + print('Downloading %s' % out_file) + + with open(out_file, 'wb') as f: + for data in tqdm(r.iter_content(block_size), total=math.ceil(total_size//block_size) , unit='KB', unit_scale=True): + current_size += len(data) + f.write(data) + assert current_size == total_size + return out_file + + +def extract_zip_file(in_file, delete=True): + print('Extracting %s' % in_file) + with zipfile.ZipFile(file=in_file) as zip_file: + for file in tqdm(iterable=zip_file.namelist(), total=len(zip_file.namelist())): + zip_file.extract(member=file) + + if delete: + print('Deleting %s' % in_file) + os.remove(in_file) \ No newline at end of file diff --git a/Synthetic data creation scripts/vimms/Controller.py b/Synthetic data creation scripts/vimms/Controller.py new file mode 100644 index 00000000..7f627359 --- /dev/null +++ b/Synthetic data creation scripts/vimms/Controller.py @@ -0,0 +1,726 @@ +import sys +from collections import defaultdict, namedtuple + +EIC = namedtuple('EIC', 'name mzs rts its') + +import numpy as np +import pandas as pd +import pylab as plt +from tqdm import tqdm + +from vimms.Common import POSITIVE, DEFAULT_MS1_SCAN_WINDOW, LoggerMixin +from vimms.MassSpec import ScanParameters, IndependentMassSpectrometer +from vimms.MzmlWriter import MzmlWriter + +def roughly_equal( a, b, tol=0.00075 ): + return abs( a - b ) <= tol + +class FloatSet: + + def __init__( self, initial=None ): + self.values = [] + self.length = 0 + self.idx = 0 + if initial is not None: + for vv in initial: + self.add( vv ) + + def add( self, nval ): + self.values.append( nval ) + + def __iter__( self ): + self.values.sort( ) + temp = [ self.values[0] ] + for vv in self.values[1:]: + if not roughly_equal( vv, temp[-1], 0.05 ): + temp.append( vv ) + self.values = temp + self.length = len( temp ) + return self + + def __next__( self ): + if self.idx < self.length: + result = self.values[ self.idx ] + self.idx += 1 + return result + else: + raise StopIteration + + +class Controller(LoggerMixin): + def __init__(self, mass_spec): + self.scans = defaultdict(list) # key: ms level, value: list of scans for that level + self.mass_spec = mass_spec + self.make_plot = False + + def handle_scan(self, scan): + self.scans[scan.ms_level].append(scan) + self._process_scan(scan) + self._update_parameters(scan) + + def handle_acquisition_open(self): + raise NotImplementedError() + + def handle_acquisition_closing(self): + raise NotImplementedError() + + def write_mzML(self, analysis_name, outfile): + writer = MzmlWriter(analysis_name, self.scans, precursor_information=self.mass_spec.precursor_information) + writer.write_mzML(outfile) + + def _process_scan(self, scan): + raise NotImplementedError() + + def _update_parameters(self, scan): + raise NotImplementedError() + + def run(self, min_time, max_time, progress_bar=True): + raise NotImplementedError() + + def _plot_scan(self, scan): + if self.make_plot: + plt.figure() + for i in range(scan.num_peaks): + x1 = scan.mzs[i] + x2 = scan.mzs[i] + y1 = 0 + y2 = scan.intensities[i] + a = [[x1, y1], [x2, y2]] + plt.plot(*zip(*a), marker='', color='r', ls='-', lw=1) + plt.title('Scan {0} {1}s -- {2} peaks'.format(scan.scan_id, scan.rt, scan.num_peaks)) + plt.show() + + +class SimpleMs1Controller(Controller): + # CJ.. this is the one we are actually using + def __init__(self, mass_spec, upper_mz=None): + super().__init__(mass_spec) + default_scan = ScanParameters() + default_scan.set(ScanParameters.MS_LEVEL, 1) + if upper_mz: + default_scan.set(ScanParameters.ISOLATION_WINDOWS, [[(0, upper_mz) ]]) + else: + default_scan.set(ScanParameters.ISOLATION_WINDOWS, [[DEFAULT_MS1_SCAN_WINDOW]]) + + self.eics = list() + + mass_spec.reset() + mass_spec.current_N = 0 + mass_spec.current_DEW = 0 + + mass_spec.set_repeating_scan(default_scan) + mass_spec.register(IndependentMassSpectrometer.MS_SCAN_ARRIVED, self.handle_scan) + mass_spec.register(IndependentMassSpectrometer.ACQUISITION_STREAM_OPENING, self.handle_acquisition_open) + mass_spec.register(IndependentMassSpectrometer.ACQUISITION_STREAM_CLOSING, self.handle_acquisition_closing) + + def peak_recorder( self ): + return self.mass_spec.peak_recorder + + def run(self, min_time, max_time, progress_bar=True): + if progress_bar: + with tqdm(total=max_time - min_time, initial=0) as pbar: + self.mass_spec.run(min_time, max_time, pbar=pbar) + else: + self.mass_spec.run(min_time, max_time ) + + def handle_acquisition_open(self): + self.logger.info('Acquisition open') + + def handle_acquisition_closing(self): + self.logger.info('Acquisition closing') + + def _process_scan(self, scan): + if scan.num_peaks > 0: + self.logger.info('Time %f Received %s' % (self.mass_spec.time, scan)) + self._plot_scan(scan) + + def _update_parameters(self, scan): + pass # do nothing + + def create_eics( self, mapper ): + # TODO should probably check for the ms level + eics = [] + for chem, values in self.peak_recorder().items(): + # so we loop through all of the peaks and + # determine the unique mzs + unique_mzs = FloatSet( [ pr[1] for pr in values ] ) + for currmz in unique_mzs: + sliced = list(filter(lambda pr: roughly_equal( pr[1], currmz, 0.05), values )) + curr_rts = np.array( mapper.rts() ) + curr_mz = np.repeat( currmz, mapper.length() ) + curr_it = np.zeros( mapper.length() ) + for (rt, mz, it ) in sliced: + idx = mapper.rt_to_idx( rt ) + curr_it[ idx ] = it + eics.append( + EIC( name=chem, rts=curr_rts, mzs=curr_mz, its=curr_it ) + ) + return eics + + def _reset_scans( self, level ): + num_scans = len( self.scans[level] ) + for idx in range( num_scans ): + self.scans[ level ][ idx ].mzs = [] + self.scans[ level ][ idx ].intensities = [] + self.scans[ level ][ idx ].num_peaks = 0 + + def update_scans( self, finalized_eics, mapper ): + finalized_eics.sort( key=lambda eic: eic.mzs[0] ) + # first. reset each scan. we assume level is 1 + self._reset_scans( 1 ) + for eic in finalized_eics: + assert len(eic.rts) == len(self.scans[1]) + for idx, (rt, mz, it) in enumerate(zip(eic.rts, eic.mzs, eic.its)): + self.scans[ 1 ][ idx ].mzs.append( mz ) + self.scans[ 1 ][ idx ].intensities.append( it ) + self.scans[ 1 ][ idx ].num_peaks += 1 + + num_scans = len( self.scans[1] ) + for idx in range( num_scans ): + self.scans[ 1 ][ idx ].mzs = np.array( self.scans[ 1 ][ idx ].mzs ) + self.scans[ 1 ][ idx ].intensities = np.array( self.scans[ 1 ][ idx ].intensities ) + + +class Precursor(object): + def __init__(self, precursor_mz, precursor_intensity, precursor_charge, precursor_scan_id): + self.precursor_mz = precursor_mz + self.precursor_intensity = precursor_intensity + self.precursor_charge = precursor_charge + self.precursor_scan_id = precursor_scan_id + + def __str__(self): + return 'Precursor mz %f intensity %f charge %d scan_id %d' % ( + self.precursor_mz, self.precursor_intensity, self.precursor_charge, self.precursor_scan_id) + + +class TopNController(Controller): + def __init__(self, mass_spec, N, isolation_window, mz_tol, rt_tol, min_ms1_intensity): + super().__init__(mass_spec) + self.last_ms1_scan = None + self.N = N + self.isolation_window = isolation_window # the isolation window (in Dalton) to select a precursor ion + self.mz_tol = mz_tol # the m/z window (ppm) to prevent the same precursor ion to be fragmented again + self.rt_tol = rt_tol # the rt window to prevent the same precursor ion to be fragmented again + self.min_ms1_intensity = min_ms1_intensity # minimum ms1 intensity to fragment + + mass_spec.reset() + mass_spec.current_N = N + mass_spec.current_DEW = rt_tol + + default_scan = ScanParameters() + default_scan.set(ScanParameters.MS_LEVEL, 1) + default_scan.set(ScanParameters.ISOLATION_WINDOWS, [[DEFAULT_MS1_SCAN_WINDOW]]) + mass_spec.set_repeating_scan(default_scan) + + # register new event handlers under this controller + mass_spec.register(IndependentMassSpectrometer.MS_SCAN_ARRIVED, self.handle_scan) + mass_spec.register(IndependentMassSpectrometer.ACQUISITION_STREAM_OPENING, self.handle_acquisition_open) + mass_spec.register(IndependentMassSpectrometer.ACQUISITION_STREAM_CLOSING, self.handle_acquisition_closing) + + def run(self, min_time=None, max_time=None, progress_bar=True): + if min_time is None and max_time is None: + min_time = self.mass_spec.schedule["targetTime"].values[0] + max_time = self.mass_spec.schedule["targetTime"].values[-1] + if progress_bar: + with tqdm(total=max_time - min_time, initial=0) as pbar: + self.mass_spec.run(min_time, max_time, pbar=pbar) + else: + self.mass_spec.run(min_time, max_time) + + def handle_acquisition_open(self): + self.logger.info('Time %f Acquisition open' % self.mass_spec.time) + + def handle_acquisition_closing(self): + self.logger.info('Time %f Acquisition closing' % self.mass_spec.time) + + def _process_scan(self, scan): + self.logger.info('Time %f Received from mass spec %s' % (self.mass_spec.time, scan)) + if scan.ms_level == 1: # we get an ms1 scan, if it has a peak, then store it for fragmentation next time + if scan.num_peaks > 0: + self.last_ms1_scan = scan + else: + self.last_ms1_scan = None + + elif scan.ms_level == 2: # if we get ms2 scan, then do something with it + # scan.filter_intensity(self.min_ms2_intensity) + if scan.num_peaks > 0: + self._plot_scan(scan) + + def _update_parameters(self, scan): + + # if there's a previous ms1 scan to process + if self.last_ms1_scan is not None: + + mzs = self.last_ms1_scan.mzs + intensities = self.last_ms1_scan.intensities + rt = self.last_ms1_scan.rt + + # loop over points in decreasing intensity + fragmented_count = 0 + idx = np.argsort(intensities)[::-1] + for i in idx: + mz = mzs[i] + intensity = intensities[i] + + # stopping criteria is after we've fragmented N ions or we found ion < min_intensity + if fragmented_count >= self.N: + self.logger.debug('Time %f Top-%d ions have been selected' % (self.mass_spec.time, self.N)) + break + + if intensity < self.min_ms1_intensity: + self.logger.debug( + 'Time %f Minimum intensity threshold %f reached at %f, %d' % ( + self.mass_spec.time, self.min_ms1_intensity, intensity, fragmented_count)) + break + + # skip ion in the dynamic exclusion list of the mass spec + if self.mass_spec.is_excluded(mz, rt): + continue + + # send a new ms2 scan parameter to the mass spec + dda_scan_params = ScanParameters() + dda_scan_params.set(ScanParameters.MS_LEVEL, 2) + + # create precursor object, assume it's all singly charged + precursor_charge = +1 if (self.mass_spec.ionisation_mode == POSITIVE) else -1 + precursor = Precursor(precursor_mz=mz, precursor_intensity=intensity, + precursor_charge=precursor_charge, precursor_scan_id=self.last_ms1_scan.scan_id) + mz_lower = mz - self.isolation_window # Da + mz_upper = mz + self.isolation_window # Da + isolation_windows = [[(mz_lower, mz_upper)]] + dda_scan_params.set(ScanParameters.ISOLATION_WINDOWS, isolation_windows) + dda_scan_params.set(ScanParameters.PRECURSOR, precursor) + + # save dynamic exclusion parameters too + dda_scan_params.set(ScanParameters.DYNAMIC_EXCLUSION_MZ_TOL, self.mz_tol) + dda_scan_params.set(ScanParameters.DYNAMIC_EXCLUSION_RT_TOL, self.rt_tol) + + # push this dda scan parameter to the mass spec queue + self.mass_spec.add_to_processing_queue(dda_scan_params) + fragmented_count += 1 + + for param in self.mass_spec.get_processing_queue(): + precursor = param.get(ScanParameters.PRECURSOR) + if precursor is not None: + self.logger.debug('- %s' % str(precursor)) + + # set this ms1 scan as has been processed + self.last_ms1_scan = None + + +class TreeController(Controller): + def __init__(self, mass_spec, dia_design, window_type, kaufmann_design, extra_bins, num_windows=None): + super().__init__(mass_spec) + self.last_ms1_scan = None + self.dia_design = dia_design + self.window_type = window_type + self.kaufmann_design = kaufmann_design + self.extra_bins = extra_bins + self.num_windows = num_windows + + mass_spec.reset() + default_scan = ScanParameters() + default_scan.set(ScanParameters.MS_LEVEL, 1) + default_scan.set(ScanParameters.ISOLATION_WINDOWS, [[DEFAULT_MS1_SCAN_WINDOW]]) + mass_spec.set_repeating_scan(default_scan) + + mass_spec.register(IndependentMassSpectrometer.MS_SCAN_ARRIVED, self.handle_scan) + mass_spec.register(IndependentMassSpectrometer.ACQUISITION_STREAM_OPENING, self.handle_acquisition_open) + mass_spec.register(IndependentMassSpectrometer.ACQUISITION_STREAM_CLOSING, self.handle_acquisition_closing) + + def run(self, min_time, max_time, progress_bar=True): + if progress_bar: + with tqdm(total=max_time - min_time, initial=0) as pbar: + self.mass_spec.run(min_time, max_time, pbar=pbar) + else: + self.mass_spec.run(min_time, max_time) + + def handle_acquisition_open(self): + self.logger.info('Acquisition open') + + def handle_acquisition_closing(self): + self.logger.info('Acquisition closing') + + def _process_scan(self, scan): + self.logger.info('Received scan {}'.format(scan)) + if scan.ms_level == 1: # if we get a non-empty ms1 scan + if scan.num_peaks > 0: + self.last_ms1_scan = scan + else: + self.last_ms1_scan = None + + elif scan.ms_level == 2: # if we get ms2 scan, then do something with it + if scan.num_peaks > 0: + self._plot_scan(scan) + + def _update_parameters(self, scan): + + # if there's a previous ms1 scan to process + if self.last_ms1_scan is not None: + + rt = self.last_ms1_scan.rt + + # then get the last ms1 scan, select bin walls and create scan locations + mzs = self.last_ms1_scan.mzs + default_range = [DEFAULT_MS1_SCAN_WINDOW] # TODO: this should maybe come from somewhere else? + locations = DiaWindows(mzs, default_range, self.dia_design, self.window_type, self.kaufmann_design, + self.extra_bins, self.num_windows).locations + self.logger.debug('Window locations {}'.format(locations)) + for i in range(len(locations)): # define isolation window around the selected precursor ions + isolation_windows = locations[i] + dda_scan_params = ScanParameters() + dda_scan_params.set(ScanParameters.MS_LEVEL, 2) + dda_scan_params.set(ScanParameters.ISOLATION_WINDOWS, isolation_windows) + self.mass_spec.add_to_processing_queue(dda_scan_params) # push this dda scan to the mass spec queue + + # set this ms1 scan as has been processed + self.last_ms1_scan = None + + +class KaufmannWindows(object): + """ + Method for creating window designs based on Kaufmann paper - https://www.ncbi.nlm.nih.gov/pubmed/27188447 + """ + + def __init__(self, bin_walls, bin_walls_extra, kaufmann_design, extra_bins=0): + self.locations = [] + if kaufmann_design == "nested": + n_locations_internal = 4 + for i in range(0, 8): + self.locations.append([[(bin_walls[(0 + i * 8)], bin_walls[(8 + i * 8)])]]) + elif kaufmann_design == "tree": + n_locations_internal = 3 + self.locations.append([[(bin_walls[0], bin_walls[32])]]) + self.locations.append([[(bin_walls[32], bin_walls[64])]]) + self.locations.append([[(bin_walls[16], bin_walls[48])]]) + self.locations.append([[(bin_walls[8], bin_walls[24]), (bin_walls[40], bin_walls[56])]]) + else: + raise ValueError("not a valid design") + locations_internal = [[[]] for i in range(n_locations_internal + extra_bins)] + for i in range(0, 4): + locations_internal[0][0].append((bin_walls[(4 + i * 16)], bin_walls[(12 + i * 16)])) + locations_internal[1][0].append((bin_walls[(2 + i * 16)], bin_walls[(6 + i * 16)])) + locations_internal[1][0].append((bin_walls[(10 + i * 16)], bin_walls[(14 + i * 16)])) + locations_internal[2][0].append((bin_walls[(1 + i * 16)], bin_walls[(3 + i * 16)])) + locations_internal[2][0].append((bin_walls[(9 + i * 16)], bin_walls[(11 + i * 16)])) + if kaufmann_design == "nested": + locations_internal[3][0].append((bin_walls[(5 + i * 16)], bin_walls[(7 + i * 16)])) + locations_internal[3][0].append((bin_walls[(13 + i * 16)], bin_walls[(15 + i * 16)])) + else: + locations_internal[2][0].append((bin_walls[(5 + i * 16)], bin_walls[(7 + i * 16)])) + locations_internal[2][0].append((bin_walls[(13 + i * 16)], bin_walls[(15 + i * 16)])) + if extra_bins > 0: # TODO: fix this + for j in range(extra_bins): + for i in range(64 * (2 ** j)): + locations_internal[n_locations_internal + j][0].append((bin_walls_extra[int( + 0 + i * ((2 ** extra_bins) / (2 ** j)))], bin_walls_extra[int( + ((2 ** extra_bins) / (2 ** j)) / 2 + i * ((2 ** extra_bins) / (2 ** j)))])) + self.locations.extend(locations_internal) + + +class DiaWindows(object): + """ + Create DIA window design + """ + + def __init__(self, ms1_mzs, ms1_range, dia_design, window_type, kaufmann_design, extra_bins, num_windows=None, + range_slack=0.01): + ms1_range_difference = ms1_range[0][1] - ms1_range[0][0] + # set the number of windows for kaufmann method + if dia_design == "kaufmann": + num_windows = 64 + # dont allow extra bins for basic method + if dia_design == "basic" and extra_bins > 0: + sys.exit("Cannot have extra bins with 'basic' dia design.") + # find bin walls and extra bin walls + if window_type == "even": + internal_bin_walls = [ms1_range[0][0]] + for window_index in range(0, num_windows): + internal_bin_walls.append(ms1_range[0][0] + ((window_index + 1) / num_windows) * ms1_range_difference) + internal_bin_walls[0] = internal_bin_walls[0] - range_slack * ms1_range_difference + internal_bin_walls[-1] = internal_bin_walls[-1] + range_slack * ms1_range_difference + internal_bin_walls_extra = None + if extra_bins > 0: + internal_bin_walls_extra = [ms1_range[0][0]] + for window_index in range(0, num_windows * (2 ** extra_bins)): + internal_bin_walls_extra.append(ms1_range[0][0] + ( + (window_index + 1) / (num_windows * (2 ** extra_bins))) * ms1_range_difference) + internal_bin_walls_extra[0] = internal_bin_walls_extra[0] - range_slack * ms1_range_difference + internal_bin_walls_extra[-1] = internal_bin_walls_extra[-1] + range_slack * ms1_range_difference + elif window_type == "percentile": + internal_bin_walls = np.percentile(ms1_mzs, + np.arange(0, 100 + 100 / num_windows, 100 / num_windows)).tolist() + internal_bin_walls[0] = internal_bin_walls[0] - range_slack * ms1_range_difference + internal_bin_walls[-1] = internal_bin_walls[-1] + range_slack * ms1_range_difference + internal_bin_walls_extra = None + if extra_bins > 0: + internal_bin_walls_extra = np.percentile(ms1_mzs, + np.arange(0, 100 + 100 / (num_windows * (2 ** extra_bins)), + 100 / (num_windows * (2 ** extra_bins)))).tolist() + internal_bin_walls_extra[0] = internal_bin_walls_extra[0] - range_slack * ms1_range_difference + internal_bin_walls_extra[-1] = internal_bin_walls_extra[-1] + range_slack * ms1_range_difference + else: + raise ValueError("Incorrect window_type selected. Must be 'even' or 'percentile'.") + # convert bin walls and extra bin walls into locations to scan + if dia_design == "basic": + self.locations = [] + for window_index in range(0, num_windows): + self.locations.append([[(internal_bin_walls[window_index], internal_bin_walls[window_index + 1])]]) + elif dia_design == "kaufmann": + self.locations = KaufmannWindows(internal_bin_walls, internal_bin_walls_extra, kaufmann_design, + extra_bins).locations + else: + raise ValueError("Incorrect dia_design selected. Must be 'basic' or 'kaufmann'.") + + +class DsDAController(Controller): + def __init__(self, mass_spec, N, isolation_window, rt_tol, min_ms1_intensity): + super().__init__(mass_spec) + self.last_ms1_scan = None + self.N = N + self.isolation_window = isolation_window # the isolation window (in Dalton) around a precursor ion to be fragmented + self.rt_tol = rt_tol # the rt window to prevent the same precursor ion to be fragmented again + self.min_ms1_intensity = min_ms1_intensity # minimum ms1 intensity to fragment + + mass_spec.reset() + mass_spec.current_N = N + mass_spec.current_DEW = rt_tol + + # register new event handlers under this controller + mass_spec.register(IndependentMassSpectrometer.MS_SCAN_ARRIVED, self.handle_scan) + mass_spec.register(IndependentMassSpectrometer.ACQUISITION_STREAM_OPENING, self.handle_acquisition_open) + mass_spec.register(IndependentMassSpectrometer.ACQUISITION_STREAM_CLOSING, self.handle_acquisition_closing) + + def run(self, schedule_file, progress_bar=True): + self.schedule = pd.read_csv(schedule_file) + for idx, row in self.schedule.iterrows(): + target_mass = row.targetMass + target_time = row.targetTime + + if np.isnan(target_mass): + ms_level = 1 + isolation_windows = [[(0, 1000)]] + precursor = None + else: + ms_level = 2 + mz_lower = target_mass - self.isolation_window + mz_upper = target_mass + self.isolation_window + isolation_windows = [[(mz_lower, mz_upper)]] + precursor_charge = +1 if (self.mass_spec.ionisation_mode == POSITIVE) else -1 + scan_id = 0 + precursor = Precursor(precursor_mz=target_mass, precursor_intensity=0, + precursor_charge=precursor_charge, precursor_scan_id=scan_id) + + dda_scan_params = ScanParameters() + dda_scan_params.set(ScanParameters.MS_LEVEL, ms_level) + dda_scan_params.set(ScanParameters.ISOLATION_WINDOWS, isolation_windows) + dda_scan_params.set(ScanParameters.TIME, target_time) + if precursor: + dda_scan_params.set(ScanParameters.PRECURSOR, precursor) + self.mass_spec.add_to_processing_queue(dda_scan_params) # push this scan to the mass spec queue + + if progress_bar: + with tqdm(total=target_time, + initial=0) as pbar: + self.mass_spec.run(self.schedule, pbar=pbar) + else: + self.mass_spec.run(self.schedule) + + def handle_acquisition_open(self): + self.logger.info('Acquisition open') + + def handle_acquisition_closing(self): + self.logger.info('Acquisition closing') + + def _process_scan(self, scan): + self.logger.info('Received {}'.format(scan)) + if scan.ms_level == 1: # we get an ms1 scan, store it for fragmentation next time + self.last_ms1_scan = scan + elif scan.ms_level == 2: # if we get ms2 scan, then do something with it + # scan.filter_intensity(self.min_ms2_intensity) + if scan.num_peaks > 0: + self._plot_scan(scan) + + def _update_parameters(self, scan): + pass + + +class HybridController(Controller): + def __init__(self, mass_spec, N, scan_param_changepoints, isolation_window, mz_tol, rt_tol, min_ms1_intensity, + n_purity_scans=None, purity_shift=None, purity_threshold=0): + super().__init__(mass_spec) + self.last_ms1_scan = None + self.N = np.array(N) + if scan_param_changepoints is not None: + self.scan_param_changepoints = np.array([0] + scan_param_changepoints) + else: + self.scan_param_changepoints = np.array([0]) + self.isolation_window = np.array(isolation_window) # the isolation window (in Dalton) to select a precursor ion + self.mz_tol = np.array(mz_tol) # the m/z window (ppm) to prevent the same precursor ion to be fragmented again + self.rt_tol = np.array(rt_tol) # the rt window to prevent the same precursor ion to be fragmented again + self.min_ms1_intensity = min_ms1_intensity # minimum ms1 intensity to fragment + + self.n_purity_scans = n_purity_scans + self.purity_shift = purity_shift + self.purity_threshold = purity_threshold + + # make sure the input are all correct + assert len(self.N) == len(self.scan_param_changepoints) == len(self.isolation_window) == len(self.mz_tol) == len(self.rt_tol) + if self.purity_threshold != 0: + assert all(self.n_purity_scans < np.array(self.N)) + + mass_spec.reset() + current_N, current_rt_tol, idx = self._get_current_N_DEW() + mass_spec.current_N = current_N + mass_spec.current_DEW = current_rt_tol + + default_scan = ScanParameters() + default_scan.set(ScanParameters.MS_LEVEL, 1) + default_scan.set(ScanParameters.ISOLATION_WINDOWS, [[DEFAULT_MS1_SCAN_WINDOW]]) + mass_spec.set_repeating_scan(default_scan) + + # register new event handlers under this controller + mass_spec.register(IndependentMassSpectrometer.MS_SCAN_ARRIVED, self.handle_scan) + mass_spec.register(IndependentMassSpectrometer.ACQUISITION_STREAM_OPENING, self.handle_acquisition_open) + mass_spec.register(IndependentMassSpectrometer.ACQUISITION_STREAM_CLOSING, self.handle_acquisition_closing) + + def run(self, min_time=None, max_time=None, progress_bar=True): + if min_time is None and max_time is None: + min_time = self.mass_spec.schedule["targetTime"].values[0] + max_time = self.mass_spec.schedule["targetTime"].values[-1] + if progress_bar: + with tqdm(total=max_time - min_time, initial=0) as pbar: + self.mass_spec.run(min_time, max_time, pbar=pbar) + else: + self.mass_spec.run(min_time, max_time) + + def handle_acquisition_open(self): + self.logger.info('Time %f Acquisition open' % self.mass_spec.time) + + def handle_acquisition_closing(self): + self.logger.info('Time %f Acquisition closing' % self.mass_spec.time) + + def _process_scan(self, scan): + self.logger.info('Time %f Received from mass spec %s' % (self.mass_spec.time, scan)) + if scan.ms_level == 1: # we get an ms1 scan, if it has a peak, then store it for fragmentation next time + if scan.num_peaks > 0: + self.last_ms1_scan = scan + else: + self.last_ms1_scan = None + + elif scan.ms_level == 2: # if we get ms2 scan, then do something with it + # scan.filter_intensity(self.min_ms2_intensity) + if scan.num_peaks > 0: + self._plot_scan(scan) + + def _update_parameters(self, scan): + + # if there's a previous ms1 scan to process + if self.last_ms1_scan is not None: + + mzs = self.last_ms1_scan.mzs + intensities = self.last_ms1_scan.intensities + rt = self.last_ms1_scan.rt + + # set up current scan parameters + current_N, current_rt_tol, idx = self._get_current_N_DEW() + current_isolation_window = self.isolation_window[idx] + current_mz_tol = self.mz_tol[idx] + + # calculate purities + purities = [] + for mz_idx in range(len(self.last_ms1_scan.mzs)): + nearby_mzs_idx = np.where(abs(self.last_ms1_scan.mzs - self.last_ms1_scan.mzs[mz_idx]) < current_isolation_window) + if len(nearby_mzs_idx[0]) == 1: + purities.append(1) + else: + total_intensity = sum(self.last_ms1_scan.intensities[nearby_mzs_idx]) + purities.append(self.last_ms1_scan.intensities[mz_idx] / total_intensity) + + # loop over points in decreasing intensity + fragmented_count = 0 + idx = np.argsort(intensities)[::-1] + for i in idx: + mz = mzs[i] + intensity = intensities[i] + purity = purities[i] + + # stopping criteria is after we've fragmented N ions or we found ion < min_intensity + if fragmented_count >= current_N: + self.logger.debug('Time %f Top-%d ions have been selected' % (self.mass_spec.time, current_N)) + break + + if intensity < self.min_ms1_intensity: + self.logger.debug( + 'Time %f Minimum intensity threshold %f reached at %f, %d' % ( + self.mass_spec.time, self.min_ms1_intensity, intensity, fragmented_count)) + break + + # skip ion in the dynamic exclusion list of the mass spec + if self.mass_spec.is_excluded(mz, rt): + continue + + if purity < self.purity_threshold: + purity_shift_amounts = [self.purity_shift * (i - (self.n_purity_scans - 1) / 2) for i in range(self.n_purity_scans)] + for purity_idx in range(self.n_purity_scans): + # send a new ms2 scan parameter to the mass spec + dda_scan_params = ScanParameters() + dda_scan_params.set(ScanParameters.MS_LEVEL, 2) + dda_scan_params.set(ScanParameters.N, current_N) + + # create precursor object, assume it's all singly charged + precursor_charge = +1 if (self.mass_spec.ionisation_mode == POSITIVE) else -1 + precursor = Precursor(precursor_mz=mz, precursor_intensity=intensity, + precursor_charge=precursor_charge, + precursor_scan_id=self.last_ms1_scan.scan_id) + mz_lower = mz + purity_shift_amounts[purity_idx] - current_isolation_window # Da + mz_upper = mz + purity_shift_amounts[purity_idx] + current_isolation_window # Da + isolation_windows = [[(mz_lower, mz_upper)]] + dda_scan_params.set(ScanParameters.ISOLATION_WINDOWS, isolation_windows) + dda_scan_params.set(ScanParameters.PRECURSOR, precursor) + + # save dynamic exclusion parameters too + dda_scan_params.set(ScanParameters.DYNAMIC_EXCLUSION_MZ_TOL, current_mz_tol) + dda_scan_params.set(ScanParameters.DYNAMIC_EXCLUSION_RT_TOL, current_rt_tol) + + # push this dda scan parameter to the mass spec queue + self.mass_spec.add_to_processing_queue(dda_scan_params) + fragmented_count += 1 + # need to work out what we want to do here + else: + # send a new ms2 scan parameter to the mass spec + dda_scan_params = ScanParameters() + dda_scan_params.set(ScanParameters.MS_LEVEL, 2) + dda_scan_params.set(ScanParameters.N, current_N) + + # create precursor object, assume it's all singly charged + precursor_charge = +1 if (self.mass_spec.ionisation_mode == POSITIVE) else -1 + precursor = Precursor(precursor_mz=mz, precursor_intensity=intensity, + precursor_charge=precursor_charge, precursor_scan_id=self.last_ms1_scan.scan_id) + mz_lower = mz - current_isolation_window # Da + mz_upper = mz + current_isolation_window # Da + isolation_windows = [[(mz_lower, mz_upper)]] + dda_scan_params.set(ScanParameters.ISOLATION_WINDOWS, isolation_windows) + dda_scan_params.set(ScanParameters.PRECURSOR, precursor) + + # save dynamic exclusion parameters too + dda_scan_params.set(ScanParameters.DYNAMIC_EXCLUSION_MZ_TOL, current_mz_tol) + dda_scan_params.set(ScanParameters.DYNAMIC_EXCLUSION_RT_TOL, current_rt_tol) + + # push this dda scan parameter to the mass spec queue + self.mass_spec.add_to_processing_queue(dda_scan_params) + fragmented_count += 1 + + for param in self.mass_spec.get_processing_queue(): + precursor = param.get(ScanParameters.PRECURSOR) + if precursor is not None: + self.logger.debug('- %s' % str(precursor)) + + # set this ms1 scan as has been processed + self.last_ms1_scan = None + + def _get_current_N_DEW(self): + idx = np.nonzero(self.scan_param_changepoints <= self.mass_spec.time)[0][-1] + current_N = self.N[idx] + current_rt_tol = self.rt_tol[idx] + return current_N, current_rt_tol, idx diff --git a/Synthetic data creation scripts/vimms/DIA.py b/Synthetic data creation scripts/vimms/DIA.py new file mode 100644 index 00000000..a7698478 --- /dev/null +++ b/Synthetic data creation scripts/vimms/DIA.py @@ -0,0 +1,349 @@ +import numpy as np +from tqdm import tqdm +import math +import sys +import copy + +from vimms.Controller import * +from vimms.MassSpec import * +from vimms.Common import POSITIVE, DEFAULT_MS1_SCAN_WINDOW, LoggerMixin + + +def DiaRestrictedScanner(dataset, ps, dia_design, window_type, kaufmann_design, extra_bins, num_windows=None, pbar=False): + mass_spec = IndependentMassSpectrometer(POSITIVE, dataset, ps) + controller = TreeController(mass_spec, dia_design, window_type, kaufmann_design, extra_bins, num_windows) + controller.run(10, 20, pbar) + controller.scans[2] = controller.scans[2][0:(controller.scans[1][1].scan_id-1)] + controller.scans[1] = controller.scans[1][0:2] + return controller + + +class DiaAnalyser(object): + def __init__(self, controller, min_intensity=0): + self.controller = controller + self.scans = controller.scans + self.dataset = controller.mass_spec.chemicals + self.chemicals_identified = 0 + self.ms2_matched = 0 + self.entropy = 0 + self.ms1_range = np.array([0,1000]) #TODO: fix this (ie make it so it can be controller in controller and then here + self.min_intensity = min_intensity + + self.ms1_scan_times = np.array([scan.rt for scan in self.scans[1]]) + self.ms2_scan_times = np.array([scan.rt for scan in self.scans[2]]) + self.ms1_mzs = [self.controller.mass_spec._get_all_mz_peaks(self.dataset[i], self.dataset[i].rt + 0.01, 1, [[(0, 1000)]])[0][0] for i in range(len(self.dataset))] + self.ms1_start_rt = np.array([data.rt for data in self.dataset]) + self.ms1_end_rt = np.array([data.rt + data.chromatogram.max_rt for data in self.dataset]) + self.first_scans, self.last_scans = self._get_scan_times() + + self.chemical_locations = [] + + with tqdm(total=len(self.dataset)) as pbar: + for chem_num in range(len(self.dataset)): + chemical_location = self._get_chemical_location(chem_num) + chemical_time = [(self.first_scans[chem_num], self.last_scans[chem_num])] + self.chemical_locations.append(chemical_location) + num_ms1_options = 0 + for i in range(len(chemical_location)): + mz_location = np.logical_and(np.array(self.ms1_mzs) > chemical_location[i][0], + np.array(self.ms1_mzs) <= chemical_location[i][1]) + time_location = np.logical_and(self.ms1_start_rt <= chemical_time[0][1], + self.ms1_end_rt >= chemical_time[0][0]) + num_ms1_options += sum(mz_location * time_location) + if num_ms1_options == 0: + self.entropy += -len(self.dataset[chem_num].children) * len(self.dataset) * math.log(1 / len(self.dataset)) + else: + self.entropy += -len(self.dataset[chem_num].children) * num_ms1_options * math.log(1 / num_ms1_options) + if num_ms1_options == 1: + self.chemicals_identified += 1 + self.ms2_matched += len(self.dataset[chem_num].children) + pbar.update(1) + pbar.close() + + def _get_scan_times(self): + max_time = self.scans[1][-1].rt + if self.scans[2] != []: + max_time = max(self.scans[1][-1].rt, self.scans[2][-1].rt) + 1 + first_scans = [max_time for i in self.dataset] + last_scans = [max_time for i in self.dataset] + for chem_num in range(len(self.dataset)): + relevant_times = self.ms1_scan_times[(self.ms1_start_rt[chem_num] < self.ms1_scan_times) & (self.ms1_scan_times < self.ms1_end_rt[chem_num])] + for time in relevant_times: + intensity = self.controller.mass_spec._get_all_mz_peaks(self.dataset[chem_num], time, 1, [[(0,1000)]])[0][1] #TODO: Make MS1 range more general + if intensity > self.min_intensity: + first_scans[chem_num] = min(first_scans[chem_num], time) + last_scans[chem_num] = time + return first_scans, last_scans + # + # def _get_ms1_mzs(self): + # # get list of ms1s + # ms1_mzs = [] + # if isinstance(self.scans[1], list): + # for j in range(len(self.scans[1])): + # ms1_mzs.extend(self.scans[1][j].mzs) + # ms1_mzs = np.unique(np.array(ms1_mzs)) + # else: + # ms1_mzs = self.scans[1].mzs + # return ms1_mzs + + def _get_chemical_location(self, chem_num): + # find location where ms2s of chemical can be narrowed down to + which_scans = np.where(np.logical_and(np.array(self.ms2_scan_times) > self.first_scans[chem_num], + np.array(self.ms2_scan_times) < self.last_scans[chem_num])) + chemical_scans = np.array(self.scans[2])[which_scans] + if chemical_scans.size == 0: + possible_locations = [(0,1000)] # TODO: Make this more general + else: + locations = [scan.isolation_windows for scan in chemical_scans] + scan_times = [scan.rt for scan in chemical_scans] + split_points = np.unique(np.array(list(sum(sum(sum(locations, []), []), ())))) + split_points = np.unique(np.concatenate((split_points, self.ms1_range))) + mid_points = [(split_points[i] + split_points[i + 1]) / 2 for i in range(len(split_points) - 1)] + possible_mid_points = self._get_mid_points_in_location(chem_num, mid_points, locations, scan_times) + possible_locations = self._get_possible_locations(possible_mid_points, split_points) + return possible_locations + + def _get_mid_points_in_location(self, chem_num, mid_points, locations, scan_times): + # find mid points which satisfying scans locations + current_mid_points = mid_points + for i in range(len(locations)): + chem_scanned = isinstance( + self.controller.mass_spec._get_all_mz_peaks(self.dataset[chem_num], scan_times[i], 2, locations[i]), + list) + new_mid_points = [] + for j in range(len(current_mid_points)): + if chem_scanned == self._in_window(current_mid_points[j], locations[i]): + new_mid_points.append(current_mid_points[j]) + current_mid_points = new_mid_points + return current_mid_points + + def _get_possible_locations(self, possible_mid_points, split_points): + # find locations where possible mid points can be in, then simplify locations + possible_locations = [] + for i in range(len(possible_mid_points)): + min_location = max( + np.array(split_points)[np.where(np.array(split_points) < possible_mid_points[i])].tolist()) + max_location = min( + np.array(split_points)[np.where(np.array(split_points) >= possible_mid_points[i])].tolist()) + possible_locations.extend([(min_location, max_location)]) + # TODO: need to simplify still + return possible_locations + + def _in_window(self, mid_point, locations): + for window in locations[0]: + if (mid_point > window[0] and mid_point <= window[1]): + return True + return False + + +class RestrictedDiaAnalyser(object): + def __init__(self, controller): + self.entropy = [] + self.chemicals_identified = [] + self.ms2_matched = [] + self.scan_num = [] + temp_controller = copy.deepcopy(controller) + start = len(temp_controller.scans[2]) + for num_ms2_scans in range(start, -1, -1): + temp_controller.scans[2] = temp_controller.scans[2][0:num_ms2_scans] + analyser = DiaAnalyser(temp_controller) + self.entropy.append(analyser.entropy) + self.chemicals_identified.append(analyser.chemicals_identified) + self.ms2_matched.append(analyser.ms2_matched) + self.scan_num.append(num_ms2_scans + 1) + self.entropy.reverse() + self.chemicals_identified.reverse() + self.ms2_matched.reverse() + self.scan_num.reverse() + + + +############################# ok up to here ##################################### + + +class Scan_Results_Calculator(object): + """ + Method for taking raw results, grouping ms2 fragments and determining in which scans they were found + """ + + def __init__(self, dia_results, ms2_mz_slack=0.00001, ms2_intensity_slack=0.1): + self.intensities_in_scans = dia_results.intensities_in_scans + self.mz_in_scans = dia_results.mz_in_scans + self.locations = dia_results.locations + self.bin_walls = dia_results.bin_walls + self.ms1_values = dia_results.ms1_values + self.results = [[] for i in range(len(dia_results.locations))] + unlisted_mz_in_scans = np.concatenate(dia_results.mz_in_scans) + unlisted_intensities_in_scans = np.concatenate(dia_results.intensities_in_scans) + # find unique mz + unique_mz = [[unlisted_mz_in_scans[0]]] + unique_intensities = [[unlisted_intensities_in_scans[0]]] + for unlisted_mz_index in range(1, len(unlisted_mz_in_scans)): + unique_mz_min = math.inf + for unique_mz_index in range(len(unique_mz)): + unique_dist = abs( + sum(unique_mz[unique_mz_index]) / len(unique_mz[unique_mz_index]) - unlisted_mz_in_scans[ + unlisted_mz_index]) + if (unique_dist < unique_mz_min): + unique_mz_min = unique_dist + unique_mz_which = unique_mz_index + if unique_mz_min < ms2_mz_slack: + unique_mz[unique_mz_which].append(unlisted_mz_in_scans[unlisted_mz_index]) + unique_intensities[unique_mz_which].append(unlisted_intensities_in_scans[unlisted_mz_index]) + else: + unique_mz.append([unlisted_mz_in_scans[unlisted_mz_index]]) + unique_intensities.append([unlisted_intensities_in_scans[unlisted_mz_index]]) + self.ms2_intensities = unique_intensities + self.ms2_mz = unique_mz + # find where intensities are unique and assign them a scan result + for unique_mz_index in range(len(unique_mz)): + if max(abs(unique_intensities[0] - sum(unique_intensities[0]) / len( + unique_intensities[0]))) > ms2_intensity_slack: + print("not ready yet") + else: + for location_index in range(len(dia_results.locations)): + TF_in_location = [] + for unique_index in range(len(unique_mz[unique_mz_index])): + TF_in_location.append( + unique_mz[unique_mz_index][unique_index] in dia_results.mz_in_scans[location_index] and + unique_intensities[unique_mz_index][unique_index] in dia_results.intensities_in_scans[ + location_index]) + if any(TF_in_location): + self.results[location_index].append(1) + else: + self.results[location_index].append(0) + + +class Dia_Location_Finder(object): + """ + Method for finding location of ms2 fragments based on which DIA scans they are seen in + """ + + def __init__(self, scan_results): + self.locations = scan_results.locations + self.results = scan_results.results + self.bin_walls = scan_results.bin_walls + self.ms1_values = scan_results.ms1_values + self.ms2_intensities = scan_results.ms2_intensities + self.ms2_mz = scan_results.ms2_mz + self.location_all = [] + bin_mid_points = list((np.array(self.bin_walls[1:]) + np.array(self.bin_walls[:(len(self.bin_walls) - 1)])) / 2) + for sample_index in range(0, len(self.results[0])): + mid_point_TF = [] + for mid_points_index in range(0, len(bin_mid_points)): + mid_point_TF.append(self._mid_point_in_location(bin_mid_points[mid_points_index], sample_index)) + self.location_all.append([(list(np.array(self.bin_walls)[np.where(np.array(mid_point_TF) == True)])[0], + list(np.array(self.bin_walls[1:])[np.where(np.array(mid_point_TF) == True)])[ + 0])]) + + def _mid_point_in_location(self, mid_point, sample_index): + for locations_index in range(0, len(self.locations)): + if self._in_window(mid_point, self.locations[locations_index]) == True and self.results[locations_index][ + sample_index] == 0: + return False + if self._in_window(mid_point, self.locations[locations_index]) == False and self.results[locations_index][ + sample_index] == 1: + return False + else: + return True + + def _in_window(self, mid_point, locations): + for window in locations: + if (mid_point > window[0] and mid_point <= window[1]): + return True + return False + + +class Entropy(object): + """ + Method for calculating entropy based on locations of ms2 components + """ + + def __init__(self, dia_location_finder): + self.entropy = [] + self.components_determined = [] + ms1_vec = [] + ms2_vec = [] + for i in range(0, len(dia_location_finder.bin_walls) - 1): + ms2_vec.extend([0]) + ms1_vec.extend([len(np.where( + np.logical_and(np.array(dia_location_finder.ms1_values) > dia_location_finder.bin_walls[i], + np.array(dia_location_finder.ms1_values) <= dia_location_finder.bin_walls[i + 1]))[0])]) + # fix this + for j in range(0, len(dia_location_finder.location_all)): + if [(dia_location_finder.bin_walls[i], dia_location_finder.bin_walls[i + 1])] == \ + dia_location_finder.location_all[j]: + ms2_vec[i] += 1 + ms1_vec_nozero = [value for value in ms1_vec if value != 0] + ms2_vec_nozero = [value for value in ms1_vec if value != 0] + entropy_vec = [] + for j in range(0, len(ms2_vec_nozero)): + entropy_vec.append(-ms2_vec_nozero[j] * ms1_vec_nozero[j] * math.log(1 / ms1_vec_nozero[j])) + self.entropy = sum(entropy_vec) + self.components_determined = sum(np.extract(np.array(ms1_vec_nozero) == 1, ms2_vec_nozero)) + self.components = sum(ms2_vec_nozero) + + +class Entropy_List(object): + """ + Method for calculating entropy on multiple subsets of the DIA results. Useful for creating plots and monitoring performance over multiple scans + """ + + def __init__(self, dataset, ms_level, rt, dia_design, window_type, kaufmann_design, extra_bins=0, range_slack=0.01, + ms1_range=[(None, None)], num_windows=None, ms2_mz_slack=0.00001, ms2_intensity_slack=0.1): + self.entropy = [] + self.components_determined = [] + if (dia_design != "kaufmann"): + sys.exit("Only the 'kaufmann' method can be used with Entropy_List") + if (kaufmann_design == "tree"): + self.start_subsample_scans = 2 + self.end_subsample_scans = 7 + extra_bins + elif (kaufmann_design == "nested"): + self.start_subsample_scans = 8 + self.end_subsample_scans = 12 + extra_bins + else: + sys.exit("Cannot use Entropy_List with this design. Kaufmann 'nested' or 'tree' only.") + for i in range(self.start_subsample_scans, self.end_subsample_scans): + dia = Dia_Methods_Subsample( + Dia_Methods(dataset, ms_level, rt, dia_design, window_type, kaufmann_design, extra_bins, range_slack, + ms1_range, num_windows), i) + results = Entropy(Dia_Location_Finder(Scan_Results_Calculator(dia, ms2_mz_slack, ms2_intensity_slack))) + self.entropy.append(results.entropy) + self.components_determined.append(results.components_determined) + self.components = results.components + + + + +class Dia_Methods(object): + """ + Method for doing DIA on a dataset of ms1 and ms2 peaks. Creates windows and then return attributes of scan results + """ + + def __init__(self, dataset, ms_level, rt, dia_design, window_type, kaufmann_design=None, extra_bins=0, + range_slack=0.01, ms1_range=[(None, None)], num_windows=None): + dia_windows = Dia_Windows(dataset, dia_design, window_type, kaufmann_design, extra_bins, range_slack, ms1_range, + num_windows) + self.bin_walls = dia_windows.bin_walls + self.locations = dia_windows.locations + self.ms1_values = dia_windows.ms1_values + self.mz_in_scans = [] + self.intensities_in_scans = [] + for window_index in range(0, len(self.locations)): + data_scan = Dataset_Scan(dataset, ms_level, rt, self.locations[window_index]) + self.mz_in_scans.append(np.array(data_scan.mz_in_scan)) + self.intensities_in_scans.append(np.array(data_scan.scan_intensities)) + + +class Dia_Methods_Subsample(object): + """ + Method for taking a sumsample of DIA results. Helpful for visualising results as scans progress + """ + + def __init__(self, dia_methods, num_scans): + self.bin_walls = list(set(np.array(sum(dia_methods.locations[0:num_scans], [])).flatten())) + self.bin_walls.sort() + self.locations = dia_methods.locations[0:num_scans] + self.ms1_values = dia_methods.ms1_values + self.mz_in_scans = dia_methods.mz_in_scans[0:num_scans] + self.intensities_in_scans = dia_methods.intensities_in_scans[0:num_scans] \ No newline at end of file diff --git a/Synthetic data creation scripts/vimms/DataGenerator.py b/Synthetic data creation scripts/vimms/DataGenerator.py new file mode 100644 index 00000000..7d46525e --- /dev/null +++ b/Synthetic data creation scripts/vimms/DataGenerator.py @@ -0,0 +1,656 @@ +import copy +import glob +import os +import xml.etree.ElementTree + +import numpy as np +import pandas as pd +import pylab as plt +import pymzml +from sklearn.neighbors import KernelDensity +import zipfile + +from vimms.Chemicals import DatabaseCompound +from vimms.Common import LoggerMixin, MZ, INTENSITY, RT, N_PEAKS, SCAN_DURATION, MZ_INTENSITY_RT, save_obj +from vimms.MassSpec import Peak, Scan +from vimms.SpectralUtils import get_precursor_info + +import matplotlib.pyplot as plt + + +def extract_hmdb_metabolite(in_file, delete=True): + print('Extracting HMDB metabolites from %s' % in_file) + + # if out_file is zipped then extract the xml file inside + try: + # extract from zip file + zf = zipfile.ZipFile(in_file, 'r') + metabolite_xml_file = zf.namelist()[0] # assume there's only a single file inside the zip file + f = zf.open(metabolite_xml_file) + except zipfile.BadZipFile: # oops not a zip file + zf = None + f = in_file + + # loops through file and extract the necessary element text to create a DatabaseCompound + db = xml.etree.ElementTree.parse(f).getroot() + compounds = [] + prefix = '{http://www.hmdb.ca}' + for metabolite_element in db: + row = [None, None, None, None, None, None] + for element in metabolite_element: + if element.tag == (prefix + 'name'): + row[0] = element.text + elif element.tag == (prefix + 'chemical_formula'): + row[1] = element.text + elif element.tag == (prefix + 'monisotopic_molecular_weight'): + row[2] = element.text + elif element.tag == (prefix + 'smiles'): + row[3] = element.text + elif element.tag == (prefix + 'inchi'): + row[4] = element.text + elif element.tag == (prefix + 'inchikey'): + row[5] = element.text + + # if all fields are present, then add them as a DatabaseCompound + if None not in row: + compound = DatabaseCompound(row[0], row[1], row[2], row[3], row[4], row[5]) + compounds.append(compound) + print('Loaded %d DatabaseCompounds from %s' % (len(compounds), in_file)) + + f.close() + if zf is not None: + zf.close() + + if delete: + print('Deleting %s' % in_file) + os.remove(in_file) + + return compounds + + +def get_data_source(mzml_path, filename, xcms_output=None): + """ + Load a `DataSource` object that stores information on a set of .mzML files. + :param mzml_path: the location of .mzML files to train the KDEs. + :param filename: a particular .mzML file to be used. If None then all files in `mzml_path` will be used. + :param xcms_output: As an option, we can use XCMS peak picking results to train the (mz, RT, intensity) densities. + This makes the generated spectra more similar to real ones after peak picking. If not available, leave this as None. + :return: a DataSource object. + """ + ds = DataSource() + ds.load_data(mzml_path, filename) + if xcms_output is not None: + ds.load_xcms_output(xcms_output) + return ds + + +def get_spectral_feature_database(ds, filename, min_ms1_intensity, min_ms2_intensity, min_rt, max_rt, + bandwidth_mz_intensity_rt, bandwidth_n_peaks, out_file=None): + """ + Generate spectral feature database on the .mzML files that have been loaded into the DataSource + :param ds: the `DataSource` object that contains loaded .mzML files. + :param filename: a particular .mzML file to be used. If None then all loaded files in `ds` will be used. + :param min_ms1_intensity: minimum MS1 intensity to include a data point to train the KDEs. + :param min_ms2_intensity: minimum MS2 intensity to include a data point to train the KDEs. + :param min_rt: minimum RT to include a data point to train the KDEs. + :param max_rt: maximum RT to include a data point to train the KDEs. + :param bandwidth_mz_intensity_rt: the bandwidth of the kernel to train the KDEs for (mz, RT, intensity) values. + :param bandwidth_n_peaks: the bandwidth of the kernel to train the KDEs for the number of peaks per scan. + :param out_file: the resulting output file to store the trained KDEs (in form of `PeakSampler` object). + :return: a PeakSampler object that can be used to draw samples for simulation. + """ + ps = PeakSampler(ds, min_rt, max_rt, min_ms1_intensity, min_ms2_intensity, filename, False, + bandwidth_mz_intensity_rt, bandwidth_n_peaks) + if out_file is not None: + save_obj(ps, out_file) + return ps + + +def filter_df(df, min_ms1_intensity, rt_range, mz_range): + # filter by rt range + if rt_range is not None: + df = df[(df['rt'] > rt_range[0][0]) & (df['rt'] < rt_range[0][1])] + + # filter by mz range + if mz_range is not None: + df = df[(df['rt'] > mz_range[0][0]) & (df['rt'] < mz_range[0][1])] + + # filter by min intensity + intensity_col = 'maxo' + if min_ms1_intensity is not None: + df = df[(df[intensity_col] > min_ms1_intensity)] + return df + + +class DataSource(LoggerMixin): + """ + A class to load and extract centroided peaks from CSV and mzML files. + :param min_ms1_intensity: minimum ms1 intensity for filtering + :param min_ms2_intensity: maximum ms2 intensity for filtering + :param min_rt: minimum RT for filtering + :param max_rt: maximum RT for filtering + """ + + def __init__(self): + # A dictionary that stores the actual pymzml spectra for each filename + self.file_spectra = {} # key: filename, value: a dict where key is scan_number and value is spectrum + + # A dictionary to store the distribution on scan durations for each ms_level in each file + self.file_scan_durations = {} # key: filename, value: a dict with key ms level and value scan durations + + # A dictionary to store extracted MS2 scans + self.precursor_info = {} # key: filename, value: a dataframe of precursor info + + # pymzml parameters + self.ms1_precision = 5e-6 + self.obo_version = '4.0.1' + + # xcms peak picking results, if any + self.df = None + + def load_data(self, mzml_path, file_name=None): + """ + Loads data and generate peaks from mzML files. The resulting peak objects will not have chromatographic peak + shapes, because no peak picking has been performed yet. + :param mzml_path: the input folder containing the mzML files + :return: nothing, but the instance variable file_spectra and scan_durations are populated + """ + for filename in glob.glob(os.path.join(mzml_path, '*.mzML')): + fname = os.path.basename(filename) + if file_name is not None and fname != file_name: + continue + self.logger.info('Loading %s' % fname) + + # TODO: inefficient because we have to parse the mzML file multiple times + self.file_spectra[fname] = self.extract_all_scans(filename) + self.precursor_info[fname] = self.extract_precursor_info(filename) + self.file_scan_durations[fname] = self.extract_scan_durations(filename) + + def extract_all_scans(self, filename): + scans = {} + run = pymzml.run.Reader(filename, obo_version=self.obo_version, + MS1_Precision=self.ms1_precision, + extraAccessions=[('MS:1000016', ['value', 'unitName'])]) + for scan_no, scan in enumerate(run): + scans[scan_no] = scan + return scans + + def extract_precursor_info(self, filename): + df = get_precursor_info(filename) + return df + + def extract_scan_durations(self, filename): + transitions = { + (1, 1): [], + (1, 2): [], + (2, 1): [], + (2, 2): [] + } + run = pymzml.run.Reader(filename, obo_version=self.obo_version, + MS1_Precision=self.ms1_precision, + extraAccessions=[('MS:1000016', ['value', 'unitName'])]) + for scan_no, scan in enumerate(run): + if scan_no == 0: + previous_level = scan['ms level'] + old_rt = self._get_rt(scan) + continue + rt = self._get_rt(scan) + current_level = scan['ms level'] + previous_duration = rt - old_rt + transitions[(previous_level, current_level)].append(previous_duration) + previous_level = current_level + old_rt = rt + return transitions + + def load_xcms_output(self, xcms_filename): + self.df = pd.read_csv(xcms_filename) + + def plot_data(self, file_name, ms_level=1, min_rt=None, max_rt=None, max_data=100000): + data_types = [MZ, INTENSITY, RT, N_PEAKS, SCAN_DURATION] + for data_type in data_types: + if data_type == SCAN_DURATION: + X = self.get_scan_durations(file_name) + self.plot_histogram(X, data_type) + elif data_type == N_PEAKS: + X = self.get_n_peaks(file_name, ms_level, min_rt=min_rt, max_rt=max_rt) + else: + X = self.get_data(data_type, file_name, ms_level, min_rt=min_rt, max_rt=max_rt, max_data=max_data) + if data_type == INTENSITY: + X = np.log(X) + self.plot_histogram(X, data_type) + self.plot_boxplot(X, data_type) + + def plot_histogram(self, X, data_type, n_bins=100): + """ + Makes a histogram plot on the distribution of the item of interest + :param X: a numpy array + :param bins: number of histogram bins + :return: nothing. A plot is shown. + """ + if data_type == SCAN_DURATION: + rt_steps = X + for key, rt_list in rt_steps.items(): + try: + bins = np.linspace(min(rt_list), max(rt_list), n_bins) + plt.figure() + plt.hist(rt_list, bins=bins) + plt.title(key) + plt.show() + except ValueError: + continue + else: + plt.figure() + _ = plt.hist(X, bins=n_bins) + plt.plot(X[:, 0], np.full(X.shape[0], -0.01), '|k') + plt.title('Histogram for %s -- shape %s' % (data_type, str(X.shape))) + plt.show() + + def plot_boxplot(self, X, data_type): + """ + Makes a boxplot on the distribution of the item of interest + :param X: a numpy array + :return: nothing. A plot is shown. + """ + plt.figure() + _ = plt.boxplot(X) + plt.title('Boxplot for %s -- shape %s' % (data_type, str(X.shape))) + plt.show() + + def plot_peak(self, peak): + f, axarr = plt.subplots(2, sharex=True) + axarr[0].plot(peak.rt_values, peak.intensity_values) + axarr[1].plot(peak.rt_values, peak.mz_values, linestyle='None', marker='o', markersize=1.0, color='b') + + def get_data(self, data_type, filename, ms_level, min_intensity=None, + min_rt=None, max_rt=None, log=False, max_data=100000): + """ + Retrieves values as numpy array + :param data_type: data_type is 'mz', 'rt', 'intensity' or 'n_peaks' + :param filename: the mzml filename or None for all files + :param ms_level: level 1 or 2 + :param min_intensity: minimum ms2 intensity for thresholding + :param min_rt: minimum RT value for thresholding + :param max_rt: max RT value for thresholding + :param log: if true, the returned values will be logged + :return: an Nx1 numpy array of all the values requested + """ + # if xcms peak picking results are provided, use that instead + if ms_level == 1 and self.df is not None: + self.logger.info('Using values from XCMS peaklist') + + # remove rows in the peak picked dataframe that are outside threshold values + df = filter_df(self.df, min_intensity, [[min_rt, max_rt]], None) + + # extract the values we need + if data_type == MZ: + X = df['mz'].values + elif data_type == RT: + # we use rt for the starting value for the chemical to elute + X = df['rt'].values + elif data_type == INTENSITY: + X = df['maxo'].values + elif data_type == MZ_INTENSITY_RT: + X = df[['mz', 'maxo', 'rt']].values + + else: # else we get the values by reading from the scans in mzML files directly + self.logger.info('Using values from scans') + + # get spectra from either one file or all files + if filename is None: # use all spectra + all_spectra = [] + for f in self.file_spectra: + spectra_for_f = list(self.file_spectra[f].values()) + all_spectra.extend(spectra_for_f) + else: # use spectra for that file only + all_spectra = self.file_spectra[filename].values() + + # loop through spectrum and get all peaks above threshold + values = [] + for spectrum in all_spectra: + # if wrong ms level, skip this spectrum + if spectrum.ms_level != ms_level: + continue + + # collect all valid Peak objects in a spectrum + spectrum_peaks = [] + for mz, intensity in spectrum.peaks('raw'): + rt = self._get_rt(spectrum) + p = Peak(mz, rt, intensity, spectrum.ms_level) + if self._valid_peak(p, min_intensity, min_rt, max_rt): + spectrum_peaks.append(p) + + if data_type == MZ_INTENSITY_RT: # used when fitting m/z, rt and intensity together for the manuscript + mzs = list(getattr(x, MZ) for x in spectrum_peaks) + intensities = list(getattr(x, INTENSITY) for x in spectrum_peaks) + rts = list(getattr(x, RT) for x in spectrum_peaks) + values.extend(list(zip(mzs, intensities, rts))) + + else: # MZ, INTENSITY or RT separately + attrs = list(getattr(x, data_type) for x in spectrum_peaks) + values.extend(attrs) + + X = np.array(values) + + # log-transform if necessary + if log: + if data_type == MZ_INTENSITY_RT: # just log the intensity part + X[:, 1] = np.log(X[:, 1]) + else: + X = np.log(X) + + # pick random samples + try: + idx = np.arange(len(X)) + rnd_idx = np.random.choice(idx, size=int(max_data), replace=False) + sampled_X = X[rnd_idx] + except ValueError: + sampled_X = X + + # return values + if data_type == MZ_INTENSITY_RT: + return sampled_X # it's already a Nx2 or Nx3 array + else: + # convert into Nx1 array + return sampled_X[:, np.newaxis] + + def get_n_peaks(self, filename, ms_level, min_intensity=None, min_rt=None, max_rt=None): + # get spectra from either one file or all files + if filename is None: # use all spectra + all_spectra = [] + for f in self.file_spectra: + spectra_for_f = list(self.file_spectra[f].values()) + all_spectra.extend(spectra_for_f) + else: # use spectra for that file only + all_spectra = self.file_spectra[filename].values() + + # loop through spectrum and get all peaks above threshold + values = [] + for spectrum in all_spectra: + # if wrong ms level, skip this spectrum + if spectrum.ms_level != ms_level: + continue + + # collect all valid Peak objects in a spectrum + spectrum_peaks = [] + for mz, intensity in spectrum.peaks('raw'): + rt = self._get_rt(spectrum) + p = Peak(mz, rt, intensity, spectrum.ms_level) + if self._valid_peak(p, min_intensity, min_rt, max_rt): + spectrum_peaks.append(p) + + # collect the data points we need into a list + n_peaks = len(spectrum_peaks) + if n_peaks > 0: + values.append(n_peaks) + + # convert into Nx1 array + X = np.array(values) + return X[:, np.newaxis] + + def get_scan_durations(self, fname): + if fname is None: # if no filename, then combine all the dictionary keys + combined = None + for f in self.file_scan_durations: + if combined is None: # copy first one + combined = copy.deepcopy(self.file_scan_durations[f]) + else: # and extend with the subsequent ones + for key in combined: + combined[key].extend(self.file_scan_durations[f][key]) + else: + combined = self.file_scan_durations[fname] + return combined + + def _get_rt(self, spectrum): + rt, units = spectrum.scan_time + if units == 'minute': + rt *= 60.0 + return rt + + def _valid_peak(self, peak, min_intensity, min_rt, max_rt): + if min_intensity is not None and peak.intensity < min_intensity: + return False + elif min_rt is not None and peak.rt < min_rt: + return False + elif max_rt is not None and peak.rt > max_rt: + return False + else: + return True + + +class PeakSampler(LoggerMixin): + """A class to sample peaks from a trained density estimator""" + + # TODO: add min intensity threshold here so we don't store everything??!!! + def __init__(self, data_source, min_rt, max_rt, min_ms1_intensity, min_ms2_intensity, + filename=None, plot=False, + bandwidth_mz_intensity_rt=1.0, bandwidth_n_peaks=1.0, filename_to_N_DEW=None): + self.min_rt = min_rt + self.max_rt = max_rt + self.min_ms1_intensity = min_ms1_intensity + self.min_ms2_intensity = min_ms2_intensity + self.filename = filename + self.plot = plot + self.filename_to_N_DEW = filename_to_N_DEW # a dictionary that maps from filename to (N, DEW) + + # get all the scan dataframes across all files and combine them all + self.all_ms2_scans = self._extract_ms2_scans(data_source) + self.logger.debug('Extracted %d MS2 scans' % len(self.all_ms2_scans)) + + # compute sum(ms2 peak intensities) / ms1.intensity + self.intensity_props = self._compute_intensity_props() + + # extract scan durations + self.file_scan_durations = {} # key: (N, DEW), value: a list of scan durations for (N, DEW) + if filename_to_N_DEW is None: + # no mapping between filename to N is specified, so we just assign it a default key of 0 + self.logger.debug('Extracting scan durations') + N_DEW = (0, 0) # default value if not specified + self.file_scan_durations[N_DEW] = data_source.get_scan_durations(filename) + else: + # store the scan durations for the different Ns + for filename, v in filename_to_N_DEW.items(): + N, DEW = v + self.logger.debug('Extracting scan durations for N=%d DEW=%d from %s' % (N, DEW, filename)) + self.file_scan_durations[v] = data_source.get_scan_durations(filename) + + # train KDEs for each ms-level + max_data = 100000 + self.kdes = {} + self.kernel = 'gaussian' + self._kde(data_source, filename, 1, bandwidth_mz_intensity_rt, bandwidth_n_peaks, max_data) + try: # exceptions if data_source only contains fullscan data but we try to train kde on ms level 2 + self._kde(data_source, filename, 2, bandwidth_mz_intensity_rt, bandwidth_n_peaks, max_data) + except ValueError: + pass + except IndexError: + pass + + #################################################################################################################### + # Public methods + #################################################################################################################### + + def scan_durations(self, previous_level, current_level, n_sample, N, DEW): + # the scan durations is stored for each N and DEW combination + file_scan_durations = self.file_scan_durations[(N, DEW)] + key = (previous_level, current_level,) + values = file_scan_durations[key] + try: + return np.random.choice(values, replace=False, size=n_sample) + except ValueError: + return np.array([]) + + def get_peak(self, ms_level, N=None, min_mz=None, max_mz=None, min_rt=None, max_rt=None, min_intensity=None): + if N is None: + N = max(self.n_peaks(ms_level, 1).astype(int)[0][0], 0) + + peaks = [] + while len(peaks) < N: + vals = self.sample(ms_level, 1) + intensity = np.exp(vals[0, 1]) + mz = vals[0, 0] + rt = vals[0, 2] + p = Peak(mz, rt, intensity, ms_level) + if self._is_valid(p, min_mz, max_mz, min_rt, max_rt, min_intensity): # othwerise we just keep rejecting + peaks.append(p) + return peaks + + def sample(self, ms_level, n_sample): + vals = self.kdes[(MZ_INTENSITY_RT, ms_level)].sample(n_sample) + return vals + + def n_peaks(self, ms_level, n_sample): + return self.kdes[(N_PEAKS, ms_level)].sample(n_sample) + + def get_ms2_spectra(self, N=1): + spectra = [] + total = len(self.all_ms2_scans) + + if total > 0: + # select N random spectra + idx = np.random.choice(total, replace=False, size=N) + samples = self.all_ms2_scans.iloc[idx, :] + + # convert to Scan objects + for idx, row in samples.iterrows(): + # create precursor (MS1) peak + parent_ms_level = 1 + parent_mz = row['ms1_mz'] + parent_rt = row['ms1_scan_rt'] + parent_intensity = row['ms1_intensity'] + parent_peak = Peak(parent_mz, parent_rt, parent_intensity, parent_ms_level) + + # create MS2 scan + ms_level = 2 + ms2_peaks = row['ms2_peaklist'] + ms2_scan_id = idx + ms2_mzs = ms2_peaks[:, 0] + ms2_rt = ms2_peaks[0, 1] # all the values are the same, so we can take the first one + ms2_intensities = ms2_peaks[:, 2] # TODO: filter by min_ms2_intensity here + ms2_scan = Scan(ms2_scan_id, ms2_mzs, ms2_intensities, ms_level, ms2_rt, parent=parent_peak) + spectra.append(ms2_scan) + return spectra + + def get_noise_sample(self): + # TODO: finish this + # need to choose number of noise fragments from get_num_noisy_samples() below + # then draw n noise fragments + # returns list of ms2 noise fragments. type = MSN + # noise fragment here is defined as ms2 peaks below some intensity threshold + return [] + + def get_num_noisy_samples(self): + # TODO: finish this + # returns a distribution of the number of noise fragments + return 0.0 + + def get_msn_noisy_intensity(self, intensity, ms_level, max_intensity, pct): + # CJ: for now, we will just be doing it where we randomly take a percentage of the + # max intensity... so we will need to have two extra arguments here + # TODO: until we characterise the noise properly, just return the original value for now + # takes intensity + # adds noise, but ensures its positive value + # returns list with one numeric value + # ignores ms_level for now + assert 0 <= pct and pct <= 100, f"pct value must be on the range of [0, 100], {pct} is invalid." + + mult = (np.random.rand()*float(pct)) / float(100.) + noise = max_intensity * mult + + return intensity + noise + + def get_msn_noisy_mz(self, mz, ms_level): + # TODO: finish this + # same as above, but for m/z + # Simon: We can characterise mz noise from the chromatographic peaks we extract. + # I suggest a constant variance for now, but we might want to fit models where we account for + # variability in noise variance as a function of mz itself, and intensity. + return mz + + def get_parent_intensity_proportion(self, N=1): + # this is the proportion of all fragment intensities in a spectra over the parent intensity + # returns number between 0 and 1 + if len(self.all_ms2_scans) > 0: + return np.random.choice(self.intensity_props, replace=False, size=N) + return None + + #################################################################################################################### + # Private methods used in the constructor + #################################################################################################################### + + def _extract_ms2_scans(self, data_source): + combined_dfs = pd.concat(data_source.precursor_info.values()) + # select only the column we need + # 'ms2_peaklist' is a 2d-array, where each row is an ms2 peak, and columns are mz, rt, intensity + col_names = ['ms1_mz', 'ms1_scan_rt', 'ms1_intensity', 'ms2_peaklist'] + return combined_dfs[col_names] + + def _compute_intensity_props(self): + self.logger.debug('Computing parent intensity proportions') + intensity_props = [] + for idx, row in self.all_ms2_scans.iterrows(): + parent_intensity = row['ms1_intensity'] + ms2_peaks = row['ms2_peaklist'] + ms2_intensities = ms2_peaks[:, 2] + prop = np.sum(ms2_intensities) / parent_intensity + if prop <= 1: + intensity_props.append(prop) + return np.array(intensity_props) + + def _kde(self, data_source, filename, ms_level, bandwidth_mz_intensity_rt, bandwidth_n_peaks, max_data): + self.logger.debug('Training KDEs for ms_level=%d' % ms_level) + params = [ + {'data_type': MZ_INTENSITY_RT, 'bandwidth': bandwidth_mz_intensity_rt}, + {'data_type': N_PEAKS, 'bandwidth': bandwidth_n_peaks} + ] + + for param in params: + data_type = param['data_type'] + min_intensity = self.min_ms1_intensity if ms_level == 1 else self.min_ms2_intensity + + # get data + self.logger.debug('Retrieving %s values from %s' % (data_type, data_source)) + if data_type == N_PEAKS: + X = data_source.get_n_peaks(filename, ms_level, min_intensity=min_intensity, + min_rt=self.min_rt, max_rt=self.max_rt) + else: + log = True if data_type == MZ_INTENSITY_RT else False + X = data_source.get_data(data_type, filename, ms_level, min_intensity=min_intensity, + min_rt=self.min_rt, max_rt=self.max_rt, log=log, max_data=max_data) + + # fit kde + bandwidth = param['bandwidth'] + kde = KernelDensity(kernel=self.kernel, bandwidth=bandwidth).fit(X) + self.kdes[(data_type, ms_level)] = kde + + # plot if necessary + self._plot(kde, X, data_type, filename, bandwidth) + + def _is_valid(self, peak, min_mz, max_mz, min_rt, max_rt, min_intensity): + if peak.intensity < 0: + return False + if min_mz is not None and min_mz > peak.mz: + return False + if max_mz is not None and max_mz < peak.mz: + return False + if min_rt is not None and min_rt > peak.rt: + return False + if max_rt is not None and max_rt < peak.rt: + return False + if min_intensity is not None and min_intensity > peak.intensity: + return False + return True + + def _plot(self, kde, X, data_type, filename, bandwidth): + if self.plot: + if data_type == MZ_INTENSITY_RT: + self.logger.debug('3D plotting for %s not implemented' % MZ_INTENSITY_RT) + else: + fname = 'All' if filename is None else filename + title = '%s density estimation for %s - bandwidth %.3f' % (data_type, fname, bandwidth) + X_plot = np.linspace(np.min(X), np.max(X), 1000)[:, np.newaxis] + log_dens = kde.score_samples(X_plot) + plt.figure() + plt.fill_between(X_plot[:, 0], np.exp(log_dens), alpha=0.5) + plt.plot(X[:, 0], np.full(X.shape[0], -0.01), '|k') + plt.title(title) + plt.show() diff --git a/Synthetic data creation scripts/vimms/DsDA.py b/Synthetic data creation scripts/vimms/DsDA.py new file mode 100644 index 00000000..18c89560 --- /dev/null +++ b/Synthetic data creation scripts/vimms/DsDA.py @@ -0,0 +1,112 @@ +import glob +import os + +import numpy as np +import pandas as pd + +from vimms.Common import load_obj + + +def get_schedule(n, schedule_dir): + while True: + files = sorted(glob.glob(os.path.join(schedule_dir, '*.csv'))) + if len(files) == n: + last_file = files[-1] + try: + schedule = pd.read_csv(last_file) + if schedule.shape[0] == 11951: + print("Schedule Found") + return last_file + except: + pass + + +def fragmentation_performance_chemicals(controller_directory, min_acceptable_intensity, controller_file_spec="*.p"): + global total_matched_chemicals + os.chdir(controller_directory) + file_names = glob.glob(controller_file_spec) + n_samples = len(file_names) + controllers = [] + all_chemicals = [] + for controller_index in range(n_samples): + controller = load_obj(file_names[controller_index]) + controllers.append(controller) + all_chemicals.extend(controller.mass_spec.chemicals) + all_rts = [chem.rt for chem in all_chemicals] + chemicals_found_total = np.unique(all_rts) + sample_chemical_start_rts = [[] for i in range(n_samples)] + sample_chemical_start_rts_total = [] + for i in range(n_samples): + for event in controllers[i].mass_spec.fragmentation_events: + if event.ms_level == 2: + if controllers[i].mass_spec._get_intensity(event.chem, event.query_rt, 0, + 0) > min_acceptable_intensity: + sample_chemical_start_rts[i].append(event.chem.rt) + sample_chemical_start_rts[i] = np.unique(np.array(sample_chemical_start_rts[i])).tolist() + # at this point we have collected the RTs of the all the chemicals that + # have been fragmented above the min_intensity threshold + flatten_rts = [] + for l in sample_chemical_start_rts[0:(i + 1)]: + flatten_rts.extend(l) + sample_chemical_start_rts_total.append(len(np.unique(np.array(flatten_rts)))) + total_matched_chemicals = sample_chemical_start_rts_total + print("Completed Controller", i + 1) + return chemicals_found_total, total_matched_chemicals + + +def create_frag_dicts(controller_directory, aligned_chemicals_location, min_acceptable_intensity, + controller_file_spec="*.p"): + os.chdir(controller_directory) + file_names = glob.glob(controller_file_spec) + params = [] + for controller_index in range(len(file_names)): + params.append({ + 'controller_directory': controller_directory + file_names[controller_index], + 'min_acceptable_intensity': min_acceptable_intensity, + 'aligned_chemicals_location': aligned_chemicals_location + }) + return params + + +def fragmentation_performance_aligned(param_dict): + controller = load_obj(param_dict["controller_directory"]) + min_acceptable_intensity = param_dict["min_acceptable_intensity"] + aligned_chemicals = pd.read_csv(param_dict["aligned_chemicals_location"]) + n_chemicals_aligned = len(aligned_chemicals["mzmed"]) + chemicals_found = 0 + + events = np.array([event for event in controller.mass_spec.fragmentation_events if event.ms_level == 2]) + event_query_rts = np.array([event.query_rt for event in events]) + event_query_mzs = np.array([controller.mass_spec._get_mz(event.chem, event.query_rt, 0, 0) for event in events]) + + chemicals_found = [0 for i in range(n_chemicals_aligned)] + + for aligned_index in range(n_chemicals_aligned): + + rtmin = aligned_chemicals['peak_rtmin'][aligned_index] + rtmax = aligned_chemicals['peak_rtmax'][aligned_index] + mzmin = aligned_chemicals['peak_mzmin'][aligned_index] + mzmax = aligned_chemicals['peak_mzmax'][aligned_index] + rtmin_check = event_query_rts > rtmin + rtmax_check = event_query_rts < rtmax + mzmin_check = event_query_mzs > mzmin + mzmax_check = event_query_mzs < mzmax + idx = np.nonzero(rtmin_check & rtmax_check & mzmin_check & mzmax_check)[0] + + for i in idx: + event = events[i] + inten = controller.mass_spec._get_intensity(event.chem, event.query_rt, 0, 0) + if inten > min_acceptable_intensity: + chemicals_found[aligned_index] = 1 + break + return chemicals_found + + +def multi_sample_fragmentation_performance_aligned(params): + chemicals_found_multi = np.array(list(map(fragmentation_performance_aligned, params))) + total_chemicals_found = [] + + for i in range(len(chemicals_found_multi)): + total_chemicals_found.append((chemicals_found_multi[0:(1 + i)].sum(axis=0) > 0).sum()) + + return total_chemicals_found diff --git a/Synthetic data creation scripts/vimms/Evaluation.py b/Synthetic data creation scripts/vimms/Evaluation.py new file mode 100644 index 00000000..bae247e7 --- /dev/null +++ b/Synthetic data creation scripts/vimms/Evaluation.py @@ -0,0 +1,58 @@ +from vimms.PlotsForPaper import compute_performance_scenario_2 + + +class Evaluation(object): + """ + A class to compute performance evaluation + """ + + def __init__(self, controller, evaluation_strategy): + """ + Initialises an evaluation object + :param controller: a controller + :param evaluation_strategy: a strategy to evaluate performance + """ + self.controller = controller + self.strategy = evaluation_strategy + + def compute_performance(self): + """ + Evaluates a controller performance based on the provided evaluation strategy + :return: performance values + """ + return self.strategy.evaluate(self.controller) + + +class TopNEvaluationStrategy(object): + """ + A class to compute Top-N performance acccording to Section 3.3 of the paper + """ + + def __init__(self, **params): + """ + Evaluation parameters + :param params: multiple keyword arguments of parameters that we need + + For Top-N these parameters are: + - dataset = the list of chemicals to evaluate performance on + - min_ms1_intensity = minimum MS1 intensity to fragment + - fullscan_peaks_df = a dataframe of XCMS peak-picking result on the full-scan file + - fragmentation_peaks_df = a dataframe of XCMS peak-picking result on the fragmentation file + - fullscan_filename = an optional filename to filter fullscan_peaks_df, otherwise None + - fragfile_filename = an optional filename to filter fragmentation_peaks_df, otherwise None + - matching_mz_tol = matching tolerance (in ppm) to match peaks + - matching_rt_tol = matching tolerance (in seconds) to match peaks + """ + self.params = params + + def evaluate(self, controller): + return compute_performance_scenario_2(controller, + self.params['dataset'], + self.params['min_ms1_intensity'], + self.params['fullscan_filename'], + self.params['fragfile_filename'], + self.params['fullscan_peaks_df'], + self.params['fragmentation_peaks_df'], + self.params['matching_mz_tol'], + self.params['matching_rt_tol'], + chem_to_frag_events=None) diff --git a/Synthetic data creation scripts/vimms/MassSpec.py b/Synthetic data creation scripts/vimms/MassSpec.py new file mode 100644 index 00000000..f77829ee --- /dev/null +++ b/Synthetic data creation scripts/vimms/MassSpec.py @@ -0,0 +1,769 @@ +import math +from collections import defaultdict +from collections import namedtuple + +import numpy as np +import pandas as pd +from events import Events + +from vimms.Common import LoggerMixin, adduct_transformation +import matplotlib.pyplot as plt + +class Peak(object): + """ + A class to represent an empirical or sampled scan-level peak object + """ + + def __init__(self, mz, rt, intensity, ms_level): + """ + Creates a peak object + :param mz: mass-to-charge value + :param rt: retention time value + :param intensity: intensity value + :param ms_level: MS level + """ + self.mz = mz + self.rt = rt + self.intensity = intensity + self.ms_level = ms_level + + def __repr__(self): + return 'Peak mz=%.4f rt=%.2f intensity=%.2f ms_level=%d' % (self.mz, self.rt, self.intensity, self.ms_level) + + def __eq__(self, other): + if not isinstance(other, Peak): + # don't attempt to compare against unrelated types + return NotImplemented + + return math.isclose(self.mz, other.mz) and \ + math.isclose(self.rt, other.rt) and \ + math.isclose(self.intensity, other.intensity) and \ + self.ms_level == other.ms_level + + +class Scan(object): + """ + A class to store scan information + """ + + def __init__(self, scan_id, mzs, intensities, ms_level, rt, scan_duration=None, isolation_windows=None, + parent=None): + """ + Creates a scan + :param scan_id: current scan id + :param mzs: an array of mz values + :param intensities: an array of intensity values + :param ms_level: the ms level of this scan + :param rt: the retention time of this scan + :param scan_duration: how long this scan takes, if known. + :param isolation_windows: the window to isolate precursor peak, if known + :param parent: parent precursor peak, if known + """ + assert len(mzs) == len(intensities) + self.scan_id = scan_id + + # ensure that mzs and intensites are sorted by their mz values + p = mzs.argsort() + self.mzs = mzs[p] + self.intensities = intensities[p] + + self.ms_level = ms_level + self.rt = rt + self.num_peaks = len(mzs) + + self.scan_duration = scan_duration + self.isolation_windows = isolation_windows + self.parent = parent + + def __repr__(self): + return 'Scan %d num_peaks=%d rt=%.2f ms_level=%d' % (self.scan_id, self.num_peaks, self.rt, self.ms_level) + + # TODO maybe add the noise here? + +class ScanParameters(object): + """ + A class to store parameters used to instruct the mass spec how to generate a scan. + This object is usually created by the controllers. + """ + + # possible scan parameter names + MS_LEVEL = 'ms_level' + ISOLATION_WINDOWS = 'isolation_windows' + PRECURSOR = 'precursor' + DYNAMIC_EXCLUSION_MZ_TOL = 'mz_tol' + DYNAMIC_EXCLUSION_RT_TOL = 'rt_tol' + TIME = 'time' + N = 'N' + + def __init__(self): + """ + Creates a scan parameter object + """ + self.params = {} + + def set(self, key, value): + """ + Sets scan parameter value + :param key: a scan parameter name + :param value: a scan parameter value + :return: + """ + self.params[key] = value + + def get(self, key): + """ + Gets scan parameter value + :param key: + :return: + """ + if key in self.params: + return self.params[key] + else: + return None + + def __repr__(self): + return 'ScanParameters %s' % (self.params) + + +class FragmentationEvent(object): + """ + A class to store fragmentation events. Mostly used for benchmarking purpose. + """ + + def __init__(self, chem, query_rt, ms_level, peaks, scan_id): + """ + Creates a fragmentation event + :param chem: the chemical that were fragmented + :param query_rt: the time when fragmentation occurs + :param ms_level: MS level of fragmentation + :param peaks: the set of peaks produced during the fragmentation event + :param scan_id: the scan id linked to this fragmentation event + """ + self.chem = chem + self.query_rt = query_rt + self.ms_level = ms_level + self.peaks = peaks + self.scan_id = scan_id + + def __repr__(self): + return 'MS%d FragmentationEvent for %s at %f' % (self.ms_level, self.chem, self.query_rt) + + +class ExclusionItem(object): + """ + A class to store the item to exclude when computing dynamic exclusion window + """ + + def __init__(self, from_mz, to_mz, from_rt, to_rt): + """ + Creates a dynamic exclusion item + :param from_mz: m/z lower bounding box + :param to_mz: m/z upper bounding box + :param from_rt: RT lower bounding box + :param to_rt: RT upper bounding box + """ + self.from_mz = from_mz + self.to_mz = to_mz + self.from_rt = from_rt + self.to_rt = to_rt + + +class IndependentMassSpectrometer(LoggerMixin): + """ + A class that represents (synchronous) mass spectrometry process. + Independent here refers to how the intensity of each peak in a scan is independent of each other + i.e. there's no ion supression effect. + """ + MS_SCAN_ARRIVED = 'MsScanArrived' + ACQUISITION_STREAM_OPENING = 'AcquisitionStreamOpening' + ACQUISITION_STREAM_CLOSING = 'AcquisitionStreamClosing' + + def __init__(self, ionisation_mode, chemicals, peak_sampler, sig, + schedule_file=None, add_noise=False, dynamic_exclusion=True): + """ + Creates a mass spec object. + :param ionisation_mode: POSITIVE or NEGATIVE + :param chemicals: a list of Chemical objects in the dataset + :param peak_sampler: an instance of DataGenerator.PeakSampler object + :param schedule_file: path to schedule (CSV) file in DsDA format + :param add_noise: a flag to indicate whether to add noise + :param dynamic_exclusion: a flag to indicate whether to perform dynamic exclusion + """ + self.peak_recorder = defaultdict(list) # Added by CJ... helps keep track of peaks + self.grp = 0 + # current scan index, internal time and schedule file if provided + self.idx = 0 + self.time = 0 + self.schedule_file = schedule_file + if self.schedule_file is not None: + self.schedule = pd.read_csv(schedule_file) + self.sig = sig + # current task queue + self.processing_queue = [] + self.repeating_scan_parameters = None + + # the events here follows IAPI events + self.events = Events((self.MS_SCAN_ARRIVED, self.ACQUISITION_STREAM_OPENING, self.ACQUISITION_STREAM_CLOSING,)) + self.event_dict = { + self.MS_SCAN_ARRIVED: self.events.MsScanArrived, + self.ACQUISITION_STREAM_OPENING: self.events.AcquisitionStreamOpening, + self.ACQUISITION_STREAM_CLOSING: self.events.AcquisitionStreamClosing + } + + # the list of all chemicals in the dataset + self.chemicals = chemicals + self.ionisation_mode = ionisation_mode # currently unused + + # stores the chromatograms start and end rt for quick retrieval + chem_rts = np.array([chem.rt for chem in self.chemicals]) + self.chrom_min_rts = np.array([chem.chromatogram.min_rt for chem in self.chemicals]) + chem_rts + self.chrom_max_rts = np.array([chem.chromatogram.max_rt for chem in self.chemicals]) + chem_rts + + # here's where we store all the stuff to sample from + self.peak_sampler = peak_sampler + + # required to sample for different scan durations based on (N, DEW) in the hybrid controller + self.current_N = 0 + self.current_DEW = 0 + + # stores the mapping between precursor peak to ms2 scans + self.precursor_information = defaultdict(list) # key: Precursor object, value: ms2 scans + self.add_noise = add_noise # whether to add noise to the generated fragment peaks + self.fragmentation_events = [] # which chemicals produce which peaks + + # for dynamic exclusion window + self.dynamic_exclusion = dynamic_exclusion + self.exclusion_list = [] # a list of ExclusionItem + self.noise_level_ = 0 + + #################################################################################################################### + # Public methods + #################################################################################################################### + def noise_level( self, nl ): + self.noise_level_ = nl + + def set_group( self, grp ): + self.grp = grp + + def run(self, min_time, max_time, pbar=None): + """ + Simulates running the mass spec from min_time to max_time + :param min_time: start time + :param max_time: end time + :param pbar: progress bar + :return: None + """ + max_time = self._init_time(max_time, min_time) + self.fire_event(IndependentMassSpectrometer.ACQUISITION_STREAM_OPENING) + + try: + while self.time < max_time: + + # get scan param from the processing queue and do one scan + param = self._get_param() + scan = self._get_scan(self.time, param) + + # notify the controller that a new scan has been generated + # at this point, the MS_SCAN_ARRIVED event handler in the controller is called + # and the processing queue will be updated with new sets of scan parameters to do + self.fire_event(self.MS_SCAN_ARRIVED, scan) + + # sample scan duration and increase internal time + current_level = scan.ms_level + current_N = self.current_N + current_DEW = self.current_DEW + try: + next_scan_param = self.get_processing_queue()[0] + except IndexError: + next_scan_param = None + current_scan_duration = self._increase_time(current_level, current_N, current_DEW, + next_scan_param) + scan.scan_duration = current_scan_duration + + # add precursor and DEW information based on the current scan produced + # the DEW list update must be done after time has been increased + self._add_precursor_info(param, scan) + if self.dynamic_exclusion: + self._manage_dynamic_exclusion_list(param, scan) + + # stores the updated value of N and DEW + self._store_next_N_DEW(next_scan_param) + + # update progress bar + self._update_progress_bar(current_scan_duration, pbar, scan) + finally: + self.fire_event(IndependentMassSpectrometer.ACQUISITION_STREAM_CLOSING) + if pbar is not None: + pbar.close() + + def get_processing_queue(self): + """ + Returns the current processing queue + :return: + """ + return self.processing_queue + + def add_to_processing_queue(self, param): + """ + Adds a new scan parameters to the processing queue of scan parameters. Usually done by the controllers. + :param param: the scan parameters to add + :return: None + """ + self.processing_queue.append(param) + + def disable_repeating_scan(self): + """ + Disable repeating scan + :return: None + """ + self.set_repeating_scan(None) + + def set_repeating_scan(self, params): + """ + Sets the parameters for the default repeating scans that will be done when the processing queue is empty. + :param params: + :return: + """ + self.repeating_scan_parameters = params + + def reset(self): + """ + Resets the mass spec state so we can reuse it again + :return: None + """ + for key in self.event_dict: # clear event handlers + self.clear(key) + self.time = 0 + self.idx = 0 + self.processing_queue = [] + self.repeating_scan_parameters = None + self.current_N = 0 + self.current_DEW = 0 + self.precursor_information = defaultdict(list) + self.fragmentation_events = [] + self.exclusion_list = [] + + def is_excluded(self, mz, rt): + """ + Checks if a pair of (mz, rt) value is currently excluded by dynamic exclusion window + :param mz: m/z value + :param rt: RT value + :return: True if excluded, False otherwise + """ + # TODO: make this faster? + for x in self.exclusion_list: + exclude_mz = x.from_mz <= mz <= x.to_mz + exclude_rt = x.from_rt <= rt <= x.to_rt + if exclude_mz and exclude_rt: + self.logger.debug( + 'Time {:.6f} Excluded precursor ion mz {:.4f} rt {:.2f} because of {}'.format(self.time, mz, rt, x)) + return True + return False + + def fire_event(self, event_name, arg=None): + """ + Simulates sending an event + :param event_name: the event name + :param arg: the event parameter + :return: None + """ + if event_name not in self.event_dict: + raise ValueError('Unknown event name') + + # pretend to fire the event + # actually here we just runs the event handler method directly + e = self.event_dict[event_name] + if arg is not None: + e(arg) + else: + e() + + def register(self, event_name, handler): + """ + Register event handler + :param event_name: the event name + :param handler: the event handler + :return: None + """ + if event_name not in self.event_dict: + raise ValueError('Unknown event name') + e = self.event_dict[event_name] + e += handler # register a new event handler for e + + def clear(self, event_name): + """ + Clears event handler for a given event name + :param event_name: the event name + :return: None + """ + if event_name not in self.event_dict: + raise ValueError('Unknown event name') + e = self.event_dict[event_name] + e.targets = [] + + #################################################################################################################### + # Private methods + #################################################################################################################### + + def _init_time(self, max_time, min_time): + """ + Sets initial mass spec time + :param max_time: end time + :param min_time: start time + :return: a new end time, if it's read from DsDA CSV file, otherwise it's the same + """ + if self.schedule_file is None: + self.time = min_time + else: + self.time = self.schedule["targetTime"].values[0] + max_time = self.schedule["targetTime"].values[-1] + return max_time + + def _get_param(self): + """ + Retrieves a new set of scan parameters from the processing queue + :return: A new set of scan parameters from the queue if available, otherwise it returns the default scan params. + """ + # if the processing queue is empty, then just do the repeating scan + if len(self.processing_queue) == 0: + param = self.repeating_scan_parameters + else: + # otherwise pop the parameter for the next scan from the queue + param = self.processing_queue.pop(0) + return param + + def _increase_time(self, current_level, current_N, current_DEW, next_scan_param): + # look into the queue, find out what the next scan ms_level is, and compute the scan duration + # only applicable for simulated mass spec, since the real mass spec can generate its own scan duration. + self.idx += 1 + if self.schedule_file is None: + if next_scan_param is None: # if queue is empty, the next one is an MS1 scan by default + next_level = 1 + else: + next_level = next_scan_param.get(ScanParameters.MS_LEVEL) + + # sample current scan duration based on current_DEW, current_N, current_level and next_level + current_scan_duration = self._sample_scan_duration(current_DEW, current_N, + current_level, next_level) + else: + new_time = self.schedule["targetTime"][self.idx] + current_scan_duration = new_time - self.time + + self.time += current_scan_duration + self.logger.info('Time %f Len(queue)=%d' % (self.time, len(self.processing_queue))) + return current_scan_duration + + def _sample_scan_duration(self, current_DEW, current_N, current_level, next_level): + # get scan duration based on current and next level + if current_level == 1 and next_level == 1: + # special case: for the transition (1, 1), we can try to get the times for the + # fullscan data (N=0, DEW=0) if it's stored + try: + current_scan_duration = self.peak_sampler.scan_durations(current_level, next_level, 1, N=0, DEW=0) + except KeyError: ## ooops not found + current_scan_duration = self.peak_sampler.scan_durations(current_level, next_level, 1, + N=current_N, DEW=current_DEW) + else: # for (1, 2), (2, 1) and (2, 2) + current_scan_duration = self.peak_sampler.scan_durations(current_level, next_level, 1, + N=current_N, DEW=current_DEW) + current_scan_duration = current_scan_duration.flatten()[0] + return current_scan_duration + + def _add_precursor_info(self, param, scan): + """ + Adds precursor ion information. + If MS2 and above, and controller tells us which precursor ion the scan is coming from, store it + :param param: a scan parameter object + :param scan: the newly generated scan + :return: None + """ + precursor = param.get(ScanParameters.PRECURSOR) + if scan.ms_level >= 2 and precursor is not None: + isolation_windows = param.get(ScanParameters.ISOLATION_WINDOWS) + iso_min = isolation_windows[0][0][0] + iso_max = isolation_windows[0][0][1] + self.logger.debug('Time {:.6f} Isolated precursor ion {:.4f} at ({:.4f}, {:.4f})'.format(self.time, + precursor.precursor_mz, + iso_min, + iso_max)) + self.precursor_information[precursor].append(scan) + + def _manage_dynamic_exclusion_list(self, param, scan): + """ + Manages dynamic exclusion list + :param param: a scan parameter object + :param scan: the newly generated scan + :return: None + """ + precursor = param.get(ScanParameters.PRECURSOR) + if scan.ms_level >= 2 and precursor is not None: + # add dynamic exclusion item to the exclusion list to prevent the same precursor ion being fragmented + # multiple times in the same mz and rt window + # Note: at this point, fragmentation has occurred and time has been incremented! so the time when + # items are checked for dynamic exclusion is the time when MS2 fragmentation occurs + # TODO: we need to add a repeat count too, i.e. how many times we've seen a fragment peak before + # it gets excluded (now it's basically 1) + mz = precursor.precursor_mz + mz_tol = param.get(ScanParameters.DYNAMIC_EXCLUSION_MZ_TOL) + rt_tol = param.get(ScanParameters.DYNAMIC_EXCLUSION_RT_TOL) + mz_lower = mz * (1 - mz_tol / 1e6) + mz_upper = mz * (1 + mz_tol / 1e6) + rt_lower = self.time + rt_upper = self.time + rt_tol + x = ExclusionItem(from_mz=mz_lower, to_mz=mz_upper, from_rt=rt_lower, to_rt=rt_upper) + self.logger.debug('Time {:.6f} Created dynamic exclusion window mz ({}-{}) rt ({}-{})'.format( + self.time, + x.from_mz, x.to_mz, x.from_rt, x.to_rt + )) + self.exclusion_list.append(x) + + # remove expired items from dynamic exclusion list + self.exclusion_list = list(filter(lambda x: x.to_rt > self.time, self.exclusion_list)) + + def _store_next_N_DEW(self, next_scan_param): + """ + Stores the N and DEW parameter values for the next scan params + :param next_scan_param: A new set of scan parameters + :return: None + """ + if next_scan_param is not None: + # Only the hybrid controller sends these N and DEW parameters. For other controllers they will be None + next_N = next_scan_param.get(ScanParameters.N) + next_DEW = next_scan_param.get(ScanParameters.DYNAMIC_EXCLUSION_RT_TOL) + else: + next_N = None + next_DEW = None + + # keep track of the N and DEW values for the next scan if they have been changed by the Hybrid Controller + if next_N is not None: + self.current_N = next_N + if next_DEW is not None: + self.current_DEW = next_DEW + + def _update_progress_bar(self, elapsed, pbar, scan): + """ + Updates progress bar based on elapsed time + :param elapsed: Elapsed time to increment the progress bar + :param pbar: progress bar object + :param scan: the newly generated scan + :return: None + """ + if pbar is not None: + if self.current_N > 0 and self.current_DEW > 0: + msg = '(%.3fs) ms_level=%d N=%d DEW=%d' % (self.time, scan.ms_level, + self.current_N, self.current_DEW) + else: + msg = '(%.3fs) ms_level=%d' % (self.time, scan.ms_level) + pbar.update(elapsed) + pbar.set_description(msg) + + #################################################################################################################### + # Scan generation methods + #################################################################################################################### + + def _get_scan(self, scan_time, param, max_intensity=0): + """ + Constructs a scan at a particular timepoint + :param time: the timepoint + :return: a mass spectrometry scan at that time + """ + scan_mzs = [] # all the mzs values in this scan + scan_intensities = [] # all the intensity values in this scan + ms_level = param.get(ScanParameters.MS_LEVEL) + isolation_windows = param.get(ScanParameters.ISOLATION_WINDOWS) + scan_id = self.idx + + # for all chemicals that come out from the column coupled to the mass spec + idx = self._get_chem_indices(scan_time) + last = -1 + for i in idx: + chemical = self.chemicals[i] + # mzs is a list of (mz, intensity) for the different adduct/isotopes combinations of a chemical + mzs = self._get_all_mz_peaks(chemical, scan_time, ms_level, isolation_windows) + + peaks = [] + if mzs is not None: + new_data = [] + for (m,it) in mzs: + new_data.append((scan_time, m, it )) + self.peak_recorder[ str(chemical.formula) ].extend( new_data ) + chem_mzs = [] + chem_intensities = [] + for peak_mz, peak_intensity in mzs: + if peak_intensity > 0: + chem_mzs.append(peak_mz) + chem_intensities.append(peak_intensity) + p = Peak(peak_mz, scan_time, peak_intensity, ms_level) + peaks.append(p) + + scan_mzs.extend(chem_mzs) + scan_intensities.extend(chem_intensities) + + curr = len( scan_mzs ) + if curr != last: + last = curr + # for benchmarking purpose + if len(peaks) > 0: + frag = FragmentationEvent(chemical, scan_time, ms_level, peaks, scan_id) + self.fragmentation_events.append(frag) + + scan_mzs = np.array(scan_mzs) + scan_intensities = np.array(scan_intensities) + if self.grp == 2: + scan_intensities *= 2 + # Note: at this point, the scan duration is not set yet because we don't know what the next scan is going to be + # We will set it later in the get_next_scan() method after we've notified the controller that this scan is produced. + return Scan(scan_id, scan_mzs, scan_intensities, ms_level, scan_time, + scan_duration=None, isolation_windows=isolation_windows) + + def _get_chem_indices(self, query_rt): + rtmin_check = self.chrom_min_rts <= query_rt + rtmax_check = query_rt <= self.chrom_max_rts + idx = np.nonzero(rtmin_check & rtmax_check)[0] + + return idx + + + def _get_all_mz_peaks(self, chemical, query_rt, ms_level, isolation_windows): + if not self._rt_match(chemical, query_rt): + return None + mz_peaks = [] + for which_isotope in range(len(chemical.isotopes)): + for which_adduct, adduct in enumerate(self._get_adducts(chemical)): + if adduct[0] != 'M+H': + continue + mz_peaks.extend( self._get_mz_peaks(chemical, query_rt, ms_level, isolation_windows, which_isotope, which_adduct) ) + if mz_peaks == []: + return None + else: + return mz_peaks + + def _get_mz_peaks(self, chemical, query_rt, ms_level, isolation_windows, which_isotope, which_adduct): + # EXAMPLE OF USE OF DEFINITION: if we wants to do an ms2 scan on a chemical. we would first have ms_level=2 and the chemicals + # ms_level =1. So we would go to the "else". We then check the ms1 window matched. It then would loop through + # the children who have ms_level = 2. So we then go to second elif and return the mz and intensity of each ms2 fragment + mz_peaks = [] + if ms_level == 1 and chemical.ms_level == 1: # fragment ms1 peaks + # returns ms1 peaks if chemical is has ms_level = 1 and scan is an ms1 scan + if not (which_isotope > 0 and which_adduct > 0): + # rechecks isolations window if not monoisotopic and "M + H" adduct + if self._isolation_match(chemical, query_rt, isolation_windows[0], which_isotope, which_adduct): + intensity = self._get_intensity(chemical, query_rt, which_isotope, which_adduct) + mz = self._get_mz(chemical, query_rt, which_isotope, which_adduct) + mz_peaks.extend([(mz, intensity)]) + assert len( mz_peaks ) == 1 + else: + pass + elif ms_level == chemical.ms_level: + # returns ms2 fragments if chemical and scan are both ms2, + # returns ms3 fragments if chemical and scan are both ms3, etc, etc + intensity = self._get_intensity(chemical, query_rt, which_isotope, which_adduct) + mz = self._get_mz(chemical, query_rt, which_isotope, which_adduct) + return [(mz, intensity)] + # TODO: Potential improve how the isotope spectra are generated + else: + # check isolation window for ms2+ scans, queries children if isolation windows ok + if self._isolation_match(chemical, query_rt, isolation_windows[chemical.ms_level - 1], which_isotope, + which_adduct) and chemical.children is not None: + for i in range(len(chemical.children)): + mz_peaks.extend(self._get_mz_peaks(chemical.children[i], query_rt, ms_level, isolation_windows, + which_isotope, which_adduct)) + else: + return [] + return mz_peaks + + def _get_adducts(self, chemical): + if chemical.ms_level == 1: + return chemical.adducts + else: + return self._get_adducts(chemical.parent) + + def _rt_match(self, chemical, query_rt): + return chemical.ms_level != 1 or chemical.chromatogram._rt_match(query_rt - chemical.rt) + + def _get_intensity(self, chemical, query_rt, which_isotope, which_adduct): + # TODO this is the part where I will set the intensity + if chemical.ms_level == 1: + intensity = chemical.isotopes[which_isotope][1] * self._get_adducts(chemical)[which_adduct][1] * \ + chemical.max_intensity + return intensity * chemical.chromatogram.get_relative_intensity(query_rt - chemical.rt) + else: + return self._get_intensity(chemical.parent, query_rt, which_isotope, which_adduct) * \ + chemical.parent_mass_prop * chemical.prop_ms2_mass + + def _get_mz(self, chemical, query_rt, which_isotope, which_adduct): + if chemical.ms_level == 1: + return (adduct_transformation(chemical.isotopes[which_isotope][0], + self._get_adducts(chemical)[which_adduct][0]) + + chemical.chromatogram.get_relative_mz(query_rt - chemical.rt)) + else: + ms1_parent = chemical + while ms1_parent.ms_level != 1: + ms1_parent = chemical.parent + isotope_transformation = ms1_parent.isotopes[which_isotope][0] - ms1_parent.isotopes[0][0] + # TODO: Needs improving + return (adduct_transformation(chemical.isotopes[0][0], + self._get_adducts(chemical)[which_adduct][0]) + isotope_transformation) + + def _isolation_match(self, chemical, query_rt, isolation_windows, which_isotope, which_adduct): + # assumes list is formated like: + # [(min_1,max_1),(min_2,max_2),...] + for window in isolation_windows: + if window[0] < self._get_mz(chemical, query_rt, which_isotope, which_adduct) <= window[1]: + return True + return False + + +class DsDAMassSpec(IndependentMassSpectrometer): + """ + A mass spec class with fixed schedule time, used during DsDA experiment in the paper. + TODO: Could probably be removed. + """ + + def run(self, schedule, pbar=None): + self.schedule = schedule + self.time = schedule["targetTime"][0] + self.fire_event(IndependentMassSpectrometer.ACQUISITION_STREAM_OPENING) + + try: + last_ms1_id = 0 + while len(self.processing_queue) != 0: + scan_params = self.processing_queue.pop(0) + + # make a scan + target_time = scan_params.get(ScanParameters.TIME) + scan = self._get_scan(target_time, scan_params, max_intensity, noise) + + # set scan duration + try: + next_time = self.processing_queue[0].get(ScanParameters.TIME) + except IndexError: + next_time = 1 + scan.scan_duration = next_time - target_time + + # update precursor scan id + if scan.ms_level == 1: + last_ms1_id = scan.scan_id + else: + precursor = scan_params.get(ScanParameters.PRECURSOR) + if precursor is not None: + precursor.precursor_scan_id = last_ms1_id + self.precursor_information[precursor].append(scan) + + # notify controller about this scan + self.fire_event(self.MS_SCAN_ARRIVED, scan) + + # increase mass spec time + self.idx += 1 + self.time += scan.scan_duration + + # print a progress bar if provided + if pbar is not None: + elapsed = self.time + pbar.update(elapsed) + # TODO: fix error bar + + finally: + self.fire_event(IndependentMassSpectrometer.ACQUISITION_STREAM_CLOSING) + if pbar is not None: + pbar.close() diff --git a/Synthetic data creation scripts/vimms/MatrixFactorisation.py b/Synthetic data creation scripts/vimms/MatrixFactorisation.py new file mode 100644 index 00000000..c6625d05 --- /dev/null +++ b/Synthetic data creation scripts/vimms/MatrixFactorisation.py @@ -0,0 +1,242 @@ +import bisect +import copy +import numpy as np +import pylab as plt +import math +import scipy + + +class BlockData(object): + def __init__(self, datasets, mz_step, rt_step, rt_range=[(0, 1450)], mz_range=[(50, 1070)]): + self.datasets = datasets + self.mz_step = mz_step + self.rt_step = rt_step + self.rt_range = rt_range + self.mz_range = mz_range + self.keys = [] + for j in range(len(self.datasets)): + self.keys.append(list(self.datasets[j].file_spectra.keys())) + self.mz_bin_lower = np.arange(mz_range[0][0], mz_range[0][1], mz_step) + self.rt_bin_lower = np.arange(rt_range[0][0], rt_range[0][1], rt_step) + self.n_mz_bin_lower = len(self.mz_bin_lower) + self.n_rt_bin_lower = len(self.rt_bin_lower) + + self.intensity_mats = [] + for j in range(len(self.datasets)): + for i in range(len(self.keys[j])): + self.intensity_mats.append(self._block_file(i, j)) + print("Processed", self.keys[j][i]) + + def _block_file(self, num, data_num): + intensity_mat = np.zeros((self.n_mz_bin_lower, self.n_rt_bin_lower), np.double) + spectra = self.datasets[data_num].file_spectra[self.keys[data_num][num]] + c1 = 0 + for scan_num in spectra: + scan_rt = spectra[scan_num].scan_time[0] + if scan_rt < self.rt_bin_lower[0]: + continue + if scan_rt > self.rt_bin_lower[-1] + self.rt_step: + continue + else: + rt_pos = bisect.bisect_right(self.rt_bin_lower, scan_rt) + rt_pos -= 1 + for peak_num in range(len(spectra[scan_num].mz)): + mz = spectra[scan_num].mz[peak_num] + intensity = spectra[scan_num].i[peak_num] + if mz < self.mz_bin_lower[0]: + continue + if mz > self.mz_bin_lower[-1] + self.mz_step: + break + mz_pos = bisect.bisect_right(self.mz_bin_lower, mz) + mz_pos -= 1 + + intensity_mat[mz_pos, rt_pos] += intensity + return intensity_mat + + def plot(self, data_num): + print("Warning: Python prints plots in a stupid stupid way!") + plt.imshow(np.log(self.intensity_mats[data_num][-1:0:-1] + 1), aspect='auto') + + def combine(self, plot=True): + combined = [] + for mat in self.intensity_mats: + combined.append(list(mat.flatten())) + return combined + + +def gibbs_sampler(X, observed, R, prior_u, prec_u, prior_v, prec_v, alpha, n_its = 1000, burn_in = 100, true_V=[], sample_known = True): + # initialise + N, M = X.shape + U = np.random.normal(size=(N, R)) + if len(true_V) == 0: + V = np.random.normal(size=(M, R)) + else: + V = true_V + tot_U = np.zeros((N, R)) + tot_V = np.zeros((M, R)) + samples_U = [] + samples_V = [] + all_err = [] + range_U = range(N) + if sample_known is False: + range_U = np.where(np.sum(observed, axis=1) != len(observed[0, :]))[0].tolist() + for it in range(n_its): + # loop over u, updating them + # first compute the covariance - shared if all data observed + prec_mat = prec_u + alpha * np.dot(V.T, V) + cov_mat = np.linalg.inv(prec_mat) + for n in range_U: + if observed[n, :].sum() < M: + # not all data observed, compute specific precision + this_prec_mat = prec_u + alpha * np.dot(np.dot(V.T, np.diag(observed[n, :])), V) + this_cov_mat = np.linalg.inv(this_prec_mat) + else: + this_prec_mat = prec_mat + this_cov_mat = cov_mat + s = np.zeros(R) + for m in range(M): + if observed[n, m]: + s += X[n, m] * V[m, :] + s *= alpha + s += np.dot(prec_u, prior_u) + cond_mu = np.dot(this_cov_mat, s) + U[n, :] = np.random.multivariate_normal(cond_mu, this_cov_mat) + + # loop over v updating them + # first covariance + if len(true_V) == 0: + prec_mat = prec_v + alpha * np.dot(U.T, U) + cov_mat = np.linalg.inv(prec_mat) + for m in range(M): + if observed[:, m].sum() < N: + this_prec_mat = prec_v + alpha * np.dot(np.dot(U.T, np.diag(observed[:, m])), U) + this_cov_mat = np.linalg.inv(this_prec_mat) + else: + this_prec_mat = prec_mat + this_cov_mat = cov_mat + + s = np.zeros(R) + for n in range(N): + if observed[n, m]: + s += X[n, m] * U[n, :] + s *= alpha + s += np.dot(prec_v, prior_v) + cond_mu = np.dot(this_cov_mat, s) + V[m, :] = np.random.multivariate_normal(cond_mu, this_cov_mat) + if it > burn_in: + tot_U += U + tot_V += V + samples_U.append(copy.deepcopy(U)) + samples_V.append(copy.deepcopy(V)) + recon_error = np.sqrt(((X - np.dot(U, V.T)) ** 2).mean()) + all_err.append(recon_error) + if len(true_V) == 0: + return tot_U / (n_its - burn_in), tot_V / (n_its - burn_in) + else: + if sample_known is True: + return samples_U + else: + updated_samples_U = [] + for i in range(len(samples_U)): + updated_samples_U.append(samples_U[i][range_U,:]) + return range_U, updated_samples_U + + +class VB_PCA(object): + def __init__(self, Y, Z, D, MaxIts=100, a=1, b=1, tol=1e-3, compute_LB=False, VB_PCA_model=None): + + # intialise parameters + self.Y = Y + self.Z = Z + self.D = D + ZY = Z * Y + self.B = [] + self.e_tau = [] + e_tau = a / b + self.e_tau.append(e_tau) + self.e_log_tau = [np.log(e_tau)] + self.N = Y.shape[0] + self.M = Y.shape[1] + self.e_w = np.random.normal(0, 1, (self.M, self.D)) + self.e_X = np.random.normal(0, 1, (self.N, self.D)) + self.e_wwt = [] + self.e_XXt = [] + for m in range(self.M): + self.e_wwt.append(np.identity(self.D) + np.matmul(self.e_w[m, :][np.newaxis].T, self.e_w[m, :][np.newaxis])) + for n in range(self.N): + self.e_XXt.append(np.identity(self.D) + np.matmul(self.e_X[n, :][np.newaxis].T, self.e_X[n, :][np.newaxis])) + self.sigx = [[] for i in range(self.N)] + self.sigw = [[] for i in range(self.M)] + # run code + for it in range(MaxIts): + + # update X + for n in range(self.N): + Zlist = [vec * np.ones((self.D, self.D)) for vec in Z[n,:]] + self.sigx[n] = np.linalg.inv(np.identity(self.D) + e_tau * sum(np.array(self.e_wwt) * np.array(Zlist))) + self.e_X[n, :] = e_tau * np.matmul(self.sigx[n], np.sum( + np.multiply(self.e_w, np.array([(ZY[n, :]).tolist() for i in range(self.D)]).T), axis=0)) + self.e_XXt[n] = self.sigx[n] + np.matmul(self.e_X[n, :][np.newaxis].T, self.e_X[n, :][np.newaxis]) + + # update W + for m in range(self.M): + Zlist = [vec * np.ones((self.D, self.D)) for vec in Z[:,m]] + self.sigw[m] = np.linalg.inv(np.identity(self.D) + e_tau * sum(np.array(self.e_XXt) * np.array(Zlist))) + self.e_w[m, :] = e_tau * np.matmul(self.sigw[m], np.sum( + np.multiply(self.e_X, np.array([(Y[:, m] * Z[:, m]).tolist() for i in range(self.D)]).T), axis=0)) + self.e_wwt[m] = self.sigw[m] + np.matmul(self.e_w[m, :][np.newaxis].T, self.e_w[m, :][np.newaxis]) + + # update tau + e = a + sum(sum(Z)) / 2 + outer_expect = 0 + RSS = 0 + for n in range(self.N): + for m in range(self.M): + outer_expect += Z[n,m] * (np.trace(np.matmul(self.e_wwt[m], self.sigx[n])) + np.matmul( + np.matmul(self.e_X[n, :], self.e_wwt[m]), self.e_X[n, :][np.newaxis].T)) + RSS += (ZY[n, m] ** 2) - 2 * np.matmul(self.e_w[m].T, self.e_X[n]) * (ZY[n, m]) + f = b + 0.5 * RSS + 0.5 * outer_expect + e_tau = e / f + self.e_tau.append(e_tau[0]) + e_log_tau = np.mean(np.log(np.random.gamma(shape=e, scale=1 / f, size=1000))) + self.e_log_tau.append(e_log_tau) + + # Compute the bound + if compute_LB is True: + LB = a * np.log(b) + (a - 1) * e_log_tau - b * e_tau - scipy.special.loggamma(a) + LB -= (e * np.log(f) + (e - 1) * e_log_tau - f * e_tau - scipy.special.loggamma(e)) + + for n in range(self.N): + LB += (-(self.D / 2) * np.log(2 * math.pi) - 0.5 * sum(np.diag(self.sigx[n])) + sum(self.e_X[n, :] ** 2)) + LB -= (-(self.D / 2) * np.log(2 * math.pi) - 0.5 * np.log(np.linalg.det(self.sigx[n])) - 0.5 * self.D) + + for m in range(self.M): + LB += (-(self.D / 2) * np.log(2 * math.pi) - 0.5 * sum(np.diag(self.sigw[m])) + sum(self.e_w[m, :] ** 2)) + print((-(self.D / 2) * np.log(2 * math.pi) - 0.5 * np.log(np.linalg.det(self.sigw[m])) - 0.5 * self.D)) + LB -= (-(self.D / 2) * np.log(2 * math.pi) - 0.5 * np.log(np.linalg.det(self.sigw[m])) - 0.5 * self.D) + + # likelihood bit + LB += (-(self.N * self.M / 2) * np.log(2 * math.pi) + (self.N * self.M / 2) * e_log_tau - 0.5 * e_tau * sum( + sum((ZY ** 2))) - 2 * sum(sum(Z * (np.multiply(np.matmul(self.e_w, self.e_X.T).T, Y)))) + outer_expect) + self.B.append(LB) + + # break if change in bound is less than the tolerance + if it > 2: + if abs(self.B[-1] - self.B[-2]) < tol: + break + # reconstruct Y + self.Y_reconstructed = np.matmul(self.e_X, self.e_w.T) + + def update(self, new_Z, new_Y): + # assume last Y is only new observations + self.N = new_Y.shape[0] + self.M = new_Y.shape[1] + if self.Z.shape[0] < new_Z.shape[0]: + self.e_X = np.concatenate((self.e_X, np.array([[0 for i in range(self.D)]]))) + self.sigx.append(np.identity(self.D)) + prec_x = np.identity(self.D) + np.dot(np.dot(self.e_w.T, np.diag(new_Z[-1, :])), self.e_w) * self.e_tau[-1] + self.sigx[-1] = np.linalg.inv(prec_x) + self.e_X[-1] = np.dot(self.sigx[-1], np.dot(self.e_w.T, (new_Y[-1,]*new_Z[-1,]))) + self.Y = new_Y + self.Z = new_Z + self.Y_reconstructed = np.matmul(self.e_X, self.e_w.T) diff --git a/Synthetic data creation scripts/vimms/MzmlWriter.py b/Synthetic data creation scripts/vimms/MzmlWriter.py new file mode 100644 index 00000000..4fa89f8c --- /dev/null +++ b/Synthetic data creation scripts/vimms/MzmlWriter.py @@ -0,0 +1,160 @@ +import os + +import numpy as np +from psims.mzml.writer import MzMLWriter as PsimsMzMLWriter + +from vimms.Common import DEFAULT_MS1_SCAN_WINDOW, create_if_not_exist + + +class MzmlWriter(object): + """A class to write peak data to mzML file""" + + def __init__(self, analysis_name, scans, precursor_information=None): + """ + Initialises the mzML writer class. + :param analysis_name: Name of the analysis. + :param scans: A dictionary where key is scan level, value is a list of Scans object for that level. + :param precursor_information: A dictionary where key is Precursor object, value is a list of ms2 scans only + """ + self.analysis_name = analysis_name + self.scans = scans + self.precursor_information = precursor_information + + def write_mzML(self, out_file): + # if directory doesn't exist, create it + out_dir = os.path.dirname(out_file) + create_if_not_exist(out_dir) + + # start writing mzML here + with PsimsMzMLWriter(open(out_file, 'wb')) as writer: + # add default controlled vocabularies + writer.controlled_vocabularies() + + # write other fields like sample list, software list, etc. + self._write_info(writer) + + # open the run + with writer.run(id=self.analysis_name): + self._write_spectra(writer, self.scans, self.precursor_information) + + # open chromatogram list sections + with writer.chromatogram_list(count=1): + tic_rts, tic_intensities = self._get_tic_chromatogram(self.scans) + writer.write_chromatogram(tic_rts, tic_intensities, id='tic', + chromatogram_type='total ion current chromatogram', + time_unit='second') + + writer.close() + + def _write_info(self, out): + # check file contains what kind of spectra + has_ms1_spectrum = 1 in self.scans + has_msn_spectrum = 1 in self.scans and len(self.scans) > 1 + file_contents = [ + 'profile spectrum' + ] + if has_ms1_spectrum: + file_contents.append('MS1 spectrum') + if has_msn_spectrum: + file_contents.append('MSn spectrum') + out.file_description( + file_contents=file_contents, + source_files=[] + ) + out.sample_list(samples=[]) + out.software_list(software_list={ + 'id': 'VMS', + 'version': '1.0.0' + }) + out.scan_settings_list(scan_settings=[]) + out.instrument_configuration_list(instrument_configurations={ + 'id': 'VMS', + 'component_list': [] + }) + out.data_processing_list({'id': 'VMS'}) + + def sort_filter(self, all_scans): + all_scans = sorted(all_scans, key=lambda x: x.scan_id) + # all_scans = [x for x in all_scans if x.num_peaks > 0] + + # add a single peak to empty scans + empty = [x for x in all_scans if x.num_peaks == 0] + for scan in empty: + scan.mzs = np.array([100.0]) + scan.intensities = np.array([1.0]) + scan.num_peaks = 1 + return all_scans + + def _write_spectra(self, writer, scans, precursor_information): + assert len(scans) <= 3 # NOTE: we only support writing up to ms2 scans for now + + # get all scans across different ms_levels and sort them by scan_id + all_scans = [] + for ms_level in scans: + all_scans.extend(scans[ms_level]) + all_scans = self.sort_filter(all_scans) + spectrum_count = len(all_scans) + + # get precursor information for each scan, if available + scan_precursor = {} + for precursor, ms2_scans in precursor_information.items(): + assert len(ms2_scans) == 1 + ms2_scan = ms2_scans[0] + scan_precursor[ms2_scan.scan_id] = precursor + + # write scans + with writer.spectrum_list(count=spectrum_count): + for scan in all_scans: + precursor = None + if scan.scan_id in scan_precursor: + precursor = scan_precursor[scan.scan_id] + self._write_scan(writer, scan, precursor) + + def _get_scan_id(self, scan_id): + return scan_id + + def _write_scan(self, out, scan, precursor): + assert scan.num_peaks > 0 + label = 'MS1 Spectrum' if scan.ms_level == 1 else 'MSn Spectrum' + precursor_information = None + if precursor is not None: + precursor_information = { + "mz": precursor.precursor_mz, + "intensity": precursor.precursor_intensity, + "charge": precursor.precursor_charge, + "spectrum_reference": self._get_scan_id(precursor.precursor_scan_id), + "activation": ["HCD", {"collision energy": 25.0}] + } + lowest_observed_mz = min(scan.mzs) + highest_observed_mz = max(scan.mzs) + bp_pos = np.argmax(scan.intensities) + bp_intensity = scan.intensities[bp_pos] + bp_mz = scan.mzs[bp_pos] + + out.write_spectrum( + scan.mzs, scan.intensities, + id=self._get_scan_id(scan.scan_id), + centroided=True, + scan_start_time=scan.rt / 60.0, + scan_window_list=[DEFAULT_MS1_SCAN_WINDOW], + params=[ + {label: ''}, + {'ms level': scan.ms_level}, + {'total ion current': np.sum(scan.intensities)}, + {'lowest observed m/z': lowest_observed_mz}, + {'highest observed m/z': highest_observed_mz}, + # {'base peak m/z', bp_mz}, + # {'base peak intensity', bp_intensity} + ], + precursor_information=precursor_information + ) + + def _get_tic_chromatogram(self, scans): + time_array = [] + intensity_array = [] + for ms1_scan in scans[1]: + time_array.append(ms1_scan.rt) + intensity_array.append(np.sum(ms1_scan.intensities)) + time_array = np.array(time_array) + intensity_array = np.array(intensity_array) + return time_array, intensity_array diff --git a/Synthetic data creation scripts/vimms/PlotsForPaper.py b/Synthetic data creation scripts/vimms/PlotsForPaper.py new file mode 100644 index 00000000..58ecbb64 --- /dev/null +++ b/Synthetic data creation scripts/vimms/PlotsForPaper.py @@ -0,0 +1,718 @@ +import os +from collections import defaultdict + +import numpy as np +import pandas as pd +import pylab as plt +import seaborn as sns +import matplotlib.patches as mpatches +import pymzml + +from vimms.Chemicals import UnknownChemical, get_absolute_intensity, get_key +from vimms.Common import load_obj, PROTON_MASS, find_nearest_index_in_array +from vimms.MassSpec import FragmentationEvent +from vimms.Roi import make_roi, RoiToChemicalCreator +from vimms.SpectralUtils import get_precursor_info, get_chemicals + + +def get_N(row): + if 'T10' in row['filename']: + return 10 + else: + return row['filename'].split('_')[3] + + +def get_dew(row): + if 'T10' in row['filename']: + return 15 + else: + tok = row['filename'].split('_')[5] # get the dew value in the filename + return tok.split('.')[0] # get the part before '.mzML' + + +def experiment_group(row): + if 'experiment' in row: + col_to_check = 'experiment' + else: + col_to_check = 'filename' + + if 'beer' in row[col_to_check]: + return 'beer' + else: + return 'urine' + + +def add_group_column(df): + df['group'] = df.apply(lambda row: experiment_group(row), axis=1) + + +def get_df(csv_file, min_ms1_intensity, rt_range, mz_range): + df = pd.read_csv(csv_file) + return filter_df(df, min_ms1_intensity, rt_range, mz_range) + + +def filter_df(df, min_ms1_intensity, rt_range, mz_range): + # filter by rt range + if rt_range is not None: + df = df[(df['rt'] > rt_range[0][0]) & (df['rt'] < rt_range[0][1])] + + # filter by mz range + if mz_range is not None: + df = df[(df['rt'] > mz_range[0][0]) & (df['rt'] < mz_range[0][1])] + + # filter by min intensity + intensity_col = 'maxo' + if min_ms1_intensity is not None: + df = df[(df[intensity_col] > min_ms1_intensity)] + + # add log intensity column + df['log_intensity'] = df.apply(lambda row: np.log(row[intensity_col]), axis=1) + + # add N column + try: + df['N'] = df.apply(lambda row: get_N(row), axis=1) + df[['N']] = df[['N']].astype('int') + except IndexError: + pass + except ValueError: + df['N'] = df.apply(lambda row: np.nan, axis=1) + + # add group column + df['group'] = df.apply(lambda row: experiment_group(row), axis=1) + return df + + +def make_boxplot(df, x, y, xticklabels, title, outfile=None): + g = sns.catplot(x=x, y=y, kind='box', data=df) + g.fig.set_size_inches(10, 3) + if xticklabels is not None: + g.set_xticklabels(xticklabels, rotation=90) + else: + g.set_xticklabels(rotation=90) + plt.title(title) + plt.tight_layout() + if outfile is not None: + plt.savefig(outfile, dpi=300) + plt.show() + + +def make_hist(df, col_name, file_name, title): + gb = df.groupby('filename') + group_df = gb.get_group(file_name) + vals = group_df[col_name].values + print(vals, len(vals)) + _ = plt.hist(vals, bins=100) + plt.title(title) + plt.tight_layout() + plt.show() + + +def to_chemical(row): + mz = row['mz'] - PROTON_MASS + rt = row['rt'] + max_intensity = row['maxo'] + chrom = None + chem = UnknownChemical(mz, rt, max_intensity, chrom, children=None) + return chem + + +def df_to_chemicals(df, filename=None): + if filename is not None: + filtered_df = df.loc[df['filename'] == filename] + else: + filtered_df = df + chems = filtered_df.apply(lambda row: to_chemical(row), axis=1).values + return chems + + +def find_chem(to_find, min_rts, max_rts, min_mzs, max_mzs, chem_list): + query_mz = to_find.isotopes[0][0] + query_rt = to_find.rt + min_rt_check = min_rts <= query_rt + max_rt_check = query_rt <= max_rts + min_mz_check = min_mzs <= query_mz + max_mz_check = query_mz <= max_mzs + idx = np.nonzero(min_rt_check & max_rt_check & min_mz_check & max_mz_check)[0] + matches = chem_list[idx] + + # pick a match + if len(matches) == 0: + return None + elif len(matches) == 1: + return matches[0] + else: # multiple matches, take the closest in rt + diffs = [np.abs(chem.rt - to_find.rt) for chem in matches] + idx = np.argmin(diffs) + return matches[idx] + + +def match(chemical_list_1, chemical_list_2, mz_tol, rt_tol, verbose=False): + matches = {} + chem_list = np.array(chemical_list_2) + min_rts = np.array([chem.rt - rt_tol for chem in chem_list]) + max_rts = np.array([chem.rt + rt_tol for chem in chem_list]) + min_mzs = np.array([chem.isotopes[0][0] * (1 - mz_tol / 1e6) for chem in chem_list]) + max_mzs = np.array([chem.isotopes[0][0] * (1 + mz_tol / 1e6) for chem in chem_list]) + for i in range(len(chemical_list_1)): + to_find = chemical_list_1[i] + if i % 1000 == 0 and verbose: + print('%d/%d found %d' % (i, len(chemical_list_1), len(matches))) + match = find_chem(to_find, min_rts, max_rts, min_mzs, max_mzs, chem_list) + if match: + matches[to_find] = match + return matches + + +def match_peaklist(mz_list_1, rt_list_1, intensity_list_1, mz_list_2, rt_list_2, intensity_list_2, mz_tol, rt_tol): + if mz_tol is not None: # create mz range for matching in ppm + min_mzs = np.array([mz * (1 - mz_tol / 1e6) for mz in mz_list_2]) + max_mzs = np.array([mz * (1 + mz_tol / 1e6) for mz in mz_list_2]) + + else: # create mz ranges by rounding to 2dp + min_mzs = np.around(mz_list_2, decimals=2) + max_mzs = np.around(mz_list_2, decimals=2) + mz_list_1 = np.around(mz_list_1, decimals=2) + + # create rt ranges for matching + min_rts = np.array([rt - rt_tol for rt in rt_list_2]) + max_rts = np.array([rt + rt_tol for rt in rt_list_2]) + + matches = {} + for i in range(len(mz_list_1)): # loop over query and find a match + query = (mz_list_1[i], rt_list_1[i], intensity_list_1[i],) + match = find_match(query, min_rts, max_rts, min_mzs, max_mzs, mz_list_2, rt_list_2, intensity_list_2) + matches[query] = match + return matches + + +def check_found_matches(matches, left_label, right_label, N=20): + found = [key for key in matches if matches[key] is not None] + print('Found %d/%d (%f)' % (len(found), len(matches), len(found) / len(matches))) + + print('%s\t\t\t\t\t\t%s' % (left_label, right_label)) + for key, value in list(matches.items())[0:N]: + if value is not None: + print('mz %.2f rt %.4f intensity %.4f\tmz %.2f rt %.4f intensity %.4f' % ( + key[0], key[1], key[2], value[0], value[1], value[2])) + + +def plot_matched_precursors(matches, min_mz, max_mz, min_rt, max_rt, out_file=None): + plt.figure(figsize=(12, 6)) + plt.rcParams.update({'font.size': 24}) + for key in matches: + mz, rt, intensity = key + if min_mz < mz < max_mz and min_rt < rt < max_rt: + if matches[key] is not None: + plt.plot([rt], [mz], marker='.', markersize=5, color='blue', alpha=0.1) + else: + plt.plot([rt], [mz], marker='.', markersize=5, color='red', alpha=0.1) + + blue_patch = mpatches.Patch(color='blue', label='Matched') + red_patch = mpatches.Patch(color='red', label='Unmatched') + plt.legend(handles=[blue_patch, red_patch]) + plt.title('Matched fragmentation events', fontsize=30) + plt.xlabel('Retention Time (s)') + plt.ylabel('m/z') + plt.tight_layout() + if out_file is not None: + plt.savefig(out_file, dpi=300) + + +def count_stuff(input_file, min_rt, max_rt): + run = pymzml.run.Reader(input_file, MS1_Precision=5e-6, + extraAccessions=[('MS:1000016', ['value', 'unitName'])], + obo_version='4.0.1') + mzs = [] + rts = [] + intensities = [] + count_ms1_scans = 0 + count_ms2_scans = 0 + cumsum_ms1_scans = [] + cumsum_ms2_scans = [] + count_selected_precursors = 0 + for spectrum in run: + ms_level = spectrum['ms level'] + current_scan_rt, units = spectrum.scan_time + if units == 'minute': + current_scan_rt *= 60.0 + if min_rt < current_scan_rt < max_rt: + if ms_level == 1: + count_ms1_scans += 1 + cumsum_ms1_scans.append((current_scan_rt, count_ms1_scans,)) + elif ms_level == 2: + try: + selected_precursors = spectrum.selected_precursors + count_selected_precursors += len(selected_precursors) + mz = selected_precursors[0]['mz'] + intensity = selected_precursors[0]['i'] + + count_ms2_scans += 1 + mzs.append(mz) + rts.append(current_scan_rt) + intensities.append(intensity) + cumsum_ms2_scans.append((current_scan_rt, count_ms2_scans,)) + except KeyError: + # print(selected_precursors) + pass + + print('Number of ms1 scans =', count_ms1_scans) + print('Number of ms2 scans =', count_ms2_scans) + print('Total scans =', count_ms1_scans + count_ms2_scans) + print('Number of selected precursors =', count_selected_precursors) + return np.array(mzs), np.array(rts), np.array(intensities), np.array(cumsum_ms1_scans), np.array(cumsum_ms2_scans) + + +def find_match(query, min_rts, max_rts, min_mzs, max_mzs, mz_list, rt_list, intensity_list): + # check ranges + query_mz, query_rt, query_intensity = query + min_rt_check = min_rts <= query_rt + max_rt_check = query_rt <= max_rts + min_mz_check = min_mzs <= query_mz + max_mz_check = query_mz <= max_mzs + idx = np.nonzero(min_rt_check & max_rt_check & min_mz_check & max_mz_check)[0] + + # get mz, rt and intensity of matching indices + matches_mz = mz_list[idx] + matches_rt = rt_list[idx] + matches_intensity = intensity_list[idx] + + if len(idx) == 0: # no match + return None + + elif len(idx) == 1: # single match + return (matches_mz[0], matches_rt[0], matches_intensity[0],) + + else: # multiple matches, take the closest in rt + diffs = [np.abs(rt - query_rt) for rt in matches_rt] + idx = np.argmin(diffs) + return (matches_mz[idx], matches_rt[idx], matches_intensity[idx],) + + +def plot_num_scans(real_cumsum_ms1, real_cumsum_ms2, simulated_cumsum_ms1, simulated_cumsum_ms2, out_file=None): + plt.plot(real_cumsum_ms1[:, 0], real_cumsum_ms1[:, 1], 'r') + plt.plot(real_cumsum_ms2[:, 0], real_cumsum_ms2[:, 1], 'b') + plt.plot(simulated_cumsum_ms1[:, 0], simulated_cumsum_ms1[:, 1], 'r--') + plt.plot(simulated_cumsum_ms2[:, 0], simulated_cumsum_ms2[:, 1], 'b--') + + plt.legend(['Actual MS1', 'Actual MS2', 'Simulated MS1', 'Simulated MS2']) + plt.xlabel('Retention Time (s)') + plt.ylabel('Cumulative sum') + plt.title('Cumulative number of MS1 and MS2 scans', fontsize=18) + plt.tight_layout() + + if out_file is not None: + plt.savefig(out_file, dpi=300) + + +def plot_matched_intensities(matched_intensities, unmatched_intensities, out_file=None): + plt.figure() + temp1 = plt.hist(np.log(matched_intensities), bins = np.linspace(10,20,50), color='blue') + temp2 = plt.hist(np.log(unmatched_intensities), bins = np.linspace(10,20,50), color='red') + plt.title('Matched precursor intensities') + + blue_patch = mpatches.Patch(color='blue', label='Matched') + red_patch = mpatches.Patch(color='red', label='Unmatched') + plt.legend(handles=[blue_patch, red_patch]) + plt.xlabel('log(intensity)') + plt.ylabel('Precursor count') + plt.tight_layout() + + if out_file is not None: + plt.savefig(out_file, dpi=300) + + +def load_controller(results_dir, experiment_name, N, rt_tol): + analysis_name = 'experiment_%s_N_%d_rttol_%d' % (experiment_name, N, rt_tol) + pickle_in = '%s/%s.p' % (results_dir, analysis_name) + print('Loading %s' % analysis_name) + try: + controller = load_obj(pickle_in) + except FileNotFoundError: + controller = None + return controller + + +def load_controllers(results_dir, Ns, rt_tols): + controllers = [] + for N in Ns: + for rt_tol in rt_tols: + controller = load_controller(results_dir, N, rt_tol) + if controller is not None: + controllers.append(controller) + return controllers + + +def compute_performance_scenario_1(controller, dataset, min_ms1_intensity, + fullscan_filename, P_peaks_df, + matching_mz_tol, matching_rt_tol, + chem_to_frag_events=None): + if chem_to_frag_events is None: # read MS2 fragmentation events from pickled controller + chem_to_frag_events = get_frag_events(controller, 2) + + # match with xcms peak-picked ms1 data + detected_ms1 = df_to_chemicals(P_peaks_df, fullscan_filename) + matches_fullscan = match(dataset, detected_ms1, matching_mz_tol, matching_rt_tol, verbose=False) + + # check if matched and set a flag to indicate that + update_matched_status(dataset, matches_fullscan, None) + + # positive instances are ground truth MS1 peaks found by XCMS + # negative instances are chemicals that cannot be matched to XCMS output + positives = list(filter(lambda x: x.found_in_fullscan, dataset)) + negatives = list(filter(lambda x: not x.found_in_fullscan, dataset)) + + # for both positive and negative instances, count how many frag events they have + # and whether it's above (good) or below (bad) the minimum ms1 intensity at the time of fragmentation. + positives_count = get_chem_frag_counts(positives, chem_to_frag_events, min_ms1_intensity) + negatives_count = get_chem_frag_counts(negatives, chem_to_frag_events, min_ms1_intensity) + + # TP = positive instances that are good only + tp = [chem for chem in positives if positives_count[chem]['good'] > 0 and positives_count[chem]['bad'] == 0] + + # FP = negative instances that are fragmented (both good + bad) + fp = [chem for chem in negatives if negatives_count[chem]['good'] > 0 or negatives_count[chem]['bad'] > 0] + + # FN = positive instances that are not fragmented at all + positive instances that are bad only + fn = [chem for chem in positives if \ + (positives_count[chem]['good'] == 0 and positives_count[chem]['bad'] == 0) or \ + (positives_count[chem]['good'] == 0 and positives_count[chem]['bad'] > 0)] + + tp = len(tp) + fp = len(fp) + fn = len(fn) + prec, rec, f1 = compute_pref_rec_f1(tp, fp, fn) + return tp, fp, fn, prec, rec, f1 + + +# def compute_performance_scenario_1(controller, chemicals, min_ms1_intensity, +# fullscan_filename, P_peaks_df, +# matching_mz_tol, matching_rt_tol, +# chem_to_frag_events=None): + +# if chem_to_frag_events is None: # read MS2 fragmentation events from pickled controller +# chem_to_frag_events = get_frag_events(controller, 2) + +# # match xcms picked ms1 peaks to fragmentation peaks +# detected_ms1 = df_to_chemicals(P_peaks_df, fullscan_filename) +# matches_fullscan = match(detected_ms1, chemicals, matching_mz_tol, matching_rt_tol, verbose=False) +# matched_frags = set(matches_fullscan.values()) +# print('%d/%d %d/%d' % (len(matches_fullscan), len(detected_ms1), len(matched_frags), len(chemicals))) + +# # ms1 peaks that are also fragmented +# positives = [] +# for ms1_peak in matches_fullscan: +# frag_peak = matches_fullscan[ms1_peak] +# frag_events = chem_to_frag_events[frag_peak] +# if len(frag_events) > 0: +# positives.append(frag_peak) + +# # fragmentation peaks that are not in ms1 peaks +# negatives = [] +# for frag_peak in chemicals: +# if frag_peak not in matched_frags: +# frag_events = chem_to_frag_events[frag_peak] +# if len(frag_events) > 0: +# negatives.append(frag_peak) + +# positives_count = get_chem_frag_counts(positives, chem_to_frag_events, min_ms1_intensity) +# negatives_count = get_chem_frag_counts(negatives, chem_to_frag_events, min_ms1_intensity) + +# # peaks from ground truth (found in full-scan files) that are fragmented above the minimum intensity threshold +# tp = [chem for chem in positives if positives_count[chem]['good'] > 0 and positives_count[chem]['bad'] == 0] +# tp = len(tp) + +# # peaks from ground truth that are not fragmented + peaks from ground truth that are fragmented below the minimum intensity threshold. +# fp = len(detected_ms1) - tp + +# # peaks not from ground truth that are fragmented above the minimum intensity threshold. +# fn = [chem for chem in negatives if negatives_count[chem]['good'] > 0 and negatives_count[chem]['bad'] == 0] +# fn = len(fn) + +# prec, rec, f1 = compute_pref_rec_f1(tp, fp, fn) +# return tp, fp, fn, prec, rec, f1 + + +def compute_performance_scenario_2(controller, dataset, min_ms1_intensity, + fullscan_filename, fragfile_filename, + fullscan_peaks_df, fragmentation_peaks_df, + matching_mz_tol, matching_rt_tol, + chem_to_frag_events=None): + if chem_to_frag_events is None: # read MS2 fragmentation events from pickled controller + chem_to_frag_events = get_frag_events(controller, 2) + + # load the list of xcms-picked peaks + detected_from_fullscan = df_to_chemicals(fullscan_peaks_df, fullscan_filename) + detected_from_fragfile = df_to_chemicals(fragmentation_peaks_df, fragfile_filename) + + # match with xcms peak-picked ms1 data from fullscan file + matches_fullscan = match(dataset, detected_from_fullscan, matching_mz_tol, matching_rt_tol, verbose=False) + + # match with xcms peak-picked ms1 data from fragmentation file + matches_fragfile = match(dataset, detected_from_fragfile, matching_mz_tol, matching_rt_tol, verbose=False) + + # check if matched and set a flag to indicate that + update_matched_status(dataset, matches_fullscan, matches_fragfile) + + # True positive: a peak that is fragmented above the minimum MS1 intensity and is picked by XCMS from + # the MS1 information in the DDA file and is picked in the fullscan file. + found_in_both = list(filter(lambda x: x.found_in_fullscan and x.found_in_fragfile, dataset)) + frag_count = get_chem_frag_counts(found_in_both, chem_to_frag_events, min_ms1_intensity) + tp = [chem for chem in found_in_both if frag_count[chem]['good'] > 0 and frag_count[chem]['bad'] == 0] + tp = len(tp) + + # False positive: any peak that is above minimum intensity and is picked by XCMS + # from the DDA file but is not picked from the fullscan. + found_in_dda_only = list(filter(lambda x: not x.found_in_fullscan and x.found_in_fragfile, dataset)) + frag_count = get_chem_frag_counts(found_in_dda_only, chem_to_frag_events, min_ms1_intensity) + fp = [chem for chem in found_in_dda_only if frag_count[chem]['good'] > 0 and frag_count[chem]['bad'] == 0] + fp = len(fp) + + # False negative: any peak that is picked from fullscan data, and is not fragmented, or + # is fragmented below the minimum intensity. + found_in_fullscan = list(filter(lambda x: x.found_in_fullscan, dataset)) + fn = len(found_in_fullscan) - tp + + prec, rec, f1 = compute_pref_rec_f1(tp, fp, fn) + return tp, fp, fn, prec, rec, f1 + + +def get_frag_events(controller, ms_level): + ''' + Gets the fragmentation events for all chemicals for an ms level from the controller + :param controller: A Top-N controller object + :param ms_level: The MS-level (usually 2) + :return: A dictionary where keys are chemicals and values are a list of fragmentation events + ''' + filtered_frag_events = list(filter(lambda x: x.ms_level == ms_level, controller.mass_spec.fragmentation_events)) + chem_to_frag_events = defaultdict(list) + for frag_event in filtered_frag_events: + key = frag_event.chem + chem_to_frag_events[key].append(frag_event) + return dict(chem_to_frag_events) + + +def count_frag_events(chem, chem_to_frag_events, min_ms1_intensity): + ''' + Counts how many good and bad fragmentation events for each chemical (key). + Good fragmentation events are defined as fragmentation events that occur when at the time of fragmentation, + the chemical MS1 intensity is above the min_ms1_intensity threshold. + :param chem: the chemical to count + :param chem_to_frag_events: a dictionary of chemicals to frag events (from get_frag_events above()) + :return: a tuple of good and bad fragmentation event counts + ''' + frag_events = chem_to_frag_events[chem] + good_count = 0 + bad_count = 0 + for frag_event in frag_events: + chem = frag_event.chem + query_rt = frag_event.query_rt + if get_absolute_intensity(chem, query_rt) < min_ms1_intensity: + bad_count += 1 + else: + good_count += 1 + return good_count, bad_count + + +def get_chem_frag_counts(chem_list, chem_to_frag_events, min_ms1_intensity): + # get the count of good/bad fragmentation events for all chemicals in chem_list + results = {} + for i in range(len(chem_list)): + chem = chem_list[i] + try: + good_count, bad_count = count_frag_events(chem, chem_to_frag_events, min_ms1_intensity) + except KeyError: + good_count = 0 + bad_count = 0 + results[chem] = { + 'good': good_count, + 'bad': bad_count + } + return results + + +def update_matched_status(dataset, matches_fullscan, matches_fragfile): + ''' + Update a boolean flag in the Chemical object that tells us if it is found in fullscan or fragmentation data + :param dataset: a list of Chemicals + :param matches_fullscan: the result of matching Chemicals in dataset to fullscan file + :param matches_fragfile: the result of matching Chemicals in dataset to fragmentation file + :return: None, but the Chemical objects in dataset is modified + ''' + found_in_fullscan = 0 + found_in_fragfile = 0 + for chem in dataset: + if matches_fullscan is not None: # check if a match is found in fullscan mzML + if chem in matches_fullscan: + chem.found_in_fullscan = True + found_in_fullscan += 1 + else: + chem.found_in_fullscan = False + + if matches_fragfile is not None: # check if a match is found in fragmentation mzML + if chem in matches_fragfile: + chem.found_in_fragfile = True + found_in_fragfile += 1 + else: + chem.found_in_fragfile = False + + print('Matched %d/%d in fullscan data, %d/%d in fragmentation data' % (found_in_fullscan, len(dataset), + found_in_fragfile, len(dataset))) + + +def compute_pref_rec_f1(tp, fp, fn): + prec = tp / (tp + fp) + rec = tp / (tp + fn) + f1 = (2 * prec * rec) / (prec + rec) + return prec, rec, f1 + + +def calculate_performance(params): + # get parameters + fragfile = params['fragfile'] + N = params['N'] + rt_tol = params['rt_tol'] + roi_mz_tol = params['roi_mz_tol'] + roi_min_ms1_intensity = params['roi_min_ms1_intensity'] + fragmentation_min_ms1_intensity = params['fragmentation_min_ms1_intensity'] + min_rt = params['min_rt'] + max_rt = params['max_rt'] + roi_min_length = params['roi_min_length'] + fullscan_filename = params['fullscan_filename'] + P_peaks_df = params['P_peaks_df'] + Q_peaks_df = params['Q_peaks_df'] + matching_mz_tol = params['matching_mz_tol'] + matching_rt_tol = params['matching_rt_tol'] + scenario = params['scenario'] + + controller_file = params['controller_file'] + chemicals_file = params['chemicals_file'] + + if chemicals_file.endswith('.p'): + print('Loading chemicals') + chemicals = load_obj(chemicals_file) + else: + print('Extracting chemicals') + chemicals = get_chemicals(chemicals_file, roi_mz_tol, roi_min_ms1_intensity, min_rt, max_rt, + min_length=roi_min_length) + + if type(chemicals) == list: + chemicals = np.array(chemicals) + + if controller_file.endswith('.p'): + print('Loading fragmentation events') + controller = load_obj(controller_file) + chem_to_frag_events = None + else: + print('Extracting fragmentation events') + controller = None + precursor_df = get_precursor_info(controller_file) + chem_to_frag_events = get_chem_to_frag_events(chemicals, precursor_df) + + # compute performance under each scenario + print('Computing performance under scenario %d' % scenario) + tp, fp, fn, prec, rec, f1 = 0, 0, 0, 0, 0, 0 + if scenario == 1: + tp, fp, fn, prec, rec, f1 = compute_performance_scenario_1(controller, chemicals, + fragmentation_min_ms1_intensity, + fullscan_filename, P_peaks_df, + matching_mz_tol, matching_rt_tol, + chem_to_frag_events=chem_to_frag_events) + elif scenario == 2: + fragfile_filename = os.path.basename(fragfile) + tp, fp, fn, prec, rec, f1 = compute_performance_scenario_2(controller, chemicals, + fragmentation_min_ms1_intensity, + fullscan_filename, fragfile_filename, + P_peaks_df, Q_peaks_df, matching_mz_tol, + matching_rt_tol, + chem_to_frag_events=chem_to_frag_events) + + return N, rt_tol, scenario, tp, fp, fn, prec, rec, f1 + + +def get_chem_to_frag_events(chemicals, ms1_df): + # used for searching later + min_rts = np.array([min(chem.chromatogram.raw_rts) for chem in chemicals]) + max_rts = np.array([max(chem.chromatogram.raw_rts) for chem in chemicals]) + min_mzs = np.array([min(chem.chromatogram.raw_mzs) for chem in chemicals]) + max_mzs = np.array([max(chem.chromatogram.raw_mzs) for chem in chemicals]) + + # loop over each fragmentation event in ms1_df, attempt to match it to chemicals + chem_to_frag_events = defaultdict(list) + for idx, row in ms1_df.iterrows(): + query_rt = row['ms1_scan_rt'] + query_mz = row['ms1_mz'] + query_intensity = row['ms1_intensity'] + scan_id = row['ms2_scan_id'] + + chem = None + idx = _get_chem_indices(query_mz, query_rt, min_mzs, max_mzs, min_rts, max_rts) + if len(idx) == 1: # single match + chem = chemicals[idx][0] + + elif len( + idx) > 1: # multiple matches, find the closest in intensity to query_intensity at the time of fragmentation + matches = chemicals[idx] + possible_intensities = np.array([get_absolute_intensity(chem, query_rt) for chem in matches]) + closest = find_nearest_index_in_array(possible_intensities, query_intensity) + chem = matches[closest] + + # create frag event for the given chem + if chem is not None: + ms_level = 2 + peaks = [] # we don't know which ms2 peaks are linked to this chem object + # key = get_key(chem) + frag_event = FragmentationEvent(chem, query_rt, ms_level, peaks, scan_id) + chem_to_frag_events[chem].append(frag_event) + return dict(chem_to_frag_events) + + +def get_chemicals(mzML_file, mz_tol, min_ms1_intensity, start_rt, stop_rt, min_length=1): + ''' + Extract ROI from an mzML file and turn them into UnknownChemical objects + :param mzML_file: input mzML file + :param mz_tol: mz tolerance for ROI extraction + :param min_ms1_intensity: ROI will only be kept if it has one point above this threshold + :param start_rt: start RT to extract ROI + :param stop_rt: end RT to extract ROI + :return: a list of UnknownChemical objects + ''' + min_intensity = 0 + good_roi, junk = make_roi(mzML_file, mz_tol=mz_tol, mz_units='ppm', min_length=min_length, + min_intensity=min_intensity, start_rt=start_rt, stop_rt=stop_rt) + + # keep ROI that have at least one point above the minimum to fragment threshold + keep = [] + for roi in good_roi: + if np.count_nonzero(np.array(roi.intensity_list) > min_ms1_intensity) > 0: + keep.append(roi) + + ps = None # unused + rtcc = RoiToChemicalCreator(ps, keep) + chemicals = np.array(rtcc.chemicals) + return chemicals + + +def evaluate_serial(all_params): + results = [] + for params in all_params: + res = calculate_performance(params) + results.append(res) + print('N=%d rt_tol=%d scenario=%d tp=%d fp=%d fn=%d prec=%.3f rec=%.3f f1=%.3f\n' % res) + result_df = pd.DataFrame(results, columns=['N', 'rt_tol', 'scenario', 'TP', 'FP', 'FN', 'Prec', 'Rec', 'F1']) + return result_df + + +def evaluate_parallel(all_params, pushed_dict=None): + import ipyparallel as ipp + rc = ipp.Client() + dview = rc[:] # use all engines​ + with dview.sync_imports(): + import os + from vimms.Common import load_obj + + if pushed_dict is not None: + dview.push(pushed_dict) + + results = dview.map_sync(calculate_performance, all_params) + result_df = pd.DataFrame(results, columns=['N', 'rt_tol', 'scenario', 'TP', 'FP', 'FN', 'Prec', 'Rec', 'F1']) + return result_df \ No newline at end of file diff --git a/Synthetic data creation scripts/vimms/Roi.py b/Synthetic data creation scripts/vimms/Roi.py new file mode 100644 index 00000000..054d0704 --- /dev/null +++ b/Synthetic data creation scripts/vimms/Roi.py @@ -0,0 +1,336 @@ +import bisect +import math +from collections import OrderedDict + +import numpy as np +import pylab as plt +import pymzml +from scipy.stats import pearsonr +import os + +from vimms.Chemicals import ChemicalCreator, UnknownChemical, GET_MS2_BY_PEAKS +from vimms.Chromatograms import EmpiricalChromatogram +from vimms.Common import PROTON_MASS, CHEM_NOISE, save_obj + +POS_TRANSFORMATIONS = OrderedDict() +POS_TRANSFORMATIONS['M+H'] = lambda mz: (mz + PROTON_MASS) +POS_TRANSFORMATIONS['[M+ACN]+H'] = lambda mz: (mz + 42.033823) +POS_TRANSFORMATIONS['[M+CH3OH]+H'] = lambda mz: (mz + 33.033489) +POS_TRANSFORMATIONS['[M+NH3]+H'] = lambda mz: (mz + 18.033823) +POS_TRANSFORMATIONS['M+Na'] = lambda mz: (mz + 22.989218) +POS_TRANSFORMATIONS['M+K'] = lambda mz: (mz + 38.963158) +POS_TRANSFORMATIONS['M+2Na-H'] = lambda mz: (mz + 44.971160) +POS_TRANSFORMATIONS['M+ACN+Na'] = lambda mz: (mz + 64.015765) +POS_TRANSFORMATIONS['M+2Na-H'] = lambda mz: (mz + 44.971160) +POS_TRANSFORMATIONS['M+2K+H'] = lambda mz: (mz + 76.919040) +POS_TRANSFORMATIONS['[M+DMSO]+H'] = lambda mz: (mz + 79.02122) +POS_TRANSFORMATIONS['[M+2ACN]+H'] = lambda mz: (mz + 83.060370) +POS_TRANSFORMATIONS['2M+H'] = lambda mz: (mz * 2) + 1.007276 +POS_TRANSFORMATIONS['M+ACN+Na'] = lambda mz: (mz + 64.015765) +POS_TRANSFORMATIONS['2M+NH4'] = lambda mz: (mz * 2) + 18.033823 + + +# Object to store a RoI +# Maintains 3 lists -- mz, rt and intensity +# When a new point (mz,rt,intensity) is added, it updates the +# list and the mean mz which is required. +class Roi(object): + def __init__(self, mz, rt, intensity): + self.mz_list = [mz] + self.rt_list = [rt] + self.intensity_list = [intensity] + self.n = 1 + self.mz_sum = mz + + def get_mean_mz(self): + return self.mz_sum / self.n + + def get_max_intensity(self): + return max(self.intensity_list) + + def add(self, mz, rt, intensity): + self.mz_list.append(mz) + self.rt_list.append(rt) + self.intensity_list.append(intensity) + self.mz_sum += mz + self.n += 1 + + def __lt__(self, other): + return self.get_mean_mz() <= other.get_mean_mz() + + def to_chromatogram(self): + if self.n == 0: + return None + chrom = EmpiricalChromatogram(np.array(self.rt_list), np.array(self.mz_list), np.array(self.intensity_list)) + return chrom + + def __repr__(self): + return 'ROI with data points=%d mz (%.4f-%.4f) rt (%.4f-%.4f)' % ( + self.n, + self.mz_list[0], self.mz_list[-1], + self.rt_list[0], self.rt_list[-1]) + + +# Find the RoI that a particular mz falls into +# If it falls into nothing, return None +# mz_tol is the window above and below the +# mean_mz of the RoI. E.g. if mz_tol = 1 Da, then it looks +# plus and minus 1Da +def match(mz, roi_list, mz_tol, mz_units='Da'): + if len(roi_list) == 0: + return None + pos = bisect.bisect_right(roi_list, mz) + if pos == len(roi_list): + return None + if pos == 0: + return None + + if mz_units == 'Da': + dist_left = mz.get_mean_mz() - roi_list[pos - 1].get_mean_mz() + dist_right = roi_list[pos].get_mean_mz() - mz.get_mean_mz() + else: # ppm + dist_left = 1e6 * (mz.get_mean_mz() - roi_list[pos - 1].get_mean_mz()) / mz.get_mean_mz() + dist_right = 1e6 * (roi_list[pos].get_mean_mz() - mz.get_mean_mz()) / mz.get_mean_mz() + + if dist_left < mz_tol and dist_right > mz_tol: + return roi_list[pos - 1] + elif dist_left > mz_tol and dist_right < mz_tol: + return roi_list[pos] + elif dist_left < mz_tol and dist_right < mz_tol: + if dist_left <= dist_right: + return roi_list[pos - 1] + else: + return roi_list[pos] + else: + return None + + +def roi_correlation(roi1, roi2, min_rt_point_overlap=5, method='pearson'): + # flip around so that roi1 starts earlier (or equal) + if roi2.rt_list[0] < roi1.rt_list[0]: + temp = roi2 + roi2 = roi1 + roi1 = temp + + # check that they meet the min_rt_point overlap + if roi1.rt_list[-1] < roi2.rt_list[0]: + # no overlap at all + return 0.0 + + # find the position of the first element in roi2 in roi1 + pos = roi1.rt_list.index(roi2.rt_list[0]) + + # print roi1.rt_list + # print roi2.rt_list + # print pos + + total_length = max([len(roi1.rt_list), len(roi2.rt_list) + pos]) + # print total_length + + r1 = np.zeros((total_length), np.double) + r2 = np.zeros_like(r1) + + r1[:len(roi1.rt_list)] = roi1.intensity_list + r2[pos:pos + len(roi2.rt_list)] = roi2.intensity_list + + # print + # for i,a in enumerate(r1): + # print "{:10.4f}\t{:10.4f}".format(a,r2[i]) + if method == 'pearson': + r, _ = pearsonr(r1, r2) + else: + r = cosine_score(r1, r2) + + return r + + +def cosine_score(u, v): + numerator = (u * v).sum() + denominator = np.sqrt((u * u).sum()) * np.sqrt((v * v).sum()) + return numerator / denominator + + +# Make the RoI from an input file +# mz_units = Da for Daltons +# mz_units = ppm for ppm +def make_roi(input_file, mz_tol=0.001, mz_units='Da', min_length=10, min_intensity=50000, start_rt=0, stop_rt=10000000): + # input_file = 'Beer_multibeers_1_fullscan1.mzML' + + if not mz_units == 'Da' and not mz_units == 'ppm': + print("Unknown mz units, use Da or ppm") + return None, None + + run = pymzml.run.Reader(input_file, MS1_Precision=5e-6, + extraAccessions=[('MS:1000016', ['value', 'unitName'])], + obo_version='4.0.1') + + live_roi = [] + dead_roi = [] + junk_roi = [] + + for spectrum in run: + # print spectrum['centroid_peaks'] + if spectrum['ms level'] == 1: + live_roi.sort() + # current_ms1_scan_rt, units = spectrum['scan start time'] # this no longer works + current_ms1_scan_rt, units = spectrum.scan_time + if units == 'minute': + current_ms1_scan_rt *= 60.0 + + if current_ms1_scan_rt < start_rt: + continue + if current_ms1_scan_rt > stop_rt: + break + + # print current_ms1_scan_rt + # print spectrum.peaks + not_grew = set(live_roi) + for mz, intensity in spectrum.peaks('raw'): + if intensity >= min_intensity: + match_roi = match(Roi(mz, 0, 0), live_roi, mz_tol, mz_units=mz_units) + if match_roi: + match_roi.add(mz, current_ms1_scan_rt, intensity) + if match_roi in not_grew: + not_grew.remove(match_roi) + else: + bisect.insort_right(live_roi, Roi(mz, current_ms1_scan_rt, intensity)) + + for roi in not_grew: + if roi.n >= min_length: + dead_roi.append(roi) + else: + junk_roi.append(roi) + pos = live_roi.index(roi) + del live_roi[pos] + + # print("Scan @ {}, {} live ROIs".format(current_ms1_scan_rt, len(live_roi))) + + # process all the live ones - keeping only those that + # are longer than the minimum length + good_roi = dead_roi + for roi in live_roi: + if roi.n >= min_length: + good_roi.append(roi) + else: + junk_roi.append(roi) + return good_roi, junk_roi + + +def greedy_roi_cluster(roi_list, corr_thresh=0.75, corr_type='cosine'): + # sort in descending intensity + roi_list_copy = [r for r in roi_list] + roi_list_copy.sort(key=lambda x: max(x.intensity_list), reverse=True) + roi_clusters = [] + while len(roi_list_copy) > 0: + roi_clusters.append([roi_list_copy[0]]) + remove_idx = [0] + if len(roi_list_copy) > 1: + for i, r in enumerate(roi_list_copy[1:]): + corr = roi_correlation(roi_list_copy[0], r) + if corr > corr_thresh: + roi_clusters[-1].append(r) + remove_idx.append(i + 1) + remove_idx.sort(reverse=True) + for r in remove_idx: + del roi_list_copy[r] + + return roi_clusters + + +class RoiToChemicalCreator(ChemicalCreator): + """ + Turns ROI to Chemical objects + """ + + def __init__(self, peak_sampler, all_roi): + super().__init__(peak_sampler) + self.rois_data = all_roi + self.ms_levels = 2 + self.crp_samples = [[] for i in range(self.ms_levels)] + self.crp_index = [[] for i in range(self.ms_levels)] + self.alpha = math.inf + self.counts = [[] for i in range(self.ms_levels)] + if self.ms_levels > 2: + self.logger.warning( + "Warning ms_level > 3 not implemented properly yet. Uses scaled ms_level = 2 information for now") + + self.chromatograms = [] + self.chemicals = [] + for i in range(len(self.rois_data)): + if i % 50000 == 0: + self.logger.debug('%6d/%6d' % (i, len(self.rois_data))) + roi = self.rois_data[i] + + # raise numpy warning as exception, see https://stackoverflow.com/questions/15933741/how-do-i-catch-a-numpy-warning-like-its-an-exception-not-just-for-testing + chrom = None + with np.errstate(divide='raise'): + try: + chrom = roi.to_chromatogram() + except FloatingPointError: + self.logger.debug('Invalid chromatogram {}'.format(i)) + except ZeroDivisionError: + self.logger.debug('Invalid chromatogram {}'.format(i)) + + if chrom is not None: + chem = self._to_unknown_chemical(chrom) + if self.peak_sampler is not None: + try: + # TODO: initialise chemical with only 1 child for the purpose of experiment, we might need to improve this + chem.children = self._get_children(GET_MS2_BY_PEAKS, chem, n_peaks=1) + except KeyError: + pass + self.chromatograms.append(chrom) + self.chemicals.append(chem) + assert len(self.chromatograms) == len(self.chemicals) + self.logger.info('Found %d ROIs above thresholds' % len(self.chromatograms)) + + def sample(self, chromatogram_creator, mz_range, rt_range, min_ms1_intensity, n_ms1_peaks, ms_levels=2, + chemical_type=None, + formula_list=None, compound_list=None, alpha=math.inf, fixed_mz=False, adduct_proportion_cutoff=0.05): + return NotImplementedError() + + def sample_from_chromatograms(self, chromatogram_creator, min_rt, max_rt, min_ms1_intensity, ms_levels=2): + return NotImplementedError() + + def _to_unknown_chemical(self, chrom): + idx = np.argmax(chrom.raw_intensities) # find intensity apex + mz = chrom.raw_mzs[idx] + + # In the MassSpec, we assume that chemical starts eluting from chem.rt + chem.chromatogram.rts (normalised to start from 0) + # So here, we have to set set chemical rt to start from the minimum of chromatogram raw rts, so it elutes correct. + # rt = chrom.raw_rts[idx] + rt = min(chrom.raw_rts) + + max_intensity = chrom.raw_intensities[idx] + mz = mz - PROTON_MASS + chem = UnknownChemical(mz, rt, max_intensity, chrom, None) + chem.type = CHEM_NOISE + return chem + + def plot_chems(self, n_plots, reverse=False): + sorted_chems = sorted(self.chemicals, key=lambda chem: chem.chromatogram.roi.num_scans()) + if reverse: + sorted_chems.reverse() + for c in sorted_chems[0:n_plots]: + chrom = c.chromatogram + plt.plot(chrom.raw_rts, chrom.raw_intensities) + plt.show() + + +def extract_roi(file_names, out_dir, pattern, mzml_path, ps, roi_mz_tol=10, roi_min_length=2, roi_min_intensity=1.75E5, roi_start_rt=0, + roi_stop_rt=1440): + for i in range(len(file_names)): # for all mzML files in file_names + # extract ROI + mzml_file = os.path.join(mzml_path, file_names[i]) + good_roi, junk = make_roi(mzml_file, mz_tol=roi_mz_tol, mz_units='ppm', min_length=roi_min_length, + min_intensity=roi_min_intensity, start_rt=roi_start_rt, stop_rt=roi_stop_rt) + all_roi = good_roi + + # turn ROI to chemicals + rtcc = RoiToChemicalCreator(ps, all_roi) + data = rtcc.chemicals + + # save extracted chemicals + basename = os.path.basename(file_names[i]) + out_name = pattern % int(basename.split('_')[2]) + save_obj(data, os.path.join(out_dir, out_name)) \ No newline at end of file diff --git a/Synthetic data creation scripts/vimms/SpectralUtils.py b/Synthetic data creation scripts/vimms/SpectralUtils.py new file mode 100644 index 00000000..e84053c2 --- /dev/null +++ b/Synthetic data creation scripts/vimms/SpectralUtils.py @@ -0,0 +1,136 @@ +# Collection of methods to deal with mass spectra mzML files +import numpy as np +import pandas as pd +import pymzml + +from vimms.Common import get_rt +from vimms.Roi import make_roi, RoiToChemicalCreator + +######################################################################################################################## +# Data extraction methods +######################################################################################################################## + + +def get_chemicals(mzML_file, mz_tol, min_ms1_intensity, start_rt, stop_rt, min_length=1): + ''' + Extract ROI from an mzML file and turn them into UnknownChemical objects + :param mzML_file: input mzML file + :param mz_tol: mz tolerance for ROI extraction + :param min_ms1_intensity: ROI will only be kept if it has one point above this threshold + :param start_rt: start RT to extract ROI + :param stop_rt: end RT to extract ROI + :return: a list of UnknownChemical objects + ''' + min_intensity = 0 + good_roi, junk = make_roi(mzML_file, mz_tol=mz_tol, mz_units='ppm', min_length=min_length, + min_intensity=min_intensity, start_rt=start_rt, stop_rt=stop_rt) + + # keep ROI that have at least one point above the minimum to fragment threshold + keep = [] + for roi in good_roi: + if np.count_nonzero(np.array(roi.intensity_list) > min_ms1_intensity) > 0: + keep.append(roi) + + ps = None # unused + rtcc = RoiToChemicalCreator(ps, keep) + chemicals = np.array(rtcc.chemicals) + return chemicals + + +def get_precursor_info(fragfile): + """ + Get (MS1) precursor peaks and their associated MS2 scans from an mzML file + :param fragfile: path to an mzML file + :return: a pandas dataframe that contains all the ms1 and ms2 information + """ + run = pymzml.run.Reader(fragfile, obo_version='4.0.1', + MS1_Precision=5e-6, + extraAccessions=[('MS:1000016', ['value', 'unitName'])]) + + last_ms1_peaklist = None + last_ms1_scan_no = 0 + isolation_window = 0.5 # Dalton + data = [] + for scan_no, scan in enumerate(run): + if scan.ms_level == 1: # save the last ms1 scan that we've seen + last_ms1_peaklist = _get_peaks(scan) + last_ms1_scan_no = scan_no + + # TODO: it's better to use the "isolation window target m/z" field in the mzML file for matching + precursors = scan.selected_precursors + if len(precursors) > 0: + assert len(precursors) == 1 # assume exactly 1 precursor peak for each ms2 scan + precursor = precursors[0] + + try: + scan_rt = get_rt(scan) + precursor_mz = precursor['mz'] + precursor_intensity = precursor['i'] + res = _find_precursor_peaks(precursor, last_ms1_peaklist, last_ms1_scan_no, + isolation_window=isolation_window) + ms2_peaklist = _get_peaks(scan) + row = [scan_no, scan_rt, precursor_mz, precursor_intensity, ms2_peaklist] + row.extend(res) + data.append(row) + except ValueError as e: + print(e) + except KeyError as e: + continue # sometimes we can't find the intensity value precursor['i'] in precursors + + columns = ['ms2_scan_id', 'ms2_scan_rt', 'ms2_precursor_mz', 'ms2_precursor_intensity', 'ms2_peaklist', + 'ms1_scan_id', 'ms1_scan_rt', 'ms1_mz', 'ms1_intensity'] + df = pd.DataFrame(data, columns=columns) + + # select only rows where we are sure of the matching, i.e. the intensity values aren't too different + df['intensity_diff'] = np.abs(df['ms2_precursor_intensity'] - df['ms1_intensity']) + idx = (df['intensity_diff'] < 0.1) + ms1_df = df[idx] + return ms1_df + + +######################################################################################################################## +# Private methods +######################################################################################################################## + +def _get_peaks(spectrum): + mzs = spectrum.mz + rts = [get_rt(spectrum)] * len(mzs) + intensities = spectrum.i + peaklist = np.stack([mzs, rts, intensities], axis=1) + return peaklist + + +def _find_precursor_peaks(precursor, last_ms1_peaklist, last_ms1_scan_no, isolation_window=0.5): + selected_ms1, selected_ms1_idx = _find_precursor_ms1(precursor, last_ms1_peaklist, + last_ms1_scan_no, isolation_window) + selected_ms1_mz = selected_ms1[0] + selected_ms1_rt = selected_ms1[1] + selected_ms1_intensity = selected_ms1[2] + res = [last_ms1_scan_no, selected_ms1_rt, selected_ms1_mz, selected_ms1_intensity] + return res + + +def _find_precursor_ms1(precursor, last_ms1_peaklist, last_ms1_scan_no, isolation_window): + precursor_mz = precursor['mz'] + precursor_intensity = precursor['i'] + + # find mz in the last ms1 scan that fall within isolation window + mzs = last_ms1_peaklist[:, 0] + diffs = abs(mzs - precursor_mz) < isolation_window + idx = np.nonzero(diffs)[0] + + if len(idx) == 0: # should never happen!? + raise ValueError('Cannot find precursor peak (%f, %f) in the last ms1 scan %d' % + (precursor_mz, precursor_intensity, last_ms1_scan_no)) + + elif len(idx) == 1: # only one is found + selected_ms1_idx = idx[0] + + else: # found multilple possible ms1 peak, select the largest intensity + possible_ms1 = last_ms1_peaklist[idx, :] + possible_intensities = possible_ms1[:, 2] + closest = np.argmax(possible_intensities) + selected_ms1_idx = idx[closest] + + selected_ms1 = last_ms1_peaklist[selected_ms1_idx, :] + return selected_ms1, selected_ms1_idx \ No newline at end of file diff --git a/Synthetic data creation scripts/vimms/TopNExperiment.py b/Synthetic data creation scripts/vimms/TopNExperiment.py new file mode 100644 index 00000000..620a3447 --- /dev/null +++ b/Synthetic data creation scripts/vimms/TopNExperiment.py @@ -0,0 +1,147 @@ +import os + +from vimms.Common import save_obj, create_if_not_exist +from vimms.Controller import TopNController +from vimms.DataGenerator import DataSource, PeakSampler +from vimms.MassSpec import IndependentMassSpectrometer + + +######################################################################################################################## +# Codes to set up experiments +######################################################################################################################## + + +def run_experiment(param): + ''' + Runs a Top-N experiment + :param param: the experimental parameters + :return: the analysis name that has been successfully ran + ''' + analysis_name = param['analysis_name'] + mzml_out = param['mzml_out'] + pickle_out = param['pickle_out'] + N = param['N'] + rt_tol = param['rt_tol'] + + if os.path.isfile(mzml_out) and os.path.isfile(pickle_out): + print('Skipping %s' % (analysis_name)) + else: + print('Processing %s' % (analysis_name)) + peak_sampler = param['peak_sampler'] + if peak_sampler is None: # extract density from the fragmenatation file + mzml_path = param['mzml_path'] + fragfiles = param['fragfiles'] + fragfile = fragfiles[(N, rt_tol,)] + min_rt = param['min_rt'] + max_rt = param['max_rt'] + peak_sampler = get_peak_sampler(mzml_path, fragfile, min_rt, max_rt) + + mass_spec = IndependentMassSpectrometer(param['ionisation_mode'], param['data'], peak_sampler) + controller = TopNController(mass_spec, param['N'], param['isolation_window'], + param['mz_tol'], param['rt_tol'], param['min_ms1_intensity']) + controller.run(param['min_rt'], param['max_rt'], progress_bar=param['pbar']) + controller.write_mzML(analysis_name, mzml_out) + save_obj(controller, pickle_out) + return analysis_name + + +def get_peak_sampler(mzml_path, fragfile, min_rt, max_rt): + ds = DataSource() + ds.load_data(mzml_path, file_name=fragfile) + kde_min_ms1_intensity = 0 # min intensity to be selected for kdes + kde_min_ms2_intensity = 0 + peak_sampler = PeakSampler(ds, kde_min_ms1_intensity, kde_min_ms2_intensity, min_rt, max_rt) + return peak_sampler + + +def run_parallel_experiment(params): + ''' + Runs experiments in parallel using iParallel library + :param params: the experimental parameter + :return: None + ''' + import ipyparallel as ipp + rc = ipp.Client() + dview = rc[:] # use all engines​ + with dview.sync_imports(): + pass + + analysis_names = dview.map_sync(run_experiment, params) + for analysis_name in analysis_names: + print(analysis_name) + + +def run_serial_experiment(params): + ''' + Runs experiments serially + :param params: the experimental parameter + :return: None + ''' + total = len(params) + for i in range(len(params)): + param = params[i] + print('Processing \t%d/%d\t%s' % (i + 1, total, param['analysis_name'])) + run_experiment(param) + + +def get_params(experiment_name, Ns, rt_tols, mz_tol, isolation_window, ionisation_mode, data, peak_sampler, + min_ms1_intensity, min_rt, max_rt, + out_dir, pbar, mzml_path=None, fragfiles=None): + ''' + Creates a list of experimental parameters + :param experiment_name: current experimental name + :param Ns: possible values of N in top-N to test + :param rt_tols: possible values of DEW to test + :param mz_tol: Top-N controller parameter: the m/z window (ppm) to prevent the same precursor ion to be fragmented again + :param isolation_window: Top-N controller parameter: the m/z window (ppm) to prevent the same precursor ion to be fragmented again + :param ionisation_mode: Top-N controller parameter: either positive or negative + :param data: chemicals to fragment + :param peak sampler: trained densities to sample values during simulatin + :param min_ms1_intensity: Top-N controller parameter: minimum ms1 intensity to fragment + :param min_rt: start RT to simulate + :param max_rt: end RT to simulate + :param out_dir: output directory + :param pbar: progress bar to update + :return: a list of parameters + ''' + create_if_not_exist(out_dir) + print('N =', Ns) + print('rt_tol =', rt_tols) + params = [] + for N in Ns: + for rt_tol in rt_tols: + analysis_name = 'experiment_%s_N_%d_rttol_%d' % (experiment_name, N, rt_tol) + mzml_out = os.path.join(out_dir, '%s.mzML' % analysis_name) + pickle_out = os.path.join(out_dir, '%s.p' % analysis_name) + param_dict = { + 'N': N, + 'mz_tol': mz_tol, + 'rt_tol': rt_tol, + 'min_ms1_intensity': min_ms1_intensity, + 'isolation_window': isolation_window, + 'ionisation_mode': ionisation_mode, + 'data': data, + 'peak_sampler': peak_sampler, + 'min_rt': min_rt, + 'max_rt': max_rt, + 'analysis_name': analysis_name, + 'mzml_out': mzml_out, + 'pickle_out': pickle_out, + 'pbar': pbar + } + if mzml_path is not None: + param_dict['mzml_path'] = mzml_path + if fragfiles is not None: + param_dict['fragfiles'] = fragfiles + params.append(param_dict) + print('len(params) =', len(params)) + return params + + +def get_N_rt_tol_from_qcb_filename(fragfile): + base = os.path.basename(fragfile) + base = os.path.splitext(base)[0] + tokens = base.split('_') + N = int(tokens[1][1:]) + rt_tol = int(tokens[2][3:]) + return N, rt_tol diff --git a/Synthetic data creation scripts/vimms/__init__.py b/Synthetic data creation scripts/vimms/__init__.py new file mode 100644 index 00000000..57dbee80 --- /dev/null +++ b/Synthetic data creation scripts/vimms/__init__.py @@ -0,0 +1 @@ +name = 'vimms' diff --git a/Synthetic data creation scripts/vimms_data_generation/01. Download Data.ipynb b/Synthetic data creation scripts/vimms_data_generation/01. Download Data.ipynb new file mode 100644 index 00000000..d7c1c3b6 --- /dev/null +++ b/Synthetic data creation scripts/vimms_data_generation/01. Download Data.ipynb @@ -0,0 +1,1174 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 1. Download Data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This notebook downloads the necessary example data that will be used in other notebooks. In particular, the notebook does the following:\n", + "\n", + "- Download beer and urine .mzML files used as examples in the paper\n", + "- Download the HMDB database and extract metabolites.\n", + "- Trains kernel density estimators on the mzML files.\n", + "- Extract regions of interests from the mzML files.\n", + "\n", + "**Please run this notebook first to make sure the data files are available for subsequent notebooks.**" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "%matplotlib inline" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "from pathlib import Path\n", + "import glob" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "sys.path.append('..')" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "from vimms.DataGenerator import extract_hmdb_metabolite, get_data_source, get_spectral_feature_database\n", + "from vimms.MassSpec import IndependentMassSpectrometer\n", + "from vimms.Controller import SimpleMs1Controller\n", + "from vimms.Common import *\n", + "from vimms.Roi import make_roi, RoiToChemicalCreator, extract_roi" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "# set_log_level_info()\n", + "set_log_level_debug()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## a. Download beer and urine files" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here we download the beer and urine .mzML files used as examples in the paper if they don't exist." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "url = 'http://researchdata.gla.ac.uk/870/2/example_data.zip'\n", + "base_dir = os.path.join(os.getcwd(), 'example_data')" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Found /home/cjurich/projects/vimms/examples/example_data\n" + ] + } + ], + "source": [ + "if not os.path.isdir(base_dir): # if not exist then download the example data and extract it\n", + " print('Creating %s' % base_dir) \n", + " out_file = 'example_data.zip'\n", + " download_file(url, out_file)\n", + " extract_zip_file(out_file, delete=True)\n", + "else:\n", + " print('Found %s' % base_dir)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## b. Download metabolites from HMDB" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Next we load a pre-processed pickled file of database metabolites in the `data_dir` folder. If it is not found, then create the file by downloading and extracting the metabolites from HMDB." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Loaded 114202 DatabaseCompounds from /home/cjurich/projects/vimms/examples/example_data/hmdb_compounds.p\n" + ] + } + ], + "source": [ + "compound_file = Path(base_dir, 'hmdb_compounds.p')\n", + "hmdb_compounds = load_obj(compound_file)\n", + "if hmdb_compounds is None: # if file does not exist\n", + "\n", + " # download the entire HMDB metabolite database\n", + " url = 'http://www.hmdb.ca/system/downloads/current/hmdb_metabolites.zip'\n", + "\n", + " out_file = download_file(url)\n", + " compounds = extract_hmdb_metabolite(out_file, delete=True)\n", + " save_obj(compounds, compound_file)\n", + "\n", + "else:\n", + " print('Loaded %d DatabaseCompounds from %s' % (len(hmdb_compounds), compound_file))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## c. Generate Spectral Feature Database" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this section we demonstrate how ViMMS constructs the spectral feature database containing information, such as the densities of m/z, RT and intensities, scan durations, MS2 peaks, from the example Beer mzML files. The spectral feature database will be used to sample for various features during the simulation later.\n", + "\n", + "The following two methods `get_data_source` and `get_spectral_feature_database` from ViMMS will be used. \n", + "- `get_data_source` loads a `DataSource` object that stores information on a set of .mzML files\n", + "- `get_spectral_feature_database` extracts relevant features from .mzML files that have been loaded into the DataSource. \n", + "\n", + "The parameter below should work for most cases, however for different data, it might be necessary to adjust the `min_rt` and `max_rt` values." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "filename = None # if None, use all mzML files found\n", + "min_ms1_intensity = 0 # min MS1 intensity threshold to include a data point for density estimation\n", + "min_ms2_intensity = 0 # min MS2 intensity threshold to include a data point for density estimation\n", + "min_rt = 0 # min RT to include a data point for density estimation\n", + "max_rt = 1440 # max RT to include a data point for density estimation\n", + "bandwidth_mz_intensity_rt = 1.0 # kernel bandwidth parameter to sample (mz, RT, intensity) values during simulation\n", + "bandwidth_n_peaks = 1.0 # kernel bandwidth parameter to sample number of peaks per scan during simulation" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Load fullscan data and train spectral feature database" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "mzml_path = Path(base_dir, 'beers', 'fullscan', 'mzML')\n", + "xcms_output = Path(mzml_path, 'extracted_peaks_ms1.csv')\n", + "out_file = Path(base_dir, 'peak_sampler_mz_rt_int_19_beers_fullscan.p')" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO : DataSource : Loading Beer_multibeers_7_fullscan1.mzML\n", + "INFO : numexpr.utils : NumExpr defaulting to 8 threads.\n", + "INFO : DataSource : Loading Beer_multibeers_6_fullscan1.mzML\n", + "INFO : DataSource : Loading Beer_multibeers_19_fullscan1.mzML\n", + "INFO : DataSource : Loading Beer_multibeers_4_fullscan1.mzML\n", + "INFO : DataSource : Loading Beer_multibeers_10_fullscan1.mzML\n", + "INFO : DataSource : Loading Beer_multibeers_17_fullscan1.mzML\n", + "INFO : DataSource : Loading Beer_multibeers_3_fullscan1.mzML\n", + "INFO : DataSource : Loading Beer_multibeers_8_fullscan1.mzML\n", + "INFO : DataSource : Loading Beer_multibeers_16_fullscan1.mzML\n", + "INFO : DataSource : Loading Beer_multibeers_12_fullscan1.mzML\n", + "INFO : DataSource : Loading Beer_multibeers_13_fullscan1.mzML\n", + "INFO : DataSource : Loading Beer_multibeers_5_fullscan1.mzML\n", + "INFO : DataSource : Loading Beer_multibeers_2_fullscan1.mzML\n", + "INFO : DataSource : Loading Beer_multibeers_9_fullscan1.mzML\n", + "INFO : DataSource : Loading Beer_multibeers_15_fullscan1.mzML\n", + "INFO : DataSource : Loading Beer_multibeers_14_fullscan1.mzML\n", + "INFO : DataSource : Loading Beer_multibeers_1_fullscan1.mzML\n", + "INFO : DataSource : Loading Beer_multibeers_11_fullscan1.mzML\n", + "INFO : DataSource : Loading Beer_multibeers_18_fullscan1.mzML\n" + ] + } + ], + "source": [ + "ds_fullscan = get_data_source(mzml_path, filename, xcms_output)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "DEBUG : PeakSampler : Extracted 0 MS2 scans\n", + "DEBUG : PeakSampler : Computing parent intensity proportions\n", + "DEBUG : PeakSampler : Extracting scan durations\n", + "DEBUG : PeakSampler : Training KDEs for ms_level=1\n", + "DEBUG : PeakSampler : Retrieving mz_intensity_rt values from <vimms.DataGenerator.DataSource object at 0x7f2e3ec14640>\n", + "INFO : DataSource : Using values from XCMS peaklist\n", + "DEBUG : PeakSampler : Retrieving n_peaks values from <vimms.DataGenerator.DataSource object at 0x7f2e3ec14640>\n", + "DEBUG : PeakSampler : Training KDEs for ms_level=2\n", + "DEBUG : PeakSampler : Retrieving mz_intensity_rt values from <vimms.DataGenerator.DataSource object at 0x7f2e3ec14640>\n", + "INFO : DataSource : Using values from scans\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Saving <class 'vimms.DataGenerator.PeakSampler'> to /home/cjurich/projects/vimms/examples/example_data/peak_sampler_mz_rt_int_19_beers_fullscan.p\n" + ] + } + ], + "source": [ + "ps = get_spectral_feature_database(ds_fullscan, filename, min_ms1_intensity, min_ms2_intensity, min_rt, max_rt,\n", + " bandwidth_mz_intensity_rt, bandwidth_n_peaks, out_file)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[Peak mz=428.7252 rt=1081.71 intensity=2123572.19 ms_level=1,\n", + " Peak mz=411.0711 rt=602.91 intensity=12226691.90 ms_level=1,\n", + " Peak mz=126.0966 rt=267.76 intensity=93690.81 ms_level=1,\n", + " Peak mz=206.4157 rt=480.98 intensity=62336.55 ms_level=1,\n", + " Peak mz=495.0441 rt=546.91 intensity=907325.70 ms_level=1,\n", + " Peak mz=210.3872 rt=289.68 intensity=582443.59 ms_level=1,\n", + " Peak mz=249.6236 rt=399.33 intensity=3002228.13 ms_level=1,\n", + " Peak mz=122.2433 rt=76.53 intensity=16820.22 ms_level=1,\n", + " Peak mz=150.6783 rt=225.97 intensity=332209.90 ms_level=1,\n", + " Peak mz=150.3422 rt=521.56 intensity=1986196.81 ms_level=1]" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ps.get_peak(1, 10) # try to sample 10 MS1 peaks" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Load fragmentation data and train spectral feature database" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "mzml_path = Path(base_dir, 'beers', 'fragmentation', 'mzML')\n", + "xcms_output = Path(mzml_path, 'extracted_peaks_ms1.csv')\n", + "out_file = Path(base_dir, 'peak_sampler_mz_rt_int_19_beers_fragmentation.p')" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO : DataSource : Loading Beer_multibeers_8_T10_POS.mzML\n", + "INFO : DataSource : Loading Beer_multibeers_12_T10_POS.mzML\n", + "INFO : DataSource : Loading Beer_multibeers_9_T10_POS.mzML\n", + "INFO : DataSource : Loading Beer_multibeers_1_T10_POS.mzML\n", + "INFO : DataSource : Loading Beer_multibeers_10_T10_POS.mzML\n", + "INFO : DataSource : Loading Beer_multibeers_19_T10_POS.mzML\n", + "INFO : DataSource : Loading Beer_multibeers_6_T10_POS.mzML\n", + "INFO : DataSource : Loading Beer_multibeers_3_T10_POS.mzML\n", + "INFO : DataSource : Loading Beer_multibeers_15_T10_POS.mzML\n", + "INFO : DataSource : Loading Beer_multibeers_2_T10_POS.mzML\n", + "INFO : DataSource : Loading Beer_multibeers_17_T10_POS.mzML\n", + "INFO : DataSource : Loading Beer_multibeers_4_T10_POS.mzML\n", + "INFO : DataSource : Loading Beer_multibeers_16_T10_POS.mzML\n", + "INFO : DataSource : Loading Beer_multibeers_13_T10_POS.mzML\n", + "INFO : DataSource : Loading Beer_multibeers_11_T10_POS.mzML\n", + "INFO : DataSource : Loading Beer_multibeers_18_T10_POS.mzML\n", + "INFO : DataSource : Loading Beer_multibeers_7_T10_POS.mzML\n", + "INFO : DataSource : Loading Beer_multibeers_14_T10_POS.mzML\n", + "INFO : DataSource : Loading Beer_multibeers_5_T10_POS.mzML\n" + ] + } + ], + "source": [ + "ds_fragmentation = get_data_source(mzml_path, filename, xcms_output)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "DEBUG : PeakSampler : Extracted 138969 MS2 scans\n", + "DEBUG : PeakSampler : Computing parent intensity proportions\n", + "DEBUG : PeakSampler : Extracting scan durations\n", + "DEBUG : PeakSampler : Training KDEs for ms_level=1\n", + "DEBUG : PeakSampler : Retrieving mz_intensity_rt values from <vimms.DataGenerator.DataSource object at 0x7f2e1bef3d60>\n", + "INFO : DataSource : Using values from XCMS peaklist\n", + "DEBUG : PeakSampler : Retrieving n_peaks values from <vimms.DataGenerator.DataSource object at 0x7f2e1bef3d60>\n", + "DEBUG : PeakSampler : Training KDEs for ms_level=2\n", + "DEBUG : PeakSampler : Retrieving mz_intensity_rt values from <vimms.DataGenerator.DataSource object at 0x7f2e1bef3d60>\n", + "INFO : DataSource : Using values from scans\n", + "DEBUG : PeakSampler : Retrieving n_peaks values from <vimms.DataGenerator.DataSource object at 0x7f2e1bef3d60>\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Saving <class 'vimms.DataGenerator.PeakSampler'> to /home/cjurich/projects/vimms/examples/example_data/peak_sampler_mz_rt_int_19_beers_fragmentation.p\n" + ] + } + ], + "source": [ + "ps = get_spectral_feature_database(ds_fragmentation, filename, min_ms1_intensity, min_ms2_intensity, min_rt, max_rt,\n", + " bandwidth_mz_intensity_rt, bandwidth_n_peaks, out_file)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[Peak mz=221.7807 rt=214.06 intensity=320377.74 ms_level=1,\n", + " Peak mz=242.0755 rt=342.85 intensity=457720.15 ms_level=1,\n", + " Peak mz=601.3200 rt=420.90 intensity=3570.79 ms_level=1,\n", + " Peak mz=436.5792 rt=801.95 intensity=87740.36 ms_level=1,\n", + " Peak mz=301.6364 rt=249.27 intensity=61586.19 ms_level=1,\n", + " Peak mz=246.0578 rt=262.69 intensity=278773.64 ms_level=1,\n", + " Peak mz=473.5991 rt=633.66 intensity=342773.23 ms_level=1,\n", + " Peak mz=178.9170 rt=1261.88 intensity=25533.11 ms_level=1,\n", + " Peak mz=254.8886 rt=375.12 intensity=718706.28 ms_level=1,\n", + " Peak mz=679.5092 rt=218.24 intensity=361896.01 ms_level=1]" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ps.get_peak(1, 10)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[Peak mz=70.5123 rt=769.84 intensity=12155.36 ms_level=2,\n", + " Peak mz=306.8993 rt=948.28 intensity=3623.41 ms_level=2,\n", + " Peak mz=97.8524 rt=1016.22 intensity=3337.43 ms_level=2,\n", + " Peak mz=103.5200 rt=342.68 intensity=1281.02 ms_level=2,\n", + " Peak mz=111.4647 rt=543.28 intensity=21201.48 ms_level=2,\n", + " Peak mz=118.6177 rt=437.24 intensity=86636.24 ms_level=2,\n", + " Peak mz=85.0430 rt=1161.27 intensity=557.34 ms_level=2,\n", + " Peak mz=272.8699 rt=253.92 intensity=36855.75 ms_level=2,\n", + " Peak mz=94.4220 rt=368.48 intensity=1134.53 ms_level=2,\n", + " Peak mz=52.8812 rt=923.67 intensity=1059.23 ms_level=2]" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ps.get_peak(2, 10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## d. Extract the ROIs for DsDA Experiments" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "roi_mz_tol = 10\n", + "roi_min_length = 2\n", + "roi_min_intensity = 1.75E5\n", + "roi_start_rt = min_rt\n", + "roi_stop_rt = max_rt" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Extract beer ROIs" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "DEBUG : RoiToChemicalCreator : 0/ 13179\n", + "INFO : RoiToChemicalCreator : Found 13179 ROIs above thresholds\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Saving <class 'list'> to /home/cjurich/projects/vimms/examples/example_data/DsDA/DsDA_Beer/beer_t10_simulator_files/beer_8.p\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "DEBUG : RoiToChemicalCreator : 0/ 11249\n", + "INFO : RoiToChemicalCreator : Found 11249 ROIs above thresholds\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Saving <class 'list'> to /home/cjurich/projects/vimms/examples/example_data/DsDA/DsDA_Beer/beer_t10_simulator_files/beer_12.p\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "DEBUG : RoiToChemicalCreator : 0/ 14842\n", + "INFO : RoiToChemicalCreator : Found 14842 ROIs above thresholds\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Saving <class 'list'> to /home/cjurich/projects/vimms/examples/example_data/DsDA/DsDA_Beer/beer_t10_simulator_files/beer_9.p\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "DEBUG : RoiToChemicalCreator : 0/ 12611\n", + "INFO : RoiToChemicalCreator : Found 12611 ROIs above thresholds\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Saving <class 'list'> to /home/cjurich/projects/vimms/examples/example_data/DsDA/DsDA_Beer/beer_t10_simulator_files/beer_1.p\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "DEBUG : RoiToChemicalCreator : 0/ 11925\n", + "INFO : RoiToChemicalCreator : Found 11925 ROIs above thresholds\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Saving <class 'list'> to /home/cjurich/projects/vimms/examples/example_data/DsDA/DsDA_Beer/beer_t10_simulator_files/beer_10.p\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "DEBUG : RoiToChemicalCreator : 0/ 12945\n", + "INFO : RoiToChemicalCreator : Found 12945 ROIs above thresholds\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Saving <class 'list'> to /home/cjurich/projects/vimms/examples/example_data/DsDA/DsDA_Beer/beer_t10_simulator_files/beer_19.p\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "DEBUG : RoiToChemicalCreator : 0/ 11636\n", + "INFO : RoiToChemicalCreator : Found 11636 ROIs above thresholds\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Saving <class 'list'> to /home/cjurich/projects/vimms/examples/example_data/DsDA/DsDA_Beer/beer_t10_simulator_files/beer_6.p\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "DEBUG : RoiToChemicalCreator : 0/ 9716\n", + "INFO : RoiToChemicalCreator : Found 9716 ROIs above thresholds\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Saving <class 'list'> to /home/cjurich/projects/vimms/examples/example_data/DsDA/DsDA_Beer/beer_t10_simulator_files/beer_3.p\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "DEBUG : RoiToChemicalCreator : 0/ 13068\n", + "INFO : RoiToChemicalCreator : Found 13068 ROIs above thresholds\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Saving <class 'list'> to /home/cjurich/projects/vimms/examples/example_data/DsDA/DsDA_Beer/beer_t10_simulator_files/beer_15.p\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "DEBUG : RoiToChemicalCreator : 0/ 14839\n", + "INFO : RoiToChemicalCreator : Found 14839 ROIs above thresholds\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Saving <class 'list'> to /home/cjurich/projects/vimms/examples/example_data/DsDA/DsDA_Beer/beer_t10_simulator_files/beer_2.p\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "DEBUG : RoiToChemicalCreator : 0/ 14778\n", + "INFO : RoiToChemicalCreator : Found 14778 ROIs above thresholds\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Saving <class 'list'> to /home/cjurich/projects/vimms/examples/example_data/DsDA/DsDA_Beer/beer_t10_simulator_files/beer_17.p\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "DEBUG : RoiToChemicalCreator : 0/ 12029\n", + "INFO : RoiToChemicalCreator : Found 12029 ROIs above thresholds\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Saving <class 'list'> to /home/cjurich/projects/vimms/examples/example_data/DsDA/DsDA_Beer/beer_t10_simulator_files/beer_4.p\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "DEBUG : RoiToChemicalCreator : 0/ 15556\n", + "INFO : RoiToChemicalCreator : Found 15556 ROIs above thresholds\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Saving <class 'list'> to /home/cjurich/projects/vimms/examples/example_data/DsDA/DsDA_Beer/beer_t10_simulator_files/beer_16.p\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "DEBUG : RoiToChemicalCreator : 0/ 10489\n", + "INFO : RoiToChemicalCreator : Found 10489 ROIs above thresholds\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Saving <class 'list'> to /home/cjurich/projects/vimms/examples/example_data/DsDA/DsDA_Beer/beer_t10_simulator_files/beer_13.p\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "DEBUG : RoiToChemicalCreator : 0/ 9971\n", + "INFO : RoiToChemicalCreator : Found 9971 ROIs above thresholds\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Saving <class 'list'> to /home/cjurich/projects/vimms/examples/example_data/DsDA/DsDA_Beer/beer_t10_simulator_files/beer_11.p\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "DEBUG : RoiToChemicalCreator : 0/ 13742\n", + "INFO : RoiToChemicalCreator : Found 13742 ROIs above thresholds\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Saving <class 'list'> to /home/cjurich/projects/vimms/examples/example_data/DsDA/DsDA_Beer/beer_t10_simulator_files/beer_18.p\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "DEBUG : RoiToChemicalCreator : 0/ 12181\n", + "INFO : RoiToChemicalCreator : Found 12181 ROIs above thresholds\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Saving <class 'list'> to /home/cjurich/projects/vimms/examples/example_data/DsDA/DsDA_Beer/beer_t10_simulator_files/beer_7.p\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "DEBUG : RoiToChemicalCreator : 0/ 12840\n", + "INFO : RoiToChemicalCreator : Found 12840 ROIs above thresholds\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Saving <class 'list'> to /home/cjurich/projects/vimms/examples/example_data/DsDA/DsDA_Beer/beer_t10_simulator_files/beer_14.p\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "DEBUG : RoiToChemicalCreator : 0/ 10502\n", + "INFO : RoiToChemicalCreator : Found 10502 ROIs above thresholds\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Saving <class 'list'> to /home/cjurich/projects/vimms/examples/example_data/DsDA/DsDA_Beer/beer_t10_simulator_files/beer_5.p\n" + ] + } + ], + "source": [ + "file_names = Path(base_dir, 'beers', 'fragmentation', 'mzML').glob('*.mzML')\n", + "out_dir = Path(base_dir,'DsDA', 'DsDA_Beer', 'beer_t10_simulator_files')\n", + "mzml_path = Path(base_dir, 'beers', 'fragmentation', 'mzML')\n", + "\n", + "extract_roi(list(file_names), out_dir, 'beer_%d.p', mzml_path, ps)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Extract urine ROIs" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "DEBUG : RoiToChemicalCreator : 0/ 16320\n", + "INFO : RoiToChemicalCreator : Found 16320 ROIs above thresholds\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Created /home/cjurich/projects/vimms/examples/example_data/DsDA/DsDA_Urine/urine_t10_simulator_files\n", + "Saving <class 'list'> to /home/cjurich/projects/vimms/examples/example_data/DsDA/DsDA_Urine/urine_t10_simulator_files/urine_97.p\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "DEBUG : RoiToChemicalCreator : 0/ 16294\n", + "INFO : RoiToChemicalCreator : Found 16294 ROIs above thresholds\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Saving <class 'list'> to /home/cjurich/projects/vimms/examples/example_data/DsDA/DsDA_Urine/urine_t10_simulator_files/urine_85.p\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "DEBUG : RoiToChemicalCreator : 0/ 16321\n", + "INFO : RoiToChemicalCreator : Found 16321 ROIs above thresholds\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Saving <class 'list'> to /home/cjurich/projects/vimms/examples/example_data/DsDA/DsDA_Urine/urine_t10_simulator_files/urine_2.p\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "DEBUG : RoiToChemicalCreator : 0/ 16100\n", + "INFO : RoiToChemicalCreator : Found 16100 ROIs above thresholds\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Saving <class 'list'> to /home/cjurich/projects/vimms/examples/example_data/DsDA/DsDA_Urine/urine_t10_simulator_files/urine_8.p\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "DEBUG : RoiToChemicalCreator : 0/ 15895\n", + "INFO : RoiToChemicalCreator : Found 15895 ROIs above thresholds\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Saving <class 'list'> to /home/cjurich/projects/vimms/examples/example_data/DsDA/DsDA_Urine/urine_t10_simulator_files/urine_53.p\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "DEBUG : RoiToChemicalCreator : 0/ 16885\n", + "INFO : RoiToChemicalCreator : Found 16885 ROIs above thresholds\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Saving <class 'list'> to /home/cjurich/projects/vimms/examples/example_data/DsDA/DsDA_Urine/urine_t10_simulator_files/urine_72.p\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "DEBUG : RoiToChemicalCreator : 0/ 18395\n", + "INFO : RoiToChemicalCreator : Found 18395 ROIs above thresholds\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Saving <class 'list'> to /home/cjurich/projects/vimms/examples/example_data/DsDA/DsDA_Urine/urine_t10_simulator_files/urine_3.p\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "DEBUG : RoiToChemicalCreator : 0/ 13836\n", + "INFO : RoiToChemicalCreator : Found 13836 ROIs above thresholds\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Saving <class 'list'> to /home/cjurich/projects/vimms/examples/example_data/DsDA/DsDA_Urine/urine_t10_simulator_files/urine_58.p\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "DEBUG : RoiToChemicalCreator : 0/ 10211\n", + "INFO : RoiToChemicalCreator : Found 10211 ROIs above thresholds\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Saving <class 'list'> to /home/cjurich/projects/vimms/examples/example_data/DsDA/DsDA_Urine/urine_t10_simulator_files/urine_32.p\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "DEBUG : RoiToChemicalCreator : 0/ 17938\n", + "INFO : RoiToChemicalCreator : Found 17938 ROIs above thresholds\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Saving <class 'list'> to /home/cjurich/projects/vimms/examples/example_data/DsDA/DsDA_Urine/urine_t10_simulator_files/urine_49.p\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "DEBUG : RoiToChemicalCreator : 0/ 17424\n", + "INFO : RoiToChemicalCreator : Found 17424 ROIs above thresholds\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Saving <class 'list'> to /home/cjurich/projects/vimms/examples/example_data/DsDA/DsDA_Urine/urine_t10_simulator_files/urine_80.p\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "DEBUG : RoiToChemicalCreator : 0/ 15601\n", + "INFO : RoiToChemicalCreator : Found 15601 ROIs above thresholds\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Saving <class 'list'> to /home/cjurich/projects/vimms/examples/example_data/DsDA/DsDA_Urine/urine_t10_simulator_files/urine_54.p\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "DEBUG : RoiToChemicalCreator : 0/ 14048\n", + "INFO : RoiToChemicalCreator : Found 14048 ROIs above thresholds\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Saving <class 'list'> to /home/cjurich/projects/vimms/examples/example_data/DsDA/DsDA_Urine/urine_t10_simulator_files/urine_93.p\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "DEBUG : RoiToChemicalCreator : 0/ 11073\n", + "INFO : RoiToChemicalCreator : Found 11073 ROIs above thresholds\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Saving <class 'list'> to /home/cjurich/projects/vimms/examples/example_data/DsDA/DsDA_Urine/urine_t10_simulator_files/urine_9.p\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "DEBUG : RoiToChemicalCreator : 0/ 18560\n", + "INFO : RoiToChemicalCreator : Found 18560 ROIs above thresholds\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Saving <class 'list'> to /home/cjurich/projects/vimms/examples/example_data/DsDA/DsDA_Urine/urine_t10_simulator_files/urine_105.p\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "DEBUG : RoiToChemicalCreator : 0/ 16681\n", + "INFO : RoiToChemicalCreator : Found 16681 ROIs above thresholds\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Saving <class 'list'> to /home/cjurich/projects/vimms/examples/example_data/DsDA/DsDA_Urine/urine_t10_simulator_files/urine_38.p\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "DEBUG : RoiToChemicalCreator : 0/ 20280\n", + "INFO : RoiToChemicalCreator : Found 20280 ROIs above thresholds\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Saving <class 'list'> to /home/cjurich/projects/vimms/examples/example_data/DsDA/DsDA_Urine/urine_t10_simulator_files/urine_57.p\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "DEBUG : RoiToChemicalCreator : 0/ 15677\n", + "INFO : RoiToChemicalCreator : Found 15677 ROIs above thresholds\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Saving <class 'list'> to /home/cjurich/projects/vimms/examples/example_data/DsDA/DsDA_Urine/urine_t10_simulator_files/urine_51.p\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "DEBUG : RoiToChemicalCreator : 0/ 16354\n", + "INFO : RoiToChemicalCreator : Found 16354 ROIs above thresholds\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Saving <class 'list'> to /home/cjurich/projects/vimms/examples/example_data/DsDA/DsDA_Urine/urine_t10_simulator_files/urine_28.p\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "DEBUG : RoiToChemicalCreator : 0/ 13089\n", + "INFO : RoiToChemicalCreator : Found 13089 ROIs above thresholds\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Saving <class 'list'> to /home/cjurich/projects/vimms/examples/example_data/DsDA/DsDA_Urine/urine_t10_simulator_files/urine_17.p\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "DEBUG : RoiToChemicalCreator : 0/ 17858\n", + "INFO : RoiToChemicalCreator : Found 17858 ROIs above thresholds\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Saving <class 'list'> to /home/cjurich/projects/vimms/examples/example_data/DsDA/DsDA_Urine/urine_t10_simulator_files/urine_52.p\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "DEBUG : RoiToChemicalCreator : 0/ 13999\n", + "INFO : RoiToChemicalCreator : Found 13999 ROIs above thresholds\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Saving <class 'list'> to /home/cjurich/projects/vimms/examples/example_data/DsDA/DsDA_Urine/urine_t10_simulator_files/urine_18.p\n" + ] + } + ], + "source": [ + "file_names = Path(base_dir, 'urines', 'fragmentation', 'mzML').glob('*.mzML')\n", + "out_dir = Path(base_dir,'DsDA', 'DsDA_Urine', 'urine_t10_simulator_files')\n", + "mzml_path = Path(base_dir, 'urines', 'fragmentation', 'mzML')\n", + "\n", + "extract_roi(list(file_names), out_dir, 'urine_%d.p', mzml_path, ps)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/Synthetic data creation scripts/vimms_data_generation/02. MS1 Simulations.ipynb b/Synthetic data creation scripts/vimms_data_generation/02. MS1 Simulations.ipynb new file mode 100644 index 00000000..2af7ae0a --- /dev/null +++ b/Synthetic data creation scripts/vimms_data_generation/02. MS1 Simulations.ipynb @@ -0,0 +1,321 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 2. Generating a Sample using MS1 Controller" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this notebook, we demonstrate how ViMMS can be used to generate a full-scan mzML file from a single sample. This corresponds to Section 3.1 of the paper." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "%matplotlib inline" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "sys.path.append('..')" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "from pathlib import Path" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "from vimms.Chemicals import ChemicalCreator\n", + "from vimms.MassSpec import IndependentMassSpectrometer\n", + "from vimms.Controller import SimpleMs1Controller\n", + "from vimms.Common import *" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Load previously trained spectral feature database and the list of extracted metabolites, created in **01. Download Data.ipynb**." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "base_dir = os.path.abspath('example_data')\n", + "ps = load_obj(Path(base_dir, 'peak_sampler_mz_rt_int_19_beers_fullscan.p'))\n", + "hmdb = load_obj(Path(base_dir, 'hmdb_compounds.p'))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Set ViMMS logging level" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "set_log_level_debug()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create Chemicals" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Define an output folder containing our results" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "out_dir = Path(base_dir, 'results', 'MS1_single')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here we generate the chemical objects that will be used in the sample. The chemical objects are generated by sampling from metabolites in the HMDB database." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "# the list of ROI sources created in the previous notebook '01. Download Data.ipynb'\n", + "ROI_Sources = [str(Path(base_dir,'DsDA', 'DsDA_Beer', 'beer_t10_simulator_files'))]\n", + "\n", + "# minimum MS1 intensity of chemicals\n", + "min_ms1_intensity = 1.75E5\n", + "\n", + "# m/z and RT range of chemicals\n", + "rt_range = [(0, 1440)]\n", + "mz_range = [(0, 1050)]\n", + "\n", + "# the number of chemicals in the sample\n", + "n_chems = 6500\n", + "\n", + "# maximum MS level (we do not generate fragmentation peaks when this value is 1)\n", + "ms_level = 1" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "DEBUG : ChemicalCreator : Sorting database compounds by masses\n", + "DEBUG : ChemicalCreator : 6500 chemicals to be created.\n", + "DEBUG : ChemicalCreator : Sampling formula 0/6500\n", + "DEBUG : ChemicalCreator : Sampling formula 500/6500\n", + "DEBUG : ChemicalCreator : Sampling formula 1000/6500\n", + "DEBUG : ChemicalCreator : Sampling formula 1500/6500\n", + "DEBUG : ChemicalCreator : Sampling formula 2000/6500\n", + "DEBUG : ChemicalCreator : Sampling formula 2500/6500\n", + "DEBUG : ChemicalCreator : Sampling formula 3000/6500\n", + "DEBUG : ChemicalCreator : Sampling formula 3500/6500\n", + "DEBUG : ChemicalCreator : Sampling formula 4000/6500\n", + "DEBUG : ChemicalCreator : Sampling formula 4500/6500\n", + "DEBUG : ChemicalCreator : Sampling formula 5000/6500\n", + "DEBUG : ChemicalCreator : Sampling formula 5500/6500\n", + "DEBUG : ChemicalCreator : Sampling formula 6000/6500\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Created /home/cjurich/projects/vimms/examples/example_data/results/MS1_single\n", + "Saving <class 'list'> to /home/cjurich/projects/vimms/examples/example_data/results/MS1_single/dataset.p\n" + ] + } + ], + "source": [ + "chems = ChemicalCreator(ps, ROI_Sources, hmdb)\n", + "dataset = chems.sample(mz_range, rt_range, min_ms1_intensity, n_chems, ms_level)\n", + "save_obj(dataset, Path(out_dir, 'dataset.p'))" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "KnownChemical - 'C13H12O3' rt=787.87 max_intensity=23718771.78\n", + "KnownChemical - 'C9H14N2O3S2' rt=224.35 max_intensity=6824500.18\n", + "KnownChemical - 'C19H12O2' rt=894.10 max_intensity=195695.90\n", + "KnownChemical - 'C16H19NO10' rt=785.77 max_intensity=1533328.27\n", + "KnownChemical - 'C3H5O7P' rt=204.98 max_intensity=737461.08\n", + "KnownChemical - 'C16H10N2O8S2' rt=538.08 max_intensity=21184668.53\n", + "KnownChemical - 'C10H17N3O8' rt=610.93 max_intensity=1315789.26\n", + "KnownChemical - 'C29H40O8' rt=537.58 max_intensity=1119299.33\n", + "KnownChemical - 'C19H16O9' rt=333.55 max_intensity=251170.17\n", + "KnownChemical - 'C23H48NO7P' rt=325.95 max_intensity=260464.78\n" + ] + } + ], + "source": [ + "for chem in dataset[0:10]:\n", + " print(chem)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Run MS1 controller on the samples and generate .mzML files" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "set_log_level_warning()" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "min_rt = rt_range[0][0]\n", + "max_rt = rt_range[0][1]" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "(1440.580s) ms_level=1: : 1440.5803710000002it [00:59, 24.09it/s] \n" + ] + } + ], + "source": [ + "mass_spec = IndependentMassSpectrometer(POSITIVE, dataset, ps)\n", + "controller = SimpleMs1Controller(mass_spec)\n", + "controller.run(min_rt, max_rt)\n", + "\n", + "mzml_filename = Path(out_dir, 'ms1_controller.mzML')\n", + "controller.write_mzML('my_analysis', mzml_filename)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Simulated results have been saved to the following .mzML file and can be viewed in tools like [ToppView](https://pubs.acs.org/doi/abs/10.1021/pr900171m) or using other mzML file viewers." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "PosixPath('/home/cjurich/projects/vimms/examples/example_data/results/MS1_single/ms1_controller.mzML')" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "mzml_filename" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/Synthetic data creation scripts/vimms_data_generation/03. Multiple Samples Example.ipynb b/Synthetic data creation scripts/vimms_data_generation/03. Multiple Samples Example.ipynb new file mode 100644 index 00000000..af910cbe --- /dev/null +++ b/Synthetic data creation scripts/vimms_data_generation/03. Multiple Samples Example.ipynb @@ -0,0 +1,1503 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 3. Generating Multiple Samples using MS1 Controller" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this notebook, we demonstrate how ViMMS can be used to generate multiple samples (sets of chemicals) that are biological and technical replicates. The MS1 controller is then used to produce mass spectral data in form of .mzML files for the multiple samples." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "%matplotlib inline" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from collections import defaultdict\n", + "import os\n", + "from pathlib import Path" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "sys.path.append('..')" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "from vimms.Chemicals import ChemicalCreator, MultiSampleCreator\n", + "from vimms.MassSpec import IndependentMassSpectrometer\n", + "from vimms.Controller import SimpleMs1Controller\n", + "from vimms.Common import *" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Load previously trained KDEs in `PeakSampler` and the list of extracted metabolites, created in **01. Download Data.ipynb**." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "base_dir = os.path.abspath('example_data')\n", + "ps = load_obj(Path(base_dir, 'peak_sampler_mz_rt_int_19_beers_fullscan.p'))\n", + "hmdb = load_obj(Path(base_dir, 'hmdb_compounds.p'))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Set ViMMS logging level" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "set_log_level_warning()\n", + "# set_log_level_info()\n", + "# set_log_level_debug()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create Initial Chemical" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Define an output folder containing our results" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "out_dir = Path(base_dir, 'results', 'MS1_multiple')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here we generate multiple chemical objects that will be used across samples. The chemical objects are generated by sampling from metabolites in the HMDB database." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "# the list of ROI sources created in the previous notebook '01. Download Data.ipynb'\n", + "ROI_Sources = [str(Path(base_dir,'DsDA', 'DsDA_Beer', 'beer_t10_simulator_files'))]\n", + "\n", + "# minimum MS1 intensity of chemicals\n", + "min_ms1_intensity = 1.75E5\n", + "\n", + "# m/z and RT range of chemicals\n", + "rt_range = [(400, 800)]\n", + "mz_range = [(100, 400)]\n", + "\n", + "# the number of chemicals in the sample\n", + "n_chems = 1000\n", + "\n", + "# maximum MS level (we do not generate fragmentation peaks when this value is 1)\n", + "ms_level = 1\n", + "\n", + "# for this experiment, we restrict the sampled chromatograms to be within 20 - 40s in length\n", + "# so they are not too big and too small\n", + "roi_rt_range = [20, 40]" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Saving <class 'list'> to C:\\Users\\joewa\\Work\\git\\vimms\\examples\\example_data\\results\\MS1_multiple\\BaseDataset\\dataset.p\n" + ] + } + ], + "source": [ + "chems = ChemicalCreator(ps, ROI_Sources, hmdb)\n", + "dataset = chems.sample(mz_range, rt_range, min_ms1_intensity, n_chems, ms_level, roi_rt_range=roi_rt_range)\n", + "save_obj(dataset, Path(out_dir, 'BaseDataset', 'dataset.p'))" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "KnownChemical - 'C21H28O2' rt=732.88 max_intensity=268684.17\n", + "KnownChemical - 'C9H15NO3S' rt=626.68 max_intensity=2101106.53\n", + "KnownChemical - 'C12H22O4' rt=548.52 max_intensity=743724.75\n", + "KnownChemical - 'C16H38N2' rt=577.29 max_intensity=1230168.35\n", + "KnownChemical - 'C5H9NO2' rt=382.06 max_intensity=8778690.16\n", + "KnownChemical - 'C9H15N5O' rt=429.14 max_intensity=402739.66\n", + "KnownChemical - 'C15H21N3O' rt=526.60 max_intensity=8337695.01\n", + "KnownChemical - 'C10H15N3O2' rt=422.79 max_intensity=311677.09\n", + "KnownChemical - 'C12H14N4O2S' rt=658.45 max_intensity=65078397.89\n", + "KnownChemical - 'C6H14O6S2' rt=428.27 max_intensity=933174.98\n" + ] + } + ], + "source": [ + "for chem in dataset[0:10]:\n", + " print(chem)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create Multiple Samples" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The next section allows us to define classes of biological replicates, each having multiple technical replicates. \n", + "\n", + "Below we create two biological classes ('class0', 'class1'), each having 10 technical replicates with some noise on the chemical's intensity." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "n_samples = [10, 10] # number of files per class\n", + "classes = [\"class%d\" % i for i in range(len(n_samples))] # creates default list of classes\n", + "intensity_noise_sd = [1000] # noise on max intensity" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['class0', 'class1']" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "classes" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Add intensity changes between different classes" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "change_probabilities = [0 for i in range(len(n_samples))] # probability of intensity changes between different classes\n", + "change_differences_means = [0 for i in range(len(n_samples))] # mean of those intensity changes\n", + "change_differences_sds = [0 for i in range(len(n_samples))] # SD of those intensity changes" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Add experimental variables (examples in comments)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "experimental_classes = None # [[\"male\",\"female\"],[\"Positive\",\"Negative\",\"Unknown\"]]\n", + "experimental_probabilitities = None # [[0.5,0.5],[0.33,0.33,0.34]]\n", + "experimental_sds = None # [[250],[250]]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Dropout chemicals in different classes" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "# drop-out chemicals by their probabilities\n", + "dropout_probability = 0.2\n", + "dropout_probabilities = [dropout_probability for i in range(len(n_samples))]\n", + "dropout_numbers = None # drop-out chemicals by an absolute number\n", + "\n", + "# dropout_probabilities = None\n", + "# dropout_numbers = 2 # number of chemicals dropped out in each class" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Generate multiple samples" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "save_location = os.path.join(out_dir, 'ChemicalFiles')" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Saving <class 'list'> to C:\\Users\\joewa\\Work\\git\\vimms\\examples\\example_data\\results\\MS1_multiple\\ChemicalFiles\\sample_0.p\n", + "Saving <class 'list'> to C:\\Users\\joewa\\Work\\git\\vimms\\examples\\example_data\\results\\MS1_multiple\\ChemicalFiles\\sample_1.p\n", + "Saving <class 'list'> to C:\\Users\\joewa\\Work\\git\\vimms\\examples\\example_data\\results\\MS1_multiple\\ChemicalFiles\\sample_2.p\n", + "Saving <class 'list'> to C:\\Users\\joewa\\Work\\git\\vimms\\examples\\example_data\\results\\MS1_multiple\\ChemicalFiles\\sample_3.p\n", + "Saving <class 'list'> to C:\\Users\\joewa\\Work\\git\\vimms\\examples\\example_data\\results\\MS1_multiple\\ChemicalFiles\\sample_4.p\n", + "Saving <class 'list'> to C:\\Users\\joewa\\Work\\git\\vimms\\examples\\example_data\\results\\MS1_multiple\\ChemicalFiles\\sample_5.p\n", + "Saving <class 'list'> to C:\\Users\\joewa\\Work\\git\\vimms\\examples\\example_data\\results\\MS1_multiple\\ChemicalFiles\\sample_6.p\n", + "Saving <class 'list'> to C:\\Users\\joewa\\Work\\git\\vimms\\examples\\example_data\\results\\MS1_multiple\\ChemicalFiles\\sample_7.p\n", + "Saving <class 'list'> to C:\\Users\\joewa\\Work\\git\\vimms\\examples\\example_data\\results\\MS1_multiple\\ChemicalFiles\\sample_8.p\n", + "Saving <class 'list'> to C:\\Users\\joewa\\Work\\git\\vimms\\examples\\example_data\\results\\MS1_multiple\\ChemicalFiles\\sample_9.p\n", + "Saving <class 'list'> to C:\\Users\\joewa\\Work\\git\\vimms\\examples\\example_data\\results\\MS1_multiple\\ChemicalFiles\\sample_10.p\n", + "Saving <class 'list'> to C:\\Users\\joewa\\Work\\git\\vimms\\examples\\example_data\\results\\MS1_multiple\\ChemicalFiles\\sample_11.p\n", + "Saving <class 'list'> to C:\\Users\\joewa\\Work\\git\\vimms\\examples\\example_data\\results\\MS1_multiple\\ChemicalFiles\\sample_12.p\n", + "Saving <class 'list'> to C:\\Users\\joewa\\Work\\git\\vimms\\examples\\example_data\\results\\MS1_multiple\\ChemicalFiles\\sample_13.p\n", + "Saving <class 'list'> to C:\\Users\\joewa\\Work\\git\\vimms\\examples\\example_data\\results\\MS1_multiple\\ChemicalFiles\\sample_14.p\n", + "Saving <class 'list'> to C:\\Users\\joewa\\Work\\git\\vimms\\examples\\example_data\\results\\MS1_multiple\\ChemicalFiles\\sample_15.p\n", + "Saving <class 'list'> to C:\\Users\\joewa\\Work\\git\\vimms\\examples\\example_data\\results\\MS1_multiple\\ChemicalFiles\\sample_16.p\n", + "Saving <class 'list'> to C:\\Users\\joewa\\Work\\git\\vimms\\examples\\example_data\\results\\MS1_multiple\\ChemicalFiles\\sample_17.p\n", + "Saving <class 'list'> to C:\\Users\\joewa\\Work\\git\\vimms\\examples\\example_data\\results\\MS1_multiple\\ChemicalFiles\\sample_18.p\n", + "Saving <class 'list'> to C:\\Users\\joewa\\Work\\git\\vimms\\examples\\example_data\\results\\MS1_multiple\\ChemicalFiles\\sample_19.p\n" + ] + } + ], + "source": [ + "multiple_samples = MultiSampleCreator(dataset, n_samples, classes, intensity_noise_sd, \n", + " change_probabilities, change_differences_means, change_differences_sds, dropout_probabilities, dropout_numbers,\n", + " experimental_classes, experimental_probabilitities, experimental_sds, save_location=save_location)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "20" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "total_samples = np.sum(multiple_samples.n_samples)\n", + "total_samples" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can also print the chemicals that are missing (removed by drop-out) in each class." + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Saving <class 'list'> to C:\\Users\\joewa\\Work\\git\\vimms\\examples\\example_data\\results\\MS1_multiple\\MissingChemicals\\missing_chemicals.p\n" + ] + }, + { + "data": { + "text/plain": [ + "[[KnownChemical - 'C21H28O2' rt=732.88 max_intensity=268684.17,\n", + " KnownChemical - 'C12H22O4' rt=548.52 max_intensity=743724.75,\n", + " KnownChemical - 'C9H15N5O' rt=429.14 max_intensity=402739.66,\n", + " KnownChemical - 'C10H15N3O2' rt=422.79 max_intensity=311677.09,\n", + " KnownChemical - 'C16H19NOS' rt=664.34 max_intensity=498105.07,\n", + " KnownChemical - 'C10H15N3O5' rt=633.61 max_intensity=531359.64,\n", + " KnownChemical - 'C6H13NO2' rt=371.39 max_intensity=3724193.90,\n", + " KnownChemical - 'C6HCl5O' rt=414.97 max_intensity=410884.25,\n", + " KnownChemical - 'C16H14ClN3O' rt=577.63 max_intensity=263971.98,\n", + " KnownChemical - 'C17H21N' rt=456.76 max_intensity=588754.25,\n", + " KnownChemical - 'C18H24O3' rt=736.21 max_intensity=209830.14,\n", + " KnownChemical - 'C7H8O5S' rt=412.30 max_intensity=2309042.63,\n", + " KnownChemical - 'C9H11N5O3' rt=668.43 max_intensity=5617453.25,\n", + " KnownChemical - 'C15H15NO3' rt=682.22 max_intensity=394247.65,\n", + " KnownChemical - 'C9H21N3O' rt=574.84 max_intensity=1280653.26,\n", + " KnownChemical - 'C13H16N2O2' rt=691.56 max_intensity=562294.37,\n", + " KnownChemical - 'C6H10N6O' rt=519.99 max_intensity=502432.70,\n", + " KnownChemical - 'C9H10N4O2S2' rt=418.25 max_intensity=15487155.81,\n", + " KnownChemical - 'C10H16O' rt=498.36 max_intensity=8125012.33,\n", + " KnownChemical - 'C11H12N2O3S' rt=630.16 max_intensity=243343.91,\n", + " KnownChemical - 'C6H4Cl2O' rt=571.80 max_intensity=240326.54,\n", + " KnownChemical - 'C11H15N4O7PS' rt=445.30 max_intensity=838657.41,\n", + " KnownChemical - 'C9H11ClO3' rt=431.82 max_intensity=422349.71,\n", + " KnownChemical - 'C18H22O2' rt=677.54 max_intensity=179998.55,\n", + " KnownChemical - 'C7H13NO2' rt=603.42 max_intensity=1053665.42,\n", + " KnownChemical - 'C8H16N4O3' rt=738.43 max_intensity=4786603.69,\n", + " KnownChemical - 'C11H18N2O3' rt=496.44 max_intensity=470069.27,\n", + " KnownChemical - 'C10H7NO4' rt=490.82 max_intensity=1618315.43,\n", + " KnownChemical - 'C10H10N2O4S' rt=467.25 max_intensity=1474129.08,\n", + " KnownChemical - 'C12H13NO3' rt=674.18 max_intensity=308063.43,\n", + " KnownChemical - 'C3H2ClF5O' rt=495.82 max_intensity=333064.35,\n", + " KnownChemical - 'C13H13N3O3' rt=747.02 max_intensity=37506537.67,\n", + " KnownChemical - 'C8H11NO2' rt=511.73 max_intensity=616561.49,\n", + " KnownChemical - 'C8H11N' rt=647.43 max_intensity=7526641.04,\n", + " KnownChemical - 'C19H25NO' rt=667.03 max_intensity=301184.19,\n", + " KnownChemical - 'C7H11NO7P2' rt=425.89 max_intensity=3206938.17,\n", + " KnownChemical - 'C6H7N5' rt=738.73 max_intensity=1066550.16,\n", + " KnownChemical - 'C6H4O5' rt=659.22 max_intensity=431609.33,\n", + " KnownChemical - 'C5H11Cl2N' rt=486.41 max_intensity=330500.12,\n", + " KnownChemical - 'C6H8S' rt=459.06 max_intensity=2414333.00,\n", + " KnownChemical - 'C19H30O3' rt=424.22 max_intensity=280506.40,\n", + " KnownChemical - 'C12H19NO3' rt=423.93 max_intensity=447323.93,\n", + " KnownChemical - 'C4H10NO6P' rt=576.86 max_intensity=530520.82,\n", + " KnownChemical - 'C8H11NO6S' rt=541.48 max_intensity=5418948.45,\n", + " KnownChemical - 'C9H18O' rt=733.79 max_intensity=1076025.65,\n", + " KnownChemical - 'C12H18' rt=633.73 max_intensity=205268.18,\n", + " KnownChemical - 'C4H6O5' rt=652.99 max_intensity=3882543.31,\n", + " KnownChemical - 'C11H15NO' rt=550.99 max_intensity=591309.97,\n", + " KnownChemical - 'C11H21N' rt=779.35 max_intensity=214960.02,\n", + " KnownChemical - 'C16H25NO4' rt=790.25 max_intensity=1409625.44,\n", + " KnownChemical - 'C10H22N4' rt=479.53 max_intensity=2385621.58,\n", + " KnownChemical - 'C21H28O5' rt=648.31 max_intensity=1022658.42,\n", + " KnownChemical - 'C12H15N3O' rt=703.97 max_intensity=832271.78,\n", + " KnownChemical - 'C5H4N2O4' rt=429.89 max_intensity=270859.15,\n", + " KnownChemical - 'C22H44O2' rt=598.82 max_intensity=374192.97,\n", + " KnownChemical - 'C9H17NO5' rt=562.37 max_intensity=868753.60,\n", + " KnownChemical - 'C7H7ClN2O4S' rt=752.38 max_intensity=9118598.67,\n", + " KnownChemical - 'C3H8NO6P' rt=710.13 max_intensity=351596.35,\n", + " KnownChemical - 'C6H11NO2' rt=686.97 max_intensity=434109.46,\n", + " KnownChemical - 'C16H11ClN4O' rt=672.93 max_intensity=189608.42,\n", + " KnownChemical - 'C7H8N2O2' rt=432.09 max_intensity=818248.42,\n", + " KnownChemical - 'C5H11NO3S' rt=589.52 max_intensity=205256.80,\n", + " KnownChemical - 'C5H8O3' rt=654.83 max_intensity=535188.33,\n", + " KnownChemical - 'C13H19NO' rt=715.87 max_intensity=311911.23,\n", + " KnownChemical - 'C12H14O5' rt=641.98 max_intensity=7157886.23,\n", + " KnownChemical - 'C8H17NO2' rt=398.12 max_intensity=181902.35,\n", + " KnownChemical - 'C17H18N2O6S' rt=684.56 max_intensity=374322.36,\n", + " KnownChemical - 'C14H14ClNOS' rt=764.50 max_intensity=111436622.82,\n", + " KnownChemical - 'C12H12N2O2S' rt=682.35 max_intensity=1024654.94,\n", + " KnownChemical - 'C14H22ClNO2' rt=403.77 max_intensity=944853.27,\n", + " KnownChemical - 'C5H9NO3S' rt=448.67 max_intensity=1158111.31,\n", + " KnownChemical - 'C8H11NO' rt=611.11 max_intensity=250852.45,\n", + " KnownChemical - 'C15H12O4' rt=641.91 max_intensity=1515666.43,\n", + " KnownChemical - 'C16H21N3' rt=750.11 max_intensity=2869315.16,\n", + " KnownChemical - 'C6H5N5O2' rt=416.88 max_intensity=747785.86,\n", + " KnownChemical - 'C9H13NO3' rt=406.94 max_intensity=238145.31,\n", + " KnownChemical - 'C13H17N' rt=490.88 max_intensity=1569691.62,\n", + " KnownChemical - 'C14H19N5O4' rt=714.78 max_intensity=1042507.95,\n", + " KnownChemical - 'C10H11ClFN5O3' rt=610.71 max_intensity=1210228.45,\n", + " KnownChemical - 'C17H23NO3' rt=767.35 max_intensity=1093116.31,\n", + " KnownChemical - 'C8H8N4O3' rt=461.84 max_intensity=1824876.81,\n", + " KnownChemical - 'C8H12N2O3S' rt=713.20 max_intensity=761679.93,\n", + " KnownChemical - 'C2H7O2PS2' rt=433.48 max_intensity=4840608.73,\n", + " KnownChemical - 'C12H15N3O3S' rt=527.02 max_intensity=1565062.17,\n", + " KnownChemical - 'C5H4N4O3' rt=480.92 max_intensity=501374.75,\n", + " KnownChemical - 'C6H8O4' rt=464.52 max_intensity=3153279.28,\n", + " KnownChemical - 'C9H12' rt=570.47 max_intensity=9404500.19,\n", + " KnownChemical - 'C9H8O2' rt=670.91 max_intensity=913535.72,\n", + " KnownChemical - 'C18H34O2' rt=392.44 max_intensity=9110098.17,\n", + " KnownChemical - 'C10H12O3' rt=457.39 max_intensity=289902.71,\n", + " KnownChemical - 'C12H14O9S' rt=798.22 max_intensity=400196.91,\n", + " KnownChemical - 'C5H7N5O' rt=400.59 max_intensity=452837.14,\n", + " KnownChemical - 'C4H9NO3' rt=651.38 max_intensity=3004717.10,\n", + " KnownChemical - 'C19H26O8' rt=696.28 max_intensity=1006653.84,\n", + " KnownChemical - 'C6H14N2O2' rt=767.16 max_intensity=5296426.89,\n", + " KnownChemical - 'C13H17NO' rt=401.00 max_intensity=5124308.38,\n", + " KnownChemical - 'C9H6O3' rt=619.80 max_intensity=4507455.53,\n", + " KnownChemical - 'C10H11N3O4S' rt=485.32 max_intensity=285420.40,\n", + " KnownChemical - 'C7H13NO4' rt=690.26 max_intensity=1032994.16,\n", + " KnownChemical - 'C24H38O3' rt=732.51 max_intensity=357802.46,\n", + " KnownChemical - 'C8H14N2O5S' rt=429.96 max_intensity=11007582.80,\n", + " KnownChemical - 'C8H14O2' rt=655.77 max_intensity=19450570.99,\n", + " KnownChemical - 'C16H26N2O3' rt=595.49 max_intensity=401097.22,\n", + " KnownChemical - 'C10H9NO3' rt=598.88 max_intensity=218760.62,\n", + " KnownChemical - 'C10H8O' rt=612.73 max_intensity=39386911.13,\n", + " KnownChemical - 'C11H11N5' rt=438.39 max_intensity=333599.76,\n", + " KnownChemical - 'C13H20N2O2' rt=583.06 max_intensity=972039.85,\n", + " KnownChemical - 'C18H29NO3' rt=605.88 max_intensity=57893824.32,\n", + " KnownChemical - 'C10H12O7S' rt=494.39 max_intensity=209149.88,\n", + " KnownChemical - 'C10H16O4' rt=430.29 max_intensity=24570465.10,\n", + " KnownChemical - 'C6H12O2' rt=757.23 max_intensity=315928.28,\n", + " KnownChemical - 'C19H17NO2S' rt=652.91 max_intensity=3565226.06,\n", + " KnownChemical - 'C8H8N2O3S' rt=545.36 max_intensity=419531.73,\n", + " KnownChemical - 'C22H30N2' rt=421.11 max_intensity=317806.27,\n", + " KnownChemical - 'C22H34O2' rt=603.16 max_intensity=313232.44,\n", + " KnownChemical - 'C16H24N2O3' rt=526.49 max_intensity=227457.40,\n", + " KnownChemical - 'C6H8OS' rt=395.19 max_intensity=403740.30,\n", + " KnownChemical - 'C10H20N2S4' rt=464.66 max_intensity=50590970.03,\n", + " KnownChemical - 'C9H7NO5S' rt=557.53 max_intensity=450036.33,\n", + " KnownChemical - 'C6H12O4' rt=606.23 max_intensity=44589301.94,\n", + " KnownChemical - 'C18H23N3O' rt=441.36 max_intensity=1143101.37,\n", + " KnownChemical - 'C6H9N3O2' rt=766.83 max_intensity=412636306.69,\n", + " KnownChemical - 'C6H13O9P' rt=679.35 max_intensity=215595.51,\n", + " KnownChemical - 'C11H12N2O' rt=621.77 max_intensity=424192.10,\n", + " KnownChemical - 'C10H15N5' rt=654.25 max_intensity=14897770.19,\n", + " KnownChemical - 'C8H7Cl2N3' rt=455.45 max_intensity=406492.97,\n", + " KnownChemical - 'C11H16O4' rt=413.23 max_intensity=9675703.23,\n", + " KnownChemical - 'C15H21N3O2' rt=742.95 max_intensity=3485444.69,\n", + " KnownChemical - 'C8H18NO2' rt=715.11 max_intensity=1308542.60,\n", + " KnownChemical - 'C17H21NO' rt=549.09 max_intensity=1013581.18,\n", + " KnownChemical - 'C18H22N2' rt=678.22 max_intensity=5604308.23,\n", + " KnownChemical - 'C13H18ClNO2' rt=679.25 max_intensity=3650145.68,\n", + " KnownChemical - 'C10H16N4O3' rt=432.26 max_intensity=545845.21,\n", + " KnownChemical - 'C6H6N2O2' rt=663.91 max_intensity=1084353.62,\n", + " KnownChemical - 'C14H14ClNS' rt=558.12 max_intensity=1339579.44,\n", + " KnownChemical - 'C16H13ClN2O3' rt=411.33 max_intensity=894436.37,\n", + " KnownChemical - 'C18H19N' rt=537.16 max_intensity=951383.74,\n", + " KnownChemical - 'C18H13ClFN3' rt=405.52 max_intensity=20709928.79,\n", + " KnownChemical - 'C5H5N5' rt=487.52 max_intensity=1418998.22,\n", + " KnownChemical - 'C4H8N2O2' rt=612.59 max_intensity=2523861.70,\n", + " KnownChemical - 'C16H25NO2' rt=721.36 max_intensity=5410419.20,\n", + " KnownChemical - 'C4H7NO3' rt=613.34 max_intensity=673127.88,\n", + " KnownChemical - 'C4H11O3PS' rt=404.34 max_intensity=782686.31,\n", + " KnownChemical - 'C16H23NO2' rt=438.46 max_intensity=17597743.59,\n", + " KnownChemical - 'C5H4N4O' rt=417.01 max_intensity=438570.10,\n", + " KnownChemical - 'C8H15N3O3' rt=545.84 max_intensity=796822.91,\n", + " KnownChemical - 'C12H19NO6' rt=489.60 max_intensity=8214409.86,\n", + " KnownChemical - 'C23H26N2O3' rt=666.41 max_intensity=1386317.20,\n", + " KnownChemical - 'C9H12O4' rt=713.34 max_intensity=3469357.72,\n", + " KnownChemical - 'C18H30O2' rt=726.66 max_intensity=78179100.24,\n", + " KnownChemical - 'C10H12N2O7' rt=397.83 max_intensity=1001140.18,\n", + " KnownChemical - 'C10H17N3S' rt=489.65 max_intensity=5285547.62,\n", + " KnownChemical - 'C15H13N3O4S' rt=489.28 max_intensity=1661798.50,\n", + " KnownChemical - 'C7H8O' rt=574.60 max_intensity=1364246.38,\n", + " KnownChemical - 'C11H12O8S' rt=423.42 max_intensity=534892.58,\n", + " KnownChemical - 'C6H4O2' rt=542.41 max_intensity=1037189.99,\n", + " KnownChemical - 'C19H16O4' rt=761.36 max_intensity=752401.42,\n", + " KnownChemical - 'C7H8ClN3O4S2' rt=741.47 max_intensity=431068.59,\n", + " KnownChemical - 'C10H7N3S' rt=631.66 max_intensity=858145.50,\n", + " KnownChemical - 'C8H12N2O4Pt' rt=714.08 max_intensity=691460.41,\n", + " KnownChemical - 'C7H9N' rt=428.60 max_intensity=2992619.91,\n", + " KnownChemical - 'C5H9N3O4' rt=552.81 max_intensity=430667.01,\n", + " KnownChemical - 'C9H17NO4' rt=548.84 max_intensity=1585214.38,\n", + " KnownChemical - 'C21H28O6' rt=725.33 max_intensity=535020.65,\n", + " KnownChemical - 'C5H4N4O2S' rt=666.41 max_intensity=6147430.47,\n", + " KnownChemical - 'C17H18N2O6' rt=760.03 max_intensity=2548246.09,\n", + " KnownChemical - 'C5H10O4' rt=718.44 max_intensity=3299138.29,\n", + " KnownChemical - 'C6H9NO5' rt=488.72 max_intensity=434793.91,\n", + " KnownChemical - 'C8H10O7S' rt=535.68 max_intensity=14707248.45,\n", + " KnownChemical - 'C8H10O8' rt=420.71 max_intensity=307837.78,\n", + " KnownChemical - 'C12H12' rt=598.70 max_intensity=2135157.97,\n", + " KnownChemical - 'C12H17NO3' rt=764.74 max_intensity=973574.98,\n", + " KnownChemical - 'C11H20NO12P' rt=767.39 max_intensity=963879.53,\n", + " KnownChemical - 'C10H16N2O4' rt=536.86 max_intensity=5046407.27,\n", + " KnownChemical - 'C20H24N2O' rt=666.04 max_intensity=334453.84,\n", + " KnownChemical - 'C4H7FN2O3' rt=420.99 max_intensity=5593047.08,\n", + " KnownChemical - 'C8H7NO5' rt=480.48 max_intensity=1214024.74,\n", + " KnownChemical - 'C15H14O3' rt=551.55 max_intensity=30530950.70,\n", + " KnownChemical - 'C20H21N' rt=610.04 max_intensity=10173146.26,\n", + " KnownChemical - 'C8H6O4' rt=702.47 max_intensity=379137.14,\n", + " KnownChemical - 'C15H11N3O3' rt=617.55 max_intensity=197807.47,\n", + " KnownChemical - 'C9H13NO' rt=600.54 max_intensity=329869.10,\n", + " KnownChemical - 'C6H6O4' rt=465.54 max_intensity=831876.73,\n", + " KnownChemical - 'C23H27NO3' rt=753.81 max_intensity=660616.06,\n", + " KnownChemical - 'C3H8N2O2' rt=603.59 max_intensity=2322167.36,\n", + " KnownChemical - 'C8H12O7' rt=401.39 max_intensity=1410977.31,\n", + " KnownChemical - 'C9H16N4O7' rt=490.93 max_intensity=182023.26,\n", + " KnownChemical - 'C21H34O3' rt=482.65 max_intensity=1248462.12,\n", + " KnownChemical - 'C3H2F6O' rt=560.99 max_intensity=1337844.10,\n", + " KnownChemical - 'C6H8O5' rt=410.09 max_intensity=3945799.83,\n", + " KnownChemical - 'C12H16N2O4' rt=415.76 max_intensity=214303.00,\n", + " KnownChemical - 'C12H15ClO3' rt=394.44 max_intensity=3366345.98,\n", + " KnownChemical - 'C10H12N2O4' rt=409.85 max_intensity=525239.21,\n", + " KnownChemical - 'C12H16F3N' rt=698.73 max_intensity=328955.91,\n", + " KnownChemical - 'C12H14O7' rt=513.76 max_intensity=658346.70,\n", + " KnownChemical - 'C10H12N2' rt=673.49 max_intensity=1722883.32,\n", + " KnownChemical - 'C5H5N3O' rt=645.22 max_intensity=8046071.09,\n", + " KnownChemical - 'C14H20O' rt=723.73 max_intensity=360069.63,\n", + " KnownChemical - 'C14H15NO3' rt=386.72 max_intensity=1266889.65,\n", + " KnownChemical - 'C3H7O4P' rt=635.82 max_intensity=310941.29,\n", + " KnownChemical - 'C7H11N3O2' rt=425.66 max_intensity=1105918.75,\n", + " KnownChemical - 'C6H14O6' rt=662.02 max_intensity=2022789.04,\n", + " KnownChemical - 'C8H8N4' rt=410.00 max_intensity=42509865.69,\n", + " KnownChemical - 'C10H12O5S' rt=441.89 max_intensity=2216622.88,\n", + " KnownChemical - 'C9H8O' rt=789.56 max_intensity=2206330.94,\n", + " KnownChemical - 'C10H21N3O2' rt=406.35 max_intensity=998566.79,\n", + " KnownChemical - 'C8H12N2' rt=701.97 max_intensity=184239.83,\n", + " KnownChemical - 'C15H10O2' rt=405.96 max_intensity=606950.09,\n", + " KnownChemical - 'C8H10N2O' rt=669.49 max_intensity=1571835.79,\n", + " KnownChemical - 'C8H12N4O5' rt=525.09 max_intensity=741152.99,\n", + " KnownChemical - 'C3H6N2O2' rt=734.58 max_intensity=383783.71,\n", + " KnownChemical - 'C24H40O4' rt=583.10 max_intensity=681116.55,\n", + " KnownChemical - 'C4H6N4O3S2' rt=423.48 max_intensity=2047253.42,\n", + " KnownChemical - 'C13H10O' rt=535.31 max_intensity=740828.93,\n", + " KnownChemical - 'C7H10O7' rt=740.36 max_intensity=4263954.45,\n", + " KnownChemical - 'C9H7N7O2S' rt=539.57 max_intensity=6130110.21,\n", + " KnownChemical - 'C10H16N2O3S' rt=390.88 max_intensity=21566188.88,\n", + " KnownChemical - 'C23H34' rt=615.96 max_intensity=351347.44,\n", + " KnownChemical - 'C5H11NO4S' rt=436.60 max_intensity=1320867.63,\n", + " KnownChemical - 'C8H10FN3O3S' rt=395.89 max_intensity=499137.59,\n", + " KnownChemical - 'C11H12N2OS' rt=473.31 max_intensity=219750.47],\n", + " [KnownChemical - 'C9H15NO3S' rt=626.68 max_intensity=2101106.53,\n", + " KnownChemical - 'C10H17N' rt=518.94 max_intensity=3471723.43,\n", + " KnownChemical - 'C16H12O7' rt=524.40 max_intensity=857009.34,\n", + " KnownChemical - 'C4H5N3O2' rt=493.13 max_intensity=427065.01,\n", + " KnownChemical - 'C6HCl5O' rt=414.97 max_intensity=410884.25,\n", + " KnownChemical - 'C17H12Cl2N4O' rt=752.14 max_intensity=2939629.28,\n", + " KnownChemical - 'C9H8O4' rt=489.90 max_intensity=1988935.55,\n", + " KnownChemical - 'C18H27NO2' rt=675.88 max_intensity=186811.18,\n", + " KnownChemical - 'C2H8O7P2' rt=748.60 max_intensity=2000241.65,\n", + " KnownChemical - 'C10H20O2' rt=658.95 max_intensity=1987739.83,\n", + " KnownChemical - 'C7H8O5S' rt=412.30 max_intensity=2309042.63,\n", + " KnownChemical - 'C11H10N4O2' rt=632.70 max_intensity=429439.78,\n", + " KnownChemical - 'C11H19N5O2S2' rt=622.35 max_intensity=360919.28,\n", + " KnownChemical - 'C10H15N5O3' rt=438.81 max_intensity=1466022.86,\n", + " KnownChemical - 'C13H16N2O2' rt=691.56 max_intensity=562294.37,\n", + " KnownChemical - 'C13H25NO4' rt=441.24 max_intensity=7863378.39,\n", + " KnownChemical - 'C4H3F7O' rt=668.21 max_intensity=2205370.01,\n", + " KnownChemical - 'C9H12N2O2' rt=421.05 max_intensity=3017089.33,\n", + " KnownChemical - 'C6H10O5' rt=418.72 max_intensity=235048.56,\n", + " KnownChemical - 'C4H3FN2O2' rt=448.29 max_intensity=469152.81,\n", + " KnownChemical - 'C8H16N4O3' rt=738.43 max_intensity=4786603.69,\n", + " KnownChemical - 'C10H7NO4' rt=490.82 max_intensity=1618315.43,\n", + " KnownChemical - 'C3H2ClF5O' rt=495.82 max_intensity=333064.35,\n", + " KnownChemical - 'C3H6O4' rt=748.81 max_intensity=1870882.49,\n", + " KnownChemical - 'C9H13N3O4' rt=502.08 max_intensity=41047899.76,\n", + " KnownChemical - 'C7H17N2O2' rt=743.92 max_intensity=2702220.70,\n", + " KnownChemical - 'C8H6N4O5' rt=657.02 max_intensity=5657036.21,\n", + " KnownChemical - 'C10H12ClN3O3S' rt=597.10 max_intensity=2004007.99,\n", + " KnownChemical - 'C5H10N2O' rt=484.04 max_intensity=888219.45,\n", + " KnownChemical - 'C22H30O3' rt=525.96 max_intensity=2651568.40,\n", + " KnownChemical - 'C12H14N2O5S' rt=560.02 max_intensity=5932071.16,\n", + " KnownChemical - 'C6H8S' rt=459.06 max_intensity=2414333.00,\n", + " KnownChemical - 'C12H19NO3' rt=423.93 max_intensity=447323.93,\n", + " KnownChemical - 'C12H18O2' rt=540.02 max_intensity=730065.84,\n", + " KnownChemical - 'C9H9N3O2S2' rt=757.70 max_intensity=775341.61,\n", + " KnownChemical - 'C15H17NO2' rt=627.96 max_intensity=442922.38,\n", + " KnownChemical - 'C11H14ClNO3S' rt=576.65 max_intensity=7613172.04,\n", + " KnownChemical - 'C8H10O3' rt=614.36 max_intensity=567209.42,\n", + " KnownChemical - 'C8H11NO5S' rt=735.56 max_intensity=199493.09,\n", + " KnownChemical - 'C13H23NO4' rt=687.18 max_intensity=439986.85,\n", + " KnownChemical - 'C11H12N4O3S' rt=579.26 max_intensity=676147.99,\n", + " KnownChemical - 'C17H26ClN' rt=600.11 max_intensity=9556158.31,\n", + " KnownChemical - 'C10H21NO7' rt=574.11 max_intensity=604728.00,\n", + " KnownChemical - 'C11H21NO4' rt=693.64 max_intensity=372915.28,\n", + " KnownChemical - 'C5H5N5O' rt=484.64 max_intensity=456377.07,\n", + " KnownChemical - 'C14H14N2O' rt=570.31 max_intensity=239509.85,\n", + " KnownChemical - 'C5H4N2O4' rt=429.89 max_intensity=270859.15,\n", + " KnownChemical - 'C7H9ClO' rt=741.87 max_intensity=189972.95,\n", + " KnownChemical - 'C13H16N2O3S' rt=673.82 max_intensity=61073923.41,\n", + " KnownChemical - 'C22H44O2' rt=598.82 max_intensity=374192.97,\n", + " KnownChemical - 'C12H18N2O3' rt=610.64 max_intensity=1132662.08,\n", + " KnownChemical - 'C10H16NO' rt=643.11 max_intensity=216324.95,\n", + " KnownChemical - 'C9H17NO5' rt=562.37 max_intensity=868753.60,\n", + " KnownChemical - 'C4H11O4P' rt=555.30 max_intensity=715163.75,\n", + " KnownChemical - 'C3H7O7P' rt=430.83 max_intensity=293934.13,\n", + " KnownChemical - 'C4H9NO2' rt=599.40 max_intensity=354130.89,\n", + " KnownChemical - 'C5H11NO3S' rt=589.52 max_intensity=205256.80,\n", + " KnownChemical - 'C13H19NO' rt=715.87 max_intensity=311911.23,\n", + " KnownChemical - 'C21H36O5' rt=427.22 max_intensity=233023.87,\n", + " KnownChemical - 'C17H20N4OS' rt=439.75 max_intensity=2025653.92,\n", + " KnownChemical - 'C14H14ClNOS' rt=764.50 max_intensity=111436622.82,\n", + " KnownChemical - 'C12H12N2O2S' rt=682.35 max_intensity=1024654.94,\n", + " KnownChemical - 'C12H21NO' rt=715.62 max_intensity=387451.65,\n", + " KnownChemical - 'C8H11NO' rt=611.11 max_intensity=250852.45,\n", + " KnownChemical - 'C4H8O5' rt=673.58 max_intensity=508429.41,\n", + " KnownChemical - 'C10H8N6O' rt=413.01 max_intensity=4576973.93,\n", + " KnownChemical - 'C6H14N4O2' rt=769.58 max_intensity=22741497.85,\n", + " KnownChemical - 'C6H10O4' rt=545.84 max_intensity=508262.69,\n", + " KnownChemical - 'C12H23NO2S' rt=718.10 max_intensity=210642.11,\n", + " KnownChemical - 'C5H11NO2Se' rt=699.47 max_intensity=6651243.08,\n", + " KnownChemical - 'C8H8O3' rt=646.00 max_intensity=677277.86,\n", + " KnownChemical - 'C14H21NO2' rt=745.54 max_intensity=2211652.62,\n", + " KnownChemical - 'C14H12O6S' rt=733.04 max_intensity=284539.30,\n", + " KnownChemical - 'C5H4N4O3' rt=480.92 max_intensity=501374.75,\n", + " KnownChemical - 'I2' rt=453.89 max_intensity=166933849.71,\n", + " KnownChemical - 'C8H7NO2' rt=425.28 max_intensity=1187800.06,\n", + " KnownChemical - 'C18H32O2' rt=533.88 max_intensity=2099293.85,\n", + " KnownChemical - 'C11H16N2O' rt=555.84 max_intensity=516183.71,\n", + " KnownChemical - 'C12H12N2O4' rt=408.72 max_intensity=514493.97,\n", + " KnownChemical - 'C6H14N2O' rt=593.92 max_intensity=453001.90,\n", + " KnownChemical - 'C10H26N4' rt=737.62 max_intensity=12346129.24,\n", + " KnownChemical - 'C15H10O10S' rt=416.58 max_intensity=414072.51,\n", + " KnownChemical - 'C7H11NO2' rt=424.34 max_intensity=471025.60,\n", + " KnownChemical - 'C14H16ClN3O4S2' rt=739.94 max_intensity=193303.86,\n", + " KnownChemical - 'C11H16ClN5' rt=451.98 max_intensity=22034454.67,\n", + " KnownChemical - 'C8H14O2' rt=655.77 max_intensity=19450570.99,\n", + " KnownChemical - 'C16H26N2O3' rt=595.49 max_intensity=401097.22,\n", + " KnownChemical - 'C9H16N2O2' rt=678.27 max_intensity=354111.46,\n", + " KnownChemical - 'C11H11N5' rt=438.39 max_intensity=333599.76,\n", + " KnownChemical - 'C19H27N5O4' rt=444.58 max_intensity=2970179.83,\n", + " KnownChemical - 'C19H25N5O4' rt=414.11 max_intensity=3545121.49,\n", + " KnownChemical - 'C21H23ClFN3O' rt=524.18 max_intensity=427821.38,\n", + " KnownChemical - 'C8H18O' rt=587.52 max_intensity=1031830.05,\n", + " KnownChemical - 'C12H20O4' rt=718.43 max_intensity=2079237.51,\n", + " KnownChemical - 'C11H12N4' rt=511.07 max_intensity=184316.02,\n", + " KnownChemical - 'C9H9Cl2N3' rt=549.73 max_intensity=2807066.89,\n", + " KnownChemical - 'C5H10N2O3' rt=495.50 max_intensity=430950.41,\n", + " KnownChemical - 'C8H16O2' rt=393.42 max_intensity=322097.12,\n", + " KnownChemical - 'C12H9N3O5S' rt=414.29 max_intensity=1124232.91,\n", + " KnownChemical - 'C10H16O4' rt=430.29 max_intensity=24570465.10,\n", + " KnownChemical - 'C15H21N' rt=551.42 max_intensity=851267.32,\n", + " KnownChemical - 'C12H13N3O2' rt=638.96 max_intensity=4010414.36,\n", + " KnownChemical - 'C4H8S2' rt=463.97 max_intensity=772632.68,\n", + " KnownChemical - 'C22H34O2' rt=603.16 max_intensity=313232.44,\n", + " KnownChemical - 'C10H20N2S4' rt=464.66 max_intensity=50590970.03,\n", + " KnownChemical - 'C17H20FN3O3' rt=685.61 max_intensity=629690.40,\n", + " KnownChemical - 'C17H20N4O6' rt=422.90 max_intensity=1147997.66,\n", + " KnownChemical - 'C8H7Cl2N3' rt=455.45 max_intensity=406492.97,\n", + " KnownChemical - 'C21H25NO' rt=612.11 max_intensity=3265372.01,\n", + " KnownChemical - 'C11H16O4' rt=413.23 max_intensity=9675703.23,\n", + " KnownChemical - 'C11H12Cl2N2O5' rt=426.66 max_intensity=1341168.16,\n", + " KnownChemical - 'C15H21N3O2' rt=742.95 max_intensity=3485444.69,\n", + " KnownChemical - 'C8H18NO2' rt=715.11 max_intensity=1308542.60,\n", + " KnownChemical - 'C12H16O5' rt=672.56 max_intensity=632244.16,\n", + " KnownChemical - 'C6H7NO' rt=585.96 max_intensity=430059.12,\n", + " KnownChemical - 'C15H12N2O' rt=417.17 max_intensity=1733868.52,\n", + " KnownChemical - 'C14H11ClN2O4S' rt=682.54 max_intensity=718246.73,\n", + " KnownChemical - 'C17H17N3O3S' rt=687.57 max_intensity=244878.44,\n", + " KnownChemical - 'C4H8O4' rt=414.94 max_intensity=3985435.24,\n", + " KnownChemical - 'C18H19N' rt=537.16 max_intensity=951383.74,\n", + " KnownChemical - 'C5H5N5' rt=487.52 max_intensity=1418998.22,\n", + " KnownChemical - 'C16H25NO2' rt=721.36 max_intensity=5410419.20,\n", + " KnownChemical - 'C5H4N4O' rt=417.01 max_intensity=438570.10,\n", + " KnownChemical - 'C6H10S' rt=673.34 max_intensity=1181581.81,\n", + " KnownChemical - 'C12H19NO6' rt=489.60 max_intensity=8214409.86,\n", + " KnownChemical - 'C15H13N3O4S' rt=489.28 max_intensity=1661798.50,\n", + " KnownChemical - 'C5H10O2' rt=464.51 max_intensity=4729035.21,\n", + " KnownChemical - 'C17H20FN3O4' rt=742.88 max_intensity=465376.23,\n", + " KnownChemical - 'C18H21ClN2O' rt=624.99 max_intensity=319533.30,\n", + " KnownChemical - 'C11H12Cl2N2O' rt=426.84 max_intensity=212978.90,\n", + " KnownChemical - 'C13H10N2O4' rt=760.79 max_intensity=3885782688.76,\n", + " KnownChemical - 'C10H17NOS' rt=505.31 max_intensity=467520.05,\n", + " KnownChemical - 'C7H7NO4' rt=422.82 max_intensity=220877.98,\n", + " KnownChemical - 'C14H20N2O2' rt=490.24 max_intensity=181082.59,\n", + " KnownChemical - 'C7H9N' rt=428.60 max_intensity=2992619.91,\n", + " KnownChemical - 'C5H10O4' rt=718.44 max_intensity=3299138.29,\n", + " KnownChemical - 'C14H10O5' rt=554.89 max_intensity=1443864.49,\n", + " KnownChemical - 'C20H27N' rt=421.89 max_intensity=1825335.98,\n", + " KnownChemical - 'C15H17F3N2O4' rt=712.12 max_intensity=6459473.72,\n", + " KnownChemical - 'C12H12' rt=598.70 max_intensity=2135157.97,\n", + " KnownChemical - 'C11H20NO12P' rt=767.39 max_intensity=963879.53,\n", + " KnownChemical - 'C7H14N2O3' rt=469.75 max_intensity=7653059.83,\n", + " KnownChemical - 'C7H6O2' rt=549.70 max_intensity=216294.58,\n", + " KnownChemical - 'C22H19ClO3' rt=616.17 max_intensity=10800178.50,\n", + " KnownChemical - 'C11H11NO3' rt=592.35 max_intensity=729313.56,\n", + " KnownChemical - 'C10H11F3N2O5' rt=718.04 max_intensity=17880991.02,\n", + " KnownChemical - 'C12H14ClNO' rt=541.22 max_intensity=481831.46,\n", + " KnownChemical - 'C16H15N3' rt=675.55 max_intensity=1287028.92,\n", + " KnownChemical - 'C12H14N2O3' rt=418.29 max_intensity=418187.22,\n", + " KnownChemical - 'C11H15NO5' rt=609.66 max_intensity=1586316.60,\n", + " KnownChemical - 'C19H28O2' rt=679.99 max_intensity=1021449.94,\n", + " KnownChemical - 'C2H7O3PS' rt=734.74 max_intensity=256371.94,\n", + " KnownChemical - 'C15H10N2O' rt=783.17 max_intensity=5513242.69,\n", + " KnownChemical - 'C8H10O' rt=450.50 max_intensity=2973122.63,\n", + " KnownChemical - 'C10H12N2' rt=673.49 max_intensity=1722883.32,\n", + " KnownChemical - 'C15H16O2' rt=537.18 max_intensity=320802.95,\n", + " KnownChemical - 'C2H8NO4P' rt=583.75 max_intensity=2735449.71,\n", + " KnownChemical - 'C5H5N3O' rt=645.22 max_intensity=8046071.09,\n", + " KnownChemical - 'C8H14O4' rt=614.27 max_intensity=651798.11,\n", + " KnownChemical - 'C10H12N4O4S' rt=793.94 max_intensity=244338.24,\n", + " KnownChemical - 'C13H18O3' rt=415.14 max_intensity=959573.88,\n", + " KnownChemical - 'C7H12O4' rt=626.10 max_intensity=177177.68,\n", + " KnownChemical - 'C9H20N2O2' rt=504.15 max_intensity=826317.38,\n", + " KnownChemical - 'C15H23NO3' rt=417.30 max_intensity=1863502.20,\n", + " KnownChemical - 'C2H8NO3P' rt=768.48 max_intensity=188501.80,\n", + " KnownChemical - 'C12H14O4' rt=533.36 max_intensity=1616412.72,\n", + " KnownChemical - 'C8H14O7' rt=562.71 max_intensity=4202246.40,\n", + " KnownChemical - 'C7H11N3O2' rt=425.66 max_intensity=1105918.75,\n", + " KnownChemical - 'C7H8N4O3' rt=492.67 max_intensity=533517.76,\n", + " KnownChemical - 'C8H8N4' rt=410.00 max_intensity=42509865.69,\n", + " KnownChemical - 'C6H7N3O' rt=475.60 max_intensity=3760136.05,\n", + " KnownChemical - 'C2H6O4S' rt=433.27 max_intensity=8958016.61,\n", + " KnownChemical - 'C21H29NO2' rt=544.39 max_intensity=255465.65,\n", + " KnownChemical - 'C8H12N2' rt=701.97 max_intensity=184239.83,\n", + " KnownChemical - 'C6H12O3' rt=591.53 max_intensity=2758937.95,\n", + " KnownChemical - 'C12H10N2O5' rt=558.98 max_intensity=602319.18,\n", + " KnownChemical - 'C14H30NO7P' rt=471.42 max_intensity=782748.15,\n", + " KnownChemical - 'C11H16N5O7PS' rt=578.04 max_intensity=351374.90,\n", + " KnownChemical - 'CH3O5P' rt=714.29 max_intensity=17422591.51,\n", + " KnownChemical - 'C6H11NO4' rt=516.85 max_intensity=4559452.41,\n", + " KnownChemical - 'C7H6ClN3O4S2' rt=449.88 max_intensity=336501.37,\n", + " KnownChemical - 'C7H10O7' rt=740.36 max_intensity=4263954.45,\n", + " KnownChemical - 'C10H16N2O3S' rt=390.88 max_intensity=21566188.88,\n", + " KnownChemical - 'C6H6N4S' rt=452.09 max_intensity=412529.32,\n", + " KnownChemical - 'C7H15NO3' rt=488.06 max_intensity=998140.46,\n", + " KnownChemical - 'C19H23NO4' rt=552.61 max_intensity=892392.75,\n", + " KnownChemical - 'C11H15N5O3S' rt=401.34 max_intensity=723652.11]]" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "save_obj(multiple_samples.missing_chemicals, Path(out_dir, 'MissingChemicals', 'missing_chemicals.p'))\n", + "multiple_samples.missing_chemicals" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Run MS1 controller on the samples and generate .mzML files" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can now take the multiple samples created above and generate mass spectral data (.mzML files) using the MS1 controller in ViMMS." + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Generating C:\\Users\\joewa\\Work\\git\\vimms\\examples\\example_data\\results\\MS1_multiple\\mzMLFiles\\number_0_class_0.mzML\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "400.81993000000045it [00:04, 92.48it/s] \n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Created C:\\Users\\joewa\\Work\\git\\vimms\\examples\\example_data\\results\\MS1_multiple\\mzMLFiles\n", + "Generating C:\\Users\\joewa\\Work\\git\\vimms\\examples\\example_data\\results\\MS1_multiple\\mzMLFiles\\number_1_class_0.mzML\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "400.7331000000005it [00:04, 95.30it/s] \n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Generating C:\\Users\\joewa\\Work\\git\\vimms\\examples\\example_data\\results\\MS1_multiple\\mzMLFiles\\number_2_class_0.mzML\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "400.23352599999976it [00:04, 95.66it/s] \n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Generating C:\\Users\\joewa\\Work\\git\\vimms\\examples\\example_data\\results\\MS1_multiple\\mzMLFiles\\number_3_class_0.mzML\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "400.06701999999984it [00:04, 93.63it/s] \n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Generating C:\\Users\\joewa\\Work\\git\\vimms\\examples\\example_data\\results\\MS1_multiple\\mzMLFiles\\number_4_class_0.mzML\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "400.8218000000011it [00:04, 96.63it/s] \n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Generating C:\\Users\\joewa\\Work\\git\\vimms\\examples\\example_data\\results\\MS1_multiple\\mzMLFiles\\number_5_class_0.mzML\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "400.67342999999937it [00:04, 93.35it/s] \n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Generating C:\\Users\\joewa\\Work\\git\\vimms\\examples\\example_data\\results\\MS1_multiple\\mzMLFiles\\number_6_class_0.mzML\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "400.9388760000012it [00:04, 91.21it/s] \n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Generating C:\\Users\\joewa\\Work\\git\\vimms\\examples\\example_data\\results\\MS1_multiple\\mzMLFiles\\number_7_class_0.mzML\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "400.7767300000007it [00:04, 97.70it/s] \n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Generating C:\\Users\\joewa\\Work\\git\\vimms\\examples\\example_data\\results\\MS1_multiple\\mzMLFiles\\number_8_class_0.mzML\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "401.0148999999999it [00:04, 96.84it/s] \n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Generating C:\\Users\\joewa\\Work\\git\\vimms\\examples\\example_data\\results\\MS1_multiple\\mzMLFiles\\number_9_class_0.mzML\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "400.10175700000036it [00:04, 90.83it/s] \n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Generating C:\\Users\\joewa\\Work\\git\\vimms\\examples\\example_data\\results\\MS1_multiple\\mzMLFiles\\number_0_class_1.mzML\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "401.1556000000006it [00:04, 92.73it/s] \n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Generating C:\\Users\\joewa\\Work\\git\\vimms\\examples\\example_data\\results\\MS1_multiple\\mzMLFiles\\number_1_class_1.mzML\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "400.1441599999994it [00:04, 86.05it/s] \n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Generating C:\\Users\\joewa\\Work\\git\\vimms\\examples\\example_data\\results\\MS1_multiple\\mzMLFiles\\number_2_class_1.mzML\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "400.8101270000003it [00:04, 93.60it/s] \n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Generating C:\\Users\\joewa\\Work\\git\\vimms\\examples\\example_data\\results\\MS1_multiple\\mzMLFiles\\number_3_class_1.mzML\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "400.6406600000005it [00:04, 91.75it/s] \n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Generating C:\\Users\\joewa\\Work\\git\\vimms\\examples\\example_data\\results\\MS1_multiple\\mzMLFiles\\number_4_class_1.mzML\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "400.2084600000003it [00:04, 89.61it/s] \n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Generating C:\\Users\\joewa\\Work\\git\\vimms\\examples\\example_data\\results\\MS1_multiple\\mzMLFiles\\number_5_class_1.mzML\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "401.14851999999894it [00:04, 81.52it/s] \n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Generating C:\\Users\\joewa\\Work\\git\\vimms\\examples\\example_data\\results\\MS1_multiple\\mzMLFiles\\number_6_class_1.mzML\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "400.3037899999998it [00:04, 92.11it/s] \n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Generating C:\\Users\\joewa\\Work\\git\\vimms\\examples\\example_data\\results\\MS1_multiple\\mzMLFiles\\number_7_class_1.mzML\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "400.75644999999827it [00:04, 88.37it/s] \n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Generating C:\\Users\\joewa\\Work\\git\\vimms\\examples\\example_data\\results\\MS1_multiple\\mzMLFiles\\number_8_class_1.mzML\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "400.00880000000143it [00:04, 90.58it/s] \n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Generating C:\\Users\\joewa\\Work\\git\\vimms\\examples\\example_data\\results\\MS1_multiple\\mzMLFiles\\number_9_class_1.mzML\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "400.82949000000076it [00:04, 80.46it/s] \n" + ] + } + ], + "source": [ + "min_rt = rt_range[0][0]\n", + "max_rt = rt_range[0][1]\n", + "controllers = defaultdict(list)\n", + "controller_to_mzml = {}\n", + "\n", + "mzml_dir = Path(out_dir, 'mzMLFiles')\n", + "num_classes = len(n_samples)\n", + "sample_idx = 0\n", + "for j in range(num_classes): # loop over classes\n", + " num_samples = n_samples[j]\n", + " for i in range(num_samples): # loop over samples for each class\n", + " \n", + " # load the sample\n", + " fname = Path(save_location, 'sample_%d.p' % sample_idx) \n", + " sample = load_obj(fname)\n", + " sample_idx += 1\n", + " \n", + " # define output .mzML filename\n", + " out_file = 'number_%d_class_%d.mzML' % (i, j)\n", + " out_path = Path(mzml_dir, out_file)\n", + " print('Generating %s' % out_path)\n", + "\n", + " # run it through the MS1 controller \n", + " mass_spec = IndependentMassSpectrometer(POSITIVE, sample, ps)\n", + " controller = SimpleMs1Controller(mass_spec)\n", + " controller.run(min_rt, max_rt)\n", + " controller.write_mzML('my_analysis', out_path)\n", + "\n", + " # save the resulting controller\n", + " controllers[j].append(controller)\n", + " controller_to_mzml[controller] = (j, out_file, )" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Print out the missing peaks" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The controller object contains all the information about the state of the mass spectrometry process over time. Below we demonstrate this by generating a report of peaks corresponding to a chemical that are present in one class but is missing from the other class. This can be useful in the benchmark evaluation of peak picking or alignment algorithms." + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "def get_chem_to_peaks(controller):\n", + " chem_to_peaks = defaultdict(list)\n", + " frag_events = controller.mass_spec.fragmentation_events\n", + " for frag_event in frag_events:\n", + " chem = frag_event.chem\n", + " peaks = frag_event.peaks\n", + " chem_to_peaks[chem].extend(peaks)\n", + " return chem_to_peaks" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "for controller, (current_class, mzml_filename) in controller_to_mzml.items():\n", + " controller_peaks = get_chem_to_peaks(controller)\n", + " basename = os.path.basename(mzml_filename)\n", + " front, back = os.path.splitext(mzml_filename)\n", + " outfile = front + '.csv'\n", + "\n", + " missing_peaks = [] \n", + " for other_class in range(num_classes):\n", + " if current_class == other_class:\n", + " continue\n", + "\n", + " # get the peaks that are present in current_class but missing in other_class\n", + " missing_chems = multiple_samples.missing_chemicals[other_class]\n", + " for chem in missing_chems:\n", + " peaks = controller_peaks[chem]\n", + " for peak in peaks:\n", + " row = (chem.formula.formula_string, current_class, other_class, peak.mz, peak.rt, peak.intensity)\n", + " missing_peaks.append(row)\n", + " \n", + " # convert to dataframe\n", + " columns = ['formula', 'present_in', 'missing_in', 'mz', 'RT', 'intensity']\n", + " missing_df = pd.DataFrame(missing_peaks, columns=columns)\n", + " missing_df.to_csv(os.path.join(out_dir, 'MissingChemicals', os.path.basename(outfile)))" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>formula</th>\n", + " <th>present_in</th>\n", + " <th>missing_in</th>\n", + " <th>mz</th>\n", + " <th>RT</th>\n", + " <th>intensity</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <td>0</td>\n", + " <td>C21H28O2</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>313.216480</td>\n", + " <td>733.48489</td>\n", + " <td>190887.063824</td>\n", + " </tr>\n", + " <tr>\n", + " <td>1</td>\n", + " <td>C21H28O2</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>330.243026</td>\n", + " <td>733.48489</td>\n", + " <td>36691.272425</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2</td>\n", + " <td>C21H28O2</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>389.128243</td>\n", + " <td>733.48489</td>\n", + " <td>89999.078686</td>\n", + " </tr>\n", + " <tr>\n", + " <td>3</td>\n", + " <td>C21H28O2</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>642.451956</td>\n", + " <td>733.48489</td>\n", + " <td>52228.312111</td>\n", + " </tr>\n", + " <tr>\n", + " <td>4</td>\n", + " <td>C21H28O2</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>314.219835</td>\n", + " <td>733.48489</td>\n", + " <td>44585.350600</td>\n", + " </tr>\n", + " <tr>\n", + " <td>5</td>\n", + " <td>C21H28O2</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>315.223189</td>\n", + " <td>733.48489</td>\n", + " <td>4958.936872</td>\n", + " </tr>\n", + " <tr>\n", + " <td>6</td>\n", + " <td>C21H28O2</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>313.216438</td>\n", + " <td>734.96489</td>\n", + " <td>201067.826579</td>\n", + " </tr>\n", + " <tr>\n", + " <td>7</td>\n", + " <td>C21H28O2</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>330.242984</td>\n", + " <td>734.96489</td>\n", + " <td>38648.163229</td>\n", + " </tr>\n", + " <tr>\n", + " <td>8</td>\n", + " <td>C21H28O2</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>389.128201</td>\n", + " <td>734.96489</td>\n", + " <td>94799.085820</td>\n", + " </tr>\n", + " <tr>\n", + " <td>9</td>\n", + " <td>C21H28O2</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>642.451914</td>\n", + " <td>734.96489</td>\n", + " <td>55013.854746</td>\n", + " </tr>\n", + " <tr>\n", + " <td>10</td>\n", + " <td>C21H28O2</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>314.219792</td>\n", + " <td>734.96489</td>\n", + " <td>46963.263842</td>\n", + " </tr>\n", + " <tr>\n", + " <td>11</td>\n", + " <td>C21H28O2</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>315.223147</td>\n", + " <td>734.96489</td>\n", + " <td>5223.416605</td>\n", + " </tr>\n", + " <tr>\n", + " <td>12</td>\n", + " <td>C21H28O2</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>313.216161</td>\n", + " <td>736.20189</td>\n", + " <td>213513.273571</td>\n", + " </tr>\n", + " <tr>\n", + " <td>13</td>\n", + " <td>C21H28O2</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>330.242708</td>\n", + " <td>736.20189</td>\n", + " <td>41040.359310</td>\n", + " </tr>\n", + " <tr>\n", + " <td>14</td>\n", + " <td>C21H28O2</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>389.127925</td>\n", + " <td>736.20189</td>\n", + " <td>100666.842077</td>\n", + " </tr>\n", + " <tr>\n", + " <td>15</td>\n", + " <td>C21H28O2</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>642.451638</td>\n", + " <td>736.20189</td>\n", + " <td>58419.034106</td>\n", + " </tr>\n", + " <tr>\n", + " <td>16</td>\n", + " <td>C21H28O2</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>314.219516</td>\n", + " <td>736.20189</td>\n", + " <td>49870.137710</td>\n", + " </tr>\n", + " <tr>\n", + " <td>17</td>\n", + " <td>C21H28O2</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>315.222871</td>\n", + " <td>736.20189</td>\n", + " <td>5546.729169</td>\n", + " </tr>\n", + " <tr>\n", + " <td>18</td>\n", + " <td>C21H28O2</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>313.216065</td>\n", + " <td>737.46289</td>\n", + " <td>208088.886377</td>\n", + " </tr>\n", + " <tr>\n", + " <td>19</td>\n", + " <td>C21H28O2</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>330.242612</td>\n", + " <td>737.46289</td>\n", + " <td>39997.713128</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " formula present_in missing_in mz RT intensity\n", + "0 C21H28O2 1 0 313.216480 733.48489 190887.063824\n", + "1 C21H28O2 1 0 330.243026 733.48489 36691.272425\n", + "2 C21H28O2 1 0 389.128243 733.48489 89999.078686\n", + "3 C21H28O2 1 0 642.451956 733.48489 52228.312111\n", + "4 C21H28O2 1 0 314.219835 733.48489 44585.350600\n", + "5 C21H28O2 1 0 315.223189 733.48489 4958.936872\n", + "6 C21H28O2 1 0 313.216438 734.96489 201067.826579\n", + "7 C21H28O2 1 0 330.242984 734.96489 38648.163229\n", + "8 C21H28O2 1 0 389.128201 734.96489 94799.085820\n", + "9 C21H28O2 1 0 642.451914 734.96489 55013.854746\n", + "10 C21H28O2 1 0 314.219792 734.96489 46963.263842\n", + "11 C21H28O2 1 0 315.223147 734.96489 5223.416605\n", + "12 C21H28O2 1 0 313.216161 736.20189 213513.273571\n", + "13 C21H28O2 1 0 330.242708 736.20189 41040.359310\n", + "14 C21H28O2 1 0 389.127925 736.20189 100666.842077\n", + "15 C21H28O2 1 0 642.451638 736.20189 58419.034106\n", + "16 C21H28O2 1 0 314.219516 736.20189 49870.137710\n", + "17 C21H28O2 1 0 315.222871 736.20189 5546.729169\n", + "18 C21H28O2 1 0 313.216065 737.46289 208088.886377\n", + "19 C21H28O2 1 0 330.242612 737.46289 39997.713128" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "missing_df.head(20)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/Synthetic data creation scripts/vimms_data_generation/04. Top-N Simulations.ipynb b/Synthetic data creation scripts/vimms_data_generation/04. Top-N Simulations.ipynb new file mode 100644 index 00000000..41810b5c --- /dev/null +++ b/Synthetic data creation scripts/vimms_data_generation/04. Top-N Simulations.ipynb @@ -0,0 +1,791 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 4. Top-N Simulations from Actual Experimental Data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This notebook loads an existing Beer1pos data, runs it through the simulator and compares the simulated results to the initial input data. The results here correspond to Section 3.2 in the paper." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2\n", + "%matplotlib inline" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "import pylab as plt\n", + "import pymzml" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "sys.path.append('..')" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "from vimms.Roi import RoiToChemicalCreator, make_roi\n", + "from vimms.DataGenerator import DataSource, PeakSampler, get_spectral_feature_database\n", + "from vimms.MassSpec import IndependentMassSpectrometer\n", + "from vimms.Controller import TopNController\n", + "from vimms.PlotsForPaper import count_stuff, plot_num_scans, match_peaklist, check_found_matches, \\\n", + "plot_matched_intensities, plot_matched_precursors\n", + "from vimms.Common import *" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "set_log_level_debug()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Parameters" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "base_dir = 'example_data'\n", + "mzml_path = os.path.join(base_dir, 'beers', 'fragmentation', 'mzML')\n", + "file_name = 'Beer_multibeers_1_T10_POS.mzML'\n", + "\n", + "experiment_name = 'mzml_compare'\n", + "experiment_out_dir = os.path.join(base_dir, 'results', experiment_name)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "min_rt = 0\n", + "max_rt = 1441" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "kde_min_ms1_intensity = 0 # min intensity to be selected for kdes\n", + "kde_min_ms2_intensity = 0" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### a. ROI extraction parameters" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "roi_mz_tol = 10\n", + "roi_min_length = 1\n", + "roi_min_intensity = 0\n", + "roi_start_rt = min_rt\n", + "roi_stop_rt = max_rt" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### b. Top-N parameters" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "isolation_window = 1 # the isolation window in Dalton around a selected precursor ion\n", + "ionisation_mode = POSITIVE\n", + "N = 10\n", + "rt_tol = 15\n", + "mz_tol = 10\n", + "min_ms1_intensity = 1.75E5 # minimum ms1 intensity to fragment" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "mzml_out = os.path.join(experiment_out_dir, 'simulated.mzML')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. Train densities" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO : DataSource : Loading Beer_multibeers_1_T10_POS.mzML\n" + ] + } + ], + "source": [ + "ds = DataSource()\n", + "ds.load_data(mzml_path, file_name=file_name)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "DEBUG : PeakSampler : Extracted 7647 MS2 scans\n", + "DEBUG : PeakSampler : Computing parent intensity proportions\n", + "DEBUG : PeakSampler : Extracting scan durations\n", + "DEBUG : PeakSampler : Training KDEs for ms_level=1\n", + "DEBUG : PeakSampler : Retrieving mz_intensity_rt values from <vimms.DataGenerator.DataSource object at 0x00000172559A4630>\n", + "INFO : DataSource : Using values from scans\n", + "DEBUG : PeakSampler : Retrieving n_peaks values from <vimms.DataGenerator.DataSource object at 0x00000172559A4630>\n", + "DEBUG : PeakSampler : Training KDEs for ms_level=2\n", + "DEBUG : PeakSampler : Retrieving mz_intensity_rt values from <vimms.DataGenerator.DataSource object at 0x00000172559A4630>\n", + "INFO : DataSource : Using values from scans\n", + "DEBUG : PeakSampler : Retrieving n_peaks values from <vimms.DataGenerator.DataSource object at 0x00000172559A4630>\n" + ] + } + ], + "source": [ + "bandwidth_mz_intensity_rt=1.0\n", + "bandwidth_n_peaks=1.0\n", + "ps = get_spectral_feature_database(ds, file_name, kde_min_ms1_intensity, kde_min_ms2_intensity, min_rt, max_rt,\n", + " bandwidth_mz_intensity_rt, bandwidth_n_peaks)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3. Extract all ROIs" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "mzml_file = os.path.join(mzml_path, file_name)\n", + "good_roi, junk = make_roi(mzml_file, mz_tol=roi_mz_tol, mz_units='ppm', min_length=roi_min_length,\n", + " min_intensity=roi_min_intensity, start_rt=roi_start_rt, stop_rt=roi_stop_rt)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "512540" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "all_roi = good_roi + junk\n", + "len(all_roi)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "How many singleton and non-singleton ROIs?" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "352967" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len([roi for roi in all_roi if roi.n == 1])" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "159573" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len([roi for roi in all_roi if roi.n > 1])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Keep only the ROIs that can be fragmented above **min_ms1_intensity threshold**." + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "175000.0" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "min_ms1_intensity" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "10190" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "keep = []\n", + "for roi in all_roi:\n", + " if np.count_nonzero(np.array(roi.intensity_list) > min_ms1_intensity) > 0:\n", + " keep.append(roi)\n", + "\n", + "all_roi = keep\n", + "len(keep)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Turn ROIs into chromatograms/chemicals" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "DEBUG : RoiToChemicalCreator : 0/ 10190\n", + "INFO : RoiToChemicalCreator : Found 10190 ROIs above thresholds\n" + ] + } + ], + "source": [ + "set_log_level_debug()\n", + "rtcc = RoiToChemicalCreator(ps, all_roi)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Saving <class 'list'> to example_data\\results\\mzml_compare\\dataset.p\n" + ] + } + ], + "source": [ + "data = rtcc.chemicals\n", + "save_obj(data, os.path.join(experiment_out_dir, 'dataset.p'))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4. Run Top-N Controller" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "set_log_level_warning()\n", + "pbar = True" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "1441.1996749999914it [01:41, 14.17it/s] \n" + ] + } + ], + "source": [ + "mass_spec = IndependentMassSpectrometer(ionisation_mode, data, ps)\n", + "controller = TopNController(mass_spec, N, isolation_window, mz_tol,\n", + " rt_tol, min_ms1_intensity)\n", + "controller.run(min_rt, max_rt, pbar)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "controller.write_mzML('my_analysis', mzml_out)" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'example_data\\\\results\\\\mzml_compare\\\\simulated.mzML'" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "mzml_out" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 5. Compare Results" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Load simulated and real data." + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of ms1 scans = 1258\n", + "Number of ms2 scans = 8619\n", + "Total scans = 9877\n", + "Number of selected precursors = 8619\n" + ] + } + ], + "source": [ + "simulated_input_file = mzml_out\n", + "simulated_mzs, simulated_rts, simulated_intensities, simulated_cumsum_ms1, simulated_cumsum_ms2 = count_stuff(\n", + " simulated_input_file, min_rt, max_rt)" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of ms1 scans = 1751\n", + "Number of ms2 scans = 7655\n", + "Total scans = 9406\n", + "Number of selected precursors = 7672\n" + ] + } + ], + "source": [ + "real_input_file = mzml_file\n", + "real_mzs, real_rts, real_intensities, real_cumsum_ms1, real_cumsum_ms2 = count_stuff(\n", + " real_input_file, min_rt, max_rt)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Plot number of scans" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "<Figure size 432x288 with 1 Axes>" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plt.rcParams.update({'font.size': 14})\n", + "out_file = os.path.join(base_dir, 'results', 'topN_num_scans.png')\n", + "plot_num_scans(real_cumsum_ms1, real_cumsum_ms2, simulated_cumsum_ms1, simulated_cumsum_ms2, out_file)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Check the number of precursors that could be matched at different m/z and RT tolerances" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Found 4875/7655 (0.636839)\n", + "Real\t\t\t\t\t\tSimulated\n", + "mz 144.98 rt 0.6005 intensity 1548081.2500\tmz 144.98 rt 0.5394 intensity 1735726.1927\n", + "mz 207.16 rt 0.7352 intensity 1443976.8750\tmz 207.16 rt 0.6714 intensity 1517306.5950\n", + "mz 126.05 rt 0.8693 intensity 1087971.0000\tmz 126.05 rt 0.8114 intensity 1324472.7979\n", + "mz 146.98 rt 1.0029 intensity 727259.1875\tmz 146.98 rt 0.8914 intensity 808759.8595\n", + "mz 224.19 rt 1.1375 intensity 395470.6562\tmz 224.19 rt 1.3004 intensity 406930.1763\n", + "mz 338.34 rt 1.2750 intensity 385685.9688\tmz 338.34 rt 1.1644 intensity 415747.1931\n", + "mz 116.07 rt 1.4077 intensity 366917.8750\tmz 116.07 rt 1.0274 intensity 488422.7070\n", + "mz 131.13 rt 1.5412 intensity 322462.5000\tmz 131.13 rt 1.4334 intensity 381448.3994\n", + "mz 104.11 rt 1.6740 intensity 267410.7188\tmz 104.11 rt 1.5724 intensity 329901.3222\n", + "mz 228.20 rt 2.0738 intensity 605814.3125\tmz 228.20 rt 2.3074 intensity 600740.7569\n", + "mz 162.08 rt 2.2073 intensity 955391.5000\tmz 162.08 rt 1.7074 intensity 324396.9502\n", + "mz 128.95 rt 2.4319 intensity 846180.3125\tmz 128.95 rt 2.0474 intensity 836551.3300\n", + "mz 144.93 rt 2.5655 intensity 3498059.0000\tmz 144.93 rt 2.4424 intensity 509680.6810\n", + "mz 125.07 rt 2.6990 intensity 506922.0938\tmz 125.07 rt 2.5504 intensity 501693.7715\n", + "mz 144.07 rt 2.8327 intensity 501226.1562\tmz 144.07 rt 2.6274 intensity 495539.7049\n", + "mz 83.06 rt 2.9652 intensity 430810.1875\tmz 83.06 rt 2.7604 intensity 426244.7156\n", + "mz 149.01 rt 3.0988 intensity 392989.6250\tmz 149.01 rt 2.8944 intensity 389704.9924\n", + "mz 88.08 rt 3.2314 intensity 388301.0312\tmz 88.08 rt 3.0454 intensity 383107.1695\n", + "mz 102.13 rt 3.6555 intensity 343353.2812\tmz 102.13 rt 3.1804 intensity 358989.9052\n" + ] + } + ], + "source": [ + "mz_tol = None # in ppm. if None, then 2 decimal places is used for matching the m/z\n", + "rt_tol = 5 # seconds\n", + "matches = match_peaklist(real_mzs, real_rts, real_intensities, simulated_mzs, simulated_rts, simulated_intensities, mz_tol, rt_tol)\n", + "check_found_matches(matches, 'Real', 'Simulated')" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Found 6546/7655 (0.855127)\n", + "Real\t\t\t\t\t\tSimulated\n", + "mz 144.98 rt 0.6005 intensity 1548081.2500\tmz 144.98 rt 0.5394 intensity 1735726.1927\n", + "mz 207.16 rt 0.7352 intensity 1443976.8750\tmz 207.16 rt 0.6714 intensity 1517306.5950\n", + "mz 126.05 rt 0.8693 intensity 1087971.0000\tmz 126.05 rt 0.8114 intensity 1324472.7979\n", + "mz 146.98 rt 1.0029 intensity 727259.1875\tmz 146.98 rt 0.8914 intensity 808759.8595\n", + "mz 224.19 rt 1.1375 intensity 395470.6562\tmz 224.19 rt 1.3004 intensity 406930.1763\n", + "mz 338.34 rt 1.2750 intensity 385685.9688\tmz 338.34 rt 1.1644 intensity 415747.1931\n", + "mz 116.07 rt 1.4077 intensity 366917.8750\tmz 116.07 rt 1.0274 intensity 488422.7070\n", + "mz 131.13 rt 1.5412 intensity 322462.5000\tmz 131.13 rt 1.4334 intensity 381448.3994\n", + "mz 104.11 rt 1.6740 intensity 267410.7188\tmz 104.11 rt 1.5724 intensity 329901.3222\n", + "mz 228.20 rt 2.0738 intensity 605814.3125\tmz 228.20 rt 2.3074 intensity 600740.7569\n", + "mz 162.08 rt 2.2073 intensity 955391.5000\tmz 162.08 rt 1.7074 intensity 324396.9502\n", + "mz 128.95 rt 2.4319 intensity 846180.3125\tmz 128.95 rt 2.0474 intensity 836551.3300\n", + "mz 144.93 rt 2.5655 intensity 3498059.0000\tmz 144.93 rt 2.4424 intensity 509680.6810\n", + "mz 125.07 rt 2.6990 intensity 506922.0938\tmz 125.07 rt 2.5504 intensity 501693.7715\n", + "mz 144.07 rt 2.8327 intensity 501226.1562\tmz 144.07 rt 2.6274 intensity 495539.7049\n", + "mz 83.06 rt 2.9652 intensity 430810.1875\tmz 83.06 rt 2.7604 intensity 426244.7156\n", + "mz 149.01 rt 3.0988 intensity 392989.6250\tmz 149.01 rt 2.8944 intensity 389704.9924\n", + "mz 88.08 rt 3.2314 intensity 388301.0312\tmz 88.08 rt 3.0454 intensity 383107.1695\n", + "mz 102.13 rt 3.6555 intensity 343353.2812\tmz 102.13 rt 3.1804 intensity 358989.9052\n" + ] + } + ], + "source": [ + "mz_tol = None\n", + "rt_tol = 10\n", + "matches = match_peaklist(real_mzs, real_rts, real_intensities, simulated_mzs, simulated_rts, simulated_intensities, mz_tol, rt_tol)\n", + "check_found_matches(matches, 'Real', 'Simulated')" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Found 6759/7655 (0.882952)\n", + "Real\t\t\t\t\t\tSimulated\n", + "mz 144.98 rt 0.6005 intensity 1548081.2500\tmz 144.98 rt 0.5394 intensity 1735726.1927\n", + "mz 207.16 rt 0.7352 intensity 1443976.8750\tmz 207.16 rt 0.6714 intensity 1517306.5950\n", + "mz 126.05 rt 0.8693 intensity 1087971.0000\tmz 126.05 rt 0.8114 intensity 1324472.7979\n", + "mz 146.98 rt 1.0029 intensity 727259.1875\tmz 146.98 rt 0.8914 intensity 808759.8595\n", + "mz 224.19 rt 1.1375 intensity 395470.6562\tmz 224.19 rt 1.3004 intensity 406930.1763\n", + "mz 338.34 rt 1.2750 intensity 385685.9688\tmz 338.34 rt 1.1644 intensity 415747.1931\n", + "mz 116.07 rt 1.4077 intensity 366917.8750\tmz 116.07 rt 1.0274 intensity 488422.7070\n", + "mz 131.13 rt 1.5412 intensity 322462.5000\tmz 131.13 rt 1.4334 intensity 381448.3994\n", + "mz 104.11 rt 1.6740 intensity 267410.7188\tmz 104.11 rt 1.5724 intensity 329901.3222\n", + "mz 228.20 rt 2.0738 intensity 605814.3125\tmz 228.20 rt 2.3074 intensity 600740.7569\n", + "mz 162.08 rt 2.2073 intensity 955391.5000\tmz 162.08 rt 1.7074 intensity 324396.9502\n", + "mz 128.95 rt 2.4319 intensity 846180.3125\tmz 128.95 rt 2.0474 intensity 836551.3300\n", + "mz 144.93 rt 2.5655 intensity 3498059.0000\tmz 144.93 rt 2.4424 intensity 509680.6810\n", + "mz 125.07 rt 2.6990 intensity 506922.0938\tmz 125.07 rt 2.5504 intensity 501693.7715\n", + "mz 144.07 rt 2.8327 intensity 501226.1562\tmz 144.07 rt 2.6274 intensity 495539.7049\n", + "mz 83.06 rt 2.9652 intensity 430810.1875\tmz 83.06 rt 2.7604 intensity 426244.7156\n", + "mz 149.01 rt 3.0988 intensity 392989.6250\tmz 149.01 rt 2.8944 intensity 389704.9924\n", + "mz 88.08 rt 3.2314 intensity 388301.0312\tmz 88.08 rt 3.0454 intensity 383107.1695\n", + "mz 102.13 rt 3.6555 intensity 343353.2812\tmz 102.13 rt 3.1804 intensity 358989.9052\n" + ] + } + ], + "source": [ + "mz_tol = None\n", + "rt_tol = 15\n", + "matches = match_peaklist(real_mzs, real_rts, real_intensities, simulated_mzs, simulated_rts, simulated_intensities, mz_tol, rt_tol)\n", + "check_found_matches(matches, 'Real', 'Simulated')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Plot the matches" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [], + "source": [ + "unmatched_intensities = []\n", + "matched_intensities = []\n", + "for key, value in list(matches.items()):\n", + " intensity = key[2]\n", + " if value is None:\n", + " unmatched_intensities.append(intensity)\n", + " else:\n", + " matched_intensities.append(intensity)" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [], + "source": [ + "plt.rcParams.update({'font.size': 18}) " + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "<Figure size 432x288 with 1 Axes>" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "out_file = os.path.join(base_dir, 'results', 'topN_matched_intensities.png')\n", + "plot_matched_intensities(matched_intensities, unmatched_intensities, out_file)" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "<Figure size 864x432 with 1 Axes>" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "out_file = os.path.join(base_dir, 'results', 'topN_matched_precursors.png')\n", + "plot_matched_precursors(matches, 50, 1000, 180, 1260, out_file)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/Synthetic data creation scripts/vimms_data_generation/05. Varying N in Top-N Simulations.ipynb b/Synthetic data creation scripts/vimms_data_generation/05. Varying N in Top-N Simulations.ipynb new file mode 100644 index 00000000..ff2d57f9 --- /dev/null +++ b/Synthetic data creation scripts/vimms_data_generation/05. Varying N in Top-N Simulations.ipynb @@ -0,0 +1,897 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 5. Varying N in Top-N Simulations" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This notebook loads an existing Beer1pos data and runs it through the simulator with varying N (the number of precursor peaks selected for fragmentations) for Top-N DDA fragmentation. The results here correspond to Section 3.3 in the paper for the Beer1pos data." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2\n", + "%matplotlib inline" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "import pylab as plt\n", + "import pymzml\n", + "import math\n", + "import seaborn as sns" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "sys.path.append('..')" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "from vimms.Roi import RoiToChemicalCreator, make_roi\n", + "from vimms.DataGenerator import DataSource, PeakSampler, get_spectral_feature_database\n", + "from vimms.MassSpec import IndependentMassSpectrometer\n", + "from vimms.Controller import TopNController\n", + "from vimms.TopNExperiment import get_params, run_serial_experiment, run_parallel_experiment\n", + "from vimms.PlotsForPaper import get_df, load_controller, compute_performance_scenario_2\n", + "from vimms.Common import *" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "set_log_level_debug()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Parameters" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "base_dir = 'example_data'\n", + "mzml_path = os.path.join(base_dir, 'beers', 'fragmentation', 'mzML')\n", + "file_name = 'Beer_multibeers_1_T10_POS.mzML'\n", + "\n", + "experiment_name = 'beer1pos'\n", + "experiment_out_dir = os.path.abspath(os.path.join(base_dir, 'results', experiment_name, 'mzML'))" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'C:\\\\Users\\\\joewa\\\\Work\\\\git\\\\vimms\\\\examples\\\\example_data\\\\results\\\\beer1pos\\\\mzML'" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "experiment_out_dir" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "min_rt = 3*60 # start time when compounds begin to elute in the mzML file\n", + "max_rt = 21*60" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "kde_min_ms1_intensity = 0 # min intensity to be selected for kdes\n", + "kde_min_ms2_intensity = 0" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### a. ROI extraction parameters" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "roi_mz_tol = 10\n", + "roi_min_length = 1\n", + "roi_min_intensity = 0\n", + "roi_start_rt = min_rt\n", + "roi_stop_rt = max_rt" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### b. Top-N parameters" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "isolation_window = 1 # the isolation window in Dalton around a selected precursor ion\n", + "ionisation_mode = POSITIVE\n", + "N = 10\n", + "rt_tol = 15\n", + "mz_tol = 10\n", + "min_ms1_intensity = 1.75E5 # minimum ms1 intensity to fragment" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "mzml_out = os.path.join(experiment_out_dir, 'simulated.mzML')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. Train densities" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO : DataSource : Loading Beer_multibeers_1_T10_POS.mzML\n" + ] + } + ], + "source": [ + "ds = DataSource()\n", + "ds.load_data(mzml_path, file_name=file_name)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "DEBUG : PeakSampler : Extracted 7647 MS2 scans\n", + "DEBUG : PeakSampler : Computing parent intensity proportions\n", + "DEBUG : PeakSampler : Extracting scan durations\n", + "DEBUG : PeakSampler : Training KDEs for ms_level=1\n", + "DEBUG : PeakSampler : Retrieving mz_intensity_rt values from <vimms.DataGenerator.DataSource object at 0x00000233E649CA20>\n", + "INFO : DataSource : Using values from scans\n", + "DEBUG : PeakSampler : Retrieving n_peaks values from <vimms.DataGenerator.DataSource object at 0x00000233E649CA20>\n", + "DEBUG : PeakSampler : Training KDEs for ms_level=2\n", + "DEBUG : PeakSampler : Retrieving mz_intensity_rt values from <vimms.DataGenerator.DataSource object at 0x00000233E649CA20>\n", + "INFO : DataSource : Using values from scans\n", + "DEBUG : PeakSampler : Retrieving n_peaks values from <vimms.DataGenerator.DataSource object at 0x00000233E649CA20>\n" + ] + } + ], + "source": [ + "bandwidth_mz_intensity_rt=1.0\n", + "bandwidth_n_peaks=1.0\n", + "ps = get_spectral_feature_database(ds, file_name, kde_min_ms1_intensity, kde_min_ms2_intensity, min_rt, max_rt,\n", + " bandwidth_mz_intensity_rt, bandwidth_n_peaks)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3. Extract all ROIs" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "mzml_file = os.path.join(mzml_path, file_name)\n", + "good_roi, junk = make_roi(mzml_file, mz_tol=roi_mz_tol, mz_units='ppm', min_length=roi_min_length,\n", + " min_intensity=roi_min_intensity, start_rt=roi_start_rt, stop_rt=roi_stop_rt)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "266107" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "all_roi = good_roi + junk\n", + "len(all_roi)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "How many singleton and non-singleton ROIs?" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "185119" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len([roi for roi in all_roi if roi.n == 1])" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "80988" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len([roi for roi in all_roi if roi.n > 1])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Keep only the ROIs that can be fragmented above **min_ms1_intensity threshold**." + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "175000.0" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "min_ms1_intensity" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "10079" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "keep = []\n", + "for roi in all_roi:\n", + " if np.count_nonzero(np.array(roi.intensity_list) > min_ms1_intensity) > 0:\n", + " keep.append(roi)\n", + "\n", + "all_roi = keep\n", + "len(keep)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Turn ROIs into chromatograms/chemicals" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "DEBUG : RoiToChemicalCreator : 0/ 10079\n", + "INFO : RoiToChemicalCreator : Found 10079 ROIs above thresholds\n" + ] + } + ], + "source": [ + "set_log_level_debug()\n", + "rtcc = RoiToChemicalCreator(ps, all_roi)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Saving <class 'list'> to C:\\Users\\joewa\\Work\\git\\vimms\\examples\\example_data\\results\\beer1pos\\mzML\\dataset.p\n" + ] + } + ], + "source": [ + "data = rtcc.chemicals\n", + "save_obj(data, os.path.join(experiment_out_dir, 'dataset.p'))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4. Run Top-N Controller" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "set_log_level_warning()\n", + "pbar = False # turn off progress bar" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "Ns = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100]\n", + "rt_tols = [15]" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "N = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100]\n", + "rt_tol = [15]\n", + "len(params) = 28\n" + ] + } + ], + "source": [ + "params = get_params(experiment_name, Ns, rt_tols, mz_tol, isolation_window, ionisation_mode, data, ps, \n", + " min_ms1_intensity, min_rt, max_rt, experiment_out_dir, pbar)" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'C:\\\\Users\\\\joewa\\\\Work\\\\git\\\\vimms\\\\examples\\\\example_data\\\\results\\\\beer1pos\\\\mzML'" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "experiment_out_dir" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Run the experiments." + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# %time run_serial_experiment(params)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Alternatively since each simulated run is completely independent of the others, we can save time by running the different values of N in parallel. Here we use the [iparallel](https://ipyparallel.readthedocs.io/en/latest/) package. To do this, start a local parallel cluster with the following command:\n", + "\n", + "$ ipcluster start -n 5\n", + "\n", + "where 5 is the number of cores to use (for example)." + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "experiment_beer1pos_N_1_rttol_15\n", + "experiment_beer1pos_N_2_rttol_15\n", + "experiment_beer1pos_N_3_rttol_15\n", + "experiment_beer1pos_N_4_rttol_15\n", + "experiment_beer1pos_N_5_rttol_15\n", + "experiment_beer1pos_N_6_rttol_15\n", + "experiment_beer1pos_N_7_rttol_15\n", + "experiment_beer1pos_N_8_rttol_15\n", + "experiment_beer1pos_N_9_rttol_15\n", + "experiment_beer1pos_N_10_rttol_15\n", + "experiment_beer1pos_N_15_rttol_15\n", + "experiment_beer1pos_N_20_rttol_15\n", + "experiment_beer1pos_N_25_rttol_15\n", + "experiment_beer1pos_N_30_rttol_15\n", + "experiment_beer1pos_N_35_rttol_15\n", + "experiment_beer1pos_N_40_rttol_15\n", + "experiment_beer1pos_N_45_rttol_15\n", + "experiment_beer1pos_N_50_rttol_15\n", + "experiment_beer1pos_N_55_rttol_15\n", + "experiment_beer1pos_N_60_rttol_15\n", + "experiment_beer1pos_N_65_rttol_15\n", + "experiment_beer1pos_N_70_rttol_15\n", + "experiment_beer1pos_N_75_rttol_15\n", + "experiment_beer1pos_N_80_rttol_15\n", + "experiment_beer1pos_N_85_rttol_15\n", + "experiment_beer1pos_N_90_rttol_15\n", + "experiment_beer1pos_N_95_rttol_15\n", + "experiment_beer1pos_N_100_rttol_15\n", + "Wall time: 17min 50s\n" + ] + } + ], + "source": [ + "%time run_parallel_experiment(params)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 5. Analyse Results" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we need to load the ground truth peaks found by xcms from each mzML file.\n", + "- P = peaks picked by XCMS from the full-scan file\n", + "- Q = peaks picked by XCMS from the fragmentation file\n", + "\n", + "Peak picking was done using the script `extract_peaks.R` in the `example_data/results/ground_truth` folder. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Manual step: to generate the lists of ground truth peaks for evaluation, please run the R script on both the full-scan and simulated fragmentation files.**\n", + "\n", + "Requirements:\n", + "- Ensure that XCMS3 has been installed: https://bioconductor.org/packages/release/bioc/html/xcms.html.\n", + "\n", + "Steps for peak picking on simulated fragmentation files:\n", + "1. Ensure that fragmentation .mzML file are located in `examples\\example_data\\results\\beer1pos`.\n", + "2. Open a new R window and run the R script `examples\\example_data\\results\\beer1pos\\extract_peaks.R`. The script will process any files found in an `mzML` folder relative to its current location.\n", + "3. The file `extracted_peaks_ms1.csv` will be created in the folder of step 2.\n", + "\n", + "We have provided the peak-picking result for the full-scan file, but to do it manually, follow the same steps as above. \n", + "1. Place your full-scan .mzML file in `examples\\example_data\\results\\ground_truth\\mzML`.\n", + "2. Open a new R window and run the R script `examples\\example_data\\results\\ground_truth\\extract_peaks.R`. The script will process any files found in an `mzML` folder relative to its current location.\n", + "3. The file `extracted_peaks_ms1.csv` will be created in the folder of step 2." + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [], + "source": [ + "min_ms1_intensity = 0\n", + "rt_range = [(min_rt, max_rt)]\n", + "mz_range = [(0, math.inf)]\n", + "results_dir = os.path.join(base_dir, 'results', 'ground_truth', 'mzML') \n", + "csv_file = os.path.join(results_dir, 'extracted_peaks_ms1.csv')\n", + "P_peaks_df = get_df(csv_file, min_ms1_intensity, rt_range, mz_range)" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [], + "source": [ + "csv_file = os.path.join(experiment_out_dir, 'extracted_peaks_ms1.csv')\n", + "Q_peaks_df = get_df(csv_file, min_ms1_intensity, rt_range, mz_range)" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [], + "source": [ + "fullscan_filename = 'Beer_multibeers_1_fullscan1.mzML' \n", + "matching_mz_tol = 10 # ppm\n", + "matching_rt_tol = 30 # seconds" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Loading experiment_beer1pos_N_1_rttol_15\n", + "Matched 3678/10079 in fullscan data, 6573/10079 in fragmentation data\n", + "beer1pos N=1 rt_tol=15 tp=1183 fp=916 fn=2495 prec=0.564 rec=0.322 f1=0.410\n", + "Loading experiment_beer1pos_N_2_rttol_15\n", + "Matched 3678/10079 in fullscan data, 6552/10079 in fragmentation data\n", + "beer1pos N=2 rt_tol=15 tp=1532 fp=1155 fn=2146 prec=0.570 rec=0.417 f1=0.481\n", + "Loading experiment_beer1pos_N_3_rttol_15\n", + "Matched 3678/10079 in fullscan data, 6276/10079 in fragmentation data\n", + "beer1pos N=3 rt_tol=15 tp=1674 fp=1193 fn=2004 prec=0.584 rec=0.455 f1=0.512\n", + "Loading experiment_beer1pos_N_4_rttol_15\n", + "Matched 3678/10079 in fullscan data, 6025/10079 in fragmentation data\n", + "beer1pos N=4 rt_tol=15 tp=1741 fp=1195 fn=1937 prec=0.593 rec=0.473 f1=0.526\n", + "Loading experiment_beer1pos_N_5_rttol_15\n", + "Matched 3678/10079 in fullscan data, 5762/10079 in fragmentation data\n", + "beer1pos N=5 rt_tol=15 tp=1817 fp=1166 fn=1861 prec=0.609 rec=0.494 f1=0.546\n", + "Loading experiment_beer1pos_N_6_rttol_15\n", + "Matched 3678/10079 in fullscan data, 5400/10079 in fragmentation data\n", + "beer1pos N=6 rt_tol=15 tp=1833 fp=1152 fn=1845 prec=0.614 rec=0.498 f1=0.550\n", + "Loading experiment_beer1pos_N_7_rttol_15\n", + "Matched 3678/10079 in fullscan data, 5253/10079 in fragmentation data\n", + "beer1pos N=7 rt_tol=15 tp=1823 fp=1117 fn=1855 prec=0.620 rec=0.496 f1=0.551\n", + "Loading experiment_beer1pos_N_8_rttol_15\n", + "Matched 3678/10079 in fullscan data, 5072/10079 in fragmentation data\n", + "beer1pos N=8 rt_tol=15 tp=1849 fp=1093 fn=1829 prec=0.628 rec=0.503 f1=0.559\n", + "Loading experiment_beer1pos_N_9_rttol_15\n", + "Matched 3678/10079 in fullscan data, 4874/10079 in fragmentation data\n", + "beer1pos N=9 rt_tol=15 tp=1850 fp=1043 fn=1828 prec=0.639 rec=0.503 f1=0.563\n", + "Loading experiment_beer1pos_N_10_rttol_15\n", + "Matched 3678/10079 in fullscan data, 4704/10079 in fragmentation data\n", + "beer1pos N=10 rt_tol=15 tp=1867 fp=1001 fn=1811 prec=0.651 rec=0.508 f1=0.570\n", + "Loading experiment_beer1pos_N_15_rttol_15\n", + "Matched 3678/10079 in fullscan data, 2934/10079 in fragmentation data\n", + "beer1pos N=15 rt_tol=15 tp=1519 fp=501 fn=2159 prec=0.752 rec=0.413 f1=0.533\n", + "Loading experiment_beer1pos_N_20_rttol_15\n", + "Matched 3678/10079 in fullscan data, 2343/10079 in fragmentation data\n", + "beer1pos N=20 rt_tol=15 tp=1264 fp=435 fn=2414 prec=0.744 rec=0.344 f1=0.470\n", + "Loading experiment_beer1pos_N_25_rttol_15\n", + "Matched 3678/10079 in fullscan data, 1924/10079 in fragmentation data\n", + "beer1pos N=25 rt_tol=15 tp=1060 fp=387 fn=2618 prec=0.733 rec=0.288 f1=0.414\n", + "Loading experiment_beer1pos_N_30_rttol_15\n", + "Matched 3678/10079 in fullscan data, 1488/10079 in fragmentation data\n", + "beer1pos N=30 rt_tol=15 tp=809 fp=329 fn=2869 prec=0.711 rec=0.220 f1=0.336\n", + "Loading experiment_beer1pos_N_35_rttol_15\n", + "Matched 3678/10079 in fullscan data, 1160/10079 in fragmentation data\n", + "beer1pos N=35 rt_tol=15 tp=654 fp=249 fn=3024 prec=0.724 rec=0.178 f1=0.286\n", + "Loading experiment_beer1pos_N_40_rttol_15\n", + "Matched 3678/10079 in fullscan data, 992/10079 in fragmentation data\n", + "beer1pos N=40 rt_tol=15 tp=543 fp=252 fn=3135 prec=0.683 rec=0.148 f1=0.243\n", + "Loading experiment_beer1pos_N_45_rttol_15\n", + "Matched 3678/10079 in fullscan data, 838/10079 in fragmentation data\n", + "beer1pos N=45 rt_tol=15 tp=445 fp=223 fn=3233 prec=0.666 rec=0.121 f1=0.205\n", + "Loading experiment_beer1pos_N_50_rttol_15\n", + "Matched 3678/10079 in fullscan data, 660/10079 in fragmentation data\n", + "beer1pos N=50 rt_tol=15 tp=343 fp=184 fn=3335 prec=0.651 rec=0.093 f1=0.163\n", + "Loading experiment_beer1pos_N_55_rttol_15\n", + "Matched 3678/10079 in fullscan data, 628/10079 in fragmentation data\n", + "beer1pos N=55 rt_tol=15 tp=301 fp=192 fn=3377 prec=0.611 rec=0.082 f1=0.144\n", + "Loading experiment_beer1pos_N_60_rttol_15\n", + "Matched 3678/10079 in fullscan data, 199/10079 in fragmentation data\n", + "beer1pos N=60 rt_tol=15 tp=114 fp=54 fn=3564 prec=0.679 rec=0.031 f1=0.059\n", + "Loading experiment_beer1pos_N_65_rttol_15\n", + "Matched 3678/10079 in fullscan data, 162/10079 in fragmentation data\n", + "beer1pos N=65 rt_tol=15 tp=72 fp=58 fn=3606 prec=0.554 rec=0.020 f1=0.038\n", + "Loading experiment_beer1pos_N_70_rttol_15\n", + "Matched 3678/10079 in fullscan data, 96/10079 in fragmentation data\n", + "beer1pos N=70 rt_tol=15 tp=48 fp=38 fn=3630 prec=0.558 rec=0.013 f1=0.026\n", + "Loading experiment_beer1pos_N_75_rttol_15\n", + "Matched 3678/10079 in fullscan data, 104/10079 in fragmentation data\n", + "beer1pos N=75 rt_tol=15 tp=44 fp=44 fn=3634 prec=0.500 rec=0.012 f1=0.023\n", + "Loading experiment_beer1pos_N_80_rttol_15\n", + "Matched 3678/10079 in fullscan data, 80/10079 in fragmentation data\n", + "beer1pos N=80 rt_tol=15 tp=33 fp=39 fn=3645 prec=0.458 rec=0.009 f1=0.018\n", + "Loading experiment_beer1pos_N_85_rttol_15\n", + "Matched 3678/10079 in fullscan data, 92/10079 in fragmentation data\n", + "beer1pos N=85 rt_tol=15 tp=32 fp=47 fn=3646 prec=0.405 rec=0.009 f1=0.017\n", + "Loading experiment_beer1pos_N_90_rttol_15\n", + "Matched 3678/10079 in fullscan data, 92/10079 in fragmentation data\n", + "beer1pos N=90 rt_tol=15 tp=36 fp=45 fn=3642 prec=0.444 rec=0.010 f1=0.019\n", + "Loading experiment_beer1pos_N_95_rttol_15\n", + "Matched 3678/10079 in fullscan data, 44/10079 in fragmentation data\n", + "beer1pos N=95 rt_tol=15 tp=20 fp=17 fn=3658 prec=0.541 rec=0.005 f1=0.011\n", + "Loading experiment_beer1pos_N_100_rttol_15\n", + "Matched 3678/10079 in fullscan data, 76/10079 in fragmentation data\n", + "beer1pos N=100 rt_tol=15 tp=28 fp=37 fn=3650 prec=0.431 rec=0.008 f1=0.015\n" + ] + } + ], + "source": [ + "results = []\n", + "for N in Ns:\n", + " for rt_tol in rt_tols:\n", + "\n", + " # load chemicals and check for matching\n", + " chemicals = load_obj(os.path.join(experiment_out_dir, 'dataset.p')) \n", + " fragfile_filename = 'experiment_%s_N_%d_rttol_%d.mzML' % (experiment_name, N, rt_tol) \n", + "\n", + " # load controller and compute performance\n", + " controller = load_controller(experiment_out_dir, experiment_name, N, rt_tol)\n", + " if controller is not None:\n", + " tp, fp, fn, prec, rec, f1 = compute_performance_scenario_2(controller, chemicals, min_ms1_intensity,\n", + " fullscan_filename, fragfile_filename,\n", + " P_peaks_df, Q_peaks_df, matching_mz_tol, matching_rt_tol)\n", + " print('%s N=%d rt_tol=%d tp=%d fp=%d fn=%d prec=%.3f rec=%.3f f1=%.3f' % (experiment_name, \n", + " N, rt_tol, tp, fp, fn, prec, rec, f1))\n", + " res = (experiment_name, N, rt_tol, tp, fp, fn, prec, rec, f1) \n", + " results.append(res) " + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [], + "source": [ + "result_df = pd.DataFrame(results, columns=['experiment', 'N', 'rt_tol', 'TP', 'FP', 'FN', 'Prec', 'Rec', 'F1'])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Plot precision, recall, f1" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "<Figure size 864x432 with 1 Axes>" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plt.figure(figsize=(12, 6))\n", + "ax = sns.lineplot(x='N', y='Prec', hue='experiment', legend='brief', data=result_df)\n", + "plt.title('Top-N Precision')\n", + "for l in ax.lines:\n", + " plt.setp(l, linewidth=5)\n", + "plt.ylabel('Precision')\n", + "plt.xlabel(r'Top-$N$')\n", + "plt.legend(prop={'size': 20})\n", + "plt.tight_layout()\n", + "\n", + "fig_out = os.path.join(experiment_out_dir, 'topN_precision.png')\n", + "plt.savefig(fig_out, dpi=300)" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "<Figure size 864x432 with 1 Axes>" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plt.figure(figsize=(12, 6))\n", + "ax = sns.lineplot(x='N', y='Rec', hue='experiment', legend='brief', data=result_df)\n", + "plt.title('Top-N Recall')\n", + "for l in ax.lines:\n", + " plt.setp(l, linewidth=5)\n", + "plt.ylabel('Recall')\n", + "plt.xlabel(r'Top-$N$')\n", + "plt.legend(prop={'size': 20})\n", + "plt.tight_layout()\n", + "\n", + "fig_out = os.path.join(experiment_out_dir, 'topN_recall.png')\n", + "plt.savefig(fig_out, dpi=300)" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "<Figure size 864x432 with 1 Axes>" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plt.figure(figsize=(12, 6))\n", + "ax = sns.lineplot(x='N', y='F1', hue='experiment', legend='brief', data=result_df)\n", + "plt.title('Top-N F1')\n", + "for l in ax.lines:\n", + " plt.setp(l, linewidth=5)\n", + "plt.ylabel(r'$F_{1}\\;score$')\n", + "plt.xlabel(r'Top-$N$')\n", + "plt.legend(prop={'size': 20})\n", + "plt.tight_layout()\n", + "\n", + "fig_out = os.path.join(experiment_out_dir, 'topN_f1.png')\n", + "plt.savefig(fig_out, dpi=300)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/Synthetic data creation scripts/vimms_data_generation/ee.py b/Synthetic data creation scripts/vimms_data_generation/ee.py new file mode 100644 index 00000000..6c4b2eb0 --- /dev/null +++ b/Synthetic data creation scripts/vimms_data_generation/ee.py @@ -0,0 +1,4 @@ +from collections import defaultdict +lines = open('./keep.txt', 'r') + + diff --git a/Synthetic data creation scripts/vimms_data_generation/intermediate b/Synthetic data creation scripts/vimms_data_generation/intermediate new file mode 100644 index 00000000..df0ceb80 --- /dev/null +++ b/Synthetic data creation scripts/vimms_data_generation/intermediate @@ -0,0 +1,106 @@ +#!/usr/bin/env python3 +import pickle +import numpy as np +import matplotlib.pyplot as plt +# what do we need to do? +# 1. get all of the rt's first + +PRECISION = 10**4 + +def roughly_equal( a, b, tol=0.001 ): + return abs( a - b ) <= tol + +class FloatSet: + + def __init__( self ): + self.values = [] + self.length = 0 + self.idx = 0 + + def add( self, nval ): + self.values.append( nval ) + + def __iter__( self ): + self.values.sort( ) + temp = [ self.values[0] ] + for vv in self.values[1:]: + if not roughly_equal( vv, temp[-1] ): + temp.append( vv ) + self.values = temp + self.length = len( temp ) + return self + + def __next__( self ): + if self.idx < self.length: + result = self.values[ self.idx ] + self.idx += 1 + return result + else: + raise StopIteration + +class RtMapper: + + def __init__( self, fset ): + self.rt_to_idx_ = list( ) + self.idx_to_rt_ = dict( ) + + self.rts_ = list( fset ) + + for idx, rt in enumerate( self.rts_ ): + self.rt_to_idx_.append((rt, idx)) + self.idx_to_rt_[ idx ] = rt + + self.length_ = len( self.rts_ ) + + def rt_to_idx(self, nrt ): + assert isinstance(nrt, float) + + for (rt, idx) in self.rt_to_idx_: + if roughly_equal(rt, nrt): + return idx + else: + raise TypeError('no match') + + def rts( self ): + return self.rts_ + + def idx_to_rt( self, idx ): + return self.idx_to_rt_[ idx ] + + def length( self ): + return self.length_ + +def zero_pad( data, rtmapper ): + new_res = dict( ) + for chem, vals in data.items(): + # first, get the unique mzs + mzset = FloatSet( ) + for (rt, mz, it) in vals: + mzset.add( mz ) + + for mzval in mzset: + values = list(filter(lambda row: roughly_equal( row[1], mzval ), vals )) + new_rts, new_mzs, new_its = np.array(rtmapper.rts()), np.zeros(rtmapper.length()), np.zeros(rtmapper.length()) + for (rt, mz, it) in vals: + idx = rtmapper.rt_to_idx( rt ) + new_mzs[ idx ] = mz + new_its[ idx ] = it + new_res[(chem, mzval)] = (new_rts, new_mzs, new_its ) + return new_res + +def rt_analysis( data ): + fset = FloatSet( ) + for values in data.values(): + for (rt, mz, it) in values: + fset.add( rt ) + + return RtMapper( fset ) + +def main( ): + data = pickle.load(open('peak_recorder.pickle', 'rb')) + rtmapper = rt_analysis( data ) + padded = zero_pad( data, rtmapper ) + print( padded.keys() ) + +if __name__ == '__main__': + main( ) diff --git a/Synthetic data creation scripts/vimms_data_generation/make.py b/Synthetic data creation scripts/vimms_data_generation/make.py new file mode 100644 index 00000000..61df6370 --- /dev/null +++ b/Synthetic data creation scripts/vimms_data_generation/make.py @@ -0,0 +1,762 @@ +#!/usr/bin/env python3 +# coding: utf-8 import json import sys, os import shutil import pandas as pd +import json +import pickle +import numpy as np +import pandas as pd +from glob import glob +from pathlib import Path +from itertools import product +from vimms.Chemicals import ChemicalCreator +from collections import namedtuple, defaultdict +from vimms.MassSpec import IndependentMassSpectrometer, Scan +from vimms.Controller import SimpleMs1Controller +from vimms.Common import * +from vimms.MzmlWriter import MzmlWriter +import vimms +from scipy.interpolate import interp1d +import argparse + + +# TODO need to document what each of the replicates does and doesn't have +# TODO need to set it up so that we're using the same multipliers for the +# significant compounds +from copy import deepcopy +import matplotlib.pyplot as plt +DATA = json.load(open("multipliers.json", "r")) +GOOD_MULTS = DATA["good"]*10 +BAD_MULTS = DATA["bad"]*10 + +def cv(vals): + return np.std(vals) / np.mean(vals) + +def random_bool( ): + return bool( np.random.randint(0, 2) ) + +def get_max_mz( chem ): + return max( [pr[0] for pr in chem.isotopes] ) + +class Params: + rt_lower = 0 + rt_upper = 2000 + mz_lower = 0 + mz_upper = 2005 + pct_missing = [0, 10, 20] + noise_level = [0, 0.01, 0.1] + + +def safe_mkdir(dirname): + if not os.path.exists(dirname): + os.mkdir(dirname) + + +def remove_x_pct(df: pd.DataFrame, x: int): + """Function that randomly removes x pct of entries from the supplied list of vals. Makes a copy of the inputted vals""" + assert x >= 0 and x <= 100, f"x must be a percentage on the range 0 < x < 100" + # print(float(chems.size)*(1.-float(x)/100.)) + num_cols, num_rows = len(df.columns), len(df[df.columns[0]]) + n_chems = num_cols * num_rows + n_remove = int(n_chems * x / 100.0 + 0.5) + rm_per_col = np.zeros(num_cols, np.int32) + for idx in range(n_remove): + rm_per_col[idx % num_cols] += 1 + + np.random.shuffle(rm_per_col) + row_idxs = np.arange(0, num_rows, 1) + + result = pd.DataFrame() + for c_idx, col in enumerate(df.columns): + + new_col = list(map(lambda x: deepcopy(x), df[col])) + row_idxs = np.sort(row_idxs) + np.random.shuffle(row_idxs) + + col_remove = rm_per_col[c_idx] + + for cr_idx in range(col_remove): + new_col[row_idxs[cr_idx]] = None + + result[col] = new_col + + return result + + + + +def setup_dirs(params: Params, basedir="mvapack-data"): + # layout of the directories are: + # basedir / pct_missing / noise / group + # maybe should return the directories too? + for edir in glob(f"{basedir}/*"): + shutil.rmtree(edir) + + for pct in params.pct_missing: + for noise in params.noise_level: + dirname = "/".join([basedir, f"{pct}_missing", f"{noise:.2f}_noise"]) + if not os.path.isdir(dirname): + Path(dirname).mkdir(parents=True, exist_ok=True) + + +def load_dbs(): + base_dir = os.path.abspath("example_data") + ps = load_obj(Path(base_dir, "peak_sampler_mz_rt_int_19_beers_fullscan.p")) + hmdb = load_obj(Path(base_dir, "hmdb_compounds.p")) + out_dir = Path(base_dir, "results", "MS1_single") + # the list of ROI sources created in the previous notebook '01. Download Data.ipynb' + ROI_Sources = [str(Path(base_dir, "DsDA", "DsDA_Beer", "beer_t10_simulator_files"))] + + return (ROI_Sources, ps, hmdb) + + +def make_dataset(ROI_Sources, ps, hmdb, n_chems): + params = Params() + mz_range = [(params.mz_lower, params.mz_upper)] + rt_range = [(params.rt_lower, params.rt_upper)] + min_ms1_intensity = 1.75e5 # TODO make this into a global + + # m/z and RT range of chemicals + + # the number of chemicals in the sample + # n_chems = 2000 + + # maximum MS level (we do not generate fragmentation peaks when this value is 1) + ms_level = 1 + + chems = ChemicalCreator(ps, ROI_Sources, hmdb) + reps = chems.sample(mz_range, rt_range, min_ms1_intensity, int(n_chems), ms_level) + # out_dir = 'mvapack-data' + # out_dir = 'demo' + # save_obj(dataset, Path(out_dir, 'dataset.p')) + + + bad = set(list(open('bad_chems', 'r').read().splitlines())) + reps = list(filter(lambda s: str(s.formula) not in bad and get_max_mz( s ) < 1800, reps )) + for r in reps: + if r.rt <= params.rt_lower: + r.rt = params.rt_lower + 20 + elif r.rt >= params.rt_upper: + r.rt = params.rt_upper - 20 + return reps[0:n_chems] + + +def merge_chems(cdict, chems): + + for cd_key, chem in zip(cdict.keys(), chems): + cdict[cd_key].append(chem) + + +def get_fc(is_sig): + + if is_sig: + return np.random.random() * 2 + 2 + else: + return np.random.random() * 1 + + +def get_mults(is_sig): + if is_sig: + return GOOD_MULTS.pop(), GOOD_MULTS.pop() + else: + return BAD_MULTS.pop(), BAD_MULTS.pop() + + +def get_t_offset(is_sig): + if is_sig: + return (np.random.random() * 10) - 5 + else: + return (np.random.random() * 60) - 30 + + +def get_t_base(is_sig): + if is_sig: + return (np.random.random() * 30) - 15 + else: + return (np.random.random() * 60) - 30 + + +def get_chemical_lists(chems, NUM_REPS=10): + + SIGNIFICANT = int(len(chems) * 0.40) + keys = list(map(lambda idx: f"rep_{idx}", range(NUM_REPS))) + values = list(map(lambda idx: deepcopy([]), range(NUM_REPS))) + clean_dict = lambda: deepcopy(dict(zip(keys, values))) + group1, group2 = clean_dict(), clean_dict() + np.random.shuffle(chems) + g1_sig, g1_isig, g2_sig, g2_isig = None, None, None, None + + for idx, chem in enumerate(chems): + # first figure out the fold change.... between 2 and 6? + is_sig = idx < SIGNIFICANT + if idx == (SIGNIFICANT - 1): + g1_sig = pd.DataFrame(group1) + g2_sig = pd.DataFrame(group2) + group1, group2 = clean_dict(), clean_dict() + + fc = get_fc(is_sig) + greater, lesser = ( + list(map(lambda idx: deepcopy(chem), range(NUM_REPS))), + list(map(lambda idx: deepcopy(chem), range(NUM_REPS))), + ) + mults1, mults2 = get_mults(is_sig) + + greater_base, lesser_base = get_t_base(is_sig), get_t_base(is_sig) + + for ii, r_key in enumerate(keys): + greater[ii].max_intensity *= mults1[ii] + greater[ii].max_intensity *= fc + greater[ii].rt += greater_base + get_t_offset(is_sig) + lesser[ii].max_intensity *= mults2[ii] + lesser[ii].rt += lesser_base + get_t_offset(is_sig) + # for some reason we can actually get negative peaks... that's a problem + greater[ii].rt = max(greater[ii].rt, 10) + lesser[ii].rt = max(lesser[ii].rt, 10) + + g1_greater = bool(np.random.randint(0, 2)) + + merge_chems(group1 if g1_greater else group2, greater) + merge_chems(group2 if g1_greater else group1, lesser) + + return g1_sig, pd.DataFrame(group1), g2_sig, pd.DataFrame(group2), keys + + +def write_summary(g1, g2, dirname, peak_values): + + unique_formulas = set() + summary_info = dict() + for idx, c in enumerate(g1.columns): + summary_info[f"G0-R{idx}"] = deepcopy(g1[c]) + + for idx, c in enumerate(g2.columns): + summary_info[f"G1-R{idx}"] = deepcopy(g2[c]) + + for k, v in summary_info.items(): + pvs = peak_values[k] + for chem in v: + print(str(chem.formula) in pvs) + break + print(peak_values.keys()) + + for clist in summary_info.values(): + _ = list( + map( + lambda c: unique_formulas.add(str(c.formula)), + list(filter(lambda chem: chem is not None, clist)), + ) + ) + summary = dict(chemical=deepcopy(list(unique_formulas))) + + for rname, rchems in summary_info.items(): + rchem_list = [] + rep_mapper = dict() + for rc in rchems: + if rc is None: + continue + rep_mapper[str(rc.formula)] = deepcopy(rc) + + for uf in unique_formulas: + rchem_list.append(rep_mapper.get(uf, None)) + + summary[rname] = rchem_list + + summary = pd.DataFrame(summary) + summary.set_index("chemical", inplace=True) + summary = summary.transpose() + mapper = dict() + for col in summary.columns: + sum_col = summary[col] + for c in sum_col: + print(c) + # print( sum_col[0] ) + # print( sum_col[0].__dict__ ) + exit(0) + times = np.array( + list(map(lambda chem: chem.rt if chem is not None else None, sum_col)) + ) + mzs = np.array( + list( + map( + lambda chem: chem.isotopes[0][0] if chem is not None else None, + sum_col, + ) + ) + ) + times = times[times != None] + mzs = mzs[mzs != None] + avg_mz, avg_rt = np.mean(mzs), np.mean(times) + mapper[col] = f"{avg_rt:.2f}_{avg_mz:.4f}" + + summary.rename(mapper=mapper, inplace=True, axis=1) + for col in summary.columns: + summary[col] = summary.apply( + lambda row: row[col].max_intensity if row[col] else 0, axis=1 + ) + + summary.to_csv(f"{dirname}/summary.csv") + summary.to_pickle(f"{dirname}/summary.pickle") + print(f"{dirname}/summary.csv") + + +def increase_resolution(controller): + new_scans = [] + old_len = len(controller.scans[1]) + + for idx, scan in enumerate(controller.scans[1]): + # loop through each scan + new_mz, new_it = [], [] + interp_mz = [] + num_points = len(scan.mzs) + + # need to check that there are points + if num_points: + new_mz.append(scan.mzs[0] - 0.1) + new_it.append(0) + + for pt_idx, (m, i) in enumerate(zip(scan.mzs, scan.intensities)): + interp_mz.append(np.linspace(m - 0.021, m + 0.021, 10)) + + new_mz.extend([m - 0.02, m, m + 0.02]) + new_it.extend([0, i, 0]) + + if pt_idx > 0 and pt_idx < num_points - 1: + interp_mz.append(np.arange(m + 0.25, scan.mzs[pt_idx + 1] - 0.25, 0.2)) + + if num_points: + new_mz.append(scan.mzs[-1] + 0.1) + new_it.append(0) + + if not len(scan.mzs): + new_scans.append(deepcopy( scan )) + continue + + interp_mz = np.concatenate(interp_mz) + interp_mz = sorted(interp_mz) + + interp_func = interp1d(new_mz, new_it, kind="linear") + new_x = np.linspace(np.min(new_mz), np.max(new_mz), 1000) + interp_it = interp_func(interp_mz) + interp_it[interp_it < 0] = 0 + + new_scan = deepcopy(scan) + new_scan.mzs = interp_mz + new_scan.intensities = interp_it + new_scans.append(new_scan) + + controller.scans[1] = new_scans + + assert len(controller.scans[1]) == old_len + + return controller + + +def valid_mzs(mzs): + for idx in range(1, len(mzs)): + assert mzs[idx] >= mzs[idx - 1], f"{mzs[idx-1]}, {mzs[idx]}" + + +def add_noise(controller, noise): + if noise == 0: + return controller + + def max_it( scan ): + if len(scan.intensities): + return np.max( scan.intensities ) + else: + return 0 + + old_len = len(controller.scans[1]) + + max_it = np.max( + max(controller.scans[1], key=max_it ).intensities + ) + max_noise = float(max_it) * (float(noise) / 100.0) + new_scans = [] + + for idx, scan in enumerate(controller.scans[1]): + valid_mzs(scan.mzs) + scan_noise = list( + map(lambda it: np.random.rand() * max_noise, range(len(scan.mzs))) + ) + new_its = [] + for ct, nz in zip(scan.intensities, scan_noise): + if ct > 0: + new_its.append(ct) + else: + new_its.append(nz) + + new_scan = deepcopy(scan) + assert len(scan.mzs) == len(scan.intensities) + new_scan.intensities = new_its + new_scans.append(new_scan) + + controller.scans[1] = new_scans + assert len(controller.scans[1]) == old_len + + return controller + + +def make_chem_dict( + group1_sig_orig, group1_isig_orig, group2_sig_orig, group2_isig_orig, rep_keys +): + # shuffle up and deal! + group1_sig_orig = group1_sig_orig.sample(frac=1).reset_index(drop=True) + group2_sig_orig = group2_sig_orig.sample(frac=1).reset_index(drop=True) + group1_isig_orig = group1_isig_orig.sample(frac=1).reset_index(drop=True) + group2_isig_orig = group2_isig_orig.sample(frac=1).reset_index(drop=True) + + chem_dict = dict() + for g_idx, (grp_sig, grp_isig) in enumerate( + [(group1_sig_orig, group1_isig_orig), (group2_sig_orig, group2_isig_orig)] + ): + for idx, r_key in enumerate(rep_keys): + # what do we care about now? need to locate + # the part where noise is actually added + chems_sig, chems_isig = ( + deepcopy(grp_sig[r_key]).to_list(), + deepcopy(grp_isig[r_key]).to_list(), + ) + chems_sig.extend(chems_isig) + chem_dict[f"G{g_idx}-R{idx}"] = deepcopy(chems_sig) + return chem_dict + + +def num_zeros(df): + ct = 0 + + for c in df.columns: + ct += sum(list(map(lambda ch: ch.max_intensity == 0, df[c]))) + + return ct + + +def adjust_groups(group1_sig, group2_sig, peak_values): + # print( peak_values.keys() ) + # print( group1_sig ) + + def col_to_key(colname): + return str(int(colname.split("_")[-1]) + 1) + + for g1_col in group1_sig.columns: + key_name = f"G0-R{col_to_key(g1_col)}" + c_dict = peak_values[key_name] + print(c_dict) + exit(0) + # print( [t.formula for t in group1_sig[ g1_col ]] ) + for t_chem in group1_sig[g1_col]: + print(t_chem.formula in c_dict) + # if t_chem.formula in c_dict: + # print( t_chem.formula ) + # print( c_dict[t_chem.formula] ) + exit(0) + print(key_name) + assert key_name in peak_values + # need to convert the g1_col to the repname + print(g1_col) + + for g2_col in group2_sig.columns: + key_name = f"G1-R{col_to_key(g2_col)}" + print(key_name) + assert key_name in peak_values # need to convert the g1_col to the repname + print(g1_col) + + exit(0) + +def make_master_replicates( chem_dict, POSITIVE, ps, params, sig ): + replicates = dict() + for rep_name, chems in chem_dict.items(): + print( rep_name ) + filt_chems = list(filter(lambda c: c is not None, chems)) + filt_chems = sorted( filt_chems, key=lambda c: str(c.formula)) + + mass_spec = IndependentMassSpectrometer( + POSITIVE, filt_chems, ps, sig, None, True + ) + controller = SimpleMs1Controller(mass_spec, params.mz_upper) + controller.run(params.rt_lower, params.rt_upper, False) + replicates[ rep_name ] = controller + + return replicates + + +def validate_cd( chem_dict ): + counter = defaultdict( int ) + num_reps = len(chem_dict.keys()) + for rep_chems in chem_dict.values(): + for c in rep_chems: + counter[ str(c.formula) ] += 1 + for cname, v in counter.items(): + assert v == num_reps, cname + +def validate_reps( replicates ): + print( replicates.keys() ) + counter = defaultdict( int ) + num_reps = len(replicates.keys()) + + for rep_chems in replicates.values(): + for c in rep_chems.mass_spec.chemicals: + counter[ str(c.formula) ] += 1 + for cname, v in counter.items(): + assert v == num_reps, cname + +def roughly_equal( a, b, tol=0.001 ): + return abs( a - b ) <= tol + +class RtMapper: + + def __init__( self, fset ): + self.rt_to_idx_ = list( ) + self.idx_to_rt_ = dict( ) + + self.rts_ = list( fset ) + + for idx, rt in enumerate( self.rts_ ): + self.rt_to_idx_.append((rt, idx)) + self.idx_to_rt_[ idx ] = rt + + self.length_ = len( self.rts_ ) + + def rt_to_idx(self, nrt ): + assert isinstance(nrt, float) + + for (rt, idx) in self.rt_to_idx_: + if roughly_equal(rt, nrt): + return idx + else: + raise TypeError('no match') + + def rts( self ): + return self.rts_ + + def idx_to_rt( self, idx ): + return self.idx_to_rt_[ idx ] + + def length( self ): + return self.length_ + + +def get_all_rts( controller ): + rts = [ sc.rt for sc in controller.scans[1] ] + rts = sorted( rts ) + return RtMapper( rts ) + +def determine_significant( eics ): + eicnames = list(set(list(map(lambda e: e.name, eics)))) + np.random.shuffle( eicnames ) + mapper = dict( ) + for idx, name in enumerate( eicnames ): + mapper[ name ] = idx < 800 + return mapper + + +def get_multiplier_mapper( sigmapper ): + result = dict( ) + for chem, is_sig in sigmapper.items( ): + mults1, mults2 = get_mults(is_sig) + mults1 = np.array( mults1 ) + mults2 = np.array( mults2 ) + fc = get_fc( is_sig ) + if random_bool( ): + result[ chem ] = np.concatenate(( + mults1*fc, mults2 + )) + else: + result[ chem ] = np.concatenate(( + mults2*fc, mults1 + )) + return result + + +def get_removal_mapper( sigmapper, pct_missing ): + num_sig = sum(sigmapper.values()) + num_to_remove = int(pct_missing/100*num_sig) + chemnames = list(sigmapper.keys()) + np.random.shuffle( chemnames ) + + result = dict( ) + for idx, cname in enumerate( chemnames ): + result[ cname ] = idx < num_to_remove + + return result + +EIC = namedtuple('EIC', 'name mzs rts its') + +def export_summary( summary, target_dir ): + all_chems = set() + for rep, chems in summary.items(): + for chemname in chems.keys(): + all_chems.add( chemname ) + repnames = summary.keys() + data = dict( ) + data['rep'] = repnames + for ac in all_chems: + col_vals = [] + for r in repnames: + if ac in summary[r]: + col_vals.append( summary[r][ac] ) + else: + col_vals.append( None ) + data[ ac ] = col_vals + + df = pd.DataFrame( data ) + mapper = dict( ) + for col in df.columns: + if col == 'rep': + continue + + rts, mzs, its = [], [], [] + for row in df[col]: + if row is None: + its.append( 0 ) + continue + (rt, mz, ct ) = row + rts.append( rt ) + mzs.append( mz ) + its.append( ct ) + rts = np.array( rts ) + mzs = np.array( mzs ) + df[col] = its + mapper[ col ] = f"{np.mean(rts):.2f}_{np.mean(mzs):.4f}" + df.rename( mapper, axis=1, inplace=True) + df.to_csv( f"{target_dir}/summary.csv", index=False ) + +def make_plot( controller ): + scan = controller.scans[1][100] + + plt.plot(scan.mzs, scan.intensities, c='k') + plt.ylabel('intensity (A.U.)') + plt.xlabel('mz (daltons)') + plt.show() + +def create_condition( controller, pct_missing, noise, eics, mapper, sigmapper, basedir='mvapack-data' ): + target_dir = f"{basedir}/{int(pct_missing)}_missing/{noise:.2f}_noise" + mult_mapper = get_multiplier_mapper( sigmapper ) + pickle.dump(mult_mapper, open('mult_mapper.pickle', 'wb')) + exit( 0 ) + assert os.path.isdir( target_dir ), target_dir + allchemnames = list(map(lambda pr: pr.name, eics )) + summary = dict( ) + for r_idx in range( 20 ): + print( r_idx ) + features = dict( ) + should_remove = get_removal_mapper( sigmapper, pct_missing ) + group = "G0" if r_idx < 10 else "G1" + rep = f"R{r_idx%10}" + full_name = group + '-' + rep + '.mzML' + local_eics = [deepcopy(ee) for ee in eics ] + + finalized_eics = [] + for le in local_eics: + rts, its, name = le.rts, le.its, le.name + mult = mult_mapper[ name ][ r_idx ] + its *= mult + + if should_remove[ name ]: + continue + + finalized_eics.append( + EIC(name=name, mzs=le.mzs, rts=rts, its=its) + ) + controller.scans = eics_to_scans( finalized_eics ) + #controller.update_scans( finalized_eics, mapper ) + #make_plot( controller ) + #controller = increase_resolution( controller ) + #controller = add_noise( controller, noise ) + #make_plot( controller ) + controller.write_mzML( target_dir, target_dir + '/' + full_name ) + + for acn in allchemnames: + if should_remove[acn]: + continue + filtered = list(filter( lambda pr: pr.name == acn, finalized_eics )) + max_mz, max_it, max_rt = 0, 0, 0 + for feic in filtered: + mzs, rts, its = feic.mzs, feic.rts, feic.its + + max_idx = np.argmax( its ) + if its[ max_idx ] > max_it: + max_it = its[ max_idx ] + max_mz = mzs[ max_idx ] + max_rt = rts[ max_idx ] + + features[ acn ] = ( max_rt, max_mz, max_it ) + + summary[ full_name ] = features + + export_summary( summary, target_dir ) + +def setup_params( ): + parser = argparse.ArgumentParser( ) + #parser.add_argument('--mode', required=True, type=str ) + parser.add_argument('--pct_missing', required=False, type=int, default=0 ) + parser.add_argument('--noise', required=False, type=float, default=0 ) + parser.add_argument('--idx', required=False, type=int, default=0 ) + + return parser.parse_args( ) + +def eics_to_scans( eics ): + eics = sorted( eics, key=lambda e: e.mzs[0] ) + scans = [] + avg_dur = np.mean(np.array([abs(t1 - t2) for t1, t2 in zip( eics[0].rts[0:-1], eics[0].rts[1:])])) + rt_len = len( eics[0].rts ) + mzs = list(map(lambda idx: [],range(rt_len))) + its = list(map(lambda idx: [],range(rt_len))) + + for idx in range( rt_len ): + for e in eics: + mzs[idx].append( e.mzs[idx] ) + its[idx].append( e.its[idx] ) + + for idx,rt in enumerate( eics[0].rts ): + if idx != rt_len-1: + duration = eics[0].rts[idx+1]-rt + else: + duration = avg_dur + assert duration > 0 + scans.append(Scan( + idx, np.array(mzs[idx]), np.array(its[idx]), 1, rt, duration + )) + + return {1: scans} + +def main( params ): + + np.random.seed( 100 ) + params = Params() + #params.rt_upper = 120 # TODO this is just for debugging + #dgs = setup_dirs(params) + (ROI_Sources, ps, hmdb) = load_dbs() + # can probably make the smallest peaks a bit smaller here + if False: + set_log_level_debug() + (ROI_Sources, ps, hmdb) = load_dbs() + orig_chems = make_dataset(ROI_Sources, ps, hmdb, 10000) + pickle.dump(orig_chems, open('chems.pickle', 'wb')) + else: + orig_chems = pickle.load(open('chems.pickle', 'rb')) + #print(len(orig_chems)) + orig_chems = list(filter(lambda oc: oc.rt > 60, orig_chems )) + sig = False + mass_spec = IndependentMassSpectrometer( + POSITIVE, orig_chems, ps, sig, None, True + ) + controller = SimpleMs1Controller(mass_spec, params.mz_upper) + #controller.run(params.rt_lower, params.rt_upper, False) + ### ok so at this point we have all of the data points. + ### Basically want to take the peak_recorder and then + ### return a list of tuples (chem name (mzs, rts, its )). + mapper = get_all_rts( controller ) + ##eics = controller.create_eics( mapper ) + ##pickle.dump(eics, open('eics.pickle', 'wb')) + ##exit( 0 ) + eics = pickle.load(open('eics-ideal.pickle', 'rb')) + controller.scans = eics_to_scans( eics ) + #controller.write_mzML('idk', 'test.mzML') + #print( scans ) + sigmapper = determine_significant( eics ) + + for pct_missing in params.pct_missing: + for noise in params.noise_level: + create_condition( controller, pct_missing, 0.0025, eics, mapper, sigmapper ) + + +if __name__ == "__main__": + main( setup_params() ) diff --git a/Synthetic data creation scripts/vimms_data_generation/mk.py b/Synthetic data creation scripts/vimms_data_generation/mk.py new file mode 100644 index 00000000..19550a93 --- /dev/null +++ b/Synthetic data creation scripts/vimms_data_generation/mk.py @@ -0,0 +1,542 @@ +#!/usr/bin/env python3 +# coding: utf-8 +import json +import sys, os +import shutil +import pandas as pd +import matplotlib.pyplot as plt +from collections import namedtuple, defaultdict + +sys.path.append("..") +import pickle +import numpy as np +from glob import glob +from pathlib import Path +from itertools import product +from vimms.Chemicals import ChemicalCreator +from vimms.MassSpec import IndependentMassSpectrometer +from vimms.Controller import SimpleMs1Controller +from vimms.Common import * +import vimms +from scipy.interpolate import interp1d + +# TODO need to document what each of the replicates does and doesn't have +# TODO need to set it up so that we're using the same multipliers for the +# significant compounds +from copy import deepcopy +import matplotlib.pyplot as plt + +DATA = json.load(open("multipliers.json", "r")) + +GOOD_MULTS = DATA["good"] +BAD_MULTS = DATA["bad"] + + +class Params: + rt_lower = 0 + rt_upper = 1440 + mz_lower = 0 + mz_upper = 2000 + pct_missing = [0, 10, 20] + noise_level = [0, 0.01, 0.1] + + +def safe_mkdir(dirname): + if not os.path.exists(dirname): + os.mkdir(dirname) + + +def remove_x_pct(df: pd.DataFrame, x: int): + """Function that randomly removes x pct of entries from the supplied list of vals. Makes a copy of the inputted vals""" + assert x >= 0 and x <= 100, f"x must be a percentage on the range 0 < x < 100" + # print(float(chems.size)*(1.-float(x)/100.)) + num_cols, num_rows = len(df.columns), len(df[df.columns[0]]) + n_chems = num_cols * num_rows + n_remove = int(n_chems * x / 100.0 + 0.5) + rm_per_col = np.zeros(num_cols, np.int32) + for idx in range(n_remove): + rm_per_col[idx % num_cols] += 1 + + np.random.shuffle(rm_per_col) + row_idxs = np.arange(0, num_rows, 1) + + result = pd.DataFrame() + for c_idx, col in enumerate(df.columns): + + new_col = list(map(lambda x: deepcopy(x), df[col])) + row_idxs = np.sort(row_idxs) + np.random.shuffle(row_idxs) + + col_remove = rm_per_col[c_idx] + + for cr_idx in range(col_remove): + new_col[row_idxs[cr_idx]] = None + + result[col] = new_col + + return result + + +def cv(vals): + return np.std(vals) / np.mean(vals) + + +def setup_dirs(params: Params, basedir="mvapack-data"): + # layout of the directories are: + # basedir / pct_missing / noise / group + # maybe should return the directories too? + for edir in glob(f"{basedir}/*"): + shutil.rmtree(edir) + + for pct in params.pct_missing: + for noise in params.noise_level: + dirname = "/".join([basedir, f"{pct}_missing", f"{noise:.2f}_noise"]) + if not os.path.isdir(dirname): + Path(dirname).mkdir(parents=True, exist_ok=True) + + +def load_dbs(): + base_dir = os.path.abspath("example_data") + ps = load_obj(Path(base_dir, "peak_sampler_mz_rt_int_19_beers_fullscan.p")) + hmdb = load_obj(Path(base_dir, "hmdb_compounds.p")) + out_dir = Path(base_dir, "results", "MS1_single") + # the list of ROI sources created in the previous notebook '01. Download Data.ipynb' + ROI_Sources = [str(Path(base_dir, "DsDA", "DsDA_Beer", "beer_t10_simulator_files"))] + + return (ROI_Sources, ps, hmdb) + + +def make_datasets(ROI_Sources, ps, hmdb, g1_size=10, g2_size=10): + min_ms1_intensity = 1.75e5 # TODO make this into a global + + # m/z and RT range of chemicals + + # the number of chemicals in the sample + # n_chems = 2000 + n_chems = 2000 + + # maximum MS level (we do not generate fragmentation peaks when this value is 1) + ms_level = 1 + + chems = ChemicalCreator(ps, ROI_Sources, hmdb) + dataset = chems.sample(mz_range, rt_range, min_ms1_intensity, n_chems, ms_level) + # out_dir = 'mvapack-data' + # out_dir = 'demo' + # save_obj(dataset, Path(out_dir, 'dataset.p')) + + group1_reps, group2_reps = [], [] + + for g1 in range(g1_size): + group1_reps.append( + chems.sample(mz_range, rt_range, min_ms1_intensity, n_chems, ms_level) + ) + # this is the part where we change the intensities + for g2 in range(g2_size): + group2_reps.append( + chems.sample(mz_range, rt_range, min_ms1_intensity, n_chems, ms_level) + ) + + return group1_reps, group2_reps + + +def merge_chems(cdict, chems): + + for cd_key, chem in zip(cdict.keys(), chems): + cdict[cd_key].append(chem) + + +def get_fc(is_sig): + + if is_sig: + return np.random.random() * 2 + 2 + else: + return np.random.random() * 1 + + +def get_mults(is_sig): + if is_sig: + return GOOD_MULTS.pop(), GOOD_MULTS.pop() + else: + return BAD_MULTS.pop(), BAD_MULTS.pop() + + +def get_t_offset(is_sig): + if is_sig: + return (np.random.random() * 10) - 5 + else: + return (np.random.random() * 60) - 30 + + +def get_t_base(is_sig): + if is_sig: + return (np.random.random() * 30) - 15 + else: + return (np.random.random() * 60) - 30 + + +def get_chemical_lists(chems, NUM_REPS=10): + + SIGNIFICANT = int(len(chems) * 0.40) + keys = list(map(lambda idx: f"rep_{idx}", range(NUM_REPS))) + values = list(map(lambda idx: deepcopy([]), range(NUM_REPS))) + clean_dict = lambda: deepcopy(dict(zip(keys, values))) + group1, group2 = clean_dict(), clean_dict() + np.random.shuffle(chems) + g1_sig, g1_isig, g2_sig, g2_isig = None, None, None, None + + for idx, chem in enumerate(chems): + # first figure out the fold change.... between 2 and 6? + is_sig = idx < SIGNIFICANT + if idx == (SIGNIFICANT - 1): + g1_sig = pd.DataFrame(group1) + g2_sig = pd.DataFrame(group2) + group1, group2 = clean_dict(), clean_dict() + + fc = get_fc(is_sig) + greater, lesser = ( + list(map(lambda idx: deepcopy(chem), range(NUM_REPS))), + list(map(lambda idx: deepcopy(chem), range(NUM_REPS))), + ) + mults1, mults2 = get_mults(is_sig) + + greater_base, lesser_base = get_t_base(is_sig), get_t_base(is_sig) + + for ii, r_key in enumerate(keys): + greater[ii].max_intensity *= mults1[ii] + greater[ii].max_intensity *= fc + greater[ii].rt += greater_base + get_t_offset(is_sig) + lesser[ii].max_intensity *= mults2[ii] + lesser[ii].rt += lesser_base + get_t_offset(is_sig) + # for some reason we can actually get negative peaks... that's a problem + greater[ii].rt = max(greater[ii].rt, 10) + lesser[ii].rt = max(lesser[ii].rt, 10) + + g1_greater = bool(np.random.randint(0, 2)) + + merge_chems(group1 if g1_greater else group2, greater) + merge_chems(group2 if g1_greater else group1, lesser) + + return g1_sig, pd.DataFrame(group1), g2_sig, pd.DataFrame(group2), keys + + +def write_summary(g1, g2, dirname, peak_values): + + unique_formulas = set() + summary_info = dict() + for idx, c in enumerate(g1.columns): + summary_info[f"G0-R{idx}"] = deepcopy(g1[c]) + + for idx, c in enumerate(g2.columns): + summary_info[f"G1-R{idx}"] = deepcopy(g2[c]) + + for k, v in summary_info.items(): + pvs = peak_values[k] + for chem in v: + print(str(chem.formula) in pvs) + break + print(peak_values.keys()) + exit(0) + + for clist in summary_info.values(): + _ = list( + map( + lambda c: unique_formulas.add(str(c.formula)), + list(filter(lambda chem: chem is not None, clist)), + ) + ) + summary = dict(chemical=deepcopy(list(unique_formulas))) + + for rname, rchems in summary_info.items(): + rchem_list = [] + rep_mapper = dict() + for rc in rchems: + if rc is None: + continue + rep_mapper[str(rc.formula)] = deepcopy(rc) + + for uf in unique_formulas: + rchem_list.append(rep_mapper.get(uf, None)) + + summary[rname] = rchem_list + + summary = pd.DataFrame(summary) + summary.set_index("chemical", inplace=True) + summary = summary.transpose() + mapper = dict() + for col in summary.columns: + sum_col = summary[col] + for c in sum_col: + print(c) + # print( sum_col[0] ) + # print( sum_col[0].__dict__ ) + exit(0) + times = np.array( + list(map(lambda chem: chem.rt if chem is not None else None, sum_col)) + ) + mzs = np.array( + list( + map( + lambda chem: chem.isotopes[0][0] if chem is not None else None, + sum_col, + ) + ) + ) + times = times[times != None] + mzs = mzs[mzs != None] + avg_mz, avg_rt = np.mean(mzs), np.mean(times) + mapper[col] = f"{avg_rt:.2f}_{avg_mz:.4f}" + + summary.rename(mapper=mapper, inplace=True, axis=1) + for col in summary.columns: + summary[col] = summary.apply( + lambda row: row[col].max_intensity if row[col] else 0, axis=1 + ) + + summary.to_csv(f"{dirname}/summary.csv") + summary.to_pickle(f"{dirname}/summary.pickle") + print(f"{dirname}/summary.csv") + + +def increase_resolution(controller): + new_scans = [] + old_len = len(controller.scans[1]) + + for idx, scan in enumerate(controller.scans[1]): + # loop through each scan + new_mz, new_it = [], [] + interp_mz = [] + num_points = len(scan.mzs) + + # need to check that there are points + if num_points: + new_mz.append(scan.mzs[0] - 0.1) + new_it.append(0) + + for pt_idx, (m, i) in enumerate(zip(scan.mzs, scan.intensities)): + interp_mz.append(np.linspace(m - 0.021, m + 0.021, 10)) + + new_mz.extend([m - 0.02, m, m + 0.02]) + new_it.extend([0, i, 0]) + + if pt_idx > 0 and pt_idx < num_points - 1: + interp_mz.append(np.arange(m + 0.25, scan.mzs[pt_idx + 1] - 0.25, 0.2)) + + if num_points: + new_mz.append(scan.mzs[-1] + 0.1) + new_it.append(0) + + if not len(scan.mzs): + new_scans.append(deepcopy( scan )) + continue + + interp_mz = np.concatenate(interp_mz) + interp_mz = sorted(interp_mz) + + interp_func = interp1d(new_mz, new_it, kind="linear") + new_x = np.linspace(np.min(new_mz), np.max(new_mz), 1000) + interp_it = interp_func(interp_mz) + interp_it[interp_it < 0] = 0 + + new_scan = deepcopy(scan) + new_scan.mzs = interp_mz + new_scan.intensities = interp_it + new_scans.append(new_scan) + + controller.scans[1] = new_scans + + assert len(controller.scans[1]) == old_len + + return controller + + +def valid_mzs(mzs): + for idx in range(1, len(mzs)): + assert mzs[idx] >= mzs[idx - 1], f"{mzs[idx-1]}, {mzs[idx]}" + + +def add_noise(controller, noise): + # if noise == 0: + # return controller + + def max_it( scan ): + if len(scan.intensities): + return np.max( scan.intensities ) + else: + return 0 + old_len = len(controller.scans[1]) + + max_it = np.max( + max(controller.scans[1], key=max_it ).intensities + ) + max_noise = float(max_it) * (float(noise) / 100.0) + new_scans = [] + + for idx, scan in enumerate(controller.scans[1]): + valid_mzs(scan.mzs) + scan_noise = list( + map(lambda it: np.random.rand() * max_noise, range(len(scan.mzs))) + ) + new_its = [] + for ct, nz in zip(scan.intensities, scan_noise): + if ct > 0: + new_its.append(ct) + else: + new_its.append(nz) + + new_scan = deepcopy(scan) + assert len(scan.mzs) == len(scan.intensities) + new_scan.intensities = new_its + new_scans.append(new_scan) + + controller.scans[1] = new_scans + assert len(controller.scans[1]) == old_len + + return controller + + +def make_chem_dict( + group1_sig_orig, group1_isig_orig, group2_sig_orig, group2_isig_orig, rep_keys +): + # shuffle up and deal! + group1_sig_orig = group1_sig_orig.sample(frac=1).reset_index(drop=True) + group2_sig_orig = group2_sig_orig.sample(frac=1).reset_index(drop=True) + group1_isig_orig = group1_isig_orig.sample(frac=1).reset_index(drop=True) + group2_isig_orig = group2_isig_orig.sample(frac=1).reset_index(drop=True) + + chem_dict = dict() + for g_idx, (grp_sig, grp_isig) in enumerate( + [(group1_sig_orig, group1_isig_orig), (group2_sig_orig, group2_isig_orig)] + ): + for idx, r_key in enumerate(rep_keys): + # what do we care about now? need to locate + # the part where noise is actually added + chems_sig, chems_isig = ( + deepcopy(grp_sig[r_key]).to_list(), + deepcopy(grp_isig[r_key]).to_list(), + ) + chems_sig.extend(chems_isig) + chem_dict[f"G{g_idx}-R{idx}"] = deepcopy(chems_sig) + return chem_dict + + +def num_zeros(df): + ct = 0 + + for c in df.columns: + ct += sum(list(map(lambda ch: ch.max_intensity == 0, df[c]))) + + return ct + + +def adjust_groups(group1_sig, group2_sig, peak_values): + # print( peak_values.keys() ) + # print( group1_sig ) + + def col_to_key(colname): + return str(int(colname.split("_")[-1]) + 1) + + for g1_col in group1_sig.columns: + key_name = f"G0-R{col_to_key(g1_col)}" + c_dict = peak_values[key_name] + print(c_dict) + exit(0) + # print( [t.formula for t in group1_sig[ g1_col ]] ) + for t_chem in group1_sig[g1_col]: + print(t_chem.formula in c_dict) + # if t_chem.formula in c_dict: + # print( t_chem.formula ) + # print( c_dict[t_chem.formula] ) + exit(0) + print(key_name) + assert key_name in peak_values + # need to convert the g1_col to the repname + print(g1_col) + + for g2_col in group2_sig.columns: + key_name = f"G1-R{col_to_key(g2_col)}" + print(key_name) + assert key_name in peak_values # need to convert the g1_col to the repname + print(g1_col) + + exit(0) + + +def main(): + + np.random.seed(100) + params = Params() + #params.rt_upper = 120 # TODO this is just for debugging + dgs = setup_dirs(params) + (ROI_Sources, ps, hmdb) = load_dbs() + if False: + (ROI_Sources, ps, hmdb) = load_dbs() + + set_log_level_debug() + # can probably make the smallest peaks a bit smaller here + (g1, g2) = make_datasets(ROI_Sources, ps, hmdb) + pickle.dump((g1, g2), open("data.p", "wb")) + + else: + (g1, g2) = pickle.load(open("data.p", "rb")) + # ok for these metabolites to be selected they need to vary between groups and not vary much inside of their own + ( + group1_sig_orig, + group1_isig_orig, + group2_sig_orig, + group2_isig_orig, + rep_keys, + ) = get_chemical_lists(g1[0]) + + # this is the part where the different replicates need to be made + for pct_missing in params.pct_missing: + group1_sig = remove_x_pct(group1_sig_orig, pct_missing) + group2_sig = remove_x_pct(group2_sig_orig, pct_missing) + chem_dict = make_chem_dict( + deepcopy(group1_sig), + group1_isig_orig, + deepcopy(group2_sig), + group2_isig_orig, + rep_keys, + ) + ct = defaultdict( int ) + for noise in params.noise_level: + dirname = f"mvapack-data/{pct_missing}_missing/{noise:.2f}_noise/" + safe_mkdir(dirname) + assert os.path.isdir(dirname), f"The directory {dirname} does not exist" + + peak_values = dict() + + for rep_name, chems in chem_dict.items(): + print( rep_name ) + filt_chems = list(filter(lambda c: c is not None, chems)) + for fc in filt_chems: + ct[ str(fc.formula) ] += 1 + continue + print(len(filt_chems)) + mass_spec = IndependentMassSpectrometer( + POSITIVE, filt_chems, ps, None, True + ) + controller = SimpleMs1Controller(mass_spec, params.mz_upper) + controller.run(params.rt_lower, params.rt_upper, False) + + peak_values[rep_name] = deepcopy(controller.peak_recorder()) + controller = increase_resolution(controller) + controller = add_noise(controller, noise) + mzml_filename = Path(dirname, f"{rep_name}.mzML") + controller.write_mzML( dirname , mzml_filename) + + plt.hist( ct.values() ) + plt.show() + for k,v in ct.items(): + print(k, v) + #pickle.dump( peak_values, open('peak_values.pickle', 'wb')) + exit( 0 ) + print(peak_values) + # group1_sig, group2_sig = adjust_groups( group1_sig, group2_sig, peak_values ) + # exit( 0 ) + write_summary(group1_sig, group2_sig, dirname, peak_values) + exit(0) + + +if __name__ == "__main__": + main() diff --git a/Synthetic data creation scripts/vimms_data_generation/multiple_samples_example.ipynb b/Synthetic data creation scripts/vimms_data_generation/multiple_samples_example.ipynb new file mode 100644 index 00000000..bfac0ccd --- /dev/null +++ b/Synthetic data creation scripts/vimms_data_generation/multiple_samples_example.ipynb @@ -0,0 +1,865 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 176, + "metadata": {}, + "outputs": [], + "source": [ + "%matplotlib inline" + ] + }, + { + "cell_type": "code", + "execution_count": 177, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The autoreload extension is already loaded. To reload it, use:\n", + " %reload_ext autoreload\n" + ] + } + ], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2" + ] + }, + { + "cell_type": "code", + "execution_count": 178, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "import sys\n", + "import scipy.stats\n", + "import pylab as plt\n", + "from IPython import display\n", + "import pylab as plt\n", + "import glob\n", + "from collections import defaultdict" + ] + }, + { + "cell_type": "code", + "execution_count": 179, + "metadata": {}, + "outputs": [], + "source": [ + "sys.path.append('..')" + ] + }, + { + "cell_type": "code", + "execution_count": 180, + "metadata": {}, + "outputs": [], + "source": [ + "from vimms.Chemicals import *\n", + "from vimms.Chromatograms import *\n", + "from vimms.MassSpec import *\n", + "from vimms.Controller import *\n", + "from vimms.Common import *\n", + "from vimms.DataGenerator import *\n", + "from vimms.DsDA import *" + ] + }, + { + "cell_type": "code", + "execution_count": 181, + "metadata": {}, + "outputs": [], + "source": [ + " set_log_level_warning()\n", + "# set_log_level_info()\n", + "# set_log_level_debug()" + ] + }, + { + "cell_type": "code", + "execution_count": 182, + "metadata": {}, + "outputs": [], + "source": [ + "# base_dir = '..\\\\data'\n", + "# base_dir = 'C:\\\\Users\\\\joewa\\\\University of Glasgow\\\\Vinny Davies - CLDS Metabolomics Project\\\\Trained Models'\n", + "base_dir = '/Users/simon/University of Glasgow/Vinny Davies - CLDS Metabolomics Project/Trained Models'" + ] + }, + { + "cell_type": "code", + "execution_count": 183, + "metadata": {}, + "outputs": [], + "source": [ + "ps = load_obj(os.path.join(base_dir, 'peak_sampler_mz_rt_int_19_beers_fullscan.p'))" + ] + }, + { + "cell_type": "code", + "execution_count": 184, + "metadata": {}, + "outputs": [], + "source": [ + "hmdb = load_obj(os.path.join(base_dir, 'hmdb_compounds.p'))" + ] + }, + { + "cell_type": "code", + "execution_count": 185, + "metadata": {}, + "outputs": [], + "source": [ + "out_dir = '/Users/simon/vimms_data'" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Create Initial Chemical" + ] + }, + { + "cell_type": "code", + "execution_count": 186, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Saving <class 'list'> to /Users/simon/vimms_data/BaseDataset/dataset.p\n" + ] + } + ], + "source": [ + "ROI_Sources = [\"/Users/simon/vimms_data/beer_t10_simulator_files\"]\n", + "min_ms1_intensity = 1.75E5\n", + "rt_range = [(400, 500)]\n", + "mz_range = [(200, 400)]\n", + "n_peaks = 50\n", + "roi_rt_range = [20, 40]\n", + "chems = ChemicalCreator(ps, ROI_Sources, hmdb)\n", + "dataset = chems.sample(mz_range, rt_range, min_ms1_intensity, n_peaks, 1, use_database=True, \n", + " fixed_mz=False, roi_rt_range=roi_rt_range)\n", + "save_obj(dataset, os.path.join(out_dir, 'BaseDataset/dataset.p'))" + ] + }, + { + "cell_type": "code", + "execution_count": 187, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "23.639999999999986\n", + "30.91300000000001\n", + "34.07000000000005\n", + "33.34400000000005\n", + "27.58499999999998\n", + "30.91192\n", + "24.62999999999988\n", + "36.65499999999997\n", + "36.42000000000007\n", + "30.340000000000146\n", + "23.268\n", + "21.972999999999956\n", + "36.011000000000024\n", + "20.067999999999984\n", + "25.733999999999924\n", + "20.24000000000001\n", + "24.460000000000036\n", + "23.871999999999957\n", + "24.733999999999924\n", + "25.93599999999998\n", + "32.3900000000001\n", + "22.5\n", + "29.205000000000013\n", + "22.774999999999977\n", + "28.649\n", + "30.10300000000001\n", + "31.930000000000064\n", + "29.180999999999926\n", + "34.6412\n", + "26.58100000000001\n", + "27.55400000000003\n", + "29.246999999999957\n", + "23.357999999999947\n", + "21.681999999999988\n", + "21.232000000000085\n", + "20.4762\n", + "37.3599999999999\n", + "24.817999999999984\n", + "23.531000000000006\n", + "31.873000000000047\n", + "24.039000000000044\n", + "27.186000000000035\n", + "28.537999999999897\n", + "36.95600000000002\n", + "38.460000000000036\n", + "20.876000000000005\n", + "38.54899999999998\n", + "37.150000000000006\n", + "26.47999999999999\n", + "20.381999999999948\n" + ] + } + ], + "source": [ + "for chem in dataset:\n", + " print(np.abs(chem.chromatogram.min_rt - chem.chromatogram.max_rt))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Create Multiple Samples" + ] + }, + { + "cell_type": "code", + "execution_count": 198, + "metadata": {}, + "outputs": [], + "source": [ + "n_samples = [50,50] # number of files per class\n", + "classes = [\"class%d\" % i for i in range(len(n_samples))] # creates default list of classes\n", + "intensity_noise_sd = [1000] # noise on max intensity" + ] + }, + { + "cell_type": "code", + "execution_count": 199, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['class0', 'class1']" + ] + }, + "execution_count": 199, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "classes" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Add intensity changes between different classes" + ] + }, + { + "cell_type": "code", + "execution_count": 200, + "metadata": {}, + "outputs": [], + "source": [ + "change_probabilities = [0 for i in range(len(n_samples))] # probability of intensity changes between different classes\n", + "change_differences_means = [0 for i in range(len(n_samples))] # mean of those intensity changes\n", + "change_differences_sds = [0 for i in range(len(n_samples))] # SD of those intensity changes" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Add experimental variables (examples in comments)" + ] + }, + { + "cell_type": "code", + "execution_count": 201, + "metadata": {}, + "outputs": [], + "source": [ + "experimental_classes = None # [[\"male\",\"female\"],[\"Positive\",\"Negative\",\"Unknown\"]]\n", + "experimental_probabilitities = None # [[0.5,0.5],[0.33,0.33,0.34]]\n", + "experimental_sds = None # [[250],[250]]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Dropout chemicals from in different classes" + ] + }, + { + "cell_type": "code", + "execution_count": 202, + "metadata": {}, + "outputs": [], + "source": [ + "dropout_probability = 0.2\n", + "dropout_probabilities = [dropout_probability for i in range(len(n_samples))]\n", + "# dropout_probabilities = None\n", + "# dropout_numbers = 2 # number of chemicals dropped out in each class\n", + "dropout_numbers = None" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Set save location" + ] + }, + { + "cell_type": "code", + "execution_count": 203, + "metadata": {}, + "outputs": [], + "source": [ + "save_location = os.path.join(out_dir, 'ChemicalFiles')" + ] + }, + { + "cell_type": "code", + "execution_count": 204, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "26\n", + "Saving <class 'list'> to /Users/simon/vimms_data/ChemicalFiles/sample_0.p\n", + "21\n", + "Saving <class 'list'> to /Users/simon/vimms_data/ChemicalFiles/sample_1.p\n", + "28\n", + "Saving <class 'list'> to /Users/simon/vimms_data/ChemicalFiles/sample_2.p\n", + "21\n", + "Saving <class 'list'> to /Users/simon/vimms_data/ChemicalFiles/sample_3.p\n", + "25\n", + "Saving <class 'list'> to /Users/simon/vimms_data/ChemicalFiles/sample_4.p\n", + "25\n", + "Saving <class 'list'> to /Users/simon/vimms_data/ChemicalFiles/sample_5.p\n", + "23\n", + "Saving <class 'list'> to /Users/simon/vimms_data/ChemicalFiles/sample_6.p\n", + "30\n", + "Saving <class 'list'> to /Users/simon/vimms_data/ChemicalFiles/sample_7.p\n", + "24\n", + "Saving <class 'list'> to /Users/simon/vimms_data/ChemicalFiles/sample_8.p\n", + "26\n", + "Saving <class 'list'> to /Users/simon/vimms_data/ChemicalFiles/sample_9.p\n", + "21\n", + "Saving <class 'list'> to /Users/simon/vimms_data/ChemicalFiles/sample_10.p\n", + "22\n", + "Saving <class 'list'> to /Users/simon/vimms_data/ChemicalFiles/sample_11.p\n", + "27\n", + "Saving <class 'list'> to /Users/simon/vimms_data/ChemicalFiles/sample_12.p\n", + "20\n", + "Saving <class 'list'> to /Users/simon/vimms_data/ChemicalFiles/sample_13.p\n", + "26\n", + "Saving <class 'list'> to /Users/simon/vimms_data/ChemicalFiles/sample_14.p\n", + "18\n", + "Saving <class 'list'> to /Users/simon/vimms_data/ChemicalFiles/sample_15.p\n", + "26\n", + "Saving <class 'list'> to /Users/simon/vimms_data/ChemicalFiles/sample_16.p\n", + "24\n", + "Saving <class 'list'> to /Users/simon/vimms_data/ChemicalFiles/sample_17.p\n", + "25\n", + "Saving <class 'list'> to /Users/simon/vimms_data/ChemicalFiles/sample_18.p\n", + "22\n", + "Saving <class 'list'> to /Users/simon/vimms_data/ChemicalFiles/sample_19.p\n", + "27\n", + "Saving <class 'list'> to /Users/simon/vimms_data/ChemicalFiles/sample_20.p\n", + "28\n", + "Saving <class 'list'> to /Users/simon/vimms_data/ChemicalFiles/sample_21.p\n", + "25\n", + "Saving <class 'list'> to /Users/simon/vimms_data/ChemicalFiles/sample_22.p\n", + "22\n", + "Saving <class 'list'> to /Users/simon/vimms_data/ChemicalFiles/sample_23.p\n", + "24\n", + "Saving <class 'list'> to /Users/simon/vimms_data/ChemicalFiles/sample_24.p\n", + "25\n", + "Saving <class 'list'> to /Users/simon/vimms_data/ChemicalFiles/sample_25.p\n", + "24\n", + "Saving <class 'list'> to /Users/simon/vimms_data/ChemicalFiles/sample_26.p\n", + "28\n", + "Saving <class 'list'> to /Users/simon/vimms_data/ChemicalFiles/sample_27.p\n", + "23\n", + "Saving <class 'list'> to /Users/simon/vimms_data/ChemicalFiles/sample_28.p\n", + "22\n", + "Saving <class 'list'> to /Users/simon/vimms_data/ChemicalFiles/sample_29.p\n", + "24\n", + "Saving <class 'list'> to /Users/simon/vimms_data/ChemicalFiles/sample_30.p\n", + "25\n", + "Saving <class 'list'> to /Users/simon/vimms_data/ChemicalFiles/sample_31.p\n", + "22\n", + "Saving <class 'list'> to /Users/simon/vimms_data/ChemicalFiles/sample_32.p\n", + "23\n", + "Saving <class 'list'> to /Users/simon/vimms_data/ChemicalFiles/sample_33.p\n", + "24\n", + "Saving <class 'list'> to /Users/simon/vimms_data/ChemicalFiles/sample_34.p\n", + "25\n", + "Saving <class 'list'> to /Users/simon/vimms_data/ChemicalFiles/sample_35.p\n", + "23\n", + "Saving <class 'list'> to /Users/simon/vimms_data/ChemicalFiles/sample_36.p\n", + "27\n", + "Saving <class 'list'> to /Users/simon/vimms_data/ChemicalFiles/sample_37.p\n", + "25\n", + "Saving <class 'list'> to /Users/simon/vimms_data/ChemicalFiles/sample_38.p\n", + "22\n", + "Saving <class 'list'> to /Users/simon/vimms_data/ChemicalFiles/sample_39.p\n", + "27\n", + "Saving <class 'list'> to /Users/simon/vimms_data/ChemicalFiles/sample_40.p\n", + "27\n", + "Saving <class 'list'> to /Users/simon/vimms_data/ChemicalFiles/sample_41.p\n", + "27\n", + "Saving <class 'list'> to /Users/simon/vimms_data/ChemicalFiles/sample_42.p\n", + "26\n", + "Saving <class 'list'> to /Users/simon/vimms_data/ChemicalFiles/sample_43.p\n", + "24\n", + "Saving <class 'list'> to /Users/simon/vimms_data/ChemicalFiles/sample_44.p\n", + "25\n", + "Saving <class 'list'> to /Users/simon/vimms_data/ChemicalFiles/sample_45.p\n", + "28\n", + "Saving <class 'list'> to /Users/simon/vimms_data/ChemicalFiles/sample_46.p\n", + "26\n", + "Saving <class 'list'> to /Users/simon/vimms_data/ChemicalFiles/sample_47.p\n", + "21\n", + "Saving <class 'list'> to /Users/simon/vimms_data/ChemicalFiles/sample_48.p\n", + "25\n", + "Saving <class 'list'> to /Users/simon/vimms_data/ChemicalFiles/sample_49.p\n", + "28\n", + "Saving <class 'list'> to /Users/simon/vimms_data/ChemicalFiles/sample_50.p\n", + "20\n", + "Saving <class 'list'> to /Users/simon/vimms_data/ChemicalFiles/sample_51.p\n", + "27\n", + "Saving <class 'list'> to /Users/simon/vimms_data/ChemicalFiles/sample_52.p\n", + "23\n", + "Saving <class 'list'> to /Users/simon/vimms_data/ChemicalFiles/sample_53.p\n", + "29\n", + "Saving <class 'list'> to /Users/simon/vimms_data/ChemicalFiles/sample_54.p\n", + "24\n", + "Saving <class 'list'> to /Users/simon/vimms_data/ChemicalFiles/sample_55.p\n", + "32\n", + "Saving <class 'list'> to /Users/simon/vimms_data/ChemicalFiles/sample_56.p\n", + "26\n", + "Saving <class 'list'> to /Users/simon/vimms_data/ChemicalFiles/sample_57.p\n", + "29\n", + "Saving <class 'list'> to /Users/simon/vimms_data/ChemicalFiles/sample_58.p\n", + "27\n", + "Saving <class 'list'> to /Users/simon/vimms_data/ChemicalFiles/sample_59.p\n", + "29\n", + "Saving <class 'list'> to /Users/simon/vimms_data/ChemicalFiles/sample_60.p\n", + "30\n", + "Saving <class 'list'> to /Users/simon/vimms_data/ChemicalFiles/sample_61.p\n", + "25\n", + "Saving <class 'list'> to /Users/simon/vimms_data/ChemicalFiles/sample_62.p\n", + "31\n", + "Saving <class 'list'> to /Users/simon/vimms_data/ChemicalFiles/sample_63.p\n", + "24\n", + "Saving <class 'list'> to /Users/simon/vimms_data/ChemicalFiles/sample_64.p\n", + "25\n", + "Saving <class 'list'> to /Users/simon/vimms_data/ChemicalFiles/sample_65.p\n", + "27\n", + "Saving <class 'list'> to /Users/simon/vimms_data/ChemicalFiles/sample_66.p\n", + "28\n", + "Saving <class 'list'> to /Users/simon/vimms_data/ChemicalFiles/sample_67.p\n", + "22\n", + "Saving <class 'list'> to /Users/simon/vimms_data/ChemicalFiles/sample_68.p\n", + "28\n", + "Saving <class 'list'> to /Users/simon/vimms_data/ChemicalFiles/sample_69.p\n", + "26\n", + "Saving <class 'list'> to /Users/simon/vimms_data/ChemicalFiles/sample_70.p\n", + "26\n", + "Saving <class 'list'> to /Users/simon/vimms_data/ChemicalFiles/sample_71.p\n", + "24\n", + "Saving <class 'list'> to /Users/simon/vimms_data/ChemicalFiles/sample_72.p\n", + "26\n", + "Saving <class 'list'> to /Users/simon/vimms_data/ChemicalFiles/sample_73.p\n", + "21\n", + "Saving <class 'list'> to /Users/simon/vimms_data/ChemicalFiles/sample_74.p\n", + "30\n", + "Saving <class 'list'> to /Users/simon/vimms_data/ChemicalFiles/sample_75.p\n", + "25\n", + "Saving <class 'list'> to /Users/simon/vimms_data/ChemicalFiles/sample_76.p\n", + "26\n", + "Saving <class 'list'> to /Users/simon/vimms_data/ChemicalFiles/sample_77.p\n", + "29\n", + "Saving <class 'list'> to /Users/simon/vimms_data/ChemicalFiles/sample_78.p\n", + "25\n", + "Saving <class 'list'> to /Users/simon/vimms_data/ChemicalFiles/sample_79.p\n", + "26\n", + "Saving <class 'list'> to /Users/simon/vimms_data/ChemicalFiles/sample_80.p\n", + "28\n", + "Saving <class 'list'> to /Users/simon/vimms_data/ChemicalFiles/sample_81.p\n", + "25\n", + "Saving <class 'list'> to /Users/simon/vimms_data/ChemicalFiles/sample_82.p\n", + "27\n", + "Saving <class 'list'> to /Users/simon/vimms_data/ChemicalFiles/sample_83.p\n", + "32\n", + "Saving <class 'list'> to /Users/simon/vimms_data/ChemicalFiles/sample_84.p\n", + "23\n", + "Saving <class 'list'> to /Users/simon/vimms_data/ChemicalFiles/sample_85.p\n", + "26\n", + "Saving <class 'list'> to /Users/simon/vimms_data/ChemicalFiles/sample_86.p\n", + "26\n", + "Saving <class 'list'> to /Users/simon/vimms_data/ChemicalFiles/sample_87.p\n", + "28\n", + "Saving <class 'list'> to /Users/simon/vimms_data/ChemicalFiles/sample_88.p\n", + "22\n", + "Saving <class 'list'> to /Users/simon/vimms_data/ChemicalFiles/sample_89.p\n", + "31\n", + "Saving <class 'list'> to /Users/simon/vimms_data/ChemicalFiles/sample_90.p\n", + "28\n", + "Saving <class 'list'> to /Users/simon/vimms_data/ChemicalFiles/sample_91.p\n", + "26\n", + "Saving <class 'list'> to /Users/simon/vimms_data/ChemicalFiles/sample_92.p\n", + "22\n", + "Saving <class 'list'> to /Users/simon/vimms_data/ChemicalFiles/sample_93.p\n", + "31\n", + "Saving <class 'list'> to /Users/simon/vimms_data/ChemicalFiles/sample_94.p\n", + "29\n", + "Saving <class 'list'> to /Users/simon/vimms_data/ChemicalFiles/sample_95.p\n", + "28\n", + "Saving <class 'list'> to /Users/simon/vimms_data/ChemicalFiles/sample_96.p\n", + "29\n", + "Saving <class 'list'> to /Users/simon/vimms_data/ChemicalFiles/sample_97.p\n", + "27\n", + "Saving <class 'list'> to /Users/simon/vimms_data/ChemicalFiles/sample_98.p\n", + "30\n", + "Saving <class 'list'> to /Users/simon/vimms_data/ChemicalFiles/sample_99.p\n" + ] + } + ], + "source": [ + "multiple_samples = MultiSampleCreator(dataset, n_samples, classes, intensity_noise_sd, \n", + " change_probabilities, change_differences_means, change_differences_sds, dropout_probabilities, dropout_numbers,\n", + " experimental_classes, experimental_probabilitities, experimental_sds, save_location=save_location)" + ] + }, + { + "cell_type": "code", + "execution_count": 205, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'C11H11N3O2S', 'C13H18O2', 'C18H27NO3', 'C14H11Cl2NO4', 'C5H6Cl6N2O3', 'C4H7Cl2O4P', 'C18H37NO3', 'C10H12ClN3O3S', 'C15H10O7'}\n" + ] + }, + { + "data": { + "text/plain": [ + "100" + ] + }, + "execution_count": 205, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Check the number of identical formulas in two objects of the same class\n", + "formulas0 = set([str(a.formula) for a in multiple_samples.samples[0]])\n", + "formulas1 = set([str(a.formula) for a in multiple_samples.samples[1]])\n", + "print(formulas0-formulas1)\n", + "\n", + "total_samples = np.sum(multiple_samples.n_samples)\n", + "total_samples" + ] + }, + { + "cell_type": "code", + "execution_count": 206, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Saving <class 'list'> to /Users/simon/vimms_data/MissingChemicals/missing_chemicals.p\n" + ] + }, + { + "data": { + "text/plain": [ + "[[KnownChemical - 'C15H16Cl3N3O2' rt=409.99 max_intensity=1725954.34,\n", + " KnownChemical - 'C16H28O' rt=431.85 max_intensity=1059507.71,\n", + " KnownChemical - 'C19H18ClN5' rt=423.53 max_intensity=329447.21,\n", + " KnownChemical - 'C17H23NO2' rt=415.68 max_intensity=251532.13,\n", + " KnownChemical - 'C11H13N3O3S' rt=421.16 max_intensity=1558628.97,\n", + " KnownChemical - 'C11H28N4' rt=416.56 max_intensity=2005289.30,\n", + " KnownChemical - 'C13H9NO2S' rt=401.56 max_intensity=239654.60,\n", + " KnownChemical - 'C13H9NO2S' rt=428.33 max_intensity=13229609.72,\n", + " KnownChemical - 'C10H16N2O4' rt=414.90 max_intensity=49733322.17,\n", + " KnownChemical - 'C8H11NO5S' rt=412.33 max_intensity=342886.18,\n", + " KnownChemical - 'C13H11N3O5S2' rt=450.38 max_intensity=267161.99,\n", + " KnownChemical - 'C7H9NO4S' rt=428.11 max_intensity=246202.61,\n", + " KnownChemical - 'C21H29NO' rt=425.12 max_intensity=245703.10,\n", + " KnownChemical - 'C8H12O9' rt=459.26 max_intensity=1531653.32,\n", + " KnownChemical - 'C16H34' rt=427.40 max_intensity=76844746.48],\n", + " [KnownChemical - 'C15H16Cl3N3O2' rt=409.99 max_intensity=1725954.34,\n", + " KnownChemical - 'C11H11N3O2S' rt=424.50 max_intensity=246727.63,\n", + " KnownChemical - 'C10H12ClN3O3S' rt=476.40 max_intensity=1936392.25,\n", + " KnownChemical - 'C9H9Cl2N3O' rt=422.93 max_intensity=547923.51,\n", + " KnownChemical - 'C14H30O3' rt=445.79 max_intensity=1703073.85,\n", + " KnownChemical - 'C5H6Cl6N2O3' rt=425.01 max_intensity=1033219.33,\n", + " KnownChemical - 'C16H26' rt=461.36 max_intensity=6975922.33,\n", + " KnownChemical - 'C13H9NO2S' rt=428.33 max_intensity=13229609.72,\n", + " KnownChemical - 'C13H11N3O5S2' rt=450.38 max_intensity=267161.99,\n", + " KnownChemical - 'C21H29NO' rt=425.12 max_intensity=245703.10,\n", + " KnownChemical - 'C16H26' rt=427.32 max_intensity=674172.61]]" + ] + }, + "execution_count": 206, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "save_obj(multiple_samples.missing_chemicals, os.path.join(out_dir, 'MissingChemicals','missing_chemicals.p'))\n", + "multiple_samples.missing_chemicals" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Run MS1 controller and save out .mzML files" + ] + }, + { + "cell_type": "code", + "execution_count": 207, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "101.0766999999995it [00:00, 585.06it/s] \n", + "101.0076000000006it [00:00, 756.03it/s] \n", + "100.65469999999988it [00:00, 661.05it/s] \n", + "100.08256000000006it [00:00, 764.81it/s] \n", + "100.82959000000005it [00:00, 658.77it/s] \n", + "100.91009999999983it [00:00, 664.08it/s] \n", + "100.93500000000017it [00:00, 662.31it/s] \n", + "101.21058000000016it [00:00, 586.26it/s] \n", + "100.69709999999975it [00:00, 725.90it/s] \n", + "101.0682000000005it [00:00, 662.61it/s] \n", + "100.94901999999905it [00:00, 725.71it/s] \n", + "100.52189999999928it [00:00, 752.98it/s] \n", + "101.13919999999996it [00:00, 660.83it/s] \n", + "100.76020000000017it [00:00, 772.24it/s] \n", + "100.29280000000011it [00:00, 671.33it/s] \n", + "100.12770000000052it [00:00, 850.93it/s] \n", + "100.89423999999963it [00:00, 659.05it/s] \n", + "100.00509999999986it [00:00, 663.76it/s] \n", + "100.21280000000041it [00:00, 647.97it/s] \n", + "100.29063000000008it [00:00, 735.42it/s] \n", + "101.06017999999972it [00:00, 634.71it/s] \n", + "100.28120000000047it [00:00, 634.15it/s] \n", + "100.2417999999991it [00:00, 699.44it/s] \n", + "100.6541000000002it [00:00, 749.57it/s] \n", + "100.80729999999954it [00:00, 730.74it/s] \n", + "100.61380000000008it [00:00, 605.76it/s] \n", + "100.43969999999973it [00:00, 671.23it/s] \n", + "100.09509999999972it [00:00, 528.67it/s] \n", + "100.90539999999976it [00:00, 704.68it/s] \n", + "101.12946999999963it [00:00, 744.37it/s] \n", + "100.98500000000013it [00:00, 727.62it/s] \n", + "100.83409999999947it [00:00, 680.47it/s] \n", + "100.81709999999998it [00:00, 744.30it/s] \n", + "100.31147999999962it [00:00, 711.94it/s] \n", + "100.18090000000007it [00:00, 710.33it/s] \n", + "100.83951000000047it [00:00, 673.48it/s] \n", + "100.25170999999955it [00:00, 711.93it/s] \n", + "100.84059999999982it [00:00, 596.15it/s] \n", + "101.03260000000074it [00:00, 730.29it/s] \n", + "100.18609999999956it [00:00, 740.38it/s] \n", + "101.0018999999997it [00:00, 680.59it/s] \n", + "100.86064999999934it [00:00, 661.15it/s] \n", + "100.57273000000066it [00:00, 637.58it/s] \n", + "100.27547999999979it [00:00, 665.58it/s] \n", + "100.98659999999938it [00:00, 684.00it/s] \n", + "100.82990000000001it [00:00, 692.23it/s] \n", + "101.16020000000037it [00:00, 653.11it/s] \n", + "101.03670000000022it [00:00, 689.48it/s] \n", + "100.5354999999995it [00:00, 744.45it/s] \n", + "101.03590000000048it [00:00, 678.34it/s] \n", + "100.88130000000035it [00:00, 668.63it/s] \n", + "101.37419999999992it [00:00, 769.19it/s] \n", + "100.01150000000007it [00:00, 625.90it/s] \n", + "100.1823000000013it [00:00, 685.04it/s] \n", + "100.93100000000044it [00:00, 635.23it/s] \n", + "100.32763000000011it [00:00, 564.93it/s] \n", + "100.86940000000044it [00:00, 604.78it/s] \n", + "101.10580000000095it [00:00, 669.84it/s] \n", + "100.90390000000008it [00:00, 633.06it/s] \n", + "101.33621000000164it [00:00, 671.74it/s] \n", + "100.46230000000043it [00:00, 648.29it/s] \n", + "100.00379999999984it [00:00, 629.58it/s] \n", + "100.70320000000038it [00:00, 712.30it/s] \n", + "100.32900000000166it [00:00, 595.87it/s] \n", + "100.39048999999966it [00:00, 709.57it/s] \n", + "100.27794999999986it [00:00, 683.80it/s] \n", + "100.46116300000017it [00:00, 661.67it/s] \n", + "101.46010000000024it [00:00, 640.70it/s] \n", + "100.08272000000034it [00:00, 757.71it/s] \n", + "100.0920299999998it [00:00, 511.43it/s] \n", + "100.18530000000004it [00:00, 423.14it/s] \n", + "100.42677000000026it [00:00, 545.44it/s] \n", + "101.02339999999987it [00:00, 575.67it/s] \n", + "100.53609999999924it [00:00, 552.46it/s] \n", + "101.23380000000026it [00:00, 671.33it/s] \n", + "100.07399999999944it [00:00, 572.10it/s] \n", + "100.51989999999978it [00:00, 658.79it/s] \n", + "100.1208600000001it [00:00, 426.95it/s] \n", + "100.58970000000045it [00:00, 584.11it/s] \n", + "100.3201899999994it [00:00, 614.86it/s] \n", + "100.82249999999982it [00:00, 544.32it/s] \n", + "100.97380000000055it [00:00, 512.93it/s] \n", + "100.72855099999964it [00:00, 593.04it/s] \n", + "100.54971000000108it [00:00, 562.24it/s] \n", + "101.33460000000008it [00:00, 470.71it/s] \n", + "100.08457999999865it [00:00, 562.77it/s] \n", + "101.13069999999931it [00:00, 485.30it/s] \n", + "100.06760000000037it [00:00, 435.65it/s] \n", + "100.25849999999991it [00:00, 629.32it/s] \n", + "100.3793079999997it [00:00, 721.45it/s] \n", + "100.44389999999964it [00:00, 566.47it/s] \n", + "100.19509999999968it [00:00, 623.98it/s] \n", + "100.83597999999984it [00:00, 683.44it/s] \n", + "101.16990999999996it [00:00, 726.61it/s] \n", + "100.33540000000096it [00:00, 571.94it/s] \n", + "100.70130000000023it [00:00, 538.94it/s] \n", + "100.94689999999974it [00:00, 548.05it/s] \n", + "100.83749999999907it [00:00, 610.77it/s] \n", + "101.07420000000008it [00:00, 630.68it/s] \n", + "101.04830000000038it [00:00, 614.66it/s] \n" + ] + } + ], + "source": [ + "min_rt = rt_range[0][0]\n", + "max_rt = rt_range[0][1]\n", + "controllers = defaultdict(list)\n", + "controller_to_mzml = {}\n", + "\n", + "mzml_dir = os.path.join(out_dir, 'mzmlFiles')\n", + "num_classes = len(n_samples)\n", + "sample_idx = 0\n", + "for j in range(num_classes):\n", + " num_samples = n_samples[j]\n", + " for i in range(num_samples):\n", + " fname = os.path.join(save_location, 'sample_%d.p' % sample_idx) \n", + " sample = load_obj(fname)\n", + " sample_idx += 1\n", + " \n", + " mass_spec = IndependentMassSpectrometer(POSITIVE, sample, density=ps.density_estimator)\n", + " mzml_filename = os.path.join(mzml_dir,'sample_id_0_number_%d' % i + '_class_%d.mzML' % j)\n", + " controller = SimpleMs1Controller(mass_spec)\n", + " controller.run(min_rt,max_rt)\n", + " controller.write_mzML('my_analysis', mzml_filename)\n", + " \n", + " controllers[j].append(controller)\n", + " controller_to_mzml[controller] = (j, mzml_filename, )" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Print out the missing peaks" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def get_chem_to_peaks(controller):\n", + " chem_to_peaks = defaultdict(list)\n", + " frag_events = controller.mass_spec.fragmentation_events\n", + " for frag_event in frag_events:\n", + " chem = frag_event.chem\n", + " peaks = frag_event.peaks\n", + " chem_to_peaks[chem].extend(peaks)\n", + " return chem_to_peaks" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for controller, (current_class, mzml_filename) in controller_to_mzml.items():\n", + " controller_peaks = get_chem_to_peaks(controller)\n", + " basename = os.path.basename(mzml_filename)\n", + " front, back = os.path.splitext(mzml_filename)\n", + " outfile = front + '.csv'\n", + "\n", + " missing_peaks = [] \n", + " for other_class in range(num_classes):\n", + " if current_class == other_class:\n", + " continue\n", + "\n", + " # get the peaks that are present in current_class but missing in other_class\n", + " missing_chems = multiple_samples.missing_chemicals[other_class]\n", + " for chem in missing_chems:\n", + " peaks = controller_peaks[chem]\n", + " for peak in peaks:\n", + " row = (chem.formula.formula_string, current_class, other_class, peak.mz, peak.rt, peak.intensity)\n", + " missing_peaks.append(row)\n", + " \n", + " # convert to dataframe\n", + " columns = ['formula', 'present_in', 'missing_in', 'mz', 'RT', 'intensity']\n", + " missing_df = pd.DataFrame(missing_peaks, columns=columns)\n", + " missing_df.to_csv(os.path.join(out_dir, 'MissingChemicals', os.path.basename(outfile)))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/Synthetic data creation scripts/vimms_data_generation/prepare-eics b/Synthetic data creation scripts/vimms_data_generation/prepare-eics new file mode 100644 index 00000000..276d7cdf --- /dev/null +++ b/Synthetic data creation scripts/vimms_data_generation/prepare-eics @@ -0,0 +1,122 @@ +#!/usr/bin/env python3 +import os +import pickle +import numpy as np +import matplotlib.pyplot as plt +from collections import defaultdict, namedtuple +from scipy.optimize import curve_fit +from scipy.signal import find_peaks + +EIC = namedtuple('EIC', 'name mzs rts its') + +import numpy as np +from scipy import sparse +from scipy.sparse.linalg import spsolve + +def baseline_als(y, lam, p, niter=10): + L = len(y) + D = sparse.diags([1,-2,1],[0,-1,-2], shape=(L,L-2)) + w = np.ones(L) + for i in range(niter): + W = sparse.spdiags(w, 0, L, L) + Z = W + lam * D.dot(D.transpose()) + z = spsolve(Z, w*y) + w = p * (y > z) + (1-p) * (y < z) + return z + +def get_lines( fname ): + if not os.path.exists( fname ): + return [] + fh = open( fname, 'r') + lines = fh.read().splitlines() + fh.close() + return lines + +def get_name( eic ): + return f"{eic.name},{eic.mzs[0]:.4f}" + + +def select_compounds( rawnames ): + holder = defaultdict( list ) + for rn in rawnames: + name,_ = rn.split(',') + holder[ name ].append( rn ) + + kept = list() + for clist in holder.values(): + if len(clist ) > 1: + kept.extend( clist ) + + return set(kept) + +def has_baseline( eic ): + its, rts, mzs = eic.its, eic.rts, eic.mzs + mask = its > 0 + its, rts, mzs = its[ mask ], rts[ mask ], mzs[ mask ] + max_idx = np.argmax( its ) + left_slope, right_slope = [], [] + for offset in range( 10 ): # maybe don't hardcode this? + left, right = offset, len( its )-1-offset + + if left + 1 < max_idx: + left_slope.append(abs( + (its[left+1]-its[left])/ + (rts[left+1]-rts[left]) + )) + + if right - 1 > max_idx: + right_slope.append(abs( + (its[right]-its[right-1])/ + (rts[right]-rts[right-1]) + )) + + return np.mean(np.array( left_slope )), np.mean(np.array( right_slope )) + + +def gauss( x, a, b, c, d ): + return a*np.exp(-((x-b)**2)/c) + d + +def idealize_eic( eic ): + its, rts, mzs = eic.its, eic.rts, eic.mzs + highest = np.max( its ) + mask = (its > 0) + rts = rts[mask] + its = its[mask] + mzs = mzs[mask] + max_val = np.max( its ) + max_it = np.argmax( its ) + a = its[ max_it ] + b = rts[ max_it ] + + params = curve_fit( gauss, rts, its, p0=(a,b,1,np.min(its)) )[0] + + if params[-1] > 0.01*max_val: + old = params[-1] + params[-1] = 0.01*max_val + params[0] += abs(old-params[-1]) + + if params[-1] < 0: + params[-1] = 0.01*max_val + + return EIC( name=eic.name, + its=gauss( eic.rts, *params ), + mzs=eic.mzs, rts=eic.rts) + +def main( ): + + kept_eics = select_compounds(get_lines( 'keep.txt' )) + eics = pickle.load(open('eics.pickle', 'rb')) + filt = list(filter(lambda eic: get_name( eic ) in kept_eics, eics)) + filt = sorted( filt, key=lambda ee: ee.name ) + # ok so we need to identify possible issues with this stuff. + # basically the main problem is there isn't any baseline + # for a lot of these + idealized = [] + for f in filt: + try: + idealized.append( idealize_eic( f )) + except: + pass + pickle.dump(idealized, open('eics-ideal.pickle', 'wb')) +if __name__ == '__main__': + main( ) diff --git a/Synthetic data creation scripts/vimms_data_generation/validate b/Synthetic data creation scripts/vimms_data_generation/validate new file mode 100644 index 00000000..506a0988 --- /dev/null +++ b/Synthetic data creation scripts/vimms_data_generation/validate @@ -0,0 +1,21 @@ +#!/usr/bin/env python3 + + +import pickle +import numpy as np + +def cv( vals ): + return np.std( vals )/np.mean( vals ) + + + +mm = pickle.load(open('mult_mapper.pickle','rb')) + +good = 0 + +for vv in mm.values(): + g1, g2 = vv[0:10], vv[10:] + good += (cv(g1)<0.20)and(cv(g2)<0.20) + + +print( good ) -- GitLab