diff --git a/environment-dev.yml b/environment-dev.yml index b1f4815ce..504cc47a2 100644 --- a/environment-dev.yml +++ b/environment-dev.yml @@ -22,6 +22,7 @@ dependencies: - lmfit>=1.0.2 - psutil>=5.4.8 - joblib>=1.0.0 + - emcee>=3.0.2 # for testing - pytest<8.0 - pytest-cov diff --git a/environment.yml b/environment.yml index 38d783a63..af3b0a6c8 100644 --- a/environment.yml +++ b/environment.yml @@ -23,3 +23,4 @@ dependencies: - lmfit>=1.0.2 - psutil>=5.4.8 - nmrglue>=0.9 + - emcee>=3.0.2 diff --git a/requirements-dev.txt b/requirements-dev.txt index 80e9b20a0..e1b67cf95 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -11,6 +11,7 @@ pandas>=1.1.3 numexpr==2.8.4 psutil>=5.4.8 joblib>=1.0.0 +emcee>=3.0.2 nmrglue>=0.9 cython>=0.29.11 diff --git a/requirements.txt b/requirements.txt index daca403e3..96eb781c0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,4 +11,5 @@ pandas>=1.1.3 numexpr==2.8.4 psutil>=5.4.8 joblib>=1.0.0 +emcee>=3.0.2 nmrglue>=0.9 diff --git a/src/mrsimulator/utils/monte_carlo.py b/src/mrsimulator/utils/monte_carlo.py new file mode 100644 index 000000000..96f4a8b20 --- /dev/null +++ b/src/mrsimulator/utils/monte_carlo.py @@ -0,0 +1,351 @@ +# -*- coding: utf-8 -*- +import copy +import re + +import emcee +import numpy as np +from mrsimulator.utils.spectral_fitting import get_correct_data_order + +__author__ = ["He Sun", "Deepansh Srivastava"] +__email__ = ["wushanyun64@gmail.com", "srivastava.89@osu.edu"] + + +def name_abbrev(params): + """ + The limfit parameters obj created by function make_LMFIT_params contains detailed + but lengthy names which could be hard for ploting. + This small function simplifys the long default param name for substream ploting. + + Example (default name --> abbreviated name) + ----------------------------- + sys_0_site_0_isotropic_chemical_shift --> delta_0_0 + sys_1_site_0_shielding_symmetric_eta-->etaCS_1_0 + mth_0_rotor_frequency-->spin_freq_0 + SP_0_operation_2_Exponential_FWHM -->expo_fwhm_0_2 + + Paramsters + ----------------------------- + params: LMFIT Parameters. + The lmfit parameters obj with default mrsimulator param name. + + Return + ----------------------------- + name_abbrev_list: list + A list of abbreviated parameters names. + """ + abbreviation_pairs = { + r"sys_[0-9]+_site_[0-9]+_isotropic_chemical_shift": "delta_iso", + r"sys_[0-9]+_site_[0-9]+_shielding_symmetric_zeta": "zeta", + r"sys_[0-9]+_site_[0-9]+_shielding_symmetric_eta": "etaCS", + r"sys_[0-9]+_site_[0-9]+_quadrupolar_Cq": "Cq", + r"sys_[0-9]+_site_[0-9]+_quadrupolar_eta": "etaQ", + r"mth_[0-9]+_rotor_frequency": "spin_freq", + r"sys_[0-9]+_abundance": "abundance", + r"SP_[0-9]+_operation_[0-9]+_Exponential_FWHM": "expo_fwhm", + r"SP_[0-9]+_operation_[0-9]+_Gaussian_FWHM": "gauss_fwhm", + r"SP_[0-9]+_operation_[0-9]+_Scale_factor": "scale", + r"SP_[0-9]+_operation_[0-9]+_ConstantOffset_offset": "offset", + r"SP_[0-9]+_operation_[0-9]+_Linear_amplitude": "linear", + } + + name_abbrev_list = [] + for name in params.keys(): + for pattern, abbrev in abbreviation_pairs.items(): + pattern = re.compile(pattern) + if pattern.fullmatch(name): + numbers = re.findall(r"(_[0-9]+)_", name) + name_abbrev = abbrev + for num in numbers: + name_abbrev += num + name_abbrev_list.append(name_abbrev) + return name_abbrev_list + + +class mrsim_emcee: + """ + A utility class to sample the posterior distribution function (PDF) of NMR + paramters with Markov Chain Monte Carlo (MCMC) method. + + This is a class depends on the emcee package. + https://emcee.readthedocs.io/en/stable/# + + For more information on Markov Chain Monte Carlo, check: + https://knac.middlebury.edu/symp_sessions/2015/flaherty_MCMC_2015.pdf + + Parameters + -------------------------- + params: LMFIT parameters + Initial guess of the parameters to start MCMC walkers. User can use + mrsimulator.utils.spectral_fitting.make_LMFIT_params to generate LMFIT + parameters for mrsimulator. Please note mrsim_emcee is not a fitting + package, the aim here to sample the posterior distribution of the NMR + parameters AFTER fitting. We recommend user to first get a proper fitting + with spectral_fitting model then start this process with the fitted params. + simulator: MRsimulator simulator + The simulator object constructed with NMR paramters. + processor: MRsimulator processor + The processor object storing NMR signal processing parameters. + sigma: float + The noise level of the spectrum baseline. + """ + + def __init__(self, params, simulator, processor, sigma=None): + params = copy.deepcopy(params) + self.params = self._vary_only(params) + self.nvarys = len(self.params) + self.simulator = simulator + self.processor = processor + self.sigma = sigma + + @staticmethod + def _vary_only(params): + """ + Filter the parameter list, leave only variables. + + Note: abundance should not a a variable, it will be removed from the params + even if included by user with Vary = True. + + Parameters + ------------------------------ + params: Parameters + LMFIT parameters obj that store the variables. + """ + for k, v in list(params.items()): + if not v.vary or "abundance" in k: + params.pop(k) + return params + + def mcmc(self, steps=1000, nwalkers=100, burn=100, thin=10, progress=True): + """ + Bayesian sampling of the posterior distribution. + + This method uses the emcee package 'https://emcee.readthedocs.io/en/stable/' + to sample the posterior distribution of NMR parameters with Marcov Chain + Monte Carlo (MCMC) method. + + Parameters + ----------------------- + steps: int + The total number of samples to draw from the posterior distribution + for each walkers. + nwalkers: int + The numbers of walkers in the emnsemble. + burn: + Number of steps to discard at the begining of the sampling process. + The first few steps when the chain is making its way towards the + center of the PDF, are called ‘burn in’ and are removed since they + do not represent the final probability density function. + thin: int + Only accept 1 in every `thin` samples. + progress: bool + If True, show a progress bar while running. + + Return + --------------------------- + Result: dictionary + Result have 4 keys. + 'chain' contains the original chain of the random walker for each paramters + and walkers, dimention [(steps - burn) // thin, nwalkers, nvarys]. + 'flat_chain' contians the faltened chain the chain for all the walkers + on each parameters. + 'log_prob' contains output _log_probability function. + 'variables' contains all the variables' (none variables not included) + values, std, relative error, initial values etc. + """ + # randstate + seed = 100 + rand = np.random.RandomState(seed) + # initial guess + var = np.zeros(self.nvarys) + for i, v in enumerate(self.params): + param = self.params[v] + var[i] = param.value + p0 = 1 + rand.randn(nwalkers, self.nvarys) * 1.0e-4 + p0 *= var + + # run mcmc sampler + sampler = emcee.EnsembleSampler( + nwalkers, + self.nvarys, + self._log_probability, + args=(self.params, self.simulator, self.processor, self.sigma), + ) + + sampler.run_mcmc(p0, steps, progress=progress) + + result = {} + result["chain"] = sampler.get_chain(thin=thin, discard=burn, flat=False) + result["flat_chain"] = sampler.get_chain(thin=thin, discard=burn, flat=True) + result["log_prob"] = sampler.get_log_prob(thin=thin, discard=burn) + result["accept_frac"] = sampler.acceptance_fraction + quantiles = np.percentile(result["flat_chain"], [15.87, 50, 84.13], axis=0) + result["variables"] = self.params + + for i, k in enumerate(result["variables"]): + std_l, median, std_u = quantiles[:, i] + result["variables"][k].value = median + result["variables"][k].stderr = 0.5 * (std_u - std_l) + + return result + + @staticmethod + def _log_prior(theta, bounds): + """ + Calculate the log prior for the parameters given. + If the values in params are out side its bound, return -inf, else return 0. + + Parameters + ----------------------- + theta: sequence + sequence of the variables. + bounds: array + Maximum and minimum of the variables. + """ + if np.any(theta > bounds[0, :]) or np.any(theta < bounds[1, :]): + return -np.inf + return 0.0 + + def _log_probability(self, theta, params, simulator, processor, sigma): + """ + Calculate the log posterior probability function of the nmr parameters. + """ + # calculate log prior. + max_ = [params[k].max for k in params] + min_ = [params[k].min for k in params] + bounds = np.array([max_, min_]) + lp = self._log_prior(theta, bounds) + if not np.isfinite(lp): + return -np.inf + + for k, v in zip(params.keys(), theta): + params[k].value = v + fxn_output = self.minimization_function(params, simulator, processor, sigma) + lnprob = np.asarray(fxn_output).ravel() + lnprob = -0.5 * (lnprob * lnprob).sum() + return lnprob + + @staticmethod + def _update_sim_params(simulator, params): + """ + update the simulation object with the NMR parameters provided. + """ + simulator = copy.deepcopy(simulator) + values = params.valuesdict() + + NMR_params = { + "shielding_symmetric": ["zeta", "eta"], + "quadrupolar": ["Cq", "eta", "beta"], + } + + # Update simulation parameters iso, eta, and zeta for the site object + num_spin = len(simulator.spin_systems) + for i in range(num_spin): + num_site = len(simulator.spin_systems[i].sites) + for j in range(num_site): + site = simulator.spin_systems[i].sites[j] + # Update NMR params + for k, v in NMR_params.items(): + if getattr(site, k): + for p in v: + name = f"sys_{i}_site_{j}_{k}_{p}" + if getattr(getattr(site, k), p) and name in values.keys(): + setattr(getattr(site, k), p, values[name]) + # Update isotropic chemical shift + name = f"sys_{i}_site_{j}_isotropic_chemical_shift" + if site.isotropic_chemical_shift and name in values.keys(): + site.isotropic_chemical_shift = values[name] + return simulator + + @staticmethod + def _update_methods(simulator, params): + """ + update the simulation object with the method parameters provided. + """ + simulator = copy.deepcopy(simulator) + values = params.valuesdict() + # Update the spinning freq here. + for i, method in enumerate(simulator.methods): + for sd in method.spectral_dimensions: + for e in sd.events: + if f"mth_{i}_rotor_frequency" in values: + e.rotor_frequency = values[f"mth_{i}_rotor_frequency"] + return simulator + + @staticmethod + def _update_signal_processors(processors, params): + """ + update the MP signal processor object with the parameters provided. + """ + values = params.valuesdict() + + # Define the dict of possible signal processors. + processors_dict = { + "Gaussian": "FWHM", + "Exponential": "FWHM", + "Scale": "factor", + "ConstantOffset": "offset", + "Linear": ["amplitude", "offset"], + } + + # update the SignalProcessor parameter and apply line broadening. + if isinstance(processors, list): + processors = processors + else: + processors = [processors] + + for i, proc in enumerate(processors): + for j, oper in enumerate(proc.operations): + oper_name = oper.__class__.__name__ + if oper_name in processors_dict: + if oper_name == "Linear": + for attr in processors_dict[oper_name]: + setattr( + oper, + attr, + values[f"SP_{i}_operation_{j}_{oper_name}_{attr}"], + ) + else: + attr = processors_dict[oper_name] + setattr( + oper, + attr, + values[f"SP_{i}_operation_{j}_{oper_name}_{attr}"], + ) + return processors + + def minimization_function(self, params, simulator, processors, sigma): + """ + Definition of the minimization function used to fit the experimental spectrum + """ + + # Update simulation parameters + simulator = self._update_sim_params(simulator, params) + simulator = self._update_methods(simulator, params) + + # run the simulation + simulator.run() + + # update processors + processors = self._update_signal_processors(processors, params) + + # apply signal processing + processed_data = [ + proc.apply_operations(method.simulation) + for proc, method in zip(processors, simulator.methods) + ] + # set sigma as 1 if not defined + sigma = [1.0 for _ in simulator.methods] if sigma is None else sigma + sigma = sigma if isinstance(sigma, list) else [sigma] + + # calculate the residual. + diff = np.asarray([]) + for processed_datum, mth, sigma_ in zip( + processed_data, simulator.methods, sigma + ): + datum = 0 + for decomposed_datum in processed_datum.y: + datum += decomposed_datum.components[0].real + + exp_data = get_correct_data_order(mth) + diff = np.append(diff, (exp_data - datum) / sigma_) + return diff diff --git a/src/mrsimulator/utils/tests/test_monte_carlo.py b/src/mrsimulator/utils/tests/test_monte_carlo.py new file mode 100644 index 000000000..cee7def67 --- /dev/null +++ b/src/mrsimulator/utils/tests/test_monte_carlo.py @@ -0,0 +1,250 @@ +# -*- coding: utf-8 -*- +import copy + +import csdmpy as cp +import mrsimulator.utils.monte_carlo as mc +import mrsimulator.utils.spectral_fitting as sf +import numpy as np +import pytest # noqa +from mrsimulator import signal_processing as sp +from mrsimulator import Simulator +from mrsimulator import Site +from mrsimulator import SpinSystem +from mrsimulator.methods import BlochDecayCTSpectrum +from mrsimulator.methods import BlochDecaySpectrum + + +def test_name_abbrev(): + params = setup_params() + + abv_0 = [ + "delta_iso_0_0", + "zeta_0_0", + "etaCS_0_0", + "Cq_0_0", + "etaQ_0_0", + "abundance_0", + "spin_freq_0", + "spin_freq_1", + "expo_fwhm_0_1", + "gauss_fwhm_0_2", + "scale_0_4", + "offset_0_5", + "linear_0_6", + "scale_1_0", + "offset_1_1", + "linear_1_2", + ] + + abv_1 = mc.name_abbrev(params) + + assert abv_0 == abv_1 + + +def test_vary_only(): + params = setup_params() + + params["sys_0_abundance"].vary = True + params["SP_0_operation_1_Exponential_FWHM"].vary = False + + variables = copy.deepcopy(params) + variables.pop("sys_0_abundance") + variables.pop("SP_0_operation_1_Exponential_FWHM") + + assert mc.mrsim_emcee._vary_only(params) == variables + + +def test_log_prior(): + maximum = [1, np.inf] + minimum = [0, -np.inf] + bounds = np.array([maximum, minimum]) + theta_1 = [0.5, 10] + theta_2 = [10, 10] + + assert mc.mrsim_emcee._log_prior(theta_1, bounds) == 0.0 + assert mc.mrsim_emcee._log_prior(theta_2, bounds) == -np.inf + + +def test_log_probability(): + sim = setup_simulator() + processor = setup_signal_processor() + params = sf.make_LMFIT_params(sim, processor, include="rotor_frequency") + theta_in = np.zeros(len(params)) + 0.5 + theta_in[7] = 1000 + theta_in[8] = 12500 + theta_out = np.zeros(len(params)) + 5 + emcee_obj = mc.mrsim_emcee(params, sim, processor) + + log_prob = emcee_obj._log_probability(theta_in, params, sim, processor, None) + assert log_prob == -2304.0 + + log_prob = emcee_obj._log_probability(theta_out, params, sim, processor, None) + assert log_prob == -np.inf + + +def test_update_sim_params(): + sim_2 = setup_second_simulator() + params_1 = setup_params() + sim_2 = mc.mrsim_emcee._update_sim_params(sim_2, params_1) + + assert sim_2.spin_systems[0].sites[0].isotropic_chemical_shift == 32.0 + assert sim_2.spin_systems[0].sites[0].shielding_symmetric.zeta == -120 + assert sim_2.spin_systems[0].sites[0].shielding_symmetric.eta == 0.1 + assert sim_2.spin_systems[0].sites[0].quadrupolar.Cq == 1e5 + assert sim_2.spin_systems[0].sites[0].quadrupolar.eta == 0.31 + assert sim_2.spin_systems[0].sites[0].quadrupolar.beta == 5.12 + + +def test_update_methods(): + sim_2 = setup_second_simulator() + params_1 = setup_params() + sim_2 = mc.mrsim_emcee._update_methods(sim_2, params_1) + + assert sim_2.methods[0].spectral_dimensions[0].events[0].rotor_frequency == 1000.0 + assert sim_2.methods[1].spectral_dimensions[0].events[0].rotor_frequency == 12500.0 + + +def test_update_signal_processors(): + processor_1 = setup_signal_processor() + processor_2 = setup_second_signal_processor() + params_1 = setup_params() + processor_2 = mc.mrsim_emcee._update_signal_processors(processor_2, params_1) + + for p in enumerate(processor_1): + assert processor_1[p[0]].operations == processor_2[p[0]].operations + + processor_3 = setup_signal_processor()[0] + processor_4 = setup_second_signal_processor()[0] + sim = setup_simulator() + params_3 = sf.make_LMFIT_params(sim, processor_3, include="rotor_frequency") + processor_4 = mc.mrsim_emcee._update_signal_processors(processor_4, params_3)[0] + + assert processor_3.operations == processor_4.operations + + +def test_mcmc(): + sim = setup_simulator() + processor = setup_signal_processor() + params = sf.make_LMFIT_params(sim, processor, include="rotor_frequency") + emcee_obj = mc.mrsim_emcee(params, sim, processor) + + result = emcee_obj.mcmc(steps=50, nwalkers=50, burn=10, thin=5) + assert ((result["accept_frac"] < 1) & (result["accept_frac"] > 0)).sum() == 50 + + +def setup_simulator(): + site = Site( + isotope="23Na", + isotropic_chemical_shift=32, + shielding_symmetric={"zeta": -120, "eta": 0.1}, + quadrupolar={"Cq": 1e5, "eta": 0.31, "beta": 5.12}, + ) + sys = SpinSystem(sites=[site], abundance=0.123) + sim = Simulator() + spectral_dims = [ + { + "count": 4096, + "spectral_width": 20000.0, + "reference_offset": 1220.7031, + "origin_offset": 54237080.0, + } + ] + experiment = cp.as_csdm(np.zeros(4096)) + sim.spin_systems.append(sys) + sim.methods.append( + BlochDecayCTSpectrum( + channels=["2H"], + rotor_frequency=1e3, + spectral_dimensions=spectral_dims, + experiment=experiment, + ) + ) + sim.methods.append( + BlochDecaySpectrum( + channels=["2H"], + rotor_frequency=12.5e3, + spectral_dimensions=spectral_dims, + experiment=experiment, + ) + ) + return sim + + +def setup_second_simulator(): + site = Site( + isotope="27Al", + isotropic_chemical_shift=40, + shielding_symmetric={"zeta": -100, "eta": 0.45}, + quadrupolar={"Cq": 1e6, "eta": 0.31, "beta": 6.37}, + ) + sys = SpinSystem(sites=[site], abundance=0.455) + sim = Simulator() + spectral_dims = [ + { + "count": 1024, + "spectral_width": 20000.0, + "reference_offset": 1220.7031, + "origin_offset": 54237080.0, + } + ] + experiment = cp.as_csdm(np.zeros(1024)) + sim.spin_systems.append(sys) + sim.methods.append( + BlochDecayCTSpectrum( + channels=["2H"], + rotor_frequency=5e3, + spectral_dimensions=spectral_dims, + experiment=experiment, + ) + ) + sim.methods.append( + BlochDecaySpectrum( + channels=["2H"], + rotor_frequency=10e3, + spectral_dimensions=spectral_dims, + experiment=experiment, + ) + ) + return sim + + +def setup_signal_processor(): + op_list1 = [ + sp.IFFT(dim_index=0), + sp.apodization.Exponential(FWHM="100 Hz"), + sp.apodization.Gaussian(FWHM="200 Hz"), + sp.FFT(dim_index=0), + sp.Scale(factor=10), + sp.baseline.ConstantOffset(offset=43.1), + sp.Linear(amplitude=32.9, offset=13.4), + ] + op_list2 = [ + sp.Scale(factor=20), + sp.baseline.ConstantOffset(offset=-43.1), + sp.Linear(amplitude=1.2, offset=0.4), + ] + return [sp.SignalProcessor(operations=op) for op in [op_list1, op_list2]] + + +def setup_second_signal_processor(): + op_list1 = [ + sp.IFFT(dim_index=0), + sp.apodization.Exponential(FWHM="150 Hz"), + sp.apodization.Gaussian(FWHM="250 Hz"), + sp.FFT(dim_index=0), + sp.Scale(factor=8), + sp.baseline.ConstantOffset(offset=33.0), + sp.Linear(amplitude=15.8, offset=11.1), + ] + op_list2 = [ + sp.Scale(factor=15), + sp.baseline.ConstantOffset(offset=-33.0), + sp.Linear(amplitude=1.1, offset=0.4), + ] + return [sp.SignalProcessor(operations=op) for op in [op_list1, op_list2]] + + +def setup_params(): + sim = setup_simulator() + processor = setup_signal_processor() + return sf.make_LMFIT_params(sim, processor, include="rotor_frequency")