diff --git a/CHANGELOG.md b/CHANGELOG.md index 9f99bbd..36094e2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,10 @@ # Changelog +## [0.4.0] - 2026-05-?? +- [#27](https://github.com/php1ic/nuclearmasses/pull/27) + * Refactor Converter class in utils into separate modules + * Update how the column locations of the parameters are set and accessed + ## [0.3.0] - 2026-04-28 - [#26](https://github.com/php1ic/nuclearmasses/pull/26) * Optimise initial parsing of the files. diff --git a/src/nuclearmasses/io/ame.py b/src/nuclearmasses/io/ame.py index 4a3422a..9cfaadd 100644 --- a/src/nuclearmasses/io/ame.py +++ b/src/nuclearmasses/io/ame.py @@ -7,8 +7,8 @@ import pandas as pd from nuclearmasses.io.ame_mass_parse import AMEMassParser -from nuclearmasses.io.ame_reaction_1_parse import AMEReactionParserOne -from nuclearmasses.io.ame_reaction_2_parse import AMEReactionParserTwo +from nuclearmasses.io.ame_reaction_1_parse import AMEReactionOneParser +from nuclearmasses.io.ame_reaction_2_parse import AMEReactionTwoParser class AME: @@ -89,8 +89,8 @@ def parse_year(self, year: int) -> pd.DataFrame: ame_mass, ame_reaction_1, ame_reaction_2 = self.get_datafiles(year) mass_df = AMEMassParser(filename=ame_mass, year=year).read_file() - rct1_df = AMEReactionParserOne(filename=ame_reaction_1, year=year).read_file() - rct2_df = AMEReactionParserTwo(filename=ame_reaction_2, year=year).read_file() + rct1_df = AMEReactionOneParser(filename=ame_reaction_1, year=year).read_file() + rct2_df = AMEReactionTwoParser(filename=ame_reaction_2, year=year).read_file() # Merge all 3 of the AME dataframes into one common_columns = ["A", "Z", "N", "TableYear", "Symbol", "DataSource"] diff --git a/src/nuclearmasses/io/ame_mass_file.py b/src/nuclearmasses/io/ame_mass_file.py index 550f0e2..943a8b8 100644 --- a/src/nuclearmasses/io/ame_mass_file.py +++ b/src/nuclearmasses/io/ame_mass_file.py @@ -1,23 +1,27 @@ """ -The ame_mass_file module defines the ``AMEMassFile`` class. This class stores the column positions of the start and -finish location of the different parameters recorded in the AME mass data file. The positions have changed between -years so the year of the table is given as a parameter at construction. +The ame_mass_file module defines the ``AMEMassLayout`` and ``AMEMassFile`` classes. The ``AMEMassLayout`` class acts +like a base class, storing the common column names and the start and end positions of the values within the AME data +file. The positions change as time progress so the ``AMEMassFile`` class uses the year, passed as a parameter, to +update the values as required. + +The years 2003, 2012 and 2016 have identical formatting so are used as the base, not the 1983 format. """ +import dataclasses -class AMEMassFile: + +@dataclasses.dataclass(kw_only=True) +class AMEMassLayout: """ - Storage class for the data in the AME mass data file. + Storage class for the most common data in the AME mass data file. The AME mass data file is fixed-width file format so we will store the format details in this class. Note we have not listed all parameters in the attributes section as there are so many. The naming convention is however shown, along with a description. - Parameters - ---------- - year : int - The year the file being parsed was published + The attribute names align with column names as a string to allow dynamic creation of other variables and attributes + in other parts of the code. Attributes ---------- @@ -29,149 +33,158 @@ class AMEMassFile: The first column of parameter X. END_X : int or None The last column of parameter X or None to represent the end of the line. - column_limits : list[tuple[int, int]] - The start and end positions of all parameters as a list of tuples that can be passed to :meth:`pandas.read_fwf`. + column : list[str] + The list of columns that appear in the file + positions : list[tuple[str, str, str]] + A list of tuples detailing column name alongside start and end position in the line. """ - def __init__(self, year: int, **kwargs): - super().__init__(**kwargs) - match year: - case 1983: - self.HEADER = 35 - self.FOOTER = 0 - self.START_Z = 11 - self.END_Z = 14 - self.START_A = 16 - self.END_A = 19 - self.START_ME = 29 - self.END_ME = 39 - self.START_DME = 41 - self.END_DME = 48 - self.START_BE_PER_A = 49 - self.END_BE_PER_A = 59 - self.START_DBE_PER_A = 61 - self.END_DBE_PER_A = 68 - self.START_BETA_DECAY_ENERGY = 76 - self.END_BETA_DECAY_ENERGY = 85 - self.START_DBETA_DECAY_ENERGY = 87 - self.END_DBETA_DECAY_ENERGY = 94 - self.START_AM = 97 - self.END_AM = 99 - self.START_MICRO_U = 100 - self.END_MICRO_U = 110 - self.START_MICRO_DU = 113 - self.END_MICRO_DU = 120 - case 1993: - self.HEADER = 40 - self.FOOTER = 0 - self.START_Z = 11 - self.END_Z = 14 - self.START_A = 16 - self.END_A = 19 - self.START_ME = 29 - self.END_ME = 39 - self.START_DME = 41 - self.END_DME = 48 - self.START_BE_PER_A = 49 - self.END_BE_PER_A = 59 - self.START_DBE_PER_A = 61 - self.END_DBE_PER_A = 68 - self.START_BETA_DECAY_ENERGY = 76 - self.END_BETA_DECAY_ENERGY = 85 - self.START_DBETA_DECAY_ENERGY = 87 - self.END_DBETA_DECAY_ENERGY = 94 - self.START_AM = 97 - self.END_AM = 99 - self.START_MICRO_U = 100 - self.END_MICRO_U = 110 - self.START_MICRO_DU = 112 - self.END_MICRO_DU = 120 - case 1995: - self.HEADER = 39 - self.FOOTER = 0 - self.START_Z = 11 - self.END_Z = 14 - self.START_A = 16 - self.END_A = 19 - self.START_ME = 29 - self.END_ME = 39 - self.START_DME = 41 - self.END_DME = 48 - self.START_BE_PER_A = 49 - self.END_BE_PER_A = 59 - self.START_DBE_PER_A = 61 - self.END_DBE_PER_A = 68 - self.START_BETA_DECAY_ENERGY = 76 - self.END_BETA_DECAY_ENERGY = 85 - self.START_DBETA_DECAY_ENERGY = 87 - self.END_DBETA_DECAY_ENERGY = 94 - self.START_AM = 97 - self.END_AM = 99 - self.START_MICRO_U = 100 - self.END_MICRO_U = 110 - self.START_MICRO_DU = 112 - self.END_MICRO_DU = 120 - case 2020: - self.HEADER = 36 - self.FOOTER = 0 - self.START_Z = 11 - self.END_Z = 14 - self.START_A = 16 - self.END_A = 19 - self.START_ME = 29 - self.END_ME = 42 - self.START_DME = 43 - self.END_DME = 53 - self.START_BE_PER_A = 56 - self.END_BE_PER_A = 66 - self.START_DBE_PER_A = 69 - self.END_DBE_PER_A = 77 - self.START_BETA_DECAY_ENERGY = 82 - self.END_BETA_DECAY_ENERGY = 93 - self.START_DBETA_DECAY_ENERGY = 95 - self.END_DBETA_DECAY_ENERGY = 104 - self.START_AM = 106 - self.END_AM = 109 - self.START_MICRO_U = 110 - self.END_MICRO_U = 120 - self.START_MICRO_DU = 124 - self.END_MICRO_DU = 135 - case _: - self.HEADER = 39 - self.FOOTER = 0 - self.START_Z = 11 - self.END_Z = 14 - self.START_A = 16 - self.END_A = 19 - self.START_ME = 29 - self.END_ME = 41 - self.START_DME = 42 - self.END_DME = 53 - self.START_BE_PER_A = 54 - self.END_BE_PER_A = 64 - self.START_DBE_PER_A = 65 - self.END_DBE_PER_A = 72 - self.START_BETA_DECAY_ENERGY = 76 - self.END_BETA_DECAY_ENERGY = 86 - self.START_DBETA_DECAY_ENERGY = 87 - self.END_DBETA_DECAY_ENERGY = 95 - self.START_AM = 96 - self.END_AM = 99 - self.START_MICRO_U = 100 - self.END_MICRO_U = 112 - self.START_MICRO_DU = 113 - self.END_MICRO_DU = 120 - - self.column_limits = [ - (self.START_Z, self.END_Z), - (self.START_A, self.END_A), - (self.START_ME, self.END_ME), - (self.START_DME, self.END_DME), - (self.START_BE_PER_A, self.END_BE_PER_A), - (self.START_DBE_PER_A, self.END_DBE_PER_A), - (self.START_BETA_DECAY_ENERGY, self.END_BETA_DECAY_ENERGY), - (self.START_DBETA_DECAY_ENERGY, self.END_DBETA_DECAY_ENERGY), - (self.START_AM, self.END_AM), - (self.START_MICRO_U, self.END_MICRO_U), - (self.START_MICRO_DU, self.END_MICRO_DU), + HEADER: int = 39 + FOOTER: int = 0 + START_Z: int = 11 + END_Z: int = 14 + START_A: int = 16 + END_A: int = 19 + START_AMEMassExcess: int = 29 + END_AMEMassExcess: int = 41 + START_AMEMassExcessError: int = 42 + END_AMEMassExcessError: int = 53 + START_BindingEnergyPerA: int = 54 + END_BindingEnergyPerA: int = 64 + START_BindingEnergyPerAError: int = 65 + END_BindingEnergyPerAError: int = 72 + START_BetaDecayEnergy: int = 76 + END_BetaDecayEnergy: int = 86 + START_BetaDecayEnergyError: int = 87 + END_BetaDecayEnergyError: int = 95 + START_AtomicNumber: int = 96 + END_AtomicNumber: int = 99 + START_AtomicMass: int = 100 + END_AtomicMass: int = 112 + START_AtomicMassError: int = 113 + END_AtomicMassError: int = 120 + + def __post_init__(self) -> None: + self.columns: list[str] = [ + "Z", + "A", + "AMEMassExcess", + "AMEMassExcessError", + "BindingEnergyPerA", + "BindingEnergyPerAError", + "BetaDecayEnergy", + "BetaDecayEnergyError", + "AtomicNumber", + "AtomicMass", + "AtomicMassError", ] + + self.positions: list[tuple[str, str, str]] = [(f"{c}", f"START_{c}", f"END_{c}") for c in self.columns] + + +class AMEMassFile: + """ + Storage class for the year specific data in the AME mass data file. + + The base ``AMEMassLayout`` class is constructed and updated as required for the given ``year``. + + Parameters + ---------- + year : int + The year the file being parsed was published + + Attributes + ---------- + AME_MASS_YEAR_OVERRIDES : dict[int | str, dict[str, int]] + Year specific updates and changes required to ``AMEMassLayout``. + layout : AMEMassLayout + A storage class containing details of parameters and their locations in the line. + """ + + AME_MASS_YEAR_OVERRIDES: dict[int | str, dict[str, int]] = { + "default": {}, + 1983: { + "HEADER": 35, + "END_AMEMassExcess": 39, + "START_AMEMassExcessError": 41, + "END_AMEMassExcessError": 48, + "START_BindingEnergyPerA": 49, + "END_BindingEnergyPerA": 59, + "START_BindingEnergyPerAError": 61, + "END_BindingEnergyPerAError": 68, + "START_BetaDecayEnergy": 76, + "END_BetaDecayEnergy": 85, + "START_BetaDecayEnergyError": 87, + "END_BetaDecayEnergyError": 94, + "START_AtomicNumber": 97, + "END_AtomicNumber": 99, + "START_AtomicMass": 100, + "END_AtomicMass": 110, + }, + 1993: { + "HEADER": 40, + "END_AMEMassExcess": 39, + "START_AMEMassExcessError": 41, + "END_AMEMassExcessError": 48, + "START_BindingEnergyPerA": 49, + "END_BindingEnergyPerA": 59, + "START_BindingEnergyPerAError": 61, + "END_BindingEnergyPerAError": 68, + "START_BetaDecayEnergy": 76, + "END_BetaDecayEnergy": 85, + "START_BetaDecayEnergyError": 87, + "END_BetaDecayEnergyError": 94, + "START_AtomicNumber": 97, + "END_AtomicNumber": 99, + "START_AtomicMass": 100, + "END_AtomicMass": 110, + "START_AtomicMassError": 112, + }, + 1995: { + "END_AMEMassExcess": 39, + "START_AMEMassExcessError": 41, + "END_AMEMassExcessError": 48, + "START_BindingEnergyPerA": 49, + "END_BindingEnergyPerA": 59, + "START_BindingEnergyPerAError": 61, + "END_BindingEnergyPerAError": 68, + "START_BetaDecayEnergy": 76, + "END_BetaDecayEnergy": 85, + "START_BetaDecayEnergyError": 87, + "END_BetaDecayEnergyError": 94, + "START_AtomicNumber": 97, + "END_AtomicNumber": 99, + "START_AtomicMass": 100, + "END_AtomicMass": 110, + "START_AtomicMassError": 112, + }, + # The years 2003, 2012 and 2016 have identical formatting so are used as the base + 2003: {}, + 2012: {}, + 2016: {}, + 2020: { + "HEADER": 36, + "END_AMEMassExcess": 42, + "START_AMEMassExcessError": 43, + "END_AMEMassExcessError": 53, + "START_BindingEnergyPerA": 56, + "END_BindingEnergyPerA": 66, + "START_BindingEnergyPerAError": 69, + "END_BindingEnergyPerAError": 77, + "START_BetaDecayEnergy": 82, + "END_BetaDecayEnergy": 93, + "START_BetaDecayEnergyError": 95, + "END_BetaDecayEnergyError": 104, + "START_AtomicNumber": 106, + "END_AtomicNumber": 109, + "START_AtomicMass": 110, + "END_AtomicMass": 120, + "START_AtomicMassError": 124, + "END_AtomicMassError": 135, + }, + } + + def __init__(self, year: int) -> None: + self.layout = AMEMassLayout( + **AMEMassFile.AME_MASS_YEAR_OVERRIDES.get(year, AMEMassFile.AME_MASS_YEAR_OVERRIDES["default"]) + ) diff --git a/src/nuclearmasses/io/ame_mass_parse.py b/src/nuclearmasses/io/ame_mass_parse.py index 98ae737..1bf135c 100644 --- a/src/nuclearmasses/io/ame_mass_parse.py +++ b/src/nuclearmasses/io/ame_mass_parse.py @@ -7,10 +7,12 @@ import pandas as pd from nuclearmasses.io.ame_mass_file import AMEMassFile -from nuclearmasses.utils.converter import Converter, DataInput +from nuclearmasses.utils.dataframe_utils import calculate_relative_error, read_fwf, strip_char_from_string_columns +from nuclearmasses.utils.periodic import get_symbol +from nuclearmasses.utils.type_defs import DataInput -class AMEMassParser(AMEMassFile, Converter): +class AMEMassParser: """ Parse the AME mass file, doing the necessary preparation and clean ups of data. @@ -33,9 +35,13 @@ class AMEMassParser(AMEMassFile, Converter): """ def __init__(self, filename: DataInput, year: int): - super().__init__(year=year) self.filename: DataInput = filename self.year: int = year + self.layout = AMEMassFile(year=year).layout + + self.column_limits = [ + (getattr(self.layout, start), getattr(self.layout, end)) for _, start, end in self.layout.positions + ] def _column_names(self) -> list[str]: """ @@ -46,19 +52,7 @@ def _column_names(self) -> list[str]: list[str] An ordered list of the columns that exist in the file. """ - return [ - "Z", - "A", - "AMEMassExcess", - "AMEMassExcessError", - "BindingEnergyPerA", - "BindingEnergyPerAError", - "BetaDecayEnergy", - "BetaDecayEnergyError", - "AtomicNumber", - "AtomicMass", - "AtomicMassError", - ] + return self.layout.columns def _data_types(self) -> dict: """ @@ -106,30 +100,6 @@ def _na_values(self) -> dict: return na_vals - def calculate_relative_error(self, raw_df) -> pd.DataFrame: - """ - Calculate the relative error of the mass excess. - - 12C has a 0.0 +/- 0.0 mass excess by definition, so relative error is 0.0. The division by zero will put a NaN - value in the column for 12C so we will manually correct and set to 0.0. - - Parameters - ---------- - raw_df : pandas.DataFrame - The raw dataframe upon which we will act. - - Returns - ------- - pandas.DataFrame - The updated dataframe with a new relative mass excess column. - """ - raw_df["AMERelativeError"] = abs( - raw_df["AMEMassExcessError"].astype(float) / raw_df["AMEMassExcess"].astype(float) - ) - raw_df.loc[(raw_df.Z == 6) & (raw_df.A == 12), "AMERelativeError"] = 0.0 - - return raw_df - def read_file(self) -> pd.DataFrame: """ Read the file-like object ``self.filename`` into a dataframe @@ -142,19 +112,19 @@ def read_file(self) -> pd.DataFrame: pandas.DataFrame A dataframe containing the parsed and organised contents of the AME mass data file """ - df = Converter.read_fwf( + df = read_fwf( self.filename, colspecs=self.column_limits, names=self._column_names(), na_values=self._na_values(), keep_default_na=False, on_bad_lines="warn", - skiprows=self.HEADER, - skipfooter=self.FOOTER, + skiprows=self.layout.HEADER, + skipfooter=self.layout.FOOTER, ) # We use the NUBASE data to define whether or not an isotope is experimentally measured, # so for this data we'll just drop any and all '#' characters - df = self.strip_char_from_string_columns(df, "#") + df = strip_char_from_string_columns(df, "#") if self.year == 1983: # The column headers and units are repeated in the 1983 table @@ -180,11 +150,11 @@ def read_file(self) -> pd.DataFrame: # We need to rescale the error value because we combined the two columns above df = df.assign(AtomicMassError=df["AtomicMassError"].astype(float) / 1.0e6) - df = self.calculate_relative_error(df) + df = calculate_relative_error(df, "AME") df["TableYear"] = self.year df["N"] = pd.to_numeric(df["A"]) - pd.to_numeric(df["Z"]) - df["Symbol"] = pd.to_numeric(df["Z"]).map(self.get_symbol) + df["Symbol"] = pd.to_numeric(df["Z"]).map(get_symbol) df["DataSource"] = 0 return df.astype(self._data_types()) diff --git a/src/nuclearmasses/io/ame_reaction_1_file.py b/src/nuclearmasses/io/ame_reaction_1_file.py index c2c16d7..abb3bf9 100644 --- a/src/nuclearmasses/io/ame_reaction_1_file.py +++ b/src/nuclearmasses/io/ame_reaction_1_file.py @@ -1,23 +1,27 @@ """ -The ame_reaction_1_file module defines the ``AMEReactionFileOne`` class. This class stores the column positions of the -start and finish location of the different parameters recorded in the AME reaction 1 data file. The positions have -changed between years so the year of the table is given as a parameter at construction. +The ame_reaction_1_file module defines the ``AMEReactionOneLayout`` and ``AMEReactionOneFile`` classes. +The ``AMEreactionOneLayout`` class acts like a base class, storing the common column names and the start and end +positions of the values within the AME data file. The positions change as time progress so the ``AMEReactionOneFile`` +class uses the year, passed as a parameter, to update the values as required. + +The years 1995, 2003, 2012 and 2016 have identical formatting so are used as the base, not the 1983 format. """ +import dataclasses + -class AMEReactionFileOne: +@dataclasses.dataclass(kw_only=True) +class AMEReactionOneLayout: """ - Storage class for the data in the AME reaction 1 data file. + Storage class for the most common data in the AME Reaction 1 data file. - The AME reaction 1 data file is fixed-width file format so we will store the format details in this class. + The AME Reaction 1 data file is fixed-width file format so we will store the format details in this class. Note we have not listed all parameters in the attributes section as there are so many. The naming convention is however shown, along with a description. - Parameters - ---------- - year : int - The year the file being parsed was published + The attribute names align with column names as a string to allow dynamic creation of other variables and attributes + in other parts of the code. Attributes ---------- @@ -29,124 +33,146 @@ class AMEReactionFileOne: The first column of parameter X. END_X : int or None The last column of parameter X or None to represent the end of the line. - column_limits : list[tuple[int, int]] - The start and end positions of all parameters as a list of tuples that can be passed to :meth:`pandas.read_fwf`. + column : list[str] + The list of columns that appear in the file + positions : list[tuple[str, str, str]] + A list of tuples detailing column name alongside start and end position in the line. """ - def __init__(self, year: int, **kwargs): - super().__init__(**kwargs) - match year: - case 1983: - self.HEADER = 30 - self.FOOTER = 0 - self.START_R1_A = 1 - self.END_R1_A = 4 - self.START_R1_Z = 8 - self.END_R1_Z = 11 - self.START_S2N = 14 - self.END_S2N = 22 - self.START_DS2N = 24 - self.END_DS2N = 30 - self.START_S2P = 32 - self.END_S2P = 39 - self.START_DS2P = 43 - self.END_DS2P = 47 - self.START_QA = 50 - self.END_QA = 57 - self.START_DQA = 60 - self.END_DQA = 65 - self.START_Q2B = 68 - self.END_Q2B = 75 - self.START_DQ2B = 78 - self.END_DQ2B = 83 - self.START_QEP = 86 - self.END_QEP = 93 - self.START_DQEP = 96 - self.END_DQEP = 101 - self.START_QBN = 103 - self.END_QBN = 111 - self.START_DQBN = 114 - self.END_DQBN = 119 - case 2020: - self.HEADER = 35 - self.FOOTER = 0 - self.START_R1_A = 1 - self.END_R1_A = 4 - self.START_R1_Z = 8 - self.END_R1_Z = 11 - self.START_S2N = 14 - self.END_S2N = 24 - self.START_DS2N = 25 - self.END_DS2N = 34 - self.START_S2P = 36 - self.END_S2P = 46 - self.START_DS2P = 47 - self.END_DS2P = 56 - self.START_QA = 58 - self.END_QA = 68 - self.START_DQA = 69 - self.END_DQA = 78 - self.START_Q2B = 79 - self.END_Q2B = 90 - self.START_DQ2B = 91 - self.END_DQ2B = 100 - self.START_QEP = 101 - self.END_QEP = 112 - self.START_DQEP = 113 - self.END_DQEP = 122 - self.START_QBN = 123 - self.END_QBN = 134 - self.START_DQBN = 135 - self.END_DQBN = 144 - case _: - match year: - case 1995 | 2003 | 2012 | 2016: - self.HEADER = 39 - case 1993: - self.HEADER = 40 - self.FOOTER = 0 - self.START_R1_A = 1 - self.END_R1_A = 4 - self.START_R1_Z = 8 - self.END_R1_Z = 11 - self.START_S2N = 14 - self.END_S2N = 22 - self.START_DS2N = 23 - self.END_DS2N = 30 - self.START_S2P = 32 - self.END_S2P = 40 - self.START_DS2P = 41 - self.END_DS2P = 48 - self.START_QA = 50 - self.END_QA = 58 - self.START_DQA = 59 - self.END_DQA = 66 - self.START_Q2B = 67 - self.END_Q2B = 76 - self.START_DQ2B = 77 - self.END_DQ2B = 84 - self.START_QEP = 85 - self.END_QEP = 94 - self.START_DQEP = 95 - self.END_DQEP = 102 - self.START_QBN = 103 - self.END_QBN = 112 - self.START_DQBN = 113 - self.END_DQBN = 125 - - self.column_limits = [ - (self.START_R1_A, self.END_R1_A), - (self.START_R1_Z, self.END_R1_Z), - (self.START_S2N, self.END_S2N), - (self.START_DS2N, self.END_DS2N), - (self.START_S2P, self.END_S2P), - (self.START_DS2P, self.END_DS2P), - (self.START_QA, self.END_QA), - (self.START_DQA, self.END_DQA), - (self.START_Q2B, self.END_Q2B), - (self.START_DQ2B, self.END_DQ2B), - (self.START_QEP, self.END_QEP), - (self.START_DQEP, self.END_DQEP), - (self.START_QBN, self.END_QBN), - (self.START_DQBN, self.END_DQBN), + HEADER: int = 39 + FOOTER: int = 0 + START_A: int = 1 + END_A: int = 4 + START_Z: int = 8 + END_Z: int = 11 + START_TwoNeutronSeparationEnergy: int = 14 + END_TwoNeutronSeparationEnergy: int = 22 + START_TwoNeutronSeparationEnergyError: int = 23 + END_TwoNeutronSeparationEnergyError: int = 30 + START_TwoProtonSeparationEnergy: int = 32 + END_TwoProtonSeparationEnergy: int = 40 + START_TwoProtonSeparationEnergyError: int = 41 + END_TwoProtonSeparationEnergyError: int = 48 + START_QAlpha: int = 50 + END_QAlpha: int = 58 + START_QAlphaError: int = 59 + END_QAlphaError: int = 66 + START_QTwoBeta: int = 67 + END_QTwoBeta: int = 76 + START_QTwoBetaError: int = 77 + END_QTwoBetaError: int = 84 + START_QEpsilon: int = 85 + END_QEpsilon: int = 94 + START_QEpsilonError: int = 95 + END_QEpsilonError: int = 102 + START_QBetaNeutron: int = 103 + END_QBetaNeutron: int = 112 + START_QBetaNeutronError: int = 113 + END_QBetaNeutronError: int = 125 + + def __post_init__(self) -> None: + self.columns: list[str] = [ + "A", + "Z", + "TwoNeutronSeparationEnergy", + "TwoNeutronSeparationEnergyError", + "TwoProtonSeparationEnergy", + "TwoProtonSeparationEnergyError", + "QAlpha", + "QAlphaError", + "QTwoBeta", + "QTwoBetaError", + "QEpsilon", + "QEpsilonError", + "QBetaNeutron", + "QBetaNeutronError", ] + + self.positions: list[tuple[str, str, str]] = [(f"{c}", f"START_{c}", f"END_{c}") for c in self.columns] + + +class AMEReactionOneFile: + """ + Storage class for the year specific data in the AME reaction 1 data file. + + The base `AMEReactionOneLayout`` class is constructed and updated as required for the given ``year``. + + Parameters + ---------- + year : int + The year the file being parsed was published + + Attributes + ---------- + YEAR_OVERRIDES : dict[int | str, dict[str, int]] + Year specific updates and changes required to ``AMEReactionOneLayout``. + layout : AMEReactionOneLayout + A storage class containing details of parameters and their locations in the line. + """ + + YEAR_OVERRIDES: dict[int | str, dict[str, int]] = { + "default": {}, + 1983: { + "HEADER": 30, + "START_TwoNeutronSeparationEnergyError": 24, + "END_TwoNeutronSeparationEnergyError": 30, + "START_TwoProtonSeparationEnergy": 32, + "END_TwoProtonSeparationEnergy": 39, + "START_TwoProtonSeparationEnergyError": 43, + "END_TwoProtonSeparationEnergyError": 47, + "START_QAlpha": 50, + "END_QAlpha": 57, + "START_QAlphaError": 60, + "END_QAlphaError": 65, + "START_QTwoBeta": 68, + "END_QTwoBeta": 75, + "START_QTwoBetaError": 78, + "END_QTwoBetaError": 83, + "START_QEpsilon": 86, + "END_QEpsilon": 93, + "START_QEpsilonError": 96, + "END_QEpsilonError": 101, + "START_QBetaNeutron": 103, + "END_QBetaNeutron": 111, + "START_QBetaNeutronError": 114, + "END_QBetaNeutronError": 119, + }, + 1993: { + "HEADER": 40, + }, + # 1995 - 2016 are the base years + 1995: {}, + 2003: {}, + 2012: {}, + 2016: {}, + 2020: { + "END_TwoNeutronSeparationEnergy": 24, + "START_TwoNeutronSeparationEnergyError": 25, + "END_TwoNeutronSeparationEnergyError": 34, + "START_TwoProtonSeparationEnergy": 36, + "END_TwoProtonSeparationEnergy": 46, + "START_TwoProtonSeparationEnergyError": 47, + "END_TwoProtonSeparationEnergyError": 56, + "START_QAlpha": 58, + "END_QAlpha": 68, + "START_QAlphaError": 69, + "END_QAlphaError": 78, + "START_QTwoBeta": 79, + "END_QTwoBeta": 90, + "START_QTwoBetaError": 91, + "END_QTwoBetaError": 100, + "START_QEpsilon": 101, + "END_QEpsilon": 112, + "START_QEpsilonError": 113, + "END_QEpsilonError": 122, + "START_QBetaNeutron": 123, + "END_QBetaNeutron": 134, + "START_QBetaNeutronError": 135, + "END_QBetaNeutronError": 144, + }, + } + + def __init__(self, year: int) -> None: + self.layout = AMEReactionOneLayout( + **AMEReactionOneFile.YEAR_OVERRIDES.get(year, AMEReactionOneFile.YEAR_OVERRIDES["default"]) + ) diff --git a/src/nuclearmasses/io/ame_reaction_1_parse.py b/src/nuclearmasses/io/ame_reaction_1_parse.py index 91b07cb..2559496 100644 --- a/src/nuclearmasses/io/ame_reaction_1_parse.py +++ b/src/nuclearmasses/io/ame_reaction_1_parse.py @@ -1,16 +1,18 @@ """ -The ame_reaction_1_parse module defines the ``AMEReactionParserOne`` class. This class contains the logic required to +The ame_reaction_1_parse module defines the ``AMEReactionOneParser`` class. This class contains the logic required to sort and organise the inputs to :meth:`pandas.read_fwf` dependent on the year of the file. Once parsed, known typos and inconsistencies are cleaned from the resultant dataframe. """ import pandas as pd -from nuclearmasses.io.ame_reaction_1_file import AMEReactionFileOne -from nuclearmasses.utils.converter import Converter, DataInput +from nuclearmasses.io.ame_reaction_1_file import AMEReactionOneFile +from nuclearmasses.utils.dataframe_utils import read_fwf, strip_char_from_string_columns +from nuclearmasses.utils.periodic import get_symbol +from nuclearmasses.utils.type_defs import DataInput -class AMEReactionParserOne(AMEReactionFileOne, Converter): +class AMEReactionOneParser: """ Parse the first AME reaction file, doing the necessary preparation and clean ups of data. @@ -33,9 +35,13 @@ class AMEReactionParserOne(AMEReactionFileOne, Converter): """ def __init__(self, filename: DataInput, year: int): - super().__init__(year=year) self.filename: DataInput = filename self.year = year + self.layout = AMEReactionOneFile(year).layout + + self.column_limits = [ + (getattr(self.layout, start), getattr(self.layout, end)) for _, start, end in self.layout.positions + ] def _column_names(self) -> list[str]: """ @@ -46,22 +52,7 @@ def _column_names(self) -> list[str]: list[str] An ordered list of the columns that exist in the file. """ - return [ - "A", - "Z", - "TwoNeutronSeparationEnergy", - "TwoNeutronSeparationEnergyError", - "TwoProtonSeparationEnergy", - "TwoProtonSeparationEnergyError", - "QAlpha", - "QAlphaError", - "QTwoBeta", - "QTwoBetaError", - "QEpsilon", - "QEpsilonError", - "QBetaNeutron", - "QBetaNeutronError", - ] + return self.layout.columns def _data_types(self) -> dict: """ @@ -130,19 +121,19 @@ def read_file(self) -> pd.DataFrame: pandas.DataFrame A dataframe containing the parsed and organised contents of the first AME reaction data file """ - df = Converter.read_fwf( + df = read_fwf( self.filename, colspecs=self.column_limits, names=self._column_names(), na_values=self._na_values(), keep_default_na=False, on_bad_lines="warn", - skiprows=self.HEADER, - skipfooter=self.FOOTER, + skiprows=self.layout.HEADER, + skipfooter=self.layout.FOOTER, ) # We use the NUBASE data to define whether or not an isotope is experimentally measured, # so for this data we'll just drop any and all '#' characters - df = self.strip_char_from_string_columns(df, "#") + df = strip_char_from_string_columns(df, "#") if self.year == 1983: # The column headers and units are repeated in the 1983 table @@ -152,7 +143,7 @@ def read_file(self) -> pd.DataFrame: df["TableYear"] = self.year df["N"] = pd.to_numeric(df["A"]) - pd.to_numeric(df["Z"]) - df["Symbol"] = pd.to_numeric(df["Z"]).map(self.get_symbol) + df["Symbol"] = pd.to_numeric(df["Z"]).map(get_symbol) df["DataSource"] = 0 return df.astype(self._data_types()) diff --git a/src/nuclearmasses/io/ame_reaction_2_file.py b/src/nuclearmasses/io/ame_reaction_2_file.py index c3158a3..9568485 100644 --- a/src/nuclearmasses/io/ame_reaction_2_file.py +++ b/src/nuclearmasses/io/ame_reaction_2_file.py @@ -1,23 +1,27 @@ """ -The ame_reaction_2_file module defines the ``AMEReactionFileTwo`` class. This class stores the column positions of the -start and finish location of the different parameters recorded in the AME reaction 2 data file. The positions have -changed between years so the year of the table is given as a parameter at construction. +The ame_reaction_2_file module defines the ``AMEReactionTwoLayout`` and ``AMEReactionTwoFile`` classes. +The ``AMEreactionTwoLayout`` class acts like a base class, storing the common column names and the start and end +positions of the values within the AME data file. The positions change as time progress so the ``AMEReactionTwoFile`` +class uses the year, passed as a parameter, to update the values as required. + +The years 1995, 2003, 2012 and 2016 have identical formatting so are used as the base, not the 1983 format. """ +import dataclasses + -class AMEReactionFileTwo: +@dataclasses.dataclass(kw_only=True) +class AMEReactionTwoLayout: """ - Storage class for the data in the AME reaction 2 data file. + Storage class for the most common data in the AME reaction 2 data file. The AME reaction 2 data file is fixed-width file format so we will store the format details in this class. Note we have not listed all parameters in the attributes section as there are so many. The naming convention is however shown, along with a description. - Parameters - ---------- - year : int - The year the file being parsed was published + The attribute names align with column names as a string to allow dynamic creation of other variables and attributes + in other parts of the code. Attributes ---------- @@ -29,124 +33,148 @@ class AMEReactionFileTwo: The first column of parameter X. END_X : int or None The last column of parameter X or None to represent the end of the line. - column_limits : list[tuple[int, int]] - The start and end positions of all parameters as a list of tuples that can be passed to :meth:`pandas.read_fwf`. + column : list[str] + The list of columns that appear in the file + positions : list[tuple[str, str, str]] + A list of tuples detailing column name alongside start and end position in the line. """ - def __init__(self, year: int, **kwargs): - super().__init__(**kwargs) - match year: - case 1983: - self.HEADER = 30 - self.FOOTER = 0 - self.START_R2_A = 1 - self.END_R2_A = 4 - self.START_R2_Z = 8 - self.END_R2_Z = 11 - self.START_SN = 14 - self.END_SN = 22 - self.START_DSN = 24 - self.END_DSN = 28 - self.START_SP = 30 - self.END_SP = 40 - self.START_DSP = 42 - self.END_DSP = 48 - self.START_Q4B = 49 - self.END_Q4B = 57 - self.START_DQ4B = 60 - self.END_DQ4B = 65 - self.START_QDA = 68 - self.END_QDA = 76 - self.START_DQDA = 78 - self.END_DQDA = 84 - self.START_QPA = 86 - self.END_QPA = 94 - self.START_DQPA = 96 - self.END_DQPA = 102 - self.START_QNA = 103 - self.END_QNA = 112 - self.START_DQNA = 114 - self.END_DQNA = 120 - case 2020: - self.HEADER = 37 - self.FOOTER = 15 - self.START_R2_A = 1 - self.END_R2_A = 4 - self.START_R2_Z = 8 - self.END_R2_Z = 11 - self.START_SN = 14 - self.END_SN = 24 - self.START_DSN = 25 - self.END_DSN = 34 - self.START_SP = 36 - self.END_SP = 46 - self.START_DSP = 47 - self.END_DSP = 56 - self.START_Q4B = 57 - self.END_Q4B = 68 - self.START_DQ4B = 69 - self.END_DQ4B = 78 - self.START_QDA = 79 - self.END_QDA = 90 - self.START_DQDA = 91 - self.END_DQDA = 100 - self.START_QPA = 101 - self.END_QPA = 112 - self.START_DQPA = 113 - self.END_DQPA = 122 - self.START_QNA = 123 - self.END_QNA = 134 - self.START_DQNA = 135 - self.END_DQNA = 144 - case _: - match year: - case 1995 | 2003 | 2012 | 2016: - self.HEADER = 39 - case 1993: - self.HEADER = 40 - self.FOOTER = 0 - self.START_R2_A = 1 - self.END_R2_A = 4 - self.START_R2_Z = 8 - self.END_R2_Z = 11 - self.START_SN = 14 - self.END_SN = 22 - self.START_DSN = 23 - self.END_DSN = 30 - self.START_SP = 32 - self.END_SP = 40 - self.START_DSP = 41 - self.END_DSP = 48 - self.START_Q4B = 49 - self.END_Q4B = 58 - self.START_DQ4B = 59 - self.END_DQ4B = 66 - self.START_QDA = 67 - self.END_QDA = 76 - self.START_DQDA = 77 - self.END_DQDA = 84 - self.START_QPA = 85 - self.END_QPA = 94 - self.START_DQPA = 95 - self.END_DQPA = 102 - self.START_QNA = 103 - self.END_QNA = 112 - self.START_DQNA = 113 - self.END_DQNA = 125 - - self.column_limits = [ - (self.START_R2_A, self.END_R2_A), - (self.START_R2_Z, self.END_R2_Z), - (self.START_SN, self.END_SN), - (self.START_DSN, self.END_DSN), - (self.START_SP, self.END_SP), - (self.START_DSP, self.END_DSP), - (self.START_Q4B, self.END_Q4B), - (self.START_DQ4B, self.END_DQ4B), - (self.START_QDA, self.END_QDA), - (self.START_DQDA, self.END_DQDA), - (self.START_QPA, self.END_QPA), - (self.START_DQPA, self.END_DQPA), - (self.START_QNA, self.END_QNA), - (self.START_DQNA, self.END_DQNA), + HEADER: int = 39 + FOOTER: int = 0 + START_A: int = 1 + END_A: int = 4 + START_Z: int = 8 + END_Z: int = 11 + START_OneNeutronSeparationEnergy: int = 14 + END_OneNeutronSeparationEnergy: int = 22 + START_OneNeutronSeparationEnergyError: int = 23 + END_OneNeutronSeparationEnergyError: int = 30 + START_OneProtonSeparationEnergy: int = 32 + END_OneProtonSeparationEnergy: int = 40 + START_OneProtonSeparationEnergyError: int = 41 + END_OneProtonSeparationEnergyError: int = 48 + START_QFourBeta: int = 49 + END_QFourBeta: int = 58 + START_QFourBetaError: int = 59 + END_QFourBetaError: int = 66 + START_QDeuteronAlpha: int = 67 + END_QDeuteronAlpha: int = 76 + START_QDeuteronAlphaError: int = 77 + END_QDeuteronAlphaError: int = 84 + START_QProtonAlpha: int = 85 + END_QProtonAlpha: int = 94 + START_QProtonAlphaError: int = 95 + END_QProtonAlphaError: int = 102 + START_QNeutronAlpha: int = 103 + END_QNeutronAlpha: int = 112 + START_QNeutronAlphaError: int = 113 + END_QNeutronAlphaError: int = 125 + + def __post_init__(self) -> None: + self.columns: list[str] = [ + "A", + "Z", + "OneNeutronSeparationEnergy", + "OneNeutronSeparationEnergyError", + "OneProtonSeparationEnergy", + "OneProtonSeparationEnergyError", + "QFourBeta", + "QFourBetaError", + "QDeuteronAlpha", + "QDeuteronAlphaError", + "QProtonAlpha", + "QProtonAlphaError", + "QNeutronAlpha", + "QNeutronAlphaError", ] + + self.positions: list[tuple[str, str, str]] = [(f"{c}", f"START_{c}", f"END_{c}") for c in self.columns] + + +class AMEReactionTwoFile: + """ + Storage class for the year specific data in the AME reaction 2 data file. + + The base ``AMEReactionTwoLayout`` class is constructed and updated as required for the given ``year``. + + Parameters + ---------- + year : int + The year the file being parsed was published + + Attributes + ---------- + YEAR_OVERRIDES : dict[int | str, dict[str, int]] + Year specific updates and changes required to ``AMEReactionTwoLayout``. + layout : AMEReactionTwoLayout + A storage class containing details of parameters and their locations in the line. + """ + + YEAR_OVERRIDES: dict[int | str, dict[str, int]] = { + "default": {}, + 1983: { + "HEADER": 30, + "START_OneNeutronSeparationEnergyError": 24, + "END_OneNeutronSeparationEnergyError": 28, + "START_OneProtonSeparationEnergy": 30, + "END_OneProtonSeparationEnergy": 40, + "START_OneProtonSeparationEnergyError": 42, + "END_OneProtonSeparationEnergyError": 48, + "START_QFourBeta": 49, + "END_QFourBeta": 57, + "START_QFourBetaError": 60, + "END_QFourBetaError": 65, + "START_QDeuteronAlpha": 68, + "END_QDeuteronAlpha": 76, + "START_QDeuteronAlphaError": 78, + "END_QDeuteronAlphaError": 84, + "START_QProtonAlpha": 86, + "END_QProtonAlpha": 94, + "START_QProtonAlphaError": 96, + "END_QProtonAlphaError": 102, + "START_QNeutronAlpha": 103, + "END_QNeutronAlpha": 112, + "START_QNeutronAlphaError": 114, + "END_QNeutronAlphaError": 120, + }, + 1993: { + "HEADER": 40, + }, + # 1995 - 2016 are the base years + 1995: {}, + 2003: {}, + 2012: {}, + 2016: {}, + 2020: { + "HEADER": 37, + "FOOTER": 15, + "END_OneNeutronSeparationEnergy": 24, + "START_OneNeutronSeparationEnergyError": 25, + "END_OneNeutronSeparationEnergyError": 34, + "START_OneProtonSeparationEnergy": 36, + "END_OneProtonSeparationEnergy": 46, + "START_OneProtonSeparationEnergyError": 47, + "END_OneProtonSeparationEnergyError": 56, + "START_QFourBeta": 57, + "END_QFourBeta": 68, + "START_QFourBetaError": 69, + "END_QFourBetaError": 78, + "START_QDeuteronAlpha": 79, + "END_QDeuteronAlpha": 90, + "START_QDeuteronAlphaError": 91, + "END_QDeuteronAlphaError": 100, + "START_QProtonAlpha": 101, + "END_QProtonAlpha": 112, + "START_QProtonAlphaError": 113, + "END_QProtonAlphaError": 122, + "START_QNeutronAlpha": 123, + "END_QNeutronAlpha": 134, + "START_QNeutronAlphaError": 134, + "END_QNeutronAlphaError": 144, + }, + } + + def __init__(self, year: int) -> None: + self.layout = AMEReactionTwoLayout( + **AMEReactionTwoFile.YEAR_OVERRIDES.get(year, AMEReactionTwoFile.YEAR_OVERRIDES["default"]) + ) diff --git a/src/nuclearmasses/io/ame_reaction_2_parse.py b/src/nuclearmasses/io/ame_reaction_2_parse.py index 11bd258..4a6f49b 100644 --- a/src/nuclearmasses/io/ame_reaction_2_parse.py +++ b/src/nuclearmasses/io/ame_reaction_2_parse.py @@ -1,16 +1,18 @@ """ -The ame_reaction_2_parse module defines the ``AMEReactionParserTwo`` class. This class contains the logic required to +The ame_reaction_2_parse module defines the ``AMEReactionTwoParser`` class. This class contains the logic required to sort and organise the inputs to :meth:`pandas.read_fwf` dependent on the year of the file. Once parsed, known typos and inconsistencies are cleaned from the resultant dataframe. """ import pandas as pd -from nuclearmasses.io.ame_reaction_2_file import AMEReactionFileTwo -from nuclearmasses.utils.converter import Converter, DataInput +from nuclearmasses.io.ame_reaction_2_file import AMEReactionTwoFile +from nuclearmasses.utils.dataframe_utils import read_fwf, strip_char_from_string_columns +from nuclearmasses.utils.periodic import get_symbol +from nuclearmasses.utils.type_defs import DataInput -class AMEReactionParserTwo(AMEReactionFileTwo, Converter): +class AMEReactionTwoParser: """ Parse the second AME reaction file, doing the necessary preparation and clean ups of data. @@ -33,9 +35,13 @@ class AMEReactionParserTwo(AMEReactionFileTwo, Converter): """ def __init__(self, filename: DataInput, year: int): - super().__init__(year=year) self.filename: DataInput = filename self.year = year + self.layout = AMEReactionTwoFile(year).layout + + self.column_limits = [ + (getattr(self.layout, start), getattr(self.layout, end)) for _, start, end in self.layout.positions + ] def _column_names(self) -> list[str]: """ @@ -46,22 +52,7 @@ def _column_names(self) -> list[str]: list[str] An ordered list of the columns that exist in the file. """ - return [ - "A", - "Z", - "OneNeutronSeparationEnergy", - "OneNeutronSeparationEnergyError", - "OneProtonSeparationEnergy", - "OneProtonSeparationEnergyError", - "QFourBeta", - "QFourBetaError", - "QDeuteronAlpha", - "QDeuteronAlphaError", - "QProtonAlpha", - "QProtonAlphaError", - "QNeutronAlpha", - "QNeutronAlphaError", - ] + return self.layout.columns def _data_types(self) -> dict: """ @@ -130,19 +121,19 @@ def read_file(self) -> pd.DataFrame: pandas.DataFrame A dataframe containing the parsed and organised contents of the second AME reaction data file """ - df = Converter.read_fwf( + df = read_fwf( self.filename, colspecs=self.column_limits, names=self._column_names(), na_values=self._na_values(), keep_default_na=False, on_bad_lines="warn", - skiprows=self.HEADER, - skipfooter=self.FOOTER, + skiprows=self.layout.HEADER, + skipfooter=self.layout.FOOTER, ) # We use the NUBASE data to define whether or not an isotope is experimentally measured, # so for this data we'll just drop any and all '#' characters - df = self.strip_char_from_string_columns(df, "#") + df = strip_char_from_string_columns(df, "#") if self.year == 1983: # The column headers and units are repeated in the 1983 table @@ -156,7 +147,7 @@ def read_file(self) -> pd.DataFrame: # Repeated column heading also means we have to cast to create new columns df["TableYear"] = self.year df["N"] = pd.to_numeric(df["A"]) - pd.to_numeric(df["Z"]) - df["Symbol"] = pd.to_numeric(df["Z"]).map(self.get_symbol) + df["Symbol"] = pd.to_numeric(df["Z"]).map(get_symbol) df["DataSource"] = 0 return df.astype(self._data_types()) diff --git a/src/nuclearmasses/io/nubase_file.py b/src/nuclearmasses/io/nubase_file.py index dd3abf4..54ecc9c 100644 --- a/src/nuclearmasses/io/nubase_file.py +++ b/src/nuclearmasses/io/nubase_file.py @@ -1,23 +1,25 @@ """ -The nubase_file module defines the ``NUBASEFile`` class. This class stores the column positions of the start and finish -location of the different parameters recorded in the NUBASE data file. The positions have changed between years so the -year of the table is given as a parameter at construction. +The nubase_file module defines the ``NUBASELayout`` and ``NUBASEFile`` classes. The ``NUBASELayout`` class acts like a +base class, storing the original column names and the start and end positions of the values within the NUBASE data file. +The positions change as time progress so the ``NUBASEFile`` class uses the year, passed as a parameter, to update the +values as required. """ +import dataclasses -class NUBASEFile: + +@dataclasses.dataclass(kw_only=True) +class NUBASELayout: """ - Storage class for the data in the NUBASE data file. + Storage class for the original data in the NUBASE data file. The NUBASE data file is fixed-width file format so we will store the format details in this class. Note we have not listed all parameters in the attributes section as there are so many. The naming convention is however shown, along with a description. - Parameters - ---------- - year : int - The year the file being parsed was published + The attribute names align with column names as a string to allow dynamic creation of other variables and attributes + in other parts of the code. Attributes ---------- @@ -26,145 +28,135 @@ class NUBASEFile: FOOTER : int The number of lines in the file to be interpreted as the footer. START_X : int - The first column of parameter X. + The first column of parameter X or None if X is not in the datafile for that year. END_X : int or None - The last column of parameter X or None to represent the end of the line. - column_limits : list[tuple[int, int]] - The start and end positions of all parameters as a list of tuples that can be passed to :meth:`pandas.read_fwf`. + The last column of parameter X or None if X is not in the datafile for that year. + columns : list[str] + The list of columns that appear in the file. + positions : list[tuple[str, str, str]] + A list of tuples detailing column name alongside start and end position in the line. """ - def __init__(self, year: int, **kwargs): - super().__init__(**kwargs) - match year: - case 1995: - self.HEADER = 0 - self.FOOTER = 0 - self.START_A = 0 - self.END_A = 3 - self.START_Z = 4 - self.END_Z = 7 - self.START_STATE = 7 - self.END_STATE = 8 - self.START_ME = 18 - self.END_ME = 29 - self.START_DME = 29 - self.END_DME = 38 - self.START_ISOMER = 39 - self.END_ISOMER = 46 - self.START_DISOMER = 48 - self.END_DISOMER = 56 - self.START_HALFLIFEVALUE = 60 - self.END_HALFLIFEVALUE = 68 - self.START_HALFLIFEUNIT = 69 - self.END_HALFLIFEUNIT = 71 - self.START_HALFLIFEERROR = 72 - self.END_HALFLIFEERROR = 77 - self.START_SPIN = 79 - self.END_SPIN = 93 - self.START_DECAYSTRING = 106 - self.END_DECAYSTRING = None - case 2003: - self.HEADER = 0 - self.FOOTER = 0 - self.START_A = 0 - self.END_A = 3 - self.START_Z = 4 - self.END_Z = 7 - self.START_STATE = 7 - self.END_STATE = 8 - self.START_ME = 18 - self.END_ME = 29 - self.START_DME = 29 - self.END_DME = 38 - self.START_ISOMER = 39 - self.END_ISOMER = 46 - self.START_DISOMER = 48 - self.END_DISOMER = 56 - self.START_HALFLIFEVALUE = 60 - self.END_HALFLIFEVALUE = 68 - self.START_HALFLIFEUNIT = 69 - self.END_HALFLIFEUNIT = 71 - self.START_HALFLIFEERROR = 72 - self.END_HALFLIFEERROR = 77 - self.START_SPIN = 79 - self.END_SPIN = 93 - self.START_DECAYSTRING = 106 - self.END_DECAYSTRING = None - case 2020: - self.HEADER = 25 - self.FOOTER = 0 - self.START_A = 0 - self.END_A = 3 - self.START_Z = 4 - self.END_Z = 7 - self.START_STATE = 7 - self.END_STATE = 8 - self.START_ME = 18 - self.END_ME = 31 - self.START_DME = 31 - self.END_DME = 42 - self.START_ISOMER = 43 - self.END_ISOMER = 53 - self.START_DISOMER = 54 - self.END_DISOMER = 64 - self.START_HALFLIFEVALUE = 69 - self.END_HALFLIFEVALUE = 77 - self.START_HALFLIFEUNIT = 78 - self.END_HALFLIFEUNIT = 80 - self.START_HALFLIFEERROR = 81 - self.END_HALFLIFEERROR = 87 - self.START_SPIN = 88 - self.END_SPIN = 101 - self.START_ENSDF = 102 - self.END_ENSDF = 103 - self.START_YEAR = 114 - self.END_YEAR = 118 - self.START_DECAYSTRING = 119 - self.END_DECAYSTRING = None - case _: - self.HEADER = 0 - self.FOOTER = 0 - self.START_A = 0 - self.END_A = 3 - self.START_Z = 4 - self.END_Z = 7 - self.START_STATE = 7 - self.END_STATE = 8 - self.START_ME = 18 - self.END_ME = 29 - self.START_DME = 29 - self.END_DME = 38 - self.START_ISOMER = 39 - self.END_ISOMER = 46 - self.START_DISOMER = 48 - self.END_DISOMER = 56 - self.START_HALFLIFEVALUE = 60 - self.END_HALFLIFEVALUE = 68 - self.START_HALFLIFEUNIT = 69 - self.END_HALFLIFEUNIT = 71 - self.START_HALFLIFEERROR = 72 - self.END_HALFLIFEERROR = 77 - self.START_SPIN = 79 - self.END_SPIN = 93 - self.START_YEAR = 105 - self.END_YEAR = 109 - self.START_DECAYSTRING = 110 - self.END_DECAYSTRING = None - - self.column_limits = [ - (self.START_A, self.END_A), - (self.START_Z, self.END_Z), - (self.START_STATE, self.END_STATE), - (self.START_ME, self.END_ME), - (self.START_DME, self.END_DME), - (self.START_ISOMER, self.END_ISOMER), - (self.START_DISOMER, self.END_DISOMER), - (self.START_HALFLIFEVALUE, self.END_HALFLIFEVALUE), - (self.START_HALFLIFEUNIT, self.END_HALFLIFEUNIT), - (self.START_HALFLIFEERROR, self.END_HALFLIFEERROR), - (self.START_SPIN, self.END_SPIN), - (self.START_DECAYSTRING, self.END_DECAYSTRING), + HEADER: int = 0 + FOOTER: int = 0 + START_A: int = 0 + END_A: int = 3 + START_Z: int = 4 + END_Z: int = 7 + START_State: int = 7 + END_State: int = 8 + START_NUBASEMassExcess: int = 18 + END_NUBASEMassExcess: int = 29 + START_NUBASEMassExcessError: int = 29 + END_NUBASEMassExcessError: int = 38 + START_IsomerEnergy: int = 39 + END_IsomerEnergy: int = 46 + START_IsomerEnergyError: int = 48 + END_IsomerEnergyError: int = 56 + START_HalfLifeValue: int = 60 + END_HalfLifeValue: int = 68 + START_HalfLifeUnit: int = 69 + END_HalfLifeUnit: int = 71 + START_HalfLifeError: int = 72 + END_HalfLifeError: int = 77 + START_Spin: int = 79 + END_Spin: int = 93 + START_DecayModes: int = 106 + END_DecayModes: int | None = None + + # Columns that weren't in the first file so are not part of the default + START_DiscoveryYear: int | None = None + END_DiscoveryYear: int | None = None + START_ENSDF: int | None = None + END_ENSDF: int | None = None + + def __post_init__(self) -> None: + self.columns: list[str] = [ + "A", + "Z", + "State", + "NUBASEMassExcess", + "NUBASEMassExcessError", + "IsomerEnergy", + "IsomerEnergyError", + "HalfLifeValue", + "HalfLifeUnit", + "HalfLifeError", + "Spin", + # "ENSDF", + # "DiscoveryYear", + "DecayModes", ] - if year > 2003: - self.column_limits.insert(-1, (self.START_YEAR, self.END_YEAR)) + self.positions: list[tuple[str, str, str]] = [(f"{c}", f"START_{c}", f"END_{c}") for c in self.columns] + + +class NUBASEFile: + """ + Storage class for the year specific data in the NUBASE data file. + + The base ``NUBASELayout`` class is constructed and updated as required for the given ``year``. + + Parameters + ---------- + year : int + The year the file being parsed was published + + Attributes + ---------- + NUBASE_YEAR_OVERRIDES : dict[int | str, dict[str, int]] + Year specific updates and changes required to ``NUBASELayout``. + layout : NUBASELayout + A storage class containing details of parameters and their locations in the line. + """ + + NUBASE_YEAR_OVERRIDES: dict[int | str, dict[str, int]] = { + "default": {}, # Use this to appease mypy by not returning None for a non-existent value + # Original columns and their positions are based of the 1995 file + 1995: {}, + # No changes in 2000 + 2003: {}, + # New discovery year column in 2012 which pushed the decay modes to the right + 2012: { + "START_DiscoveryYear": 105, + "END_DiscoveryYear": 109, + "START_DecayModes": 110, + }, + # This is the same as 2012 and copy pasting seems to be the simplest way to have the same values + 2016: { + "START_DiscoveryYear": 105, + "END_DiscoveryYear": 109, + "START_DecayModes": 110, + }, + # Big update in 2020 + # Added a header block + # Increased total digits in various values, pushing almost all columns to the right + # New column representing the year isotopes details were last updated in ENSDF + 2020: { + "HEADER": 25, + "START_NUBASEMassExcessError": 31, + "END_NUBASEMassExcessError": 42, + "START_IsomerEnergy": 43, + "END_IsomerEnergy": 53, + "START_IsomerEnergyError": 54, + "END_IsomerEnergyError": 64, + "START_HalfLifeValue": 69, + "END_HalfLifeValue": 77, + "START_HalfLifeUnit": 78, + "END_HalfLifeUnit": 80, + "START_HalfLifeError": 81, + "END_HalfLifeError": 87, + "START_Spin": 88, + "END_Spin": 101, + "START_ENSDF": 102, + "END_ENSDF": 103, + "START_DiscoveryYear": 114, + "END_DiscoveryYear": 118, + "START_DecayModes": 119, + }, + } + + def __init__(self, year: int) -> None: + self.layout = NUBASELayout( + **NUBASEFile.NUBASE_YEAR_OVERRIDES.get(year, NUBASEFile.NUBASE_YEAR_OVERRIDES["default"]) + ) diff --git a/src/nuclearmasses/io/nubase_parse.py b/src/nuclearmasses/io/nubase_parse.py index f3181c3..cd4371a 100644 --- a/src/nuclearmasses/io/nubase_parse.py +++ b/src/nuclearmasses/io/nubase_parse.py @@ -4,15 +4,16 @@ are cleaned from the resultant dataframe. """ -import typing - import pandas as pd from nuclearmasses.io.nubase_file import NUBASEFile -from nuclearmasses.utils.converter import Converter, DataInput +from nuclearmasses.utils.dataframe_utils import calculate_relative_error, read_fwf, strip_char_from_string_columns +from nuclearmasses.utils.periodic import get_symbol +from nuclearmasses.utils.type_defs import DataInput +from nuclearmasses.utils.units import unit_to_seconds -class NUBASEParser(NUBASEFile, Converter): +class NUBASEParser: """ Parse the NUBASE file, doing the necessary preparations and clean up of data. @@ -32,18 +33,28 @@ class NUBASEParser(NUBASEFile, Converter): The file-like object to parse. year : int The published year of the data file. + layout : NUBASEFile + A storage class containing details of parameters and their location in the line. unit_replacements : dict[str, str] A dictionary used to tidy up time units from NUBASE format to one the module recognises. + column_limits : list[tuple[int, int]] + The start and end positions of all parameters as a list of tuples that can be passed to :meth:`pandas.read_fwf`. """ def __init__(self, filename: DataInput, year: int): - super().__init__(year=year) self.filename: DataInput = filename self.year: int = year + self.layout = NUBASEFile(year=year).layout self.unit_replacements: dict[str, str] = { r"y$": "yr", r"^m$": "min", } + self.column_limits = [ + (getattr(self.layout, start), getattr(self.layout, end)) for _, start, end in self.layout.positions + ] + + if year > 2003: + self.column_limits.insert(-1, (self.layout.START_DiscoveryYear, self.layout.END_DiscoveryYear)) def _column_names(self) -> list[str]: """ @@ -54,30 +65,14 @@ def _column_names(self) -> list[str]: list[str] An ordered list of the columns that exist in the file. """ - col_names = [ - "A", - "Z", - "State", - "NUBASEMassExcess", - "NUBASEMassExcessError", - "IsomerEnergy", - "IsomerEnergyError", - "HalfLifeValue", - "HalfLifeUnit", - "HalfLifeError", - "Spin", - "DiscoveryYear", - "DecayModes", - ] + col_names = self.layout.columns - # The discovery year was added after 2003, and I assume it will be there in the future, so we will set up - # as if it is always present and delete for the first two tables. - if self.year == 1995 or self.year == 2003: - col_names.remove("DiscoveryYear") + if self.year > 2003: + col_names.insert(-1, "DiscoveryYear") return col_names - def _data_types(self) -> dict: + def _data_types(self) -> dict[str, str]: """ Set the column data types depending on the year. @@ -92,11 +87,8 @@ def _data_types(self) -> dict: "Z": "Int64", "N": "Int64", "Experimental": "boolean", - # "State": "Int64", "NUBASEMassExcess": "float64", "NUBASEMassExcessError": "float64", - # "IsomerEnergy": "float64", - # "IsomerEnergyError": "float64", "HalfLifeValue": "float64", "HalfLifeUnit": "string", "HalfLifeError": "float64", @@ -106,11 +98,15 @@ def _data_types(self) -> dict: "DiscoveryYear": "Int64", "DecayModes": "string", "DataSource": "Int64", + # We will need these one day + # "State": "Int64", + # "IsomerEnergy": "float64", + # "IsomerEnergyError": "float64", } # The discovery year was added after 2003, and I assume it will be there in the future, so we will set up # as if it is always present and delete for the first two tables. - if self.year == 1995 or self.year == 2003: + if self.year <= 2003: data_types.pop("DiscoveryYear") return data_types @@ -186,7 +182,7 @@ def parse_half_life(self, raw_df) -> pd.DataFrame: raw_df[col] = pd.to_numeric(raw_df[col], errors="coerce") # Pre-compute unit -> second conversion - unit_map = raw_df["HalfLifeUnit"].map(self.unit_to_seconds) + unit_map = raw_df["HalfLifeUnit"].map(unit_to_seconds) raw_df["HalfLifeSeconds"] = raw_df["HalfLifeValue"] * unit_map raw_df["HalfLifeErrorSeconds"] = raw_df["HalfLifeError"] * unit_map @@ -217,30 +213,6 @@ def parse_state(self, raw_df) -> pd.DataFrame: return raw_df - def calculate_relative_error(self, raw_df) -> pd.DataFrame: - """ - Calculate the relative error of the mass excess. - - 12C has a 0.0 +/- 0.0 mass excess by definition, so relative error is 0.0. The division by zero will put a NaN - value in the column for 12C so we will manually correct and set to 0.0. - - Parameters - ---------- - raw_df : pandas.DataFrame - The raw dataframe upon which we will act. - - Returns - ------- - pandas.DataFrame - The updated dataframe with a new relative mass excess column. - """ - raw_df["NUBASERelativeError"] = abs( - raw_df["NUBASEMassExcessError"].astype(float) / raw_df["NUBASEMassExcess"].astype(float) - ) - raw_df.loc[(raw_df.Z == 6) & (raw_df.A == 12), "NUBASERelativeError"] = 0.0 - - return raw_df - def read_file(self) -> pd.DataFrame: """ Read the file-like object ``self.filename`` into a dataframe @@ -253,15 +225,15 @@ def read_file(self) -> pd.DataFrame: pandas.DataFrame A dataframe containing the parsed and organised contents of the NUBASE data file """ - df = Converter.read_fwf( + df = read_fwf( self.filename, - colspecs=typing.cast(typing.Sequence[tuple[int, int]], self.column_limits), # appease mypy + colspecs=self.column_limits, names=self._column_names(), na_values=self._na_values(), keep_default_na=False, on_bad_lines="warn", - skiprows=self.HEADER, - skipfooter=self.FOOTER, + skiprows=self.layout.HEADER, + skipfooter=self.layout.FOOTER, ) df = self.parse_state(df) @@ -269,10 +241,10 @@ def read_file(self) -> pd.DataFrame: # We use the NUBASE data to define whether or not an isotope is experimentally measured, df["Experimental"] = ~df["NUBASEMassExcess"].astype("string").str.contains("#", na=False) # Once we have used the '#' to determine if it's experimental or not, we can remove all instances of it - df = self.strip_char_from_string_columns(df, "#") + df = strip_char_from_string_columns(df, "#") df = self.parse_half_life(df) - df = self.calculate_relative_error(df) + df = calculate_relative_error(df, "NUBASE") if self.year == 2012: # 198Au has a typo in it's decay mode in the 2012 table. It is recorded as '-' @@ -280,7 +252,7 @@ def read_file(self) -> pd.DataFrame: df["TableYear"] = self.year df["N"] = pd.to_numeric(df["A"]) - pd.to_numeric(df["Z"]) - df["Symbol"] = pd.to_numeric(df["Z"]).map(self.get_symbol) + df["Symbol"] = pd.to_numeric(df["Z"]).map(get_symbol) df["DataSource"] = 0 return df.astype(self._data_types()) diff --git a/src/nuclearmasses/mass_table.py b/src/nuclearmasses/mass_table.py index ff0e349..1980412 100644 --- a/src/nuclearmasses/mass_table.py +++ b/src/nuclearmasses/mass_table.py @@ -14,7 +14,7 @@ from nuclearmasses.io.ame import AME from nuclearmasses.io.nubase import NUBASE -from nuclearmasses.utils.converter import Converter +from nuclearmasses.utils.periodic import get_symbol class MassTable: @@ -103,7 +103,7 @@ def add_user_data( user_columns = set(user_df.columns) # The symbol is commonly used so if it wasn't in the file, create it as a column if "Symbol" not in user_columns: - user_df["Symbol"] = pd.to_numeric(user_df["Z"]).map(Converter().get_symbol) + user_df["Symbol"] = pd.to_numeric(user_df["Z"]).map(get_symbol) # Set the source value using the function parameter if it hasn't already been set if "DataSource" not in user_columns: diff --git a/src/nuclearmasses/utils/converter.py b/src/nuclearmasses/utils/converter.py deleted file mode 100644 index bcfb2a7..0000000 --- a/src/nuclearmasses/utils/converter.py +++ /dev/null @@ -1,228 +0,0 @@ -""" -The converter module defines the ``Converter`` class that is used to store lookup dictionaries to allow simple and fast -conversions between scientific units and seconds, and element symbol and Z value. The dictionaries are defined on the -class level so any instance should share a single copy. -""" - -import importlib -from importlib.resources.abc import Traversable -import os -import typing - -import pandas as pd - -# Typing hint Union for the different ways a file or data can be represented -DataInput = Traversable | os.PathLike[str] | str | typing.TextIO - - -class Converter: - """ - Utility class to convert between various physical properties. - - All methods are static so it is not necessary to create an instance of the class. - - Internal dictionaries allow bidirectional conversion between element symbol and Z, as well as the conversion of an - time unit in SI format into the equivalent number of seconds (e.g. min -> 60.0). - """ - - UNIT_TO_SECONDS: dict[str, float] = { - "s": 1.0, - "ms": 1e-3, - "us": 1e-6, - "ns": 1e-9, - "ps": 1e-12, - "as": 1e-18, - "zs": 1e-21, - "ys": 1e-24, - "min": 60.0, - "h": 3600.0, - "d": 86400.0, - "yr": 31_557_600.0, # 365.25 days - "kyr": 3.15576e10, - "myr": 3.15576e13, - "gyr": 3.15576e16, - "zyr": 3.15576e21, - "eyr": 3.15576e18, - "pyr": 3.15576e15, - "tyr": 3.15576e12, - "yyr": 3.15576e24, - } - - # fmt: off - # Formatter wants to put each item on it's own line, I don't - Z_TO_SYMBOL: dict[int, str] = { - 0: "n", 1: "H", 2: "He", 3: "Li", 4: "Be", 5: "B", 6: "C", 7: "N", 8: "O", 9: "F", - 10: "Ne", 11: "Na", 12: "Mg", 13: "Al", 14: "Si", 15: "P", 16: "S", 17: "Cl", 18: "Ar", 19: "K", - 20: "Ca", 21: "Sc", 22: "Ti", 23: "V", 24: "Cr", 25: "Mn", 26: "Fe", 27: "Co", 28: "Ni", 29: "Cu", - 30: "Zn", 31: "Ga", 32: "Ge", 33: "As", 34: "Se", 35: "Br", 36: "Kr", 37: "Rb", 38: "Sr", 39: "Y", - 40: "Zr", 41: "Nb", 42: "Mo", 43: "Tc", 44: "Ru", 45: "Rh", 46: "Pd", 47: "Ag", 48: "Cd", 49: "In", - 50: "Sn", 51: "Sb", 52: "Te", 53: "I", 54: "Xe", 55: "Cs", 56: "Ba", 57: "La", 58: "Ce", 59: "Pr", - 60: "Nd", 61: "Pm", 62: "Sm", 63: "Eu", 64: "Gd", 65: "Tb", 66: "Dy", 67: "Ho", 68: "Er", 69: "Tm", - 70: "Yb", 71: "Lu", 72: "Hf", 73: "Ta", 74: "W", 75: "Re", 76: "Os", 77: "Ir", 78: "Pt", 79: "Au", - 80: "Hg", 81: "Tl", 82: "Pb", 83: "Bi", 84: "Po", 85: "At", 86: "Rn", 87: "Fr", 88: "Ra", 89: "Ac", - 90: "Th", 91: "Pa", 92: "U", 93: "Np", 94: "Pu", 95: "Am", 96: "Cm", 97: "Bk", 98: "Cf", 99: "Es", - 100: "Fm", 101: "Md", 102: "No", 103: "Lr", 104: "Rf", 105: "Db", 106: "Sg", 107: "Bh", 108: "Hs", 109: "Mt", - 110: "Ds", 111: "Rg", 112: "Cn", 113: "Ed", 114: "Fl", 115: "Ef", 116: "Lv", 117: "Ts", 118: "Og" - } - # fmt: on - - # Switch the keys and values of the z_to_symbol dictionary so a user can access the Z value using the symbol - SYMBOL_TO_Z: dict[str, int] = {val: key for key, val in Z_TO_SYMBOL.items()} - - def __init__(self, **kwargs) -> None: - # We are using multiple inheritance, so need this for MRO - super().__init__(**kwargs) - - @staticmethod - def get_symbol(z: int) -> str | None: - """ - Get the symbol representing ``z``. - - This is a nicely named, very thin wrapper around the inbuilt dictionary get. - - Parameters - ---------- - z : int - The Z value to get the symbol for. - - Returns - ------- - str or None - The string representing the ``z` value or None if the ``z` value is invalid. - """ - return Converter.Z_TO_SYMBOL.get(z, None) - - @staticmethod - def get_z(symbol: str) -> int | None: - """ - Get the z (proton number) representing ``symbol``. - - This is a nicely named, very thin wrapper around the inbuilt dictionary. - - Parameters - ---------- - symbol : str - The elemental symbol to get the Z for. - - Returns - ------- - int or None - The Z value representing ``symbol`` or None if ``symbol`` is invalid. - """ - return Converter.SYMBOL_TO_Z.get(symbol, None) - - @staticmethod - def normalise_symbol(symbol: str) -> str: - """ - Validate format of ``symbol`` to allow simpler conversions. - - Element symbols always have a capital first letter and lower case second, if it exists. We store all symbols - like this so want any user input to be of this format. In typesetting, this is known as title case so we can - leverage that conversion function. - - No checking is done on the validity of the symbol. - - Parameters - ---------- - symbol : str - The elemental symbol to validate. - - Returns - ------- - str - The elemental symbol with the correct casing. - """ - return symbol.strip().title() - - @staticmethod - def unit_to_seconds(unit_str: str) -> float | None: - """Convert a time unit to a scale factor in seconds. - - Parameters - ---------- - unit_str : str - The time unit to convert into seconds. - - Returns - ------- - float or None - The time unit represented in seconds or None if the unit does not represent time. - - Examples - -------- - >>> from nuclearmasses.utils.converter import Converter - >>> Converter.unit_to_seconds("s") - 1.0 - >>> Converter.unit_to_seconds("min") - 60.0 - >>> Converter.unit_to_seconds("keV") - >>> Converter.unit_to_seconds(2) - >>> - """ - if pd.isna(unit_str) or not isinstance(unit_str, str): - return None - - # Remove white space and make lower case to be consistent - cleaned_unit = unit_str.strip().lower() - if not cleaned_unit: - return None - - return Converter.UNIT_TO_SECONDS.get(cleaned_unit, None) - - @staticmethod - def read_fwf(base: DataInput, **kwargs): - """ - Overloaded version of :meth:`pandas.read_fwf` that accepts additional types. - - The use of importlib.resource means we have types that the pandas version of read_fwf does not accept. - It can still be used but some work is required. This function does that work, as well as some other checking - to make sure we can pass the necessary types into our parser classes. - - Parameters - ---------- - base : DataInput - The file-like object to read. - - Returns - ------- - pandas.DataFrame - The file-like object parsed into a pandas dataframe. - """ - # A file like object - if hasattr(base, "read"): - return pd.read_fwf(base, **kwargs) # type: ignore[arg-type] - - # importlib.resource Traversable - if isinstance(base, Traversable): - with importlib.resources.as_file(base) as the_file: - return pd.read_fwf(the_file, **kwargs) - - # Filesystem path - return pd.read_fwf(base, **kwargs) - - @staticmethod - def strip_char_from_string_columns(df: pd.DataFrame, char: str) -> pd.DataFrame: - """ - Remove ``char`` from columns that are of known string type - - Helper method to optimise the removal of the ``char`` character from columns in ``df`` that are of string type. - This function is specific to this module; we know, after parsing a file with :meth:'pandas.read_fwf`, columns - that contain purely floats will be of type float, but if a value on one line is e.g. 1234.56# the column will - be of type string (or object). As we wish to remove the '#' character, we can use this detail to only apply the - removal algorithm to those columns of type string and save some unnecessary processing. - - Parameters - ---------- - df : pandas.DataFrame - The dataframe we are removing the character from. - char : str - The character we want to remove. - - Returns - ------- - pandas.DataFrame - The original dataframe with all instance of ``char`` removed from string type columns. - """ - cols = df.select_dtypes(include=["object", "string"]).columns - df[cols] = df[cols].apply(lambda s: s.str.replace(char, "", regex=False)) - return df diff --git a/src/nuclearmasses/utils/dataframe_utils.py b/src/nuclearmasses/utils/dataframe_utils.py new file mode 100644 index 0000000..44ed249 --- /dev/null +++ b/src/nuclearmasses/utils/dataframe_utils.py @@ -0,0 +1,96 @@ +""" +The module dataframe_utils contains functionality to extend that available via pandas or apply a common transformation +to a dataframe that is used throughout the repository. +""" + +import importlib +from importlib.resources.abc import Traversable + +import pandas as pd + +from nuclearmasses.utils.type_defs import DataInput + + +def read_fwf(base: DataInput, **kwargs): + """ + Overloaded version of :meth:`pandas.read_fwf` that accepts additional types. + + The use of importlib.resource means we have types that the pandas version of read_fwf does not accept. + It can still be used but some work is required. This function does that work, as well as some other checking + to make sure we can pass the necessary types into our parser classes. + + Parameters + ---------- + base : DataInput + The file-like object to read. + + Returns + ------- + pandas.DataFrame + The file-like object parsed into a pandas dataframe. + """ + # A file like object + if hasattr(base, "read"): + return pd.read_fwf(base, **kwargs) # type: ignore[arg-type] + + # importlib.resource Traversable + if isinstance(base, Traversable): + with importlib.resources.as_file(base) as the_file: + return pd.read_fwf(the_file, **kwargs) + + # Filesystem path + return pd.read_fwf(base, **kwargs) + + +def strip_char_from_string_columns(df: pd.DataFrame, char: str) -> pd.DataFrame: + """ + Remove ``char`` from columns that are of known string type + + Helper method to optimise the removal of the ``char`` character from columns in ``df`` that are of string type. + This function is specific to this module; we know, after parsing a file with :meth:'pandas.read_fwf`, columns + that contain purely floats will be of type float, but if a value on one line is e.g. 1234.56# the column will + be of type string (or object). As we wish to remove the '#' character, we can use this detail to only apply the + removal algorithm to those columns of type string and save some unnecessary processing. + + Parameters + ---------- + df : pandas.DataFrame + The dataframe we are removing the character from. + char : str + The character we want to remove. + + Returns + ------- + pandas.DataFrame + The original dataframe with all instance of ``char`` removed from string type columns. + """ + cols = df.select_dtypes(include=["object", "string"]).columns + df[cols] = df[cols].apply(lambda s: s.str.replace(char, "", regex=False)) + return df + + +def calculate_relative_error(raw_df: pd.DataFrame, source: str) -> pd.DataFrame: + """ + Calculate the relative error of the mass excess. + + 12C has a 0.0 +/- 0.0 mass excess by definition, so relative error is 0.0. The division by zero will put a NaN + value in the column for 12C so we will manually correct and set to 0.0. + + Parameters + ---------- + raw_df : pandas.DataFrame + The raw dataframe upon which we will act. + source : str + Which table's data are we working with + + Returns + ------- + pandas.DataFrame + The updated dataframe with a new relative mass excess column. + """ + raw_df[f"{source}RelativeError"] = abs( + raw_df[f"{source}MassExcessError"].astype(float) / raw_df[f"{source}MassExcess"].astype(float) + ) + raw_df.loc[(raw_df.Z == 6) & (raw_df.A == 12), f"{source}RelativeError"] = 0.0 + + return raw_df diff --git a/src/nuclearmasses/utils/periodic.py b/src/nuclearmasses/utils/periodic.py new file mode 100644 index 0000000..f5569d4 --- /dev/null +++ b/src/nuclearmasses/utils/periodic.py @@ -0,0 +1,85 @@ +""" +The module periodic contains functionality to convert between elemental symbol and atomic number. +""" + +# fmt: off +# Formatter wants to put each item on it's own line, I don't +Z_TO_SYMBOL: dict[int, str] = { + 0: "n", 1: "H", 2: "He", 3: "Li", 4: "Be", 5: "B", 6: "C", 7: "N", 8: "O", 9: "F", + 10: "Ne", 11: "Na", 12: "Mg", 13: "Al", 14: "Si", 15: "P", 16: "S", 17: "Cl", 18: "Ar", 19: "K", + 20: "Ca", 21: "Sc", 22: "Ti", 23: "V", 24: "Cr", 25: "Mn", 26: "Fe", 27: "Co", 28: "Ni", 29: "Cu", + 30: "Zn", 31: "Ga", 32: "Ge", 33: "As", 34: "Se", 35: "Br", 36: "Kr", 37: "Rb", 38: "Sr", 39: "Y", + 40: "Zr", 41: "Nb", 42: "Mo", 43: "Tc", 44: "Ru", 45: "Rh", 46: "Pd", 47: "Ag", 48: "Cd", 49: "In", + 50: "Sn", 51: "Sb", 52: "Te", 53: "I", 54: "Xe", 55: "Cs", 56: "Ba", 57: "La", 58: "Ce", 59: "Pr", + 60: "Nd", 61: "Pm", 62: "Sm", 63: "Eu", 64: "Gd", 65: "Tb", 66: "Dy", 67: "Ho", 68: "Er", 69: "Tm", + 70: "Yb", 71: "Lu", 72: "Hf", 73: "Ta", 74: "W", 75: "Re", 76: "Os", 77: "Ir", 78: "Pt", 79: "Au", + 80: "Hg", 81: "Tl", 82: "Pb", 83: "Bi", 84: "Po", 85: "At", 86: "Rn", 87: "Fr", 88: "Ra", 89: "Ac", + 90: "Th", 91: "Pa", 92: "U", 93: "Np", 94: "Pu", 95: "Am", 96: "Cm", 97: "Bk", 98: "Cf", 99: "Es", + 100: "Fm", 101: "Md", 102: "No", 103: "Lr", 104: "Rf", 105: "Db", 106: "Sg", 107: "Bh", 108: "Hs", 109: "Mt", + 110: "Ds", 111: "Rg", 112: "Cn", 113: "Ed", 114: "Fl", 115: "Ef", 116: "Lv", 117: "Ts", 118: "Og" +} +# fmt: on + +# Switch the keys and values of the z_to_symbol dictionary so a user can access the Z value using the symbol +SYMBOL_TO_Z: dict[str, int] = {val: key for key, val in Z_TO_SYMBOL.items()} + + +def get_symbol(z: int) -> str | None: + """ + Get the symbol representing ``z``. + + This is a nicely named, very thin wrapper around the inbuilt dictionary get. + + Parameters + ---------- + z : int + The Z value to get the symbol for. + + Returns + ------- + str or None + The string representing the ``z` value or None if the ``z` value is invalid. + """ + return Z_TO_SYMBOL.get(z, None) + + +def get_z(symbol: str) -> int | None: + """ + Get the z (proton number) representing ``symbol``. + + This is a nicely named, very thin wrapper around the inbuilt dictionary. + + Parameters + ---------- + symbol : str + The elemental symbol to get the Z for. + + Returns + ------- + int or None + The Z value representing ``symbol`` or None if ``symbol`` is invalid. + """ + return SYMBOL_TO_Z.get(symbol, None) + + +def normalise_symbol(symbol: str) -> str: + """ + Validate format of ``symbol`` to allow simpler conversions. + + Element symbols always have a capital first letter and lower case second, if it exists. We store all symbols + like this so want any user input to be of this format. In typesetting, this is known as title case so we can + leverage that conversion function. + + No checking is done on the validity of the symbol. + + Parameters + ---------- + symbol : str + The elemental symbol to validate. + + Returns + ------- + str + The elemental symbol with the correct casing. + """ + return symbol.strip().title() diff --git a/src/nuclearmasses/utils/type_defs.py b/src/nuclearmasses/utils/type_defs.py new file mode 100644 index 0000000..cc85d63 --- /dev/null +++ b/src/nuclearmasses/utils/type_defs.py @@ -0,0 +1,10 @@ +""" +The module type_defs contains additional type definitions used by the main repository. +""" + +from importlib.resources.abc import Traversable +import os +import typing + +# Typing hint Union for the different ways a file or data can be represented +DataInput: typing.TypeAlias = Traversable | os.PathLike[str] | str | typing.TextIO diff --git a/src/nuclearmasses/utils/units.py b/src/nuclearmasses/utils/units.py new file mode 100644 index 0000000..c610e69 --- /dev/null +++ b/src/nuclearmasses/utils/units.py @@ -0,0 +1,62 @@ +""" +The module units contains functionality to convert from human readable strings into SI units. For example the time units +mins, hr or Gyr would be converted into seconds. +""" + +UNIT_TO_SECONDS: dict[str, float] = { + "ys": 1e-24, + "zs": 1e-21, + "as": 1e-18, + "ps": 1e-12, + "ns": 1e-9, + "us": 1e-6, + "ms": 1e-3, + "s": 1.0, + "min": 60.0, + "h": 3600.0, + "d": 86400.0, + "yr": 31_557_600.0, # 365.25 days + "kyr": 3.15576e10, + "tyr": 3.15576e12, + "myr": 3.15576e13, + "pyr": 3.15576e15, + "gyr": 3.15576e16, + "eyr": 3.15576e18, + "zyr": 3.15576e21, + "yyr": 3.15576e24, +} + + +def unit_to_seconds(unit_str: str) -> float | None: + """Convert a time unit to a scale factor in seconds. + + Parameters + ---------- + unit_str : str + The time unit to convert into seconds. + + Returns + ------- + float or None + The time unit represented in seconds or None if the unit does not represent time. + + Examples + -------- + >>> from nuclearmasses.utils.converter import Converter + >>> Converter.unit_to_seconds("s") + 1.0 + >>> Converter.unit_to_seconds("min") + 60.0 + >>> Converter.unit_to_seconds("keV") + >>> Converter.unit_to_seconds(2) + >>> + """ + if not isinstance(unit_str, str): + return None + + # Remove white space and make lower case to be consistent + cleaned_unit = unit_str.strip().lower() + if not cleaned_unit: + return None + + return UNIT_TO_SECONDS.get(cleaned_unit, None) diff --git a/tests/test_ame.py b/tests/io/test_ame.py similarity index 100% rename from tests/test_ame.py rename to tests/io/test_ame.py diff --git a/tests/test_ame_mass_parse.py b/tests/io/test_ame_mass_parse.py similarity index 94% rename from tests/test_ame_mass_parse.py rename to tests/io/test_ame_mass_parse.py index 7ce15e4..a14f904 100644 --- a/tests/test_ame_mass_parse.py +++ b/tests/io/test_ame_mass_parse.py @@ -12,8 +12,8 @@ def test_1983_mass(): "0 11 39 28 67 Ni +n2p -63742.471 19.056 582618.683 19.066 B- 3560.871 20.646 66 931570.167 20.457 -.0" ) parser = AMEMassParser(line, 1983) - parser.HEADER = 0 - parser.FOOTER = 0 + parser.layout.HEADER = 0 + parser.layout.FOOTER = 0 df = parser.read_file() expected = pd.DataFrame( @@ -45,8 +45,8 @@ def test_1993_mass(): "0 15 41 26 67 Fe x -46574.693 465.747 567012.139 465.747 B- 8746.727 543.150 66 950000.000 500.000" ) parser = AMEMassParser(line, 1993) - parser.HEADER = 0 - parser.FOOTER = 0 + parser.layout.HEADER = 0 + parser.layout.FOOTER = 0 df = parser.read_file() expected = pd.DataFrame( @@ -79,8 +79,8 @@ def test_1995_mass(): " 15 41 26 67 Fe x -46574.693 465.747 567012.133 465.747 B- 8746.727 543.150 66 950000.000 500.000" ) parser = AMEMassParser(line, 1995) - parser.HEADER = 0 - parser.FOOTER = 0 + parser.layout.HEADER = 0 + parser.layout.FOOTER = 0 df = parser.read_file() expected = pd.DataFrame( @@ -112,8 +112,8 @@ def test_2003_mass(): " 15 41 26 67 Fe x -45692.348 415.570 8449.695 6.203 B- 9368.702 523.438 66 950947.244 446.132" ) parser = AMEMassParser(line, 2003) - parser.HEADER = 0 - parser.FOOTER = 0 + parser.layout.HEADER = 0 + parser.layout.FOOTER = 0 df = parser.read_file() expected = pd.DataFrame( @@ -145,8 +145,8 @@ def test_2012_mass(): " 15 41 26 67 Fe x -46068.530 217.972 8455.310 3.253 B- 9253.245 218.067 66 950543.395 234.002" ) parser = AMEMassParser(line, 2012) - parser.HEADER = 0 - parser.FOOTER = 0 + parser.layout.HEADER = 0 + parser.layout.FOOTER = 0 df = parser.read_file() expected = pd.DataFrame( @@ -178,8 +178,8 @@ def test_2016_mass(): " 15 41 26 67 Fe x -45610.155 270.285 8448.469 4.034 B- 9711.620 270.362 66 951035.482 290.163" ) parser = AMEMassParser(line, 2016) - parser.HEADER = 0 - parser.FOOTER = 0 + parser.layout.HEADER = 0 + parser.layout.FOOTER = 0 df = parser.read_file() expected = pd.DataFrame( @@ -211,8 +211,8 @@ def test_2020_mass(): " 15 41 26 67 Fe x -45708.416 3.819 8449.9359 0.0570 B- 9613.3678 7.4900 66 950930.000 4.100" ) parser = AMEMassParser(line, 2020) - parser.HEADER = 0 - parser.FOOTER = 0 + parser.layout.HEADER = 0 + parser.layout.FOOTER = 0 df = parser.read_file() expected = pd.DataFrame( diff --git a/tests/test_ame_reaction_1_parse.py b/tests/io/test_ame_reaction_1_parse.py similarity index 90% rename from tests/test_ame_reaction_1_parse.py rename to tests/io/test_ame_reaction_1_parse.py index d7825e4..6e56f24 100644 --- a/tests/test_ame_reaction_1_parse.py +++ b/tests/io/test_ame_reaction_1_parse.py @@ -3,7 +3,7 @@ import pandas as pd import pandas.testing as pdt -from nuclearmasses.io.ame_reaction_1_parse import AMEReactionParserOne +from nuclearmasses.io.ame_reaction_1_parse import AMEReactionOneParser def test_1983_rct1(): @@ -12,9 +12,9 @@ def test_1983_rct1(): line = io.StringIO( " 186 Ir 77 15780 250 9536 20 3850 100 -7600# 300# -2639 20 -10640# 200#" ) - parser = AMEReactionParserOne(line, 1983) - parser.HEADER = 0 - parser.FOOTER = 0 + parser = AMEReactionOneParser(line, 1983) + parser.layout.HEADER = 0 + parser.layout.FOOTER = 0 df = parser.read_file() expected = pd.DataFrame( @@ -48,9 +48,9 @@ def test_1993_rct1(): line = io.StringIO( " 186 Ir 77 15618.44 270.74 9522.98 20.49 3852.98 103.94 -7419.61 145.57 -2635.85 20.03 -10622# 230#" ) - parser = AMEReactionParserOne(line, 1993) - parser.HEADER = 0 - parser.FOOTER = 0 + parser = AMEReactionOneParser(line, 1993) + parser.layout.HEADER = 0 + parser.layout.FOOTER = 0 df = parser.read_file() expected = pd.DataFrame( @@ -84,9 +84,9 @@ def test_1995_rct1(): line = io.StringIO( " 186 Ir 77 15618.41 270.74 9522.89 20.49 3853.04 103.94 -7495.33 145.56 -2635.83 20.03 -10682.00 207.60" ) - parser = AMEReactionParserOne(line, 1995) - parser.HEADER = 0 - parser.FOOTER = 0 + parser = AMEReactionOneParser(line, 1995) + parser.layout.HEADER = 0 + parser.layout.FOOTER = 0 df = parser.read_file() expected = pd.DataFrame( @@ -120,9 +120,9 @@ def test_2003_rct1(): line = io.StringIO( " 186 Ir 77 15704.74 32.47 9524.26 17.08 3849.65 103.31 -7458.10 26.70 -2639.77 16.57 -10561.10 44.19" ) - parser = AMEReactionParserOne(line, 2003) - parser.HEADER = 0 - parser.FOOTER = 0 + parser = AMEReactionOneParser(line, 2003) + parser.layout.HEADER = 0 + parser.layout.FOOTER = 0 df = parser.read_file() expected = pd.DataFrame( @@ -156,9 +156,9 @@ def test_2012_rct1(): line = io.StringIO( " 186 Ir 77 15706.55 32.47 9527.99 17.09 3848.03 103.31 -7459.92 26.70 -2641.13 16.57 -10557.95 30.67" ) - parser = AMEReactionParserOne(line, 2012) - parser.HEADER = 0 - parser.FOOTER = 0 + parser = AMEReactionOneParser(line, 2012) + parser.layout.HEADER = 0 + parser.layout.FOOTER = 0 df = parser.read_file() expected = pd.DataFrame( @@ -192,9 +192,9 @@ def test_2016_rct1(): line = io.StringIO( " 186 Ir 77 15704.13 32.47 9530.65 17.07 3848.80 103.31 -7457.49 26.70 -2642.29 16.55 -10555.52 30.67" ) - parser = AMEReactionParserOne(line, 2016) - parser.HEADER = 0 - parser.FOOTER = 0 + parser = AMEReactionOneParser(line, 2016) + parser.layout.HEADER = 0 + parser.layout.FOOTER = 0 df = parser.read_file() expected = pd.DataFrame( @@ -228,9 +228,9 @@ def test_2020_rct1(): line = io.StringIO( " 186 Ir 77 15704.1312 32.4655 9530.4731 17.0698 3848.8777 103.3133 -7457.4943 26.6968 -2642.2739 16.5459 -10555.5245 30.6658" ) - parser = AMEReactionParserOne(line, 2020) - parser.HEADER = 0 - parser.FOOTER = 0 + parser = AMEReactionOneParser(line, 2020) + parser.layout.HEADER = 0 + parser.layout.FOOTER = 0 df = parser.read_file() expected = pd.DataFrame( diff --git a/tests/test_ame_reaction_2_parse.py b/tests/io/test_ame_reaction_2_parse.py similarity index 90% rename from tests/test_ame_reaction_2_parse.py rename to tests/io/test_ame_reaction_2_parse.py index a104b57..297cb3f 100644 --- a/tests/test_ame_reaction_2_parse.py +++ b/tests/io/test_ame_reaction_2_parse.py @@ -3,16 +3,16 @@ import pandas as pd import pandas.testing as pdt -from nuclearmasses.io.ame_reaction_2_parse import AMEReactionParserTwo +from nuclearmasses.io.ame_reaction_2_parse import AMEReactionTwoParser def test_1983_rct2(): line = io.StringIO( " 204 Tl 81 7853 17 5702.8 1.7 -13480 120 12613.7 1.8 8608.4 1.8 7180 50" ) - parser = AMEReactionParserTwo(line, 1983) - parser.HEADER = 0 - parser.FOOTER = 0 + parser = AMEReactionTwoParser(line, 1983) + parser.layout.HEADER = 0 + parser.layout.FOOTER = 0 df = parser.read_file() expected = pd.DataFrame( @@ -47,9 +47,9 @@ def test_1993_rct2(): line = io.StringIO( " 204 Tl 81 6655.82 0.29 6365.32 1.26 -12492.85 71.39 13712.87 1.23 8183.14 1.25 7690.59 15.05" ) - parser = AMEReactionParserTwo(line, 1993) - parser.HEADER = 0 - parser.FOOTER = 0 + parser = AMEReactionTwoParser(line, 1993) + parser.layout.HEADER = 0 + parser.layout.FOOTER = 0 df = parser.read_file() expected = pd.DataFrame( @@ -84,9 +84,9 @@ def test_1995_rct2(): line = io.StringIO( " 204 Tl 81 6655.86 0.29 6365.35 1.26 -12494.05 92.85 13713.05 1.23 8183.32 1.24 7702.97 3.35" ) - parser = AMEReactionParserTwo(line, 1995) - parser.HEADER = 0 - parser.FOOTER = 0 + parser = AMEReactionTwoParser(line, 1995) + parser.layout.HEADER = 0 + parser.layout.FOOTER = 0 df = parser.read_file() expected = pd.DataFrame( @@ -121,9 +121,9 @@ def test_2003_rct2(): line = io.StringIO( " 204 Tl 81 6656.10 0.29 6365.82 1.25 -12470.66 24.01 13710.69 1.15 8181.34 1.16 7701.54 3.34" ) - parser = AMEReactionParserTwo(line, 2003) - parser.HEADER = 0 - parser.FOOTER = 0 + parser = AMEReactionTwoParser(line, 2003) + parser.layout.HEADER = 0 + parser.layout.FOOTER = 0 df = parser.read_file() expected = pd.DataFrame( @@ -158,9 +158,9 @@ def test_2012_rct2(): line = io.StringIO( " 204 Tl 81 6656.09 0.29 6365.80 1.25 -12470.19 22.31 13710.68 1.14 8181.16 1.15 7701.67 3.33" ) - parser = AMEReactionParserTwo(line, 2012) - parser.HEADER = 0 - parser.FOOTER = 0 + parser = AMEReactionTwoParser(line, 2012) + parser.layout.HEADER = 0 + parser.layout.FOOTER = 0 df = parser.read_file() expected = pd.DataFrame( @@ -195,9 +195,9 @@ def test_2016_rct2(): line = io.StringIO( " 204 Tl 81 6656.08 0.29 6365.85 1.25 -12470.71 22.32 13709.99 1.06 8180.45 1.07 7700.97 3.31" ) - parser = AMEReactionParserTwo(line, 2016) - parser.HEADER = 0 - parser.FOOTER = 0 + parser = AMEReactionTwoParser(line, 2016) + parser.layout.HEADER = 0 + parser.layout.FOOTER = 0 df = parser.read_file() expected = pd.DataFrame( @@ -232,9 +232,9 @@ def test_2020_rct2(): line = io.StringIO( " 204 Tl 81 6656.0787 0.2907 6365.8379 1.2542 -12470.8182 22.6974 13710.0469 1.0612 8180.5147 1.0721 7701.0380 3.3084" ) - parser = AMEReactionParserTwo(line, 2020) - parser.HEADER = 0 - parser.FOOTER = 0 + parser = AMEReactionTwoParser(line, 2020) + parser.layout.HEADER = 0 + parser.layout.FOOTER = 0 df = parser.read_file() expected = pd.DataFrame( diff --git a/tests/test_nubase.py b/tests/io/test_nubase.py similarity index 100% rename from tests/test_nubase.py rename to tests/io/test_nubase.py diff --git a/tests/test_nubase_parse.py b/tests/io/test_nubase_parse.py similarity index 99% rename from tests/test_nubase_parse.py rename to tests/io/test_nubase_parse.py index 20a7d0b..80dc954 100644 --- a/tests/test_nubase_parse.py +++ b/tests/io/test_nubase_parse.py @@ -149,7 +149,7 @@ def test_2020_nubase(): "168 0670 168Ho -60060 30 2.99 m 0.07 3+ 10 1960 B-=100" ) parser = NUBASEParser(line, 2020) - parser.HEADER = 0 + parser.layout.HEADER = 0 df = parser.read_file() expected = pd.DataFrame( diff --git a/tests/test_converter.py b/tests/test_converter.py deleted file mode 100644 index 8201f84..0000000 --- a/tests/test_converter.py +++ /dev/null @@ -1,45 +0,0 @@ -import pytest - -from nuclearmasses.utils.converter import Converter - - -@pytest.fixture -def converter(): - return Converter() - - -def test_z_to_symbol(converter): - assert converter.get_symbol(0) == "n" - assert converter.get_symbol(6) == "C" - assert converter.get_symbol(104) == "Rf" - - -def test_symbol_to_z(converter): - assert converter.get_z("Al") == 13 - assert converter.get_z("Fe") == 26 - assert converter.get_z("Po") == 84 - - -def test_normalise_symbol(converter): - # These inputs shouldn't change - assert converter.normalise_symbol("H") == "H" - assert converter.normalise_symbol("Os") == "Os" - - # These inputs should change - assert converter.normalise_symbol("h") == "H" - assert converter.normalise_symbol("mg") == "Mg" - assert converter.normalise_symbol("RN") == "Rn" - - -def test_units_to_seconds(converter): - assert converter.unit_to_seconds("ms") == 1.0e-3 - assert converter.unit_to_seconds("s") == 1.0 - assert converter.unit_to_seconds("min") == 60.0 - assert converter.unit_to_seconds("h") == 3600.0 - assert converter.unit_to_seconds("d") == 86400.0 - assert converter.unit_to_seconds("yr") == 31557600.0 - - -@pytest.mark.parametrize("unit", [5, "m", "Hz", "", " "]) -def test_nontime_unit_return_nan(converter, unit): - assert converter.unit_to_seconds(unit) is None diff --git a/tests/utils/test_dataframe_utils.py b/tests/utils/test_dataframe_utils.py new file mode 100644 index 0000000..3dc70be --- /dev/null +++ b/tests/utils/test_dataframe_utils.py @@ -0,0 +1,57 @@ +import pandas as pd + +from nuclearmasses.utils.dataframe_utils import calculate_relative_error, strip_char_from_string_columns + + +def test_12C_relative_error(): + df = pd.DataFrame( + { + "A": [12], + "Z": [6], + "NUBASEMassExcess": [-12345.6], + "NUBASEMassExcessError": [1.2], + } + ) + + df = calculate_relative_error(df, "NUBASE") + assert df["NUBASERelativeError"][0] == 0.0 + + +def test_relative_error(): + df = pd.DataFrame( + { + "A": [123], + "Z": [50], + "NUBASEMassExcess": [100000.0], + "NUBASEMassExcessError": [10.0], + } + ) + + df = calculate_relative_error(df, "NUBASE") + assert df["NUBASERelativeError"][0] == 10.0 / 100000.0 + + +def test_remove_hash_from_column(): + df = pd.DataFrame( + { + "X": ["Random#"], + "Y": ["Clean"], + } + ) + + df = strip_char_from_string_columns(df, "#") + assert df["X"][0] == "Random" + assert df["Y"][0] == "Clean" + + +def test_remove_decimal_from_column(): + df = pd.DataFrame( + { + "X": ["Random.String"], + "Y": ["Clean"], + } + ) + + df = strip_char_from_string_columns(df, ".") + assert df["X"][0] == "RandomString" + assert df["Y"][0] == "Clean" diff --git a/tests/utils/test_periodic.py b/tests/utils/test_periodic.py new file mode 100644 index 0000000..4d455d5 --- /dev/null +++ b/tests/utils/test_periodic.py @@ -0,0 +1,24 @@ +from nuclearmasses.utils.periodic import get_symbol, get_z, normalise_symbol + + +def test_z_to_symbol(): + assert get_symbol(0) == "n" + assert get_symbol(6) == "C" + assert get_symbol(104) == "Rf" + + +def test_symbol_to_z(): + assert get_z("Al") == 13 + assert get_z("Fe") == 26 + assert get_z("Po") == 84 + + +def test_normalise_symbol(): + # These inputs shouldn't change + assert normalise_symbol("H") == "H" + assert normalise_symbol("Os") == "Os" + + # These inputs should change + assert normalise_symbol("h") == "H" + assert normalise_symbol("mg") == "Mg" + assert normalise_symbol("RN") == "Rn" diff --git a/tests/utils/test_units.py b/tests/utils/test_units.py new file mode 100644 index 0000000..2e9c83e --- /dev/null +++ b/tests/utils/test_units.py @@ -0,0 +1,17 @@ +import pytest + +from nuclearmasses.utils.units import unit_to_seconds + + +def test_units_to_seconds(): + assert unit_to_seconds("ms") == 1.0e-3 + assert unit_to_seconds("s") == 1.0 + assert unit_to_seconds("min") == 60.0 + assert unit_to_seconds("h") == 3600.0 + assert unit_to_seconds("d") == 86400.0 + assert unit_to_seconds("yr") == 31557600.0 + + +@pytest.mark.parametrize("unit", [5, "m", "Hz", "", " "]) +def test_nontime_unit_return_nan(unit): + assert unit_to_seconds(unit) is None