From 8786bcb76e9504bb8e60256b51ebfde18365b806 Mon Sep 17 00:00:00 2001 From: Ian Cullen Date: Thu, 30 Apr 2026 15:50:56 +0100 Subject: [PATCH 01/12] Refactor common function The algorithm is identical between NUBASE and AME so a minor update to also take the name means we can share it. --- src/nuclearmasses/io/ame_mass_parse.py | 26 +------------------------ src/nuclearmasses/io/nubase_parse.py | 26 +------------------------ src/nuclearmasses/utils/converter.py | 27 ++++++++++++++++++++++++++ 3 files changed, 29 insertions(+), 50 deletions(-) diff --git a/src/nuclearmasses/io/ame_mass_parse.py b/src/nuclearmasses/io/ame_mass_parse.py index 98ae737..350c9a3 100644 --- a/src/nuclearmasses/io/ame_mass_parse.py +++ b/src/nuclearmasses/io/ame_mass_parse.py @@ -106,30 +106,6 @@ def _na_values(self) -> dict: return na_vals - def calculate_relative_error(self, raw_df) -> pd.DataFrame: - """ - Calculate the relative error of the mass excess. - - 12C has a 0.0 +/- 0.0 mass excess by definition, so relative error is 0.0. The division by zero will put a NaN - value in the column for 12C so we will manually correct and set to 0.0. - - Parameters - ---------- - raw_df : pandas.DataFrame - The raw dataframe upon which we will act. - - Returns - ------- - pandas.DataFrame - The updated dataframe with a new relative mass excess column. - """ - raw_df["AMERelativeError"] = abs( - raw_df["AMEMassExcessError"].astype(float) / raw_df["AMEMassExcess"].astype(float) - ) - raw_df.loc[(raw_df.Z == 6) & (raw_df.A == 12), "AMERelativeError"] = 0.0 - - return raw_df - def read_file(self) -> pd.DataFrame: """ Read the file-like object ``self.filename`` into a dataframe @@ -180,7 +156,7 @@ def read_file(self) -> pd.DataFrame: # We need to rescale the error value because we combined the two columns above df = df.assign(AtomicMassError=df["AtomicMassError"].astype(float) / 1.0e6) - df = self.calculate_relative_error(df) + df = self.calculate_relative_error(df, "AME") df["TableYear"] = self.year df["N"] = pd.to_numeric(df["A"]) - pd.to_numeric(df["Z"]) diff --git a/src/nuclearmasses/io/nubase_parse.py b/src/nuclearmasses/io/nubase_parse.py index f3181c3..983e064 100644 --- a/src/nuclearmasses/io/nubase_parse.py +++ b/src/nuclearmasses/io/nubase_parse.py @@ -217,30 +217,6 @@ def parse_state(self, raw_df) -> pd.DataFrame: return raw_df - def calculate_relative_error(self, raw_df) -> pd.DataFrame: - """ - Calculate the relative error of the mass excess. - - 12C has a 0.0 +/- 0.0 mass excess by definition, so relative error is 0.0. The division by zero will put a NaN - value in the column for 12C so we will manually correct and set to 0.0. - - Parameters - ---------- - raw_df : pandas.DataFrame - The raw dataframe upon which we will act. - - Returns - ------- - pandas.DataFrame - The updated dataframe with a new relative mass excess column. - """ - raw_df["NUBASERelativeError"] = abs( - raw_df["NUBASEMassExcessError"].astype(float) / raw_df["NUBASEMassExcess"].astype(float) - ) - raw_df.loc[(raw_df.Z == 6) & (raw_df.A == 12), "NUBASERelativeError"] = 0.0 - - return raw_df - def read_file(self) -> pd.DataFrame: """ Read the file-like object ``self.filename`` into a dataframe @@ -272,7 +248,7 @@ def read_file(self) -> pd.DataFrame: df = self.strip_char_from_string_columns(df, "#") df = self.parse_half_life(df) - df = self.calculate_relative_error(df) + df = self.calculate_relative_error(df, "NUBASE") if self.year == 2012: # 198Au has a typo in it's decay mode in the 2012 table. It is recorded as '-' diff --git a/src/nuclearmasses/utils/converter.py b/src/nuclearmasses/utils/converter.py index bcfb2a7..afa787d 100644 --- a/src/nuclearmasses/utils/converter.py +++ b/src/nuclearmasses/utils/converter.py @@ -226,3 +226,30 @@ def strip_char_from_string_columns(df: pd.DataFrame, char: str) -> pd.DataFrame: cols = df.select_dtypes(include=["object", "string"]).columns df[cols] = df[cols].apply(lambda s: s.str.replace(char, "", regex=False)) return df + + @staticmethod + def calculate_relative_error(raw_df: pd.DataFrame, source: str) -> pd.DataFrame: + """ + Calculate the relative error of the mass excess. + + 12C has a 0.0 +/- 0.0 mass excess by definition, so relative error is 0.0. The division by zero will put a NaN + value in the column for 12C so we will manually correct and set to 0.0. + + Parameters + ---------- + raw_df : pandas.DataFrame + The raw dataframe upon which we will act. + source : str + Which table's data are we working with + + Returns + ------- + pandas.DataFrame + The updated dataframe with a new relative mass excess column. + """ + raw_df[f"{source}RelativeError"] = abs( + raw_df[f"{source}MassExcessError"].astype(float) / raw_df[f"{source}MassExcess"].astype(float) + ) + raw_df.loc[(raw_df.Z == 6) & (raw_df.A == 12), f"{source}RelativeError"] = 0.0 + + return raw_df From f1909183f5fd5f027b06487fab0e766cc17e1ad0 Mon Sep 17 00:00:00 2001 From: Ian Cullen Date: Fri, 1 May 2026 22:00:23 +0100 Subject: [PATCH 02/12] Replace use of match statement to configure column positions A few major changes here: - Inheritance was not the correct technique to use so stop doing that and use the static Converter methods and make the parameter locations in the datafile a member so we can still access. - Instead of a large match case on the year to set the majority of the members to the same value most of the time, create a dataclass to store the values and tweak as required for each year by creating a new instance. - The use of the dataclass also allows us to set the column names at the same time as the parameter start and end line positions and automate the creation of other variables. Ideally we would remove all manual setting of anything related to columns, but we often do different things with different subsets so it's not clear how achieve this. --- src/nuclearmasses/io/nubase_file.py | 284 +++++++++++++-------------- src/nuclearmasses/io/nubase_parse.py | 63 +++--- tests/test_nubase_parse.py | 2 +- 3 files changed, 167 insertions(+), 182 deletions(-) diff --git a/src/nuclearmasses/io/nubase_file.py b/src/nuclearmasses/io/nubase_file.py index dd3abf4..11c54ae 100644 --- a/src/nuclearmasses/io/nubase_file.py +++ b/src/nuclearmasses/io/nubase_file.py @@ -1,23 +1,25 @@ """ -The nubase_file module defines the ``NUBASEFile`` class. This class stores the column positions of the start and finish -location of the different parameters recorded in the NUBASE data file. The positions have changed between years so the -year of the table is given as a parameter at construction. +The nubase_file module defines the ``NUBASELayout`` and ``NUBASEFile`` classes. The ``NUBASELayout`` class acts like a +base class, storing the original column names and the start and end positionsof the values within the NUBASE data file. +The positions change as time progress so the ``NUBASEFile`` class uses the year, passed as a parameter, to update the +values as required. """ +import dataclasses -class NUBASEFile: + +@dataclasses.dataclass(kw_only=True) +class NUBASELayout: """ - Storage class for the data in the NUBASE data file. + Storage class for the original data in the NUBASE data file. The NUBASE data file is fixed-width file format so we will store the format details in this class. Note we have not listed all parameters in the attributes section as there are so many. The naming convention is however shown, along with a description. - Parameters - ---------- - year : int - The year the file being parsed was published + The attribute names align with column names as a string to allow dynamic creation of other variables and attributes + in other parts of the code. Attributes ---------- @@ -26,145 +28,135 @@ class NUBASEFile: FOOTER : int The number of lines in the file to be interpreted as the footer. START_X : int - The first column of parameter X. + The first column of parameter X or None if X is not in the datafile for that year. END_X : int or None - The last column of parameter X or None to represent the end of the line. - column_limits : list[tuple[int, int]] - The start and end positions of all parameters as a list of tuples that can be passed to :meth:`pandas.read_fwf`. + The last column of parameter X or None if X is not in the datafile for that year. + columns : list[str] + The list of columns that appear in the file. + positions : list[tuple(str, str, str)] + A list of tuples detailing column name alongside start and end position in the line. """ - def __init__(self, year: int, **kwargs): - super().__init__(**kwargs) - match year: - case 1995: - self.HEADER = 0 - self.FOOTER = 0 - self.START_A = 0 - self.END_A = 3 - self.START_Z = 4 - self.END_Z = 7 - self.START_STATE = 7 - self.END_STATE = 8 - self.START_ME = 18 - self.END_ME = 29 - self.START_DME = 29 - self.END_DME = 38 - self.START_ISOMER = 39 - self.END_ISOMER = 46 - self.START_DISOMER = 48 - self.END_DISOMER = 56 - self.START_HALFLIFEVALUE = 60 - self.END_HALFLIFEVALUE = 68 - self.START_HALFLIFEUNIT = 69 - self.END_HALFLIFEUNIT = 71 - self.START_HALFLIFEERROR = 72 - self.END_HALFLIFEERROR = 77 - self.START_SPIN = 79 - self.END_SPIN = 93 - self.START_DECAYSTRING = 106 - self.END_DECAYSTRING = None - case 2003: - self.HEADER = 0 - self.FOOTER = 0 - self.START_A = 0 - self.END_A = 3 - self.START_Z = 4 - self.END_Z = 7 - self.START_STATE = 7 - self.END_STATE = 8 - self.START_ME = 18 - self.END_ME = 29 - self.START_DME = 29 - self.END_DME = 38 - self.START_ISOMER = 39 - self.END_ISOMER = 46 - self.START_DISOMER = 48 - self.END_DISOMER = 56 - self.START_HALFLIFEVALUE = 60 - self.END_HALFLIFEVALUE = 68 - self.START_HALFLIFEUNIT = 69 - self.END_HALFLIFEUNIT = 71 - self.START_HALFLIFEERROR = 72 - self.END_HALFLIFEERROR = 77 - self.START_SPIN = 79 - self.END_SPIN = 93 - self.START_DECAYSTRING = 106 - self.END_DECAYSTRING = None - case 2020: - self.HEADER = 25 - self.FOOTER = 0 - self.START_A = 0 - self.END_A = 3 - self.START_Z = 4 - self.END_Z = 7 - self.START_STATE = 7 - self.END_STATE = 8 - self.START_ME = 18 - self.END_ME = 31 - self.START_DME = 31 - self.END_DME = 42 - self.START_ISOMER = 43 - self.END_ISOMER = 53 - self.START_DISOMER = 54 - self.END_DISOMER = 64 - self.START_HALFLIFEVALUE = 69 - self.END_HALFLIFEVALUE = 77 - self.START_HALFLIFEUNIT = 78 - self.END_HALFLIFEUNIT = 80 - self.START_HALFLIFEERROR = 81 - self.END_HALFLIFEERROR = 87 - self.START_SPIN = 88 - self.END_SPIN = 101 - self.START_ENSDF = 102 - self.END_ENSDF = 103 - self.START_YEAR = 114 - self.END_YEAR = 118 - self.START_DECAYSTRING = 119 - self.END_DECAYSTRING = None - case _: - self.HEADER = 0 - self.FOOTER = 0 - self.START_A = 0 - self.END_A = 3 - self.START_Z = 4 - self.END_Z = 7 - self.START_STATE = 7 - self.END_STATE = 8 - self.START_ME = 18 - self.END_ME = 29 - self.START_DME = 29 - self.END_DME = 38 - self.START_ISOMER = 39 - self.END_ISOMER = 46 - self.START_DISOMER = 48 - self.END_DISOMER = 56 - self.START_HALFLIFEVALUE = 60 - self.END_HALFLIFEVALUE = 68 - self.START_HALFLIFEUNIT = 69 - self.END_HALFLIFEUNIT = 71 - self.START_HALFLIFEERROR = 72 - self.END_HALFLIFEERROR = 77 - self.START_SPIN = 79 - self.END_SPIN = 93 - self.START_YEAR = 105 - self.END_YEAR = 109 - self.START_DECAYSTRING = 110 - self.END_DECAYSTRING = None - - self.column_limits = [ - (self.START_A, self.END_A), - (self.START_Z, self.END_Z), - (self.START_STATE, self.END_STATE), - (self.START_ME, self.END_ME), - (self.START_DME, self.END_DME), - (self.START_ISOMER, self.END_ISOMER), - (self.START_DISOMER, self.END_DISOMER), - (self.START_HALFLIFEVALUE, self.END_HALFLIFEVALUE), - (self.START_HALFLIFEUNIT, self.END_HALFLIFEUNIT), - (self.START_HALFLIFEERROR, self.END_HALFLIFEERROR), - (self.START_SPIN, self.END_SPIN), - (self.START_DECAYSTRING, self.END_DECAYSTRING), + HEADER: int = 0 + FOOTER: int = 0 + START_A: int = 0 + END_A: int = 3 + START_Z: int = 4 + END_Z: int = 7 + START_State: int = 7 + END_State: int = 8 + START_NUBASEMassExcess: int = 18 + END_NUBASEMassExcess: int = 29 + START_NUBASEMassExcessError: int = 29 + END_NUBASEMassExcessError: int = 38 + START_IsomerEnergy: int = 39 + END_IsomerEnergy: int = 46 + START_IsomerEnergyError: int = 48 + END_IsomerEnergyError: int = 56 + START_HalfLifeValue: int = 60 + END_HalfLifeValue: int = 68 + START_HalfLifeUnit: int = 69 + END_HalfLifeUnit: int = 71 + START_HalfLifeError: int = 72 + END_HalfLifeError: int = 77 + START_Spin: int = 79 + END_Spin: int = 93 + START_DecayModes: int = 106 + END_DecayModes: int | None = None + + # Columns that weren't in the first file so are not part of the default + START_DiscoveryYear: int | None = None + END_DiscoveryYear: int | None = None + START_ENSDF: int | None = None + END_ENSDF: int | None = None + + def __post_init__(self) -> None: + self.columns: list[str] = [ + "A", + "Z", + "State", + "NUBASEMassExcess", + "NUBASEMassExcessError", + "IsomerEnergy", + "IsomerEnergyError", + "HalfLifeValue", + "HalfLifeUnit", + "HalfLifeError", + "Spin", + # "ENSDF", + # "DiscoveryYear", + "DecayModes", ] - if year > 2003: - self.column_limits.insert(-1, (self.START_YEAR, self.END_YEAR)) + self.positions: list[tuple[str, str, str]] = [(f"{c}", f"START_{c}", f"END_{c}") for c in self.columns] + + +class NUBASEFile: + """ + Storage class for the year specific data in the NUBASE data file. + + The base ``NUBASELayout`` class is constructed and updated as required for the given ``year``. + + Parameters + ---------- + year : int + The year the file being parsed was published + + Attributes + ---------- + NUBASE_YEAR_OVERRIDES : dict[int | str, dict[str, int]] + Year specific updates and changes required to ``NUBASELayout``. + layout : NUBASELayout + A storage class containing details of parameters and their locations in the line. + """ + + NUBASE_YEAR_OVERRIDES: dict[int | str, dict[str, int]] = { + "default": {}, # Use this to appease mypy by not returning None for a non-existent value + # Original columns and their positions are based of the 1995 file + 1995: {}, + # No changes in 2000 + 2003: {}, + # New discovery year column in 2012 which pushed the decay modes to the right + 2012: { + "START_DiscoveryYear": 105, + "END_DiscoveryYear": 109, + "START_DecayModes": 110, + }, + # This is the same as 2012 and copy pasting seems to be the simplest way to have the same values + 2016: { + "START_DiscoveryYear": 105, + "END_DiscoveryYear": 109, + "START_DecayModes": 110, + }, + # Big update in 2020 + # Added a header block + # Increased total digits in various values, pushing almost all columns to the right + # New column representing the year isotopes details were last updated in ENSDF + 2020: { + "HEADER": 25, + "START_NUBASEMassExcessError": 31, + "END_NUBASEMassExcessError": 42, + "START_IsomerEnergy": 43, + "END_IsomerEnergy": 53, + "START_IsomerEnergyError": 54, + "END_IsomerEnergyError": 64, + "START_HalfLifeValue": 69, + "END_HalfLifeValue": 77, + "START_HalfLifeUnit": 78, + "END_HalfLifeUnit": 80, + "START_HalfLifeError": 81, + "END_HalfLifeError": 87, + "START_Spin": 88, + "END_Spin": 101, + "START_ENSDF": 102, + "END_ENSDF": 103, + "START_DiscoveryYear": 114, + "END_DiscoveryYear": 118, + "START_DecayModes": 119, + }, + } + + def __init__(self, year: int) -> None: + self.layout = NUBASELayout( + **NUBASEFile.NUBASE_YEAR_OVERRIDES.get(year, NUBASEFile.NUBASE_YEAR_OVERRIDES["default"]) + ) diff --git a/src/nuclearmasses/io/nubase_parse.py b/src/nuclearmasses/io/nubase_parse.py index 983e064..02c7297 100644 --- a/src/nuclearmasses/io/nubase_parse.py +++ b/src/nuclearmasses/io/nubase_parse.py @@ -4,15 +4,13 @@ are cleaned from the resultant dataframe. """ -import typing - import pandas as pd from nuclearmasses.io.nubase_file import NUBASEFile from nuclearmasses.utils.converter import Converter, DataInput -class NUBASEParser(NUBASEFile, Converter): +class NUBASEParser: """ Parse the NUBASE file, doing the necessary preparations and clean up of data. @@ -32,18 +30,28 @@ class NUBASEParser(NUBASEFile, Converter): The file-like object to parse. year : int The published year of the data file. + layout : NUBASEFile + A storage class containing details of parameters and their location in the line. unit_replacements : dict[str, str] A dictionary used to tidy up time units from NUBASE format to one the module recognises. + column_limits : list[tuple[int, int]] + The start and end positions of all parameters as a list of tuples that can be passed to :meth:`pandas.read_fwf`. """ def __init__(self, filename: DataInput, year: int): - super().__init__(year=year) self.filename: DataInput = filename self.year: int = year + self.layout = NUBASEFile(year=year).layout self.unit_replacements: dict[str, str] = { r"y$": "yr", r"^m$": "min", } + self.column_limits = [ + (getattr(self.layout, start), getattr(self.layout, end)) for _, start, end in self.layout.positions + ] + + if year > 2003: + self.column_limits.insert(-1, (self.layout.START_DiscoveryYear, self.layout.END_DiscoveryYear)) def _column_names(self) -> list[str]: """ @@ -54,30 +62,14 @@ def _column_names(self) -> list[str]: list[str] An ordered list of the columns that exist in the file. """ - col_names = [ - "A", - "Z", - "State", - "NUBASEMassExcess", - "NUBASEMassExcessError", - "IsomerEnergy", - "IsomerEnergyError", - "HalfLifeValue", - "HalfLifeUnit", - "HalfLifeError", - "Spin", - "DiscoveryYear", - "DecayModes", - ] + col_names = self.layout.columns - # The discovery year was added after 2003, and I assume it will be there in the future, so we will set up - # as if it is always present and delete for the first two tables. - if self.year == 1995 or self.year == 2003: - col_names.remove("DiscoveryYear") + if self.year > 2003: + col_names.insert(-1, "DiscoveryYear") return col_names - def _data_types(self) -> dict: + def _data_types(self) -> dict[str, str]: """ Set the column data types depending on the year. @@ -92,11 +84,8 @@ def _data_types(self) -> dict: "Z": "Int64", "N": "Int64", "Experimental": "boolean", - # "State": "Int64", "NUBASEMassExcess": "float64", "NUBASEMassExcessError": "float64", - # "IsomerEnergy": "float64", - # "IsomerEnergyError": "float64", "HalfLifeValue": "float64", "HalfLifeUnit": "string", "HalfLifeError": "float64", @@ -106,11 +95,15 @@ def _data_types(self) -> dict: "DiscoveryYear": "Int64", "DecayModes": "string", "DataSource": "Int64", + # We will need these one day + # "State": "Int64", + # "IsomerEnergy": "float64", + # "IsomerEnergyError": "float64", } # The discovery year was added after 2003, and I assume it will be there in the future, so we will set up # as if it is always present and delete for the first two tables. - if self.year == 1995 or self.year == 2003: + if self.year <= 2003: data_types.pop("DiscoveryYear") return data_types @@ -186,7 +179,7 @@ def parse_half_life(self, raw_df) -> pd.DataFrame: raw_df[col] = pd.to_numeric(raw_df[col], errors="coerce") # Pre-compute unit -> second conversion - unit_map = raw_df["HalfLifeUnit"].map(self.unit_to_seconds) + unit_map = raw_df["HalfLifeUnit"].map(Converter.unit_to_seconds) raw_df["HalfLifeSeconds"] = raw_df["HalfLifeValue"] * unit_map raw_df["HalfLifeErrorSeconds"] = raw_df["HalfLifeError"] * unit_map @@ -231,13 +224,13 @@ def read_file(self) -> pd.DataFrame: """ df = Converter.read_fwf( self.filename, - colspecs=typing.cast(typing.Sequence[tuple[int, int]], self.column_limits), # appease mypy + colspecs=self.column_limits, names=self._column_names(), na_values=self._na_values(), keep_default_na=False, on_bad_lines="warn", - skiprows=self.HEADER, - skipfooter=self.FOOTER, + skiprows=self.layout.HEADER, + skipfooter=self.layout.FOOTER, ) df = self.parse_state(df) @@ -245,10 +238,10 @@ def read_file(self) -> pd.DataFrame: # We use the NUBASE data to define whether or not an isotope is experimentally measured, df["Experimental"] = ~df["NUBASEMassExcess"].astype("string").str.contains("#", na=False) # Once we have used the '#' to determine if it's experimental or not, we can remove all instances of it - df = self.strip_char_from_string_columns(df, "#") + df = Converter.strip_char_from_string_columns(df, "#") df = self.parse_half_life(df) - df = self.calculate_relative_error(df, "NUBASE") + df = Converter.calculate_relative_error(df, "NUBASE") if self.year == 2012: # 198Au has a typo in it's decay mode in the 2012 table. It is recorded as '-' @@ -256,7 +249,7 @@ def read_file(self) -> pd.DataFrame: df["TableYear"] = self.year df["N"] = pd.to_numeric(df["A"]) - pd.to_numeric(df["Z"]) - df["Symbol"] = pd.to_numeric(df["Z"]).map(self.get_symbol) + df["Symbol"] = pd.to_numeric(df["Z"]).map(Converter.get_symbol) df["DataSource"] = 0 return df.astype(self._data_types()) diff --git a/tests/test_nubase_parse.py b/tests/test_nubase_parse.py index 20a7d0b..80dc954 100644 --- a/tests/test_nubase_parse.py +++ b/tests/test_nubase_parse.py @@ -149,7 +149,7 @@ def test_2020_nubase(): "168 0670 168Ho -60060 30 2.99 m 0.07 3+ 10 1960 B-=100" ) parser = NUBASEParser(line, 2020) - parser.HEADER = 0 + parser.layout.HEADER = 0 df = parser.read_file() expected = pd.DataFrame( From de7bd6b9ac864e623cdaa38c775f013a8c44e305 Mon Sep 17 00:00:00 2001 From: Ian Cullen Date: Sat, 2 May 2026 11:43:01 +0100 Subject: [PATCH 03/12] Correct type hinting in docstring --- src/nuclearmasses/io/nubase_file.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/nuclearmasses/io/nubase_file.py b/src/nuclearmasses/io/nubase_file.py index 11c54ae..649acc0 100644 --- a/src/nuclearmasses/io/nubase_file.py +++ b/src/nuclearmasses/io/nubase_file.py @@ -33,7 +33,7 @@ class NUBASELayout: The last column of parameter X or None if X is not in the datafile for that year. columns : list[str] The list of columns that appear in the file. - positions : list[tuple(str, str, str)] + positions : list[tuple[str, str, str]] A list of tuples detailing column name alongside start and end position in the line. """ From f0804240ffcf1bc4a6243f4ddadea692d1b726a9 Mon Sep 17 00:00:00 2001 From: Ian Cullen Date: Sat, 2 May 2026 11:56:12 +0100 Subject: [PATCH 04/12] Fix typo --- src/nuclearmasses/io/nubase_file.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/nuclearmasses/io/nubase_file.py b/src/nuclearmasses/io/nubase_file.py index 649acc0..54ecc9c 100644 --- a/src/nuclearmasses/io/nubase_file.py +++ b/src/nuclearmasses/io/nubase_file.py @@ -1,6 +1,6 @@ """ The nubase_file module defines the ``NUBASELayout`` and ``NUBASEFile`` classes. The ``NUBASELayout`` class acts like a -base class, storing the original column names and the start and end positionsof the values within the NUBASE data file. +base class, storing the original column names and the start and end positions of the values within the NUBASE data file. The positions change as time progress so the ``NUBASEFile`` class uses the year, passed as a parameter, to update the values as required. """ From 1c4918b35083969eca4a48962a571796b021177e Mon Sep 17 00:00:00 2001 From: Ian Cullen Date: Sat, 2 May 2026 11:56:53 +0100 Subject: [PATCH 05/12] Replace match statement for AME mass tables See f190918 for details. --- src/nuclearmasses/io/ame_mass_file.py | 317 +++++++++++++------------ src/nuclearmasses/io/ame_mass_parse.py | 32 +-- tests/test_ame_mass_parse.py | 28 +-- 3 files changed, 191 insertions(+), 186 deletions(-) diff --git a/src/nuclearmasses/io/ame_mass_file.py b/src/nuclearmasses/io/ame_mass_file.py index 550f0e2..943a8b8 100644 --- a/src/nuclearmasses/io/ame_mass_file.py +++ b/src/nuclearmasses/io/ame_mass_file.py @@ -1,23 +1,27 @@ """ -The ame_mass_file module defines the ``AMEMassFile`` class. This class stores the column positions of the start and -finish location of the different parameters recorded in the AME mass data file. The positions have changed between -years so the year of the table is given as a parameter at construction. +The ame_mass_file module defines the ``AMEMassLayout`` and ``AMEMassFile`` classes. The ``AMEMassLayout`` class acts +like a base class, storing the common column names and the start and end positions of the values within the AME data +file. The positions change as time progress so the ``AMEMassFile`` class uses the year, passed as a parameter, to +update the values as required. + +The years 2003, 2012 and 2016 have identical formatting so are used as the base, not the 1983 format. """ +import dataclasses -class AMEMassFile: + +@dataclasses.dataclass(kw_only=True) +class AMEMassLayout: """ - Storage class for the data in the AME mass data file. + Storage class for the most common data in the AME mass data file. The AME mass data file is fixed-width file format so we will store the format details in this class. Note we have not listed all parameters in the attributes section as there are so many. The naming convention is however shown, along with a description. - Parameters - ---------- - year : int - The year the file being parsed was published + The attribute names align with column names as a string to allow dynamic creation of other variables and attributes + in other parts of the code. Attributes ---------- @@ -29,149 +33,158 @@ class AMEMassFile: The first column of parameter X. END_X : int or None The last column of parameter X or None to represent the end of the line. - column_limits : list[tuple[int, int]] - The start and end positions of all parameters as a list of tuples that can be passed to :meth:`pandas.read_fwf`. + column : list[str] + The list of columns that appear in the file + positions : list[tuple[str, str, str]] + A list of tuples detailing column name alongside start and end position in the line. """ - def __init__(self, year: int, **kwargs): - super().__init__(**kwargs) - match year: - case 1983: - self.HEADER = 35 - self.FOOTER = 0 - self.START_Z = 11 - self.END_Z = 14 - self.START_A = 16 - self.END_A = 19 - self.START_ME = 29 - self.END_ME = 39 - self.START_DME = 41 - self.END_DME = 48 - self.START_BE_PER_A = 49 - self.END_BE_PER_A = 59 - self.START_DBE_PER_A = 61 - self.END_DBE_PER_A = 68 - self.START_BETA_DECAY_ENERGY = 76 - self.END_BETA_DECAY_ENERGY = 85 - self.START_DBETA_DECAY_ENERGY = 87 - self.END_DBETA_DECAY_ENERGY = 94 - self.START_AM = 97 - self.END_AM = 99 - self.START_MICRO_U = 100 - self.END_MICRO_U = 110 - self.START_MICRO_DU = 113 - self.END_MICRO_DU = 120 - case 1993: - self.HEADER = 40 - self.FOOTER = 0 - self.START_Z = 11 - self.END_Z = 14 - self.START_A = 16 - self.END_A = 19 - self.START_ME = 29 - self.END_ME = 39 - self.START_DME = 41 - self.END_DME = 48 - self.START_BE_PER_A = 49 - self.END_BE_PER_A = 59 - self.START_DBE_PER_A = 61 - self.END_DBE_PER_A = 68 - self.START_BETA_DECAY_ENERGY = 76 - self.END_BETA_DECAY_ENERGY = 85 - self.START_DBETA_DECAY_ENERGY = 87 - self.END_DBETA_DECAY_ENERGY = 94 - self.START_AM = 97 - self.END_AM = 99 - self.START_MICRO_U = 100 - self.END_MICRO_U = 110 - self.START_MICRO_DU = 112 - self.END_MICRO_DU = 120 - case 1995: - self.HEADER = 39 - self.FOOTER = 0 - self.START_Z = 11 - self.END_Z = 14 - self.START_A = 16 - self.END_A = 19 - self.START_ME = 29 - self.END_ME = 39 - self.START_DME = 41 - self.END_DME = 48 - self.START_BE_PER_A = 49 - self.END_BE_PER_A = 59 - self.START_DBE_PER_A = 61 - self.END_DBE_PER_A = 68 - self.START_BETA_DECAY_ENERGY = 76 - self.END_BETA_DECAY_ENERGY = 85 - self.START_DBETA_DECAY_ENERGY = 87 - self.END_DBETA_DECAY_ENERGY = 94 - self.START_AM = 97 - self.END_AM = 99 - self.START_MICRO_U = 100 - self.END_MICRO_U = 110 - self.START_MICRO_DU = 112 - self.END_MICRO_DU = 120 - case 2020: - self.HEADER = 36 - self.FOOTER = 0 - self.START_Z = 11 - self.END_Z = 14 - self.START_A = 16 - self.END_A = 19 - self.START_ME = 29 - self.END_ME = 42 - self.START_DME = 43 - self.END_DME = 53 - self.START_BE_PER_A = 56 - self.END_BE_PER_A = 66 - self.START_DBE_PER_A = 69 - self.END_DBE_PER_A = 77 - self.START_BETA_DECAY_ENERGY = 82 - self.END_BETA_DECAY_ENERGY = 93 - self.START_DBETA_DECAY_ENERGY = 95 - self.END_DBETA_DECAY_ENERGY = 104 - self.START_AM = 106 - self.END_AM = 109 - self.START_MICRO_U = 110 - self.END_MICRO_U = 120 - self.START_MICRO_DU = 124 - self.END_MICRO_DU = 135 - case _: - self.HEADER = 39 - self.FOOTER = 0 - self.START_Z = 11 - self.END_Z = 14 - self.START_A = 16 - self.END_A = 19 - self.START_ME = 29 - self.END_ME = 41 - self.START_DME = 42 - self.END_DME = 53 - self.START_BE_PER_A = 54 - self.END_BE_PER_A = 64 - self.START_DBE_PER_A = 65 - self.END_DBE_PER_A = 72 - self.START_BETA_DECAY_ENERGY = 76 - self.END_BETA_DECAY_ENERGY = 86 - self.START_DBETA_DECAY_ENERGY = 87 - self.END_DBETA_DECAY_ENERGY = 95 - self.START_AM = 96 - self.END_AM = 99 - self.START_MICRO_U = 100 - self.END_MICRO_U = 112 - self.START_MICRO_DU = 113 - self.END_MICRO_DU = 120 - - self.column_limits = [ - (self.START_Z, self.END_Z), - (self.START_A, self.END_A), - (self.START_ME, self.END_ME), - (self.START_DME, self.END_DME), - (self.START_BE_PER_A, self.END_BE_PER_A), - (self.START_DBE_PER_A, self.END_DBE_PER_A), - (self.START_BETA_DECAY_ENERGY, self.END_BETA_DECAY_ENERGY), - (self.START_DBETA_DECAY_ENERGY, self.END_DBETA_DECAY_ENERGY), - (self.START_AM, self.END_AM), - (self.START_MICRO_U, self.END_MICRO_U), - (self.START_MICRO_DU, self.END_MICRO_DU), + HEADER: int = 39 + FOOTER: int = 0 + START_Z: int = 11 + END_Z: int = 14 + START_A: int = 16 + END_A: int = 19 + START_AMEMassExcess: int = 29 + END_AMEMassExcess: int = 41 + START_AMEMassExcessError: int = 42 + END_AMEMassExcessError: int = 53 + START_BindingEnergyPerA: int = 54 + END_BindingEnergyPerA: int = 64 + START_BindingEnergyPerAError: int = 65 + END_BindingEnergyPerAError: int = 72 + START_BetaDecayEnergy: int = 76 + END_BetaDecayEnergy: int = 86 + START_BetaDecayEnergyError: int = 87 + END_BetaDecayEnergyError: int = 95 + START_AtomicNumber: int = 96 + END_AtomicNumber: int = 99 + START_AtomicMass: int = 100 + END_AtomicMass: int = 112 + START_AtomicMassError: int = 113 + END_AtomicMassError: int = 120 + + def __post_init__(self) -> None: + self.columns: list[str] = [ + "Z", + "A", + "AMEMassExcess", + "AMEMassExcessError", + "BindingEnergyPerA", + "BindingEnergyPerAError", + "BetaDecayEnergy", + "BetaDecayEnergyError", + "AtomicNumber", + "AtomicMass", + "AtomicMassError", ] + + self.positions: list[tuple[str, str, str]] = [(f"{c}", f"START_{c}", f"END_{c}") for c in self.columns] + + +class AMEMassFile: + """ + Storage class for the year specific data in the AME mass data file. + + The base ``AMEMassLayout`` class is constructed and updated as required for the given ``year``. + + Parameters + ---------- + year : int + The year the file being parsed was published + + Attributes + ---------- + AME_MASS_YEAR_OVERRIDES : dict[int | str, dict[str, int]] + Year specific updates and changes required to ``AMEMassLayout``. + layout : AMEMassLayout + A storage class containing details of parameters and their locations in the line. + """ + + AME_MASS_YEAR_OVERRIDES: dict[int | str, dict[str, int]] = { + "default": {}, + 1983: { + "HEADER": 35, + "END_AMEMassExcess": 39, + "START_AMEMassExcessError": 41, + "END_AMEMassExcessError": 48, + "START_BindingEnergyPerA": 49, + "END_BindingEnergyPerA": 59, + "START_BindingEnergyPerAError": 61, + "END_BindingEnergyPerAError": 68, + "START_BetaDecayEnergy": 76, + "END_BetaDecayEnergy": 85, + "START_BetaDecayEnergyError": 87, + "END_BetaDecayEnergyError": 94, + "START_AtomicNumber": 97, + "END_AtomicNumber": 99, + "START_AtomicMass": 100, + "END_AtomicMass": 110, + }, + 1993: { + "HEADER": 40, + "END_AMEMassExcess": 39, + "START_AMEMassExcessError": 41, + "END_AMEMassExcessError": 48, + "START_BindingEnergyPerA": 49, + "END_BindingEnergyPerA": 59, + "START_BindingEnergyPerAError": 61, + "END_BindingEnergyPerAError": 68, + "START_BetaDecayEnergy": 76, + "END_BetaDecayEnergy": 85, + "START_BetaDecayEnergyError": 87, + "END_BetaDecayEnergyError": 94, + "START_AtomicNumber": 97, + "END_AtomicNumber": 99, + "START_AtomicMass": 100, + "END_AtomicMass": 110, + "START_AtomicMassError": 112, + }, + 1995: { + "END_AMEMassExcess": 39, + "START_AMEMassExcessError": 41, + "END_AMEMassExcessError": 48, + "START_BindingEnergyPerA": 49, + "END_BindingEnergyPerA": 59, + "START_BindingEnergyPerAError": 61, + "END_BindingEnergyPerAError": 68, + "START_BetaDecayEnergy": 76, + "END_BetaDecayEnergy": 85, + "START_BetaDecayEnergyError": 87, + "END_BetaDecayEnergyError": 94, + "START_AtomicNumber": 97, + "END_AtomicNumber": 99, + "START_AtomicMass": 100, + "END_AtomicMass": 110, + "START_AtomicMassError": 112, + }, + # The years 2003, 2012 and 2016 have identical formatting so are used as the base + 2003: {}, + 2012: {}, + 2016: {}, + 2020: { + "HEADER": 36, + "END_AMEMassExcess": 42, + "START_AMEMassExcessError": 43, + "END_AMEMassExcessError": 53, + "START_BindingEnergyPerA": 56, + "END_BindingEnergyPerA": 66, + "START_BindingEnergyPerAError": 69, + "END_BindingEnergyPerAError": 77, + "START_BetaDecayEnergy": 82, + "END_BetaDecayEnergy": 93, + "START_BetaDecayEnergyError": 95, + "END_BetaDecayEnergyError": 104, + "START_AtomicNumber": 106, + "END_AtomicNumber": 109, + "START_AtomicMass": 110, + "END_AtomicMass": 120, + "START_AtomicMassError": 124, + "END_AtomicMassError": 135, + }, + } + + def __init__(self, year: int) -> None: + self.layout = AMEMassLayout( + **AMEMassFile.AME_MASS_YEAR_OVERRIDES.get(year, AMEMassFile.AME_MASS_YEAR_OVERRIDES["default"]) + ) diff --git a/src/nuclearmasses/io/ame_mass_parse.py b/src/nuclearmasses/io/ame_mass_parse.py index 350c9a3..e066138 100644 --- a/src/nuclearmasses/io/ame_mass_parse.py +++ b/src/nuclearmasses/io/ame_mass_parse.py @@ -10,7 +10,7 @@ from nuclearmasses.utils.converter import Converter, DataInput -class AMEMassParser(AMEMassFile, Converter): +class AMEMassParser: """ Parse the AME mass file, doing the necessary preparation and clean ups of data. @@ -33,9 +33,13 @@ class AMEMassParser(AMEMassFile, Converter): """ def __init__(self, filename: DataInput, year: int): - super().__init__(year=year) self.filename: DataInput = filename self.year: int = year + self.layout = AMEMassFile(year=year).layout + + self.column_limits = [ + (getattr(self.layout, start), getattr(self.layout, end)) for _, start, end in self.layout.positions + ] def _column_names(self) -> list[str]: """ @@ -46,19 +50,7 @@ def _column_names(self) -> list[str]: list[str] An ordered list of the columns that exist in the file. """ - return [ - "Z", - "A", - "AMEMassExcess", - "AMEMassExcessError", - "BindingEnergyPerA", - "BindingEnergyPerAError", - "BetaDecayEnergy", - "BetaDecayEnergyError", - "AtomicNumber", - "AtomicMass", - "AtomicMassError", - ] + return self.layout.columns def _data_types(self) -> dict: """ @@ -125,12 +117,12 @@ def read_file(self) -> pd.DataFrame: na_values=self._na_values(), keep_default_na=False, on_bad_lines="warn", - skiprows=self.HEADER, - skipfooter=self.FOOTER, + skiprows=self.layout.HEADER, + skipfooter=self.layout.FOOTER, ) # We use the NUBASE data to define whether or not an isotope is experimentally measured, # so for this data we'll just drop any and all '#' characters - df = self.strip_char_from_string_columns(df, "#") + df = Converter.strip_char_from_string_columns(df, "#") if self.year == 1983: # The column headers and units are repeated in the 1983 table @@ -156,11 +148,11 @@ def read_file(self) -> pd.DataFrame: # We need to rescale the error value because we combined the two columns above df = df.assign(AtomicMassError=df["AtomicMassError"].astype(float) / 1.0e6) - df = self.calculate_relative_error(df, "AME") + df = Converter.calculate_relative_error(df, "AME") df["TableYear"] = self.year df["N"] = pd.to_numeric(df["A"]) - pd.to_numeric(df["Z"]) - df["Symbol"] = pd.to_numeric(df["Z"]).map(self.get_symbol) + df["Symbol"] = pd.to_numeric(df["Z"]).map(Converter.get_symbol) df["DataSource"] = 0 return df.astype(self._data_types()) diff --git a/tests/test_ame_mass_parse.py b/tests/test_ame_mass_parse.py index 7ce15e4..a14f904 100644 --- a/tests/test_ame_mass_parse.py +++ b/tests/test_ame_mass_parse.py @@ -12,8 +12,8 @@ def test_1983_mass(): "0 11 39 28 67 Ni +n2p -63742.471 19.056 582618.683 19.066 B- 3560.871 20.646 66 931570.167 20.457 -.0" ) parser = AMEMassParser(line, 1983) - parser.HEADER = 0 - parser.FOOTER = 0 + parser.layout.HEADER = 0 + parser.layout.FOOTER = 0 df = parser.read_file() expected = pd.DataFrame( @@ -45,8 +45,8 @@ def test_1993_mass(): "0 15 41 26 67 Fe x -46574.693 465.747 567012.139 465.747 B- 8746.727 543.150 66 950000.000 500.000" ) parser = AMEMassParser(line, 1993) - parser.HEADER = 0 - parser.FOOTER = 0 + parser.layout.HEADER = 0 + parser.layout.FOOTER = 0 df = parser.read_file() expected = pd.DataFrame( @@ -79,8 +79,8 @@ def test_1995_mass(): " 15 41 26 67 Fe x -46574.693 465.747 567012.133 465.747 B- 8746.727 543.150 66 950000.000 500.000" ) parser = AMEMassParser(line, 1995) - parser.HEADER = 0 - parser.FOOTER = 0 + parser.layout.HEADER = 0 + parser.layout.FOOTER = 0 df = parser.read_file() expected = pd.DataFrame( @@ -112,8 +112,8 @@ def test_2003_mass(): " 15 41 26 67 Fe x -45692.348 415.570 8449.695 6.203 B- 9368.702 523.438 66 950947.244 446.132" ) parser = AMEMassParser(line, 2003) - parser.HEADER = 0 - parser.FOOTER = 0 + parser.layout.HEADER = 0 + parser.layout.FOOTER = 0 df = parser.read_file() expected = pd.DataFrame( @@ -145,8 +145,8 @@ def test_2012_mass(): " 15 41 26 67 Fe x -46068.530 217.972 8455.310 3.253 B- 9253.245 218.067 66 950543.395 234.002" ) parser = AMEMassParser(line, 2012) - parser.HEADER = 0 - parser.FOOTER = 0 + parser.layout.HEADER = 0 + parser.layout.FOOTER = 0 df = parser.read_file() expected = pd.DataFrame( @@ -178,8 +178,8 @@ def test_2016_mass(): " 15 41 26 67 Fe x -45610.155 270.285 8448.469 4.034 B- 9711.620 270.362 66 951035.482 290.163" ) parser = AMEMassParser(line, 2016) - parser.HEADER = 0 - parser.FOOTER = 0 + parser.layout.HEADER = 0 + parser.layout.FOOTER = 0 df = parser.read_file() expected = pd.DataFrame( @@ -211,8 +211,8 @@ def test_2020_mass(): " 15 41 26 67 Fe x -45708.416 3.819 8449.9359 0.0570 B- 9613.3678 7.4900 66 950930.000 4.100" ) parser = AMEMassParser(line, 2020) - parser.HEADER = 0 - parser.FOOTER = 0 + parser.layout.HEADER = 0 + parser.layout.FOOTER = 0 df = parser.read_file() expected = pd.DataFrame( From 3797bbdba494ad392a9501578928e1d6fc8f4b77 Mon Sep 17 00:00:00 2001 From: Ian Cullen Date: Sat, 2 May 2026 13:15:22 +0100 Subject: [PATCH 06/12] Replace match statement in AME reaction 1 parsing See f190918 for details --- src/nuclearmasses/io/ame_reaction_1_file.py | 282 ++++++++++--------- src/nuclearmasses/io/ame_reaction_1_parse.py | 33 +-- tests/test_ame_reaction_1_parse.py | 28 +- 3 files changed, 179 insertions(+), 164 deletions(-) diff --git a/src/nuclearmasses/io/ame_reaction_1_file.py b/src/nuclearmasses/io/ame_reaction_1_file.py index c2c16d7..4002c0d 100644 --- a/src/nuclearmasses/io/ame_reaction_1_file.py +++ b/src/nuclearmasses/io/ame_reaction_1_file.py @@ -1,23 +1,27 @@ """ -The ame_reaction_1_file module defines the ``AMEReactionFileOne`` class. This class stores the column positions of the -start and finish location of the different parameters recorded in the AME reaction 1 data file. The positions have -changed between years so the year of the table is given as a parameter at construction. +The ame_reaction_1_file module defines the ``AMEReactionOneLayout`` and ``AMEReactionOneFile`` classes. +The ``AMEreactionOneLayout`` class acts like a base class, storing the common column names and the start and end +positions of the values within the AME data file. The positions change as time progress so the ``AMEReactionOneFile`` +class uses the year, passed as a parameter, to update the values as required. + +The years 1995, 2003, 2012 and 2016 have identical formatting so are used as the base, not the 1983 format. """ +import dataclasses -class AMEReactionFileOne: + +@dataclasses.dataclass(kw_only=True) +class AMEReactionOneLayout: """ - Storage class for the data in the AME reaction 1 data file. + Storage class for the most common data in the AME Reaction 1 data file. - The AME reaction 1 data file is fixed-width file format so we will store the format details in this class. + The AME Reaction 1 data file is fixed-width file format so we will store the format details in this class. Note we have not listed all parameters in the attributes section as there are so many. The naming convention is however shown, along with a description. - Parameters - ---------- - year : int - The year the file being parsed was published + The attribute names align with column names as a string to allow dynamic creation of other variables and attributes + in other parts of the code. Attributes ---------- @@ -29,124 +33,146 @@ class AMEReactionFileOne: The first column of parameter X. END_X : int or None The last column of parameter X or None to represent the end of the line. - column_limits : list[tuple[int, int]] - The start and end positions of all parameters as a list of tuples that can be passed to :meth:`pandas.read_fwf`. + column : list[str] + The list of columns that appear in the file + positions : list[tuple[str, str, str]] + A list of tuples detailing column name alongside start and end position in the line. """ - def __init__(self, year: int, **kwargs): - super().__init__(**kwargs) - match year: - case 1983: - self.HEADER = 30 - self.FOOTER = 0 - self.START_R1_A = 1 - self.END_R1_A = 4 - self.START_R1_Z = 8 - self.END_R1_Z = 11 - self.START_S2N = 14 - self.END_S2N = 22 - self.START_DS2N = 24 - self.END_DS2N = 30 - self.START_S2P = 32 - self.END_S2P = 39 - self.START_DS2P = 43 - self.END_DS2P = 47 - self.START_QA = 50 - self.END_QA = 57 - self.START_DQA = 60 - self.END_DQA = 65 - self.START_Q2B = 68 - self.END_Q2B = 75 - self.START_DQ2B = 78 - self.END_DQ2B = 83 - self.START_QEP = 86 - self.END_QEP = 93 - self.START_DQEP = 96 - self.END_DQEP = 101 - self.START_QBN = 103 - self.END_QBN = 111 - self.START_DQBN = 114 - self.END_DQBN = 119 - case 2020: - self.HEADER = 35 - self.FOOTER = 0 - self.START_R1_A = 1 - self.END_R1_A = 4 - self.START_R1_Z = 8 - self.END_R1_Z = 11 - self.START_S2N = 14 - self.END_S2N = 24 - self.START_DS2N = 25 - self.END_DS2N = 34 - self.START_S2P = 36 - self.END_S2P = 46 - self.START_DS2P = 47 - self.END_DS2P = 56 - self.START_QA = 58 - self.END_QA = 68 - self.START_DQA = 69 - self.END_DQA = 78 - self.START_Q2B = 79 - self.END_Q2B = 90 - self.START_DQ2B = 91 - self.END_DQ2B = 100 - self.START_QEP = 101 - self.END_QEP = 112 - self.START_DQEP = 113 - self.END_DQEP = 122 - self.START_QBN = 123 - self.END_QBN = 134 - self.START_DQBN = 135 - self.END_DQBN = 144 - case _: - match year: - case 1995 | 2003 | 2012 | 2016: - self.HEADER = 39 - case 1993: - self.HEADER = 40 - self.FOOTER = 0 - self.START_R1_A = 1 - self.END_R1_A = 4 - self.START_R1_Z = 8 - self.END_R1_Z = 11 - self.START_S2N = 14 - self.END_S2N = 22 - self.START_DS2N = 23 - self.END_DS2N = 30 - self.START_S2P = 32 - self.END_S2P = 40 - self.START_DS2P = 41 - self.END_DS2P = 48 - self.START_QA = 50 - self.END_QA = 58 - self.START_DQA = 59 - self.END_DQA = 66 - self.START_Q2B = 67 - self.END_Q2B = 76 - self.START_DQ2B = 77 - self.END_DQ2B = 84 - self.START_QEP = 85 - self.END_QEP = 94 - self.START_DQEP = 95 - self.END_DQEP = 102 - self.START_QBN = 103 - self.END_QBN = 112 - self.START_DQBN = 113 - self.END_DQBN = 125 - - self.column_limits = [ - (self.START_R1_A, self.END_R1_A), - (self.START_R1_Z, self.END_R1_Z), - (self.START_S2N, self.END_S2N), - (self.START_DS2N, self.END_DS2N), - (self.START_S2P, self.END_S2P), - (self.START_DS2P, self.END_DS2P), - (self.START_QA, self.END_QA), - (self.START_DQA, self.END_DQA), - (self.START_Q2B, self.END_Q2B), - (self.START_DQ2B, self.END_DQ2B), - (self.START_QEP, self.END_QEP), - (self.START_DQEP, self.END_DQEP), - (self.START_QBN, self.END_QBN), - (self.START_DQBN, self.END_DQBN), + HEADER: int = 39 + FOOTER: int = 0 + START_A: int = 1 + END_A: int = 4 + START_Z: int = 8 + END_Z: int = 11 + START_TwoNeutronSeparationEnergy: int = 14 + END_TwoNeutronSeparationEnergy: int = 22 + START_TwoNeutronSeparationEnergyError: int = 23 + END_TwoNeutronSeparationEnergyError: int = 30 + START_TwoProtonSeparationEnergy: int = 32 + END_TwoProtonSeparationEnergy: int = 40 + START_TwoProtonSeparationEnergyError: int = 41 + END_TwoProtonSeparationEnergyError: int = 48 + START_QAlpha: int = 50 + END_QAlpha: int = 58 + START_QAlphaError: int = 59 + END_QAlphaError: int = 66 + START_QTwoBeta: int = 67 + END_QTwoBeta: int = 76 + START_QTwoBetaError: int = 77 + END_QTwoBetaError: int = 84 + START_QEpsilon: int = 85 + END_QEpsilon: int = 94 + START_QEpsilonError: int = 95 + END_QEpsilonError: int = 102 + START_QBetaNeutron: int = 103 + END_QBetaNeutron: int = 112 + START_QBetaNeutronError: int = 113 + END_QBetaNeutronError: int = 125 + + def __post_init__(self) -> None: + self.columns: list[str] = [ + "A", + "Z", + "TwoNeutronSeparationEnergy", + "TwoNeutronSeparationEnergyError", + "TwoProtonSeparationEnergy", + "TwoProtonSeparationEnergyError", + "QAlpha", + "QAlphaError", + "QTwoBeta", + "QTwoBetaError", + "QEpsilon", + "QEpsilonError", + "QBetaNeutron", + "QBetaNeutronError", ] + + self.positions: list[tuple[str, str, str]] = [(f"{c}", f"START_{c}", f"END_{c}") for c in self.columns] + + +class AMEReactionFileOne: + """ + Storage class for the year specific data in the AME reaction 1 data file. + + The base `AMEReactionOneLayout`` class is constructed and updated as required for the given ``year``. + + Parameters + ---------- + year : int + The year the file being parsed was published + + Attributes + ---------- + YEAR_OVERRIDES : dict[int | str, dict[str, int]] + Year specific updates and changes required to ``AMEReactionOneLayout``. + layout : AMEReactionOneLayout + A storage class containing details of parameters and their locations in the line. + """ + + YEAR_OVERRIDES: dict[int | str, dict[str, int]] = { + "default": {}, + 1983: { + "HEADER": 30, + "START_TwoNeutronSeparationEnergyError": 24, + "END_TwoNeutronSeparationEnergyError": 30, + "START_TwoProtonSeparationEnergy": 32, + "END_TwoProtonSeparationEnergy": 39, + "START_TwoProtonSeparationEnergyError": 43, + "END_TwoProtonSeparationEnergyError": 47, + "START_QAlpha": 50, + "END_QAlpha": 57, + "START_QAlphaError": 60, + "END_QAlphaError": 65, + "START_QTwoBeta": 68, + "END_QTwoBeta": 75, + "START_QTwoBetaError": 78, + "END_QTwoBetaError": 83, + "START_QEpsilon": 86, + "END_QEpsilon": 93, + "START_QEpsilonError": 96, + "END_QEpsilonError": 101, + "START_QBetaNeutron": 103, + "END_QBetaNeutron": 111, + "START_QBetaNeutronError": 114, + "END_QBetaNeutronError": 119, + }, + 1993: { + "HEADER": 40, + }, + # 1995 - 2016 are the base years + 1995: {}, + 2003: {}, + 2012: {}, + 2016: {}, + 2020: { + "END_TwoNeutronSeparationEnergy": 24, + "START_TwoNeutronSeparationEnergyError": 25, + "END_TwoNeutronSeparationEnergyError": 34, + "START_TwoProtonSeparationEnergy": 36, + "END_TwoProtonSeparationEnergy": 46, + "START_TwoProtonSeparationEnergyError": 47, + "END_TwoProtonSeparationEnergyError": 56, + "START_QAlpha": 58, + "END_QAlpha": 68, + "START_QAlphaError": 69, + "END_QAlphaError": 78, + "START_QTwoBeta": 79, + "END_QTwoBeta": 90, + "START_QTwoBetaError": 91, + "END_QTwoBetaError": 100, + "START_QEpsilon": 101, + "END_QEpsilon": 112, + "START_QEpsilonError": 113, + "END_QEpsilonError": 122, + "START_QBetaNeutron": 123, + "END_QBetaNeutron": 134, + "START_QBetaNeutronError": 135, + "END_QBetaNeutronError": 144, + }, + } + + def __init__(self, year: int) -> None: + self.layout = AMEReactionOneLayout( + **AMEReactionFileOne.YEAR_OVERRIDES.get(year, AMEReactionFileOne.YEAR_OVERRIDES["default"]) + ) diff --git a/src/nuclearmasses/io/ame_reaction_1_parse.py b/src/nuclearmasses/io/ame_reaction_1_parse.py index 91b07cb..a5eef83 100644 --- a/src/nuclearmasses/io/ame_reaction_1_parse.py +++ b/src/nuclearmasses/io/ame_reaction_1_parse.py @@ -10,7 +10,7 @@ from nuclearmasses.utils.converter import Converter, DataInput -class AMEReactionParserOne(AMEReactionFileOne, Converter): +class AMEReactionParserOne: """ Parse the first AME reaction file, doing the necessary preparation and clean ups of data. @@ -33,9 +33,13 @@ class AMEReactionParserOne(AMEReactionFileOne, Converter): """ def __init__(self, filename: DataInput, year: int): - super().__init__(year=year) self.filename: DataInput = filename self.year = year + self.layout = AMEReactionFileOne(year).layout + + self.column_limits = [ + (getattr(self.layout, start), getattr(self.layout, end)) for _, start, end in self.layout.positions + ] def _column_names(self) -> list[str]: """ @@ -46,22 +50,7 @@ def _column_names(self) -> list[str]: list[str] An ordered list of the columns that exist in the file. """ - return [ - "A", - "Z", - "TwoNeutronSeparationEnergy", - "TwoNeutronSeparationEnergyError", - "TwoProtonSeparationEnergy", - "TwoProtonSeparationEnergyError", - "QAlpha", - "QAlphaError", - "QTwoBeta", - "QTwoBetaError", - "QEpsilon", - "QEpsilonError", - "QBetaNeutron", - "QBetaNeutronError", - ] + return self.layout.columns def _data_types(self) -> dict: """ @@ -137,12 +126,12 @@ def read_file(self) -> pd.DataFrame: na_values=self._na_values(), keep_default_na=False, on_bad_lines="warn", - skiprows=self.HEADER, - skipfooter=self.FOOTER, + skiprows=self.layout.HEADER, + skipfooter=self.layout.FOOTER, ) # We use the NUBASE data to define whether or not an isotope is experimentally measured, # so for this data we'll just drop any and all '#' characters - df = self.strip_char_from_string_columns(df, "#") + df = Converter.strip_char_from_string_columns(df, "#") if self.year == 1983: # The column headers and units are repeated in the 1983 table @@ -152,7 +141,7 @@ def read_file(self) -> pd.DataFrame: df["TableYear"] = self.year df["N"] = pd.to_numeric(df["A"]) - pd.to_numeric(df["Z"]) - df["Symbol"] = pd.to_numeric(df["Z"]).map(self.get_symbol) + df["Symbol"] = pd.to_numeric(df["Z"]).map(Converter.get_symbol) df["DataSource"] = 0 return df.astype(self._data_types()) diff --git a/tests/test_ame_reaction_1_parse.py b/tests/test_ame_reaction_1_parse.py index d7825e4..b1ceb81 100644 --- a/tests/test_ame_reaction_1_parse.py +++ b/tests/test_ame_reaction_1_parse.py @@ -13,8 +13,8 @@ def test_1983_rct1(): " 186 Ir 77 15780 250 9536 20 3850 100 -7600# 300# -2639 20 -10640# 200#" ) parser = AMEReactionParserOne(line, 1983) - parser.HEADER = 0 - parser.FOOTER = 0 + parser.layout.HEADER = 0 + parser.layout.FOOTER = 0 df = parser.read_file() expected = pd.DataFrame( @@ -49,8 +49,8 @@ def test_1993_rct1(): " 186 Ir 77 15618.44 270.74 9522.98 20.49 3852.98 103.94 -7419.61 145.57 -2635.85 20.03 -10622# 230#" ) parser = AMEReactionParserOne(line, 1993) - parser.HEADER = 0 - parser.FOOTER = 0 + parser.layout.HEADER = 0 + parser.layout.FOOTER = 0 df = parser.read_file() expected = pd.DataFrame( @@ -85,8 +85,8 @@ def test_1995_rct1(): " 186 Ir 77 15618.41 270.74 9522.89 20.49 3853.04 103.94 -7495.33 145.56 -2635.83 20.03 -10682.00 207.60" ) parser = AMEReactionParserOne(line, 1995) - parser.HEADER = 0 - parser.FOOTER = 0 + parser.layout.HEADER = 0 + parser.layout.FOOTER = 0 df = parser.read_file() expected = pd.DataFrame( @@ -121,8 +121,8 @@ def test_2003_rct1(): " 186 Ir 77 15704.74 32.47 9524.26 17.08 3849.65 103.31 -7458.10 26.70 -2639.77 16.57 -10561.10 44.19" ) parser = AMEReactionParserOne(line, 2003) - parser.HEADER = 0 - parser.FOOTER = 0 + parser.layout.HEADER = 0 + parser.layout.FOOTER = 0 df = parser.read_file() expected = pd.DataFrame( @@ -157,8 +157,8 @@ def test_2012_rct1(): " 186 Ir 77 15706.55 32.47 9527.99 17.09 3848.03 103.31 -7459.92 26.70 -2641.13 16.57 -10557.95 30.67" ) parser = AMEReactionParserOne(line, 2012) - parser.HEADER = 0 - parser.FOOTER = 0 + parser.layout.HEADER = 0 + parser.layout.FOOTER = 0 df = parser.read_file() expected = pd.DataFrame( @@ -193,8 +193,8 @@ def test_2016_rct1(): " 186 Ir 77 15704.13 32.47 9530.65 17.07 3848.80 103.31 -7457.49 26.70 -2642.29 16.55 -10555.52 30.67" ) parser = AMEReactionParserOne(line, 2016) - parser.HEADER = 0 - parser.FOOTER = 0 + parser.layout.HEADER = 0 + parser.layout.FOOTER = 0 df = parser.read_file() expected = pd.DataFrame( @@ -229,8 +229,8 @@ def test_2020_rct1(): " 186 Ir 77 15704.1312 32.4655 9530.4731 17.0698 3848.8777 103.3133 -7457.4943 26.6968 -2642.2739 16.5459 -10555.5245 30.6658" ) parser = AMEReactionParserOne(line, 2020) - parser.HEADER = 0 - parser.FOOTER = 0 + parser.layout.HEADER = 0 + parser.layout.FOOTER = 0 df = parser.read_file() expected = pd.DataFrame( From 1042e21ba62237924687ed955c3326d8ad1b1704 Mon Sep 17 00:00:00 2001 From: Ian Cullen Date: Sat, 2 May 2026 14:29:17 +0100 Subject: [PATCH 07/12] Replace match statement in reaction 2 parsing See f190918 for details --- src/nuclearmasses/io/ame.py | 4 +- src/nuclearmasses/io/ame_reaction_2_file.py | 282 ++++++++++--------- src/nuclearmasses/io/ame_reaction_2_parse.py | 37 +-- tests/test_ame_reaction_2_parse.py | 44 +-- 4 files changed, 192 insertions(+), 175 deletions(-) diff --git a/src/nuclearmasses/io/ame.py b/src/nuclearmasses/io/ame.py index 4a3422a..2aaf881 100644 --- a/src/nuclearmasses/io/ame.py +++ b/src/nuclearmasses/io/ame.py @@ -8,7 +8,7 @@ from nuclearmasses.io.ame_mass_parse import AMEMassParser from nuclearmasses.io.ame_reaction_1_parse import AMEReactionParserOne -from nuclearmasses.io.ame_reaction_2_parse import AMEReactionParserTwo +from nuclearmasses.io.ame_reaction_2_parse import AMEReactionTwoParser class AME: @@ -90,7 +90,7 @@ def parse_year(self, year: int) -> pd.DataFrame: mass_df = AMEMassParser(filename=ame_mass, year=year).read_file() rct1_df = AMEReactionParserOne(filename=ame_reaction_1, year=year).read_file() - rct2_df = AMEReactionParserTwo(filename=ame_reaction_2, year=year).read_file() + rct2_df = AMEReactionTwoParser(filename=ame_reaction_2, year=year).read_file() # Merge all 3 of the AME dataframes into one common_columns = ["A", "Z", "N", "TableYear", "Symbol", "DataSource"] diff --git a/src/nuclearmasses/io/ame_reaction_2_file.py b/src/nuclearmasses/io/ame_reaction_2_file.py index c3158a3..9568485 100644 --- a/src/nuclearmasses/io/ame_reaction_2_file.py +++ b/src/nuclearmasses/io/ame_reaction_2_file.py @@ -1,23 +1,27 @@ """ -The ame_reaction_2_file module defines the ``AMEReactionFileTwo`` class. This class stores the column positions of the -start and finish location of the different parameters recorded in the AME reaction 2 data file. The positions have -changed between years so the year of the table is given as a parameter at construction. +The ame_reaction_2_file module defines the ``AMEReactionTwoLayout`` and ``AMEReactionTwoFile`` classes. +The ``AMEreactionTwoLayout`` class acts like a base class, storing the common column names and the start and end +positions of the values within the AME data file. The positions change as time progress so the ``AMEReactionTwoFile`` +class uses the year, passed as a parameter, to update the values as required. + +The years 1995, 2003, 2012 and 2016 have identical formatting so are used as the base, not the 1983 format. """ +import dataclasses + -class AMEReactionFileTwo: +@dataclasses.dataclass(kw_only=True) +class AMEReactionTwoLayout: """ - Storage class for the data in the AME reaction 2 data file. + Storage class for the most common data in the AME reaction 2 data file. The AME reaction 2 data file is fixed-width file format so we will store the format details in this class. Note we have not listed all parameters in the attributes section as there are so many. The naming convention is however shown, along with a description. - Parameters - ---------- - year : int - The year the file being parsed was published + The attribute names align with column names as a string to allow dynamic creation of other variables and attributes + in other parts of the code. Attributes ---------- @@ -29,124 +33,148 @@ class AMEReactionFileTwo: The first column of parameter X. END_X : int or None The last column of parameter X or None to represent the end of the line. - column_limits : list[tuple[int, int]] - The start and end positions of all parameters as a list of tuples that can be passed to :meth:`pandas.read_fwf`. + column : list[str] + The list of columns that appear in the file + positions : list[tuple[str, str, str]] + A list of tuples detailing column name alongside start and end position in the line. """ - def __init__(self, year: int, **kwargs): - super().__init__(**kwargs) - match year: - case 1983: - self.HEADER = 30 - self.FOOTER = 0 - self.START_R2_A = 1 - self.END_R2_A = 4 - self.START_R2_Z = 8 - self.END_R2_Z = 11 - self.START_SN = 14 - self.END_SN = 22 - self.START_DSN = 24 - self.END_DSN = 28 - self.START_SP = 30 - self.END_SP = 40 - self.START_DSP = 42 - self.END_DSP = 48 - self.START_Q4B = 49 - self.END_Q4B = 57 - self.START_DQ4B = 60 - self.END_DQ4B = 65 - self.START_QDA = 68 - self.END_QDA = 76 - self.START_DQDA = 78 - self.END_DQDA = 84 - self.START_QPA = 86 - self.END_QPA = 94 - self.START_DQPA = 96 - self.END_DQPA = 102 - self.START_QNA = 103 - self.END_QNA = 112 - self.START_DQNA = 114 - self.END_DQNA = 120 - case 2020: - self.HEADER = 37 - self.FOOTER = 15 - self.START_R2_A = 1 - self.END_R2_A = 4 - self.START_R2_Z = 8 - self.END_R2_Z = 11 - self.START_SN = 14 - self.END_SN = 24 - self.START_DSN = 25 - self.END_DSN = 34 - self.START_SP = 36 - self.END_SP = 46 - self.START_DSP = 47 - self.END_DSP = 56 - self.START_Q4B = 57 - self.END_Q4B = 68 - self.START_DQ4B = 69 - self.END_DQ4B = 78 - self.START_QDA = 79 - self.END_QDA = 90 - self.START_DQDA = 91 - self.END_DQDA = 100 - self.START_QPA = 101 - self.END_QPA = 112 - self.START_DQPA = 113 - self.END_DQPA = 122 - self.START_QNA = 123 - self.END_QNA = 134 - self.START_DQNA = 135 - self.END_DQNA = 144 - case _: - match year: - case 1995 | 2003 | 2012 | 2016: - self.HEADER = 39 - case 1993: - self.HEADER = 40 - self.FOOTER = 0 - self.START_R2_A = 1 - self.END_R2_A = 4 - self.START_R2_Z = 8 - self.END_R2_Z = 11 - self.START_SN = 14 - self.END_SN = 22 - self.START_DSN = 23 - self.END_DSN = 30 - self.START_SP = 32 - self.END_SP = 40 - self.START_DSP = 41 - self.END_DSP = 48 - self.START_Q4B = 49 - self.END_Q4B = 58 - self.START_DQ4B = 59 - self.END_DQ4B = 66 - self.START_QDA = 67 - self.END_QDA = 76 - self.START_DQDA = 77 - self.END_DQDA = 84 - self.START_QPA = 85 - self.END_QPA = 94 - self.START_DQPA = 95 - self.END_DQPA = 102 - self.START_QNA = 103 - self.END_QNA = 112 - self.START_DQNA = 113 - self.END_DQNA = 125 - - self.column_limits = [ - (self.START_R2_A, self.END_R2_A), - (self.START_R2_Z, self.END_R2_Z), - (self.START_SN, self.END_SN), - (self.START_DSN, self.END_DSN), - (self.START_SP, self.END_SP), - (self.START_DSP, self.END_DSP), - (self.START_Q4B, self.END_Q4B), - (self.START_DQ4B, self.END_DQ4B), - (self.START_QDA, self.END_QDA), - (self.START_DQDA, self.END_DQDA), - (self.START_QPA, self.END_QPA), - (self.START_DQPA, self.END_DQPA), - (self.START_QNA, self.END_QNA), - (self.START_DQNA, self.END_DQNA), + HEADER: int = 39 + FOOTER: int = 0 + START_A: int = 1 + END_A: int = 4 + START_Z: int = 8 + END_Z: int = 11 + START_OneNeutronSeparationEnergy: int = 14 + END_OneNeutronSeparationEnergy: int = 22 + START_OneNeutronSeparationEnergyError: int = 23 + END_OneNeutronSeparationEnergyError: int = 30 + START_OneProtonSeparationEnergy: int = 32 + END_OneProtonSeparationEnergy: int = 40 + START_OneProtonSeparationEnergyError: int = 41 + END_OneProtonSeparationEnergyError: int = 48 + START_QFourBeta: int = 49 + END_QFourBeta: int = 58 + START_QFourBetaError: int = 59 + END_QFourBetaError: int = 66 + START_QDeuteronAlpha: int = 67 + END_QDeuteronAlpha: int = 76 + START_QDeuteronAlphaError: int = 77 + END_QDeuteronAlphaError: int = 84 + START_QProtonAlpha: int = 85 + END_QProtonAlpha: int = 94 + START_QProtonAlphaError: int = 95 + END_QProtonAlphaError: int = 102 + START_QNeutronAlpha: int = 103 + END_QNeutronAlpha: int = 112 + START_QNeutronAlphaError: int = 113 + END_QNeutronAlphaError: int = 125 + + def __post_init__(self) -> None: + self.columns: list[str] = [ + "A", + "Z", + "OneNeutronSeparationEnergy", + "OneNeutronSeparationEnergyError", + "OneProtonSeparationEnergy", + "OneProtonSeparationEnergyError", + "QFourBeta", + "QFourBetaError", + "QDeuteronAlpha", + "QDeuteronAlphaError", + "QProtonAlpha", + "QProtonAlphaError", + "QNeutronAlpha", + "QNeutronAlphaError", ] + + self.positions: list[tuple[str, str, str]] = [(f"{c}", f"START_{c}", f"END_{c}") for c in self.columns] + + +class AMEReactionTwoFile: + """ + Storage class for the year specific data in the AME reaction 2 data file. + + The base ``AMEReactionTwoLayout`` class is constructed and updated as required for the given ``year``. + + Parameters + ---------- + year : int + The year the file being parsed was published + + Attributes + ---------- + YEAR_OVERRIDES : dict[int | str, dict[str, int]] + Year specific updates and changes required to ``AMEReactionTwoLayout``. + layout : AMEReactionTwoLayout + A storage class containing details of parameters and their locations in the line. + """ + + YEAR_OVERRIDES: dict[int | str, dict[str, int]] = { + "default": {}, + 1983: { + "HEADER": 30, + "START_OneNeutronSeparationEnergyError": 24, + "END_OneNeutronSeparationEnergyError": 28, + "START_OneProtonSeparationEnergy": 30, + "END_OneProtonSeparationEnergy": 40, + "START_OneProtonSeparationEnergyError": 42, + "END_OneProtonSeparationEnergyError": 48, + "START_QFourBeta": 49, + "END_QFourBeta": 57, + "START_QFourBetaError": 60, + "END_QFourBetaError": 65, + "START_QDeuteronAlpha": 68, + "END_QDeuteronAlpha": 76, + "START_QDeuteronAlphaError": 78, + "END_QDeuteronAlphaError": 84, + "START_QProtonAlpha": 86, + "END_QProtonAlpha": 94, + "START_QProtonAlphaError": 96, + "END_QProtonAlphaError": 102, + "START_QNeutronAlpha": 103, + "END_QNeutronAlpha": 112, + "START_QNeutronAlphaError": 114, + "END_QNeutronAlphaError": 120, + }, + 1993: { + "HEADER": 40, + }, + # 1995 - 2016 are the base years + 1995: {}, + 2003: {}, + 2012: {}, + 2016: {}, + 2020: { + "HEADER": 37, + "FOOTER": 15, + "END_OneNeutronSeparationEnergy": 24, + "START_OneNeutronSeparationEnergyError": 25, + "END_OneNeutronSeparationEnergyError": 34, + "START_OneProtonSeparationEnergy": 36, + "END_OneProtonSeparationEnergy": 46, + "START_OneProtonSeparationEnergyError": 47, + "END_OneProtonSeparationEnergyError": 56, + "START_QFourBeta": 57, + "END_QFourBeta": 68, + "START_QFourBetaError": 69, + "END_QFourBetaError": 78, + "START_QDeuteronAlpha": 79, + "END_QDeuteronAlpha": 90, + "START_QDeuteronAlphaError": 91, + "END_QDeuteronAlphaError": 100, + "START_QProtonAlpha": 101, + "END_QProtonAlpha": 112, + "START_QProtonAlphaError": 113, + "END_QProtonAlphaError": 122, + "START_QNeutronAlpha": 123, + "END_QNeutronAlpha": 134, + "START_QNeutronAlphaError": 134, + "END_QNeutronAlphaError": 144, + }, + } + + def __init__(self, year: int) -> None: + self.layout = AMEReactionTwoLayout( + **AMEReactionTwoFile.YEAR_OVERRIDES.get(year, AMEReactionTwoFile.YEAR_OVERRIDES["default"]) + ) diff --git a/src/nuclearmasses/io/ame_reaction_2_parse.py b/src/nuclearmasses/io/ame_reaction_2_parse.py index 11bd258..978bb29 100644 --- a/src/nuclearmasses/io/ame_reaction_2_parse.py +++ b/src/nuclearmasses/io/ame_reaction_2_parse.py @@ -1,16 +1,16 @@ """ -The ame_reaction_2_parse module defines the ``AMEReactionParserTwo`` class. This class contains the logic required to +The ame_reaction_2_parse module defines the ``AMEReactionTwoParser`` class. This class contains the logic required to sort and organise the inputs to :meth:`pandas.read_fwf` dependent on the year of the file. Once parsed, known typos and inconsistencies are cleaned from the resultant dataframe. """ import pandas as pd -from nuclearmasses.io.ame_reaction_2_file import AMEReactionFileTwo +from nuclearmasses.io.ame_reaction_2_file import AMEReactionTwoFile from nuclearmasses.utils.converter import Converter, DataInput -class AMEReactionParserTwo(AMEReactionFileTwo, Converter): +class AMEReactionTwoParser: """ Parse the second AME reaction file, doing the necessary preparation and clean ups of data. @@ -33,9 +33,13 @@ class AMEReactionParserTwo(AMEReactionFileTwo, Converter): """ def __init__(self, filename: DataInput, year: int): - super().__init__(year=year) self.filename: DataInput = filename self.year = year + self.layout = AMEReactionTwoFile(year).layout + + self.column_limits = [ + (getattr(self.layout, start), getattr(self.layout, end)) for _, start, end in self.layout.positions + ] def _column_names(self) -> list[str]: """ @@ -46,22 +50,7 @@ def _column_names(self) -> list[str]: list[str] An ordered list of the columns that exist in the file. """ - return [ - "A", - "Z", - "OneNeutronSeparationEnergy", - "OneNeutronSeparationEnergyError", - "OneProtonSeparationEnergy", - "OneProtonSeparationEnergyError", - "QFourBeta", - "QFourBetaError", - "QDeuteronAlpha", - "QDeuteronAlphaError", - "QProtonAlpha", - "QProtonAlphaError", - "QNeutronAlpha", - "QNeutronAlphaError", - ] + return self.layout.columns def _data_types(self) -> dict: """ @@ -137,12 +126,12 @@ def read_file(self) -> pd.DataFrame: na_values=self._na_values(), keep_default_na=False, on_bad_lines="warn", - skiprows=self.HEADER, - skipfooter=self.FOOTER, + skiprows=self.layout.HEADER, + skipfooter=self.layout.FOOTER, ) # We use the NUBASE data to define whether or not an isotope is experimentally measured, # so for this data we'll just drop any and all '#' characters - df = self.strip_char_from_string_columns(df, "#") + df = Converter.strip_char_from_string_columns(df, "#") if self.year == 1983: # The column headers and units are repeated in the 1983 table @@ -156,7 +145,7 @@ def read_file(self) -> pd.DataFrame: # Repeated column heading also means we have to cast to create new columns df["TableYear"] = self.year df["N"] = pd.to_numeric(df["A"]) - pd.to_numeric(df["Z"]) - df["Symbol"] = pd.to_numeric(df["Z"]).map(self.get_symbol) + df["Symbol"] = pd.to_numeric(df["Z"]).map(Converter.get_symbol) df["DataSource"] = 0 return df.astype(self._data_types()) diff --git a/tests/test_ame_reaction_2_parse.py b/tests/test_ame_reaction_2_parse.py index a104b57..297cb3f 100644 --- a/tests/test_ame_reaction_2_parse.py +++ b/tests/test_ame_reaction_2_parse.py @@ -3,16 +3,16 @@ import pandas as pd import pandas.testing as pdt -from nuclearmasses.io.ame_reaction_2_parse import AMEReactionParserTwo +from nuclearmasses.io.ame_reaction_2_parse import AMEReactionTwoParser def test_1983_rct2(): line = io.StringIO( " 204 Tl 81 7853 17 5702.8 1.7 -13480 120 12613.7 1.8 8608.4 1.8 7180 50" ) - parser = AMEReactionParserTwo(line, 1983) - parser.HEADER = 0 - parser.FOOTER = 0 + parser = AMEReactionTwoParser(line, 1983) + parser.layout.HEADER = 0 + parser.layout.FOOTER = 0 df = parser.read_file() expected = pd.DataFrame( @@ -47,9 +47,9 @@ def test_1993_rct2(): line = io.StringIO( " 204 Tl 81 6655.82 0.29 6365.32 1.26 -12492.85 71.39 13712.87 1.23 8183.14 1.25 7690.59 15.05" ) - parser = AMEReactionParserTwo(line, 1993) - parser.HEADER = 0 - parser.FOOTER = 0 + parser = AMEReactionTwoParser(line, 1993) + parser.layout.HEADER = 0 + parser.layout.FOOTER = 0 df = parser.read_file() expected = pd.DataFrame( @@ -84,9 +84,9 @@ def test_1995_rct2(): line = io.StringIO( " 204 Tl 81 6655.86 0.29 6365.35 1.26 -12494.05 92.85 13713.05 1.23 8183.32 1.24 7702.97 3.35" ) - parser = AMEReactionParserTwo(line, 1995) - parser.HEADER = 0 - parser.FOOTER = 0 + parser = AMEReactionTwoParser(line, 1995) + parser.layout.HEADER = 0 + parser.layout.FOOTER = 0 df = parser.read_file() expected = pd.DataFrame( @@ -121,9 +121,9 @@ def test_2003_rct2(): line = io.StringIO( " 204 Tl 81 6656.10 0.29 6365.82 1.25 -12470.66 24.01 13710.69 1.15 8181.34 1.16 7701.54 3.34" ) - parser = AMEReactionParserTwo(line, 2003) - parser.HEADER = 0 - parser.FOOTER = 0 + parser = AMEReactionTwoParser(line, 2003) + parser.layout.HEADER = 0 + parser.layout.FOOTER = 0 df = parser.read_file() expected = pd.DataFrame( @@ -158,9 +158,9 @@ def test_2012_rct2(): line = io.StringIO( " 204 Tl 81 6656.09 0.29 6365.80 1.25 -12470.19 22.31 13710.68 1.14 8181.16 1.15 7701.67 3.33" ) - parser = AMEReactionParserTwo(line, 2012) - parser.HEADER = 0 - parser.FOOTER = 0 + parser = AMEReactionTwoParser(line, 2012) + parser.layout.HEADER = 0 + parser.layout.FOOTER = 0 df = parser.read_file() expected = pd.DataFrame( @@ -195,9 +195,9 @@ def test_2016_rct2(): line = io.StringIO( " 204 Tl 81 6656.08 0.29 6365.85 1.25 -12470.71 22.32 13709.99 1.06 8180.45 1.07 7700.97 3.31" ) - parser = AMEReactionParserTwo(line, 2016) - parser.HEADER = 0 - parser.FOOTER = 0 + parser = AMEReactionTwoParser(line, 2016) + parser.layout.HEADER = 0 + parser.layout.FOOTER = 0 df = parser.read_file() expected = pd.DataFrame( @@ -232,9 +232,9 @@ def test_2020_rct2(): line = io.StringIO( " 204 Tl 81 6656.0787 0.2907 6365.8379 1.2542 -12470.8182 22.6974 13710.0469 1.0612 8180.5147 1.0721 7701.0380 3.3084" ) - parser = AMEReactionParserTwo(line, 2020) - parser.HEADER = 0 - parser.FOOTER = 0 + parser = AMEReactionTwoParser(line, 2020) + parser.layout.HEADER = 0 + parser.layout.FOOTER = 0 df = parser.read_file() expected = pd.DataFrame( From 0cd930cee222bd970f746952ac46ee3539b314e2 Mon Sep 17 00:00:00 2001 From: Ian Cullen Date: Sat, 2 May 2026 14:34:45 +0100 Subject: [PATCH 08/12] Be consistent with class naming --- src/nuclearmasses/io/ame.py | 4 ++-- src/nuclearmasses/io/ame_reaction_1_file.py | 4 ++-- src/nuclearmasses/io/ame_reaction_1_parse.py | 8 ++++---- tests/test_ame_reaction_1_parse.py | 16 ++++++++-------- 4 files changed, 16 insertions(+), 16 deletions(-) diff --git a/src/nuclearmasses/io/ame.py b/src/nuclearmasses/io/ame.py index 2aaf881..9cfaadd 100644 --- a/src/nuclearmasses/io/ame.py +++ b/src/nuclearmasses/io/ame.py @@ -7,7 +7,7 @@ import pandas as pd from nuclearmasses.io.ame_mass_parse import AMEMassParser -from nuclearmasses.io.ame_reaction_1_parse import AMEReactionParserOne +from nuclearmasses.io.ame_reaction_1_parse import AMEReactionOneParser from nuclearmasses.io.ame_reaction_2_parse import AMEReactionTwoParser @@ -89,7 +89,7 @@ def parse_year(self, year: int) -> pd.DataFrame: ame_mass, ame_reaction_1, ame_reaction_2 = self.get_datafiles(year) mass_df = AMEMassParser(filename=ame_mass, year=year).read_file() - rct1_df = AMEReactionParserOne(filename=ame_reaction_1, year=year).read_file() + rct1_df = AMEReactionOneParser(filename=ame_reaction_1, year=year).read_file() rct2_df = AMEReactionTwoParser(filename=ame_reaction_2, year=year).read_file() # Merge all 3 of the AME dataframes into one diff --git a/src/nuclearmasses/io/ame_reaction_1_file.py b/src/nuclearmasses/io/ame_reaction_1_file.py index 4002c0d..abb3bf9 100644 --- a/src/nuclearmasses/io/ame_reaction_1_file.py +++ b/src/nuclearmasses/io/ame_reaction_1_file.py @@ -91,7 +91,7 @@ def __post_init__(self) -> None: self.positions: list[tuple[str, str, str]] = [(f"{c}", f"START_{c}", f"END_{c}") for c in self.columns] -class AMEReactionFileOne: +class AMEReactionOneFile: """ Storage class for the year specific data in the AME reaction 1 data file. @@ -174,5 +174,5 @@ class AMEReactionFileOne: def __init__(self, year: int) -> None: self.layout = AMEReactionOneLayout( - **AMEReactionFileOne.YEAR_OVERRIDES.get(year, AMEReactionFileOne.YEAR_OVERRIDES["default"]) + **AMEReactionOneFile.YEAR_OVERRIDES.get(year, AMEReactionOneFile.YEAR_OVERRIDES["default"]) ) diff --git a/src/nuclearmasses/io/ame_reaction_1_parse.py b/src/nuclearmasses/io/ame_reaction_1_parse.py index a5eef83..cfcb26d 100644 --- a/src/nuclearmasses/io/ame_reaction_1_parse.py +++ b/src/nuclearmasses/io/ame_reaction_1_parse.py @@ -1,16 +1,16 @@ """ -The ame_reaction_1_parse module defines the ``AMEReactionParserOne`` class. This class contains the logic required to +The ame_reaction_1_parse module defines the ``AMEReactionOneParser`` class. This class contains the logic required to sort and organise the inputs to :meth:`pandas.read_fwf` dependent on the year of the file. Once parsed, known typos and inconsistencies are cleaned from the resultant dataframe. """ import pandas as pd -from nuclearmasses.io.ame_reaction_1_file import AMEReactionFileOne +from nuclearmasses.io.ame_reaction_1_file import AMEReactionOneFile from nuclearmasses.utils.converter import Converter, DataInput -class AMEReactionParserOne: +class AMEReactionOneParser: """ Parse the first AME reaction file, doing the necessary preparation and clean ups of data. @@ -35,7 +35,7 @@ class AMEReactionParserOne: def __init__(self, filename: DataInput, year: int): self.filename: DataInput = filename self.year = year - self.layout = AMEReactionFileOne(year).layout + self.layout = AMEReactionOneFile(year).layout self.column_limits = [ (getattr(self.layout, start), getattr(self.layout, end)) for _, start, end in self.layout.positions diff --git a/tests/test_ame_reaction_1_parse.py b/tests/test_ame_reaction_1_parse.py index b1ceb81..6e56f24 100644 --- a/tests/test_ame_reaction_1_parse.py +++ b/tests/test_ame_reaction_1_parse.py @@ -3,7 +3,7 @@ import pandas as pd import pandas.testing as pdt -from nuclearmasses.io.ame_reaction_1_parse import AMEReactionParserOne +from nuclearmasses.io.ame_reaction_1_parse import AMEReactionOneParser def test_1983_rct1(): @@ -12,7 +12,7 @@ def test_1983_rct1(): line = io.StringIO( " 186 Ir 77 15780 250 9536 20 3850 100 -7600# 300# -2639 20 -10640# 200#" ) - parser = AMEReactionParserOne(line, 1983) + parser = AMEReactionOneParser(line, 1983) parser.layout.HEADER = 0 parser.layout.FOOTER = 0 df = parser.read_file() @@ -48,7 +48,7 @@ def test_1993_rct1(): line = io.StringIO( " 186 Ir 77 15618.44 270.74 9522.98 20.49 3852.98 103.94 -7419.61 145.57 -2635.85 20.03 -10622# 230#" ) - parser = AMEReactionParserOne(line, 1993) + parser = AMEReactionOneParser(line, 1993) parser.layout.HEADER = 0 parser.layout.FOOTER = 0 df = parser.read_file() @@ -84,7 +84,7 @@ def test_1995_rct1(): line = io.StringIO( " 186 Ir 77 15618.41 270.74 9522.89 20.49 3853.04 103.94 -7495.33 145.56 -2635.83 20.03 -10682.00 207.60" ) - parser = AMEReactionParserOne(line, 1995) + parser = AMEReactionOneParser(line, 1995) parser.layout.HEADER = 0 parser.layout.FOOTER = 0 df = parser.read_file() @@ -120,7 +120,7 @@ def test_2003_rct1(): line = io.StringIO( " 186 Ir 77 15704.74 32.47 9524.26 17.08 3849.65 103.31 -7458.10 26.70 -2639.77 16.57 -10561.10 44.19" ) - parser = AMEReactionParserOne(line, 2003) + parser = AMEReactionOneParser(line, 2003) parser.layout.HEADER = 0 parser.layout.FOOTER = 0 df = parser.read_file() @@ -156,7 +156,7 @@ def test_2012_rct1(): line = io.StringIO( " 186 Ir 77 15706.55 32.47 9527.99 17.09 3848.03 103.31 -7459.92 26.70 -2641.13 16.57 -10557.95 30.67" ) - parser = AMEReactionParserOne(line, 2012) + parser = AMEReactionOneParser(line, 2012) parser.layout.HEADER = 0 parser.layout.FOOTER = 0 df = parser.read_file() @@ -192,7 +192,7 @@ def test_2016_rct1(): line = io.StringIO( " 186 Ir 77 15704.13 32.47 9530.65 17.07 3848.80 103.31 -7457.49 26.70 -2642.29 16.55 -10555.52 30.67" ) - parser = AMEReactionParserOne(line, 2016) + parser = AMEReactionOneParser(line, 2016) parser.layout.HEADER = 0 parser.layout.FOOTER = 0 df = parser.read_file() @@ -228,7 +228,7 @@ def test_2020_rct1(): line = io.StringIO( " 186 Ir 77 15704.1312 32.4655 9530.4731 17.0698 3848.8777 103.3133 -7457.4943 26.6968 -2642.2739 16.5459 -10555.5245 30.6658" ) - parser = AMEReactionParserOne(line, 2020) + parser = AMEReactionOneParser(line, 2020) parser.layout.HEADER = 0 parser.layout.FOOTER = 0 df = parser.read_file() From 7c168b3e85effd979d19bfd953ced8cc9f0711e7 Mon Sep 17 00:00:00 2001 From: Ian Cullen Date: Sat, 2 May 2026 21:03:49 +0100 Subject: [PATCH 09/12] Refactor Converter class into separate modules The utils/ directory was created to allow the separation of functionality but everything ended up in a single class for ease. Before the repo grows too large let's split up as planned. --- src/nuclearmasses/io/ame_mass_parse.py | 12 +- src/nuclearmasses/io/ame_reaction_1_parse.py | 10 +- src/nuclearmasses/io/ame_reaction_2_parse.py | 10 +- src/nuclearmasses/io/nubase_parse.py | 15 +- src/nuclearmasses/mass_table.py | 4 +- src/nuclearmasses/utils/converter.py | 255 ------------------- src/nuclearmasses/utils/dataframe_utils.py | 96 +++++++ src/nuclearmasses/utils/periodic.py | 85 +++++++ src/nuclearmasses/utils/type_defs.py | 10 + src/nuclearmasses/utils/units.py | 62 +++++ 10 files changed, 283 insertions(+), 276 deletions(-) delete mode 100644 src/nuclearmasses/utils/converter.py create mode 100644 src/nuclearmasses/utils/dataframe_utils.py create mode 100644 src/nuclearmasses/utils/periodic.py create mode 100644 src/nuclearmasses/utils/type_defs.py create mode 100644 src/nuclearmasses/utils/units.py diff --git a/src/nuclearmasses/io/ame_mass_parse.py b/src/nuclearmasses/io/ame_mass_parse.py index e066138..1bf135c 100644 --- a/src/nuclearmasses/io/ame_mass_parse.py +++ b/src/nuclearmasses/io/ame_mass_parse.py @@ -7,7 +7,9 @@ import pandas as pd from nuclearmasses.io.ame_mass_file import AMEMassFile -from nuclearmasses.utils.converter import Converter, DataInput +from nuclearmasses.utils.dataframe_utils import calculate_relative_error, read_fwf, strip_char_from_string_columns +from nuclearmasses.utils.periodic import get_symbol +from nuclearmasses.utils.type_defs import DataInput class AMEMassParser: @@ -110,7 +112,7 @@ def read_file(self) -> pd.DataFrame: pandas.DataFrame A dataframe containing the parsed and organised contents of the AME mass data file """ - df = Converter.read_fwf( + df = read_fwf( self.filename, colspecs=self.column_limits, names=self._column_names(), @@ -122,7 +124,7 @@ def read_file(self) -> pd.DataFrame: ) # We use the NUBASE data to define whether or not an isotope is experimentally measured, # so for this data we'll just drop any and all '#' characters - df = Converter.strip_char_from_string_columns(df, "#") + df = strip_char_from_string_columns(df, "#") if self.year == 1983: # The column headers and units are repeated in the 1983 table @@ -148,11 +150,11 @@ def read_file(self) -> pd.DataFrame: # We need to rescale the error value because we combined the two columns above df = df.assign(AtomicMassError=df["AtomicMassError"].astype(float) / 1.0e6) - df = Converter.calculate_relative_error(df, "AME") + df = calculate_relative_error(df, "AME") df["TableYear"] = self.year df["N"] = pd.to_numeric(df["A"]) - pd.to_numeric(df["Z"]) - df["Symbol"] = pd.to_numeric(df["Z"]).map(Converter.get_symbol) + df["Symbol"] = pd.to_numeric(df["Z"]).map(get_symbol) df["DataSource"] = 0 return df.astype(self._data_types()) diff --git a/src/nuclearmasses/io/ame_reaction_1_parse.py b/src/nuclearmasses/io/ame_reaction_1_parse.py index cfcb26d..2559496 100644 --- a/src/nuclearmasses/io/ame_reaction_1_parse.py +++ b/src/nuclearmasses/io/ame_reaction_1_parse.py @@ -7,7 +7,9 @@ import pandas as pd from nuclearmasses.io.ame_reaction_1_file import AMEReactionOneFile -from nuclearmasses.utils.converter import Converter, DataInput +from nuclearmasses.utils.dataframe_utils import read_fwf, strip_char_from_string_columns +from nuclearmasses.utils.periodic import get_symbol +from nuclearmasses.utils.type_defs import DataInput class AMEReactionOneParser: @@ -119,7 +121,7 @@ def read_file(self) -> pd.DataFrame: pandas.DataFrame A dataframe containing the parsed and organised contents of the first AME reaction data file """ - df = Converter.read_fwf( + df = read_fwf( self.filename, colspecs=self.column_limits, names=self._column_names(), @@ -131,7 +133,7 @@ def read_file(self) -> pd.DataFrame: ) # We use the NUBASE data to define whether or not an isotope is experimentally measured, # so for this data we'll just drop any and all '#' characters - df = Converter.strip_char_from_string_columns(df, "#") + df = strip_char_from_string_columns(df, "#") if self.year == 1983: # The column headers and units are repeated in the 1983 table @@ -141,7 +143,7 @@ def read_file(self) -> pd.DataFrame: df["TableYear"] = self.year df["N"] = pd.to_numeric(df["A"]) - pd.to_numeric(df["Z"]) - df["Symbol"] = pd.to_numeric(df["Z"]).map(Converter.get_symbol) + df["Symbol"] = pd.to_numeric(df["Z"]).map(get_symbol) df["DataSource"] = 0 return df.astype(self._data_types()) diff --git a/src/nuclearmasses/io/ame_reaction_2_parse.py b/src/nuclearmasses/io/ame_reaction_2_parse.py index 978bb29..4a6f49b 100644 --- a/src/nuclearmasses/io/ame_reaction_2_parse.py +++ b/src/nuclearmasses/io/ame_reaction_2_parse.py @@ -7,7 +7,9 @@ import pandas as pd from nuclearmasses.io.ame_reaction_2_file import AMEReactionTwoFile -from nuclearmasses.utils.converter import Converter, DataInput +from nuclearmasses.utils.dataframe_utils import read_fwf, strip_char_from_string_columns +from nuclearmasses.utils.periodic import get_symbol +from nuclearmasses.utils.type_defs import DataInput class AMEReactionTwoParser: @@ -119,7 +121,7 @@ def read_file(self) -> pd.DataFrame: pandas.DataFrame A dataframe containing the parsed and organised contents of the second AME reaction data file """ - df = Converter.read_fwf( + df = read_fwf( self.filename, colspecs=self.column_limits, names=self._column_names(), @@ -131,7 +133,7 @@ def read_file(self) -> pd.DataFrame: ) # We use the NUBASE data to define whether or not an isotope is experimentally measured, # so for this data we'll just drop any and all '#' characters - df = Converter.strip_char_from_string_columns(df, "#") + df = strip_char_from_string_columns(df, "#") if self.year == 1983: # The column headers and units are repeated in the 1983 table @@ -145,7 +147,7 @@ def read_file(self) -> pd.DataFrame: # Repeated column heading also means we have to cast to create new columns df["TableYear"] = self.year df["N"] = pd.to_numeric(df["A"]) - pd.to_numeric(df["Z"]) - df["Symbol"] = pd.to_numeric(df["Z"]).map(Converter.get_symbol) + df["Symbol"] = pd.to_numeric(df["Z"]).map(get_symbol) df["DataSource"] = 0 return df.astype(self._data_types()) diff --git a/src/nuclearmasses/io/nubase_parse.py b/src/nuclearmasses/io/nubase_parse.py index 02c7297..cd4371a 100644 --- a/src/nuclearmasses/io/nubase_parse.py +++ b/src/nuclearmasses/io/nubase_parse.py @@ -7,7 +7,10 @@ import pandas as pd from nuclearmasses.io.nubase_file import NUBASEFile -from nuclearmasses.utils.converter import Converter, DataInput +from nuclearmasses.utils.dataframe_utils import calculate_relative_error, read_fwf, strip_char_from_string_columns +from nuclearmasses.utils.periodic import get_symbol +from nuclearmasses.utils.type_defs import DataInput +from nuclearmasses.utils.units import unit_to_seconds class NUBASEParser: @@ -179,7 +182,7 @@ def parse_half_life(self, raw_df) -> pd.DataFrame: raw_df[col] = pd.to_numeric(raw_df[col], errors="coerce") # Pre-compute unit -> second conversion - unit_map = raw_df["HalfLifeUnit"].map(Converter.unit_to_seconds) + unit_map = raw_df["HalfLifeUnit"].map(unit_to_seconds) raw_df["HalfLifeSeconds"] = raw_df["HalfLifeValue"] * unit_map raw_df["HalfLifeErrorSeconds"] = raw_df["HalfLifeError"] * unit_map @@ -222,7 +225,7 @@ def read_file(self) -> pd.DataFrame: pandas.DataFrame A dataframe containing the parsed and organised contents of the NUBASE data file """ - df = Converter.read_fwf( + df = read_fwf( self.filename, colspecs=self.column_limits, names=self._column_names(), @@ -238,10 +241,10 @@ def read_file(self) -> pd.DataFrame: # We use the NUBASE data to define whether or not an isotope is experimentally measured, df["Experimental"] = ~df["NUBASEMassExcess"].astype("string").str.contains("#", na=False) # Once we have used the '#' to determine if it's experimental or not, we can remove all instances of it - df = Converter.strip_char_from_string_columns(df, "#") + df = strip_char_from_string_columns(df, "#") df = self.parse_half_life(df) - df = Converter.calculate_relative_error(df, "NUBASE") + df = calculate_relative_error(df, "NUBASE") if self.year == 2012: # 198Au has a typo in it's decay mode in the 2012 table. It is recorded as '-' @@ -249,7 +252,7 @@ def read_file(self) -> pd.DataFrame: df["TableYear"] = self.year df["N"] = pd.to_numeric(df["A"]) - pd.to_numeric(df["Z"]) - df["Symbol"] = pd.to_numeric(df["Z"]).map(Converter.get_symbol) + df["Symbol"] = pd.to_numeric(df["Z"]).map(get_symbol) df["DataSource"] = 0 return df.astype(self._data_types()) diff --git a/src/nuclearmasses/mass_table.py b/src/nuclearmasses/mass_table.py index ff0e349..1980412 100644 --- a/src/nuclearmasses/mass_table.py +++ b/src/nuclearmasses/mass_table.py @@ -14,7 +14,7 @@ from nuclearmasses.io.ame import AME from nuclearmasses.io.nubase import NUBASE -from nuclearmasses.utils.converter import Converter +from nuclearmasses.utils.periodic import get_symbol class MassTable: @@ -103,7 +103,7 @@ def add_user_data( user_columns = set(user_df.columns) # The symbol is commonly used so if it wasn't in the file, create it as a column if "Symbol" not in user_columns: - user_df["Symbol"] = pd.to_numeric(user_df["Z"]).map(Converter().get_symbol) + user_df["Symbol"] = pd.to_numeric(user_df["Z"]).map(get_symbol) # Set the source value using the function parameter if it hasn't already been set if "DataSource" not in user_columns: diff --git a/src/nuclearmasses/utils/converter.py b/src/nuclearmasses/utils/converter.py deleted file mode 100644 index afa787d..0000000 --- a/src/nuclearmasses/utils/converter.py +++ /dev/null @@ -1,255 +0,0 @@ -""" -The converter module defines the ``Converter`` class that is used to store lookup dictionaries to allow simple and fast -conversions between scientific units and seconds, and element symbol and Z value. The dictionaries are defined on the -class level so any instance should share a single copy. -""" - -import importlib -from importlib.resources.abc import Traversable -import os -import typing - -import pandas as pd - -# Typing hint Union for the different ways a file or data can be represented -DataInput = Traversable | os.PathLike[str] | str | typing.TextIO - - -class Converter: - """ - Utility class to convert between various physical properties. - - All methods are static so it is not necessary to create an instance of the class. - - Internal dictionaries allow bidirectional conversion between element symbol and Z, as well as the conversion of an - time unit in SI format into the equivalent number of seconds (e.g. min -> 60.0). - """ - - UNIT_TO_SECONDS: dict[str, float] = { - "s": 1.0, - "ms": 1e-3, - "us": 1e-6, - "ns": 1e-9, - "ps": 1e-12, - "as": 1e-18, - "zs": 1e-21, - "ys": 1e-24, - "min": 60.0, - "h": 3600.0, - "d": 86400.0, - "yr": 31_557_600.0, # 365.25 days - "kyr": 3.15576e10, - "myr": 3.15576e13, - "gyr": 3.15576e16, - "zyr": 3.15576e21, - "eyr": 3.15576e18, - "pyr": 3.15576e15, - "tyr": 3.15576e12, - "yyr": 3.15576e24, - } - - # fmt: off - # Formatter wants to put each item on it's own line, I don't - Z_TO_SYMBOL: dict[int, str] = { - 0: "n", 1: "H", 2: "He", 3: "Li", 4: "Be", 5: "B", 6: "C", 7: "N", 8: "O", 9: "F", - 10: "Ne", 11: "Na", 12: "Mg", 13: "Al", 14: "Si", 15: "P", 16: "S", 17: "Cl", 18: "Ar", 19: "K", - 20: "Ca", 21: "Sc", 22: "Ti", 23: "V", 24: "Cr", 25: "Mn", 26: "Fe", 27: "Co", 28: "Ni", 29: "Cu", - 30: "Zn", 31: "Ga", 32: "Ge", 33: "As", 34: "Se", 35: "Br", 36: "Kr", 37: "Rb", 38: "Sr", 39: "Y", - 40: "Zr", 41: "Nb", 42: "Mo", 43: "Tc", 44: "Ru", 45: "Rh", 46: "Pd", 47: "Ag", 48: "Cd", 49: "In", - 50: "Sn", 51: "Sb", 52: "Te", 53: "I", 54: "Xe", 55: "Cs", 56: "Ba", 57: "La", 58: "Ce", 59: "Pr", - 60: "Nd", 61: "Pm", 62: "Sm", 63: "Eu", 64: "Gd", 65: "Tb", 66: "Dy", 67: "Ho", 68: "Er", 69: "Tm", - 70: "Yb", 71: "Lu", 72: "Hf", 73: "Ta", 74: "W", 75: "Re", 76: "Os", 77: "Ir", 78: "Pt", 79: "Au", - 80: "Hg", 81: "Tl", 82: "Pb", 83: "Bi", 84: "Po", 85: "At", 86: "Rn", 87: "Fr", 88: "Ra", 89: "Ac", - 90: "Th", 91: "Pa", 92: "U", 93: "Np", 94: "Pu", 95: "Am", 96: "Cm", 97: "Bk", 98: "Cf", 99: "Es", - 100: "Fm", 101: "Md", 102: "No", 103: "Lr", 104: "Rf", 105: "Db", 106: "Sg", 107: "Bh", 108: "Hs", 109: "Mt", - 110: "Ds", 111: "Rg", 112: "Cn", 113: "Ed", 114: "Fl", 115: "Ef", 116: "Lv", 117: "Ts", 118: "Og" - } - # fmt: on - - # Switch the keys and values of the z_to_symbol dictionary so a user can access the Z value using the symbol - SYMBOL_TO_Z: dict[str, int] = {val: key for key, val in Z_TO_SYMBOL.items()} - - def __init__(self, **kwargs) -> None: - # We are using multiple inheritance, so need this for MRO - super().__init__(**kwargs) - - @staticmethod - def get_symbol(z: int) -> str | None: - """ - Get the symbol representing ``z``. - - This is a nicely named, very thin wrapper around the inbuilt dictionary get. - - Parameters - ---------- - z : int - The Z value to get the symbol for. - - Returns - ------- - str or None - The string representing the ``z` value or None if the ``z` value is invalid. - """ - return Converter.Z_TO_SYMBOL.get(z, None) - - @staticmethod - def get_z(symbol: str) -> int | None: - """ - Get the z (proton number) representing ``symbol``. - - This is a nicely named, very thin wrapper around the inbuilt dictionary. - - Parameters - ---------- - symbol : str - The elemental symbol to get the Z for. - - Returns - ------- - int or None - The Z value representing ``symbol`` or None if ``symbol`` is invalid. - """ - return Converter.SYMBOL_TO_Z.get(symbol, None) - - @staticmethod - def normalise_symbol(symbol: str) -> str: - """ - Validate format of ``symbol`` to allow simpler conversions. - - Element symbols always have a capital first letter and lower case second, if it exists. We store all symbols - like this so want any user input to be of this format. In typesetting, this is known as title case so we can - leverage that conversion function. - - No checking is done on the validity of the symbol. - - Parameters - ---------- - symbol : str - The elemental symbol to validate. - - Returns - ------- - str - The elemental symbol with the correct casing. - """ - return symbol.strip().title() - - @staticmethod - def unit_to_seconds(unit_str: str) -> float | None: - """Convert a time unit to a scale factor in seconds. - - Parameters - ---------- - unit_str : str - The time unit to convert into seconds. - - Returns - ------- - float or None - The time unit represented in seconds or None if the unit does not represent time. - - Examples - -------- - >>> from nuclearmasses.utils.converter import Converter - >>> Converter.unit_to_seconds("s") - 1.0 - >>> Converter.unit_to_seconds("min") - 60.0 - >>> Converter.unit_to_seconds("keV") - >>> Converter.unit_to_seconds(2) - >>> - """ - if pd.isna(unit_str) or not isinstance(unit_str, str): - return None - - # Remove white space and make lower case to be consistent - cleaned_unit = unit_str.strip().lower() - if not cleaned_unit: - return None - - return Converter.UNIT_TO_SECONDS.get(cleaned_unit, None) - - @staticmethod - def read_fwf(base: DataInput, **kwargs): - """ - Overloaded version of :meth:`pandas.read_fwf` that accepts additional types. - - The use of importlib.resource means we have types that the pandas version of read_fwf does not accept. - It can still be used but some work is required. This function does that work, as well as some other checking - to make sure we can pass the necessary types into our parser classes. - - Parameters - ---------- - base : DataInput - The file-like object to read. - - Returns - ------- - pandas.DataFrame - The file-like object parsed into a pandas dataframe. - """ - # A file like object - if hasattr(base, "read"): - return pd.read_fwf(base, **kwargs) # type: ignore[arg-type] - - # importlib.resource Traversable - if isinstance(base, Traversable): - with importlib.resources.as_file(base) as the_file: - return pd.read_fwf(the_file, **kwargs) - - # Filesystem path - return pd.read_fwf(base, **kwargs) - - @staticmethod - def strip_char_from_string_columns(df: pd.DataFrame, char: str) -> pd.DataFrame: - """ - Remove ``char`` from columns that are of known string type - - Helper method to optimise the removal of the ``char`` character from columns in ``df`` that are of string type. - This function is specific to this module; we know, after parsing a file with :meth:'pandas.read_fwf`, columns - that contain purely floats will be of type float, but if a value on one line is e.g. 1234.56# the column will - be of type string (or object). As we wish to remove the '#' character, we can use this detail to only apply the - removal algorithm to those columns of type string and save some unnecessary processing. - - Parameters - ---------- - df : pandas.DataFrame - The dataframe we are removing the character from. - char : str - The character we want to remove. - - Returns - ------- - pandas.DataFrame - The original dataframe with all instance of ``char`` removed from string type columns. - """ - cols = df.select_dtypes(include=["object", "string"]).columns - df[cols] = df[cols].apply(lambda s: s.str.replace(char, "", regex=False)) - return df - - @staticmethod - def calculate_relative_error(raw_df: pd.DataFrame, source: str) -> pd.DataFrame: - """ - Calculate the relative error of the mass excess. - - 12C has a 0.0 +/- 0.0 mass excess by definition, so relative error is 0.0. The division by zero will put a NaN - value in the column for 12C so we will manually correct and set to 0.0. - - Parameters - ---------- - raw_df : pandas.DataFrame - The raw dataframe upon which we will act. - source : str - Which table's data are we working with - - Returns - ------- - pandas.DataFrame - The updated dataframe with a new relative mass excess column. - """ - raw_df[f"{source}RelativeError"] = abs( - raw_df[f"{source}MassExcessError"].astype(float) / raw_df[f"{source}MassExcess"].astype(float) - ) - raw_df.loc[(raw_df.Z == 6) & (raw_df.A == 12), f"{source}RelativeError"] = 0.0 - - return raw_df diff --git a/src/nuclearmasses/utils/dataframe_utils.py b/src/nuclearmasses/utils/dataframe_utils.py new file mode 100644 index 0000000..44ed249 --- /dev/null +++ b/src/nuclearmasses/utils/dataframe_utils.py @@ -0,0 +1,96 @@ +""" +The module dataframe_utils contains functionality to extend that available via pandas or apply a common transformation +to a dataframe that is used throughout the repository. +""" + +import importlib +from importlib.resources.abc import Traversable + +import pandas as pd + +from nuclearmasses.utils.type_defs import DataInput + + +def read_fwf(base: DataInput, **kwargs): + """ + Overloaded version of :meth:`pandas.read_fwf` that accepts additional types. + + The use of importlib.resource means we have types that the pandas version of read_fwf does not accept. + It can still be used but some work is required. This function does that work, as well as some other checking + to make sure we can pass the necessary types into our parser classes. + + Parameters + ---------- + base : DataInput + The file-like object to read. + + Returns + ------- + pandas.DataFrame + The file-like object parsed into a pandas dataframe. + """ + # A file like object + if hasattr(base, "read"): + return pd.read_fwf(base, **kwargs) # type: ignore[arg-type] + + # importlib.resource Traversable + if isinstance(base, Traversable): + with importlib.resources.as_file(base) as the_file: + return pd.read_fwf(the_file, **kwargs) + + # Filesystem path + return pd.read_fwf(base, **kwargs) + + +def strip_char_from_string_columns(df: pd.DataFrame, char: str) -> pd.DataFrame: + """ + Remove ``char`` from columns that are of known string type + + Helper method to optimise the removal of the ``char`` character from columns in ``df`` that are of string type. + This function is specific to this module; we know, after parsing a file with :meth:'pandas.read_fwf`, columns + that contain purely floats will be of type float, but if a value on one line is e.g. 1234.56# the column will + be of type string (or object). As we wish to remove the '#' character, we can use this detail to only apply the + removal algorithm to those columns of type string and save some unnecessary processing. + + Parameters + ---------- + df : pandas.DataFrame + The dataframe we are removing the character from. + char : str + The character we want to remove. + + Returns + ------- + pandas.DataFrame + The original dataframe with all instance of ``char`` removed from string type columns. + """ + cols = df.select_dtypes(include=["object", "string"]).columns + df[cols] = df[cols].apply(lambda s: s.str.replace(char, "", regex=False)) + return df + + +def calculate_relative_error(raw_df: pd.DataFrame, source: str) -> pd.DataFrame: + """ + Calculate the relative error of the mass excess. + + 12C has a 0.0 +/- 0.0 mass excess by definition, so relative error is 0.0. The division by zero will put a NaN + value in the column for 12C so we will manually correct and set to 0.0. + + Parameters + ---------- + raw_df : pandas.DataFrame + The raw dataframe upon which we will act. + source : str + Which table's data are we working with + + Returns + ------- + pandas.DataFrame + The updated dataframe with a new relative mass excess column. + """ + raw_df[f"{source}RelativeError"] = abs( + raw_df[f"{source}MassExcessError"].astype(float) / raw_df[f"{source}MassExcess"].astype(float) + ) + raw_df.loc[(raw_df.Z == 6) & (raw_df.A == 12), f"{source}RelativeError"] = 0.0 + + return raw_df diff --git a/src/nuclearmasses/utils/periodic.py b/src/nuclearmasses/utils/periodic.py new file mode 100644 index 0000000..f5569d4 --- /dev/null +++ b/src/nuclearmasses/utils/periodic.py @@ -0,0 +1,85 @@ +""" +The module periodic contains functionality to convert between elemental symbol and atomic number. +""" + +# fmt: off +# Formatter wants to put each item on it's own line, I don't +Z_TO_SYMBOL: dict[int, str] = { + 0: "n", 1: "H", 2: "He", 3: "Li", 4: "Be", 5: "B", 6: "C", 7: "N", 8: "O", 9: "F", + 10: "Ne", 11: "Na", 12: "Mg", 13: "Al", 14: "Si", 15: "P", 16: "S", 17: "Cl", 18: "Ar", 19: "K", + 20: "Ca", 21: "Sc", 22: "Ti", 23: "V", 24: "Cr", 25: "Mn", 26: "Fe", 27: "Co", 28: "Ni", 29: "Cu", + 30: "Zn", 31: "Ga", 32: "Ge", 33: "As", 34: "Se", 35: "Br", 36: "Kr", 37: "Rb", 38: "Sr", 39: "Y", + 40: "Zr", 41: "Nb", 42: "Mo", 43: "Tc", 44: "Ru", 45: "Rh", 46: "Pd", 47: "Ag", 48: "Cd", 49: "In", + 50: "Sn", 51: "Sb", 52: "Te", 53: "I", 54: "Xe", 55: "Cs", 56: "Ba", 57: "La", 58: "Ce", 59: "Pr", + 60: "Nd", 61: "Pm", 62: "Sm", 63: "Eu", 64: "Gd", 65: "Tb", 66: "Dy", 67: "Ho", 68: "Er", 69: "Tm", + 70: "Yb", 71: "Lu", 72: "Hf", 73: "Ta", 74: "W", 75: "Re", 76: "Os", 77: "Ir", 78: "Pt", 79: "Au", + 80: "Hg", 81: "Tl", 82: "Pb", 83: "Bi", 84: "Po", 85: "At", 86: "Rn", 87: "Fr", 88: "Ra", 89: "Ac", + 90: "Th", 91: "Pa", 92: "U", 93: "Np", 94: "Pu", 95: "Am", 96: "Cm", 97: "Bk", 98: "Cf", 99: "Es", + 100: "Fm", 101: "Md", 102: "No", 103: "Lr", 104: "Rf", 105: "Db", 106: "Sg", 107: "Bh", 108: "Hs", 109: "Mt", + 110: "Ds", 111: "Rg", 112: "Cn", 113: "Ed", 114: "Fl", 115: "Ef", 116: "Lv", 117: "Ts", 118: "Og" +} +# fmt: on + +# Switch the keys and values of the z_to_symbol dictionary so a user can access the Z value using the symbol +SYMBOL_TO_Z: dict[str, int] = {val: key for key, val in Z_TO_SYMBOL.items()} + + +def get_symbol(z: int) -> str | None: + """ + Get the symbol representing ``z``. + + This is a nicely named, very thin wrapper around the inbuilt dictionary get. + + Parameters + ---------- + z : int + The Z value to get the symbol for. + + Returns + ------- + str or None + The string representing the ``z` value or None if the ``z` value is invalid. + """ + return Z_TO_SYMBOL.get(z, None) + + +def get_z(symbol: str) -> int | None: + """ + Get the z (proton number) representing ``symbol``. + + This is a nicely named, very thin wrapper around the inbuilt dictionary. + + Parameters + ---------- + symbol : str + The elemental symbol to get the Z for. + + Returns + ------- + int or None + The Z value representing ``symbol`` or None if ``symbol`` is invalid. + """ + return SYMBOL_TO_Z.get(symbol, None) + + +def normalise_symbol(symbol: str) -> str: + """ + Validate format of ``symbol`` to allow simpler conversions. + + Element symbols always have a capital first letter and lower case second, if it exists. We store all symbols + like this so want any user input to be of this format. In typesetting, this is known as title case so we can + leverage that conversion function. + + No checking is done on the validity of the symbol. + + Parameters + ---------- + symbol : str + The elemental symbol to validate. + + Returns + ------- + str + The elemental symbol with the correct casing. + """ + return symbol.strip().title() diff --git a/src/nuclearmasses/utils/type_defs.py b/src/nuclearmasses/utils/type_defs.py new file mode 100644 index 0000000..cc85d63 --- /dev/null +++ b/src/nuclearmasses/utils/type_defs.py @@ -0,0 +1,10 @@ +""" +The module type_defs contains additional type definitions used by the main repository. +""" + +from importlib.resources.abc import Traversable +import os +import typing + +# Typing hint Union for the different ways a file or data can be represented +DataInput: typing.TypeAlias = Traversable | os.PathLike[str] | str | typing.TextIO diff --git a/src/nuclearmasses/utils/units.py b/src/nuclearmasses/utils/units.py new file mode 100644 index 0000000..c610e69 --- /dev/null +++ b/src/nuclearmasses/utils/units.py @@ -0,0 +1,62 @@ +""" +The module units contains functionality to convert from human readable strings into SI units. For example the time units +mins, hr or Gyr would be converted into seconds. +""" + +UNIT_TO_SECONDS: dict[str, float] = { + "ys": 1e-24, + "zs": 1e-21, + "as": 1e-18, + "ps": 1e-12, + "ns": 1e-9, + "us": 1e-6, + "ms": 1e-3, + "s": 1.0, + "min": 60.0, + "h": 3600.0, + "d": 86400.0, + "yr": 31_557_600.0, # 365.25 days + "kyr": 3.15576e10, + "tyr": 3.15576e12, + "myr": 3.15576e13, + "pyr": 3.15576e15, + "gyr": 3.15576e16, + "eyr": 3.15576e18, + "zyr": 3.15576e21, + "yyr": 3.15576e24, +} + + +def unit_to_seconds(unit_str: str) -> float | None: + """Convert a time unit to a scale factor in seconds. + + Parameters + ---------- + unit_str : str + The time unit to convert into seconds. + + Returns + ------- + float or None + The time unit represented in seconds or None if the unit does not represent time. + + Examples + -------- + >>> from nuclearmasses.utils.converter import Converter + >>> Converter.unit_to_seconds("s") + 1.0 + >>> Converter.unit_to_seconds("min") + 60.0 + >>> Converter.unit_to_seconds("keV") + >>> Converter.unit_to_seconds(2) + >>> + """ + if not isinstance(unit_str, str): + return None + + # Remove white space and make lower case to be consistent + cleaned_unit = unit_str.strip().lower() + if not cleaned_unit: + return None + + return UNIT_TO_SECONDS.get(cleaned_unit, None) From 8865c6e56449a9c719855ac7834721acea025fb2 Mon Sep 17 00:00:00 2001 From: Ian Cullen Date: Sat, 2 May 2026 21:05:29 +0100 Subject: [PATCH 10/12] Update and organise tests after Converter class split --- tests/{ => io}/test_ame.py | 0 tests/{ => io}/test_ame_mass_parse.py | 0 tests/{ => io}/test_ame_reaction_1_parse.py | 0 tests/{ => io}/test_ame_reaction_2_parse.py | 0 tests/{ => io}/test_nubase.py | 0 tests/{ => io}/test_nubase_parse.py | 0 tests/test_converter.py | 45 ---------------- tests/utils/test_dataframe_utils.py | 57 +++++++++++++++++++++ tests/utils/test_periodic.py | 25 +++++++++ tests/utils/test_units.py | 17 ++++++ 10 files changed, 99 insertions(+), 45 deletions(-) rename tests/{ => io}/test_ame.py (100%) rename tests/{ => io}/test_ame_mass_parse.py (100%) rename tests/{ => io}/test_ame_reaction_1_parse.py (100%) rename tests/{ => io}/test_ame_reaction_2_parse.py (100%) rename tests/{ => io}/test_nubase.py (100%) rename tests/{ => io}/test_nubase_parse.py (100%) delete mode 100644 tests/test_converter.py create mode 100644 tests/utils/test_dataframe_utils.py create mode 100644 tests/utils/test_periodic.py create mode 100644 tests/utils/test_units.py diff --git a/tests/test_ame.py b/tests/io/test_ame.py similarity index 100% rename from tests/test_ame.py rename to tests/io/test_ame.py diff --git a/tests/test_ame_mass_parse.py b/tests/io/test_ame_mass_parse.py similarity index 100% rename from tests/test_ame_mass_parse.py rename to tests/io/test_ame_mass_parse.py diff --git a/tests/test_ame_reaction_1_parse.py b/tests/io/test_ame_reaction_1_parse.py similarity index 100% rename from tests/test_ame_reaction_1_parse.py rename to tests/io/test_ame_reaction_1_parse.py diff --git a/tests/test_ame_reaction_2_parse.py b/tests/io/test_ame_reaction_2_parse.py similarity index 100% rename from tests/test_ame_reaction_2_parse.py rename to tests/io/test_ame_reaction_2_parse.py diff --git a/tests/test_nubase.py b/tests/io/test_nubase.py similarity index 100% rename from tests/test_nubase.py rename to tests/io/test_nubase.py diff --git a/tests/test_nubase_parse.py b/tests/io/test_nubase_parse.py similarity index 100% rename from tests/test_nubase_parse.py rename to tests/io/test_nubase_parse.py diff --git a/tests/test_converter.py b/tests/test_converter.py deleted file mode 100644 index 8201f84..0000000 --- a/tests/test_converter.py +++ /dev/null @@ -1,45 +0,0 @@ -import pytest - -from nuclearmasses.utils.converter import Converter - - -@pytest.fixture -def converter(): - return Converter() - - -def test_z_to_symbol(converter): - assert converter.get_symbol(0) == "n" - assert converter.get_symbol(6) == "C" - assert converter.get_symbol(104) == "Rf" - - -def test_symbol_to_z(converter): - assert converter.get_z("Al") == 13 - assert converter.get_z("Fe") == 26 - assert converter.get_z("Po") == 84 - - -def test_normalise_symbol(converter): - # These inputs shouldn't change - assert converter.normalise_symbol("H") == "H" - assert converter.normalise_symbol("Os") == "Os" - - # These inputs should change - assert converter.normalise_symbol("h") == "H" - assert converter.normalise_symbol("mg") == "Mg" - assert converter.normalise_symbol("RN") == "Rn" - - -def test_units_to_seconds(converter): - assert converter.unit_to_seconds("ms") == 1.0e-3 - assert converter.unit_to_seconds("s") == 1.0 - assert converter.unit_to_seconds("min") == 60.0 - assert converter.unit_to_seconds("h") == 3600.0 - assert converter.unit_to_seconds("d") == 86400.0 - assert converter.unit_to_seconds("yr") == 31557600.0 - - -@pytest.mark.parametrize("unit", [5, "m", "Hz", "", " "]) -def test_nontime_unit_return_nan(converter, unit): - assert converter.unit_to_seconds(unit) is None diff --git a/tests/utils/test_dataframe_utils.py b/tests/utils/test_dataframe_utils.py new file mode 100644 index 0000000..c1a5a25 --- /dev/null +++ b/tests/utils/test_dataframe_utils.py @@ -0,0 +1,57 @@ +from nuclearmasses.utils.dataframe_utils import calculate_relative_error, strip_char_from_string_columns + +import pandas as pd + + +def test_12C_relative_error(): + df = pd.DataFrame( + { + "A": [12], + "Z": [6], + "NUBASEMassExcess": [-12345.6], + "NUBASEMassExcessError": [1.2], + } + ) + + df = calculate_relative_error(df, "NUBASE") + assert df["NUBASERelativeError"][0] == 0.0 + + +def test_relative_error(): + df = pd.DataFrame( + { + "A": [123], + "Z": [50], + "NUBASEMassExcess": [100000.0], + "NUBASEMassExcessError": [10.0], + } + ) + + df = calculate_relative_error(df, "NUBASE") + assert df["NUBASERelativeError"][0] == 10.0/100000.0 + + +def test_remove_hash_from_column(): + df = pd.DataFrame( + { + "X": ["Random#"], + "Y": ["Clean"], + } + ) + + df = strip_char_from_string_columns(df, "#") + assert df["X"][0] == "Random" + assert df["Y"][0] == "Clean" + + +def test_remove_decimal_from_column(): + df = pd.DataFrame( + { + "X": ["Random.String"], + "Y": ["Clean"], + } + ) + + df = strip_char_from_string_columns(df, ".") + assert df["X"][0] == "RandomString" + assert df["Y"][0] == "Clean" diff --git a/tests/utils/test_periodic.py b/tests/utils/test_periodic.py new file mode 100644 index 0000000..4b592e9 --- /dev/null +++ b/tests/utils/test_periodic.py @@ -0,0 +1,25 @@ +from nuclearmasses.utils.periodic import get_symbol, get_z, normalise_symbol + + +def test_z_to_symbol(): + assert get_symbol(0) == "n" + assert get_symbol(6) == "C" + assert get_symbol(104) == "Rf" + + +def test_symbol_to_z(): + assert get_z("Al") == 13 + assert get_z("Fe") == 26 + assert get_z("Po") == 84 + + +def test_normalise_symbol(): + # These inputs shouldn't change + assert normalise_symbol("H") == "H" + assert normalise_symbol("Os") == "Os" + + # These inputs should change + assert normalise_symbol("h") == "H" + assert normalise_symbol("mg") == "Mg" + assert normalise_symbol("RN") == "Rn" + diff --git a/tests/utils/test_units.py b/tests/utils/test_units.py new file mode 100644 index 0000000..2e9c83e --- /dev/null +++ b/tests/utils/test_units.py @@ -0,0 +1,17 @@ +import pytest + +from nuclearmasses.utils.units import unit_to_seconds + + +def test_units_to_seconds(): + assert unit_to_seconds("ms") == 1.0e-3 + assert unit_to_seconds("s") == 1.0 + assert unit_to_seconds("min") == 60.0 + assert unit_to_seconds("h") == 3600.0 + assert unit_to_seconds("d") == 86400.0 + assert unit_to_seconds("yr") == 31557600.0 + + +@pytest.mark.parametrize("unit", [5, "m", "Hz", "", " "]) +def test_nontime_unit_return_nan(unit): + assert unit_to_seconds(unit) is None From 2c18278929b4d97d06646cf8594b1dc699740066 Mon Sep 17 00:00:00 2001 From: Ian Cullen Date: Sat, 2 May 2026 21:44:47 +0100 Subject: [PATCH 11/12] Fix linter warnings --- tests/utils/test_dataframe_utils.py | 6 +++--- tests/utils/test_periodic.py | 1 - 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/tests/utils/test_dataframe_utils.py b/tests/utils/test_dataframe_utils.py index c1a5a25..3dc70be 100644 --- a/tests/utils/test_dataframe_utils.py +++ b/tests/utils/test_dataframe_utils.py @@ -1,7 +1,7 @@ -from nuclearmasses.utils.dataframe_utils import calculate_relative_error, strip_char_from_string_columns - import pandas as pd +from nuclearmasses.utils.dataframe_utils import calculate_relative_error, strip_char_from_string_columns + def test_12C_relative_error(): df = pd.DataFrame( @@ -28,7 +28,7 @@ def test_relative_error(): ) df = calculate_relative_error(df, "NUBASE") - assert df["NUBASERelativeError"][0] == 10.0/100000.0 + assert df["NUBASERelativeError"][0] == 10.0 / 100000.0 def test_remove_hash_from_column(): diff --git a/tests/utils/test_periodic.py b/tests/utils/test_periodic.py index 4b592e9..4d455d5 100644 --- a/tests/utils/test_periodic.py +++ b/tests/utils/test_periodic.py @@ -22,4 +22,3 @@ def test_normalise_symbol(): assert normalise_symbol("h") == "H" assert normalise_symbol("mg") == "Mg" assert normalise_symbol("RN") == "Rn" - From d495383e654c3134d060b19556635d9f5a9b8602 Mon Sep 17 00:00:00 2001 From: Ian Cullen Date: Sat, 2 May 2026 21:50:14 +0100 Subject: [PATCH 12/12] Update CHANGELOG with refactor details --- CHANGELOG.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9f99bbd..36094e2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,10 @@ # Changelog +## [0.4.0] - 2026-05-?? +- [#27](https://github.com/php1ic/nuclearmasses/pull/27) + * Refactor Converter class in utils into separate modules + * Update how the column locations of the parameters are set and accessed + ## [0.3.0] - 2026-04-28 - [#26](https://github.com/php1ic/nuclearmasses/pull/26) * Optimise initial parsing of the files.