diff --git a/src/nuclearmasses/io/ame.py b/src/nuclearmasses/io/ame.py index 30afdf0..4a3422a 100644 --- a/src/nuclearmasses/io/ame.py +++ b/src/nuclearmasses/io/ame.py @@ -1,3 +1,7 @@ +""" +The ame module defines the ``AME`` class to work on and store details related to the AME data. +""" + from importlib.resources.abc import Traversable import pandas as pd @@ -8,7 +12,31 @@ class AME: - """Top level storage and functionality for AME data""" + """ + Container class to store details related to the years a AME table was published. + + This is a high level class that tracks details, but delegates the parsing of the files. + + Parameters + ---------- + data_path : Traversable + Absolute path to the data files location. Not an actual file, rather the top level directory that contains from + where we access the year then individual file + + Attributes + ---------- + data_path : Traversable + Absolute path to the data files location. Not an actual file, rather the top level directory that contains from + where we access the year then individual file. + years : list[ints] + An ordered list of the years in which a NUBASE table was published. + ame_files : list[tuple[str, str, str]] + The filenames each AME data file in year order. + files : dict[int, tuple[str, str, str]] + A dictionary mapping published year to filename. + nubase_df : pandas.DataFrame + A dataframe containing the NUBASE data from all published years. + """ def __init__(self, data_path: Traversable): self.data_path = data_path @@ -26,14 +54,38 @@ def __init__(self, data_path: Traversable): self.ame_df: pd.DataFrame = self.parse_all_years() def get_datafiles(self, year: int) -> tuple[Traversable, Traversable, Traversable]: - """Use the given year to locate the 3 AME data file and return the absolute paths.""" + """ + Construct the absolute paths to the files for the given ``year``. + + Parameters + ---------- + year : int + The published year to get the file for. + + Returns + ------- + tuple[Traversable,Traversable,Traversable] + The absolute paths to the three AME data files. + """ root = self.data_path / str(year) mass, rct1, rct2 = self.files[year] return root / mass, root / rct1, root / rct2 def parse_year(self, year: int) -> pd.DataFrame: - """Combine all the AME files from the given ``year``""" + """ + Parse the data from the given ``year``. + + Parameters + ---------- + year : int + The published year to get the data for. + + Returns + ------- + pandas.DataFrame + The data from ``year`` as a dataframe + """ ame_mass, ame_reaction_1, ame_reaction_2 = self.get_datafiles(year) mass_df = AMEMassParser(filename=ame_mass, year=year).read_file() @@ -45,5 +97,12 @@ def parse_year(self, year: int) -> pd.DataFrame: return mass_df.merge(rct1_df, on=common_columns, how="outer").merge(rct2_df, on=common_columns, how="outer") def parse_all_years(self) -> pd.DataFrame: - """Parse the files for all available years""" + """ + Parse the files for all available years. + + Returns + ------- + pandas.DataFrame + The data from all published years as a single dataframe. + """ return pd.concat((self.parse_year(y) for y in self.years), ignore_index=True) diff --git a/src/nuclearmasses/io/ame_mass_file.py b/src/nuclearmasses/io/ame_mass_file.py index 0e9b7fc..550f0e2 100644 --- a/src/nuclearmasses/io/ame_mass_file.py +++ b/src/nuclearmasses/io/ame_mass_file.py @@ -1,5 +1,37 @@ +""" +The ame_mass_file module defines the ``AMEMassFile`` class. This class stores the column positions of the start and +finish location of the different parameters recorded in the AME mass data file. The positions have changed between +years so the year of the table is given as a parameter at construction. +""" + + class AMEMassFile: - """Easy access to the variables in the AME mass file.""" + """ + Storage class for the data in the AME mass data file. + + The AME mass data file is fixed-width file format so we will store the format details in this class. + + Note we have not listed all parameters in the attributes section as there are so many. The naming convention is + however shown, along with a description. + + Parameters + ---------- + year : int + The year the file being parsed was published + + Attributes + ---------- + HEADER : int + The number of lines in the file to be interpreted as the header. + FOOTER : int + The number of lines in the file to be interpreted as the footer. + START_X : int + The first column of parameter X. + END_X : int or None + The last column of parameter X or None to represent the end of the line. + column_limits : list[tuple[int, int]] + The start and end positions of all parameters as a list of tuples that can be passed to :meth:`pandas.read_fwf`. + """ def __init__(self, year: int, **kwargs): super().__init__(**kwargs) diff --git a/src/nuclearmasses/io/ame_mass_parse.py b/src/nuclearmasses/io/ame_mass_parse.py index ec64dbe..98ae737 100644 --- a/src/nuclearmasses/io/ame_mass_parse.py +++ b/src/nuclearmasses/io/ame_mass_parse.py @@ -1,3 +1,9 @@ +""" +The ame_mass_parse module defines the ``AMEMassParser`` class. This class contains the logic required to sort and +organise the inputs to :meth:`pandas.read_fwf` dependent on the year of the file. Once parsed, known typos and +inconsistencies are cleaned from the resultant dataframe. +""" + import pandas as pd from nuclearmasses.io.ame_mass_file import AMEMassFile @@ -5,21 +11,41 @@ class AMEMassParser(AMEMassFile, Converter): - """Parse the AME mass file. - - The format is known but the provided string does not match all lines. - We will therefore use START and END markers, which are inherited, and - read the columns are interested in. + """ + Parse the AME mass file, doing the necessary preparation and clean ups of data. + + There are some quirks to the format used in the file. It's based on fixed-width format, but deviates in various + places so additional work is required once the file is parsed. + + Parameters + ---------- + filename : DataInput + The file-like object to parse. + year : int + The published year of the data file. + + Attributes + ---------- + filename : DataInput + The file-like object to parse. + year : int + The published year of the data file. """ def __init__(self, filename: DataInput, year: int): - """Set the file to read and table year""" super().__init__(year=year) self.filename: DataInput = filename self.year: int = year def _column_names(self) -> list[str]: - """Set the column name depending on the year""" + """ + Set the column name depending on the year. + + Returns + ------- + list[str] + An ordered list of the columns that exist in the file. + """ return [ "Z", "A", @@ -35,7 +61,14 @@ def _column_names(self) -> list[str]: ] def _data_types(self) -> dict: - """Set the data type depending on the year""" + """ + Set the column data types depending on the year. + + Returns + ------- + dict[str, str] + A dictionary of the columns that exist and their data type + """ return { "TableYear": "Int64", "Symbol": "string", @@ -54,7 +87,14 @@ def _data_types(self) -> dict: } def _na_values(self) -> dict: - """Set the columns that have placeholder values""" + """ + Set the columns that have empty fields that should be NaN'd depending on the year. + + Returns + ------- + dict[str, list[str]] + A dictionary of the columns that will have values that should be interpreted as NaN. + """ na_vals = { "A": [""], "BetaDecayEnergy": ["", "*"], @@ -67,9 +107,21 @@ def _na_values(self) -> dict: return na_vals def calculate_relative_error(self, raw_df) -> pd.DataFrame: - """Calculate the relative error of the mass excess + """ + Calculate the relative error of the mass excess. - 12C has a 0.0 +/- 0.0 mass excess definition by definition so ensure that is still true. + 12C has a 0.0 +/- 0.0 mass excess by definition, so relative error is 0.0. The division by zero will put a NaN + value in the column for 12C so we will manually correct and set to 0.0. + + Parameters + ---------- + raw_df : pandas.DataFrame + The raw dataframe upon which we will act. + + Returns + ------- + pandas.DataFrame + The updated dataframe with a new relative mass excess column. """ raw_df["AMERelativeError"] = abs( raw_df["AMEMassExcessError"].astype(float) / raw_df["AMEMassExcess"].astype(float) @@ -79,11 +131,16 @@ def calculate_relative_error(self, raw_df) -> pd.DataFrame: return raw_df def read_file(self) -> pd.DataFrame: - """Read the file using it's known format + """ + Read the file-like object ``self.filename`` into a dataframe + + The ``AMEMassFile`` and other functions in this class have hopefully sanitized the column names, data types and + locations of the date so we can now make the generic call to parse the file. - The AMEMassFile and other functions in this class have hopefully sanitized the - column names, data types and locations of the date so we can now make the generic - call to parse the file. + Returns + ------- + pandas.DataFrame + A dataframe containing the parsed and organised contents of the AME mass data file """ df = Converter.read_fwf( self.filename, diff --git a/src/nuclearmasses/io/ame_reaction_1_file.py b/src/nuclearmasses/io/ame_reaction_1_file.py index 9a0de75..c2c16d7 100644 --- a/src/nuclearmasses/io/ame_reaction_1_file.py +++ b/src/nuclearmasses/io/ame_reaction_1_file.py @@ -1,8 +1,39 @@ +""" +The ame_reaction_1_file module defines the ``AMEReactionFileOne`` class. This class stores the column positions of the +start and finish location of the different parameters recorded in the AME reaction 1 data file. The positions have +changed between years so the year of the table is given as a parameter at construction. +""" + + class AMEReactionFileOne: - """Easy access to the variables in the first AME reaction file.""" + """ + Storage class for the data in the AME reaction 1 data file. + + The AME reaction 1 data file is fixed-width file format so we will store the format details in this class. + + Note we have not listed all parameters in the attributes section as there are so many. The naming convention is + however shown, along with a description. + + Parameters + ---------- + year : int + The year the file being parsed was published + + Attributes + ---------- + HEADER : int + The number of lines in the file to be interpreted as the header. + FOOTER : int + The number of lines in the file to be interpreted as the footer. + START_X : int + The first column of parameter X. + END_X : int or None + The last column of parameter X or None to represent the end of the line. + column_limits : list[tuple[int, int]] + The start and end positions of all parameters as a list of tuples that can be passed to :meth:`pandas.read_fwf`. + """ def __init__(self, year: int, **kwargs): - """Setup the values that locate the variable.""" super().__init__(**kwargs) match year: case 1983: diff --git a/src/nuclearmasses/io/ame_reaction_1_parse.py b/src/nuclearmasses/io/ame_reaction_1_parse.py index c113684..91b07cb 100644 --- a/src/nuclearmasses/io/ame_reaction_1_parse.py +++ b/src/nuclearmasses/io/ame_reaction_1_parse.py @@ -1,3 +1,9 @@ +""" +The ame_reaction_1_parse module defines the ``AMEReactionParserOne`` class. This class contains the logic required to +sort and organise the inputs to :meth:`pandas.read_fwf` dependent on the year of the file. Once parsed, known typos and +inconsistencies are cleaned from the resultant dataframe. +""" + import pandas as pd from nuclearmasses.io.ame_reaction_1_file import AMEReactionFileOne @@ -5,19 +11,41 @@ class AMEReactionParserOne(AMEReactionFileOne, Converter): - """Parse the first AME reaction file. + """ + Parse the first AME reaction file, doing the necessary preparation and clean ups of data. + + There are some quirks to the format used in the file. It's based on fixed-width format, but deviates in various + places so additional work is required once the file is parsed. + + Parameters + ---------- + filename : DataInput + The file-like object to parse. + year : int + The published year of the data file. - The format is known but I don't think python can easily parse it. + Attributes + ---------- + filename : DataInput + The file-like object to parse. + year : int + The published year of the data file. """ def __init__(self, filename: DataInput, year: int): - """Set the file to read and table year.""" super().__init__(year=year) self.filename: DataInput = filename self.year = year def _column_names(self) -> list[str]: - """Set the column name depending on the year""" + """ + Set the column name depending on the year. + + Returns + ------- + list[str] + An ordered list of the columns that exist in the file. + """ return [ "A", "Z", @@ -36,7 +64,14 @@ def _column_names(self) -> list[str]: ] def _data_types(self) -> dict: - """Set the data type depending on the year""" + """ + Set the column data types depending on the year. + + Returns + ------- + dict[str, str] + A dictionary of the columns that exist and their data type + """ return { "TableYear": "Int64", "Symbol": "string", @@ -59,7 +94,14 @@ def _data_types(self) -> dict: } def _na_values(self) -> dict: - """Set the columns that have placeholder values""" + """ + Set the columns that have empty fields that should be NaN'd depending on the year. + + Returns + ------- + dict[str, list[str]] + A dictionary of the columns that will have values that should be interpreted as NaN. + """ return { "A": [""], "TwoNeutronSeparationEnergy": ["", "*"], @@ -77,11 +119,16 @@ def _na_values(self) -> dict: } def read_file(self) -> pd.DataFrame: - """Read the file using it's known format + """ + Read the file-like object ``self.filename`` into a dataframe + + The ``AMEReactionOneFile`` and other functions in this class have hopefully sanitized the column names, data + types and locations of the date so we can now make the generic call to parse the file. - The AMEReactionFileOne and other functions in this class have hopefully sanitized the - column names, data types and locations of the date so we can now make the generic - call to parse the file. + Returns + ------- + pandas.DataFrame + A dataframe containing the parsed and organised contents of the first AME reaction data file """ df = Converter.read_fwf( self.filename, diff --git a/src/nuclearmasses/io/ame_reaction_2_file.py b/src/nuclearmasses/io/ame_reaction_2_file.py index 8aa5263..c3158a3 100644 --- a/src/nuclearmasses/io/ame_reaction_2_file.py +++ b/src/nuclearmasses/io/ame_reaction_2_file.py @@ -1,5 +1,37 @@ +""" +The ame_reaction_2_file module defines the ``AMEReactionFileTwo`` class. This class stores the column positions of the +start and finish location of the different parameters recorded in the AME reaction 2 data file. The positions have +changed between years so the year of the table is given as a parameter at construction. +""" + + class AMEReactionFileTwo: - """Easy access to the variables in the second AME reaction file.""" + """ + Storage class for the data in the AME reaction 2 data file. + + The AME reaction 2 data file is fixed-width file format so we will store the format details in this class. + + Note we have not listed all parameters in the attributes section as there are so many. The naming convention is + however shown, along with a description. + + Parameters + ---------- + year : int + The year the file being parsed was published + + Attributes + ---------- + HEADER : int + The number of lines in the file to be interpreted as the header. + FOOTER : int + The number of lines in the file to be interpreted as the footer. + START_X : int + The first column of parameter X. + END_X : int or None + The last column of parameter X or None to represent the end of the line. + column_limits : list[tuple[int, int]] + The start and end positions of all parameters as a list of tuples that can be passed to :meth:`pandas.read_fwf`. + """ def __init__(self, year: int, **kwargs): super().__init__(**kwargs) diff --git a/src/nuclearmasses/io/ame_reaction_2_parse.py b/src/nuclearmasses/io/ame_reaction_2_parse.py index 8e25d64..11bd258 100644 --- a/src/nuclearmasses/io/ame_reaction_2_parse.py +++ b/src/nuclearmasses/io/ame_reaction_2_parse.py @@ -1,3 +1,9 @@ +""" +The ame_reaction_2_parse module defines the ``AMEReactionParserTwo`` class. This class contains the logic required to +sort and organise the inputs to :meth:`pandas.read_fwf` dependent on the year of the file. Once parsed, known typos and +inconsistencies are cleaned from the resultant dataframe. +""" + import pandas as pd from nuclearmasses.io.ame_reaction_2_file import AMEReactionFileTwo @@ -5,19 +11,41 @@ class AMEReactionParserTwo(AMEReactionFileTwo, Converter): - """Parse the second AME reaction file. + """ + Parse the second AME reaction file, doing the necessary preparation and clean ups of data. + + There are some quirks to the format used in the file. It's based on fixed-width format, but deviates in various + places so additional work is required once the file is parsed. + + Parameters + ---------- + filename : DataInput + The file-like object to parse. + year : int + The published year of the data file. - The format is known but I don't think python can easily parse it. + Attributes + ---------- + filename : DataInput + The file-like object to parse. + year : int + The published year of the data file. """ def __init__(self, filename: DataInput, year: int): - """Set the file to read and table year.""" super().__init__(year=year) self.filename: DataInput = filename self.year = year def _column_names(self) -> list[str]: - """Set the column name depending on the year""" + """ + Set the column name depending on the year. + + Returns + ------- + list[str] + An ordered list of the columns that exist in the file. + """ return [ "A", "Z", @@ -36,7 +64,14 @@ def _column_names(self) -> list[str]: ] def _data_types(self) -> dict: - """Set the data type depending on the year""" + """ + Set the column data types depending on the year. + + Returns + ------- + dict[str, str] + A dictionary of the columns that exist and their data type + """ return { "TableYear": "Int64", "Symbol": "string", @@ -59,7 +94,14 @@ def _data_types(self) -> dict: } def _na_values(self) -> dict: - """Set the columns that have placeholder values""" + """ + Set the columns that have empty fields that should be NaN'd depending on the year. + + Returns + ------- + dict[str, list[str]] + A dictionary of the columns that will have values that should be interpreted as NaN. + """ return { "A": [""], "OneNeutronSeparationEnergy": ["", "*"], @@ -77,11 +119,16 @@ def _na_values(self) -> dict: } def read_file(self) -> pd.DataFrame: - """Read the file using it's known format + """ + Read the file-like object ``self.filename`` into a dataframe + + The ``AMEReactionTwoFile`` and other functions in this class have hopefully sanitized the column names, data + types and locations of the date so we can now make the generic call to parse the file. - The AMEReactionFileTwo and other functions in this class have hopefully sanitized the - column names, data types and locations of the date so we can now make the generic - call to parse the file. + Returns + ------- + pandas.DataFrame + A dataframe containing the parsed and organised contents of the second AME reaction data file """ df = Converter.read_fwf( self.filename, diff --git a/src/nuclearmasses/io/nubase.py b/src/nuclearmasses/io/nubase.py index 98d8e00..cd0054f 100644 --- a/src/nuclearmasses/io/nubase.py +++ b/src/nuclearmasses/io/nubase.py @@ -1,3 +1,7 @@ +""" +The nubase module defines the ``NUBASE`` class to work on and store details related to the NUBASE data. +""" + from importlib.resources.abc import Traversable import pandas as pd @@ -6,7 +10,31 @@ class NUBASE: - """Top level storage and functionality for NUBASE data""" + """ + Container class to store details related to the years a NUBASE table was published. + + This is a high level class that tracks details, but delegates the parsing of the files. + + Parameters + ---------- + data_path : Traversable + Absolute path to the data files location. Not an actual file, rather the top level directory that contains from + where we access the year then individual file + + Attributes + ---------- + data_path : Traversable + Absolute path to the data files location. Not an actual file, rather the top level directory that contains from + where we access the year then individual file. + years : list[ints] + An ordered list of the years in which a NUBASE table was published. + nubase_files : list[str] + The filenames of the NUBASE data files in year order. + files : dict[int, str] + A dictionary mapping published year to filename. + nubase_df : pandas.DataFrame + A dataframe containing the NUBASE data from all published years. + """ def __init__(self, data_path: Traversable): self.data_path = data_path @@ -22,13 +50,44 @@ def __init__(self, data_path: Traversable): self.nubase_df: pd.DataFrame = self.parse_all_files() def get_datafile(self, year: int) -> Traversable: - """Use the given ``year`` to locate the NUBASE mass table file and return the absolute path.""" + """ + Construct the absolute path to the mass table file for the given ``year``. + + Parameters + ---------- + year : int + The published year to get the file for. + + Returns + ------- + Traversable + The absolute path to the data file. + """ return self.data_path / str(year) / self.files[year] def parse_year(self, year: int) -> pd.DataFrame: - """Parse the file of the given ``year``""" + """ + Parse the data of the given ``year``. + + Parameters + ---------- + year : int + The published year to get the data for. + + Returns + ------- + pandas.DataFrame + The data from ``year`` as a dataframe + """ return NUBASEParser(filename=self.get_datafile(year), year=year).read_file() def parse_all_files(self) -> pd.DataFrame: - """Parse the files for all available years""" + """ + Parse the files for all available years. + + Returns + ------- + pandas.DataFrame + The data from all published years as a single dataframe. + """ return pd.concat((self.parse_year(y) for y in self.years), ignore_index=True) diff --git a/src/nuclearmasses/io/nubase_file.py b/src/nuclearmasses/io/nubase_file.py index 157a3e9..dd3abf4 100644 --- a/src/nuclearmasses/io/nubase_file.py +++ b/src/nuclearmasses/io/nubase_file.py @@ -1,14 +1,39 @@ +""" +The nubase_file module defines the ``NUBASEFile`` class. This class stores the column positions of the start and finish +location of the different parameters recorded in the NUBASE data file. The positions have changed between years so the +year of the table is given as a parameter at construction. +""" + + class NUBASEFile: - """Easy access to the variables in the NUBASE file. + """ + Storage class for the data in the NUBASE data file. + + The NUBASE data file is fixed-width file format so we will store the format details in this class. + + Note we have not listed all parameters in the attributes section as there are so many. The naming convention is + however shown, along with a description. + + Parameters + ---------- + year : int + The year the file being parsed was published - The NUBASE data file is formatted by location in the line, values exist - between 2 specific columns in the line. Store the start and end locations - in this class to allow simple access and stop the NUBASE parser having - magic numbers. + Attributes + ---------- + HEADER : int + The number of lines in the file to be interpreted as the header. + FOOTER : int + The number of lines in the file to be interpreted as the footer. + START_X : int + The first column of parameter X. + END_X : int or None + The last column of parameter X or None to represent the end of the line. + column_limits : list[tuple[int, int]] + The start and end positions of all parameters as a list of tuples that can be passed to :meth:`pandas.read_fwf`. """ def __init__(self, year: int, **kwargs): - """Setup the values that locate the variable.""" super().__init__(**kwargs) match year: case 1995: diff --git a/src/nuclearmasses/io/nubase_parse.py b/src/nuclearmasses/io/nubase_parse.py index bf653a8..f3181c3 100644 --- a/src/nuclearmasses/io/nubase_parse.py +++ b/src/nuclearmasses/io/nubase_parse.py @@ -1,3 +1,9 @@ +""" +The nubase_parse module defines the ``NUBASEParser`` class. This class contains the logic required to sort and organise +the inputs to :meth:`pandas.read_fwf` dependent on the year of the file. Once parsed, known typos and inconsistencies +are cleaned from the resultant dataframe. +""" + import typing import pandas as pd @@ -7,13 +13,30 @@ class NUBASEParser(NUBASEFile, Converter): - """Parse the NUBASE data file. - - A collection of functions to parse the weird format of the NUBASE file. + """ + Parse the NUBASE file, doing the necessary preparations and clean up of data. + + There are some quirks to the format used in the file. It's based on fixed-width format, but deviates in various + various places so additional work is required once the file is parsed. + + Parameters + ---------- + filename : DataInput + The file-like object to parse. + year : int + The published year of the data file. + + Attributes + ---------- + filename : DataInput + The file-like object to parse. + year : int + The published year of the data file. + unit_replacements : dict[str, str] + A dictionary used to tidy up time units from NUBASE format to one the module recognises. """ def __init__(self, filename: DataInput, year: int): - """Set the file to read and the table year.""" super().__init__(year=year) self.filename: DataInput = filename self.year: int = year @@ -23,7 +46,14 @@ def __init__(self, filename: DataInput, year: int): } def _column_names(self) -> list[str]: - """Set the column name depending on the year""" + """ + Set the column name depending on the year. + + Returns + ------- + list[str] + An ordered list of the columns that exist in the file. + """ col_names = [ "A", "Z", @@ -48,7 +78,14 @@ def _column_names(self) -> list[str]: return col_names def _data_types(self) -> dict: - """Set the data type depending on the year""" + """ + Set the column data types depending on the year. + + Returns + ------- + dict[str, str] + A dictionary of the columns that exist and their data type + """ data_types = { "Symbol": "string", "A": "Int64", @@ -79,7 +116,14 @@ def _data_types(self) -> dict: return data_types def _na_values(self) -> dict: - """Set the columns that have placeholder values""" + """ + Set the columns that have empty fields that should be NaN'd depending on the year. + + Returns + ------- + dict[str, list[str]] + A dictionary of the columns that will have values that should be interpreted as NaN. + """ na_values = { "State": [""], "NUBASEMassExcess": [""], @@ -99,10 +143,21 @@ def _na_values(self) -> dict: return na_values def parse_half_life(self, raw_df) -> pd.DataFrame: - """Create half-life columns with SI units + """ + Create additional half-life columns with values in seconds - The half-life is stored as a human readable value, e.g. 2ms, 4Gyr, 5mins, this is fine to read but not to do + The half-life is stored as a human readable value, e.g. 2ms, 4Gyr, 5mins. This is fine to read but not to do any type of sorting or algorithm. Convert to the SI unit of seconds, but don't overwrite original columns. + + Parameters + ---------- + raw_df : pandas.DataFrame + The dataframe to use to convert raw half-life records into values in seconds. + + Returns + ------- + pandas.DataFrame + The updated dataframe with new columns containing half-life values in seconds. """ # Convert stable isotopes into ones with enormous lifetimes with zero error so we can cast raw_df["HalfLifeValue"] = raw_df["HalfLifeValue"].astype("object") @@ -139,9 +194,21 @@ def parse_half_life(self, raw_df) -> pd.DataFrame: return raw_df def parse_state(self, raw_df) -> pd.DataFrame: - """Interpret the state of the isotope + """ + Interpret the state of the isotope + + Currently we are only interested in ground states so drop any other row that is not that. + In the future we will care about isomers. + + Parameters + ---------- + raw_df : pandas.DataFrame + The raw dataframe with all states of isotopes in. - Currently we are only interested in ground states, but in the future we will care about isomers. + Returns + ------- + pandas.DataFrame + The updated dataframe containing only ground state data """ # Ignore anything this is not the ground state raw_df = raw_df[raw_df["State"] == 0] @@ -151,9 +218,21 @@ def parse_state(self, raw_df) -> pd.DataFrame: return raw_df def calculate_relative_error(self, raw_df) -> pd.DataFrame: - """Calculate the relative error of the mass excess + """ + Calculate the relative error of the mass excess. + + 12C has a 0.0 +/- 0.0 mass excess by definition, so relative error is 0.0. The division by zero will put a NaN + value in the column for 12C so we will manually correct and set to 0.0. + + Parameters + ---------- + raw_df : pandas.DataFrame + The raw dataframe upon which we will act. - 12C has a 0.0 +/- 0.0 mass excess definition by definition so ensure that is still true. + Returns + ------- + pandas.DataFrame + The updated dataframe with a new relative mass excess column. """ raw_df["NUBASERelativeError"] = abs( raw_df["NUBASEMassExcessError"].astype(float) / raw_df["NUBASEMassExcess"].astype(float) @@ -163,11 +242,16 @@ def calculate_relative_error(self, raw_df) -> pd.DataFrame: return raw_df def read_file(self) -> pd.DataFrame: - """Read the file using it's known format + """ + Read the file-like object ``self.filename`` into a dataframe + + The ``NUBASEFile`` and other functions in this class have hopefully sanitized the column names, data types and + locations of the date so we can now make the generic call to parse the file. - The NUBASEFile and other functions in this class have hopefully sanitized the - column names, data types and locations of the date so we can now make the generic - call to parse the file. + Returns + ------- + pandas.DataFrame + A dataframe containing the parsed and organised contents of the NUBASE data file """ df = Converter.read_fwf( self.filename, diff --git a/src/nuclearmasses/mass_table.py b/src/nuclearmasses/mass_table.py index 745c13a..ff0e349 100644 --- a/src/nuclearmasses/mass_table.py +++ b/src/nuclearmasses/mass_table.py @@ -1,3 +1,9 @@ +""" +The mass_table module defines the ``MassTable`` class that is used to store all the data published by the AME and +NUBASE papers. Once an instance of the class is instantiated, the ``data`` attribute can be used to access the +complete mass table as a pandas Dataframe. +""" + from difflib import get_close_matches import importlib.resources import io @@ -12,15 +18,34 @@ class MassTable: - """Class for all of the mass data. + """ + Container class for the complete mass table. - Internally there are separate dataframes for the NUBASE and AME data as well as a combined one for all data + Any ``MassTable`` instance parses all data files on construction, and has its own copy of the mass table dataframe. + The dataframe is accessed via the ``data`` attribute, but functionality that manipulates the mass table is generally + done on the class instance level. + + Attributes + ---------- + data : pandas.DataFrame + The parsed mass table and any additional user data. """ def __init__(self) -> None: self._complete_df: pd.DataFrame = self._parse_files() def _parse_files(self) -> pd.DataFrame: + """ + Parse all the published data files and merge into a single dataframe. + + The merge is carried out on values unique to an isotope, and the published year, to remove duplicated columns. + No indexing or slicing is done, so the dataframe is in a relatively raw form. + + Returns + ------- + pandas.DataFrame + The complete mass table as a pandas dataframe. + """ data_path = importlib.resources.files("nuclearmasses").joinpath("data") common_columns = ["A", "Z", "N", "TableYear", "Symbol", "DataSource"] @@ -33,7 +58,25 @@ def add_user_data( source: int = 1, common_values: dict[str, typing.Any] | None = None, ) -> None: - """Merge user data into the mass table""" + """ + Add user data into the published mass table. + + Read json formatted ``data`` for isotope identification and values then add it to the existing mass table using + ``source`` to differentiate it from published values. If not present in ``data``, the dictionary + ``common_values`` can be used to set a single value for a property on all isotopes added. + + The ``data`` is added via :meth:`pandas.concat` to create new entries for each isotope, rather than overwriting. + It is not merged in via :meth:`pandas.merge` so any values not provided are set to NaN. + + Parameters + ---------- + data : str | pathlib.Path | typing.IO + The data, in json format, that will be added to the existing dataframe. + source : int, default 1 + The value used to identify where this data has originated. + common_values : dict[str, typing.Any] | None + Additional values, not provided in ``data`` but common to all entries. + """ # We are going to force at least 3 columns in the user data # Two in the input file: A and Z to uniquely identify the isotope # One via code: DataSource to differentiate from the original table data @@ -48,7 +91,7 @@ def add_user_data( else: data = io.StringIO(data) - # Read the file, should be valid json so nice and simple + # Read the data, should be valid json so nice and simple user_df: pd.DataFrame = pd.read_json(data, dtype={"A": int, "Z": int}) # Add any additional data that is constant for the user data, e.g. TableYear @@ -100,5 +143,14 @@ def add_user_data( @property def data(self) -> pd.DataFrame: - """Access the complete mass table dataframe""" + """ + Return the dataframe containing the complete mass table. + + Data from all available years and both AME and NUBASE sources is combined and collated into a single dataframe. + + Returns + ------- + pandas.DataFrame + The complete mass table as a pandas dataframe. + """ return self._complete_df diff --git a/src/nuclearmasses/utils/converter.py b/src/nuclearmasses/utils/converter.py index f5d2300..bcfb2a7 100644 --- a/src/nuclearmasses/utils/converter.py +++ b/src/nuclearmasses/utils/converter.py @@ -1,3 +1,9 @@ +""" +The converter module defines the ``Converter`` class that is used to store lookup dictionaries to allow simple and fast +conversions between scientific units and seconds, and element symbol and Z value. The dictionaries are defined on the +class level so any instance should share a single copy. +""" + import importlib from importlib.resources.abc import Traversable import os @@ -10,10 +16,13 @@ class Converter: - """A utility class for converting between symbol and Z value + """ + Utility class to convert between various physical properties. + + All methods are static so it is not necessary to create an instance of the class. - This class provides bidirectional lookup functionality via two dictionaries one mapping Z to symbol, - and the other symbol to Z. + Internal dictionaries allow bidirectional conversion between element symbol and Z, as well as the conversion of an + time unit in SI format into the equivalent number of seconds (e.g. min -> 60.0). """ UNIT_TO_SECONDS: dict[str, float] = { @@ -61,37 +70,67 @@ class Converter: SYMBOL_TO_Z: dict[str, int] = {val: key for key, val in Z_TO_SYMBOL.items()} def __init__(self, **kwargs) -> None: - """Construct the symbol -> Z and Z -> symbol dictionaries.""" # We are using multiple inheritance, so need this for MRO super().__init__(**kwargs) @staticmethod def get_symbol(z: int) -> str | None: - """Get the symbol representing + """ + Get the symbol representing ``z``. - This is a nicely named, very thin wrapper around the inbuilt dictionary. - I'm sure I was going to do something else in this function beyond basic accessing, but don't recall. - Leave as is and hopefully I'll remember + This is a nicely named, very thin wrapper around the inbuilt dictionary get. + + Parameters + ---------- + z : int + The Z value to get the symbol for. + + Returns + ------- + str or None + The string representing the ``z` value or None if the ``z` value is invalid. """ return Converter.Z_TO_SYMBOL.get(z, None) @staticmethod def get_z(symbol: str) -> int | None: - """Get the z (proton number) representing + """ + Get the z (proton number) representing ``symbol``. This is a nicely named, very thin wrapper around the inbuilt dictionary. + + Parameters + ---------- + symbol : str + The elemental symbol to get the Z for. + + Returns + ------- + int or None + The Z value representing ``symbol`` or None if ``symbol`` is invalid. """ return Converter.SYMBOL_TO_Z.get(symbol, None) @staticmethod def normalise_symbol(symbol: str) -> str: - """Validate format of to allow simpler conversions + """ + Validate format of ``symbol`` to allow simpler conversions. Element symbols always have a capital first letter and lower case second, if it exists. We store all symbols like this so want any user input to be of this format. In typesetting, this is known as title case so we can leverage that conversion function. No checking is done on the validity of the symbol. + + Parameters + ---------- + symbol : str + The elemental symbol to validate. + + Returns + ------- + str + The elemental symbol with the correct casing. """ return symbol.strip().title() @@ -99,13 +138,26 @@ def normalise_symbol(symbol: str) -> str: def unit_to_seconds(unit_str: str) -> float | None: """Convert a time unit to a scale factor in seconds. - Returns np.nan for non-valid time units. - - e.g. - "s" -> 1.0, - "min" -> 60.0, - "keV" -> np.nan - 2 -> np.nan + Parameters + ---------- + unit_str : str + The time unit to convert into seconds. + + Returns + ------- + float or None + The time unit represented in seconds or None if the unit does not represent time. + + Examples + -------- + >>> from nuclearmasses.utils.converter import Converter + >>> Converter.unit_to_seconds("s") + 1.0 + >>> Converter.unit_to_seconds("min") + 60.0 + >>> Converter.unit_to_seconds("keV") + >>> Converter.unit_to_seconds(2) + >>> """ if pd.isna(unit_str) or not isinstance(unit_str, str): return None @@ -119,11 +171,22 @@ def unit_to_seconds(unit_str: str) -> float | None: @staticmethod def read_fwf(base: DataInput, **kwargs): - """Overloaded version of pandas.read_fwf() that accepts more types + """ + Overloaded version of :meth:`pandas.read_fwf` that accepts additional types. - Our use of importlib.resource means we have types that the pandas version of read_fwf does not accept. + The use of importlib.resource means we have types that the pandas version of read_fwf does not accept. It can still be used but some work is required. This function does that work, as well as some other checking to make sure we can pass the necessary types into our parser classes. + + Parameters + ---------- + base : DataInput + The file-like object to read. + + Returns + ------- + pandas.DataFrame + The file-like object parsed into a pandas dataframe. """ # A file like object if hasattr(base, "read"): @@ -139,6 +202,27 @@ def read_fwf(base: DataInput, **kwargs): @staticmethod def strip_char_from_string_columns(df: pd.DataFrame, char: str) -> pd.DataFrame: + """ + Remove ``char`` from columns that are of known string type + + Helper method to optimise the removal of the ``char`` character from columns in ``df`` that are of string type. + This function is specific to this module; we know, after parsing a file with :meth:'pandas.read_fwf`, columns + that contain purely floats will be of type float, but if a value on one line is e.g. 1234.56# the column will + be of type string (or object). As we wish to remove the '#' character, we can use this detail to only apply the + removal algorithm to those columns of type string and save some unnecessary processing. + + Parameters + ---------- + df : pandas.DataFrame + The dataframe we are removing the character from. + char : str + The character we want to remove. + + Returns + ------- + pandas.DataFrame + The original dataframe with all instance of ``char`` removed from string type columns. + """ cols = df.select_dtypes(include=["object", "string"]).columns df[cols] = df[cols].apply(lambda s: s.str.replace(char, "", regex=False)) return df