Skip to content

Commit 780f90e

Browse files
committed
Optimise frame wide character replacement
A large fraction of time in the initial read was doing the removal of any and all instances of the # character. If we use the fact that only string and object column types will contain this character after parsing with read_fwf we can run on fewer columns and don't need to use regex, saving time.
1 parent f761a56 commit 780f90e

4 files changed

Lines changed: 8 additions & 4 deletions

File tree

src/nuclearmasses/io/ame_mass_parse.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,8 @@ def read_file(self) -> pd.DataFrame:
9797
)
9898
# We use the NUBASE data to define whether or not an isotope is experimentally measured,
9999
# so for this data we'll just drop any and all '#' characters
100-
df = df.replace("#", "", regex=True)
100+
str_cols = df.select_dtypes(include=["object", "string"]).columns
101+
df[str_cols] = df[str_cols].astype(str).apply(lambda s: s.str.replace("#", "", regex=False))
101102

102103
if self.year == 1983:
103104
# The column headers and units are repeated in the 1983 table

src/nuclearmasses/io/ame_reaction_1_parse.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,8 @@ def read_file(self) -> pd.DataFrame:
9595
)
9696
# We use the NUBASE data to define whether or not an isotope is experimentally measured,
9797
# so for this data we'll just drop any and all '#' characters
98-
df = df.replace("#", "", regex=True)
98+
str_cols = df.select_dtypes(include=["object", "string"]).columns
99+
df[str_cols] = df[str_cols].astype(str).apply(lambda s: s.str.replace("#", "", regex=False))
99100

100101
if self.year == 1983:
101102
# The column headers and units are repeated in the 1983 table

src/nuclearmasses/io/ame_reaction_2_parse.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,8 @@ def read_file(self) -> pd.DataFrame:
9595
)
9696
# We use the NUBASE data to define whether or not an isotope is experimentally measured,
9797
# so for this data we'll just drop any and all '#' characters
98-
df = df.replace("#", "", regex=True)
98+
str_cols = df.select_dtypes(include=["object", "string"]).columns
99+
df[str_cols] = df[str_cols].astype(str).apply(lambda s: s.str.replace("#", "", regex=False))
99100

100101
if self.year == 1983:
101102
# The column headers and units are repeated in the 1983 table

src/nuclearmasses/io/nubase_parse.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -185,7 +185,8 @@ def read_file(self) -> pd.DataFrame:
185185
# We use the NUBASE data to define whether or not an isotope is experimentally measured,
186186
df["Experimental"] = ~df["NUBASEMassExcess"].astype("string").str.contains("#", na=False)
187187
# Once we have used the '#' to determine if it's experimental or not, we can remove all instances of it
188-
df = df.replace("#", "", regex=True)
188+
str_cols = df.select_dtypes(include=["object", "string"]).columns
189+
df[str_cols] = df[str_cols].astype(str).apply(lambda s: s.str.replace("#", "", regex=False))
189190

190191
df = self.parse_half_life(df)
191192
df = self.calculate_relative_error(df)

0 commit comments

Comments
 (0)