Optimise frame wide character replacement

php1ic · php1ic · commit 780f90ee4440 · 2026-04-20T22:40:18.000+01:00
A large fraction of time in the initial read was doing the removal of
any and all instances of the # character. If we use the fact that only
string and object column types will contain this character after
parsing with read_fwf we can run on fewer columns and don't need to use
regex, saving time.
diff --git a/src/nuclearmasses/io/ame_mass_parse.py b/src/nuclearmasses/io/ame_mass_parse.py
@@ -97,7 +97,8 @@ def read_file(self) -> pd.DataFrame:
         )
         # We use the NUBASE data to define whether or not an isotope is experimentally measured,
         # so for this data we'll just drop any and all '#' characters
-        df = df.replace("#", "", regex=True)
+        str_cols = df.select_dtypes(include=["object", "string"]).columns
+        df[str_cols] = df[str_cols].astype(str).apply(lambda s: s.str.replace("#", "", regex=False))
 
         if self.year == 1983:
             # The column headers and units are repeated in the 1983 table
diff --git a/src/nuclearmasses/io/ame_reaction_1_parse.py b/src/nuclearmasses/io/ame_reaction_1_parse.py
@@ -95,7 +95,8 @@ def read_file(self) -> pd.DataFrame:
         )
         # We use the NUBASE data to define whether or not an isotope is experimentally measured,
         # so for this data we'll just drop any and all '#' characters
-        df = df.replace("#", "", regex=True)
+        str_cols = df.select_dtypes(include=["object", "string"]).columns
+        df[str_cols] = df[str_cols].astype(str).apply(lambda s: s.str.replace("#", "", regex=False))
 
         if self.year == 1983:
             # The column headers and units are repeated in the 1983 table
diff --git a/src/nuclearmasses/io/ame_reaction_2_parse.py b/src/nuclearmasses/io/ame_reaction_2_parse.py
@@ -95,7 +95,8 @@ def read_file(self) -> pd.DataFrame:
         )
         # We use the NUBASE data to define whether or not an isotope is experimentally measured,
         # so for this data we'll just drop any and all '#' characters
-        df = df.replace("#", "", regex=True)
+        str_cols = df.select_dtypes(include=["object", "string"]).columns
+        df[str_cols] = df[str_cols].astype(str).apply(lambda s: s.str.replace("#", "", regex=False))
 
         if self.year == 1983:
             # The column headers and units are repeated in the 1983 table
diff --git a/src/nuclearmasses/io/nubase_parse.py b/src/nuclearmasses/io/nubase_parse.py
@@ -185,7 +185,8 @@ def read_file(self) -> pd.DataFrame:
         # We use the NUBASE data to define whether or not an isotope is experimentally measured,
         df["Experimental"] = ~df["NUBASEMassExcess"].astype("string").str.contains("#", na=False)
         # Once we have used the '#' to determine if it's experimental or not, we can remove all instances of it
-        df = df.replace("#", "", regex=True)
+        str_cols = df.select_dtypes(include=["object", "string"]).columns
+        df[str_cols] = df[str_cols].astype(str).apply(lambda s: s.str.replace("#", "", regex=False))
 
         df = self.parse_half_life(df)
         df = self.calculate_relative_error(df)

Original file line number	Diff line number	Diff line change
`@@ -97,7 +97,8 @@ def read_file(self) -> pd.DataFrame:`
`97`	`97`	`)`
`98`	`98`	`# We use the NUBASE data to define whether or not an isotope is experimentally measured,`
`99`	`99`	`# so for this data we'll just drop any and all '#' characters`
`100`		`- df = df.replace("#", "", regex=True)`
	`100`	`+ str_cols = df.select_dtypes(include=["object", "string"]).columns`
	`101`	`+ df[str_cols] = df[str_cols].astype(str).apply(lambda s: s.str.replace("#", "", regex=False))`
`101`	`102`
`102`	`103`	`if self.year == 1983:`
`103`	`104`	`# The column headers and units are repeated in the 1983 table`
Original file line number	Diff line number	Diff line change
`@@ -95,7 +95,8 @@ def read_file(self) -> pd.DataFrame:`
`95`	`95`	`)`
`96`	`96`	`# We use the NUBASE data to define whether or not an isotope is experimentally measured,`
`97`	`97`	`# so for this data we'll just drop any and all '#' characters`
`98`		`- df = df.replace("#", "", regex=True)`
	`98`	`+ str_cols = df.select_dtypes(include=["object", "string"]).columns`
	`99`	`+ df[str_cols] = df[str_cols].astype(str).apply(lambda s: s.str.replace("#", "", regex=False))`
`99`	`100`
`100`	`101`	`if self.year == 1983:`
`101`	`102`	`# The column headers and units are repeated in the 1983 table`