update pandas to 2.2.3 and other minor fixes

zmx721 · zmx721 · commit 23c7c3f8a87c · 2025-06-05T20:09:52.000+08:00
diff --git a/environment.yaml b/environment.yaml
@@ -5,7 +5,7 @@ channels:
 dependencies:
     - python>=3.11,<3.13
     - altair-all=5.5.*
-    - pandas=1.5.*
+    - pandas=2.2.*
     - scipy
     - matplotlib
     - jupyter
diff --git a/modules/module2/module2-14-column_arithmetic_questions.qmd b/modules/module2/module2-14-column_arithmetic_questions.qmd
@@ -38,11 +38,11 @@ df[['Column_A']] * df[['Column_B']]
         'Question 2',
         'What is the correct syntax to multiply <code>Column_A</code> and <code>Column_B</code> from dataframe <code>df</code> and save it as a new column named <code>new_column</code>?',
         {
-        '<code>df = df.assign(’new_column’=df[’Column_A’] * df[’Column_B’])</code></code>': 'Do you need to put your new column name in between quotations?',
-        '<code>df = df.assign(new_column=df[’Column_A’] * df[’Column_B’])</code>': 'You must have been paying attention.',
-        '<code>df = df.assign[new_column=df(’Column_A’) * df(’Column_B’)]</code>': 'Are you sure that you are using the correct parentheses for this?',
+        '<code>df = df.assign(\'new_column\'=df[\'Column_A\'] * df[\'Column_B\'])</code></code>': 'Do you need to put your new column name in between quotations?',
+        '<code>df = df.assign(new_column=df[\'Column_A\'] * df[\'Column_B\'])</code>': 'You must have been paying attention.',
+        '<code>df = df.assign[new_column=df(\'Column_A\') * df(\'Column_B\')]</code>': 'Are you sure that you are using the correct parentheses for this?',
         },
-        '<code>df = df.assign(new_column=df[’Column_A’] * df[’Column_B’])</code>',
+        '<code>df = df.assign(new_column=df[\'Column_A\'] * df[\'Column_B\'])</code>',
     );
 </script>
 
diff --git a/modules/module2/module2-17-filtering_question.qmd b/modules/module2/module2-17-filtering_question.qmd
@@ -19,7 +19,7 @@ df['location'] == 'Canada'
 is 
  
 ```out
-[ True, False, False, True]
+[True, False, False, True]
 ```
 
 <br>
@@ -218,7 +218,7 @@ mighty_pokemon
     generateQuiz(
         'mcq3',
         'Question',
-        'Which type has the most Pokemon with attack and defense scores greater than 100? <i>(Hint: Think about how we counted the frequency of categorical columns in module 1)</i>',
+        'Which type has the most Pokemon with attack and defense scores greater than 100? <i>(Hint: Think about how we counted the frequency of categorical columns in module 1).</i>',
         {
         'Rock and Bug': 'Well done!',
         'Water and Rock': 'You can use <code>mighty_pokemon[\'type\'].value_counts()</code> to find out.',
diff --git a/modules/module2/module2-30-plotting_a_groupby_object.qmd b/modules/module2/module2-30-plotting_a_groupby_object.qmd
@@ -32,9 +32,9 @@ Create a plot by chaining the following actions.
 import pandas as pd
 import altair as alt
 
-pokemon = pd.read_csv('data/pokemon.csv').drop("name", axis=1)
+pokemon = pd.read_csv('data/pokemon.csv')
 
-____ = pd.DataFrame(____.____('____').____().____[:, '____'])
+____ = pd.DataFrame(____.____('____').____(numeric_only=True).____[:, '____'])
 
 ____ = ____.____()
 
@@ -52,8 +52,8 @@ ____ = ____.____()
 #| check: true
 from src.utils import assert_chart_equal, remove_keys_inplace
 
-pokemon = pd.read_csv('data/pokemon.csv').drop("name", axis=1)
-pokemon_type = pd.DataFrame(pokemon.groupby('type').mean().loc[:, 'attack']).reset_index()
+pokemon = pd.read_csv('data/pokemon.csv')
+pokemon_type = pd.DataFrame(pokemon.groupby('type').mean(numeric_only=True).loc[:, 'attack']).reset_index()
 solution = alt.Chart(pokemon_type, width=500,
                         height=300).mark_bar().encode(x=alt.X('type:N', sort='-y',
         title='Pokemon type'), y=alt.Y('attack:Q',
@@ -89,9 +89,9 @@ assert_chart_equal(solution, result)
 import pandas as pd
 import altair as alt
 
-pokemon = pd.read_csv('data/pokemon.csv').drop("name", axis=1)
+pokemon = pd.read_csv('data/pokemon.csv')
 
-pokemon_type = pd.DataFrame(pokemon.groupby('type').mean().loc[:, 'attack'])
+pokemon_type = pd.DataFrame(pokemon.groupby('type').mean(numeric_only=True).loc[:, 'attack'])
 
 pokemon_type = pokemon_type.reset_index()
 
diff --git a/modules/module2/slides/module2_25.qmd b/modules/module2/slides/module2_25.qmd
@@ -37,15 +37,15 @@ We found in Module 1 using `.value_counts()` that there are 7 different manufact
 Let's start with "K":
 
 ```{python}
-cereal[cereal['mfr'] == 'K'].mean()[['sugars']]
+cereal[cereal['mfr'] == 'K'].mean(numeric_only=True)[['sugars']]
 ```
 
 <br>
 
 Next "G":
 
 ```{python}
-cereal[cereal['mfr'] == 'G'].mean()[['sugars']]
+cereal[cereal['mfr'] == 'G'].mean(numeric_only=True)[['sugars']]
 ```
 
 
@@ -154,11 +154,12 @@ Similarly to how we made frequency tables using `.value_counts()`, we can now us
 ## Summary Statistics with Groups
 
 ```{python}
-# | inlcude: false
+# | include: false
 pd.set_option('display.max_rows', 4)
 ```
 
 ```{python}
+mfr_group = cereal.drop(columns=["name", "type"]).groupby(by='mfr')
 mfr_group.mean()
 ```
 
@@ -190,18 +191,18 @@ Of course, using groups is not limited to finding only the mean. We can do the s
 ## Aggregating dataframes
 
 ```{python}
-# | inlcude: false
+# | include: false
 pd.set_option('display.max_rows', 6)
 ```
 
 ```{python}
-cereal.agg('mean')
+cereal.select_dtypes(include=np.number).agg('mean')
 ```
 
 <br>
 
 ```{python}
-cereal.mean()
+cereal.mean(numeric_only=True)
 ```
 
 
@@ -216,7 +217,7 @@ Using `.agg()` with only a `mean` input is essentially the same thing as calling
 ---
 
 ```{python}
-cereal.agg(['max', 'min', 'median'])
+cereal.select_dtypes(include=np.number).agg(['max', 'min', 'median'])
 ```
 
 
diff --git a/modules/module2/slides/module2_29.qmd b/modules/module2/slides/module2_29.qmd
@@ -312,7 +312,7 @@ This is a big help for the clarity of our analysis.
 ---
 
 ```{python}
-mfr_mean = cereal.groupby(by='mfr').mean()
+mfr_mean = cereal.groupby(by='mfr').mean(numeric_only=True)
 mfr_mean
 ```