diff --git a/notebooks/02_04b.ipynb b/notebooks/02_04b.ipynb index 97411ce..65c41a2 100644 --- a/notebooks/02_04b.ipynb +++ b/notebooks/02_04b.ipynb @@ -10,7 +10,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "id": "5be0cfbf-e779-42b3-8bd6-f3dd46888ebb", "metadata": {}, "outputs": [], @@ -28,6 +28,915 @@ "source": [ "### Filling missing values using fillna(), replace() and interpolate()" ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "7a57182f", + "metadata": {}, + "outputs": [], + "source": [ + "data = {'name': ['steve','john','richard','sarah','randy'\n", + " ,'micheal','julie']\n", + " ,'age': [20,21,33,23,42,38,22] \n", + " ,'gender':['male','male','male','female','male'\n", + " ,'male','female']\n", + " ,'rank':[2,1,3,5,4,7,6] \n", + " }" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "d162fd07", + "metadata": {}, + "outputs": [], + "source": [ + "ranking_df = DataFrame(data)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "8523dea0", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
nameagegenderrank
0steve20male2
1john21male1
2richard33male3
3sarah23female5
4randy42male4
5micheal38male7
6julie22female6
\n", + "
" + ], + "text/plain": [ + " name age gender rank\n", + "0 steve 20 male 2\n", + "1 john 21 male 1\n", + "2 richard 33 male 3\n", + "3 sarah 23 female 5\n", + "4 randy 42 male 4\n", + "5 micheal 38 male 7\n", + "6 julie 22 female 6" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ranking_df" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "3044327f", + "metadata": {}, + "outputs": [], + "source": [ + "ranking_df.iloc[2:5,1] = np.nan" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "d1a1cc35", + "metadata": {}, + "outputs": [], + "source": [ + "ranking_df.iloc[3:6,3] = np.nan" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "26e68549", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
nameagegenderrank
0steve20.0male2.0
1john21.0male1.0
2richardNaNmale3.0
3sarahNaNfemaleNaN
4randyNaNmaleNaN
5micheal38.0maleNaN
6julie22.0female6.0
\n", + "
" + ], + "text/plain": [ + " name age gender rank\n", + "0 steve 20.0 male 2.0\n", + "1 john 21.0 male 1.0\n", + "2 richard NaN male 3.0\n", + "3 sarah NaN female NaN\n", + "4 randy NaN male NaN\n", + "5 micheal 38.0 male NaN\n", + "6 julie 22.0 female 6.0" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ranking_df" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "6f22e0fc", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "name 0\n", + "age 3\n", + "gender 0\n", + "rank 3\n", + "dtype: int64" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ranking_df.isnull().sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "fe95be53", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "name 7\n", + "age 4\n", + "gender 7\n", + "rank 4\n", + "dtype: int64" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ranking_df.notnull().sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "997479d9", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
nameagegenderrank
2richardNaNmale3.0
3sarahNaNfemaleNaN
4randyNaNmaleNaN
\n", + "
" + ], + "text/plain": [ + " name age gender rank\n", + "2 richard NaN male 3.0\n", + "3 sarah NaN female NaN\n", + "4 randy NaN male NaN" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "bool_series = pd.isnull(ranking_df['age'])\n", + "ranking_df[bool_series]" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "0ed33a94", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
nameagegenderrank
0steve20.0male2.0
1john21.0male1.0
2richardNaNmale3.0
3sarahNaNfemaleNaN
4randyNaNmaleNaN
5micheal38.0maleNaN
6julie22.0female6.0
\n", + "
" + ], + "text/plain": [ + " name age gender rank\n", + "0 steve 20.0 male 2.0\n", + "1 john 21.0 male 1.0\n", + "2 richard NaN male 3.0\n", + "3 sarah NaN female NaN\n", + "4 randy NaN male NaN\n", + "5 micheal 38.0 male NaN\n", + "6 julie 22.0 female 6.0" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ranking_df" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "25909be7", + "metadata": {}, + "outputs": [], + "source": [ + "test = ranking_df.fillna(0)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "a37e370a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
nameagegenderrank
0steve20.0male2.0
1john21.0male1.0
2richard38.0male3.0
3sarah38.0female6.0
4randy38.0male6.0
5micheal38.0male6.0
6julie22.0female6.0
\n", + "
" + ], + "text/plain": [ + " name age gender rank\n", + "0 steve 20.0 male 2.0\n", + "1 john 21.0 male 1.0\n", + "2 richard 38.0 male 3.0\n", + "3 sarah 38.0 female 6.0\n", + "4 randy 38.0 male 6.0\n", + "5 micheal 38.0 male 6.0\n", + "6 julie 22.0 female 6.0" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "test = ranking_df.bfill()\n", + "test" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "e53a3117", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
nameagegenderrank
0steve20.0male2.0
1john21.0male1.0
2richard21.0male3.0
3sarah21.0female3.0
4randy21.0male3.0
5micheal38.0male3.0
6julie22.0female6.0
\n", + "
" + ], + "text/plain": [ + " name age gender rank\n", + "0 steve 20.0 male 2.0\n", + "1 john 21.0 male 1.0\n", + "2 richard 21.0 male 3.0\n", + "3 sarah 21.0 female 3.0\n", + "4 randy 21.0 male 3.0\n", + "5 micheal 38.0 male 3.0\n", + "6 julie 22.0 female 6.0" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "test = ranking_df.ffill()\n", + "test" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "a5eafedb", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
nameagegenderrank
0steve20.0male2.0
1john21.0male1.0
6julie22.0female6.0
\n", + "
" + ], + "text/plain": [ + " name age gender rank\n", + "0 steve 20.0 male 2.0\n", + "1 john 21.0 male 1.0\n", + "6 julie 22.0 female 6.0" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ranking_df.dropna()" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "a82b2b07", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
nameagegenderrank
0steve20.0male2.0
1john21.0male1.0
2richardNaNmale3.0
3sarahNaNfemaleNaN
4randyNaNmaleNaN
5micheal38.0maleNaN
6julie22.0female6.0
\n", + "
" + ], + "text/plain": [ + " name age gender rank\n", + "0 steve 20.0 male 2.0\n", + "1 john 21.0 male 1.0\n", + "2 richard NaN male 3.0\n", + "3 sarah NaN female NaN\n", + "4 randy NaN male NaN\n", + "5 micheal 38.0 male NaN\n", + "6 julie 22.0 female 6.0" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ranking_df.dropna(how='all')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7a809d4c", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { @@ -46,7 +955,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.8" + "version": "3.12.1" } }, "nbformat": 4, diff --git a/notebooks/02_04e.ipynb b/notebooks/02_04e.ipynb index 66aa5ef..ee800ab 100644 --- a/notebooks/02_04e.ipynb +++ b/notebooks/02_04e.ipynb @@ -10,7 +10,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 1, "id": "5be0cfbf-e779-42b3-8bd6-f3dd46888ebb", "metadata": {}, "outputs": [], @@ -31,7 +31,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 2, "metadata": {}, "outputs": [ { @@ -126,7 +126,7 @@ "6 julie 22.0 Female 6.0" ] }, - "execution_count": 4, + "execution_count": 2, "metadata": {}, "output_type": "execute_result" } @@ -146,7 +146,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 3, "metadata": {}, "outputs": [ { @@ -241,7 +241,7 @@ "6 False False False False" ] }, - "execution_count": 5, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } @@ -252,7 +252,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -347,7 +347,7 @@ "6 True True True True" ] }, - "execution_count": 6, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -358,7 +358,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -421,7 +421,7 @@ "4 randy NaN Male NaN" ] }, - "execution_count": 7, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -433,7 +433,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -528,7 +528,7 @@ "6 julie 22.0 Female 6.0" ] }, - "execution_count": 8, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -539,17 +539,9 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 8, "metadata": {}, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/tmp/ipykernel_29111/2647120121.py:1: FutureWarning: DataFrame.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead.\n", - " ranking_df.fillna(method='pad')\n" - ] - }, { "data": { "text/html": [ @@ -595,30 +587,30 @@ " \n", " 2\n", " richard\n", - " 22.0\n", + " 23.0\n", " Male\n", " 4.0\n", " \n", " \n", " 3\n", - " richard\n", - " 22.0\n", + " randy\n", + " 23.0\n", " Male\n", - " 4.0\n", + " 6.0\n", " \n", " \n", " 4\n", " randy\n", - " 22.0\n", + " 23.0\n", " Male\n", - " 4.0\n", + " 6.0\n", " \n", " \n", " 5\n", " micheal\n", " 23.0\n", " Male\n", - " 4.0\n", + " 6.0\n", " \n", " \n", " 6\n", @@ -635,35 +627,27 @@ " names age gender rank\n", "0 steve 20.0 Male 2.0\n", "1 john 22.0 Male 1.0\n", - "2 richard 22.0 Male 4.0\n", - "3 richard 22.0 Male 4.0\n", - "4 randy 22.0 Male 4.0\n", - "5 micheal 23.0 Male 4.0\n", + "2 richard 23.0 Male 4.0\n", + "3 randy 23.0 Male 6.0\n", + "4 randy 23.0 Male 6.0\n", + "5 micheal 23.0 Male 6.0\n", "6 julie 22.0 Female 6.0" ] }, - "execution_count": 9, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "ranking_df.fillna(method='pad')" + "ranking_df.bfill()" ] }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 9, "metadata": {}, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/tmp/ipykernel_29111/3253257716.py:1: FutureWarning: DataFrame.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead.\n", - " ranking_df.fillna(method='bfill')\n" - ] - }, { "data": { "text/html": [ @@ -709,30 +693,30 @@ " \n", " 2\n", " richard\n", - " 23.0\n", + " 22.0\n", " Male\n", " 4.0\n", " \n", " \n", " 3\n", - " randy\n", - " 23.0\n", + " richard\n", + " 22.0\n", " Male\n", - " 6.0\n", + " 4.0\n", " \n", " \n", " 4\n", " randy\n", - " 23.0\n", + " 22.0\n", " Male\n", - " 6.0\n", + " 4.0\n", " \n", " \n", " 5\n", " micheal\n", " 23.0\n", " Male\n", - " 6.0\n", + " 4.0\n", " \n", " \n", " 6\n", @@ -749,20 +733,20 @@ " names age gender rank\n", "0 steve 20.0 Male 2.0\n", "1 john 22.0 Male 1.0\n", - "2 richard 23.0 Male 4.0\n", - "3 randy 23.0 Male 6.0\n", - "4 randy 23.0 Male 6.0\n", - "5 micheal 23.0 Male 6.0\n", + "2 richard 22.0 Male 4.0\n", + "3 richard 22.0 Male 4.0\n", + "4 randy 22.0 Male 4.0\n", + "5 micheal 23.0 Male 4.0\n", "6 julie 22.0 Female 6.0" ] }, - "execution_count": 10, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "ranking_df.fillna(method='bfill')" + "ranking_df.ffill()" ] }, { @@ -1218,7 +1202,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.13" + "version": "3.12.1" } }, "nbformat": 4, diff --git a/notebooks/02_05b.ipynb b/notebooks/02_05b.ipynb index 79444cd..f3583dd 100644 --- a/notebooks/02_05b.ipynb +++ b/notebooks/02_05b.ipynb @@ -19,6 +19,134 @@ "### Removing duplicates" ] }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
column 1column 2column 3
01aA
11aA
22bB
32bB
43cC
53cC
63cC
\n", + "
" + ], + "text/plain": [ + " column 1 column 2 column 3\n", + "0 1 a A\n", + "1 1 a A\n", + "2 2 b B\n", + "3 2 b B\n", + "4 3 c C\n", + "5 3 c C\n", + "6 3 c C" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "DF_obj = DataFrame({'column 1': [1,1,2,2,3,3,3],\n", + " 'column 2':['a', 'a', 'b', 'b', 'c', 'c', 'c'],\n", + " 'column 3': ['A', 'A', 'B', 'B', 'C', 'C', 'C']})\n", + "DF_obj" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 False\n", + "1 True\n", + "2 False\n", + "3 True\n", + "4 False\n", + "5 True\n", + "6 True\n", + "dtype: bool" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "DF_obj.duplicated()" + ] + }, { "cell_type": "code", "execution_count": null, @@ -44,7 +172,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.8" + "version": "3.12.1" } }, "nbformat": 4, diff --git a/notebooks/02_06e.ipynb b/notebooks/02_06e.ipynb index 0c8562f..34bbb5c 100644 --- a/notebooks/02_06e.ipynb +++ b/notebooks/02_06e.ipynb @@ -1381,7 +1381,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.13" + "version": "3.12.1" } }, "nbformat": 4, diff --git a/notebooks/02_07b.ipynb b/notebooks/02_07b.ipynb index 8877c57..c769c56 100644 --- a/notebooks/02_07b.ipynb +++ b/notebooks/02_07b.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -17,6 +17,283 @@ "source": [ "### Grouping data by column index" ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
car_namesmpgcyldisphpdratwtqsecvsamgearcarb
0Mazda RX421.06160.01103.902.62016.460144
1Mazda RX4 Wag21.06160.01103.902.87517.020144
2Datsun 71022.84108.0933.852.32018.611141
3Hornet 4 Drive21.46258.01103.083.21519.441031
4Hornet Sportabout18.78360.01753.153.44017.020032
\n", + "
" + ], + "text/plain": [ + " car_names mpg cyl disp hp drat wt qsec vs am gear \\\n", + "0 Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 \n", + "1 Mazda RX4 Wag 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 \n", + "2 Datsun 710 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 \n", + "3 Hornet 4 Drive 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 \n", + "4 Hornet Sportabout 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 \n", + "\n", + " carb \n", + "0 4 \n", + "1 4 \n", + "2 1 \n", + "3 1 \n", + "4 2 " + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "address = '/workspaces/python-for-data-science-and-machine-learning-essential-training-part-1-3006708/data/mtcars.csv'\n", + "\n", + "cars = pd.read_csv(address)\n", + "\n", + "cars.columns = ['car_names','mpg','cyl','disp', 'hp', 'drat', 'wt', 'qsec', 'vs', 'am', 'gear', 'carb']\n", + "\n", + "cars.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
mpgdisphpdratwtqsecvsamgearcarb
cyl
426.663636105.13636482.6363644.0709092.28572719.1372730.9090910.7272734.0909091.545455
619.742857183.314286122.2857143.5857143.11714317.9771430.5714290.4285713.8571433.428571
815.100000353.100000209.2142863.2292863.99921416.7721430.0000000.1428573.2857143.500000
\n", + "
" + ], + "text/plain": [ + " mpg disp hp drat wt qsec \\\n", + "cyl \n", + "4 26.663636 105.136364 82.636364 4.070909 2.285727 19.137273 \n", + "6 19.742857 183.314286 122.285714 3.585714 3.117143 17.977143 \n", + "8 15.100000 353.100000 209.214286 3.229286 3.999214 16.772143 \n", + "\n", + " vs am gear carb \n", + "cyl \n", + "4 0.909091 0.727273 4.090909 1.545455 \n", + "6 0.571429 0.428571 3.857143 3.428571 \n", + "8 0.000000 0.142857 3.285714 3.500000 " + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cars_groups = cars.groupby(cars['cyl'])\n", + "cars_groups.mean(numeric_only=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { @@ -35,7 +312,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.1" + "version": "3.12.1" } }, "nbformat": 4, diff --git a/notebooks/02_07e.ipynb b/notebooks/02_07e.ipynb index 57a29e5..f21c71d 100644 --- a/notebooks/02_07e.ipynb +++ b/notebooks/02_07e.ipynb @@ -20,7 +20,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": null, "metadata": {}, "outputs": [ { diff --git a/notebooks/02__03b.ipynb b/notebooks/02__03b.ipynb index 75d4a75..1843e26 100644 --- a/notebooks/02__03b.ipynb +++ b/notebooks/02__03b.ipynb @@ -9,10 +9,41 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "acd063a4", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: pip in /usr/local/python/3.12.1/lib/python3.12/site-packages (25.3)\n", + "Collecting pip\n", + " Downloading pip-26.0.1-py3-none-any.whl.metadata (4.7 kB)\n", + "Downloading pip-26.0.1-py3-none-any.whl (1.8 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.8/1.8 MB\u001b[0m \u001b[31m36.5 MB/s\u001b[0m \u001b[33m0:00:00\u001b[0m\n", + "\u001b[?25hInstalling collected packages: pip\n", + " Attempting uninstall: pip\n", + " Found existing installation: pip 25.3\n", + " Uninstalling pip-25.3:\n", + " Successfully uninstalled pip-25.3\n", + "Successfully installed pip-26.0.1\n", + "Collecting pandas\n", + " Downloading pandas-3.0.1-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (79 kB)\n", + "Collecting numpy>=1.26.0 (from pandas)\n", + " Downloading numpy-2.4.2-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (6.6 kB)\n", + "Requirement already satisfied: python-dateutil>=2.8.2 in /home/codespace/.local/lib/python3.12/site-packages (from pandas) (2.9.0.post0)\n", + "Requirement already satisfied: six>=1.5 in /home/codespace/.local/lib/python3.12/site-packages (from python-dateutil>=2.8.2->pandas) (1.17.0)\n", + "Downloading pandas-3.0.1-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (10.9 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m10.9/10.9 MB\u001b[0m \u001b[31m25.6 MB/s\u001b[0m \u001b[33m0:00:00\u001b[0m6m0:00:01\u001b[0m\n", + "\u001b[?25hDownloading numpy-2.4.2-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (16.6 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m16.6/16.6 MB\u001b[0m \u001b[31m73.9 MB/s\u001b[0m \u001b[33m0:00:00\u001b[0m6m0:00:01\u001b[0m\n", + "\u001b[?25hInstalling collected packages: numpy, pandas\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2/2\u001b[0m [pandas]2m1/2\u001b[0m [pandas]\n", + "\u001b[1A\u001b[2KSuccessfully installed numpy-2.4.2 pandas-3.0.1\n" + ] + } + ], "source": [ "!pip install --upgrade pip \n", "!pip install pandas " @@ -25,6 +56,634 @@ "source": [ "#### Comparison operators (> < = <= => == !=) and Masking." ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "6b6736e3", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "0710cd0e", + "metadata": {}, + "outputs": [], + "source": [ + "from pandas import DataFrame" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "a3028931", + "metadata": {}, + "outputs": [], + "source": [ + "numbers_df = DataFrame(np.arange(0,90,3).reshape(10,3)\n", + " , index=['row 1','row 2','row 3','row 4','row 5','row 6','row 7','row 8','row 9','row 10']\n", + " , columns=['column 1','column 2','column 3'])" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "5833bb4a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
column 1column 2column 3
row 1036
row 291215
row 3182124
row 4273033
row 5363942
row 6454851
row 7545760
row 8636669
row 9727578
row 10818487
\n", + "
" + ], + "text/plain": [ + " column 1 column 2 column 3\n", + "row 1 0 3 6\n", + "row 2 9 12 15\n", + "row 3 18 21 24\n", + "row 4 27 30 33\n", + "row 5 36 39 42\n", + "row 6 45 48 51\n", + "row 7 54 57 60\n", + "row 8 63 66 69\n", + "row 9 72 75 78\n", + "row 10 81 84 87" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "numbers_df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0775fe46", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "np.int64(3)" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "numbers_df.iloc[0,1]" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "bcecc000", + "metadata": {}, + "outputs": [], + "source": [ + "numbers_df.iloc[0,1] =20" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "7c9b1337", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
column 1column 2column 3
row 10206
row 291215
row 3182124
row 4273033
row 5363942
row 6454851
row 7545760
row 8636669
row 9727578
row 10818487
\n", + "
" + ], + "text/plain": [ + " column 1 column 2 column 3\n", + "row 1 0 20 6\n", + "row 2 9 12 15\n", + "row 3 18 21 24\n", + "row 4 27 30 33\n", + "row 5 36 39 42\n", + "row 6 45 48 51\n", + "row 7 54 57 60\n", + "row 8 63 66 69\n", + "row 9 72 75 78\n", + "row 10 81 84 87" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "numbers_df" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "7dca7a2d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
column 2column 3
row 21215
row 32124
row 43033
\n", + "
" + ], + "text/plain": [ + " column 2 column 3\n", + "row 2 12 15\n", + "row 3 21 24\n", + "row 4 30 33" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "numbers_df.iloc[[1,2,3],[1,2]]" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "2a39a682", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
column 1column 2column 3
row 1FalseFalseFalse
row 2FalseFalseFalse
row 3FalseFalseFalse
row 4FalseFalseTrue
row 5TrueTrueTrue
row 6TrueTrueTrue
row 7TrueTrueTrue
row 8TrueTrueTrue
row 9TrueTrueTrue
row 10TrueTrueTrue
\n", + "
" + ], + "text/plain": [ + " column 1 column 2 column 3\n", + "row 1 False False False\n", + "row 2 False False False\n", + "row 3 False False False\n", + "row 4 False False True\n", + "row 5 True True True\n", + "row 6 True True True\n", + "row 7 True True True\n", + "row 8 True True True\n", + "row 9 True True True\n", + "row 10 True True True" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "mask = numbers_df>30\n", + "mask" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "234ed7c9", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
column 1column 2column 3
row 1NaNNaNNaN
row 2NaNNaNNaN
row 3NaNNaNNaN
row 4NaNNaN33.0
row 536.039.042.0
row 645.048.051.0
row 754.057.060.0
row 863.066.069.0
row 972.075.078.0
row 1081.084.087.0
\n", + "
" + ], + "text/plain": [ + " column 1 column 2 column 3\n", + "row 1 NaN NaN NaN\n", + "row 2 NaN NaN NaN\n", + "row 3 NaN NaN NaN\n", + "row 4 NaN NaN 33.0\n", + "row 5 36.0 39.0 42.0\n", + "row 6 45.0 48.0 51.0\n", + "row 7 54.0 57.0 60.0\n", + "row 8 63.0 66.0 69.0\n", + "row 9 72.0 75.0 78.0\n", + "row 10 81.0 84.0 87.0" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "numbers_df[mask]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a6bdbad8", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e8b88c21", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": {