empa-scientific-it
diff --git a/‎30_introduction_data_exploration.ipynb‎
Lines changed: 19 additions & 17 deletions b/‎30_introduction_data_exploration.ipynb‎
Lines changed: 19 additions & 17 deletions
diff --git a/‎…/World-happiness-report-updated_2024.csv‎ ‎…/World-happiness-report-updated_2024.csv‎data/plotly_intro/World-happiness-report-updated_2024.csv renamed to data/data_exploration/World-happiness-report-updated_2024.csv b/‎…/World-happiness-report-updated_2024.csv‎ ‎…/World-happiness-report-updated_2024.csv‎data/plotly_intro/World-happiness-report-updated_2024.csv renamed to data/data_exploration/World-happiness-report-updated_2024.csv
diff --git a/‎data/plotly_intro/bubbly.py‎ ‎data/data_exploration/bubbly.py‎data/plotly_intro/bubbly.py renamed to data/data_exploration/bubbly.py b/‎data/plotly_intro/bubbly.py‎ ‎data/data_exploration/bubbly.py‎data/plotly_intro/bubbly.py renamed to data/data_exploration/bubbly.py
diff --git a/‎…/plotly_intro/country_region_mapping.csv‎ ‎…a_exploration/country_region_mapping.csv‎data/plotly_intro/country_region_mapping.csv renamed to data/data_exploration/country_region_mapping.csv b/‎…/plotly_intro/country_region_mapping.csv‎ ‎…a_exploration/country_region_mapping.csv‎data/plotly_intro/country_region_mapping.csv renamed to data/data_exploration/country_region_mapping.csv
diff --git a/‎data/plotly_intro/gapminder.tsv‎ ‎data/data_exploration/gapminder.tsv‎data/plotly_intro/gapminder.tsv renamed to data/data_exploration/gapminder.tsv b/‎data/plotly_intro/gapminder.tsv‎ ‎data/data_exploration/gapminder.tsv‎data/plotly_intro/gapminder.tsv renamed to data/data_exploration/gapminder.tsv
diff --git a/‎tutorial/intro_plotly_helper.py‎ ‎tutorial/data_exploration_helper.py‎tutorial/intro_plotly_helper.py renamed to tutorial/data_exploration_helper.py
Lines changed: 2 additions & 2 deletions b/‎tutorial/intro_plotly_helper.py‎ ‎tutorial/data_exploration_helper.py‎tutorial/intro_plotly_helper.py renamed to tutorial/data_exploration_helper.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎…sts/test_30_plotly_intro_to_libraries.py‎ ‎…test_30_introduction_data_exploration.py‎tutorial/tests/test_30_plotly_intro_to_libraries.py renamed to tutorial/tests/test_30_introduction_data_exploration.py
Lines changed: 12 additions & 6 deletions b/‎…sts/test_30_plotly_intro_to_libraries.py‎ ‎…test_30_introduction_data_exploration.py‎tutorial/tests/test_30_plotly_intro_to_libraries.py renamed to tutorial/tests/test_30_introduction_data_exploration.py
Lines changed: 12 additions & 6 deletions
@@ -54,7 +54,7 @@
     "import pandas as pd\n",
     "from tutorial.my_bubbly import bubbleplot \n",
     "from plotly.offline import iplot\n",
-    "path = \"data/plotly_intro\"\n",
+    "path = \"data/data_exploration\"\n",
     "gapminder_indicators = pd.read_csv(path + '/gapminder.tsv', delimiter='\\t')\n",
     "\n",
     "figure = bubbleplot(dataset=gapminder_indicators, x_column='gdpPercap', y_column='lifeExp', \n",
@@ -258,7 +258,7 @@
     "    Reads in a CSV file containing happiness data and returns it as a pandas DataFrame.\n",
     "\n",
     "    Instructions:\n",
-    "        - Use the `path_to_happiness` which will be `data/plotly_intro/World-happiness-report-updated_2024.csv`.\n",
+    "        - Use the `path_to_happiness` which will be `data/data_exploration/World-happiness-report-updated_2024.csv`.\n",
     "        - Read in the CSV into a DataFrame using `pd.read_csv`.\n",
     "        - Ensure the encoding is set to 'latin1' as the file is formatted accordingly.\n",
     "\n",
@@ -286,7 +286,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "happyness = pd.read_csv('data/plotly_intro/World-happiness-report-updated_2024.csv', encoding='latin1')\n",
+    "happyness = pd.read_csv('data/data_exploration/World-happiness-report-updated_2024.csv', encoding='latin1')\n",
     "happyness.describe()\n"
    ]
   },
@@ -347,7 +347,7 @@
    "source": [
     "import pandas as pd\n",
     "\n",
-    "happyness = pd.read_csv('data/plotly_intro/World-happiness-report-updated_2024.csv', encoding='latin1')\n",
+    "happyness = pd.read_csv('data/data_exploration/World-happiness-report-updated_2024.csv', encoding='latin1')\n",
     "\n",
     "# Assuming your dataframe is loaded into 'df'\n",
     "df = happyness\n",
@@ -437,7 +437,7 @@
     "import pandas as pd\n",
     "import matplotlib.pyplot as plt\n",
     "\n",
-    "happiness = pd.read_csv('data/plotly_intro/World-happiness-report-updated_2024.csv', encoding='latin1')\n",
+    "happiness = pd.read_csv('data/data_exploration/World-happiness-report-updated_2024.csv', encoding='latin1')\n",
     "years = happiness['year'].unique()\n",
     "print(f\"Unique years in the dataset: {sorted(years)}\")\n",
     "\n",
@@ -482,7 +482,7 @@
     "\n",
     "In this exercise we want to complete the dataframe with missing values. Complete the function below to \n",
     "\n",
-    "  1) Fill in missing years for every country (so we have an entry for every year between 2005 and 2023 and every country). Do this by initializing a DataFrame with `pd.DataFrame()` with a list.\n",
+    "  1) Fill in missing years for every country (so we have an entry for every year between 2005 and 2023 and every country). Do this by initializing a DataFrame with `pd.DataFrame()` with a list. Then left merge the happiness dataframe to it with `pd.merge()`\n",
     "  2) Fill all missing values in the year 2005 with the value 1. Use the `.fillna()` function.\n",
     "  3) Forwardfill all the remaining years with the function `.ffill()`. (To forward fill the order of the dataframe is important! Make sure to sort first.)"
    ]
@@ -681,7 +681,7 @@
    "outputs": [],
    "source": [
     "# Define the dataset and the columns\n",
-    "from tutorial.intro_plotly_helper import get_happiness_data, get_clean_dataset_with_region\n",
+    "from tutorial.data_exploration_helper import get_happiness_data, get_clean_dataset_with_region\n",
     "from plotly.offline import iplot\n",
     "dataset = get_clean_dataset_with_region(get_happiness_data())\n",
     "x_column = 'Freedom to make life choices'\n",
@@ -698,7 +698,7 @@
     "    'frames': []\n",
     "}\n",
     "\n",
-    "# Get a random representative year\n",
+    "# Take a random year present in the dataset\n",
     "year = 2010\n",
     "\n",
     "# Make the trace\n",
@@ -743,7 +743,7 @@
    },
    "outputs": [],
    "source": [
-    "from tutorial.intro_plotly_helper import get_happiness_data, get_clean_dataset_with_region, get_scatter_figure\n",
+    "from tutorial.data_exploration_helper import get_happiness_data, get_clean_dataset_with_region, get_scatter_figure\n",
     "from plotly.offline import iplot\n",
     "\n",
     "dataset = get_clean_dataset_with_region(get_happiness_data())\n",
@@ -806,7 +806,7 @@
    },
    "outputs": [],
    "source": [
-    "from tutorial.intro_plotly_helper import full_clean_dataset, get_scatter_figure_with_years\n",
+    "from tutorial.data_exploration_helper import full_clean_dataset, get_scatter_figure_with_years\n",
     "from plotly.offline import iplot\n",
     "\n",
     "dataset = full_clean_dataset()\n",
@@ -955,7 +955,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from tutorial.intro_plotly_helper import get_happiness_data, get_clean_dataset_with_region\n",
+    "from tutorial.data_exploration_helper import get_happiness_data, get_clean_dataset_with_region\n",
     "import pandas as pd\n",
     "import numpy as np\n",
     "\n",
@@ -993,7 +993,7 @@
    },
    "outputs": [],
    "source": [
-    "from tutorial.intro_plotly_helper import set_layout, full_clean_dataset\n",
+    "from tutorial.data_exploration_helper import set_layout, full_clean_dataset\n",
     "from plotly.offline import iplot\n",
     "\n",
     "\n",
@@ -1176,7 +1176,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from tutorial.intro_plotly_helper import load_full_happiness_figure\n",
+    "from tutorial.data_exploration_helper import load_full_happiness_figure\n",
     "from plotly.offline import iplot\n",
     "\n",
     "figure = load_full_happiness_figure()\n",
@@ -1201,6 +1201,8 @@
     "So as an exercise we exported the bubbly library as a file bubbly.py into the folder data.plotly_intro. It is quite a short library so quite managable.\n",
     "Try to figure out what the error is exactly and then fix the library locally by modifying only the file `data/plotly_intro/bubbly.py` until the same code below compiles.\n",
     "\n",
+    "Note: You will need to restart the kernel after changes to the packages.\n",
+    "\n",
     "(If you are interested in a solution, we have a fixed version under tutorial.my_bubbly.py, feel free to check the differences.)\n"
    ]
   },
@@ -1212,9 +1214,9 @@
    "source": [
     "import pandas as pd\n",
     "# from bubbly.bubbly import bubbleplot\n",
-    "from data.plotly_intro.bubbly import bubbleplot \n",
+    "from data.data_exploration.bubbly import bubbleplot \n",
     "from plotly.offline import iplot\n",
-    "path = \"data/plotly_intro\"\n",
+    "path = \"data/data_exploration\"\n",
     "gapminder_indicators = pd.read_csv(path + '/gapminder.tsv', delimiter='\\t')\n",
     "\n",
     "figure = bubbleplot(dataset=gapminder_indicators, x_column='gdpPercap', y_column='lifeExp', \n",
@@ -1242,7 +1244,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
+   "display_name": "python-tutorial",
    "language": "python",
    "name": "python3"
   },
@@ -1256,7 +1258,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.12.10"
+   "version": "3.10.15"
   }
  },
  "nbformat": 4,
 
@@ -7,7 +7,7 @@
 def get_happiness_data():
     # Load the dataset
     happiness_df = pd.read_csv(
-        "data/plotly_intro/World-happiness-report-updated_2024.csv",
+        "data/data_exploration/World-happiness-report-updated_2024.csv",
         encoding="latin1",
         usecols=[
             "Freedom to make life choices",
@@ -61,7 +61,7 @@ def get_clean_dataset_with_region(happiness_df: pd.DataFrame) -> pd.DataFrame:
 
     # Load the region mapping
     region_df = pd.read_csv(
-        "data/plotly_intro/country_region_mapping.csv",
+        "data/data_exploration/country_region_mapping.csv",
         encoding="latin1",
         usecols=["Country name", "Regional indicator"],
     ).drop_duplicates()
 
@@ -1,8 +1,8 @@
 import numpy as np
 import pandas as pd
-import pytest
 
-from tutorial.intro_plotly_helper import (
+import pytest
+from tutorial.data_exploration_helper import (
     full_clean_dataset,
     get_clean_dataset,
     get_happiness_data,
@@ -24,7 +24,7 @@ def reference_read_in_dataframe(path_to_happiness: str) -> pd.DataFrame:
 def test_read_in_dataframe(input_arg, function_to_test):
     """The test case(s)"""
     # Get the path to the data
-    path_to_happiness = "data/plotly_intro/World-happiness-report-updated_2024.csv"
+    path_to_happiness = "data/data_exploration/World-happiness-report-updated_2024.csv"
 
     # Read in the data
     happiness_df = reference_read_in_dataframe(path_to_happiness)
@@ -71,9 +71,15 @@ def test_clean_dataset(input_arg, function_to_test):
 
     clean_ref = reference_clean_dataset(hapiness_df)
     clean_sol = function_to_test(hapiness_df)
+    clean_ref_sorted = clean_ref.sort_values(by=["Country name", "year"]).reset_index(
+        drop=True
+    )
+    clean_sol_sorted = clean_sol.sort_values(by=["Country name", "year"]).reset_index(
+        drop=True
+    )
 
-    # Check if the two DataFrames are equal
-    assert clean_ref.equals(clean_sol)
+    # Check if the two DataFrames are equal, ignoring the index
+    assert clean_ref_sorted.equals(clean_sol_sorted)
 
     import matplotlib.pyplot as plt
 
@@ -112,7 +118,7 @@ def test_add_regional_indicator(input_arg, function_to_test):
 
     # Load the region mapping
     region_df = pd.read_csv(
-        "data/plotly_intro/country_region_mapping.csv",
+        "data/data_exploration/country_region_mapping.csv",
         encoding="latin1",
         usecols=["Country name", "Regional indicator"],
     ).drop_duplicates()