Skip to content
Draft
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
352 changes: 352 additions & 0 deletions magazine_author_types.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,352 @@
{
"cells": [
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"# Label magazine authors by type\n"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"magazine_authors_file = \"data/magazine_authors-2023-03-25-merged.xlsx\"\n",
"magazine_authors_export_file = \"data/magazine_authors-2023-03-25-merged-labelled.csv\""
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"magazine_authors = pd.read_excel(magazine_authors_file)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(3106, 9)"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"magazine_authors.shape"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"def categorize_author(row): \n",
" if pd.notnull(row[\"meeting_name\"]):\n",
" return \"meeting\"\n",
" elif pd.notnull(row[\"organization_name\"]):\n",
" return \"organization\"\n",
" elif pd.notnull(row[\"given_name\"]):\n",
" return \"person\"\n",
" elif pd.notnull(row[\"family_name\"]):\n",
" return \"person\"\n",
" else:\n",
" return \"unknown\"\n"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"magazine_authors[\"author_type\"] = magazine_authors.apply(lambda row: categorize_author(row), axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>drupal_full_name</th>\n",
" <th>drupal_author_id</th>\n",
" <th>duplicate_of_id</th>\n",
" <th>given_name</th>\n",
" <th>family_name</th>\n",
" <th>organization_name</th>\n",
" <th>meeting_name</th>\n",
" <th>Notes</th>\n",
" <th>civicrm_id</th>\n",
" <th>author_type</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>A. N. Whitehead</td>\n",
" <td>1832</td>\n",
" <td>NaN</td>\n",
" <td>A. N.</td>\n",
" <td>Whitehead</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>person</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>A. S.</td>\n",
" <td>1597</td>\n",
" <td>1108.0</td>\n",
" <td>NaN</td>\n",
" <td>Anonymous</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>person</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>A. Sidney Wright</td>\n",
" <td>1789</td>\n",
" <td>NaN</td>\n",
" <td>A. Sidney</td>\n",
" <td>Wright</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>person</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>A. Stanley Thompson</td>\n",
" <td>2801</td>\n",
" <td>NaN</td>\n",
" <td>A. Stanley</td>\n",
" <td>Thompson</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>person</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>A.F. Anderson</td>\n",
" <td>2615</td>\n",
" <td>2351.0</td>\n",
" <td>Alfred F.</td>\n",
" <td>Anderson</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>person</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>95</th>\n",
" <td>Ann Birch</td>\n",
" <td>4951</td>\n",
" <td>NaN</td>\n",
" <td>Ann</td>\n",
" <td>Birch</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>person</td>\n",
" </tr>\n",
" <tr>\n",
" <th>96</th>\n",
" <td>Ann Bishop</td>\n",
" <td>3191</td>\n",
" <td>NaN</td>\n",
" <td>Ann</td>\n",
" <td>Bishop</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>person</td>\n",
" </tr>\n",
" <tr>\n",
" <th>97</th>\n",
" <td>Ann Boone</td>\n",
" <td>219</td>\n",
" <td>NaN</td>\n",
" <td>Ann</td>\n",
" <td>Boone</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>person</td>\n",
" </tr>\n",
" <tr>\n",
" <th>98</th>\n",
" <td>Ann C. Stever</td>\n",
" <td>2618</td>\n",
" <td>2523.0</td>\n",
" <td>Ann C.</td>\n",
" <td>Stever</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>person</td>\n",
" </tr>\n",
" <tr>\n",
" <th>99</th>\n",
" <td>Ann Denham</td>\n",
" <td>2658</td>\n",
" <td>NaN</td>\n",
" <td>Ann</td>\n",
" <td>Denham</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>person</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>100 rows × 10 columns</p>\n",
"</div>"
],
"text/plain": [
" drupal_full_name drupal_author_id duplicate_of_id given_name \\\n",
"0 A. N. Whitehead 1832 NaN A. N. \n",
"1 A. S. 1597 1108.0 NaN \n",
"2 A. Sidney Wright 1789 NaN A. Sidney \n",
"3 A. Stanley Thompson 2801 NaN A. Stanley \n",
"4 A.F. Anderson 2615 2351.0 Alfred F. \n",
".. ... ... ... ... \n",
"95 Ann Birch 4951 NaN Ann \n",
"96 Ann Bishop 3191 NaN Ann \n",
"97 Ann Boone 219 NaN Ann \n",
"98 Ann C. Stever 2618 2523.0 Ann C. \n",
"99 Ann Denham 2658 NaN Ann \n",
"\n",
" family_name organization_name meeting_name Notes civicrm_id author_type \n",
"0 Whitehead NaN NaN NaN NaN person \n",
"1 Anonymous NaN NaN NaN NaN person \n",
"2 Wright NaN NaN NaN NaN person \n",
"3 Thompson NaN NaN NaN NaN person \n",
"4 Anderson NaN NaN NaN NaN person \n",
".. ... ... ... ... ... ... \n",
"95 Birch NaN NaN NaN NaN person \n",
"96 Bishop NaN NaN NaN NaN person \n",
"97 Boone NaN NaN NaN NaN person \n",
"98 Stever NaN NaN NaN NaN person \n",
"99 Denham NaN NaN NaN NaN person \n",
"\n",
"[100 rows x 10 columns]"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"magazine_authors.head(n=100)\n"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
"magazine_authors.to_csv(magazine_authors_export_file, index=False)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.1"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}