diff --git a/magazine_author_types.ipynb b/magazine_author_types.ipynb new file mode 100644 index 0000000..308a571 --- /dev/null +++ b/magazine_author_types.ipynb @@ -0,0 +1,352 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Label magazine authors by type\n" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "magazine_authors_file = \"data/magazine_authors-2023-03-25-merged.xlsx\"\n", + "magazine_authors_export_file = \"data/magazine_authors-2023-03-25-merged-labelled.csv\"" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "magazine_authors = pd.read_excel(magazine_authors_file)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(3106, 9)" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "magazine_authors.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "def categorize_author(row): \n", + " if pd.notnull(row[\"meeting_name\"]):\n", + " return \"meeting\"\n", + " elif pd.notnull(row[\"organization_name\"]):\n", + " return \"organization\"\n", + " elif pd.notnull(row[\"given_name\"]):\n", + " return \"person\"\n", + " elif pd.notnull(row[\"family_name\"]):\n", + " return \"person\"\n", + " else:\n", + " return \"unknown\"\n" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "magazine_authors[\"author_type\"] = magazine_authors.apply(lambda row: categorize_author(row), axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
| \n", + " | drupal_full_name | \n", + "drupal_author_id | \n", + "duplicate_of_id | \n", + "given_name | \n", + "family_name | \n", + "organization_name | \n", + "meeting_name | \n", + "Notes | \n", + "civicrm_id | \n", + "author_type | \n", + "
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | \n", + "A. N. Whitehead | \n", + "1832 | \n", + "NaN | \n", + "A. N. | \n", + "Whitehead | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "person | \n", + "
| 1 | \n", + "A. S. | \n", + "1597 | \n", + "1108.0 | \n", + "NaN | \n", + "Anonymous | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "person | \n", + "
| 2 | \n", + "A. Sidney Wright | \n", + "1789 | \n", + "NaN | \n", + "A. Sidney | \n", + "Wright | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "person | \n", + "
| 3 | \n", + "A. Stanley Thompson | \n", + "2801 | \n", + "NaN | \n", + "A. Stanley | \n", + "Thompson | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "person | \n", + "
| 4 | \n", + "A.F. Anderson | \n", + "2615 | \n", + "2351.0 | \n", + "Alfred F. | \n", + "Anderson | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "person | \n", + "
| ... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "
| 95 | \n", + "Ann Birch | \n", + "4951 | \n", + "NaN | \n", + "Ann | \n", + "Birch | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "person | \n", + "
| 96 | \n", + "Ann Bishop | \n", + "3191 | \n", + "NaN | \n", + "Ann | \n", + "Bishop | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "person | \n", + "
| 97 | \n", + "Ann Boone | \n", + "219 | \n", + "NaN | \n", + "Ann | \n", + "Boone | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "person | \n", + "
| 98 | \n", + "Ann C. Stever | \n", + "2618 | \n", + "2523.0 | \n", + "Ann C. | \n", + "Stever | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "person | \n", + "
| 99 | \n", + "Ann Denham | \n", + "2658 | \n", + "NaN | \n", + "Ann | \n", + "Denham | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "person | \n", + "
100 rows × 10 columns
\n", + "