From ab67ff84b156d38b4293e529dbba318d1df4a485 Mon Sep 17 00:00:00 2001 From: Brylie Christopher Oxley Date: Fri, 7 Apr 2023 13:19:11 +0300 Subject: [PATCH] Add script to label magazine author types --- magazine_author_types.ipynb | 352 ++++++++++++++++++++++++++++++++++++ 1 file changed, 352 insertions(+) create mode 100644 magazine_author_types.ipynb diff --git a/magazine_author_types.ipynb b/magazine_author_types.ipynb new file mode 100644 index 0000000..308a571 --- /dev/null +++ b/magazine_author_types.ipynb @@ -0,0 +1,352 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Label magazine authors by type\n" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "magazine_authors_file = \"data/magazine_authors-2023-03-25-merged.xlsx\"\n", + "magazine_authors_export_file = \"data/magazine_authors-2023-03-25-merged-labelled.csv\"" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "magazine_authors = pd.read_excel(magazine_authors_file)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(3106, 9)" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "magazine_authors.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "def categorize_author(row): \n", + " if pd.notnull(row[\"meeting_name\"]):\n", + " return \"meeting\"\n", + " elif pd.notnull(row[\"organization_name\"]):\n", + " return \"organization\"\n", + " elif pd.notnull(row[\"given_name\"]):\n", + " return \"person\"\n", + " elif pd.notnull(row[\"family_name\"]):\n", + " return \"person\"\n", + " else:\n", + " return \"unknown\"\n" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "magazine_authors[\"author_type\"] = magazine_authors.apply(lambda row: categorize_author(row), axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
drupal_full_namedrupal_author_idduplicate_of_idgiven_namefamily_nameorganization_namemeeting_nameNotescivicrm_idauthor_type
0A. N. Whitehead1832NaNA. N.WhiteheadNaNNaNNaNNaNperson
1A. S.15971108.0NaNAnonymousNaNNaNNaNNaNperson
2A. Sidney Wright1789NaNA. SidneyWrightNaNNaNNaNNaNperson
3A. Stanley Thompson2801NaNA. StanleyThompsonNaNNaNNaNNaNperson
4A.F. Anderson26152351.0Alfred F.AndersonNaNNaNNaNNaNperson
.................................
95Ann Birch4951NaNAnnBirchNaNNaNNaNNaNperson
96Ann Bishop3191NaNAnnBishopNaNNaNNaNNaNperson
97Ann Boone219NaNAnnBooneNaNNaNNaNNaNperson
98Ann C. Stever26182523.0Ann C.SteverNaNNaNNaNNaNperson
99Ann Denham2658NaNAnnDenhamNaNNaNNaNNaNperson
\n", + "

100 rows × 10 columns

\n", + "
" + ], + "text/plain": [ + " drupal_full_name drupal_author_id duplicate_of_id given_name \\\n", + "0 A. N. Whitehead 1832 NaN A. N. \n", + "1 A. S. 1597 1108.0 NaN \n", + "2 A. Sidney Wright 1789 NaN A. Sidney \n", + "3 A. Stanley Thompson 2801 NaN A. Stanley \n", + "4 A.F. Anderson 2615 2351.0 Alfred F. \n", + ".. ... ... ... ... \n", + "95 Ann Birch 4951 NaN Ann \n", + "96 Ann Bishop 3191 NaN Ann \n", + "97 Ann Boone 219 NaN Ann \n", + "98 Ann C. Stever 2618 2523.0 Ann C. \n", + "99 Ann Denham 2658 NaN Ann \n", + "\n", + " family_name organization_name meeting_name Notes civicrm_id author_type \n", + "0 Whitehead NaN NaN NaN NaN person \n", + "1 Anonymous NaN NaN NaN NaN person \n", + "2 Wright NaN NaN NaN NaN person \n", + "3 Thompson NaN NaN NaN NaN person \n", + "4 Anderson NaN NaN NaN NaN person \n", + ".. ... ... ... ... ... ... \n", + "95 Birch NaN NaN NaN NaN person \n", + "96 Bishop NaN NaN NaN NaN person \n", + "97 Boone NaN NaN NaN NaN person \n", + "98 Stever NaN NaN NaN NaN person \n", + "99 Denham NaN NaN NaN NaN person \n", + "\n", + "[100 rows x 10 columns]" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "magazine_authors.head(n=100)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "magazine_authors.to_csv(magazine_authors_export_file, index=False)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.1" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +}