diff --git a/demos/demos_by_use_case/bio/BiogridDemo.ipynb b/demos/demos_by_use_case/bio/BiogridDemo.ipynb index 98e57ae8e9..afbab486b5 100644 --- a/demos/demos_by_use_case/bio/BiogridDemo.ipynb +++ b/demos/demos_by_use_case/bio/BiogridDemo.ipynb @@ -6,27 +6,29 @@ "source": [ "# PyGraphistry Tutorial: Visualize Protein Interactions From BioGrid\n", "\n", - "That is over 600.000 interactions across 50'000 proteins!\n", + "That is over 600 000 interactions across 50 000 proteins!\n", "\n", "##### Notes\n", "\n", - "This notebook automatically downloads about 200 MB of [BioGrid](http://thebiogrid.org) data. If you are going to run this notebook more than once, we recommend manually dowloading and saving the data to disk. To do so, unzip the two files and place their content in `pygraphistry/demos/data`.\n", - "- Protein Interactions: [BIOGRID-ALL-3.3.123.tab2.zip](http://thebiogrid.org/downloads/archives/Release%20Archive/BIOGRID-3.3.123/BIOGRID-ALL-3.3.123.tab2.zip)\n", - "- Protein Identifiers: [BIOGRID-IDENTIFIERS-3.3.123.tab.zip](http://thebiogrid.org/downloads/archives/Release%20Archive/BIOGRID-3.3.123/BIOGRID-IDENTIFIERS-3.3.123.tab.zip)\n" + "This notebook automatically downloads about 200 MB of [BioGrid](http://thebiogrid.org) data.\n", + "- Protein Interactions: [BIOGRID-ALL-5.0.252.tab2.zip](http://thebiogrid.org/downloads/archives/Release%20Archive/BIOGRID-5.0.252/BIOGRID-ALL-5.0.252.tab2.zip)\n", + "- Protein Identifiers: [BIOGRID-IDENTIFIERS-5.0.252.tab.zip](http://thebiogrid.org/downloads/archives/Release%20Archive/BIOGRID-5.0.252/BIOGRID-IDENTIFIERS-5.0.252.tab.zip)\n" ] }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ + "import os\n", + "import requests\n", "import pandas\n", "import graphistry\n", "\n", "# To specify Graphistry account & server, use:\n", "# graphistry.register(api=3, username='...', password='...', protocol='https', server='hub.graphistry.com')\n", - "# For more options: https://pygraphistry.readthedocs.io/en/latest/server/register.html\n" + "# For more options: https://pygraphistry.readthedocs.io/en/latest/server/register.html" ] }, { @@ -39,7 +41,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 16, "metadata": { "scrolled": false }, @@ -48,26 +50,107 @@ "name": "stderr", "output_type": "stream", "text": [ - "/usr/local/lib/python2.7/site-packages/IPython/core/interactiveshell.py:2717: DtypeWarning: Columns (19,20) have mixed types. Specify dtype option on import or set low_memory=False.\n", - " interactivity=interactivity, compiler=compiler, result=result)\n" + "/tmp/ipykernel_887036/1971870635.py:6: DtypeWarning: Columns (9,10,19,20) have mixed types. Specify dtype option on import or set low_memory=False.\n", + " rawdata = pandas.read_table(local_path, na_values=['-'], engine='c', compression='zip')\n" ] }, { "data": { + "application/vnd.microsoft.datawrangler.viewer.v0+json": { + "columns": [ + { + "name": "index", + "rawType": "int64", + "type": "integer" + }, + { + "name": "BioGRID ID Interactor A", + "rawType": "int64", + "type": "integer" + }, + { + "name": "BioGRID ID Interactor B", + "rawType": "int64", + "type": "integer" + }, + { + "name": "Official Symbol Interactor A", + "rawType": "object", + "type": "string" + }, + { + "name": "Official Symbol Interactor B", + "rawType": "object", + "type": "string" + }, + { + "name": "Pubmed ID", + "rawType": "int64", + "type": "integer" + }, + { + "name": "Author", + "rawType": "object", + "type": "string" + }, + { + "name": "Throughput", + "rawType": "object", + "type": "string" + } + ], + "ref": "b2e86da8-9547-4b2a-8d62-e592b31af662", + "rows": [ + [ + "0", + "112315", + "108607", + "MAP2K4", + "FLNC", + "9006895", + "Marti A (1997)", + "Low Throughput" + ], + [ + "1", + "124185", + "106603", + "MYPN", + "ACTN2", + "11309420", + "Bang ML (2001)", + "Low Throughput" + ], + [ + "2", + "106605", + "108625", + "ACVR1", + "FNTA", + "8599089", + "Wang T (1996)", + "Low Throughput" + ] + ], + "shape": { + "columns": 7, + "rows": 3 + } + }, "text/html": [ "
\n", - "\n", "\n", " \n", @@ -134,17 +217,18 @@ "2 Wang T (1996) Low Throughput " ] }, - "execution_count": 2, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "url1 = 'https://s3-us-west-1.amazonaws.com/graphistry.demo.data/BIOGRID-ALL-3.3.123.tab2.txt.gz'\n", - "rawdata = pandas.read_table(url1, na_values=['-'], engine='c', compression='gzip')\n", + "local_path = './data/BIOGRID-ALL-5.0.252.tab2.zip'\n", + "if not os.path.exists(local_path):\n", + " os.makedirs(os.path.dirname(local_path), exist_ok=True)\n", + " open(local_path, 'wb').write(requests.get('https://downloads.thebiogrid.org/Download/BioGRID/Release-Archive/BIOGRID-5.0.252/BIOGRID-ALL-5.0.252.tab2.zip').content)\n", "\n", - "# If using local data, comment the two lines above and uncomment the line below\n", - "# pandas.read_table('./data/BIOGRID-ALL-3.3.123.tab2.txt', na_values=['-'], engine='c')\n", + "rawdata = pandas.read_table(local_path, na_values=['-'], engine='c', compression='zip')\n", "\n", "cols = ['BioGRID ID Interactor A', 'BioGRID ID Interactor B', 'Official Symbol Interactor A', \n", " 'Official Symbol Interactor B', 'Pubmed ID', 'Author', 'Throughput']\n", @@ -162,23 +246,25 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", - " \n", " \n", " \n", " " ], @@ -186,14 +272,15 @@ "" ] }, - "execution_count": 3, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "g = graphistry.bind(source=\"BioGRID ID Interactor A\", destination=\"BioGRID ID Interactor B\")\n", - "g.plot(interactions.sample(10000))" + "result = g.plot(interactions.sample(10000), render='ipython', name=\"Biogrid Min\")\n", + "result" ] }, { @@ -206,27 +293,68 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 18, "metadata": { "scrolled": true }, "outputs": [ { "data": { + "application/vnd.microsoft.datawrangler.viewer.v0+json": { + "columns": [ + { + "name": "index", + "rawType": "int64", + "type": "integer" + }, + { + "name": "BIOGRID_ID", + "rawType": "int64", + "type": "integer" + }, + { + "name": "ORGANISM", + "rawType": "object", + "type": "string" + } + ], + "ref": "f9491f39-29cf-4d7d-a0c7-867582159c64", + "rows": [ + [ + "0", + "1", + "Arabidopsis thaliana" + ], + [ + "6", + "2", + "Arabidopsis thaliana" + ], + [ + "20", + "3", + "Arabidopsis thaliana" + ] + ], + "shape": { + "columns": 2, + "rows": 3 + } + }, "text/html": [ "
\n", - "\n", "
\n", " \n", @@ -243,12 +371,12 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -259,59 +387,92 @@ "text/plain": [ " BIOGRID_ID ORGANISM\n", "0 1 Arabidopsis thaliana\n", - "7 2 Arabidopsis thaliana\n", - "22 3 Arabidopsis thaliana" + "6 2 Arabidopsis thaliana\n", + "20 3 Arabidopsis thaliana" ] }, - "execution_count": 4, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "# This downloads 170 MB, it might take some time.\n", - "url2 = 'https://s3-us-west-1.amazonaws.com/graphistry.demo.data/BIOGRID-IDENTIFIERS-3.3.123.tab.txt.gz'\n", - "raw_proteins = pandas.read_table(url2, na_values=['-'], engine='c', compression='gzip')\n", - "\n", - "# If using local data, comment the two lines above and uncomment the line below\n", - "# raw_proteins = pandas.read_table('./data/BIOGRID-IDENTIFIERS-3.3.123.tab.txt', na_values=['-'], engine='c')\n", + "local_path2 = './data/BIOGRID-IDENTIFIERS-5.0.252.tab.zip'\n", + "if not os.path.exists(local_path2):\n", + " os.makedirs(os.path.dirname(local_path2), exist_ok=True)\n", + " open(local_path2, 'wb').write(requests.get('https://downloads.thebiogrid.org/Download/BioGRID/Release-Archive/BIOGRID-5.0.252/BIOGRID-IDENTIFIERS-5.0.252.tab.zip').content)\n", "\n", + "raw_proteins = pandas.read_table(local_path2, na_values=['-'], engine='c', compression='zip', skiprows=28)\n", "\n", "protein_ids = raw_proteins[['BIOGRID_ID', 'ORGANISM_OFFICIAL_NAME']].drop_duplicates() \\\n", " .rename(columns={'ORGANISM_OFFICIAL_NAME': 'ORGANISM'})\n", "protein_ids[:3]" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We extract the proteins referenced as either sources or targets of interactions." - ] - }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 19, "metadata": { "scrolled": true }, "outputs": [ { "data": { + "application/vnd.microsoft.datawrangler.viewer.v0+json": { + "columns": [ + { + "name": "index", + "rawType": "int64", + "type": "integer" + }, + { + "name": "BIOGRID_ID", + "rawType": "int64", + "type": "integer" + }, + { + "name": "SYMBOL", + "rawType": "object", + "type": "string" + } + ], + "ref": "80b3371f-aa92-44c9-af6b-1f2a68588df2", + "rows": [ + [ + "0", + "112315", + "MAP2K4" + ], + [ + "1", + "124185", + "MYPN" + ], + [ + "2", + "106605", + "ACVR1" + ] + ], + "shape": { + "columns": 2, + "rows": 3 + } + }, "text/html": [ "
\n", - "\n", "
Arabidopsis thaliana
762Arabidopsis thaliana
22203Arabidopsis thaliana
\n", " \n", @@ -348,7 +509,7 @@ "2 106605 ACVR1" ] }, - "execution_count": 5, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } @@ -366,34 +527,76 @@ "all_proteins[:3]" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We join on the indentification DB to get the organism in which each protein belongs." - ] - }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 20, "metadata": {}, "outputs": [ { "data": { + "application/vnd.microsoft.datawrangler.viewer.v0+json": { + "columns": [ + { + "name": "index", + "rawType": "int64", + "type": "integer" + }, + { + "name": "BIOGRID_ID", + "rawType": "int64", + "type": "integer" + }, + { + "name": "SYMBOL", + "rawType": "object", + "type": "string" + }, + { + "name": "ORGANISM", + "rawType": "object", + "type": "string" + } + ], + "ref": "d7938a83-d8bc-4064-a0bc-14b07f744f7b", + "rows": [ + [ + "0", + "112315", + "MAP2K4", + "Homo sapiens" + ], + [ + "1", + "124185", + "MYPN", + "Homo sapiens" + ], + [ + "2", + "106605", + "ACVR1", + "Homo sapiens" + ] + ], + "shape": { + "columns": 3, + "rows": 3 + } + }, "text/html": [ "
\n", - "\n", "
\n", " \n", @@ -434,7 +637,7 @@ "2 106605 ACVR1 Homo sapiens" ] }, - "execution_count": 6, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } @@ -453,42 +656,99 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "colors = protein_labels.ORGANISM.unique().tolist()\n", - "protein_labels['Color'] = protein_labels.ORGANISM.map(lambda x: colors.index(x))" + "protein_labels['Color'] = protein_labels.ORGANISM.map(lambda x: colors.index(x)).astype('int32')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "For convenience, let's add links to PubMed and RCSB." + "For convenience, let's add links to PubMed and Biogrid." ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 22, "metadata": {}, "outputs": [ { "data": { + "application/vnd.microsoft.datawrangler.viewer.v0+json": { + "columns": [ + { + "name": "index", + "rawType": "int64", + "type": "integer" + }, + { + "name": "BIOGRID_ID", + "rawType": "int64", + "type": "integer" + }, + { + "name": "SYMBOL", + "rawType": "object", + "type": "string" + }, + { + "name": "ORGANISM", + "rawType": "object", + "type": "string" + }, + { + "name": "Color", + "rawType": "int32", + "type": "integer" + } + ], + "ref": "b6bad508-716f-452f-832b-60e22d909a95", + "rows": [ + [ + "0", + "112315", + "MAP2K4", + "Homo sapiens", + "0" + ], + [ + "1", + "124185", + "MYPN", + "Homo sapiens", + "0" + ], + [ + "2", + "106605", + "ACVR1", + "Homo sapiens", + "0" + ] + ], + "shape": { + "columns": 4, + "rows": 3 + } + }, "text/html": [ "
\n", - "\n", "
\n", " \n", @@ -504,21 +764,21 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -528,9 +788,9 @@ ], "text/plain": [ " BIOGRID_ID SYMBOL \\\n", - "0 112315 %s' % (url, id.upper())\n", - " else:\n", - " return 'n/a'\n", + "def makeBiogridLink(row):\n", + " if pandas.notna(row.get('BIOGRID_ID')):\n", + " url = f'https://thebiogrid.org/{row[\"BIOGRID_ID\"]}'\n", + " return f'{row[\"SYMBOL\"]}'\n", + " return 'n/a'\n", " \n", - "protein_labels.SYMBOL = protein_labels.SYMBOL.map(makeRcsbLink)\n", + "protein_labels.SYMBOL = protein_labels.apply(makeBiogridLink, axis=1)\n", "protein_labels[:3]" ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 23, "metadata": { "scrolled": true }, "outputs": [ { "data": { + "application/vnd.microsoft.datawrangler.viewer.v0+json": { + "columns": [ + { + "name": "index", + "rawType": "int64", + "type": "integer" + }, + { + "name": "BioGRID ID Interactor A", + "rawType": "int64", + "type": "integer" + }, + { + "name": "BioGRID ID Interactor B", + "rawType": "int64", + "type": "integer" + }, + { + "name": "Official Symbol Interactor A", + "rawType": "object", + "type": "string" + }, + { + "name": "Official Symbol Interactor B", + "rawType": "object", + "type": "string" + }, + { + "name": "Pubmed ID", + "rawType": "object", + "type": "string" + }, + { + "name": "Author", + "rawType": "object", + "type": "string" + }, + { + "name": "Throughput", + "rawType": "object", + "type": "string" + } + ], + "ref": "15384706-11a1-4564-91be-8f3149642949", + "rows": [ + [ + "0", + "112315", + "108607", + "MAP2K4", + "FLNC", + "9006895", + "Marti A (1997)", + "Low Throughput" + ], + [ + "1", + "124185", + "106603", + "MYPN", + "ACTN2", + "11309420", + "Bang ML (2001)", + "Low Throughput" + ], + [ + "2", + "106605", + "108625", + "ACVR1", + "FNTA", + "8599089", + "Wang T (1996)", + "Low Throughput" + ] + ], + "shape": { + "columns": 7, + "rows": 3 + } + }, "text/html": [ "
\n", - "\n", "
0112315<a target=\"_blank\" href=\"http://www.rcsb.org/p...<a target=\"_blank\" href=\"https://thebiogrid.or...Homo sapiens0
1124185<a target=\"_blank\" href=\"http://www.rcsb.org/p...<a target=\"_blank\" href=\"https://thebiogrid.or...Homo sapiens0
2106605<a target=\"_blank\" href=\"http://www.rcsb.org/p...<a target=\"_blank\" href=\"https://thebiogrid.or...Homo sapiens0
\n", " \n", @@ -649,7 +989,7 @@ "2 Low Throughput " ] }, - "execution_count": 9, + "execution_count": 23, "metadata": {}, "output_type": "execute_result" } @@ -673,32 +1013,27 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 24, "metadata": { "scrolled": false }, "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Uploading 7139 kB. This may take a while...\n" - ] - }, { "data": { "text/html": [ "\n", - " \n", " \n", " \n", " " ], @@ -706,30 +1041,22 @@ "" ] }, - "execution_count": 10, + "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# This will upload ~10MB of data, be patient!\n", - "g2 = g.bind(node='BIOGRID_ID', edge_title='Author', point_title='SYMBOL', point_color='Color')\n", - "g2.plot(interactions, protein_labels)" + "g2 = g.bind(node='BIOGRID_ID', source=\"BioGRID ID Interactor A\", destination=\"BioGRID ID Interactor B\", \n", + " edge_title='Author', point_title='SYMBOL', point_color='Color')\n", + "g2.plot(interactions.drop(columns=['Pubmed ID']), protein_labels, name=\"Biogrid Labeled\")" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] } ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "rapids-26.02", "language": "python", "name": "python3" }, @@ -743,7 +1070,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.11" + "version": "3.11.14" } }, "nbformat": 4, diff --git a/demos/demos_by_use_case/fraud/icij_fincen_viz.ipynb b/demos/demos_by_use_case/fraud/icij_fincen_viz.ipynb index 87a77ee1fe..bcf0ac3835 100644 --- a/demos/demos_by_use_case/fraud/icij_fincen_viz.ipynb +++ b/demos/demos_by_use_case/fraud/icij_fincen_viz.ipynb @@ -8,7 +8,7 @@ "\n", "This notebook demonstrates GFQL (Graph Query Language) using the ICIJ FinCEN Files dataset.\n", "\n", - "This document will serve as an end-to-end demonstration of using Graphistry's GFQL to extract information out of a complex real-world dataset. For this purpose, we will use a dataset made available by the International Consortium of Investigative Journalists (ICIJ) as part of their reporting of leaked documents from the U.S. Department of Treasury’s Financial Crimes Enforcement Network, which include suspicious activity reports filed by U.S. banks acting as beneficiary or orginating banks in domestic transactions or as correspondent or intermediary banks in international transactions. More information about the dataset can be found at the ICIJ. In this tutorial, we will show how graph queries can be used to extract insights from large datasets and answer questions.\n", + "This document will serve as an end-to-end demonstration of using Graphistry's GFQL to extract information out of a complex real-world dataset. For this purpose, we will use a dataset made available by the International Consortium of Investigative Journalists (ICIJ) as part of their reporting of leaked documents from the U.S. Department of Treasury’s Financial Crimes Enforcement Network, which include suspicious activity reports filed by U.S. banks acting as beneficiary or originating banks in domestic transactions or as correspondent or intermediary banks in international transactions. More information about the dataset can be found at the ICIJ. In this tutorial, we will show how graph queries can be used to extract insights from large datasets and answer questions.\n", "\n", "Based on: https://hub.graphistry.com/docs/GFQL/gfql/" ] @@ -24,14 +24,15 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "import graphistry\n", "\n", - "# graphistry.register(api=3, protocol=\"https\", server=\"hub.graphistry.com\",\n", - "# username=\"...\", password=\"...\")" + "# To specify Graphistry account & server, use:\n", + "# graphistry.register(api=3, username='...', password='...', protocol='https', server='hub.graphistry.com')\n", + "# For more options: https://pygraphistry.readthedocs.io/en/latest/server/register.html" ] }, { @@ -45,7 +46,7 @@ }, { "cell_type": "code", - "execution_count": 107, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -53,12 +54,12 @@ "import zipfile\n", "import pandas as pd\n", "\n", - "# download data\n", - "resp = requests.get(\"https://media.icij.org/uploads/2020/09/download_data_fincen_files.zip\")\n", - "with open(\"download_data_fincen_files.zip\", \"wb\") as f:\n", - " f.write(resp.content)\n", - "with zipfile.ZipFile(\"download_data_fincen_files.zip\",\"r\") as zip_ref:\n", - " zip_ref.extract(\"download_transactions_map.csv\")\n", + "# uncomment to download data\n", + "# resp = requests.get(\"https://media.icij.org/uploads/2020/09/download_data_fincen_files.zip\")\n", + "# with open(\"download_data_fincen_files.zip\", \"wb\") as f:\n", + "# f.write(resp.content)\n", + "# with zipfile.ZipFile(\"download_data_fincen_files.zip\",\"r\") as zip_ref:\n", + "# zip_ref.extract(\"download_transactions_map.csv\")\n", "\n", "# read csv into pandas dataframe and change type of time columns in data to datetime\n", "df_e = pd.read_csv(\"download_transactions_map.csv\")\n", @@ -84,7 +85,7 @@ }, { "cell_type": "code", - "execution_count": 108, + "execution_count": 16, "metadata": {}, "outputs": [ { @@ -102,7 +103,7 @@ "type": "string" } ], - "ref": "cfa51504-9fe6-47d0-98d5-a73e6f549641", + "ref": "32b90554-cd5a-4069-87a1-9de21bce8c4e", "rows": [ [ "0", @@ -123,11 +124,191 @@ [ "4", "latvian-trade-commercial-bank" + ], + [ + "5", + "vtb-north-west" + ], + [ + "6", + "dbs-bank-ltd" + ], + [ + "7", + "unicredit-bank-austria-ag" + ], + [ + "8", + "kazkommertsbank" + ], + [ + "9", + "transcredit-bank" + ], + [ + "10", + "rak-bank" + ], + [ + "11", + "goldman-sachs-and-co-new-york" + ], + [ + "12", + "national-bank-sal" + ], + [ + "13", + "hsbc-hong-kong-hkg" + ], + [ + "14", + "finance-bank-of-zambia" + ], + [ + "15", + "bawag-psk-bank" + ], + [ + "16", + "kabul-bank" + ], + [ + "17", + "public-bank-berhad" + ], + [ + "18", + "tbc-bank" + ], + [ + "19", + "berenberg-bank" + ], + [ + "20", + "entie-commercial-bank" + ], + [ + "21", + "credit-suisse-ag" + ], + [ + "22", + "commonwealth-bank-of-australia" + ], + [ + "23", + "mauritius-commercial-bank-ltd" + ], + [ + "24", + "sberbank" + ], + [ + "25", + "dbs-bank" + ], + [ + "26", + "cor-clearing-llc" + ], + [ + "27", + "bsi-sa" + ], + [ + "28", + "bank-of-tokyo-mitsubishi-ufj-ltd-chicago-branch" + ], + [ + "29", + "ojsc-bank-petrocommerce" + ], + [ + "30", + "credit-suisse-zurich-switzerland-che" + ], + [ + "31", + "jscb-national-standart" + ], + [ + "32", + "russian-finance-society-moscow-russia-rus" + ], + [ + "33", + "krb-ukhtabank-ukhta-russia-rus" + ], + [ + "34", + "bank-of-georgia" + ], + [ + "35", + "unicredit-bank-moscow-russia-rus" + ], + [ + "36", + "sberbank-of-russia" + ], + [ + "37", + "hua-nan-commercial-bank-ltd" + ], + [ + "38", + "ping-an-bank-co-ltd" + ], + [ + "39", + "chinatrust-commercial-bank" + ], + [ + "40", + "united-overseas-bank-limited" + ], + [ + "41", + "dbs-bank-ltd-singapore" + ], + [ + "42", + "ap-anlage-und-privatbank-ag" + ], + [ + "43", + "hsbc-bank" + ], + [ + "44", + "banque-de-commerce-et-de-placements-sa" + ], + [ + "45", + "abn-amro-bank-nv" + ], + [ + "46", + "bank-of-shanghai" + ], + [ + "47", + "abn-amro-bank-nv-amsterdam-netherlands-nld" + ], + [ + "48", + "anlage-und-privatbank-ag" + ], + [ + "49", + "bank-hapoalim" ] ], "shape": { "columns": 1, - "rows": 5 + "rows": 2277 } }, "text/html": [ @@ -173,26 +354,59 @@ " \n", " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", "
4latvian-trade-commercial-bank
......
2272hsbc-panama-pan
2273bank-of-america-na-london-gbr
2274vtb-bank-deutschland-ag-deu
2275ing-bank-netherland-n-v-netherlands-nld
2276rigenesis-bank-as-latvia-lva
\n", + "

2277 rows × 1 columns

\n", "
" ], "text/plain": [ - " nodeId\n", - "0 cimb-bank-berhad\n", - "1 barclays-bank-plc-ho-uk\n", - "2 natwest-offshore\n", - "3 evrofinance-mosnarbank\n", - "4 latvian-trade-commercial-bank" + " nodeId\n", + "0 cimb-bank-berhad\n", + "1 barclays-bank-plc-ho-uk\n", + "2 natwest-offshore\n", + "3 evrofinance-mosnarbank\n", + "4 latvian-trade-commercial-bank\n", + "... ...\n", + "2272 hsbc-panama-pan\n", + "2273 bank-of-america-na-london-gbr\n", + "2274 vtb-bank-deutschland-ag-deu\n", + "2275 ing-bank-netherland-n-v-netherlands-nld\n", + "2276 rigenesis-bank-as-latvia-lva\n", + "\n", + "[2277 rows x 1 columns]" ] }, - "execution_count": 108, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "g._nodes.head()" + "g._nodes" ] }, { @@ -201,12 +415,12 @@ "source": [ "## Edge-list sample\n", "\n", - "View a sample of the edges in the graph. Each edge represents an aggregegation of financial transactions between two financial institutions within a specific time period." + "View a sample of the edges in the graph. Each edge represents an aggregation of financial transactions between two financial institutions within a specific time period." ] }, { "cell_type": "code", - "execution_count": 109, + "execution_count": 17, "metadata": {}, "outputs": [ { @@ -299,7 +513,7 @@ "type": "float" } ], - "ref": "ad822317-09b0-47f3-859e-31f29a27845c", + "ref": "aa3abb10-9986-4810-bf61-a86f77f5dc8b", "rows": [ [ "0", @@ -583,7 +797,7 @@ "4 NaN 1.200000e+04 " ] }, - "execution_count": 109, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } @@ -603,7 +817,7 @@ }, { "cell_type": "code", - "execution_count": 112, + "execution_count": 18, "metadata": {}, "outputs": [], "source": [ @@ -629,33 +843,28 @@ }, { "cell_type": "code", - "execution_count": 111, + "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", - " \n", - " \n", - " \n", + " \n", " " ], "text/plain": [ "" ] }, - "execution_count": 111, + "execution_count": 24, "metadata": {}, "output_type": "execute_result" } @@ -673,7 +882,7 @@ " \"edgeOpacity\": 0.3 if len(g_out._edges) > 1500 else 0.9,\n", " \"strongGravity\": True,\n", " \"play\": 2000})\n", - "g_out.plot()" + "g_out.plot(name=\"ICIJ FinCEN Full\")" ] }, { @@ -682,12 +891,12 @@ "source": [ "## Caribbean havens subgraph\n", "\n", - "The ICIJ mentions the importance of the world's top offshore financial havens in the data. Lets find transactions involving Caribbean tax havens using GFQL chain operations." + "The ICIJ mentions the importance of the world's top offshore financial havens in the data. Let's find transactions involving Caribbean tax havens using GFQL chain operations." ] }, { "cell_type": "code", - "execution_count": 113, + "execution_count": 25, "metadata": {}, "outputs": [], "source": [ @@ -714,7 +923,7 @@ }, { "cell_type": "code", - "execution_count": 114, + "execution_count": 26, "metadata": {}, "outputs": [], "source": [ @@ -727,7 +936,7 @@ }, { "cell_type": "code", - "execution_count": 115, + "execution_count": 27, "metadata": {}, "outputs": [ { @@ -827,7 +1036,7 @@ "type": "float" } ], - "ref": "323b075d-0dcb-430e-98d6-01d1a502716d", + "ref": "98ab04a7-55ec-48da-9309-8960fe32bf8a", "rows": [ [ "0", @@ -1118,7 +1327,7 @@ "4 1.0 101000.0 " ] }, - "execution_count": 115, + "execution_count": 27, "metadata": {}, "output_type": "execute_result" } @@ -1130,7 +1339,7 @@ }, { "cell_type": "code", - "execution_count": 116, + "execution_count": 28, "metadata": {}, "outputs": [ { @@ -1149,11 +1358,11 @@ }, { "name": "is_carib_bank", - "rawType": "bool", - "type": "boolean" + "rawType": "object", + "type": "unknown" } ], - "ref": "06682bbf-1f4c-46c2-bc58-1b075a11f6c2", + "ref": "157b20bf-81be-49a4-a35d-8025f71fc5c1", "rows": [ [ "0", @@ -1240,15 +1449,15 @@ "" ], "text/plain": [ - " nodeId is_carib_bank\n", - "0 hsbc-bank True\n", - "1 hsbc True\n", - "2 caledonian-bank-limited True\n", - "3 gonet-bank-and-trust-limited True\n", - "4 dms-bank-trust-ltd True" + " nodeId is_carib_bank\n", + "0 hsbc-bank True\n", + "1 hsbc True\n", + "2 caledonian-bank-limited True\n", + "3 gonet-bank-and-trust-limited True\n", + "4 dms-bank-trust-ltd True" ] }, - "execution_count": 116, + "execution_count": 28, "metadata": {}, "output_type": "execute_result" } @@ -1266,7 +1475,7 @@ }, { "cell_type": "code", - "execution_count": 117, + "execution_count": 29, "metadata": {}, "outputs": [], "source": [ @@ -1279,7 +1488,7 @@ }, { "cell_type": "code", - "execution_count": 118, + "execution_count": 30, "metadata": {}, "outputs": [ { @@ -1379,7 +1588,7 @@ "type": "float" } ], - "ref": "564a8112-a18c-4102-b227-430909a639ad", + "ref": "47fce7ec-b59f-4cf2-8ce1-510bee56b55b", "rows": [ [ "0", @@ -1663,7 +1872,7 @@ "4 2.0 200000.0 " ] }, - "execution_count": 118, + "execution_count": 30, "metadata": {}, "output_type": "execute_result" } @@ -1675,7 +1884,7 @@ }, { "cell_type": "code", - "execution_count": 119, + "execution_count": 31, "metadata": {}, "outputs": [ { @@ -1694,11 +1903,11 @@ }, { "name": "is_carib_bank", - "rawType": "bool", - "type": "boolean" + "rawType": "object", + "type": "unknown" } ], - "ref": "da9991cc-04a3-469f-afcd-b2df1bee9b7c", + "ref": "83e79bc4-93a7-40e2-a489-ca39b316c75c", "rows": [ [ "0", @@ -1785,15 +1994,15 @@ "" ], "text/plain": [ - " nodeId is_carib_bank\n", - "0 hsbc-hong-kong-hkg False\n", - "1 credit-suisse-ag False\n", - "2 bsi-sa False\n", - "3 abn-amro-bank-nv False\n", - "4 deutsche-bank-ag False" + " nodeId is_carib_bank\n", + "0 hsbc-hong-kong-hkg False\n", + "1 credit-suisse-ag False\n", + "2 bsi-sa False\n", + "3 abn-amro-bank-nv False\n", + "4 deutsche-bank-ag False" ] }, - "execution_count": 119, + "execution_count": 31, "metadata": {}, "output_type": "execute_result" } @@ -1813,33 +2022,28 @@ }, { "cell_type": "code", - "execution_count": 120, + "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", - " \n", - " \n", - " \n", + " \n", " " ], "text/plain": [ "" ] }, - "execution_count": 120, + "execution_count": 32, "metadata": {}, "output_type": "execute_result" } @@ -1868,7 +2072,7 @@ " \"play\": 2000})\n", ")\n", "\n", - "g_carib_styled.plot()" + "g_carib_styled.plot(name=\"ICIJ FinCEN Caribbean\")" ] }, { @@ -1877,38 +2081,33 @@ "source": [ "## Latvia-Russia subgraph\n", "\n", - "Find all data following a specific transaction pattern: Latvia to Russia transactions in a specific amount range." + "Find all data following a specific transaction pattern: Latvia to Russia transactions." ] }, { "cell_type": "code", - "execution_count": 121, + "execution_count": 40, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", - " \n", - " \n", - " \n", + " \n", " " ], "text/plain": [ "" ] }, - "execution_count": 121, + "execution_count": 40, "metadata": {}, "output_type": "execute_result" } @@ -1918,7 +2117,7 @@ "\n", "chain_operations = [\n", " e_forward(hops=1, edge_match={\"originator_bank_country\": \"Latvia\", \"beneficiary_bank_country\": \"Russia\"}),\n", - " n({\"nodeId\": contains(pat=\"\")}, name=\"is_rus_beneficiary\"),\n", + " n(name=\"is_rus_beneficiary\"),\n", "]\n", "g_lva_rus = g.gfql(chain_operations)\n", "\n", @@ -1936,12 +2135,12 @@ " \"edgeOpacity\": 0.3 if len(g_lva_rus._edges) > 1500 else 0.9,\n", " \"strongGravity\": True,\n", " \"play\": 2000})\n", - "g_lva_rus.plot()" + "g_lva_rus.plot(name=\"ICIJ FinCEN Lva-Rus\")" ] }, { "cell_type": "code", - "execution_count": 122, + "execution_count": 34, "metadata": {}, "outputs": [ { @@ -1960,11 +2159,11 @@ }, { "name": "is_rus_beneficiary", - "rawType": "bool", - "type": "boolean" + "rawType": "object", + "type": "unknown" } ], - "ref": "91b5eebd-38eb-4840-9376-a30c770e7ff0", + "ref": "1f2c7c78-4197-4380-a45b-65083c8ab2a4", "rows": [ [ "0", @@ -2051,15 +2250,15 @@ "" ], "text/plain": [ - " nodeId is_rus_beneficiary\n", - "0 latvian-trade-commercial-bank False\n", - "1 ltb-bank-riga False\n", - "2 norvik-banka-jsc False\n", - "3 jsc-norvik-banka False\n", - "4 rietumu-banka-jsc False" + " nodeId is_rus_beneficiary\n", + "0 latvian-trade-commercial-bank False\n", + "1 ltb-bank-riga False\n", + "2 norvik-banka-jsc False\n", + "3 jsc-norvik-banka False\n", + "4 rietumu-banka-jsc False" ] }, - "execution_count": 122, + "execution_count": 34, "metadata": {}, "output_type": "execute_result" } @@ -2070,7 +2269,7 @@ }, { "cell_type": "code", - "execution_count": 123, + "execution_count": 35, "metadata": {}, "outputs": [ { @@ -2163,7 +2362,7 @@ "type": "float" } ], - "ref": "58886227-2ab8-48c2-b567-51fa8269bc55", + "ref": "c62d5465-337c-4c4e-91c9-e250e9e5410c", "rows": [ [ "0", @@ -2447,7 +2646,7 @@ "4 1.0 790753.42 " ] }, - "execution_count": 123, + "execution_count": 35, "metadata": {}, "output_type": "execute_result" } @@ -2472,7 +2671,7 @@ }, { "cell_type": "code", - "execution_count": 124, + "execution_count": 36, "metadata": {}, "outputs": [], "source": [ @@ -2491,7 +2690,7 @@ }, { "cell_type": "code", - "execution_count": 125, + "execution_count": 37, "metadata": {}, "outputs": [ { @@ -2510,11 +2709,11 @@ }, { "name": "is_soyuz", - "rawType": "bool", - "type": "boolean" + "rawType": "object", + "type": "unknown" } ], - "ref": "7846a1c3-19cc-4969-bfeb-6d49c88517d2", + "ref": "21e213e6-fb69-4c24-96df-d9d7fa87e121", "rows": [ [ "0", @@ -2571,12 +2770,12 @@ "" ], "text/plain": [ - " nodeId is_soyuz\n", - "0 as-expobank False\n", - "1 bank-soyuz-moscow-russia-rus True" + " nodeId is_soyuz\n", + "0 as-expobank False\n", + "1 bank-soyuz-moscow-russia-rus True" ] }, - "execution_count": 125, + "execution_count": 37, "metadata": {}, "output_type": "execute_result" } @@ -2587,7 +2786,7 @@ }, { "cell_type": "code", - "execution_count": 126, + "execution_count": 38, "metadata": {}, "outputs": [ { @@ -2680,7 +2879,7 @@ "type": "float" } ], - "ref": "3a269bb6-f311-4b19-bd36-5676983283ff", + "ref": "caf2dc46-ab93-4f7b-b416-1c2458aad8d4", "rows": [ [ "0", @@ -2785,7 +2984,7 @@ "0 RUS 1.0 15900000.0 " ] }, - "execution_count": 126, + "execution_count": 38, "metadata": {}, "output_type": "execute_result" } @@ -2803,33 +3002,28 @@ }, { "cell_type": "code", - "execution_count": 128, + "execution_count": 39, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", - " \n", - " \n", - " \n", + " \n", " " ], "text/plain": [ "" ] }, - "execution_count": 128, + "execution_count": 39, "metadata": {}, "output_type": "execute_result" } @@ -2850,7 +3044,7 @@ " \"play\": 2000})\n", ")\n", "\n", - "g_od_styled.plot()" + "g_od_styled.plot(name=\"ICIJ FinCEN OD\")" ] }, { @@ -2863,7 +3057,7 @@ "\n", "- **Node filtering**: `n()` with attribute matching and predicates\n", "- **Edge traversal**: `e_forward()` with hop counts and edge matching\n", - "- **Chaining operations**: `graphistry.Chain()` to combine multiple operations\n", + "- **Chaining operations**: `g.gfql()` to combine multiple operations\n", "- **Predicates**:\n", " - `is_in()` for matching multiple values\n", " - `contains()` for substring matching\n", diff --git a/demos/demos_by_use_case/social/brightkite_checkin.ipynb b/demos/demos_by_use_case/social/brightkite_checkin.ipynb new file mode 100644 index 0000000000..10440e77f5 --- /dev/null +++ b/demos/demos_by_use_case/social/brightkite_checkin.ipynb @@ -0,0 +1,1011 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Brightkite Location-Based Social Network Dataset\n", + "\n", + "This notebook analyzes the Brightkite dataset from SNAP:\n", + "- **Network**: 58,228 users with 214,078 friendships\n", + "- **Check-ins**: 4.4M location check-ins from April 2008 - October 2010\n", + "\n", + "Source: https://snap.stanford.edu/data/loc-brightkite.html" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import requests\n", + "import gzip\n", + "from io import BytesIO, StringIO\n", + "import graphistry\n", + "\n", + "# To specify Graphistry account & server, use:\n", + "# graphistry.register(api=3, protocol=\"https\", server=\"hub.graphistry.com\",\n", + "# username=\"...\", password=\"...\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Download and Parse Friendship Network" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Downloading friendship network...\n", + "Loaded 428,156 edges\n" + ] + }, + { + "data": { + "application/vnd.microsoft.datawrangler.viewer.v0+json": { + "columns": [ + { + "name": "index", + "rawType": "int64", + "type": "integer" + }, + { + "name": "user1", + "rawType": "int64", + "type": "integer" + }, + { + "name": "user2", + "rawType": "int64", + "type": "integer" + } + ], + "ref": "1a98570c-5c5f-4e82-ada8-00a4f6b03c3f", + "rows": [ + [ + "0", + "0", + "1" + ], + [ + "1", + "0", + "2" + ], + [ + "2", + "0", + "3" + ], + [ + "3", + "0", + "4" + ], + [ + "4", + "0", + "5" + ] + ], + "shape": { + "columns": 2, + "rows": 5 + } + }, + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
user1user2
001
102
203
304
405
\n", + "
" + ], + "text/plain": [ + " user1 user2\n", + "0 0 1\n", + "1 0 2\n", + "2 0 3\n", + "3 0 4\n", + "4 0 5" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Download friendship network\n", + "edges_url = 'https://snap.stanford.edu/data/loc-brightkite_edges.txt.gz'\n", + "print('Downloading friendship network...')\n", + "edges_response = requests.get(edges_url)\n", + "\n", + "# Decompress and parse\n", + "with gzip.GzipFile(fileobj=BytesIO(edges_response.content)) as f:\n", + " edges_content = f.read().decode('utf-8')\n", + "\n", + "# Parse into DataFrame\n", + "edges_df = pd.read_csv(\n", + " StringIO(edges_content),\n", + " sep='\\t',\n", + " comment='#',\n", + " names=['user1', 'user2'],\n", + " dtype={'user1': int, 'user2': int}\n", + ")\n", + "\n", + "print(f'Loaded {len(edges_df):,} edges')\n", + "edges_df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Download and Parse Check-in Data" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Downloading check-in data...\n", + "Loaded 4,491,144 check-ins\n" + ] + }, + { + "data": { + "application/vnd.microsoft.datawrangler.viewer.v0+json": { + "columns": [ + { + "name": "index", + "rawType": "int64", + "type": "integer" + }, + { + "name": "user", + "rawType": "int64", + "type": "integer" + }, + { + "name": "check_in_time", + "rawType": "datetime64[ns, UTC]", + "type": "unknown" + }, + { + "name": "latitude", + "rawType": "float64", + "type": "float" + }, + { + "name": "longitude", + "rawType": "float64", + "type": "float" + }, + { + "name": "location_id", + "rawType": "object", + "type": "string" + } + ], + "ref": "b10f3f43-b7a0-44a1-9910-421dba902b50", + "rows": [ + [ + "0", + "0", + "2010-10-17 01:48:53+00:00", + "39.747652", + "-104.99251", + "88c46bf20db295831bd2d1718ad7e6f5" + ], + [ + "1", + "0", + "2010-10-16 06:02:04+00:00", + "39.891383", + "-105.070814", + "7a0f88982aa015062b95e3b4843f9ca2" + ], + [ + "2", + "0", + "2010-10-16 03:48:54+00:00", + "39.891077", + "-105.068532", + "dd7cd3d264c2d063832db506fba8bf79" + ], + [ + "3", + "0", + "2010-10-14 18:25:51+00:00", + "39.750469", + "-104.999073", + "9848afcc62e500a01cf6fbf24b797732f8963683" + ], + [ + "4", + "0", + "2010-10-14 00:21:47+00:00", + "39.752713", + "-104.996337", + "2ef143e12038c870038df53e0478cefc" + ] + ], + "shape": { + "columns": 5, + "rows": 5 + } + }, + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
usercheck_in_timelatitudelongitudelocation_id
002010-10-17 01:48:53+00:0039.747652-104.99251088c46bf20db295831bd2d1718ad7e6f5
102010-10-16 06:02:04+00:0039.891383-105.0708147a0f88982aa015062b95e3b4843f9ca2
202010-10-16 03:48:54+00:0039.891077-105.068532dd7cd3d264c2d063832db506fba8bf79
302010-10-14 18:25:51+00:0039.750469-104.9990739848afcc62e500a01cf6fbf24b797732f8963683
402010-10-14 00:21:47+00:0039.752713-104.9963372ef143e12038c870038df53e0478cefc
\n", + "
" + ], + "text/plain": [ + " user check_in_time latitude longitude \\\n", + "0 0 2010-10-17 01:48:53+00:00 39.747652 -104.992510 \n", + "1 0 2010-10-16 06:02:04+00:00 39.891383 -105.070814 \n", + "2 0 2010-10-16 03:48:54+00:00 39.891077 -105.068532 \n", + "3 0 2010-10-14 18:25:51+00:00 39.750469 -104.999073 \n", + "4 0 2010-10-14 00:21:47+00:00 39.752713 -104.996337 \n", + "\n", + " location_id \n", + "0 88c46bf20db295831bd2d1718ad7e6f5 \n", + "1 7a0f88982aa015062b95e3b4843f9ca2 \n", + "2 dd7cd3d264c2d063832db506fba8bf79 \n", + "3 9848afcc62e500a01cf6fbf24b797732f8963683 \n", + "4 2ef143e12038c870038df53e0478cefc " + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Download check-in data\n", + "checkins_url = 'https://snap.stanford.edu/data/loc-brightkite_totalCheckins.txt.gz'\n", + "print('Downloading check-in data...')\n", + "checkins_response = requests.get(checkins_url)\n", + "\n", + "# Decompress and parse\n", + "with gzip.GzipFile(fileobj=BytesIO(checkins_response.content)) as f:\n", + " checkins_content = f.read().decode('utf-8')\n", + "\n", + "# Parse into DataFrame\n", + "checkins_df = pd.read_csv(\n", + " StringIO(checkins_content),\n", + " sep='\\t',\n", + " comment='#',\n", + " names=['user', 'check_in_time', 'latitude', 'longitude', 'location_id'],\n", + " dtype={'user': int},\n", + " parse_dates=['check_in_time']\n", + ")\n", + "\n", + "# Filter out likely invalid coordinates: (0, 0) or missing values\n", + "checkins_df = checkins_df[\n", + " checkins_df['latitude'].notna() & \n", + " checkins_df['longitude'].notna() & \n", + " ((checkins_df['latitude'] != 0) | (checkins_df['longitude'] != 0))\n", + "]\n", + "\n", + "print(f'Loaded {len(checkins_df):,} check-ins')\n", + "checkins_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Filtered edges: 428,156 -> 388,180\n", + "Users in network: 58,228\n", + "Users with valid check-ins: 50,686\n", + "Users in filtered network: 50,111\n" + ] + }, + { + "data": { + "application/vnd.microsoft.datawrangler.viewer.v0+json": { + "columns": [ + { + "name": "index", + "rawType": "int64", + "type": "integer" + }, + { + "name": "user1", + "rawType": "int64", + "type": "integer" + }, + { + "name": "user2", + "rawType": "int64", + "type": "integer" + } + ], + "ref": "0f09b4e8-d1c2-42d3-92f8-0185023fc0c7", + "rows": [ + [ + "0", + "0", + "1" + ], + [ + "1", + "0", + "2" + ], + [ + "2", + "0", + "3" + ], + [ + "3", + "0", + "4" + ], + [ + "4", + "0", + "5" + ] + ], + "shape": { + "columns": 2, + "rows": 5 + } + }, + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
user1user2
001
102
203
304
405
\n", + "
" + ], + "text/plain": [ + " user1 user2\n", + "0 0 1\n", + "1 0 2\n", + "2 0 3\n", + "3 0 4\n", + "4 0 5" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Filter edges to only include users with valid check-ins\n", + "valid_users = set(checkins_df['user'].unique())\n", + "edges_df_filtered = edges_df[\n", + " edges_df['user1'].isin(valid_users) & \n", + " edges_df['user2'].isin(valid_users)\n", + "]\n", + "\n", + "print(f'Filtered edges: {len(edges_df):,} -> {len(edges_df_filtered):,}')\n", + "print(f'Users in network: {pd.concat([edges_df[\"user1\"], edges_df[\"user2\"]]).nunique():,}')\n", + "print(f'Users with valid check-ins: {len(valid_users):,}')\n", + "print(f'Users in filtered network: {pd.concat([edges_df_filtered[\"user1\"], edges_df_filtered[\"user2\"]]).nunique():,}')\n", + "edges_df_filtered.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Visualize Friendship Network with Graphistry\n", + "\n", + "This visualization shows the social network of Brightkite users. Each node represents a user, positioned at their first check-in location. Edges represent friendships between users.\n", + "\n", + "**What to explore:**\n", + "- Community clusters: Groups of highly connected friends\n", + "- Geographic patterns: Whether friend groups cluster geographically\n", + "- Network hubs: Users with many connections (high degree)\n", + "- Network structure: Identify isolated groups vs. the main component" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Visualize friendship network (filtered to users with valid check-ins)\n", + "# Use only first check-in per user for node positioning\n", + "\n", + "g = graphistry.edges(edges_df_filtered, 'user1', 'user2').nodes(checkins_df.groupby('user').first().reset_index(), 'user') \\\n", + " .layout_settings(play=0) \\\n", + " .settings(height=800, url_params={\"pointOpacity\": 0.6, \"edgeOpacity\": 0.01})\n", + "g.plot(name=\"Brightkite Basic\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create Hypergraph: Users + Check-ins\n", + "\n", + "This hypergraph combines two types of nodes: **user nodes** (blue, at average location) and **check-in nodes** (red, at actual check-in locations). Two types of edges connect them: **friendships** (blue) between users, and **user-to-check-in** edges (red) linking users to their check-ins.\n", + "\n", + "**What to explore:**\n", + "- Mobility patterns: Check-in scatter around user's average location reveals travel behavior\n", + "- Social-spatial correlation: Do friends visit similar locations?\n", + "- Activity levels: Number of red edges from a user shows check-in frequency\n", + "- Geographic hotspots: Dense red node clusters indicate popular locations\n", + "- User movement range: Distance between user node and their check-ins shows mobility" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Max check-ins per user: 10\n", + "Original check-ins: 4,491,144\n", + "Sampled check-ins: 343,587\n", + "Users with check-ins: 50,686\n", + "User nodes: 50,686\n", + "Check-in nodes: 343,587\n", + "Friendship edges: 388,180\n", + "User->check-in edges: 343,587\n", + "Total nodes: 394,273\n", + "Total edges: 731,767\n" + ] + } + ], + "source": [ + "# Sample check-ins using per-user cap for fair representation\n", + "# Users with ≤10 check-ins: keep all\n", + "# Users with >10 check-ins: randomly sample 10\n", + "\n", + "max_per_user = 10 # Maximum check-ins per user\n", + "\n", + "checkins_sampled = checkins_df.groupby('user', group_keys=False)[checkins_df.columns].apply(\n", + " lambda x: x if len(x) <= max_per_user else x.sample(n=max_per_user, random_state=42)\n", + ")\n", + "\n", + "print(f'Max check-ins per user: {max_per_user}')\n", + "print(f'Original check-ins: {len(checkins_df):,}')\n", + "print(f'Sampled check-ins: {len(checkins_sampled):,}')\n", + "print(f'Users with check-ins: {checkins_sampled[\"user\"].nunique():,}')\n", + "\n", + "# Create aggregated user nodes with average coordinates (using SAMPLED check-ins for consistency)\n", + "user_nodes = checkins_sampled.groupby('user').agg({\n", + " 'latitude': 'mean',\n", + " 'longitude': 'mean',\n", + " 'check_in_time': 'count'\n", + "}).reset_index()\n", + "user_nodes.columns = ['user', 'avg_latitude', 'avg_longitude', 'checkin_count']\n", + "user_nodes['type'] = 'user'\n", + "user_nodes['node_id'] = 'user_' + user_nodes['user'].astype(str)\n", + "\n", + "# Create check-in nodes from SAMPLED data\n", + "checkin_nodes = checkins_sampled.copy()\n", + "checkin_nodes['type'] = 'checkin'\n", + "checkin_nodes['node_id'] = 'checkin_' + checkin_nodes.index.astype(str)\n", + "\n", + "# Create user->check-in edges\n", + "user_checkin_edges = pd.DataFrame({\n", + " 'source': 'user_' + checkin_nodes['user'].astype(str),\n", + " 'destination': checkin_nodes['node_id'],\n", + " 'type': 'user_to_checkin',\n", + " 'user': checkin_nodes['user'].astype(str)\n", + "})\n", + "\n", + "# Create friendship edges between user nodes\n", + "friendship_edges = pd.DataFrame({\n", + " 'source': 'user_' + edges_df_filtered['user1'].astype(str),\n", + " 'destination': 'user_' + edges_df_filtered['user2'].astype(str),\n", + " 'type': 'friendship'\n", + "})\n", + "\n", + "# Combine all edges\n", + "all_edges = pd.concat([friendship_edges, user_checkin_edges], ignore_index=True)\n", + "\n", + "# Combine all nodes\n", + "all_nodes = pd.concat([\n", + " user_nodes[['node_id', 'user', 'avg_latitude', 'avg_longitude', 'type', 'checkin_count']].rename(\n", + " columns={'avg_latitude': 'latitude', 'avg_longitude': 'longitude'}\n", + " ),\n", + " checkin_nodes[['node_id', 'user', 'latitude', 'longitude', 'type', 'check_in_time', 'location_id']]\n", + "], ignore_index=True)\n", + "\n", + "print(f'User nodes: {len(user_nodes):,}')\n", + "print(f'Check-in nodes: {len(checkin_nodes):,}')\n", + "print(f'Friendship edges: {len(friendship_edges):,}')\n", + "print(f'User->check-in edges: {len(user_checkin_edges):,}')\n", + "print(f'Total nodes: {len(all_nodes):,}')\n", + "print(f'Total edges: {len(all_edges):,}')" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Create hypergraph visualization\n", + "g_hyper = graphistry.edges(all_edges, 'source', 'destination').nodes(all_nodes, 'node_id') \\\n", + " .encode_point_color(\"type\", as_categorical=True, categorical_mapping={\"checkin\": \"red\", \"user\": \"blue\"}) \\\n", + " .encode_edge_color(\"type\", as_categorical=True, categorical_mapping={\"user_to_checkin\": \"red\", \"friendship\": \"blue\"}) \\\n", + " .layout_settings(play=0) \\\n", + " .settings(height=800, url_params={\"pointOpacity\": 0.6, \"edgeOpacity\": 0.01})\n", + "g_hyper.plot(name=\"Brightkite Map\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Add Choropleth Map Layer\n", + "\n", + "This visualization adds a geographic choropleth layer using Kepler.gl that color-codes countries by the total number of nodes (users + check-ins) within their borders. The choropleth overlays the hypergraph to provide geographic context for network activity.\n", + "\n", + "**What to explore:**\n", + "- Country-level aggregation: Total node count per country shown via color intensity\n", + "- Color gradient interpretation: Darker (black/dark green) = minimal activity, brighter (vibrant green) = high activity\n", + "- Logarithmic binning: Each color step represents order-of-magnitude increases (1, 10, 100, 1K, 5K, 10K, 15K+)\n", + "- Geographic patterns: Compare regional concentration vs. global distribution\n", + "- Cross-reference: Match choropleth colors to underlying point clusters on the map\n", + "- Network geography: Identify where users and check-ins are concentrated globally" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Adding country information to 394,273 nodes with coordinates...\n", + "Loading formatted geocoded file...\n", + "\n", + "Country distribution:\n", + "country\n", + "US 238806\n", + "JP 26953\n", + "GB 23758\n", + "AU 11320\n", + "CA 10310\n", + "DE 10213\n", + "SE 6749\n", + "NL 6291\n", + "IT 4759\n", + "NO 4329\n", + "FR 4309\n", + "ES 3890\n", + "FI 2651\n", + "CN 2477\n", + "BE 1900\n", + "CL 1855\n", + "IN 1732\n", + "BR 1685\n", + "CH 1682\n", + "PT 1588\n", + "Name: count, dtype: int64\n" + ] + } + ], + "source": [ + "# Add country information using reverse_geocoder (fast, offline)\n", + "import reverse_geocoder as rg\n", + "\n", + "# Filter nodes with valid coordinates\n", + "nodes_with_coords = all_nodes[all_nodes['latitude'].notna() & all_nodes['longitude'].notna()].copy()\n", + "\n", + "print(f'Adding country information to {len(nodes_with_coords):,} nodes with coordinates...')\n", + "\n", + "# Prepare coordinates for batch reverse geocoding\n", + "coords = list(zip(nodes_with_coords['latitude'], nodes_with_coords['longitude']))\n", + "\n", + "# Batch reverse geocode (much faster than individual requests)\n", + "results = rg.search(coords)\n", + "\n", + "# Extract country codes\n", + "nodes_with_coords['country'] = [result['cc'] for result in results]\n", + "\n", + "# Merge back to all_nodes\n", + "all_nodes = all_nodes.drop(columns=['country'], errors='ignore')\n", + "all_nodes = all_nodes.merge(\n", + " nodes_with_coords[['node_id', 'country']], \n", + " on='node_id', \n", + " how='left'\n", + ")\n", + "\n", + "print('\\nCountry distribution:')\n", + "print(all_nodes['country'].value_counts().head(20))" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from graphistry.kepler import KeplerDataset, KeplerLayer, KeplerEncoding\n", + "\n", + "# Create visualization with countries colored by activity\n", + "kepler_ps_encoding = (\n", + " KeplerEncoding()\n", + "\n", + " # Nodes dataset\n", + " .with_dataset(\n", + " KeplerDataset(\n", + " id=\"nodes\",\n", + " type=\"nodes\",\n", + " label=\"Nodes\"\n", + " )\n", + " )\n", + "\n", + " # Edges dataset with mapped coordinates\n", + " .with_dataset(\n", + " KeplerDataset(\n", + " id=\"edges\",\n", + " type=\"edges\",\n", + " label=\"Edges\"\n", + " )\n", + " )\n", + "\n", + " # Countries dataset\n", + " .with_dataset(\n", + " KeplerDataset(\n", + " id=\"countries\",\n", + " type=\"countries\",\n", + " label=\"Nodes in Countries\",\n", + " resolution=110,\n", + " boundary_lakes=False,\n", + " computed_columns={\n", + " \"nodes_in_countries\": {\n", + " \"type\": \"aggregate\",\n", + " \"computeFromDataset\": \"nodes\",\n", + " \"sourceKey\": \"country\",\n", + " \"targetKey\": \"iso_a2_eh\",\n", + " \"aggregate\": \"count\",\n", + " \"aggregateCol\": \"node_id\",\n", + " \"bins\": [0, 1, 10, 100, 1000, 5000, 10000, 15000, 9999999],\n", + " \"right\": False,\n", + " \"includeLowest\": True\n", + " }\n", + " }\n", + " )\n", + " )\n", + "\n", + " # Countries geojson layer with color encoding\n", + " .with_layer(\n", + " KeplerLayer({\n", + " \"id\": \"countries-ps-layer\",\n", + " \"type\": \"geojson\",\n", + " \"config\": {\n", + " \"dataId\": \"countries\",\n", + " \"label\": \"Countries by Num Users\",\n", + " \"columns\": {\n", + " \"geojson\": \"_geometry\"\n", + " },\n", + " \"isVisible\": True,\n", + " \"visConfig\": {\n", + " \"opacity\": 0.7,\n", + " \"strokeOpacity\": 0.8,\n", + " \"thickness\": 0.5,\n", + " \"strokeColor\": [60, 60, 60],\n", + " \"colorRange\": {\n", + " \"name\": \"Custom Gradient\",\n", + " \"type\": \"sequential\",\n", + " \"category\": \"Custom\",\n", + " \"colors\": [\n", + " \"#000000\", # Black for lowest value (0-0.5)\n", + " \"#001a0a\", # Very dark green (0.5-1)\n", + " \"#003314\", # Dark green (1-2)\n", + " \"#004d1f\", # Green (2-3)\n", + " \"#00802d\", # Dark lime green (3-5)\n", + " \"#00b340\", # Medium green (5-7)\n", + " \"#00e65c\", # Bright green (7-10)\n", + " \"#1aff8c\" # Vibrant green for highest value (10+)\n", + " ]\n", + " },\n", + " \"filled\": True,\n", + " \"outline\": True,\n", + " \"extruded\": False,\n", + " \"wireframe\": False\n", + " }\n", + " },\n", + " \"visualChannels\": {\n", + " \"colorField\": {\n", + " \"name\": \"nodes_in_countries\",\n", + " \"type\": \"string\"\n", + " },\n", + " \"colorScale\": \"ordinal\",\n", + " \"sizeField\": None,\n", + " \"sizeScale\": \"linear\"\n", + " }\n", + " })\n", + " )\n", + ")\n", + "\n", + "# Create hypergraph visualization\n", + "g_hyper = graphistry.edges(all_edges, 'source', 'destination').nodes(all_nodes, 'node_id') \\\n", + " .encode_point_color(\"type\", as_categorical=True, categorical_mapping={\"checkin\": \"red\", \"user\": \"blue\"}) \\\n", + " .encode_edge_color(\"type\", as_categorical=True, categorical_mapping={\"user_to_checkin\": \"red\", \"friendship\": \"blue\"}) \\\n", + " .encode_kepler(kepler_ps_encoding) \\\n", + " .layout_settings(play=0) \\\n", + " .settings(height=800, url_params={\"pointOpacity\": 0.6, \"edgeOpacity\": 0.01})\n", + "g_hyper.plot(name=\"Brightkite Choropleth\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "rapids-24.08", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/docs/source/api/kepler/dataset.rst b/docs/source/api/kepler/dataset.rst index 817b7d5eda..246078ef7b 100644 --- a/docs/source/api/kepler/dataset.rst +++ b/docs/source/api/kepler/dataset.rst @@ -82,6 +82,7 @@ Example See Also -------- +- :doc:`preloaded_datasets`: Complete column documentation for preloaded geographic datasets - :ref:`kepler-dataset-format`: Native Kepler.gl dataset format reference - :ref:`kepler-layer-api`: Layer configuration - :ref:`kepler-encoding-api`: Complete Kepler configuration diff --git a/docs/source/api/kepler/preloaded_datasets.rst b/docs/source/api/kepler/preloaded_datasets.rst new file mode 100644 index 0000000000..285de5fc7b --- /dev/null +++ b/docs/source/api/kepler/preloaded_datasets.rst @@ -0,0 +1,806 @@ +.. _kepler_preloaded_datasets: + +Kepler.gl Preloaded Datasets +================================= + +PyGraphistry provides preloaded Natural Earth geographic datasets for use with Kepler.gl visualizations. +These datasets include administrative boundaries at different levels with comprehensive attribute data. + +Admin Region Hierarchy +---------------------- + +The Natural Earth data is organized into administrative levels: + +* **0th Order (Countries)**: National boundaries - ``countries`` or ``zeroOrderAdminRegions`` +* **1st Order (States/Provinces)**: Sub-national divisions - ``states``, ``provinces``, or ``firstOrderAdminRegions`` + +Countries Dataset (0th Order Admin Regions) +-------------------------------------------- + +The countries dataset contains 168 columns of data for each country. All column names are lowercase. + +Example Usage +^^^^^^^^^^^^^ + +.. code-block:: python + + from graphistry import KeplerDataset + + # Create a countries dataset + countries_ds = KeplerDataset( + type="countries", + resolution=10, # 10=high, 50=medium, 110=low + include_countries=["United States of America", "Canada", "Mexico"] + ) + + # Get list of available columns + columns = KeplerDataset.get_available_columns('countries') + +Complete Column List with Example Values +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +All 168 columns with example values from United States (displayed in groups for clarity): + +**Geographic and Administrative Columns** + +.. list-table:: + :header-rows: 1 + :widths: 30 70 + + * - Column + - Example Value + * - ``featurecla`` + - "Admin-0 country" + * - ``scalerank`` + - 1 + * - ``labelrank`` + - 2 + * - ``sovereignt`` + - "United States of America" + * - ``sov_a3`` + - "US1" + * - ``adm0_dif`` + - 1 + * - ``level`` + - 2 + * - ``type`` + - "Country" + * - ``tlc`` + - 1 + * - ``admin`` + - "United States of America" + * - ``adm0_a3`` + - "USA" + * - ``geou_dif`` + - 0 + * - ``geounit`` + - "United States of America" + * - ``gu_a3`` + - "USA" + * - ``su_dif`` + - 0 + * - ``subunit`` + - "United States" + * - ``su_a3`` + - "USA" + * - ``brk_diff`` + - 0 + +**Names and Identifiers** + +.. list-table:: + :header-rows: 1 + :widths: 30 70 + + * - Column + - Example Value + * - ``name`` + - "United States of America" + * - ``name_long`` + - "United States" + * - ``brk_a3`` + - "USA" + * - ``brk_name`` + - "United States" + * - ``brk_group`` + - "" + * - ``abbrev`` + - "U.S.A." + * - ``postal`` + - "US" + * - ``formal_en`` + - "United States of America" + * - ``formal_fr`` + - "" + * - ``name_ciawf`` + - "United States" + * - ``note_adm0`` + - "" + * - ``note_brk`` + - "" + * - ``name_sort`` + - "United States of America" + * - ``name_alt`` + - "" + +**Map Display Properties** + +.. list-table:: + :header-rows: 1 + :widths: 30 70 + + * - Column + - Example Value + * - ``mapcolor7`` + - 4 + * - ``mapcolor8`` + - 5 + * - ``mapcolor9`` + - 1 + * - ``mapcolor13`` + - 1 + * - ``min_zoom`` + - 0.0 + * - ``min_label`` + - 1.7 + * - ``max_label`` + - 5.7 + * - ``label_x`` + - -97.482602 + * - ``label_y`` + - 39.538479 + * - ``latitude`` + - 42.31380089200132 + * - ``longitude`` + - -105.33907490650022 + +**Demographics and Economics** + +.. list-table:: + :header-rows: 1 + :widths: 30 70 + + * - Column + - Example Value + * - ``pop_est`` + - 328239523.0 + * - ``pop_rank`` + - 17 + * - ``pop_year`` + - 2019 + * - ``gdp_md`` + - 21433226 + * - ``gdp_year`` + - 2019 + * - ``economy`` + - "1. Developed region: G7" + * - ``income_grp`` + - "1. High income: OECD" + +**ISO and International Codes** + +.. list-table:: + :header-rows: 1 + :widths: 30 70 + + * - Column + - Example Value + * - ``fips_10`` + - "US" + * - ``iso_a2`` + - "US" + * - ``iso_a2_eh`` + - "US" + * - ``iso_a3`` + - "USA" + * - ``iso_a3_eh`` + - "USA" + * - ``iso_n3`` + - "840" + * - ``iso_n3_eh`` + - "840" + * - ``un_a3`` + - "840" + * - ``wb_a2`` + - "US" + * - ``wb_a3`` + - "USA" + * - ``woe_id`` + - 23424977 + * - ``woe_id_eh`` + - 23424977 + * - ``woe_note`` + - "Exact WOE match as country" + * - ``adm0_iso`` + - "USA" + * - ``adm0_diff`` + - "" + * - ``adm0_tlc`` + - "USA" + +**Regional Classifications** + +.. list-table:: + :header-rows: 1 + :widths: 30 70 + + * - Column + - Example Value + * - ``continent`` + - "North America" + * - ``region_un`` + - "Americas" + * - ``subregion`` + - "Northern America" + * - ``region_wb`` + - "North America" + +**Country-Specific Admin Codes** + +.. list-table:: + :header-rows: 1 + :widths: 30 70 + + * - Column + - Example Value + * - ``adm0_a3_us`` + - "USA" + * - ``adm0_a3_fr`` + - "USA" + * - ``adm0_a3_ru`` + - "USA" + * - ``adm0_a3_es`` + - "USA" + * - ``adm0_a3_cn`` + - "USA" + * - ``adm0_a3_tw`` + - "USA" + * - ``adm0_a3_in`` + - "USA" + * - ``adm0_a3_np`` + - "USA" + * - ``adm0_a3_pk`` + - "USA" + * - ``adm0_a3_de`` + - "USA" + * - ``adm0_a3_gb`` + - "USA" + * - ``adm0_a3_br`` + - "USA" + * - ``adm0_a3_il`` + - "USA" + * - ``adm0_a3_ps`` + - "USA" + * - ``adm0_a3_sa`` + - "USA" + * - ``adm0_a3_eg`` + - "USA" + * - ``adm0_a3_ma`` + - "USA" + * - ``adm0_a3_pt`` + - "USA" + * - ``adm0_a3_ar`` + - "USA" + * - ``adm0_a3_jp`` + - "USA" + * - ``adm0_a3_ko`` + - "USA" + * - ``adm0_a3_vn`` + - "USA" + * - ``adm0_a3_tr`` + - "USA" + * - ``adm0_a3_id`` + - "USA" + * - ``adm0_a3_pl`` + - "USA" + * - ``adm0_a3_gr`` + - "USA" + * - ``adm0_a3_it`` + - "USA" + * - ``adm0_a3_nl`` + - "USA" + * - ``adm0_a3_se`` + - "USA" + * - ``adm0_a3_bd`` + - "USA" + * - ``adm0_a3_ua`` + - "USA" + * - ``adm0_a3_un`` + - -99 + * - ``adm0_a3_wb`` + - -99 + +**Metadata Fields** + +.. list-table:: + :header-rows: 1 + :widths: 30 70 + + * - Column + - Example Value + * - ``name_len`` + - 24 + * - ``long_len`` + - 13 + * - ``abbrev_len`` + - 6 + * - ``tiny`` + - -99 + * - ``homepart`` + - 1 + * - ``ne_id`` + - 1159321369 + * - ``wikidataid`` + - "Q30" + +**Multilingual Names** + +.. list-table:: + :header-rows: 1 + :widths: 30 70 + + * - Column + - Example Value + * - ``name_ar`` + - "الولايات المتحدة" + * - ``name_bn`` + - "মার্কিন যুক্তরাষ্ট্র" + * - ``name_de`` + - "Vereinigte Staaten" + * - ``name_en`` + - "United States of America" + * - ``name_es`` + - "Estados Unidos" + * - ``name_fa`` + - "ایالات متحده آمریکا" + * - ``name_fr`` + - "États-Unis" + * - ``name_el`` + - "Ηνωμένες Πολιτείες Αμερικής" + * - ``name_he`` + - "ארצות הברית" + * - ``name_hi`` + - "संयुक्त राज्य अमेरिका" + * - ``name_hu`` + - "Amerikai Egyesült Államok" + * - ``name_id`` + - "Amerika Serikat" + * - ``name_it`` + - "Stati Uniti d'America" + * - ``name_ja`` + - "アメリカ合衆国" + * - ``name_ko`` + - "미국" + * - ``name_nl`` + - "Verenigde Staten van Amerika" + * - ``name_pl`` + - "Stany Zjednoczone" + * - ``name_pt`` + - "Estados Unidos" + * - ``name_ru`` + - "США" + * - ``name_sv`` + - "USA" + * - ``name_tr`` + - "Amerika Birleşik Devletleri" + * - ``name_uk`` + - "Сполучені Штати Америки" + * - ``name_ur`` + - "ریاستہائے متحدہ امریکا" + * - ``name_vi`` + - "Hoa Kỳ" + * - ``name_zh`` + - "美国" + * - ``name_zht`` + - "美國" + +**Feature Classification Fields** + +.. list-table:: + :header-rows: 1 + :widths: 30 70 + + * - Column + - Example Value + * - ``fclass_iso`` + - "Admin-0 country" + * - ``tlc_diff`` + - "" + * - ``fclass_tlc`` + - "Admin-0 country" + * - ``fclass_us`` + - "" + * - ``fclass_fr`` + - "" + * - ``fclass_ru`` + - "" + * - ``fclass_es`` + - "" + * - ``fclass_cn`` + - "" + * - ``fclass_tw`` + - "" + * - ``fclass_in`` + - "" + * - ``fclass_np`` + - "" + * - ``fclass_pk`` + - "" + * - ``fclass_de`` + - "" + * - ``fclass_gb`` + - "" + * - ``fclass_br`` + - "" + * - ``fclass_il`` + - "" + * - ``fclass_ps`` + - "" + * - ``fclass_sa`` + - "" + * - ``fclass_eg`` + - "" + * - ``fclass_ma`` + - "" + * - ``fclass_pt`` + - "" + * - ``fclass_ar`` + - "" + * - ``fclass_jp`` + - "" + * - ``fclass_ko`` + - "" + * - ``fclass_vn`` + - "" + * - ``fclass_tr`` + - "" + * - ``fclass_id`` + - "" + * - ``fclass_pl`` + - "" + * - ``fclass_gr`` + - "" + * - ``fclass_it`` + - "" + * - ``fclass_nl`` + - "" + * - ``fclass_se`` + - "" + * - ``fclass_bd`` + - "" + * - ``fclass_ua`` + - "" + +**Geometry** + +.. list-table:: + :header-rows: 1 + :widths: 30 70 + + * - Column + - Example Value + * - ``_geometry`` + - MULTIPOLYGON(...) + +States/Provinces Dataset (1st Order Admin Regions) +--------------------------------------------------- + +The states/provinces dataset contains administrative subdivisions for countries worldwide. + +Example Usage +^^^^^^^^^^^^^ + +.. code-block:: python + + from graphistry import KeplerDataset + + # Create a states dataset for US states + states_ds = KeplerDataset( + type="states", + include_countries=["United States of America"], + include_1st_order_regions=["California", "Texas", "New York"] + ) + +Complete Column List with Example Values +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +All 115 columns with example values from California: + +**Geographic and Administrative Columns** + +.. list-table:: + :header-rows: 1 + :widths: 30 70 + + * - Column + - Example Value + * - ``featurecla`` + - "Admin-1 states provinces" + * - ``scalerank`` + - 2 + * - ``adm1_code`` + - "USA-3521" + * - ``diss_me`` + - 3521 + * - ``iso_3166_2`` + - "US-CA" + * - ``wikipedia`` + - "http://en.wikipedia.org/wiki/California" + * - ``iso_a2`` + - "US" + * - ``adm0_sr`` + - 8 + +**Names and Identifiers** + +.. list-table:: + :header-rows: 1 + :widths: 30 70 + + * - Column + - Example Value + * - ``name`` + - "California" + * - ``name_alt`` + - "CA|Calif." + * - ``name_local`` + - "" + * - ``type`` + - "State" + * - ``type_en`` + - "State" + * - ``code_local`` + - "US06" + * - ``code_hasc`` + - "US.CA" + * - ``note`` + - "" + * - ``hasc_maybe`` + - "" + +**Regional Information** + +.. list-table:: + :header-rows: 1 + :widths: 30 70 + + * - Column + - Example Value + * - ``region`` + - "West" + * - ``region_cod`` + - "" + * - ``region_sub`` + - "Pacific" + * - ``sub_code`` + - "" + * - ``provnum_ne`` + - 0.0 + * - ``gadm_level`` + - 1 + +**Administrative Details** + +.. list-table:: + :header-rows: 1 + :widths: 30 70 + + * - Column + - Example Value + * - ``check_me`` + - 20 + * - ``datarank`` + - 1 + * - ``abbrev`` + - "Calif." + * - ``postal`` + - "CA" + * - ``area_sqkm`` + - 0.0 + * - ``sameascity`` + - -99 + * - ``labelrank`` + - 0 + * - ``name_len`` + - 10 + +**Map Display Properties** + +.. list-table:: + :header-rows: 1 + :widths: 30 70 + + * - Column + - Example Value + * - ``mapcolor9`` + - 1 + * - ``mapcolor13`` + - 1 + * - ``min_label`` + - 3.5 + * - ``max_label`` + - 7.5 + * - ``min_zoom`` + - 2.0 + +**External References** + +.. list-table:: + :header-rows: 1 + :widths: 30 70 + + * - Column + - Example Value + * - ``fips`` + - "US06" + * - ``fips_alt`` + - "" + * - ``woe_id`` + - 2347563.0 + * - ``woe_label`` + - "California, US, United States" + * - ``woe_name`` + - "California" + * - ``wikidataid`` + - "Q99" + * - ``ne_id`` + - 1159308415 + +**Geographic Coordinates** + +.. list-table:: + :header-rows: 1 + :widths: 30 70 + + * - Column + - Example Value + * - ``latitude`` + - 37.1259483770762 + * - ``longitude`` + - -119.44202946142391 + +**Parent Country Information** + +.. list-table:: + :header-rows: 1 + :widths: 30 70 + + * - Column + - Example Value + * - ``sov_a3`` + - "US1" + * - ``adm0_a3`` + - "USA" + * - ``adm0_label`` + - 2 + * - ``admin`` + - "United States of America" + * - ``geonunit`` + - "United States of America" + * - ``gu_a3`` + - "USA" + +**GeoNames Integration** + +.. list-table:: + :header-rows: 1 + :widths: 30 70 + + * - Column + - Example Value + * - ``gn_id`` + - 5332921.0 + * - ``gn_name`` + - "California" + * - ``gns_id`` + - -1.0 + * - ``gns_name`` + - "" + * - ``gn_level`` + - 1.0 + * - ``gn_region`` + - "" + * - ``gn_a1_code`` + - "US.CA" + * - ``gns_level`` + - -1.0 + * - ``gns_lang`` + - "" + * - ``gns_adm1`` + - "" + * - ``gns_region`` + - "" + +**Multilingual Names** + +.. list-table:: + :header-rows: 1 + :widths: 30 70 + + * - Column + - Example Value + * - ``name_ar`` + - "كاليفورنيا" + * - ``name_bn`` + - "ক্যালিফোর্নিয়া" + * - ``name_de`` + - "Kalifornien" + * - ``name_en`` + - "California" + * - ``name_es`` + - "California" + * - ``name_fr`` + - "Californie" + * - ``name_el`` + - "Καλιφόρνια" + * - ``name_hi`` + - "कैलिफ़ोर्निया" + * - ``name_hu`` + - "Kalifornia" + * - ``name_id`` + - "California" + * - ``name_it`` + - "California" + * - ``name_ja`` + - "カリフォルニア州" + * - ``name_ko`` + - "캘리포니아" + * - ``name_nl`` + - "Californië" + * - ``name_pl`` + - "Kalifornia" + * - ``name_pt`` + - "Califórnia" + * - ``name_ru`` + - "Калифорния" + * - ``name_sv`` + - "Kalifornien" + * - ``name_tr`` + - "Kaliforniya" + * - ``name_vi`` + - "California" + * - ``name_zh`` + - "加利福尼亚州" + * - ``name_he`` + - "קליפורניה" + * - ``name_uk`` + - "Каліфорнія" + * - ``name_ur`` + - "کیلی فورنیا" + * - ``name_fa`` + - "کالیفرنیا" + * - ``name_zht`` + - "加利福尼亞州" + +**Feature Classification Fields** + +.. list-table:: + :header-rows: 1 + :widths: 30 70 + + * - Column + - Example Value + * - ``fclass_iso`` to ``fclass_tlc`` + - "" (empty for all) + +**Geometry** + +.. list-table:: + :header-rows: 1 + :widths: 30 70 + + * - Column + - Example Value + * - ``_geometry`` + - MULTIPOLYGON(...) + +Data Source +----------- + +These datasets are derived from Natural Earth (https://www.naturalearthdata.com/), a public domain map dataset available at 1:10m, 1:50m, and 1:110m scales. The data is updated periodically to reflect political and demographic changes. + +See Also +-------- + +* :doc:`/api/plotter` - Main plotting interface with Kepler support +* Natural Earth documentation: https://www.naturalearthdata.com/ \ No newline at end of file diff --git a/docs/source/notebooks/gfql.rst b/docs/source/notebooks/gfql.rst index 99b18c033f..8e5f16378c 100644 --- a/docs/source/notebooks/gfql.rst +++ b/docs/source/notebooks/gfql.rst @@ -12,4 +12,4 @@ GFQL Graph queries GPU Benchmarking <../demos/gfql/benchmark_hops_cpu_gpu.ipynb> GFQL Remote mode <../demos/gfql/gfql_remote.ipynb> Python Remote mode <../demos/gfql/python_remote.ipynb> - # ICIJ FinCEN Files Visualization <../demos/demos_by_user_case/icij_fincen_viz.ipynb> + ICIJ FinCEN Files Visualization <../demos/demos_by_user_case/icij_fincen_viz.ipynb> diff --git a/docs/source/notebooks/visualization.rst b/docs/source/notebooks/visualization.rst index 44e9cc48f5..08081d0940 100644 --- a/docs/source/notebooks/visualization.rst +++ b/docs/source/notebooks/visualization.rst @@ -23,7 +23,8 @@ Geographic (Kepler.gl) :caption: Geographic visualization with Kepler.gl :titlesonly: - Geospatial Network Visualization <../demos/more_examples/graphistry_features/layout_map.ipynb> + Company Networks on a Map <../demos/more_examples/graphistry_features/layout_map.ipynb> + Brightkite Check-ins <../demos/demos_by_use_case/social/brightkite_checkin.ipynb> Layout -------