Health-Informatics-UoN
diff --git a/‎observablehq.config.js‎
Lines changed: 2 additions & 2 deletions b/‎observablehq.config.js‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/examples-in-five-safes-tes/aggregating-statistics.md‎
Lines changed: 153 additions & 0 deletions b/‎src/examples-in-five-safes-tes/aggregating-statistics.md‎
Lines changed: 153 additions & 0 deletions
diff --git a/‎src/examples-in-five-safes-tes/contingency-table-wizard.png‎
229 KB b/‎src/examples-in-five-safes-tes/contingency-table-wizard.png‎
229 KB
diff --git a/‎src/examples-in-five-safes-tes/contingency-tables.md‎
Lines changed: 155 additions & 0 deletions b/‎src/examples-in-five-safes-tes/contingency-tables.md‎
Lines changed: 155 additions & 0 deletions
diff --git a/‎src/examples-in-five-safes-tes/wizard-desc-stats.png‎
210 KB b/‎src/examples-in-five-safes-tes/wizard-desc-stats.png‎
210 KB
@@ -26,8 +26,8 @@ export default {
         {name: "Submission layer wizards", path: "/examples-in-five-safes-tes/submission-layer-wizards"},
         {name: "Collecting results", path: "/examples-in-five-safes-tes/collecting-results"},
         {name: "Visualising OMOP metadata", path: "/examples-in-five-safes-tes/Bunny visualisations"},
-        // {name: "Aggregating statistics", path: "/examples-in-five-safes-tes/aggregating-statistics"},
-        // {name: "Contingency tables", path: "/examples-in-five-safes-tes/contingency-tables"},
+        {name: "Aggregating statistics", path: "/examples-in-five-safes-tes/aggregating-statistics"},
+        {name: "Contingency tables", path: "/examples-in-five-safes-tes/contingency-tables"},
         {name: "Five Safes TES messages", path: "/examples-in-five-safes-tes/5s-tes-messages"},
       ]
     }
 
@@ -3,4 +3,157 @@ theme: air
 style: ../entrust-style.css
 title: Aggregating basic statistics
 ---
+# Aggregating basic statistics
 
+This tutorial can be run as a Jupyter notebook from the [5s-TES notebooks repository](https://github.com/Health-Informatics-UoN/5s-TES-notebooks/)
+
+The outputs from TES tasks can be easily used to calculate basic statistics.
+   
+   This example will use summary statistics from a dataset in the OMOP common data model.
+There is a container which, given a SQL query that filters an OMOP table by your criteria, will calculate the necessary summary statistics for your final analysis.
+
+This example data was collected using the [Custom Image wizard](submission-layer-wizards#custom-image) in the submission layer with these settings changed from default:
+
+| Field   | value                                                                          |
+| ------- | ------------------------------------------------------------------------------ |
+| image   | ghcr.io/health-informatics-uon/five-safes-tes-analytics-dev:sha-57c3950 |
+| workdir | /app |
+| command | --user-query=SELECT value_as_number FROM public.measurement \\nWHERE measurement_concept_id = 3000905\\nAND value_as_number IS NOT NULL<br>--analysis=variance<br>--output-filename=/outputs/output<br>--output-format=json<br> |
+
+
+![Screenshot of the custom image wizard](wizard-desc-stats.png)
+
+<details>
+    <summary>Expand to view generated JSON</summary>
+
+
+```
+    {
+         \"id\": \"450\",
+         \"state\": 0,
+         \"name\": \"test variance\",
+         \"description\": \"Federated analysis task\",
+         \"inputs\": null,
+         \"outputs\": [
+                  {
+                           \"name\": \"Query Results\",
+                           \"description\": \"Results from the requested query execution\",
+                           \"url\": \"s3://\",
+                           \"path\": \"/outputs\",
+                           \"type\": \"DIRECTORY\"
+                  }
+         ],
+         \"resources\": null,
+         \"executors\": [
+                  {
+                           \"image\": \"ghcr.io/health-informatics-uon/five-safes-tes-analytics-dev:sha-57c3950\",
+                           \"command\": [
+                                    \"--user-query=SELECT value_as_number FROM public.measurement \\nWHERE measurement_concept_id = 3000905\\nAND value_as_number IS NOT NULL\",
+                                    \"--analysis=variance\",
+                                    \"--output-filename=/outputs/output\",
+                                    \"--output-format=json\"
+                           ],
+                           \"workdir\": \"/app\",
+                           \"stdin\": null,
+                           \"stdout\": null,
+                           \"stderr\": null,
+                           \"env\": {}
+                  }
+         ],
+         \"volumes\": null,
+         \"tags\": {
+                  \"Project\": \"NottinghamDemo\",
+                  \"tres\": \"Nottingham TRE 01|Nottingham TRE 02\"
+         },
+         \"logs\": null,
+         \"creation_time\": null
+    }
+```
+</details>
+
+The `aggregate_utils` module provided with this notebook allows you to calculate statistics for the overall population by aggregating intermediate result
+
+```python
+from pathlib import Path
+from aggregate_utils import VarianceIntermediate, TTestIntermediate, make_variance_intermediate_from_json, aggregate_variance_intermediates
+import numpy as np
+```
+The example data are held in `./data`
+
+```python
+paths = {
+    "tre1": "./data/variance-tre-1.json",
+    "tre2": "./data/variance-tre-2.json"
+}
+```
+
+The `make_variance_intermediate_from_json` function reads the data from the JSON file and provides methods for aggregation.
+The returned values hold the count (`n`), the sum (`total`), and the sum of squares (`sum_x2`) for the value read from the original table.
+These three pieces of information are sufficient to calculate several other summary statistics.
+
+```python
+variance_intermediates = {
+    k:make_variance_intermediate_from_json(Path(v))
+    for k,v in paths.items()
+}
+variance_intermediates
+```
+
+`'tre1': VarianceIntermediate(n=2140, total=10819.0, sum_x2=76707.0),`
+
+`'tre2': VarianceIntermediate(n=4571, total=228747.0, sum_x2=15257585.0)`
+
+For example, the `aggregate_variance_intermediates` function will aggregate these as if they came from a single sample.
+
+```python
+aggregated_intermediate = aggregate_variance_intermediates(variance_intermediates.values())
+aggregated_intermediate
+```
+
+`VarianceIntermediate(n=6711, total=239566.0, sum_x2=15334292.0)`
+
+The `mean` and `variance` properties are for the whole sample.
+```python
+aggregated_intermediate.mean
+```
+
+`35.69751154820444`
+
+```python
+aggregated_intermediate.variance
+```
+
+`1010.6365591480935`
+
+The values are from a very skewed distribution.
+To demonstrate how the same information can be used to conduct other common statistical analyses, random samples from a normal distribution can be generated with this code:
+
+```python
+mu, sigma = 50, 10
+rng = np.random.default_rng()
+s = rng.normal(mu, sigma, 10)
+```
+
+A `TTestIntermediate` uses the same three pieces of information as a `VarianceIntermediate`.
+
+```python
+gaussian_t_test_intermediate = TTestIntermediate(
+    n=len(s),
+    total=np.sum(s),
+    sum_x2=np.sum(s**2)
+)
+
+gaussian_t_test_intermediate
+```
+
+`TTestIntermediate(n=10, total=np.float64(476.9742997017046), sum_x2=np.float64(23459.825051161773))`
+
+You can also use this information to perform a one-sample t-test.
+
+```python
+gaussian_t_test_intermediate.one_sample_t_test(52)
+```
+
+`0.07033644170438572`
+
+There are many other analyses that can be performed with a few building blocks like this.
@@ -3,4 +3,159 @@ theme: air
 style: ../entrust-style.css
 title: Aggregating contingency tables
 ---
+# Aggregating data for contingency tables
 
+This tutorial can be run as a Jupyter notebook from the [5s-TES notebooks repository](https://github.com/Health-Informatics-UoN/5s-TES-notebooks/)
+
+Federated analysis on contingency tables is relatively simple.
+Counts are easy to federate: each TRE calculates their local count for some group, then these are aggregated by adding the counts together.
+Each cell of a contingency table is a count, so the table can be federated by requesting these counts, and then statistical analyses can be performed on the aggregate.
+
+```mermaid
+graph TD
+  subgraph 5s-TES
+    sub(Submission layer)
+    tre1(TRE 1)
+    tre2(TREs ..n)
+  end
+  user -- Request counts --> sub
+  sub -- Request counts --> tre1
+  sub -- Request counts --> tre2
+  tre1 -- counts --> agg(User)
+  tre2 -- counts --> agg
+  agg -- Sum counts --> Result
+```
+
+The example data were produced by running the [Custom Image wizard](submission-layer-wizards#custom-image) using the following parameters:
+
+| Field | value |
+| ----- | ----- |
+| Docker image| ghcr.io/health-informatics-uon/five-safes-tes-analytics-dev:sha-9ac04bc |
+| Workdir | /app |
+| Commands | --user-query=SELECT g.concept_name AS gender_name, r.concept_name AS race_name\\nFROM public.person p\\nJOIN public.concept g ON p.gender_concept_id = g.concept_id\\nJOIN public.concept r ON p.race_concept_id = r.concept_id\\nWHERE p.race_concept_id IN (8515, 8516, 8527)<br>--analysis=contingency_table<br>--output-filename=/outputs/output<br>--output-format=json |
+
+The UI should look like this:
+![A screenshot of the web application showing the custom image wizard](contingency-table-wizard.png)
+
+<details>
+    <summary>Expand for example JSON</summary>
+
+```json
+{
+         "id": "504",
+         "name": "test chi-sq",
+         "description": "Federated analysis task",
+         "inputs": null,
+         "outputs": [
+                  {
+                           "name": "Query Results",
+                           "description": "Results from the requested query execution",
+                           "url": "s3://",
+                           "path": "/outputs",
+                           "type": "DIRECTORY"
+                  }
+         ],
+         "resources": null,
+         "executors": [
+                  {
+                           "image": "ghcr.io/health-informatics-uon/five-safes-tes-analytics-dev:sha-9ac04bc",
+                           "command": [
+                                    "--user-query=SELECT g.concept_name AS gender_name, r.concept_name AS race_name\nFROM public.person p\nJOIN public.concept g ON p.gender_concept_id = g.concept_id\nJOIN public.concept r ON p.race_concept_id = r.concept_id\nWHERE p.race_concept_id IN (8515, 8516, 8527)",
+                                    "--analysis=contingency_table",
+                                    "--output-filename=/outputs/output",
+                                    "--output-format=json"
+                           ],
+                           "workdir": "/app",
+                           "stdin": null,
+                           "stdout": null,
+                           "stderr": null,
+                           "env": {}
+                  }
+         ],
+         "volumes": null,
+         "tags": {
+                  "Project": "NottinghamDemo",
+                  "tres": "Nottingham TRE 01|Nottingham TRE 02"
+         },
+         "logs": null,
+         "creation_time": null
+}
+```
+</details>
+
+```python
+import pandas as pd
+from contingency_table_utils import read_contingency_table_from_json, aggregate_tables
+
+from scipy.stats import chi2_contingency
+```
+
+The json produced by this analysis can be read into tables using the `contingency_table_utils` module supplied.
+
+```python
+tre1 = read_contingency_table_from_json(\"data/tre1.json\")
+tre2 = read_contingency_table_from_json(\"data/tre2.json\")
+tre1.data
+```
+
+| gender_name |                  race_name |    n |
+| ----------- | -------------------------- | ---- |
+|      FEMALE |                      Asian |   29 |
+|      FEMALE |  Black or African American |   44 |
+|      FEMALE |                      White |  411 |
+|        MALE |                      Asian |   41 |
+|        MALE |  Black or African American |   38 |
+|        MALE |                      White |  433 |
+
+The data aren't very interesting, as they simply report how many men and women there are of three ethnicities in the synthetic datasets, but they serve to show how contingency tables can be assembled.
+
+`aggregate_tables` checks that your tables have the same variables, and sums the counts if they do.
+
+```python
+aggregate = aggregate_tables([tre1, tre2])
+```
+
+The `contingency_table` property organises this data into the format for statistical analyses.
+
+```python
+aggregate.contingency_table
+```
+
+| | FEMALE |   MALE |
+| ------------------------- | ------ | ------ |
+| Asian                     |   1011 |    982 |
+| Black or African American |   1080 |   1022 |
+| White                     |   1426 |   1441 |
+
+This format can be used for `scipy.stats` contingency table functions.
+
+```python
+chisq = chi2_contingency(aggregate.contingency_table)
+print(f\"The p-value for the chi-squared test is {chisq.pvalue:.3f}\")
+```
+
+`The p-value for the chi-squared test is 0.508`
+
+Phew, the synthetic data haven't got any surprising imbalances.
+
+This tutorial should show how you can perform federated analyses based on contingency tables of count data.
+The key requirement for writing your own analyses is writing a SQL query that, like
+
+```{sql}
+SELECT g.concept_name AS gender_name, r.concept_name AS race_name
+  FROM public.person p
+  JOIN public.concept g ON p.gender_concept_id = g.concept_id
+  JOIN public.concept r ON p.race_concept_id = r.concept_id;
+```
+
+produces a table of two categorical columns, e.g.
+
+| gender_name | race_name                                 |
+| ----------- | ----------------------------------------- |
+| FEMALE      | Asian                                     |
+| FEMALE      | Black or African American                 |
+| FEMALE      | White                                     |
+| MALE        | White                                     |
+| FEMALE      | Native Hawaiian or Other Pacific Islander |
+| MALE        | Native Hawaiian or Other Pacific Islander |
+| ...         | ...                                       |
Original file line number	Diff line number	Diff line change
`@@ -26,8 +26,8 @@ export default {`
`26`	`26`	`{name: "Submission layer wizards", path: "/examples-in-five-safes-tes/submission-layer-wizards"},`
`27`	`27`	`{name: "Collecting results", path: "/examples-in-five-safes-tes/collecting-results"},`
`28`	`28`	`{name: "Visualising OMOP metadata", path: "/examples-in-five-safes-tes/Bunny visualisations"},`
`29`		`- // {name: "Aggregating statistics", path: "/examples-in-five-safes-tes/aggregating-statistics"},`
`30`		`- // {name: "Contingency tables", path: "/examples-in-five-safes-tes/contingency-tables"},`
	`29`	`+ {name: "Aggregating statistics", path: "/examples-in-five-safes-tes/aggregating-statistics"},`
	`30`	`+ {name: "Contingency tables", path: "/examples-in-five-safes-tes/contingency-tables"},`
`31`	`31`	`{name: "Five Safes TES messages", path: "/examples-in-five-safes-tes/5s-tes-messages"},`
`32`	`32`	`]`
`33`	`33`	`}`