creativecommons
diff --git a/‎Pipfile‎
Lines changed: 5 additions & 19 deletions b/‎Pipfile‎
Lines changed: 5 additions & 19 deletions
diff --git a/‎Pipfile.lock‎
Lines changed: 692 additions & 1833 deletions b/‎Pipfile.lock‎
Lines changed: 692 additions & 1833 deletions
diff --git a/‎README.md‎
Lines changed: 103 additions & 24 deletions b/‎README.md‎
Lines changed: 103 additions & 24 deletions
diff --git a/‎data/2025Q4/1-fetch/smithsonian_1_metrics.csv‎
Lines changed: 2 additions & 0 deletions b/‎data/2025Q4/1-fetch/smithsonian_1_metrics.csv‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎data/2025Q4/1-fetch/smithsonian_2_units.csv‎
Lines changed: 39 additions & 0 deletions b/‎data/2025Q4/1-fetch/smithsonian_2_units.csv‎
Lines changed: 39 additions & 0 deletions
@@ -4,39 +4,25 @@ verify_ssl = true
 name = "pypi"
 
 [packages]
+cachetools = "*"  # Required by google-api-python-client
 feedparser = "*"
-flickrapi = "*"
 GitPython = "*"
 google-api-python-client = "*"
-h11 = ">=0.16.0"  # Ensure dependency is secure
-internetarchive = ">=5.5.1"
-jupyterlab = ">=3.6.7"
+lxml = "*"
 matplotlib = "*"
-numpy = "*"
 pandas = "*"
-plotly = "*"
-pillow = ">=11.3.0"  # Ensure dependency is secure
-Pyarrow = "*"
+protobuf = ">=6.33.5"  # Ensure dependency is secure
+pyasn1 = ">=0.6.2"  # Ensure dependency is secure
 Pygments = "*"
-python-dotenv = "*"
 PyYAML = "*"
 requests = ">=2.31.0"
-seaborn = "*"
-urllib3 = ">=2.5.0"
-wordcloud = "*"
+urllib3 = ">=2.6.3"  # Ensure dependency is secure
 
 [dev-packages]
 black = "*"
-"black[jupyter]" = "*"
 flake8 = "*"
 isort = "*"
 pre-commit = "*"
 
 [requires]
 python_version = "3.11"
-
-[scripts]
-gcs_fetched = "./scripts/1-fetch/gcs_fetched.py"
-flickr_fetched = "./scripts/1-fetch/flickr_fetched.py"
-gcs_processed = "./scripts/2-process/gcs_processed.py"
-gcs_reports = "./scripts/3-report/gcs_reports.py"
@@ -1,29 +1,16 @@
-# quantifying
+# Quantifying
 
-Quantifying the Commons
+Quantifying the Commons: measure the size and diversity of the commons--the
+collection of works that are openly licensed or in the public domain
 
 
 ## Overview
 
-This project seeks to quantify the size and diversity of the commons--the
-collection of works that are openly licensed or in the public domain.
-
-
-### Meaningful
-
-The reports generated by this project (and the data fetched and processed to
-support it) seeks to be meaningful. We hope this project will provide data and
-analysis that helps inform discussions about the commons--the collection of
-works that are openly licensed or in the public domain.
-
-The goal of this project is to help answer questions like:
-- How has the world's use of the commons changed over time?
-- How is the knowledge and culture of the commons distributed?
-  - Who has access (and how much) to the commons?
-- What significant trends can be observed in the commons?
-  - Which public domain dedication or licenses are the most popular?
-  - What are the correlations between public domain dedication or licenses and
-    region, language, domain/endeavor, etc.?
+This project seeks to quantify the size and diversity of the creative commons
+legal tools. We aim to track the collection of works (articles, images,
+publications, etc.) that are openly licensed or in the public domain. The
+project automates data collection from multiple data sources, processes the
+data, and generates meaningful reports.
 
 
 ## Code of conduct
@@ -47,6 +34,93 @@ See [`CONTRIBUTING.md`][org-contrib].
 [org-contrib]: https://github.com/creativecommons/.github/blob/main/CONTRIBUTING.md
 
 
+### The three phases of generating a report
+
+1. **Fetch**: This phase involves collecting data from a particular source
+   using its API. Before writing any code, we plan the analyses we want to
+   perform by asking meaningful questions about the data. We also consider API
+   limitations (such as query limits) and design a query strategy to work
+   within these limitations. Then we write a python script that gets the data,
+   it is quite important to follow the format of the scripts existing in the
+   project and use the modules and functions where applicable. It ensures
+   consistency in the scripts and we can easily debug issues might arise.
+   - **Meaningful questions**
+     - The reports generated by this project (and the data fetched and
+       processed to support it) seeks to be meaningful. We hope this project
+       will provide data and analysis that helps inform discussions about the
+       commons. The goal of this project is to help answer questions like:
+       - How has the world's use of the commons changed over time?
+       - How is the knowledge and culture of the commons distributed?
+       - Who has access (and how much) to the commons?
+       - What significant trends can be observed in the commons?
+       - Which public domain dedication or licenses are the most popular?
+       - What are the correlations between public domain dedication or licenses
+         and region, language, domain/endeavor, etc.?
+   - **Limitations of an API**
+     - Some data sources provide APIs with query limits (it can be daily or
+       hourly) depending on what is given in the documentation. This restricts
+       how many requests that can be made in the specified period of time. It
+       is important to plan a query strategy and schedule fetch jobs to stay
+       within the allowed limits.
+   - **Headings of data in 1-fetch**
+     - [Tool identifier][tool-identifier]: A unique identifier used to
+       distinguish each Creative Commons legal tool within the dataset. This
+       helps ensure consistency when tracking tools across different data
+       sources.
+     - [SPDX identifier][spdx-identifier]: A standardized identifier maintained
+       by the Software Package Data Exchange (SPDX) project. It provides a
+       consistent way to reference licenses in applications.
+2. **Process**: In this phase, the fetched data is transformed into a
+   structured and standardized format for analysis. The data is then analyzed
+   and categorized based on defined criteria to extract insights that answer
+   the meaningful questions identified during the 1-fetch phase.
+3. **report**: This phase focuses on presenting the results of the analysis.
+   We generate graphs and summaries that clearly show trends, patterns, and
+   distributions in the data. These reports help communicate key insights about
+   the size, diversity, and characteristics of openly licensed and public
+   domain works.
+
+[tool-identifier]: https://creativecommons.org/share-your-work/cclicenses/
+[spdx-identifier]: https://spdx.org/licenses/
+
+
+### Automation phases
+
+For automating these phases, the project uses Python scripts to fetch, process,
+and report data. GitHub Actions is used to automatically run these scripts on a
+defined schedule and on code updates. It handles script execution, manages
+dependencies, and ensures the workflow runs consistently.
+- **Script assumptions**
+  - Execution schedule for each quarter:
+    - 1-Fetch: first month, 1st half of second month
+    - 2-Process: 2nd half of second month
+    - 3-Report: third month
+- **Script requirements**
+  - *Must be safe*
+    - Scripts must not make any changes with default options
+    - Easiest way to run script should also be the safest
+    - Have options spelled out
+    - Must be timely
+  - *Scripts should complete within a maximum of 45 minutes*
+    - *Scripts shouldn't take longer than 3 minutes with default options*
+    - That way there’s a quicker way to see what is happening when it is
+      running; see execution, without errors, etc. Then later in production it
+      can be run with longer options
+  - *Must be idempotent*
+    - [Idempotence - Wikipedia](https://en.wikipedia.org/wiki/Idempotence)
+    - This applies to both the data fetched and the data stored. If the data
+      changes randomly, we can't draw meaningful conclusions.
+  - *Balanced use of third-party libraries*
+    - Third-party libraries should be leveraged when they are:
+      - API specific (google-api-python-client, internetarchive, etc.)
+- File formats
+  - CSV: the format is well supported (rendered on GitHub, etc.), easy to use,
+    and the data used by the project is simple enough to avoid any
+    shortcomings.
+  - YAML: prioritizes human readability which addresses the primary costs and
+    risks associated with configuration files.
+
+
 ### Project structure
 
 Please note that in the directory tree below, all instances of `fetch`,
@@ -69,7 +143,6 @@ Quantifying/
 │   │   │   └── README.md  # All generated reports are displayed in the README
 │   └── ...
 ├── dev/
-├── pre-automation/  # All Quantifying work prior to adding automation system
 ├── scripts/  # Run scripts for all phases
 │   ├── 1-fetch/
 │   ├── 2-process/
@@ -91,8 +164,7 @@ Quantifying/
 ```
 
 
-## Development
-
+## How to set up
 
 ### Prerequisites
 
@@ -155,6 +227,13 @@ When run this way, the shared library (`scripts/shared.py`) provides easy access
 to all of the necessary paths and all of the modules managed by pipenv are
 available.
 
+In order for scripts to be run directly (as shown above), the script must be
+executable. For more information on making files executable, please see:
+[File Permissions - Foundational technologies — Creative Commons Open
+Source][file-perms].
+
+[file-perms]: https://opensource.creativecommons.org/contributing-code/foundational-tech/#file-permissions
+
 
 ### Static analysis
 
 
@@ -0,0 +1,2 @@
+"CC0_RECORDS","CC0_RECORDS_WITH_CC0_MEDIA","CC0_MEDIA","CC0_MEDIA_PERCENTAGE","TOTAL_OBJECTS"
+"14273329","5199915","4503016","36","15616799"
@@ -0,0 +1,39 @@
+"UNIT","CC0_RECORDS","CC0_RECORDS_WITH_CC0_MEDIA","TOTAL_OBJECTS"
+"AAA","0","0","29735"
+"AAG","0","0","344"
+"ACM","251","247","2977"
+"ACMA","0","0","57"
+"CFCHFOLKLIFE","17544","0","18517"
+"CHNDM","58158","54590","201545"
+"FBR","1517","37","11248"
+"FSG","4720","4720","45588"
+"HAC","430","430","1437"
+"HMSG","449","448","13898"
+"HSFA","0","0","299"
+"NASM","1010","989","32325"
+"NMAAHC","22224","4465","22577"
+"NMAH","1316502","10548","1317248"
+"NMAI","237637","180","239307"
+"NMAfA","111","111","12477"
+"NMNHANTHRO","497734","0","497734"
+"NMNHBIRDS","635217","559038","635217"
+"NMNHBOTANY","4562256","3572487","4562256"
+"NMNHEDUCATION","6473","4090","6473"
+"NMNHENTO","731838","197223","731838"
+"NMNHFISHES","502585","10806","502585"
+"NMNHHERPS","615308","2345","615308"
+"NMNHINV","2003972","70094","2003972"
+"NMNHMAMMALS","626133","542046","626133"
+"NMNHMINSCI","465275","11311","465275"
+"NMNHPALEO","743533","94487","743533"
+"NPG","15446","14540","123566"
+"NPM","10814","8005","83710"
+"NZP","1061","1061","2086"
+"OCIO_DPO3D","108","17","146"
+"OFEO-SG","5509","3665","7295"
+"SAAM","13626","12891","188157"
+"SIA","35498","5477","48169"
+"SIL","1035579","13567","1039087"
+"SILAF","63416","0","63416"
+"SILNMAHTL","34577","0","34577"
+"SLA_SRO","104811","0","104811"
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+"CC0_RECORDS","CC0_RECORDS_WITH_CC0_MEDIA","CC0_MEDIA","CC0_MEDIA_PERCENTAGE","TOTAL_OBJECTS"`
	`2`	`+"14273329","5199915","4503016","36","15616799"`