From f5e75ff4cd1872c0f21bf5c416c017df406215df Mon Sep 17 00:00:00 2001 From: ziad hany Date: Tue, 23 Sep 2025 13:21:39 +0300 Subject: [PATCH 1/4] Add cargo packageURL mining in CI with scancode-action Signed-off-by: ziad hany --- .github/workflows/mine-cargo-packageurls.yml | 16 ++++++++++++++++ cargo/checkpoints.json | 1 + 2 files changed, 17 insertions(+) create mode 100644 .github/workflows/mine-cargo-packageurls.yml create mode 100644 cargo/checkpoints.json diff --git a/.github/workflows/mine-cargo-packageurls.yml b/.github/workflows/mine-cargo-packageurls.yml new file mode 100644 index 00000000..e6702aaa --- /dev/null +++ b/.github/workflows/mine-cargo-packageurls.yml @@ -0,0 +1,16 @@ +on: [workflow_dispatch] + +jobs: + mine-pypi-purls: + runs-on: ubuntu-24.04 + name: Mine cargo PackageURLs + steps: + - uses: aboutcode-org/scancode-action@beta + with: + scancodeio-repo-branch: "collect-purl-metadata#egg=scancodeio[mining]" + pipelines: "mine_cargo" + env: + FEDERATEDCODE_GIT_ACCOUNT_URL: https://github.com/aboutcode-data/minecode-data-cargo-test + FEDERATEDCODE_GIT_SERVICE_TOKEN: ${{ secrets.MINING_GITHUB_TOKEN }} + FEDERATEDCODE_GIT_SERVICE_NAME: "the AboutCode bot" + FEDERATEDCODE_GIT_SERVICE_EMAIL: "automation@aboutcode.org" diff --git a/cargo/checkpoints.json b/cargo/checkpoints.json new file mode 100644 index 00000000..9e26dfee --- /dev/null +++ b/cargo/checkpoints.json @@ -0,0 +1 @@ +{} \ No newline at end of file From c09c5971e589d38f96d966a6f02ca8c928b27c1a Mon Sep 17 00:00:00 2001 From: ziad hany Date: Wed, 24 Sep 2025 11:30:25 +0300 Subject: [PATCH 2/4] Rename FederatedCode service name and fix docs title underline Signed-off-by: ziad hany --- .github/workflows/mine-cargo-packageurls.yml | 2 +- docs/source/index.rst | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/mine-cargo-packageurls.yml b/.github/workflows/mine-cargo-packageurls.yml index e6702aaa..22791805 100644 --- a/.github/workflows/mine-cargo-packageurls.yml +++ b/.github/workflows/mine-cargo-packageurls.yml @@ -12,5 +12,5 @@ jobs: env: FEDERATEDCODE_GIT_ACCOUNT_URL: https://github.com/aboutcode-data/minecode-data-cargo-test FEDERATEDCODE_GIT_SERVICE_TOKEN: ${{ secrets.MINING_GITHUB_TOKEN }} - FEDERATEDCODE_GIT_SERVICE_NAME: "the AboutCode bot" + FEDERATEDCODE_GIT_SERVICE_NAME: "AboutCode Automation" FEDERATEDCODE_GIT_SERVICE_EMAIL: "automation@aboutcode.org" diff --git a/docs/source/index.rst b/docs/source/index.rst index 9ff88976..1fb7649c 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -1,5 +1,5 @@ Welcome to miencode-pipelines documentation! -========================================= +============================================= This is released at pypi: https://pypi.org/project/minecode-pipelines/ From 93f58425c7c7482408869a3b3279aee6cb797750 Mon Sep 17 00:00:00 2001 From: ziad hany Date: Wed, 24 Sep 2025 11:31:25 +0300 Subject: [PATCH 3/4] Rename FederatedCode service name for pypi Signed-off-by: ziad hany --- .github/workflows/mine-pypi-packageurls.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/mine-pypi-packageurls.yml b/.github/workflows/mine-pypi-packageurls.yml index e06d945d..72f7a89d 100644 --- a/.github/workflows/mine-pypi-packageurls.yml +++ b/.github/workflows/mine-pypi-packageurls.yml @@ -12,5 +12,5 @@ jobs: env: FEDERATEDCODE_GIT_ACCOUNT_URL: https://github.com/aboutcode-data/minecode-data-pypi-test FEDERATEDCODE_GIT_SERVICE_TOKEN: ${{ secrets.MINING_GITHUB_TOKEN }} - FEDERATEDCODE_GIT_SERVICE_NAME: "the AboutCode bot" + FEDERATEDCODE_GIT_SERVICE_NAME: "AboutCode Automation" FEDERATEDCODE_GIT_SERVICE_EMAIL: "automation@aboutcode.org" From 42a450ab942eb9e26dc755294ea22b19e1d82480 Mon Sep 17 00:00:00 2001 From: ziad hany Date: Wed, 24 Sep 2025 12:02:11 +0300 Subject: [PATCH 4/4] Fix all checks, doc8 validation, and add .gitkeep for empty directory Signed-off-by: ziad hany --- README.rst | 22 ++++++++++++++-------- etc/.gitkeep | 0 2 files changed, 14 insertions(+), 8 deletions(-) create mode 100644 etc/.gitkeep diff --git a/README.rst b/README.rst index 359eb266..d2d92414 100644 --- a/README.rst +++ b/README.rst @@ -21,16 +21,22 @@ Configuration format * last serial number processed (used in indexes at pypi, npm etc) * last processed commit (where the data is stored in git repos) - * directory to store las fetched index data (like the JSON fetched from pypi simple with package names and last updated info) + * directory to store las fetched index data + (like the JSON fetched from pypi simple with package names and last updated info) * state information in ``state``: * ``null``: mining has not started. - * ``initital-sync`` : at the start of mining we need to mine a huge amount of packages for packageURL to catch up. - This is typically very large and could take several hours to several days dependening on the ecosystem size. - We fetch and save an index state and mine all packageURLs till there. Once we reach a state where remaining - new packageURLs can be mined in a couple hours, we can move on to the next state where we mine new packageURLs - added in a periodic manner. - * ``periodic-sync`` : This is a periodic update of new packageURLs added in the index in a period, and typically this + * ``initital-sync`` : at the start of mining we need to mine a huge + amount of packages for packageURL to catch up. + This is typically very large and could take several hours to several days + dependening on the ecosystem size. + We fetch and save an index state and mine all packageURLs till there. + Once we reach a state where remaining + new packageURLs can be mined in a couple hours, we can move on to + the next state where we mine new packageURLs + added in a periodic manner. + * ``periodic-sync`` : This is a periodic update of new packageURLs + added in the index in a period, and typically this should not take more than a couple hours. * optional elements to improve readability/debugging: @@ -38,5 +44,5 @@ Configuration format * ``last_updated``: date and time of last checkpoint update * ``packages_checkpoints.json``: stores checkpoint related to: - + * ``packages_mined``: which packages have been mined in the ``initital-sync`` state. diff --git a/etc/.gitkeep b/etc/.gitkeep new file mode 100644 index 00000000..e69de29b