diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index b290e09..97c8c97 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -1,20 +1,20 @@ { "name": "nfcore", - "image": "nfcore/gitpod:latest", - "remoteUser": "gitpod", - "runArgs": ["--privileged"], + "image": "nfcore/devcontainer:latest", - // Configure tool-specific properties. - "customizations": { - // Configure properties specific to VS Code. - "vscode": { - // Set *default* container specific settings.json values on container create. - "settings": { - "python.defaultInterpreterPath": "/opt/conda/bin/python" - }, + "remoteUser": "root", + "privileged": true, - // Add the IDs of extensions you want installed when the container is created. - "extensions": ["ms-python.python", "ms-python.vscode-pylance", "nf-core.nf-core-extensionpack"] - } + "remoteEnv": { + // Workspace path on the host for mounting with docker-outside-of-docker + "LOCAL_WORKSPACE_FOLDER": "${localWorkspaceFolder}" + }, + + "onCreateCommand": "./.devcontainer/setup.sh", + + "hostRequirements": { + "cpus": 4, + "memory": "16gb", + "storage": "32gb" } } diff --git a/.devcontainer/setup.sh b/.devcontainer/setup.sh new file mode 100755 index 0000000..c2e1cd4 --- /dev/null +++ b/.devcontainer/setup.sh @@ -0,0 +1,13 @@ +#!/usr/bin/env bash + +# Customise the terminal command prompt +echo "export PROMPT_DIRTRIM=2" >> $HOME/.bashrc +echo "export PS1='\[\e[3;36m\]\w ->\[\e[0m\\] '" >> $HOME/.bashrc +export PROMPT_DIRTRIM=2 +export PS1='\[\e[3;36m\]\w ->\[\e[0m\\] ' + +# Update Nextflow +nextflow self-update + +# Update welcome message +echo "Welcome to the nf-core/proteinannotator devcontainer!" > /usr/local/etc/vscode-dev-containers/first-run-notice.txt diff --git a/.editorconfig b/.editorconfig deleted file mode 100644 index 6d9b74c..0000000 --- a/.editorconfig +++ /dev/null @@ -1,37 +0,0 @@ -root = true - -[*] -charset = utf-8 -end_of_line = lf -insert_final_newline = true -trim_trailing_whitespace = true -indent_size = 4 -indent_style = space - -[*.{md,yml,yaml,html,css,scss,js}] -indent_size = 2 - -# These files are edited and tested upstream in nf-core/modules -[/modules/nf-core/**] -charset = unset -end_of_line = unset -insert_final_newline = unset -trim_trailing_whitespace = unset -indent_style = unset -[/subworkflows/nf-core/**] -charset = unset -end_of_line = unset -insert_final_newline = unset -trim_trailing_whitespace = unset -indent_style = unset - -[/assets/email*] -indent_size = unset - -# ignore python and markdown -[*.{py,md}] -indent_style = unset - -# ignore ro-crate metadata files -[**/ro-crate-metadata.json] -insert_final_newline = unset diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md index 9809077..4948d40 100644 --- a/.github/CONTRIBUTING.md +++ b/.github/CONTRIBUTING.md @@ -78,7 +78,7 @@ If you wish to contribute a new step, please use the following coding standards: 5. Add any new parameters to `nextflow_schema.json` with help text (via the `nf-core pipelines schema build` tool). 6. Add sanity checks and validation for all relevant parameters. 7. Perform local tests to validate that the new code works as expected. -8. If applicable, add a new test command in `.github/workflow/ci.yml`. +8. If applicable, add a new test in the `tests` directory. 9. Update MultiQC config `assets/multiqc_config.yml` so relevant suffixes, file name clean up and module plots are in the appropriate order. If applicable, add a [MultiQC](https://https://multiqc.info/) module. 10. Add a description of the output files and if relevant any appropriate images from the MultiQC report to `docs/output.md`. diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 49b1cc5..5d0e8d4 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -18,8 +18,8 @@ Learn more about contributing: [CONTRIBUTING.md](https://github.com/nf-core/prot - [ ] If you've added a new tool - have you followed the pipeline conventions in the [contribution docs](https://github.com/nf-core/proteinannotator/tree/master/.github/CONTRIBUTING.md) - [ ] If necessary, also make a PR on the nf-core/proteinannotator _branch_ on the [nf-core/test-datasets](https://github.com/nf-core/test-datasets) repository. - [ ] Make sure your code lints (`nf-core pipelines lint`). -- [ ] Ensure the test suite passes (`nextflow run . -profile test,docker --outdir `). -- [ ] Check for unexpected warnings in debug mode (`nextflow run . -profile debug,test,docker --outdir `). +- [ ] Ensure the test suite passes (e.g. `nf-test test */local --profile=~test,docker` for all new local tests). +- [ ] Check for unexpected warnings in debug mode (`nf-test test */local --profile=~test,docker,debug`). - [ ] Usage Documentation in `docs/usage.md` is updated. - [ ] Output Documentation in `docs/output.md` is updated. - [ ] `CHANGELOG.md` is updated. diff --git a/.github/actions/get-shards/action.yml b/.github/actions/get-shards/action.yml new file mode 100644 index 0000000..3408527 --- /dev/null +++ b/.github/actions/get-shards/action.yml @@ -0,0 +1,69 @@ +name: "Get number of shards" +description: "Get the number of nf-test shards for the current CI job" +inputs: + max_shards: + description: "Maximum number of shards allowed" + required: true + paths: + description: "Component paths to test" + required: false + tags: + description: "Tags to pass as argument for nf-test --tag parameter" + required: false +outputs: + shard: + description: "Array of shard numbers" + value: ${{ steps.shards.outputs.shard }} + total_shards: + description: "Total number of shards" + value: ${{ steps.shards.outputs.total_shards }} +runs: + using: "composite" + steps: + - name: Install nf-test + uses: nf-core/setup-nf-test@v1 + with: + version: ${{ env.NFT_VER }} + - name: Get number of shards + id: shards + shell: bash + run: | + # Run nf-test with dynamic parameter + nftest_output=$(nf-test test \ + --profile +docker \ + $(if [ -n "${{ inputs.tags }}" ]; then echo "--tag ${{ inputs.tags }}"; fi) \ + --dry-run \ + --ci \ + --changed-since HEAD^) || { + echo "nf-test command failed with exit code $?" + echo "Full output: $nftest_output" + exit 1 + } + echo "nf-test dry-run output: $nftest_output" + + # Default values for shard and total_shards + shard="[]" + total_shards=0 + + # Check if there are related tests + if echo "$nftest_output" | grep -q 'No tests to execute'; then + echo "No related tests found." + else + # Extract the number of related tests + number_of_shards=$(echo "$nftest_output" | sed -n 's|.*Executed \([0-9]*\) tests.*|\1|p') + if [[ -n "$number_of_shards" && "$number_of_shards" -gt 0 ]]; then + shards_to_run=$(( $number_of_shards < ${{ inputs.max_shards }} ? $number_of_shards : ${{ inputs.max_shards }} )) + shard=$(seq 1 "$shards_to_run" | jq -R . | jq -c -s .) + total_shards="$shards_to_run" + else + echo "Unexpected output format. Falling back to default values." + fi + fi + + # Write to GitHub Actions outputs + echo "shard=$shard" >> $GITHUB_OUTPUT + echo "total_shards=$total_shards" >> $GITHUB_OUTPUT + + # Debugging output + echo "Final shard array: $shard" + echo "Total number of shards: $total_shards" diff --git a/.github/actions/nf-test/action.yml b/.github/actions/nf-test/action.yml new file mode 100644 index 0000000..efe8e86 --- /dev/null +++ b/.github/actions/nf-test/action.yml @@ -0,0 +1,123 @@ +name: "nf-test Action" +description: "Runs nf-test with common setup steps" +inputs: + profile: + description: "Profile to use" + required: true + shard: + description: "Shard number for this CI job" + required: true + total_shards: + description: "Total number of test shards(NOT the total number of matrix jobs)" + required: true + paths: + description: "Test paths" + required: true + tags: + description: "Tags to pass as argument for nf-test --tag parameter" + required: false +runs: + using: "composite" + steps: + - name: Print Modules Folder Tree + uses: jaywcjlove/github-action-folder-tree@main + with: + exclude: "node_modules|dist|.git|.husky" + path: ./modules + depth: 10 + - name: Print Subworkflows Folder Tree + uses: jaywcjlove/github-action-folder-tree@main + with: + exclude: "node_modules|dist|.git|.husky" + path: ./subworkflows + depth: 10 + - name: Setup Nextflow + uses: nf-core/setup-nextflow@v2 + with: + version: "${{ env.NXF_VERSION }}" + + - name: Set up Python + uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6 + with: + python-version: "3.14" + + - name: Install nf-test + uses: nf-core/setup-nf-test@v1 + with: + version: "${{ env.NFT_VER }}" + install-pdiff: true + + - name: Setup apptainer + if: contains(inputs.profile, 'singularity') + uses: eWaterCycle/setup-apptainer@main + + - name: Set up Singularity + if: contains(inputs.profile, 'singularity') + shell: bash + run: | + mkdir -p $NXF_SINGULARITY_CACHEDIR + mkdir -p $NXF_SINGULARITY_LIBRARYDIR + + - name: Conda setup + if: contains(inputs.profile, 'conda') + uses: conda-incubator/setup-miniconda@505e6394dae86d6a5c7fbb6e3fb8938e3e863830 # v3 + with: + auto-update-conda: true + conda-solver: libmamba + channels: conda-forge + channel-priority: strict + conda-remove-defaults: true + + - name: Run nf-test + shell: bash + env: + NFT_WORKDIR: ${{ env.NFT_WORKDIR }} + run: | + nf-test test \ + --profile=+${{ inputs.profile }} \ + $(if [ -n "${{ inputs.tags }}" ]; then echo "--tag ${{ inputs.tags }}"; fi) \ + --ci \ + --changed-since HEAD^ \ + --verbose \ + --tap=test.tap \ + --shard ${{ inputs.shard }}/${{ inputs.total_shards }} + + # Save the absolute path of the test.tap file to the output + echo "tap_file_path=$(realpath test.tap)" >> $GITHUB_OUTPUT + + - name: Generate test summary + if: always() + shell: bash + run: | + # Add header if it doesn't exist (using a token file to track this) + if [ ! -f ".summary_header" ]; then + echo "# 🚀 nf-test results" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "| Status | Test Name | Profile | Shard |" >> $GITHUB_STEP_SUMMARY + echo "|:------:|-----------|---------|-------|" >> $GITHUB_STEP_SUMMARY + touch .summary_header + fi + + if [ -f test.tap ]; then + while IFS= read -r line; do + if [[ $line =~ ^ok ]]; then + test_name="${line#ok }" + # Remove the test number from the beginning + test_name="${test_name#* }" + echo "| ✅ | ${test_name} | ${{ inputs.profile }} | ${{ inputs.shard }}/${{ inputs.total_shards }} |" >> $GITHUB_STEP_SUMMARY + elif [[ $line =~ ^not\ ok ]]; then + test_name="${line#not ok }" + # Remove the test number from the beginning + test_name="${test_name#* }" + echo "| ❌ | ${test_name} | ${{ inputs.profile }} | ${{ inputs.shard }}/${{ inputs.total_shards }} |" >> $GITHUB_STEP_SUMMARY + fi + done < test.tap + else + echo "| ⚠️ | No test results found | ${{ inputs.profile }} | ${{ inputs.shard }}/${{ inputs.total_shards }} |" >> $GITHUB_STEP_SUMMARY + fi + + - name: Clean up + if: always() + shell: bash + run: | + sudo rm -rf /home/ubuntu/tests/ diff --git a/.github/workflows/awsfulltest.yml b/.github/workflows/awsfulltest.yml index 56b67a6..562018e 100644 --- a/.github/workflows/awsfulltest.yml +++ b/.github/workflows/awsfulltest.yml @@ -4,66 +4,42 @@ name: nf-core AWS full size tests # It runs the -profile 'test_full' on AWS batch on: - pull_request: - branches: - - main - - master workflow_dispatch: pull_request_review: types: [submitted] + release: + types: [published] jobs: run-platform: name: Run AWS full tests - # run only if the PR is approved by at least 2 reviewers and against the master branch or manually triggered - if: github.repository == 'nf-core/proteinannotator' && github.event.review.state == 'approved' && github.event.pull_request.base.ref == 'master' || github.event_name == 'workflow_dispatch' + # run only if the PR is approved by at least 2 reviewers and against the master/main branch or manually triggered + if: github.repository == 'nf-core/proteinannotator' && github.event.review.state == 'approved' && (github.event.pull_request.base.ref == 'master' || github.event.pull_request.base.ref == 'main') || github.event_name == 'workflow_dispatch' || github.event_name == 'release' runs-on: ubuntu-latest steps: - - name: Get PR reviews - uses: octokit/request-action@v2.x - if: github.event_name != 'workflow_dispatch' - id: check_approvals - continue-on-error: true - with: - route: GET /repos/${{ github.repository }}/pulls/${{ github.event.pull_request.number }}/reviews?per_page=100 - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - - name: Check for approvals - if: ${{ failure() && github.event_name != 'workflow_dispatch' }} - run: | - echo "No review approvals found. At least 2 approvals are required to run this action automatically." - exit 1 - - - name: Check for enough approvals (>=2) - id: test_variables - if: github.event_name != 'workflow_dispatch' + - name: Set revision variable + id: revision run: | - JSON_RESPONSE='${{ steps.check_approvals.outputs.data }}' - CURRENT_APPROVALS_COUNT=$(echo $JSON_RESPONSE | jq -c '[.[] | select(.state | contains("APPROVED")) ] | length') - test $CURRENT_APPROVALS_COUNT -ge 2 || exit 1 # At least 2 approvals are required + echo "revision={%- raw -%}${{ (github.event_name == 'workflow_dispatch' || github.event_name == 'release') && github.sha || 'dev' }}" >> "$GITHUB_OUTPUT" - name: Launch workflow via Seqera Platform uses: seqeralabs/action-tower-launch@v2 - # TODO nf-core: You can customise AWS full pipeline tests as required - # Add full size test data (but still relatively small datasets for few samples) - # on the `test_full.config` test runs with only one set of parameters with: - workspace_id: ${{ secrets.TOWER_WORKSPACE_ID }} + workspace_id: ${{ vars.TOWER_WORKSPACE_ID }} access_token: ${{ secrets.TOWER_ACCESS_TOKEN }} - compute_env: ${{ secrets.TOWER_COMPUTE_ENV }} - revision: ${{ github.sha }} - workdir: s3://${{ secrets.AWS_S3_BUCKET }}/work/proteinannotator/work-${{ github.sha }} + compute_env: ${{ vars.TOWER_COMPUTE_ENV }} + revision: ${{ steps.revision.outputs.revision }} + workdir: s3://${{ vars.AWS_S3_BUCKET }}/work/proteinannotator/work-${{ steps.revision.outputs.revision }} parameters: | { "hook_url": "${{ secrets.MEGATESTS_ALERTS_SLACK_HOOK_URL }}", - "outdir": "s3://${{ secrets.AWS_S3_BUCKET }}/proteinannotator/results-${{ github.sha }}" + "outdir": "s3://${{ vars.AWS_S3_BUCKET }}/proteinannotator/results-${{ steps.revision.outputs.revision }}" } profiles: test_full - - uses: actions/upload-artifact@v4 + - uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5 with: name: Seqera Platform debug log file path: | - seqera_platform_action_*.log - seqera_platform_action_*.json + tower_action_*.log + tower_action_*.json diff --git a/.github/workflows/awstest.yml b/.github/workflows/awstest.yml index 90608bc..50644cc 100644 --- a/.github/workflows/awstest.yml +++ b/.github/workflows/awstest.yml @@ -14,20 +14,20 @@ jobs: - name: Launch workflow via Seqera Platform uses: seqeralabs/action-tower-launch@v2 with: - workspace_id: ${{ secrets.TOWER_WORKSPACE_ID }} + workspace_id: ${{ vars.TOWER_WORKSPACE_ID }} access_token: ${{ secrets.TOWER_ACCESS_TOKEN }} - compute_env: ${{ secrets.TOWER_COMPUTE_ENV }} + compute_env: ${{ vars.TOWER_COMPUTE_ENV }} revision: ${{ github.sha }} - workdir: s3://${{ secrets.AWS_S3_BUCKET }}/work/proteinannotator/work-${{ github.sha }} + workdir: s3://${{ vars.AWS_S3_BUCKET }}/work/proteinannotator/work-${{ github.sha }} parameters: | { - "outdir": "s3://${{ secrets.AWS_S3_BUCKET }}/proteinannotator/results-test-${{ github.sha }}" + "outdir": "s3://${{ vars.AWS_S3_BUCKET }}/proteinannotator/results-test-${{ github.sha }}" } profiles: test - - uses: actions/upload-artifact@v4 + - uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5 with: name: Seqera Platform debug log file path: | - seqera_platform_action_*.log - seqera_platform_action_*.json + tower_action_*.log + tower_action_*.json diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml deleted file mode 100644 index c3c74a6..0000000 --- a/.github/workflows/ci.yml +++ /dev/null @@ -1,87 +0,0 @@ -name: nf-core CI -# This workflow runs the pipeline with the minimal test dataset to check that it completes without any syntax errors -on: - push: - branches: - - dev - pull_request: - release: - types: [published] - workflow_dispatch: - -env: - NXF_ANSI_LOG: false - NXF_SINGULARITY_CACHEDIR: ${{ github.workspace }}/.singularity - NXF_SINGULARITY_LIBRARYDIR: ${{ github.workspace }}/.singularity - -concurrency: - group: "${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}" - cancel-in-progress: true - -jobs: - test: - name: "Run pipeline with test data (${{ matrix.NXF_VER }} | ${{ matrix.test_name }} | ${{ matrix.profile }})" - # Only run on push if this is the nf-core dev branch (merged PRs) - if: "${{ github.event_name != 'push' || (github.event_name == 'push' && github.repository == 'nf-core/proteinannotator') }}" - runs-on: ubuntu-latest - strategy: - matrix: - NXF_VER: - - "24.04.2" - - "latest-everything" - profile: - - "conda" - - "docker" - - "singularity" - test_name: - - "test" - isMaster: - - ${{ github.base_ref == 'master' }} - # Exclude conda and singularity on dev - exclude: - - isMaster: false - profile: "conda" - - isMaster: false - profile: "singularity" - steps: - - name: Check out pipeline code - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4 - with: - fetch-depth: 0 - - - name: Set up Nextflow - uses: nf-core/setup-nextflow@v2 - with: - version: "${{ matrix.NXF_VER }}" - - - name: Set up Apptainer - if: matrix.profile == 'singularity' - uses: eWaterCycle/setup-apptainer@main - - - name: Set up Singularity - if: matrix.profile == 'singularity' - run: | - mkdir -p $NXF_SINGULARITY_CACHEDIR - mkdir -p $NXF_SINGULARITY_LIBRARYDIR - - - name: Set up Miniconda - if: matrix.profile == 'conda' - uses: conda-incubator/setup-miniconda@a4260408e20b96e80095f42ff7f1a15b27dd94ca # v3 - with: - miniconda-version: "latest" - auto-update-conda: true - conda-solver: libmamba - channels: conda-forge,bioconda - - - name: Set up Conda - if: matrix.profile == 'conda' - run: | - echo $(realpath $CONDA)/condabin >> $GITHUB_PATH - echo $(realpath python) >> $GITHUB_PATH - - - name: Clean up Disk space - uses: jlumbroso/free-disk-space@54081f138730dfa15788a46383842cd2f914a1be # v1.3.1 - - - name: "Run pipeline with test data ${{ matrix.NXF_VER }} | ${{ matrix.test_name }} | ${{ matrix.profile }}" - run: | - nextflow run ${GITHUB_WORKSPACE} -profile ${{ matrix.test_name }},${{ matrix.profile }} --outdir ./results diff --git a/.github/workflows/clean-up.yml b/.github/workflows/clean-up.yml index 0b6b1f2..6adb0ff 100644 --- a/.github/workflows/clean-up.yml +++ b/.github/workflows/clean-up.yml @@ -10,7 +10,7 @@ jobs: issues: write pull-requests: write steps: - - uses: actions/stale@28ca1036281a5e5922ead5184a1bbf96e5fc984e # v9 + - uses: actions/stale@5f858e3efba33a5ca4407a664cc011ad407f2008 # v10 with: stale-issue-message: "This issue has been tagged as awaiting-changes or awaiting-feedback by an nf-core contributor. Remove stale label or add a comment otherwise this issue will be closed in 20 days." stale-pr-message: "This PR has been tagged as awaiting-changes or awaiting-feedback by an nf-core contributor. Remove stale label or add a comment if it is still useful." diff --git a/.github/workflows/download_pipeline.yml b/.github/workflows/download_pipeline.yml index ab06316..45884ff 100644 --- a/.github/workflows/download_pipeline.yml +++ b/.github/workflows/download_pipeline.yml @@ -12,14 +12,6 @@ on: required: true default: "dev" pull_request: - types: - - opened - - edited - - synchronize - branches: - - main - - master - pull_request_target: branches: - main - master @@ -52,9 +44,9 @@ jobs: - name: Disk space cleanup uses: jlumbroso/free-disk-space@54081f138730dfa15788a46383842cd2f914a1be # v1.3.1 - - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5 + - uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6 with: - python-version: "3.12" + python-version: "3.14" architecture: "x64" - name: Setup Apptainer @@ -65,7 +57,7 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - pip install git+https://github.com/nf-core/tools.git@dev + pip install git+https://github.com/nf-core/tools.git - name: Make a cache directory for the container images run: | @@ -120,6 +112,7 @@ jobs: echo "IMAGE_COUNT_AFTER=$image_count" >> "$GITHUB_OUTPUT" - name: Compare container image counts + id: count_comparison run: | if [ "${{ steps.count_initial.outputs.IMAGE_COUNT_INITIAL }}" -ne "${{ steps.count_afterwards.outputs.IMAGE_COUNT_AFTER }}" ]; then initial_count=${{ steps.count_initial.outputs.IMAGE_COUNT_INITIAL }} @@ -132,3 +125,10 @@ jobs: else echo "The pipeline can be downloaded successfully!" fi + + - name: Upload Nextflow logfile for debugging purposes + uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5 + with: + name: nextflow_logfile.txt + path: .nextflow.log* + include-hidden-files: true diff --git a/.github/workflows/fix-linting.yml b/.github/workflows/fix_linting.yml similarity index 80% rename from .github/workflows/fix-linting.yml rename to .github/workflows/fix_linting.yml index 4e7f853..e18e27a 100644 --- a/.github/workflows/fix-linting.yml +++ b/.github/workflows/fix_linting.yml @@ -13,13 +13,13 @@ jobs: runs-on: ubuntu-latest steps: # Use the @nf-core-bot token to check out so we can push later - - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4 + - uses: actions/checkout@93cb6efe18208431cddfb8368fd83d5badbf9bfd # v5 with: token: ${{ secrets.nf_core_bot_auth_token }} # indication that the linting is being fixed - name: React on comment - uses: peter-evans/create-or-update-comment@71345be0265236311c031f5c7866368bd1eff043 # v4 + uses: peter-evans/create-or-update-comment@e8674b075228eee787fea43ef493e45ece1004c9 # v5 with: comment-id: ${{ github.event.comment.id }} reactions: eyes @@ -32,9 +32,9 @@ jobs: GITHUB_TOKEN: ${{ secrets.nf_core_bot_auth_token }} # Install and run pre-commit - - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5 + - uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6 with: - python-version: "3.12" + python-version: "3.14" - name: Install pre-commit run: pip install pre-commit @@ -47,7 +47,7 @@ jobs: # indication that the linting has finished - name: react if linting finished succesfully if: steps.pre-commit.outcome == 'success' - uses: peter-evans/create-or-update-comment@71345be0265236311c031f5c7866368bd1eff043 # v4 + uses: peter-evans/create-or-update-comment@e8674b075228eee787fea43ef493e45ece1004c9 # v5 with: comment-id: ${{ github.event.comment.id }} reactions: "+1" @@ -67,21 +67,21 @@ jobs: - name: react if linting errors were fixed id: react-if-fixed if: steps.commit-and-push.outcome == 'success' - uses: peter-evans/create-or-update-comment@71345be0265236311c031f5c7866368bd1eff043 # v4 + uses: peter-evans/create-or-update-comment@e8674b075228eee787fea43ef493e45ece1004c9 # v5 with: comment-id: ${{ github.event.comment.id }} reactions: hooray - name: react if linting errors were not fixed if: steps.commit-and-push.outcome == 'failure' - uses: peter-evans/create-or-update-comment@71345be0265236311c031f5c7866368bd1eff043 # v4 + uses: peter-evans/create-or-update-comment@e8674b075228eee787fea43ef493e45ece1004c9 # v5 with: comment-id: ${{ github.event.comment.id }} reactions: confused - name: react if linting errors were not fixed if: steps.commit-and-push.outcome == 'failure' - uses: peter-evans/create-or-update-comment@71345be0265236311c031f5c7866368bd1eff043 # v4 + uses: peter-evans/create-or-update-comment@e8674b075228eee787fea43ef493e45ece1004c9 # v5 with: issue-number: ${{ github.event.issue.number }} body: | diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml index dbd52d5..7a527a3 100644 --- a/.github/workflows/linting.yml +++ b/.github/workflows/linting.yml @@ -3,9 +3,6 @@ name: nf-core linting # It runs the `nf-core pipelines lint` and markdown lint tests to ensure # that the code meets the nf-core guidelines. on: - push: - branches: - - dev pull_request: release: types: [published] @@ -14,12 +11,12 @@ jobs: pre-commit: runs-on: ubuntu-latest steps: - - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4 + - uses: actions/checkout@93cb6efe18208431cddfb8368fd83d5badbf9bfd # v5 - - name: Set up Python 3.12 - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5 + - name: Set up Python 3.14 + uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6 with: - python-version: "3.12" + python-version: "3.14" - name: Install pre-commit run: pip install pre-commit @@ -31,18 +28,18 @@ jobs: runs-on: ubuntu-latest steps: - name: Check out pipeline code - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4 + uses: actions/checkout@93cb6efe18208431cddfb8368fd83d5badbf9bfd # v5 - name: Install Nextflow uses: nf-core/setup-nextflow@v2 - - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5 + - uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6 with: - python-version: "3.12" + python-version: "3.14" architecture: "x64" - name: read .nf-core.yml - uses: pietrobolcato/action-read-yaml@1.1.0 + uses: pietrobolcato/action-read-yaml@9f13718d61111b69f30ab4ac683e67a56d254e1d # 1.1.0 id: read_yml with: config: ${{ github.workspace }}/.nf-core.yml @@ -74,7 +71,7 @@ jobs: - name: Upload linting log file artifact if: ${{ always() }} - uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4 + uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5 with: name: linting-logs path: | diff --git a/.github/workflows/linting_comment.yml b/.github/workflows/linting_comment.yml index 95b6b6a..e6e9bc2 100644 --- a/.github/workflows/linting_comment.yml +++ b/.github/workflows/linting_comment.yml @@ -11,7 +11,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Download lint results - uses: dawidd6/action-download-artifact@20319c5641d495c8a52e688b7dc5fada6c3a9fbc # v8 + uses: dawidd6/action-download-artifact@ac66b43f0e6a346234dd65d4d0c8fbb31cb316e5 # v11 with: workflow: linting.yml workflow_conclusion: completed @@ -21,7 +21,7 @@ jobs: run: echo "pr_number=$(cat linting-logs/PR_number.txt)" >> $GITHUB_OUTPUT - name: Post PR comment - uses: marocchino/sticky-pull-request-comment@331f8f5b4215f0445d3c07b4967662a32a2d3e31 # v2 + uses: marocchino/sticky-pull-request-comment@773744901bac0e8cbb5a0dc842800d45e9b2b405 # v2 with: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} number: ${{ steps.pr_number.outputs.pr_number }} diff --git a/.github/workflows/nf-test.yml b/.github/workflows/nf-test.yml new file mode 100644 index 0000000..fd742d1 --- /dev/null +++ b/.github/workflows/nf-test.yml @@ -0,0 +1,144 @@ +name: Run nf-test +on: + pull_request: + paths-ignore: + - "docs/**" + - "**/meta.yml" + - "**/*.md" + - "**/*.png" + - "**/*.svg" + release: + types: [published] + workflow_dispatch: + +# Cancel if a newer run is started +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + +env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + NFT_VER: "0.9.3" + NFT_WORKDIR: "~" + NXF_ANSI_LOG: false + NXF_SINGULARITY_CACHEDIR: ${{ github.workspace }}/.singularity + NXF_SINGULARITY_LIBRARYDIR: ${{ github.workspace }}/.singularity + +jobs: + nf-test-changes: + name: nf-test-changes + runs-on: # use self-hosted runners + - runs-on=${{ github.run_id }}-nf-test-changes + - runner=4cpu-linux-x64 + outputs: + shard: ${{ steps.set-shards.outputs.shard }} + total_shards: ${{ steps.set-shards.outputs.total_shards }} + steps: + - name: Clean Workspace # Purge the workspace in case it's running on a self-hosted runner + run: | + ls -la ./ + rm -rf ./* || true + rm -rf ./.??* || true + ls -la ./ + - uses: actions/checkout@93cb6efe18208431cddfb8368fd83d5badbf9bfd # v5 + with: + fetch-depth: 0 + + - name: get number of shards + id: set-shards + uses: ./.github/actions/get-shards + env: + NFT_VER: ${{ env.NFT_VER }} + with: + max_shards: 7 + + - name: debug + run: | + echo ${{ steps.set-shards.outputs.shard }} + echo ${{ steps.set-shards.outputs.total_shards }} + + nf-test: + name: "${{ matrix.profile }} | ${{ matrix.NXF_VER }} | ${{ matrix.shard }}/${{ needs.nf-test-changes.outputs.total_shards }}" + needs: [nf-test-changes] + if: ${{ needs.nf-test-changes.outputs.total_shards != '0' }} + runs-on: # use self-hosted runners + - runs-on=${{ github.run_id }}-nf-test + - runner=4cpu-linux-x64 + strategy: + fail-fast: false + matrix: + shard: ${{ fromJson(needs.nf-test-changes.outputs.shard) }} + profile: [conda, docker, singularity] + isMain: + - ${{ github.base_ref == 'master' || github.base_ref == 'main' }} + # Exclude conda and singularity on dev + exclude: + - isMain: false + profile: "conda" + - isMain: false + profile: "singularity" + NXF_VER: + - "25.10.0" + - "latest-everything" + env: + NXF_ANSI_LOG: false + TOTAL_SHARDS: ${{ needs.nf-test-changes.outputs.total_shards }} + + steps: + - uses: actions/checkout@93cb6efe18208431cddfb8368fd83d5badbf9bfd # v5 + with: + fetch-depth: 0 + + - name: Run nf-test + id: run_nf_test + uses: ./.github/actions/nf-test + continue-on-error: ${{ matrix.NXF_VER == 'latest-everything' }} + env: + NFT_WORKDIR: ${{ env.NFT_WORKDIR }} + NXF_VERSION: ${{ matrix.NXF_VER }} + with: + profile: ${{ matrix.profile }} + shard: ${{ matrix.shard }} + total_shards: ${{ env.TOTAL_SHARDS }} + + - name: Report test status + if: ${{ always() }} + run: | + if [[ "${{ steps.run_nf_test.outcome }}" == "failure" ]]; then + echo "::error::Test with ${{ matrix.NXF_VER }} failed" + # Add to workflow summary + echo "## ❌ Test failed: ${{ matrix.profile }} | ${{ matrix.NXF_VER }} | Shard ${{ matrix.shard }}/${{ env.TOTAL_SHARDS }}" >> $GITHUB_STEP_SUMMARY + if [[ "${{ matrix.NXF_VER }}" == "latest-everything" ]]; then + echo "::warning::Test with latest-everything failed but will not cause workflow failure. Please check if the error is expected or if it needs fixing." + fi + if [[ "${{ matrix.NXF_VER }}" != "latest-everything" ]]; then + exit 1 + fi + fi + + confirm-pass: + needs: [nf-test] + if: always() + runs-on: # use self-hosted runners + - runs-on=${{ github.run_id }}-confirm-pass + - runner=2cpu-linux-x64 + steps: + - name: One or more tests failed (excluding latest-everything) + if: ${{ contains(needs.*.result, 'failure') }} + run: exit 1 + + - name: One or more tests cancelled + if: ${{ contains(needs.*.result, 'cancelled') }} + run: exit 1 + + - name: All tests ok + if: ${{ contains(needs.*.result, 'success') }} + run: exit 0 + + - name: debug-print + if: always() + run: | + echo "::group::DEBUG: `needs` Contents" + echo "DEBUG: toJSON(needs) = ${{ toJSON(needs) }}" + echo "DEBUG: toJSON(needs.*.result) = ${{ toJSON(needs.*.result) }}" + echo "::endgroup::" diff --git a/.github/workflows/release-announcements.yml b/.github/workflows/release-announcements.yml index 76a9e67..431d3d4 100644 --- a/.github/workflows/release-announcements.yml +++ b/.github/workflows/release-announcements.yml @@ -14,6 +14,10 @@ jobs: run: | echo "topics=$(curl -s https://nf-co.re/pipelines.json | jq -r '.remote_workflows[] | select(.full_name == "${{ github.repository }}") | .topics[]' | awk '{print "#"$0}' | tr '\n' ' ')" | sed 's/-//g' >> $GITHUB_OUTPUT + - name: get description + id: get_description + run: | + echo "description=$(curl -s https://nf-co.re/pipelines.json | jq -r '.remote_workflows[] | select(.full_name == "${{ github.repository }}") | .description')" >> $GITHUB_OUTPUT - uses: rzr/fediverse-action@master with: access-token: ${{ secrets.MASTODON_ACCESS_TOKEN }} @@ -22,7 +26,7 @@ jobs: # https://docs.github.com/en/developers/webhooks-and-events/webhooks/webhook-events-and-payloads#release message: | Pipeline release! ${{ github.repository }} v${{ github.event.release.tag_name }} - ${{ github.event.release.name }}! - + ${{ steps.get_description.outputs.description }} Please see the changelog: ${{ github.event.release.html_url }} ${{ steps.get_topics.outputs.topics }} #nfcore #openscience #nextflow #bioinformatics @@ -30,7 +34,7 @@ jobs: bsky-post: runs-on: ubuntu-latest steps: - - uses: zentered/bluesky-post-action@80dbe0a7697de18c15ad22f4619919ceb5ccf597 # v0.1.0 + - uses: zentered/bluesky-post-action@6461056ea355ea43b977e149f7bf76aaa572e5e8 # v0.3.0 with: post: | Pipeline release! ${{ github.repository }} v${{ github.event.release.tag_name }} - ${{ github.event.release.name }}! diff --git a/.github/workflows/template_version_comment.yml b/.github/workflows/template-version-comment.yml similarity index 91% rename from .github/workflows/template_version_comment.yml rename to .github/workflows/template-version-comment.yml index 537529b..e8560fc 100644 --- a/.github/workflows/template_version_comment.yml +++ b/.github/workflows/template-version-comment.yml @@ -9,12 +9,12 @@ jobs: runs-on: ubuntu-latest steps: - name: Check out pipeline code - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4 + uses: actions/checkout@93cb6efe18208431cddfb8368fd83d5badbf9bfd # v5 with: ref: ${{ github.event.pull_request.head.sha }} - name: Read template version from .nf-core.yml - uses: nichmor/minimal-read-yaml@v0.0.2 + uses: nichmor/minimal-read-yaml@1f7205277e25e156e1f63815781db80a6d490b8f # v0.0.2 id: read_yml with: config: ${{ github.workspace }}/.nf-core.yml diff --git a/.gitignore b/.gitignore index a42ce01..1989f4d 100644 --- a/.gitignore +++ b/.gitignore @@ -7,3 +7,9 @@ testing/ testing* *.pyc null/ + +# Nextflow nf-tests output +.nf-test.log +.nf-test/tests +.nf-test-*.nf +.nf-test/* diff --git a/.gitpod.yml b/.gitpod.yml deleted file mode 100644 index 83599f6..0000000 --- a/.gitpod.yml +++ /dev/null @@ -1,10 +0,0 @@ -image: nfcore/gitpod:latest -tasks: - - name: Update Nextflow and setup pre-commit - command: | - pre-commit install --install-hooks - nextflow self-update - -vscode: - extensions: - - nf-core.nf-core-extensionpack # https://github.com/nf-core/vscode-extensionpack diff --git a/.nf-core.yml b/.nf-core.yml index 485ee2a..50507a8 100644 --- a/.nf-core.yml +++ b/.nf-core.yml @@ -1,16 +1,24 @@ +lint: + files_exist: + - .github/workflows/ci.yml + - conf/igenomes.config + - conf/igenomes_ignored.config + files_unchanged: + - assets/nf-core-proteinannotator_logo_light.png + - docs/images/nf-core-proteinannotator_logo_light.png + - docs/images/nf-core-proteinannotator_logo_dark.png + - .github/PULL_REQUEST_TEMPLATE.md +nf_core_version: 3.5.1 repository_type: pipeline - -nf_core_version: 3.2.0 - -lint: {} - template: - org: nf-core - name: proteinannotator - description: The best protein annotation pipeline in the world. Protein fasta -> - ??? -> Annotations! - author: Olga Botvinnik - version: 1.0.0dev + author: Olga Botvinnik, Evangelos Karatzas + description: Generation of sequence-level annotations for amino acid sequences + version: 1.0.0 force: true outdir: . + skip_features: + - fastqc + - igenomes is_nfcore: true + name: proteinannotator + org: nf-core diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 1dec865..d06777a 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -4,10 +4,24 @@ repos: hooks: - id: prettier additional_dependencies: - - prettier@3.2.5 - - - repo: https://github.com/editorconfig-checker/editorconfig-checker.python - rev: "3.1.2" + - prettier@3.6.2 + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v6.0.0 hooks: - - id: editorconfig-checker - alias: ec + - id: trailing-whitespace + args: [--markdown-linebreak-ext=md] + exclude: | + (?x)^( + .*ro-crate-metadata.json$| + modules/nf-core/.*| + subworkflows/nf-core/.*| + .*\.snap$ + )$ + - id: end-of-file-fixer + exclude: | + (?x)^( + .*ro-crate-metadata.json$| + modules/nf-core/.*| + subworkflows/nf-core/.*| + .*\.snap$ + )$ diff --git a/.prettierignore b/.prettierignore index edd29f0..8e669dc 100644 --- a/.prettierignore +++ b/.prettierignore @@ -10,4 +10,9 @@ testing/ testing* *.pyc bin/ +.nf-test/ ro-crate-metadata.json +modules/nf-core/ +subworkflows/nf-core/ +**/Makefile +Makefile diff --git a/.prettierrc.yml b/.prettierrc.yml index c81f9a7..07dbd8b 100644 --- a/.prettierrc.yml +++ b/.prettierrc.yml @@ -1 +1,6 @@ printWidth: 120 +tabWidth: 4 +overrides: + - files: "*.{md,yml,yaml,html,css,scss,js,cff}" + options: + tabWidth: 2 diff --git a/CHANGELOG.md b/CHANGELOG.md index d497b4c..6c034ab 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,14 +3,21 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## v1.0.0dev - [date] +## v1.0.0 - Yellow Saiga - [2026/02/09] Initial release of nf-core/proteinannotator, created with the [nf-core](https://nf-co.re/) template. -### `Added` - -### `Fixed` - -### `Dependencies` - -### `Deprecated` +- [#68](https://github.com/nf-core/proteinannotator/pull/68) - Using the `ARIA2` and `UNTAR` nf-core modules to download and decompress the InterProScan database. (by @vagkaratzas) +- [#67](https://github.com/nf-core/proteinannotator/pull/67) - Swapped to the updated, non-buggy, nf-core version of `INTERPROSCAN`. (by @vagkaratzas) +- [#65](https://github.com/nf-core/proteinannotator/pull/65) - Converted the pipeline schematic to nf-core metromap. (by @vagkaratzas) +- [#62](https://github.com/nf-core/proteinannotator/pull/62) - Added the option to download and use the latest FunFam HMM library (or use path to an existing one) for domain annotation. (by @vagkaratzas) +- [#61](https://github.com/nf-core/proteinannotator/pull/61) - Added nf-core modules `ARIA2` and `HMMER_HMMSEARCH` to download latest Pfam HMM library (or use path to existing one) and match domains to input sequences. (by @vagkaratzas) +- [#60](https://github.com/nf-core/proteinannotator/pull/60) - Added nf-core module `S4PRED_RUNMODEL` for secondary structure prediction (i.e., α-helix, a β-strand or a coil). (by @vagkaratzas) +- [#59](https://github.com/nf-core/proteinannotator/pull/59) - Added nf-core qc and pre-processing subworkflow for amino acid sequences `FAA_SEQFU_SEQKIT`. (by @vagkaratzas) +- [#57](https://github.com/nf-core/proteinannotator/pull/57) - nf-core tools template update to 3.5.1. (by @vagkaratzas) +- [#52](https://github.com/nf-core/proteinannotator/pull/52) - Add option to turn off InterProScan for testing. (by @edmundmiller, @olgabot) +- [#51](https://github.com/nf-core/proteinannotator/pull/51) - Update to nf-core/tools v3.3.1. (by @olgabot) +- [#47](https://github.com/nf-core/proteinannotator/pull/47) - Update metromap with more tools added from [May 2025 Hackathon](https://nf-co.re/events/2025/hackathon-boston). (by @olgabot) +- [#42](https://github.com/nf-core/proteinannotator/pull/42) - Updated to `nf-test` on GitHub Actions and in the `PULL_REQUEST_TEMPLATE.md`. (by @olgabot) +- [#13](https://github.com/nf-core/proteinannotator/pull/13) - Add nf-core seqkit/stats module. (by @olgabot, @heuermh) +- [#9](https://github.com/nf-core/proteinannotator/pull/9) - Added [InterProScan](https://interproscan-docs.readthedocs.io/) module - local version. (by @olgabot, @heuermh, @eweizy) diff --git a/CITATIONS.md b/CITATIONS.md index 843f5d3..3150061 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -10,9 +10,25 @@ ## Pipeline tools -- [FastQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/) +- [SeqFu](https://pubmed.ncbi.nlm.nih.gov/34066939/) -> Andrews, S. (2010). FastQC: A Quality Control Tool for High Throughput Sequence Data [Online]. +> Telatin A, Fariselli P, Birolo G. SeqFu: a suite of utilities for the robust and reproducible manipulation of sequence files. Bioengineering. 2021 May 7;8(5):59. doi: 10.3390/bioengineering8050059. PubMed PMID: 34066939; PubMed Central PMCID: PMC8148589. + +- [SeqKit](https://pubmed.ncbi.nlm.nih.gov/38898985/) + +> Shen W, Sipos B, Zhao L. SeqKit2: A Swiss army knife for sequence and alignment processing. iMeta. 2024 Apr 5:e191. doi: 10.1002/imt2.191. PubMed PMID: 38898985; PubMed Central PMCID: PMC11183193. + +- [hmmer](https://pubmed.ncbi.nlm.nih.gov/29905871/) + +> Eddy SR. Accelerated profile HMM searches. PLoS computational biology. 2011 Oct 20;7(10):e1002195. doi: 10.1371/journal.pcbi.1002195. PubMed PMID: 22039361; PubMed Central PMCID: PMC3197634. + +- [InterProScan](https://academic.oup.com/bioinformatics/article/17/9/847/206564) + +> Jones P, Binns D, Chang HY, Fraser M, Li W, McAnulla C, McWilliam H, Maslen J, Mitchell A, Nuka G, Pesseat S, Quinn, A. F, Sangrador-Vegas A, Scheremetjew M, Yong S-Y, Lopez R, Hunter S. InterProScan 5: genome-scale protein function classification. Bioinformatics. 2014 May 1;30(9):1236-40. doi: 10.1093/bioinformatics/btu031. PubMed PMID: 24451626; PubMed Central PMCID: PMC3998142. + +- [s4pred](https://pubmed.ncbi.nlm.nih.gov/34213528/) + +> Moffat L, Jones DT. Increasing the accuracy of single sequence prediction methods using a deep semi-supervised learning framework. Bioinformatics. 2021 Nov 1;37(21):3744-51. doi: 10.1093/bioinformatics/btab491. PubMed PMID: 34213528; PubMed Central PMCID: PMC8570780. - [MultiQC](https://pubmed.ncbi.nlm.nih.gov/27312411/) diff --git a/README.md b/README.md index 17e7044..fb552f8 100644 --- a/README.md +++ b/README.md @@ -5,57 +5,63 @@ -[![GitHub Actions CI Status](https://github.com/nf-core/proteinannotator/actions/workflows/ci.yml/badge.svg)](https://github.com/nf-core/proteinannotator/actions/workflows/ci.yml) +[![Open in GitHub Codespaces](https://img.shields.io/badge/Open_In_GitHub_Codespaces-black?labelColor=grey&logo=github)](https://github.com/codespaces/new/nf-core/proteinannotator) +[![GitHub Actions CI Status](https://github.com/nf-core/proteinannotator/actions/workflows/nf-test.yml/badge.svg)](https://github.com/nf-core/proteinannotator/actions/workflows/nf-test.yml) [![GitHub Actions Linting Status](https://github.com/nf-core/proteinannotator/actions/workflows/linting.yml/badge.svg)](https://github.com/nf-core/proteinannotator/actions/workflows/linting.yml)[![AWS CI](https://img.shields.io/badge/CI%20tests-full%20size-FF9900?labelColor=000000&logo=Amazon%20AWS)](https://nf-co.re/proteinannotator/results)[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.XXXXXXX-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.XXXXXXX) [![nf-test](https://img.shields.io/badge/unit_tests-nf--test-337ab7.svg)](https://www.nf-test.com) -[![Nextflow](https://img.shields.io/badge/nextflow%20DSL2-%E2%89%A524.04.2-23aa62.svg)](https://www.nextflow.io/) +[![Nextflow](https://img.shields.io/badge/version-%E2%89%A525.10.0-green?style=flat&logo=nextflow&logoColor=white&color=%230DC09D&link=https%3A%2F%2Fnextflow.io)](https://www.nextflow.io/) +[![nf-core template version](https://img.shields.io/badge/nf--core_template-3.5.1-green?style=flat&logo=nfcore&logoColor=white&color=%2324B064&link=https%3A%2F%2Fnf-co.re)](https://github.com/nf-core/tools/releases/tag/3.5.1) [![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=anaconda)](https://docs.conda.io/en/latest/) [![run with docker](https://img.shields.io/badge/run%20with-docker-0db7ed?labelColor=000000&logo=docker)](https://www.docker.com/) [![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000)](https://sylabs.io/docs/) [![Launch on Seqera Platform](https://img.shields.io/badge/Launch%20%F0%9F%9A%80-Seqera%20Platform-%234256e7)](https://cloud.seqera.io/launch?pipeline=https://github.com/nf-core/proteinannotator) -[![Get help on Slack](http://img.shields.io/badge/slack-nf--core%20%23proteinannotator-4A154B?labelColor=000000&logo=slack)](https://nfcore.slack.com/channels/proteinannotator)[![Follow on Twitter](http://img.shields.io/badge/twitter-%40nf__core-1DA1F2?labelColor=000000&logo=twitter)](https://twitter.com/nf_core)[![Follow on Mastodon](https://img.shields.io/badge/mastodon-nf__core-6364ff?labelColor=FFFFFF&logo=mastodon)](https://mstdn.science/@nf_core)[![Watch on YouTube](http://img.shields.io/badge/youtube-nf--core-FF0000?labelColor=000000&logo=youtube)](https://www.youtube.com/c/nf-core) +[![Get help on Slack](http://img.shields.io/badge/slack-nf--core%20%23proteinannotator-4A154B?labelColor=000000&logo=slack)](https://nfcore.slack.com/channels/proteinannotator)[![Follow on Bluesky](https://img.shields.io/badge/bluesky-%40nf__core-1185fe?labelColor=000000&logo=bluesky)](https://bsky.app/profile/nf-co.re)[![Follow on Mastodon](https://img.shields.io/badge/mastodon-nf__core-6364ff?labelColor=FFFFFF&logo=mastodon)](https://mstdn.science/@nf_core)[![Watch on YouTube](http://img.shields.io/badge/youtube-nf--core-FF0000?labelColor=000000&logo=youtube)](https://www.youtube.com/c/nf-core) ## Introduction -**nf-core/proteinannotator** is a bioinformatics pipeline that ... +**nf-core/proteinannotator** is a bioinformatics pipeline that computes statistics for protein FASTA inputs and produces protein annotations based on predicted sequence features, including conserved domains, functions, and secondary structure. - +

+ + + nf-core/proteinannotator + +

+ +### Check quality and pre-process + +Generate input amino acid sequence statistics with ([`SeqFu`](https://github.com/telatin/seqfu2/)) and pre-process them (i.e., gap removal, convert to upper case, validate, filter by length, replace special characters such as `/`, and remove duplicate sequences) with ([`SeqKit`](https://github.com/shenwei356/seqkit/)) + +### Annotate sequences - -1. Read QC ([`FastQC`](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/))2. Present QC for raw reads ([`MultiQC`](http://multiqc.info/)) +1. Conserved domain annotation with ([`hmmer`](https://github.com/EddyRivasLab/hmmer/)) against databases + such as [Pfam](https://ftp.ebi.ac.uk/pub/databases/Pfam/) and [FunFam](https://download.cathdb.info/cath/releases/all-releases/) +2. Functional annotation: + - ([`InterProScan`](https://interproscan-docs.readthedocs.io/en/v5/)) a software tool used to analyze protein sequences by scanning them against the signatures of protein families, domains, and sites in the [InterPro](https://www.ebi.ac.uk/interpro/) database, helping to identify their functional characteristics. +3. Predict secondary structure compositional features such as α-helices, β-strands and coils with ([`s4pred`](https://github.com/psipred/s4pred)) +4. Present QC stats for input sequences before and after initial pre-processing with ([`MultiQC`](http://multiqc.info/)) ## Usage > [!NOTE] > If you are new to Nextflow and nf-core, please refer to [this page](https://nf-co.re/docs/usage/installation) on how to set-up Nextflow. Make sure to [test your setup](https://nf-co.re/docs/usage/introduction#how-to-run-a-pipeline) with `-profile test` before running the workflow on actual data. - +Each row represents a FASTA file of proteins from a single species. Now, you can run the pipeline using: - - ```bash nextflow run nf-core/proteinannotator \ -profile \ @@ -76,11 +82,14 @@ For more details about the output files and reports, please refer to the ## Credits -nf-core/proteinannotator was originally written by Olga Botvinnik. +nf-core/proteinannotator was originally written by Olga Botvinnik and Evangelos Karatzas. We thank the following people for their extensive assistance in the development of this pipeline: - +- [Michael L Heuer](https://github.com/heuermh) +- [Edmund Miller](https://github.com/edmundmiller) +- [Eric Wei](https://github.com/eweizy) +- [Martin Beracochea](https://github.com/mberacochea) ## Contributions and Support @@ -93,8 +102,6 @@ For further information or help, don't hesitate to get in touch on the [Slack `# - - An extensive list of references for the tools used by the pipeline can be found in the [`CITATIONS.md`](CITATIONS.md) file. You can cite the `nf-core` publication as follows: diff --git a/assets/email_template.html b/assets/email_template.html index faeae5f..6918b57 100644 --- a/assets/email_template.html +++ b/assets/email_template.html @@ -4,7 +4,7 @@ - + nf-core/proteinannotator Pipeline Report diff --git a/assets/methods_description_template.yml b/assets/methods_description_template.yml index 41b9841..dbb106f 100644 --- a/assets/methods_description_template.yml +++ b/assets/methods_description_template.yml @@ -3,7 +3,6 @@ description: "Suggested text and references to use when describing pipeline usag section_name: "nf-core/proteinannotator Methods Description" section_href: "https://github.com/nf-core/proteinannotator" plot_type: "html" -## TODO nf-core: Update the HTML below to your preferred methods description, e.g. add publication citation for this pipeline ## You inject any metadata in the Nextflow '${workflow}' object data: |

Methods

diff --git a/assets/multiqc_config.yml b/assets/multiqc_config.yml index bf478c5..e9ddc17 100644 --- a/assets/multiqc_config.yml +++ b/assets/multiqc_config.yml @@ -1,7 +1,5 @@ report_comment: > - This report has been generated by the nf-core/proteinannotator - analysis pipeline. For information about how to interpret these results, please see the - documentation. + This report has been generated by the nf-core/proteinannotator analysis pipeline. For information about how to interpret these results, please see the documentation. report_section_order: "nf-core-proteinannotator-methods-description": order: -1000 diff --git a/assets/nf-core-proteinannotator_logo_light.png b/assets/nf-core-proteinannotator_logo_light.png index 3195fc7..0d582c6 100644 Binary files a/assets/nf-core-proteinannotator_logo_light.png and b/assets/nf-core-proteinannotator_logo_light.png differ diff --git a/assets/samplesheet.csv b/assets/samplesheet.csv index 5f653ab..e09391c 100644 --- a/assets/samplesheet.csv +++ b/assets/samplesheet.csv @@ -1,3 +1,3 @@ -sample,fastq_1,fastq_2 -SAMPLE_PAIRED_END,/path/to/fastq/files/AEG588A1_S1_L002_R1_001.fastq.gz,/path/to/fastq/files/AEG588A1_S1_L002_R2_001.fastq.gz -SAMPLE_SINGLE_END,/path/to/fastq/files/AEG588A4_S4_L003_R1_001.fastq.gz, +id,fasta +species1,species1_proteins.fasta +species2,species2_proteins.fasta diff --git a/assets/schema_input.json b/assets/schema_input.json index e508742..f627365 100644 --- a/assets/schema_input.json +++ b/assets/schema_input.json @@ -7,27 +7,20 @@ "items": { "type": "object", "properties": { - "sample": { + "id": { "type": "string", "pattern": "^\\S+$", "errorMessage": "Sample name must be provided and cannot contain spaces", "meta": ["id"] }, - "fastq_1": { + "fasta": { "type": "string", "format": "file-path", "exists": true, - "pattern": "^\\S+\\.f(ast)?q\\.gz$", - "errorMessage": "FastQ file for reads 1 must be provided, cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'" - }, - "fastq_2": { - "type": "string", - "format": "file-path", - "exists": true, - "pattern": "^\\S+\\.f(ast)?q\\.gz$", - "errorMessage": "FastQ file for reads 2 cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'" + "pattern": "^([\\S\\s]*\\/)?[^\\s\\/]+\\.(fa|fasta|faa|fas)(\\.gz)?$", + "errorMessage": "Fasta file for each sample must be provided, cannot contain spaces and must have extension '.faa', '.fa', '.fas' or '.fasta', and optionally be compressed (.gz)" } }, - "required": ["sample", "fastq_1"] + "required": ["id", "fasta"] } } diff --git a/conf/base.config b/conf/base.config index 3b25db0..bbc7b33 100644 --- a/conf/base.config +++ b/conf/base.config @@ -15,7 +15,7 @@ process { memory = { 6.GB * task.attempt } time = { 4.h * task.attempt } - errorStrategy = { task.exitStatus in ((130..145) + 104) ? 'retry' : 'finish' } + errorStrategy = { task.exitStatus in ((130..145) + 104 + 175) ? 'retry' : 'finish' } maxRetries = 1 maxErrors = '-1' @@ -59,4 +59,8 @@ process { errorStrategy = 'retry' maxRetries = 2 } + withLabel: process_gpu { + ext.use_gpu = { workflow.profile.contains('gpu') } + accelerator = { workflow.profile.contains('gpu') ? 1 : null } + } } diff --git a/conf/igenomes.config b/conf/igenomes.config deleted file mode 100644 index 3f11437..0000000 --- a/conf/igenomes.config +++ /dev/null @@ -1,440 +0,0 @@ -/* -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - Nextflow config file for iGenomes paths -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - Defines reference genomes using iGenome paths. - Can be used by any config that customises the base path using: - $params.igenomes_base / --igenomes_base ----------------------------------------------------------------------------------------- -*/ - -params { - // illumina iGenomes reference file paths - genomes { - 'GRCh37' { - fasta = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Annotation/README.txt" - mito_name = "MT" - macs_gsize = "2.7e9" - blacklist = "${projectDir}/assets/blacklists/GRCh37-blacklist.bed" - } - 'GRCh38' { - fasta = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38/Annotation/Genes/genes.bed" - mito_name = "chrM" - macs_gsize = "2.7e9" - blacklist = "${projectDir}/assets/blacklists/hg38-blacklist.bed" - } - 'CHM13' { - fasta = "${params.igenomes_base}/Homo_sapiens/UCSC/CHM13/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Homo_sapiens/UCSC/CHM13/Sequence/BWAIndex/" - bwamem2 = "${params.igenomes_base}/Homo_sapiens/UCSC/CHM13/Sequence/BWAmem2Index/" - gtf = "${params.igenomes_base}/Homo_sapiens/NCBI/CHM13/Annotation/Genes/genes.gtf" - gff = "ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/009/914/755/GCF_009914755.1_T2T-CHM13v2.0/GCF_009914755.1_T2T-CHM13v2.0_genomic.gff.gz" - mito_name = "chrM" - } - 'GRCm38' { - fasta = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/Annotation/README.txt" - mito_name = "MT" - macs_gsize = "1.87e9" - blacklist = "${projectDir}/assets/blacklists/GRCm38-blacklist.bed" - } - 'TAIR10' { - fasta = "${params.igenomes_base}/Arabidopsis_thaliana/Ensembl/TAIR10/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Arabidopsis_thaliana/Ensembl/TAIR10/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Arabidopsis_thaliana/Ensembl/TAIR10/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Arabidopsis_thaliana/Ensembl/TAIR10/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Arabidopsis_thaliana/Ensembl/TAIR10/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Arabidopsis_thaliana/Ensembl/TAIR10/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Arabidopsis_thaliana/Ensembl/TAIR10/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Arabidopsis_thaliana/Ensembl/TAIR10/Annotation/README.txt" - mito_name = "Mt" - } - 'EB2' { - fasta = "${params.igenomes_base}/Bacillus_subtilis_168/Ensembl/EB2/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Bacillus_subtilis_168/Ensembl/EB2/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Bacillus_subtilis_168/Ensembl/EB2/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Bacillus_subtilis_168/Ensembl/EB2/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Bacillus_subtilis_168/Ensembl/EB2/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Bacillus_subtilis_168/Ensembl/EB2/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Bacillus_subtilis_168/Ensembl/EB2/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Bacillus_subtilis_168/Ensembl/EB2/Annotation/README.txt" - } - 'UMD3.1' { - fasta = "${params.igenomes_base}/Bos_taurus/Ensembl/UMD3.1/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Bos_taurus/Ensembl/UMD3.1/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Bos_taurus/Ensembl/UMD3.1/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Bos_taurus/Ensembl/UMD3.1/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Bos_taurus/Ensembl/UMD3.1/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Bos_taurus/Ensembl/UMD3.1/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Bos_taurus/Ensembl/UMD3.1/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Bos_taurus/Ensembl/UMD3.1/Annotation/README.txt" - mito_name = "MT" - } - 'WBcel235' { - fasta = "${params.igenomes_base}/Caenorhabditis_elegans/Ensembl/WBcel235/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Caenorhabditis_elegans/Ensembl/WBcel235/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Caenorhabditis_elegans/Ensembl/WBcel235/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Caenorhabditis_elegans/Ensembl/WBcel235/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Caenorhabditis_elegans/Ensembl/WBcel235/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Caenorhabditis_elegans/Ensembl/WBcel235/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Caenorhabditis_elegans/Ensembl/WBcel235/Annotation/Genes/genes.bed" - mito_name = "MtDNA" - macs_gsize = "9e7" - } - 'CanFam3.1' { - fasta = "${params.igenomes_base}/Canis_familiaris/Ensembl/CanFam3.1/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Canis_familiaris/Ensembl/CanFam3.1/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Canis_familiaris/Ensembl/CanFam3.1/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Canis_familiaris/Ensembl/CanFam3.1/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Canis_familiaris/Ensembl/CanFam3.1/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Canis_familiaris/Ensembl/CanFam3.1/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Canis_familiaris/Ensembl/CanFam3.1/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Canis_familiaris/Ensembl/CanFam3.1/Annotation/README.txt" - mito_name = "MT" - } - 'GRCz10' { - fasta = "${params.igenomes_base}/Danio_rerio/Ensembl/GRCz10/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Danio_rerio/Ensembl/GRCz10/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Danio_rerio/Ensembl/GRCz10/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Danio_rerio/Ensembl/GRCz10/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Danio_rerio/Ensembl/GRCz10/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Danio_rerio/Ensembl/GRCz10/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Danio_rerio/Ensembl/GRCz10/Annotation/Genes/genes.bed" - mito_name = "MT" - } - 'BDGP6' { - fasta = "${params.igenomes_base}/Drosophila_melanogaster/Ensembl/BDGP6/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Drosophila_melanogaster/Ensembl/BDGP6/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Drosophila_melanogaster/Ensembl/BDGP6/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Drosophila_melanogaster/Ensembl/BDGP6/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Drosophila_melanogaster/Ensembl/BDGP6/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Drosophila_melanogaster/Ensembl/BDGP6/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Drosophila_melanogaster/Ensembl/BDGP6/Annotation/Genes/genes.bed" - mito_name = "M" - macs_gsize = "1.2e8" - } - 'EquCab2' { - fasta = "${params.igenomes_base}/Equus_caballus/Ensembl/EquCab2/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Equus_caballus/Ensembl/EquCab2/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Equus_caballus/Ensembl/EquCab2/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Equus_caballus/Ensembl/EquCab2/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Equus_caballus/Ensembl/EquCab2/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Equus_caballus/Ensembl/EquCab2/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Equus_caballus/Ensembl/EquCab2/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Equus_caballus/Ensembl/EquCab2/Annotation/README.txt" - mito_name = "MT" - } - 'EB1' { - fasta = "${params.igenomes_base}/Escherichia_coli_K_12_DH10B/Ensembl/EB1/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Escherichia_coli_K_12_DH10B/Ensembl/EB1/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Escherichia_coli_K_12_DH10B/Ensembl/EB1/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Escherichia_coli_K_12_DH10B/Ensembl/EB1/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Escherichia_coli_K_12_DH10B/Ensembl/EB1/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Escherichia_coli_K_12_DH10B/Ensembl/EB1/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Escherichia_coli_K_12_DH10B/Ensembl/EB1/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Escherichia_coli_K_12_DH10B/Ensembl/EB1/Annotation/README.txt" - } - 'Galgal4' { - fasta = "${params.igenomes_base}/Gallus_gallus/Ensembl/Galgal4/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Gallus_gallus/Ensembl/Galgal4/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Gallus_gallus/Ensembl/Galgal4/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Gallus_gallus/Ensembl/Galgal4/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Gallus_gallus/Ensembl/Galgal4/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Gallus_gallus/Ensembl/Galgal4/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Gallus_gallus/Ensembl/Galgal4/Annotation/Genes/genes.bed" - mito_name = "MT" - } - 'Gm01' { - fasta = "${params.igenomes_base}/Glycine_max/Ensembl/Gm01/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Glycine_max/Ensembl/Gm01/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Glycine_max/Ensembl/Gm01/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Glycine_max/Ensembl/Gm01/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Glycine_max/Ensembl/Gm01/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Glycine_max/Ensembl/Gm01/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Glycine_max/Ensembl/Gm01/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Glycine_max/Ensembl/Gm01/Annotation/README.txt" - } - 'Mmul_1' { - fasta = "${params.igenomes_base}/Macaca_mulatta/Ensembl/Mmul_1/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Macaca_mulatta/Ensembl/Mmul_1/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Macaca_mulatta/Ensembl/Mmul_1/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Macaca_mulatta/Ensembl/Mmul_1/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Macaca_mulatta/Ensembl/Mmul_1/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Macaca_mulatta/Ensembl/Mmul_1/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Macaca_mulatta/Ensembl/Mmul_1/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Macaca_mulatta/Ensembl/Mmul_1/Annotation/README.txt" - mito_name = "MT" - } - 'IRGSP-1.0' { - fasta = "${params.igenomes_base}/Oryza_sativa_japonica/Ensembl/IRGSP-1.0/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Oryza_sativa_japonica/Ensembl/IRGSP-1.0/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Oryza_sativa_japonica/Ensembl/IRGSP-1.0/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Oryza_sativa_japonica/Ensembl/IRGSP-1.0/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Oryza_sativa_japonica/Ensembl/IRGSP-1.0/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Oryza_sativa_japonica/Ensembl/IRGSP-1.0/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Oryza_sativa_japonica/Ensembl/IRGSP-1.0/Annotation/Genes/genes.bed" - mito_name = "Mt" - } - 'CHIMP2.1.4' { - fasta = "${params.igenomes_base}/Pan_troglodytes/Ensembl/CHIMP2.1.4/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Pan_troglodytes/Ensembl/CHIMP2.1.4/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Pan_troglodytes/Ensembl/CHIMP2.1.4/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Pan_troglodytes/Ensembl/CHIMP2.1.4/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Pan_troglodytes/Ensembl/CHIMP2.1.4/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Pan_troglodytes/Ensembl/CHIMP2.1.4/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Pan_troglodytes/Ensembl/CHIMP2.1.4/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Pan_troglodytes/Ensembl/CHIMP2.1.4/Annotation/README.txt" - mito_name = "MT" - } - 'Rnor_5.0' { - fasta = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_5.0/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_5.0/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_5.0/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_5.0/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_5.0/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_5.0/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_5.0/Annotation/Genes/genes.bed" - mito_name = "MT" - } - 'Rnor_6.0' { - fasta = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_6.0/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_6.0/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_6.0/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_6.0/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_6.0/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_6.0/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_6.0/Annotation/Genes/genes.bed" - mito_name = "MT" - } - 'R64-1-1' { - fasta = "${params.igenomes_base}/Saccharomyces_cerevisiae/Ensembl/R64-1-1/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Saccharomyces_cerevisiae/Ensembl/R64-1-1/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Saccharomyces_cerevisiae/Ensembl/R64-1-1/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Saccharomyces_cerevisiae/Ensembl/R64-1-1/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Saccharomyces_cerevisiae/Ensembl/R64-1-1/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Saccharomyces_cerevisiae/Ensembl/R64-1-1/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Saccharomyces_cerevisiae/Ensembl/R64-1-1/Annotation/Genes/genes.bed" - mito_name = "MT" - macs_gsize = "1.2e7" - } - 'EF2' { - fasta = "${params.igenomes_base}/Schizosaccharomyces_pombe/Ensembl/EF2/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Schizosaccharomyces_pombe/Ensembl/EF2/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Schizosaccharomyces_pombe/Ensembl/EF2/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Schizosaccharomyces_pombe/Ensembl/EF2/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Schizosaccharomyces_pombe/Ensembl/EF2/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Schizosaccharomyces_pombe/Ensembl/EF2/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Schizosaccharomyces_pombe/Ensembl/EF2/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Schizosaccharomyces_pombe/Ensembl/EF2/Annotation/README.txt" - mito_name = "MT" - macs_gsize = "1.21e7" - } - 'Sbi1' { - fasta = "${params.igenomes_base}/Sorghum_bicolor/Ensembl/Sbi1/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Sorghum_bicolor/Ensembl/Sbi1/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Sorghum_bicolor/Ensembl/Sbi1/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Sorghum_bicolor/Ensembl/Sbi1/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Sorghum_bicolor/Ensembl/Sbi1/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Sorghum_bicolor/Ensembl/Sbi1/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Sorghum_bicolor/Ensembl/Sbi1/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Sorghum_bicolor/Ensembl/Sbi1/Annotation/README.txt" - } - 'Sscrofa10.2' { - fasta = "${params.igenomes_base}/Sus_scrofa/Ensembl/Sscrofa10.2/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Sus_scrofa/Ensembl/Sscrofa10.2/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Sus_scrofa/Ensembl/Sscrofa10.2/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Sus_scrofa/Ensembl/Sscrofa10.2/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Sus_scrofa/Ensembl/Sscrofa10.2/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Sus_scrofa/Ensembl/Sscrofa10.2/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Sus_scrofa/Ensembl/Sscrofa10.2/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Sus_scrofa/Ensembl/Sscrofa10.2/Annotation/README.txt" - mito_name = "MT" - } - 'AGPv3' { - fasta = "${params.igenomes_base}/Zea_mays/Ensembl/AGPv3/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Zea_mays/Ensembl/AGPv3/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Zea_mays/Ensembl/AGPv3/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Zea_mays/Ensembl/AGPv3/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Zea_mays/Ensembl/AGPv3/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Zea_mays/Ensembl/AGPv3/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Zea_mays/Ensembl/AGPv3/Annotation/Genes/genes.bed" - mito_name = "Mt" - } - 'hg38' { - fasta = "${params.igenomes_base}/Homo_sapiens/UCSC/hg38/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Homo_sapiens/UCSC/hg38/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Homo_sapiens/UCSC/hg38/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Homo_sapiens/UCSC/hg38/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Homo_sapiens/UCSC/hg38/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Homo_sapiens/UCSC/hg38/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Homo_sapiens/UCSC/hg38/Annotation/Genes/genes.bed" - mito_name = "chrM" - macs_gsize = "2.7e9" - blacklist = "${projectDir}/assets/blacklists/hg38-blacklist.bed" - } - 'hg19' { - fasta = "${params.igenomes_base}/Homo_sapiens/UCSC/hg19/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Homo_sapiens/UCSC/hg19/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Homo_sapiens/UCSC/hg19/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Homo_sapiens/UCSC/hg19/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Homo_sapiens/UCSC/hg19/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Homo_sapiens/UCSC/hg19/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Homo_sapiens/UCSC/hg19/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Homo_sapiens/UCSC/hg19/Annotation/README.txt" - mito_name = "chrM" - macs_gsize = "2.7e9" - blacklist = "${projectDir}/assets/blacklists/hg19-blacklist.bed" - } - 'mm10' { - fasta = "${params.igenomes_base}/Mus_musculus/UCSC/mm10/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Mus_musculus/UCSC/mm10/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Mus_musculus/UCSC/mm10/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Mus_musculus/UCSC/mm10/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Mus_musculus/UCSC/mm10/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Mus_musculus/UCSC/mm10/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Mus_musculus/UCSC/mm10/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Mus_musculus/UCSC/mm10/Annotation/README.txt" - mito_name = "chrM" - macs_gsize = "1.87e9" - blacklist = "${projectDir}/assets/blacklists/mm10-blacklist.bed" - } - 'bosTau8' { - fasta = "${params.igenomes_base}/Bos_taurus/UCSC/bosTau8/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Bos_taurus/UCSC/bosTau8/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Bos_taurus/UCSC/bosTau8/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Bos_taurus/UCSC/bosTau8/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Bos_taurus/UCSC/bosTau8/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Bos_taurus/UCSC/bosTau8/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Bos_taurus/UCSC/bosTau8/Annotation/Genes/genes.bed" - mito_name = "chrM" - } - 'ce10' { - fasta = "${params.igenomes_base}/Caenorhabditis_elegans/UCSC/ce10/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Caenorhabditis_elegans/UCSC/ce10/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Caenorhabditis_elegans/UCSC/ce10/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Caenorhabditis_elegans/UCSC/ce10/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Caenorhabditis_elegans/UCSC/ce10/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Caenorhabditis_elegans/UCSC/ce10/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Caenorhabditis_elegans/UCSC/ce10/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Caenorhabditis_elegans/UCSC/ce10/Annotation/README.txt" - mito_name = "chrM" - macs_gsize = "9e7" - } - 'canFam3' { - fasta = "${params.igenomes_base}/Canis_familiaris/UCSC/canFam3/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Canis_familiaris/UCSC/canFam3/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Canis_familiaris/UCSC/canFam3/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Canis_familiaris/UCSC/canFam3/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Canis_familiaris/UCSC/canFam3/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Canis_familiaris/UCSC/canFam3/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Canis_familiaris/UCSC/canFam3/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Canis_familiaris/UCSC/canFam3/Annotation/README.txt" - mito_name = "chrM" - } - 'danRer10' { - fasta = "${params.igenomes_base}/Danio_rerio/UCSC/danRer10/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Danio_rerio/UCSC/danRer10/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Danio_rerio/UCSC/danRer10/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Danio_rerio/UCSC/danRer10/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Danio_rerio/UCSC/danRer10/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Danio_rerio/UCSC/danRer10/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Danio_rerio/UCSC/danRer10/Annotation/Genes/genes.bed" - mito_name = "chrM" - macs_gsize = "1.37e9" - } - 'dm6' { - fasta = "${params.igenomes_base}/Drosophila_melanogaster/UCSC/dm6/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Drosophila_melanogaster/UCSC/dm6/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Drosophila_melanogaster/UCSC/dm6/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Drosophila_melanogaster/UCSC/dm6/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Drosophila_melanogaster/UCSC/dm6/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Drosophila_melanogaster/UCSC/dm6/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Drosophila_melanogaster/UCSC/dm6/Annotation/Genes/genes.bed" - mito_name = "chrM" - macs_gsize = "1.2e8" - } - 'equCab2' { - fasta = "${params.igenomes_base}/Equus_caballus/UCSC/equCab2/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Equus_caballus/UCSC/equCab2/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Equus_caballus/UCSC/equCab2/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Equus_caballus/UCSC/equCab2/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Equus_caballus/UCSC/equCab2/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Equus_caballus/UCSC/equCab2/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Equus_caballus/UCSC/equCab2/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Equus_caballus/UCSC/equCab2/Annotation/README.txt" - mito_name = "chrM" - } - 'galGal4' { - fasta = "${params.igenomes_base}/Gallus_gallus/UCSC/galGal4/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Gallus_gallus/UCSC/galGal4/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Gallus_gallus/UCSC/galGal4/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Gallus_gallus/UCSC/galGal4/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Gallus_gallus/UCSC/galGal4/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Gallus_gallus/UCSC/galGal4/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Gallus_gallus/UCSC/galGal4/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Gallus_gallus/UCSC/galGal4/Annotation/README.txt" - mito_name = "chrM" - } - 'panTro4' { - fasta = "${params.igenomes_base}/Pan_troglodytes/UCSC/panTro4/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Pan_troglodytes/UCSC/panTro4/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Pan_troglodytes/UCSC/panTro4/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Pan_troglodytes/UCSC/panTro4/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Pan_troglodytes/UCSC/panTro4/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Pan_troglodytes/UCSC/panTro4/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Pan_troglodytes/UCSC/panTro4/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Pan_troglodytes/UCSC/panTro4/Annotation/README.txt" - mito_name = "chrM" - } - 'rn6' { - fasta = "${params.igenomes_base}/Rattus_norvegicus/UCSC/rn6/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Rattus_norvegicus/UCSC/rn6/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Rattus_norvegicus/UCSC/rn6/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Rattus_norvegicus/UCSC/rn6/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Rattus_norvegicus/UCSC/rn6/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Rattus_norvegicus/UCSC/rn6/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Rattus_norvegicus/UCSC/rn6/Annotation/Genes/genes.bed" - mito_name = "chrM" - } - 'sacCer3' { - fasta = "${params.igenomes_base}/Saccharomyces_cerevisiae/UCSC/sacCer3/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Saccharomyces_cerevisiae/UCSC/sacCer3/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Saccharomyces_cerevisiae/UCSC/sacCer3/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Saccharomyces_cerevisiae/UCSC/sacCer3/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Saccharomyces_cerevisiae/UCSC/sacCer3/Sequence/BismarkIndex/" - readme = "${params.igenomes_base}/Saccharomyces_cerevisiae/UCSC/sacCer3/Annotation/README.txt" - mito_name = "chrM" - macs_gsize = "1.2e7" - } - 'susScr3' { - fasta = "${params.igenomes_base}/Sus_scrofa/UCSC/susScr3/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Sus_scrofa/UCSC/susScr3/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Sus_scrofa/UCSC/susScr3/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Sus_scrofa/UCSC/susScr3/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Sus_scrofa/UCSC/susScr3/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Sus_scrofa/UCSC/susScr3/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Sus_scrofa/UCSC/susScr3/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Sus_scrofa/UCSC/susScr3/Annotation/README.txt" - mito_name = "chrM" - } - } -} diff --git a/conf/igenomes_ignored.config b/conf/igenomes_ignored.config deleted file mode 100644 index b4034d8..0000000 --- a/conf/igenomes_ignored.config +++ /dev/null @@ -1,9 +0,0 @@ -/* -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - Nextflow config file for iGenomes paths -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - Empty genomes dictionary to use when igenomes is ignored. ----------------------------------------------------------------------------------------- -*/ - -params.genomes = [:] diff --git a/conf/modules.config b/conf/modules.config index d203d2b..ec1428c 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -18,8 +18,135 @@ process { saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] - withName: FASTQC { - ext.args = '--quiet' + withName: 'NFCORE_PROTEINANNOTATOR:PROTEINANNOTATOR:FAA_SEQFU_SEQKIT:SEQFU_STATS_BEFORE' { + ext.prefix = { "${meta.id}_before" } + publishDir = [ + path: { "${params.outdir}/qc/${meta.id}/" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: 'NFCORE_PROTEINANNOTATOR:PROTEINANNOTATOR:FAA_SEQFU_SEQKIT:SEQFU_STATS_AFTER' { + ext.prefix = { "${meta.id}_after" } + publishDir = [ + path: { "${params.outdir}/qc/${meta.id}/" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: 'NFCORE_PROTEINANNOTATOR:PROTEINANNOTATOR:FAA_SEQFU_SEQKIT:SEQKIT_SEQ' { + ext.args = [ + "--remove-gaps", + "--upper-case", + "--validate-seq", + "--min-len ${params.min_seq_length}", + "--max-len ${params.max_seq_length}" + ].join(' ').trim() + ext.prefix = "intermediate_seqkit_seq" + publishDir = [ + path: { "${params.outdir}/qc/${meta.id}/" }, + mode: params.publish_dir_mode, + enabled: false, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: 'NFCORE_PROTEINANNOTATOR:PROTEINANNOTATOR:FAA_SEQFU_SEQKIT:SEQKIT_REPLACE' { + ext.args = '-p "/" -r "_"' + ext.suffix = "fasta" + ext.prefix = "intermediate_seqkit_replace" + publishDir = [ + path: { "${params.outdir}/qc/${meta.id}/" }, + mode: params.publish_dir_mode, + enabled: false, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: 'NFCORE_PROTEINANNOTATOR:PROTEINANNOTATOR:FAA_SEQFU_SEQKIT:SEQKIT_RMDUP' { + ext.args = { params.remove_duplicates_on_sequence ? "--by-seq" : '' } + publishDir = [ + path: { "${params.outdir}/qc/${meta.id}/" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: 'NFCORE_PROTEINANNOTATOR:PROTEINANNOTATOR:DOMAIN_ANNOTATION:ARIA2_PFAM' { + publishDir = [ + path: { "${params.outdir}/downloaded_dbs/" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: 'NFCORE_PROTEINANNOTATOR:PROTEINANNOTATOR:DOMAIN_ANNOTATION:ARIA2_FUNFAM' { + publishDir = [ + path: { "${params.outdir}/downloaded_dbs/" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: 'NFCORE_PROTEINANNOTATOR:PROTEINANNOTATOR:DOMAIN_ANNOTATION:HMMSEARCH_PFAM' { + ext.args = { "-E ${params.hmmsearch_evalue_cutoff}" } + publishDir = [ + path: { "${params.outdir}/domain_annotation/pfam/" }, + mode: params.publish_dir_mode, + pattern: "*.domtbl.gz", + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: 'NFCORE_PROTEINANNOTATOR:PROTEINANNOTATOR:DOMAIN_ANNOTATION:HMMSEARCH_FUNFAM' { + ext.args = { "-E ${params.hmmsearch_evalue_cutoff}" } + publishDir = [ + path: { "${params.outdir}/domain_annotation/funfam/" }, + mode: params.publish_dir_mode, + pattern: "*.domtbl.gz", + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: 'NFCORE_PROTEINANNOTATOR:PROTEINANNOTATOR:FUNCTIONAL_ANNOTATION:ARIA2' { + publishDir = [ + path: { "${params.outdir}/downloaded_dbs/" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: 'NFCORE_PROTEINANNOTATOR:PROTEINANNOTATOR:FUNCTIONAL_ANNOTATION:UNTAR' { + publishDir = [ + path: { "${params.outdir}/downloaded_dbs/" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + ] + } + + withName: 'NFCORE_PROTEINANNOTATOR:PROTEINANNOTATOR:FUNCTIONAL_ANNOTATION:INTERPROSCAN' { + publishDir = [ + path: { "${params.outdir}/functional_annotation/interproscan/${meta.id}/" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + ] + ext.args = [ + "--applications ${params.interproscan_applications}", + params.interproscan_enableprecalc ? '' : '--disable-precalc' + ].join(' ').trim() + cpus = 1 // bugs with more + } + + withName: 'NFCORE_PROTEINANNOTATOR:PROTEINANNOTATOR:S4PRED_RUNMODEL' { + ext.prefix = { params.s4pred_outfmt } + ext.args = { "--outfmt ${params.s4pred_outfmt}" } + publishDir = [ + path: { "${params.outdir}/s4pred/${meta.id}/" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] } withName: 'MULTIQC' { diff --git a/conf/test.config b/conf/test.config index 45fd379..252ec87 100644 --- a/conf/test.config +++ b/conf/test.config @@ -14,17 +14,20 @@ process { resourceLimits = [ cpus: 4, memory: '15.GB', - time: '1.h' + time: '1.h', ] } params { - config_profile_name = 'Test profile' + config_profile_name = 'Test profile' config_profile_description = 'Minimal test dataset to check pipeline function' // Input data - // TODO nf-core: Specify the paths to your test data on nf-core/test-datasets - // TODO nf-core: Give any required params for the test so that command line flags are not needed - input = params.pipelines_testdata_base_path + 'viralrecon/samplesheet/samplesheet_test_illumina_amplicon.csv'// Genome references - genome = 'R64-1-1' + input = params.pipelines_testdata_base_path + 'proteinannotator/samplesheet/samplesheet.csv' + // Domain annotation + pfam_latest_link = params.pipelines_testdata_base_path + 'proteinannotator/testdata/pfam/Pfam-A_test.hmm.gz' + funfam_latest_link = params.pipelines_testdata_base_path + 'proteinannotator/testdata/funfam/funfam-hmm3-v4_3_0_test.lib.gz' + // Functional annotation + interproscan_db_url = params.pipelines_testdata_base_path + 'proteinannotator/testdata/interproscan/interproscan_test.tar.gz' + interproscan_applications = 'Hamap,TIGRFAM,sfld' } diff --git a/conf/test_full.config b/conf/test_full.config index ae76d99..bfb05f7 100644 --- a/conf/test_full.config +++ b/conf/test_full.config @@ -11,14 +11,15 @@ */ params { - config_profile_name = 'Full test profile' + config_profile_name = 'Full test profile' config_profile_description = 'Full test dataset to check pipeline function' // Input data for full size test - // TODO nf-core: Specify the paths to your full test data ( on nf-core/test-datasets or directly in repositories, e.g. SRA) - // TODO nf-core: Give any required params for the test so that command line flags are not needed - input = params.pipelines_testdata_base_path + 'viralrecon/samplesheet/samplesheet_full_illumina_amplicon.csv' - - // Genome references - genome = 'R64-1-1' + input = params.pipelines_testdata_base_path + 'proteinannotator/testdata/samplesheet.csv' + // Domain annotation + pfam_latest_link = params.pipelines_testdata_base_path + 'proteinannotator/testdata/pfam/Pfam-A_test.hmm.gz' + funfam_latest_link = params.pipelines_testdata_base_path + 'proteinannotator/testdata/funfam/funfam-hmm3-v4_3_0_test.lib.gz' + // Functional annotation + interproscan_db_url = params.pipelines_testdata_base_path + 'proteinannotator/testdata/interproscan_test.tar.gz' + interproscan_applications = 'Hamap,TIGRFAM,sfld' } diff --git a/docs/images/nf-core-proteinannotator_logo_dark.png b/docs/images/nf-core-proteinannotator_logo_dark.png index 58297a5..5bf26e4 100644 Binary files a/docs/images/nf-core-proteinannotator_logo_dark.png and b/docs/images/nf-core-proteinannotator_logo_dark.png differ diff --git a/docs/images/nf-core-proteinannotator_logo_light.png b/docs/images/nf-core-proteinannotator_logo_light.png index d7e77e4..1a55db1 100644 Binary files a/docs/images/nf-core-proteinannotator_logo_light.png and b/docs/images/nf-core-proteinannotator_logo_light.png differ diff --git a/docs/images/proteinannotator_metromap_dark.png b/docs/images/proteinannotator_metromap_dark.png new file mode 100644 index 0000000..43036af Binary files /dev/null and b/docs/images/proteinannotator_metromap_dark.png differ diff --git a/docs/images/proteinannotator_metromap_dark.svg b/docs/images/proteinannotator_metromap_dark.svg new file mode 100644 index 0000000..8e92969 --- /dev/null +++ b/docs/images/proteinannotator_metromap_dark.svg @@ -0,0 +1,4 @@ + + + +
4
4
1
1
2
2
STAGE
1. Check quality and pre-process
2. Annotate domains
3. Annotate functions
4. Predict secondary structures
STAGE...
seqfu
stats
seqfu...
seqfu
stats
seqfu...
seqkit
seq
seqkit...
seqkit
replace
seqkit...
seqkit
rmdup
seqkit...
FAA
FAA
Amino acid sequence annotation
Database download
Amino acid sequence annotation...
METHOD

METHOD
s4pred
runmodel
s4pred...
HTML
HTML
MultiQC
MultiQC
aria2
aria2
PFAM
PFAM
FUNFAM
FUNFAM
hmmer
hmmsearch
hmmer...
DOMTBL
DOMTBL
SS2
SS2
3
3
interproscan
interproscan
GFF3
GFF3
TSV
TSV
JSON
JSON
XML
XML
diff --git a/docs/images/proteinannotator_metromap_light.png b/docs/images/proteinannotator_metromap_light.png new file mode 100644 index 0000000..11f5656 Binary files /dev/null and b/docs/images/proteinannotator_metromap_light.png differ diff --git a/docs/images/proteinannotator_metromap_light.svg b/docs/images/proteinannotator_metromap_light.svg new file mode 100644 index 0000000..4256470 --- /dev/null +++ b/docs/images/proteinannotator_metromap_light.svg @@ -0,0 +1,4 @@ + + + +
4
4
1
1
2
2
STAGE
1. Check quality and pre-process
2. Annotate domains
3. Annotate functions
4. Predict secondary structures
STAGE...
seqfu
stats
seqfu...
seqfu
stats
seqfu...
seqkit
seq
seqkit...
seqkit
replace
seqkit...
seqkit
rmdup
seqkit...
FAA
FAA
Amino acid sequence annotation
Database download
Amino acid sequence annotation...
METHOD

METHOD
s4pred
runmodel
s4pred...
HTML
HTML
MultiQC
MultiQC
aria2
aria2
PFAM
PFAM
FUNFAM
FUNFAM
hmmer
hmmsearch
hmmer...
DOMTBL
DOMTBL
SS2
SS2
3
3
interproscan
interproscan
GFF3
GFF3
TSV
TSV
JSON
JSON
XML
XML
diff --git a/docs/output.md b/docs/output.md index d0c6618..fcd3159 100644 --- a/docs/output.md +++ b/docs/output.md @@ -6,28 +6,366 @@ This document describes the output produced by the pipeline. Most of the plots a The directories listed below will be created in the results directory after the pipeline has finished. All paths are relative to the top-level results directory. - - ## Pipeline overview The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data using the following steps: -- [FastQC](#fastqc) - Raw read QC +- [Quality control and preprocessing](#quality-control-and-preprocessing) + - [SeqFu](#seqfu) for input amino acid sequences quality control (QC) + - [SeqKit](#seqkit) for preprocessing input amino acid sequences (i.e., gap removal, convert to upper case, validate, filter by length, replace special characters such as `/`, and remove duplicate sequences) +- [Database download](#database-download) Optionally download selected databases for annotation. + - [aria2](#aria2) - To optionally download the Pfam, FunFam, and/or InterProScan databases through the pipeline. +- [Domain annotation](#domain-annotation) Annotate proteins with domains from established repositories. + - [hmmer](#hmmer) - To optionally match the input sequence to known Pfam and/or FunFam domains through `hmmer/hmmsearch` +- [Functional annotation](#functional-annotation) Annotate proteins with functional domains + - [InterProScan](#Interproscan) - Search the InterProScan database for functional domains +- [s4pred](#s4pred) - Predict secondary structures of sequences, producing amino acid level probabilities of forming an α-helix, a β-strand or a coil. - [MultiQC](#multiqc) - Aggregate report describing results and QC from the whole pipeline - [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution -### FastQC +### Quality control and preprocessing + +#### SeqFu + +
+Output files + +- `qc/` + - `/` + - `_before.tsv`: Statistics for the input amino acid sequences before preprocessing + - `_before_mqc.txt`: Statistics for the input amino acid sequences in MultiQC-ready format before preprocessing + - `_after.tsv`: (optional) Statistics for the input amino acid sequences after preprocessing + - `_after_mqc.txt`: (optional) Statistics for the input amino acid sequences in MultiQC-ready format after preprocessing + - `.log`: (optional) Output file with count of duplicate sequences that were found and removed + +
+ +The `seqfu` module is used for statistics generation of input amino acid sequences, both before and after preprocessing. + +[SeqFu](https://github.com/telatin/seqfu2) is a cross-platform compiled suite of tools to manipulate and inspect `FASTA` and `FASTQ` files. + +#### SeqKit + +
+Output files + +- `qc/` + - `/` + - `.`: Updated preprocessed input fasta file + +
+ +The `seqkit` module is used for initial preprocessing (i.e., gap removal, convert to upper case, validate, filter by length, replace special characters such as `/`, and remove duplicate sequences) of the input amino acid sequences. + +[SeqKit](https://github.com/shenwei356/seqkit) is a cross-platform and ultrafast toolkit for FASTA/Q file manipulation. + +### Database download + +#### aria2 + +
+Output files + +- `downloaded_dbs/` + - `interproscan_db/`: (optional) uncompressed archive data from the downloaded InterProScan database + - `*/`: (optional) one directory for each of the member databases of InterProScan + - `Pfam-A*.hmm.gz`: (optional) The latest full, or a minimal test, Pfam-A HMM database that can be downloaded through the pipeline. + - `interproscan_test.tar.gz`: (optional) the downloaded InterProScan archive of member databases according to the optional user-provided url + - `funfam-hmm3-v4_3_0*.lib.gz`: (optional) The latest (v4_3_0) full, or a minimal test, FunFam HMM database that can be downloaded through the pipeline. + +
+ +If the `skip_*` flags (e.g., `skip_pfam`, `skip_funfam`, `skip_interproscan`) for each annotation database is set to `true`, or the `*_db` parameter paths (e.g., `pfam_db`, `funfam_db`, `interproscan_db`) are set (i.e., not `null`), or the run is resumed after a successful database download, then the respective database will not be (re)downloaded. The full database links can be found in the main `nextflow.config` file, while minimal test versions can be found in the `test` and `test_full` profiles (i.e., `conf/test.config`, `conf/test_full.config`). + +[aria2](https://github.com/aria2/aria2/) is a lightweight multi-protocol & multi-source, cross platform download utility operated in command-line. It supports HTTP/HTTPS, FTP, SFTP, BitTorrent and Metalink. + +### Domain annotation + +#### hmmer
Output files -- `fastqc/` - - `*_fastqc.html`: FastQC report containing quality metrics. - - `*_fastqc.zip`: Zip archive containing the FastQC report, tab-delimited data file and plot images. +- `domain_annotation/` + - `pfam/` + - `.domtbl.gz`: `hmmer/hmmsearch` results along parameters info. + - `funfam/` + - `.domtbl.gz`: `hmmer/hmmsearch` results along parameters info.
-[FastQC](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) gives general quality metrics about your sequenced reads. It provides information about the quality score distribution across your reads, per base sequence content (%A/T/G/C), adapter contamination and overrepresented sequences. For further reading and documentation see the [FastQC help pages](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/). +Each of the `domain_annotation/` subfolders (e.g., `pfam`, `funfam`) contain a `.domtbl.gz` annotation file per input sample, depending on which domain annotation databases were used in the pipeline execution. + +[hmmer](https://github.com/EddyRivasLab/hmmer) is a fast and flexible alignment trimming tool that keeps phylogenetically informative sites and removes others. + +### Functional annotation + +#### InterProScan + +
+Output files + +- `functional_annotation/` + - `interproscan/` + - `/` + - `.gff`: general feature format (GFF) file + - `.json`: javascript object notation (JSON) file + - `.tsv`: tab-separated variable (TSV) file + - `.xml`: eXtensible markup language (XML) file + +
+ +[InterProScan](https://interproscan-docs.readthedocs.io/en/v5/#) is a protein annotation tool that searches [InterPro](http://www.ebi.ac.uk/interpro/), a database which integrates predictive information about protein function from a number of member resources, giving an overview of the families that a protein belongs to and the domains and sites it contains. The default database applications that are used to functionally annotate sequences include +Hamap, PANTHER, PIRSF, TIGRFAM and sfld, and are set through the `--interproscan_applications` parameter. + +See also [InterProScan output documentation](https://interproscan-docs.readthedocs.io/en/v5/), where most of these examples are taken from. + +##### Generic Feature Format Version 3 (GFF3) Output + +The GFF3 format is a flat tab-delimited file, which is much richer then the TSV output. It allows you to trace back from matches to predicted proteins and to nucleic acid sequences. It also contains a FASTA format representation of the predicted protein sequences and their matches. You will find a documentation of all the columns and attributes used on http://www.sequenceontology.org/gff3.shtml. + +
+Example InterProScan GFF output + +``` +##gff-version 3 +##feature-ontology http://song.cvs.sourceforge.net/viewvc/song/ontology/sofa.obo?revision=1.269 +##interproscan-version 5.26-65.0 +##sequence-region AACH01000027 1 1347 +##seqid|source|type|start|end|score|strand|phase|attributes +AACH01000027 provided_by_user nucleic_acid 1 1347 . + . Name=AACH01000027;md5=b2a7416cb92565c004becb7510f46840;ID=AACH01000027 +AACH01000027 getorf ORF 1 1347 . + . Name=AACH01000027.2_21;Target=pep_AACH01000027_1_1347 1 449;md5=b2a7416cb92565c004becb7510f46840;ID=orf_AACH01000027_1_1347 +AACH01000027 getorf polypeptide 1 449 . + . md5=fd0743a673ac69fb6e5c67a48f264dd5;ID=pep_AACH01000027_1_1347 +AACH01000027 Pfam protein_match 84 314 1.2E-45 + . Name=PF00696;signature_desc=Amino acid kinase family;Target=null 84 314;status=T;ID=match$8_84_314;Ontology_term="GO:0008652";date=15-04-2013;Dbxref="InterPro:IPR001048","Reactome:REACT_13" +##sequence-region 2 +... +>pep_AACH01000027_1_1347 +LVLLAAFDCIDDTKLVKQIIISEIINSLPNIVNDKYGRKVLLYLLSPRDPAHTVREIIEV +LQKGDGNAHSKKDTEIRRREMKYKRIVFKVGTSSLTNEDGSLSRSKVKDITQQLAMLHEA +GHELILVSSGAIAAGFGALGFKKRPTKIADKQASAAVGQGLLLEEYTTNLLLRQIVSAQI +LLTQDDFVDKRRYKNAHQALSVLLNRGAIPIINENDSVVIDELKVGDNDTLSAQVAAMVQ +ADLLVFLTDVDGLYTGNPNSDPRAKRLERIETINREIIDMAGGAGSSNGTGGMLTKIKAA +TIATESGVPVYICSSLKSDSMIEAAEETEDGSYFVAQEKGLRTQKQWLAFYAQSQGSIWV +DKGAAEALSQYGKSLLLSGIVEAEGVFSYGDIVTVFDKESGKSLGKGRVQFGASALEDML +RSQKAKGVLIYRDDWISITPEIQLLFTEF +... +>match$8_84_314 +KRIVFKVGTSSLTNEDGSLSRSKVKDITQQLAMLHEAGHELILVSSGAIAAGFGALGFKK +RPTKIADKQASAAVGQGLLLEEYTTNLLLRQIVSAQILLTQDDFVDKRRYKNAHQALSVL +LNRGAIPIINENDSVVIDELKVGDNDTLSAQVAAMVQADLLVFLTDVDGLYTGNPNSDPR +AKRLERIETINREIIDMAGGAGSSNGTGGMLTKIKAATIATESGVPVYICS +``` + +
+ +#### JavaScript Object Notation (JSON) Output + +JSON representation of the matches - an alternative to XML format. As new releases are made public, the changes to the expected JSON format are documented in [Change log for InterProScan JSON output format](https://interproscan-docs.readthedocs.io/en/v5/JSONOutputFormatHistory.html#change-log-for-interproscan-json-output-format). + +
+Example InterProScan JSON output + +``` +{ + "interproscan-version": "5.26-65.0", +"results": [{ + "sequence" : "MSKIGKSIRLERIIDRKTRKTVIVPMDHGLTVGPIPGLIDLAAAVDKVAEGGANAVLGHMGLPLYGHRGYGKDVGLIIHLSASTSLGPDANHKVLVTRVEDAIRVGADGVSIHVNVGAEDEAEMLRDLGMVARRCDLWGMPLLAMMYPRGAKVRSEHSVEYVKHAARVGAELGVDIVKTNYTGSPETFREVVRGCPAPVVIAGGPKMDTEADLLQMVYDAMQAGAAGISIGRNIFQAENPTLLTRKLSKIVHEGYTPEEAARLKL", + "md5" : "88d47cc807fe8e977130b0cc93e0bd61", + "matches" : [ { + "signature" : { + "accession" : "PIRSF038992", + "name" : "Aldolase_Ia", + "description" : null, + "type" : null, + "signatureLibraryRelease" : { + "library" : "PIRSF", + "version" : "3.01" + }, + "models" : { + "PIRSF038992" : { + "accession" : "PIRSF038992", + "name" : "Aldolase_Ia", + "description" : null, + "key" : "PIRSF038992" + } + }, + "entry" : { + "accession" : "IPR002915", + "name" : "DeoC/FbaB/lacD_aldolase", + "description" : "DeoC/FbaB/ lacD aldolase", + "type" : "FAMILY", + "goXRefs" : [ { + "identifier" : "GO:0016829", + "name" : "lyase activity", + "databaseName" : "GO", + "category" : "MOLECULAR_FUNCTION" + } ], + "pathwayXRefs" : [ { + "identifier" : "R-HSA-71336", + "name" : "Pentose phosphate pathway (hexose monophosphate shunt)", + "databaseName" : "Reactome" + }, { + "identifier" : "R-HSA-6798695", + "name" : "Neutrophil degranulation", + "databaseName" : "Reactome" + } ] + } + }, + "locations" : [ { + "start" : 1, + "end" : 265, + "hmmStart" : 2, + "hmmEnd" : 262, + "hmmBounds" : "INCOMPLETE", + "evalue" : 3.3E-94, + "score" : 302.6, + "envelopeStart" : 1, + "envelopeEnd" : 265 + } ], + "evalue" : 3.0E-94, + "score" : 302.7 + }, { + ... +}] +} +``` + +
+ +##### Tab-separated values format (TSV) Output + +TSV: Basic tab delimited format. Outputs only those sequences with domain matches. + +
+Example InterProScan TSV output + +``` +P51587 14086411a2cdf1c4cba63020e1622579 3418 Pfam PF09103 BRCA2, oligonucleotide/oligosaccharide-binding, domain 1 2670 2799 7.9E-43 T 15-03-2013 +P51587 14086411a2cdf1c4cba63020e1622579 3418 ProSiteProfiles PS50138 BRCA2 repeat profile. 1002 1036 0.0 T 18-03-2013 IPR002093 BRCA2 repeat GO:0005515|GO:0006302 +P51587 14086411a2cdf1c4cba63020e1622579 3418 Gene3D G3DSA:2.40.50.140 2966 3051 3.1E-52 T 15-03-2013 +... +``` + +The TSV format presents the match data in columns as follows: + +1. Protein accession (e.g. P51587) +2. Sequence MD5 digest (e.g. 14086411a2cdf1c4cba63020e1622579) +3. Sequence length (e.g. 3418) +4. Analysis (e.g. Pfam / PRINTS / Gene3D) +5. Signature accession (e.g. PF09103 / G3DSA:2.40.50.140) +6. Signature description (e.g. BRCA2 repeat profile) +7. Start location +8. Stop location +9. Score - is the e-value (or score) of the match reported by member database method (e.g. 3.1E-52) +10. Status - is the status of the match (T: true) +11. Date - is the date of the run +12. InterPro annotations - accession (e.g. IPR002093) +13. InterPro annotations - description (e.g. BRCA2 repeat) +14. GO annotations with their source(s), e.g. GO:0005515(InterPro)|GO:0006302(PANTHER)|GO:0007195(InterPro,PANTHER). This is an optional column; only displayed if the `--goterms` option is switched on +15. Pathways annotations, e.g. REACT_71. This is an optional column; only displayed if the `--pathways` option is switched on + +If a value is missing in a column, for example, the match has no InterPro annotation, a ‘-‘ is displayed. + +
+ +##### Extensible Markup Language (XML) Output + +XML representation of the matches - this is the richest form of the data. The XML Schema Definition (XSD) file links are below the example output. + +The XML Schema Definition (XSD) is available [here](http://ftp.ebi.ac.uk/pub/software/unix/iprscan/5/schemas/). + +
+Example InterProScan XML output + +``` + + + + MPIGSKERPTFFEIFKTRCNKADLGPISLNWFEELSSEAPPYNSEPAEESEHKNNNYEPNLFKTPQRKPSYNQLASTPIIFKEQGLTLPLYQSPVKELDKFKLDLGRNVPNSRHKSLRTVKTKMDQADDVSCPLLNSCLSESPVVLQCTHVTPQRDKSVVCGSLFHTPKFVKGRQTPKHISESLGAEVDPDMSWSSSLATPPTLSSTVLIVRNEEASETVFPHDTTANVKSYFSNHDESLKKNDRFIASVTDSENTNQREAASHGFGKTSGNSFKVNSCKDHIGKSMPNVLEDEVYETVVDTSEEDSFSLCFSKCRTKNLQKVRTSKTRKKIFHEANADECEKSKNQVKEKYSFVSEVEPNDTDPLDSNVAHQKPFESGSDKISKEVVPSLACEWSQLTLSGLNGAQMEKIPLLHISSCDQNISEKDLLDTENKRKKDFLTSENSLPRISSLPKSEKPLNEETVVNKRDEEQHLESHTDCILAVKQAISGTSPVASSFQGIKKSIFRIRESPKETFNASFSGHMTDPNFKKETEASESGLEIHTVCSQKEDSLCPNLIDNGSWPATTTQNSVALKNAGLISTLKKKTNKFIYAIHDETSYKGKKIPKDQKSELINCSAQFEANAFEAPLTFANADSGLLHSSVKRSCSQNDSEEPTLSLTSSFGTILRKCSRNETCSNNTVISQDLDYKEAKCNKEKLQLFITPEADSLSCLQEGQCENDPKSKKVSDIKEEVLAAACHPVQHSKVEYSDTDFQSQKSLLYDHENASTLILTPTSKDVLSNLVMISRGKESYKMSDKLKGNNYESDVELTKNIPMEKNQDVCALNENYKNVELLPPEKYMRVASPSRKVQFNQNTNLRVIQKNQEETTSISKITVNPDSEELFSDNENNFVFQVANERNNLALGNTKELHETDLTCVNEPIFKNSTMVLYGDTGDKQATQVSIKKDLVYVLAEENKNSVKQHIKMTLGQDLKSDISLNIDKIPEKNNDYMNKWAGLLGPISNHSFGGSFRTASNKEIKLSEHNIKKSKMFFKDIEEQYPTSLACVEIVNTLALDNQKKLSKPQSINTVSAHLQSSVVVSDCKNSHITPQMLFSKQDFNSNHNLTPSQKAEITELSTILEESGSQFEFTQFRKPSYILQKSTFEVPENQMTILKTTSEECRDADLHVIMNAPSIGQVDSSKQFEGTVEIKRKFAGLLKNDCNKSASGYLTDENEVGFRGFYSAHGTKLNVSTEALQKAVKLFSDIENISEETSAEVHPISLSSSKCHDSVVSMFKIENHNDKTVSEKNNKCQLILQNNIEMTTGTFVEEITENYKRNTENEDNKYTAASRNSHNLEFDGSDSSKNDTVCIHKDETDLLFTDQHNICLKLSGQFMKEGNTQIKEDLSDLTFLEVAKAQEACHGNTSNKEQLTATKTEQNIKDFETSDTFFQTASGKNISVAKESFNKIVNFFDQKPEELHNFSLNSELHSDIRKNKMDILSYEETDIVKHKILKESVPVGTGNQLVTFQGQPERDEKIKEPTLLGFHTASGKKVKIAKESLDKVKNLFDEKEQGTSEITSFSHQWAKTLKYREACKDLELACETIEITAAPKCKEMQNSLNNDKNLVSIETVVPPKLLSDNLCRQTENLKTSKSIFLKVKVHENVEKETAKSPATCYTNQSPYSVIENSALAFYTSCSRKTSVSQTSLLEAKKWLREGIFDGQPERINTADYVGNYLYENNSNSTIAENDKNHLSEKQDTYLSNSSMSNSYSYHSDEVYNDSGYLSKNKLDSGIEPVLKNVEDQKNTSFSKVISNVKDANAYPQTVNEDICVEELVTSSSPCKNKNAAIKLSISNSNNFEVGPPAFRIASGKIVCVSHETIKKVKDIFTDSFSKVIKENNENKSKICQTKIMAGCYEALDDSEDILHNSLDNDECSTHSHKVFADIQSEEILQHNQNMSGLEKVSKISPCDVSLETSDICKCSIGKLHKSVSSANTCGIFSTASGKSVQVSDASLQNARQVFSEIEDSTKQVFSKVLFKSNEHSDQLTREENTAIRTPEHLISQKGFSYNVVNSSAFSGFSTASGKQVSILESSLHKVKGVLEEFDLIRTEHSLHYSPTSRQNVSKILPRVDKRNPEHCVNSEMEKTCSKEFKLSNNLNVEGGSSENNHSIKVSPYLSQFQQDKQQLVLGTKVSLVENIHVLGKEQASPKNVKMEIGKTETFSDVPVKTNIEVCSTYSKDSENYFETEAVEIAKAFMEDDELTDSKLPSHATHSLFTCPENEEMVLSNSRIGKRRGEPLILVGEPSIKRNLLNEFDRIIENQEKSLKASKSTPDGTIKDRRLFMHHVSLEPITCVPFRTTKERQEIQNPNFTAPGQEFLSKSHLYEHLTLEKSSSNLAVSGHPFYQVSATRNEKMRHLITTGRPTKVFVPPFKTKSHFHRVEQCVRNINLEENRQKQNIDGHGSDDSKNKINDNEIHQFNKNNSNQAAAVTFTKCEEEPLDLITSLQNARDIQDMRIKKKQRQRVFPQPGSLYLAKTSTLPRISLKAAVGGQVPSACSHKQLYTYGVSKHCIKINSKNAESFQFHTEDYFGKESLWTGKGIQLADGGWLIPSNDGKAGKEEFYRALCDTPGVDPKLISRIWVYNHYRWIIWKLAAMECAFPKEFANRCLSPERVLLQLKYRYDTEIDRSRRSAIKKIMERDDTAAKTLVLCVSDIISLSANISETSSNKTSSADTQKVAIIELTDGWYAVKAQLDPPLLAVLKNGRLTVGQKIILHGAELVGSPDACTPLEAPESLMLKISANSTRPARWYTKLGFFPDPRPFPLPLSSLFSDGGNVGCVDVIIQRAYPIQWMEKTSSGLYIFRNEREEEKEAAKYVEAQQKRLEALFTKIQEEFEEHEENTTKPYLPSRALTRQQVRALQDGAELYEAVKNAADPAYLEGYFSEEQLRALNNHRQMLNDKKQAQIQLEIRKAMESAEQKEQGLSRDVTTVWKLRIVSYSKKEKDSVILSIWRPSSDLYSLLTEGKRYRIYHLATSKSKSKSERANIQLAATKKTQYQQLPVSDEILFQIYQPREPLHFSKFLDPDFQPSCSEVDLIGFVVSVVKKTGLAPFVYLSDECYNLLAIKFWIDLNEDIIKPHMLIAASNLQWRPESKSGLLTLFAGDFSVFSASPKEGHFQETFNKMKNTVENIDILCNEAENKLMHILHANDPKWSTPTKDCTSGPYTAQIIPGTGNKLLMSSPNCEIYYQSPLSLCMAKRKSVSTPVSAQMTSKSCKGEKEIDDQKNCKKRRALDFLSRLPLPPPVSPICTFVSPAAQKAFQPPRSCGTKYETPIKKKELNSPQMTPFKKFNEISLLESNSIADEELALINTQALLSGSTGEKQFISVSESTRTAPTSSEDYLRLKRRCTTSLIKEQESSQASTEECEKNKQDTITTKKYI + + +... + + + + + + + + + + + + + + + + +... + + + + + + + + + + + + + + + + + +... + + + + + + + + + + + + + + + + ... + + + ... + + + + + ... + + + +``` + +
+ +#### s4pred + +
+Output files + +- `s4pred/` + - `/` + - `/` + - `.`: The probability of each amino acid to be an α-helix, a β-strand or a coil, in the chosen output format (i.e., 'ss2', 'fas', or 'horiz'). + +
+ +The `s4pred` module is used to predict secondary structures of amino acid sequences. + +[s4pred](https://github.com/psipred/s4pred) is a tool for accurate prediction of a protein's secondary structure from only it's amino acid sequence. ### MultiQC @@ -45,6 +383,18 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d Results generated by MultiQC collate pipeline QC from supported tools e.g. FastQC. The pipeline has special steps which also allow the software versions to be reported in the MultiQC output for future traceability. For more information about how to use MultiQC reports, see . +### SeqKit stats + +
+Output files + +- `seqkit/` + - `{prefix}.tsv`: output of `seqkit stats` command on `{prefix}.fasta` input file, in tab-delimited text format. + +
+ +[SeqKit stats](https://bioinf.shenwei.me/seqkit/usage/#stats) generates simple statistics for protein FASTA files, such as number of residues, minimal sequence length, average sequence length, and maximal sequence length. + ### Pipeline information
diff --git a/docs/usage.md b/docs/usage.md index 694f0b4..3ac3aa4 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -16,39 +16,21 @@ You will need to create a samplesheet with information about the samples you wou --input '[path to samplesheet file]' ``` -### Multiple runs of the same sample - -The `sample` identifiers have to be the same when you have re-sequenced the same sample more than once e.g. to increase sequencing depth. The pipeline will concatenate the raw reads before performing any downstream analysis. Below is an example for the same sample sequenced across 3 lanes: - -```csv title="samplesheet.csv" -sample,fastq_1,fastq_2 -CONTROL_REP1,AEG588A1_S1_L002_R1_001.fastq.gz,AEG588A1_S1_L002_R2_001.fastq.gz -CONTROL_REP1,AEG588A1_S1_L003_R1_001.fastq.gz,AEG588A1_S1_L003_R2_001.fastq.gz -CONTROL_REP1,AEG588A1_S1_L004_R1_001.fastq.gz,AEG588A1_S1_L004_R2_001.fastq.gz -``` - ### Full samplesheet -The pipeline will auto-detect whether a sample is single- or paired-end using the information provided in the samplesheet. The samplesheet can have as many columns as you desire, however, there is a strict requirement for the first 3 columns to match those defined in the table below. - -A final samplesheet file consisting of both single- and paired-end data may look something like the one below. This is for 6 samples, where `TREATMENT_REP3` has been sequenced twice. +A final samplesheet file consisting of multiple input fasta files from different species is below. ```csv title="samplesheet.csv" -sample,fastq_1,fastq_2 -CONTROL_REP1,AEG588A1_S1_L002_R1_001.fastq.gz,AEG588A1_S1_L002_R2_001.fastq.gz -CONTROL_REP2,AEG588A2_S2_L002_R1_001.fastq.gz,AEG588A2_S2_L002_R2_001.fastq.gz -CONTROL_REP3,AEG588A3_S3_L002_R1_001.fastq.gz,AEG588A3_S3_L002_R2_001.fastq.gz -TREATMENT_REP1,AEG588A4_S4_L003_R1_001.fastq.gz, -TREATMENT_REP2,AEG588A5_S5_L003_R1_001.fastq.gz, -TREATMENT_REP3,AEG588A6_S6_L003_R1_001.fastq.gz, -TREATMENT_REP3,AEG588A6_S6_L004_R1_001.fastq.gz, +id,fasta +species1,species1_proteins.fasta +species1_v2,species1_v2_proteins.fasta +species2,species2_proteins.fasta ``` -| Column | Description | -| --------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `sample` | Custom sample name. This entry will be identical for multiple sequencing libraries/runs from the same sample. Spaces in sample names are automatically converted to underscores (`_`). | -| `fastq_1` | Full path to FastQ file for Illumina short reads 1. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz". | -| `fastq_2` | Full path to FastQ file for Illumina short reads 2. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz". | +| Column | Description | +| ------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `id` | Custom sample name. This entry will be identical for multiple sequencing libraries/runs from the same sample. Spaces in sample names are automatically converted to underscores (`_`). | +| `fasta` | Full path to protein fasta file. File may be gzipped and have the extension ".fasta", ".fasta.gz", ".fa" or ".fa.gz". | An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline. @@ -57,7 +39,7 @@ An [example samplesheet](../assets/samplesheet.csv) has been provided with the p The typical command for running the pipeline is as follows: ```bash -nextflow run nf-core/proteinannotator --input ./samplesheet.csv --outdir ./results --genome GRCh37 -profile docker +nextflow run nf-core/proteinannotator --input ./samplesheet.csv --outdir ./results -profile docker ``` This will launch the pipeline with the `docker` configuration profile. See below for more information about profiles. @@ -89,13 +71,57 @@ with: ```yaml title="params.yaml" input: './samplesheet.csv' outdir: './results/' -genome: 'GRCh37' <...> ``` You can also generate such `YAML`/`JSON` files via [nf-core/launch](https://nf-co.re/launch). -### Updating the pipeline +## Functional Annotation Options + +### InterProScan + +[InterProScan](https://github.com/ebi-pf-team/interproscan) is used to provide more information about the proteins annotated on the contigs. By default, turning on this subworkflow without `--skip_interproscan` will download and unzip the InterPro database. The database will then be saved in the output directory `/downloaded_dbs/interproscan_db/`. We recommend keeping a copy of this directory for future reuse in case the results folder is deleted. + +:::note +The large database download (5.5GB) can take up to 4 hours depending on the bandwidth. +::: + +A local version of the database can be supplied to the pipeline by passing the InterProScan database directory to `--interproscan_db `. The directory can be created by running (e.g. for database version 5.72-103.0): + +``` +curl -L https://ftp.ebi.ac.uk/pub/software/unix/iprscan/5/5.72-103.0/interproscan-5.72-103.0-64-bit.tar.gz -o interproscan_db/interproscan-5.72-103.0-64-bit.tar.gz +tar -xzf interproscan_db/interproscan-5.72-103.0-64-bit.tar.gz -C interproscan_db/ + +``` + +The contents of the database directory should include the directory `data` in the top level with a number of subdirectories: + +``` +interproscan_db/ + └── data/ + ├── antifam + ├── cdd + ├── funfam + ├── gene3d + ├── hamap + ├── ncbifam + ├── panther + | └── [18.0] + ├── pfam + | └── [36.0] + ├── phobius + ├── pirsf + ├── pirsr + ├── prints + ├── prosite + | └── [2023_05] + ├── sfld + ├── smart + ├── superfamily + └── tmhmm +``` + +## Updating the pipeline When you run the above command, Nextflow automatically pulls the pipeline code from GitHub and stores it as a cached version. When running the pipeline after this, it will always use the cached version if available - even if the pipeline has been updated since. To make sure that you're running the latest version of the pipeline, make sure that you regularly update the cached version of the pipeline: @@ -103,7 +129,7 @@ When you run the above command, Nextflow automatically pulls the pipeline code f nextflow pull nf-core/proteinannotator ``` -### Reproducibility +## Reproducibility It is a good idea to specify the pipeline version when running the pipeline on your data. This ensures that a specific version of the pipeline code and software are used when you run your pipeline. If you keep using the same tag, you'll be running the same version of the pipeline, even if there have been changes to the code since. @@ -149,7 +175,7 @@ If `-profile` is not specified, the pipeline will run locally and expect all sof - `shifter` - A generic configuration profile to be used with [Shifter](https://nersc.gitlab.io/development/shifter/how-to-use/) - `charliecloud` - - A generic configuration profile to be used with [Charliecloud](https://hpc.github.io/charliecloud/) + - A generic configuration profile to be used with [Charliecloud](https://charliecloud.io/) - `apptainer` - A generic configuration profile to be used with [Apptainer](https://apptainer.org/) - `wave` diff --git a/main.nf b/main.nf index 8bc9175..98d7d67 100644 --- a/main.nf +++ b/main.nf @@ -18,19 +18,6 @@ include { PROTEINANNOTATOR } from './workflows/proteinannotator' include { PIPELINE_INITIALISATION } from './subworkflows/local/utils_nfcore_proteinannotator_pipeline' include { PIPELINE_COMPLETION } from './subworkflows/local/utils_nfcore_proteinannotator_pipeline' -include { getGenomeAttribute } from './subworkflows/local/utils_nfcore_proteinannotator_pipeline' - -/* -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - GENOME PARAMETER VALUES -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -*/ - -// TODO nf-core: Remove this line if you don't need a FASTA file -// This is an example of how to use getGenomeAttribute() to fetch parameters -// from igenomes.config using `--genome` -params.fasta = getGenomeAttribute('fasta') - /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ NAMED WORKFLOWS FOR PIPELINE @@ -51,7 +38,18 @@ workflow NFCORE_PROTEINANNOTATOR { // WORKFLOW: Run pipeline // PROTEINANNOTATOR ( - samplesheet + samplesheet, + params.skip_preprocessing, + params.skip_pfam, + params.pfam_db, + params.pfam_latest_link, + params.skip_funfam, + params.funfam_db, + params.funfam_latest_link, + params.skip_interproscan, + params.interproscan_db_url, + params.interproscan_db, + params.skip_s4pred ) emit: multiqc_report = PROTEINANNOTATOR.out.multiqc_report // channel: /path/to/multiqc_report.html @@ -74,7 +72,10 @@ workflow { params.monochrome_logs, args, params.outdir, - params.input + params.input, + params.help, + params.help_full, + params.show_hidden ) // diff --git a/modules.json b/modules.json index a7609b9..37ba5b8 100644 --- a/modules.json +++ b/modules.json @@ -5,33 +5,83 @@ "https://github.com/nf-core/modules.git": { "modules": { "nf-core": { - "fastqc": { + "aria2": { "branch": "master", - "git_sha": "08108058ea36a63f141c25c4e75f9f872a5b2296", + "git_sha": "41dfa3f7c0ffabb96a6a813fe321c6d1cc5b6e46", + "installed_by": ["modules"] + }, + "hmmer/hmmsearch": { + "branch": "master", + "git_sha": "41dfa3f7c0ffabb96a6a813fe321c6d1cc5b6e46", + "installed_by": ["modules"] + }, + "interproscan": { + "branch": "master", + "git_sha": "50ab4e0a9616556589152fef559b50a6e86c326b", "installed_by": ["modules"] }, "multiqc": { "branch": "master", - "git_sha": "f0719ae309075ae4a291533883847c3f7c441dad", + "git_sha": "80cba9452fb1e9bb79884976fa1ca0e671949aa2", + "installed_by": ["modules"] + }, + "s4pred/runmodel": { + "branch": "master", + "git_sha": "41dfa3f7c0ffabb96a6a813fe321c6d1cc5b6e46", + "installed_by": ["modules"] + }, + "seqfu/stats": { + "branch": "master", + "git_sha": "e753770db613ce014b3c4bc94f6cba443427b726", + "installed_by": ["faa_seqfu_seqkit"] + }, + "seqkit/replace": { + "branch": "master", + "git_sha": "2eccc519c7d25e24c8ef2a5a94062630b8383daf", + "installed_by": ["faa_seqfu_seqkit"] + }, + "seqkit/rmdup": { + "branch": "master", + "git_sha": "41dfa3f7c0ffabb96a6a813fe321c6d1cc5b6e46", + "installed_by": ["faa_seqfu_seqkit"] + }, + "seqkit/seq": { + "branch": "master", + "git_sha": "41dfa3f7c0ffabb96a6a813fe321c6d1cc5b6e46", + "installed_by": ["faa_seqfu_seqkit"] + }, + "seqkit/stats": { + "branch": "master", + "git_sha": "28935b89b7e1f19e835f8c6e4c8322d4b505dded", + "installed_by": ["modules"] + }, + "untar": { + "branch": "master", + "git_sha": "447f7bc0fa41dfc2400c8cad4c0291880dc060cf", "installed_by": ["modules"] } } }, "subworkflows": { "nf-core": { + "faa_seqfu_seqkit": { + "branch": "master", + "git_sha": "2eccc519c7d25e24c8ef2a5a94062630b8383daf", + "installed_by": ["subworkflows"] + }, "utils_nextflow_pipeline": { "branch": "master", - "git_sha": "c2b22d85f30a706a3073387f30380704fcae013b", + "git_sha": "05954dab2ff481bcb999f24455da29a5828af08d", "installed_by": ["subworkflows"] }, "utils_nfcore_pipeline": { "branch": "master", - "git_sha": "51ae5406a030d4da1e49e4dab49756844fdd6c7a", + "git_sha": "65f5e638d901a51534c68fd5c1c19e8112fb4df1", "installed_by": ["subworkflows"] }, "utils_nfschema_plugin": { "branch": "master", - "git_sha": "2fd2cd6d0e7b273747f32e465fdc6bcc3ae0814e", + "git_sha": "fdc08b8b1ae74f56686ce21f7ea11ad11990ce57", "installed_by": ["subworkflows"] } } diff --git a/modules/nf-core/aria2/environment.yml b/modules/nf-core/aria2/environment.yml new file mode 100644 index 0000000..4536048 --- /dev/null +++ b/modules/nf-core/aria2/environment.yml @@ -0,0 +1,7 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::aria2=1.36.0 diff --git a/modules/nf-core/aria2/main.nf b/modules/nf-core/aria2/main.nf new file mode 100644 index 0000000..f3ec399 --- /dev/null +++ b/modules/nf-core/aria2/main.nf @@ -0,0 +1,47 @@ +process ARIA2 { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/aria2:1.36.0' : + 'biocontainers/aria2:1.36.0' }" + + input: + tuple val(meta), val(source_url) + + output: + tuple val(meta), path("$downloaded_file"), emit: downloaded_file + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + downloaded_file = source_url.split("/")[-1] + + """ + aria2c \\ + --check-certificate=false \\ + ${args} \\ + ${source_url} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + aria2: \$(echo \$(aria2c --version 2>&1) | grep 'aria2 version' | cut -f3 -d ' ') + END_VERSIONS + """ + + stub: + downloaded_file = source_url.split("/")[-1] + + """ + touch ${downloaded_file} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + aria2: \$(echo \$(aria2c --version 2>&1) | grep 'aria2 version' | cut -f3 -d ' ') + END_VERSIONS + """ +} diff --git a/modules/nf-core/aria2/meta.yml b/modules/nf-core/aria2/meta.yml new file mode 100644 index 0000000..9dc7289 --- /dev/null +++ b/modules/nf-core/aria2/meta.yml @@ -0,0 +1,54 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/yaml-schema.json +name: "aria2" +description: CLI Download utility +keywords: + - download + - utility + - http(s) +tools: + - "aria2": + description: "aria2 is a lightweight multi-protocol & multi-source, cross platform + download utility operated in command-line. It supports HTTP/HTTPS, FTP, SFTP, + BitTorrent and Metalink." + homepage: "https://aria2.github.io/" + documentation: "https://aria2.github.io/manual/en/html/index.html" + tool_dev_url: "https://github.com/aria2/aria2/" + licence: ["GPL v2"] + identifier: "" +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'test', single_end:false ]` + - source_url: + type: string + description: Source URL to be downloaded + pattern: "{http,https}*" + ontologies: + - edam: "http://edamontology.org/data_1052" # URL +output: + downloaded_file: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + - $downloaded_file: + type: file + description: Downloaded file from source + pattern: "*.*" + ontologies: [] + versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" + ontologies: + - edam: "http://edamontology.org/format_3750" # YAML +authors: + - "@JoseEspinosa" + - "@leoisl" +maintainers: + - "@JoseEspinosa" + - "@leoisl" diff --git a/modules/nf-core/aria2/tests/main.nf.test b/modules/nf-core/aria2/tests/main.nf.test new file mode 100644 index 0000000..ba03351 --- /dev/null +++ b/modules/nf-core/aria2/tests/main.nf.test @@ -0,0 +1,45 @@ +nextflow_process { + name "Test Process ARIA2" + script "../main.nf" + process "ARIA2" + tag "modules" + tag "modules_nfcore" + tag "aria2" + + test("sarscov2 Illumina single end [bam]") { + when { + process { + """ + input[0] = [ [ id:'test' ], // meta map + params.test_data['sarscov2']['illumina']['test_single_end_bam'] // https URL + ] + """ + } + } + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } + + test("sarscov2 Illumina single end [bam] - stub") { + options "-stub-run" + when { + process { + """ + input[0] = [ [ id:'test' ], // meta map + params.test_data['sarscov2']['illumina']['test_single_end_bam'] // https URL + ] + """ + } + } + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } +} diff --git a/modules/nf-core/aria2/tests/main.nf.test.snap b/modules/nf-core/aria2/tests/main.nf.test.snap new file mode 100644 index 0000000..96911f6 --- /dev/null +++ b/modules/nf-core/aria2/tests/main.nf.test.snap @@ -0,0 +1,60 @@ +{ + "sarscov2 Illumina single end [bam] - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.single_end.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + "versions.yml:md5,1d3d763f0ff390b632205a498112b076" + ], + "downloaded_file": [ + [ + { + "id": "test" + }, + "test.single_end.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,1d3d763f0ff390b632205a498112b076" + ] + } + ], + "timestamp": "2023-12-14T17:34:30.569759" + }, + "sarscov2 Illumina single end [bam]": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.single_end.bam:md5,21afed4c3e007de5e007cc5cbaebede7" + ] + ], + "1": [ + "versions.yml:md5,1d3d763f0ff390b632205a498112b076" + ], + "downloaded_file": [ + [ + { + "id": "test" + }, + "test.single_end.bam:md5,21afed4c3e007de5e007cc5cbaebede7" + ] + ], + "versions": [ + "versions.yml:md5,1d3d763f0ff390b632205a498112b076" + ] + } + ], + "timestamp": "2023-12-14T17:34:22.216677" + } +} \ No newline at end of file diff --git a/modules/nf-core/fastqc/environment.yml b/modules/nf-core/fastqc/environment.yml deleted file mode 100644 index 691d4c7..0000000 --- a/modules/nf-core/fastqc/environment.yml +++ /dev/null @@ -1,5 +0,0 @@ -channels: - - conda-forge - - bioconda -dependencies: - - bioconda::fastqc=0.12.1 diff --git a/modules/nf-core/fastqc/main.nf b/modules/nf-core/fastqc/main.nf deleted file mode 100644 index 033f415..0000000 --- a/modules/nf-core/fastqc/main.nf +++ /dev/null @@ -1,64 +0,0 @@ -process FASTQC { - tag "${meta.id}" - label 'process_medium' - - conda "${moduleDir}/environment.yml" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/fastqc:0.12.1--hdfd78af_0' : - 'biocontainers/fastqc:0.12.1--hdfd78af_0' }" - - input: - tuple val(meta), path(reads) - - output: - tuple val(meta), path("*.html"), emit: html - tuple val(meta), path("*.zip") , emit: zip - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" - // Make list of old name and new name pairs to use for renaming in the bash while loop - def old_new_pairs = reads instanceof Path || reads.size() == 1 ? [[ reads, "${prefix}.${reads.extension}" ]] : reads.withIndex().collect { entry, index -> [ entry, "${prefix}_${index + 1}.${entry.extension}" ] } - def rename_to = old_new_pairs*.join(' ').join(' ') - def renamed_files = old_new_pairs.collect{ _old_name, new_name -> new_name }.join(' ') - - // The total amount of allocated RAM by FastQC is equal to the number of threads defined (--threads) time the amount of RAM defined (--memory) - // https://github.com/s-andrews/FastQC/blob/1faeea0412093224d7f6a07f777fad60a5650795/fastqc#L211-L222 - // Dividing the task.memory by task.cpu allows to stick to requested amount of RAM in the label - def memory_in_mb = task.memory ? task.memory.toUnit('MB').toFloat() / task.cpus : null - // FastQC memory value allowed range (100 - 10000) - def fastqc_memory = memory_in_mb > 10000 ? 10000 : (memory_in_mb < 100 ? 100 : memory_in_mb) - - """ - printf "%s %s\\n" ${rename_to} | while read old_name new_name; do - [ -f "\${new_name}" ] || ln -s \$old_name \$new_name - done - - fastqc \\ - ${args} \\ - --threads ${task.cpus} \\ - --memory ${fastqc_memory} \\ - ${renamed_files} - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - fastqc: \$( fastqc --version | sed '/FastQC v/!d; s/.*v//' ) - END_VERSIONS - """ - - stub: - def prefix = task.ext.prefix ?: "${meta.id}" - """ - touch ${prefix}.html - touch ${prefix}.zip - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - fastqc: \$( fastqc --version | sed '/FastQC v/!d; s/.*v//' ) - END_VERSIONS - """ -} diff --git a/modules/nf-core/fastqc/meta.yml b/modules/nf-core/fastqc/meta.yml deleted file mode 100644 index 2b2e62b..0000000 --- a/modules/nf-core/fastqc/meta.yml +++ /dev/null @@ -1,67 +0,0 @@ -name: fastqc -description: Run FastQC on sequenced reads -keywords: - - quality control - - qc - - adapters - - fastq -tools: - - fastqc: - description: | - FastQC gives general quality metrics about your reads. - It provides information about the quality score distribution - across your reads, the per base sequence content (%A/C/G/T). - - You get information about adapter contamination and other - overrepresented sequences. - homepage: https://www.bioinformatics.babraham.ac.uk/projects/fastqc/ - documentation: https://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/ - licence: ["GPL-2.0-only"] - identifier: biotools:fastqc -input: - - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - reads: - type: file - description: | - List of input FastQ files of size 1 and 2 for single-end and paired-end data, - respectively. -output: - - html: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - "*.html": - type: file - description: FastQC report - pattern: "*_{fastqc.html}" - - zip: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - "*.zip": - type: file - description: FastQC report archive - pattern: "*_{fastqc.zip}" - - versions: - - versions.yml: - type: file - description: File containing software versions - pattern: "versions.yml" -authors: - - "@drpatelh" - - "@grst" - - "@ewels" - - "@FelixKrueger" -maintainers: - - "@drpatelh" - - "@grst" - - "@ewels" - - "@FelixKrueger" diff --git a/modules/nf-core/fastqc/tests/main.nf.test b/modules/nf-core/fastqc/tests/main.nf.test deleted file mode 100644 index e9d79a0..0000000 --- a/modules/nf-core/fastqc/tests/main.nf.test +++ /dev/null @@ -1,309 +0,0 @@ -nextflow_process { - - name "Test Process FASTQC" - script "../main.nf" - process "FASTQC" - - tag "modules" - tag "modules_nfcore" - tag "fastqc" - - test("sarscov2 single-end [fastq]") { - - when { - process { - """ - input[0] = Channel.of([ - [ id: 'test', single_end:true ], - [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true) ] - ]) - """ - } - } - - then { - assertAll ( - { assert process.success }, - // NOTE The report contains the date inside it, which means that the md5sum is stable per day, but not longer than that. So you can't md5sum it. - // looks like this:
Mon 2 Oct 2023
test.gz
- // https://github.com/nf-core/modules/pull/3903#issuecomment-1743620039 - { assert process.out.html[0][1] ==~ ".*/test_fastqc.html" }, - { assert process.out.zip[0][1] ==~ ".*/test_fastqc.zip" }, - { assert path(process.out.html[0][1]).text.contains("File typeConventional base calls") }, - { assert snapshot(process.out.versions).match() } - ) - } - } - - test("sarscov2 paired-end [fastq]") { - - when { - process { - """ - input[0] = Channel.of([ - [id: 'test', single_end: false], // meta map - [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), - file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_2.fastq.gz', checkIfExists: true) ] - ]) - """ - } - } - - then { - assertAll ( - { assert process.success }, - { assert process.out.html[0][1][0] ==~ ".*/test_1_fastqc.html" }, - { assert process.out.html[0][1][1] ==~ ".*/test_2_fastqc.html" }, - { assert process.out.zip[0][1][0] ==~ ".*/test_1_fastqc.zip" }, - { assert process.out.zip[0][1][1] ==~ ".*/test_2_fastqc.zip" }, - { assert path(process.out.html[0][1][0]).text.contains("File typeConventional base calls") }, - { assert path(process.out.html[0][1][1]).text.contains("File typeConventional base calls") }, - { assert snapshot(process.out.versions).match() } - ) - } - } - - test("sarscov2 interleaved [fastq]") { - - when { - process { - """ - input[0] = Channel.of([ - [id: 'test', single_end: false], // meta map - file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_interleaved.fastq.gz', checkIfExists: true) - ]) - """ - } - } - - then { - assertAll ( - { assert process.success }, - { assert process.out.html[0][1] ==~ ".*/test_fastqc.html" }, - { assert process.out.zip[0][1] ==~ ".*/test_fastqc.zip" }, - { assert path(process.out.html[0][1]).text.contains("File typeConventional base calls") }, - { assert snapshot(process.out.versions).match() } - ) - } - } - - test("sarscov2 paired-end [bam]") { - - when { - process { - """ - input[0] = Channel.of([ - [id: 'test', single_end: false], // meta map - file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.paired_end.sorted.bam', checkIfExists: true) - ]) - """ - } - } - - then { - assertAll ( - { assert process.success }, - { assert process.out.html[0][1] ==~ ".*/test_fastqc.html" }, - { assert process.out.zip[0][1] ==~ ".*/test_fastqc.zip" }, - { assert path(process.out.html[0][1]).text.contains("File typeConventional base calls") }, - { assert snapshot(process.out.versions).match() } - ) - } - } - - test("sarscov2 multiple [fastq]") { - - when { - process { - """ - input[0] = Channel.of([ - [id: 'test', single_end: false], // meta map - [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), - file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_2.fastq.gz', checkIfExists: true), - file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test2_1.fastq.gz', checkIfExists: true), - file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test2_2.fastq.gz', checkIfExists: true) ] - ]) - """ - } - } - - then { - assertAll ( - { assert process.success }, - { assert process.out.html[0][1][0] ==~ ".*/test_1_fastqc.html" }, - { assert process.out.html[0][1][1] ==~ ".*/test_2_fastqc.html" }, - { assert process.out.html[0][1][2] ==~ ".*/test_3_fastqc.html" }, - { assert process.out.html[0][1][3] ==~ ".*/test_4_fastqc.html" }, - { assert process.out.zip[0][1][0] ==~ ".*/test_1_fastqc.zip" }, - { assert process.out.zip[0][1][1] ==~ ".*/test_2_fastqc.zip" }, - { assert process.out.zip[0][1][2] ==~ ".*/test_3_fastqc.zip" }, - { assert process.out.zip[0][1][3] ==~ ".*/test_4_fastqc.zip" }, - { assert path(process.out.html[0][1][0]).text.contains("File typeConventional base calls") }, - { assert path(process.out.html[0][1][1]).text.contains("File typeConventional base calls") }, - { assert path(process.out.html[0][1][2]).text.contains("File typeConventional base calls") }, - { assert path(process.out.html[0][1][3]).text.contains("File typeConventional base calls") }, - { assert snapshot(process.out.versions).match() } - ) - } - } - - test("sarscov2 custom_prefix") { - - when { - process { - """ - input[0] = Channel.of([ - [ id:'mysample', single_end:true ], // meta map - file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true) - ]) - """ - } - } - - then { - assertAll ( - { assert process.success }, - { assert process.out.html[0][1] ==~ ".*/mysample_fastqc.html" }, - { assert process.out.zip[0][1] ==~ ".*/mysample_fastqc.zip" }, - { assert path(process.out.html[0][1]).text.contains("File typeConventional base calls") }, - { assert snapshot(process.out.versions).match() } - ) - } - } - - test("sarscov2 single-end [fastq] - stub") { - - options "-stub" - when { - process { - """ - input[0] = Channel.of([ - [ id: 'test', single_end:true ], - [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true) ] - ]) - """ - } - } - - then { - assertAll ( - { assert process.success }, - { assert snapshot(process.out).match() } - ) - } - } - - test("sarscov2 paired-end [fastq] - stub") { - - options "-stub" - when { - process { - """ - input[0] = Channel.of([ - [id: 'test', single_end: false], // meta map - [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), - file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_2.fastq.gz', checkIfExists: true) ] - ]) - """ - } - } - - then { - assertAll ( - { assert process.success }, - { assert snapshot(process.out).match() } - ) - } - } - - test("sarscov2 interleaved [fastq] - stub") { - - options "-stub" - when { - process { - """ - input[0] = Channel.of([ - [id: 'test', single_end: false], // meta map - file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_interleaved.fastq.gz', checkIfExists: true) - ]) - """ - } - } - - then { - assertAll ( - { assert process.success }, - { assert snapshot(process.out).match() } - ) - } - } - - test("sarscov2 paired-end [bam] - stub") { - - options "-stub" - when { - process { - """ - input[0] = Channel.of([ - [id: 'test', single_end: false], // meta map - file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.paired_end.sorted.bam', checkIfExists: true) - ]) - """ - } - } - - then { - assertAll ( - { assert process.success }, - { assert snapshot(process.out).match() } - ) - } - } - - test("sarscov2 multiple [fastq] - stub") { - - options "-stub" - when { - process { - """ - input[0] = Channel.of([ - [id: 'test', single_end: false], // meta map - [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), - file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_2.fastq.gz', checkIfExists: true), - file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test2_1.fastq.gz', checkIfExists: true), - file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test2_2.fastq.gz', checkIfExists: true) ] - ]) - """ - } - } - - then { - assertAll ( - { assert process.success }, - { assert snapshot(process.out).match() } - ) - } - } - - test("sarscov2 custom_prefix - stub") { - - options "-stub" - when { - process { - """ - input[0] = Channel.of([ - [ id:'mysample', single_end:true ], // meta map - file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true) - ]) - """ - } - } - - then { - assertAll ( - { assert process.success }, - { assert snapshot(process.out).match() } - ) - } - } -} diff --git a/modules/nf-core/fastqc/tests/main.nf.test.snap b/modules/nf-core/fastqc/tests/main.nf.test.snap deleted file mode 100644 index d5db309..0000000 --- a/modules/nf-core/fastqc/tests/main.nf.test.snap +++ /dev/null @@ -1,392 +0,0 @@ -{ - "sarscov2 custom_prefix": { - "content": [ - [ - "versions.yml:md5,e1cc25ca8af856014824abd842e93978" - ] - ], - "meta": { - "nf-test": "0.9.0", - "nextflow": "24.04.3" - }, - "timestamp": "2024-07-22T11:02:16.374038" - }, - "sarscov2 single-end [fastq] - stub": { - "content": [ - { - "0": [ - [ - { - "id": "test", - "single_end": true - }, - "test.html:md5,d41d8cd98f00b204e9800998ecf8427e" - ] - ], - "1": [ - [ - { - "id": "test", - "single_end": true - }, - "test.zip:md5,d41d8cd98f00b204e9800998ecf8427e" - ] - ], - "2": [ - "versions.yml:md5,e1cc25ca8af856014824abd842e93978" - ], - "html": [ - [ - { - "id": "test", - "single_end": true - }, - "test.html:md5,d41d8cd98f00b204e9800998ecf8427e" - ] - ], - "versions": [ - "versions.yml:md5,e1cc25ca8af856014824abd842e93978" - ], - "zip": [ - [ - { - "id": "test", - "single_end": true - }, - "test.zip:md5,d41d8cd98f00b204e9800998ecf8427e" - ] - ] - } - ], - "meta": { - "nf-test": "0.9.0", - "nextflow": "24.04.3" - }, - "timestamp": "2024-07-22T11:02:24.993809" - }, - "sarscov2 custom_prefix - stub": { - "content": [ - { - "0": [ - [ - { - "id": "mysample", - "single_end": true - }, - "mysample.html:md5,d41d8cd98f00b204e9800998ecf8427e" - ] - ], - "1": [ - [ - { - "id": "mysample", - "single_end": true - }, - "mysample.zip:md5,d41d8cd98f00b204e9800998ecf8427e" - ] - ], - "2": [ - "versions.yml:md5,e1cc25ca8af856014824abd842e93978" - ], - "html": [ - [ - { - "id": "mysample", - "single_end": true - }, - "mysample.html:md5,d41d8cd98f00b204e9800998ecf8427e" - ] - ], - "versions": [ - "versions.yml:md5,e1cc25ca8af856014824abd842e93978" - ], - "zip": [ - [ - { - "id": "mysample", - "single_end": true - }, - "mysample.zip:md5,d41d8cd98f00b204e9800998ecf8427e" - ] - ] - } - ], - "meta": { - "nf-test": "0.9.0", - "nextflow": "24.04.3" - }, - "timestamp": "2024-07-22T11:03:10.93942" - }, - "sarscov2 interleaved [fastq]": { - "content": [ - [ - "versions.yml:md5,e1cc25ca8af856014824abd842e93978" - ] - ], - "meta": { - "nf-test": "0.9.0", - "nextflow": "24.04.3" - }, - "timestamp": "2024-07-22T11:01:42.355718" - }, - "sarscov2 paired-end [bam]": { - "content": [ - [ - "versions.yml:md5,e1cc25ca8af856014824abd842e93978" - ] - ], - "meta": { - "nf-test": "0.9.0", - "nextflow": "24.04.3" - }, - "timestamp": "2024-07-22T11:01:53.276274" - }, - "sarscov2 multiple [fastq]": { - "content": [ - [ - "versions.yml:md5,e1cc25ca8af856014824abd842e93978" - ] - ], - "meta": { - "nf-test": "0.9.0", - "nextflow": "24.04.3" - }, - "timestamp": "2024-07-22T11:02:05.527626" - }, - "sarscov2 paired-end [fastq]": { - "content": [ - [ - "versions.yml:md5,e1cc25ca8af856014824abd842e93978" - ] - ], - "meta": { - "nf-test": "0.9.0", - "nextflow": "24.04.3" - }, - "timestamp": "2024-07-22T11:01:31.188871" - }, - "sarscov2 paired-end [fastq] - stub": { - "content": [ - { - "0": [ - [ - { - "id": "test", - "single_end": false - }, - "test.html:md5,d41d8cd98f00b204e9800998ecf8427e" - ] - ], - "1": [ - [ - { - "id": "test", - "single_end": false - }, - "test.zip:md5,d41d8cd98f00b204e9800998ecf8427e" - ] - ], - "2": [ - "versions.yml:md5,e1cc25ca8af856014824abd842e93978" - ], - "html": [ - [ - { - "id": "test", - "single_end": false - }, - "test.html:md5,d41d8cd98f00b204e9800998ecf8427e" - ] - ], - "versions": [ - "versions.yml:md5,e1cc25ca8af856014824abd842e93978" - ], - "zip": [ - [ - { - "id": "test", - "single_end": false - }, - "test.zip:md5,d41d8cd98f00b204e9800998ecf8427e" - ] - ] - } - ], - "meta": { - "nf-test": "0.9.0", - "nextflow": "24.04.3" - }, - "timestamp": "2024-07-22T11:02:34.273566" - }, - "sarscov2 multiple [fastq] - stub": { - "content": [ - { - "0": [ - [ - { - "id": "test", - "single_end": false - }, - "test.html:md5,d41d8cd98f00b204e9800998ecf8427e" - ] - ], - "1": [ - [ - { - "id": "test", - "single_end": false - }, - "test.zip:md5,d41d8cd98f00b204e9800998ecf8427e" - ] - ], - "2": [ - "versions.yml:md5,e1cc25ca8af856014824abd842e93978" - ], - "html": [ - [ - { - "id": "test", - "single_end": false - }, - "test.html:md5,d41d8cd98f00b204e9800998ecf8427e" - ] - ], - "versions": [ - "versions.yml:md5,e1cc25ca8af856014824abd842e93978" - ], - "zip": [ - [ - { - "id": "test", - "single_end": false - }, - "test.zip:md5,d41d8cd98f00b204e9800998ecf8427e" - ] - ] - } - ], - "meta": { - "nf-test": "0.9.0", - "nextflow": "24.04.3" - }, - "timestamp": "2024-07-22T11:03:02.304411" - }, - "sarscov2 single-end [fastq]": { - "content": [ - [ - "versions.yml:md5,e1cc25ca8af856014824abd842e93978" - ] - ], - "meta": { - "nf-test": "0.9.0", - "nextflow": "24.04.3" - }, - "timestamp": "2024-07-22T11:01:19.095607" - }, - "sarscov2 interleaved [fastq] - stub": { - "content": [ - { - "0": [ - [ - { - "id": "test", - "single_end": false - }, - "test.html:md5,d41d8cd98f00b204e9800998ecf8427e" - ] - ], - "1": [ - [ - { - "id": "test", - "single_end": false - }, - "test.zip:md5,d41d8cd98f00b204e9800998ecf8427e" - ] - ], - "2": [ - "versions.yml:md5,e1cc25ca8af856014824abd842e93978" - ], - "html": [ - [ - { - "id": "test", - "single_end": false - }, - "test.html:md5,d41d8cd98f00b204e9800998ecf8427e" - ] - ], - "versions": [ - "versions.yml:md5,e1cc25ca8af856014824abd842e93978" - ], - "zip": [ - [ - { - "id": "test", - "single_end": false - }, - "test.zip:md5,d41d8cd98f00b204e9800998ecf8427e" - ] - ] - } - ], - "meta": { - "nf-test": "0.9.0", - "nextflow": "24.04.3" - }, - "timestamp": "2024-07-22T11:02:44.640184" - }, - "sarscov2 paired-end [bam] - stub": { - "content": [ - { - "0": [ - [ - { - "id": "test", - "single_end": false - }, - "test.html:md5,d41d8cd98f00b204e9800998ecf8427e" - ] - ], - "1": [ - [ - { - "id": "test", - "single_end": false - }, - "test.zip:md5,d41d8cd98f00b204e9800998ecf8427e" - ] - ], - "2": [ - "versions.yml:md5,e1cc25ca8af856014824abd842e93978" - ], - "html": [ - [ - { - "id": "test", - "single_end": false - }, - "test.html:md5,d41d8cd98f00b204e9800998ecf8427e" - ] - ], - "versions": [ - "versions.yml:md5,e1cc25ca8af856014824abd842e93978" - ], - "zip": [ - [ - { - "id": "test", - "single_end": false - }, - "test.zip:md5,d41d8cd98f00b204e9800998ecf8427e" - ] - ] - } - ], - "meta": { - "nf-test": "0.9.0", - "nextflow": "24.04.3" - }, - "timestamp": "2024-07-22T11:02:53.550742" - } -} \ No newline at end of file diff --git a/modules/nf-core/hmmer/hmmsearch/environment.yml b/modules/nf-core/hmmer/hmmsearch/environment.yml new file mode 100644 index 0000000..1967d40 --- /dev/null +++ b/modules/nf-core/hmmer/hmmsearch/environment.yml @@ -0,0 +1,7 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - bioconda::hmmer=3.4 diff --git a/modules/nf-core/hmmer/hmmsearch/main.nf b/modules/nf-core/hmmer/hmmsearch/main.nf new file mode 100644 index 0000000..603a865 --- /dev/null +++ b/modules/nf-core/hmmer/hmmsearch/main.nf @@ -0,0 +1,70 @@ +process HMMER_HMMSEARCH { + tag "$meta.id" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/hmmer:3.4--hdbdd923_1' : + 'biocontainers/hmmer:3.4--hdbdd923_1' }" + + input: + tuple val(meta), path(hmmfile), path(seqdb), val(write_align), val(write_target), val(write_domain) + + output: + tuple val(meta), path('*.txt.gz') , emit: output + tuple val(meta), path('*.sto.gz') , emit: alignments , optional: true + tuple val(meta), path('*.tbl.gz') , emit: target_summary, optional: true + tuple val(meta), path('*.domtbl.gz'), emit: domain_summary, optional: true + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + output = "${prefix}.txt" + alignment = write_align ? "-A ${prefix}.sto" : '' + target_summary = write_target ? "--tblout ${prefix}.tbl" : '' + domain_summary = write_domain ? "--domtblout ${prefix}.domtbl" : '' + """ + hmmsearch \\ + $args \\ + --cpu $task.cpus \\ + -o $output \\ + $alignment \\ + $target_summary \\ + $domain_summary \\ + $hmmfile \\ + $seqdb + + gzip --no-name *.txt \\ + ${write_align ? '*.sto' : ''} \\ + ${write_target ? '*.tbl' : ''} \\ + ${write_domain ? '*.domtbl' : ''} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + hmmer: \$(hmmsearch -h | grep -o '^# HMMER [0-9.]*' | sed 's/^# HMMER *//') + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch "${prefix}.txt" + ${write_align ? "touch ${prefix}.sto" : ''} \\ + ${write_target ? "touch ${prefix}.tbl" : ''} \\ + ${write_domain ? "touch ${prefix}.domtbl" : ''} + + gzip --no-name *.txt \\ + ${write_align ? '*.sto' : ''} \\ + ${write_target ? '*.tbl' : ''} \\ + ${write_domain ? '*.domtbl' : ''} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + hmmer: \$(hmmsearch -h | grep -o '^# HMMER [0-9.]*' | sed 's/^# HMMER *//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/hmmer/hmmsearch/meta.yml b/modules/nf-core/hmmer/hmmsearch/meta.yml new file mode 100644 index 0000000..97daa54 --- /dev/null +++ b/modules/nf-core/hmmer/hmmsearch/meta.yml @@ -0,0 +1,100 @@ +name: hmmer_hmmsearch +description: search profile(s) against a sequence database +keywords: + - Hidden Markov Model + - HMM + - hmmer + - hmmsearch +tools: + - hmmer: + description: Biosequence analysis using profile hidden Markov models + homepage: http://hmmer.org/ + documentation: http://hmmer.org/documentation.html + tool_dev_url: https://github.com/EddyRivasLab/hmmer + doi: "10.1371/journal.pcbi.1002195" + licence: ["BSD"] + identifier: "" +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - hmmfile: + type: file + description: One or more HMM profiles created with hmmbuild + pattern: "*.{hmm,hmm.gz}" + ontologies: [] + - seqdb: + type: file + description: Database of sequences in FASTA format + pattern: "*.{fasta,fna,faa,fa,fasta.gz,fna.gz,faa.gz,fa.gz}" + ontologies: [] + - write_align: + type: boolean + description: Flag to save optional alignment output. Specify with 'true' to + save. + - write_target: + type: boolean + description: Flag to save optional per target summary. Specify with 'true' to + save. + - write_domain: + type: boolean + description: Flag to save optional per domain summary. Specify with 'true' to + save. +output: + output: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*.txt.gz": + type: file + description: Human readable output summarizing hmmsearch results + pattern: "*.{txt.gz}" + ontologies: [] + alignments: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*.sto.gz": + type: file + description: Optional multiple sequence alignment (MSA) in Stockholm format + pattern: "*.{sto.gz}" + ontologies: [] + target_summary: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*.tbl.gz": + type: file + description: Optional tabular (space-delimited) summary of per-target output + pattern: "*.{tbl.gz}" + ontologies: [] + domain_summary: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*.domtbl.gz": + type: file + description: Optional tabular (space-delimited) summary of per-domain output + pattern: "*.{domtbl.gz}" + ontologies: [] + versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" + ontologies: + - edam: http://edamontology.org/format_3750 # YAML +authors: + - "@Midnighter" +maintainers: + - "@Midnighter" diff --git a/modules/nf-core/hmmer/hmmsearch/tests/main.nf.test b/modules/nf-core/hmmer/hmmsearch/tests/main.nf.test new file mode 100644 index 0000000..f1b59e9 --- /dev/null +++ b/modules/nf-core/hmmer/hmmsearch/tests/main.nf.test @@ -0,0 +1,126 @@ +nextflow_process { + + name "Test Process HMMER_HMMSEARCH" + script "../main.nf" + process "HMMER_HMMSEARCH" + + tag "modules" + tag "modules_nfcore" + tag "hmmer" + tag "hmmer/hmmsearch" + + test("hmmer/hmmsearch") { + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], + file('https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/delete_me/hmmer/bac.16S_rRNA.hmm.gz', checkIfExists: true), + file('https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/delete_me/hmmer/e_coli_k12_16s.fna.gz', checkIfExists: true), + false, + false, + false + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert path(process.out.output[0][1]).linesGzip.toString().contains('[ok]') }, + { assert snapshot(process.out.versions).match() } + ) + } + + } + + test("hmmer/hmmsearch - optional") { + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], + file('https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/delete_me/hmmer/bac.16S_rRNA.hmm.gz', checkIfExists: true), + file('https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/delete_me/hmmer/e_coli_k12_16s.fna.gz', checkIfExists: true), + true, + true, + true + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert path(process.out.output.get(0).get(1)).linesGzip.toString().contains('[ok]') }, + { assert path(process.out.target_summary.get(0).get(1)).linesGzip.toString().contains('[ok]') }, + { assert snapshot( + process.out.alignments + + process.out.versions + ).match() } + ) + } + + } + + test("hmmer/hmmsearch - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], + file('https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/delete_me/hmmer/bac.16S_rRNA.hmm.gz', checkIfExists: true), + file('https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/delete_me/hmmer/e_coli_k12_16s.fna.gz', checkIfExists: true), + false, + false, + false + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("hmmer/hmmsearch - optional - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], + file('https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/delete_me/hmmer/bac.16S_rRNA.hmm.gz', checkIfExists: true), + file('https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/delete_me/hmmer/e_coli_k12_16s.fna.gz', checkIfExists: true), + true, + true, + true + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} diff --git a/modules/nf-core/hmmer/hmmsearch/tests/main.nf.test.snap b/modules/nf-core/hmmer/hmmsearch/tests/main.nf.test.snap new file mode 100644 index 0000000..e6b2277 --- /dev/null +++ b/modules/nf-core/hmmer/hmmsearch/tests/main.nf.test.snap @@ -0,0 +1,175 @@ +{ + "hmmer/hmmsearch": { + "content": [ + [ + "versions.yml:md5,37393b1da5a14113d3290ab8b3b4c40f" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-03-28T12:18:47.293093635" + }, + "hmmer/hmmsearch - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "test.txt.gz:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + + ], + "2": [ + + ], + "3": [ + + ], + "4": [ + "versions.yml:md5,37393b1da5a14113d3290ab8b3b4c40f" + ], + "alignments": [ + + ], + "domain_summary": [ + + ], + "output": [ + [ + { + "id": "test", + "single_end": false + }, + "test.txt.gz:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "target_summary": [ + + ], + "versions": [ + "versions.yml:md5,37393b1da5a14113d3290ab8b3b4c40f" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-03-28T12:18:57.862047944" + }, + "hmmer/hmmsearch - optional - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "test.txt.gz:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + [ + { + "id": "test", + "single_end": false + }, + "test.sto.gz:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "2": [ + [ + { + "id": "test", + "single_end": false + }, + "test.tbl.gz:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "3": [ + [ + { + "id": "test", + "single_end": false + }, + "test.domtbl.gz:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "4": [ + "versions.yml:md5,37393b1da5a14113d3290ab8b3b4c40f" + ], + "alignments": [ + [ + { + "id": "test", + "single_end": false + }, + "test.sto.gz:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "domain_summary": [ + [ + { + "id": "test", + "single_end": false + }, + "test.domtbl.gz:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "output": [ + [ + { + "id": "test", + "single_end": false + }, + "test.txt.gz:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "target_summary": [ + [ + { + "id": "test", + "single_end": false + }, + "test.tbl.gz:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,37393b1da5a14113d3290ab8b3b4c40f" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-03-28T12:19:03.49192788" + }, + "hmmer/hmmsearch - optional": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + "test.sto.gz:md5,5c44c289b9e36aa1f7f3afae2005fbb7" + ], + "versions.yml:md5,37393b1da5a14113d3290ab8b3b4c40f" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-03-28T12:18:52.725638562" + } +} \ No newline at end of file diff --git a/modules/nf-core/interproscan/environment.yml b/modules/nf-core/interproscan/environment.yml new file mode 100644 index 0000000..8e82f00 --- /dev/null +++ b/modules/nf-core/interproscan/environment.yml @@ -0,0 +1,7 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - bioconda::interproscan=5.59_91.0 diff --git a/modules/nf-core/interproscan/main.nf b/modules/nf-core/interproscan/main.nf new file mode 100644 index 0000000..a1779f3 --- /dev/null +++ b/modules/nf-core/interproscan/main.nf @@ -0,0 +1,59 @@ +process INTERPROSCAN { + tag "$meta.id" + // will throw NullPointer exceptions and crush with more than 1 cpu + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/interproscan:5.59_91.0--hec16e2b_1' : + 'biocontainers/interproscan:5.59_91.0--hec16e2b_1' }" + + input: + tuple val(meta), path(fasta) + path(interproscan_database, stageAs: 'data') + + output: + tuple val(meta), path('*.tsv') , optional: true, emit: tsv + tuple val(meta), path('*.xml') , optional: true, emit: xml + tuple val(meta), path('*.gff3'), optional: true, emit: gff3 + tuple val(meta), path('*.json'), optional: true, emit: json + tuple val("${task.process}"), val("interproscan"), eval('interproscan.sh --version | sed "1!d; s/.*version //"'), topic: versions, emit: versions_interproscan + + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def is_compressed = fasta.getExtension() == "gz" + def fasta_name = is_compressed ? fasta.getBaseName() : fasta + def uncompress_input = is_compressed ? "gzip -c -d ${fasta} > ${fasta_name}" : '' + """ + $uncompress_input + + if [ -d 'data' ]; then + # Find interproscan.properties to link data/ from work directory + INTERPROSCAN_DIR="\$( dirname "\$( dirname "\$( which interproscan.sh )" )" )" + INTERPROSCAN_PROPERTIES="\$( find "\$INTERPROSCAN_DIR/share" -name "interproscan.properties" )" + cp "\$INTERPROSCAN_PROPERTIES" . + sed -i "/^bin\\.directory=/ s|.*|bin.directory=\$INTERPROSCAN_DIR/bin|" interproscan.properties + export INTERPROSCAN_CONF=interproscan.properties + fi # else use sample DB included with conda ( testing only! ) + + interproscan.sh \\ + --cpu ${task.cpus} \\ + --input ${fasta_name} \\ + ${args} \\ + --output-file-base ${prefix} + """ + + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + echo ${args} + + touch ${prefix}.{tsv,xml,json,gff3} + """ +} diff --git a/modules/nf-core/interproscan/meta.yml b/modules/nf-core/interproscan/meta.yml new file mode 100644 index 0000000..cd58f2b --- /dev/null +++ b/modules/nf-core/interproscan/meta.yml @@ -0,0 +1,104 @@ +name: "interproscan" +description: Produces protein annotations and predictions from an amino acids FASTA + file +keywords: + - annotation + - fasta + - protein + - dna + - interproscan +tools: + - "interproscan": + description: "InterPro integrates together predictive information about proteins + function from a number of partner resources" + homepage: "https://www.ebi.ac.uk/interpro/search/sequence/" + documentation: "https://interproscan-docs.readthedocs.io" + tool_dev_url: "https://github.com/ebi-pf-team/interproscan" + doi: "10.1093/bioinformatics/btu031" + licence: ["GPL v3"] + identifier: "" +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - fasta: + type: file + description: Input fasta file containing the amino acid or dna query sequences + pattern: "*.{fa,fasta,fa.gz,fasta.gz}" + ontologies: + - edam: http://edamontology.org/format_1929 # FASTA + - interproscan_database: + type: directory + description: Path to the interproscan database (untarred + http://ftp.ebi.ac.uk/pub/software/unix/iprscan/5/${version_major}-${version_minor}/interproscan-${version_major}-${version_minor}-64-bit.tar.gz) +output: + tsv: + - - meta: + type: map + description: Groovy Map containing sample information + - "*.tsv": + type: file + description: Tab separated file containing with detailed hits + pattern: "*.{tsv}" + ontologies: + - edam: http://edamontology.org/format_3475 # TSV + xml: + - - meta: + type: map + description: Groovy Map containing sample information + - "*.xml": + type: file + description: XML file containing with detailed hits + pattern: "*.{xml}" + ontologies: + - edam: http://edamontology.org/format_2332 # XML + gff3: + - - meta: + type: map + description: Groovy Map containing sample information + - "*.gff3": + type: file + description: GFF3 file containing with detailed hits + pattern: "*.{gff3}" + ontologies: + - edam: http://edamontology.org/format_1975 # GFF3 + json: + - - meta: + type: map + description: Groovy Map containing sample information + - "*.json": + type: file + description: JSON file containing with detailed hits + pattern: "*.{json}" + ontologies: + - edam: http://edamontology.org/format_3464 # JSON + versions_interproscan: + - - ${task.process}: + type: string + description: The name of the process + - interproscan: + type: string + description: The name of the tool + - interproscan.sh --version | sed "1!d; s/.*version //": + type: eval + description: The expression to obtain the version of the tool +topics: + versions: + - - ${task.process}: + type: string + description: The name of the process + - interproscan: + type: string + description: The name of the tool + - interproscan.sh --version | sed "1!d; s/.*version //": + type: eval + description: The expression to obtain the version of the tool +authors: + - "@toniher" + - "@mahesh-panchal" +maintainers: + - "@toniher" + - "@vagkaratzas" + - "@mahesh-panchal" diff --git a/modules/nf-core/interproscan/tests/main.nf.test b/modules/nf-core/interproscan/tests/main.nf.test new file mode 100644 index 0000000..c25edf2 --- /dev/null +++ b/modules/nf-core/interproscan/tests/main.nf.test @@ -0,0 +1,151 @@ +nextflow_process { + + name "Test Process INTERPROSCAN" + script "../main.nf" + process "INTERPROSCAN" + + tag "modules" + tag "modules_nfcore" + tag "interproscan" + tag "aria2" + tag "untar" + + test("l_asparaginase - faa - test database") { + config "./nextflow_database.config" + + setup { + run("ARIA2") { + script "../../aria2/main.nf" + process { + """ + input[0] = [ + [ id:'test' ], + params.modules_testdata_base_path + 'proteomics/interproscan/interproscan_test.tar.gz' // https URL + ] + """ + } + } + + run("UNTAR") { + script "../../untar/main.nf" + process { + """ + input[0] = ARIA2.out.downloaded_file + """ + } + } + } + + when { + process { + """ + input[0] = [ + [ id:'test' ], + file(params.modules_testdata_base_path + 'proteomics/interproscan/l_arginase.faa', checkIfExists: true) + ] + input[1] = UNTAR.out.untar.map{ f -> f[1] } + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + path(process.out.tsv[0][1]).readLines()[0] + .contains("GI|225038609|EFDID|719595|FULL 079fff43a0270e432d339ea71b6f0acf 350 SFLD SFLDS00057 Glutaminase/Asparaginase 17 347 0.0 T"), + process.out.xml, + process.out.json, + path(process.out.gff3[0][1]).readLines()[0..4,6..-1], + process.out.findAll { key, val -> key.startsWith("versions")} + ).match()} + ) + } + + } + + test("sarscov2 - proteome_fasta") { + config "./nextflow.config" + when { + process { + """ + input[0] = [ + [ id:'test' ], + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/proteome.fasta', checkIfExists: true) + ] + input[1] = [] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + path(process.out.tsv[0][1]).readLines()[0] + .contains("ENSSASP00005000004.1 4c35f09aac2f7be4f3cffd30c6aecac8 1273 Coils Coil Coil 1176 1203 - T"), + process.out.xml, + process.out.json, + path(process.out.gff3[0][1]).readLines()[0..4,6..-1], + process.out.findAll { key, val -> key.startsWith("versions")} + ).match()} + ) + } + + } + + test("sarscov2 - proteome_fasta_gz") { + config "./nextflow.config" + when { + process { + """ + input[0] = [ + [ id:'test' ], + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/proteome.fasta.gz', checkIfExists: true) + ] + input[1] = [] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + path(process.out.tsv[0][1]).readLines()[0] + .contains("ENSSASP00005000004.1 4c35f09aac2f7be4f3cffd30c6aecac8 1273 Coils Coil Coil 1176 1203 - T"), + process.out.xml, + process.out.json, + path(process.out.gff3[0][1]).readLines()[0..4,6..-1], + process.out.findAll { key, val -> key.startsWith("versions")} + ).match()} + ) + } + + } + + test("sarscov2 - proteome_fasta_gz - stub") { + config "./nextflow.config" + options '-stub' + + when { + process { + """ + input[0] = [ + [ id:'test' ], + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/proteome.fasta.gz', checkIfExists: true) + ] + input[1] = [] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } +} diff --git a/modules/nf-core/interproscan/tests/main.nf.test.snap b/modules/nf-core/interproscan/tests/main.nf.test.snap new file mode 100644 index 0000000..1ab6785 --- /dev/null +++ b/modules/nf-core/interproscan/tests/main.nf.test.snap @@ -0,0 +1,284 @@ +{ + "sarscov2 - proteome_fasta_gz - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + [ + { + "id": "test" + }, + "test.xml:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "2": [ + [ + { + "id": "test" + }, + "test.gff3:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "3": [ + [ + { + "id": "test" + }, + "test.json:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "4": [ + [ + "INTERPROSCAN", + "interproscan", + "5.59-91.0" + ] + ], + "gff3": [ + [ + { + "id": "test" + }, + "test.gff3:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "json": [ + [ + { + "id": "test" + }, + "test.json:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "tsv": [ + [ + { + "id": "test" + }, + "test.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions_interproscan": [ + [ + "INTERPROSCAN", + "interproscan", + "5.59-91.0" + ] + ], + "xml": [ + [ + { + "id": "test" + }, + "test.xml:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.3" + }, + "timestamp": "2026-01-30T12:47:47.991445105" + }, + "l_asparaginase - faa - test database": { + "content": [ + true, + [ + [ + { + "id": "test" + }, + "test.xml:md5,7248992d9c1618cf7baa7515ae79ce32" + ] + ], + [ + [ + { + "id": "test" + }, + "test.json:md5,e0d127dd8a952cbd798999851d1338e6" + ] + ], + [ + "##gff-version 3", + "##feature-ontology http://song.cvs.sourceforge.net/viewvc/song/ontology/sofa.obo?revision=1.269", + "##interproscan-version 5.59-91.0", + "##sequence-region GI|225038609|EFDID|719595|FULL 1 350", + "GI|225038609|EFDID|719595|FULL\t.\tpolypeptide\t1\t350\t.\t+\t.\tID=GI|225038609|EFDID|719595|FULL;md5=079fff43a0270e432d339ea71b6f0acf", + "##FASTA", + ">GI|225038609|EFDID|719595|FULL", + "MDKLLWNKKEISVSERISMKRIVMINTGGTFSSQRGENGLSPKLTGTQIRNFLGEFEEDL", + "ELSTEDYCALDSSNITPEDWVQLADKISQIIYSCDGVVIIHGTDTMAYTASMLSFMLQNL", + "PIPVVLTGSQLPIGVPMSDAVNNCRCAVQMAASGLGGVYVAFDHKLMLGCRTSKVRTVSF", + "NAFESINYPYVGEVNALGMQLYPTRLSKPTGEFQLQTAYSDKIAVLKLFPGMRPDLFSFL", + "QEKGYEGIYIEGFGLGGVPFVKNDITEEISKASKAGIPILVGSQCSYEGSNLGIYETGLR", + "VLESGGIPVHDMTQEAIVTKLMWCLGQTKDREKIHQLFHTNLIQEVTLPY", + ">match$1_17_347", + "ISMKRIVMINTGGTFSSQRGENGLSPKLTGTQIRNFLGEFEEDLELSTEDYCALDSSNIT", + "PEDWVQLADKISQIIYSCDGVVIIHGTDTMAYTASMLSFMLQNLPIPVVLTGSQLPIGVP", + "MSDAVNNCRCAVQMAASGLGGVYVAFDHKLMLGCRTSKVRTVSFNAFESINYPYVGEVNA", + "LGMQLYPTRLSKPTGEFQLQTAYSDKIAVLKLFPGMRPDLFSFLQEKGYEGIYIEGFGLG", + "GVPFVKNDITEEISKASKAGIPILVGSQCSYEGSNLGIYETGLRVLESGGIPVHDMTQEA", + "IVTKLMWCLGQTKDREKIHQLFHTNLIQEVT" + ], + { + "versions_interproscan": [ + [ + "INTERPROSCAN", + "interproscan", + "5.59-91.0" + ] + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.3" + }, + "timestamp": "2026-02-02T10:26:22.006759729" + }, + "sarscov2 - proteome_fasta_gz": { + "content": [ + true, + [ + [ + { + "id": "test" + }, + "test.xml:md5,7a211c1a4761e2b9b8700e6e9abbb15f" + ] + ], + [ + [ + { + "id": "test" + }, + "test.json:md5,b05cffc28b7bfeb3dabe43c2927b2024" + ] + ], + [ + "##gff-version 3", + "##feature-ontology http://song.cvs.sourceforge.net/viewvc/song/ontology/sofa.obo?revision=1.269", + "##interproscan-version 5.59-91.0", + "##sequence-region ENSSASP00005000004.1 1 1273", + "ENSSASP00005000004.1\t.\tpolypeptide\t1\t1273\t.\t+\t.\tID=ENSSASP00005000004.1;md5=4c35f09aac2f7be4f3cffd30c6aecac8", + "##FASTA", + ">ENSSASP00005000004.1", + "MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSSVLHSTQDLFLPFFS", + "NVTWFHAIHVSGTNGTKRFDNPVLPFNDGVYFASTEKSNIIRGWIFGTTLDSKTQSLLIV", + "NNATNVVIKVCEFQFCNDPFLGVYYHKNNKSWMESEFRVYSSANNCTFEYVSQPFLMDLE", + "GKQGNFKNLREFVFKNIDGYFKIYSKHTPINLVRDLPQGFSALEPLVDLPIGINITRFQT", + "LLALHRSYLTPGDSSSGWTAGAAAYYVGYLQPRTFLLKYNENGTITDAVDCALDPLSETK", + "CTLKSFTVEKGIYQTSNFRVQPTESIVRFPNITNLCPFGEVFNATRFASVYAWNRKRISN", + "CVADYSVLYNSASFSTFKCYGVSPTKLNDLCFTNVYADSFVIRGDEVRQIAPGQTGKIAD", + "YNYKLPDDFTGCVIAWNSNNLDSKVGGNYNYLYRLFRKSNLKPFERDISTEIYQAGSTPC", + "NGVEGFNCYFPLQSYGFQPTNGVGYQPYRVVVLSFELLHAPATVCGPKKSTNLVKNKCVN", + "FNFNGLTGTGVLTESNKKFLPFQQFGRDIADTTDAVRDPQTLEILDITPCSFGGVSVITP", + "GTNTSNQVAVLYQDVNCTEVPVAIHADQLTPTWRVYSTGSNVFQTRAGCLIGAEHVNNSY", + "ECDIPIGAGICASYQTQTNSPRRARSVASQSIIAYTMSLGAENSVAYSNNSIAIPTNFTI", + "SVTTEILPVSMTKTSVDCTMYICGDSTECSNLLLQYGSFCTQLNRALTGIAVEQDKNTQE", + "VFAQVKQIYKTPPIKDFGGFNFSQILPDPSKPSKRSFIEDLLFNKVTLADAGFIKQYGDC", + "LGDIAARDLICAQKFNGLTVLPPLLTDEMIAQYTSALLAGTITSGWTFGAGAALQIPFAM", + "QMAYRFNGIGVTQNVLYENQKLIANQFNSAIGKIQDSLSSTASALGKLQDVVNQNAQALN", + "TLVKQLSSNFGAISSVLNDILSRLDKVEAEVQIDRLITGRLQSLQTYVTQQLIRAAEIRA", + "SANLAATKMSECVLGQSKRVDFCGKGYHLMSFPQSAPHGVVFLHVTYVPAQEKNFTTAPA", + "ICHDGKAHFPREGVFVSNGTHWFVTQRNFYEPQIITTDNTFVSGNCDVVIGIVNNTVYDP", + "LQPELDSFKEELDKYFKNHTSPDVDLGDISGINASVVNIQKEIDRLNEVAKNLNESLIDL", + "QELGKYEQYIKWPWYIWLGFIAGLIAIVMVTIMLCCMTSCCSCLKGCCSCGSCCKFDEDD", + "SEPVLKGVKLHYT", + ">match$1_1176_1203", + "VVNIQKEIDRLNEVAKNLNESLIDLQEL" + ], + { + "versions_interproscan": [ + [ + "INTERPROSCAN", + "interproscan", + "5.59-91.0" + ] + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.3" + }, + "timestamp": "2026-01-30T12:47:40.77682478" + }, + "sarscov2 - proteome_fasta": { + "content": [ + true, + [ + [ + { + "id": "test" + }, + "test.xml:md5,7a211c1a4761e2b9b8700e6e9abbb15f" + ] + ], + [ + [ + { + "id": "test" + }, + "test.json:md5,b05cffc28b7bfeb3dabe43c2927b2024" + ] + ], + [ + "##gff-version 3", + "##feature-ontology http://song.cvs.sourceforge.net/viewvc/song/ontology/sofa.obo?revision=1.269", + "##interproscan-version 5.59-91.0", + "##sequence-region ENSSASP00005000004.1 1 1273", + "ENSSASP00005000004.1\t.\tpolypeptide\t1\t1273\t.\t+\t.\tID=ENSSASP00005000004.1;md5=4c35f09aac2f7be4f3cffd30c6aecac8", + "##FASTA", + ">ENSSASP00005000004.1", + "MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSSVLHSTQDLFLPFFS", + "NVTWFHAIHVSGTNGTKRFDNPVLPFNDGVYFASTEKSNIIRGWIFGTTLDSKTQSLLIV", + "NNATNVVIKVCEFQFCNDPFLGVYYHKNNKSWMESEFRVYSSANNCTFEYVSQPFLMDLE", + "GKQGNFKNLREFVFKNIDGYFKIYSKHTPINLVRDLPQGFSALEPLVDLPIGINITRFQT", + "LLALHRSYLTPGDSSSGWTAGAAAYYVGYLQPRTFLLKYNENGTITDAVDCALDPLSETK", + "CTLKSFTVEKGIYQTSNFRVQPTESIVRFPNITNLCPFGEVFNATRFASVYAWNRKRISN", + "CVADYSVLYNSASFSTFKCYGVSPTKLNDLCFTNVYADSFVIRGDEVRQIAPGQTGKIAD", + "YNYKLPDDFTGCVIAWNSNNLDSKVGGNYNYLYRLFRKSNLKPFERDISTEIYQAGSTPC", + "NGVEGFNCYFPLQSYGFQPTNGVGYQPYRVVVLSFELLHAPATVCGPKKSTNLVKNKCVN", + "FNFNGLTGTGVLTESNKKFLPFQQFGRDIADTTDAVRDPQTLEILDITPCSFGGVSVITP", + "GTNTSNQVAVLYQDVNCTEVPVAIHADQLTPTWRVYSTGSNVFQTRAGCLIGAEHVNNSY", + "ECDIPIGAGICASYQTQTNSPRRARSVASQSIIAYTMSLGAENSVAYSNNSIAIPTNFTI", + "SVTTEILPVSMTKTSVDCTMYICGDSTECSNLLLQYGSFCTQLNRALTGIAVEQDKNTQE", + "VFAQVKQIYKTPPIKDFGGFNFSQILPDPSKPSKRSFIEDLLFNKVTLADAGFIKQYGDC", + "LGDIAARDLICAQKFNGLTVLPPLLTDEMIAQYTSALLAGTITSGWTFGAGAALQIPFAM", + "QMAYRFNGIGVTQNVLYENQKLIANQFNSAIGKIQDSLSSTASALGKLQDVVNQNAQALN", + "TLVKQLSSNFGAISSVLNDILSRLDKVEAEVQIDRLITGRLQSLQTYVTQQLIRAAEIRA", + "SANLAATKMSECVLGQSKRVDFCGKGYHLMSFPQSAPHGVVFLHVTYVPAQEKNFTTAPA", + "ICHDGKAHFPREGVFVSNGTHWFVTQRNFYEPQIITTDNTFVSGNCDVVIGIVNNTVYDP", + "LQPELDSFKEELDKYFKNHTSPDVDLGDISGINASVVNIQKEIDRLNEVAKNLNESLIDL", + "QELGKYEQYIKWPWYIWLGFIAGLIAIVMVTIMLCCMTSCCSCLKGCCSCGSCCKFDEDD", + "SEPVLKGVKLHYT", + ">match$1_1176_1203", + "VVNIQKEIDRLNEVAKNLNESLIDLQEL" + ], + { + "versions_interproscan": [ + [ + "INTERPROSCAN", + "interproscan", + "5.59-91.0" + ] + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.3" + }, + "timestamp": "2026-01-30T12:47:11.858565893" + } +} \ No newline at end of file diff --git a/modules/nf-core/interproscan/tests/nextflow.config b/modules/nf-core/interproscan/tests/nextflow.config new file mode 100644 index 0000000..c03ecac --- /dev/null +++ b/modules/nf-core/interproscan/tests/nextflow.config @@ -0,0 +1,6 @@ +process { + withName: INTERPROSCAN { + ext.args = '-appl Coils' + memory = 12.GB + } +} diff --git a/modules/nf-core/interproscan/tests/nextflow_database.config b/modules/nf-core/interproscan/tests/nextflow_database.config new file mode 100644 index 0000000..bc8c773 --- /dev/null +++ b/modules/nf-core/interproscan/tests/nextflow_database.config @@ -0,0 +1,7 @@ +process { + withName: INTERPROSCAN { + ext.args = '--applications Hamap,TIGRFAM,sfld --disable-precalc' + memory = 12.GB + cpus = 1 + } +} diff --git a/modules/nf-core/multiqc/environment.yml b/modules/nf-core/multiqc/environment.yml index a27122c..009874d 100644 --- a/modules/nf-core/multiqc/environment.yml +++ b/modules/nf-core/multiqc/environment.yml @@ -1,5 +1,7 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json channels: - conda-forge - bioconda dependencies: - - bioconda::multiqc=1.27 + - bioconda::multiqc=1.33 diff --git a/modules/nf-core/multiqc/main.nf b/modules/nf-core/multiqc/main.nf index 58d9313..3b0e975 100644 --- a/modules/nf-core/multiqc/main.nf +++ b/modules/nf-core/multiqc/main.nf @@ -3,11 +3,11 @@ process MULTIQC { conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/multiqc:1.27--pyhdfd78af_0' : - 'biocontainers/multiqc:1.27--pyhdfd78af_0' }" + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/34/34e733a9ae16a27e80fe00f863ea1479c96416017f24a907996126283e7ecd4d/data' : + 'community.wave.seqera.io/library/multiqc:1.33--ee7739d47738383b' }" input: - path multiqc_files, stageAs: "?/*" + path multiqc_files, stageAs: "?/*" path(multiqc_config) path(extra_multiqc_config) path(multiqc_logo) @@ -15,10 +15,11 @@ process MULTIQC { path(sample_names) output: - path "*multiqc_report.html", emit: report - path "*_data" , emit: data - path "*_plots" , optional:true, emit: plots - path "versions.yml" , emit: versions + path "*.html" , emit: report + path "*_data" , emit: data + path "*_plots" , optional:true, emit: plots + tuple val("${task.process}"), val('multiqc'), eval('multiqc --version | sed "s/.* //g"'), emit: versions + // MultiQC should not push its versions to the `versions` topic. Its input depends on the versions topic to be resolved thus outputting to the topic will let the pipeline hang forever when: task.ext.when == null || task.ext.when @@ -26,38 +27,29 @@ process MULTIQC { script: def args = task.ext.args ?: '' def prefix = task.ext.prefix ? "--filename ${task.ext.prefix}.html" : '' - def config = multiqc_config ? "--config $multiqc_config" : '' - def extra_config = extra_multiqc_config ? "--config $extra_multiqc_config" : '' + def config = multiqc_config ? "--config ${multiqc_config}" : '' + def extra_config = extra_multiqc_config ? "--config ${extra_multiqc_config}" : '' def logo = multiqc_logo ? "--cl-config 'custom_logo: \"${multiqc_logo}\"'" : '' def replace = replace_names ? "--replace-names ${replace_names}" : '' def samples = sample_names ? "--sample-names ${sample_names}" : '' """ multiqc \\ --force \\ - $args \\ - $config \\ - $prefix \\ - $extra_config \\ - $logo \\ - $replace \\ - $samples \\ + ${args} \\ + ${config} \\ + ${prefix} \\ + ${extra_config} \\ + ${logo} \\ + ${replace} \\ + ${samples} \\ . - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - multiqc: \$( multiqc --version | sed -e "s/multiqc, version //g" ) - END_VERSIONS """ stub: """ mkdir multiqc_data + touch multiqc_data/.stub mkdir multiqc_plots touch multiqc_report.html - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - multiqc: \$( multiqc --version | sed -e "s/multiqc, version //g" ) - END_VERSIONS """ } diff --git a/modules/nf-core/multiqc/meta.yml b/modules/nf-core/multiqc/meta.yml index b16c187..861cd7f 100644 --- a/modules/nf-core/multiqc/meta.yml +++ b/modules/nf-core/multiqc/meta.yml @@ -15,57 +15,74 @@ tools: licence: ["GPL-3.0-or-later"] identifier: biotools:multiqc input: - - - multiqc_files: - type: file - description: | - List of reports / files recognised by MultiQC, for example the html and zip output of FastQC - - - multiqc_config: - type: file - description: Optional config yml for MultiQC - pattern: "*.{yml,yaml}" - - - extra_multiqc_config: - type: file - description: Second optional config yml for MultiQC. Will override common sections - in multiqc_config. - pattern: "*.{yml,yaml}" - - - multiqc_logo: - type: file - description: Optional logo file for MultiQC - pattern: "*.{png}" - - - replace_names: + - multiqc_files: + type: file + description: | + List of reports / files recognised by MultiQC, for example the html and zip output of FastQC + ontologies: [] + - multiqc_config: + type: file + description: Optional config yml for MultiQC + pattern: "*.{yml,yaml}" + ontologies: + - edam: http://edamontology.org/format_3750 # YAML + - extra_multiqc_config: + type: file + description: Second optional config yml for MultiQC. Will override common sections + in multiqc_config. + pattern: "*.{yml,yaml}" + ontologies: + - edam: http://edamontology.org/format_3750 # YAML + - multiqc_logo: + type: file + description: Optional logo file for MultiQC + pattern: "*.{png}" + ontologies: [] + - replace_names: + type: file + description: | + Optional two-column sample renaming file. First column a set of + patterns, second column a set of corresponding replacements. Passed via + MultiQC's `--replace-names` option. + pattern: "*.{tsv}" + ontologies: + - edam: http://edamontology.org/format_3475 # TSV + - sample_names: + type: file + description: | + Optional TSV file with headers, passed to the MultiQC --sample_names + argument. + pattern: "*.{tsv}" + ontologies: + - edam: http://edamontology.org/format_3475 # TSV +output: + report: + - "*.html": type: file - description: | - Optional two-column sample renaming file. First column a set of - patterns, second column a set of corresponding replacements. Passed via - MultiQC's `--replace-names` option. - pattern: "*.{tsv}" - - - sample_names: + description: MultiQC report file + pattern: ".html" + ontologies: [] + data: + - "*_data": + type: directory + description: MultiQC data dir + pattern: "multiqc_data" + plots: + - "*_plots": type: file - description: | - Optional TSV file with headers, passed to the MultiQC --sample_names - argument. - pattern: "*.{tsv}" -output: - - report: - - "*multiqc_report.html": - type: file - description: MultiQC report file - pattern: "multiqc_report.html" - - data: - - "*_data": - type: directory - description: MultiQC data dir - pattern: "multiqc_data" - - plots: - - "*_plots": - type: file - description: Plots created by MultiQC - pattern: "*_data" - - versions: - - versions.yml: - type: file - description: File containing software versions - pattern: "versions.yml" + description: Plots created by MultiQC + pattern: "*_data" + ontologies: [] + versions: + - - ${task.process}: + type: string + description: The process the versions were collected from + - multiqc: + type: string + description: The tool name + - multiqc --version | sed "s/.* //g": + type: eval + description: The expression to obtain the version of the tool authors: - "@abhi18av" - "@bunop" @@ -76,3 +93,27 @@ maintainers: - "@bunop" - "@drpatelh" - "@jfy133" +containers: + conda: + linux/amd64: + lock_file: https://wave.seqera.io/v1alpha1/builds/bd-ee7739d47738383b_1/condalock + linux/arm64: + lock_file: https://wave.seqera.io/v1alpha1/builds/bd-58d7dee710ab3aa8_1/condalock + docker: + linux/amd64: + build_id: bd-ee7739d47738383b_1 + name: community.wave.seqera.io/library/multiqc:1.33--ee7739d47738383b + scanId: sc-6ddec592dcadd583_4 + linux/arm64: + build_id: bd-58d7dee710ab3aa8_1 + name: community.wave.seqera.io/library/multiqc:1.33--58d7dee710ab3aa8 + scanId: sc-a04c42273e34c55c_2 + singularity: + linux/amd64: + build_id: bd-e3576ddf588fa00d_1 + https: https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/34/34e733a9ae16a27e80fe00f863ea1479c96416017f24a907996126283e7ecd4d/data + name: oras://community.wave.seqera.io/library/multiqc:1.33--e3576ddf588fa00d + linux/arm64: + build_id: bd-2537ca5f8445e3c2_1 + https: https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/78/78b89e91d89e9cc99ad5ade5be311f347838cb2acbfb4f13bc343b170be09ce4/data + name: oras://community.wave.seqera.io/library/multiqc:1.33--2537ca5f8445e3c2 diff --git a/modules/nf-core/multiqc/tests/custom_prefix.config b/modules/nf-core/multiqc/tests/custom_prefix.config new file mode 100644 index 0000000..b30b135 --- /dev/null +++ b/modules/nf-core/multiqc/tests/custom_prefix.config @@ -0,0 +1,5 @@ +process { + withName: 'MULTIQC' { + ext.prefix = "custom_prefix" + } +} diff --git a/modules/nf-core/multiqc/tests/main.nf.test b/modules/nf-core/multiqc/tests/main.nf.test index 33316a7..d1ae8b0 100644 --- a/modules/nf-core/multiqc/tests/main.nf.test +++ b/modules/nf-core/multiqc/tests/main.nf.test @@ -30,7 +30,33 @@ nextflow_process { { assert process.success }, { assert process.out.report[0] ==~ ".*/multiqc_report.html" }, { assert process.out.data[0] ==~ ".*/multiqc_data" }, - { assert snapshot(process.out.versions).match("multiqc_versions_single") } + { assert snapshot(process.out.findAll { key, val -> key.startsWith("versions")}).match() } + ) + } + + } + + test("sarscov2 single-end [fastqc] - custom prefix") { + config "./custom_prefix.config" + + when { + process { + """ + input[0] = Channel.of(file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastqc/test_fastqc.zip', checkIfExists: true)) + input[1] = [] + input[2] = [] + input[3] = [] + input[4] = [] + input[5] = [] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert process.out.report[0] ==~ ".*/custom_prefix.html" }, + { assert process.out.data[0] ==~ ".*/custom_prefix_data" } ) } @@ -56,7 +82,7 @@ nextflow_process { { assert process.success }, { assert process.out.report[0] ==~ ".*/multiqc_report.html" }, { assert process.out.data[0] ==~ ".*/multiqc_data" }, - { assert snapshot(process.out.versions).match("multiqc_versions_config") } + { assert snapshot(process.out.findAll { key, val -> key.startsWith("versions")}).match() } ) } } @@ -84,7 +110,7 @@ nextflow_process { { assert snapshot(process.out.report.collect { file(it).getName() } + process.out.data.collect { file(it).getName() } + process.out.plots.collect { file(it).getName() } + - process.out.versions ).match("multiqc_stub") } + process.out.findAll { key, val -> key.startsWith("versions")} ).match() } ) } diff --git a/modules/nf-core/multiqc/tests/main.nf.test.snap b/modules/nf-core/multiqc/tests/main.nf.test.snap index 7b7c132..d72d35b 100644 --- a/modules/nf-core/multiqc/tests/main.nf.test.snap +++ b/modules/nf-core/multiqc/tests/main.nf.test.snap @@ -1,41 +1,61 @@ { - "multiqc_versions_single": { + "sarscov2 single-end [fastqc]": { "content": [ - [ - "versions.yml:md5,8f3b8c1cec5388cf2708be948c9fa42f" - ] + { + "versions": [ + [ + "MULTIQC", + "multiqc", + "1.33" + ] + ] + } ], "meta": { - "nf-test": "0.9.2", - "nextflow": "24.10.4" + "nf-test": "0.9.3", + "nextflow": "25.10.2" }, - "timestamp": "2025-01-27T09:29:57.631982377" + "timestamp": "2025-12-09T10:10:43.020315838" }, - "multiqc_stub": { + "sarscov2 single-end [fastqc] - stub": { "content": [ [ "multiqc_report.html", "multiqc_data", "multiqc_plots", - "versions.yml:md5,8f3b8c1cec5388cf2708be948c9fa42f" + { + "versions": [ + [ + "MULTIQC", + "multiqc", + "1.33" + ] + ] + } ] ], "meta": { - "nf-test": "0.9.2", - "nextflow": "24.10.4" + "nf-test": "0.9.3", + "nextflow": "25.10.2" }, - "timestamp": "2025-01-27T09:30:34.743726958" + "timestamp": "2025-12-09T10:11:14.131950776" }, - "multiqc_versions_config": { + "sarscov2 single-end [fastqc] [config]": { "content": [ - [ - "versions.yml:md5,8f3b8c1cec5388cf2708be948c9fa42f" - ] + { + "versions": [ + [ + "MULTIQC", + "multiqc", + "1.33" + ] + ] + } ], "meta": { - "nf-test": "0.9.2", - "nextflow": "24.10.4" + "nf-test": "0.9.3", + "nextflow": "25.10.2" }, - "timestamp": "2025-01-27T09:30:21.44383553" + "timestamp": "2025-12-09T10:11:07.15692209" } } \ No newline at end of file diff --git a/modules/nf-core/multiqc/tests/tags.yml b/modules/nf-core/multiqc/tests/tags.yml deleted file mode 100644 index bea6c0d..0000000 --- a/modules/nf-core/multiqc/tests/tags.yml +++ /dev/null @@ -1,2 +0,0 @@ -multiqc: - - modules/nf-core/multiqc/** diff --git a/modules/nf-core/s4pred/runmodel/environment.yml b/modules/nf-core/s4pred/runmodel/environment.yml new file mode 100644 index 0000000..83e4fb2 --- /dev/null +++ b/modules/nf-core/s4pred/runmodel/environment.yml @@ -0,0 +1,7 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - "bioconda::s4pred=1.2.1" diff --git a/modules/nf-core/s4pred/runmodel/main.nf b/modules/nf-core/s4pred/runmodel/main.nf new file mode 100644 index 0000000..d96ea5b --- /dev/null +++ b/modules/nf-core/s4pred/runmodel/main.nf @@ -0,0 +1,52 @@ +process S4PRED_RUNMODEL { + tag "$meta.id" + label 'process_high' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/s4pred:1.2.1--pyhdfd78af_1': + 'biocontainers/s4pred:1.2.1--pyhdfd78af_1' }" + + input: + tuple val(meta), path(fasta) + + output: + tuple val(meta), path("${prefix}"), emit: preds + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + def VERSION = '1.2.1' // WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions. + """ + mkdir ${prefix} + + run_model \\ + $args \\ + --threads $task.cpus \\ + --save-files \\ + --outdir ${prefix} \\ + ${fasta} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + s4pred: $VERSION + END_VERSIONS + """ + + stub: + prefix = task.ext.prefix ?: "${meta.id}" + def VERSION = '1.2.1' // WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions. + """ + mkdir -p ${prefix} + touch ${prefix}/test.ss2 + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + s4pred: $VERSION + END_VERSIONS + """ +} diff --git a/modules/nf-core/s4pred/runmodel/meta.yml b/modules/nf-core/s4pred/runmodel/meta.yml new file mode 100644 index 0000000..638ce67 --- /dev/null +++ b/modules/nf-core/s4pred/runmodel/meta.yml @@ -0,0 +1,52 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json +name: "s4pred_runmodel" +description: Prediction of a protein's secondary structure from its amino acid sequence +keywords: + - protein + - secondary structure + - prediction +tools: + - "s4pred": + description: "Accurate prediction of a protein's secondary structure from its + amino acid sequence" + homepage: "https://github.com/psipred/s4pred" + documentation: "https://github.com/psipred/s4pred" + tool_dev_url: "https://github.com/psipred/s4pred" + doi: "10.1093/bioinformatics/btab491" + licence: ["GPL v3-or-later"] + identifier: "" +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + - fasta: + type: file + description: protein FASTA file containing one or more amino acid sequences + to predict their respective secondary structures + pattern: "*.{fasta,fa,fas,fna,faa}" + ontologies: + - edam: http://edamontology.org/format_1929 # FASTA +output: + preds: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - ${prefix}: + type: directory + description: A folder with all the prediction outputs + pattern: "${prefix}" + versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" + ontologies: + - edam: http://edamontology.org/format_3750 # YAML +authors: + - "@vagkaratzas" +maintainers: + - "@vagkaratzas" diff --git a/modules/nf-core/s4pred/runmodel/tests/main.nf.test b/modules/nf-core/s4pred/runmodel/tests/main.nf.test new file mode 100644 index 0000000..0879036 --- /dev/null +++ b/modules/nf-core/s4pred/runmodel/tests/main.nf.test @@ -0,0 +1,58 @@ +nextflow_process { + + name "Test Process S4PRED_RUNMODEL" + script "../main.nf" + process "S4PRED_RUNMODEL" + + tag "modules" + tag "modules_nfcore" + tag "s4pred" + tag "s4pred/runmodel" + + test("sarscov2 - proteome - fasta") { + + when { + process { + """ + input[0] = [ + [ id:'test' ], + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/proteome.fasta', checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("sarscov2 - proteome - fasta - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ + [ id:'test' ], + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/proteome.fasta', checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} diff --git a/modules/nf-core/s4pred/runmodel/tests/main.nf.test.snap b/modules/nf-core/s4pred/runmodel/tests/main.nf.test.snap new file mode 100644 index 0000000..8b3d57a --- /dev/null +++ b/modules/nf-core/s4pred/runmodel/tests/main.nf.test.snap @@ -0,0 +1,98 @@ +{ + "sarscov2 - proteome - fasta": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + [ + "ENSSASP00005000002.1.ss2:md5,ca1ecb796927219ab1bd2408e17d94d6", + "ENSSASP00005000003.1.ss2:md5,ad37a8ec4cef27c91c9958bcb652903f", + "ENSSASP00005000004.1.ss2:md5,dae6e305856d12c20c97b86492c937cd", + "ENSSASP00005000005.1.ss2:md5,f3e72765229ac89063633f6d4e40c4fe", + "ENSSASP00005000006.1.ss2:md5,731586e1e6176ada900c181edb17e3c9", + "ENSSASP00005000007.1.ss2:md5,2f7459999558834d339b781c96da1482", + "ENSSASP00005000008.1.ss2:md5,c620bbff5f073089df9554ca439933eb", + "ENSSASP00005000009.1.ss2:md5,19f317c1b3d468ec2e876ecce4d4b60a", + "ENSSASP00005000010.1.ss2:md5,020604f0e037a104983e7ba325821c0a", + "ENSSASP00005000011.1.ss2:md5,ee9dc69931be3ec7eeed88e42f7b90e7", + "ENSSASP00005000012.1.ss2:md5,f08bffd8ce8031fb7bb658f8e2408626", + "ENSSASP00005000013.1.ss2:md5,445e88597dc8fc0f40e21d95d148ced1" + ] + ] + ], + "1": [ + "versions.yml:md5,408e5584371b73bd5dd0a53096ed2d14" + ], + "preds": [ + [ + { + "id": "test" + }, + [ + "ENSSASP00005000002.1.ss2:md5,ca1ecb796927219ab1bd2408e17d94d6", + "ENSSASP00005000003.1.ss2:md5,ad37a8ec4cef27c91c9958bcb652903f", + "ENSSASP00005000004.1.ss2:md5,dae6e305856d12c20c97b86492c937cd", + "ENSSASP00005000005.1.ss2:md5,f3e72765229ac89063633f6d4e40c4fe", + "ENSSASP00005000006.1.ss2:md5,731586e1e6176ada900c181edb17e3c9", + "ENSSASP00005000007.1.ss2:md5,2f7459999558834d339b781c96da1482", + "ENSSASP00005000008.1.ss2:md5,c620bbff5f073089df9554ca439933eb", + "ENSSASP00005000009.1.ss2:md5,19f317c1b3d468ec2e876ecce4d4b60a", + "ENSSASP00005000010.1.ss2:md5,020604f0e037a104983e7ba325821c0a", + "ENSSASP00005000011.1.ss2:md5,ee9dc69931be3ec7eeed88e42f7b90e7", + "ENSSASP00005000012.1.ss2:md5,f08bffd8ce8031fb7bb658f8e2408626", + "ENSSASP00005000013.1.ss2:md5,445e88597dc8fc0f40e21d95d148ced1" + ] + ] + ], + "versions": [ + "versions.yml:md5,408e5584371b73bd5dd0a53096ed2d14" + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.6" + }, + "timestamp": "2025-05-07T10:09:40.552710868" + }, + "sarscov2 - proteome - fasta - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + [ + "test.ss2:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "1": [ + "versions.yml:md5,408e5584371b73bd5dd0a53096ed2d14" + ], + "preds": [ + [ + { + "id": "test" + }, + [ + "test.ss2:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "versions": [ + "versions.yml:md5,408e5584371b73bd5dd0a53096ed2d14" + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.6" + }, + "timestamp": "2025-05-07T10:09:46.65000795" + } +} \ No newline at end of file diff --git a/modules/nf-core/seqfu/stats/environment.yml b/modules/nf-core/seqfu/stats/environment.yml new file mode 100644 index 0000000..210e292 --- /dev/null +++ b/modules/nf-core/seqfu/stats/environment.yml @@ -0,0 +1,7 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - bioconda::seqfu=1.22.3 diff --git a/modules/nf-core/seqfu/stats/main.nf b/modules/nf-core/seqfu/stats/main.nf new file mode 100644 index 0000000..ced2451 --- /dev/null +++ b/modules/nf-core/seqfu/stats/main.nf @@ -0,0 +1,52 @@ +process SEQFU_STATS { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/seqfu:1.22.3--hfd12232_2': + 'biocontainers/seqfu:1.22.3--hfd12232_2' }" + + input: + // stats can get one or more fasta or fastq files + tuple val(meta), path(files) + + output: + tuple val(meta), path("*.tsv") , emit: stats + tuple val(meta), path("*_mqc.txt"), emit: multiqc + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + seqfu \\ + stats \\ + $args \\ + --multiqc ${prefix}_mqc.txt \\ + $files > ${prefix}.tsv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + seqfu: \$(seqfu version) + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + echo $args + + touch ${prefix}.tsv + touch ${prefix}_mqc.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + seqfu: \$(seqfu version) + END_VERSIONS + """ +} diff --git a/modules/nf-core/seqfu/stats/meta.yml b/modules/nf-core/seqfu/stats/meta.yml new file mode 100644 index 0000000..573498c --- /dev/null +++ b/modules/nf-core/seqfu/stats/meta.yml @@ -0,0 +1,67 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json +name: "seqfu_stats" +description: Statistics for FASTA or FASTQ files +keywords: + - seqfu + - stats + - n50 +tools: + - "seqfu": + description: "Cross-platform compiled suite of tools to manipulate and inspect + FASTA and FASTQ files" + homepage: "https://telatin.github.io/seqfu2/" + documentation: "https://telatin.github.io/seqfu2/" + tool_dev_url: "https://github.com/telatin/seqfu2" + doi: "10.3390/bioengineering8050059" + licence: ["GPL v3"] + identifier: biotools:seqfu + +input: + # Only when we have meta + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + - files: + type: file + description: One or more FASTA or FASTQ files + pattern: "*.{fasta,fastq,fasta.gz,fastq.gz,fq,fq.gz}" + ontologies: + - edam: http://edamontology.org/format_1930 # FASTQ +output: + #Only when we have meta + stats: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + - "*.tsv": + type: file + description: Tab-separated output file with basic sequence statistics. + pattern: "*.{tsv}" + ontologies: + - edam: http://edamontology.org/format_3475 # TSV + multiqc: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + - "*_mqc.txt": + type: file + description: MultiQC ready table + pattern: "*.{_mqc.txt}" + ontologies: [] + versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" + ontologies: + - edam: http://edamontology.org/format_3750 # YAML +authors: + - "@telatin" +maintainers: + - "@telatin" diff --git a/modules/nf-core/seqfu/stats/tests/main.nf.test b/modules/nf-core/seqfu/stats/tests/main.nf.test new file mode 100644 index 0000000..3d13b05 --- /dev/null +++ b/modules/nf-core/seqfu/stats/tests/main.nf.test @@ -0,0 +1,88 @@ +nextflow_process { + + name "Test Process SEQFU_STATS" + script "../main.nf" + process "SEQFU_STATS" + + tag "modules" + tag "modules_nfcore" + tag "seqfu" + tag "seqfu/stats" + + test("seqfu stats - faa") { + // test with 1 FAA file (with multiple sequences of different length) + when { + process { + """ + input[0] = [ + [ id:'test_faa' ], + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/proteome.fasta', checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("seqfu stats - multiple files") { + // test feeding a mix of files including compressed + when { + process { + """ + input[0] = [ + [ id:'test_multiple' ], + [ + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/proteome.fasta', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true) + ] + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("seqfu stats - faa - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ + [ id:'test_stub' ], + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/proteome.fasta', checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success}, + { assert snapshot( + process.out, + process.out.versions.collect{ path(it).yaml } + ).match() } + ) + } + + } + +} diff --git a/modules/nf-core/seqfu/stats/tests/main.nf.test.snap b/modules/nf-core/seqfu/stats/tests/main.nf.test.snap new file mode 100644 index 0000000..30b1699 --- /dev/null +++ b/modules/nf-core/seqfu/stats/tests/main.nf.test.snap @@ -0,0 +1,156 @@ +{ + "seqfu stats - faa - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test_stub" + }, + "test_stub.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + [ + { + "id": "test_stub" + }, + "test_stub_mqc.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "2": [ + "versions.yml:md5,7e7581ee4a87fd1f9969628ae050e689" + ], + "multiqc": [ + [ + { + "id": "test_stub" + }, + "test_stub_mqc.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "stats": [ + [ + { + "id": "test_stub" + }, + "test_stub.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,7e7581ee4a87fd1f9969628ae050e689" + ] + }, + [ + { + "SEQFU_STATS": { + "seqfu": "1.22.3" + } + } + ] + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.04.6" + }, + "timestamp": "2025-09-24T12:51:50.35812127" + }, + "seqfu stats - faa": { + "content": [ + { + "0": [ + [ + { + "id": "test_faa" + }, + "test_faa.tsv:md5,0d6bf2cc788f7828761440a1689cac04" + ] + ], + "1": [ + [ + { + "id": "test_faa" + }, + "test_faa_mqc.txt:md5,8f3c2edaf1ea5be912c9f99b21b2856c" + ] + ], + "2": [ + "versions.yml:md5,7e7581ee4a87fd1f9969628ae050e689" + ], + "multiqc": [ + [ + { + "id": "test_faa" + }, + "test_faa_mqc.txt:md5,8f3c2edaf1ea5be912c9f99b21b2856c" + ] + ], + "stats": [ + [ + { + "id": "test_faa" + }, + "test_faa.tsv:md5,0d6bf2cc788f7828761440a1689cac04" + ] + ], + "versions": [ + "versions.yml:md5,7e7581ee4a87fd1f9969628ae050e689" + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.04.6" + }, + "timestamp": "2025-09-24T12:52:03.19664223" + }, + "seqfu stats - multiple files": { + "content": [ + { + "0": [ + [ + { + "id": "test_multiple" + }, + "test_multiple.tsv:md5,d016de3d84187a06c8e19b8dabccb3ae" + ] + ], + "1": [ + [ + { + "id": "test_multiple" + }, + "test_multiple_mqc.txt:md5,dbc6e762eebbf756cd0687807de60445" + ] + ], + "2": [ + "versions.yml:md5,7e7581ee4a87fd1f9969628ae050e689" + ], + "multiqc": [ + [ + { + "id": "test_multiple" + }, + "test_multiple_mqc.txt:md5,dbc6e762eebbf756cd0687807de60445" + ] + ], + "stats": [ + [ + { + "id": "test_multiple" + }, + "test_multiple.tsv:md5,d016de3d84187a06c8e19b8dabccb3ae" + ] + ], + "versions": [ + "versions.yml:md5,7e7581ee4a87fd1f9969628ae050e689" + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.04.6" + }, + "timestamp": "2025-09-24T12:52:11.315174611" + } +} \ No newline at end of file diff --git a/modules/nf-core/seqkit/replace/environment.yml b/modules/nf-core/seqkit/replace/environment.yml new file mode 100644 index 0000000..b26fb1e --- /dev/null +++ b/modules/nf-core/seqkit/replace/environment.yml @@ -0,0 +1,7 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - bioconda::seqkit=2.9.0 diff --git a/modules/nf-core/seqkit/replace/main.nf b/modules/nf-core/seqkit/replace/main.nf new file mode 100644 index 0000000..dd03dac --- /dev/null +++ b/modules/nf-core/seqkit/replace/main.nf @@ -0,0 +1,52 @@ +process SEQKIT_REPLACE { + tag "${meta.id}" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container + ? 'https://depot.galaxyproject.org/singularity/seqkit:2.9.0--h9ee0642_0' + : 'biocontainers/seqkit:2.9.0--h9ee0642_0'}" + + input: + tuple val(meta), path(fastx) + + output: + tuple val(meta), path("*.fast*"), emit: fastx + tuple val("${task.process}"), val('seqkit'), eval("seqkit version | sed 's/seqkit v//'"), emit: versions_seqkit, topic: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def extension = "fastq" + if ("${fastx}" ==~ /.+\.fasta|.+\.fasta.gz|.+\.fa|.+\.fa.gz|.+\.fas|.+\.fas.gz|.+\.fna|.+\.fna.gz|.+\.faa|.+\.faa.gz/) { + extension = "fasta" + } + def isgz = "" + if ("${fastx}" ==~ /.+\.gz/) { + isgz = ".gz" + } + def endswith = task.ext.suffix ?: "${extension}${isgz}" + """ + seqkit \\ + replace \\ + ${args} \\ + --threads ${task.cpus} \\ + -i ${fastx} \\ + -o ${prefix}.${endswith} + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + def extension = "fastq" + if ("${fastx}" ==~ /.+\.fasta|.+\.fasta.gz|.+\.fa|.+\.fa.gz|.+\.fas|.+\.fas.gz|.+\.fna|.+\.fna.gz/) { + extension = "fasta" + } + def endswith = task.ext.suffix ?: "${extension}.gz" + + """ + echo "" | gzip > ${prefix}.${endswith} + """ +} diff --git a/modules/nf-core/seqkit/replace/meta.yml b/modules/nf-core/seqkit/replace/meta.yml new file mode 100644 index 0000000..1592787 --- /dev/null +++ b/modules/nf-core/seqkit/replace/meta.yml @@ -0,0 +1,69 @@ +name: seqkit_replace +description: Use seqkit to find/replace strings within sequences and sequence headers +keywords: + - seqkit + - replace + - sequence + - sequence headers + - fasta +tools: + - seqkit: + description: Cross-platform and ultrafast toolkit for FASTA/Q file manipulation, + written by Wei Shen. + homepage: https://bioinf.shenwei.me/seqkit/usage/ + documentation: https://bioinf.shenwei.me/seqkit/usage/ + tool_dev_url: https://github.com/shenwei356/seqkit/ + doi: "10.1371/journal.pone.016396" + identifier: biotools:seqkit +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - fastx: + type: file + description: fasta/q file + pattern: "*.{fasta,fastq,fa,fq,fas,fna,faa}*" + ontologies: + - edam: http://edamontology.org/format_1930 # FASTQ +output: + fastx: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*.fast*": + type: file + description: fasta/q file with replaced values + pattern: "*.{fasta,fastq,fa,fq,fas,fna,faa}*" + ontologies: + - edam: http://edamontology.org/format_1930 # FASTQ + versions_seqkit: + - - ${task.process}: + type: string + description: The name of the process + - seqkit: + type: string + description: The name of the tool + - "seqkit version | sed 's/seqkit v//'": + type: eval + description: The expression to obtain the version of seqkit + +topics: + versions: + - - ${task.process}: + type: string + description: The name of the process + - seqkit: + type: string + description: The name of the tool + - "seqkit version | sed 's/seqkit v//'": + type: eval + description: The expression to obtain the version of seqkit + +authors: + - "@mjcipriano" +maintainers: + - "@mjcipriano" diff --git a/modules/nf-core/seqkit/replace/tests/main.nf.test b/modules/nf-core/seqkit/replace/tests/main.nf.test new file mode 100644 index 0000000..15fb7d3 --- /dev/null +++ b/modules/nf-core/seqkit/replace/tests/main.nf.test @@ -0,0 +1,105 @@ +nextflow_process { + + name "Test Process SEQKIT_REPLACE" + script "../main.nf" + process "SEQKIT_REPLACE" + + tag "modules" + tag "modules_nfcore" + tag "seqkit" + tag "seqkit/replace" + + test("sarscov2 - fasta - replace") { + + config "./replace.config" + + when { + process { + """ + input[0] = [ [ id:'test' ], // meta map + [ file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/proteome.fasta.gz', checkIfExists: true) ] + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + + test("sarscov2 - fasta - replace - uncompressed") { + + config "./replace.config" + + when { + process { + """ + input[0] = [ [ id:'test' ], // meta map + [ file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) ] + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("sarscov2 - fasta - uncomp - custom") { + + config "./uncomp.config" + + when { + process { + """ + input[0] = [ [ id:'test' ], // meta map + [ file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) ] + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("sarscov2 - fasta - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ [ id:'test' ], // meta map + [ file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) ] + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} diff --git a/modules/nf-core/seqkit/replace/tests/main.nf.test.snap b/modules/nf-core/seqkit/replace/tests/main.nf.test.snap new file mode 100644 index 0000000..fdae3eb --- /dev/null +++ b/modules/nf-core/seqkit/replace/tests/main.nf.test.snap @@ -0,0 +1,166 @@ +{ + "sarscov2 - fasta - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.fasta.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "1": [ + [ + "SEQKIT_REPLACE", + "seqkit", + "2.9.0" + ] + ], + "fastx": [ + [ + { + "id": "test" + }, + "test.fasta.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "versions_seqkit": [ + [ + "SEQKIT_REPLACE", + "seqkit", + "2.9.0" + ] + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.3" + }, + "timestamp": "2026-02-03T14:51:31.369966565" + }, + "sarscov2 - fasta - replace - uncompressed": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.fasta:md5,b1518908253a4997fcad98270751112e" + ] + ], + "1": [ + [ + "SEQKIT_REPLACE", + "seqkit", + "2.9.0" + ] + ], + "fastx": [ + [ + { + "id": "test" + }, + "test.fasta:md5,b1518908253a4997fcad98270751112e" + ] + ], + "versions_seqkit": [ + [ + "SEQKIT_REPLACE", + "seqkit", + "2.9.0" + ] + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.3" + }, + "timestamp": "2026-02-03T14:51:20.851601162" + }, + "sarscov2 - fasta - uncomp - custom": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test..fasta:md5,05d3294a62c72f5489f067c1da3c2f6c" + ] + ], + "1": [ + [ + "SEQKIT_REPLACE", + "seqkit", + "2.9.0" + ] + ], + "fastx": [ + [ + { + "id": "test" + }, + "test..fasta:md5,05d3294a62c72f5489f067c1da3c2f6c" + ] + ], + "versions_seqkit": [ + [ + "SEQKIT_REPLACE", + "seqkit", + "2.9.0" + ] + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.3" + }, + "timestamp": "2026-02-03T14:51:26.121667583" + }, + "sarscov2 - fasta - replace": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.fasta.gz:md5,c40eaff961f6f2a48bb7e8fd156ed5d7" + ] + ], + "1": [ + [ + "SEQKIT_REPLACE", + "seqkit", + "2.9.0" + ] + ], + "fastx": [ + [ + { + "id": "test" + }, + "test.fasta.gz:md5,c40eaff961f6f2a48bb7e8fd156ed5d7" + ] + ], + "versions_seqkit": [ + [ + "SEQKIT_REPLACE", + "seqkit", + "2.9.0" + ] + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.3" + }, + "timestamp": "2026-02-03T14:51:15.520168225" + } +} \ No newline at end of file diff --git a/modules/nf-core/seqkit/replace/tests/replace.config b/modules/nf-core/seqkit/replace/tests/replace.config new file mode 100644 index 0000000..8766447 --- /dev/null +++ b/modules/nf-core/seqkit/replace/tests/replace.config @@ -0,0 +1,5 @@ + process { + withName: 'SEQKIT_REPLACE' { + ext.args = "-s -p 'A' -r 'N'" + } + } diff --git a/modules/nf-core/seqkit/replace/tests/uncomp.config b/modules/nf-core/seqkit/replace/tests/uncomp.config new file mode 100644 index 0000000..dbd892b --- /dev/null +++ b/modules/nf-core/seqkit/replace/tests/uncomp.config @@ -0,0 +1,6 @@ + process { + withName: 'SEQKIT_REPLACE' { + ext.args = "-s -p 'T' -r 'N'" + ext.suffix = ".fasta" + } + } diff --git a/modules/nf-core/seqkit/rmdup/environment.yml b/modules/nf-core/seqkit/rmdup/environment.yml new file mode 100644 index 0000000..b26fb1e --- /dev/null +++ b/modules/nf-core/seqkit/rmdup/environment.yml @@ -0,0 +1,7 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - bioconda::seqkit=2.9.0 diff --git a/modules/nf-core/seqkit/rmdup/main.nf b/modules/nf-core/seqkit/rmdup/main.nf new file mode 100644 index 0000000..4c83add --- /dev/null +++ b/modules/nf-core/seqkit/rmdup/main.nf @@ -0,0 +1,66 @@ +process SEQKIT_RMDUP { + tag "$meta.id" + label 'process_low' + // File IO can be a bottleneck. See: https://bioinf.shenwei.me/seqkit/usage/#parallelization-of-cpu-intensive-jobs + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/seqkit:2.9.0--h9ee0642_0': + 'biocontainers/seqkit:2.9.0--h9ee0642_0' }" + + input: + tuple val(meta), path(fastx) + + output: + tuple val(meta), path("${prefix}.${extension}") , emit: fastx + tuple val(meta), path("*.log") , emit: log + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + extension = "fastq" + if ("$fastx" ==~ /.+\.fasta|.+\.fasta.gz|.+\.fa|.+\.fa.gz|.+\.fas|.+\.fas.gz|.+\.fna|.+\.fna.gz|.+\.fsa|.+\.fsa.gz/ ) { + extension = "fasta" + } + extension = fastx.toString().endsWith('.gz') ? "${extension}.gz" : extension + // SeqKit/rmdup takes care of compressing the output: https://bioinf.shenwei.me/seqkit/usage/#rmdup + if("${prefix}.${extension}" == "$fastx") error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" + """ + seqkit \\ + rmdup \\ + --threads $task.cpus \\ + $args \\ + $fastx \\ + -o ${prefix}.${extension} \\ + 2>| >(tee ${prefix}.log >&2) + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + seqkit: \$(seqkit version | cut -d' ' -f2) + END_VERSIONS + """ + + stub: + prefix = task.ext.prefix ?: "${meta.id}" + extension = "fastq" + if ("$fastx" ==~ /.+\.fasta|.+\.fasta.gz|.+\.fa|.+\.fa.gz|.+\.fas|.+\.fas.gz|.+\.fna|.+\.fna.gz|.+\.fsa|.+\.fsa.gz/ ) { + extension = "fasta" + } + extension = fastx.toString().endsWith('.gz') ? "${extension}.gz" : extension + if("${prefix}.${extension}" == "$fastx") error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" + """ + touch ${prefix}.${extension} + echo \\ + '[INFO] 0 duplicated records removed' \\ + > ${prefix}.log + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + seqkit: \$(seqkit version | cut -d' ' -f2) + END_VERSIONS + """ +} diff --git a/modules/nf-core/seqkit/rmdup/meta.yml b/modules/nf-core/seqkit/rmdup/meta.yml new file mode 100644 index 0000000..62baa3a --- /dev/null +++ b/modules/nf-core/seqkit/rmdup/meta.yml @@ -0,0 +1,66 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json +name: "seqkit_rmdup" +description: Transforms sequences (extract ID, filter by length, remove gaps, reverse + complement...) +keywords: + - genomics + - fasta + - fastq + - remove + - duplicates +tools: + - "seqkit": + description: "A cross-platform and ultrafast toolkit for FASTA/Q file manipulation" + homepage: "https://bioinf.shenwei.me/seqkit/" + documentation: "https://bioinf.shenwei.me/seqkit/usage/" + tool_dev_url: "https://github.com/shenwei356/seqkit" + doi: "10.1371/journal.pone.0163962" + licence: ["MIT"] + identifier: biotools:seqkit +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1' ]` + - fastx: + type: file + description: Input fasta/fastq file + pattern: "*.{fsa,fas,fa,fasta,fastq,fq,fsa.gz,fas.gz,fa.gz,fasta.gz,fastq.gz,fq.gz}" + ontologies: + - edam: http://edamontology.org/format_1930 # FASTQ +output: + fastx: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1' ]` + - ${prefix}.${extension}: + type: file + description: Output fasta/fastq file + pattern: "*.{fasta,fasta.gz,fastq,fastq.gz}" + ontologies: + - edam: http://edamontology.org/format_1930 # FASTQ + log: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1' ]` + - "*.log": + type: file + description: Log containing information regarding removed duplicates + pattern: "*.log" + ontologies: [] + versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" + ontologies: + - edam: http://edamontology.org/format_3750 # YAML +authors: + - "@GallVp" +maintainers: + - "@GallVp" diff --git a/modules/nf-core/seqkit/rmdup/tests/main.nf.test b/modules/nf-core/seqkit/rmdup/tests/main.nf.test new file mode 100644 index 0000000..beac3a4 --- /dev/null +++ b/modules/nf-core/seqkit/rmdup/tests/main.nf.test @@ -0,0 +1,172 @@ +nextflow_process { + + name "Test Process SEQKIT_RMDUP" + script "../main.nf" + process "SEQKIT_RMDUP" + + tag "modules" + tag "modules_nfcore" + tag "seqkit" + tag "seqkit/rmdup" + + test("sarscov2-genome_fasta") { + when { + process { + """ + input[0] = [ + [ id:'test' ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() }, + { assert path(process.out.log[0][1]).text.contains('0 duplicated records removed') } + ) + } + + } + + test("repeated-fasta") { + when { + process { + """ + def repeated_fasta = file('repeated.fasta') + repeated_fasta.text = '>A\\nAGCTAGCTAGCT\\n>B\\nAGCTAGCTAGCT\\n>A\\nAGCTAGCTAGCT' + + input[0] = [ + [ id:'test' ], // meta map + repeated_fasta + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() }, + { assert path(process.out.log[0][1]).text.contains('1 duplicated records removed') } + ) + } + + } + + test("sarscov2-genome_fasta_gz") { + when { + process { + """ + input[0] = [ + [ id:'test' ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta.gz', checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() }, + { assert path(process.out.log[0][1]).text.contains('0 duplicated records removed') } + ) + } + + } + + test("sarscov2-test_1_fastq_gz") { + when { + process { + """ + input[0] = [ + [ id:'test' ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() }, + { assert path(process.out.log[0][1]).text.contains('0 duplicated records removed') } + ) + } + + } + + test("file_name_conflict-fail_with_error") { + when { + process { + """ + input[0] = [ + [ id:'test_1' ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert !process.success }, + { assert process.stdout.toString().contains("Input and output names are the same") } + ) + } + + } + + test("sarscov2-genome_fasta-stub") { + + options '-stub' + + when { + process { + """ + input[0] = [ + [ id:'test' ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() }, + { assert path(process.out.log[0][1]).text.contains('0 duplicated records removed') } + ) + } + + } + + test("file_name_conflict-fail_with_error-stub") { + options '-stub' + + when { + process { + """ + input[0] = [ + [ id:'genome' ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert !process.success }, + { assert process.stdout.toString().contains("Input and output names are the same") } + ) + } + + } + +} diff --git a/modules/nf-core/seqkit/rmdup/tests/main.nf.test.snap b/modules/nf-core/seqkit/rmdup/tests/main.nf.test.snap new file mode 100644 index 0000000..4b2858d --- /dev/null +++ b/modules/nf-core/seqkit/rmdup/tests/main.nf.test.snap @@ -0,0 +1,247 @@ +{ + "sarscov2-genome_fasta-stub": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.fasta:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + [ + { + "id": "test" + }, + "test.log:md5,cf833211befdf890bb6b2a3cd0b91853" + ] + ], + "2": [ + "versions.yml:md5,9855ca606b68cb8c32718a1249488688" + ], + "fastx": [ + [ + { + "id": "test" + }, + "test.fasta:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "log": [ + [ + { + "id": "test" + }, + "test.log:md5,cf833211befdf890bb6b2a3cd0b91853" + ] + ], + "versions": [ + "versions.yml:md5,9855ca606b68cb8c32718a1249488688" + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.3" + }, + "timestamp": "2025-01-15T15:12:04.249165" + }, + "sarscov2-test_1_fastq_gz": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.fastq.gz:md5,4161df271f9bfcd25d5845a1e220dbec" + ] + ], + "1": [ + [ + { + "id": "test" + }, + "test.log:md5,a41135cfe024baaf42f135583fe73f0d" + ] + ], + "2": [ + "versions.yml:md5,9855ca606b68cb8c32718a1249488688" + ], + "fastx": [ + [ + { + "id": "test" + }, + "test.fastq.gz:md5,4161df271f9bfcd25d5845a1e220dbec" + ] + ], + "log": [ + [ + { + "id": "test" + }, + "test.log:md5,a41135cfe024baaf42f135583fe73f0d" + ] + ], + "versions": [ + "versions.yml:md5,9855ca606b68cb8c32718a1249488688" + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.3" + }, + "timestamp": "2025-01-15T15:11:57.07272" + }, + "sarscov2-genome_fasta": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.fasta:md5,483f4a5dfe60171c86ee9b7e6dff908b" + ] + ], + "1": [ + [ + { + "id": "test" + }, + "test.log:md5,a41135cfe024baaf42f135583fe73f0d" + ] + ], + "2": [ + "versions.yml:md5,9855ca606b68cb8c32718a1249488688" + ], + "fastx": [ + [ + { + "id": "test" + }, + "test.fasta:md5,483f4a5dfe60171c86ee9b7e6dff908b" + ] + ], + "log": [ + [ + { + "id": "test" + }, + "test.log:md5,a41135cfe024baaf42f135583fe73f0d" + ] + ], + "versions": [ + "versions.yml:md5,9855ca606b68cb8c32718a1249488688" + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.3" + }, + "timestamp": "2025-01-15T15:11:44.13147" + }, + "repeated-fasta": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.fasta:md5,7510a742291241e7d7556bf720caf65c" + ] + ], + "1": [ + [ + { + "id": "test" + }, + "test.log:md5,314c0aaef0f832a217a3f6ce3f8bc117" + ] + ], + "2": [ + "versions.yml:md5,9855ca606b68cb8c32718a1249488688" + ], + "fastx": [ + [ + { + "id": "test" + }, + "test.fasta:md5,7510a742291241e7d7556bf720caf65c" + ] + ], + "log": [ + [ + { + "id": "test" + }, + "test.log:md5,314c0aaef0f832a217a3f6ce3f8bc117" + ] + ], + "versions": [ + "versions.yml:md5,9855ca606b68cb8c32718a1249488688" + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.3" + }, + "timestamp": "2025-01-15T15:11:48.203975" + }, + "sarscov2-genome_fasta_gz": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.fasta.gz:md5,483f4a5dfe60171c86ee9b7e6dff908b" + ] + ], + "1": [ + [ + { + "id": "test" + }, + "test.log:md5,a41135cfe024baaf42f135583fe73f0d" + ] + ], + "2": [ + "versions.yml:md5,9855ca606b68cb8c32718a1249488688" + ], + "fastx": [ + [ + { + "id": "test" + }, + "test.fasta.gz:md5,483f4a5dfe60171c86ee9b7e6dff908b" + ] + ], + "log": [ + [ + { + "id": "test" + }, + "test.log:md5,a41135cfe024baaf42f135583fe73f0d" + ] + ], + "versions": [ + "versions.yml:md5,9855ca606b68cb8c32718a1249488688" + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.3" + }, + "timestamp": "2025-01-15T15:11:52.657459" + } +} \ No newline at end of file diff --git a/modules/nf-core/seqkit/seq/environment.yml b/modules/nf-core/seqkit/seq/environment.yml new file mode 100644 index 0000000..b26fb1e --- /dev/null +++ b/modules/nf-core/seqkit/seq/environment.yml @@ -0,0 +1,7 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - bioconda::seqkit=2.9.0 diff --git a/modules/nf-core/seqkit/seq/main.nf b/modules/nf-core/seqkit/seq/main.nf new file mode 100644 index 0000000..c3b6cb5 --- /dev/null +++ b/modules/nf-core/seqkit/seq/main.nf @@ -0,0 +1,67 @@ +process SEQKIT_SEQ { + tag "${meta.id}" + label 'process_low' + // File IO can be a bottleneck. See: https://bioinf.shenwei.me/seqkit/usage/#parallelization-of-cpu-intensive-jobs + + conda "${moduleDir}/environment.yml" + container "${workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container + ? 'https://depot.galaxyproject.org/singularity/seqkit:2.9.0--h9ee0642_0' + : 'biocontainers/seqkit:2.9.0--h9ee0642_0'}" + + input: + tuple val(meta), path(fastx) + + output: + tuple val(meta), path("${prefix}.*"), emit: fastx + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def args2 = task.ext.args2 ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + def extension = "fastq" + if ("${fastx}" ==~ /.+\.fasta|.+\.fasta.gz|.+\.fa|.+\.fa.gz|.+\.fas|.+\.fas.gz|.+\.fna|.+\.fna.gz|.+\.fsa|.+\.fsa.gz/) { + extension = "fasta" + } + extension = fastx.toString().endsWith('.gz') ? "${extension}.gz" : extension + def call_gzip = extension.endsWith('.gz') ? "| gzip -c ${args2}" : '' + if ("${prefix}.${extension}" == "${fastx}") { + error("Input and output names are the same, use \"task.ext.prefix\" to disambiguate!") + } + """ + seqkit \\ + seq \\ + --threads ${task.cpus} \\ + ${args} \\ + ${fastx} \\ + ${call_gzip} \\ + > ${prefix}.${extension} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + seqkit: \$(seqkit version | cut -d' ' -f2) + END_VERSIONS + """ + + stub: + prefix = task.ext.prefix ?: "${meta.id}" + def extension = "fastq" + if ("${fastx}" ==~ /.+\.fasta|.+\.fasta.gz|.+\.fa|.+\.fa.gz|.+\.fas|.+\.fas.gz|.+\.fna|.+\.fna.gz|.+\.fsa|.+\.fsa.gz/) { + extension = "fasta" + } + extension = fastx.toString().endsWith('.gz') ? "${extension}.gz" : extension + if ("${prefix}.${extension}" == "${fastx}") { + error("Input and output names are the same, use \"task.ext.prefix\" to disambiguate!") + } + """ + touch ${prefix}.${extension} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + seqkit: \$(seqkit version | cut -d' ' -f2) + END_VERSIONS + """ +} diff --git a/modules/nf-core/seqkit/seq/meta.yml b/modules/nf-core/seqkit/seq/meta.yml new file mode 100644 index 0000000..b1bca11 --- /dev/null +++ b/modules/nf-core/seqkit/seq/meta.yml @@ -0,0 +1,57 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json +name: "seqkit_seq" +description: Transforms sequences (extract ID, filter by length, remove gaps, reverse + complement...) +keywords: + - genomics + - fasta + - fastq + - transform + - filter + - gaps + - complement +tools: + - "seqkit": + description: "A cross-platform and ultrafast toolkit for FASTA/Q file manipulation" + homepage: "https://bioinf.shenwei.me/seqkit/" + documentation: "https://bioinf.shenwei.me/seqkit/usage/" + tool_dev_url: "https://github.com/shenwei356/seqkit" + doi: "10.1371/journal.pone.0163962" + licence: ["MIT"] + identifier: biotools:seqkit +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1' ]` + - fastx: + type: file + description: Input fasta/fastq file + pattern: "*.{fsa,fas,fa,fasta,fastq,fq,fsa.gz,fas.gz,fa.gz,fasta.gz,fastq.gz,fq.gz}" + ontologies: + - edam: http://edamontology.org/format_1930 # FASTQ +output: + fastx: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1' ]` + - ${prefix}.*: + type: file + description: Output fasta/fastq file + pattern: "*.{fasta,fasta.gz,fastq,fastq.gz}" + ontologies: + - edam: http://edamontology.org/format_1930 # FASTQ + versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" + ontologies: + - edam: http://edamontology.org/format_3750 # YAML +authors: + - "@GallVp" +maintainers: + - "@GallVp" diff --git a/modules/nf-core/seqkit/seq/tests/main.nf.test b/modules/nf-core/seqkit/seq/tests/main.nf.test new file mode 100644 index 0000000..9fd1c08 --- /dev/null +++ b/modules/nf-core/seqkit/seq/tests/main.nf.test @@ -0,0 +1,145 @@ +nextflow_process { + + name "Test Process SEQKIT_SEQ" + script "../main.nf" + process "SEQKIT_SEQ" + config './nextflow.config' + + tag "modules" + tag "modules_nfcore" + tag "seqkit" + tag "seqkit/seq" + + test("sarscov2-genome_fasta") { + when { + process { + """ + input[0] = [ + [ id:'test' ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() }, + ) + } + + } + + test("sarscov2-genome_fasta_gz") { + when { + process { + """ + input[0] = [ + [ id:'test' ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta.gz', checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() }, + ) + } + + } + + test("sarscov2-test_1_fastq_gz") { + when { + process { + """ + input[0] = [ + [ id:'test' ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() }, + ) + } + + } + + test("file_name_conflict-fail_with_error") { + when { + process { + """ + input[0] = [ + [ id:'test_1' ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert !process.success }, + { assert process.stdout.toString().contains("Input and output names are the same") } + ) + } + + } + + test("sarscov2-genome_fasta-stub") { + + options '-stub' + + when { + process { + """ + input[0] = [ + [ id:'test' ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() }, + ) + } + + } + + test("file_name_conflict-fail_with_error-stub") { + + options '-stub' + + when { + process { + """ + input[0] = [ + [ id:'genome' ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert !process.success }, + { assert process.stdout.toString().contains("Input and output names are the same") } + ) + } + + } + +} diff --git a/modules/nf-core/seqkit/seq/tests/main.nf.test.snap b/modules/nf-core/seqkit/seq/tests/main.nf.test.snap new file mode 100644 index 0000000..6817193 --- /dev/null +++ b/modules/nf-core/seqkit/seq/tests/main.nf.test.snap @@ -0,0 +1,134 @@ +{ + "sarscov2-genome_fasta-stub": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.fasta:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + "versions.yml:md5,eeb475e557ef671d4b58e11f82d2448e" + ], + "fastx": [ + [ + { + "id": "test" + }, + "test.fasta:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,eeb475e557ef671d4b58e11f82d2448e" + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.3" + }, + "timestamp": "2025-01-15T15:13:34.513457" + }, + "sarscov2-test_1_fastq_gz": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.fastq.gz:md5,4161df271f9bfcd25d5845a1e220dbec" + ] + ], + "1": [ + "versions.yml:md5,eeb475e557ef671d4b58e11f82d2448e" + ], + "fastx": [ + [ + { + "id": "test" + }, + "test.fastq.gz:md5,4161df271f9bfcd25d5845a1e220dbec" + ] + ], + "versions": [ + "versions.yml:md5,eeb475e557ef671d4b58e11f82d2448e" + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.3" + }, + "timestamp": "2025-01-15T15:13:27.316329" + }, + "sarscov2-genome_fasta": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.fasta:md5,483f4a5dfe60171c86ee9b7e6dff908b" + ] + ], + "1": [ + "versions.yml:md5,eeb475e557ef671d4b58e11f82d2448e" + ], + "fastx": [ + [ + { + "id": "test" + }, + "test.fasta:md5,483f4a5dfe60171c86ee9b7e6dff908b" + ] + ], + "versions": [ + "versions.yml:md5,eeb475e557ef671d4b58e11f82d2448e" + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.3" + }, + "timestamp": "2025-01-15T15:13:18.463038" + }, + "sarscov2-genome_fasta_gz": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.fasta.gz:md5,483f4a5dfe60171c86ee9b7e6dff908b" + ] + ], + "1": [ + "versions.yml:md5,eeb475e557ef671d4b58e11f82d2448e" + ], + "fastx": [ + [ + { + "id": "test" + }, + "test.fasta.gz:md5,483f4a5dfe60171c86ee9b7e6dff908b" + ] + ], + "versions": [ + "versions.yml:md5,eeb475e557ef671d4b58e11f82d2448e" + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.3" + }, + "timestamp": "2025-01-15T15:13:22.960973" + } +} \ No newline at end of file diff --git a/modules/nf-core/seqkit/seq/tests/nextflow.config b/modules/nf-core/seqkit/seq/tests/nextflow.config new file mode 100644 index 0000000..d8e3c66 --- /dev/null +++ b/modules/nf-core/seqkit/seq/tests/nextflow.config @@ -0,0 +1,3 @@ +process { + ext.args2 = '-n' +} diff --git a/modules/nf-core/seqkit/stats/environment.yml b/modules/nf-core/seqkit/stats/environment.yml new file mode 100644 index 0000000..b26fb1e --- /dev/null +++ b/modules/nf-core/seqkit/stats/environment.yml @@ -0,0 +1,7 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - bioconda::seqkit=2.9.0 diff --git a/modules/nf-core/seqkit/stats/main.nf b/modules/nf-core/seqkit/stats/main.nf new file mode 100644 index 0000000..540aa88 --- /dev/null +++ b/modules/nf-core/seqkit/stats/main.nf @@ -0,0 +1,36 @@ +process SEQKIT_STATS { + tag "${meta.id}" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container + ? 'https://depot.galaxyproject.org/singularity/seqkit:2.9.0--h9ee0642_0' + : 'biocontainers/seqkit:2.9.0--h9ee0642_0'}" + + input: + tuple val(meta), path(reads) + + output: + tuple val(meta), path("*.tsv"), emit: stats + tuple val("${task.process}"), val('seqkit'), eval("seqkit version | sed 's/seqkit v//'"), emit: versions_seqkit, topic: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '--all' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + seqkit stats \\ + --tabular \\ + --threads ${task.cpus} \\ + ${args} \\ + ${reads} > '${prefix}.tsv' + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.tsv + """ +} diff --git a/modules/nf-core/seqkit/stats/meta.yml b/modules/nf-core/seqkit/stats/meta.yml new file mode 100644 index 0000000..1d51b05 --- /dev/null +++ b/modules/nf-core/seqkit/stats/meta.yml @@ -0,0 +1,72 @@ +name: "seqkit_stats" +description: simple statistics of FASTA/Q files +keywords: + - seqkit + - fasta + - stats +tools: + - "seqkit": + description: Cross-platform and ultrafast toolkit for FASTA/Q file manipulation, + written by Wei Shen. + homepage: https://bioinf.shenwei.me/seqkit/usage/ + documentation: https://bioinf.shenwei.me/seqkit/usage/ + tool_dev_url: https://github.com/shenwei356/seqkit/ + doi: "10.1371/journal.pone.0163962" + licence: ["MIT"] + identifier: biotools:seqkit +input: + - - meta: + type: map + description: > + Groovy Map containing sample information e.g. [ id:'test', single_end:false + ] + - reads: + type: file + description: > + Either FASTA or FASTQ files. + pattern: "*.{fa,fna,faa,fasta,fq,fastq}[.gz]" + ontologies: + - edam: http://edamontology.org/format_1930 # FASTQ +output: + stats: + - - meta: + type: map + description: > + Groovy Map containing sample information e.g. [ id:'test', single_end:false + ] + - "*.tsv": + type: file + description: > + Tab-separated output file with basic sequence statistics. + pattern: "*.tsv" + ontologies: + - edam: http://edamontology.org/format_3475 # TSV + versions_seqkit: + - - ${task.process}: + type: string + description: The name of the process + - seqkit: + type: string + description: The name of the tool + - "seqkit version | sed 's/seqkit v//'": + type: eval + description: The expression to obtain the version of seqkit + +topics: + versions: + - - ${task.process}: + type: string + description: The name of the process + - seqkit: + type: string + description: The name of the tool + - "seqkit version | sed 's/seqkit v//'": + type: eval + description: The expression to obtain the version of seqkit + +authors: + - "@Midnighter" + - "@heuermh" +maintainers: + - "@Midnighter" + - "@heuermh" diff --git a/modules/nf-core/seqkit/stats/tests/main.nf.test b/modules/nf-core/seqkit/stats/tests/main.nf.test new file mode 100644 index 0000000..2cd4eb4 --- /dev/null +++ b/modules/nf-core/seqkit/stats/tests/main.nf.test @@ -0,0 +1,141 @@ +nextflow_process { + + name "Test Process SEQKIT_STATS" + script "../main.nf" + process "SEQKIT_STATS" + + tag "modules" + tag "modules_nfcore" + tag "seqkit" + tag "seqkit/stats" + + test("single_end") { + + when { + process { + """ + input[0] = [[ id:'test', single_end:true ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("paired_end") { + + when { + process { + """ + input[0] = [[ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_2.fastq.gz', checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("nanopore") { + + when { + process { + """ + input[0] = [[ id:'test', single_end:true ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/nanopore/fastq/test.fastq.gz', checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("genome_fasta") { + + when { + process { + """ + input[0] = [[ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("transcriptome_fasta") { + + when { + process { + """ + input[0] = [[ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/transcriptome.fasta', checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("single_end - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [[ id:'test', single_end:true ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} diff --git a/modules/nf-core/seqkit/stats/tests/main.nf.test.snap b/modules/nf-core/seqkit/stats/tests/main.nf.test.snap new file mode 100644 index 0000000..b83e6a6 --- /dev/null +++ b/modules/nf-core/seqkit/stats/tests/main.nf.test.snap @@ -0,0 +1,260 @@ +{ + "nanopore": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": true + }, + "test.tsv:md5,14f97a9e0414998854ead651e0e69449" + ] + ], + "1": [ + [ + "SEQKIT_STATS", + "seqkit", + "2.9.0" + ] + ], + "stats": [ + [ + { + "id": "test", + "single_end": true + }, + "test.tsv:md5,14f97a9e0414998854ead651e0e69449" + ] + ], + "versions_seqkit": [ + [ + "SEQKIT_STATS", + "seqkit", + "2.9.0" + ] + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.3" + }, + "timestamp": "2026-02-03T13:29:39.937621054" + }, + "genome_fasta": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "test.tsv:md5,82b33df8ec2515560b80c3d0bc64c898" + ] + ], + "1": [ + [ + "SEQKIT_STATS", + "seqkit", + "2.9.0" + ] + ], + "stats": [ + [ + { + "id": "test", + "single_end": false + }, + "test.tsv:md5,82b33df8ec2515560b80c3d0bc64c898" + ] + ], + "versions_seqkit": [ + [ + "SEQKIT_STATS", + "seqkit", + "2.9.0" + ] + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.3" + }, + "timestamp": "2026-02-03T13:29:45.268086738" + }, + "transcriptome_fasta": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "test.tsv:md5,eef67c2e2f225391836d59d5b0d6c3b7" + ] + ], + "1": [ + [ + "SEQKIT_STATS", + "seqkit", + "2.9.0" + ] + ], + "stats": [ + [ + { + "id": "test", + "single_end": false + }, + "test.tsv:md5,eef67c2e2f225391836d59d5b0d6c3b7" + ] + ], + "versions_seqkit": [ + [ + "SEQKIT_STATS", + "seqkit", + "2.9.0" + ] + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.3" + }, + "timestamp": "2026-02-03T13:29:50.657274211" + }, + "single_end": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": true + }, + "test.tsv:md5,f172827a8608b646559cc39c6ca05085" + ] + ], + "1": [ + [ + "SEQKIT_STATS", + "seqkit", + "2.9.0" + ] + ], + "stats": [ + [ + { + "id": "test", + "single_end": true + }, + "test.tsv:md5,f172827a8608b646559cc39c6ca05085" + ] + ], + "versions_seqkit": [ + [ + "SEQKIT_STATS", + "seqkit", + "2.9.0" + ] + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.3" + }, + "timestamp": "2026-02-03T13:29:29.264287187" + }, + "paired_end": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "test.tsv:md5,f172827a8608b646559cc39c6ca05085" + ] + ], + "1": [ + [ + "SEQKIT_STATS", + "seqkit", + "2.9.0" + ] + ], + "stats": [ + [ + { + "id": "test", + "single_end": false + }, + "test.tsv:md5,f172827a8608b646559cc39c6ca05085" + ] + ], + "versions_seqkit": [ + [ + "SEQKIT_STATS", + "seqkit", + "2.9.0" + ] + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.3" + }, + "timestamp": "2026-02-03T13:29:34.665600897" + }, + "single_end - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": true + }, + "test.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + [ + "SEQKIT_STATS", + "seqkit", + "2.9.0" + ] + ], + "stats": [ + [ + { + "id": "test", + "single_end": true + }, + "test.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions_seqkit": [ + [ + "SEQKIT_STATS", + "seqkit", + "2.9.0" + ] + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.3" + }, + "timestamp": "2026-02-03T13:29:55.952952629" + } +} \ No newline at end of file diff --git a/modules/nf-core/untar/environment.yml b/modules/nf-core/untar/environment.yml new file mode 100644 index 0000000..9b926b1 --- /dev/null +++ b/modules/nf-core/untar/environment.yml @@ -0,0 +1,12 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::coreutils=9.5 + - conda-forge::grep=3.11 + - conda-forge::gzip=1.13 + - conda-forge::lbzip2=2.5 + - conda-forge::sed=4.8 + - conda-forge::tar=1.34 diff --git a/modules/nf-core/untar/main.nf b/modules/nf-core/untar/main.nf new file mode 100644 index 0000000..b9c324d --- /dev/null +++ b/modules/nf-core/untar/main.nf @@ -0,0 +1,75 @@ +process UNTAR { + tag "${archive}" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container + ? 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/52/52ccce28d2ab928ab862e25aae26314d69c8e38bd41ca9431c67ef05221348aa/data' + : 'community.wave.seqera.io/library/coreutils_grep_gzip_lbzip2_pruned:838ba80435a629f8'}" + + input: + tuple val(meta), path(archive) + + output: + tuple val(meta), path("${prefix}"), emit: untar + tuple val("${task.process}"), val('untar'), eval('tar --version 2>&1 | head -1 | sed "s/tar (GNU tar) //; s/ Copyright.*//"'), emit: versions_untar, topic: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def args2 = task.ext.args2 ?: '' + prefix = task.ext.prefix ?: (meta.id ? "${meta.id}" : archive.baseName.toString().replaceFirst(/\.tar$/, "")) + + """ + mkdir ${prefix} + + ## Ensures --strip-components only applied when top level of tar contents is a directory + ## If just files or multiple directories, place all in prefix + if [[ \$(tar -taf ${archive} | grep -o -P "^.*?\\/" | uniq | wc -l) -eq 1 ]]; then + tar \\ + -C ${prefix} --strip-components 1 \\ + -xavf \\ + ${args} \\ + ${archive} \\ + ${args2} + else + tar \\ + -C ${prefix} \\ + -xavf \\ + ${args} \\ + ${archive} \\ + ${args2} + fi + + """ + + stub: + prefix = task.ext.prefix ?: (meta.id ? "${meta.id}" : archive.toString().replaceFirst(/\.[^\.]+(.gz)?$/, "")) + """ + mkdir ${prefix} + ## Dry-run untaring the archive to get the files and place all in prefix + if [[ \$(tar -taf ${archive} | grep -o -P "^.*?\\/" | uniq | wc -l) -eq 1 ]]; then + for i in `tar -tf ${archive}`; + do + if [[ \$(echo "\${i}" | grep -E "/\$") == "" ]]; + then + touch \${i} + else + mkdir -p \${i} + fi + done + else + for i in `tar -tf ${archive}`; + do + if [[ \$(echo "\${i}" | grep -E "/\$") == "" ]]; + then + touch ${prefix}/\${i} + else + mkdir -p ${prefix}/\${i} + fi + done + fi + """ +} diff --git a/modules/nf-core/untar/meta.yml b/modules/nf-core/untar/meta.yml new file mode 100644 index 0000000..571d807 --- /dev/null +++ b/modules/nf-core/untar/meta.yml @@ -0,0 +1,73 @@ +name: untar +description: Extract files from tar, tar.gz, tar.bz2, tar.xz archives +keywords: + - untar + - uncompress + - extract +tools: + - untar: + description: | + Extract tar, tar.gz, tar.bz2, tar.xz files. + documentation: https://www.gnu.org/software/tar/manual/ + licence: ["GPL-3.0-or-later"] + identifier: "" +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - archive: + type: file + description: File to be untarred + pattern: "*.{tar,tar.gz,tar.bz2,tar.xz}" + ontologies: + - edam: http://edamontology.org/format_3981 # TAR format + - edam: http://edamontology.org/format_3989 # GZIP format +output: + untar: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + pattern: "*/" + - ${prefix}: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + pattern: "*/" + versions_untar: + - - ${task.process}: + type: string + description: The name of the process + - untar: + type: string + description: The name of the tool + - tar --version 2>&1 | head -1 | sed "s/tar (GNU tar) //; s/ Copyright.*//": + type: eval + description: The expression to obtain the version of the tool + +topics: + versions: + - - ${task.process}: + type: string + description: The name of the process + - untar: + type: string + description: The name of the tool + - tar --version 2>&1 | head -1 | sed "s/tar (GNU tar) //; s/ Copyright.*//": + type: eval + description: The expression to obtain the version of the tool + +authors: + - "@joseespinosa" + - "@drpatelh" + - "@matthdsm" + - "@jfy133" +maintainers: + - "@joseespinosa" + - "@drpatelh" + - "@matthdsm" + - "@jfy133" diff --git a/modules/nf-core/untar/tests/main.nf.test b/modules/nf-core/untar/tests/main.nf.test new file mode 100644 index 0000000..fde8db1 --- /dev/null +++ b/modules/nf-core/untar/tests/main.nf.test @@ -0,0 +1,97 @@ +nextflow_process { + + name "Test Process UNTAR" + script "../main.nf" + process "UNTAR" + tag "modules" + tag "modules_nfcore" + tag "untar" + + test("test_untar") { + + when { + process { + """ + input[0] = [ [], file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/db/kraken2.tar.gz', checkIfExists: true) ] + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot( + process.out.untar, + process.out.findAll { key, val -> key.startsWith('versions') } + ).match() }, + ) + } + } + + test("test_untar_onlyfiles") { + + when { + process { + """ + input[0] = [ [], file(params.modules_testdata_base_path + 'generic/tar/hello.tar.gz', checkIfExists: true) ] + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot( + process.out.untar, + process.out.findAll { key, val -> key.startsWith('versions') } + ).match() }, + ) + } + } + + test("test_untar - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ [], file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/db/kraken2.tar.gz', checkIfExists: true) ] + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot( + process.out.untar, + process.out.findAll { key, val -> key.startsWith('versions') } + ).match() }, + ) + } + } + + test("test_untar_onlyfiles - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ [], file(params.modules_testdata_base_path + 'generic/tar/hello.tar.gz', checkIfExists: true) ] + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot( + process.out.untar, + process.out.findAll { key, val -> key.startsWith('versions') } + ).match() }, + ) + } + } +} diff --git a/modules/nf-core/untar/tests/main.nf.test.snap b/modules/nf-core/untar/tests/main.nf.test.snap new file mode 100644 index 0000000..51a414d --- /dev/null +++ b/modules/nf-core/untar/tests/main.nf.test.snap @@ -0,0 +1,118 @@ +{ + "test_untar_onlyfiles": { + "content": [ + [ + [ + [ + + ], + [ + "hello.txt:md5,e59ff97941044f85df5297e1c302d260" + ] + ] + ], + { + "versions_untar": [ + [ + "UNTAR", + "untar", + "1.34" + ] + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.2" + }, + "timestamp": "2026-01-28T17:49:32.000491" + }, + "test_untar_onlyfiles - stub": { + "content": [ + [ + [ + [ + + ], + [ + "hello.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + { + "versions_untar": [ + [ + "UNTAR", + "untar", + "1.34" + ] + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.2" + }, + "timestamp": "2026-01-28T17:49:58.812479" + }, + "test_untar - stub": { + "content": [ + [ + [ + [ + + ], + [ + "hash.k2d:md5,d41d8cd98f00b204e9800998ecf8427e", + "opts.k2d:md5,d41d8cd98f00b204e9800998ecf8427e", + "taxo.k2d:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + { + "versions_untar": [ + [ + "UNTAR", + "untar", + "1.34" + ] + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.2" + }, + "timestamp": "2026-01-28T17:49:48.119456" + }, + "test_untar": { + "content": [ + [ + [ + [ + + ], + [ + "hash.k2d:md5,8b8598468f54a7087c203ad0190555d9", + "opts.k2d:md5,a033d00cf6759407010b21700938f543", + "taxo.k2d:md5,094d5891cdccf2f1468088855c214b2c" + ] + ] + ], + { + "versions_untar": [ + [ + "UNTAR", + "untar", + "1.34" + ] + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.2" + }, + "timestamp": "2026-01-28T17:49:17.252494" + } +} \ No newline at end of file diff --git a/nextflow.config b/nextflow.config index 1dcfdb8..e56f91f 100644 --- a/nextflow.config +++ b/nextflow.config @@ -9,20 +9,40 @@ // Global default params, used in configs params { - // TODO nf-core: Specify your pipeline's command line flags // Input options - input = null + input = null - // References - genome = null - igenomes_base = 's3://ngi-igenomes/igenomes/' - igenomes_ignore = false + // QC + skip_preprocessing = false + min_seq_length = 30 + max_seq_length = 5000 + remove_duplicates_on_sequence = false + + // Domain annotation + skip_pfam = false + pfam_db = null + pfam_latest_link = "https://ftp.ebi.ac.uk/pub/databases/Pfam/current_release/Pfam-A.hmm.gz" + skip_funfam = false + funfam_db = null + funfam_latest_link = "https://download.cathdb.info/cath/releases/all-releases/v4_3_0/sequence-data/funfam-hmm3-v4_3_0.lib.gz" + hmmsearch_evalue_cutoff = 0.001 + + // Functional annotation + skip_interproscan = false + interproscan_db_url = "https://ftp.ebi.ac.uk/pub/software/unix/iprscan/5/5.72-103.0/interproscan-5.72-103.0-64-bit.tar.gz" + interproscan_db = null + interproscan_applications = 'Hamap,PANTHER,PIRSF,TIGRFAM,sfld' + interproscan_enableprecalc = false + + // Secondary structure prediction (s4pred) + skip_s4pred = false + s4pred_outfmt = 'ss2' // ["ss2", "fas", "horiz"] // MultiQC options - multiqc_config = null - multiqc_title = null - multiqc_logo = null - max_multiqc_email_size = '25.MB' + multiqc_config = null + multiqc_title = null + multiqc_logo = null + max_multiqc_email_size = '25.MB' multiqc_methods_description = null // Boilerplate options @@ -32,13 +52,15 @@ params { email_on_fail = null plaintext_email = false monochrome_logs = false - hook_url = null + hook_url = System.getenv('HOOK_URL') help = false help_full = false show_hidden = false version = false pipelines_testdata_base_path = 'https://raw.githubusercontent.com/nf-core/test-datasets/' - trace_report_suffix = new java.util.Date().format( 'yyyy-MM-dd_HH-mm-ss')// Config options + trace_report_suffix = new java.util.Date().format( 'yyyy-MM-dd_HH-mm-ss') + + // Config options config_profile_name = null config_profile_description = null @@ -46,9 +68,10 @@ params { custom_config_base = "https://raw.githubusercontent.com/nf-core/configs/${params.custom_config_version}" config_profile_contact = null config_profile_url = null + modules_testdata_base_path = null // Schema validation default options - validate_params = true + validate_params = true } // Load base.config by default for all pipelines @@ -56,132 +79,138 @@ includeConfig 'conf/base.config' profiles { debug { - dumpHashes = true - process.beforeScript = 'echo $HOSTNAME' - cleanup = false + dumpHashes = true + process.beforeScript = 'echo $HOSTNAME' + cleanup = false nextflow.enable.configProcessNamesValidation = true } conda { - conda.enabled = true - docker.enabled = false - singularity.enabled = false - podman.enabled = false - shifter.enabled = false - charliecloud.enabled = false - conda.channels = ['conda-forge', 'bioconda'] - apptainer.enabled = false + conda.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + conda.channels = ['conda-forge', 'bioconda'] + apptainer.enabled = false } mamba { - conda.enabled = true - conda.useMamba = true - docker.enabled = false - singularity.enabled = false - podman.enabled = false - shifter.enabled = false - charliecloud.enabled = false - apptainer.enabled = false + conda.enabled = true + conda.useMamba = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + apptainer.enabled = false } docker { - docker.enabled = true - conda.enabled = false - singularity.enabled = false - podman.enabled = false - shifter.enabled = false - charliecloud.enabled = false - apptainer.enabled = false - docker.runOptions = '-u $(id -u):$(id -g)' + docker.enabled = true + conda.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + apptainer.enabled = false + docker.runOptions = '-u $(id -u):$(id -g)' + } + arm64 { + process.arch = 'arm64' + // TODO https://github.com/nf-core/modules/issues/6694 + // For now if you're using arm64 you have to use wave for the sake of the maintainers + // wave profile + apptainer.ociAutoPull = true + singularity.ociAutoPull = true + wave.enabled = true + wave.freeze = true + wave.strategy = 'conda,container' } - arm { + emulate_amd64 { docker.runOptions = '-u $(id -u):$(id -g) --platform=linux/amd64' } singularity { - singularity.enabled = true - singularity.autoMounts = true - conda.enabled = false - docker.enabled = false - podman.enabled = false - shifter.enabled = false - charliecloud.enabled = false - apptainer.enabled = false + singularity.enabled = true + singularity.autoMounts = true + conda.enabled = false + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + apptainer.enabled = false } podman { - podman.enabled = true - conda.enabled = false - docker.enabled = false - singularity.enabled = false - shifter.enabled = false - charliecloud.enabled = false - apptainer.enabled = false + podman.enabled = true + conda.enabled = false + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + apptainer.enabled = false } shifter { - shifter.enabled = true - conda.enabled = false - docker.enabled = false - singularity.enabled = false - podman.enabled = false - charliecloud.enabled = false - apptainer.enabled = false + shifter.enabled = true + conda.enabled = false + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + apptainer.enabled = false } charliecloud { - charliecloud.enabled = true - conda.enabled = false - docker.enabled = false - singularity.enabled = false - podman.enabled = false - shifter.enabled = false - apptainer.enabled = false + charliecloud.enabled = true + conda.enabled = false + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + apptainer.enabled = false } apptainer { - apptainer.enabled = true - apptainer.autoMounts = true - conda.enabled = false - docker.enabled = false - singularity.enabled = false - podman.enabled = false - shifter.enabled = false - charliecloud.enabled = false + apptainer.enabled = true + apptainer.autoMounts = true + conda.enabled = false + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false } wave { - apptainer.ociAutoPull = true + apptainer.ociAutoPull = true singularity.ociAutoPull = true - wave.enabled = true - wave.freeze = true - wave.strategy = 'conda,container' + wave.enabled = true + wave.freeze = true + wave.strategy = 'conda,container' } - gitpod { - executor.name = 'local' - executor.cpus = 4 - executor.memory = 8.GB - process { - resourceLimits = [ - memory: 8.GB, - cpus : 4, - time : 1.h - ] - } + gpu { + docker.runOptions = '-u $(id -u):$(id -g) --gpus all' + apptainer.runOptions = '--nv' + singularity.runOptions = '--nv' } test { includeConfig 'conf/test.config' } test_full { includeConfig 'conf/test_full.config' } } -// Load nf-core custom profiles from different Institutions -includeConfig !System.getenv('NXF_OFFLINE') && params.custom_config_base ? "${params.custom_config_base}/nfcore_custom.config" : "/dev/null" +// Load nf-core custom profiles from different institutions + +// If params.custom_config_base is set AND either the NXF_OFFLINE environment variable is not set or params.custom_config_base is a local path, the nfcore_custom.config file from the specified base path is included. +// Load nf-core/proteinannotator custom profiles from different institutions. +includeConfig params.custom_config_base && (!System.getenv('NXF_OFFLINE') || !params.custom_config_base.startsWith('http')) ? "${params.custom_config_base}/nfcore_custom.config" : "/dev/null" + // Load nf-core/proteinannotator custom profiles from different institutions. -// TODO nf-core: Optionally, you can add a pipeline-specific nf-core config at https://github.com/nf-core/configs -// includeConfig !System.getenv('NXF_OFFLINE') && params.custom_config_base ? "${params.custom_config_base}/pipeline/proteinannotator.config" : "/dev/null" +includeConfig params.custom_config_base && (!System.getenv('NXF_OFFLINE') || !params.custom_config_base.startsWith('http')) ? "${params.custom_config_base}/pipeline/proteinannotator.config" : "/dev/null" // Set default registry for Apptainer, Docker, Podman, Charliecloud and Singularity independent of -profile // Will not be used unless Apptainer / Docker / Podman / Charliecloud / Singularity are enabled // Set to your registry if you have a mirror of containers -apptainer.registry = 'quay.io' -docker.registry = 'quay.io' -podman.registry = 'quay.io' -singularity.registry = 'quay.io' +apptainer.registry = 'quay.io' +docker.registry = 'quay.io' +podman.registry = 'quay.io' +singularity.registry = 'quay.io' charliecloud.registry = 'quay.io' -// Load igenomes.config if required -includeConfig !params.igenomes_ignore ? 'conf/igenomes.config' : 'conf/igenomes_ignored.config' + // Export these variables to prevent local Python/R libraries from conflicting with those in the container // The JULIA depot path has been adjusted to a fixed path `/usr/local/share/julia` that needs to be used for packages in the container. @@ -189,19 +218,19 @@ includeConfig !params.igenomes_ignore ? 'conf/igenomes.config' : 'conf/igenomes_ env { PYTHONNOUSERSITE = 1 - R_PROFILE_USER = "/.Rprofile" - R_ENVIRON_USER = "/.Renviron" + R_PROFILE_USER = "/.Rprofile" + R_ENVIRON_USER = "/.Renviron" JULIA_DEPOT_PATH = "/usr/local/share/julia" } // Set bash options process.shell = [ "bash", - "-C", // No clobber - prevent output redirection from overwriting files. - "-e", // Exit if a tool returns a non-zero status/exit code - "-u", // Treat unset variables and parameters as an error - "-o", // Returns the status of the last command to exit.. - "pipefail" // ..with a non-zero status or zero if all successfully execute + "-C", + "-e", + "-u", + "-o", + "pipefail", ] // Disable process selector warnings by default. Use debug profile to enable warnings. @@ -209,79 +238,66 @@ nextflow.enable.configProcessNamesValidation = false timeline { enabled = true - file = "${params.outdir}/pipeline_info/execution_timeline_${params.trace_report_suffix}.html" + file = "${params.outdir}/pipeline_info/execution_timeline_${params.trace_report_suffix}.html" } report { enabled = true - file = "${params.outdir}/pipeline_info/execution_report_${params.trace_report_suffix}.html" + file = "${params.outdir}/pipeline_info/execution_report_${params.trace_report_suffix}.html" } trace { enabled = true - file = "${params.outdir}/pipeline_info/execution_trace_${params.trace_report_suffix}.txt" + file = "${params.outdir}/pipeline_info/execution_trace_${params.trace_report_suffix}.txt" } dag { enabled = true - file = "${params.outdir}/pipeline_info/pipeline_dag_${params.trace_report_suffix}.html" + file = "${params.outdir}/pipeline_info/pipeline_dag_${params.trace_report_suffix}.html" } manifest { name = 'nf-core/proteinannotator' - author = """Olga Botvinnik""" // The author field is deprecated from Nextflow version 24.10.0, use contributors instead contributors = [ - // TODO nf-core: Update the field with the details of the contributors to your pipeline. New with Nextflow version 24.10.0 [ name: 'Olga Botvinnik', - affiliation: '', + affiliation: 'Seanome, San Francisco, CA, USA', + email: 'olga@seanome.org', + github: 'olgabot', + contribution: ['author'], // List of contribution types ('author', 'maintainer' or 'contributor') + orcid: 'https://orcid.org/0000-0003-4412-7970' + ], + [ + name: 'Michael L Heuer', + affiliation: """Network.bio, New York, NY, USA""", email: '', - github: '', - contribution: [], // List of contribution types ('author', 'maintainer' or 'contributor') - orcid: '' + github: '@heuermh', + contribution: ['contributor'], + orcid: 'https://orcid.org/0000-0002-9052-6000' ], + [ + name: 'Evangelos Karatzas', + affiliation: 'EMBL-EBI', + email: 'vangelis@ebi.ac.uk', + github: 'https://github.com/vagkaratzas', + contribution: ['maintainer'], + orcid: '0000-0001-9132-8981' + ] ] homePage = 'https://github.com/nf-core/proteinannotator' - description = """The best protein annotation pipeline in the world. Protein fasta -> ??? -> Annotations!""" + description = """Generation of sequence-level annotations for amino acid sequences""" mainScript = 'main.nf' defaultBranch = 'master' - nextflowVersion = '!>=24.04.2' - version = '1.0.0dev' + nextflowVersion = '!>=25.10.0' + version = '1.0.0' doi = '' } // Nextflow plugins plugins { - id 'nf-schema@2.3.0' // Validation of pipeline parameters and creation of an input channel from a sample sheet + id 'nf-schema@2.6.1' // Validation of pipeline parameters and creation of an input channel from a sample sheet } validation { defaultIgnoreParams = ["genomes"] monochromeLogs = params.monochrome_logs - help { - enabled = true - command = "nextflow run nf-core/proteinannotator -profile --input samplesheet.csv --outdir " - fullParameter = "help_full" - showHiddenParameter = "show_hidden" - beforeText = """ --\033[2m----------------------------------------------------\033[0m- - \033[0;32m,--.\033[0;30m/\033[0;32m,-.\033[0m -\033[0;34m ___ __ __ __ ___ \033[0;32m/,-._.--~\'\033[0m -\033[0;34m |\\ | |__ __ / ` / \\ |__) |__ \033[0;33m} {\033[0m -\033[0;34m | \\| | \\__, \\__/ | \\ |___ \033[0;32m\\`-._,-`-,\033[0m - \033[0;32m`._,._,\'\033[0m -\033[0;35m nf-core/proteinannotator ${manifest.version}\033[0m --\033[2m----------------------------------------------------\033[0m- -""" - afterText = """${manifest.doi ? "\n* The pipeline\n" : ""}${manifest.doi.tokenize(",").collect { " https://doi.org/${it.trim().replace('https://doi.org/','')}"}.join("\n")}${manifest.doi ? "\n" : ""} -* The nf-core framework - https://doi.org/10.1038/s41587-020-0439-x - -* Software dependencies - https://github.com/nf-core/proteinannotator/blob/master/CITATIONS.md -""" - } - summary { - beforeText = validation.help.beforeText - afterText = validation.help.afterText - } } // Load modules.config for DSL2 module specific options diff --git a/nextflow_schema.json b/nextflow_schema.json index c885023..b7ad6d8 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -2,7 +2,7 @@ "$schema": "https://json-schema.org/draft/2020-12/schema", "$id": "https://raw.githubusercontent.com/nf-core/proteinannotator/master/nextflow_schema.json", "title": "nf-core/proteinannotator pipeline parameters", - "description": "The best protein annotation pipeline in the world. Protein fasta -> ??? -> Annotations!", + "description": "Generation of sequence-level annotations for amino acid sequences", "type": "object", "$defs": { "input_output_options": { @@ -43,45 +43,6 @@ } } }, - "reference_genome_options": { - "title": "Reference genome options", - "type": "object", - "fa_icon": "fas fa-dna", - "description": "Reference genome related files and options required for the workflow.", - "properties": { - "genome": { - "type": "string", - "description": "Name of iGenomes reference.", - "fa_icon": "fas fa-book", - "help_text": "If using a reference genome configured in the pipeline using iGenomes, use this parameter to give the ID for the reference. This is then used to build the full paths for all required reference genome files e.g. `--genome GRCh38`. \n\nSee the [nf-core website docs](https://nf-co.re/usage/reference_genomes) for more details." - }, - "fasta": { - "type": "string", - "format": "file-path", - "exists": true, - "mimetype": "text/plain", - "pattern": "^\\S+\\.fn?a(sta)?(\\.gz)?$", - "description": "Path to FASTA genome file.", - "help_text": "This parameter is *mandatory* if `--genome` is not specified. If you don't have a BWA index available this will be generated for you automatically. Combine with `--save_reference` to save BWA index for future runs.", - "fa_icon": "far fa-file-code" - }, - "igenomes_ignore": { - "type": "boolean", - "description": "Do not load the iGenomes reference config.", - "fa_icon": "fas fa-ban", - "hidden": true, - "help_text": "Do not load `igenomes.config` when running the pipeline. You may choose this option if you observe clashes between custom parameters and those supplied in `igenomes.config`." - }, - "igenomes_base": { - "type": "string", - "format": "directory-path", - "description": "The base path to the igenomes reference files", - "fa_icon": "fas fa-ban", - "hidden": true, - "default": "s3://ngi-igenomes/igenomes/" - } - } - }, "institutional_config_options": { "title": "Institutional config options", "type": "object", @@ -219,11 +180,168 @@ "default": "https://raw.githubusercontent.com/nf-core/test-datasets/", "hidden": true }, + "modules_testdata_base_path": { + "type": "string", + "description": "Base URL or local path to location of modules test dataset files" + }, "trace_report_suffix": { "type": "string", "fa_icon": "far calendar", "description": "Suffix to add to the trace report filename. Default is the date and time in the format yyyy-MM-dd_HH-mm-ss.", "hidden": true + }, + "help": { + "type": ["boolean", "string"], + "description": "Display the help message." + }, + "help_full": { + "type": "boolean", + "description": "Display the full detailed help message." + }, + "show_hidden": { + "type": "boolean", + "description": "Display hidden parameters in the help message (only works when --help or --help_full are provided)." + } + } + }, + "quality_check_params": { + "title": "Quality check parameters", + "type": "object", + "description": "Use these parameters to control the flow of the quality check subworkflow execution.", + "properties": { + "skip_preprocessing": { + "type": "boolean", + "fa_icon": "fas fa-ban", + "description": "Skip all default QC steps for sequences (gap trimming, length filtering, validation, duplicate removal).", + "help": "Skips input amino acid sequence preprocessing transformations, length filtering, duplicate removal and validation checks." + }, + "min_seq_length": { + "type": "integer", + "default": 30, + "fa_icon": "fas fa-ruler-horizontal", + "description": "The minimum allowed sequence length", + "help_text": "Specify the minimum length of amino acid sequences that go into clustering. Modifies the --min-len parameter of seqkit seq." + }, + "max_seq_length": { + "type": "integer", + "default": 5000, + "fa_icon": "fas fa-ruler-horizontal", + "description": "The maximum allowed sequence length", + "help_text": "Specify the maximum length of amino acid sequences that go into clustering. Modifies the --max-len parameter of seqkit seq" + }, + "remove_duplicates_on_sequence": { + "type": "boolean", + "description": "Remove duplicate input amino acid sequences, based on the sequence.", + "help": "Instead of just removing by similar identifier, the tool also removes duplicate sequences." + } + } + }, + "domain_annotation_params": { + "title": "Domain annotation parameters", + "type": "object", + "description": "Use these parameters to control the flow of the domain annotation execution.", + "properties": { + "skip_pfam": { + "type": "boolean", + "fa_icon": "fas fa-ban", + "description": "Skip the domain annotation with the Pfam database.", + "help": "Skips the domain annotation of input sequence against a Pfam database." + }, + "pfam_db": { + "type": "string", + "format": "file-path", + "description": "Path to an already installed Pfam HMM database (.hmm.gz).", + "help_text": "If left null and skip_pfam is false, the pipeline will start downloading the latest Pfam HMM library." + }, + "pfam_latest_link": { + "type": "string", + "default": "https://ftp.ebi.ac.uk/pub/databases/Pfam/current_release/Pfam-A.hmm.gz", + "description": "InterPro hosted link to the latest Pfam HMM database file.", + "help_text": "Latest version should be a bit more than 350MB." + }, + "skip_funfam": { + "type": "boolean", + "fa_icon": "fas fa-ban", + "description": "Skip the domain annotation with the FunFam database.", + "help": "Skips the domain annotation of input sequence against a FunFam database." + }, + "funfam_db": { + "type": "string", + "format": "file-path", + "description": "Path to an already installed FunFam HMM database (.lib.gz).", + "help_text": "If left null and skip_funfam is false, the pipeline will start downloading the latest FunFam HMM library." + }, + "funfam_latest_link": { + "type": "string", + "default": "https://download.cathdb.info/cath/releases/all-releases/v4_3_0/sequence-data/funfam-hmm3-v4_3_0.lib.gz", + "description": "CATH hosted link to the latest available (v4_3_0) FunFam HMM database file." + }, + "hmmsearch_evalue_cutoff": { + "type": "number", + "default": 0.001, + "description": "hmmsearch e-value cutoff threshold for reported results. Modifies the -E parameter of hmmsearch." + } + } + }, + "functional_annotation_options": { + "title": "Functional annotation parameters", + "type": "object", + "description": "Use these parameters to control the flow of the functional annotation execution.", + "default": "", + "properties": { + "skip_interproscan": { + "type": "boolean", + "description": "Run InterProScan", + "default": false + }, + "interproscan_db_url": { + "type": "string", + "default": "https://ftp.ebi.ac.uk/pub/software/unix/iprscan/5/5.72-103.0/interproscan-5.72-103.0-64-bit.tar.gz", + "help_text": "This allows the user to change the InterProScan database version that the pipeline will download for you automatically. To instead use a pre-downloaded database, please supply its path to `--interproscan_db`. Changing this URL allows for the use of the latest database release. By default this is set to `http://ftp.ebi.ac.uk/pub/software/unix/iprscan/5/5.72-103.0/interproscan-5.72-103.0-64-bit.tar.gz`. ", + "description": "Change the database version used for annotation.", + "fa_icon": "fas fa-database" + }, + "interproscan_db": { + "type": "string", + "help_text": "Use this to supply the path to a pre-downloaded InterProScan database. This can be any unzipped InterProScan version.\n\nFor more details on where to find different InterProScan databases see tool [documentation](https://interproscan-docs.readthedocs.io/en/latest/UserDocs.html#obtaining-a-copy-of-interproscan).\n", + "description": "Path to pre-downloaded InterProScan database.", + "fa_icon": "fas fa-database" + }, + "interproscan_applications": { + "type": "string", + "default": "Hamap,PANTHER,PIRSF,TIGRFAM,sfld", + "pattern": "^\\w+(,\\w+)*", + "help_text": "A comma-separated string specifying the database(s) to be used to annotate the coding regions annotated during the contig annotation workflow of the pipeline. By default these include `Hamap,PANTHER,PIRSF,TIGRFAM,sfld`.\n\nFor more information about all possible databases see the tool [documentation](https://interproscan-docs.readthedocs.io/en/v5/HowToRun.html).\n\n> Modifies tool parameter(s):\n> - InterProScan: `--applications`", + "description": "Assigns the database(s) to be used to annotate the coding regions.", + "fa_icon": "fas fa-database" + }, + "interproscan_enableprecalc": { + "type": "boolean", + "help_text": "This increases the speed of functional annotation with InterProScan by pre-calculating matches found in the UniProtKB, thereby identifying unique matches in the query sequences for faster annotation. By default this is turned off.\n\nFor more information about this flag see the tool [documentation](https://interproscan-docs.readthedocs.io/en/latest/HowToRun.html).\n\n> Modifies tool parameter(s):\n> - InterProScan: `---diasable-precalc`", + "description": "Pre-calculates residue mutual matches.", + "fa_icon": "fas fa-clock" + } + }, + "help_text": "This subworkflow adds additional protein annotations to all input sequences. Currently, only annotation with InterProScan is integrated in the subworkflow.", + "fa_icon": "fas fa-file-signature" + }, + "prediction_params": { + "title": "Prediction parameters", + "type": "object", + "description": "Use these parameters to control the flow of the secondary structure prediction execution.", + "properties": { + "skip_s4pred": { + "type": "boolean", + "fa_icon": "fas fa-ban", + "description": "Skip the secondary structure prediction.", + "help": "Skips the prediction of the secondary structures of sequences with the s4pred software." + }, + "s4pred_outfmt": { + "type": "string", + "default": "ss2", + "description": "Choose the output format (i.e., 'ss2', 'fas', 'horiz') for the s4pred per amino acid probability predictions (i.e., α-helix, β-strand, coil). Modifies the --outfmt parameter of s4pred run_model.", + "help_text": "ss2 is the default and it corresponds to the PSIPRED vertical format (PSIPRED VFORMAT). The fas output returns the sequence FASTA file with the predicted secondary structure concatenated on a second line. The horiz option outputs the results in the PSIPRED horizontal format (PSIPRED HFORMAT).", + "enum": ["ss2", "fas", "horiz"] } } } @@ -232,14 +350,24 @@ { "$ref": "#/$defs/input_output_options" }, - { - "$ref": "#/$defs/reference_genome_options" - }, { "$ref": "#/$defs/institutional_config_options" }, { "$ref": "#/$defs/generic_options" + }, + { + "$ref": "#/$defs/quality_check_params" + }, + { + "$ref": "#/$defs/domain_annotation_params" + }, + + { + "$ref": "#/$defs/functional_annotation_options" + }, + { + "$ref": "#/$defs/prediction_params" } ] } diff --git a/nf-test.config b/nf-test.config new file mode 100644 index 0000000..3a1fff5 --- /dev/null +++ b/nf-test.config @@ -0,0 +1,24 @@ +config { + // location for all nf-test tests + testsDir "." + + // nf-test directory including temporary files for each test + workDir System.getenv("NFT_WORKDIR") ?: ".nf-test" + + // location of an optional nextflow.config file specific for executing tests + configFile "tests/nextflow.config" + + // ignore tests coming from the nf-core/modules repo + ignore 'modules/nf-core/**/tests/*', 'subworkflows/nf-core/**/tests/*' + + // run all test with defined profile(s) from the main nextflow.config + profile "test" + + // list of filenames or patterns that should be trigger a full test run + triggers 'nextflow.config', 'nf-test.config', 'conf/test.config', 'tests/nextflow.config', 'tests/.nftignore' + + // load the necessary plugins + plugins { + load "nft-utils@0.0.3" + } +} diff --git a/ro-crate-metadata.json b/ro-crate-metadata.json index c116c14..51bb9a3 100644 --- a/ro-crate-metadata.json +++ b/ro-crate-metadata.json @@ -21,9 +21,9 @@ { "@id": "./", "@type": "Dataset", - "creativeWorkStatus": "InProgress", - "datePublished": "2025-03-12T16:58:57+00:00", - "description": "

\n \n \n \"nf-core/proteinannotator\"\n \n

\n\n[![GitHub Actions CI Status](https://github.com/nf-core/proteinannotator/actions/workflows/ci.yml/badge.svg)](https://github.com/nf-core/proteinannotator/actions/workflows/ci.yml)\n[![GitHub Actions Linting Status](https://github.com/nf-core/proteinannotator/actions/workflows/linting.yml/badge.svg)](https://github.com/nf-core/proteinannotator/actions/workflows/linting.yml)[![AWS CI](https://img.shields.io/badge/CI%20tests-full%20size-FF9900?labelColor=000000&logo=Amazon%20AWS)](https://nf-co.re/proteinannotator/results)[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.XXXXXXX-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.XXXXXXX)\n[![nf-test](https://img.shields.io/badge/unit_tests-nf--test-337ab7.svg)](https://www.nf-test.com)\n\n[![Nextflow](https://img.shields.io/badge/nextflow%20DSL2-%E2%89%A524.04.2-23aa62.svg)](https://www.nextflow.io/)\n[![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=anaconda)](https://docs.conda.io/en/latest/)\n[![run with docker](https://img.shields.io/badge/run%20with-docker-0db7ed?labelColor=000000&logo=docker)](https://www.docker.com/)\n[![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000)](https://sylabs.io/docs/)\n[![Launch on Seqera Platform](https://img.shields.io/badge/Launch%20%F0%9F%9A%80-Seqera%20Platform-%234256e7)](https://cloud.seqera.io/launch?pipeline=https://github.com/nf-core/proteinannotator)\n\n[![Get help on Slack](http://img.shields.io/badge/slack-nf--core%20%23proteinannotator-4A154B?labelColor=000000&logo=slack)](https://nfcore.slack.com/channels/proteinannotator)[![Follow on Twitter](http://img.shields.io/badge/twitter-%40nf__core-1DA1F2?labelColor=000000&logo=twitter)](https://twitter.com/nf_core)[![Follow on Mastodon](https://img.shields.io/badge/mastodon-nf__core-6364ff?labelColor=FFFFFF&logo=mastodon)](https://mstdn.science/@nf_core)[![Watch on YouTube](http://img.shields.io/badge/youtube-nf--core-FF0000?labelColor=000000&logo=youtube)](https://www.youtube.com/c/nf-core)\n\n## Introduction\n\n**nf-core/proteinannotator** is a bioinformatics pipeline that ...\n\n\n\n\n1. Read QC ([`FastQC`](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/))2. Present QC for raw reads ([`MultiQC`](http://multiqc.info/))\n\n## Usage\n\n> [!NOTE]\n> If you are new to Nextflow and nf-core, please refer to [this page](https://nf-co.re/docs/usage/installation) on how to set-up Nextflow. Make sure to [test your setup](https://nf-co.re/docs/usage/introduction#how-to-run-a-pipeline) with `-profile test` before running the workflow on actual data.\n\n\n\nNow, you can run the pipeline using:\n\n\n\n```bash\nnextflow run nf-core/proteinannotator \\\n -profile \\\n --input samplesheet.csv \\\n --outdir \n```\n\n> [!WARNING]\n> Please provide pipeline parameters via the CLI or Nextflow `-params-file` option. Custom config files including those provided by the `-c` Nextflow option can be used to provide any configuration _**except for parameters**_; see [docs](https://nf-co.re/docs/usage/getting_started/configuration#custom-configuration-files).\n\nFor more details and further functionality, please refer to the [usage documentation](https://nf-co.re/proteinannotator/usage) and the [parameter documentation](https://nf-co.re/proteinannotator/parameters).\n\n## Pipeline output\n\nTo see the results of an example test run with a full size dataset refer to the [results](https://nf-co.re/proteinannotator/results) tab on the nf-core website pipeline page.\nFor more details about the output files and reports, please refer to the\n[output documentation](https://nf-co.re/proteinannotator/output).\n\n## Credits\n\nnf-core/proteinannotator was originally written by Olga Botvinnik.\n\nWe thank the following people for their extensive assistance in the development of this pipeline:\n\n\n\n## Contributions and Support\n\nIf you would like to contribute to this pipeline, please see the [contributing guidelines](.github/CONTRIBUTING.md).\n\nFor further information or help, don't hesitate to get in touch on the [Slack `#proteinannotator` channel](https://nfcore.slack.com/channels/proteinannotator) (you can join with [this invite](https://nf-co.re/join/slack)).\n\n## Citations\n\n\n\n\n\n\nAn extensive list of references for the tools used by the pipeline can be found in the [`CITATIONS.md`](CITATIONS.md) file.\n\nYou can cite the `nf-core` publication as follows:\n\n> **The nf-core framework for community-curated bioinformatics pipelines.**\n>\n> Philip Ewels, Alexander Peltzer, Sven Fillinger, Harshil Patel, Johannes Alneberg, Andreas Wilm, Maxime Ulysse Garcia, Paolo Di Tommaso & Sven Nahnsen.\n>\n> _Nat Biotechnol._ 2020 Feb 13. doi: [10.1038/s41587-020-0439-x](https://dx.doi.org/10.1038/s41587-020-0439-x).\n", + "creativeWorkStatus": "Stable", + "datePublished": "2026-02-09T10:42:29+00:00", + "description": "

\n \n \n \"nf-core/proteinannotator\"\n \n

\n\n[![Open in GitHub Codespaces](https://img.shields.io/badge/Open_In_GitHub_Codespaces-black?labelColor=grey&logo=github)](https://github.com/codespaces/new/nf-core/proteinannotator)\n[![GitHub Actions CI Status](https://github.com/nf-core/proteinannotator/actions/workflows/nf-test.yml/badge.svg)](https://github.com/nf-core/proteinannotator/actions/workflows/nf-test.yml)\n[![GitHub Actions Linting Status](https://github.com/nf-core/proteinannotator/actions/workflows/linting.yml/badge.svg)](https://github.com/nf-core/proteinannotator/actions/workflows/linting.yml)[![AWS CI](https://img.shields.io/badge/CI%20tests-full%20size-FF9900?labelColor=000000&logo=Amazon%20AWS)](https://nf-co.re/proteinannotator/results)[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.XXXXXXX-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.XXXXXXX)\n[![nf-test](https://img.shields.io/badge/unit_tests-nf--test-337ab7.svg)](https://www.nf-test.com)\n\n[![Nextflow](https://img.shields.io/badge/version-%E2%89%A525.10.0-green?style=flat&logo=nextflow&logoColor=white&color=%230DC09D&link=https%3A%2F%2Fnextflow.io)](https://www.nextflow.io/)\n[![nf-core template version](https://img.shields.io/badge/nf--core_template-3.5.1-green?style=flat&logo=nfcore&logoColor=white&color=%2324B064&link=https%3A%2F%2Fnf-co.re)](https://github.com/nf-core/tools/releases/tag/3.5.1)\n[![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=anaconda)](https://docs.conda.io/en/latest/)\n[![run with docker](https://img.shields.io/badge/run%20with-docker-0db7ed?labelColor=000000&logo=docker)](https://www.docker.com/)\n[![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000)](https://sylabs.io/docs/)\n[![Launch on Seqera Platform](https://img.shields.io/badge/Launch%20%F0%9F%9A%80-Seqera%20Platform-%234256e7)](https://cloud.seqera.io/launch?pipeline=https://github.com/nf-core/proteinannotator)\n\n[![Get help on Slack](http://img.shields.io/badge/slack-nf--core%20%23proteinannotator-4A154B?labelColor=000000&logo=slack)](https://nfcore.slack.com/channels/proteinannotator)[![Follow on Bluesky](https://img.shields.io/badge/bluesky-%40nf__core-1185fe?labelColor=000000&logo=bluesky)](https://bsky.app/profile/nf-co.re)[![Follow on Mastodon](https://img.shields.io/badge/mastodon-nf__core-6364ff?labelColor=FFFFFF&logo=mastodon)](https://mstdn.science/@nf_core)[![Watch on YouTube](http://img.shields.io/badge/youtube-nf--core-FF0000?labelColor=000000&logo=youtube)](https://www.youtube.com/c/nf-core)\n\n## Introduction\n\n**nf-core/proteinannotator** is a bioinformatics pipeline that computes statistics for protein FASTA inputs and produces protein annotations based on predicted sequence features, including conserved domains, functions, and secondary structure.\n\n

\n \n \n \"nf-core/proteinannotator\"\n \n

\n\n### Check quality and pre-process\n\nGenerate input amino acid sequence statistics with ([`SeqFu`](https://github.com/telatin/seqfu2/)) and pre-process them (i.e., gap removal, convert to upper case, validate, filter by length, replace special characters such as `/`, and remove duplicate sequences) with ([`SeqKit`](https://github.com/shenwei356/seqkit/))\n\n### Annotate sequences\n\n1. Conserved domain annotation with ([`hmmer`](https://github.com/EddyRivasLab/hmmer/)) against databases\n such as [Pfam](https://ftp.ebi.ac.uk/pub/databases/Pfam/) and [FunFam](https://download.cathdb.info/cath/releases/all-releases/)\n2. Functional annotation:\n - ([`InterProScan`](https://interproscan-docs.readthedocs.io/en/v5/)) a software tool used to analyze protein sequences by scanning them against the signatures of protein families, domains, and sites in the [InterPro](https://www.ebi.ac.uk/interpro/) database, helping to identify their functional characteristics.\n3. Predict secondary structure compositional features such as \u03b1-helices, \u03b2-strands and coils with ([`s4pred`](https://github.com/psipred/s4pred))\n4. Present QC stats for input sequences before and after initial pre-processing with ([`MultiQC`](http://multiqc.info/))\n\n## Usage\n\n> [!NOTE]\n> If you are new to Nextflow and nf-core, please refer to [this page](https://nf-co.re/docs/usage/installation) on how to set-up Nextflow. Make sure to [test your setup](https://nf-co.re/docs/usage/introduction#how-to-run-a-pipeline) with `-profile test` before running the workflow on actual data.\n\nFirst, prepare a samplesheet with your input data that looks as follows:\n\n`samplesheet.csv`:\n\n```csv\nid,fasta\nspecies1,species1_proteins.fasta\nspecies2,species2_proteins.fasta\n```\n\nEach row represents a FASTA file of proteins from a single species.\n\nNow, you can run the pipeline using:\n\n```bash\nnextflow run nf-core/proteinannotator \\\n -profile \\\n --input samplesheet.csv \\\n --outdir \n```\n\n> [!WARNING]\n> Please provide pipeline parameters via the CLI or Nextflow `-params-file` option. Custom config files including those provided by the `-c` Nextflow option can be used to provide any configuration _**except for parameters**_; see [docs](https://nf-co.re/docs/usage/getting_started/configuration#custom-configuration-files).\n\nFor more details and further functionality, please refer to the [usage documentation](https://nf-co.re/proteinannotator/usage) and the [parameter documentation](https://nf-co.re/proteinannotator/parameters).\n\n## Pipeline output\n\nTo see the results of an example test run with a full size dataset refer to the [results](https://nf-co.re/proteinannotator/results) tab on the nf-core website pipeline page.\nFor more details about the output files and reports, please refer to the\n[output documentation](https://nf-co.re/proteinannotator/output).\n\n## Credits\n\nnf-core/proteinannotator was originally written by Olga Botvinnik and Evangelos Karatzas.\n\nWe thank the following people for their extensive assistance in the development of this pipeline:\n\n- [Michael L Heuer](https://github.com/heuermh)\n- [Edmund Miller](https://github.com/edmundmiller)\n- [Eric Wei](https://github.com/eweizy)\n- [Martin Beracochea](https://github.com/mberacochea)\n\n## Contributions and Support\n\nIf you would like to contribute to this pipeline, please see the [contributing guidelines](.github/CONTRIBUTING.md).\n\nFor further information or help, don't hesitate to get in touch on the [Slack `#proteinannotator` channel](https://nfcore.slack.com/channels/proteinannotator) (you can join with [this invite](https://nf-co.re/join/slack)).\n\n## Citations\n\n\n\n\nAn extensive list of references for the tools used by the pipeline can be found in the [`CITATIONS.md`](CITATIONS.md) file.\n\nYou can cite the `nf-core` publication as follows:\n\n> **The nf-core framework for community-curated bioinformatics pipelines.**\n>\n> Philip Ewels, Alexander Peltzer, Sven Fillinger, Harshil Patel, Johannes Alneberg, Andreas Wilm, Maxime Ulysse Garcia, Paolo Di Tommaso & Sven Nahnsen.\n>\n> _Nat Biotechnol._ 2020 Feb 13. doi: [10.1038/s41587-020-0439-x](https://dx.doi.org/10.1038/s41587-020-0439-x).\n", "hasPart": [ { "@id": "main.nf" @@ -99,7 +99,7 @@ }, "mentions": [ { - "@id": "#05d6075c-5759-440c-8287-0bb024b3c84c" + "@id": "#aff5d966-2a2a-4cbf-bf15-44cdd5058ceb" } ], "name": "nf-core/proteinannotator" @@ -126,16 +126,28 @@ "SoftwareSourceCode", "ComputationalWorkflow" ], + "creator": [ + { + "@id": "https://orcid.org/0000-0003-4412-7970" + } + ], "dateCreated": "", - "dateModified": "2025-03-12T09:58:57Z", + "dateModified": "2026-02-09T10:42:29Z", "dct:conformsTo": "https://bioschemas.org/profiles/ComputationalWorkflow/1.0-RELEASE/", "keywords": [ "nf-core", - "nextflow" + "nextflow", + "annotation", + "proteomics" ], "license": [ "MIT" ], + "maintainer": [ + { + "@id": "https://orcid.org/0000-0003-4412-7970" + } + ], "name": [ "nf-core/proteinannotator" ], @@ -147,10 +159,10 @@ }, "url": [ "https://github.com/nf-core/proteinannotator", - "https://nf-co.re/proteinannotator/dev/" + "https://nf-co.re/proteinannotator/1.0.0/" ], "version": [ - "1.0.0dev" + "1.0.0" ] }, { @@ -163,14 +175,14 @@ "url": { "@id": "https://www.nextflow.io/" }, - "version": "!>=24.04.2" + "version": "!>=25.10.0" }, { - "@id": "#05d6075c-5759-440c-8287-0bb024b3c84c", + "@id": "#aff5d966-2a2a-4cbf-bf15-44cdd5058ceb", "@type": "TestSuite", "instance": [ { - "@id": "#674d6a0b-fbe6-49d7-a592-36266d21a9b3" + "@id": "#5d20d507-f40f-4fcc-854d-5d27a47f2941" } ], "mainEntity": { @@ -179,10 +191,10 @@ "name": "Test suite for nf-core/proteinannotator" }, { - "@id": "#674d6a0b-fbe6-49d7-a592-36266d21a9b3", + "@id": "#5d20d507-f40f-4fcc-854d-5d27a47f2941", "@type": "TestInstance", "name": "GitHub Actions workflow for testing nf-core/proteinannotator", - "resource": "repos/nf-core/proteinannotator/actions/workflows/ci.yml", + "resource": "repos/nf-core/proteinannotator/actions/workflows/nf-test.yml", "runsOn": { "@id": "https://w3id.org/ro/terms/test#GithubService" }, @@ -306,6 +318,12 @@ "@type": "Organization", "name": "nf-core", "url": "https://nf-co.re/" + }, + { + "@id": "https://orcid.org/0000-0003-4412-7970", + "@type": "Person", + "email": "olga.botvinnik@gmail.com", + "name": "Olga Botvinnik" } ] } \ No newline at end of file diff --git a/subworkflows/local/domain_annotation/main.nf b/subworkflows/local/domain_annotation/main.nf new file mode 100644 index 0000000..1ec8289 --- /dev/null +++ b/subworkflows/local/domain_annotation/main.nf @@ -0,0 +1,66 @@ +include { ARIA2 as ARIA2_PFAM } from '../../../modules/nf-core/aria2/main' +include { ARIA2 as ARIA2_FUNFAM } from '../../../modules/nf-core/aria2/main' +include { HMMER_HMMSEARCH as HMMSEARCH_PFAM } from '../../../modules/nf-core/hmmer/hmmsearch/main' +include { HMMER_HMMSEARCH as HMMSEARCH_FUNFAM } from '../../../modules/nf-core/hmmer/hmmsearch/main' + +workflow DOMAIN_ANNOTATION { + take: + ch_fasta // channel: [ val(meta), [ fasta ] ] + skip_pfam // boolean + pfam_db // string, path to the pfam HMM database, if already exists + pfam_latest_link // string, path to the latest pfam HMM database, to download + skip_funfam // boolean + funfam_db // string, path to the funfam HMM database, if already exists + funfam_latest_link // string, path to the latest funfam HMM database, to download + + main: + + ch_versions = channel.empty() + ch_pfam_domains = channel.empty() + ch_funfam_domains = channel.empty() + + if (!skip_pfam) { + if (!pfam_db) { + ch_pfam_link = channel.of([ [ id: 'pfam' ], pfam_latest_link ]) + + ARIA2_PFAM( ch_pfam_link ) + ch_versions = ch_versions.mix( ARIA2_PFAM.out.versions ) + ch_pfam_db = ARIA2_PFAM.out.downloaded_file + } else { + ch_pfam_db = channel.of([ [ id: 'pfam' ], pfam_db ]) + } + + ch_input_for_hmmsearch_pfam = ch_fasta + .combine(ch_pfam_db) + .map{ meta, seqs, _meta2, models -> [meta, models, seqs, false, false, true] } + + HMMSEARCH_PFAM( ch_input_for_hmmsearch_pfam ) + ch_versions = ch_versions.mix( HMMSEARCH_PFAM.out.versions.first() ) + ch_pfam_domains = HMMSEARCH_PFAM.out.domain_summary + } + + if (!skip_funfam) { + if (!funfam_db) { + ch_funfam_link = channel.of([ [ id: 'funfam' ], funfam_latest_link ]) + + ARIA2_FUNFAM( ch_funfam_link ) + ch_versions = ch_versions.mix( ARIA2_FUNFAM.out.versions ) + ch_funfam_db = ARIA2_FUNFAM.out.downloaded_file + } else { + ch_funfam_db = channel.of([ [ id: 'funfam' ], funfam_db ]) + } + + ch_input_for_hmmsearch_funfam = ch_fasta + .combine(ch_funfam_db) + .map{ meta, seqs, _meta2, models -> [meta, models, seqs, false, false, true] } + + HMMSEARCH_FUNFAM( ch_input_for_hmmsearch_funfam ) + ch_versions = ch_versions.mix( HMMSEARCH_FUNFAM.out.versions.first() ) + ch_funfam_domains = HMMSEARCH_FUNFAM.out.domain_summary + } + + emit: + pfam_domains = ch_pfam_domains + funfam_domains = ch_funfam_domains + versions = ch_versions +} diff --git a/subworkflows/local/domain_annotation/meta.yml b/subworkflows/local/domain_annotation/meta.yml new file mode 100644 index 0000000..e04e241 --- /dev/null +++ b/subworkflows/local/domain_annotation/meta.yml @@ -0,0 +1,61 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/subworkflows/yaml-schema.json +name: "domain_annotation" +description: Annotate amino acid fasta files with selected HMM libraries such as Pfam and FunFam +keywords: + - fasta + - sequences + - domain + - annotation + - database + - download + - HMM +components: + - aria2 + - hmmer/hmmsearch +input: + - ch_fasta: + type: file + description: | + Amino acid fasta file containing amino acid sequences for annotation + Structure: [ val(meta), [ path(fasta) ] ] + - skip_pfam: + type: boolean + description: | + Skip domain annotation with Pfam + - pfam_db: + type: string + description: | + Path to an existing HMM Pfam library on the system. If provided, the ARIA2_PFAM db download will be skipped. + - pfam_latest_link: + type: string + description: | + Path to the latest Pfam HMM database, to download + - skip_funfam: + type: boolean + description: | + Skip domain annotation with FunFam + - funfam_db: + type: string + description: | + Path to an existing HMM FunFam library on the system. If provided, the ARIA2_FUNFAM db download will be skipped. + - funfam_latest_link: + type: string + description: | + Path to the latest FunFam HMM database, to download +output: + - pfam_domains: + type: file + description: | + domtbl.gz files with pfam domain annotation for input amino acid sequences + - funfam_domains: + type: file + description: | + domtbl.gz files with funfam domain annotation for input amino acid sequences + - versions: + type: file + description: | + Versions file containing the software versions used in the workflow +authors: + - "@vagkaratzas" +maintainers: + - "@vagkaratzas" diff --git a/subworkflows/local/domain_annotation/tests/main.nf.test b/subworkflows/local/domain_annotation/tests/main.nf.test new file mode 100644 index 0000000..c713051 --- /dev/null +++ b/subworkflows/local/domain_annotation/tests/main.nf.test @@ -0,0 +1,97 @@ +nextflow_workflow { + + name "Test Subworkflow DOMAIN_ANNOTATION" + script "../main.nf" + workflow "DOMAIN_ANNOTATION" + + test("faa - domain annotation") { + + when { + workflow { + """ + input[0] = channel.of([ + [ id: 'test' ], + file(params.pipelines_testdata_base_path + '/testdata/sequences/test_proteins.faa', checkIfExists: true) + ]) + input[1] = false // skip_pfam + input[2] = null // pfam_db + input[3] = params.pipelines_testdata_base_path + '/testdata/pfam/Pfam-A_test.hmm.gz' // pfam_latest_link + input[4] = false // skip_funfam + input[5] = null // funfam_db + input[6] = params.pipelines_testdata_base_path + '/testdata/funfam/funfam-hmm3-v4_3_0_test.lib.gz' // funfam_latest_link + """ + } + } + + then { + assertAll( + { assert workflow.success}, + { assert snapshot( + path(workflow.out.pfam_domains[0][1]).linesGzip[0..7], + path(workflow.out.funfam_domains[0][1]).linesGzip[0..7], + workflow.out.versions.collect { path(it).yaml }.unique() + ).match()} + ) + } + } + + test("faa - pfam_db - skip_funfam") { + + when { + workflow { + """ + input[0] = channel.of([ + [ id: 'test' ], + file(params.pipelines_testdata_base_path + '/testdata/sequences/test_proteins.faa', checkIfExists: true) + ]) + input[1] = false // skip_pfam + input[2] = params.pipelines_testdata_base_path + '/testdata/pfam/Pfam-A_test.hmm.gz' // pfam_db + input[3] = params.pipelines_testdata_base_path + '/testdata/pfam/Pfam-A_test.hmm.gz' // pfam_latest_link + input[4] = true // skip_funfam + input[5] = null // funfam_db + input[6] = params.pipelines_testdata_base_path + '/testdata/funfam/funfam-hmm3-v4_3_0_test.lib.gz' // funfam_latest_link + """ + } + } + + then { + assertAll( + { assert workflow.success}, + { assert snapshot( + path(workflow.out.pfam_domains[0][1]).linesGzip[0..7], + workflow.out.versions.collect { path(it).yaml }.unique() + ).match()} + ) + } + } + + test("faa - domain annotation - stub") { + + options "-stub" + + when { + workflow { + """ + input[0] = channel.of([ + [ id: 'test' ], + file(params.pipelines_testdata_base_path + '/testdata/sequences/test_proteins.faa', checkIfExists: true) + ]) + input[1] = false // skip_pfam + input[2] = null // pfam_db + input[3] = params.pipelines_testdata_base_path + '/testdata/pfam/Pfam-A_test.hmm.gz' // pfam_latest_link + input[4] = false // skip_funfam + input[5] = null // funfam_db + input[6] = params.pipelines_testdata_base_path + '/testdata/funfam/funfam-hmm3-v4_3_0_test.lib.gz' // funfam_latest_link + """ + } + } + + then { + assertAll( + { assert workflow.success}, + { assert snapshot(workflow.out).match() } + ) + } + } + +} diff --git a/subworkflows/local/domain_annotation/tests/main.nf.test.snap b/subworkflows/local/domain_annotation/tests/main.nf.test.snap new file mode 100644 index 0000000..f1c925c --- /dev/null +++ b/subworkflows/local/domain_annotation/tests/main.nf.test.snap @@ -0,0 +1,134 @@ +{ + "faa - domain annotation": { + "content": [ + [ + "# --- full sequence --- -------------- this domain ------------- hmm coord ali coord env coord", + "# target name accession tlen query name accession qlen E-value score bias # of c-Evalue i-Evalue score bias from to from to from to acc description of target", + "#------------------- ---------- ----- -------------------- ---------- ----- --------- ------ ----- --- --- --------- --------- ------ ----- ----- ----- ----- ----- ----- ----- ---- ---------------------", + "T1026 - 172 Nanovirus_coat PF04660.18 177 1.2e-54 172.1 0.5 1 1 6.7e-55 1.3e-54 172.0 0.5 3 177 4 172 2 172 0.94 FBNSV, , 172 residues|", + "T1026 - 172 PUA_NSUN2 PF25378.2 87 1.8e-05 12.2 0.0 1 1 1.3e-05 2.7e-05 11.6 0.0 40 81 68 111 40 116 0.89 FBNSV, , 172 residues|", + "T1024 - 408 MFS_1 PF07690.22 347 6.7e-35 107.6 58.0 1 2 1.4e-32 2.9e-32 98.9 38.6 4 346 17 365 15 366 0.78 LmrP, , 408 residues|", + "T1024 - 408 MFS_1 PF07690.22 347 6.7e-35 107.6 58.0 2 2 2.6e-09 5.1e-09 22.5 11.4 38 174 262 399 257 407 0.74 LmrP, , 408 residues|", + "#" + ], + [ + "# --- full sequence --- -------------- this domain ------------- hmm coord ali coord env coord", + "# target name accession tlen query name accession qlen E-value score bias # of c-Evalue i-Evalue score bias from to from to from to acc description of target", + "#------------------- ---------- ----- -------------------- ---------- ----- --------- ------ ----- --- --- --------- --------- ------ ----- ----- ----- ----- ----- ----- ----- ---- ---------------------", + "T1024 - 408 1.20.1250.20-FF-000637 - 179 1e-09 25.7 6.6 1 2 5.2e-10 1e-09 25.7 6.6 17 173 32 191 24 197 0.81 LmrP, , 408 residues|", + "T1024 - 408 1.20.1250.20-FF-000637 - 179 1e-09 25.7 6.6 2 2 0.046 0.093 -0.2 5.0 30 137 261 367 248 402 0.64 LmrP, , 408 residues|", + "T1026 - 172 1.10.238.10-FF-000755 - 78 1.2e-05 12.7 0.1 1 2 1.6e-05 3.2e-05 11.3 0.0 16 54 39 79 29 90 0.73 FBNSV, , 172 residues|", + "T1026 - 172 1.10.238.10-FF-000755 - 78 1.2e-05 12.7 0.1 2 2 0.3 0.6 -2.4 0.0 38 49 102 113 97 123 0.74 FBNSV, , 172 residues|", + "#" + ], + [ + { + "DOMAIN_ANNOTATION:HMMSEARCH_FUNFAM": { + "hmmer": 3.4 + } + }, + { + "DOMAIN_ANNOTATION:ARIA2_FUNFAM": { + "aria2": "1.36.0" + } + }, + { + "DOMAIN_ANNOTATION:ARIA2_PFAM": { + "aria2": "1.36.0" + } + }, + { + "DOMAIN_ANNOTATION:HMMSEARCH_PFAM": { + "hmmer": 3.4 + } + } + ] + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.2" + }, + "timestamp": "2025-12-05T08:44:26.478981734" + }, + "faa - pfam_db - skip_funfam": { + "content": [ + [ + "# --- full sequence --- -------------- this domain ------------- hmm coord ali coord env coord", + "# target name accession tlen query name accession qlen E-value score bias # of c-Evalue i-Evalue score bias from to from to from to acc description of target", + "#------------------- ---------- ----- -------------------- ---------- ----- --------- ------ ----- --- --- --------- --------- ------ ----- ----- ----- ----- ----- ----- ----- ---- ---------------------", + "T1026 - 172 Nanovirus_coat PF04660.18 177 1.2e-54 172.1 0.5 1 1 6.7e-55 1.3e-54 172.0 0.5 3 177 4 172 2 172 0.94 FBNSV, , 172 residues|", + "T1026 - 172 PUA_NSUN2 PF25378.2 87 1.8e-05 12.2 0.0 1 1 1.3e-05 2.7e-05 11.6 0.0 40 81 68 111 40 116 0.89 FBNSV, , 172 residues|", + "T1024 - 408 MFS_1 PF07690.22 347 6.7e-35 107.6 58.0 1 2 1.4e-32 2.9e-32 98.9 38.6 4 346 17 365 15 366 0.78 LmrP, , 408 residues|", + "T1024 - 408 MFS_1 PF07690.22 347 6.7e-35 107.6 58.0 2 2 2.6e-09 5.1e-09 22.5 11.4 38 174 262 399 257 407 0.74 LmrP, , 408 residues|", + "#" + ], + [ + { + "DOMAIN_ANNOTATION:HMMSEARCH_PFAM": { + "hmmer": 3.4 + } + } + ] + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.2" + }, + "timestamp": "2025-12-05T10:10:22.057426358" + }, + "faa - domain annotation - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.domtbl.gz:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + [ + { + "id": "test" + }, + "test.domtbl.gz:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "2": [ + "versions.yml:md5,160d4c5a5001cfb4ff57b94fc52b67d9", + "versions.yml:md5,35e41735706132967dd94bb636833a4a", + "versions.yml:md5,a74a0c8fcb741e59bc14424f612b8d09", + "versions.yml:md5,f1d8a406d3dcb97a7c15e9c810926de1" + ], + "funfam_domains": [ + [ + { + "id": "test" + }, + "test.domtbl.gz:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "pfam_domains": [ + [ + { + "id": "test" + }, + "test.domtbl.gz:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,160d4c5a5001cfb4ff57b94fc52b67d9", + "versions.yml:md5,35e41735706132967dd94bb636833a4a", + "versions.yml:md5,a74a0c8fcb741e59bc14424f612b8d09", + "versions.yml:md5,f1d8a406d3dcb97a7c15e9c810926de1" + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.2" + }, + "timestamp": "2025-12-05T08:44:37.015452047" + } +} \ No newline at end of file diff --git a/subworkflows/local/functional_annotation/main.nf b/subworkflows/local/functional_annotation/main.nf new file mode 100644 index 0000000..240c504 --- /dev/null +++ b/subworkflows/local/functional_annotation/main.nf @@ -0,0 +1,35 @@ +include { ARIA2 } from '../../../modules/nf-core/aria2/main' +include { UNTAR } from '../../../modules/nf-core/untar/main' +include { INTERPROSCAN } from '../../../modules/nf-core/interproscan/main' + +workflow FUNCTIONAL_ANNOTATION { + take: + ch_fasta // channel: [ val(meta), [ fasta ] ] + skip_interproscan // boolean + interproscan_db_url // string, url to download db + interproscan_db // string, existing db + + main: + ch_interproscan_tsv = channel.empty() + ch_versions = channel.empty() + + if (!skip_interproscan) { + if (interproscan_db) { + ch_interproscan_db = channel.fromPath(interproscan_db).first() + } + else { + ARIA2( [ [ id:'interproscan_db' ], interproscan_db_url ] ) + ch_versions = ch_versions.mix(ARIA2.out.versions.first()) + + UNTAR( ARIA2.out.downloaded_file ) + ch_interproscan_db = UNTAR.out.untar.map{ f -> f[1] } + } + + INTERPROSCAN( ch_fasta, ch_interproscan_db ) + ch_interproscan_tsv = ch_interproscan_tsv.mix(INTERPROSCAN.out.tsv) + } + + emit: + interproscan_tsv = ch_interproscan_tsv + versions = ch_versions // channel: [ versions.yml ] +} diff --git a/subworkflows/local/utils_nfcore_proteinannotator_pipeline/main.nf b/subworkflows/local/utils_nfcore_proteinannotator_pipeline/main.nf index d40f06b..1ba3ccc 100644 --- a/subworkflows/local/utils_nfcore_proteinannotator_pipeline/main.nf +++ b/subworkflows/local/utils_nfcore_proteinannotator_pipeline/main.nf @@ -11,6 +11,7 @@ include { UTILS_NFSCHEMA_PLUGIN } from '../../nf-core/utils_nfschema_plugin' include { paramsSummaryMap } from 'plugin/nf-schema' include { samplesheetToList } from 'plugin/nf-schema' +include { paramsHelp } from 'plugin/nf-schema' include { completionEmail } from '../../nf-core/utils_nfcore_pipeline' include { completionSummary } from '../../nf-core/utils_nfcore_pipeline' include { imNotification } from '../../nf-core/utils_nfcore_pipeline' @@ -24,77 +25,88 @@ include { UTILS_NEXTFLOW_PIPELINE } from '../../nf-core/utils_nextflow_pipelin */ workflow PIPELINE_INITIALISATION { - take: version // boolean: Display version and exit validate_params // boolean: Boolean whether to validate parameters against the schema at runtime - monochrome_logs // boolean: Do not use coloured log outputs + _monochrome_logs // boolean: Do not use coloured log outputs nextflow_cli_args // array: List of positional nextflow CLI args outdir // string: The output directory where the results will be saved input // string: Path to input samplesheet + help // boolean: Display help message and exit + help_full // boolean: Show the full help message + show_hidden // boolean: Show hidden parameters in the help message main: - ch_versions = Channel.empty() + ch_versions = channel.empty() // // Print version and exit if required and dump pipeline parameters to JSON file // - UTILS_NEXTFLOW_PIPELINE ( + UTILS_NEXTFLOW_PIPELINE( version, true, outdir, - workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1 + workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1, ) // // Validate parameters and generate parameter summary to stdout // + before_text = """ +-\033[2m----------------------------------------------------\033[0m- + \033[0;32m,--.\033[0;30m/\033[0;32m,-.\033[0m +\033[0;34m ___ __ __ __ ___ \033[0;32m/,-._.--~\'\033[0m +\033[0;34m |\\ | |__ __ / ` / \\ |__) |__ \033[0;33m} {\033[0m +\033[0;34m | \\| | \\__, \\__/ | \\ |___ \033[0;32m\\`-._,-`-,\033[0m + \033[0;32m`._,._,\'\033[0m +\033[0;35m nf-core/proteinannotator ${workflow.manifest.version}\033[0m +-\033[2m----------------------------------------------------\033[0m- +""" + after_text = """${workflow.manifest.doi ? "\n* The pipeline\n" : ""}${workflow.manifest.doi.tokenize(",").collect { doi -> " https://doi.org/${doi.trim().replace('https://doi.org/','')}"}.join("\n")}${workflow.manifest.doi ? "\n" : ""} +* The nf-core framework + https://doi.org/10.1038/s41587-020-0439-x + +* Software dependencies + https://github.com/nf-core/proteinannotator/blob/master/CITATIONS.md +""" + command = "nextflow run ${workflow.manifest.name} -profile --input samplesheet.csv --outdir " + UTILS_NFSCHEMA_PLUGIN ( workflow, validate_params, - null + null, + help, + help_full, + show_hidden, + before_text, + after_text, + command ) // // Check config provided to the pipeline // - UTILS_NFCORE_PIPELINE ( + UTILS_NFCORE_PIPELINE( nextflow_cli_args ) // - // Custom validation for pipeline parameters - // - validateInputParameters() - - // - // Create channel from input file provided through params.input + // Create channel from input file provided through input // - Channel - .fromList(samplesheetToList(params.input, "${projectDir}/assets/schema_input.json")) - .map { - meta, fastq_1, fastq_2 -> - if (!fastq_2) { - return [ meta.id, meta + [ single_end:true ], [ fastq_1 ] ] - } else { - return [ meta.id, meta + [ single_end:false ], [ fastq_1, fastq_2 ] ] - } + channel.fromList(samplesheetToList(input, "${projectDir}/assets/schema_input.json")) + .map { meta, fasta -> + return [meta, fasta] } - .groupTuple() .map { samplesheet -> validateInputSamplesheet(samplesheet) } - .map { - meta, fastqs -> - return [ meta, fastqs.flatten() ] - } .set { ch_samplesheet } emit: samplesheet = ch_samplesheet - versions = ch_versions + versions = ch_versions } /* @@ -104,15 +116,14 @@ workflow PIPELINE_INITIALISATION { */ workflow PIPELINE_COMPLETION { - take: - email // string: email address - email_on_fail // string: email address sent on pipeline failure + email // string: email address + email_on_fail // string: email address sent on pipeline failure plaintext_email // boolean: Send plain-text email instead of HTML - outdir // path: Path to output directory where results will be published + outdir // path: Path to output directory where results will be published monochrome_logs // boolean: Disable ANSI colour codes in log output - hook_url // string: hook URL for notifications - multiqc_report // string: Path to MultiQC report + hook_url // string: hook URL for notifications + multiqc_report // string: Path to MultiQC report main: summary_params = paramsSummaryMap(workflow, parameters_schema: "nextflow_schema.json") @@ -141,7 +152,7 @@ workflow PIPELINE_COMPLETION { } workflow.onError { - log.error "Pipeline failed. Please refer to troubleshooting docs: https://nf-co.re/docs/usage/troubleshooting" + log.error("Pipeline failed. Please refer to troubleshooting docs: https://nf-co.re/docs/usage/troubleshooting") } } @@ -150,77 +161,59 @@ workflow PIPELINE_COMPLETION { FUNCTIONS ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -// -// Check and validate pipeline parameters -// -def validateInputParameters() { - genomeExistsError() -} // // Validate channels from input samplesheet // def validateInputSamplesheet(input) { - def (metas, fastqs) = input[1..2] - - // Check that multiple runs of the same sample are of the same datatype i.e. single-end / paired-end - def endedness_ok = metas.collect{ meta -> meta.single_end }.unique().size == 1 - if (!endedness_ok) { - error("Please check input samplesheet -> Multiple runs of a sample must be of the same datatype i.e. single-end or paired-end: ${metas[0].id}") - } - - return [ metas[0], fastqs ] -} -// -// Get attribute from genome config file e.g. fasta -// -def getGenomeAttribute(attribute) { - if (params.genomes && params.genome && params.genomes.containsKey(params.genome)) { - if (params.genomes[ params.genome ].containsKey(attribute)) { - return params.genomes[ params.genome ][ attribute ] - } - } - return null + // todo: implement samplesheet validation + return input } -// -// Exit pipeline if incorrect --genome key provided -// -def genomeExistsError() { - if (params.genomes && params.genome && !params.genomes.containsKey(params.genome)) { - def error_string = "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n" + - " Genome '${params.genome}' not found in any config files provided to the pipeline.\n" + - " Currently, the available genome keys are:\n" + - " ${params.genomes.keySet().join(", ")}\n" + - "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" - error(error_string) - } -} // // Generate methods description for MultiQC // def toolCitationText() { - // TODO nf-core: Optionally add in-text citation tools to this list. - // Can use ternary operators to dynamically construct based conditions, e.g. params["run_xyz"] ? "Tool (Foo et al. 2023)" : "", - // Uncomment function in methodsDescriptionText to render in MultiQC report + + def quality_check_text = [ + "Amino acid sequence statistics were generated with SeqFu (Telatin et al. 2021).", + params.skip_preprocessing ? "" : "Input sequences were preprocessed with SeqKit (gap trimming, length filtering, validation, duplicate removal) (Shen et al. 2024)." + ].join(' ').trim() + + def domain_annotation_text = (params.skip_pfam && params.skip_funfam) ? "" : "Domains were annotated with hmmer/hmmsearch (Eddy et al. 2011)." + + def prediction_text = params.skip_s4pred ? "" : "Secondary structures were predicted via the s4pred software (Moffat et al. 2021)." + + def postprocessing_text = "Run statistics were reported using MultiQC (Ewels et al. 2016)." + def citation_text = [ - "Tools used in the workflow included:", - "FastQC (Andrews 2010),", - "MultiQC (Ewels et al. 2016)", - "." - ].join(' ').trim() + quality_check_text, + domain_annotation_text, + prediction_text, + postprocessing_text + ].join(' ').trim() return citation_text } def toolBibliographyText() { - // TODO nf-core: Optionally add bibliographic entries to this list. - // Can use ternary operators to dynamically construct based conditions, e.g. params["run_xyz"] ? "
  • Author (2023) Pub name, Journal, DOI
  • " : "", - // Uncomment function in methodsDescriptionText to render in MultiQC report + def quality_check_text = [ + '
  • Telatin, A., Fariselli, P., & Birolo, G. (2021). SeqFu: a suite of utilities for the robust and reproducible manipulation of sequence files. Bioengineering, 8(5), 59. doi: 10.3390/bioengineering8050059
  • ', + params.skip_preprocessing ? '' : '
  • Shen, W., Sipos, B., & Zhao, L. (2024). SeqKit2: A Swiss army knife for sequence and alignment processing. Imeta, 3(3), e191. doi: 10.1002/imt2.191
  • ' + ].join(' ').trim() + + def domain_annotation_text = (params.skip_pfam && params.skip_funfam) ? '' : '
  • Eddy, S. R. (2011). Accelerated profile HMM searches. PLoS computational biology, 7(10), e1002195. doi: 10.1371/journal.pcbi.1002195
  • ' + + def prediction_text = params.skip_s4pred ? '' : '
  • Moffat, L., & Jones, D. T. (2021). Increasing the accuracy of single sequence prediction methods using a deep semi-supervised learning framework. Bioinformatics, 37(21), 3744-3751. doi: 10.1093/bioinformatics/btab491
  • ' + + def postprocessing_text = '
  • Ewels, P., Magnusson, M., Lundin, S., & Käller, M. (2016). MultiQC: summarize analysis results for multiple tools and samples in a single report. Bioinformatics, 32(19), 3047–3048. doi: 10.1093/bioinformatics/btw354
  • ' + def reference_text = [ - "
  • Andrews S, (2010) FastQC, URL: https://www.bioinformatics.babraham.ac.uk/projects/fastqc/).
  • ", - "
  • Ewels, P., Magnusson, M., Lundin, S., & Käller, M. (2016). MultiQC: summarize analysis results for multiple tools and samples in a single report. Bioinformatics , 32(19), 3047–3048. doi: /10.1093/bioinformatics/btw354
  • " - ].join(' ').trim() + quality_check_text, + domain_annotation_text, + prediction_text, + postprocessing_text + ].join(' ').trim() return reference_text } @@ -242,23 +235,21 @@ def methodsDescriptionText(mqc_methods_yaml) { temp_doi_ref += "(doi: ${doi_ref.replace("https://doi.org/", "").replace(" ", "")}), " } meta["doi_text"] = temp_doi_ref.substring(0, temp_doi_ref.length() - 2) - } else meta["doi_text"] = "" + } + else { + meta["doi_text"] = "" + } meta["nodoi_text"] = meta.manifest_map.doi ? "" : "
  • If available, make sure to update the text to include the Zenodo DOI of version of the pipeline used.
  • " // Tool references - meta["tool_citations"] = "" - meta["tool_bibliography"] = "" - - // TODO nf-core: Only uncomment below if logic in toolCitationText/toolBibliographyText has been filled! - // meta["tool_citations"] = toolCitationText().replaceAll(", \\.", ".").replaceAll("\\. \\.", ".").replaceAll(", \\.", ".") - // meta["tool_bibliography"] = toolBibliographyText() + meta["tool_citations"] = toolCitationText().replaceAll(", \\.", ".").replaceAll("\\. \\.", ".").replaceAll(", \\.", ".") + meta["tool_bibliography"] = toolBibliographyText() def methods_text = mqc_methods_yaml.text - def engine = new groovy.text.SimpleTemplateEngine() + def engine = new groovy.text.SimpleTemplateEngine() def description_html = engine.createTemplate(methods_text).make(meta) return description_html.toString() } - diff --git a/subworkflows/nf-core/faa_seqfu_seqkit/main.nf b/subworkflows/nf-core/faa_seqfu_seqkit/main.nf new file mode 100644 index 0000000..068e658 --- /dev/null +++ b/subworkflows/nf-core/faa_seqfu_seqkit/main.nf @@ -0,0 +1,40 @@ +include { SEQFU_STATS as SEQFU_STATS_BEFORE } from '../../../modules/nf-core/seqfu/stats/main' +include { SEQKIT_SEQ } from '../../../modules/nf-core/seqkit/seq/main' +include { SEQKIT_RMDUP } from '../../../modules/nf-core/seqkit/rmdup/main' +include { SEQKIT_REPLACE } from '../../../modules/nf-core/seqkit/replace/main' +include { SEQFU_STATS as SEQFU_STATS_AFTER } from '../../../modules/nf-core/seqfu/stats/main' + +workflow FAA_SEQFU_SEQKIT { + + take: + ch_fasta // tuple val(meta), path(fasta) + skip_preprocessing // boolean + + main: + ch_multiqc_files = channel.empty() + ch_versions = channel.empty() + + SEQFU_STATS_BEFORE( ch_fasta ) + ch_multiqc_files = ch_multiqc_files.mix( SEQFU_STATS_BEFORE.out.multiqc ) + ch_versions = ch_versions.mix( SEQFU_STATS_BEFORE.out.versions ) + + if (!skip_preprocessing) { + SEQKIT_SEQ( ch_fasta ) + ch_versions = ch_versions.mix( SEQKIT_SEQ.out.versions ) + + SEQKIT_REPLACE( SEQKIT_SEQ.out.fastx ) + + SEQKIT_RMDUP( SEQKIT_REPLACE.out.fastx ) + ch_fasta = SEQKIT_RMDUP.out.fastx + ch_versions = ch_versions.mix( SEQKIT_RMDUP.out.versions ) + + SEQFU_STATS_AFTER( SEQKIT_RMDUP.out.fastx ) + ch_multiqc_files = ch_multiqc_files.mix( SEQFU_STATS_AFTER.out.multiqc ) + ch_versions = ch_versions.mix( SEQFU_STATS_AFTER.out.versions ) + } + + emit: + fasta = ch_fasta + multiqc_files = ch_multiqc_files + versions = ch_versions +} diff --git a/subworkflows/nf-core/faa_seqfu_seqkit/meta.yml b/subworkflows/nf-core/faa_seqfu_seqkit/meta.yml new file mode 100644 index 0000000..5f43443 --- /dev/null +++ b/subworkflows/nf-core/faa_seqfu_seqkit/meta.yml @@ -0,0 +1,47 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/subworkflows/yaml-schema.json +name: "faa_seqfu_seqkit" +description: | + Subworkflow that optionally preprocesses amino acid FASTA sequences + (seqkit seq/replace/rmdup), computes sequence statistics before and + after preprocessing using seqfu stats, and exports MultiQC-compatible + statistics and software versions. +keywords: + - fasta + - protein + - preprocessing + - statistics + - quality check +components: + - seqfu/stats + - seqkit/seq + - seqkit/rmdup + - seqkit/replace +input: + - ch_fasta: + type: file + description: | + Amino acid sequences fasta file. + Structure: [ val(meta), [ path(fasta) ] ] + - skip_preprocessing: + type: boolean + description: | + If true, skip seqkit-based preprocessing steps and only compute + initial sequence statistics. +output: + - fasta: + type: file + description: | + Contains the final amino acid FASTA file + (either preprocessed or original if preprocessing is skipped). + - multiqc_files: + type: file + description: | + Statistics file for MultiQC. + - versions: + type: file + description: | + Versions file containing the software versions used in the workflow. +authors: + - "@vagkaratzas" +maintainers: + - "@vagkaratzas" diff --git a/subworkflows/nf-core/faa_seqfu_seqkit/tests/main.nf.test b/subworkflows/nf-core/faa_seqfu_seqkit/tests/main.nf.test new file mode 100644 index 0000000..1fb2d9a --- /dev/null +++ b/subworkflows/nf-core/faa_seqfu_seqkit/tests/main.nf.test @@ -0,0 +1,65 @@ +nextflow_workflow { + + name "Test Subworkflow FAA_SEQFU_SEQKIT" + script "../main.nf" + workflow "FAA_SEQFU_SEQKIT" + config './nextflow.config' + + tag "subworkflows" + tag "subworkflows_nfcore" + tag "subworkflows/faa_seqfu_seqkit" + tag "seqfu/stats" + tag "seqkit/seq" + tag "seqkit/rmdup" + tag "seqkit/replace" + + test("faa") { + + when { + workflow { + """ + input[0] = channel.of([ + [ id: 'test_faa' ], + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/proteome_test.faa', checkIfExists: true) + ]) + input[1] = false + """ + } + } + + then { + assertAll( + { assert workflow.success}, + { assert snapshot(workflow.out).match() } + ) + } + } + + test("faa - stub") { + + options "-stub" + + when { + workflow { + """ + input[0] = channel.of([ + [ id: 'test_stub' ], + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/proteome_test.faa', checkIfExists: true) + ]) + input[1] = false + """ + } + } + + then { + assertAll( + { assert workflow.success }, + { assert snapshot( + workflow.out, + workflow.out.versions.collect{ path(it).yaml }.unique() + ).match() } + ) + } + } + +} diff --git a/subworkflows/nf-core/faa_seqfu_seqkit/tests/main.nf.test.snap b/subworkflows/nf-core/faa_seqfu_seqkit/tests/main.nf.test.snap new file mode 100644 index 0000000..b3559a3 --- /dev/null +++ b/subworkflows/nf-core/faa_seqfu_seqkit/tests/main.nf.test.snap @@ -0,0 +1,158 @@ +{ + "faa - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test_stub" + }, + "test_stub.fasta:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + [ + { + "id": "test_stub" + }, + "test_stub_after_mqc.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ], + [ + { + "id": "test_stub" + }, + "test_stub_before_mqc.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "2": [ + "versions.yml:md5,4a72093d798cac55be5088015708d471", + "versions.yml:md5,8412ce0c46c747dbf5a92d26ecc091ad", + "versions.yml:md5,ae6438038b16e6e6e730325116ed2944", + "versions.yml:md5,bca759d787b0ca55b454a6b0aa55f9ee" + ], + "fasta": [ + [ + { + "id": "test_stub" + }, + "test_stub.fasta:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "multiqc_files": [ + [ + { + "id": "test_stub" + }, + "test_stub_after_mqc.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ], + [ + { + "id": "test_stub" + }, + "test_stub_before_mqc.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,4a72093d798cac55be5088015708d471", + "versions.yml:md5,8412ce0c46c747dbf5a92d26ecc091ad", + "versions.yml:md5,ae6438038b16e6e6e730325116ed2944", + "versions.yml:md5,bca759d787b0ca55b454a6b0aa55f9ee" + ] + }, + [ + { + "FAA_SEQFU_SEQKIT:SEQFU_STATS_AFTER": { + "seqfu": "1.22.3" + } + }, + { + "FAA_SEQFU_SEQKIT:SEQKIT_SEQ": { + "seqkit": "v2.9.0" + } + }, + { + "FAA_SEQFU_SEQKIT:SEQFU_STATS_BEFORE": { + "seqfu": "1.22.3" + } + }, + { + "FAA_SEQFU_SEQKIT:SEQKIT_RMDUP": { + "seqkit": "v2.9.0" + } + } + ] + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.3" + }, + "timestamp": "2026-02-03T14:51:50.251419053" + }, + "faa": { + "content": [ + { + "0": [ + [ + { + "id": "test_faa" + }, + "test_faa.fasta:md5,242b2a540b013689b51f8814fab92817" + ] + ], + "1": [ + [ + { + "id": "test_faa" + }, + "test_faa_after_mqc.txt:md5,e18db7ac9791039ca747e1bb45f73054" + ], + [ + { + "id": "test_faa" + }, + "test_faa_before_mqc.txt:md5,45fdb76a36d40a331779d7591f523ed9" + ] + ], + "2": [ + "versions.yml:md5,4a72093d798cac55be5088015708d471", + "versions.yml:md5,8412ce0c46c747dbf5a92d26ecc091ad", + "versions.yml:md5,ae6438038b16e6e6e730325116ed2944", + "versions.yml:md5,bca759d787b0ca55b454a6b0aa55f9ee" + ], + "fasta": [ + [ + { + "id": "test_faa" + }, + "test_faa.fasta:md5,242b2a540b013689b51f8814fab92817" + ] + ], + "multiqc_files": [ + [ + { + "id": "test_faa" + }, + "test_faa_after_mqc.txt:md5,e18db7ac9791039ca747e1bb45f73054" + ], + [ + { + "id": "test_faa" + }, + "test_faa_before_mqc.txt:md5,45fdb76a36d40a331779d7591f523ed9" + ] + ], + "versions": [ + "versions.yml:md5,4a72093d798cac55be5088015708d471", + "versions.yml:md5,8412ce0c46c747dbf5a92d26ecc091ad", + "versions.yml:md5,ae6438038b16e6e6e730325116ed2944", + "versions.yml:md5,bca759d787b0ca55b454a6b0aa55f9ee" + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.3" + }, + "timestamp": "2026-02-03T14:51:43.056425196" + } +} \ No newline at end of file diff --git a/subworkflows/nf-core/faa_seqfu_seqkit/tests/nextflow.config b/subworkflows/nf-core/faa_seqfu_seqkit/tests/nextflow.config new file mode 100644 index 0000000..38af0eb --- /dev/null +++ b/subworkflows/nf-core/faa_seqfu_seqkit/tests/nextflow.config @@ -0,0 +1,28 @@ +process { + + withName: SEQFU_STATS_BEFORE { + ext.prefix = { "${meta.id}_before" } + } + + withName: SEQFU_STATS_AFTER { + ext.prefix = { "${meta.id}_after" } + } + + withName: SEQKIT_SEQ { + ext.args = [ + "--remove-gaps", + "--upper-case", + "--validate-seq", + "--min-len 30", + "--max-len 5000" + ].join(' ').trim() + ext.prefix = "intermediate_seqkit_seq" + } + + withName: SEQKIT_REPLACE { + ext.args = '-p "/" -r "_"' + ext.suffix = "fasta" + ext.prefix = "intermediate_seqkit_replace" + } + +} diff --git a/subworkflows/nf-core/utils_nextflow_pipeline/tests/tags.yml b/subworkflows/nf-core/utils_nextflow_pipeline/tests/tags.yml deleted file mode 100644 index f847611..0000000 --- a/subworkflows/nf-core/utils_nextflow_pipeline/tests/tags.yml +++ /dev/null @@ -1,2 +0,0 @@ -subworkflows/utils_nextflow_pipeline: - - subworkflows/nf-core/utils_nextflow_pipeline/** diff --git a/subworkflows/nf-core/utils_nfcore_pipeline/main.nf b/subworkflows/nf-core/utils_nfcore_pipeline/main.nf index bfd2587..2f30e9a 100644 --- a/subworkflows/nf-core/utils_nfcore_pipeline/main.nf +++ b/subworkflows/nf-core/utils_nfcore_pipeline/main.nf @@ -98,7 +98,7 @@ def workflowVersionToYAML() { // Get channel of software versions used in pipeline in YAML format // def softwareVersionsToYAML(ch_versions) { - return ch_versions.unique().map { version -> processVersionsFromYAML(version) }.unique().mix(Channel.of(workflowVersionToYAML())) + return ch_versions.unique().map { version -> processVersionsFromYAML(version) }.unique().mix(channel.of(workflowVersionToYAML())) } // diff --git a/subworkflows/nf-core/utils_nfcore_pipeline/tests/tags.yml b/subworkflows/nf-core/utils_nfcore_pipeline/tests/tags.yml deleted file mode 100644 index ac8523c..0000000 --- a/subworkflows/nf-core/utils_nfcore_pipeline/tests/tags.yml +++ /dev/null @@ -1,2 +0,0 @@ -subworkflows/utils_nfcore_pipeline: - - subworkflows/nf-core/utils_nfcore_pipeline/** diff --git a/subworkflows/nf-core/utils_nfschema_plugin/main.nf b/subworkflows/nf-core/utils_nfschema_plugin/main.nf index 4994303..1df8b76 100644 --- a/subworkflows/nf-core/utils_nfschema_plugin/main.nf +++ b/subworkflows/nf-core/utils_nfschema_plugin/main.nf @@ -4,6 +4,7 @@ include { paramsSummaryLog } from 'plugin/nf-schema' include { validateParameters } from 'plugin/nf-schema' +include { paramsHelp } from 'plugin/nf-schema' workflow UTILS_NFSCHEMA_PLUGIN { @@ -15,32 +16,58 @@ workflow UTILS_NFSCHEMA_PLUGIN { // when this input is empty it will automatically use the configured schema or // "${projectDir}/nextflow_schema.json" as default. This input should not be empty // for meta pipelines + help // boolean: show help message + help_full // boolean: show full help message + show_hidden // boolean: show hidden parameters in help message + before_text // string: text to show before the help message and parameters summary + after_text // string: text to show after the help message and parameters summary + command // string: an example command of the pipeline main: + if(help || help_full) { + help_options = [ + beforeText: before_text, + afterText: after_text, + command: command, + showHidden: show_hidden, + fullHelp: help_full, + ] + if(parameters_schema) { + help_options << [parametersSchema: parameters_schema] + } + log.info paramsHelp( + help_options, + (params.help instanceof String && params.help != "true") ? params.help : "", + ) + exit 0 + } + // // Print parameter summary to stdout. This will display the parameters // that differ from the default given in the JSON schema // + + summary_options = [:] if(parameters_schema) { - log.info paramsSummaryLog(input_workflow, parameters_schema:parameters_schema) - } else { - log.info paramsSummaryLog(input_workflow) + summary_options << [parametersSchema: parameters_schema] } + log.info before_text + log.info paramsSummaryLog(summary_options, input_workflow) + log.info after_text // // Validate the parameters using nextflow_schema.json or the schema // given via the validation.parametersSchema configuration option // if(validate_params) { + validateOptions = [:] if(parameters_schema) { - validateParameters(parameters_schema:parameters_schema) - } else { - validateParameters() + validateOptions << [parametersSchema: parameters_schema] } + validateParameters(validateOptions) } emit: dummy_emit = true } - diff --git a/subworkflows/nf-core/utils_nfschema_plugin/tests/main.nf.test b/subworkflows/nf-core/utils_nfschema_plugin/tests/main.nf.test index 8fb3016..c977917 100644 --- a/subworkflows/nf-core/utils_nfschema_plugin/tests/main.nf.test +++ b/subworkflows/nf-core/utils_nfschema_plugin/tests/main.nf.test @@ -25,6 +25,12 @@ nextflow_workflow { input[0] = workflow input[1] = validate_params input[2] = "" + input[3] = false + input[4] = false + input[5] = false + input[6] = "" + input[7] = "" + input[8] = "" """ } } @@ -51,6 +57,12 @@ nextflow_workflow { input[0] = workflow input[1] = validate_params input[2] = "" + input[3] = false + input[4] = false + input[5] = false + input[6] = "" + input[7] = "" + input[8] = "" """ } } @@ -77,6 +89,12 @@ nextflow_workflow { input[0] = workflow input[1] = validate_params input[2] = "${projectDir}/subworkflows/nf-core/utils_nfschema_plugin/tests/nextflow_schema.json" + input[3] = false + input[4] = false + input[5] = false + input[6] = "" + input[7] = "" + input[8] = "" """ } } @@ -103,6 +121,12 @@ nextflow_workflow { input[0] = workflow input[1] = validate_params input[2] = "${projectDir}/subworkflows/nf-core/utils_nfschema_plugin/tests/nextflow_schema.json" + input[3] = false + input[4] = false + input[5] = false + input[6] = "" + input[7] = "" + input[8] = "" """ } } @@ -114,4 +138,36 @@ nextflow_workflow { ) } } + + test("Should create a help message") { + + when { + + params { + test_data = '' + outdir = null + } + + workflow { + """ + validate_params = true + input[0] = workflow + input[1] = validate_params + input[2] = "${projectDir}/subworkflows/nf-core/utils_nfschema_plugin/tests/nextflow_schema.json" + input[3] = true + input[4] = false + input[5] = false + input[6] = "Before" + input[7] = "After" + input[8] = "nextflow run test/test" + """ + } + } + + then { + assertAll( + { assert workflow.success } + ) + } + } } diff --git a/subworkflows/nf-core/utils_nfschema_plugin/tests/nextflow.config b/subworkflows/nf-core/utils_nfschema_plugin/tests/nextflow.config index 0907ac5..f6537cc 100644 --- a/subworkflows/nf-core/utils_nfschema_plugin/tests/nextflow.config +++ b/subworkflows/nf-core/utils_nfschema_plugin/tests/nextflow.config @@ -1,8 +1,8 @@ plugins { - id "nf-schema@2.1.0" + id "nf-schema@2.6.1" } validation { parametersSchema = "${projectDir}/subworkflows/nf-core/utils_nfschema_plugin/tests/nextflow_schema.json" monochromeLogs = true -} \ No newline at end of file +} diff --git a/tests/.nftignore b/tests/.nftignore new file mode 100644 index 0000000..153205a --- /dev/null +++ b/tests/.nftignore @@ -0,0 +1,22 @@ +.DS_Store +multiqc/multiqc_data/multiqc.parquet +multiqc/multiqc_data/multiqc.log +multiqc/multiqc_data/multiqc_data.json +multiqc/multiqc_data/multiqc_sources.txt +multiqc/multiqc_data/multiqc_software_versions.txt +multiqc/multiqc_data/llms-full.txt +multiqc/multiqc_plots/{svg,pdf,png}/*.{svg,pdf,png} +multiqc/multiqc_report.html +pipeline_info/*.{html,json,txt,yml} +domain_annotation/pfam/T1024.domtbl.gz +domain_annotation/pfam/T1026.domtbl.gz +domain_annotation/pfam/l_arginase.domtbl.gz +domain_annotation/funfam/T1024.domtbl.gz +domain_annotation/funfam/T1026.domtbl.gz +domain_annotation/funfam/l_arginase.domtbl.gz +functional_annotation/interproscan/T1024/T1024.gff3 +functional_annotation/interproscan/T1024/T1024.tsv +functional_annotation/interproscan/T1026/T1026.gff3 +functional_annotation/interproscan/T1026/T1026.tsv +functional_annotation/interproscan/l_arginase/l_arginase.gff3 +functional_annotation/interproscan/l_arginase/l_arginase.tsv diff --git a/tests/default.nf.test b/tests/default.nf.test new file mode 100644 index 0000000..b0d55a4 --- /dev/null +++ b/tests/default.nf.test @@ -0,0 +1,38 @@ +nextflow_pipeline { + + name "Test pipeline" + script "../main.nf" + tag "pipeline" + tag "pipeline_proteinannotator" + tag "cpu" + + + test("-profile test") { + + when { + params { + outdir = "$outputDir" + } + } + + then { + // stable_name: All files + folders in ${params.outdir}/ with a stable name + def stable_name = getAllFilesFromDir(params.outdir, relative: true, includeDir: true, ignore: ['pipeline_info/*.{html,json,txt}']) + // stable_path: All files in ${params.outdir}/ with stable content + def stable_path = getAllFilesFromDir(params.outdir, ignoreFile: 'tests/.nftignore') + assertAll( + { assert workflow.success}, + { assert snapshot( + // Number of successful tasks + workflow.trace.succeeded().size(), + // pipeline versions.yml file for multiqc from which Nextflow version is removed because we test pipelines on multiple Nextflow versions + removeNextflowVersion("$outputDir/pipeline_info/nf_core_proteinannotator_software_mqc_versions.yml"), + // All stable path name, with a relative path + stable_name, + // All files with stable contents + stable_path + ).match() } + ) + } + } +} diff --git a/tests/default.nf.test.snap b/tests/default.nf.test.snap new file mode 100644 index 0000000..ac75f97 --- /dev/null +++ b/tests/default.nf.test.snap @@ -0,0 +1,227 @@ +{ + "-profile test": { + "content": [ + 32, + { + "ARIA2": { + "aria2": "1.36.0" + }, + "ARIA2_FUNFAM": { + "aria2": "1.36.0" + }, + "ARIA2_PFAM": { + "aria2": "1.36.0" + }, + "HMMSEARCH_FUNFAM": { + "hmmer": 3.4 + }, + "HMMSEARCH_PFAM": { + "hmmer": 3.4 + }, + "INTERPROSCAN": { + "interproscan": "5.59-91.0" + }, + "S4PRED_RUNMODEL": { + "s4pred": "1.2.1" + }, + "SEQFU_STATS_AFTER": { + "seqfu": "1.22.3" + }, + "SEQFU_STATS_BEFORE": { + "seqfu": "1.22.3" + }, + "SEQKIT_REPLACE": { + "seqkit": "2.9.0" + }, + "SEQKIT_RMDUP": { + "seqkit": "v2.9.0" + }, + "SEQKIT_SEQ": { + "seqkit": "v2.9.0" + }, + "UNTAR": { + "untar": 1.34 + }, + "Workflow": { + "nf-core/proteinannotator": "v1.0.0" + } + }, + [ + "domain_annotation", + "domain_annotation/funfam", + "domain_annotation/funfam/T1024.domtbl.gz", + "domain_annotation/funfam/T1026.domtbl.gz", + "domain_annotation/funfam/l_arginase.domtbl.gz", + "domain_annotation/pfam", + "domain_annotation/pfam/T1024.domtbl.gz", + "domain_annotation/pfam/T1026.domtbl.gz", + "domain_annotation/pfam/l_arginase.domtbl.gz", + "downloaded_dbs", + "downloaded_dbs/Pfam-A_test.hmm.gz", + "downloaded_dbs/funfam-hmm3-v4_3_0_test.lib.gz", + "downloaded_dbs/interproscan_db", + "downloaded_dbs/interproscan_db/hamap", + "downloaded_dbs/interproscan_db/hamap/2023_05", + "downloaded_dbs/interproscan_db/hamap/2023_05/hamap.hmm.lib", + "downloaded_dbs/interproscan_db/hamap/2023_05/hamap.prf", + "downloaded_dbs/interproscan_db/hamap/2023_05/profiles", + "downloaded_dbs/interproscan_db/hamap/2023_05/profiles/MF_00457.prf", + "downloaded_dbs/interproscan_db/hamap/2023_05/profiles/MF_01458.prf", + "downloaded_dbs/interproscan_db/sfld", + "downloaded_dbs/interproscan_db/sfld/4", + "downloaded_dbs/interproscan_db/sfld/4/sfld.hmm", + "downloaded_dbs/interproscan_db/sfld/4/sfld.msa", + "downloaded_dbs/interproscan_db/sfld/4/sfld_hierarchy_flat.txt", + "downloaded_dbs/interproscan_db/sfld/4/sfld_sites.annot", + "downloaded_dbs/interproscan_db/tigrfam", + "downloaded_dbs/interproscan_db/tigrfam/15.0", + "downloaded_dbs/interproscan_db/tigrfam/15.0/TIGRFAMs_15.0_HMM.LIB", + "downloaded_dbs/interproscan_db/tigrfam/15.0/TIGRFAMs_HMM.LIB", + "downloaded_dbs/interproscan_test.tar.gz", + "functional_annotation", + "functional_annotation/interproscan", + "functional_annotation/interproscan/T1024", + "functional_annotation/interproscan/T1024/T1024.gff3", + "functional_annotation/interproscan/T1024/T1024.json", + "functional_annotation/interproscan/T1024/T1024.tsv", + "functional_annotation/interproscan/T1024/T1024.xml", + "functional_annotation/interproscan/T1026", + "functional_annotation/interproscan/T1026/T1026.gff3", + "functional_annotation/interproscan/T1026/T1026.json", + "functional_annotation/interproscan/T1026/T1026.tsv", + "functional_annotation/interproscan/T1026/T1026.xml", + "functional_annotation/interproscan/l_arginase", + "functional_annotation/interproscan/l_arginase/l_arginase.gff3", + "functional_annotation/interproscan/l_arginase/l_arginase.json", + "functional_annotation/interproscan/l_arginase/l_arginase.tsv", + "functional_annotation/interproscan/l_arginase/l_arginase.xml", + "multiqc", + "multiqc/multiqc_data", + "multiqc/multiqc_data/llms-full.txt", + "multiqc/multiqc_data/multiqc.log", + "multiqc/multiqc_data/multiqc.parquet", + "multiqc/multiqc_data/multiqc_T1024_after.txt", + "multiqc/multiqc_data/multiqc_T1024_before.txt", + "multiqc/multiqc_data/multiqc_T1026_after.txt", + "multiqc/multiqc_data/multiqc_T1026_before.txt", + "multiqc/multiqc_data/multiqc_citations.txt", + "multiqc/multiqc_data/multiqc_data.json", + "multiqc/multiqc_data/multiqc_l_arginase_after.txt", + "multiqc/multiqc_data/multiqc_l_arginase_before.txt", + "multiqc/multiqc_data/multiqc_software_versions.txt", + "multiqc/multiqc_data/multiqc_sources.txt", + "multiqc/multiqc_plots", + "multiqc/multiqc_plots/pdf", + "multiqc/multiqc_plots/pdf/T1024_after.pdf", + "multiqc/multiqc_plots/pdf/T1024_before.pdf", + "multiqc/multiqc_plots/pdf/T1026_after.pdf", + "multiqc/multiqc_plots/pdf/T1026_before.pdf", + "multiqc/multiqc_plots/pdf/l_arginase_after.pdf", + "multiqc/multiqc_plots/pdf/l_arginase_before.pdf", + "multiqc/multiqc_plots/png", + "multiqc/multiqc_plots/png/T1024_after.png", + "multiqc/multiqc_plots/png/T1024_before.png", + "multiqc/multiqc_plots/png/T1026_after.png", + "multiqc/multiqc_plots/png/T1026_before.png", + "multiqc/multiqc_plots/png/l_arginase_after.png", + "multiqc/multiqc_plots/png/l_arginase_before.png", + "multiqc/multiqc_plots/svg", + "multiqc/multiqc_plots/svg/T1024_after.svg", + "multiqc/multiqc_plots/svg/T1024_before.svg", + "multiqc/multiqc_plots/svg/T1026_after.svg", + "multiqc/multiqc_plots/svg/T1026_before.svg", + "multiqc/multiqc_plots/svg/l_arginase_after.svg", + "multiqc/multiqc_plots/svg/l_arginase_before.svg", + "multiqc/multiqc_report.html", + "pipeline_info", + "pipeline_info/nf_core_proteinannotator_software_mqc_versions.yml", + "qc", + "qc/T1024", + "qc/T1024/T1024.fasta", + "qc/T1024/T1024.log", + "qc/T1024/T1024_after.tsv", + "qc/T1024/T1024_after_mqc.txt", + "qc/T1024/T1024_before.tsv", + "qc/T1024/T1024_before_mqc.txt", + "qc/T1026", + "qc/T1026/T1026.fasta", + "qc/T1026/T1026.log", + "qc/T1026/T1026_after.tsv", + "qc/T1026/T1026_after_mqc.txt", + "qc/T1026/T1026_before.tsv", + "qc/T1026/T1026_before_mqc.txt", + "qc/l_arginase", + "qc/l_arginase/l_arginase.fasta", + "qc/l_arginase/l_arginase.log", + "qc/l_arginase/l_arginase_after.tsv", + "qc/l_arginase/l_arginase_after_mqc.txt", + "qc/l_arginase/l_arginase_before.tsv", + "qc/l_arginase/l_arginase_before_mqc.txt", + "s4pred", + "s4pred/T1024", + "s4pred/T1024/ss2", + "s4pred/T1024/ss2/T1024.ss2", + "s4pred/T1026", + "s4pred/T1026/ss2", + "s4pred/T1026/ss2/T1026.ss2", + "s4pred/l_arginase", + "s4pred/l_arginase/ss2", + "s4pred/l_arginase/ss2/GI|225038609|EFDID|719595|FULL.ss2" + ], + [ + "Pfam-A_test.hmm.gz:md5,a5ab72b2b7bc72c62756684707e2387c", + "funfam-hmm3-v4_3_0_test.lib.gz:md5,df8b324882e1ceb8f8196155a968ed77", + "hamap.hmm.lib:md5,8c589a7f459284080e674e79454434d3", + "hamap.prf:md5,1c9f7eef7704bf307cfaf9f0f2e22153", + "MF_00457.prf:md5,91ab950fb5c449fef9f0cd235b72e9d9", + "MF_01458.prf:md5,f43826d1c001200d5d60ca4d97235f96", + "sfld.hmm:md5,ad6c16bfdd1ec4dc1f13aadaf1b2f1bd", + "sfld.msa:md5,1ee957899d996426a442ff1ae7737c68", + "sfld_hierarchy_flat.txt:md5,8c606dcaa55f174547a42606a0f1cd1a", + "sfld_sites.annot:md5,aa216404da8721f56cae847600170784", + "TIGRFAMs_15.0_HMM.LIB:md5,64f2b2c9e834b47b17d91bb9a6a0067e", + "TIGRFAMs_HMM.LIB:md5,543da3f4b65eed9ec393986c6c6ff0ba", + "interproscan_test.tar.gz:md5,cde88c0cd841c84dc1203e64854c762b", + "T1024.json:md5,0288f7551a14faedc409dd374b3e073e", + "T1024.xml:md5,63a3db0eb0e1f76403411602c23b721e", + "T1026.json:md5,5c2a40474b1cfb50cd043fe0be5e5d52", + "T1026.xml:md5,335552ce1703548565212a1d54681d75", + "l_arginase.json:md5,e0d127dd8a952cbd798999851d1338e6", + "l_arginase.xml:md5,7248992d9c1618cf7baa7515ae79ce32", + "multiqc_T1024_after.txt:md5,f2a552d4750ff8360941b10cec141499", + "multiqc_T1024_before.txt:md5,f2a552d4750ff8360941b10cec141499", + "multiqc_T1026_after.txt:md5,aabd4e58ed67d366fd04592ca09dbc9b", + "multiqc_T1026_before.txt:md5,aabd4e58ed67d366fd04592ca09dbc9b", + "multiqc_citations.txt:md5,4c806e63a283ec1b7e78cdae3a923d4f", + "multiqc_l_arginase_after.txt:md5,5df939cbafd732aa5095dad4434a4a33", + "multiqc_l_arginase_before.txt:md5,dd8e50549823ef944d6e36d0e2af56f7", + "T1024.fasta:md5,aa546680bf58fcb8aeb10e1617e6d5bd", + "T1024.log:md5,a41135cfe024baaf42f135583fe73f0d", + "T1024_after.tsv:md5,2daf830aba484edbcf25e811f5769ad0", + "T1024_after_mqc.txt:md5,23c573ad96f7199ba251c7bacd1c5968", + "T1024_before.tsv:md5,2daf830aba484edbcf25e811f5769ad0", + "T1024_before_mqc.txt:md5,23c573ad96f7199ba251c7bacd1c5968", + "T1026.fasta:md5,ae21f6aa06d0a5cedc121db5dfc343f3", + "T1026.log:md5,a41135cfe024baaf42f135583fe73f0d", + "T1026_after.tsv:md5,052ba564eabda203298ffc26ef80b7ab", + "T1026_after_mqc.txt:md5,3534726223f5a24dedda4446fd202404", + "T1026_before.tsv:md5,052ba564eabda203298ffc26ef80b7ab", + "T1026_before_mqc.txt:md5,3534726223f5a24dedda4446fd202404", + "l_arginase.fasta:md5,38d388cfc6ee9013eba4f693d104d6f0", + "l_arginase.log:md5,a41135cfe024baaf42f135583fe73f0d", + "l_arginase_after.tsv:md5,6c4fe965aa0905f437f3731ed8eed98e", + "l_arginase_after_mqc.txt:md5,af14b7e79260ece0074d4ea82a3b3ce6", + "l_arginase_before.tsv:md5,31a34ec0257053f34a449270f23d98ff", + "l_arginase_before_mqc.txt:md5,7ab718f12ea5460f254b8c43cfd11040", + "T1024.ss2:md5,6f2467c4e6974e761333bef106809e88", + "T1026.ss2:md5,cc788acb2aefe43fea147d9dd2b6c2c9", + "GI|225038609|EFDID|719595|FULL.ss2:md5,e7d8eaa84d46a6a714ffe00d7f21cdfb" + ] + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.3" + }, + "timestamp": "2026-02-04T12:43:32.273407057" + } +} \ No newline at end of file diff --git a/tests/nextflow.config b/tests/nextflow.config new file mode 100644 index 0000000..db52597 --- /dev/null +++ b/tests/nextflow.config @@ -0,0 +1,13 @@ +/* +======================================================================================== + Nextflow config file for running nf-test tests +======================================================================================== +*/ + +// Or any resources requirements +params { + modules_testdata_base_path = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/' + pipelines_testdata_base_path = 'https://raw.githubusercontent.com/nf-core/test-datasets/refs/heads/proteinannotator' +} + +aws.client.anonymous = true diff --git a/workflows/proteinannotator.nf b/workflows/proteinannotator.nf index bc0b4db..fae1d7a 100644 --- a/workflows/proteinannotator.nf +++ b/workflows/proteinannotator.nf @@ -3,7 +3,10 @@ IMPORT MODULES / SUBWORKFLOWS / FUNCTIONS ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -include { FASTQC } from '../modules/nf-core/fastqc/main' +include { FAA_SEQFU_SEQKIT } from '../subworkflows/nf-core/faa_seqfu_seqkit/main' +include { DOMAIN_ANNOTATION } from '../subworkflows/local/domain_annotation' +include { FUNCTIONAL_ANNOTATION } from '../subworkflows/local/functional_annotation' +include { S4PRED_RUNMODEL } from '../modules/nf-core/s4pred/runmodel/main' include { MULTIQC } from '../modules/nf-core/multiqc/main' include { paramsSummaryMap } from 'plugin/nf-schema' include { paramsSummaryMultiqc } from '../subworkflows/nf-core/utils_nfcore_pipeline' @@ -17,81 +20,125 @@ include { methodsDescriptionText } from '../subworkflows/local/utils_nfcore_prot */ workflow PROTEINANNOTATOR { - take: - ch_samplesheet // channel: samplesheet read in from --input + ch_samplesheet // channel: samplesheet read in from --input + skip_preprocessing // boolean + skip_pfam // boolean + pfam_db // string, path to the pfam HMM database, if already exists + pfam_latest_link // string, path to the latest pfam HMM database, to download + skip_funfam // boolean + funfam_db // string, path to the pfam HMM database, if already exists + funfam_latest_link // string, path to the latest pfam HMM database, to download + skip_interproscan // boolean + interproscan_db_url // string, url to download db + interproscan_db // string, existing db + skip_s4pred // boolean + main: - ch_versions = Channel.empty() - ch_multiqc_files = Channel.empty() - // - // MODULE: Run FastQC - // - FASTQC ( - ch_samplesheet + ch_versions = channel.empty() + ch_multiqc_files = channel.empty() + + FAA_SEQFU_SEQKIT( ch_samplesheet, skip_preprocessing ) + ch_versions = ch_versions.mix( FAA_SEQFU_SEQKIT.out.versions ) + + DOMAIN_ANNOTATION ( + FAA_SEQFU_SEQKIT.out.fasta, + skip_pfam, + pfam_db, + pfam_latest_link, + skip_funfam, + funfam_db, + funfam_latest_link ) - ch_multiqc_files = ch_multiqc_files.mix(FASTQC.out.zip.collect{it[1]}) - ch_versions = ch_versions.mix(FASTQC.out.versions.first()) + ch_versions = ch_versions.mix( DOMAIN_ANNOTATION.out.versions ) + + FUNCTIONAL_ANNOTATION ( + FAA_SEQFU_SEQKIT.out.fasta, + skip_interproscan, + interproscan_db_url, + interproscan_db + ) + ch_versions = ch_versions.mix( FUNCTIONAL_ANNOTATION.out.versions ) + + if (!skip_s4pred) { + S4PRED_RUNMODEL( FAA_SEQFU_SEQKIT.out.fasta ) + ch_versions = ch_versions.mix( S4PRED_RUNMODEL.out.versions.first() ) + } // // Collate and save software versions // - softwareVersionsToYAML(ch_versions) + def topic_versions = channel.topic("versions") + .distinct() + .branch { entry -> + versions_file: entry instanceof Path + versions_tuple: true + } + + def topic_versions_string = topic_versions.versions_tuple + .map { process, tool, version -> + [ process[process.lastIndexOf(':')+1..-1], " ${tool}: ${version}" ] + } + .groupTuple(by:0) + .map { process, tool_versions -> + tool_versions.unique().sort() + "${process}:\n${tool_versions.join('\n')}" + } + + softwareVersionsToYAML(ch_versions.mix(topic_versions.versions_file)) + .mix(topic_versions_string) .collectFile( storeDir: "${params.outdir}/pipeline_info", - name: 'nf_core_' + 'proteinannotator_software_' + 'mqc_' + 'versions.yml', + name: 'nf_core_' + 'proteinannotator_software_' + 'mqc_' + 'versions.yml', sort: true, - newLine: true - ).set { ch_collated_versions } - + newLine: true, + ) + .set { ch_collated_versions } // // MODULE: MultiQC // - ch_multiqc_config = Channel.fromPath( + ch_multiqc_config = channel.fromPath( "$projectDir/assets/multiqc_config.yml", checkIfExists: true) ch_multiqc_custom_config = params.multiqc_config ? - Channel.fromPath(params.multiqc_config, checkIfExists: true) : - Channel.empty() + channel.fromPath(params.multiqc_config, checkIfExists: true) : + channel.empty() ch_multiqc_logo = params.multiqc_logo ? - Channel.fromPath(params.multiqc_logo, checkIfExists: true) : - Channel.empty() + channel.fromPath(params.multiqc_logo, checkIfExists: true) : + channel.empty() summary_params = paramsSummaryMap( workflow, parameters_schema: "nextflow_schema.json") - ch_workflow_summary = Channel.value(paramsSummaryMultiqc(summary_params)) + ch_workflow_summary = channel.value(paramsSummaryMultiqc(summary_params)) ch_multiqc_files = ch_multiqc_files.mix( ch_workflow_summary.collectFile(name: 'workflow_summary_mqc.yaml')) ch_multiqc_custom_methods_description = params.multiqc_methods_description ? file(params.multiqc_methods_description, checkIfExists: true) : file("$projectDir/assets/methods_description_template.yml", checkIfExists: true) - ch_methods_description = Channel.value( + ch_methods_description = channel.value( methodsDescriptionText(ch_multiqc_custom_methods_description)) ch_multiqc_files = ch_multiqc_files.mix(ch_collated_versions) ch_multiqc_files = ch_multiqc_files.mix( ch_methods_description.collectFile( name: 'methods_description_mqc.yaml', - sort: true + sort: true, ) ) - MULTIQC ( + ch_multiqc_files = ch_multiqc_files.mix(FAA_SEQFU_SEQKIT.out.multiqc_files.collect{ f -> f[1] }.ifEmpty([])) + + MULTIQC( ch_multiqc_files.collect(), ch_multiqc_config.toList(), ch_multiqc_custom_config.toList(), ch_multiqc_logo.toList(), [], - [] + [], ) - emit:multiqc_report = MULTIQC.out.report.toList() // channel: /path/to/multiqc_report.html - versions = ch_versions // channel: [ path(versions.yml) ] - + emit: + multiqc_report = MULTIQC.out.report.toList() // channel: /path/to/multiqc_report.html + versions = ch_versions // channel: [ path(versions.yml) ] } - -/* -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - THE END -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -*/