{CI} Fix merge-base diff in linter, style, and scan jobs

naga-nandyala · naga-nandyala · commit e7117cec6401 · 2026-03-26T15:18:55.000+11:00
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
@@ -190,19 +190,10 @@ jobs:
         #!/usr/bin/env bash
         set -ev
         source ./env/bin/activate
-        git fetch origin --depth=1 $(System.PullRequest.TargetBranch)
-        declare -A secret_files
-        for FILE in `git diff --name-only --diff-filter=AM origin/$(System.PullRequest.TargetBranch)` ; do
-          detected=$(azdev scan -f $FILE --continue-on-failure| python -c "import sys, json; print(json.load(sys.stdin)['secrets_detected'])")
-          if [ $detected == 'True' ]; then
-            printf "\033[0;31mSecrets detected from %s, Please remove or replace it. You can run 'azdev scan'/'azdev mask' locally to fix.\033[0m\n" "$FILE"
-            secret_files+=$FILE
-          fi
-        done
-        if [ "${#secret_files[@]}" -gt 0 ]; then
-          exit 1
-        fi
+        python scripts/ci/azdev_scan.py
       displayName: "azdev scan ( High Confidence ) on Modified Extensions"
+      env:
+        ADO_PULL_REQUEST_TARGET_BRANCH: $(System.PullRequest.TargetBranch)
 
 - job: AzdevScanProModifiedExtensionsMedium
   displayName: "azdev scan ( Medium Confidence ) on Modified Extensions"
@@ -221,19 +212,10 @@ jobs:
         #!/usr/bin/env bash
         set -ev
         source ./env/bin/activate
-        git fetch origin --depth=1 $(System.PullRequest.TargetBranch)
-        declare -A secret_files
-        for FILE in `git diff --name-only --diff-filter=AM origin/$(System.PullRequest.TargetBranch)` ; do
-          detected=$(azdev scan --confidence-level MEDIUM -f $FILE --continue-on-failure| python -c "import sys, json; print(json.load(sys.stdin)['secrets_detected'])")
-          if [ $detected == 'True' ]; then
-            printf "\033[0;31mSecrets detected from %s, Please remove or replace it. You can run 'azdev scan --confidence-level MEDIUM'/'azdev mask --confidence-level MEDIUM' locally to fix.\033[0m\n" "$FILE"
-            secret_files+=$FILE
-          fi
-        done
-        if [ "${#secret_files[@]}" -gt 0 ]; then
-          exit 1
-        fi
+        python scripts/ci/azdev_scan.py --confidence-level MEDIUM
       displayName: "azdev scan ( Medium Confidence ) on Modified Extensions"
+      env:
+        ADO_PULL_REQUEST_TARGET_BRANCH: $(System.PullRequest.TargetBranch)
 
 #- job: IndexRefDocVerify
 #  displayName: "Verify Ref Docs"
diff --git a/scripts/ci/azdev_linter_style.py b/scripts/ci/azdev_linter_style.py
@@ -7,7 +7,7 @@
 This script is used to run azdev linter and azdev style on extensions.
 
 It's only working on ADO by default. If want to run locally,
-please update the target branch/commit to find diff in function find_modified_files_against_master_branch()
+please update the target branch in find_modified_files_against_master_branch() in util.py.
 """
 import json
 import logging
@@ -18,7 +18,7 @@
 
 import service_name
 from packaging.version import Version
-from util import get_ext_metadata
+from util import get_ext_metadata, find_modified_files_against_master_branch
 
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.DEBUG)
@@ -119,30 +119,6 @@ def check_extension_name(self):
                              f"Please fix the name in setup.py!")
 
 
-def find_modified_files_against_master_branch():
-    """
-    Find modified files from src/ only.
-    A: Added, C: Copied, M: Modified, R: Renamed, T: File type changed.
-    Deleted files don't count in diff.
-    """
-    ado_pr_target_branch = 'origin/' + os.environ.get('ADO_PULL_REQUEST_TARGET_BRANCH')
-
-    separator_line()
-    logger.info('pull request target branch: %s', ado_pr_target_branch)
-
-    cmd = 'git --no-pager diff --name-only --diff-filter=ACMRT {} -- src/'.format(ado_pr_target_branch)
-    files = check_output(cmd.split()).decode('utf-8').split('\n')
-    files = [f for f in files if len(f) > 0]
-
-    if files:
-        logger.info('modified files:')
-        separator_line()
-        for f in files:
-            logger.info(f)
-
-    return files
-
-
 def contain_index_json(files):
     return 'src/index.json' in files
 
diff --git a/scripts/ci/azdev_scan.py b/scripts/ci/azdev_scan.py
@@ -0,0 +1,91 @@
+# --------------------------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See License.txt in the project root for license information.
+# --------------------------------------------------------------------------------------------
+
+"""
+This script is used to run azdev scan on modified extensions in PR pipelines.
+
+It reuses find_modified_files_against_master_branch() from util.py to get an
+accurate list of files changed in the PR (via merge-base), then runs
+azdev scan on each file.
+"""
+import json
+import logging
+import sys
+from subprocess import CalledProcessError, check_output
+
+from util import find_modified_files_against_master_branch
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.DEBUG)
+ch = logging.StreamHandler()
+ch.setLevel(logging.DEBUG)
+logger.addHandler(ch)
+
+
+def run_scan(modified_files, confidence_level=None):
+    """Run azdev scan on each modified file and report secrets."""
+    confidence_flag = []
+    confidence_msg = ''
+    if confidence_level:
+        confidence_flag = ['--confidence-level', confidence_level]
+        confidence_msg = ' --confidence-level {}'.format(confidence_level)
+
+    secret_files = []
+    failed_files = []
+    for f in modified_files:
+        cmd = ['azdev', 'scan', '-f', f, '--continue-on-failure'] + confidence_flag
+        logger.info('Scanning: %s', f)
+        try:
+            output = check_output(cmd).decode('utf-8', errors='replace')
+            result = json.loads(output)
+            if result.get('secrets_detected') is True:
+                logger.error(
+                    '\033[0;31mSecrets detected from %s, Please remove or replace it. '
+                    'You can run \'azdev scan%s\'/\'azdev mask%s\' locally to fix.\033[0m',
+                    f, confidence_msg, confidence_msg
+                )
+                secret_files.append(f)
+        except CalledProcessError as e:
+            logger.error('azdev scan failed for %s: %s', f, e)
+            failed_files.append(f)
+        except (json.JSONDecodeError, KeyError) as e:
+            logger.error('Failed to parse azdev scan output for %s: %s', f, e)
+            failed_files.append(f)
+
+    has_errors = False
+    if secret_files:
+        logger.error('Secrets detected in %d file(s): %s', len(secret_files), secret_files)
+        has_errors = True
+    if failed_files:
+        logger.error('Scan failed for %d file(s): %s', len(failed_files), failed_files)
+        has_errors = True
+    if has_errors:
+        sys.exit(1)
+    else:
+        logger.info('-' * 100)
+        logger.info('No secrets detected in any modified files.')
+        logger.info('-' * 100)
+
+
+def main():
+    import argparse
+    parser = argparse.ArgumentParser(description='azdev scan on modified extensions')
+    parser.add_argument('--confidence-level',
+                        type=str,
+                        default=None,
+                        help='Confidence level for azdev scan (e.g., MEDIUM). '
+                             'Default: HIGH (azdev scan default).')
+    args = parser.parse_args()
+
+    modified_files = find_modified_files_against_master_branch()
+    if not modified_files:
+        logger.info('No modified files found, skipping scan.')
+        return
+
+    run_scan(modified_files, confidence_level=args.confidence_level)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/scripts/ci/util.py b/scripts/ci/util.py
@@ -10,7 +10,7 @@
 import json
 import zipfile
 
-from subprocess import check_output
+from subprocess import check_call, check_output
 
 logger = logging.getLogger(__name__)
 
@@ -163,3 +163,67 @@ def diff_code(start, end):
                    f'end: {end}, '
                    f'diff_ref: {diff_ref}.')
     return diff_ref
+
+
+def find_modified_files_against_master_branch():
+    """
+    Find modified files from src/ only, using merge-base for accurate PR diff.
+    A: Added, C: Copied, M: Modified, R: Renamed, T: File type changed.
+    Deleted files don't count in diff.
+    """
+    ado_pr_target_branch = os.environ.get('ADO_PULL_REQUEST_TARGET_BRANCH')
+    if not ado_pr_target_branch or ado_pr_target_branch == '$(System.PullRequest.TargetBranch)':
+        logger.warning('ADO_PULL_REQUEST_TARGET_BRANCH is not available, skip diff.')
+        return []
+
+    normalized_branch = re.sub(
+        r'^(?:refs/remotes/origin/|refs/heads/|origin/)+', '', ado_pr_target_branch
+    )
+
+    ado_pr_target_branch = 'origin/{}'.format(normalized_branch)
+
+    logger.info('-' * 100)
+    logger.info('pull request target branch: %s', ado_pr_target_branch)
+
+    # Ensure target ref exists and has enough history for merge-base.
+    # Only use --deepen when the repo is a shallow clone.
+    is_shallow = os.path.isfile(os.path.join('.git', 'shallow'))
+    fetch_cmd = ['git', 'fetch', 'origin']
+    if is_shallow:
+        fetch_cmd.append('--deepen=50')
+    fetch_cmd.append('refs/heads/{}:refs/remotes/origin/{}'.format(normalized_branch, normalized_branch))
+    check_call(fetch_cmd)
+
+    try:
+        merge_base = check_output([
+            'git', 'merge-base', 'HEAD', ado_pr_target_branch
+        ]).decode('utf-8').strip()
+    except Exception:
+        if is_shallow:
+            logger.warning('merge-base failed after --deepen=50, falling back to --unshallow')
+            check_call([
+                'git',
+                'fetch',
+                'origin',
+                '--unshallow',
+                'refs/heads/{}:refs/remotes/origin/{}'.format(normalized_branch, normalized_branch),
+            ])
+            merge_base = check_output([
+                'git', 'merge-base', 'HEAD', ado_pr_target_branch
+            ]).decode('utf-8').strip()
+        else:
+            raise
+
+    logger.info('merge base: %s', merge_base)
+
+    cmd = ['git', '--no-pager', 'diff', '--name-only', '--diff-filter=ACMRT', merge_base, 'HEAD', '--', 'src/']
+    files = check_output(cmd).decode('utf-8').split('\n')
+    files = [f for f in files if len(f) > 0]
+
+    if files:
+        logger.info('modified files:')
+        logger.info('-' * 100)
+        for f in files:
+            logger.info(f)
+
+    return files