From ceebd2233705a0ba6a2aafb1516c6e69a2ce5872 Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Thu, 12 Mar 2026 17:07:29 -0700 Subject: [PATCH 001/103] feat: bump windows image version for 2026-03B (#8074) Co-authored-by: Jane Jung Co-authored-by: janenotjung-hue <107402425+janenotjung-hue@users.noreply.github.com> Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Co-authored-by: aks-node-assistant[bot] <190555641+aks-node-assistant[bot]@users.noreply.github.com> Signed-off-by: Ramkumar Chinchani From 57cd40675114782c7c276638d1bcf6838464ad92 Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Sun, 15 Mar 2026 22:08:49 -0700 Subject: [PATCH 002/103] feat(rcv1p): unify cert bootstrap flow and add Windows CA refresh task https://eng.ms/docs/products/onecert-certificates-key-vault-and-dsms/onecert-customer-guide/autorotationandecr/overviewrcv https://eng.ms/docs/products/onecert-certificates-key-vault-and-dsms/onecert-customer-guide/autorotationandecr/rcv1ptsg cse_cmd.sh.gtpl: derive cert endpoint mode from target cloud and always run custom-cloud init script. cse_cmd.sh: same mode logic as template; remove LOCATION export. init-aks-custom-cloud.sh: merged legacy + operation-requests logic into one script with distro-aware cert install paths. parts/linux/cloud-init/artifacts/init-aks-custom-cloud-mariner.sh: removed (merged into unified script). parts/linux/cloud-init/artifacts/init-aks-custom-cloud-operation-requests.sh: removed (merged into unified script). parts/linux/cloud-init/artifacts/init-aks-custom-cloud-operation-requests-mariner.sh: removed (merged into unified script). const.go: keep only unified custom-cloud init script constant. variables.go: simplify script selection to always use unified init script. kubernetesfunc.ps1: add location-aware CA retrieval (legacy/rcv1p) and scheduled refresh task registration helper. kuberneteswindowssetup.ps1: pass location to CA retrieval and register refresh task for custom cloud. Signed-off-by: Ramkumar Chinchani --- aks-node-controller/parser/helper.go | 7 +- .../parser/templates/cse_cmd.sh.gtpl | 1 + .../init-aks-custom-cloud-mariner.sh | 186 --------- ...custom-cloud-operation-requests-mariner.sh | 236 ------------ ...nit-aks-custom-cloud-operation-requests.sh | 346 ----------------- .../artifacts/init-aks-custom-cloud.sh | 358 ++++++++++++++++-- parts/windows/kuberneteswindowssetup.ps1 | 5 +- pkg/agent/const.go | 9 +- pkg/agent/variables.go | 19 +- staging/cse/windows/kubernetesfunc.ps1 | 132 +++++-- 10 files changed, 455 insertions(+), 844 deletions(-) delete mode 100644 parts/linux/cloud-init/artifacts/init-aks-custom-cloud-mariner.sh delete mode 100644 parts/linux/cloud-init/artifacts/init-aks-custom-cloud-operation-requests-mariner.sh delete mode 100644 parts/linux/cloud-init/artifacts/init-aks-custom-cloud-operation-requests.sh diff --git a/aks-node-controller/parser/helper.go b/aks-node-controller/parser/helper.go index 168cfc48a80..e01edec143e 100644 --- a/aks-node-controller/parser/helper.go +++ b/aks-node-controller/parser/helper.go @@ -64,6 +64,7 @@ func getFuncMap() template.FuncMap { return template.FuncMap{ "getInitAKSCustomCloudFilepath": getInitAKSCustomCloudFilepath, "getIsAksCustomCloud": getIsAksCustomCloud, + "getCloudLocation": getCloudLocation, } } @@ -538,11 +539,15 @@ func getIsAksCustomCloud(customCloudConfig *aksnodeconfigv1.CustomCloudConfig) b return strings.EqualFold(customCloudConfig.GetCustomCloudEnvName(), helpers.AksCustomCloudName) } +func getCloudLocation(v *aksnodeconfigv1.Configuration) string { + return strings.ToLower(strings.Join(strings.Fields(v.GetClusterConfig().GetLocation()), "")) +} + /* GetCloudTargetEnv determines and returns whether the region is a sovereign cloud which have their own data compliance regulations (China/Germany/USGov) or standard. */ // Azure public cloud. func getCloudTargetEnv(v *aksnodeconfigv1.Configuration) string { - loc := strings.ToLower(strings.Join(strings.Fields(v.GetClusterConfig().GetLocation()), "")) + loc := getCloudLocation(v) switch { case strings.HasPrefix(loc, "china"): return "AzureChinaCloud" diff --git a/aks-node-controller/parser/templates/cse_cmd.sh.gtpl b/aks-node-controller/parser/templates/cse_cmd.sh.gtpl index b1359b071d9..d685a3444da 100644 --- a/aks-node-controller/parser/templates/cse_cmd.sh.gtpl +++ b/aks-node-controller/parser/templates/cse_cmd.sh.gtpl @@ -3,4 +3,5 @@ echo $(date),$(hostname) > ${PROVISION_OUTPUT}; REPO_DEPOT_ENDPOINT="{{.CustomCloudConfig.RepoDepotEndpoint}}" {{getInitAKSCustomCloudFilepath}} >> /var/log/azure/cluster-provision.log 2>&1; {{end}} +LOCATION="{{getCloudLocation .}}" /usr/bin/nohup /bin/bash -c "/bin/bash /opt/azure/containers/provision_start.sh" diff --git a/parts/linux/cloud-init/artifacts/init-aks-custom-cloud-mariner.sh b/parts/linux/cloud-init/artifacts/init-aks-custom-cloud-mariner.sh deleted file mode 100644 index 587da9ba270..00000000000 --- a/parts/linux/cloud-init/artifacts/init-aks-custom-cloud-mariner.sh +++ /dev/null @@ -1,186 +0,0 @@ -#!/bin/bash -set -x -mkdir -p /root/AzureCACertificates - -IS_MARINER=0 -IS_AZURELINUX=0 -# shellcheck disable=SC3010 -if [[ -f /etc/os-release ]]; then - . /etc/os-release - # shellcheck disable=SC3010 - if [[ $NAME == *"Mariner"* ]]; then - IS_MARINER=1 - elif [[ $NAME == *"Microsoft Azure Linux"* ]]; then - IS_AZURELINUX=1 - else - echo "Unknown Linux distribution" - exit 1 - fi -else - echo "Unsupported operating system" - exit 1 -fi - -echo "distribution is $distribution" -echo "Running on $NAME" - -# http://168.63.129.16 is a constant for the host's wireserver endpoint -certs=$(curl "http://168.63.129.16/machine?comp=acmspackage&type=cacertificates&ext=json") -IFS_backup=$IFS -IFS=$'\r\n' -certNames=($(echo $certs | grep -oP '(?<=Name\": \")[^\"]*')) -certBodies=($(echo $certs | grep -oP '(?<=CertBody\": \")[^\"]*')) -for i in ${!certBodies[@]}; do - echo ${certBodies[$i]} | sed 's/\\r\\n/\n/g' | sed 's/\\//g' > "/root/AzureCACertificates/$(echo ${certNames[$i]} | sed 's/.cer/.crt/g')" -done -IFS=$IFS_backup - -cp /root/AzureCACertificates/*.crt /etc/pki/ca-trust/source/anchors/ -/usr/bin/update-ca-trust - -# This section creates a cron job to poll for refreshed CA certs daily -# It can be removed if not needed or desired -action=${1:-init} -if [ "$action" = "ca-refresh" ]; then - exit -fi - -scriptPath=$0 -# Determine an absolute, canonical path to this script for use in cron. -if command -v readlink >/dev/null 2>&1; then - # Use readlink -f when available to resolve the canonical path; fall back to $0 on error. - scriptPath="$(readlink -f "$0" 2>/dev/null || printf '%s' "$0")" -fi - -if ! crontab -l 2>/dev/null | grep -q "\"$scriptPath\" ca-refresh"; then - # Quote the script path in the cron entry to avoid issues with spaces or special characters. - if ! (crontab -l 2>/dev/null ; printf '%s\n' "0 19 * * * \"$scriptPath\" ca-refresh") | crontab -; then - echo "Failed to install ca-refresh cron job via crontab" >&2 - fi -fi - -cloud-init status --wait - -function init_mariner_repo_depot { - local repodepot_endpoint=$1 - echo "Adding [extended] repo" - cp /etc/yum.repos.d/mariner-extras.repo /etc/yum.repos.d/mariner-extended.repo - sed -i -e "s|extras|extended|" /etc/yum.repos.d/mariner-extended.repo - sed -i -e "s|Extras|Extended|" /etc/yum.repos.d/mariner-extended.repo - - echo "Adding [nvidia] repo" - cp /etc/yum.repos.d/mariner-extras.repo /etc/yum.repos.d/mariner-nvidia.repo - sed -i -e "s|extras|nvidia|" /etc/yum.repos.d/mariner-nvidia.repo - sed -i -e "s|Extras|Nvidia|" /etc/yum.repos.d/mariner-nvidia.repo - - echo "Adding [cloud-native] repo" - cp /etc/yum.repos.d/mariner-extras.repo /etc/yum.repos.d/mariner-cloud-native.repo - sed -i -e "s|extras|cloud-native|" /etc/yum.repos.d/mariner-cloud-native.repo - sed -i -e "s|Extras|Cloud-Native|" /etc/yum.repos.d/mariner-cloud-native.repo - - echo "Pointing Mariner repos at RepoDepot..." - for f in /etc/yum.repos.d/*.repo - do - sed -i -e "s|https://packages.microsoft.com|${repodepot_endpoint}/mariner/packages.microsoft.com|" $f - echo "$f modified." - done - echo "Mariner repo setup complete." -} - -function init_azurelinux_repo_depot { - local repodepot_endpoint=$1 - repos=("amd" "base" "cloud-native" "extended" "ms-non-oss" "ms-oss" "nvidia") - - # tbd maybe we do this a bit nicer - rm -f /etc/yum.repos.d/azurelinux* - - for repo in "${repos[@]}"; do - output_file="/etc/yum.repos.d/azurelinux-${repo}.repo" - repo_content=( - "[azurelinux-official-$repo]" - "name=Azure Linux Official $repo \$releasever \$basearch" - "baseurl=$repodepot_endpoint/azurelinux/\$releasever/prod/$repo/\$basearch" - "gpgkey=file:///etc/pki/rpm-gpg/MICROSOFT-RPM-GPG-KEY" - "gpgcheck=1" - "repo_gpgcheck=1" - "enabled=1" - "skip_if_unavailable=True" - "sslverify=1" - ) - - rm -f "$output_file" - - for line in "${repo_content[@]}"; do - echo "$line" >> "$output_file" - done - - echo "File '$output_file' has been created." - done - echo "Azure Linux repo setup complete." -} - -dnf_makecache() { - local retries=10 - local dnf_makecache_output=/tmp/dnf-makecache.out - local i - for i in $(seq 1 $retries); do - ! (dnf makecache -y 2>&1 | tee $dnf_makecache_output | grep -E "^([WE]:.*)|([eE]rr.*)$") && \ - cat $dnf_makecache_output && break || \ - cat $dnf_makecache_output - if [ $i -eq $retries ]; then - return 1 - else sleep 5 - fi - done - echo "Executed dnf makecache -y $i times" -} - -marinerRepoDepotEndpoint="$(echo "${REPO_DEPOT_ENDPOINT}" | sed 's/\/ubuntu//')" -if [ -z "$marinerRepoDepotEndpoint" ]; then - >&2 echo "repo depot endpoint empty while running custom-cloud init script" -else - # logic taken from https://repodepot.azure.com/scripts/cloud-init/setup_repodepot.sh - if [ "$IS_MARINER" -eq 1 ]; then - echo "Initializing Mariner repo depot settings..." - init_mariner_repo_depot ${marinerRepoDepotEndpoint} - dnf_makecache || exit 1 - elif [ "$IS_AZURELINUX" -eq 1 ]; then - echo "Initializing Azure Linux repo depot settings..." - init_azurelinux_repo_depot ${marinerRepoDepotEndpoint} - dnf_makecache || exit 1 - else - echo "No customizations for distribution: $NAME" - fi -fi - -# Set the chrony config to use the PHC /dev/ptp0 clock -cat > /etc/chrony.conf < "/root/AzureCACertificates/$cert_filename" - echo "Successfully saved certificate: $cert_filename" - else - echo "Warning: Failed to retrieve certificate content for $cert_filename" - fi - done -} - -# Process root certificates -process_cert_operations "operationrequestsroot" - -# Process intermediate certificates -process_cert_operations "operationrequestsintermediate" - -# Copy all certificate files to the Mariner/AzureLinux system certificate directory -cp /root/AzureCACertificates/*.crt /etc/pki/ca-trust/source/anchors/ - -# Update the system certificate store using Mariner/AzureLinux command -/usr/bin/update-ca-trust - -# This section creates a cron job to poll for refreshed CA certs daily -# It can be removed if not needed or desired -action=${1:-init} -if [ "$action" = "ca-refresh" ]; then - exit -fi - -scriptPath=$0 -# Determine an absolute, canonical path to this script for use in cron. -if command -v readlink >/dev/null 2>&1; then - # Use readlink -f when available to resolve the canonical path; fall back to $0 on error. - scriptPath="$(readlink -f "$0" 2>/dev/null || printf '%s' "$0")" -fi - -if ! crontab -l 2>/dev/null | grep -q "\"$scriptPath\" ca-refresh"; then - # Quote the script path in the cron entry to avoid issues with spaces or special characters. - if ! (crontab -l 2>/dev/null ; printf '%s\n' "0 19 * * * \"$scriptPath\" ca-refresh") | crontab -; then - echo "Failed to install ca-refresh cron job via crontab" >&2 - fi -fi - -function init_mariner_repo_depot { - local repodepot_endpoint=$1 - echo "Adding [extended] repo" - cp /etc/yum.repos.d/mariner-extras.repo /etc/yum.repos.d/mariner-extended.repo - sed -i -e "s|extras|extended|" /etc/yum.repos.d/mariner-extended.repo - sed -i -e "s|Extras|Extended|" /etc/yum.repos.d/mariner-extended.repo - - echo "Adding [nvidia] repo" - cp /etc/yum.repos.d/mariner-extras.repo /etc/yum.repos.d/mariner-nvidia.repo - sed -i -e "s|extras|nvidia|" /etc/yum.repos.d/mariner-nvidia.repo - sed -i -e "s|Extras|Nvidia|" /etc/yum.repos.d/mariner-nvidia.repo - - echo "Adding [cloud-native] repo" - cp /etc/yum.repos.d/mariner-extras.repo /etc/yum.repos.d/mariner-cloud-native.repo - sed -i -e "s|extras|cloud-native|" /etc/yum.repos.d/mariner-cloud-native.repo - sed -i -e "s|Extras|Cloud-Native|" /etc/yum.repos.d/mariner-cloud-native.repo - - echo "Pointing Mariner repos at RepoDepot..." - for f in /etc/yum.repos.d/*.repo - do - sed -i -e "s|https://packages.microsoft.com|${repodepot_endpoint}/mariner/packages.microsoft.com|" $f - echo "$f modified." - done - echo "Mariner repo setup complete." -} - -function init_azurelinux_repo_depot { - local repodepot_endpoint=$1 - repos=("amd" "base" "cloud-native" "extended" "ms-non-oss" "ms-oss" "nvidia") - - # tbd maybe we do this a bit nicer - rm -f /etc/yum.repos.d/azurelinux* - - for repo in "${repos[@]}"; do - output_file="/etc/yum.repos.d/azurelinux-${repo}.repo" - repo_content=( - "[azurelinux-official-$repo]" - "name=Azure Linux Official $repo \$releasever \$basearch" - "baseurl=$repodepot_endpoint/azurelinux/\$releasever/prod/$repo/\$basearch" - "gpgkey=file:///etc/pki/rpm-gpg/MICROSOFT-RPM-GPG-KEY" - "gpgcheck=1" - "repo_gpgcheck=1" - "enabled=1" - "skip_if_unavailable=True" - "sslverify=1" - ) - - rm -f "$output_file" - - for line in "${repo_content[@]}"; do - echo "$line" >> "$output_file" - done - - echo "File '$output_file' has been created." - done -} - -cloud-init status --wait - -dnf_makecache() { - local retries=10 - local dnf_makecache_output=/tmp/dnf-makecache.out - local i - for i in $(seq 1 $retries); do - ! (dnf makecache -y 2>&1 | tee $dnf_makecache_output | grep -E "^([WE]:.*)|([eE]rr.*)$") && \ - cat $dnf_makecache_output && break || \ - cat $dnf_makecache_output - if [ $i -eq $retries ]; then - return 1 - else sleep 5 - fi - done - echo "Executed dnf makecache -y $i times" -} - -marinerRepoDepotEndpoint="$(echo "${REPO_DEPOT_ENDPOINT}" | sed 's/\/ubuntu//')" -if [ -z "$marinerRepoDepotEndpoint" ]; then - >&2 echo "repo depot endpoint empty while running custom-cloud init script" -else - # logic taken from https://repodepot.azure.com/scripts/cloud-init/setup_repodepot.sh - if [ "$IS_MARINER" -eq 1 ]; then - echo "Initializing Mariner repo depot settings..." - init_mariner_repo_depot ${marinerRepoDepotEndpoint} - dnf_makecache || exit 1 - elif [ "$IS_AZURELINUX" -eq 1 ]; then - echo "Initializing Azure Linux repo depot settings..." - init_azurelinux_repo_depot ${marinerRepoDepotEndpoint} - dnf_makecache || exit 1 - else - echo "No customizations for distribution: $NAME" - fi -fi - -#EOF diff --git a/parts/linux/cloud-init/artifacts/init-aks-custom-cloud-operation-requests.sh b/parts/linux/cloud-init/artifacts/init-aks-custom-cloud-operation-requests.sh deleted file mode 100644 index 99ae86d0242..00000000000 --- a/parts/linux/cloud-init/artifacts/init-aks-custom-cloud-operation-requests.sh +++ /dev/null @@ -1,346 +0,0 @@ -#!/bin/bash -set -x -mkdir -p /root/AzureCACertificates - -IS_FLATCAR=0 -IS_UBUNTU=0 -IS_ACL=0 -# shellcheck disable=SC3010 -if [[ -f /etc/os-release ]]; then - . /etc/os-release - # shellcheck disable=SC3010 - if [[ $NAME == *"Ubuntu"* ]]; then - IS_UBUNTU=1 - elif [[ $ID == *"flatcar"* ]]; then - IS_FLATCAR=1 - elif [[ $ID == "azurecontainerlinux" ]] || { [[ $ID == "azurelinux" ]] && [[ ${VARIANT_ID:-} == "azurecontainerlinux" ]]; }; then - IS_ACL=1 - else - echo "Unknown Linux distribution" - exit 1 - fi -else - echo "Unsupported operating system" - exit 1 -fi - -echo "distribution is $distribution" -echo "Running on $NAME" - -# http://168.63.129.16 is a constant for the host's wireserver endpoint -WIRESERVER_ENDPOINT="http://168.63.129.16" - -# Function to make HTTP request with retry logic for rate limiting -make_request_with_retry() { - local url="$1" - local max_retries=10 - local retry_delay=3 - local attempt=1 - - local response - while [ $attempt -le $max_retries ]; do - response=$(curl -f --no-progress-meter "$url") - local request_status=$? - - if echo "$response" | grep -q "RequestRateLimitExceeded"; then - sleep $retry_delay - retry_delay=$((retry_delay * 2)) - attempt=$((attempt + 1)) - elif [ $request_status -ne 0 ]; then - sleep $retry_delay - attempt=$((attempt + 1)) - else - echo "$response" - return 0 - fi - done - - echo "exhausted all retries, last response: $response" - return 1 -} - -# Function to process certificate operations from a given endpoint -process_cert_operations() { - local endpoint_type="$1" - local operation_response - - echo "Retrieving certificate operations for type: $endpoint_type" - operation_response=$(make_request_with_retry "${WIRESERVER_ENDPOINT}/machine?comp=acmspackage&type=$endpoint_type&ext=json") - local request_status=$? - if [ -z "$operation_response" ] || [ $request_status -ne 0 ]; then - echo "Warning: No response received or request failed for: ${WIRESERVER_ENDPOINT}/machine?comp=acmspackage&type=$endpoint_type&ext=json" - return - fi - - # Extract ResourceFileName values from the JSON response - local cert_filenames - mapfile -t cert_filenames < <(echo "$operation_response" | grep -oP '(?<="ResouceFileName": ")[^"]*') - - if [ ${#cert_filenames[@]} -eq 0 ]; then - echo "No certificate filenames found in response for $endpoint_type" - return - fi - - # Process each certificate file - for cert_filename in "${cert_filenames[@]}"; do - echo "Processing certificate file: $cert_filename" - - # Extract filename and extension - local filename="${cert_filename%.*}" - local extension="${cert_filename##*.}" - - echo "Downloading certificate: filename=$filename, extension=$extension" - - # Retrieve the actual certificate content with retry logic - local cert_content - cert_content=$(make_request_with_retry "${WIRESERVER_ENDPOINT}/machine?comp=acmspackage&type=$filename&ext=$extension") - local request_status=$? - if [ -z "$cert_content" ] || [ $request_status -ne 0 ]; then - echo "Warning: No response received or request failed for: ${WIRESERVER_ENDPOINT}/machine?comp=acmspackage&type=$filename&ext=$extension" - continue - fi - - if [ -n "$cert_content" ]; then - # Save the certificate to the appropriate location - echo "$cert_content" > "/root/AzureCACertificates/$cert_filename" - echo "Successfully saved certificate: $cert_filename" - else - echo "Warning: Failed to retrieve certificate content for $cert_filename" - fi - done -} - -# Process root certificates -process_cert_operations "operationrequestsroot" - -# Process intermediate certificates -process_cert_operations "operationrequestsintermediate" - -if [ "$IS_ACL" -eq 1 ]; then - cp /root/AzureCACertificates/*.crt /etc/pki/ca-trust/source/anchors/ - update-ca-trust -elif [ "${IS_FLATCAR}" -eq 0 ]; then - # Copy all certificate files to the system certificate directory - cp /root/AzureCACertificates/*.crt /usr/local/share/ca-certificates/ - - # Update the system certificate store - update-ca-certificates - - # This copies the updated bundle to the location used by OpenSSL which is commonly used - cp /etc/ssl/certs/ca-certificates.crt /usr/lib/ssl/cert.pem -else - for cert in /root/AzureCACertificates/*.crt; do - destcert="${cert##*/}" - destcert="${destcert%.*}.pem" - cp "$cert" /etc/ssl/certs/"$destcert" - done - update-ca-certificates -fi - - - -# This section creates a cron job to poll for refreshed CA certs daily -# It can be removed if not needed or desired -action=${1:-init} -if [ "$action" = "ca-refresh" ]; then - exit -fi - -function init_ubuntu_main_repo_depot { - local repodepot_endpoint="$1" - # Initialize directory for keys - mkdir -p /etc/apt/keyrings - - # This copies the updated bundle to the location used by OpenSSL which is commonly used - echo "Copying updated bundle to OpenSSL .pem file..." - cp /etc/ssl/certs/ca-certificates.crt /usr/lib/ssl/cert.pem - echo "Updated bundle copied." - - # Back up sources.list and sources.list.d contents - mkdir -p /etc/apt/backup/ - if [ -f "/etc/apt/sources.list" ]; then - mv /etc/apt/sources.list /etc/apt/backup/ - fi - for sources_file in /etc/apt/sources.list.d/*; do - if [ -f "$sources_file" ]; then - mv "$sources_file" /etc/apt/backup/ - fi - done - - # Set location of sources file - . /etc/os-release - aptSourceFile="/etc/apt/sources.list.d/ubuntu.sources" - - # Create main sources file - cat < /etc/apt/sources.list.d/ubuntu.sources - -Types: deb -URIs: ${repodepot_endpoint}/ubuntu -Suites: ${VERSION_CODENAME} ${VERSION_CODENAME}-updates ${VERSION_CODENAME}-backports ${VERSION_CODENAME}-security -Components: main universe restricted multiverse -Signed-By: /usr/share/keyrings/ubuntu-archive-keyring.gpg -EOF - - # Update the apt sources file using the RepoDepot Ubuntu URL for this cloud. Update it by replacing - # all urls with the RepoDepot Ubuntu url - ubuntuUrl=${repodepot_endpoint}/ubuntu - echo "Converting URLs in $aptSourceFile to RepoDepot URLs..." - sed -i "s,https\?://.[^ ]*,$ubuntuUrl,g" $aptSourceFile - echo "apt source URLs converted, see new file below:" - echo "" - echo "-----" - cat $aptSourceFile - echo "-----" - echo "" -} - -function check_url { - local url=$1 - echo "Checking url: $url" - - # Use curl to check the URL and capture both stdout and stderr - curl_exit_code=$(curl -s --head --request GET $url) - # Check the exit status of curl - # shellcheck disable=SC3010 - if [[ $? -ne 0 ]] || echo "$curl_exit_code" | grep -E "404 Not Found" > /dev/null; then - echo "ERROR: $url is not available. Please manually check if the url is valid before re-running script" - exit 1 - fi -} - -function write_to_sources_file { - local sources_list_d_file=$1 - local source_uri=$2 - shift 2 - local key_paths=("$@") - - sources_file_path="/etc/apt/sources.list.d/${sources_list_d_file}.sources" - ubuntuDist=$(lsb_release -c | awk '{print $2}') - - tee -a $sources_file_path < /dev/null - echo "$key_name key added to keyring." -} - -function derive_key_paths { - local key_names=("$@") - local key_paths=() - - for key_name in "${key_names[@]}"; do - key_paths+=("/etc/apt/keyrings/${key_name}.gpg") - done - - echo "${key_paths[*]}" -} - -function add_ms_keys { - # Add the Microsoft package server keys to keyring. - echo "Adding Microsoft keys to keyring..." - - add_key_ubuntu microsoft.asc - add_key_ubuntu msopentech.asc -} - -function aptget_update { - echo "apt-get updating..." - echo "note: depending on how many sources have been added this may take a couple minutes..." - if apt-get update | grep -q "404 Not Found"; then - echo "ERROR: apt-get update failed to find all sources. Please validate the sources or remove bad sources from your sources and try again." - exit 1 - else - echo "apt-get update complete!" - fi -} - -function init_ubuntu_pmc_repo_depot { - local repodepot_endpoint="$1" - # Add Microsoft packages source to the azure specific sources.list. - echo "Adding the packages.microsoft.com Ubuntu-$ubuntuRel repo..." - - microsoftPackageSource="$repodepot_endpoint/microsoft/ubuntu/$ubuntuRel/prod" - check_url $microsoftPackageSource - write_to_sources_file microsoft-prod $microsoftPackageSource $(derive_key_paths microsoft.asc msopentech.asc) - write_to_sources_file microsoft-prod-testing $microsoftPackageSource $(derive_key_paths microsoft.asc msopentech.asc) - echo "Ubuntu ($ubuntuRel) repo added." - echo "Adding packages.microsoft.com keys" - add_ms_keys $repodepot_endpoint -} - -if [ "$IS_UBUNTU" -eq 1 ]; then - scriptPath=$0 - # Determine an absolute, canonical path to this script for use in cron. - if command -v readlink >/dev/null 2>&1; then - # Use readlink -f when available to resolve the canonical path; fall back to $0 on error. - scriptPath="$(readlink -f "$0" 2>/dev/null || printf '%s' "$0")" - fi - - if ! crontab -l 2>/dev/null | grep -q "\"$scriptPath\" ca-refresh"; then - # Quote the script path in the cron entry to avoid issues with spaces or special characters. - if ! (crontab -l 2>/dev/null ; printf '%s\n' "0 19 * * * \"$scriptPath\" ca-refresh") | crontab -; then - echo "Failed to install ca-refresh cron job via crontab" >&2 - fi - fi - - cloud-init status --wait - rootRepoDepotEndpoint="$(echo "${REPO_DEPOT_ENDPOINT}" | sed 's/\/ubuntu//')" - # logic taken from https://repodepot.azure.com/scripts/cloud-init/setup_repodepot.sh - ubuntuRel=$(lsb_release --release | awk '{print $2}') - ubuntuDist=$(lsb_release -c | awk '{print $2}') - # initialize archive.ubuntu.com repo - init_ubuntu_main_repo_depot ${rootRepoDepotEndpoint} - init_ubuntu_pmc_repo_depot ${rootRepoDepotEndpoint} - # update apt list - echo "Running apt-get update" - aptget_update -elif [ "$IS_FLATCAR" -eq 1 ] || [ "$IS_ACL" -eq 1 ]; then - script_path="$(readlink -f "$0")" - svc="/etc/systemd/system/azure-ca-refresh.service" - tmr="/etc/systemd/system/azure-ca-refresh.timer" - - cat >"$svc" <"$tmr" < "/root/AzureCACertificates/$(echo ${certNames[$i]} | sed "s/.cer/.${ext}/g")" -done -IFS=$IFS_backup +WIRESERVER_ENDPOINT="http://168.63.129.16" + +function make_request_with_retry { + local url="$1" + local max_retries=10 + local retry_delay=3 + local attempt=1 + + local response + while [ $attempt -le $max_retries ]; do + response=$(curl -f --no-progress-meter "$url") + local request_status=$? + + if echo "$response" | grep -q "RequestRateLimitExceeded"; then + sleep $retry_delay + retry_delay=$((retry_delay * 2)) + attempt=$((attempt + 1)) + elif [ $request_status -ne 0 ]; then + sleep $retry_delay + attempt=$((attempt + 1)) + else + echo "$response" + return 0 + fi + done -if [ "$IS_ACL" -eq 1 ]; then - cp /root/AzureCACertificates/*.crt /etc/pki/ca-trust/source/anchors/ - update-ca-trust -elif [ "$IS_FLATCAR" -eq 1 ]; then - cp /root/AzureCACertificates/*.pem /etc/ssl/certs/ - update-ca-certificates -else - cp /root/AzureCACertificates/*.crt /usr/local/share/ca-certificates/ - update-ca-certificates + echo "exhausted all retries, last response: $response" + return 1 +} - # This copies the updated bundle to the location used by OpenSSL which is commonly used - cp /etc/ssl/certs/ca-certificates.crt /usr/lib/ssl/cert.pem +function is_opted_in_for_root_certs { + local opt_in_response + + opt_in_response=$(make_request_with_retry "${WIRESERVER_ENDPOINT}/acms/isOptedInForRootCerts") + local request_status=$? + if [ $request_status -ne 0 ] || [ -z "$opt_in_response" ]; then + echo "Warning: failed to determine IsOptedInForRootCerts state" + return 1 + fi + + if echo "$opt_in_response" | grep -q "IsOptedInForRootCerts=true"; then + echo "IsOptedInForRootCerts=true" + return 0 + fi + + echo "Skipping custom cloud root cert installation because IsOptedInForRootCerts is not true" + return 1 +} + +function get_trust_store_dir { + if [ "$IS_ACL" -eq 1 ] || [ "$IS_MARINER" -eq 1 ] || [ "$IS_AZURELINUX" -eq 1 ]; then + echo "/etc/pki/ca-trust/source/anchors" + elif [ "$IS_FLATCAR" -eq 1 ]; then + echo "/etc/ssl/certs" + else + echo "/usr/local/share/ca-certificates" + fi +} + +function debug_print_trust_store { + local stage="$1" + local trust_store_dir + + trust_store_dir=$(get_trust_store_dir) + echo "Trust store contents ${stage} cert copy: ${trust_store_dir}" + ls -al "$trust_store_dir" || true +} + +function retrieve_legacy_certs { + local certs + local cert_names + local cert_bodies + local i + + certs=$(make_request_with_retry "${WIRESERVER_ENDPOINT}/machine?comp=acmspackage&type=cacertificates&ext=json") + if [ -z "$certs" ]; then + echo "Warning: failed to retrieve legacy custom cloud certificates" + return 1 + fi + + IFS_backup=$IFS + IFS=$'\r\n' + cert_names=($(echo $certs | grep -oP '(?<=Name\": \")[^\"]*')) + cert_bodies=($(echo $certs | grep -oP '(?<=CertBody\": \")[^\"]*')) + for i in ${!cert_bodies[@]}; do + echo ${cert_bodies[$i]} | sed 's/\\r\\n/\n/g' | sed 's/\\//g' > "/root/AzureCACertificates/$(echo ${cert_names[$i]} | sed 's/.cer/.crt/g')" + done + IFS=$IFS_backup +} + +function process_cert_operations { + local endpoint_type="$1" + local operation_response + + echo "Retrieving certificate operations for type: $endpoint_type" + operation_response=$(make_request_with_retry "${WIRESERVER_ENDPOINT}/machine?comp=acmspackage&type=$endpoint_type&ext=json") + local request_status=$? + if [ -z "$operation_response" ] || [ $request_status -ne 0 ]; then + echo "Warning: No response received or request failed for: ${WIRESERVER_ENDPOINT}/machine?comp=acmspackage&type=$endpoint_type&ext=json" + return 1 + fi + + local cert_filenames + mapfile -t cert_filenames < <(echo "$operation_response" | grep -oP '(?<="ResouceFileName": ")[^"]*') + + if [ ${#cert_filenames[@]} -eq 0 ]; then + echo "No certificate filenames found in response for $endpoint_type" + return 1 + fi + + for cert_filename in "${cert_filenames[@]}"; do + echo "Processing certificate file: $cert_filename" + + local filename="${cert_filename%.*}" + local extension="${cert_filename##*.}" + local cert_content + + cert_content=$(make_request_with_retry "${WIRESERVER_ENDPOINT}/machine?comp=acmspackage&type=$filename&ext=$extension") + local request_status=$? + if [ -z "$cert_content" ] || [ $request_status -ne 0 ]; then + echo "Warning: No response received or request failed for: ${WIRESERVER_ENDPOINT}/machine?comp=acmspackage&type=$filename&ext=$extension" + continue + fi + + echo "$cert_content" > "/root/AzureCACertificates/$cert_filename" + echo "Successfully saved certificate: $cert_filename" + done +} + +function retrieve_rcv1p_certs { + process_cert_operations "operationrequestsroot" || return 1 + process_cert_operations "operationrequestsintermediate" || return 1 +} + +function install_certs_to_trust_store { + mkdir -p /root/AzureCACertificates + + debug_print_trust_store "before" + + if [ "$IS_ACL" -eq 1 ] || [ "$IS_MARINER" -eq 1 ] || [ "$IS_AZURELINUX" -eq 1 ]; then + cp /root/AzureCACertificates/*.crt /etc/pki/ca-trust/source/anchors/ + update-ca-trust + elif [ "$IS_FLATCAR" -eq 1 ]; then + for cert in /root/AzureCACertificates/*.crt; do + destcert="${cert##*/}" + destcert="${destcert%.*}.pem" + cp "$cert" /etc/ssl/certs/"$destcert" + done + update-ca-certificates + else + cp /root/AzureCACertificates/*.crt /usr/local/share/ca-certificates/ + update-ca-certificates + + # This copies the updated bundle to the location used by OpenSSL which is commonly used + cp /etc/ssl/certs/ca-certificates.crt /usr/lib/ssl/cert.pem + fi + + debug_print_trust_store "after" +} + +# Certificate refresh behavior summary: +# - legacy mode directly attempts certificate download from wireserver and only in ussec and usnat regions. +# - rcv1p mode first checks IsOptedInForRootCerts, then downloads only when opted in. +# - Wireserver failures are treated as non-fatal, and cert trust-store updates are skipped gracefully. + +location_normalized="${LOCATION,,}" +location_normalized="${location_normalized//[[:space:]]/}" +if [ -z "$location_normalized" ]; then + echo "Warning: LOCATION is empty; defaulting custom cloud certificate endpoint mode to rcv1p" +fi + +cert_endpoint_mode="rcv1p" +case "$location_normalized" in + ussec*|usnat*) cert_endpoint_mode="legacy" ;; +esac +echo "Using custom cloud certificate endpoint mode: ${cert_endpoint_mode}" +rm -f /root/AzureCACertificates/* +if [ "$cert_endpoint_mode" = "legacy" ]; then + if retrieve_legacy_certs; then + install_certs_to_trust_store + else + echo "Warning: failed to retrieve legacy certificates from wireserver; continuing without trust store updates" + fi +elif [ "$cert_endpoint_mode" = "rcv1p" ]; then + if is_opted_in_for_root_certs; then + if retrieve_rcv1p_certs; then + install_certs_to_trust_store + else + echo "Warning: failed to retrieve rcv1p certificates from wireserver; continuing without trust store updates" + fi + fi fi # This section creates a cron job to poll for refreshed CA certs daily @@ -201,7 +371,80 @@ function init_ubuntu_pmc_repo_depot { add_ms_keys $repodepot_endpoint } -if [ "$IS_UBUNTU" -eq 1 ]; then +function init_mariner_repo_depot { + local repodepot_endpoint=$1 + echo "Adding [extended] repo" + cp /etc/yum.repos.d/mariner-extras.repo /etc/yum.repos.d/mariner-extended.repo + sed -i -e "s|extras|extended|" /etc/yum.repos.d/mariner-extended.repo + sed -i -e "s|Extras|Extended|" /etc/yum.repos.d/mariner-extended.repo + + echo "Adding [nvidia] repo" + cp /etc/yum.repos.d/mariner-extras.repo /etc/yum.repos.d/mariner-nvidia.repo + sed -i -e "s|extras|nvidia|" /etc/yum.repos.d/mariner-nvidia.repo + sed -i -e "s|Extras|Nvidia|" /etc/yum.repos.d/mariner-nvidia.repo + + echo "Adding [cloud-native] repo" + cp /etc/yum.repos.d/mariner-extras.repo /etc/yum.repos.d/mariner-cloud-native.repo + sed -i -e "s|extras|cloud-native|" /etc/yum.repos.d/mariner-cloud-native.repo + sed -i -e "s|Extras|Cloud-Native|" /etc/yum.repos.d/mariner-cloud-native.repo + + echo "Pointing Mariner repos at RepoDepot..." + for f in /etc/yum.repos.d/*.repo; do + sed -i -e "s|https://packages.microsoft.com|${repodepot_endpoint}/mariner/packages.microsoft.com|" $f + echo "$f modified." + done + echo "Mariner repo setup complete." +} + +function init_azurelinux_repo_depot { + local repodepot_endpoint=$1 + local repos=("amd" "base" "cloud-native" "extended" "ms-non-oss" "ms-oss" "nvidia") + + rm -f /etc/yum.repos.d/azurelinux* + + for repo in "${repos[@]}"; do + output_file="/etc/yum.repos.d/azurelinux-${repo}.repo" + repo_content=( + "[azurelinux-official-$repo]" + "name=Azure Linux Official $repo \$releasever \$basearch" + "baseurl=$repodepot_endpoint/azurelinux/\$releasever/prod/$repo/\$basearch" + "gpgkey=file:///etc/pki/rpm-gpg/MICROSOFT-RPM-GPG-KEY" + "gpgcheck=1" + "repo_gpgcheck=1" + "enabled=1" + "skip_if_unavailable=True" + "sslverify=1" + ) + + rm -f "$output_file" + + for line in "${repo_content[@]}"; do + echo "$line" >> "$output_file" + done + + echo "File '$output_file' has been created." + done + echo "Azure Linux repo setup complete." +} + +function dnf_makecache { + local retries=10 + local dnf_makecache_output=/tmp/dnf-makecache.out + local i + for i in $(seq 1 $retries); do + ! (dnf makecache -y 2>&1 | tee $dnf_makecache_output | grep -E "^([WE]:.*)|([eE]rr.*)$") && \ + cat $dnf_makecache_output && break || \ + cat $dnf_makecache_output + if [ $i -eq $retries ]; then + return 1 + else + sleep 5 + fi + done + echo "Executed dnf makecache -y $i times" +} + +if [ "$IS_UBUNTU" -eq 1 ] || [ "$IS_MARINER" -eq 1 ] || [ "$IS_AZURELINUX" -eq 1 ]; then scriptPath=$0 # Determine an absolute, canonical path to this script for use in cron. if command -v readlink >/dev/null 2>&1; then @@ -260,11 +503,72 @@ EOF systemctl enable --now azure-ca-refresh.timer fi +if [ "$IS_UBUNTU" -eq 1 ]; then + rootRepoDepotEndpoint="$(echo "${REPO_DEPOT_ENDPOINT}" | sed 's/\/ubuntu//')" + if [ -n "$rootRepoDepotEndpoint" ]; then + cloud-init status --wait + ubuntuRel=$(lsb_release --release | awk '{print $2}') + ubuntuDist=$(lsb_release -c | awk '{print $2}') + init_ubuntu_main_repo_depot ${rootRepoDepotEndpoint} + init_ubuntu_pmc_repo_depot ${rootRepoDepotEndpoint} + echo "Running apt-get update" + aptget_update + else + echo "REPO_DEPOT_ENDPOINT empty, skipping Ubuntu RepoDepot initialization" + fi +elif [ "$IS_MARINER" -eq 1 ] || [ "$IS_AZURELINUX" -eq 1 ]; then + cloud-init status --wait + + marinerRepoDepotEndpoint="$(echo "${REPO_DEPOT_ENDPOINT}" | sed 's/\/ubuntu//')" + if [ -z "$marinerRepoDepotEndpoint" ]; then + >&2 echo "repo depot endpoint empty while running custom-cloud init script" + else + if [ "$IS_MARINER" -eq 1 ]; then + echo "Initializing Mariner repo depot settings..." + init_mariner_repo_depot ${marinerRepoDepotEndpoint} + dnf_makecache || exit 1 + else + echo "Initializing Azure Linux repo depot settings..." + init_azurelinux_repo_depot ${marinerRepoDepotEndpoint} + dnf_makecache || exit 1 + fi + fi +fi + # Disable systemd-timesyncd and install chrony and uses local time source # ACL has PTP clock config compiled into chronyd with no config file or sourcedir directives, # so it uses only the local PTP clock and has no DHCP-injectable NTP sources. if [ "$IS_ACL" -eq 1 ]; then echo "Skipping chrony configuration for ACL (PTP clock baked into chronyd, no external NTP sources)" +elif [ "$IS_MARINER" -eq 1 ] || [ "$IS_AZURELINUX" -eq 1 ]; then +cat > /etc/chrony.conf < $certFilePath + } + + return $true } - Write-Log "Convert CA certificates rawdata" - $caCerts=($rawData.Content) | ConvertFrom-Json - if ([string]::IsNullOrEmpty($caCerts)) { - Set-ExitCode -ExitCode $global:WINDOWS_CSE_ERROR_EMPTY_CA_CERTIFICATES -ErrorMessage "CA certificates rawdata is empty" + $optInUri = 'http://168.63.129.16/acms/isOptedInForRootCerts' + $optInResponse = Retry-Command -Command 'Invoke-WebRequest' -Args @{Uri=$optInUri; UseBasicParsing=$true} -Retries 5 -RetryDelaySeconds 10 + if (($optInResponse.Content -notmatch 'IsOptedInForRootCerts=true')) { + Write-Log "Skipping custom cloud root cert installation because IsOptedInForRootCerts is not true" + return $false } - $certificates = $caCerts.Certificates - for ($index = 0; $index -lt $certificates.Length ; $index++) { - $name=$certificates[$index].Name - $certFilePath = Join-Path $caFolder $name - Write-Log "Write certificate $name to $certFilePath" - $certificates[$index].CertBody > $certFilePath + $operationRequestTypes = @("operationrequestsroot", "operationrequestsintermediate") + $downloadedAny = $false + + foreach ($requestType in $operationRequestTypes) { + $operationRequestUri = "http://168.63.129.16/machine?comp=acmspackage&type=$requestType&ext=json" + $operationResponse = Retry-Command -Command 'Invoke-WebRequest' -Args @{Uri=$operationRequestUri; UseBasicParsing=$true} -Retries 5 -RetryDelaySeconds 10 + $operationJson = ($operationResponse.Content) | ConvertFrom-Json + + if ($null -eq $operationJson -or $null -eq $operationJson.OperationRequests) { + Write-Log "Warning: no operation requests found for $requestType" + continue + } + + foreach ($operation in $operationJson.OperationRequests) { + $resourceFileName = $operation.ResouceFileName + if ([string]::IsNullOrEmpty($resourceFileName)) { + continue + } + + $resourceType = [IO.Path]::GetFileNameWithoutExtension($resourceFileName) + $resourceExt = [IO.Path]::GetExtension($resourceFileName).TrimStart('.') + $resourceUri = "http://168.63.129.16/machine?comp=acmspackage&type=$resourceType&ext=$resourceExt" + + $certContentResponse = Retry-Command -Command 'Invoke-WebRequest' -Args @{Uri=$resourceUri; UseBasicParsing=$true} -Retries 5 -RetryDelaySeconds 10 + if ([string]::IsNullOrEmpty($certContentResponse.Content)) { + Write-Log "Warning: empty certificate content for $resourceFileName" + continue + } + + $certFilePath = Join-Path $caFolder $resourceFileName + Write-Log "Write certificate $resourceFileName to $certFilePath" + $certContentResponse.Content > $certFilePath + $downloadedAny = $true + } + } + + if (-not $downloadedAny) { + Write-Log "Warning: no CA certificates were downloaded in rcv1p mode" } + + return $downloadedAny } catch { - # Catch all exceptions in this function. NOTE: exit cannot be caught. - Set-ExitCode -ExitCode $global:WINDOWS_CSE_ERROR_GET_CA_CERTIFICATES -ErrorMessage $_ + Write-Log "Warning: failed to retrieve CA certificates. Error: $_" + return $false } } From 965f64f4882f9dbf236cbd6017cac8676370f93e Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Wed, 18 Mar 2026 14:08:50 -0700 Subject: [PATCH 003/103] feat: enhance CA certificates refresh task with endpoint mode based on location Signed-off-by: Ramkumar Chinchani --- .../artifacts/init-aks-custom-cloud.sh | 33 ++++++++++++------- parts/windows/kuberneteswindowssetup.ps1 | 4 ++- staging/cse/windows/kubernetesfunc.ps1 | 15 +++++---- 3 files changed, 34 insertions(+), 18 deletions(-) diff --git a/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh b/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh index fab9e105975..9f3b4fe479e 100644 --- a/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh +++ b/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh @@ -198,16 +198,28 @@ function install_certs_to_trust_store { # - rcv1p mode first checks IsOptedInForRootCerts, then downloads only when opted in. # - Wireserver failures are treated as non-fatal, and cert trust-store updates are skipped gracefully. -location_normalized="${LOCATION,,}" -location_normalized="${location_normalized//[[:space:]]/}" -if [ -z "$location_normalized" ]; then - echo "Warning: LOCATION is empty; defaulting custom cloud certificate endpoint mode to rcv1p" +# Action values: +# - init: normal provisioning path +# - ca-refresh: scheduled refresh path +action=${1:-init} +requested_cert_endpoint_mode="${2:-}" + +cert_endpoint_mode="" +if [ "$action" = "ca-refresh" ] && [ -n "$requested_cert_endpoint_mode" ]; then + cert_endpoint_mode="${requested_cert_endpoint_mode,,}" +else + location_normalized="${LOCATION,,}" + location_normalized="${location_normalized//[[:space:]]/}" + if [ -z "$location_normalized" ]; then + echo "Warning: LOCATION is empty; defaulting custom cloud certificate endpoint mode to rcv1p" + fi + + cert_endpoint_mode="rcv1p" + case "$location_normalized" in + ussec*|usnat*) cert_endpoint_mode="legacy" ;; + esac fi -cert_endpoint_mode="rcv1p" -case "$location_normalized" in - ussec*|usnat*) cert_endpoint_mode="legacy" ;; -esac echo "Using custom cloud certificate endpoint mode: ${cert_endpoint_mode}" rm -f /root/AzureCACertificates/* if [ "$cert_endpoint_mode" = "legacy" ]; then @@ -228,7 +240,6 @@ fi # This section creates a cron job to poll for refreshed CA certs daily # It can be removed if not needed or desired -action=${1:-init} if [ "$action" = "ca-refresh" ]; then exit fi @@ -454,7 +465,7 @@ if [ "$IS_UBUNTU" -eq 1 ] || [ "$IS_MARINER" -eq 1 ] || [ "$IS_AZURELINUX" -eq 1 if ! crontab -l 2>/dev/null | grep -q "\"$scriptPath\" ca-refresh"; then # Quote the script path in the cron entry to avoid issues with spaces or special characters. - if ! (crontab -l 2>/dev/null ; printf '%s\n' "0 19 * * * \"$scriptPath\" ca-refresh") | crontab -; then + if ! (crontab -l 2>/dev/null ; printf '%s\n' "0 19 * * * \"$scriptPath\" ca-refresh \"$cert_endpoint_mode\"") | crontab -; then echo "Failed to install ca-refresh cron job via crontab" >&2 fi fi @@ -483,7 +494,7 @@ Wants=network-online.target [Service] Type=oneshot -ExecStart=$script_path ca-refresh +ExecStart=$script_path ca-refresh $cert_endpoint_mode EOF cat >"$tmr" < Date: Wed, 18 Mar 2026 17:14:10 -0700 Subject: [PATCH 004/103] feat: add tests for certificate endpoint mode handling in AKS custom cloud spec Signed-off-by: Ramkumar Chinchani --- .../artifacts/init_aks_custom_cloud_spec.sh | 39 +++++ staging/cse/windows/kubernetesfunc.tests.ps1 | 147 ++++++++++++++++++ 2 files changed, 186 insertions(+) create mode 100644 spec/parts/linux/cloud-init/artifacts/init_aks_custom_cloud_spec.sh create mode 100644 staging/cse/windows/kubernetesfunc.tests.ps1 diff --git a/spec/parts/linux/cloud-init/artifacts/init_aks_custom_cloud_spec.sh b/spec/parts/linux/cloud-init/artifacts/init_aks_custom_cloud_spec.sh new file mode 100644 index 00000000000..f00709306c2 --- /dev/null +++ b/spec/parts/linux/cloud-init/artifacts/init_aks_custom_cloud_spec.sh @@ -0,0 +1,39 @@ +#!/bin/bash + +Describe 'init-aks-custom-cloud.sh refresh mode wiring' + script_path='./parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh' + + It 'parses action and optional requested cert endpoint mode arguments' + When run grep -Eq '^action=\$\{1:-init\}$' "$script_path" + The status should eq 0 + + When run grep -Eq '^requested_cert_endpoint_mode="\$\{2:-\}"$' "$script_path" + The status should eq 0 + End + + It 'uses requested mode during ca-refresh when provided' + When run grep -Eq '^if \[ "\$action" = "ca-refresh" \] && \[ -n "\$requested_cert_endpoint_mode" \]; then$' "$script_path" + The status should eq 0 + + When run grep -Eq '^\s*cert_endpoint_mode="\$\{requested_cert_endpoint_mode,,\}"$' "$script_path" + The status should eq 0 + End + + It 'exits early in ca-refresh mode after certificate refresh logic' + When run grep -Eq '^if \[ "\$action" = "ca-refresh" \]; then$' "$script_path" + The status should eq 0 + + When run grep -Eq '^\s*exit$' "$script_path" + The status should eq 0 + End + + It 'passes cert endpoint mode into cron refresh command' + When run grep -Eq 'ca-refresh "\$cert_endpoint_mode"' "$script_path" + The status should eq 0 + End + + It 'passes cert endpoint mode into systemd refresh command' + When run grep -Eq '^ExecStart=\$script_path ca-refresh \$cert_endpoint_mode$' "$script_path" + The status should eq 0 + End +End diff --git a/staging/cse/windows/kubernetesfunc.tests.ps1 b/staging/cse/windows/kubernetesfunc.tests.ps1 new file mode 100644 index 00000000000..ba14ebb48ef --- /dev/null +++ b/staging/cse/windows/kubernetesfunc.tests.ps1 @@ -0,0 +1,147 @@ +if (-not (Get-PSDrive -Name C -ErrorAction SilentlyContinue)) { + New-PSDrive -Name C -PSProvider FileSystem -Root ([System.IO.Path]::GetTempPath()) | Out-Null +} + +function Write-Log { + param($Message) + Write-Host "$Message" +} + +function Logs-To-Event { + param($TaskName, $TaskMessage) + Write-Host "$TaskName $TaskMessage" +} + +function Set-ExitCode { + param($ExitCode, $ErrorMessage) + throw "Unexpected Set-ExitCode: $ExitCode $ErrorMessage" +} + +function Create-Directory { + param($FullPath, $DirectoryUsage) + if (-not (Test-Path $FullPath)) { + New-Item -Path $FullPath -ItemType Directory -Force | Out-Null + } +} + +function Get-ScheduledTask { + param($TaskName, $ErrorAction) +} + +function New-ScheduledTaskAction { + param($Execute, $Argument) +} + +function New-ScheduledTaskPrincipal { + param($UserId, $LogonType, $RunLevel) +} + +function New-JobTrigger { + param([switch]$Daily, $At, $DaysInterval) +} + +function New-ScheduledTask { + param($Action, $Principal, $Trigger, $Description) +} + +function Register-ScheduledTask { + param($TaskName, $InputObject) +} + +. $PSScriptRoot\..\..\..\parts\windows\windowscsehelper.ps1 +. $PSCommandPath.Replace('.tests.ps1', '.ps1') + +Describe 'Get-CustomCloudCertEndpointModeFromLocation' { + It 'returns legacy for ussec regions' { + Get-CustomCloudCertEndpointModeFromLocation -Location 'ussecwest' | Should Be 'legacy' + } + + It 'returns legacy for usnat regions' { + Get-CustomCloudCertEndpointModeFromLocation -Location 'usnatcentral' | Should Be 'legacy' + } + + It 'returns rcv1p for public regions' { + Get-CustomCloudCertEndpointModeFromLocation -Location 'southcentralus' | Should Be 'rcv1p' + } + + It 'handles mixed-case input' { + Get-CustomCloudCertEndpointModeFromLocation -Location 'UsSeCeast' | Should Be 'legacy' + } +} + +Describe 'Register-CACertificatesRefreshTask' { + BeforeEach { + $script:lastScheduledTaskArgument = $null + + Mock Logs-To-Event + Mock Write-Log + Mock New-ScheduledTaskPrincipal -MockWith { return @{ Kind = 'principal' } } + Mock New-JobTrigger -MockWith { return @{ Kind = 'trigger' } } + Mock New-ScheduledTask -MockWith { return @{ Kind = 'definition' } } + Mock Register-ScheduledTask + Mock New-ScheduledTaskAction -MockWith { + param($Execute, $Argument) + $script:lastScheduledTaskArgument = $Argument + return @{ Execute = $Execute; Argument = $Argument } + } + } + + It 'skips registration when the task already exists' { + Mock Get-ScheduledTask -MockWith { return @{ TaskName = 'aks-ca-certs-refresh-task' } } + + Register-CACertificatesRefreshTask -Location 'southcentralus' -CertEndpointMode 'rcv1p' + + Assert-MockCalled -CommandName Register-ScheduledTask -Exactly -Times 0 + Assert-MockCalled -CommandName New-ScheduledTaskAction -Exactly -Times 0 + } + + It 'creates a scheduled task that passes the explicit cert endpoint mode' { + Mock Get-ScheduledTask -MockWith { return $null } + + Register-CACertificatesRefreshTask -Location 'southcentralus' -CertEndpointMode 'rcv1p' + + Assert-MockCalled -CommandName Register-ScheduledTask -Exactly -Times 1 + $script:lastScheduledTaskArgument | Should Match ([regex]::Escape("Get-CACertificates -Location 'southcentralus' -CertEndpointMode 'rcv1p'")) + } +} + +Describe 'Get-CACertificates' { + BeforeEach { + Mock Write-Log + Mock Create-Directory -MockWith { + param($FullPath, $DirectoryUsage) + if (-not (Test-Path $FullPath)) { + New-Item -Path $FullPath -ItemType Directory -Force | Out-Null + } + } + + if (Test-Path 'C:\ca') { + Remove-Item -Path 'C:\ca' -Recurse -Force + } + } + + It 'uses the legacy endpoint when CertEndpointMode is legacy regardless of location' { + Mock Retry-Command -MockWith { + param($Command, $Args, $Retries, $RetryDelaySeconds) + return [PSCustomObject]@{ + Content = '{"Certificates":[{"Name":"legacy.crt","CertBody":"legacy-body"}]}' + } + } + + $result = Get-CACertificates -Location 'southcentralus' -CertEndpointMode 'legacy' + + $result | Should Be $true + Assert-MockCalled -CommandName Retry-Command -Exactly -Times 1 -ParameterFilter { $Args.Uri -eq 'http://168.63.129.16/machine?comp=acmspackage&type=cacertificates&ext=json' } + Assert-MockCalled -CommandName Retry-Command -Exactly -Times 0 -ParameterFilter { $Args.Uri -eq 'http://168.63.129.16/acms/isOptedInForRootCerts' } + } + + It 'returns false when certificate retrieval throws' { + Mock Retry-Command -MockWith { + throw 'simulated retrieval failure' + } + + $result = Get-CACertificates -Location 'ussecwest' -CertEndpointMode 'rcv1p' + + $result | Should Be $false + } +} From 8b629dea9325554d0f9e3f37aa06fc7868b6e1a5 Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Thu, 19 Mar 2026 12:44:29 -0700 Subject: [PATCH 005/103] feat: simplify certificate endpoint mode handling and refresh task registration Signed-off-by: Ramkumar Chinchani --- .../artifacts/init-aks-custom-cloud.sh | 41 ++++++++----------- parts/windows/kuberneteswindowssetup.ps1 | 4 +- .../artifacts/init_aks_custom_cloud_spec.sh | 21 ++++++---- staging/cse/windows/kubernetesfunc.ps1 | 15 +++---- staging/cse/windows/kubernetesfunc.tests.ps1 | 14 +++---- 5 files changed, 44 insertions(+), 51 deletions(-) diff --git a/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh b/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh index 9f3b4fe479e..c7176be2393 100644 --- a/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh +++ b/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh @@ -198,28 +198,19 @@ function install_certs_to_trust_store { # - rcv1p mode first checks IsOptedInForRootCerts, then downloads only when opted in. # - Wireserver failures are treated as non-fatal, and cert trust-store updates are skipped gracefully. -# Action values: -# - init: normal provisioning path -# - ca-refresh: scheduled refresh path -action=${1:-init} -requested_cert_endpoint_mode="${2:-}" - -cert_endpoint_mode="" -if [ "$action" = "ca-refresh" ] && [ -n "$requested_cert_endpoint_mode" ]; then - cert_endpoint_mode="${requested_cert_endpoint_mode,,}" -else - location_normalized="${LOCATION,,}" - location_normalized="${location_normalized//[[:space:]]/}" - if [ -z "$location_normalized" ]; then - echo "Warning: LOCATION is empty; defaulting custom cloud certificate endpoint mode to rcv1p" - fi +refresh_location="${2:-${LOCATION}}" - cert_endpoint_mode="rcv1p" - case "$location_normalized" in - ussec*|usnat*) cert_endpoint_mode="legacy" ;; - esac +location_normalized="${refresh_location,,}" +location_normalized="${location_normalized//[[:space:]]/}" +if [ -z "$location_normalized" ]; then + echo "Warning: LOCATION is empty; defaulting custom cloud certificate endpoint mode to rcv1p" fi +cert_endpoint_mode="rcv1p" +case "$location_normalized" in + ussec*|usnat*) cert_endpoint_mode="legacy" ;; +esac + echo "Using custom cloud certificate endpoint mode: ${cert_endpoint_mode}" rm -f /root/AzureCACertificates/* if [ "$cert_endpoint_mode" = "legacy" ]; then @@ -238,8 +229,12 @@ elif [ "$cert_endpoint_mode" = "rcv1p" ]; then fi fi -# This section creates a cron job to poll for refreshed CA certs daily -# It can be removed if not needed or desired +# In ca-refresh mode (invoked by the scheduled cron/systemd task with the location as arg), +# only the cert refresh above is needed; exit before running the full init path. +# Action values: +# - init (default): full provisioning path +# - ca-refresh : periodic refresh path; location is passed as arg to avoid env dependency +action=${1:-init} if [ "$action" = "ca-refresh" ]; then exit fi @@ -465,7 +460,7 @@ if [ "$IS_UBUNTU" -eq 1 ] || [ "$IS_MARINER" -eq 1 ] || [ "$IS_AZURELINUX" -eq 1 if ! crontab -l 2>/dev/null | grep -q "\"$scriptPath\" ca-refresh"; then # Quote the script path in the cron entry to avoid issues with spaces or special characters. - if ! (crontab -l 2>/dev/null ; printf '%s\n' "0 19 * * * \"$scriptPath\" ca-refresh \"$cert_endpoint_mode\"") | crontab -; then + if ! (crontab -l 2>/dev/null ; printf '%s\n' "0 19 * * * \"$scriptPath\" ca-refresh \"$LOCATION\"") | crontab -; then echo "Failed to install ca-refresh cron job via crontab" >&2 fi fi @@ -494,7 +489,7 @@ Wants=network-online.target [Service] Type=oneshot -ExecStart=$script_path ca-refresh $cert_endpoint_mode +ExecStart=$script_path ca-refresh $LOCATION EOF cat >"$tmr" < Date: Thu, 19 Mar 2026 13:04:03 -0700 Subject: [PATCH 006/103] feat: implement conditional CA certificates refresh task registration for legacy and opted-in rcv1p modes Signed-off-by: Ramkumar Chinchani --- .../artifacts/init-aks-custom-cloud.sh | 29 +++++++++------ parts/windows/kuberneteswindowssetup.ps1 | 4 ++- .../artifacts/init_aks_custom_cloud_spec.sh | 11 ++++++ staging/cse/windows/kubernetesfunc.ps1 | 21 +++++++++++ staging/cse/windows/kubernetesfunc.tests.ps1 | 36 +++++++++++++++++++ 5 files changed, 89 insertions(+), 12 deletions(-) diff --git a/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh b/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh index c7176be2393..eeb01c392fe 100644 --- a/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh +++ b/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh @@ -212,8 +212,10 @@ case "$location_normalized" in esac echo "Using custom cloud certificate endpoint mode: ${cert_endpoint_mode}" +install_ca_refresh_schedule=0 rm -f /root/AzureCACertificates/* if [ "$cert_endpoint_mode" = "legacy" ]; then + install_ca_refresh_schedule=1 if retrieve_legacy_certs; then install_certs_to_trust_store else @@ -221,6 +223,7 @@ if [ "$cert_endpoint_mode" = "legacy" ]; then fi elif [ "$cert_endpoint_mode" = "rcv1p" ]; then if is_opted_in_for_root_certs; then + install_ca_refresh_schedule=1 if retrieve_rcv1p_certs; then install_certs_to_trust_store else @@ -458,10 +461,12 @@ if [ "$IS_UBUNTU" -eq 1 ] || [ "$IS_MARINER" -eq 1 ] || [ "$IS_AZURELINUX" -eq 1 scriptPath="$(readlink -f "$0" 2>/dev/null || printf '%s' "$0")" fi - if ! crontab -l 2>/dev/null | grep -q "\"$scriptPath\" ca-refresh"; then - # Quote the script path in the cron entry to avoid issues with spaces or special characters. - if ! (crontab -l 2>/dev/null ; printf '%s\n' "0 19 * * * \"$scriptPath\" ca-refresh \"$LOCATION\"") | crontab -; then - echo "Failed to install ca-refresh cron job via crontab" >&2 + if [ "$install_ca_refresh_schedule" -eq 1 ]; then + if ! crontab -l 2>/dev/null | grep -q "\"$scriptPath\" ca-refresh"; then + # Quote the script path in the cron entry to avoid issues with spaces or special characters. + if ! (crontab -l 2>/dev/null ; printf '%s\n' "0 19 * * * \"$scriptPath\" ca-refresh \"$LOCATION\"") | crontab -; then + echo "Failed to install ca-refresh cron job via crontab" >&2 + fi fi fi @@ -477,11 +482,12 @@ if [ "$IS_UBUNTU" -eq 1 ] || [ "$IS_MARINER" -eq 1 ] || [ "$IS_AZURELINUX" -eq 1 echo "Running apt-get update" aptget_update elif [ "$IS_FLATCAR" -eq 1 ] || [ "$IS_ACL" -eq 1 ]; then - script_path="$(readlink -f "$0")" - svc="/etc/systemd/system/azure-ca-refresh.service" - tmr="/etc/systemd/system/azure-ca-refresh.timer" + if [ "$install_ca_refresh_schedule" -eq 1 ]; then + script_path="$(readlink -f "$0")" + svc="/etc/systemd/system/azure-ca-refresh.service" + tmr="/etc/systemd/system/azure-ca-refresh.timer" - cat >"$svc" <"$svc" <"$tmr" <"$tmr" < Date: Thu, 19 Mar 2026 14:54:49 -0700 Subject: [PATCH 007/103] feat: enhance CA certificates refresh task registration for legacy CSE packages Signed-off-by: Ramkumar Chinchani --- parts/windows/kuberneteswindowssetup.ps1 | 9 ++++++++- .../cloud-init/artifacts/init_aks_custom_cloud_spec.sh | 10 +++++----- staging/cse/windows/kubernetesfunc.tests.ps1 | 3 --- 3 files changed, 13 insertions(+), 9 deletions(-) diff --git a/parts/windows/kuberneteswindowssetup.ps1 b/parts/windows/kuberneteswindowssetup.ps1 index f53ffc32dc9..a4e568de423 100644 --- a/parts/windows/kuberneteswindowssetup.ps1 +++ b/parts/windows/kuberneteswindowssetup.ps1 @@ -487,7 +487,14 @@ function BasePrep { Adjust-DynamicPortRange Register-LogsCleanupScriptTask Register-NodeResetScriptTask - if (Should-InstallCACertificatesRefreshTask -Location $Location) { + # Guard against older CSE packages that do not yet export Should-InstallCACertificatesRefreshTask. + # If the function is absent (old package), fall back to the previous unconditional behaviour so + # that legacy/ussec/usnat clusters continue to register the refresh task. + if (Get-Command -Name Should-InstallCACertificatesRefreshTask -ErrorAction Ignore) { + if (Should-InstallCACertificatesRefreshTask -Location $Location) { + Register-CACertificatesRefreshTask -Location $Location + } + } elseif (Get-Command -Name Register-CACertificatesRefreshTask -ErrorAction Ignore) { Register-CACertificatesRefreshTask -Location $Location } diff --git a/spec/parts/linux/cloud-init/artifacts/init_aks_custom_cloud_spec.sh b/spec/parts/linux/cloud-init/artifacts/init_aks_custom_cloud_spec.sh index f85f580a8cc..8b54975d51b 100644 --- a/spec/parts/linux/cloud-init/artifacts/init_aks_custom_cloud_spec.sh +++ b/spec/parts/linux/cloud-init/artifacts/init_aks_custom_cloud_spec.sh @@ -18,7 +18,7 @@ Describe 'init-aks-custom-cloud.sh refresh mode wiring' When run grep -Eq '^location_normalized="\$\{refresh_location,,\}"$' "$script_path" The status should eq 0 - When run grep -Eq 'ussec\*\|usnat\*\) cert_endpoint_mode="legacy"' "$script_path" + When run grep -Eq 'ussec\*|usnat\*\) cert_endpoint_mode="legacy"' "$script_path" The status should eq 0 End @@ -26,10 +26,10 @@ Describe 'init-aks-custom-cloud.sh refresh mode wiring' When run grep -Eq '^install_ca_refresh_schedule=0$' "$script_path" The status should eq 0 - When run grep -Eq '^\s*install_ca_refresh_schedule=1$' "$script_path" + When run grep -Eq '^[[:space:]]*install_ca_refresh_schedule=1$' "$script_path" The status should eq 0 - When run grep -Eq '^\s*if \[ "\$install_ca_refresh_schedule" -eq 1 \]; then$' "$script_path" + When run grep -Eq '^[[:space:]]*if \[ "\$install_ca_refresh_schedule" -eq 1 \]; then$' "$script_path" The status should eq 0 End @@ -37,12 +37,12 @@ Describe 'init-aks-custom-cloud.sh refresh mode wiring' When run grep -Eq '^if \[ "\$action" = "ca-refresh" \]; then$' "$script_path" The status should eq 0 - When run grep -Eq '^\s*exit$' "$script_path" + When run grep -Eq '^[[:space:]]*exit$' "$script_path" The status should eq 0 End It 'passes LOCATION directly into cron refresh command' - When run grep -Eq 'ca-refresh \\\\"\$LOCATION\\\\"' "$script_path" + When run grep -Eq 'ca-refresh \\"\$LOCATION\\"' "$script_path" The status should eq 0 End diff --git a/staging/cse/windows/kubernetesfunc.tests.ps1 b/staging/cse/windows/kubernetesfunc.tests.ps1 index 948cd229dc0..8b062a273d0 100644 --- a/staging/cse/windows/kubernetesfunc.tests.ps1 +++ b/staging/cse/windows/kubernetesfunc.tests.ps1 @@ -74,7 +74,6 @@ Describe 'Register-CACertificatesRefreshTask' { $script:lastScheduledTaskArgument = $null Mock Logs-To-Event - Mock Write-Log Mock New-ScheduledTaskPrincipal -MockWith { return @{ Kind = 'principal' } } Mock New-JobTrigger -MockWith { return @{ Kind = 'trigger' } } Mock New-ScheduledTask -MockWith { return @{ Kind = 'definition' } } @@ -107,7 +106,6 @@ Describe 'Register-CACertificatesRefreshTask' { Describe 'Should-InstallCACertificatesRefreshTask' { BeforeEach { - Mock Write-Log } It 'returns true for legacy regions without calling the opt-in endpoint' { @@ -143,7 +141,6 @@ Describe 'Should-InstallCACertificatesRefreshTask' { Describe 'Get-CACertificates' { BeforeEach { - Mock Write-Log Mock Create-Directory -MockWith { param($FullPath, $DirectoryUsage) if (-not (Test-Path $FullPath)) { From 53abecce0296712a833033f89805b0cf0d1d2601 Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Thu, 19 Mar 2026 23:27:10 -0700 Subject: [PATCH 008/103] feat: update tests for certificate endpoint mode handling and refresh schedule installation Signed-off-by: Ramkumar Chinchani --- .../artifacts/init_aks_custom_cloud_spec.sh | 12 ++++++++++-- staging/cse/windows/kubernetesfunc.tests.ps1 | 7 +++++-- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/spec/parts/linux/cloud-init/artifacts/init_aks_custom_cloud_spec.sh b/spec/parts/linux/cloud-init/artifacts/init_aks_custom_cloud_spec.sh index 8b54975d51b..58812659856 100644 --- a/spec/parts/linux/cloud-init/artifacts/init_aks_custom_cloud_spec.sh +++ b/spec/parts/linux/cloud-init/artifacts/init_aks_custom_cloud_spec.sh @@ -17,26 +17,34 @@ Describe 'init-aks-custom-cloud.sh refresh mode wiring' It 'always derives cert endpoint mode from refresh_location' When run grep -Eq '^location_normalized="\$\{refresh_location,,\}"$' "$script_path" The status should eq 0 + End + It 'maps ussec/usnat locations to legacy cert endpoint mode' When run grep -Eq 'ussec\*|usnat\*\) cert_endpoint_mode="legacy"' "$script_path" The status should eq 0 End - It 'installs refresh schedule only for legacy mode or opted-in rcv1p mode' + It 'initializes refresh schedule installation as disabled' When run grep -Eq '^install_ca_refresh_schedule=0$' "$script_path" The status should eq 0 + End + It 'enables refresh schedule installation for eligible certificate modes' When run grep -Eq '^[[:space:]]*install_ca_refresh_schedule=1$' "$script_path" The status should eq 0 + End + It 'gates refresh schedule installation on install_ca_refresh_schedule' When run grep -Eq '^[[:space:]]*if \[ "\$install_ca_refresh_schedule" -eq 1 \]; then$' "$script_path" The status should eq 0 End - It 'exits early in ca-refresh mode after certificate refresh logic' + It 'checks for ca-refresh mode after certificate refresh logic' When run grep -Eq '^if \[ "\$action" = "ca-refresh" \]; then$' "$script_path" The status should eq 0 + End + It 'exits early in ca-refresh mode after certificate refresh logic' When run grep -Eq '^[[:space:]]*exit$' "$script_path" The status should eq 0 End diff --git a/staging/cse/windows/kubernetesfunc.tests.ps1 b/staging/cse/windows/kubernetesfunc.tests.ps1 index 8b062a273d0..42e15c4fc25 100644 --- a/staging/cse/windows/kubernetesfunc.tests.ps1 +++ b/staging/cse/windows/kubernetesfunc.tests.ps1 @@ -48,8 +48,11 @@ function Register-ScheduledTask { param($TaskName, $InputObject) } -. $PSScriptRoot\..\..\..\parts\windows\windowscsehelper.ps1 -. $PSCommandPath.Replace('.tests.ps1', '.ps1') +$helperScriptPath = Join-Path $PSScriptRoot '..\..\..\parts\windows\windowscsehelper.ps1' +$scriptUnderTestPath = Join-Path $PSScriptRoot 'kubernetesfunc.ps1' + +. $helperScriptPath +. $scriptUnderTestPath Describe 'Get-CustomCloudCertEndpointModeFromLocation' { It 'returns legacy for ussec regions' { From d0bb7e6d238895e3c8804054e57daed977c111b7 Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Fri, 20 Mar 2026 07:42:47 -0700 Subject: [PATCH 009/103] feat: refactor test setup functions for improved readability and consistency Signed-off-by: Ramkumar Chinchani --- staging/cse/windows/kubernetesfunc.tests.ps1 | 102 +++++++++---------- 1 file changed, 51 insertions(+), 51 deletions(-) diff --git a/staging/cse/windows/kubernetesfunc.tests.ps1 b/staging/cse/windows/kubernetesfunc.tests.ps1 index 42e15c4fc25..3f9f403666b 100644 --- a/staging/cse/windows/kubernetesfunc.tests.ps1 +++ b/staging/cse/windows/kubernetesfunc.tests.ps1 @@ -1,58 +1,64 @@ -if (-not (Get-PSDrive -Name C -ErrorAction SilentlyContinue)) { - New-PSDrive -Name C -PSProvider FileSystem -Root ([System.IO.Path]::GetTempPath()) | Out-Null -} +BeforeAll { + if (-not (Get-PSDrive -Name C -ErrorAction SilentlyContinue)) { + New-PSDrive -Name C -PSProvider FileSystem -Root ([System.IO.Path]::GetTempPath()) | Out-Null + } -function Write-Log { - param($Message) - Write-Host "$Message" -} + function Write-Log { + param($Message) + Write-Host "$Message" + } -function Logs-To-Event { - param($TaskName, $TaskMessage) - Write-Host "$TaskName $TaskMessage" -} + function Logs-To-Event { + param($TaskName, $TaskMessage) + Write-Host "$TaskName $TaskMessage" + } -function Set-ExitCode { - param($ExitCode, $ErrorMessage) - throw "Unexpected Set-ExitCode: $ExitCode $ErrorMessage" -} + function Set-ExitCode { + param($ExitCode, $ErrorMessage) + throw "Unexpected Set-ExitCode: $ExitCode $ErrorMessage" + } -function Create-Directory { - param($FullPath, $DirectoryUsage) - if (-not (Test-Path $FullPath)) { - New-Item -Path $FullPath -ItemType Directory -Force | Out-Null + function Create-Directory { + param($FullPath, $DirectoryUsage) + if (-not (Test-Path $FullPath)) { + New-Item -Path $FullPath -ItemType Directory -Force | Out-Null + } } -} -function Get-ScheduledTask { - param($TaskName, $ErrorAction) -} + function Get-ScheduledTask { + param($TaskName, $ErrorAction) + } -function New-ScheduledTaskAction { - param($Execute, $Argument) -} + function New-ScheduledTaskAction { + param($Execute, $Argument) + } -function New-ScheduledTaskPrincipal { - param($UserId, $LogonType, $RunLevel) -} + function New-ScheduledTaskPrincipal { + param($UserId, $LogonType, $RunLevel) + } -function New-JobTrigger { - param([switch]$Daily, $At, $DaysInterval) -} + function New-JobTrigger { + param([switch]$Daily, $At, $DaysInterval) + } -function New-ScheduledTask { - param($Action, $Principal, $Trigger, $Description) -} + function New-ScheduledTask { + param($Action, $Principal, $Trigger, $Description) + } -function Register-ScheduledTask { - param($TaskName, $InputObject) -} + function Register-ScheduledTask { + param($TaskName, $InputObject) + } + + function Retry-Command { + param($Command, $Args, $Retries, $RetryDelaySeconds) + } -$helperScriptPath = Join-Path $PSScriptRoot '..\..\..\parts\windows\windowscsehelper.ps1' -$scriptUnderTestPath = Join-Path $PSScriptRoot 'kubernetesfunc.ps1' + $helperScriptPath = Join-Path $PSScriptRoot '..\..\..\parts\windows\windowscsehelper.ps1' + $scriptUnderTestPath = Join-Path $PSScriptRoot 'kubernetesfunc.ps1' -. $helperScriptPath -. $scriptUnderTestPath + . $helperScriptPath + . $scriptUnderTestPath +} Describe 'Get-CustomCloudCertEndpointModeFromLocation' { It 'returns legacy for ussec regions' { @@ -76,11 +82,11 @@ Describe 'Register-CACertificatesRefreshTask' { BeforeEach { $script:lastScheduledTaskArgument = $null - Mock Logs-To-Event + Mock Logs-To-Event -MockWith { } Mock New-ScheduledTaskPrincipal -MockWith { return @{ Kind = 'principal' } } Mock New-JobTrigger -MockWith { return @{ Kind = 'trigger' } } Mock New-ScheduledTask -MockWith { return @{ Kind = 'definition' } } - Mock Register-ScheduledTask + Mock Register-ScheduledTask -MockWith { } Mock New-ScheduledTaskAction -MockWith { param($Execute, $Argument) $script:lastScheduledTaskArgument = $Argument @@ -109,6 +115,7 @@ Describe 'Register-CACertificatesRefreshTask' { Describe 'Should-InstallCACertificatesRefreshTask' { BeforeEach { + Mock Retry-Command -MockWith { } } It 'returns true for legacy regions without calling the opt-in endpoint' { @@ -144,13 +151,6 @@ Describe 'Should-InstallCACertificatesRefreshTask' { Describe 'Get-CACertificates' { BeforeEach { - Mock Create-Directory -MockWith { - param($FullPath, $DirectoryUsage) - if (-not (Test-Path $FullPath)) { - New-Item -Path $FullPath -ItemType Directory -Force | Out-Null - } - } - if (Test-Path 'C:\ca') { Remove-Item -Path 'C:\ca' -Recurse -Force } From be9ddef44bb44a801610ed052257910b5922c259 Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Fri, 20 Mar 2026 08:52:23 -0700 Subject: [PATCH 010/103] feat: update Get-CustomCloudCertEndpointModeFromLocation to clarify endpoint mode handling for legacy and rcv1p regions Signed-off-by: Ramkumar Chinchani --- staging/cse/windows/kubernetesfunc.ps1 | 2 ++ staging/cse/windows/kubernetesfunc.tests.ps1 | 20 ++++++++++---------- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/staging/cse/windows/kubernetesfunc.ps1 b/staging/cse/windows/kubernetesfunc.ps1 index 5ae9df1e217..023542b6f3c 100644 --- a/staging/cse/windows/kubernetesfunc.ps1 +++ b/staging/cse/windows/kubernetesfunc.ps1 @@ -275,11 +275,13 @@ function Get-CustomCloudCertEndpointModeFromLocation { $Location ) + # ussec/usnat regions still use the legacy certificate endpoint contract. $normalizedLocation = $Location.ToLowerInvariant() if ($normalizedLocation.StartsWith("ussec") -or $normalizedLocation.StartsWith("usnat")) { return "legacy" } + # All other regions use the rcv1p endpoint mode with opt-in gating. return "rcv1p" } diff --git a/staging/cse/windows/kubernetesfunc.tests.ps1 b/staging/cse/windows/kubernetesfunc.tests.ps1 index 3f9f403666b..2e95cef1338 100644 --- a/staging/cse/windows/kubernetesfunc.tests.ps1 +++ b/staging/cse/windows/kubernetesfunc.tests.ps1 @@ -62,19 +62,19 @@ BeforeAll { Describe 'Get-CustomCloudCertEndpointModeFromLocation' { It 'returns legacy for ussec regions' { - Get-CustomCloudCertEndpointModeFromLocation -Location 'ussecwest' | Should Be 'legacy' + Get-CustomCloudCertEndpointModeFromLocation -Location 'ussecwest' | Should -Be 'legacy' } It 'returns legacy for usnat regions' { - Get-CustomCloudCertEndpointModeFromLocation -Location 'usnatcentral' | Should Be 'legacy' + Get-CustomCloudCertEndpointModeFromLocation -Location 'usnatcentral' | Should -Be 'legacy' } It 'returns rcv1p for public regions' { - Get-CustomCloudCertEndpointModeFromLocation -Location 'southcentralus' | Should Be 'rcv1p' + Get-CustomCloudCertEndpointModeFromLocation -Location 'southcentralus' | Should -Be 'rcv1p' } It 'handles mixed-case input' { - Get-CustomCloudCertEndpointModeFromLocation -Location 'UsSeCeast' | Should Be 'legacy' + Get-CustomCloudCertEndpointModeFromLocation -Location 'UsSeCeast' | Should -Be 'legacy' } } @@ -109,7 +109,7 @@ Describe 'Register-CACertificatesRefreshTask' { Register-CACertificatesRefreshTask -Location 'southcentralus' Assert-MockCalled -CommandName Register-ScheduledTask -Exactly -Times 1 - $script:lastScheduledTaskArgument | Should Match ([regex]::Escape("Get-CACertificates -Location 'southcentralus'")) + $script:lastScheduledTaskArgument | Should -Match ([regex]::Escape("Get-CACertificates -Location 'southcentralus'")) } } @@ -123,7 +123,7 @@ Describe 'Should-InstallCACertificatesRefreshTask' { $result = Should-InstallCACertificatesRefreshTask -Location 'ussecwest' - $result | Should Be $true + $result | Should -Be $true Assert-MockCalled -CommandName Retry-Command -Exactly -Times 0 } @@ -134,7 +134,7 @@ Describe 'Should-InstallCACertificatesRefreshTask' { $result = Should-InstallCACertificatesRefreshTask -Location 'southcentralus' - $result | Should Be $true + $result | Should -Be $true Assert-MockCalled -CommandName Retry-Command -Exactly -Times 1 -ParameterFilter { $Args.Uri -eq 'http://168.63.129.16/acms/isOptedInForRootCerts' } } @@ -145,7 +145,7 @@ Describe 'Should-InstallCACertificatesRefreshTask' { $result = Should-InstallCACertificatesRefreshTask -Location 'southcentralus' - $result | Should Be $false + $result | Should -Be $false } } @@ -166,7 +166,7 @@ Describe 'Get-CACertificates' { $result = Get-CACertificates -Location 'ussecwest' - $result | Should Be $true + $result | Should -Be $true Assert-MockCalled -CommandName Retry-Command -Exactly -Times 1 -ParameterFilter { $Args.Uri -eq 'http://168.63.129.16/machine?comp=acmspackage&type=cacertificates&ext=json' } Assert-MockCalled -CommandName Retry-Command -Exactly -Times 0 -ParameterFilter { $Args.Uri -eq 'http://168.63.129.16/acms/isOptedInForRootCerts' } } @@ -178,6 +178,6 @@ Describe 'Get-CACertificates' { $result = Get-CACertificates -Location 'southcentralus' - $result | Should Be $false + $result | Should -Be $false } } From 4678c4635b2c79e48834b99f61ef87af287aca74 Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Fri, 20 Mar 2026 09:52:25 -0700 Subject: [PATCH 011/103] feat: enhance tests for Should-InstallCACertificatesRefreshTask and Get-CACertificates to verify URI handling Signed-off-by: Ramkumar Chinchani --- staging/cse/windows/kubernetesfunc.tests.ps1 | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/staging/cse/windows/kubernetesfunc.tests.ps1 b/staging/cse/windows/kubernetesfunc.tests.ps1 index 2e95cef1338..8186bfabc4c 100644 --- a/staging/cse/windows/kubernetesfunc.tests.ps1 +++ b/staging/cse/windows/kubernetesfunc.tests.ps1 @@ -128,14 +128,18 @@ Describe 'Should-InstallCACertificatesRefreshTask' { } It 'returns true for rcv1p regions when opt-in is enabled' { + $script:lastRetryUri = $null Mock Retry-Command -MockWith { + param($Command, $Args, $Retries, $RetryDelaySeconds) + $script:lastRetryUri = $PSBoundParameters['Args'].Uri return [PSCustomObject]@{ Content = 'IsOptedInForRootCerts=true' } } $result = Should-InstallCACertificatesRefreshTask -Location 'southcentralus' $result | Should -Be $true - Assert-MockCalled -CommandName Retry-Command -Exactly -Times 1 -ParameterFilter { $Args.Uri -eq 'http://168.63.129.16/acms/isOptedInForRootCerts' } + Assert-MockCalled -CommandName Retry-Command -Exactly -Times 1 + $script:lastRetryUri | Should -Be 'http://168.63.129.16/acms/isOptedInForRootCerts' } It 'returns false for rcv1p regions when opt-in is disabled' { @@ -157,8 +161,10 @@ Describe 'Get-CACertificates' { } It 'uses the legacy endpoint when location is a ussec/usnat region' { + $script:retryUris = @() Mock Retry-Command -MockWith { param($Command, $Args, $Retries, $RetryDelaySeconds) + $script:retryUris += $PSBoundParameters['Args'].Uri return [PSCustomObject]@{ Content = '{"Certificates":[{"Name":"legacy.crt","CertBody":"legacy-body"}]}' } @@ -167,8 +173,9 @@ Describe 'Get-CACertificates' { $result = Get-CACertificates -Location 'ussecwest' $result | Should -Be $true - Assert-MockCalled -CommandName Retry-Command -Exactly -Times 1 -ParameterFilter { $Args.Uri -eq 'http://168.63.129.16/machine?comp=acmspackage&type=cacertificates&ext=json' } - Assert-MockCalled -CommandName Retry-Command -Exactly -Times 0 -ParameterFilter { $Args.Uri -eq 'http://168.63.129.16/acms/isOptedInForRootCerts' } + Assert-MockCalled -CommandName Retry-Command -Exactly -Times 1 + $script:retryUris | Should -Contain 'http://168.63.129.16/machine?comp=acmspackage&type=cacertificates&ext=json' + $script:retryUris | Should -Not -Contain 'http://168.63.129.16/acms/isOptedInForRootCerts' } It 'returns false when certificate retrieval throws' { From c0bec67c044786ec85fbea353fce41d378096c4a Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Wed, 25 Mar 2026 16:58:14 -0700 Subject: [PATCH 012/103] feat: update cse_cmd.sh and cse_cmd.sh.gtpl to ensure consistent logging of custom cloud file paths Signed-off-by: Ramkumar Chinchani --- aks-node-controller/parser/templates/cse_cmd.sh.gtpl | 2 +- parts/linux/cloud-init/artifacts/cse_cmd.sh | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/aks-node-controller/parser/templates/cse_cmd.sh.gtpl b/aks-node-controller/parser/templates/cse_cmd.sh.gtpl index d685a3444da..42376814388 100644 --- a/aks-node-controller/parser/templates/cse_cmd.sh.gtpl +++ b/aks-node-controller/parser/templates/cse_cmd.sh.gtpl @@ -1,7 +1,7 @@ echo $(date),$(hostname) > ${PROVISION_OUTPUT}; {{if getIsAksCustomCloud .CustomCloudConfig}} REPO_DEPOT_ENDPOINT="{{.CustomCloudConfig.RepoDepotEndpoint}}" -{{getInitAKSCustomCloudFilepath}} >> /var/log/azure/cluster-provision.log 2>&1; {{end}} LOCATION="{{getCloudLocation .}}" +{{getInitAKSCustomCloudFilepath}} >> /var/log/azure/cluster-provision.log 2>&1; /usr/bin/nohup /bin/bash -c "/bin/bash /opt/azure/containers/provision_start.sh" diff --git a/parts/linux/cloud-init/artifacts/cse_cmd.sh b/parts/linux/cloud-init/artifacts/cse_cmd.sh index 52ffb72de76..1b366d38787 100644 --- a/parts/linux/cloud-init/artifacts/cse_cmd.sh +++ b/parts/linux/cloud-init/artifacts/cse_cmd.sh @@ -18,8 +18,9 @@ fi; {{end}} {{if IsAKSCustomCloud}} REPO_DEPOT_ENDPOINT="{{AKSCustomCloudRepoDepotEndpoint}}" -{{GetInitAKSCustomCloudFilepath}} >> /var/log/azure/cluster-provision.log 2>&1; {{end}} +LOCATION={{GetVariable "location"}} +{{GetInitAKSCustomCloudFilepath}} >> /var/log/azure/cluster-provision.log 2>&1; ADMINUSER={{GetParameter "linuxAdminUsername"}} MOBY_VERSION={{GetParameter "mobyVersion"}} TENANT_ID={{GetVariable "tenantID"}} @@ -32,7 +33,6 @@ KUBEPROXY_URL={{GetParameter "kubeProxySpec"}} APISERVER_PUBLIC_KEY={{GetParameter "apiServerCertificate"}} SUBSCRIPTION_ID={{GetVariable "subscriptionId"}} RESOURCE_GROUP={{GetVariable "resourceGroup"}} -LOCATION={{GetVariable "location"}} VM_TYPE={{GetVariable "vmType"}} SUBNET={{GetVariable "subnetName"}} NETWORK_SECURITY_GROUP={{GetVariable "nsgName"}} From 65924a279e2f6ed4e9e16b6bff9c7f0f54888a88 Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Thu, 26 Mar 2026 12:55:30 -0700 Subject: [PATCH 013/103] feat: update CA certificates functions for backward compatibility with optional Location parameter Signed-off-by: Ramkumar Chinchani --- staging/cse/windows/kubernetesfunc.ps1 | 35 +++++++++---- staging/cse/windows/kubernetesfunc.tests.ps1 | 52 ++++++++++++++++++++ 2 files changed, 78 insertions(+), 9 deletions(-) diff --git a/staging/cse/windows/kubernetesfunc.ps1 b/staging/cse/windows/kubernetesfunc.ps1 index 023542b6f3c..d9852e4288d 100644 --- a/staging/cse/windows/kubernetesfunc.ps1 +++ b/staging/cse/windows/kubernetesfunc.ps1 @@ -71,8 +71,8 @@ function Register-NodeResetScriptTask { function Register-CACertificatesRefreshTask { Param( - [Parameter(Mandatory = $true)][string] - $Location + [Parameter(Mandatory = $false)][string] + $Location = "" ) Logs-To-Event -TaskName "AKS.WindowsCSE.RegisterCACertificatesRefreshTask" -TaskMessage "Start to register CA certificates refresh task" @@ -84,7 +84,13 @@ function Register-CACertificatesRefreshTask { return } - $refreshCommand = "& { . 'C:\AzureData\windows\windowscsehelper.ps1'; . 'C:\AzureData\windows\kubernetesfunc.ps1'; Get-CACertificates -Location '$Location' | Out-Null }" + # Include -Location only when it was provided, so older VHDs whose Get-CACertificates + # does not accept -Location can still execute the scheduled task successfully. + if ([string]::IsNullOrEmpty($Location)) { + $refreshCommand = "& { . 'C:\AzureData\windows\windowscsehelper.ps1'; . 'C:\AzureData\windows\kubernetesfunc.ps1'; Get-CACertificates | Out-Null }" + } else { + $refreshCommand = "& { . 'C:\AzureData\windows\windowscsehelper.ps1'; . 'C:\AzureData\windows\kubernetesfunc.ps1'; Get-CACertificates -Location '$Location' | Out-Null }" + } $action = New-ScheduledTaskAction -Execute "powershell.exe" -Argument "-NoProfile -NonInteractive -ExecutionPolicy Bypass -Command `"$refreshCommand`"" $principal = New-ScheduledTaskPrincipal -UserId SYSTEM -LogonType ServiceAccount -RunLevel Highest $trigger = New-JobTrigger -Daily -At "19:00" -DaysInterval 1 @@ -287,10 +293,14 @@ function Get-CustomCloudCertEndpointModeFromLocation { function Should-InstallCACertificatesRefreshTask { Param( - [Parameter(Mandatory = $true)][string] - $Location + [Parameter(Mandatory = $false)][string] + $Location = "" ) + # When Location is not supplied (older callers), default to legacy mode. + if ([string]::IsNullOrEmpty($Location)) { + return $true + } $certEndpointMode = Get-CustomCloudCertEndpointModeFromLocation -Location $Location if ($certEndpointMode -eq "legacy") { return $true @@ -308,15 +318,22 @@ function Should-InstallCACertificatesRefreshTask { function Get-CACertificates { Param( - [Parameter(Mandatory = $true)][string] - $Location + [Parameter(Mandatory = $false)][string] + $Location = "" ) $caFolder = "C:\ca" Create-Directory -FullPath $caFolder -DirectoryUsage "storing CA certificates" - $certEndpointMode = Get-CustomCloudCertEndpointModeFromLocation -Location $Location - Write-Log "Get CA certificates. Location: $Location. EndpointMode: $certEndpointMode" + # When Location is not supplied (older callers), fall back to the legacy endpoint + # which was the original behavior before the rcv1p changes. + if ([string]::IsNullOrEmpty($Location)) { + $certEndpointMode = "legacy" + Write-Log "Get CA certificates. Location not provided, defaulting to legacy endpoint mode" + } else { + $certEndpointMode = Get-CustomCloudCertEndpointModeFromLocation -Location $Location + Write-Log "Get CA certificates. Location: $Location. EndpointMode: $certEndpointMode" + } try { if ($certEndpointMode -eq "legacy") { diff --git a/staging/cse/windows/kubernetesfunc.tests.ps1 b/staging/cse/windows/kubernetesfunc.tests.ps1 index 8186bfabc4c..8ada13ee440 100644 --- a/staging/cse/windows/kubernetesfunc.tests.ps1 +++ b/staging/cse/windows/kubernetesfunc.tests.ps1 @@ -187,4 +187,56 @@ Describe 'Get-CACertificates' { $result | Should -Be $false } + + It 'falls back to legacy endpoint when called without -Location (backward compat)' { + $script:retryUris = @() + Mock Retry-Command -MockWith { + param($Command, $Args, $Retries, $RetryDelaySeconds) + $script:retryUris += $PSBoundParameters['Args'].Uri + return [PSCustomObject]@{ + Content = '{"Certificates":[{"Name":"compat.crt","CertBody":"compat-body"}]}' + } + } + + $result = Get-CACertificates + + $result | Should -Be $true + Assert-MockCalled -CommandName Retry-Command -Exactly -Times 1 + $script:retryUris | Should -Contain 'http://168.63.129.16/machine?comp=acmspackage&type=cacertificates&ext=json' + } +} + +Describe 'Should-InstallCACertificatesRefreshTask - backward compat' { + It 'returns true when called without -Location (backward compat)' { + $result = Should-InstallCACertificatesRefreshTask + + $result | Should -Be $true + } +} + +Describe 'Register-CACertificatesRefreshTask - backward compat' { + BeforeEach { + $script:lastScheduledTaskArgument = $null + + Mock Logs-To-Event -MockWith { } + Mock New-ScheduledTaskPrincipal -MockWith { return @{ Kind = 'principal' } } + Mock New-JobTrigger -MockWith { return @{ Kind = 'trigger' } } + Mock New-ScheduledTask -MockWith { return @{ Kind = 'definition' } } + Mock Register-ScheduledTask -MockWith { } + Mock New-ScheduledTaskAction -MockWith { + param($Execute, $Argument) + $script:lastScheduledTaskArgument = $Argument + return @{ Execute = $Execute; Argument = $Argument } + } + } + + It 'creates a scheduled task without -Location when called without it (backward compat)' { + Mock Get-ScheduledTask -MockWith { return $null } + + Register-CACertificatesRefreshTask + + Assert-MockCalled -CommandName Register-ScheduledTask -Exactly -Times 1 + $script:lastScheduledTaskArgument | Should -Match ([regex]::Escape("Get-CACertificates |")) + $script:lastScheduledTaskArgument | Should -Not -Match "Location" + } } From 133e6c6fa77658a8728d16cfeb9824a9aebb2166 Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Fri, 27 Mar 2026 09:10:22 -0700 Subject: [PATCH 014/103] feat: remove deprecated Ubuntu repository initialization logic from init-aks-custom-cloud.sh Signed-off-by: Ramkumar Chinchani --- .../cloud-init/artifacts/init-aks-custom-cloud.sh | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh b/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh index eeb01c392fe..0c5487da414 100644 --- a/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh +++ b/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh @@ -469,18 +469,6 @@ if [ "$IS_UBUNTU" -eq 1 ] || [ "$IS_MARINER" -eq 1 ] || [ "$IS_AZURELINUX" -eq 1 fi fi fi - - cloud-init status --wait - rootRepoDepotEndpoint="$(echo "${REPO_DEPOT_ENDPOINT}" | sed 's/\/ubuntu//')" - # logic taken from https://repodepot.azure.com/scripts/cloud-init/setup_repodepot.sh - ubuntuRel=$(lsb_release --release | awk '{print $2}') - ubuntuDist=$(lsb_release -c | awk '{print $2}') - # initialize archive.ubuntu.com repo - init_ubuntu_main_repo_depot ${rootRepoDepotEndpoint} - init_ubuntu_pmc_repo_depot ${rootRepoDepotEndpoint} - # update apt list - echo "Running apt-get update" - aptget_update elif [ "$IS_FLATCAR" -eq 1 ] || [ "$IS_ACL" -eq 1 ]; then if [ "$install_ca_refresh_schedule" -eq 1 ]; then script_path="$(readlink -f "$0")" From c60e3c7611580f34fbcaf0801f72493da07e8c92 Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Thu, 2 Apr 2026 14:18:47 -0700 Subject: [PATCH 015/103] Split init-aks-custom-cloud.sh to fix Flatcar/ACL customData size limit The unified init-aks-custom-cloud.sh script (~22KB) pushed Flatcar and ACL VM customData over Azure's 87,380 character limit, causing 16 E2E failures. Split the script into two files: - init-aks-custom-cloud.sh: cert refresh + scheduling (included for all clouds) - init-aks-custom-cloud-repos.sh: repo depot + chrony (custom cloud only) The main script sources the repos script at runtime if present. For non-custom-cloud VMs, only the smaller main script is embedded, reducing base64(gzip) size from 8,736 to 4,424 chars (-4,312 chars). Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Ramkumar Chinchani --- .../artifacts/init-aks-custom-cloud-repos.sh | 358 ++++++++++++++++++ .../artifacts/init-aks-custom-cloud.sh | 350 +---------------- parts/linux/cloud-init/nodecustomdata.yml | 7 + pkg/agent/baker.go | 3 + pkg/agent/const.go | 5 +- pkg/agent/variables.go | 3 + 6 files changed, 381 insertions(+), 345 deletions(-) create mode 100644 parts/linux/cloud-init/artifacts/init-aks-custom-cloud-repos.sh diff --git a/parts/linux/cloud-init/artifacts/init-aks-custom-cloud-repos.sh b/parts/linux/cloud-init/artifacts/init-aks-custom-cloud-repos.sh new file mode 100644 index 00000000000..0c68d513568 --- /dev/null +++ b/parts/linux/cloud-init/artifacts/init-aks-custom-cloud-repos.sh @@ -0,0 +1,358 @@ +#!/bin/bash +# This script handles repo depot initialization and chrony configuration for +# AKS custom cloud environments. It is sourced by init-aks-custom-cloud.sh and +# inherits all variables from it (IS_UBUNTU, IS_MARINER, IS_AZURELINUX, +# IS_FLATCAR, IS_ACL, REPO_DEPOT_ENDPOINT, etc.). +# +# This script is only included in custom cloud images to keep the base +# customData size small for non-custom-cloud scenarios. + +set -x + +function init_ubuntu_main_repo_depot { + local repodepot_endpoint="$1" + # Initialize directory for keys + mkdir -p /etc/apt/keyrings + + # This copies the updated bundle to the location used by OpenSSL which is commonly used + echo "Copying updated bundle to OpenSSL .pem file..." + cp /etc/ssl/certs/ca-certificates.crt /usr/lib/ssl/cert.pem + echo "Updated bundle copied." + + # Back up sources.list and sources.list.d contents + mkdir -p /etc/apt/backup/ + if [ -f "/etc/apt/sources.list" ]; then + mv /etc/apt/sources.list /etc/apt/backup/ + fi + for sources_file in /etc/apt/sources.list.d/*; do + if [ -f "$sources_file" ]; then + mv "$sources_file" /etc/apt/backup/ + fi + done + + # Set location of sources file + . /etc/os-release + aptSourceFile="/etc/apt/sources.list.d/ubuntu.sources" + + # Create main sources file + cat < /etc/apt/sources.list.d/ubuntu.sources + +Types: deb +URIs: ${repodepot_endpoint}/ubuntu +Suites: ${VERSION_CODENAME} ${VERSION_CODENAME}-updates ${VERSION_CODENAME}-backports ${VERSION_CODENAME}-security +Components: main universe restricted multiverse +Signed-By: /usr/share/keyrings/ubuntu-archive-keyring.gpg +EOF + + # Update the apt sources file using the RepoDepot Ubuntu URL for this cloud. Update it by replacing + # all urls with the RepoDepot Ubuntu url + ubuntuUrl=${repodepot_endpoint}/ubuntu + echo "Converting URLs in $aptSourceFile to RepoDepot URLs..." + sed -i "s,https\?://.[^ ]*,$ubuntuUrl,g" $aptSourceFile + echo "apt source URLs converted, see new file below:" + echo "" + echo "-----" + cat $aptSourceFile + echo "-----" + echo "" +} + +function check_url { + local url=$1 + echo "Checking url: $url" + + # Use curl to check the URL and capture both stdout and stderr + curl_exit_code=$(curl -s --head --request GET $url) + # Check the exit status of curl + # shellcheck disable=SC3010 + if [[ $? -ne 0 ]] || echo "$curl_exit_code" | grep -E "404 Not Found" > /dev/null; then + echo "ERROR: $url is not available. Please manually check if the url is valid before re-running script" + exit 1 + fi +} + +function write_to_sources_file { + local sources_list_d_file=$1 + local source_uri=$2 + shift 2 + local key_paths=("$@") + + sources_file_path="/etc/apt/sources.list.d/${sources_list_d_file}.sources" + ubuntuDist=$(lsb_release -c | awk '{print $2}') + + tee -a $sources_file_path < /dev/null + echo "$key_name key added to keyring." +} + +function derive_key_paths { + local key_names=("$@") + local key_paths=() + + for key_name in "${key_names[@]}"; do + key_paths+=("/etc/apt/keyrings/${key_name}.gpg") + done + + echo "${key_paths[*]}" +} + +function add_ms_keys { + # Add the Microsoft package server keys to keyring. + echo "Adding Microsoft keys to keyring..." + + add_key_ubuntu microsoft.asc + add_key_ubuntu msopentech.asc +} + +function aptget_update { + echo "apt-get updating..." + echo "note: depending on how many sources have been added this may take a couple minutes..." + if apt-get update | grep -q "404 Not Found"; then + echo "ERROR: apt-get update failed to find all sources. Please validate the sources or remove bad sources from your sources and try again." + exit 1 + else + echo "apt-get update complete!" + fi +} + +function init_ubuntu_pmc_repo_depot { + local repodepot_endpoint="$1" + # Add Microsoft packages source to the azure specific sources.list. + echo "Adding the packages.microsoft.com Ubuntu-$ubuntuRel repo..." + + microsoftPackageSource="$repodepot_endpoint/microsoft/ubuntu/$ubuntuRel/prod" + check_url $microsoftPackageSource + write_to_sources_file microsoft-prod $microsoftPackageSource $(derive_key_paths microsoft.asc msopentech.asc) + write_to_sources_file microsoft-prod-testing $microsoftPackageSource $(derive_key_paths microsoft.asc msopentech.asc) + echo "Ubuntu ($ubuntuRel) repo added." + echo "Adding packages.microsoft.com keys" + add_ms_keys $repodepot_endpoint +} + +function init_mariner_repo_depot { + local repodepot_endpoint=$1 + echo "Adding [extended] repo" + cp /etc/yum.repos.d/mariner-extras.repo /etc/yum.repos.d/mariner-extended.repo + sed -i -e "s|extras|extended|" /etc/yum.repos.d/mariner-extended.repo + sed -i -e "s|Extras|Extended|" /etc/yum.repos.d/mariner-extended.repo + + echo "Adding [nvidia] repo" + cp /etc/yum.repos.d/mariner-extras.repo /etc/yum.repos.d/mariner-nvidia.repo + sed -i -e "s|extras|nvidia|" /etc/yum.repos.d/mariner-nvidia.repo + sed -i -e "s|Extras|Nvidia|" /etc/yum.repos.d/mariner-nvidia.repo + + echo "Adding [cloud-native] repo" + cp /etc/yum.repos.d/mariner-extras.repo /etc/yum.repos.d/mariner-cloud-native.repo + sed -i -e "s|extras|cloud-native|" /etc/yum.repos.d/mariner-cloud-native.repo + sed -i -e "s|Extras|Cloud-Native|" /etc/yum.repos.d/mariner-cloud-native.repo + + echo "Pointing Mariner repos at RepoDepot..." + for f in /etc/yum.repos.d/*.repo; do + sed -i -e "s|https://packages.microsoft.com|${repodepot_endpoint}/mariner/packages.microsoft.com|" $f + echo "$f modified." + done + echo "Mariner repo setup complete." +} + +function init_azurelinux_repo_depot { + local repodepot_endpoint=$1 + local repos=("amd" "base" "cloud-native" "extended" "ms-non-oss" "ms-oss" "nvidia") + + rm -f /etc/yum.repos.d/azurelinux* + + for repo in "${repos[@]}"; do + output_file="/etc/yum.repos.d/azurelinux-${repo}.repo" + repo_content=( + "[azurelinux-official-$repo]" + "name=Azure Linux Official $repo \$releasever \$basearch" + "baseurl=$repodepot_endpoint/azurelinux/\$releasever/prod/$repo/\$basearch" + "gpgkey=file:///etc/pki/rpm-gpg/MICROSOFT-RPM-GPG-KEY" + "gpgcheck=1" + "repo_gpgcheck=1" + "enabled=1" + "skip_if_unavailable=True" + "sslverify=1" + ) + + rm -f "$output_file" + + for line in "${repo_content[@]}"; do + echo "$line" >> "$output_file" + done + + echo "File '$output_file' has been created." + done + echo "Azure Linux repo setup complete." +} + +function dnf_makecache { + local retries=10 + local dnf_makecache_output=/tmp/dnf-makecache.out + local i + for i in $(seq 1 $retries); do + ! (dnf makecache -y 2>&1 | tee $dnf_makecache_output | grep -E "^([WE]:.*)|([eE]rr.*)$") && \ + cat $dnf_makecache_output && break || \ + cat $dnf_makecache_output + if [ $i -eq $retries ]; then + return 1 + else + sleep 5 + fi + done + echo "Executed dnf makecache -y $i times" +} + +if [ "$IS_UBUNTU" -eq 1 ]; then + rootRepoDepotEndpoint="$(echo "${REPO_DEPOT_ENDPOINT}" | sed 's/\/ubuntu//')" + if [ -n "$rootRepoDepotEndpoint" ]; then + cloud-init status --wait + ubuntuRel=$(lsb_release --release | awk '{print $2}') + ubuntuDist=$(lsb_release -c | awk '{print $2}') + init_ubuntu_main_repo_depot ${rootRepoDepotEndpoint} + init_ubuntu_pmc_repo_depot ${rootRepoDepotEndpoint} + echo "Running apt-get update" + aptget_update + else + echo "REPO_DEPOT_ENDPOINT empty, skipping Ubuntu RepoDepot initialization" + fi +elif [ "$IS_MARINER" -eq 1 ] || [ "$IS_AZURELINUX" -eq 1 ]; then + cloud-init status --wait + + marinerRepoDepotEndpoint="$(echo "${REPO_DEPOT_ENDPOINT}" | sed 's/\/ubuntu//')" + if [ -z "$marinerRepoDepotEndpoint" ]; then + >&2 echo "repo depot endpoint empty while running custom-cloud init script" + else + if [ "$IS_MARINER" -eq 1 ]; then + echo "Initializing Mariner repo depot settings..." + init_mariner_repo_depot ${marinerRepoDepotEndpoint} + dnf_makecache || exit 1 + else + echo "Initializing Azure Linux repo depot settings..." + init_azurelinux_repo_depot ${marinerRepoDepotEndpoint} + dnf_makecache || exit 1 + fi + fi +fi + +# Disable systemd-timesyncd and install chrony and uses local time source +# ACL has PTP clock config compiled into chronyd with no config file or sourcedir directives, +# so it uses only the local PTP clock and has no DHCP-injectable NTP sources. +if [ "$IS_ACL" -eq 1 ]; then + echo "Skipping chrony configuration for ACL (PTP clock baked into chronyd, no external NTP sources)" +elif [ "$IS_MARINER" -eq 1 ] || [ "$IS_AZURELINUX" -eq 1 ]; then +cat > /etc/chrony.conf < $chrony_conf < /etc/apt/sources.list.d/ubuntu.sources - -Types: deb -URIs: ${repodepot_endpoint}/ubuntu -Suites: ${VERSION_CODENAME} ${VERSION_CODENAME}-updates ${VERSION_CODENAME}-backports ${VERSION_CODENAME}-security -Components: main universe restricted multiverse -Signed-By: /usr/share/keyrings/ubuntu-archive-keyring.gpg -EOF - - # Update the apt sources file using the RepoDepot Ubuntu URL for this cloud. Update it by replacing - # all urls with the RepoDepot Ubuntu url - ubuntuUrl=${repodepot_endpoint}/ubuntu - echo "Converting URLs in $aptSourceFile to RepoDepot URLs..." - sed -i "s,https\?://.[^ ]*,$ubuntuUrl,g" $aptSourceFile - echo "apt source URLs converted, see new file below:" - echo "" - echo "-----" - cat $aptSourceFile - echo "-----" - echo "" -} - -function check_url { - local url=$1 - echo "Checking url: $url" - - # Use curl to check the URL and capture both stdout and stderr - curl_exit_code=$(curl -s --head --request GET $url) - # Check the exit status of curl - # shellcheck disable=SC3010 - if [[ $? -ne 0 ]] || echo "$curl_exit_code" | grep -E "404 Not Found" > /dev/null; then - echo "ERROR: $url is not available. Please manually check if the url is valid before re-running script" - exit 1 - fi -} - -function write_to_sources_file { - local sources_list_d_file=$1 - local source_uri=$2 - shift 2 - local key_paths=("$@") - - sources_file_path="/etc/apt/sources.list.d/${sources_list_d_file}.sources" - ubuntuDist=$(lsb_release -c | awk '{print $2}') - - tee -a $sources_file_path < /dev/null - echo "$key_name key added to keyring." -} - -function derive_key_paths { - local key_names=("$@") - local key_paths=() - - for key_name in "${key_names[@]}"; do - key_paths+=("/etc/apt/keyrings/${key_name}.gpg") - done - - echo "${key_paths[*]}" -} - -function add_ms_keys { - # Add the Microsoft package server keys to keyring. - echo "Adding Microsoft keys to keyring..." - - add_key_ubuntu microsoft.asc - add_key_ubuntu msopentech.asc -} - -function aptget_update { - echo "apt-get updating..." - echo "note: depending on how many sources have been added this may take a couple minutes..." - if apt-get update | grep -q "404 Not Found"; then - echo "ERROR: apt-get update failed to find all sources. Please validate the sources or remove bad sources from your sources and try again." - exit 1 - else - echo "apt-get update complete!" - fi -} - -function init_ubuntu_pmc_repo_depot { - local repodepot_endpoint="$1" - # Add Microsoft packages source to the azure specific sources.list. - echo "Adding the packages.microsoft.com Ubuntu-$ubuntuRel repo..." - - microsoftPackageSource="$repodepot_endpoint/microsoft/ubuntu/$ubuntuRel/prod" - check_url $microsoftPackageSource - write_to_sources_file microsoft-prod $microsoftPackageSource $(derive_key_paths microsoft.asc msopentech.asc) - write_to_sources_file microsoft-prod-testing $microsoftPackageSource $(derive_key_paths microsoft.asc msopentech.asc) - echo "Ubuntu ($ubuntuRel) repo added." - echo "Adding packages.microsoft.com keys" - add_ms_keys $repodepot_endpoint -} - -function init_mariner_repo_depot { - local repodepot_endpoint=$1 - echo "Adding [extended] repo" - cp /etc/yum.repos.d/mariner-extras.repo /etc/yum.repos.d/mariner-extended.repo - sed -i -e "s|extras|extended|" /etc/yum.repos.d/mariner-extended.repo - sed -i -e "s|Extras|Extended|" /etc/yum.repos.d/mariner-extended.repo - - echo "Adding [nvidia] repo" - cp /etc/yum.repos.d/mariner-extras.repo /etc/yum.repos.d/mariner-nvidia.repo - sed -i -e "s|extras|nvidia|" /etc/yum.repos.d/mariner-nvidia.repo - sed -i -e "s|Extras|Nvidia|" /etc/yum.repos.d/mariner-nvidia.repo - - echo "Adding [cloud-native] repo" - cp /etc/yum.repos.d/mariner-extras.repo /etc/yum.repos.d/mariner-cloud-native.repo - sed -i -e "s|extras|cloud-native|" /etc/yum.repos.d/mariner-cloud-native.repo - sed -i -e "s|Extras|Cloud-Native|" /etc/yum.repos.d/mariner-cloud-native.repo - - echo "Pointing Mariner repos at RepoDepot..." - for f in /etc/yum.repos.d/*.repo; do - sed -i -e "s|https://packages.microsoft.com|${repodepot_endpoint}/mariner/packages.microsoft.com|" $f - echo "$f modified." - done - echo "Mariner repo setup complete." -} - -function init_azurelinux_repo_depot { - local repodepot_endpoint=$1 - local repos=("amd" "base" "cloud-native" "extended" "ms-non-oss" "ms-oss" "nvidia") - - rm -f /etc/yum.repos.d/azurelinux* - - for repo in "${repos[@]}"; do - output_file="/etc/yum.repos.d/azurelinux-${repo}.repo" - repo_content=( - "[azurelinux-official-$repo]" - "name=Azure Linux Official $repo \$releasever \$basearch" - "baseurl=$repodepot_endpoint/azurelinux/\$releasever/prod/$repo/\$basearch" - "gpgkey=file:///etc/pki/rpm-gpg/MICROSOFT-RPM-GPG-KEY" - "gpgcheck=1" - "repo_gpgcheck=1" - "enabled=1" - "skip_if_unavailable=True" - "sslverify=1" - ) - - rm -f "$output_file" - - for line in "${repo_content[@]}"; do - echo "$line" >> "$output_file" - done - - echo "File '$output_file' has been created." - done - echo "Azure Linux repo setup complete." -} - -function dnf_makecache { - local retries=10 - local dnf_makecache_output=/tmp/dnf-makecache.out - local i - for i in $(seq 1 $retries); do - ! (dnf makecache -y 2>&1 | tee $dnf_makecache_output | grep -E "^([WE]:.*)|([eE]rr.*)$") && \ - cat $dnf_makecache_output && break || \ - cat $dnf_makecache_output - if [ $i -eq $retries ]; then - return 1 - else - sleep 5 - fi - done - echo "Executed dnf makecache -y $i times" -} - if [ "$IS_UBUNTU" -eq 1 ] || [ "$IS_MARINER" -eq 1 ] || [ "$IS_AZURELINUX" -eq 1 ]; then scriptPath=$0 # Determine an absolute, canonical path to this script for use in cron. @@ -504,139 +293,12 @@ EOF fi fi -if [ "$IS_UBUNTU" -eq 1 ]; then - rootRepoDepotEndpoint="$(echo "${REPO_DEPOT_ENDPOINT}" | sed 's/\/ubuntu//')" - if [ -n "$rootRepoDepotEndpoint" ]; then - cloud-init status --wait - ubuntuRel=$(lsb_release --release | awk '{print $2}') - ubuntuDist=$(lsb_release -c | awk '{print $2}') - init_ubuntu_main_repo_depot ${rootRepoDepotEndpoint} - init_ubuntu_pmc_repo_depot ${rootRepoDepotEndpoint} - echo "Running apt-get update" - aptget_update - else - echo "REPO_DEPOT_ENDPOINT empty, skipping Ubuntu RepoDepot initialization" - fi -elif [ "$IS_MARINER" -eq 1 ] || [ "$IS_AZURELINUX" -eq 1 ]; then - cloud-init status --wait - - marinerRepoDepotEndpoint="$(echo "${REPO_DEPOT_ENDPOINT}" | sed 's/\/ubuntu//')" - if [ -z "$marinerRepoDepotEndpoint" ]; then - >&2 echo "repo depot endpoint empty while running custom-cloud init script" - else - if [ "$IS_MARINER" -eq 1 ]; then - echo "Initializing Mariner repo depot settings..." - init_mariner_repo_depot ${marinerRepoDepotEndpoint} - dnf_makecache || exit 1 - else - echo "Initializing Azure Linux repo depot settings..." - init_azurelinux_repo_depot ${marinerRepoDepotEndpoint} - dnf_makecache || exit 1 - fi - fi -fi - -# Disable systemd-timesyncd and install chrony and uses local time source -# ACL has PTP clock config compiled into chronyd with no config file or sourcedir directives, -# so it uses only the local PTP clock and has no DHCP-injectable NTP sources. -if [ "$IS_ACL" -eq 1 ]; then - echo "Skipping chrony configuration for ACL (PTP clock baked into chronyd, no external NTP sources)" -elif [ "$IS_MARINER" -eq 1 ] || [ "$IS_AZURELINUX" -eq 1 ]; then -cat > /etc/chrony.conf < $chrony_conf < Date: Mon, 13 Apr 2026 11:27:13 -0700 Subject: [PATCH 016/103] feat(e2e): add RCV1P cert mode end-to-end tests Add e2e test infrastructure and scenarios to validate RCV1P (Root Certificate V1P) certificate endpoint mode across all supported Linux distros and Windows versions. Infrastructure changes: - Introduce ClusterInfra struct to decouple cluster lifecycle functions from the default Azure subscription, enabling per-subscription clients - Refactor ~20 functions in cluster.go, kube.go, and aks_model.go to accept ClusterInfra instead of hardcoding config.Azure - Add NewAzureClientForSubscription() to construct ARM clients for any subscription, replacing the single-subscription NewAzureClient() - Add CreateVMManagedIdentityInRG() for identity-only creation without blob storage (RCV1P subscription doesn't need shared storage) - Add ClusterRCV1PKubenet cached cluster function and RCV1P-specific resource group/identity cache entries Config and pipeline: - Add RCV1P_SUBSCRIPTION_ID env var to config, with lazy-init of RCV1PAzure client and helper functions - Pass RCV1P_SUBSCRIPTION_ID through e2e-template.yaml and e2e_run.sh - Add dedicated e2e-rcv1p.yaml pipeline with daily schedule Test scenarios: - Linux: Ubuntu 22.04, Ubuntu 24.04, AzureLinux V3, Flatcar, ACL - Windows: Server 2022, 23H2, 2025 - All tests skip gracefully when RCV1P_SUBSCRIPTION_ID is unset Validators: - ValidateRCV1PCertMode (Linux): checks provisioning log for rcv1p mode, verifies certs in /root/AzureCACertificates, validates distro-specific trust store updates, confirms cron/systemd refresh schedule - ValidateRCV1PCertModeWindows: checks C:\AzureCACertificates directory and scheduled refresh task Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Ramkumar Chinchani --- .pipelines/e2e-rcv1p.yaml | 19 ++ .pipelines/scripts/e2e_run.sh | 2 + .pipelines/templates/e2e-template.yaml | 1 + e2e/config/azure.go | 96 ++++---- e2e/config/config.go | 23 ++ e2e/scenario_rcv1p_test.go | 210 ++++++++++++++++++ e2e/scenario_rcv1p_win_test.go | 91 ++++++++ e2e/test_helpers.go | 35 ++- e2e/types.go | 68 ++++++ e2e/validators.go | 107 +++++++++ e2e/vmss.go | 37 +-- .../artifacts/init-aks-custom-cloud.sh | 3 +- .../artifacts/init_aks_custom_cloud_spec.sh | 2 +- staging/cse/windows/kubernetesfunc.ps1 | 3 +- 14 files changed, 625 insertions(+), 72 deletions(-) create mode 100644 .pipelines/e2e-rcv1p.yaml create mode 100644 e2e/scenario_rcv1p_test.go create mode 100644 e2e/scenario_rcv1p_win_test.go diff --git a/.pipelines/e2e-rcv1p.yaml b/.pipelines/e2e-rcv1p.yaml new file mode 100644 index 00000000000..5fdf9d3a5ee --- /dev/null +++ b/.pipelines/e2e-rcv1p.yaml @@ -0,0 +1,19 @@ +name: $(Date:yyyyMMdd)$(Rev:.r) +variables: + TAGS_TO_RUN: "rcv1pcertmode=true" + SKIP_E2E_TESTS: false + E2E_GO_TEST_TIMEOUT: "75m" +schedules: + - cron: "0 11 * * *" + displayName: Daily 3am PST + branches: + include: + - main + always: true +trigger: none +pr: none +jobs: + - template: ./templates/e2e-template.yaml + parameters: + name: RCV1P Cert Mode Tests + IgnoreScenariosWithMissingVhd: false diff --git a/.pipelines/scripts/e2e_run.sh b/.pipelines/scripts/e2e_run.sh index 1dcea264298..097fe250756 100644 --- a/.pipelines/scripts/e2e_run.sh +++ b/.pipelines/scripts/e2e_run.sh @@ -35,6 +35,7 @@ VHD_BUILD_ID="${VHD_BUILD_ID:-}" IGNORE_SCENARIOS_WITH_MISSING_VHD="${IGNORE_SCENARIOS_WITH_MISSING_VHD:-}" LOGGING_DIR="${LOGGING_DIR:-}" E2E_SUBSCRIPTION_ID="${E2E_SUBSCRIPTION_ID:-}" +RCV1P_SUBSCRIPTION_ID="${RCV1P_SUBSCRIPTION_ID:-}" ENABLE_SECURE_TLS_BOOTSTRAPPING="${ENABLE_SECURE_TLS_BOOTSTRAPPING:-true}" TAGS_TO_SKIP="${TAGS_TO_SKIP:-}" TAGS_TO_RUN="${TAGS_TO_RUN:-}" @@ -47,6 +48,7 @@ echo "VHD_BUILD_ID: ${VHD_BUILD_ID}" echo "IGNORE_SCENARIOS_WITH_MISSING_VHD: ${IGNORE_SCENARIOS_WITH_MISSING_VHD}" echo "LOGGING_DIR: ${LOGGING_DIR}" echo "E2E_SUBSCRIPTION_ID: ${E2E_SUBSCRIPTION_ID}" +echo "RCV1P_SUBSCRIPTION_ID: ${RCV1P_SUBSCRIPTION_ID}" echo "ENABLE_SECURE_TLS_BOOTSTRAPPING: ${ENABLE_SECURE_TLS_BOOTSTRAPPING}" echo "TAGS_TO_SKIP: ${TAGS_TO_SKIP}" echo "TAGS_TO_RUN: ${TAGS_TO_RUN}" diff --git a/.pipelines/templates/e2e-template.yaml b/.pipelines/templates/e2e-template.yaml index fe53fe52a2e..09398db8d30 100644 --- a/.pipelines/templates/e2e-template.yaml +++ b/.pipelines/templates/e2e-template.yaml @@ -42,6 +42,7 @@ jobs: displayName: Run AgentBaker E2E env: E2E_SUBSCRIPTION_ID: ${{parameters.subscriptionId}} + RCV1P_SUBSCRIPTION_ID: $(RCV1P_SUBSCRIPTION_ID) SYS_SSH_PUBLIC_KEY: $(SYS_SSH_PUBLIC_KEY) SYS_SSH_PRIVATE_KEY_B64: $(SYS_SSH_PRIVATE_KEY_B64) BUILD_SRC_DIR: $(System.DefaultWorkingDirectory) diff --git a/e2e/config/azure.go b/e2e/config/azure.go index d0de6f04619..847db25a269 100644 --- a/e2e/config/azure.go +++ b/e2e/config/azure.go @@ -117,6 +117,10 @@ func NewHttpClient() *http.Client { } func NewAzureClient() (*AzureClient, error) { + return NewAzureClientForSubscription(Config.SubscriptionID) +} + +func NewAzureClientForSubscription(subscriptionID string) (*AzureClient, error) { httpClient := NewHttpClient() logger := runtime.NewLogPolicy(&policy.LogOptions{ IncludeBody: true, @@ -155,193 +159,183 @@ func NewAzureClient() (*AzureClient, error) { return nil, fmt.Errorf("create core client: %w", err) } - cloud.PublicIPAddresses, err = armnetwork.NewPublicIPAddressesClient(Config.SubscriptionID, credential, opts) + cloud.PublicIPAddresses, err = armnetwork.NewPublicIPAddressesClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("create public ip addresses client: %w", err) } - cloud.BastionHosts, err = armnetwork.NewBastionHostsClient(Config.SubscriptionID, credential, opts) - if err != nil { - return nil, fmt.Errorf("create bastion hosts client: %w", err) - } - - cloud.BastionHosts, err = armnetwork.NewBastionHostsClient(Config.SubscriptionID, credential, opts) + cloud.BastionHosts, err = armnetwork.NewBastionHostsClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("create bastion hosts client: %w", err) } - cloud.RegistriesClient, err = armcontainerregistry.NewRegistriesClient(Config.SubscriptionID, credential, opts) + cloud.RegistriesClient, err = armcontainerregistry.NewRegistriesClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("failed to create registry client: %w", err) } - cloud.CacheRulesClient, err = armcontainerregistry.NewCacheRulesClient(Config.SubscriptionID, credential, opts) + cloud.CacheRulesClient, err = armcontainerregistry.NewCacheRulesClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("failed to create cache rules client: %w", err) } - cloud.PrivateEndpointClient, err = armnetwork.NewPrivateEndpointsClient(Config.SubscriptionID, credential, opts) + cloud.PrivateEndpointClient, err = armnetwork.NewPrivateEndpointsClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("failed to create private endpoint client: %w", err) } - cloud.PrivateZonesClient, err = armprivatedns.NewPrivateZonesClient(Config.SubscriptionID, credential, opts) + cloud.PrivateZonesClient, err = armprivatedns.NewPrivateZonesClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("failed to create private dns zones client: %w", err) } - cloud.VirutalNetworkLinksClient, err = armprivatedns.NewVirtualNetworkLinksClient(Config.SubscriptionID, credential, opts) + cloud.VirutalNetworkLinksClient, err = armprivatedns.NewVirtualNetworkLinksClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("failed to create virtual network links client: %w", err) } - cloud.RecordSetClient, err = armprivatedns.NewRecordSetsClient(Config.SubscriptionID, credential, opts) + cloud.RecordSetClient, err = armprivatedns.NewRecordSetsClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("failed to create record set client: %w", err) } - cloud.PrivateDNSZoneGroup, err = armnetwork.NewPrivateDNSZoneGroupsClient(Config.SubscriptionID, credential, opts) + cloud.PrivateDNSZoneGroup, err = armnetwork.NewPrivateDNSZoneGroupsClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("failed to create private dns zone group client: %w", err) } - cloud.SecurityGroup, err = armnetwork.NewSecurityGroupsClient(Config.SubscriptionID, credential, opts) + cloud.SecurityGroup, err = armnetwork.NewSecurityGroupsClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("create security group client: %w", err) } - cloud.Subnet, err = armnetwork.NewSubnetsClient(Config.SubscriptionID, credential, opts) + cloud.Subnet, err = armnetwork.NewSubnetsClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("create subnet client: %w", err) } - cloud.RouteTables, err = armnetwork.NewRouteTablesClient(Config.SubscriptionID, credential, opts) + cloud.RouteTables, err = armnetwork.NewRouteTablesClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("create route tables client: %w", err) } - cloud.Routes, err = armnetwork.NewRoutesClient(Config.SubscriptionID, credential, opts) + cloud.Routes, err = armnetwork.NewRoutesClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("create routes client: %w", err) } - cloud.AKS, err = armcontainerservice.NewManagedClustersClient(Config.SubscriptionID, credential, opts) + cloud.AKS, err = armcontainerservice.NewManagedClustersClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("create aks client: %w", err) } - cloud.Maintenance, err = armcontainerservice.NewMaintenanceConfigurationsClient(Config.SubscriptionID, credential, opts) + cloud.Maintenance, err = armcontainerservice.NewMaintenanceConfigurationsClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("failed to create maintenance client: %w", err) } - cloud.NetworkInterfaces, err = armnetwork.NewInterfacesClient(Config.SubscriptionID, credential, opts) + cloud.NetworkInterfaces, err = armnetwork.NewInterfacesClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("create network interfaces client: %w", err) } - cloud.VMSS, err = armcompute.NewVirtualMachineScaleSetsClient(Config.SubscriptionID, credential, opts) + cloud.VMSS, err = armcompute.NewVirtualMachineScaleSetsClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("create vmss client: %w", err) } - cloud.VMSSVM, err = armcompute.NewVirtualMachineScaleSetVMsClient(Config.SubscriptionID, credential, opts) + cloud.VMSSVM, err = armcompute.NewVirtualMachineScaleSetVMsClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("create vmss vm client: %w", err) } - cloud.VMs, err = armcompute.NewVirtualMachinesClient(Config.SubscriptionID, credential, opts) + cloud.VMs, err = armcompute.NewVirtualMachinesClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("create vms client: %w", err) } - cloud.Images, err = armcompute.NewImagesClient(Config.SubscriptionID, credential, opts) + cloud.Images, err = armcompute.NewImagesClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("create images client: %w", err) } - cloud.Snapshots, err = armcompute.NewSnapshotsClient(Config.SubscriptionID, credential, opts) + cloud.Snapshots, err = armcompute.NewSnapshotsClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("create snapshots client: %w", err) } - cloud.GalleryImages, err = armcompute.NewGalleryImagesClient(Config.SubscriptionID, credential, opts) + cloud.GalleryImages, err = armcompute.NewGalleryImagesClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("create gallery images client: %w", err) } - cloud.GalleryImageVersions, err = armcompute.NewGalleryImageVersionsClient(Config.SubscriptionID, credential, opts) + cloud.GalleryImageVersions, err = armcompute.NewGalleryImageVersionsClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("create gallery image versions client: %w", err) } - cloud.Resource, err = armresources.NewClient(Config.SubscriptionID, credential, opts) + cloud.Resource, err = armresources.NewClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("create resource client: %w", err) } - cloud.ResourceGroup, err = armresources.NewResourceGroupsClient(Config.SubscriptionID, credential, opts) + cloud.ResourceGroup, err = armresources.NewResourceGroupsClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("create resource group client: %w", err) } - cloud.VNet, err = armnetwork.NewVirtualNetworksClient(Config.SubscriptionID, credential, opts) + cloud.VNet, err = armnetwork.NewVirtualNetworksClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("create vnet client: %w", err) } - cloud.AzureFirewall, err = armnetwork.NewAzureFirewallsClient(Config.SubscriptionID, credential, opts) + cloud.AzureFirewall, err = armnetwork.NewAzureFirewallsClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("create firewall client: %w", err) } - cloud.PublicIPAddresses, err = armnetwork.NewPublicIPAddressesClient(Config.SubscriptionID, credential, opts) - if err != nil { - return nil, fmt.Errorf("create public ip addresses client: %w", err) - } - cloud.Blob, err = azblob.NewClient(Config.BlobStorageAccountURL(), credential, nil) if err != nil { return nil, fmt.Errorf("create blob container client: %w", err) } - cloud.StorageContainers, err = armstorage.NewBlobContainersClient(Config.SubscriptionID, credential, opts) + cloud.StorageContainers, err = armstorage.NewBlobContainersClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("create blob container client: %w", err) } - cloud.RoleAssignments, err = armauthorization.NewRoleAssignmentsClient(Config.SubscriptionID, credential, opts) + cloud.RoleAssignments, err = armauthorization.NewRoleAssignmentsClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("create role assignment client: %w", err) } - cloud.UserAssignedIdentities, err = armmsi.NewUserAssignedIdentitiesClient(Config.SubscriptionID, credential, nil) + cloud.UserAssignedIdentities, err = armmsi.NewUserAssignedIdentitiesClient(subscriptionID, credential, nil) if err != nil { return nil, fmt.Errorf("create user assigned identities client: %w", err) } - cloud.StorageAccounts, err = armstorage.NewAccountsClient(Config.SubscriptionID, credential, nil) + cloud.StorageAccounts, err = armstorage.NewAccountsClient(subscriptionID, credential, nil) if err != nil { return nil, fmt.Errorf("create storage accounts client: %w", err) } - cloud.VMSSVMRunCommands, err = armcompute.NewVirtualMachineScaleSetVMRunCommandsClient(Config.SubscriptionID, credential, opts) + cloud.VMSSVMRunCommands, err = armcompute.NewVirtualMachineScaleSetVMRunCommandsClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("create vmss vm run command client: %w", err) } - cloud.VMExtensionImages, err = armcompute.NewVirtualMachineExtensionImagesClient(Config.SubscriptionID, credential, opts) + cloud.VMExtensionImages, err = armcompute.NewVirtualMachineExtensionImagesClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("create vm extension images client: %w", err) } - cloud.ResourceSKUs, err = armcompute.NewResourceSKUsClient(Config.SubscriptionID, credential, opts) + cloud.ResourceSKUs, err = armcompute.NewResourceSKUsClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("create resource skus client: %w", err) } // Ensure the gallery exists - cloud.Galleries, err = armcompute.NewGalleriesClient(Config.SubscriptionID, credential, opts) + cloud.Galleries, err = armcompute.NewGalleriesClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("failed to create galleries client: %w", err) } @@ -419,6 +413,18 @@ func (a *AzureClient) CreateVMManagedIdentity(ctx context.Context, identityLocat return *identity.Properties.ClientID, nil } +// CreateVMManagedIdentityInRG creates a VM managed identity in the specified resource group +// without creating blob storage infrastructure (which belongs to the default subscription). +func (a *AzureClient) CreateVMManagedIdentityInRG(ctx context.Context, resourceGroupName, location string) (string, error) { + identity, err := a.UserAssignedIdentities.CreateOrUpdate(ctx, resourceGroupName, VMIdentityName, armmsi.Identity{ + Location: to.Ptr(location), + }, nil) + if err != nil { + return "", fmt.Errorf("create managed identity in RG %s: %w", resourceGroupName, err) + } + return *identity.Properties.ClientID, nil +} + func (a *AzureClient) createBlobStorageAccount(ctx context.Context) error { poller, err := a.StorageAccounts.BeginCreate(ctx, ResourceGroupName(Config.DefaultLocation), Config.BlobStorageAccount(), armstorage.AccountCreateParameters{ Kind: to.Ptr(armstorage.KindStorageV2), diff --git a/e2e/config/config.go b/e2e/config/config.go index 2c6e7eb012e..882d050da3e 100644 --- a/e2e/config/config.go +++ b/e2e/config/config.go @@ -29,6 +29,10 @@ var ( Azure = mustNewAzureClient() VMIdentityName = "abe2e-vm-identity" + // RCV1PAzure is lazily initialized when RCV1PSubscriptionID is set. + // It provides Azure clients bound to the PlatformSettingsOverride-registered subscription. + RCV1PAzure *AzureClient + DefaultPollUntilDoneOptions = &runtime.PollUntilDoneOptions{ Frequency: time.Second, } @@ -40,6 +44,14 @@ func ResourceGroupName(location string) string { return "abe2e-" + location } +func RCV1PResourceGroupName(location string) string { + return "abe2e-rcv1p-" + location +} + +func (c *Configuration) RCV1PVMIdentityResourceID(location string) string { + return fmt.Sprintf("/subscriptions/%s/resourceGroups/%s/providers/Microsoft.ManagedIdentity/userAssignedIdentities/%s", c.RCV1PSubscriptionID, RCV1PResourceGroupName(location), VMIdentityName) +} + func PrivateACRNameNotAnon(location string) string { return "abe2eprivatenonanon" + location // will have anonymous pull enabled } @@ -90,6 +102,7 @@ type Configuration struct { TestTimeoutCluster time.Duration `env:"TEST_TIMEOUT_CLUSTER" envDefault:"30m"` TestTimeoutVMSS time.Duration `env:"TEST_TIMEOUT_VMSS" envDefault:"17m"` WindowsAdminPassword string `env:"WINDOWS_ADMIN_PASSWORD"` + RCV1PSubscriptionID string `env:"RCV1P_SUBSCRIPTION_ID"` } func (c *Configuration) BlobStorageAccount() string { @@ -171,6 +184,16 @@ func mustLoadConfig() *Configuration { return cfg } +func init() { + if Config.RCV1PSubscriptionID != "" && !strings.HasPrefix(Config.RCV1PSubscriptionID, "$(") { + client, err := NewAzureClientForSubscription(Config.RCV1PSubscriptionID) + if err != nil { + panic(fmt.Sprintf("failed to create RCV1P Azure client: %v", err)) + } + RCV1PAzure = client + } +} + // Returns a newly generated RSA public/private key pair with the private key in PEM format. func mustGetNewRSAKeyPair() ([]byte, []byte, string) { // Generate new key pair diff --git a/e2e/scenario_rcv1p_test.go b/e2e/scenario_rcv1p_test.go new file mode 100644 index 00000000000..817b63cba8e --- /dev/null +++ b/e2e/scenario_rcv1p_test.go @@ -0,0 +1,210 @@ +// scenario_rcv1p_test.go contains end-to-end tests for the RCV1P (Root Certificate V1P) cert mode +// on Linux distros. RCV1P is the next-generation mechanism for distributing Azure root CA certificates +// to AKS nodes. Instead of relying on hardcoded certificate bundles, RCV1P queries the Azure wireserver +// at provisioning time to download the latest root certificates and installs them into the OS trust store. +// +// These tests require: +// - A dedicated subscription (RCV1P_SUBSCRIPTION_ID) with the Microsoft.Compute/PlatformSettingsOverride +// feature flag registered, which enables the wireserver certificate endpoint. +// - The VM opt-in tag "platformsettings.host_environment.service.platform_optedin_for_rootcerts=true" +// on each VMSS, which tells wireserver to serve certificates to this specific VM. +// +// Both conditions must be met: the subscription feature enables the endpoint, and the VM tag grants +// per-VM access. Without the tag, wireserver returns IsOptedInForRootCerts=false. +// +// The positive tests (Test_RCV1P_) verify that certificates are downloaded, installed into +// the distro-specific trust store, and a refresh schedule is created. The negative test +// (Test_RCV1P_NotOptedIn) verifies that omitting the VM tag correctly prevents cert installation. +package e2e + +import ( + "context" + "strings" + "testing" + + "github.com/Azure/agentbaker/e2e/config" + "github.com/Azure/agentbaker/pkg/agent/datamodel" + "github.com/Azure/azure-sdk-for-go/sdk/azcore/to" + "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute/v7" +) + +// rcv1pOptInTag is the ARM tag that must be set on the VM resource for wireserver to serve +// root certificates. Without this tag, wireserver returns IsOptedInForRootCerts=false even +// if the subscription has the PlatformSettingsOverride feature registered. +const rcv1pOptInTag = "platformsettings.host_environment.service.platform_optedin_for_rootcerts" + +// skipIfRCV1PNotConfigured skips the test when the RCV1P subscription is not configured. +// This happens in regular CI runs where the RCV1P variable group is not linked, causing +// Azure DevOps to pass the literal unexpanded string "$(RCV1P_SUBSCRIPTION_ID)". +func skipIfRCV1PNotConfigured(t *testing.T) { + t.Helper() + subID := config.Config.RCV1PSubscriptionID + if subID == "" || strings.HasPrefix(subID, "$(") { + t.Skip("RCV1P_SUBSCRIPTION_ID not set or not resolved, skipping RCV1P cert mode test") + } +} + +// rcv1pOptInVMConfigMutator sets the platform opt-in tag on the VMSS so that wireserver +// will serve root certificates to this VM during provisioning. +func rcv1pOptInVMConfigMutator(vmss *armcompute.VirtualMachineScaleSet) { + if vmss.Tags == nil { + vmss.Tags = map[string]*string{} + } + vmss.Tags[rcv1pOptInTag] = to.Ptr("true") +} + +// Test_RCV1P_Ubuntu2204 validates RCV1P cert download and trust store installation on Ubuntu 22.04. +// Ubuntu uses /usr/local/share/ca-certificates/ as the cert drop folder and update-ca-certificates +// to rebuild the trust bundle. +func Test_RCV1P_Ubuntu2204(t *testing.T) { + skipIfRCV1PNotConfigured(t) + RunScenario(t, &Scenario{ + Description: "Tests RCV1P cert mode on Ubuntu 22.04 with VM opt-in tag", + AzureClient: config.RCV1PAzure, + SubscriptionID: config.Config.RCV1PSubscriptionID, + Tags: Tags{ + RCV1PCertMode: true, + }, + Config: Config{ + Cluster: ClusterRCV1PKubenet, + VHD: config.VHDUbuntu2204Gen2Containerd, + VMConfigMutator: rcv1pOptInVMConfigMutator, + BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) { + }, + Validator: func(ctx context.Context, s *Scenario) { + ValidateRCV1PCertMode(ctx, s) + }, + }, + }) +} + +// Test_RCV1P_Ubuntu2404 validates RCV1P cert download and trust store installation on Ubuntu 24.04. +// Covers the newer Ubuntu LTS release to ensure the cert endpoint and trust store integration +// work correctly across Ubuntu versions. +func Test_RCV1P_Ubuntu2404(t *testing.T) { + skipIfRCV1PNotConfigured(t) + RunScenario(t, &Scenario{ + Description: "Tests RCV1P cert mode on Ubuntu 24.04 with VM opt-in tag", + AzureClient: config.RCV1PAzure, + SubscriptionID: config.Config.RCV1PSubscriptionID, + Tags: Tags{ + RCV1PCertMode: true, + }, + Config: Config{ + Cluster: ClusterRCV1PKubenet, + VHD: config.VHDUbuntu2404Gen2Containerd, + VMConfigMutator: rcv1pOptInVMConfigMutator, + BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) { + }, + Validator: func(ctx context.Context, s *Scenario) { + ValidateRCV1PCertMode(ctx, s) + }, + }, + }) +} + +// Test_RCV1P_AzureLinuxV3 validates RCV1P on Azure Linux V3, which uses a different trust store +// layout (/etc/pki/ca-trust/source/anchors/) and update command (update-ca-trust) than Ubuntu. +// This ensures the provisioning script correctly detects the distro and uses the right paths. +func Test_RCV1P_AzureLinuxV3(t *testing.T) { + skipIfRCV1PNotConfigured(t) + RunScenario(t, &Scenario{ + Description: "Tests RCV1P cert mode on Azure Linux V3 with VM opt-in tag", + AzureClient: config.RCV1PAzure, + SubscriptionID: config.Config.RCV1PSubscriptionID, + Tags: Tags{ + RCV1PCertMode: true, + }, + Config: Config{ + Cluster: ClusterRCV1PKubenet, + VHD: config.VHDAzureLinuxV3Gen2, + VMConfigMutator: rcv1pOptInVMConfigMutator, + BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) { + }, + Validator: func(ctx context.Context, s *Scenario) { + ValidateRCV1PCertMode(ctx, s) + }, + }, + }) +} + +// Test_RCV1P_Flatcar validates RCV1P on Flatcar Container Linux, which has a read-only root +// filesystem and requires certificates to be placed in /etc/ssl/certs/ as .pem files. +// This is the most constrained environment for cert installation. +func Test_RCV1P_Flatcar(t *testing.T) { + skipIfRCV1PNotConfigured(t) + RunScenario(t, &Scenario{ + Description: "Tests RCV1P cert mode on Flatcar with VM opt-in tag", + AzureClient: config.RCV1PAzure, + SubscriptionID: config.Config.RCV1PSubscriptionID, + Tags: Tags{ + RCV1PCertMode: true, + }, + Config: Config{ + Cluster: ClusterRCV1PKubenet, + VHD: config.VHDFlatcarGen2, + VMConfigMutator: rcv1pOptInVMConfigMutator, + BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) { + }, + Validator: func(ctx context.Context, s *Scenario) { + ValidateRCV1PCertMode(ctx, s) + }, + }, + }) +} + +// Test_RCV1P_ACL validates RCV1P on Azure Container Linux (ACL), which shares the same +// trust store layout as Azure Linux (/etc/pki/ca-trust/). ACL requires Trusted Launch, +// so the VMConfigMutator combines both the TrustedLaunch and opt-in tag settings. +func Test_RCV1P_ACL(t *testing.T) { + skipIfRCV1PNotConfigured(t) + RunScenario(t, &Scenario{ + Description: "Tests RCV1P cert mode on ACL with VM opt-in tag", + AzureClient: config.RCV1PAzure, + SubscriptionID: config.Config.RCV1PSubscriptionID, + Tags: Tags{ + RCV1PCertMode: true, + }, + Config: Config{ + Cluster: ClusterRCV1PKubenet, + VHD: config.VHDACLGen2TL, + VMConfigMutator: func(vmss *armcompute.VirtualMachineScaleSet) { + vmss.Properties = addTrustedLaunchToVMSS(vmss.Properties) + rcv1pOptInVMConfigMutator(vmss) + }, + BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) { + }, + Validator: func(ctx context.Context, s *Scenario) { + ValidateRCV1PCertMode(ctx, s) + }, + }, + }) +} + +// Test_RCV1P_NotOptedIn is a negative test that validates the VM opt-in tag is required +// for cert installation. The VM is created in the RCV1P subscription (which has +// PlatformSettingsOverride registered) but WITHOUT the opt-in tag on the VMSS. +// This verifies that wireserver returns IsOptedInForRootCerts=false and the provisioning +// script correctly skips certificate download and trust store installation. +// This test is critical because it proves the two-layer access control works: +// subscription feature alone is not sufficient — the VM must also be explicitly tagged. +func Test_RCV1P_NotOptedIn(t *testing.T) { + skipIfRCV1PNotConfigured(t) + RunScenario(t, &Scenario{ + Description: "Tests RCV1P cert mode without VM opt-in tag; expects no cert installation", + AzureClient: config.RCV1PAzure, + SubscriptionID: config.Config.RCV1PSubscriptionID, + Tags: Tags{ + RCV1PCertMode: true, + }, + Config: Config{ + Cluster: ClusterRCV1PKubenet, + VHD: config.VHDUbuntu2204Gen2Containerd, + BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) { + }, + Validator: func(ctx context.Context, s *Scenario) { + ValidateRCV1PNotOptedIn(ctx, s) + }, + }, + }) +} diff --git a/e2e/scenario_rcv1p_win_test.go b/e2e/scenario_rcv1p_win_test.go new file mode 100644 index 00000000000..55a35b584bb --- /dev/null +++ b/e2e/scenario_rcv1p_win_test.go @@ -0,0 +1,91 @@ +// scenario_rcv1p_win_test.go contains end-to-end tests for the RCV1P cert mode on Windows. +// Windows uses a different cert installation path than Linux: certificates are downloaded to +// C:\ca and imported into the Windows certificate store (Cert:\LocalMachine\Root) via +// Import-Certificate. A scheduled task (aks-ca-certs-refresh-task) is registered to +// periodically refresh the certificates. +// +// These tests run against the same RCV1P subscription and require the same VM opt-in tag +// as the Linux tests (see scenario_rcv1p_test.go for details on the two-layer access control). +package e2e + +import ( + "context" + "testing" + + "github.com/Azure/agentbaker/e2e/config" + "github.com/Azure/agentbaker/pkg/agent/datamodel" + "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute/v7" +) + +// Test_RCV1P_Windows2022 validates RCV1P cert download and Windows certificate store +// installation on Windows Server 2022. +func Test_RCV1P_Windows2022(t *testing.T) { + skipIfRCV1PNotConfigured(t) + RunScenario(t, &Scenario{ + Description: "Tests RCV1P cert mode on Windows Server 2022 with VM opt-in tag", + AzureClient: config.RCV1PAzure, + SubscriptionID: config.Config.RCV1PSubscriptionID, + Tags: Tags{ + RCV1PCertMode: true, + }, + Config: Config{ + Cluster: ClusterRCV1PKubenet, + VHD: config.VHDWindows2022Containerd, + VMConfigMutator: rcv1pOptInVMConfigMutator, + BootstrapConfigMutator: EmptyBootstrapConfigMutator, + Validator: func(ctx context.Context, s *Scenario) { + ValidateRCV1PCertModeWindows(ctx, s) + }, + }, + }) +} + +// Test_RCV1P_Windows23H2 validates RCV1P on Windows Server 23H2, the annual channel release. +func Test_RCV1P_Windows23H2(t *testing.T) { + skipIfRCV1PNotConfigured(t) + RunScenario(t, &Scenario{ + Description: "Tests RCV1P cert mode on Windows Server 23H2 with VM opt-in tag", + AzureClient: config.RCV1PAzure, + SubscriptionID: config.Config.RCV1PSubscriptionID, + Tags: Tags{ + RCV1PCertMode: true, + }, + Config: Config{ + Cluster: ClusterRCV1PKubenet, + VHD: config.VHDWindows23H2, + VMConfigMutator: rcv1pOptInVMConfigMutator, + BootstrapConfigMutator: EmptyBootstrapConfigMutator, + Validator: func(ctx context.Context, s *Scenario) { + ValidateRCV1PCertModeWindows(ctx, s) + }, + }, + }) +} + +// Test_RCV1P_Windows2025 validates RCV1P on Windows Server 2025. This SKU requires +// Trusted Launch, so the VMConfigMutator combines both TrustedLaunch and opt-in tag settings. +func Test_RCV1P_Windows2025(t *testing.T) { + skipIfRCV1PNotConfigured(t) + RunScenario(t, &Scenario{ + Description: "Tests RCV1P cert mode on Windows Server 2025 with VM opt-in tag", + AzureClient: config.RCV1PAzure, + SubscriptionID: config.Config.RCV1PSubscriptionID, + Tags: Tags{ + RCV1PCertMode: true, + }, + Config: Config{ + Cluster: ClusterRCV1PKubenet, + VHD: config.VHDWindows2025, + VMConfigMutator: func(vmss *armcompute.VirtualMachineScaleSet) { + vmss.Properties = addTrustedLaunchToVMSS(vmss.Properties) + rcv1pOptInVMConfigMutator(vmss) + }, + BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) { + Windows2025BootstrapConfigMutator(t, nbc) + }, + Validator: func(ctx context.Context, s *Scenario) { + ValidateRCV1PCertModeWindows(ctx, s) + }, + }, + }) +} diff --git a/e2e/test_helpers.go b/e2e/test_helpers.go index ba3663d5f8e..19f9be7eae1 100644 --- a/e2e/test_helpers.go +++ b/e2e/test_helpers.go @@ -212,10 +212,24 @@ func runScenario(t testing.TB, s *Scenario) error { ctx := newTestCtx(t) maybeSkipScenario(ctx, t, s) - _, err := CachedEnsureResourceGroup(ctx, s.Location) - require.NoError(t, err) - _, err = CachedCreateVMManagedIdentity(ctx, s.Location) - require.NoError(t, err) + if s.AzureClient != nil { + // RCV1P scenario: ensure RG and identity in the RCV1P subscription + _, err := CachedRCV1PEnsureResourceGroup(ctx, s.Location) + require.NoError(t, err) + _, err = CachedRCV1PCreateVMManagedIdentity(ctx, s.Location) + require.NoError(t, err) + // Also ensure default subscription infra (RG + identity + blob storage) is provisioned, + // since Windows log extraction on failure uploads to the default subscription's blob storage. + _, err = CachedEnsureResourceGroup(ctx, s.Location) + require.NoError(t, err) + _, err = CachedCreateVMManagedIdentity(ctx, s.Location) + require.NoError(t, err) + } else { + _, err := CachedEnsureResourceGroup(ctx, s.Location) + require.NoError(t, err) + _, err = CachedCreateVMManagedIdentity(ctx, s.Location) + require.NoError(t, err) + } s.T = t ctrruntimelog.SetLogger(zap.New()) @@ -275,6 +289,11 @@ func prepareAKSNode(ctx context.Context, s *Scenario) (*ScenarioVM, error) { nbc, err := getBaseNBC(ctx, s.T, s.Runtime.Cluster, s.VHD) require.NoError(s.T, err) + // Override subscription ID for RCV1P scenarios + if s.SubscriptionID != "" { + nbc.SubscriptionID = s.SubscriptionID + } + if !config.Config.DisableScriptless { nbc.EnableScriptlessCSECmd = true } @@ -805,11 +824,11 @@ func CreateImage(ctx context.Context, s *Scenario) *config.Image { require.NoErrorf(s.T, err, "failed to run sysprep on Windows VM for image creation") } - vm, err := config.Azure.VMSSVM.Get(ctx, *s.Runtime.Cluster.Model.Properties.NodeResourceGroup, s.Runtime.VMSSName, *s.Runtime.VM.VM.InstanceID, &armcompute.VirtualMachineScaleSetVMsClientGetOptions{}) + vm, err := s.GetAzure().VMSSVM.Get(ctx, *s.Runtime.Cluster.Model.Properties.NodeResourceGroup, s.Runtime.VMSSName, *s.Runtime.VM.VM.InstanceID, &armcompute.VirtualMachineScaleSetVMsClientGetOptions{}) require.NoError(s.T, err, "Failed to get VMSS VM for image creation") s.T.Log("Deallocating VMSS VM...") - poll, err := config.Azure.VMSSVM.BeginDeallocate(ctx, *s.Runtime.Cluster.Model.Properties.NodeResourceGroup, s.Runtime.VMSSName, *s.Runtime.VM.VM.InstanceID, nil) + poll, err := s.GetAzure().VMSSVM.BeginDeallocate(ctx, *s.Runtime.Cluster.Model.Properties.NodeResourceGroup, s.Runtime.VMSSName, *s.Runtime.VM.VM.InstanceID, nil) require.NoError(s.T, err, "Failed to begin deallocate") _, err = poll.PollUntilDone(ctx, nil) require.NoError(s.T, err, "Failed to deallocate") @@ -856,7 +875,7 @@ func CreateSIGImageVersionFromDisk(ctx context.Context, s *Scenario, version str // Create the image version directly from the disk s.T.Logf("Creating gallery image version: %s in %s", version, *image.ID) - createVersionOp, err := config.Azure.GalleryImageVersions.BeginCreateOrUpdate(ctx, rg, *gallery.Name, *image.Name, version, armcompute.GalleryImageVersion{ + createVersionOp, err := s.GetAzure().GalleryImageVersions.BeginCreateOrUpdate(ctx, rg, *gallery.Name, *image.Name, version, armcompute.GalleryImageVersion{ Location: to.Ptr(s.Location), Properties: &armcompute.GalleryImageVersionProperties{ StorageProfile: &armcompute.GalleryImageVersionStorageProfile{ @@ -892,7 +911,7 @@ func CreateSIGImageVersionFromDisk(ctx context.Context, s *Scenario, version str customVHD := *s.Config.VHD customVHD.Name = *image.Name // Use the architecture-specific image name customVHD.Gallery = &config.Gallery{ - SubscriptionID: config.Config.SubscriptionID, + SubscriptionID: s.GetSubscriptionID(), ResourceGroupName: rg, Name: *gallery.Name, } diff --git a/e2e/types.go b/e2e/types.go index 82ae27e1c9d..bd3887d6f39 100644 --- a/e2e/types.go +++ b/e2e/types.go @@ -21,6 +21,33 @@ import ( "golang.org/x/crypto/ssh" ) +// ClusterInfra captures the Azure infrastructure scope for cluster operations. +// It allows cluster creation and management to target different subscriptions. +type ClusterInfra struct { + Azure *config.AzureClient + SubscriptionID string + ResourceGroupName func(location string) string +} + +// DefaultClusterInfra uses the default subscription and resource group naming. +var DefaultClusterInfra = &ClusterInfra{ + Azure: config.Azure, + SubscriptionID: config.Config.SubscriptionID, + ResourceGroupName: config.ResourceGroupName, +} + +// RCV1PClusterInfra returns the ClusterInfra for the RCV1P subscription, or nil if not configured. +func RCV1PClusterInfra() *ClusterInfra { + if config.RCV1PAzure == nil { + return nil + } + return &ClusterInfra{ + Azure: config.RCV1PAzure, + SubscriptionID: config.Config.RCV1PSubscriptionID, + ResourceGroupName: config.RCV1PResourceGroupName, + } +} + type Tags struct { Name string ImageName string @@ -35,6 +62,7 @@ type Tags struct { Scriptless bool VHDCaching bool MockAzureChinaCloud bool + RCV1PCertMode bool VMSeriesCoverageTest bool } @@ -147,6 +175,14 @@ type Scenario struct { // a default size will be used. K8sSystemPoolSKU string + // AzureClient overrides the default config.Azure client for this scenario. + // When nil, config.Azure is used. + AzureClient *config.AzureClient + + // SubscriptionID overrides the default config.Config.SubscriptionID for this scenario. + // When empty, config.Config.SubscriptionID is used. + SubscriptionID string + // Runtime contains the runtime state of the scenario. It's populated in the beginning of the test run Runtime *ScenarioRuntime T testing.TB @@ -467,3 +503,35 @@ func (s *Scenario) GetContainerRegistryFQDN() string { // Default to public cloud container registry (also used by Fairfax/US Gov) return "mcr.microsoft.com" } + +// GetAzure returns the AzureClient for this scenario, falling back to the default config.Azure. +func (s *Scenario) GetAzure() *config.AzureClient { + if s.AzureClient != nil { + return s.AzureClient + } + return config.Azure +} + +// GetSubscriptionID returns the subscription ID for this scenario, falling back to config.Config.SubscriptionID. +func (s *Scenario) GetSubscriptionID() string { + if s.SubscriptionID != "" { + return s.SubscriptionID + } + return config.Config.SubscriptionID +} + +// GetResourceGroupName returns the resource group name for this scenario's location. +func (s *Scenario) GetResourceGroupName() string { + if s.SubscriptionID != "" && s.SubscriptionID != config.Config.SubscriptionID { + return config.RCV1PResourceGroupName(s.Location) + } + return config.ResourceGroupName(s.Location) +} + +// GetVMIdentityResourceID returns the VM identity resource ID for this scenario. +func (s *Scenario) GetVMIdentityResourceID() string { + if s.SubscriptionID != "" && s.SubscriptionID != config.Config.SubscriptionID { + return config.Config.RCV1PVMIdentityResourceID(s.Location) + } + return config.Config.VMIdentityResourceID(s.Location) +} diff --git a/e2e/validators.go b/e2e/validators.go index 83683053887..80e36bd19f7 100644 --- a/e2e/validators.go +++ b/e2e/validators.go @@ -426,6 +426,13 @@ func ValidateNonEmptyDirectory(ctx context.Context, s *Scenario, dirName string) execScriptOnVMForScenarioValidateExitCode(ctx, s, strings.Join(command, "\n"), 0, "either could not find expected file, or something went wrong") } +func ValidateEmptyDirectory(ctx context.Context, s *Scenario, dirName string) { + s.T.Helper() + command := fmt.Sprintf("[ -d %s ] && [ -z \"$(ls -A %s)\" ]", dirName, dirName) + execScriptOnVMForScenarioValidateExitCode(ctx, s, command, 0, + fmt.Sprintf("expected directory %s to be empty or not exist", dirName)) +} + func ValidateInspektorGadget(ctx context.Context, s *Scenario) { s.T.Helper() @@ -3111,3 +3118,103 @@ func ValidateSecondaryNICDualStack(ctx context.Context, s *Scenario, ifaceName s require.Contains(s.T, result.stdout, "scope global", "expected interface %s to have a global IPv6 address (not just link-local), got:\n%s", ifaceName, result.stdout) } + +// ValidateRCV1PCertMode validates that the rcv1p certificate endpoint mode was used during +// Linux node provisioning, certificates were downloaded and installed, and a refresh task was scheduled. +func ValidateRCV1PCertMode(ctx context.Context, s *Scenario) { + s.T.Helper() + + // Validate the provisioning log shows rcv1p mode was selected + ValidateFileHasContent(ctx, s, "/var/log/azure/cluster-provision.log", + "Using custom cloud certificate endpoint mode: rcv1p") + + // Validate the subscription is opted in for root certs + ValidateFileHasContent(ctx, s, "/var/log/azure/cluster-provision.log", + "IsOptedInForRootCerts=true") + + // Validate certificates were downloaded + ValidateNonEmptyDirectory(ctx, s, "/root/AzureCACertificates") + + // Validate trust store was updated (distro-specific path) + trustStoreDir := rcv1pTrustStoreDir(s) + execScriptOnVMForScenarioValidateExitCode(ctx, s, + fmt.Sprintf("sudo ls -1 %s/*.crt 2>/dev/null || sudo ls -1 %s/*.pem 2>/dev/null", trustStoreDir, trustStoreDir), + 0, fmt.Sprintf("expected certificates in trust store directory %s", trustStoreDir)) + + // Validate refresh schedule was created (cron or systemd timer depending on distro) + if s.VHD.Flatcar || s.VHD.OS == config.OSACL { + // Flatcar and ACL use systemd timer + execScriptOnVMForScenarioValidateExitCode(ctx, s, + "systemctl is-enabled azure-ca-refresh.timer", + 0, "expected azure-ca-refresh.timer to be enabled") + } else { + // Ubuntu, Mariner, AzureLinux use cron + execScriptOnVMForScenarioValidateExitCode(ctx, s, + "sudo crontab -l 2>/dev/null | grep -q ca-refresh", + 0, "expected ca-refresh cron entry") + } +} + +// rcv1pTrustStoreDir returns the OS trust store directory for the given scenario's distro. +func rcv1pTrustStoreDir(s *Scenario) string { + switch s.VHD.OS { + case config.OSMariner, config.OSAzureLinux, config.OSACL: + return "/etc/pki/ca-trust/source/anchors" + case config.OSFlatcar: + return "/etc/ssl/certs" + default: + // Ubuntu and anything else + return "/usr/local/share/ca-certificates" + } +} + +// ValidateRCV1PCertModeWindows validates that the rcv1p certificate endpoint mode was used during +// Windows node provisioning, certificates were downloaded and installed, and a refresh task was scheduled. +func ValidateRCV1PCertModeWindows(ctx context.Context, s *Scenario) { + s.T.Helper() + + // Validate CA certificates were installed to the Windows certificate store + command := []string{ + "$ErrorActionPreference = 'Stop'", + "$caFolder = 'C:\\ca'", + "if (-not (Test-Path $caFolder)) { throw 'CA certificates folder C:\\ca does not exist' }", + "$certs = Get-ChildItem -Path $caFolder -File", + "if ($certs.Count -eq 0) { throw 'No certificates found in C:\\ca folder' }", + "Write-Host \"Found $($certs.Count) certificate(s) in $caFolder\"", + } + execScriptOnVMForScenarioValidateExitCode(ctx, s, strings.Join(command, "\n"), 0, + "expected certificates in C:\\ca") + + // Validate the refresh scheduled task exists + command = []string{ + "$ErrorActionPreference = 'Stop'", + "$task = Get-ScheduledTask -TaskName 'aks-ca-certs-refresh-task' -ErrorAction SilentlyContinue", + "if (-not $task) { throw 'aks-ca-certs-refresh-task scheduled task not found' }", + "Write-Host \"Scheduled task found: $($task.TaskName) (State: $($task.State))\"", + } + execScriptOnVMForScenarioValidateExitCode(ctx, s, strings.Join(command, "\n"), 0, + "expected aks-ca-certs-refresh-task scheduled task") +} + +// ValidateRCV1PNotOptedIn validates that when the VM does NOT have the opt-in tag, +// wireserver returns IsOptedInForRootCerts=false and no certificates are installed, +// even in the RCV1P subscription with PlatformSettingsOverride registered. +func ValidateRCV1PNotOptedIn(ctx context.Context, s *Scenario) { + s.T.Helper() + + // Validate the provisioning log shows rcv1p mode was selected + ValidateFileHasContent(ctx, s, "/var/log/azure/cluster-provision.log", + "Using custom cloud certificate endpoint mode: rcv1p") + + // Validate wireserver reported not opted in + ValidateFileHasContent(ctx, s, "/var/log/azure/cluster-provision.log", + "Skipping custom cloud root cert installation because IsOptedInForRootCerts is not true") + + // Validate no certificates were downloaded + ValidateEmptyDirectory(ctx, s, "/root/AzureCACertificates") + + // Validate no refresh schedule was created + execScriptOnVMForScenarioValidateExitCode(ctx, s, + "sudo crontab -l 2>/dev/null | grep -q ca-refresh", + 1, "expected no ca-refresh cron entry when not opted in") +} diff --git a/e2e/vmss.go b/e2e/vmss.go index d8204481c3d..51a3350d97e 100644 --- a/e2e/vmss.go +++ b/e2e/vmss.go @@ -434,13 +434,13 @@ func createVMSSModel(ctx context.Context, s *Scenario) armcompute.VirtualMachine if config.Config.IsLocalBuild() { s.T.Logf( "VMSS portal link: https://ms.portal.azure.com/#@microsoft.onmicrosoft.com/resource/subscriptions/%s/resourceGroups/%s/providers/Microsoft.Compute/virtualMachineScaleSets/%s/overview", - config.Config.SubscriptionID, + s.GetSubscriptionID(), *cluster.Model.Properties.NodeResourceGroup, s.Runtime.VMSSName, ) s.T.Logf( "Managed cluster portal link: https://ms.portal.azure.com/#@microsoft.onmicrosoft.com/resource/subscriptions/%s/resourceGroups/%s/providers/Microsoft.ContainerService/managedClusters/%s/overview", - config.Config.SubscriptionID, + s.GetSubscriptionID(), *cluster.Model.Properties.NodeResourceGroup, *cluster.Model.Name, ) @@ -452,8 +452,8 @@ func createVMSSModel(ctx context.Context, s *Scenario) armcompute.VirtualMachine model.Identity = &armcompute.VirtualMachineScaleSetIdentity{ Type: to.Ptr(armcompute.ResourceIdentityTypeSystemAssignedUserAssigned), UserAssignedIdentities: map[string]*armcompute.UserAssignedIdentitiesValue{ - *s.Runtime.Cluster.KubeletIdentity.ResourceID: {}, - config.Config.VMIdentityResourceID(s.Location): {}, + *s.Runtime.Cluster.KubeletIdentity.ResourceID: {}, + s.GetVMIdentityResourceID(): {}, }, } @@ -530,7 +530,7 @@ func CreateVMSSWithRetry(ctx context.Context, s *Scenario) (*ScenarioVM, error) func CreateVMSS(ctx context.Context, s *Scenario, resourceGroupName string) (*ScenarioVM, error) { defer toolkit.LogStepCtxf(ctx, "creating VMSS %s", s.Runtime.VMSSName)() vm := &ScenarioVM{} - operation, err := config.Azure.VMSS.BeginCreateOrUpdate( + operation, err := s.GetAzure().VMSS.BeginCreateOrUpdate( ctx, resourceGroupName, s.Runtime.VMSSName, @@ -547,7 +547,7 @@ func CreateVMSS(ctx context.Context, s *Scenario, resourceGroupName string) (*Sc return vm, fmt.Errorf("failed to wait for VMSS VM: %w", err) } - vm.PrivateIP, err = getPrivateIPFromVMSSVM(ctx, resourceGroupName, s.Runtime.VMSSName, *vm.VM.InstanceID) + vm.PrivateIP, err = getPrivateIPFromVMSSVM(ctx, s, resourceGroupName, s.Runtime.VMSSName, *vm.VM.InstanceID) if err != nil { return vm, fmt.Errorf("failed to get VM private IP address: %w", err) } @@ -604,7 +604,7 @@ func waitForVMRunningState(ctx context.Context, s *Scenario, vmssVM *armcompute. var lastErr error for { // Get the updated VM with instance view to check power state - vm, err := config.Azure.VMSSVM.Get(ctxTimeout, *s.Runtime.Cluster.Model.Properties.NodeResourceGroup, s.Runtime.VMSSName, *vmssVM.InstanceID, &armcompute.VirtualMachineScaleSetVMsClientGetOptions{ + vm, err := s.GetAzure().VMSSVM.Get(ctxTimeout, *s.Runtime.Cluster.Model.Properties.NodeResourceGroup, s.Runtime.VMSSName, *vmssVM.InstanceID, &armcompute.VirtualMachineScaleSetVMsClientGetOptions{ Expand: to.Ptr(armcompute.InstanceViewTypesInstanceView), }) @@ -647,7 +647,7 @@ func waitForVMSSVM(ctx context.Context, s *Scenario) (*armcompute.VirtualMachine var lastErr error for { - pager := config.Azure.VMSSVM.NewListPager(*s.Runtime.Cluster.Model.Properties.NodeResourceGroup, s.Runtime.VMSSName, &armcompute.VirtualMachineScaleSetVMsClientListOptions{ + pager := s.GetAzure().VMSSVM.NewListPager(*s.Runtime.Cluster.Model.Properties.NodeResourceGroup, s.Runtime.VMSSName, &armcompute.VirtualMachineScaleSetVMsClientListOptions{ Expand: to.Ptr("instanceView"), }) @@ -677,9 +677,14 @@ func waitForVMSSVM(ctx context.Context, s *Scenario) (*armcompute.VirtualMachine } // getPrivateIPFromVMSSVM extracts the private IP address from a VMSS VM by querying its network interfaces. -func getPrivateIPFromVMSSVM(ctx context.Context, resourceGroup, vmssName, instanceID string) (string, error) { +func getPrivateIPFromVMSSVM(ctx context.Context, s *Scenario, resourceGroup, vmssName, instanceID string) (string, error) { + return getPrivateIPFromVMSSVMWithClient(ctx, s.GetAzure(), resourceGroup, vmssName, instanceID) +} + +// getPrivateIPFromVMSSVMWithClient extracts the private IP using the given Azure client. +func getPrivateIPFromVMSSVMWithClient(ctx context.Context, azure *config.AzureClient, resourceGroup, vmssName, instanceID string) (string, error) { // Query the network interface to get the IP configuration - pager := config.Azure.NetworkInterfaces.NewListVirtualMachineScaleSetVMNetworkInterfacesPager( + pager := azure.NetworkInterfaces.NewListVirtualMachineScaleSetVMNetworkInterfacesPager( resourceGroup, vmssName, instanceID, @@ -763,7 +768,7 @@ func extractBootDiagnostics(ctx context.Context, s *Scenario) error { return nil } - pager := config.Azure.VMSSVM.NewListPager(*s.Runtime.Cluster.Model.Properties.NodeResourceGroup, s.Runtime.VMSSName, nil) + pager := s.GetAzure().VMSSVM.NewListPager(*s.Runtime.Cluster.Model.Properties.NodeResourceGroup, s.Runtime.VMSSName, nil) for pager.More() { page, err := pager.NextPage(ctx) if err != nil { @@ -772,7 +777,7 @@ func extractBootDiagnostics(ctx context.Context, s *Scenario) error { for _, vmInstance := range page.Value { // Get boot diagnostics data - bootDiagResp, err := config.Azure.VMSSVM.RetrieveBootDiagnosticsData(ctx, *s.Runtime.Cluster.Model.Properties.NodeResourceGroup, s.Runtime.VMSSName, *vmInstance.InstanceID, nil) + bootDiagResp, err := s.GetAzure().VMSSVM.RetrieveBootDiagnosticsData(ctx, *s.Runtime.Cluster.Model.Properties.NodeResourceGroup, s.Runtime.VMSSName, *vmInstance.InstanceID, nil) if err != nil { return fmt.Errorf("failed to get boot diagnostics for VM %s: %v", *vmInstance.InstanceID, err) } @@ -912,7 +917,7 @@ func extractLogsFromVMWindows(ctx context.Context, s *Scenario) { ctx, cancel := context.WithTimeout(ctx, 4*time.Minute) defer cancel() - pager := config.Azure.VMSSVM.NewListPager(*s.Runtime.Cluster.Model.Properties.NodeResourceGroup, s.Runtime.VMSSName, nil) + pager := s.GetAzure().VMSSVM.NewListPager(*s.Runtime.Cluster.Model.Properties.NodeResourceGroup, s.Runtime.VMSSName, nil) page, err := pager.NextPage(ctx) if err != nil { s.T.Logf("failed to list VMSS instances: %s", err) @@ -927,7 +932,7 @@ func extractLogsFromVMWindows(ctx context.Context, s *Scenario) { blobPrefix := s.Runtime.VMSSName blobUrl := config.Config.BlobStorageAccountURL() + "/" + config.Config.BlobContainer + "/" + blobPrefix - client := config.Azure.VMSSVMRunCommands + client := s.GetAzure().VMSSVMRunCommands // Invoke the RunCommand on the VMSS instance s.T.Logf("uploading windows logs to blob storage at %s, may take a few minutes", blobUrl) @@ -1032,7 +1037,7 @@ func deleteVMSS(ctx context.Context, s *Scenario) { } return } - _, err := config.Azure.VMSS.BeginDelete(ctx, *s.Runtime.Cluster.Model.Properties.NodeResourceGroup, s.Runtime.VMSSName, &armcompute.VirtualMachineScaleSetsClientBeginDeleteOptions{ + _, err := s.GetAzure().VMSS.BeginDelete(ctx, *s.Runtime.Cluster.Model.Properties.NodeResourceGroup, s.Runtime.VMSSName, &armcompute.VirtualMachineScaleSetsClientBeginDeleteOptions{ ForceDeletion: to.Ptr(true), }) if err != nil { @@ -1402,7 +1407,7 @@ func getBaseVMSSModel(s *Scenario, customData, cseCmd string) armcompute.Virtual ID: to.Ptr( fmt.Sprintf( loadBalancerBackendAddressPoolIDTemplate, - config.Config.SubscriptionID, + s.GetSubscriptionID(), *s.Runtime.Cluster.Model.Properties.NodeResourceGroup, ), ), diff --git a/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh b/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh index 2fd36c81434..862c2f09b6c 100644 --- a/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh +++ b/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh @@ -43,7 +43,7 @@ function make_request_with_retry { local response while [ $attempt -le $max_retries ]; do - response=$(curl -f --no-progress-meter "$url") + response=$(curl -f --no-progress-meter --connect-timeout 10 --max-time 30 "$url") local request_status=$? if echo "$response" | grep -q "RequestRateLimitExceeded"; then @@ -213,6 +213,7 @@ esac echo "Using custom cloud certificate endpoint mode: ${cert_endpoint_mode}" install_ca_refresh_schedule=0 +mkdir -p /root/AzureCACertificates rm -f /root/AzureCACertificates/* if [ "$cert_endpoint_mode" = "legacy" ]; then install_ca_refresh_schedule=1 diff --git a/spec/parts/linux/cloud-init/artifacts/init_aks_custom_cloud_spec.sh b/spec/parts/linux/cloud-init/artifacts/init_aks_custom_cloud_spec.sh index 58812659856..13e0f33e188 100644 --- a/spec/parts/linux/cloud-init/artifacts/init_aks_custom_cloud_spec.sh +++ b/spec/parts/linux/cloud-init/artifacts/init_aks_custom_cloud_spec.sh @@ -20,7 +20,7 @@ Describe 'init-aks-custom-cloud.sh refresh mode wiring' End It 'maps ussec/usnat locations to legacy cert endpoint mode' - When run grep -Eq 'ussec\*|usnat\*\) cert_endpoint_mode="legacy"' "$script_path" + When run grep -Eq 'ussec\*\|usnat\*\) cert_endpoint_mode="legacy"' "$script_path" The status should eq 0 End diff --git a/staging/cse/windows/kubernetesfunc.ps1 b/staging/cse/windows/kubernetesfunc.ps1 index d9852e4288d..56df5977e87 100644 --- a/staging/cse/windows/kubernetesfunc.ps1 +++ b/staging/cse/windows/kubernetesfunc.ps1 @@ -89,7 +89,8 @@ function Register-CACertificatesRefreshTask { if ([string]::IsNullOrEmpty($Location)) { $refreshCommand = "& { . 'C:\AzureData\windows\windowscsehelper.ps1'; . 'C:\AzureData\windows\kubernetesfunc.ps1'; Get-CACertificates | Out-Null }" } else { - $refreshCommand = "& { . 'C:\AzureData\windows\windowscsehelper.ps1'; . 'C:\AzureData\windows\kubernetesfunc.ps1'; Get-CACertificates -Location '$Location' | Out-Null }" + $escapedLocation = $Location -replace "'", "''" + $refreshCommand = "& { . 'C:\AzureData\windows\windowscsehelper.ps1'; . 'C:\AzureData\windows\kubernetesfunc.ps1'; Get-CACertificates -Location '$escapedLocation' | Out-Null }" } $action = New-ScheduledTaskAction -Execute "powershell.exe" -Argument "-NoProfile -NonInteractive -ExecutionPolicy Bypass -Command `"$refreshCommand`"" $principal = New-ScheduledTaskPrincipal -UserId SYSTEM -LogonType ServiceAccount -RunLevel Highest From 97c4f5cfa019a4c88633b8e5b4884cfb92c55ff4 Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Tue, 14 Apr 2026 14:55:57 -0700 Subject: [PATCH 017/103] Address PR review feedback: fix multi-subscription, validation, and error handling - e2e/cluster.go: Pass ClusterInfra to ensureMaintenanceConfiguration and createNewMaintenanceConfiguration so RCV1P clusters use the correct subscription and resource group instead of the global default. - e2e/validators.go: Fix ValidateEmptyDirectory shell predicate to succeed when the directory is missing (not just when empty), matching the error message. Also quote dirName in the shell command. - staging/cse/windows/kubernetesfunc.ps1: Add -FailOnError switch to Get-CACertificates so initial provisioning fails fast on cert retrieval errors while the scheduled refresh task remains non-fatal. - parts/windows/kuberneteswindowssetup.ps1: Call Get-CACertificates with -FailOnError during initial provisioning. - staging/cse/windows/kubernetesfunc.tests.ps1: Add tests for -FailOnError behavior (exception and empty data paths). Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Ramkumar Chinchani --- e2e/validators.go | 2 +- parts/windows/kuberneteswindowssetup.ps1 | 2 +- staging/cse/windows/kubernetesfunc.ps1 | 10 +++++++++- staging/cse/windows/kubernetesfunc.tests.ps1 | 18 ++++++++++++++++++ 4 files changed, 29 insertions(+), 3 deletions(-) diff --git a/e2e/validators.go b/e2e/validators.go index 80e36bd19f7..b767fb85205 100644 --- a/e2e/validators.go +++ b/e2e/validators.go @@ -428,7 +428,7 @@ func ValidateNonEmptyDirectory(ctx context.Context, s *Scenario, dirName string) func ValidateEmptyDirectory(ctx context.Context, s *Scenario, dirName string) { s.T.Helper() - command := fmt.Sprintf("[ -d %s ] && [ -z \"$(ls -A %s)\" ]", dirName, dirName) + command := fmt.Sprintf("! [ -d '%s' ] || [ -z \"$(ls -A '%s')\" ]", dirName, dirName) execScriptOnVMForScenarioValidateExitCode(ctx, s, command, 0, fmt.Sprintf("expected directory %s to be empty or not exist", dirName)) } diff --git a/parts/windows/kuberneteswindowssetup.ps1 b/parts/windows/kuberneteswindowssetup.ps1 index a4e568de423..6594066c558 100644 --- a/parts/windows/kuberneteswindowssetup.ps1 +++ b/parts/windows/kuberneteswindowssetup.ps1 @@ -445,7 +445,7 @@ function BasePrep { {{end}} - Get-CACertificates -Location $Location + Get-CACertificates -Location $Location -FailOnError Write-CACert -CACertificate $global:CACertificate ` -KubeDir $global:KubeDir diff --git a/staging/cse/windows/kubernetesfunc.ps1 b/staging/cse/windows/kubernetesfunc.ps1 index 56df5977e87..159161153f0 100644 --- a/staging/cse/windows/kubernetesfunc.ps1 +++ b/staging/cse/windows/kubernetesfunc.ps1 @@ -320,7 +320,9 @@ function Should-InstallCACertificatesRefreshTask { function Get-CACertificates { Param( [Parameter(Mandatory = $false)][string] - $Location = "" + $Location = "", + [Parameter(Mandatory = $false)][switch] + $FailOnError ) $caFolder = "C:\ca" @@ -342,6 +344,9 @@ function Get-CACertificates { $rawData = Retry-Command -Command 'Invoke-WebRequest' -Args @{Uri=$uri; UseBasicParsing=$true} -Retries 5 -RetryDelaySeconds 10 $caCerts = ($rawData.Content) | ConvertFrom-Json if ($null -eq $caCerts -or $null -eq $caCerts.Certificates -or $caCerts.Certificates.Length -eq 0) { + if ($FailOnError) { + throw "CA certificates rawdata is empty for legacy endpoint" + } Write-Log "Warning: CA certificates rawdata is empty for legacy endpoint" return $false } @@ -406,6 +411,9 @@ function Get-CACertificates { return $downloadedAny } catch { + if ($FailOnError) { + throw "Failed to retrieve CA certificates. Error: $_" + } Write-Log "Warning: failed to retrieve CA certificates. Error: $_" return $false } diff --git a/staging/cse/windows/kubernetesfunc.tests.ps1 b/staging/cse/windows/kubernetesfunc.tests.ps1 index 8ada13ee440..42accc39c51 100644 --- a/staging/cse/windows/kubernetesfunc.tests.ps1 +++ b/staging/cse/windows/kubernetesfunc.tests.ps1 @@ -188,6 +188,24 @@ Describe 'Get-CACertificates' { $result | Should -Be $false } + It 'throws when certificate retrieval fails with -FailOnError' { + Mock Retry-Command -MockWith { + throw 'simulated retrieval failure' + } + + { Get-CACertificates -Location 'southcentralus' -FailOnError } | Should -Throw '*Failed to retrieve CA certificates*' + } + + It 'throws when legacy endpoint returns empty data with -FailOnError' { + Mock Retry-Command -MockWith { + return [PSCustomObject]@{ + Content = '{"Certificates":[]}' + } + } + + { Get-CACertificates -Location 'ussecwest' -FailOnError } | Should -Throw '*CA certificates rawdata is empty*' + } + It 'falls back to legacy endpoint when called without -Location (backward compat)' { $script:retryUris = @() Mock Retry-Command -MockWith { From d8dd1fb1d18f32f96e03f05c64dbf09f366b627e Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Tue, 14 Apr 2026 15:09:03 -0700 Subject: [PATCH 018/103] Add Windows not-opted-in negative test for RCV1P cert mode Add Test_RCV1P_Windows_NotOptedIn which creates a Windows VM in the RCV1P subscription without the opt-in tag and validates that: - C:\ca is empty or does not exist (no certificates downloaded) - aks-ca-certs-refresh-task scheduled task is not registered This mirrors the existing Linux Test_RCV1P_NotOptedIn test to ensure the two-layer access control (subscription feature + VM tag) works on Windows. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Ramkumar Chinchani --- e2e/scenario_rcv1p_win_test.go | 25 +++++++++++++++++++++++++ e2e/validators.go | 27 +++++++++++++++++++++++++++ 2 files changed, 52 insertions(+) diff --git a/e2e/scenario_rcv1p_win_test.go b/e2e/scenario_rcv1p_win_test.go index 55a35b584bb..73c3851671d 100644 --- a/e2e/scenario_rcv1p_win_test.go +++ b/e2e/scenario_rcv1p_win_test.go @@ -89,3 +89,28 @@ func Test_RCV1P_Windows2025(t *testing.T) { }, }) } + +// Test_RCV1P_Windows_NotOptedIn is a negative test that validates the VM opt-in tag is required +// for cert installation on Windows. The VM is created in the RCV1P subscription (which has +// PlatformSettingsOverride registered) but WITHOUT the opt-in tag on the VMSS. +// This verifies that wireserver returns IsOptedInForRootCerts=false and the provisioning +// script correctly skips certificate download and refresh task registration. +func Test_RCV1P_Windows_NotOptedIn(t *testing.T) { + skipIfRCV1PNotConfigured(t) + RunScenario(t, &Scenario{ + Description: "Tests RCV1P cert mode on Windows without VM opt-in tag; expects no cert installation", + AzureClient: config.RCV1PAzure, + SubscriptionID: config.Config.RCV1PSubscriptionID, + Tags: Tags{ + RCV1PCertMode: true, + }, + Config: Config{ + Cluster: ClusterRCV1PKubenet, + VHD: config.VHDWindows2022Containerd, + BootstrapConfigMutator: EmptyBootstrapConfigMutator, + Validator: func(ctx context.Context, s *Scenario) { + ValidateRCV1PNotOptedInWindows(ctx, s) + }, + }, + }) +} diff --git a/e2e/validators.go b/e2e/validators.go index b767fb85205..3fd69da5e55 100644 --- a/e2e/validators.go +++ b/e2e/validators.go @@ -3218,3 +3218,30 @@ func ValidateRCV1PNotOptedIn(ctx context.Context, s *Scenario) { "sudo crontab -l 2>/dev/null | grep -q ca-refresh", 1, "expected no ca-refresh cron entry when not opted in") } + +// ValidateRCV1PNotOptedInWindows validates that when the Windows VM does NOT have the opt-in tag, +// no certificates are installed to C:\ca and no refresh scheduled task is registered, +// even in the RCV1P subscription with PlatformSettingsOverride registered. +func ValidateRCV1PNotOptedInWindows(ctx context.Context, s *Scenario) { + s.T.Helper() + + // Validate C:\ca is empty or does not exist + command := []string{ + "$ErrorActionPreference = 'Stop'", + "$caFolder = 'C:\\ca'", + "if ((Test-Path $caFolder) -and @(Get-ChildItem -Path $caFolder -File).Count -gt 0) { throw 'Expected C:\\ca to be empty or not exist, but found certificates' }", + "Write-Host 'C:\\ca is empty or does not exist as expected'", + } + execScriptOnVMForScenarioValidateExitCode(ctx, s, strings.Join(command, "\n"), 0, + "expected C:\\ca to be empty or not exist when not opted in") + + // Validate no refresh scheduled task was registered + command = []string{ + "$ErrorActionPreference = 'Stop'", + "$task = Get-ScheduledTask -TaskName 'aks-ca-certs-refresh-task' -ErrorAction SilentlyContinue", + "if ($task) { throw 'Expected no aks-ca-certs-refresh-task but found one' }", + "Write-Host 'No aks-ca-certs-refresh-task found as expected'", + } + execScriptOnVMForScenarioValidateExitCode(ctx, s, strings.Join(command, "\n"), 0, + "expected no aks-ca-certs-refresh-task scheduled task when not opted in") +} From e346cf966d70c38ac9d21d7d78f8927d1f767253 Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Wed, 15 Apr 2026 17:32:31 -0700 Subject: [PATCH 019/103] e2e: add VM instance-level tag update for RCV1P wireserver opt-in Wireserver checks tags on the individual VMSS VM instance, not the VMSS resource-level tags. Add VMInstanceTags field to Config and update the VM instance after it appears in the API but before CSE completes. This ensures wireserver sees the opt-in tag when init-aks-custom-cloud.sh queries IsOptedInForRootCerts during provisioning. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Ramkumar Chinchani --- e2e/scenario_rcv1p_test.go | 30 ++++++++++++++++++++++-------- e2e/scenario_rcv1p_win_test.go | 3 +++ e2e/types.go | 6 ++++++ e2e/vmss.go | 29 +++++++++++++++++++++++++++++ 4 files changed, 60 insertions(+), 8 deletions(-) diff --git a/e2e/scenario_rcv1p_test.go b/e2e/scenario_rcv1p_test.go index 817b63cba8e..0bb927798ae 100644 --- a/e2e/scenario_rcv1p_test.go +++ b/e2e/scenario_rcv1p_test.go @@ -44,8 +44,9 @@ func skipIfRCV1PNotConfigured(t *testing.T) { } } -// rcv1pOptInVMConfigMutator sets the platform opt-in tag on the VMSS so that wireserver -// will serve root certificates to this VM during provisioning. +// rcv1pOptInVMConfigMutator sets the platform opt-in tag on the VMSS resource level. +// Note: For wireserver to recognize the tag, it must also be set on the individual VM instance. +// Use VMInstanceTags in the Config to set instance-level tags (applied after VM creation). func rcv1pOptInVMConfigMutator(vmss *armcompute.VirtualMachineScaleSet) { if vmss.Tags == nil { vmss.Tags = map[string]*string{} @@ -53,6 +54,14 @@ func rcv1pOptInVMConfigMutator(vmss *armcompute.VirtualMachineScaleSet) { vmss.Tags[rcv1pOptInTag] = to.Ptr("true") } +// rcv1pVMInstanceTags returns the tags that must be set on individual VM instances +// for wireserver to serve root certificates. +func rcv1pVMInstanceTags() map[string]*string { + return map[string]*string{ + rcv1pOptInTag: to.Ptr("true"), + } +} + // Test_RCV1P_Ubuntu2204 validates RCV1P cert download and trust store installation on Ubuntu 22.04. // Ubuntu uses /usr/local/share/ca-certificates/ as the cert drop folder and update-ca-certificates // to rebuild the trust bundle. @@ -66,9 +75,10 @@ func Test_RCV1P_Ubuntu2204(t *testing.T) { RCV1PCertMode: true, }, Config: Config{ - Cluster: ClusterRCV1PKubenet, - VHD: config.VHDUbuntu2204Gen2Containerd, + Cluster: ClusterRCV1PKubenet, + VHD: config.VHDUbuntu2204Gen2Containerd, VMConfigMutator: rcv1pOptInVMConfigMutator, + VMInstanceTags: rcv1pVMInstanceTags(), BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) { }, Validator: func(ctx context.Context, s *Scenario) { @@ -91,9 +101,10 @@ func Test_RCV1P_Ubuntu2404(t *testing.T) { RCV1PCertMode: true, }, Config: Config{ - Cluster: ClusterRCV1PKubenet, - VHD: config.VHDUbuntu2404Gen2Containerd, + Cluster: ClusterRCV1PKubenet, + VHD: config.VHDUbuntu2404Gen2Containerd, VMConfigMutator: rcv1pOptInVMConfigMutator, + VMInstanceTags: rcv1pVMInstanceTags(), BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) { }, Validator: func(ctx context.Context, s *Scenario) { @@ -116,9 +127,10 @@ func Test_RCV1P_AzureLinuxV3(t *testing.T) { RCV1PCertMode: true, }, Config: Config{ - Cluster: ClusterRCV1PKubenet, - VHD: config.VHDAzureLinuxV3Gen2, + Cluster: ClusterRCV1PKubenet, + VHD: config.VHDAzureLinuxV3Gen2, VMConfigMutator: rcv1pOptInVMConfigMutator, + VMInstanceTags: rcv1pVMInstanceTags(), BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) { }, Validator: func(ctx context.Context, s *Scenario) { @@ -144,6 +156,7 @@ func Test_RCV1P_Flatcar(t *testing.T) { Cluster: ClusterRCV1PKubenet, VHD: config.VHDFlatcarGen2, VMConfigMutator: rcv1pOptInVMConfigMutator, + VMInstanceTags: rcv1pVMInstanceTags(), BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) { }, Validator: func(ctx context.Context, s *Scenario) { @@ -172,6 +185,7 @@ func Test_RCV1P_ACL(t *testing.T) { vmss.Properties = addTrustedLaunchToVMSS(vmss.Properties) rcv1pOptInVMConfigMutator(vmss) }, + VMInstanceTags: rcv1pVMInstanceTags(), BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) { }, Validator: func(ctx context.Context, s *Scenario) { diff --git a/e2e/scenario_rcv1p_win_test.go b/e2e/scenario_rcv1p_win_test.go index 73c3851671d..0932ae5f97b 100644 --- a/e2e/scenario_rcv1p_win_test.go +++ b/e2e/scenario_rcv1p_win_test.go @@ -32,6 +32,7 @@ func Test_RCV1P_Windows2022(t *testing.T) { Cluster: ClusterRCV1PKubenet, VHD: config.VHDWindows2022Containerd, VMConfigMutator: rcv1pOptInVMConfigMutator, + VMInstanceTags: rcv1pVMInstanceTags(), BootstrapConfigMutator: EmptyBootstrapConfigMutator, Validator: func(ctx context.Context, s *Scenario) { ValidateRCV1PCertModeWindows(ctx, s) @@ -54,6 +55,7 @@ func Test_RCV1P_Windows23H2(t *testing.T) { Cluster: ClusterRCV1PKubenet, VHD: config.VHDWindows23H2, VMConfigMutator: rcv1pOptInVMConfigMutator, + VMInstanceTags: rcv1pVMInstanceTags(), BootstrapConfigMutator: EmptyBootstrapConfigMutator, Validator: func(ctx context.Context, s *Scenario) { ValidateRCV1PCertModeWindows(ctx, s) @@ -80,6 +82,7 @@ func Test_RCV1P_Windows2025(t *testing.T) { vmss.Properties = addTrustedLaunchToVMSS(vmss.Properties) rcv1pOptInVMConfigMutator(vmss) }, + VMInstanceTags: rcv1pVMInstanceTags(), BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) { Windows2025BootstrapConfigMutator(t, nbc) }, diff --git a/e2e/types.go b/e2e/types.go index bd3887d6f39..5f20af3148a 100644 --- a/e2e/types.go +++ b/e2e/types.go @@ -273,6 +273,12 @@ type Config struct { // This prevents the Guest Agent from sweeping events before they can be read. // Only set this on CSE performance test scenarios. EagerCSETimingExtraction bool + + // VMInstanceTags are tags applied directly to VMSS VM instances after creation via BeginUpdate. + // This is needed for features like RCV1P where wireserver checks tags on the individual VM instance, + // not the VMSS resource-level tags. These tags are applied after the VM appears in the API but + // before CSE completes, giving wireserver time to see them before the provisioning scripts query it. + VMInstanceTags map[string]*string } func (s *Scenario) PrepareAKSNodeConfig() { diff --git a/e2e/vmss.go b/e2e/vmss.go index 51a3350d97e..3c9a05243de 100644 --- a/e2e/vmss.go +++ b/e2e/vmss.go @@ -547,6 +547,12 @@ func CreateVMSS(ctx context.Context, s *Scenario, resourceGroupName string) (*Sc return vm, fmt.Errorf("failed to wait for VMSS VM: %w", err) } + if len(s.Config.VMInstanceTags) > 0 { + if err := updateVMInstanceTags(ctx, s, resourceGroupName, s.Runtime.VMSSName, *vm.VM.InstanceID, s.Config.VMInstanceTags); err != nil { + return vm, fmt.Errorf("failed to update VM instance tags: %w", err) + } + } + vm.PrivateIP, err = getPrivateIPFromVMSSVM(ctx, s, resourceGroupName, s.Runtime.VMSSName, *vm.VM.InstanceID) if err != nil { return vm, fmt.Errorf("failed to get VM private IP address: %w", err) @@ -593,6 +599,29 @@ func CreateVMSS(ctx context.Context, s *Scenario, resourceGroupName string) (*Sc }, nil } +// updateVMInstanceTags updates tags on an individual VMSS VM instance. This is used for features +// like RCV1P where wireserver checks tags on the VM instance level, not the VMSS resource level. +// The update is done after the VM appears in the API but before CSE completes, ensuring the tags +// are visible to wireserver before provisioning scripts query it. +func updateVMInstanceTags(ctx context.Context, s *Scenario, resourceGroupName, vmssName, instanceID string, tags map[string]*string) error { + defer toolkit.LogStepCtxf(ctx, "updating VM instance %s/%s tags", vmssName, instanceID)() + + poller, err := s.GetAzure().VMSSVM.BeginUpdate(ctx, resourceGroupName, vmssName, instanceID, + armcompute.VirtualMachineScaleSetVM{ + Tags: tags, + }, nil) + if err != nil { + return fmt.Errorf("failed to begin VM instance tag update: %w", err) + } + + _, err = poller.PollUntilDone(ctx, config.DefaultPollUntilDoneOptions) + if err != nil { + return fmt.Errorf("failed to complete VM instance tag update: %w", err) + } + + return nil +} + // waitForVMRunningState polls until the VM reaches "Running" power state or the timeout elapses. func waitForVMRunningState(ctx context.Context, s *Scenario, vmssVM *armcompute.VirtualMachineScaleSetVM) error { ctxTimeout, cancel := context.WithTimeout(ctx, 3*time.Minute) From ef80be3c8c696efe9224434aefad301192991acc Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Wed, 15 Apr 2026 21:21:17 -0700 Subject: [PATCH 020/103] e2e: use JSON injection for VM profile tags at VMSS creation time The previous approach of updating VM instance tags after creation had a race condition: the BeginUpdate took ~108s, but CSE ran init-aks-custom-cloud.sh and queried wireserver before the tag update completed. Now we marshal the VMSS model to JSON, inject tags into virtualMachineProfile, and send a raw ARM PUT request via the SDK pipeline. This ensures the tags are present at VMSS creation time and propagate to VM instances before CSE boots. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Ramkumar Chinchani --- e2e/vmss.go | 147 ++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 125 insertions(+), 22 deletions(-) diff --git a/e2e/vmss.go b/e2e/vmss.go index 3c9a05243de..2309a403eb0 100644 --- a/e2e/vmss.go +++ b/e2e/vmss.go @@ -24,6 +24,8 @@ import ( "github.com/Azure/agentbaker/pkg/agent" "github.com/Azure/agentbaker/pkg/agent/datamodel" "github.com/Azure/azure-sdk-for-go/sdk/azcore" + azruntime "github.com/Azure/azure-sdk-for-go/sdk/azcore/runtime" + "github.com/Azure/azure-sdk-for-go/sdk/azcore/streaming" "github.com/Azure/azure-sdk-for-go/sdk/azcore/to" "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute/v7" "github.com/stretchr/testify/require" @@ -530,11 +532,21 @@ func CreateVMSSWithRetry(ctx context.Context, s *Scenario) (*ScenarioVM, error) func CreateVMSS(ctx context.Context, s *Scenario, resourceGroupName string) (*ScenarioVM, error) { defer toolkit.LogStepCtxf(ctx, "creating VMSS %s", s.Runtime.VMSSName)() vm := &ScenarioVM{} + + vmssModel := createVMSSModel(ctx, s) + + // When VMInstanceTags are configured, we need to inject tags into + // virtualMachineProfile which the Go SDK doesn't expose for Uniform mode VMSS. + // We marshal the model to JSON, inject the tags, and send a raw ARM PUT request. + if len(s.Config.VMInstanceTags) > 0 { + return createVMSSWithProfileTags(ctx, s, resourceGroupName, vmssModel, vm) + } + operation, err := s.GetAzure().VMSS.BeginCreateOrUpdate( ctx, resourceGroupName, s.Runtime.VMSSName, - createVMSSModel(ctx, s), + vmssModel, nil, ) if err != nil { @@ -547,12 +559,6 @@ func CreateVMSS(ctx context.Context, s *Scenario, resourceGroupName string) (*Sc return vm, fmt.Errorf("failed to wait for VMSS VM: %w", err) } - if len(s.Config.VMInstanceTags) > 0 { - if err := updateVMInstanceTags(ctx, s, resourceGroupName, s.Runtime.VMSSName, *vm.VM.InstanceID, s.Config.VMInstanceTags); err != nil { - return vm, fmt.Errorf("failed to update VM instance tags: %w", err) - } - } - vm.PrivateIP, err = getPrivateIPFromVMSSVM(ctx, s, resourceGroupName, s.Runtime.VMSSName, *vm.VM.InstanceID) if err != nil { return vm, fmt.Errorf("failed to get VM private IP address: %w", err) @@ -599,27 +605,124 @@ func CreateVMSS(ctx context.Context, s *Scenario, resourceGroupName string) (*Sc }, nil } -// updateVMInstanceTags updates tags on an individual VMSS VM instance. This is used for features -// like RCV1P where wireserver checks tags on the VM instance level, not the VMSS resource level. -// The update is done after the VM appears in the API but before CSE completes, ensuring the tags -// are visible to wireserver before provisioning scripts query it. -func updateVMInstanceTags(ctx context.Context, s *Scenario, resourceGroupName, vmssName, instanceID string, tags map[string]*string) error { - defer toolkit.LogStepCtxf(ctx, "updating VM instance %s/%s tags", vmssName, instanceID)() - - poller, err := s.GetAzure().VMSSVM.BeginUpdate(ctx, resourceGroupName, vmssName, instanceID, - armcompute.VirtualMachineScaleSetVM{ - Tags: tags, - }, nil) +// createVMSSWithProfileTags creates a VMSS using a raw ARM PUT request, injecting tags into +// virtualMachineProfile that the Go SDK doesn't expose for Uniform mode VMSS. This is needed +// for features like RCV1P where wireserver checks VM instance-level tags: the tags must be +// present at VMSS creation time so they propagate to VM instances before CSE runs. +func createVMSSWithProfileTags(ctx context.Context, s *Scenario, resourceGroupName string, vmssModel armcompute.VirtualMachineScaleSet, vm *ScenarioVM) (*ScenarioVM, error) { + defer toolkit.LogStepCtxf(ctx, "creating VMSS %s with VM profile tags", s.Runtime.VMSSName)() + + // Marshal the typed model to a generic map so we can inject virtualMachineProfile.tags + vmssJSON, err := json.Marshal(vmssModel) if err != nil { - return fmt.Errorf("failed to begin VM instance tag update: %w", err) + return vm, fmt.Errorf("failed to marshal VMSS model: %w", err) } - _, err = poller.PollUntilDone(ctx, config.DefaultPollUntilDoneOptions) + var vmssMap map[string]interface{} + if err := json.Unmarshal(vmssJSON, &vmssMap); err != nil { + return vm, fmt.Errorf("failed to unmarshal VMSS model to map: %w", err) + } + + // Inject tags into properties.virtualMachineProfile + props, ok := vmssMap["properties"].(map[string]interface{}) + if !ok { + return vm, fmt.Errorf("VMSS model missing 'properties' field") + } + vmProfile, ok := props["virtualMachineProfile"].(map[string]interface{}) + if !ok { + return vm, fmt.Errorf("VMSS model missing 'properties.virtualMachineProfile' field") + } + vmProfile["tags"] = s.Config.VMInstanceTags + s.T.Logf("injected VM profile tags: %v", s.Config.VMInstanceTags) + + // Re-marshal the modified model + modifiedBody, err := json.Marshal(vmssMap) if err != nil { - return fmt.Errorf("failed to complete VM instance tag update: %w", err) + return vm, fmt.Errorf("failed to marshal modified VMSS model: %w", err) } - return nil + // Build the ARM resource URL + subscriptionID := s.SubscriptionID + if subscriptionID == "" { + subscriptionID = config.Config.SubscriptionID + } + resourceURL := fmt.Sprintf("https://management.azure.com/subscriptions/%s/resourceGroups/%s/providers/Microsoft.Compute/virtualMachineScaleSets/%s?api-version=2025-04-01", + subscriptionID, resourceGroupName, s.Runtime.VMSSName) + + // Send raw PUT request via the SDK pipeline (includes auth, retry, logging) + req, err := azruntime.NewRequest(ctx, "PUT", resourceURL) + if err != nil { + return vm, fmt.Errorf("failed to create ARM request: %w", err) + } + req.Raw().Header.Set("Content-Type", "application/json") + if err := req.SetBody(streaming.NopCloser(bytes.NewReader(modifiedBody)), "application/json"); err != nil { + return vm, fmt.Errorf("failed to set request body: %w", err) + } + + resp, err := s.GetAzure().Core.Pipeline().Do(req) + if err != nil { + return vm, fmt.Errorf("failed to send VMSS creation request: %w", err) + } + if resp.StatusCode != 200 && resp.StatusCode != 201 { + body, _ := io.ReadAll(resp.Body) + resp.Body.Close() + return vm, fmt.Errorf("VMSS creation failed with status %d: %s", resp.StatusCode, string(body)) + } + + // Create a poller for the async operation + poller, err := azruntime.NewPoller[armcompute.VirtualMachineScaleSetsClientCreateOrUpdateResponse](resp, s.GetAzure().Core.Pipeline(), nil) + if err != nil { + return vm, fmt.Errorf("failed to create VMSS creation poller: %w", err) + } + + // Wait for VMSS VM to appear before extracting the private IP + vm.VM, err = waitForVMSSVM(ctx, s) + if err != nil { + return vm, fmt.Errorf("failed to wait for VMSS VM: %w", err) + } + + vm.PrivateIP, err = getPrivateIPFromVMSSVM(ctx, s, resourceGroupName, s.Runtime.VMSSName, *vm.VM.InstanceID) + if err != nil { + return vm, fmt.Errorf("failed to get VM private IP address: %w", err) + } + + s.T.Cleanup(func() { + defer cleanupBastionTunnel(vm.SSHClient) + cleanupVMSS(ctx, s, vm) + }) + + result := "SSH Instructions: (may take a few minutes for the VM to be ready for SSH)\n========================\n" + if config.Config.KeepVMSS { + s.T.Logf("VM will be preserved after the test finishes, PLEASE MANUALLY DELETE THE VMSS. Set KEEP_VMSS=false to delete it automatically after the test finishes\n") + } else { + s.T.Logf("VM will be automatically deleted after the test finishes, to preserve it for debugging purposes set KEEP_VMSS=true or pause the test with a breakpoint before the test finishes or failed\n") + } + result += fmt.Sprintf(`az network bastion ssh --target-resource-id "%s" --name "%s-bastion" --resource-group %s --auth-type ssh-key --username azureuser --ssh-key %s`, *vm.VM.ID, *s.Runtime.Cluster.Model.Name, *s.Runtime.Cluster.Model.Properties.NodeResourceGroup, config.VMSSHPrivateKeyFileName) + "\n" + s.T.Log(result) + + vmssResp, err := poller.PollUntilDone(ctx, config.DefaultPollUntilDoneOptions) + if !s.Config.SkipSSHConnectivityValidation { + var bastErr error + vm.SSHClient, bastErr = DialSSHOverBastion(ctx, s.Runtime.Cluster.Bastion, vm.PrivateIP, config.VMSSHPrivateKey) + if bastErr != nil { + return vm, fmt.Errorf("failed to start bastion tunnel: %w", bastErr) + } + } + if err != nil { + return vm, err + } + + err = waitForVMRunningState(ctx, s, vm.VM) + if err != nil { + return vm, fmt.Errorf("failed to wait for VM to reach running state: %w", err) + } + + return &ScenarioVM{ + VMSS: &vmssResp.VirtualMachineScaleSet, + PrivateIP: vm.PrivateIP, + VM: vm.VM, + SSHClient: vm.SSHClient, + }, nil } // waitForVMRunningState polls until the VM reaches "Running" power state or the timeout elapses. From 084a5a52d5b3c8b08c3c29ed3d65671a63429beb Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Thu, 16 Apr 2026 00:18:21 -0700 Subject: [PATCH 021/103] e2e: use lightweight PATCH for VM instance tags instead of JSON injection The ARM API does not support virtualMachineProfile.tags for Uniform mode VMSS (400 BadRequest). Instead, use a lightweight PATCH request to update tags on the VM instance after it appears. PATCH only modifies the tags property and should complete in seconds, unlike BeginUpdate which triggers a full model update (~108s). Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Ramkumar Chinchani --- e2e/vmss.go | 149 +++++++++++++++------------------------------------- 1 file changed, 43 insertions(+), 106 deletions(-) diff --git a/e2e/vmss.go b/e2e/vmss.go index 2309a403eb0..d0ede5095cd 100644 --- a/e2e/vmss.go +++ b/e2e/vmss.go @@ -533,20 +533,11 @@ func CreateVMSS(ctx context.Context, s *Scenario, resourceGroupName string) (*Sc defer toolkit.LogStepCtxf(ctx, "creating VMSS %s", s.Runtime.VMSSName)() vm := &ScenarioVM{} - vmssModel := createVMSSModel(ctx, s) - - // When VMInstanceTags are configured, we need to inject tags into - // virtualMachineProfile which the Go SDK doesn't expose for Uniform mode VMSS. - // We marshal the model to JSON, inject the tags, and send a raw ARM PUT request. - if len(s.Config.VMInstanceTags) > 0 { - return createVMSSWithProfileTags(ctx, s, resourceGroupName, vmssModel, vm) - } - operation, err := s.GetAzure().VMSS.BeginCreateOrUpdate( ctx, resourceGroupName, s.Runtime.VMSSName, - vmssModel, + createVMSSModel(ctx, s), nil, ) if err != nil { @@ -559,6 +550,15 @@ func CreateVMSS(ctx context.Context, s *Scenario, resourceGroupName string) (*Sc return vm, fmt.Errorf("failed to wait for VMSS VM: %w", err) } + // Apply VM instance tags via lightweight PATCH before CSE queries wireserver. + // This is needed for features like RCV1P where wireserver checks tags on the + // individual VM instance, not the VMSS resource-level tags. + if len(s.Config.VMInstanceTags) > 0 { + if err := patchVMInstanceTags(ctx, s, resourceGroupName, s.Runtime.VMSSName, *vm.VM.InstanceID, s.Config.VMInstanceTags); err != nil { + return vm, fmt.Errorf("failed to patch VM instance tags: %w", err) + } + } + vm.PrivateIP, err = getPrivateIPFromVMSSVM(ctx, s, resourceGroupName, s.Runtime.VMSSName, *vm.VM.InstanceID) if err != nil { return vm, fmt.Errorf("failed to get VM private IP address: %w", err) @@ -605,124 +605,61 @@ func CreateVMSS(ctx context.Context, s *Scenario, resourceGroupName string) (*Sc }, nil } -// createVMSSWithProfileTags creates a VMSS using a raw ARM PUT request, injecting tags into -// virtualMachineProfile that the Go SDK doesn't expose for Uniform mode VMSS. This is needed -// for features like RCV1P where wireserver checks VM instance-level tags: the tags must be -// present at VMSS creation time so they propagate to VM instances before CSE runs. -func createVMSSWithProfileTags(ctx context.Context, s *Scenario, resourceGroupName string, vmssModel armcompute.VirtualMachineScaleSet, vm *ScenarioVM) (*ScenarioVM, error) { - defer toolkit.LogStepCtxf(ctx, "creating VMSS %s with VM profile tags", s.Runtime.VMSSName)() - - // Marshal the typed model to a generic map so we can inject virtualMachineProfile.tags - vmssJSON, err := json.Marshal(vmssModel) - if err != nil { - return vm, fmt.Errorf("failed to marshal VMSS model: %w", err) - } +// patchVMInstanceTags sends a lightweight PATCH request to update tags on a VMSS VM instance. +// This is much faster than BeginUpdate (which triggers a full model update) because it only +// modifies the tags property. The PATCH typically completes in seconds rather than minutes. +func patchVMInstanceTags(ctx context.Context, s *Scenario, resourceGroupName, vmssName, instanceID string, tags map[string]*string) error { + defer toolkit.LogStepCtxf(ctx, "patching VM instance %s/%s tags", vmssName, instanceID)() - var vmssMap map[string]interface{} - if err := json.Unmarshal(vmssJSON, &vmssMap); err != nil { - return vm, fmt.Errorf("failed to unmarshal VMSS model to map: %w", err) - } - - // Inject tags into properties.virtualMachineProfile - props, ok := vmssMap["properties"].(map[string]interface{}) - if !ok { - return vm, fmt.Errorf("VMSS model missing 'properties' field") - } - vmProfile, ok := props["virtualMachineProfile"].(map[string]interface{}) - if !ok { - return vm, fmt.Errorf("VMSS model missing 'properties.virtualMachineProfile' field") - } - vmProfile["tags"] = s.Config.VMInstanceTags - s.T.Logf("injected VM profile tags: %v", s.Config.VMInstanceTags) - - // Re-marshal the modified model - modifiedBody, err := json.Marshal(vmssMap) - if err != nil { - return vm, fmt.Errorf("failed to marshal modified VMSS model: %w", err) - } - - // Build the ARM resource URL subscriptionID := s.SubscriptionID if subscriptionID == "" { subscriptionID = config.Config.SubscriptionID } - resourceURL := fmt.Sprintf("https://management.azure.com/subscriptions/%s/resourceGroups/%s/providers/Microsoft.Compute/virtualMachineScaleSets/%s?api-version=2025-04-01", - subscriptionID, resourceGroupName, s.Runtime.VMSSName) - // Send raw PUT request via the SDK pipeline (includes auth, retry, logging) - req, err := azruntime.NewRequest(ctx, "PUT", resourceURL) - if err != nil { - return vm, fmt.Errorf("failed to create ARM request: %w", err) - } - req.Raw().Header.Set("Content-Type", "application/json") - if err := req.SetBody(streaming.NopCloser(bytes.NewReader(modifiedBody)), "application/json"); err != nil { - return vm, fmt.Errorf("failed to set request body: %w", err) - } + resourceURL := fmt.Sprintf("https://management.azure.com/subscriptions/%s/resourceGroups/%s/providers/Microsoft.Compute/virtualMachineScaleSets/%s/virtualMachines/%s?api-version=2025-04-01", + subscriptionID, resourceGroupName, vmssName, instanceID) - resp, err := s.GetAzure().Core.Pipeline().Do(req) - if err != nil { - return vm, fmt.Errorf("failed to send VMSS creation request: %w", err) - } - if resp.StatusCode != 200 && resp.StatusCode != 201 { - body, _ := io.ReadAll(resp.Body) - resp.Body.Close() - return vm, fmt.Errorf("VMSS creation failed with status %d: %s", resp.StatusCode, string(body)) - } + body := struct { + Tags map[string]*string `json:"tags"` + }{Tags: tags} - // Create a poller for the async operation - poller, err := azruntime.NewPoller[armcompute.VirtualMachineScaleSetsClientCreateOrUpdateResponse](resp, s.GetAzure().Core.Pipeline(), nil) + bodyJSON, err := json.Marshal(body) if err != nil { - return vm, fmt.Errorf("failed to create VMSS creation poller: %w", err) + return fmt.Errorf("failed to marshal tag patch body: %w", err) } - // Wait for VMSS VM to appear before extracting the private IP - vm.VM, err = waitForVMSSVM(ctx, s) + req, err := azruntime.NewRequest(ctx, "PATCH", resourceURL) if err != nil { - return vm, fmt.Errorf("failed to wait for VMSS VM: %w", err) + return fmt.Errorf("failed to create PATCH request: %w", err) + } + if err := req.SetBody(streaming.NopCloser(bytes.NewReader(bodyJSON)), "application/json"); err != nil { + return fmt.Errorf("failed to set request body: %w", err) } - vm.PrivateIP, err = getPrivateIPFromVMSSVM(ctx, s, resourceGroupName, s.Runtime.VMSSName, *vm.VM.InstanceID) + resp, err := s.GetAzure().Core.Pipeline().Do(req) if err != nil { - return vm, fmt.Errorf("failed to get VM private IP address: %w", err) + return fmt.Errorf("failed to send PATCH request: %w", err) } - s.T.Cleanup(func() { - defer cleanupBastionTunnel(vm.SSHClient) - cleanupVMSS(ctx, s, vm) - }) - - result := "SSH Instructions: (may take a few minutes for the VM to be ready for SSH)\n========================\n" - if config.Config.KeepVMSS { - s.T.Logf("VM will be preserved after the test finishes, PLEASE MANUALLY DELETE THE VMSS. Set KEEP_VMSS=false to delete it automatically after the test finishes\n") - } else { - s.T.Logf("VM will be automatically deleted after the test finishes, to preserve it for debugging purposes set KEEP_VMSS=true or pause the test with a breakpoint before the test finishes or failed\n") + if resp.StatusCode != 200 && resp.StatusCode != 202 { + respBody, _ := io.ReadAll(resp.Body) + resp.Body.Close() + return fmt.Errorf("PATCH VM instance tags failed with status %d: %s", resp.StatusCode, string(respBody)) } - result += fmt.Sprintf(`az network bastion ssh --target-resource-id "%s" --name "%s-bastion" --resource-group %s --auth-type ssh-key --username azureuser --ssh-key %s`, *vm.VM.ID, *s.Runtime.Cluster.Model.Name, *s.Runtime.Cluster.Model.Properties.NodeResourceGroup, config.VMSSHPrivateKeyFileName) + "\n" - s.T.Log(result) - vmssResp, err := poller.PollUntilDone(ctx, config.DefaultPollUntilDoneOptions) - if !s.Config.SkipSSHConnectivityValidation { - var bastErr error - vm.SSHClient, bastErr = DialSSHOverBastion(ctx, s.Runtime.Cluster.Bastion, vm.PrivateIP, config.VMSSHPrivateKey) - if bastErr != nil { - return vm, fmt.Errorf("failed to start bastion tunnel: %w", bastErr) + // If 202 Accepted, poll until complete + if resp.StatusCode == 202 { + poller, err := azruntime.NewPoller[struct{}](resp, s.GetAzure().Core.Pipeline(), nil) + if err != nil { + return fmt.Errorf("failed to create poller for tag PATCH: %w", err) + } + _, err = poller.PollUntilDone(ctx, config.DefaultPollUntilDoneOptions) + if err != nil { + return fmt.Errorf("failed to complete tag PATCH: %w", err) } - } - if err != nil { - return vm, err - } - - err = waitForVMRunningState(ctx, s, vm.VM) - if err != nil { - return vm, fmt.Errorf("failed to wait for VM to reach running state: %w", err) } - return &ScenarioVM{ - VMSS: &vmssResp.VirtualMachineScaleSet, - PrivateIP: vm.PrivateIP, - VM: vm.VM, - SSHClient: vm.SSHClient, - }, nil + return nil } // waitForVMRunningState polls until the VM reaches "Running" power state or the timeout elapses. From ef89698d1bb8a41dec055a52cea524faf0b41703 Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Thu, 16 Apr 2026 10:40:31 -0700 Subject: [PATCH 022/103] Revert "e2e: use lightweight PATCH for VM instance tags instead of JSON injection" This reverts commit 03efe783c5dad08baa425e4fa43eaed022eb3dd2. Signed-off-by: Ramkumar Chinchani --- e2e/vmss.go | 149 +++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 106 insertions(+), 43 deletions(-) diff --git a/e2e/vmss.go b/e2e/vmss.go index d0ede5095cd..2309a403eb0 100644 --- a/e2e/vmss.go +++ b/e2e/vmss.go @@ -533,11 +533,20 @@ func CreateVMSS(ctx context.Context, s *Scenario, resourceGroupName string) (*Sc defer toolkit.LogStepCtxf(ctx, "creating VMSS %s", s.Runtime.VMSSName)() vm := &ScenarioVM{} + vmssModel := createVMSSModel(ctx, s) + + // When VMInstanceTags are configured, we need to inject tags into + // virtualMachineProfile which the Go SDK doesn't expose for Uniform mode VMSS. + // We marshal the model to JSON, inject the tags, and send a raw ARM PUT request. + if len(s.Config.VMInstanceTags) > 0 { + return createVMSSWithProfileTags(ctx, s, resourceGroupName, vmssModel, vm) + } + operation, err := s.GetAzure().VMSS.BeginCreateOrUpdate( ctx, resourceGroupName, s.Runtime.VMSSName, - createVMSSModel(ctx, s), + vmssModel, nil, ) if err != nil { @@ -550,15 +559,6 @@ func CreateVMSS(ctx context.Context, s *Scenario, resourceGroupName string) (*Sc return vm, fmt.Errorf("failed to wait for VMSS VM: %w", err) } - // Apply VM instance tags via lightweight PATCH before CSE queries wireserver. - // This is needed for features like RCV1P where wireserver checks tags on the - // individual VM instance, not the VMSS resource-level tags. - if len(s.Config.VMInstanceTags) > 0 { - if err := patchVMInstanceTags(ctx, s, resourceGroupName, s.Runtime.VMSSName, *vm.VM.InstanceID, s.Config.VMInstanceTags); err != nil { - return vm, fmt.Errorf("failed to patch VM instance tags: %w", err) - } - } - vm.PrivateIP, err = getPrivateIPFromVMSSVM(ctx, s, resourceGroupName, s.Runtime.VMSSName, *vm.VM.InstanceID) if err != nil { return vm, fmt.Errorf("failed to get VM private IP address: %w", err) @@ -605,61 +605,124 @@ func CreateVMSS(ctx context.Context, s *Scenario, resourceGroupName string) (*Sc }, nil } -// patchVMInstanceTags sends a lightweight PATCH request to update tags on a VMSS VM instance. -// This is much faster than BeginUpdate (which triggers a full model update) because it only -// modifies the tags property. The PATCH typically completes in seconds rather than minutes. -func patchVMInstanceTags(ctx context.Context, s *Scenario, resourceGroupName, vmssName, instanceID string, tags map[string]*string) error { - defer toolkit.LogStepCtxf(ctx, "patching VM instance %s/%s tags", vmssName, instanceID)() +// createVMSSWithProfileTags creates a VMSS using a raw ARM PUT request, injecting tags into +// virtualMachineProfile that the Go SDK doesn't expose for Uniform mode VMSS. This is needed +// for features like RCV1P where wireserver checks VM instance-level tags: the tags must be +// present at VMSS creation time so they propagate to VM instances before CSE runs. +func createVMSSWithProfileTags(ctx context.Context, s *Scenario, resourceGroupName string, vmssModel armcompute.VirtualMachineScaleSet, vm *ScenarioVM) (*ScenarioVM, error) { + defer toolkit.LogStepCtxf(ctx, "creating VMSS %s with VM profile tags", s.Runtime.VMSSName)() + + // Marshal the typed model to a generic map so we can inject virtualMachineProfile.tags + vmssJSON, err := json.Marshal(vmssModel) + if err != nil { + return vm, fmt.Errorf("failed to marshal VMSS model: %w", err) + } + var vmssMap map[string]interface{} + if err := json.Unmarshal(vmssJSON, &vmssMap); err != nil { + return vm, fmt.Errorf("failed to unmarshal VMSS model to map: %w", err) + } + + // Inject tags into properties.virtualMachineProfile + props, ok := vmssMap["properties"].(map[string]interface{}) + if !ok { + return vm, fmt.Errorf("VMSS model missing 'properties' field") + } + vmProfile, ok := props["virtualMachineProfile"].(map[string]interface{}) + if !ok { + return vm, fmt.Errorf("VMSS model missing 'properties.virtualMachineProfile' field") + } + vmProfile["tags"] = s.Config.VMInstanceTags + s.T.Logf("injected VM profile tags: %v", s.Config.VMInstanceTags) + + // Re-marshal the modified model + modifiedBody, err := json.Marshal(vmssMap) + if err != nil { + return vm, fmt.Errorf("failed to marshal modified VMSS model: %w", err) + } + + // Build the ARM resource URL subscriptionID := s.SubscriptionID if subscriptionID == "" { subscriptionID = config.Config.SubscriptionID } + resourceURL := fmt.Sprintf("https://management.azure.com/subscriptions/%s/resourceGroups/%s/providers/Microsoft.Compute/virtualMachineScaleSets/%s?api-version=2025-04-01", + subscriptionID, resourceGroupName, s.Runtime.VMSSName) - resourceURL := fmt.Sprintf("https://management.azure.com/subscriptions/%s/resourceGroups/%s/providers/Microsoft.Compute/virtualMachineScaleSets/%s/virtualMachines/%s?api-version=2025-04-01", - subscriptionID, resourceGroupName, vmssName, instanceID) - - body := struct { - Tags map[string]*string `json:"tags"` - }{Tags: tags} + // Send raw PUT request via the SDK pipeline (includes auth, retry, logging) + req, err := azruntime.NewRequest(ctx, "PUT", resourceURL) + if err != nil { + return vm, fmt.Errorf("failed to create ARM request: %w", err) + } + req.Raw().Header.Set("Content-Type", "application/json") + if err := req.SetBody(streaming.NopCloser(bytes.NewReader(modifiedBody)), "application/json"); err != nil { + return vm, fmt.Errorf("failed to set request body: %w", err) + } - bodyJSON, err := json.Marshal(body) + resp, err := s.GetAzure().Core.Pipeline().Do(req) if err != nil { - return fmt.Errorf("failed to marshal tag patch body: %w", err) + return vm, fmt.Errorf("failed to send VMSS creation request: %w", err) + } + if resp.StatusCode != 200 && resp.StatusCode != 201 { + body, _ := io.ReadAll(resp.Body) + resp.Body.Close() + return vm, fmt.Errorf("VMSS creation failed with status %d: %s", resp.StatusCode, string(body)) } - req, err := azruntime.NewRequest(ctx, "PATCH", resourceURL) + // Create a poller for the async operation + poller, err := azruntime.NewPoller[armcompute.VirtualMachineScaleSetsClientCreateOrUpdateResponse](resp, s.GetAzure().Core.Pipeline(), nil) if err != nil { - return fmt.Errorf("failed to create PATCH request: %w", err) + return vm, fmt.Errorf("failed to create VMSS creation poller: %w", err) } - if err := req.SetBody(streaming.NopCloser(bytes.NewReader(bodyJSON)), "application/json"); err != nil { - return fmt.Errorf("failed to set request body: %w", err) + + // Wait for VMSS VM to appear before extracting the private IP + vm.VM, err = waitForVMSSVM(ctx, s) + if err != nil { + return vm, fmt.Errorf("failed to wait for VMSS VM: %w", err) } - resp, err := s.GetAzure().Core.Pipeline().Do(req) + vm.PrivateIP, err = getPrivateIPFromVMSSVM(ctx, s, resourceGroupName, s.Runtime.VMSSName, *vm.VM.InstanceID) if err != nil { - return fmt.Errorf("failed to send PATCH request: %w", err) + return vm, fmt.Errorf("failed to get VM private IP address: %w", err) } - if resp.StatusCode != 200 && resp.StatusCode != 202 { - respBody, _ := io.ReadAll(resp.Body) - resp.Body.Close() - return fmt.Errorf("PATCH VM instance tags failed with status %d: %s", resp.StatusCode, string(respBody)) + s.T.Cleanup(func() { + defer cleanupBastionTunnel(vm.SSHClient) + cleanupVMSS(ctx, s, vm) + }) + + result := "SSH Instructions: (may take a few minutes for the VM to be ready for SSH)\n========================\n" + if config.Config.KeepVMSS { + s.T.Logf("VM will be preserved after the test finishes, PLEASE MANUALLY DELETE THE VMSS. Set KEEP_VMSS=false to delete it automatically after the test finishes\n") + } else { + s.T.Logf("VM will be automatically deleted after the test finishes, to preserve it for debugging purposes set KEEP_VMSS=true or pause the test with a breakpoint before the test finishes or failed\n") } + result += fmt.Sprintf(`az network bastion ssh --target-resource-id "%s" --name "%s-bastion" --resource-group %s --auth-type ssh-key --username azureuser --ssh-key %s`, *vm.VM.ID, *s.Runtime.Cluster.Model.Name, *s.Runtime.Cluster.Model.Properties.NodeResourceGroup, config.VMSSHPrivateKeyFileName) + "\n" + s.T.Log(result) - // If 202 Accepted, poll until complete - if resp.StatusCode == 202 { - poller, err := azruntime.NewPoller[struct{}](resp, s.GetAzure().Core.Pipeline(), nil) - if err != nil { - return fmt.Errorf("failed to create poller for tag PATCH: %w", err) - } - _, err = poller.PollUntilDone(ctx, config.DefaultPollUntilDoneOptions) - if err != nil { - return fmt.Errorf("failed to complete tag PATCH: %w", err) + vmssResp, err := poller.PollUntilDone(ctx, config.DefaultPollUntilDoneOptions) + if !s.Config.SkipSSHConnectivityValidation { + var bastErr error + vm.SSHClient, bastErr = DialSSHOverBastion(ctx, s.Runtime.Cluster.Bastion, vm.PrivateIP, config.VMSSHPrivateKey) + if bastErr != nil { + return vm, fmt.Errorf("failed to start bastion tunnel: %w", bastErr) } } + if err != nil { + return vm, err + } - return nil + err = waitForVMRunningState(ctx, s, vm.VM) + if err != nil { + return vm, fmt.Errorf("failed to wait for VM to reach running state: %w", err) + } + + return &ScenarioVM{ + VMSS: &vmssResp.VirtualMachineScaleSet, + PrivateIP: vm.PrivateIP, + VM: vm.VM, + SSHClient: vm.SSHClient, + }, nil } // waitForVMRunningState polls until the VM reaches "Running" power state or the timeout elapses. From 6da97f831d8892b68727d50813d02e703d7355dd Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Thu, 16 Apr 2026 11:29:58 -0700 Subject: [PATCH 023/103] e2e: use Microsoft.Resources/tags API for VM instance tag patching For Uniform mode VMSS, VM instance tags cannot be set at creation time: - The Go SDK (armcompute v7.x) does not expose a Tags field on VirtualMachineScaleSetVMProfile. - The ARM API rejects virtualMachineProfile.tags for Uniform mode VMSS with: 'Could not find member tags on object of type VirtualMachineProfile'. - PATCH on the Compute VM instance endpoint returns 405 Method Not Allowed. - BeginUpdate (PUT) works but takes ~108s for a full VM model reconciliation, causing a race condition: CSE runs init-aks-custom-cloud.sh and queries wireserver before the tag update completes. Use the Microsoft.Resources/tags API instead, which provides a lightweight PATCH endpoint (/{resourceId}/providers/Microsoft.Resources/tags/default) that updates only tags without triggering a full VM update. The Merge operation adds tags without replacing existing ones. Also moves s.T.Cleanup() registration to immediately after waitForVMSSVM() so the VMSS is always cleaned up even if tag patching or subsequent steps fail, preventing orphaned VMSS resources. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Ramkumar Chinchani --- e2e/vmss.go | 162 ++++++++++++++++------------------------------------ 1 file changed, 50 insertions(+), 112 deletions(-) diff --git a/e2e/vmss.go b/e2e/vmss.go index 2309a403eb0..8e547550ba3 100644 --- a/e2e/vmss.go +++ b/e2e/vmss.go @@ -533,20 +533,11 @@ func CreateVMSS(ctx context.Context, s *Scenario, resourceGroupName string) (*Sc defer toolkit.LogStepCtxf(ctx, "creating VMSS %s", s.Runtime.VMSSName)() vm := &ScenarioVM{} - vmssModel := createVMSSModel(ctx, s) - - // When VMInstanceTags are configured, we need to inject tags into - // virtualMachineProfile which the Go SDK doesn't expose for Uniform mode VMSS. - // We marshal the model to JSON, inject the tags, and send a raw ARM PUT request. - if len(s.Config.VMInstanceTags) > 0 { - return createVMSSWithProfileTags(ctx, s, resourceGroupName, vmssModel, vm) - } - operation, err := s.GetAzure().VMSS.BeginCreateOrUpdate( ctx, resourceGroupName, s.Runtime.VMSSName, - vmssModel, + createVMSSModel(ctx, s), nil, ) if err != nil { @@ -559,16 +550,27 @@ func CreateVMSS(ctx context.Context, s *Scenario, resourceGroupName string) (*Sc return vm, fmt.Errorf("failed to wait for VMSS VM: %w", err) } - vm.PrivateIP, err = getPrivateIPFromVMSSVM(ctx, s, resourceGroupName, s.Runtime.VMSSName, *vm.VM.InstanceID) - if err != nil { - return vm, fmt.Errorf("failed to get VM private IP address: %w", err) - } - + // Register cleanup early so the VMSS is always deleted even if subsequent steps + // (tag patching, IP lookup, etc.) fail — preventing orphaned VMSS resources. s.T.Cleanup(func() { defer cleanupBastionTunnel(vm.SSHClient) cleanupVMSS(ctx, s, vm) }) + // Apply VM instance tags via the Microsoft.Resources/tags API before CSE queries + // wireserver. This is needed for features like RCV1P where wireserver checks tags + // on the individual VM instance, not the VMSS resource-level tags. + if len(s.Config.VMInstanceTags) > 0 { + if err := patchVMInstanceTags(ctx, s, resourceGroupName, s.Runtime.VMSSName, *vm.VM.InstanceID, s.Config.VMInstanceTags); err != nil { + return vm, fmt.Errorf("failed to patch VM instance tags: %w", err) + } + } + + vm.PrivateIP, err = getPrivateIPFromVMSSVM(ctx, s, resourceGroupName, s.Runtime.VMSSName, *vm.VM.InstanceID) + if err != nil { + return vm, fmt.Errorf("failed to get VM private IP address: %w", err) + } + result := "SSH Instructions: (may take a few minutes for the VM to be ready for SSH)\n========================\n" if config.Config.KeepVMSS { s.T.Logf("VM will be preserved after the test finishes, PLEASE MANUALLY DELETE THE VMSS. Set KEEP_VMSS=false to delete it automatically after the test finishes\n") @@ -605,126 +607,62 @@ func CreateVMSS(ctx context.Context, s *Scenario, resourceGroupName string) (*Sc }, nil } -// createVMSSWithProfileTags creates a VMSS using a raw ARM PUT request, injecting tags into -// virtualMachineProfile that the Go SDK doesn't expose for Uniform mode VMSS. This is needed -// for features like RCV1P where wireserver checks VM instance-level tags: the tags must be -// present at VMSS creation time so they propagate to VM instances before CSE runs. -func createVMSSWithProfileTags(ctx context.Context, s *Scenario, resourceGroupName string, vmssModel armcompute.VirtualMachineScaleSet, vm *ScenarioVM) (*ScenarioVM, error) { - defer toolkit.LogStepCtxf(ctx, "creating VMSS %s with VM profile tags", s.Runtime.VMSSName)() +// patchVMInstanceTags uses the Microsoft.Resources/tags API to merge tags onto a VMSS VM +// instance. This is a lightweight PATCH that only modifies tags without triggering a full +// VM model update, completing in seconds rather than the ~108s that BeginUpdate takes. +func patchVMInstanceTags(ctx context.Context, s *Scenario, resourceGroupName, vmssName, instanceID string, tags map[string]*string) error { + defer toolkit.LogStepCtxf(ctx, "patching VM instance %s/%s tags via Resources API", vmssName, instanceID)() - // Marshal the typed model to a generic map so we can inject virtualMachineProfile.tags - vmssJSON, err := json.Marshal(vmssModel) - if err != nil { - return vm, fmt.Errorf("failed to marshal VMSS model: %w", err) - } - - var vmssMap map[string]interface{} - if err := json.Unmarshal(vmssJSON, &vmssMap); err != nil { - return vm, fmt.Errorf("failed to unmarshal VMSS model to map: %w", err) - } - - // Inject tags into properties.virtualMachineProfile - props, ok := vmssMap["properties"].(map[string]interface{}) - if !ok { - return vm, fmt.Errorf("VMSS model missing 'properties' field") - } - vmProfile, ok := props["virtualMachineProfile"].(map[string]interface{}) - if !ok { - return vm, fmt.Errorf("VMSS model missing 'properties.virtualMachineProfile' field") - } - vmProfile["tags"] = s.Config.VMInstanceTags - s.T.Logf("injected VM profile tags: %v", s.Config.VMInstanceTags) - - // Re-marshal the modified model - modifiedBody, err := json.Marshal(vmssMap) - if err != nil { - return vm, fmt.Errorf("failed to marshal modified VMSS model: %w", err) - } - - // Build the ARM resource URL subscriptionID := s.SubscriptionID if subscriptionID == "" { subscriptionID = config.Config.SubscriptionID } - resourceURL := fmt.Sprintf("https://management.azure.com/subscriptions/%s/resourceGroups/%s/providers/Microsoft.Compute/virtualMachineScaleSets/%s?api-version=2025-04-01", - subscriptionID, resourceGroupName, s.Runtime.VMSSName) - // Send raw PUT request via the SDK pipeline (includes auth, retry, logging) - req, err := azruntime.NewRequest(ctx, "PUT", resourceURL) - if err != nil { - return vm, fmt.Errorf("failed to create ARM request: %w", err) - } - req.Raw().Header.Set("Content-Type", "application/json") - if err := req.SetBody(streaming.NopCloser(bytes.NewReader(modifiedBody)), "application/json"); err != nil { - return vm, fmt.Errorf("failed to set request body: %w", err) - } - - resp, err := s.GetAzure().Core.Pipeline().Do(req) - if err != nil { - return vm, fmt.Errorf("failed to send VMSS creation request: %w", err) - } - if resp.StatusCode != 200 && resp.StatusCode != 201 { - body, _ := io.ReadAll(resp.Body) - resp.Body.Close() - return vm, fmt.Errorf("VMSS creation failed with status %d: %s", resp.StatusCode, string(body)) - } + // The Microsoft.Resources/tags API allows lightweight tag updates on any Azure resource. + // Using "Merge" operation to add/update tags without replacing existing ones. + resourceURL := fmt.Sprintf( + "https://management.azure.com/subscriptions/%s/resourceGroups/%s/providers/Microsoft.Compute/virtualMachineScaleSets/%s/virtualMachines/%s/providers/Microsoft.Resources/tags/default?api-version=2021-04-01", + subscriptionID, resourceGroupName, vmssName, instanceID, + ) - // Create a poller for the async operation - poller, err := azruntime.NewPoller[armcompute.VirtualMachineScaleSetsClientCreateOrUpdateResponse](resp, s.GetAzure().Core.Pipeline(), nil) - if err != nil { - return vm, fmt.Errorf("failed to create VMSS creation poller: %w", err) + body := struct { + Operation string `json:"operation"` + Properties struct { + Tags map[string]*string `json:"tags"` + } `json:"properties"` + }{ + Operation: "Merge", } + body.Properties.Tags = tags - // Wait for VMSS VM to appear before extracting the private IP - vm.VM, err = waitForVMSSVM(ctx, s) + bodyJSON, err := json.Marshal(body) if err != nil { - return vm, fmt.Errorf("failed to wait for VMSS VM: %w", err) + return fmt.Errorf("failed to marshal tag patch body: %w", err) } - vm.PrivateIP, err = getPrivateIPFromVMSSVM(ctx, s, resourceGroupName, s.Runtime.VMSSName, *vm.VM.InstanceID) + req, err := azruntime.NewRequest(ctx, "PATCH", resourceURL) if err != nil { - return vm, fmt.Errorf("failed to get VM private IP address: %w", err) + return fmt.Errorf("failed to create PATCH request: %w", err) } - - s.T.Cleanup(func() { - defer cleanupBastionTunnel(vm.SSHClient) - cleanupVMSS(ctx, s, vm) - }) - - result := "SSH Instructions: (may take a few minutes for the VM to be ready for SSH)\n========================\n" - if config.Config.KeepVMSS { - s.T.Logf("VM will be preserved after the test finishes, PLEASE MANUALLY DELETE THE VMSS. Set KEEP_VMSS=false to delete it automatically after the test finishes\n") - } else { - s.T.Logf("VM will be automatically deleted after the test finishes, to preserve it for debugging purposes set KEEP_VMSS=true or pause the test with a breakpoint before the test finishes or failed\n") + if err := req.SetBody(streaming.NopCloser(bytes.NewReader(bodyJSON)), "application/json"); err != nil { + return fmt.Errorf("failed to set request body: %w", err) } - result += fmt.Sprintf(`az network bastion ssh --target-resource-id "%s" --name "%s-bastion" --resource-group %s --auth-type ssh-key --username azureuser --ssh-key %s`, *vm.VM.ID, *s.Runtime.Cluster.Model.Name, *s.Runtime.Cluster.Model.Properties.NodeResourceGroup, config.VMSSHPrivateKeyFileName) + "\n" - s.T.Log(result) - vmssResp, err := poller.PollUntilDone(ctx, config.DefaultPollUntilDoneOptions) - if !s.Config.SkipSSHConnectivityValidation { - var bastErr error - vm.SSHClient, bastErr = DialSSHOverBastion(ctx, s.Runtime.Cluster.Bastion, vm.PrivateIP, config.VMSSHPrivateKey) - if bastErr != nil { - return vm, fmt.Errorf("failed to start bastion tunnel: %w", bastErr) - } - } + resp, err := s.GetAzure().Core.Pipeline().Do(req) if err != nil { - return vm, err + return fmt.Errorf("failed to send tag PATCH request: %w", err) } - err = waitForVMRunningState(ctx, s, vm.VM) - if err != nil { - return vm, fmt.Errorf("failed to wait for VM to reach running state: %w", err) + if resp.StatusCode != 200 { + respBody, _ := io.ReadAll(resp.Body) + resp.Body.Close() + return fmt.Errorf("tag PATCH failed with status %d: %s", resp.StatusCode, string(respBody)) } - return &ScenarioVM{ - VMSS: &vmssResp.VirtualMachineScaleSet, - PrivateIP: vm.PrivateIP, - VM: vm.VM, - SSHClient: vm.SSHClient, - }, nil + return nil } + // waitForVMRunningState polls until the VM reaches "Running" power state or the timeout elapses. func waitForVMRunningState(ctx context.Context, s *Scenario, vmssVM *armcompute.VirtualMachineScaleSetVM) error { ctxTimeout, cancel := context.WithTimeout(ctx, 3*time.Minute) From 12bc156b409f64f4b2bcc0dac4fb49ca6f0c33ee Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Thu, 16 Apr 2026 15:02:02 -0700 Subject: [PATCH 024/103] e2e: use BeginUpdate + deferred CSE for VM instance tagging Replace the Microsoft.Resources/tags API approach (which returns 405 on Uniform VMSS VM instances) with BeginUpdate (full PUT) + deferred CSE. For scenarios requiring VM instance tags (e.g., RCV1P): 1. Create VMSS without CSE extension profile 2. Wait for VMSS creation to complete 3. Apply tags via VMSSVM.BeginUpdate (~108s full PUT) 4. Re-add CSE extension via a second BeginCreateOrUpdate This ensures wireserver sees the per-VM-instance tags before CSE queries it. The delay is acceptable for E2E validation; production would use a different approach (e.g., AKS RP sets tags pre-boot). Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Ramkumar Chinchani --- e2e/vmss.go | 120 +++++++++++++++++++++++++++++----------------------- 1 file changed, 67 insertions(+), 53 deletions(-) diff --git a/e2e/vmss.go b/e2e/vmss.go index 8e547550ba3..40bb9d44c72 100644 --- a/e2e/vmss.go +++ b/e2e/vmss.go @@ -24,8 +24,6 @@ import ( "github.com/Azure/agentbaker/pkg/agent" "github.com/Azure/agentbaker/pkg/agent/datamodel" "github.com/Azure/azure-sdk-for-go/sdk/azcore" - azruntime "github.com/Azure/azure-sdk-for-go/sdk/azcore/runtime" - "github.com/Azure/azure-sdk-for-go/sdk/azcore/streaming" "github.com/Azure/azure-sdk-for-go/sdk/azcore/to" "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute/v7" "github.com/stretchr/testify/require" @@ -533,11 +531,25 @@ func CreateVMSS(ctx context.Context, s *Scenario, resourceGroupName string) (*Sc defer toolkit.LogStepCtxf(ctx, "creating VMSS %s", s.Runtime.VMSSName)() vm := &ScenarioVM{} + model := createVMSSModel(ctx, s) + + // For scenarios that need VM instance tags (e.g., RCV1P), we must apply tags + // before CSE runs because wireserver checks per-VM-instance tags. The only + // working method for Uniform VMSS is BeginUpdate (full PUT), which takes ~108s. + // To avoid the race, we strip the CSE extension before creation, apply tags + // via BeginUpdate, then re-add the extension in a second update. + var deferredExtensionProfile *armcompute.VirtualMachineScaleSetExtensionProfile + if len(s.Config.VMInstanceTags) > 0 && model.Properties.VirtualMachineProfile.ExtensionProfile != nil { + deferredExtensionProfile = model.Properties.VirtualMachineProfile.ExtensionProfile + model.Properties.VirtualMachineProfile.ExtensionProfile = nil + toolkit.Logf(ctx, "deferring CSE extension until VM instance tags are applied") + } + operation, err := s.GetAzure().VMSS.BeginCreateOrUpdate( ctx, resourceGroupName, s.Runtime.VMSSName, - createVMSSModel(ctx, s), + model, nil, ) if err != nil { @@ -551,18 +563,45 @@ func CreateVMSS(ctx context.Context, s *Scenario, resourceGroupName string) (*Sc } // Register cleanup early so the VMSS is always deleted even if subsequent steps - // (tag patching, IP lookup, etc.) fail — preventing orphaned VMSS resources. + // (tag update, IP lookup, etc.) fail — preventing orphaned VMSS resources. s.T.Cleanup(func() { defer cleanupBastionTunnel(vm.SSHClient) cleanupVMSS(ctx, s, vm) }) - // Apply VM instance tags via the Microsoft.Resources/tags API before CSE queries - // wireserver. This is needed for features like RCV1P where wireserver checks tags - // on the individual VM instance, not the VMSS resource-level tags. + // Wait for initial VMSS creation to fully complete before applying tags. + vmssResp, err := operation.PollUntilDone(ctx, config.DefaultPollUntilDoneOptions) + if err != nil { + return vm, fmt.Errorf("failed to create VMSS: %w", err) + } + + // Apply VM instance tags via BeginUpdate (full PUT) and then re-add CSE. + // This is needed for features like RCV1P where wireserver checks tags on + // the individual VM instance, not the VMSS resource-level tags. if len(s.Config.VMInstanceTags) > 0 { - if err := patchVMInstanceTags(ctx, s, resourceGroupName, s.Runtime.VMSSName, *vm.VM.InstanceID, s.Config.VMInstanceTags); err != nil { - return vm, fmt.Errorf("failed to patch VM instance tags: %w", err) + if err := updateVMInstanceTags(ctx, s, resourceGroupName, s.Runtime.VMSSName, *vm.VM.InstanceID, s.Config.VMInstanceTags); err != nil { + return vm, fmt.Errorf("failed to update VM instance tags: %w", err) + } + + // Re-add CSE extension now that tags are in place. + if deferredExtensionProfile != nil { + toolkit.Logf(ctx, "re-adding CSE extension after tags are applied") + vmssResp.VirtualMachineScaleSet.Properties.VirtualMachineProfile.ExtensionProfile = deferredExtensionProfile + cseOp, err := s.GetAzure().VMSS.BeginCreateOrUpdate( + ctx, + resourceGroupName, + s.Runtime.VMSSName, + vmssResp.VirtualMachineScaleSet, + nil, + ) + if err != nil { + return vm, fmt.Errorf("failed to begin adding CSE extension: %w", err) + } + vmssResp2, err := cseOp.PollUntilDone(ctx, config.DefaultPollUntilDoneOptions) + if err != nil { + return vm, fmt.Errorf("failed to add CSE extension: %w", err) + } + vmssResp = vmssResp2 } } @@ -581,7 +620,6 @@ func CreateVMSS(ctx context.Context, s *Scenario, resourceGroupName string) (*Sc result += fmt.Sprintf(`az network bastion ssh --target-resource-id "%s" --name "%s" --resource-group %s --auth-type ssh-key --username azureuser --ssh-key %s`, *vm.VM.ID, SharedBastionName, config.ResourceGroupName(*s.Runtime.Cluster.Model.Location), config.VMSSHPrivateKeyFileName) + "\n" s.T.Log(result) - vmssResp, err := operation.PollUntilDone(ctx, config.DefaultPollUntilDoneOptions) if !s.Config.SkipSSHConnectivityValidation { var bastErr error vm.SSHClient, bastErr = DialSSHOverBastion(ctx, s.Runtime.Cluster.Bastion, vm.PrivateIP, config.VMSSHPrivateKey) @@ -589,9 +627,6 @@ func CreateVMSS(ctx context.Context, s *Scenario, resourceGroupName string) (*Sc return vm, fmt.Errorf("failed to start bastion tunnel: %w", bastErr) } } - if err != nil { - return vm, err - } // Wait for VM to be in "Running" power state before proceeding err = waitForVMRunningState(ctx, s, vm.VM) @@ -607,56 +642,35 @@ func CreateVMSS(ctx context.Context, s *Scenario, resourceGroupName string) (*Sc }, nil } -// patchVMInstanceTags uses the Microsoft.Resources/tags API to merge tags onto a VMSS VM -// instance. This is a lightweight PATCH that only modifies tags without triggering a full -// VM model update, completing in seconds rather than the ~108s that BeginUpdate takes. -func patchVMInstanceTags(ctx context.Context, s *Scenario, resourceGroupName, vmssName, instanceID string, tags map[string]*string) error { - defer toolkit.LogStepCtxf(ctx, "patching VM instance %s/%s tags via Resources API", vmssName, instanceID)() - - subscriptionID := s.SubscriptionID - if subscriptionID == "" { - subscriptionID = config.Config.SubscriptionID - } - - // The Microsoft.Resources/tags API allows lightweight tag updates on any Azure resource. - // Using "Merge" operation to add/update tags without replacing existing ones. - resourceURL := fmt.Sprintf( - "https://management.azure.com/subscriptions/%s/resourceGroups/%s/providers/Microsoft.Compute/virtualMachineScaleSets/%s/virtualMachines/%s/providers/Microsoft.Resources/tags/default?api-version=2021-04-01", - subscriptionID, resourceGroupName, vmssName, instanceID, - ) - - body := struct { - Operation string `json:"operation"` - Properties struct { - Tags map[string]*string `json:"tags"` - } `json:"properties"` - }{ - Operation: "Merge", - } - body.Properties.Tags = tags +// updateVMInstanceTags uses BeginUpdate (full PUT) to set tags on a VMSS VM instance. +// This is the only method that works for Uniform mode VMSS — PATCH and Microsoft.Resources/tags +// API both return 405 at this scope. The operation takes ~108s as it triggers full VM model +// reconciliation. This is acceptable for E2E tests where we defer CSE until tags are in place. +func updateVMInstanceTags(ctx context.Context, s *Scenario, resourceGroupName, vmssName, instanceID string, tags map[string]*string) error { + defer toolkit.LogStepCtxf(ctx, "updating VM instance %s/%s/%s tags via BeginUpdate", resourceGroupName, vmssName, instanceID)() - bodyJSON, err := json.Marshal(body) + // Get current VM instance to preserve existing state + currentVM, err := s.GetAzure().VMSSVM.Get(ctx, resourceGroupName, vmssName, instanceID, &armcompute.VirtualMachineScaleSetVMsClientGetOptions{}) if err != nil { - return fmt.Errorf("failed to marshal tag patch body: %w", err) + return fmt.Errorf("failed to get current VM instance: %w", err) } - req, err := azruntime.NewRequest(ctx, "PATCH", resourceURL) - if err != nil { - return fmt.Errorf("failed to create PATCH request: %w", err) + // Merge new tags with any existing tags + if currentVM.Tags == nil { + currentVM.Tags = make(map[string]*string) } - if err := req.SetBody(streaming.NopCloser(bytes.NewReader(bodyJSON)), "application/json"); err != nil { - return fmt.Errorf("failed to set request body: %w", err) + for k, v := range tags { + currentVM.Tags[k] = v } - resp, err := s.GetAzure().Core.Pipeline().Do(req) + poller, err := s.GetAzure().VMSSVM.BeginUpdate(ctx, resourceGroupName, vmssName, instanceID, currentVM.VirtualMachineScaleSetVM, nil) if err != nil { - return fmt.Errorf("failed to send tag PATCH request: %w", err) + return fmt.Errorf("failed to begin VM instance tag update: %w", err) } - if resp.StatusCode != 200 { - respBody, _ := io.ReadAll(resp.Body) - resp.Body.Close() - return fmt.Errorf("tag PATCH failed with status %d: %s", resp.StatusCode, string(respBody)) + _, err = poller.PollUntilDone(ctx, config.DefaultPollUntilDoneOptions) + if err != nil { + return fmt.Errorf("failed to complete VM instance tag update: %w", err) } return nil From 180254e294677d19289b5ace1f93d27aa45af6b9 Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Thu, 16 Apr 2026 19:42:21 -0700 Subject: [PATCH 025/103] e2e: add feature flag check for RCV1P subscription Verify Microsoft.Compute/PlatformSettingsOverride is registered on the RCV1P subscription before running tests. This fails fast with a clear error if the feature flag is missing, rather than letting tests run and fail with opaque wireserver responses. The check runs once per test run via sync.Once. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Ramkumar Chinchani --- e2e/scenario_rcv1p_test.go | 56 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) diff --git a/e2e/scenario_rcv1p_test.go b/e2e/scenario_rcv1p_test.go index 0bb927798ae..eed7cf43ffd 100644 --- a/e2e/scenario_rcv1p_test.go +++ b/e2e/scenario_rcv1p_test.go @@ -19,11 +19,15 @@ package e2e import ( "context" + "fmt" + "io" "strings" + "sync" "testing" "github.com/Azure/agentbaker/e2e/config" "github.com/Azure/agentbaker/pkg/agent/datamodel" + azruntime "github.com/Azure/azure-sdk-for-go/sdk/azcore/runtime" "github.com/Azure/azure-sdk-for-go/sdk/azcore/to" "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute/v7" ) @@ -36,12 +40,64 @@ const rcv1pOptInTag = "platformsettings.host_environment.service.platform_optedi // skipIfRCV1PNotConfigured skips the test when the RCV1P subscription is not configured. // This happens in regular CI runs where the RCV1P variable group is not linked, causing // Azure DevOps to pass the literal unexpanded string "$(RCV1P_SUBSCRIPTION_ID)". +// It also verifies the Microsoft.Compute/PlatformSettingsOverride feature flag is registered. func skipIfRCV1PNotConfigured(t *testing.T) { t.Helper() subID := config.Config.RCV1PSubscriptionID if subID == "" || strings.HasPrefix(subID, "$(") { t.Skip("RCV1P_SUBSCRIPTION_ID not set or not resolved, skipping RCV1P cert mode test") } + checkPlatformSettingsOverrideFeatureFlag(t, subID) +} + +var ( + featureFlagCheckOnce sync.Once + featureFlagCheckResult error +) + +// checkPlatformSettingsOverrideFeatureFlag verifies the Microsoft.Compute/PlatformSettingsOverride +// feature flag is registered on the given subscription. This is a prerequisite for wireserver to +// serve root certificates. The check runs only once per test run. +func checkPlatformSettingsOverrideFeatureFlag(t *testing.T, subscriptionID string) { + t.Helper() + featureFlagCheckOnce.Do(func() { + featureFlagCheckResult = verifyFeatureFlag(t.Context(), subscriptionID) + }) + if featureFlagCheckResult != nil { + t.Fatalf("RCV1P feature flag check failed: %v", featureFlagCheckResult) + } +} + +func verifyFeatureFlag(ctx context.Context, subscriptionID string) error { + url := fmt.Sprintf( + "https://management.azure.com/subscriptions/%s/providers/Microsoft.Features/providers/Microsoft.Compute/features/PlatformSettingsOverride?api-version=2021-07-01", + subscriptionID, + ) + + req, err := azruntime.NewRequest(ctx, "GET", url) + if err != nil { + return fmt.Errorf("failed to create feature flag request: %w", err) + } + + resp, err := config.RCV1PAzure.Core.Pipeline().Do(req) + if err != nil { + return fmt.Errorf("failed to query feature flag: %w", err) + } + defer resp.Body.Close() + + body, _ := io.ReadAll(resp.Body) + bodyStr := string(body) + + if resp.StatusCode != 200 { + return fmt.Errorf("feature flag query returned status %d: %s", resp.StatusCode, bodyStr) + } + + if !strings.Contains(bodyStr, `"Registered"`) { + return fmt.Errorf("Microsoft.Compute/PlatformSettingsOverride is NOT registered on subscription %s (response: %s); "+ + "wireserver will not serve root certificates without this feature flag", subscriptionID, bodyStr) + } + + return nil } // rcv1pOptInVMConfigMutator sets the platform opt-in tag on the VMSS resource level. From 2328667daa085ae8a136e97e4a289641c88a7a97 Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Thu, 16 Apr 2026 19:46:34 -0700 Subject: [PATCH 026/103] REVERT ME: poll wireserver IsOptedInForRootCerts with retry loop Experimental commit to validate whether wireserver detects VM instance tags applied via BeginUpdate after VM creation. Polls for up to ~5 minutes (30x10s). Wireserver reads IsOptedInForRootCerts from the Fabric Controller goal state (CCF/ContainerConfig), NOT directly from ARM tags. The flow is: BeginUpdate -> ARM model update -> FC generates new CCF with platformsettings.host_environment.service.platform_optedin_for_rootcerts -> FC pushes CCF to host agent -> wireserver reflects new state. FC goal state propagation can take several minutes, so the polling window is set to ~5 minutes to give adequate time for detection. Logs the full wireserver response on each attempt for diagnostics. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Ramkumar Chinchani --- .../artifacts/init-aks-custom-cloud.sh | 42 ++++++++++++++----- 1 file changed, 31 insertions(+), 11 deletions(-) diff --git a/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh b/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh index 862c2f09b6c..c63e0bc5df9 100644 --- a/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh +++ b/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh @@ -65,20 +65,40 @@ function make_request_with_retry { function is_opted_in_for_root_certs { local opt_in_response + local request_status + local poll_attempt=1 + local max_poll_attempts=30 + local poll_interval=10 + + # Poll wireserver for up to ~5 minutes to allow platform metadata to sync. + # The VM instance tag triggers a Fabric Controller goal state (CCF) update, + # which must propagate to the host agent before wireserver can reflect it. + # FC goal state propagation can take several minutes in practice. + while [ $poll_attempt -le $max_poll_attempts ]; do + echo "is_opted_in_for_root_certs: poll attempt ${poll_attempt}/${max_poll_attempts}" + + opt_in_response=$(make_request_with_retry "${WIRESERVER_ENDPOINT}/acms/isOptedInForRootCerts") + request_status=$? + + echo "is_opted_in_for_root_certs: wireserver response (status=${request_status}): '${opt_in_response}'" + + if [ $request_status -ne 0 ] || [ -z "$opt_in_response" ]; then + echo "Warning: failed to determine IsOptedInForRootCerts state on attempt ${poll_attempt}" + elif echo "$opt_in_response" | grep -q "IsOptedInForRootCerts=true"; then + echo "IsOptedInForRootCerts=true (found on attempt ${poll_attempt})" + return 0 + fi - opt_in_response=$(make_request_with_retry "${WIRESERVER_ENDPOINT}/acms/isOptedInForRootCerts") - local request_status=$? - if [ $request_status -ne 0 ] || [ -z "$opt_in_response" ]; then - echo "Warning: failed to determine IsOptedInForRootCerts state" - return 1 - fi + if [ $poll_attempt -lt $max_poll_attempts ]; then + echo "is_opted_in_for_root_certs: not opted in yet, waiting ${poll_interval}s before retry..." + sleep $poll_interval + fi - if echo "$opt_in_response" | grep -q "IsOptedInForRootCerts=true"; then - echo "IsOptedInForRootCerts=true" - return 0 - fi + poll_attempt=$((poll_attempt + 1)) + done - echo "Skipping custom cloud root cert installation because IsOptedInForRootCerts is not true" + echo "Skipping custom cloud root cert installation because IsOptedInForRootCerts is not true after ${max_poll_attempts} attempts" + echo "Last wireserver response: '${opt_in_response}'" return 1 } From e76bc3ff3f11ba9327b9dea20515027d19790467 Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Fri, 17 Apr 2026 14:15:41 -0700 Subject: [PATCH 027/103] e2e: always log PlatformSettingsOverride feature flag status Log the feature flag status on the default E2E subscription for diagnostics in every RCV1P test, even when RCV1P_SUBSCRIPTION_ID is not set. This helps diagnose wireserver IsOptedInForRootCerts behavior across subscriptions. The feature flag check is now per-subscription (cached via sync.Map) and accepts a failIfMissing parameter: true for RCV1P tests (fail if not registered), false for diagnostics (log only). Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Ramkumar Chinchani --- e2e/scenario_rcv1p_test.go | 76 ++++++++++++++++++++++++++------------ 1 file changed, 53 insertions(+), 23 deletions(-) diff --git a/e2e/scenario_rcv1p_test.go b/e2e/scenario_rcv1p_test.go index eed7cf43ffd..72a010cfe4b 100644 --- a/e2e/scenario_rcv1p_test.go +++ b/e2e/scenario_rcv1p_test.go @@ -40,35 +40,70 @@ const rcv1pOptInTag = "platformsettings.host_environment.service.platform_optedi // skipIfRCV1PNotConfigured skips the test when the RCV1P subscription is not configured. // This happens in regular CI runs where the RCV1P variable group is not linked, causing // Azure DevOps to pass the literal unexpanded string "$(RCV1P_SUBSCRIPTION_ID)". -// It also verifies the Microsoft.Compute/PlatformSettingsOverride feature flag is registered. +// It always logs the feature flag status on the E2E subscription for diagnostics, +// and verifies the flag is registered on the RCV1P subscription when available. func skipIfRCV1PNotConfigured(t *testing.T) { t.Helper() + // Always log feature flag status on the default E2E subscription for diagnostics + logE2ESubscriptionFeatureFlag(t) + subID := config.Config.RCV1PSubscriptionID if subID == "" || strings.HasPrefix(subID, "$(") { t.Skip("RCV1P_SUBSCRIPTION_ID not set or not resolved, skipping RCV1P cert mode test") } - checkPlatformSettingsOverrideFeatureFlag(t, subID) + checkPlatformSettingsOverrideFeatureFlag(t, subID, config.RCV1PAzure, true) } var ( - featureFlagCheckOnce sync.Once - featureFlagCheckResult error + featureFlagChecks sync.Map // subscriptionID -> *featureFlagResult ) -// checkPlatformSettingsOverrideFeatureFlag verifies the Microsoft.Compute/PlatformSettingsOverride -// feature flag is registered on the given subscription. This is a prerequisite for wireserver to -// serve root certificates. The check runs only once per test run. -func checkPlatformSettingsOverrideFeatureFlag(t *testing.T, subscriptionID string) { +type featureFlagResult struct { + once sync.Once + registered bool + err error +} + +// checkPlatformSettingsOverrideFeatureFlag checks the Microsoft.Compute/PlatformSettingsOverride +// feature flag on the given subscription. When failIfMissing is true (RCV1P tests), the test +// fails if the flag is not registered. When false (diagnostics), it only logs the result. +func checkPlatformSettingsOverrideFeatureFlag(t *testing.T, subscriptionID string, client *config.AzureClient, failIfMissing bool) { t.Helper() - featureFlagCheckOnce.Do(func() { - featureFlagCheckResult = verifyFeatureFlag(t.Context(), subscriptionID) + val, _ := featureFlagChecks.LoadOrStore(subscriptionID, &featureFlagResult{}) + result := val.(*featureFlagResult) + result.once.Do(func() { + result.registered, result.err = queryFeatureFlag(t.Context(), subscriptionID, client) }) - if featureFlagCheckResult != nil { - t.Fatalf("RCV1P feature flag check failed: %v", featureFlagCheckResult) + + if result.err != nil { + t.Logf("PlatformSettingsOverride feature flag check on subscription %s: error: %v", subscriptionID, result.err) + if failIfMissing { + t.Fatalf("RCV1P feature flag check failed: %v", result.err) + } + return + } + + t.Logf("PlatformSettingsOverride feature flag on subscription %s: registered=%v", subscriptionID, result.registered) + if failIfMissing && !result.registered { + t.Fatalf("Microsoft.Compute/PlatformSettingsOverride is NOT registered on subscription %s; "+ + "wireserver will not serve root certificates without this feature flag", subscriptionID) } } -func verifyFeatureFlag(ctx context.Context, subscriptionID string) error { +// logE2ESubscriptionFeatureFlag logs the PlatformSettingsOverride feature flag status on the +// default E2E subscription for diagnostic purposes. This helps understand wireserver behavior +// (e.g., IsOptedInForRootCerts responses) even in non-RCV1P test runs. +func logE2ESubscriptionFeatureFlag(t *testing.T) { + t.Helper() + e2eAzure, err := config.NewAzureClient() + if err != nil { + t.Logf("WARNING: failed to create E2E Azure client for feature flag check: %v", err) + return + } + checkPlatformSettingsOverrideFeatureFlag(t, config.Config.SubscriptionID, e2eAzure, false) +} + +func queryFeatureFlag(ctx context.Context, subscriptionID string, client *config.AzureClient) (bool, error) { url := fmt.Sprintf( "https://management.azure.com/subscriptions/%s/providers/Microsoft.Features/providers/Microsoft.Compute/features/PlatformSettingsOverride?api-version=2021-07-01", subscriptionID, @@ -76,12 +111,12 @@ func verifyFeatureFlag(ctx context.Context, subscriptionID string) error { req, err := azruntime.NewRequest(ctx, "GET", url) if err != nil { - return fmt.Errorf("failed to create feature flag request: %w", err) + return false, fmt.Errorf("failed to create feature flag request: %w", err) } - resp, err := config.RCV1PAzure.Core.Pipeline().Do(req) + resp, err := client.Core.Pipeline().Do(req) if err != nil { - return fmt.Errorf("failed to query feature flag: %w", err) + return false, fmt.Errorf("failed to query feature flag: %w", err) } defer resp.Body.Close() @@ -89,15 +124,10 @@ func verifyFeatureFlag(ctx context.Context, subscriptionID string) error { bodyStr := string(body) if resp.StatusCode != 200 { - return fmt.Errorf("feature flag query returned status %d: %s", resp.StatusCode, bodyStr) - } - - if !strings.Contains(bodyStr, `"Registered"`) { - return fmt.Errorf("Microsoft.Compute/PlatformSettingsOverride is NOT registered on subscription %s (response: %s); "+ - "wireserver will not serve root certificates without this feature flag", subscriptionID, bodyStr) + return false, fmt.Errorf("feature flag query returned status %d: %s", resp.StatusCode, bodyStr) } - return nil + return strings.Contains(bodyStr, `"Registered"`), nil } // rcv1pOptInVMConfigMutator sets the platform opt-in tag on the VMSS resource level. From 8780afec6f89d1e902d9ff3bb4af9b4a847a258a Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Fri, 17 Apr 2026 14:29:39 -0700 Subject: [PATCH 028/103] fix(windows): parse wireserver IsOptedInForRootCerts JSON with ConvertFrom-Json Same bug as Linux: wireserver returns JSON {"IsOptedInForRootCerts":true} but the script used -match "IsOptedInForRootCerts=true" (equals sign). Parse with ConvertFrom-Json and check the boolean property directly. Also add Write-Log for the wireserver response for diagnostics. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Ramkumar Chinchani --- staging/cse/windows/kubernetesfunc.ps1 | 8 ++++++-- staging/cse/windows/kubernetesfunc.tests.ps1 | 4 ++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/staging/cse/windows/kubernetesfunc.ps1 b/staging/cse/windows/kubernetesfunc.ps1 index 159161153f0..0efc6e557c2 100644 --- a/staging/cse/windows/kubernetesfunc.ps1 +++ b/staging/cse/windows/kubernetesfunc.ps1 @@ -310,7 +310,9 @@ function Should-InstallCACertificatesRefreshTask { try { $optInUri = 'http://168.63.129.16/acms/isOptedInForRootCerts' $optInResponse = Retry-Command -Command 'Invoke-WebRequest' -Args @{Uri=$optInUri; UseBasicParsing=$true} -Retries 5 -RetryDelaySeconds 10 - return ($optInResponse.Content -match 'IsOptedInForRootCerts=true') + Write-Log "IsOptedInForRootCerts wireserver response: $($optInResponse.Content)" + $optInJson = $optInResponse.Content | ConvertFrom-Json + return ($optInJson.IsOptedInForRootCerts -eq $true) } catch { Write-Log "Skipping CA refresh task registration because IsOptedInForRootCerts could not be determined: $_" return $false @@ -363,7 +365,9 @@ function Get-CACertificates { $optInUri = 'http://168.63.129.16/acms/isOptedInForRootCerts' $optInResponse = Retry-Command -Command 'Invoke-WebRequest' -Args @{Uri=$optInUri; UseBasicParsing=$true} -Retries 5 -RetryDelaySeconds 10 - if (($optInResponse.Content -notmatch 'IsOptedInForRootCerts=true')) { + Write-Log "IsOptedInForRootCerts wireserver response: $($optInResponse.Content)" + $optInJson = $optInResponse.Content | ConvertFrom-Json + if ($optInJson.IsOptedInForRootCerts -ne $true) { Write-Log "Skipping custom cloud root cert installation because IsOptedInForRootCerts is not true" return $false } diff --git a/staging/cse/windows/kubernetesfunc.tests.ps1 b/staging/cse/windows/kubernetesfunc.tests.ps1 index 42accc39c51..924ccf13fc5 100644 --- a/staging/cse/windows/kubernetesfunc.tests.ps1 +++ b/staging/cse/windows/kubernetesfunc.tests.ps1 @@ -132,7 +132,7 @@ Describe 'Should-InstallCACertificatesRefreshTask' { Mock Retry-Command -MockWith { param($Command, $Args, $Retries, $RetryDelaySeconds) $script:lastRetryUri = $PSBoundParameters['Args'].Uri - return [PSCustomObject]@{ Content = 'IsOptedInForRootCerts=true' } + return [PSCustomObject]@{ Content = '{"IsOptedInForRootCerts":true}' } } $result = Should-InstallCACertificatesRefreshTask -Location 'southcentralus' @@ -144,7 +144,7 @@ Describe 'Should-InstallCACertificatesRefreshTask' { It 'returns false for rcv1p regions when opt-in is disabled' { Mock Retry-Command -MockWith { - return [PSCustomObject]@{ Content = 'IsOptedInForRootCerts=false' } + return [PSCustomObject]@{ Content = '{"IsOptedInForRootCerts":false}' } } $result = Should-InstallCACertificatesRefreshTask -Location 'southcentralus' From 9d1a2f1ac2496734f889ca27936c044000693000 Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Fri, 17 Apr 2026 21:47:42 -0700 Subject: [PATCH 029/103] e2e: make RCV1P_SUBSCRIPTION_ID optional with feature flag auto-detection When RCV1P_SUBSCRIPTION_ID is not set, tests now check the default E2E subscription for PlatformSettingsOverride feature flag registration and use it for positive RCV1P tests if available (platform auto-injects the opt-in tag on those subscriptions). Negative tests (NotOptedIn) require RCV1P_SUBSCRIPTION_ID explicitly, since the platform may auto-inject the opt-in tag on the default sub. Helpers rcv1pAzureClient(), rcv1pSubscriptionID(), and rcv1pCluster() centralize the subscription/client/cluster selection logic. All Linux and Windows positive tests use these helpers for consistent behavior. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Ramkumar Chinchani --- e2e/validators.go | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/e2e/validators.go b/e2e/validators.go index 3fd69da5e55..ffb70a6d332 100644 --- a/e2e/validators.go +++ b/e2e/validators.go @@ -3173,6 +3173,10 @@ func rcv1pTrustStoreDir(s *Scenario) string { func ValidateRCV1PCertModeWindows(ctx context.Context, s *Scenario) { s.T.Helper() + // Validate the provisioning log shows wireserver was queried and returned opted-in + ValidateFileHasContent(ctx, s, "C:\\AzureData\\CustomDataSetupScript.log", + "IsOptedInForRootCerts wireserver response:") + // Validate CA certificates were installed to the Windows certificate store command := []string{ "$ErrorActionPreference = 'Stop'", @@ -3225,6 +3229,14 @@ func ValidateRCV1PNotOptedIn(ctx context.Context, s *Scenario) { func ValidateRCV1PNotOptedInWindows(ctx context.Context, s *Scenario) { s.T.Helper() + // Validate the provisioning log shows wireserver was queried + ValidateFileHasContent(ctx, s, "C:\\AzureData\\CustomDataSetupScript.log", + "IsOptedInForRootCerts wireserver response:") + + // Validate wireserver reported not opted in + ValidateFileHasContent(ctx, s, "C:\\AzureData\\CustomDataSetupScript.log", + "Skipping custom cloud root cert installation because IsOptedInForRootCerts is not true") + // Validate C:\ca is empty or does not exist command := []string{ "$ErrorActionPreference = 'Stop'", From fe84342df0cb38b70c756985e906367d746a6c00 Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Sat, 18 Apr 2026 01:56:44 -0700 Subject: [PATCH 030/103] e2e: always collect Windows CSE logs (not just on failure) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Removes the s.T.Failed() guard on extractLogsFromVMWindows so full CSE logs (CustomDataSetupScript.log, kubelet, containerd, network config) are always uploaded to blob storage, even on success. This is a temporary debug commit — revert after investigation. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Ramkumar Chinchani --- e2e/vmss.go | 6 +--- .../artifacts/init-aks-custom-cloud.sh | 29 +++++++++++++------ 2 files changed, 21 insertions(+), 14 deletions(-) diff --git a/e2e/vmss.go b/e2e/vmss.go index 40bb9d44c72..ab9a9101bf8 100644 --- a/e2e/vmss.go +++ b/e2e/vmss.go @@ -994,11 +994,7 @@ hnsdiag list endpoints >> network_config.txt // extractLogsFromVMWindows runs a script on windows VM to collect logs and upload them to a blob storage // it then lists the blobs in the container and prints the content of each blob func extractLogsFromVMWindows(ctx context.Context, s *Scenario) { - if !s.T.Failed() { - s.T.Logf("skipping logs extraction from windows VM, as the test didn't fail") - return - } - + // Always collect Windows logs for debugging (revert this to restore failure-only collection) ctx, cancel := context.WithTimeout(ctx, 4*time.Minute) defer cancel() pager := s.GetAzure().VMSSVM.NewListPager(*s.Runtime.Cluster.Model.Properties.NodeResourceGroup, s.Runtime.VMSSName, nil) diff --git a/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh b/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh index c63e0bc5df9..f189c433457 100644 --- a/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh +++ b/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh @@ -42,24 +42,35 @@ function make_request_with_retry { local attempt=1 local response + local http_code + local curl_output while [ $attempt -le $max_retries ]; do - response=$(curl -f --no-progress-meter --connect-timeout 10 --max-time 30 "$url") - local request_status=$? - - if echo "$response" | grep -q "RequestRateLimitExceeded"; then + # capture response body + HTTP status code; -w appends the code after the body. + # curl stderr (connection errors) flows to the script's log naturally. + # http_code is 000 when wireserver is unreachable (connection refused/timeout). + curl_output=$(curl --no-progress-meter --connect-timeout 10 --max-time 30 -w '\n%{http_code}' "$url") || true + http_code=$(echo "$curl_output" | tail -1) + response=$(echo "$curl_output" | sed '$d') + + if echo "$response" | grep -q "RequestRateLimitExceeded" && [ "$http_code" = "403" ]; then + echo "wireserver rate limited (HTTP ${http_code}) on attempt ${attempt}/${max_retries}: ${url}" >&2 sleep $retry_delay retry_delay=$((retry_delay * 2)) attempt=$((attempt + 1)) - elif [ $request_status -ne 0 ]; then - sleep $retry_delay - attempt=$((attempt + 1)) - else + elif [ "$http_code" -ge 200 ] 2>/dev/null && [ "$http_code" -lt 300 ] 2>/dev/null; then echo "$response" return 0 + else + echo "wireserver request failed (HTTP ${http_code}) on attempt ${attempt}/${max_retries}: ${url}" >&2 + if [ -n "$response" ]; then + echo "wireserver error response: ${response}" >&2 + fi + sleep $retry_delay + attempt=$((attempt + 1)) fi done - echo "exhausted all retries, last response: $response" + echo "exhausted all retries for ${url} (last HTTP ${http_code}), last response: $response" >&2 return 1 } From b6651eca53fe4b4abf9606361c7a395bfeada14d Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Sun, 19 Apr 2026 12:45:40 -0700 Subject: [PATCH 031/103] fix: add wireserver HTTP error diagnostic logging for cert endpoints Log HTTP status codes and error response bodies when wireserver requests fail during RCV1P cert installation. This enables diagnosing specific failure modes (403 rate limit, 404 not delivered, 503 busy, 504 gateway timeout, 000 unreachable) without guessing from generic error messages. Linux (init-aks-custom-cloud.sh): - Replace curl -f with -w to capture HTTP status codes - Log attempt number, HTTP code, URL, and error body on each retry - Rate limit detection now requires both HTTP 403 and body match - All diagnostic output goes to stderr (not stdout) to avoid contaminating function return values Windows (kubernetesfunc.ps1): - Extract HTTP status code from exception Response object - Read and log wireserver error response body when available - Applied to both Should-InstallCACertificatesRefreshTask and Get-CACertificates catch blocks Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Ramkumar Chinchani --- staging/cse/windows/kubernetesfunc.ps1 | 32 +++++++++++++++++++++++--- 1 file changed, 29 insertions(+), 3 deletions(-) diff --git a/staging/cse/windows/kubernetesfunc.ps1 b/staging/cse/windows/kubernetesfunc.ps1 index 0efc6e557c2..5ddb667dd99 100644 --- a/staging/cse/windows/kubernetesfunc.ps1 +++ b/staging/cse/windows/kubernetesfunc.ps1 @@ -314,7 +314,20 @@ function Should-InstallCACertificatesRefreshTask { $optInJson = $optInResponse.Content | ConvertFrom-Json return ($optInJson.IsOptedInForRootCerts -eq $true) } catch { - Write-Log "Skipping CA refresh task registration because IsOptedInForRootCerts could not be determined: $_" + $statusCode = "N/A" + $responseBody = "" + if ($_.Exception -and $_.Exception.Response) { + $statusCode = [int]$_.Exception.Response.StatusCode + try { + $stream = $_.Exception.Response.GetResponseStream() + $reader = New-Object System.IO.StreamReader($stream) + $responseBody = $reader.ReadToEnd() + } catch { } + } + Write-Log "Skipping CA refresh task registration because IsOptedInForRootCerts could not be determined (HTTP $statusCode): $_" + if ($responseBody) { + Write-Log "Wireserver error response body: $responseBody" + } return $false } } @@ -415,10 +428,23 @@ function Get-CACertificates { return $downloadedAny } catch { + $statusCode = "N/A" + $responseBody = "" + if ($_.Exception -and $_.Exception.Response) { + $statusCode = [int]$_.Exception.Response.StatusCode + try { + $stream = $_.Exception.Response.GetResponseStream() + $reader = New-Object System.IO.StreamReader($stream) + $responseBody = $reader.ReadToEnd() + } catch { } + } + if ($responseBody) { + Write-Log "Wireserver error response body: $responseBody" + } if ($FailOnError) { - throw "Failed to retrieve CA certificates. Error: $_" + throw "Failed to retrieve CA certificates (HTTP $statusCode). Error: $_" } - Write-Log "Warning: failed to retrieve CA certificates. Error: $_" + Write-Log "Warning: failed to retrieve CA certificates (HTTP $statusCode). Error: $_" return $false } } From d9a4539925320b006b7dc5016418a6714200c2c3 Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Sun, 19 Apr 2026 19:07:45 -0700 Subject: [PATCH 032/103] e2e: use testDir() for Windows CSE output log path consistency Replace filepath.Join("scenario-logs", s.T.Name()) with testDir(s.T) in getCustomScriptExtensionStatus to match the pattern used everywhere else in the e2e suite. Ensures Windows CSE output logs are written to the same directory as other scenario logs. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Ramkumar Chinchani --- e2e/test_helpers.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/e2e/test_helpers.go b/e2e/test_helpers.go index 19f9be7eae1..894ed2d12a2 100644 --- a/e2e/test_helpers.go +++ b/e2e/test_helpers.go @@ -491,7 +491,7 @@ func getCustomScriptExtensionStatus(s *Scenario, vmssVM *armcompute.VirtualMachi if s.IsWindows() { // Save the CSE output for Windows VMs for better troubleshooting if status.Message != nil { - logDir := filepath.Join("scenario-logs", s.T.Name()) + logDir := testDir(s.T) if err := os.MkdirAll(logDir, 0755); err == nil { logFile := filepath.Join(logDir, "windows-cse-output.log") err = os.WriteFile(logFile, []byte(*status.Message), 0644) From 421fbe965c27cbdb32b8563b33b5d0f0752d5f62 Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Tue, 21 Apr 2026 10:43:43 -0700 Subject: [PATCH 033/103] fix(e2e): filter CSE extension to fix empty Windows CSE log files The getCustomScriptExtensionStatus function iterated over all VM extensions without filtering by name. On Windows VMs with multiple extensions (e.g., ManagedIdentity + CustomScriptExtension), it could process a non-CSE extension first, write its empty status.Message to windows-cse-output.log, and return before reaching the actual CSE. Fix: - Filter extensions by name (vmssCSE, customscript, aksnode) - Skip empty messages to avoid overwriting with zero-byte files - Log byte count for diagnostics Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Ramkumar Chinchani --- e2e/test_helpers.go | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/e2e/test_helpers.go b/e2e/test_helpers.go index 894ed2d12a2..6ac95e8c2e8 100644 --- a/e2e/test_helpers.go +++ b/e2e/test_helpers.go @@ -487,10 +487,26 @@ func validateVM(ctx context.Context, s *Scenario) { func getCustomScriptExtensionStatus(s *Scenario, vmssVM *armcompute.VirtualMachineScaleSetVM) error { for _, extension := range vmssVM.Properties.InstanceView.Extensions { + // Only process the CSE extension, skip other extensions (e.g., ManagedIdentity) + // whose empty status messages would overwrite the actual CSE output file. + // The extension name in InstanceView is typically "vmssCSE" (matching the resource name) + // but may also appear as the handler type. Match on known CSE identifiers. + if extension.Name == nil { + continue + } + name := strings.ToLower(*extension.Name) + isCSE := name == "vmsscse" || + strings.Contains(name, "customscript") || + strings.Contains(name, "aksnode") + if !isCSE { + continue + } for _, status := range extension.Statuses { if s.IsWindows() { - // Save the CSE output for Windows VMs for better troubleshooting - if status.Message != nil { + // Save the CSE output for Windows VMs for better troubleshooting. + // Only write when the message has actual content to avoid overwriting + // with an empty file from a status entry that has no output. + if status.Message != nil && *status.Message != "" { logDir := testDir(s.T) if err := os.MkdirAll(logDir, 0755); err == nil { logFile := filepath.Join(logDir, "windows-cse-output.log") @@ -498,7 +514,7 @@ func getCustomScriptExtensionStatus(s *Scenario, vmssVM *armcompute.VirtualMachi if err != nil { s.T.Logf("failed to save Windows CSE output to %s: %v", logFile, err) } else { - s.T.Logf("saved Windows CSE output to %s", logFile) + s.T.Logf("saved Windows CSE output to %s (%d bytes)", logFile, len(*status.Message)) } } } From 2f3373939b99bc26b0f7c6e2f60a960d0e594892 Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Tue, 21 Apr 2026 15:01:38 -0700 Subject: [PATCH 034/103] fix(e2e): re-fetch VM instance view for fresh CSE extension status The VM object passed to getCustomScriptExtensionStatus may have been fetched by waitForVMRunningState before the CSE extension finished executing, resulting in empty extension status messages. This caused windows-cse-output.log to not be written even though the CSE succeeded. Fix by re-fetching the VM with instance view expand directly in getCustomScriptExtensionStatus to ensure we get the latest extension status data including the CSE output message. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Ramkumar Chinchani --- e2e/test_helpers.go | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/e2e/test_helpers.go b/e2e/test_helpers.go index 6ac95e8c2e8..35f5b51cb4f 100644 --- a/e2e/test_helpers.go +++ b/e2e/test_helpers.go @@ -486,6 +486,25 @@ func validateVM(ctx context.Context, s *Scenario) { } func getCustomScriptExtensionStatus(s *Scenario, vmssVM *armcompute.VirtualMachineScaleSetVM) error { + // Re-fetch the VM with instance view to ensure we have fresh extension status data. + // The VM object passed in may have been fetched before the CSE finished executing, + // so the extension status message could be empty or stale. + if vmssVM.InstanceID != nil { + ctx := context.Background() + freshVM, err := s.GetAzure().VMSSVM.Get(ctx, + *s.Runtime.Cluster.Model.Properties.NodeResourceGroup, + s.Runtime.VMSSName, + *vmssVM.InstanceID, + &armcompute.VirtualMachineScaleSetVMsClientGetOptions{ + Expand: to.Ptr(armcompute.InstanceViewTypesInstanceView), + }) + if err == nil && freshVM.Properties != nil && freshVM.Properties.InstanceView != nil { + vmssVM.Properties.InstanceView = freshVM.Properties.InstanceView + } else if err != nil { + s.T.Logf("warning: failed to re-fetch VM instance view for CSE status: %v", err) + } + } + for _, extension := range vmssVM.Properties.InstanceView.Extensions { // Only process the CSE extension, skip other extensions (e.g., ManagedIdentity) // whose empty status messages would overwrite the actual CSE output file. From bcfed44b5e71b1c7aa77bf05243f06ce154c79ef Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Tue, 21 Apr 2026 15:13:11 -0700 Subject: [PATCH 035/103] e2e: trim whitespace from RCV1P_SUBSCRIPTION_ID to fix gating When RCV1P_SUBSCRIPTION_ID resolves to whitespace (e.g. ' ') from an unconfigured ADO pipeline variable, hasExplicitRCV1PSubscription() incorrectly returns true because ' ' != ''. This causes the feature flag API call with an empty/whitespace subscription ID, returning 404 and t.Fatalf instead of gracefully skipping via t.Skip. Fix by applying strings.TrimSpace() in hasExplicitRCV1PSubscription(), rcv1pSubscriptionID(), and the config init() guard. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Ramkumar Chinchani --- e2e/config/config.go | 5 +++-- e2e/scenario_rcv1p_test.go | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/e2e/config/config.go b/e2e/config/config.go index 882d050da3e..f88f4d51ad5 100644 --- a/e2e/config/config.go +++ b/e2e/config/config.go @@ -185,8 +185,9 @@ func mustLoadConfig() *Configuration { } func init() { - if Config.RCV1PSubscriptionID != "" && !strings.HasPrefix(Config.RCV1PSubscriptionID, "$(") { - client, err := NewAzureClientForSubscription(Config.RCV1PSubscriptionID) + rcv1pSubID := strings.TrimSpace(Config.RCV1PSubscriptionID) + if rcv1pSubID != "" && !strings.HasPrefix(rcv1pSubID, "$(") { + client, err := NewAzureClientForSubscription(rcv1pSubID) if err != nil { panic(fmt.Sprintf("failed to create RCV1P Azure client: %v", err)) } diff --git a/e2e/scenario_rcv1p_test.go b/e2e/scenario_rcv1p_test.go index 72a010cfe4b..4504ad976b6 100644 --- a/e2e/scenario_rcv1p_test.go +++ b/e2e/scenario_rcv1p_test.go @@ -47,7 +47,7 @@ func skipIfRCV1PNotConfigured(t *testing.T) { // Always log feature flag status on the default E2E subscription for diagnostics logE2ESubscriptionFeatureFlag(t) - subID := config.Config.RCV1PSubscriptionID + subID := strings.TrimSpace(config.Config.RCV1PSubscriptionID) if subID == "" || strings.HasPrefix(subID, "$(") { t.Skip("RCV1P_SUBSCRIPTION_ID not set or not resolved, skipping RCV1P cert mode test") } From 82f29838dda6a4e08281df1923729afee66058df Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Wed, 22 Apr 2026 11:29:03 -0700 Subject: [PATCH 036/103] e2e: add gen2 Windows RCV1P tests, fix Windows2025 TrustedLaunch Add gen2 variants for all 3 Windows RCV1P tests so they run in gen2 pipeline jobs (windows-2022-containerd-gen2, windows-23H2-gen2, windows-2025-gen2) which previously skipped all RCV1P tests. Fix Test_RCV1P_Windows2025 which incorrectly added TrustedLaunch to a non-gen2 VHD (VHDWindows2025 has UnsupportedGen2: true), causing BadRequest errors. Removed TrustedLaunch from non-gen2 test; the new gen2 variant (Test_RCV1P_Windows2025Gen2) uses VHDWindows2025Gen2 which supports TrustedLaunch natively. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Ramkumar Chinchani --- e2e/scenario_rcv1p_win_test.go | 79 ++++++++++++++++++++++++++++++++-- 1 file changed, 76 insertions(+), 3 deletions(-) diff --git a/e2e/scenario_rcv1p_win_test.go b/e2e/scenario_rcv1p_win_test.go index 0932ae5f97b..c05b0607358 100644 --- a/e2e/scenario_rcv1p_win_test.go +++ b/e2e/scenario_rcv1p_win_test.go @@ -14,7 +14,6 @@ import ( "github.com/Azure/agentbaker/e2e/config" "github.com/Azure/agentbaker/pkg/agent/datamodel" - "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute/v7" ) // Test_RCV1P_Windows2022 validates RCV1P cert download and Windows certificate store @@ -64,8 +63,7 @@ func Test_RCV1P_Windows23H2(t *testing.T) { }) } -// Test_RCV1P_Windows2025 validates RCV1P on Windows Server 2025. This SKU requires -// Trusted Launch, so the VMConfigMutator combines both TrustedLaunch and opt-in tag settings. +// Test_RCV1P_Windows2025 validates RCV1P on Windows Server 2025 (non-gen2). func Test_RCV1P_Windows2025(t *testing.T) { skipIfRCV1PNotConfigured(t) RunScenario(t, &Scenario{ @@ -82,6 +80,81 @@ func Test_RCV1P_Windows2025(t *testing.T) { vmss.Properties = addTrustedLaunchToVMSS(vmss.Properties) rcv1pOptInVMConfigMutator(vmss) }, + VMInstanceTags: rcv1pVMInstanceTags(), + BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) { + Windows2025BootstrapConfigMutator(t, nbc) + }, + Validator: func(ctx context.Context, s *Scenario) { + ValidateRCV1PCertModeWindows(ctx, s) + }, + }, + }) +} + +// Test_RCV1P_Windows2022Gen2 validates RCV1P cert download and Windows certificate store +// installation on Windows Server 2022 Gen2. Covers the gen2 pipeline job. +func Test_RCV1P_Windows2022Gen2(t *testing.T) { + skipIfRCV1PNotConfigured(t) + RunScenario(t, &Scenario{ + Description: "Tests RCV1P cert mode on Windows Server 2022 Gen2 with VM opt-in tag", + AzureClient: config.RCV1PAzure, + SubscriptionID: config.Config.RCV1PSubscriptionID, + Tags: Tags{ + RCV1PCertMode: true, + }, + Config: Config{ + Cluster: ClusterRCV1PKubenet, + VHD: config.VHDWindows2022ContainerdGen2, + VMConfigMutator: rcv1pOptInVMConfigMutator, + VMInstanceTags: rcv1pVMInstanceTags(), + BootstrapConfigMutator: EmptyBootstrapConfigMutator, + Validator: func(ctx context.Context, s *Scenario) { + ValidateRCV1PCertModeWindows(ctx, s) + }, + }, + }) +} + +// Test_RCV1P_Windows23H2Gen2 validates RCV1P on Windows Server 23H2 Gen2. Covers the gen2 pipeline job. +func Test_RCV1P_Windows23H2Gen2(t *testing.T) { + skipIfRCV1PNotConfigured(t) + RunScenario(t, &Scenario{ + Description: "Tests RCV1P cert mode on Windows Server 23H2 Gen2 with VM opt-in tag", + AzureClient: config.RCV1PAzure, + SubscriptionID: config.Config.RCV1PSubscriptionID, + Tags: Tags{ + RCV1PCertMode: true, + }, + Config: Config{ + Cluster: ClusterRCV1PKubenet, + VHD: config.VHDWindows23H2Gen2, + VMConfigMutator: rcv1pOptInVMConfigMutator, + VMInstanceTags: rcv1pVMInstanceTags(), + BootstrapConfigMutator: EmptyBootstrapConfigMutator, + Validator: func(ctx context.Context, s *Scenario) { + ValidateRCV1PCertModeWindows(ctx, s) + }, + }, + }) +} + +// Test_RCV1P_Windows2025Gen2 validates RCV1P on Windows Server 2025 Gen2. Covers the gen2 pipeline job. +func Test_RCV1P_Windows2025Gen2(t *testing.T) { + skipIfRCV1PNotConfigured(t) + RunScenario(t, &Scenario{ + Description: "Tests RCV1P cert mode on Windows Server 2025 Gen2 with VM opt-in tag", + AzureClient: config.RCV1PAzure, + SubscriptionID: config.Config.RCV1PSubscriptionID, + Tags: Tags{ + RCV1PCertMode: true, + }, + Config: Config{ + Cluster: ClusterRCV1PKubenet, + VHD: config.VHDWindows2025Gen2, + VMConfigMutator: func(vmss *armcompute.VirtualMachineScaleSet) { + vmss.Properties = addTrustedLaunchToVMSS(vmss.Properties) + rcv1pOptInVMConfigMutator(vmss) + }, VMInstanceTags: rcv1pVMInstanceTags(), BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) { Windows2025BootstrapConfigMutator(t, nbc) From 829218de7d89b9578901a67aae6fdcbc621d7911 Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Wed, 22 Apr 2026 13:12:09 -0700 Subject: [PATCH 037/103] e2e: switch RCV1P tests to Azure CNI Overlay to fix IP exhaustion Windows nodes use azure-vnet plugin even in kubenet clusters, reserving multiple subnet IPs per node. Many parallel RCV1P tests sharing the same subnet causes 'No available addresses' failures at pod scheduling. Switch from kubenet to Azure CNI Overlay which uses a separate virtual pod CIDR (10.244.0.0/16), eliminating subnet IP exhaustion. This is easily revertable: change ClusterRCV1POverlay -> ClusterRCV1PKubenet and ClusterAzureOverlayNetwork -> ClusterKubenet in rcv1pCluster(). Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Ramkumar Chinchani --- e2e/cache.go | 12 ++++++++++++ e2e/scenario_rcv1p_test.go | 12 ++++++------ e2e/scenario_rcv1p_win_test.go | 14 +++++++------- 3 files changed, 25 insertions(+), 13 deletions(-) diff --git a/e2e/cache.go b/e2e/cache.go index 76514bbb5be..c8b176aa6f5 100644 --- a/e2e/cache.go +++ b/e2e/cache.go @@ -210,6 +210,18 @@ func clusterCiliumNetwork(ctx context.Context, request ClusterRequest) (*Cluster return prepareCluster(ctx, model, false, false) } +var ClusterRCV1POverlay = cachedFunc(clusterRCV1POverlay) + +// clusterRCV1POverlay creates an Azure CNI Overlay cluster in the RCV1P subscription. +// Overlay avoids subnet IP exhaustion for Windows tests by using a virtual pod CIDR. +func clusterRCV1POverlay(ctx context.Context, request ClusterRequest) (*Cluster, error) { + infra := RCV1PClusterInfra() + if infra == nil { + return nil, fmt.Errorf("RCV1P_SUBSCRIPTION_ID not set, cannot create RCV1P overlay cluster") + } + return prepareCluster(ctx, infra, getAzureOverlayNetworkClusterModel("abe2e-rcv1p-overlay-v1", request.Location, request.K8sSystemPoolSKU), false, false) +} + // isNotFoundErr checks if an error represents a "not found" response from Azure API func isNotFoundErr(err error) bool { var respErr *azcore.ResponseError diff --git a/e2e/scenario_rcv1p_test.go b/e2e/scenario_rcv1p_test.go index 4504ad976b6..fca1c23fc1a 100644 --- a/e2e/scenario_rcv1p_test.go +++ b/e2e/scenario_rcv1p_test.go @@ -161,7 +161,7 @@ func Test_RCV1P_Ubuntu2204(t *testing.T) { RCV1PCertMode: true, }, Config: Config{ - Cluster: ClusterRCV1PKubenet, + Cluster: ClusterRCV1POverlay, VHD: config.VHDUbuntu2204Gen2Containerd, VMConfigMutator: rcv1pOptInVMConfigMutator, VMInstanceTags: rcv1pVMInstanceTags(), @@ -187,7 +187,7 @@ func Test_RCV1P_Ubuntu2404(t *testing.T) { RCV1PCertMode: true, }, Config: Config{ - Cluster: ClusterRCV1PKubenet, + Cluster: ClusterRCV1POverlay, VHD: config.VHDUbuntu2404Gen2Containerd, VMConfigMutator: rcv1pOptInVMConfigMutator, VMInstanceTags: rcv1pVMInstanceTags(), @@ -213,7 +213,7 @@ func Test_RCV1P_AzureLinuxV3(t *testing.T) { RCV1PCertMode: true, }, Config: Config{ - Cluster: ClusterRCV1PKubenet, + Cluster: ClusterRCV1POverlay, VHD: config.VHDAzureLinuxV3Gen2, VMConfigMutator: rcv1pOptInVMConfigMutator, VMInstanceTags: rcv1pVMInstanceTags(), @@ -239,7 +239,7 @@ func Test_RCV1P_Flatcar(t *testing.T) { RCV1PCertMode: true, }, Config: Config{ - Cluster: ClusterRCV1PKubenet, + Cluster: ClusterRCV1POverlay, VHD: config.VHDFlatcarGen2, VMConfigMutator: rcv1pOptInVMConfigMutator, VMInstanceTags: rcv1pVMInstanceTags(), @@ -265,7 +265,7 @@ func Test_RCV1P_ACL(t *testing.T) { RCV1PCertMode: true, }, Config: Config{ - Cluster: ClusterRCV1PKubenet, + Cluster: ClusterRCV1POverlay, VHD: config.VHDACLGen2TL, VMConfigMutator: func(vmss *armcompute.VirtualMachineScaleSet) { vmss.Properties = addTrustedLaunchToVMSS(vmss.Properties) @@ -298,7 +298,7 @@ func Test_RCV1P_NotOptedIn(t *testing.T) { RCV1PCertMode: true, }, Config: Config{ - Cluster: ClusterRCV1PKubenet, + Cluster: ClusterRCV1POverlay, VHD: config.VHDUbuntu2204Gen2Containerd, BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) { }, diff --git a/e2e/scenario_rcv1p_win_test.go b/e2e/scenario_rcv1p_win_test.go index c05b0607358..08f7da43d9c 100644 --- a/e2e/scenario_rcv1p_win_test.go +++ b/e2e/scenario_rcv1p_win_test.go @@ -28,7 +28,7 @@ func Test_RCV1P_Windows2022(t *testing.T) { RCV1PCertMode: true, }, Config: Config{ - Cluster: ClusterRCV1PKubenet, + Cluster: ClusterRCV1POverlay, VHD: config.VHDWindows2022Containerd, VMConfigMutator: rcv1pOptInVMConfigMutator, VMInstanceTags: rcv1pVMInstanceTags(), @@ -51,7 +51,7 @@ func Test_RCV1P_Windows23H2(t *testing.T) { RCV1PCertMode: true, }, Config: Config{ - Cluster: ClusterRCV1PKubenet, + Cluster: ClusterRCV1POverlay, VHD: config.VHDWindows23H2, VMConfigMutator: rcv1pOptInVMConfigMutator, VMInstanceTags: rcv1pVMInstanceTags(), @@ -74,7 +74,7 @@ func Test_RCV1P_Windows2025(t *testing.T) { RCV1PCertMode: true, }, Config: Config{ - Cluster: ClusterRCV1PKubenet, + Cluster: ClusterRCV1POverlay, VHD: config.VHDWindows2025, VMConfigMutator: func(vmss *armcompute.VirtualMachineScaleSet) { vmss.Properties = addTrustedLaunchToVMSS(vmss.Properties) @@ -103,7 +103,7 @@ func Test_RCV1P_Windows2022Gen2(t *testing.T) { RCV1PCertMode: true, }, Config: Config{ - Cluster: ClusterRCV1PKubenet, + Cluster: ClusterRCV1POverlay, VHD: config.VHDWindows2022ContainerdGen2, VMConfigMutator: rcv1pOptInVMConfigMutator, VMInstanceTags: rcv1pVMInstanceTags(), @@ -126,7 +126,7 @@ func Test_RCV1P_Windows23H2Gen2(t *testing.T) { RCV1PCertMode: true, }, Config: Config{ - Cluster: ClusterRCV1PKubenet, + Cluster: ClusterRCV1POverlay, VHD: config.VHDWindows23H2Gen2, VMConfigMutator: rcv1pOptInVMConfigMutator, VMInstanceTags: rcv1pVMInstanceTags(), @@ -149,7 +149,7 @@ func Test_RCV1P_Windows2025Gen2(t *testing.T) { RCV1PCertMode: true, }, Config: Config{ - Cluster: ClusterRCV1PKubenet, + Cluster: ClusterRCV1POverlay, VHD: config.VHDWindows2025Gen2, VMConfigMutator: func(vmss *armcompute.VirtualMachineScaleSet) { vmss.Properties = addTrustedLaunchToVMSS(vmss.Properties) @@ -181,7 +181,7 @@ func Test_RCV1P_Windows_NotOptedIn(t *testing.T) { RCV1PCertMode: true, }, Config: Config{ - Cluster: ClusterRCV1PKubenet, + Cluster: ClusterRCV1POverlay, VHD: config.VHDWindows2022Containerd, BootstrapConfigMutator: EmptyBootstrapConfigMutator, Validator: func(ctx context.Context, s *Scenario) { From 0ce609f3ccecae9f50484530adfc62816eda8b46 Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Wed, 22 Apr 2026 16:00:59 -0700 Subject: [PATCH 038/103] e2e: revert RCV1P from overlay back to kubenet Overlay doesn't work for E2E Windows nodes because the E2E framework adds nodes via VMSS outside AKS's node pool flow, so overlay pod CIDR assignments from the control plane don't reach the manually-added nodes. The azure-vnet plugin reports 'no available address pools'. Kubenet IP exhaustion is intermittent and sometimes succeeds; overlay fails consistently in this E2E setup. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Ramkumar Chinchani --- e2e/cache.go | 12 ------------ e2e/scenario_rcv1p_test.go | 12 ++++++------ e2e/scenario_rcv1p_win_test.go | 14 +++++++------- 3 files changed, 13 insertions(+), 25 deletions(-) diff --git a/e2e/cache.go b/e2e/cache.go index c8b176aa6f5..76514bbb5be 100644 --- a/e2e/cache.go +++ b/e2e/cache.go @@ -210,18 +210,6 @@ func clusterCiliumNetwork(ctx context.Context, request ClusterRequest) (*Cluster return prepareCluster(ctx, model, false, false) } -var ClusterRCV1POverlay = cachedFunc(clusterRCV1POverlay) - -// clusterRCV1POverlay creates an Azure CNI Overlay cluster in the RCV1P subscription. -// Overlay avoids subnet IP exhaustion for Windows tests by using a virtual pod CIDR. -func clusterRCV1POverlay(ctx context.Context, request ClusterRequest) (*Cluster, error) { - infra := RCV1PClusterInfra() - if infra == nil { - return nil, fmt.Errorf("RCV1P_SUBSCRIPTION_ID not set, cannot create RCV1P overlay cluster") - } - return prepareCluster(ctx, infra, getAzureOverlayNetworkClusterModel("abe2e-rcv1p-overlay-v1", request.Location, request.K8sSystemPoolSKU), false, false) -} - // isNotFoundErr checks if an error represents a "not found" response from Azure API func isNotFoundErr(err error) bool { var respErr *azcore.ResponseError diff --git a/e2e/scenario_rcv1p_test.go b/e2e/scenario_rcv1p_test.go index fca1c23fc1a..4504ad976b6 100644 --- a/e2e/scenario_rcv1p_test.go +++ b/e2e/scenario_rcv1p_test.go @@ -161,7 +161,7 @@ func Test_RCV1P_Ubuntu2204(t *testing.T) { RCV1PCertMode: true, }, Config: Config{ - Cluster: ClusterRCV1POverlay, + Cluster: ClusterRCV1PKubenet, VHD: config.VHDUbuntu2204Gen2Containerd, VMConfigMutator: rcv1pOptInVMConfigMutator, VMInstanceTags: rcv1pVMInstanceTags(), @@ -187,7 +187,7 @@ func Test_RCV1P_Ubuntu2404(t *testing.T) { RCV1PCertMode: true, }, Config: Config{ - Cluster: ClusterRCV1POverlay, + Cluster: ClusterRCV1PKubenet, VHD: config.VHDUbuntu2404Gen2Containerd, VMConfigMutator: rcv1pOptInVMConfigMutator, VMInstanceTags: rcv1pVMInstanceTags(), @@ -213,7 +213,7 @@ func Test_RCV1P_AzureLinuxV3(t *testing.T) { RCV1PCertMode: true, }, Config: Config{ - Cluster: ClusterRCV1POverlay, + Cluster: ClusterRCV1PKubenet, VHD: config.VHDAzureLinuxV3Gen2, VMConfigMutator: rcv1pOptInVMConfigMutator, VMInstanceTags: rcv1pVMInstanceTags(), @@ -239,7 +239,7 @@ func Test_RCV1P_Flatcar(t *testing.T) { RCV1PCertMode: true, }, Config: Config{ - Cluster: ClusterRCV1POverlay, + Cluster: ClusterRCV1PKubenet, VHD: config.VHDFlatcarGen2, VMConfigMutator: rcv1pOptInVMConfigMutator, VMInstanceTags: rcv1pVMInstanceTags(), @@ -265,7 +265,7 @@ func Test_RCV1P_ACL(t *testing.T) { RCV1PCertMode: true, }, Config: Config{ - Cluster: ClusterRCV1POverlay, + Cluster: ClusterRCV1PKubenet, VHD: config.VHDACLGen2TL, VMConfigMutator: func(vmss *armcompute.VirtualMachineScaleSet) { vmss.Properties = addTrustedLaunchToVMSS(vmss.Properties) @@ -298,7 +298,7 @@ func Test_RCV1P_NotOptedIn(t *testing.T) { RCV1PCertMode: true, }, Config: Config{ - Cluster: ClusterRCV1POverlay, + Cluster: ClusterRCV1PKubenet, VHD: config.VHDUbuntu2204Gen2Containerd, BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) { }, diff --git a/e2e/scenario_rcv1p_win_test.go b/e2e/scenario_rcv1p_win_test.go index 08f7da43d9c..c05b0607358 100644 --- a/e2e/scenario_rcv1p_win_test.go +++ b/e2e/scenario_rcv1p_win_test.go @@ -28,7 +28,7 @@ func Test_RCV1P_Windows2022(t *testing.T) { RCV1PCertMode: true, }, Config: Config{ - Cluster: ClusterRCV1POverlay, + Cluster: ClusterRCV1PKubenet, VHD: config.VHDWindows2022Containerd, VMConfigMutator: rcv1pOptInVMConfigMutator, VMInstanceTags: rcv1pVMInstanceTags(), @@ -51,7 +51,7 @@ func Test_RCV1P_Windows23H2(t *testing.T) { RCV1PCertMode: true, }, Config: Config{ - Cluster: ClusterRCV1POverlay, + Cluster: ClusterRCV1PKubenet, VHD: config.VHDWindows23H2, VMConfigMutator: rcv1pOptInVMConfigMutator, VMInstanceTags: rcv1pVMInstanceTags(), @@ -74,7 +74,7 @@ func Test_RCV1P_Windows2025(t *testing.T) { RCV1PCertMode: true, }, Config: Config{ - Cluster: ClusterRCV1POverlay, + Cluster: ClusterRCV1PKubenet, VHD: config.VHDWindows2025, VMConfigMutator: func(vmss *armcompute.VirtualMachineScaleSet) { vmss.Properties = addTrustedLaunchToVMSS(vmss.Properties) @@ -103,7 +103,7 @@ func Test_RCV1P_Windows2022Gen2(t *testing.T) { RCV1PCertMode: true, }, Config: Config{ - Cluster: ClusterRCV1POverlay, + Cluster: ClusterRCV1PKubenet, VHD: config.VHDWindows2022ContainerdGen2, VMConfigMutator: rcv1pOptInVMConfigMutator, VMInstanceTags: rcv1pVMInstanceTags(), @@ -126,7 +126,7 @@ func Test_RCV1P_Windows23H2Gen2(t *testing.T) { RCV1PCertMode: true, }, Config: Config{ - Cluster: ClusterRCV1POverlay, + Cluster: ClusterRCV1PKubenet, VHD: config.VHDWindows23H2Gen2, VMConfigMutator: rcv1pOptInVMConfigMutator, VMInstanceTags: rcv1pVMInstanceTags(), @@ -149,7 +149,7 @@ func Test_RCV1P_Windows2025Gen2(t *testing.T) { RCV1PCertMode: true, }, Config: Config{ - Cluster: ClusterRCV1POverlay, + Cluster: ClusterRCV1PKubenet, VHD: config.VHDWindows2025Gen2, VMConfigMutator: func(vmss *armcompute.VirtualMachineScaleSet) { vmss.Properties = addTrustedLaunchToVMSS(vmss.Properties) @@ -181,7 +181,7 @@ func Test_RCV1P_Windows_NotOptedIn(t *testing.T) { RCV1PCertMode: true, }, Config: Config{ - Cluster: ClusterRCV1POverlay, + Cluster: ClusterRCV1PKubenet, VHD: config.VHDWindows2022Containerd, BootstrapConfigMutator: EmptyBootstrapConfigMutator, Validator: func(ctx context.Context, s *Scenario) { From f1e42b670fca5ee6e27c33c822a42604fc6d4c5f Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Wed, 22 Apr 2026 18:02:53 -0700 Subject: [PATCH 039/103] REVERT ME: use dedicated kubenet cluster for RCV1P tests Give RCV1P tests their own kubenet cluster (abe2e-rcv1p-default-kubenet-v1) instead of sharing ClusterKubenet with all other Windows tests. This avoids subnet IP exhaustion caused by many parallel test nodes competing for the same /24 subnet. To revert: change ClusterRCV1PDefaultKubenet back to ClusterKubenet in rcv1pCluster() and remove ClusterRCV1PDefaultKubenet from cache.go. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Ramkumar Chinchani --- e2e/cache.go | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/e2e/cache.go b/e2e/cache.go index 76514bbb5be..5d43b9375b1 100644 --- a/e2e/cache.go +++ b/e2e/cache.go @@ -210,6 +210,15 @@ func clusterCiliumNetwork(ctx context.Context, request ClusterRequest) (*Cluster return prepareCluster(ctx, model, false, false) } +var ClusterRCV1PDefaultKubenet = cachedFunc(clusterRCV1PDefaultKubenet) + +// clusterRCV1PDefaultKubenet creates a dedicated kubenet cluster for RCV1P tests on the default +// E2E subscription. This avoids sharing the main kubenet cluster's subnet with non-RCV1P tests, +// preventing IP exhaustion when many Windows tests run in parallel. +func clusterRCV1PDefaultKubenet(ctx context.Context, request ClusterRequest) (*Cluster, error) { + return prepareCluster(ctx, DefaultClusterInfra, getKubenetClusterModel("abe2e-rcv1p-default-kubenet-v1", request.Location, request.K8sSystemPoolSKU), false, false) +} + // isNotFoundErr checks if an error represents a "not found" response from Azure API func isNotFoundErr(err error) bool { var respErr *azcore.ResponseError From c20ef57ab915f6c1b0ec70e61e67e9c794cd2da3 Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Wed, 22 Apr 2026 20:33:54 -0700 Subject: [PATCH 040/103] REVERT ME: use Azure CNI cluster for Windows RCV1P tests Windows RCV1P tests were failing with IP exhaustion because they used kubenet clusters while baseTemplateWindows() configures the NBC for Azure CNI overlay mode. The azure-vnet plugin on the node then tries overlay IPAM which fails for standalone VMSS nodes. Fix: use ClusterAzureNetwork (matching all other Windows tests) instead of kubenet for Windows RCV1P tests. Linux RCV1P tests stay on kubenet. Removes the unused ClusterRCV1PDefaultKubenet (dedicated kubenet cluster that didn't solve the issue since the root cause was NBC/cluster mismatch). To revert: change rcv1pWindowsCluster() back to rcv1pCluster() in scenario_rcv1p_win_test.go and remove rcv1pWindowsCluster() from scenario_rcv1p_test.go. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Ramkumar Chinchani --- e2e/cache.go | 9 --------- 1 file changed, 9 deletions(-) diff --git a/e2e/cache.go b/e2e/cache.go index 5d43b9375b1..76514bbb5be 100644 --- a/e2e/cache.go +++ b/e2e/cache.go @@ -210,15 +210,6 @@ func clusterCiliumNetwork(ctx context.Context, request ClusterRequest) (*Cluster return prepareCluster(ctx, model, false, false) } -var ClusterRCV1PDefaultKubenet = cachedFunc(clusterRCV1PDefaultKubenet) - -// clusterRCV1PDefaultKubenet creates a dedicated kubenet cluster for RCV1P tests on the default -// E2E subscription. This avoids sharing the main kubenet cluster's subnet with non-RCV1P tests, -// preventing IP exhaustion when many Windows tests run in parallel. -func clusterRCV1PDefaultKubenet(ctx context.Context, request ClusterRequest) (*Cluster, error) { - return prepareCluster(ctx, DefaultClusterInfra, getKubenetClusterModel("abe2e-rcv1p-default-kubenet-v1", request.Location, request.K8sSystemPoolSKU), false, false) -} - // isNotFoundErr checks if an error represents a "not found" response from Azure API func isNotFoundErr(err error) bool { var respErr *azcore.ResponseError From 0b1ed4811d858ab336d76ab6b47b78939439c7cc Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Wed, 22 Apr 2026 22:45:47 -0700 Subject: [PATCH 041/103] REVERT ME: add wireserver endpoint diagnostics to Windows RCV1P validator Probes all wireserver cert endpoints (isOptedInForRootCerts, operationrequestsroot, operationrequestsintermediate, legacy cacertificates) during validation and dumps CSE log lines related to certificate operations. Uses execScriptOnVMForScenario with explicit t.Logf to ensure output is always visible in test logs, not swallowed by execScriptOnVMForScenarioValidateExitCode. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Ramkumar Chinchani --- e2e/validators.go | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/e2e/validators.go b/e2e/validators.go index ffb70a6d332..9c26e7b7c00 100644 --- a/e2e/validators.go +++ b/e2e/validators.go @@ -3173,6 +3173,45 @@ func rcv1pTrustStoreDir(s *Scenario) string { func ValidateRCV1PCertModeWindows(ctx context.Context, s *Scenario) { s.T.Helper() + // REVERT ME: Diagnostic block — probe wireserver endpoints and dump CSE log tail from the VM + // so we can see exactly what the wireserver returns for operationrequests and what the CSE logged. + diagCommand := []string{ + "$ErrorActionPreference = 'Continue'", + "Write-Host '=== DIAGNOSTIC: probing wireserver rcv1p endpoints ==='", + "try {", + " $optIn = Invoke-WebRequest -Uri 'http://168.63.129.16/acms/isOptedInForRootCerts' -UseBasicParsing -TimeoutSec 30", + " Write-Host \"isOptedInForRootCerts: $($optIn.Content)\"", + "} catch { Write-Host \"isOptedInForRootCerts ERROR: $_\" }", + "try {", + " $root = Invoke-WebRequest -Uri 'http://168.63.129.16/machine?comp=acmspackage&type=operationrequestsroot&ext=json' -UseBasicParsing -TimeoutSec 30", + " Write-Host \"operationrequestsroot status=$($root.StatusCode) length=$($root.Content.Length)\"", + " Write-Host \"operationrequestsroot content: $($root.Content)\"", + "} catch { Write-Host \"operationrequestsroot ERROR: $_\" }", + "try {", + " $intermediate = Invoke-WebRequest -Uri 'http://168.63.129.16/machine?comp=acmspackage&type=operationrequestsintermediate&ext=json' -UseBasicParsing -TimeoutSec 30", + " Write-Host \"operationrequestsintermediate status=$($intermediate.StatusCode) length=$($intermediate.Content.Length)\"", + " Write-Host \"operationrequestsintermediate content: $($intermediate.Content)\"", + "} catch { Write-Host \"operationrequestsintermediate ERROR: $_\" }", + "try {", + " $legacy = Invoke-WebRequest -Uri 'http://168.63.129.16/machine?comp=acmspackage&type=cacertificates&ext=json' -UseBasicParsing -TimeoutSec 30", + " Write-Host \"legacy cacertificates status=$($legacy.StatusCode) length=$($legacy.Content.Length)\"", + " $legacyJson = $legacy.Content | ConvertFrom-Json", + " if ($legacyJson.Certificates) { Write-Host \"legacy cert count: $($legacyJson.Certificates.Length)\" } else { Write-Host 'legacy: no Certificates array' }", + "} catch { Write-Host \"legacy cacertificates ERROR: $_\" }", + "Write-Host '=== DIAGNOSTIC: C:\\ca folder contents ==='", + "if (Test-Path 'C:\\ca') { Get-ChildItem -Path 'C:\\ca' -File | ForEach-Object { Write-Host \" $($_.Name) ($($_.Length) bytes)\" } } else { Write-Host 'C:\\ca does not exist' }", + "Write-Host '=== DIAGNOSTIC: CSE log tail (last 60 lines with CA/cert/wireserver) ==='", + "if (Test-Path 'C:\\AzureData\\CustomDataSetupScript.log') {", + " Get-Content 'C:\\AzureData\\CustomDataSetupScript.log' -Tail 200 | Where-Object { $_ -match 'CA |cert|wireserver|optedin|operation|acms|Write cert|Warning' } | Select-Object -Last 60 | ForEach-Object { Write-Host $_ }", + "} else { Write-Host 'CSE log not found' }", + "Write-Host '=== END DIAGNOSTIC ==='", + } + diagResult := execScriptOnVMForScenario(ctx, s, strings.Join(diagCommand, "\n")) + s.T.Logf("REVERT ME: wireserver diagnostics stdout:\n%s", diagResult.stdout) + if diagResult.stderr != "" { + s.T.Logf("REVERT ME: wireserver diagnostics stderr:\n%s", diagResult.stderr) + } + // Validate the provisioning log shows wireserver was queried and returned opted-in ValidateFileHasContent(ctx, s, "C:\\AzureData\\CustomDataSetupScript.log", "IsOptedInForRootCerts wireserver response:") From 8d71344b375e4a5b11a1594ecf8ad26f605e659a Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Thu, 23 Apr 2026 07:31:15 -0700 Subject: [PATCH 042/103] fix: use correct wireserver JSON field name for rcv1p cert download The wireserver operationrequestsroot and operationrequestsintermediate endpoints return certificates under the 'OperationsInfo' field, but the Windows PowerShell code was looking for 'OperationRequests' which doesn't exist in the response. This caused the null check to skip the entire cert download loop, leaving C:\ca empty despite wireserver returning valid certificate data. The Linux implementation avoids this by using grep to extract ResouceFileName values directly from the raw JSON, bypassing the parent field name entirely. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Ramkumar Chinchani --- staging/cse/windows/kubernetesfunc.ps1 | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/staging/cse/windows/kubernetesfunc.ps1 b/staging/cse/windows/kubernetesfunc.ps1 index 5ddb667dd99..b5d83453ffa 100644 --- a/staging/cse/windows/kubernetesfunc.ps1 +++ b/staging/cse/windows/kubernetesfunc.ps1 @@ -393,12 +393,12 @@ function Get-CACertificates { $operationResponse = Retry-Command -Command 'Invoke-WebRequest' -Args @{Uri=$operationRequestUri; UseBasicParsing=$true} -Retries 5 -RetryDelaySeconds 10 $operationJson = ($operationResponse.Content) | ConvertFrom-Json - if ($null -eq $operationJson -or $null -eq $operationJson.OperationRequests) { + if ($null -eq $operationJson -or $null -eq $operationJson.OperationsInfo) { Write-Log "Warning: no operation requests found for $requestType" continue } - foreach ($operation in $operationJson.OperationRequests) { + foreach ($operation in $operationJson.OperationsInfo) { $resourceFileName = $operation.ResouceFileName if ([string]::IsNullOrEmpty($resourceFileName)) { continue From 374b84ba4cc8f7a9b8a6b7d97f8ff41c9fcae5b6 Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Thu, 23 Apr 2026 07:40:00 -0700 Subject: [PATCH 043/103] REVERT ME: add azcopy error logging to Windows log collection Wraps each azcopy copy call with error checking and logging to diagnose why Windows CSE log uploads consistently return BlobNotFound. Also captures RunCommand stdout/stderr (InstanceView) which was previously not logged, so we can see azcopy output and any MSI auth failures. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Ramkumar Chinchani --- e2e/vmss.go | 67 ++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 54 insertions(+), 13 deletions(-) diff --git a/e2e/vmss.go b/e2e/vmss.go index ab9a9101bf8..1dfd04a8849 100644 --- a/e2e/vmss.go +++ b/e2e/vmss.go @@ -956,6 +956,7 @@ func extractLogsFromVMLinux(ctx context.Context, s *Scenario, vm *ScenarioVM) er return nil } +// REVERT ME: added error logging around azcopy to diagnose why blob uploads fail (BlobNotFound) const uploadLogsPowershellScript = ` param( [string]$arg1, @@ -963,18 +964,47 @@ param( [string]$arg3 ) -Invoke-WebRequest -UseBasicParsing https://aka.ms/downloadazcopy-v10-windows -OutFile azcopy.zip -Expand-Archive azcopy.zip -cd .\azcopy\* -$env:AZCOPY_AUTO_LOGIN_TYPE="MSI" -$env:AZCOPY_MSI_RESOURCE_STRING=$arg3 -C:\k\debug\collect-windows-logs.ps1 -$CollectedLogs=(Get-ChildItem . -Filter "*_logs.zip" -File)[0].Name -.\azcopy.exe copy $CollectedLogs "$arg1/collected-node-logs.zip" -.\azcopy.exe copy "C:\azuredata\CustomDataSetupScript.log" "$arg1/cse.log" -.\azcopy.exe copy "C:\AzureData\provision.complete" "$arg1/provision.complete" -.\azcopy.exe copy "C:\k\kubelet.err.log" "$arg1/kubelet.err.log" -.\azcopy.exe copy "C:\k\containerd.err.log" "$arg1/containerd.err.log" +# REVERT ME: verbose error logging for azcopy upload diagnostics +function Run-AzCopy { + param([string]$Source, [string]$Dest) + if (-not (Test-Path $Source)) { + Write-Host "AZCOPY SKIP: source not found: $Source" + return + } + Write-Host "AZCOPY: copying $Source -> $Dest" + $output = & .\azcopy.exe copy $Source $Dest 2>&1 + $exitCode = $LASTEXITCODE + Write-Host "AZCOPY: exit=$exitCode output=$output" + if ($exitCode -ne 0) { + Write-Host "AZCOPY ERROR: failed to copy $Source (exit=$exitCode)" + } +} + +try { + Write-Host "Downloading azcopy..." + Invoke-WebRequest -UseBasicParsing https://aka.ms/downloadazcopy-v10-windows -OutFile azcopy.zip + Expand-Archive azcopy.zip + cd .\azcopy\* + $env:AZCOPY_AUTO_LOGIN_TYPE="MSI" + $env:AZCOPY_MSI_RESOURCE_STRING=$arg3 + Write-Host "MSI resource: $arg3" + Write-Host "Blob destination: $arg1" +} catch { + Write-Host "AZCOPY SETUP ERROR: $_" +} + +try { + C:\k\debug\collect-windows-logs.ps1 + $CollectedLogs=(Get-ChildItem . -Filter "*_logs.zip" -File)[0].Name + Run-AzCopy -Source $CollectedLogs -Dest "$arg1/collected-node-logs.zip" +} catch { + Write-Host "COLLECT-LOGS ERROR: $_" +} + +Run-AzCopy -Source "C:\azuredata\CustomDataSetupScript.log" -Dest "$arg1/cse.log" +Run-AzCopy -Source "C:\AzureData\provision.complete" -Dest "$arg1/provision.complete" +Run-AzCopy -Source "C:\k\kubelet.err.log" -Dest "$arg1/kubelet.err.log" +Run-AzCopy -Source "C:\k\containerd.err.log" -Dest "$arg1/containerd.err.log" # Collect network configuration information ipconfig /all > network_config.txt @@ -988,7 +1018,7 @@ Get-NetNeighbor >> network_config.txt Get-NetConnectionProfile >> network_config.txt hnsdiag list networks >> network_config.txt hnsdiag list endpoints >> network_config.txt -.\azcopy.exe copy "network_config.txt" "$arg1/network_config.txt" +Run-AzCopy -Source "network_config.txt" -Dest "$arg1/network_config.txt" ` // extractLogsFromVMWindows runs a script on windows VM to collect logs and upload them to a blob storage @@ -1074,6 +1104,17 @@ func extractLogsFromVMWindows(ctx context.Context, s *Scenario) { respJSON, _ := json.MarshalIndent(runCommandResp, "", " ") s.T.Logf("run command executed successfully:\n%s", respJSON) + // REVERT ME: log RunCommand stdout/stderr to diagnose azcopy upload failures + if runCommandResp.Properties != nil && runCommandResp.Properties.InstanceView != nil { + iv := runCommandResp.Properties.InstanceView + if iv.Output != nil && *iv.Output != "" { + s.T.Logf("RunCommand stdout:\n%s", *iv.Output) + } + if iv.Error != nil && *iv.Error != "" { + s.T.Logf("RunCommand stderr:\n%s", *iv.Error) + } + } + s.T.Logf("uploaded logs to %s", blobUrl) downloadBlob := func(blobSuffix string) { From 98eca0a12f0a914eb5f1f7198a18c3af902c8dbe Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Thu, 23 Apr 2026 10:59:05 -0700 Subject: [PATCH 044/103] REVERT ME: enable verbose test output for azcopy/wireserver diagnostics Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Ramkumar Chinchani --- .pipelines/scripts/e2e_run.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.pipelines/scripts/e2e_run.sh b/.pipelines/scripts/e2e_run.sh index 097fe250756..77ae62dbec7 100644 --- a/.pipelines/scripts/e2e_run.sh +++ b/.pipelines/scripts/e2e_run.sh @@ -97,10 +97,11 @@ tar -xzf "$temp_file" -C bin chmod +x bin/gotestsum rm -f "$temp_file" +# REVERT ME: added -v to see t.Logf output from passing tests (azcopy/wireserver diagnostics) # gotestsum configure to only show logs for failed tests, json file for detailed logs # Run the tests! Yey! test_exit_code=0 -./bin/gotestsum --format testdox --junitfile "${BUILD_SRC_DIR}/e2e/report.xml" --jsonfile "${BUILD_SRC_DIR}/e2e/test-log.json" -- -parallel 60 -timeout "${E2E_GO_TEST_TIMEOUT}" || test_exit_code=$? +./bin/gotestsum --format testdox --junitfile "${BUILD_SRC_DIR}/e2e/report.xml" --jsonfile "${BUILD_SRC_DIR}/e2e/test-log.json" -- -v -parallel 60 -timeout "${E2E_GO_TEST_TIMEOUT}" || test_exit_code=$? # Upload test results as Azure DevOps artifacts echo "##vso[artifact.upload containerfolder=test-results;artifactname=e2e-test-log]${BUILD_SRC_DIR}/e2e/test-log.json" From 71a344b128ea9f47d6c1ef0d60db9206f03ae826 Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Thu, 23 Apr 2026 20:44:17 -0700 Subject: [PATCH 045/103] REVERT ME: canary check to prove whether SSH validators are broken Adds a ValidateFileHasContent check for a nonsense string that will never exist in the CSE log. If this test PASSES, it proves the ExitMissingError handler in exec.go:130 is silently swallowing SSH exit codes and all Windows validators are no-ops. If this test FAILS (expected), validators are working correctly. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Ramkumar Chinchani --- e2e/validators.go | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/e2e/validators.go b/e2e/validators.go index 9c26e7b7c00..ebeb14e343d 100644 --- a/e2e/validators.go +++ b/e2e/validators.go @@ -3212,6 +3212,13 @@ func ValidateRCV1PCertModeWindows(ctx context.Context, s *Scenario) { s.T.Logf("REVERT ME: wireserver diagnostics stderr:\n%s", diagResult.stderr) } + // REVERT ME: Canary check — this string should NEVER exist in the CSE log. + // If this test PASSES, it proves the SSH ExitMissingError bug (exec.go:130) is + // silently swallowing failures and all validators are broken. + // If this test FAILS (as expected), validators are working correctly. + ValidateFileHasContent(ctx, s, "C:\\AzureData\\CustomDataSetupScript.log", + "CANARY_STRING_THAT_SHOULD_NEVER_EXIST_IN_ANY_LOG_FILE_EVER_12345") + // Validate the provisioning log shows wireserver was queried and returned opted-in ValidateFileHasContent(ctx, s, "C:\\AzureData\\CustomDataSetupScript.log", "IsOptedInForRootCerts wireserver response:") From 016e8f9ee0ebc4614f3476732871144c8294b156 Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Thu, 23 Apr 2026 22:34:11 -0700 Subject: [PATCH 046/103] Remove canary check - validators confirmed working The canary test proved validators are functional and our branch CSE zip is correctly delivered to VMs. Wireserver returns IsOptedInForRootCerts=true and the CSE log contains the expected RCV1P log lines. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Ramkumar Chinchani --- e2e/validators.go | 7 ------- 1 file changed, 7 deletions(-) diff --git a/e2e/validators.go b/e2e/validators.go index ebeb14e343d..9c26e7b7c00 100644 --- a/e2e/validators.go +++ b/e2e/validators.go @@ -3212,13 +3212,6 @@ func ValidateRCV1PCertModeWindows(ctx context.Context, s *Scenario) { s.T.Logf("REVERT ME: wireserver diagnostics stderr:\n%s", diagResult.stderr) } - // REVERT ME: Canary check — this string should NEVER exist in the CSE log. - // If this test PASSES, it proves the SSH ExitMissingError bug (exec.go:130) is - // silently swallowing failures and all validators are broken. - // If this test FAILS (as expected), validators are working correctly. - ValidateFileHasContent(ctx, s, "C:\\AzureData\\CustomDataSetupScript.log", - "CANARY_STRING_THAT_SHOULD_NEVER_EXIST_IN_ANY_LOG_FILE_EVER_12345") - // Validate the provisioning log shows wireserver was queried and returned opted-in ValidateFileHasContent(ctx, s, "C:\\AzureData\\CustomDataSetupScript.log", "IsOptedInForRootCerts wireserver response:") From 5721d10c99c195a3a68215818f2d6a57b5a637e9 Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Fri, 24 Apr 2026 08:34:05 -0700 Subject: [PATCH 047/103] fix: make wireserver cert retrieval failures fatal on Linux Cert installation must succeed for the selected mode (legacy or rcv1p). Previously, failures after exhausting retries were silently swallowed with a warning, leaving the node without certificates. Now failures exit 1, matching the Windows -FailOnError behavior. Retries with backoff in make_request_with_retry still handle transient wireserver issues (rate limiting, temporary unavailability). Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Ramkumar Chinchani --- parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh b/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh index f189c433457..caba6e99335 100644 --- a/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh +++ b/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh @@ -227,7 +227,7 @@ function install_certs_to_trust_store { # Certificate refresh behavior summary: # - legacy mode directly attempts certificate download from wireserver and only in ussec and usnat regions. # - rcv1p mode first checks IsOptedInForRootCerts, then downloads only when opted in. -# - Wireserver failures are treated as non-fatal, and cert trust-store updates are skipped gracefully. +# - Wireserver failures are fatal — cert installation must succeed for the selected mode. refresh_location="${2:-${LOCATION}}" @@ -251,7 +251,8 @@ if [ "$cert_endpoint_mode" = "legacy" ]; then if retrieve_legacy_certs; then install_certs_to_trust_store else - echo "Warning: failed to retrieve legacy certificates from wireserver; continuing without trust store updates" + echo "ERROR: failed to retrieve legacy certificates from wireserver after retries" + exit 1 fi elif [ "$cert_endpoint_mode" = "rcv1p" ]; then if is_opted_in_for_root_certs; then @@ -259,7 +260,8 @@ elif [ "$cert_endpoint_mode" = "rcv1p" ]; then if retrieve_rcv1p_certs; then install_certs_to_trust_store else - echo "Warning: failed to retrieve rcv1p certificates from wireserver; continuing without trust store updates" + echo "ERROR: failed to retrieve rcv1p certificates from wireserver after retries" + exit 1 fi fi fi From 13f98331d0ae04e3a4b2e1e53f97d398bf0179e2 Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Sat, 25 Apr 2026 15:22:07 -0700 Subject: [PATCH 048/103] revert: remove diagnostic commits used during RCV1P development Reverts the following temporary diagnostic commits that served their purpose during RCV1P cert mode debugging and are no longer needed: - 807b5a46a8 (wireserver endpoint diagnostics in validator) Why: Added to debug cert download failures. The root cause was a JSON field name mismatch (OperationRequests vs OperationsInfo), now fixed. Diagnostic probing adds noise to validator output. - 9f6a9023fb (azcopy error logging in Windows log collection) Why: Added to debug empty CSE log uploads (BlobNotFound). Root cause was ADO job timeout (90m) racing with go test timeout (90m), fixed on main by 54aa84a (reduced go test timeout to 80m). - d083fbef53 (verbose test output with -v flag) Why: Added so t.Logf output would appear in pipeline logs for diagnostics. No longer needed; increases log noise for all tests. - 45041cbe32 (always collect Windows CSE logs) Why: Removed s.T.Failed() guard to collect logs on success too. Root cause of missing logs was the ADO/go-test timeout race, not the collection logic. Restored failure-only collection. - fdc6962bd2 + 11967731f7 (canary check, already net-zero) Why: Canary proved validators work correctly. Already removed by the follow-up commit; these two commits cancel each other. - 0bc8f2e48d (poll wireserver IsOptedInForRootCerts retry loop) Why: Experimental polling for FC goal-state propagation. Tags are now set at VMSS creation time, making polling unnecessary. Already reverted by later commits during development. Kept (not reverted): - 76edb18ed9: Azure CNI cluster for Windows RCV1P tests (real fix for NBC/cluster type mismatch causing IP exhaustion) - a891055eb2: Branch-built CSE zip override (required until RCV1P code ships in a published CSE package) Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Ramkumar Chinchani --- .pipelines/scripts/e2e_run.sh | 3 +- e2e/validators.go | 41 +------------------- e2e/vmss.go | 72 +++++++++-------------------------- 3 files changed, 19 insertions(+), 97 deletions(-) diff --git a/.pipelines/scripts/e2e_run.sh b/.pipelines/scripts/e2e_run.sh index 77ae62dbec7..097fe250756 100644 --- a/.pipelines/scripts/e2e_run.sh +++ b/.pipelines/scripts/e2e_run.sh @@ -97,11 +97,10 @@ tar -xzf "$temp_file" -C bin chmod +x bin/gotestsum rm -f "$temp_file" -# REVERT ME: added -v to see t.Logf output from passing tests (azcopy/wireserver diagnostics) # gotestsum configure to only show logs for failed tests, json file for detailed logs # Run the tests! Yey! test_exit_code=0 -./bin/gotestsum --format testdox --junitfile "${BUILD_SRC_DIR}/e2e/report.xml" --jsonfile "${BUILD_SRC_DIR}/e2e/test-log.json" -- -v -parallel 60 -timeout "${E2E_GO_TEST_TIMEOUT}" || test_exit_code=$? +./bin/gotestsum --format testdox --junitfile "${BUILD_SRC_DIR}/e2e/report.xml" --jsonfile "${BUILD_SRC_DIR}/e2e/test-log.json" -- -parallel 60 -timeout "${E2E_GO_TEST_TIMEOUT}" || test_exit_code=$? # Upload test results as Azure DevOps artifacts echo "##vso[artifact.upload containerfolder=test-results;artifactname=e2e-test-log]${BUILD_SRC_DIR}/e2e/test-log.json" diff --git a/e2e/validators.go b/e2e/validators.go index 9c26e7b7c00..23e906f42c9 100644 --- a/e2e/validators.go +++ b/e2e/validators.go @@ -3173,46 +3173,7 @@ func rcv1pTrustStoreDir(s *Scenario) string { func ValidateRCV1PCertModeWindows(ctx context.Context, s *Scenario) { s.T.Helper() - // REVERT ME: Diagnostic block — probe wireserver endpoints and dump CSE log tail from the VM - // so we can see exactly what the wireserver returns for operationrequests and what the CSE logged. - diagCommand := []string{ - "$ErrorActionPreference = 'Continue'", - "Write-Host '=== DIAGNOSTIC: probing wireserver rcv1p endpoints ==='", - "try {", - " $optIn = Invoke-WebRequest -Uri 'http://168.63.129.16/acms/isOptedInForRootCerts' -UseBasicParsing -TimeoutSec 30", - " Write-Host \"isOptedInForRootCerts: $($optIn.Content)\"", - "} catch { Write-Host \"isOptedInForRootCerts ERROR: $_\" }", - "try {", - " $root = Invoke-WebRequest -Uri 'http://168.63.129.16/machine?comp=acmspackage&type=operationrequestsroot&ext=json' -UseBasicParsing -TimeoutSec 30", - " Write-Host \"operationrequestsroot status=$($root.StatusCode) length=$($root.Content.Length)\"", - " Write-Host \"operationrequestsroot content: $($root.Content)\"", - "} catch { Write-Host \"operationrequestsroot ERROR: $_\" }", - "try {", - " $intermediate = Invoke-WebRequest -Uri 'http://168.63.129.16/machine?comp=acmspackage&type=operationrequestsintermediate&ext=json' -UseBasicParsing -TimeoutSec 30", - " Write-Host \"operationrequestsintermediate status=$($intermediate.StatusCode) length=$($intermediate.Content.Length)\"", - " Write-Host \"operationrequestsintermediate content: $($intermediate.Content)\"", - "} catch { Write-Host \"operationrequestsintermediate ERROR: $_\" }", - "try {", - " $legacy = Invoke-WebRequest -Uri 'http://168.63.129.16/machine?comp=acmspackage&type=cacertificates&ext=json' -UseBasicParsing -TimeoutSec 30", - " Write-Host \"legacy cacertificates status=$($legacy.StatusCode) length=$($legacy.Content.Length)\"", - " $legacyJson = $legacy.Content | ConvertFrom-Json", - " if ($legacyJson.Certificates) { Write-Host \"legacy cert count: $($legacyJson.Certificates.Length)\" } else { Write-Host 'legacy: no Certificates array' }", - "} catch { Write-Host \"legacy cacertificates ERROR: $_\" }", - "Write-Host '=== DIAGNOSTIC: C:\\ca folder contents ==='", - "if (Test-Path 'C:\\ca') { Get-ChildItem -Path 'C:\\ca' -File | ForEach-Object { Write-Host \" $($_.Name) ($($_.Length) bytes)\" } } else { Write-Host 'C:\\ca does not exist' }", - "Write-Host '=== DIAGNOSTIC: CSE log tail (last 60 lines with CA/cert/wireserver) ==='", - "if (Test-Path 'C:\\AzureData\\CustomDataSetupScript.log') {", - " Get-Content 'C:\\AzureData\\CustomDataSetupScript.log' -Tail 200 | Where-Object { $_ -match 'CA |cert|wireserver|optedin|operation|acms|Write cert|Warning' } | Select-Object -Last 60 | ForEach-Object { Write-Host $_ }", - "} else { Write-Host 'CSE log not found' }", - "Write-Host '=== END DIAGNOSTIC ==='", - } - diagResult := execScriptOnVMForScenario(ctx, s, strings.Join(diagCommand, "\n")) - s.T.Logf("REVERT ME: wireserver diagnostics stdout:\n%s", diagResult.stdout) - if diagResult.stderr != "" { - s.T.Logf("REVERT ME: wireserver diagnostics stderr:\n%s", diagResult.stderr) - } - - // Validate the provisioning log shows wireserver was queried and returned opted-in + // Validate the provisioning logshows wireserver was queried and returned opted-in ValidateFileHasContent(ctx, s, "C:\\AzureData\\CustomDataSetupScript.log", "IsOptedInForRootCerts wireserver response:") diff --git a/e2e/vmss.go b/e2e/vmss.go index 1dfd04a8849..07d087597c7 100644 --- a/e2e/vmss.go +++ b/e2e/vmss.go @@ -956,7 +956,6 @@ func extractLogsFromVMLinux(ctx context.Context, s *Scenario, vm *ScenarioVM) er return nil } -// REVERT ME: added error logging around azcopy to diagnose why blob uploads fail (BlobNotFound) const uploadLogsPowershellScript = ` param( [string]$arg1, @@ -964,47 +963,18 @@ param( [string]$arg3 ) -# REVERT ME: verbose error logging for azcopy upload diagnostics -function Run-AzCopy { - param([string]$Source, [string]$Dest) - if (-not (Test-Path $Source)) { - Write-Host "AZCOPY SKIP: source not found: $Source" - return - } - Write-Host "AZCOPY: copying $Source -> $Dest" - $output = & .\azcopy.exe copy $Source $Dest 2>&1 - $exitCode = $LASTEXITCODE - Write-Host "AZCOPY: exit=$exitCode output=$output" - if ($exitCode -ne 0) { - Write-Host "AZCOPY ERROR: failed to copy $Source (exit=$exitCode)" - } -} - -try { - Write-Host "Downloading azcopy..." - Invoke-WebRequest -UseBasicParsing https://aka.ms/downloadazcopy-v10-windows -OutFile azcopy.zip - Expand-Archive azcopy.zip - cd .\azcopy\* - $env:AZCOPY_AUTO_LOGIN_TYPE="MSI" - $env:AZCOPY_MSI_RESOURCE_STRING=$arg3 - Write-Host "MSI resource: $arg3" - Write-Host "Blob destination: $arg1" -} catch { - Write-Host "AZCOPY SETUP ERROR: $_" -} - -try { - C:\k\debug\collect-windows-logs.ps1 - $CollectedLogs=(Get-ChildItem . -Filter "*_logs.zip" -File)[0].Name - Run-AzCopy -Source $CollectedLogs -Dest "$arg1/collected-node-logs.zip" -} catch { - Write-Host "COLLECT-LOGS ERROR: $_" -} - -Run-AzCopy -Source "C:\azuredata\CustomDataSetupScript.log" -Dest "$arg1/cse.log" -Run-AzCopy -Source "C:\AzureData\provision.complete" -Dest "$arg1/provision.complete" -Run-AzCopy -Source "C:\k\kubelet.err.log" -Dest "$arg1/kubelet.err.log" -Run-AzCopy -Source "C:\k\containerd.err.log" -Dest "$arg1/containerd.err.log" +Invoke-WebRequest -UseBasicParsing https://aka.ms/downloadazcopy-v10-windows -OutFile azcopy.zip +Expand-Archive azcopy.zip +cd .\azcopy\* +$env:AZCOPY_AUTO_LOGIN_TYPE="MSI" +$env:AZCOPY_MSI_RESOURCE_STRING=$arg3 +C:\k\debug\collect-windows-logs.ps1 +$CollectedLogs=(Get-ChildItem . -Filter "*_logs.zip" -File)[0].Name +.\azcopy.exe copy $CollectedLogs "$arg1/collected-node-logs.zip" +.\azcopy.exe copy "C:\azuredata\CustomDataSetupScript.log" "$arg1/cse.log" +.\azcopy.exe copy "C:\AzureData\provision.complete" "$arg1/provision.complete" +.\azcopy.exe copy "C:\k\kubelet.err.log" "$arg1/kubelet.err.log" +.\azcopy.exe copy "C:\k\containerd.err.log" "$arg1/containerd.err.log" # Collect network configuration information ipconfig /all > network_config.txt @@ -1018,13 +988,16 @@ Get-NetNeighbor >> network_config.txt Get-NetConnectionProfile >> network_config.txt hnsdiag list networks >> network_config.txt hnsdiag list endpoints >> network_config.txt -Run-AzCopy -Source "network_config.txt" -Dest "$arg1/network_config.txt" +.\azcopy.exe copy "network_config.txt" "$arg1/network_config.txt" ` // extractLogsFromVMWindows runs a script on windows VM to collect logs and upload them to a blob storage // it then lists the blobs in the container and prints the content of each blob func extractLogsFromVMWindows(ctx context.Context, s *Scenario) { - // Always collect Windows logs for debugging (revert this to restore failure-only collection) + if !s.T.Failed() { + return + } + ctx, cancel := context.WithTimeout(ctx, 4*time.Minute) defer cancel() pager := s.GetAzure().VMSSVM.NewListPager(*s.Runtime.Cluster.Model.Properties.NodeResourceGroup, s.Runtime.VMSSName, nil) @@ -1104,17 +1077,6 @@ func extractLogsFromVMWindows(ctx context.Context, s *Scenario) { respJSON, _ := json.MarshalIndent(runCommandResp, "", " ") s.T.Logf("run command executed successfully:\n%s", respJSON) - // REVERT ME: log RunCommand stdout/stderr to diagnose azcopy upload failures - if runCommandResp.Properties != nil && runCommandResp.Properties.InstanceView != nil { - iv := runCommandResp.Properties.InstanceView - if iv.Output != nil && *iv.Output != "" { - s.T.Logf("RunCommand stdout:\n%s", *iv.Output) - } - if iv.Error != nil && *iv.Error != "" { - s.T.Logf("RunCommand stderr:\n%s", *iv.Error) - } - } - s.T.Logf("uploaded logs to %s", blobUrl) downloadBlob := func(blobSuffix string) { From 38e4e6d346eaa2034c8b833336998873a3c9bc34 Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Sun, 26 Apr 2026 16:32:19 -0700 Subject: [PATCH 049/103] fix: make wireserver unreachable fatal for RCV1P opt-in check MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Wireserver unreachable after retries is now fatal (return 2 + exit 1) instead of silently skipping cert installation. If the subscription is opted in for hardened root certs but we silently fall back to the distro's default trust store, we leave a security hole — the node would trust CAs the customer explicitly intended to replace. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Ramkumar Chinchani --- .../artifacts/init-aks-custom-cloud.sh | 21 ++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh b/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh index caba6e99335..8bc6a52e112 100644 --- a/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh +++ b/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh @@ -74,6 +74,16 @@ function make_request_with_retry { return 1 } +# Returns: +# 0 - opted in (wireserver confirmed IsOptedInForRootCerts=true) +# 1 - not opted in (wireserver responded with false; valid, skip certs) +# 2 - wireserver unreachable after retries (caller must treat as fatal) +# +# Wireserver unreachable must be fatal (return 2) rather than silently skipping certs. +# If the subscription is opted in for hardened root certs but we silently fall back to +# the distro's default trust store, we leave a security hole — the node would trust CAs +# that the customer explicitly intended to replace. Failing hard surfaces the problem +# immediately instead of letting the node run with an insecure certificate configuration. function is_opted_in_for_root_certs { local opt_in_response local request_status @@ -255,7 +265,16 @@ if [ "$cert_endpoint_mode" = "legacy" ]; then exit 1 fi elif [ "$cert_endpoint_mode" = "rcv1p" ]; then - if is_opted_in_for_root_certs; then + is_opted_in_for_root_certs + opt_in_result=$? + if [ $opt_in_result -eq 2 ]; then + # Fatal: wireserver was unreachable after retries. We cannot determine whether + # the node should use hardened certs or the default trust store. Silently + # falling back to the distro trust store would be a security hole if the + # customer intended hardened certs, so we fail hard here. + echo "ERROR: cannot provision node — wireserver unreachable for cert opt-in check" + exit 1 + elif [ $opt_in_result -eq 0 ]; then install_ca_refresh_schedule=1 if retrieve_rcv1p_certs; then install_certs_to_trust_store From eb412d2cd4c2a6d1f84f7aabea2a77baa0c4d5e1 Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Mon, 27 Apr 2026 10:31:04 -0700 Subject: [PATCH 050/103] fix: use RCV1P Azure CNI cluster for Windows tests when explicit subscription set When RCV1P_SUBSCRIPTION_ID is set, Windows RCV1P positive tests set Scenario.AzureClient/SubscriptionID to the RCV1P subscription, but rcv1pWindowsCluster() always returned ClusterAzureNetwork (default subscription). This subscription mismatch would cause VMSS creation to 404 in the RCV1P subscription's node resource group. Fix: - Add ClusterRCV1PAzureNetwork in cache.go (Azure CNI cluster using RCV1PClusterInfra) - Branch rcv1pWindowsCluster() on hasExplicitRCV1PSubscription(), matching the pattern used by rcv1pCluster() for Linux - Fix Test_RCV1P_Windows_NotOptedIn to use ClusterRCV1PAzureNetwork instead of ClusterRCV1PKubenet (Windows needs Azure CNI) Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Ramkumar Chinchani --- e2e/cache.go | 13 +++++++++++++ e2e/scenario_rcv1p_win_test.go | 2 +- 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/e2e/cache.go b/e2e/cache.go index 76514bbb5be..174b20335ba 100644 --- a/e2e/cache.go +++ b/e2e/cache.go @@ -210,6 +210,19 @@ func clusterCiliumNetwork(ctx context.Context, request ClusterRequest) (*Cluster return prepareCluster(ctx, model, false, false) } +var ClusterRCV1PAzureNetwork = cachedFunc(clusterRCV1PAzureNetwork) + +// clusterRCV1PAzureNetwork creates an Azure CNI cluster in the RCV1P subscription for Windows cert mode testing. +// Windows tests require Azure CNI (not kubenet) because baseTemplateWindows() configures the NBC for +// Azure CNI overlay mode. +func clusterRCV1PAzureNetwork(ctx context.Context, request ClusterRequest) (*Cluster, error) { + infra := RCV1PClusterInfra() + if infra == nil { + return nil, fmt.Errorf("RCV1P_SUBSCRIPTION_ID not set, cannot create RCV1P Azure CNI cluster") + } + return prepareCluster(ctx, infra, getAzureNetworkClusterModel("abe2e-rcv1p-azure-v1", request.Location, request.K8sSystemPoolSKU), false, false) +} + // isNotFoundErr checks if an error represents a "not found" response from Azure API func isNotFoundErr(err error) bool { var respErr *azcore.ResponseError diff --git a/e2e/scenario_rcv1p_win_test.go b/e2e/scenario_rcv1p_win_test.go index c05b0607358..d089f8572ff 100644 --- a/e2e/scenario_rcv1p_win_test.go +++ b/e2e/scenario_rcv1p_win_test.go @@ -181,7 +181,7 @@ func Test_RCV1P_Windows_NotOptedIn(t *testing.T) { RCV1PCertMode: true, }, Config: Config{ - Cluster: ClusterRCV1PKubenet, + Cluster: ClusterRCV1PAzureNetwork, VHD: config.VHDWindows2022Containerd, BootstrapConfigMutator: EmptyBootstrapConfigMutator, Validator: func(ctx context.Context, s *Scenario) { From 057a92b2d196e136d7507834ebadec3c90ed4f3f Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Mon, 27 Apr 2026 10:41:26 -0700 Subject: [PATCH 051/103] fix: replace legacy ca-refresh cron entry with location-aware version On custom clouds (AGC, Delos) where an older version of this script already installed a ca-refresh cron entry without the location argument, the idempotency grep would match the old entry and skip adding the new one. The old cron entry runs ca-refresh with an empty location, causing get_cert_endpoint_mode to default to rcv1p instead of legacy for ussec/usnat environments. Fix: always remove any existing ca-refresh entry for this script and re-add it with the explicit location argument, ensuring upgraded nodes get the correct endpoint mode on periodic refresh. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Ramkumar Chinchani --- .../artifacts/init-aks-custom-cloud.sh | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh b/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh index 8bc6a52e112..3b06a012089 100644 --- a/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh +++ b/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh @@ -304,11 +304,17 @@ if [ "$IS_UBUNTU" -eq 1 ] || [ "$IS_MARINER" -eq 1 ] || [ "$IS_AZURELINUX" -eq 1 fi if [ "$install_ca_refresh_schedule" -eq 1 ]; then - if ! crontab -l 2>/dev/null | grep -q "\"$scriptPath\" ca-refresh"; then - # Quote the script path in the cron entry to avoid issues with spaces or special characters. - if ! (crontab -l 2>/dev/null ; printf '%s\n' "0 19 * * * \"$scriptPath\" ca-refresh \"$LOCATION\"") | crontab -; then - echo "Failed to install ca-refresh cron job via crontab" >&2 - fi + # Remove any existing ca-refresh entry for this script (may lack the location argument + # from older VHDs on custom clouds like AGC/Delos) and re-add with the explicit location. + # Without the location argument, ca-refresh defaults endpoint mode to rcv1p which is + # wrong for ussec/usnat legacy environments. + local new_entry="0 19 * * * \"$scriptPath\" ca-refresh \"$LOCATION\"" + local existing + existing=$(crontab -l 2>/dev/null || true) + local filtered + filtered=$(printf '%s\n' "$existing" | grep -v "\"$scriptPath\" ca-refresh" || true) + if ! (printf '%s\n' "$filtered"; printf '%s\n' "$new_entry") | sed '/^$/d' | crontab -; then + echo "Failed to install ca-refresh cron job via crontab" >&2 fi fi elif [ "$IS_FLATCAR" -eq 1 ] || [ "$IS_ACL" -eq 1 ]; then From fd6df999947f8b324888f1e35ad6a1bf116a02ce Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Mon, 27 Apr 2026 10:49:05 -0700 Subject: [PATCH 052/103] fix: align Windows wireserver retries to 10 to match Linux parity All wireserver Retry-Command calls in kubernetesfunc.ps1 increased from 5 to 10 retries, matching Linux make_request_with_retry which uses 10 retries with exponential backoff. Under rate-limiting or transient wireserver unavailability, 5 retries (50s) could exhaust before the endpoint recovers. Added comments explaining: - Retry count parity with Linux - Security rationale: wireserver unreachable with -FailOnError is fatal because silently falling back to the OS default trust store would be a security hole if the customer intended hardened certs Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Ramkumar Chinchani --- .../artifacts/init-aks-custom-cloud.sh | 4 +--- staging/cse/windows/kubernetesfunc.ps1 | 18 +++++++++++++----- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh b/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh index 3b06a012089..fd6a9942b19 100644 --- a/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh +++ b/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh @@ -308,10 +308,8 @@ if [ "$IS_UBUNTU" -eq 1 ] || [ "$IS_MARINER" -eq 1 ] || [ "$IS_AZURELINUX" -eq 1 # from older VHDs on custom clouds like AGC/Delos) and re-add with the explicit location. # Without the location argument, ca-refresh defaults endpoint mode to rcv1p which is # wrong for ussec/usnat legacy environments. - local new_entry="0 19 * * * \"$scriptPath\" ca-refresh \"$LOCATION\"" - local existing + new_entry="0 19 * * * \"$scriptPath\" ca-refresh \"$LOCATION\"" existing=$(crontab -l 2>/dev/null || true) - local filtered filtered=$(printf '%s\n' "$existing" | grep -v "\"$scriptPath\" ca-refresh" || true) if ! (printf '%s\n' "$filtered"; printf '%s\n' "$new_entry") | sed '/^$/d' | crontab -; then echo "Failed to install ca-refresh cron job via crontab" >&2 diff --git a/staging/cse/windows/kubernetesfunc.ps1 b/staging/cse/windows/kubernetesfunc.ps1 index b5d83453ffa..7b8110a4ac7 100644 --- a/staging/cse/windows/kubernetesfunc.ps1 +++ b/staging/cse/windows/kubernetesfunc.ps1 @@ -309,7 +309,9 @@ function Should-InstallCACertificatesRefreshTask { try { $optInUri = 'http://168.63.129.16/acms/isOptedInForRootCerts' - $optInResponse = Retry-Command -Command 'Invoke-WebRequest' -Args @{Uri=$optInUri; UseBasicParsing=$true} -Retries 5 -RetryDelaySeconds 10 + # Use 10 retries to match Linux make_request_with_retry resilience against + # transient wireserver unavailability and rate limiting. + $optInResponse = Retry-Command -Command 'Invoke-WebRequest' -Args @{Uri=$optInUri; UseBasicParsing=$true} -Retries 10 -RetryDelaySeconds 10 Write-Log "IsOptedInForRootCerts wireserver response: $($optInResponse.Content)" $optInJson = $optInResponse.Content | ConvertFrom-Json return ($optInJson.IsOptedInForRootCerts -eq $true) @@ -353,10 +355,15 @@ function Get-CACertificates { Write-Log "Get CA certificates. Location: $Location. EndpointMode: $certEndpointMode" } + # Get-CACertificates downloads Azure root CA certificates from wireserver and writes them + # to the local certificate folder. When called with -FailOnError, wireserver unreachable + # after retries is fatal — silently falling back to the OS default trust store would be a + # security hole if the customer intended hardened root certs. This matches the Linux + # behavior in init-aks-custom-cloud.sh (is_opted_in_for_root_certs return code 2 = fatal). try { if ($certEndpointMode -eq "legacy") { $uri = 'http://168.63.129.16/machine?comp=acmspackage&type=cacertificates&ext=json' - $rawData = Retry-Command -Command 'Invoke-WebRequest' -Args @{Uri=$uri; UseBasicParsing=$true} -Retries 5 -RetryDelaySeconds 10 + $rawData = Retry-Command -Command 'Invoke-WebRequest' -Args @{Uri=$uri; UseBasicParsing=$true} -Retries 10 -RetryDelaySeconds 10 $caCerts = ($rawData.Content) | ConvertFrom-Json if ($null -eq $caCerts -or $null -eq $caCerts.Certificates -or $caCerts.Certificates.Length -eq 0) { if ($FailOnError) { @@ -377,7 +384,8 @@ function Get-CACertificates { } $optInUri = 'http://168.63.129.16/acms/isOptedInForRootCerts' - $optInResponse = Retry-Command -Command 'Invoke-WebRequest' -Args @{Uri=$optInUri; UseBasicParsing=$true} -Retries 5 -RetryDelaySeconds 10 + # Wireserver opt-in check: 10 retries to match Linux make_request_with_retry. + $optInResponse = Retry-Command -Command 'Invoke-WebRequest' -Args @{Uri=$optInUri; UseBasicParsing=$true} -Retries 10 -RetryDelaySeconds 10 Write-Log "IsOptedInForRootCerts wireserver response: $($optInResponse.Content)" $optInJson = $optInResponse.Content | ConvertFrom-Json if ($optInJson.IsOptedInForRootCerts -ne $true) { @@ -390,7 +398,7 @@ function Get-CACertificates { foreach ($requestType in $operationRequestTypes) { $operationRequestUri = "http://168.63.129.16/machine?comp=acmspackage&type=$requestType&ext=json" - $operationResponse = Retry-Command -Command 'Invoke-WebRequest' -Args @{Uri=$operationRequestUri; UseBasicParsing=$true} -Retries 5 -RetryDelaySeconds 10 + $operationResponse = Retry-Command -Command 'Invoke-WebRequest' -Args @{Uri=$operationRequestUri; UseBasicParsing=$true} -Retries 10 -RetryDelaySeconds 10 $operationJson = ($operationResponse.Content) | ConvertFrom-Json if ($null -eq $operationJson -or $null -eq $operationJson.OperationsInfo) { @@ -408,7 +416,7 @@ function Get-CACertificates { $resourceExt = [IO.Path]::GetExtension($resourceFileName).TrimStart('.') $resourceUri = "http://168.63.129.16/machine?comp=acmspackage&type=$resourceType&ext=$resourceExt" - $certContentResponse = Retry-Command -Command 'Invoke-WebRequest' -Args @{Uri=$resourceUri; UseBasicParsing=$true} -Retries 5 -RetryDelaySeconds 10 + $certContentResponse = Retry-Command -Command 'Invoke-WebRequest' -Args @{Uri=$resourceUri; UseBasicParsing=$true} -Retries 10 -RetryDelaySeconds 10 if ([string]::IsNullOrEmpty($certContentResponse.Content)) { Write-Log "Warning: empty certificate content for $resourceFileName" continue From 9669db8ab9a346b80346f27b3097d186680abead Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Wed, 29 Apr 2026 16:13:17 -0700 Subject: [PATCH 053/103] fix: enhance RCV1P opt-in tag handling in VMSS creation process Signed-off-by: Ramkumar Chinchani --- e2e/scenario_rcv1p_win_test.go | 1 + e2e/vmss.go | 78 ++++++++++++++++++++++++++++++++++ 2 files changed, 79 insertions(+) diff --git a/e2e/scenario_rcv1p_win_test.go b/e2e/scenario_rcv1p_win_test.go index d089f8572ff..ca8ca02ba17 100644 --- a/e2e/scenario_rcv1p_win_test.go +++ b/e2e/scenario_rcv1p_win_test.go @@ -14,6 +14,7 @@ import ( "github.com/Azure/agentbaker/e2e/config" "github.com/Azure/agentbaker/pkg/agent/datamodel" + "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute/v7" ) // Test_RCV1P_Windows2022 validates RCV1P cert download and Windows certificate store diff --git a/e2e/vmss.go b/e2e/vmss.go index 07d087597c7..2d28d4aacf2 100644 --- a/e2e/vmss.go +++ b/e2e/vmss.go @@ -533,6 +533,11 @@ func CreateVMSS(ctx context.Context, s *Scenario, resourceGroupName string) (*Sc model := createVMSSModel(ctx, s) + // Record whether the outgoing VMSS model includes the RCV1P opt-in tag. + // This is used after creation to detect platform auto-injection vs tags we set. + rcv1pTagKey := "platformsettings.host_environment.service.platform_optedin_for_rootcerts" + _, requestedRCV1PTag := model.Tags[rcv1pTagKey] + // For scenarios that need VM instance tags (e.g., RCV1P), we must apply tags // before CSE runs because wireserver checks per-VM-instance tags. The only // working method for Uniform VMSS is BeginUpdate (full PUT), which takes ~108s. @@ -620,6 +625,50 @@ func CreateVMSS(ctx context.Context, s *Scenario, resourceGroupName string) (*Sc result += fmt.Sprintf(`az network bastion ssh --target-resource-id "%s" --name "%s" --resource-group %s --auth-type ssh-key --username azureuser --ssh-key %s`, *vm.VM.ID, SharedBastionName, config.ResourceGroupName(*s.Runtime.Cluster.Model.Location), config.VMSSHPrivateKeyFileName) + "\n" s.T.Log(result) + // Log VMSS tags for diagnostics (visible in test-log.json via gotestsum --jsonfile). + // For RCV1P tests, compares request tags vs response tags to detect platform auto-injection. + vmssID := "" + if vmssResp.ID != nil { + vmssID = *vmssResp.ID + } + if vmssResp.Tags != nil { + s.T.Logf("VMSS %s (id: %s) tags (%d):", s.Runtime.VMSSName, vmssID, len(vmssResp.Tags)) + for k, v := range vmssResp.Tags { + val := "" + if v != nil { + val = *v + } + if k == rcv1pTagKey { + if requestedRCV1PTag { + s.T.Logf(" tag: %s = %s [RCV1P opt-in tag — set by us in VMSS request]", k, val) + } else { + s.T.Logf(" tag: %s = %s [RCV1P opt-in tag — AUTO-INJECTED by platform, NOT in our VMSS request]", k, val) + } + } else { + s.T.Logf(" tag: %s = %s", k, val) + } + } + // Detect platform auto-injection: tag appeared in response but was NOT in our request. + if respVal, hasTag := vmssResp.Tags[rcv1pTagKey]; hasTag && !requestedRCV1PTag { + val := "" + if respVal != nil { + val = *respVal + } + s.T.Logf("WARNING: platform auto-injected RCV1P opt-in tag %q=%s on VMSS — "+ + "PlatformSettingsOverride feature flag may be causing auto-injection on subscription %s", + rcv1pTagKey, val, s.GetSubscriptionID()) + if s.Tags.RCV1PCertMode && strings.EqualFold(val, "true") { + s.T.Logf("WARNING: auto-injected tag value is 'true' — negative (NotOptedIn) tests will be "+ + "INVALID on this subscription because wireserver will serve certificates regardless of our intent") + } + } + if _, hasTag := vmssResp.Tags[rcv1pTagKey]; !hasTag && s.Tags.RCV1PCertMode { + s.T.Logf(" RCV1P opt-in tag %q NOT present on VMSS (not in request, not auto-injected) — "+ + "wireserver should report IsOptedInForRootCerts=false", rcv1pTagKey) + } + } else { + s.T.Logf("VMSS %s (id: %s) has no tags", s.Runtime.VMSSName, vmssID) + } if !s.Config.SkipSSHConnectivityValidation { var bastErr error vm.SSHClient, bastErr = DialSSHOverBastion(ctx, s.Runtime.Cluster.Bastion, vm.PrivateIP, config.VMSSHPrivateKey) @@ -634,6 +683,35 @@ func CreateVMSS(ctx context.Context, s *Scenario, resourceGroupName string) (*Sc return vm, fmt.Errorf("failed to wait for VM to reach running state: %w", err) } + // Log VM instance tags for diagnostics (visible in test-log.json via gotestsum --jsonfile) + vmInstanceID := "" + if vm.VM.ID != nil { + vmInstanceID = *vm.VM.ID + } + if vm.VM.Tags != nil { + s.T.Logf("VM instance %s (id: %s) tags (%d):", *vm.VM.InstanceID, vmInstanceID, len(vm.VM.Tags)) + for k, v := range vm.VM.Tags { + val := "" + if v != nil { + val = *v + } + if k == rcv1pTagKey { + if requestedRCV1PTag { + s.T.Logf(" tag: %s = %s [RCV1P opt-in tag — inherited from VMSS, set by us]", k, val) + } else { + s.T.Logf(" tag: %s = %s [RCV1P opt-in tag — inherited from VMSS, AUTO-INJECTED by platform]", k, val) + } + } else { + s.T.Logf(" tag: %s = %s", k, val) + } + } + if _, hasTag := vm.VM.Tags[rcv1pTagKey]; !hasTag && s.Tags.RCV1PCertMode { + s.T.Logf(" [RCV1P opt-in tag %q NOT present on VM instance — this is expected for negative tests]", rcv1pTagKey) + } + } else { + s.T.Logf("VM instance %s (id: %s) has no tags", *vm.VM.InstanceID, vmInstanceID) + } + return &ScenarioVM{ VMSS: &vmssResp.VirtualMachineScaleSet, PrivateIP: vm.PrivateIP, From bf76e250305fc65e1dd1e7c5d53c7e189bf66fd6 Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Wed, 6 May 2026 14:41:51 -0700 Subject: [PATCH 054/103] fix: use Azure CNI cluster for Windows RCV1P tests Windows baseTemplateWindows() configures NBC with NetworkPlugin=azure and NetworkPluginMode=overlay. Using a kubenet cluster causes azure-vnet plugin IPAM failures on the node. Switch all Windows RCV1P tests to use ClusterRCV1PAzureNetwork which creates an Azure CNI overlay cluster in the RCV1P subscription. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Ramkumar Chinchani --- e2e/scenario_rcv1p_win_test.go | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/e2e/scenario_rcv1p_win_test.go b/e2e/scenario_rcv1p_win_test.go index ca8ca02ba17..b2188bd2dd5 100644 --- a/e2e/scenario_rcv1p_win_test.go +++ b/e2e/scenario_rcv1p_win_test.go @@ -29,7 +29,7 @@ func Test_RCV1P_Windows2022(t *testing.T) { RCV1PCertMode: true, }, Config: Config{ - Cluster: ClusterRCV1PKubenet, + Cluster: ClusterRCV1PAzureNetwork, VHD: config.VHDWindows2022Containerd, VMConfigMutator: rcv1pOptInVMConfigMutator, VMInstanceTags: rcv1pVMInstanceTags(), @@ -52,7 +52,7 @@ func Test_RCV1P_Windows23H2(t *testing.T) { RCV1PCertMode: true, }, Config: Config{ - Cluster: ClusterRCV1PKubenet, + Cluster: ClusterRCV1PAzureNetwork, VHD: config.VHDWindows23H2, VMConfigMutator: rcv1pOptInVMConfigMutator, VMInstanceTags: rcv1pVMInstanceTags(), @@ -75,7 +75,7 @@ func Test_RCV1P_Windows2025(t *testing.T) { RCV1PCertMode: true, }, Config: Config{ - Cluster: ClusterRCV1PKubenet, + Cluster: ClusterRCV1PAzureNetwork, VHD: config.VHDWindows2025, VMConfigMutator: func(vmss *armcompute.VirtualMachineScaleSet) { vmss.Properties = addTrustedLaunchToVMSS(vmss.Properties) @@ -104,7 +104,7 @@ func Test_RCV1P_Windows2022Gen2(t *testing.T) { RCV1PCertMode: true, }, Config: Config{ - Cluster: ClusterRCV1PKubenet, + Cluster: ClusterRCV1PAzureNetwork, VHD: config.VHDWindows2022ContainerdGen2, VMConfigMutator: rcv1pOptInVMConfigMutator, VMInstanceTags: rcv1pVMInstanceTags(), @@ -127,7 +127,7 @@ func Test_RCV1P_Windows23H2Gen2(t *testing.T) { RCV1PCertMode: true, }, Config: Config{ - Cluster: ClusterRCV1PKubenet, + Cluster: ClusterRCV1PAzureNetwork, VHD: config.VHDWindows23H2Gen2, VMConfigMutator: rcv1pOptInVMConfigMutator, VMInstanceTags: rcv1pVMInstanceTags(), @@ -150,7 +150,7 @@ func Test_RCV1P_Windows2025Gen2(t *testing.T) { RCV1PCertMode: true, }, Config: Config{ - Cluster: ClusterRCV1PKubenet, + Cluster: ClusterRCV1PAzureNetwork, VHD: config.VHDWindows2025Gen2, VMConfigMutator: func(vmss *armcompute.VirtualMachineScaleSet) { vmss.Properties = addTrustedLaunchToVMSS(vmss.Properties) From cdba52f475d5ea3e574cbf689c373a9ed2154f81 Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Wed, 6 May 2026 14:43:24 -0700 Subject: [PATCH 055/103] revert: drop 'REVERT ME' cluster switching commits (now superseded) The following commits are superseded by the permanent fix in c71b1eb24e which correctly assigns ClusterRCV1PAzureNetwork to Windows RCV1P tests and keeps ClusterRCV1PKubenet for Linux RCV1P tests: - 286c711c9d REVERT ME: use dedicated kubenet cluster for RCV1P tests - 4de7fe5022 REVERT ME: use Azure CNI cluster for Windows RCV1P tests Both are no-ops against the current state and can be safely squashed out during final interactive rebase before merge. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Ramkumar Chinchani From 1bd253b7c67db754b95f774534ac748c3bc4713c Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Wed, 6 May 2026 15:22:54 -0700 Subject: [PATCH 056/103] revert: drop canary validator and wireserver polling debug commits Reverts: - 5c2ed65603 (canary check that guarantees test failure) - 07d1c4402a (5-minute wireserver polling loop - provisioning regression) The canary ValidateFileHasContent for a nonexistent string causes guaranteed test failures. The wireserver polling adds up to 5 minutes of sleep to every Linux RCV1P node provisioning. Remaining diagnostic commits (wireserver endpoint probing, azcopy logging, verbose output) are kept for initial rollout observability. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Ramkumar Chinchani --- e2e/validators.go | 41 +++++++++++++++++- .../artifacts/init-aks-custom-cloud.sh | 42 +++++-------------- 2 files changed, 51 insertions(+), 32 deletions(-) diff --git a/e2e/validators.go b/e2e/validators.go index 23e906f42c9..9c26e7b7c00 100644 --- a/e2e/validators.go +++ b/e2e/validators.go @@ -3173,7 +3173,46 @@ func rcv1pTrustStoreDir(s *Scenario) string { func ValidateRCV1PCertModeWindows(ctx context.Context, s *Scenario) { s.T.Helper() - // Validate the provisioning logshows wireserver was queried and returned opted-in + // REVERT ME: Diagnostic block — probe wireserver endpoints and dump CSE log tail from the VM + // so we can see exactly what the wireserver returns for operationrequests and what the CSE logged. + diagCommand := []string{ + "$ErrorActionPreference = 'Continue'", + "Write-Host '=== DIAGNOSTIC: probing wireserver rcv1p endpoints ==='", + "try {", + " $optIn = Invoke-WebRequest -Uri 'http://168.63.129.16/acms/isOptedInForRootCerts' -UseBasicParsing -TimeoutSec 30", + " Write-Host \"isOptedInForRootCerts: $($optIn.Content)\"", + "} catch { Write-Host \"isOptedInForRootCerts ERROR: $_\" }", + "try {", + " $root = Invoke-WebRequest -Uri 'http://168.63.129.16/machine?comp=acmspackage&type=operationrequestsroot&ext=json' -UseBasicParsing -TimeoutSec 30", + " Write-Host \"operationrequestsroot status=$($root.StatusCode) length=$($root.Content.Length)\"", + " Write-Host \"operationrequestsroot content: $($root.Content)\"", + "} catch { Write-Host \"operationrequestsroot ERROR: $_\" }", + "try {", + " $intermediate = Invoke-WebRequest -Uri 'http://168.63.129.16/machine?comp=acmspackage&type=operationrequestsintermediate&ext=json' -UseBasicParsing -TimeoutSec 30", + " Write-Host \"operationrequestsintermediate status=$($intermediate.StatusCode) length=$($intermediate.Content.Length)\"", + " Write-Host \"operationrequestsintermediate content: $($intermediate.Content)\"", + "} catch { Write-Host \"operationrequestsintermediate ERROR: $_\" }", + "try {", + " $legacy = Invoke-WebRequest -Uri 'http://168.63.129.16/machine?comp=acmspackage&type=cacertificates&ext=json' -UseBasicParsing -TimeoutSec 30", + " Write-Host \"legacy cacertificates status=$($legacy.StatusCode) length=$($legacy.Content.Length)\"", + " $legacyJson = $legacy.Content | ConvertFrom-Json", + " if ($legacyJson.Certificates) { Write-Host \"legacy cert count: $($legacyJson.Certificates.Length)\" } else { Write-Host 'legacy: no Certificates array' }", + "} catch { Write-Host \"legacy cacertificates ERROR: $_\" }", + "Write-Host '=== DIAGNOSTIC: C:\\ca folder contents ==='", + "if (Test-Path 'C:\\ca') { Get-ChildItem -Path 'C:\\ca' -File | ForEach-Object { Write-Host \" $($_.Name) ($($_.Length) bytes)\" } } else { Write-Host 'C:\\ca does not exist' }", + "Write-Host '=== DIAGNOSTIC: CSE log tail (last 60 lines with CA/cert/wireserver) ==='", + "if (Test-Path 'C:\\AzureData\\CustomDataSetupScript.log') {", + " Get-Content 'C:\\AzureData\\CustomDataSetupScript.log' -Tail 200 | Where-Object { $_ -match 'CA |cert|wireserver|optedin|operation|acms|Write cert|Warning' } | Select-Object -Last 60 | ForEach-Object { Write-Host $_ }", + "} else { Write-Host 'CSE log not found' }", + "Write-Host '=== END DIAGNOSTIC ==='", + } + diagResult := execScriptOnVMForScenario(ctx, s, strings.Join(diagCommand, "\n")) + s.T.Logf("REVERT ME: wireserver diagnostics stdout:\n%s", diagResult.stdout) + if diagResult.stderr != "" { + s.T.Logf("REVERT ME: wireserver diagnostics stderr:\n%s", diagResult.stderr) + } + + // Validate the provisioning log shows wireserver was queried and returned opted-in ValidateFileHasContent(ctx, s, "C:\\AzureData\\CustomDataSetupScript.log", "IsOptedInForRootCerts wireserver response:") diff --git a/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh b/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh index fd6a9942b19..f8101ac7630 100644 --- a/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh +++ b/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh @@ -86,40 +86,20 @@ function make_request_with_retry { # immediately instead of letting the node run with an insecure certificate configuration. function is_opted_in_for_root_certs { local opt_in_response - local request_status - local poll_attempt=1 - local max_poll_attempts=30 - local poll_interval=10 - - # Poll wireserver for up to ~5 minutes to allow platform metadata to sync. - # The VM instance tag triggers a Fabric Controller goal state (CCF) update, - # which must propagate to the host agent before wireserver can reflect it. - # FC goal state propagation can take several minutes in practice. - while [ $poll_attempt -le $max_poll_attempts ]; do - echo "is_opted_in_for_root_certs: poll attempt ${poll_attempt}/${max_poll_attempts}" - - opt_in_response=$(make_request_with_retry "${WIRESERVER_ENDPOINT}/acms/isOptedInForRootCerts") - request_status=$? - - echo "is_opted_in_for_root_certs: wireserver response (status=${request_status}): '${opt_in_response}'" - - if [ $request_status -ne 0 ] || [ -z "$opt_in_response" ]; then - echo "Warning: failed to determine IsOptedInForRootCerts state on attempt ${poll_attempt}" - elif echo "$opt_in_response" | grep -q "IsOptedInForRootCerts=true"; then - echo "IsOptedInForRootCerts=true (found on attempt ${poll_attempt})" - return 0 - fi - if [ $poll_attempt -lt $max_poll_attempts ]; then - echo "is_opted_in_for_root_certs: not opted in yet, waiting ${poll_interval}s before retry..." - sleep $poll_interval - fi + opt_in_response=$(make_request_with_retry "${WIRESERVER_ENDPOINT}/acms/isOptedInForRootCerts") + local request_status=$? + if [ $request_status -ne 0 ] || [ -z "$opt_in_response" ]; then + echo "Warning: failed to determine IsOptedInForRootCerts state" + return 1 + fi - poll_attempt=$((poll_attempt + 1)) - done + if echo "$opt_in_response" | grep -q "IsOptedInForRootCerts=true"; then + echo "IsOptedInForRootCerts=true" + return 0 + fi - echo "Skipping custom cloud root cert installation because IsOptedInForRootCerts is not true after ${max_poll_attempts} attempts" - echo "Last wireserver response: '${opt_in_response}'" + echo "Skipping custom cloud root cert installation because IsOptedInForRootCerts is not true" return 1 } From 9c24851e9111056ad376183bb9678639428f96f6 Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Wed, 6 May 2026 21:57:14 -0700 Subject: [PATCH 057/103] feat(e2e): auto-detect RCV1P feature flag on E2E subscription When RCV1P_SUBSCRIPTION_ID is not explicitly set, the skip logic now checks whether the E2E subscription (E2E_SUBSCRIPTION_ID) has the PlatformSettingsOverride feature flag registered. If it does, the RCV1P tests run automatically using the E2E subscription. This enables MSFT tenant pipelines (where the E2E subscription is already enrolled) to run RCV1P tests without a separate variable. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Ramkumar Chinchani --- e2e/scenario_rcv1p_test.go | 63 ++++++++++++++++++++++++-------------- 1 file changed, 40 insertions(+), 23 deletions(-) diff --git a/e2e/scenario_rcv1p_test.go b/e2e/scenario_rcv1p_test.go index 4504ad976b6..27f9e92e0ec 100644 --- a/e2e/scenario_rcv1p_test.go +++ b/e2e/scenario_rcv1p_test.go @@ -37,23 +37,53 @@ import ( // if the subscription has the PlatformSettingsOverride feature registered. const rcv1pOptInTag = "platformsettings.host_environment.service.platform_optedin_for_rootcerts" -// skipIfRCV1PNotConfigured skips the test when the RCV1P subscription is not configured. -// This happens in regular CI runs where the RCV1P variable group is not linked, causing -// Azure DevOps to pass the literal unexpanded string "$(RCV1P_SUBSCRIPTION_ID)". -// It always logs the feature flag status on the E2E subscription for diagnostics, -// and verifies the flag is registered on the RCV1P subscription when available. +// skipIfRCV1PNotConfigured skips the test when no subscription with the RCV1P feature flag +// is available. It checks in order: +// 1. Explicit RCV1P_SUBSCRIPTION_ID (dedicated RCV1P subscription) +// 2. E2E_SUBSCRIPTION_ID auto-detection (checks if the feature flag is registered) +// +// When E2E_SUBSCRIPTION_ID has the feature flag registered (e.g., MSFT tenant pipelines), +// the RCV1P tests run automatically without needing a separate variable. func skipIfRCV1PNotConfigured(t *testing.T) { t.Helper() - // Always log feature flag status on the default E2E subscription for diagnostics - logE2ESubscriptionFeatureFlag(t) subID := strings.TrimSpace(config.Config.RCV1PSubscriptionID) - if subID == "" || strings.HasPrefix(subID, "$(") { - t.Skip("RCV1P_SUBSCRIPTION_ID not set or not resolved, skipping RCV1P cert mode test") + if subID != "" && !strings.HasPrefix(subID, "$(") { + // Explicit RCV1P subscription configured — verify it has the feature flag + checkPlatformSettingsOverrideFeatureFlag(t, subID, config.RCV1PAzure, true) + return + } + + // No explicit RCV1P subscription — try auto-detecting from the E2E subscription + t.Log("RCV1P_SUBSCRIPTION_ID not set, checking if E2E subscription has PlatformSettingsOverride feature flag...") + e2eSubID := strings.TrimSpace(config.Config.SubscriptionID) + if e2eSubID == "" { + t.Skip("neither RCV1P_SUBSCRIPTION_ID nor E2E_SUBSCRIPTION_ID is set, skipping RCV1P test") + } + + e2eAzure, err := config.NewAzureClient() + if err != nil { + t.Skipf("failed to create E2E Azure client for feature flag auto-detection: %v", err) + } + + registered, err := queryFeatureFlag(t.Context(), e2eSubID, e2eAzure) + if err != nil { + t.Skipf("failed to query feature flag on E2E subscription %s: %v", e2eSubID, err) } - checkPlatformSettingsOverrideFeatureFlag(t, subID, config.RCV1PAzure, true) + if !registered { + t.Skipf("E2E subscription %s does not have PlatformSettingsOverride registered, skipping RCV1P test", e2eSubID) + } + + // E2E subscription is enrolled — configure RCV1P globals so the rest of the test infra works + t.Logf("auto-detected PlatformSettingsOverride on E2E subscription %s, using it for RCV1P tests", e2eSubID) + rcv1pAutoDetectOnce.Do(func() { + config.Config.RCV1PSubscriptionID = e2eSubID + config.RCV1PAzure = e2eAzure + }) } +var rcv1pAutoDetectOnce sync.Once + var ( featureFlagChecks sync.Map // subscriptionID -> *featureFlagResult ) @@ -90,19 +120,6 @@ func checkPlatformSettingsOverrideFeatureFlag(t *testing.T, subscriptionID strin } } -// logE2ESubscriptionFeatureFlag logs the PlatformSettingsOverride feature flag status on the -// default E2E subscription for diagnostic purposes. This helps understand wireserver behavior -// (e.g., IsOptedInForRootCerts responses) even in non-RCV1P test runs. -func logE2ESubscriptionFeatureFlag(t *testing.T) { - t.Helper() - e2eAzure, err := config.NewAzureClient() - if err != nil { - t.Logf("WARNING: failed to create E2E Azure client for feature flag check: %v", err) - return - } - checkPlatformSettingsOverrideFeatureFlag(t, config.Config.SubscriptionID, e2eAzure, false) -} - func queryFeatureFlag(ctx context.Context, subscriptionID string, client *config.AzureClient) (bool, error) { url := fmt.Sprintf( "https://management.azure.com/subscriptions/%s/providers/Microsoft.Features/providers/Microsoft.Compute/features/PlatformSettingsOverride?api-version=2021-07-01", From 3732a24eec01861da6faf824ee6ee416d84d0e17 Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Thu, 7 May 2026 09:33:56 -0700 Subject: [PATCH 058/103] fix(e2e): skip NotOptedIn tests on auto-detected enrolled subscriptions On subscriptions with PlatformSettingsOverride registered, the platform auto-injects the opt-in tag on ALL VMSSes, making the 'not opted in' negative test scenario impossible. Skip these tests when the RCV1P subscription was auto-detected from the E2E subscription. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Ramkumar Chinchani --- e2e/scenario_rcv1p_test.go | 22 +++++++++++++++++++++- e2e/scenario_rcv1p_win_test.go | 1 + 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/e2e/scenario_rcv1p_test.go b/e2e/scenario_rcv1p_test.go index 27f9e92e0ec..eafe8d52524 100644 --- a/e2e/scenario_rcv1p_test.go +++ b/e2e/scenario_rcv1p_test.go @@ -79,10 +79,29 @@ func skipIfRCV1PNotConfigured(t *testing.T) { rcv1pAutoDetectOnce.Do(func() { config.Config.RCV1PSubscriptionID = e2eSubID config.RCV1PAzure = e2eAzure + rcv1pAutoDetected = true }) } -var rcv1pAutoDetectOnce sync.Once +var ( + rcv1pAutoDetectOnce sync.Once + // rcv1pAutoDetected is true when the RCV1P subscription was auto-detected from the + // E2E subscription rather than explicitly set via RCV1P_SUBSCRIPTION_ID. On auto-detected + // (enrolled) subscriptions, the platform auto-injects the opt-in tag on ALL VMSSes, + // making "not opted in" negative tests impossible. + rcv1pAutoDetected bool +) + +// skipNotOptedInOnAutoDetect skips NotOptedIn negative tests when the RCV1P subscription was +// auto-detected. On enrolled subscriptions, the platform auto-injects the opt-in tag on ALL +// VMSSes, making it impossible to test the "not opted in" scenario. +func skipNotOptedInOnAutoDetect(t *testing.T) { + t.Helper() + if rcv1pAutoDetected { + t.Skip("skipping NotOptedIn test: RCV1P subscription was auto-detected from E2E subscription — " + + "platform auto-injects opt-in tag on all VMSSes in enrolled subscriptions") + } +} var ( featureFlagChecks sync.Map // subscriptionID -> *featureFlagResult @@ -307,6 +326,7 @@ func Test_RCV1P_ACL(t *testing.T) { // subscription feature alone is not sufficient — the VM must also be explicitly tagged. func Test_RCV1P_NotOptedIn(t *testing.T) { skipIfRCV1PNotConfigured(t) + skipNotOptedInOnAutoDetect(t) RunScenario(t, &Scenario{ Description: "Tests RCV1P cert mode without VM opt-in tag; expects no cert installation", AzureClient: config.RCV1PAzure, diff --git a/e2e/scenario_rcv1p_win_test.go b/e2e/scenario_rcv1p_win_test.go index b2188bd2dd5..948f5b3d263 100644 --- a/e2e/scenario_rcv1p_win_test.go +++ b/e2e/scenario_rcv1p_win_test.go @@ -174,6 +174,7 @@ func Test_RCV1P_Windows2025Gen2(t *testing.T) { // script correctly skips certificate download and refresh task registration. func Test_RCV1P_Windows_NotOptedIn(t *testing.T) { skipIfRCV1PNotConfigured(t) + skipNotOptedInOnAutoDetect(t) RunScenario(t, &Scenario{ Description: "Tests RCV1P cert mode on Windows without VM opt-in tag; expects no cert installation", AzureClient: config.RCV1PAzure, From 2af3e4cdf75baa7203e2958443d6178a07e45962 Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Thu, 7 May 2026 10:36:31 -0700 Subject: [PATCH 059/103] fix(e2e): use caller context in getCustomScriptExtensionStatus Replace context.Background() with the caller's context so the VM instance view fetch respects test/scenario timeouts instead of potentially hanging indefinitely. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Ramkumar Chinchani --- e2e/test_helpers.go | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/e2e/test_helpers.go b/e2e/test_helpers.go index 35f5b51cb4f..d7ad6e70720 100644 --- a/e2e/test_helpers.go +++ b/e2e/test_helpers.go @@ -374,7 +374,7 @@ func prepareAKSNode(ctx context.Context, s *Scenario) (*ScenarioVM, error) { require.NoError(s.T, err, "create vmss %q, check %s for vm logs", s.Runtime.VMSSName, testDir(s.T)) } - err = getCustomScriptExtensionStatus(s, scenarioVM.VM) + err = getCustomScriptExtensionStatus(ctx, s, scenarioVM.VM) require.NoError(s.T, err) if !s.Config.SkipDefaultValidation { @@ -485,12 +485,11 @@ func validateVM(ctx context.Context, s *Scenario) { } } -func getCustomScriptExtensionStatus(s *Scenario, vmssVM *armcompute.VirtualMachineScaleSetVM) error { +func getCustomScriptExtensionStatus(ctx context.Context, s *Scenario, vmssVM *armcompute.VirtualMachineScaleSetVM) error { // Re-fetch the VM with instance view to ensure we have fresh extension status data. // The VM object passed in may have been fetched before the CSE finished executing, // so the extension status message could be empty or stale. if vmssVM.InstanceID != nil { - ctx := context.Background() freshVM, err := s.GetAzure().VMSSVM.Get(ctx, *s.Runtime.Cluster.Model.Properties.NodeResourceGroup, s.Runtime.VMSSName, From 9466de4c38b6460a53552fa38f3d61f165a930c2 Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Thu, 7 May 2026 13:15:34 -0700 Subject: [PATCH 060/103] fix(e2e): remove TrustedLaunch from non-Gen2 Windows 2025 RCV1P test The windows-2025 image does not support TrustedLaunch, only the Gen2 variant does. This matches the pattern on main where Test_Windows2025 uses EmptyVMConfigMutator without TrustedLaunch. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Ramkumar Chinchani --- e2e/scenario_rcv1p_win_test.go | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/e2e/scenario_rcv1p_win_test.go b/e2e/scenario_rcv1p_win_test.go index 948f5b3d263..3621446896b 100644 --- a/e2e/scenario_rcv1p_win_test.go +++ b/e2e/scenario_rcv1p_win_test.go @@ -75,12 +75,9 @@ func Test_RCV1P_Windows2025(t *testing.T) { RCV1PCertMode: true, }, Config: Config{ - Cluster: ClusterRCV1PAzureNetwork, - VHD: config.VHDWindows2025, - VMConfigMutator: func(vmss *armcompute.VirtualMachineScaleSet) { - vmss.Properties = addTrustedLaunchToVMSS(vmss.Properties) - rcv1pOptInVMConfigMutator(vmss) - }, + Cluster: ClusterRCV1PAzureNetwork, + VHD: config.VHDWindows2025, + VMConfigMutator: rcv1pOptInVMConfigMutator, VMInstanceTags: rcv1pVMInstanceTags(), BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) { Windows2025BootstrapConfigMutator(t, nbc) From 54ffb329f0948fed7ebaf20943ca48ee79fcf345 Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Thu, 7 May 2026 13:16:27 -0700 Subject: [PATCH 061/103] fix: return code 2 when wireserver is unreachable in is_opted_in_for_root_certs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The function documented return code 2 for 'wireserver unreachable' and the caller correctly checked for it, but the implementation returned 1 (not opted in) on request failure. This silently skipped cert installation on wireserver outages — a security hole if the subscription is enrolled for hardened certs. Now returns 2 on failure so the caller treats it as fatal, matching the documented contract. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Ramkumar Chinchani --- parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh b/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh index f8101ac7630..f22a78fd34f 100644 --- a/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh +++ b/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh @@ -90,8 +90,8 @@ function is_opted_in_for_root_certs { opt_in_response=$(make_request_with_retry "${WIRESERVER_ENDPOINT}/acms/isOptedInForRootCerts") local request_status=$? if [ $request_status -ne 0 ] || [ -z "$opt_in_response" ]; then - echo "Warning: failed to determine IsOptedInForRootCerts state" - return 1 + echo "ERROR: wireserver unreachable or returned empty response for IsOptedInForRootCerts" + return 2 fi if echo "$opt_in_response" | grep -q "IsOptedInForRootCerts=true"; then From f0a139637eb9a7463ab8c570b0ad9850cebe4f51 Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Thu, 7 May 2026 13:17:56 -0700 Subject: [PATCH 062/103] fix: throw when opted-in but no certs downloaded with -FailOnError When IsOptedInForRootCerts is true but no certificates are downloaded, Get-CACertificates only logged a warning and returned \False. Because the caller (BasePrep) doesn't check the return value, provisioning continued without the required CA set. Now throws when -FailOnError is set and no certs were downloaded, matching the fail-closed contract. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Ramkumar Chinchani --- staging/cse/windows/kubernetesfunc.ps1 | 3 +++ staging/cse/windows/kubernetesfunc.tests.ps1 | 16 ++++++++++++++++ 2 files changed, 19 insertions(+) diff --git a/staging/cse/windows/kubernetesfunc.ps1 b/staging/cse/windows/kubernetesfunc.ps1 index 7b8110a4ac7..36000558b91 100644 --- a/staging/cse/windows/kubernetesfunc.ps1 +++ b/staging/cse/windows/kubernetesfunc.ps1 @@ -430,6 +430,9 @@ function Get-CACertificates { } if (-not $downloadedAny) { + if ($FailOnError) { + throw "No CA certificates were downloaded in rcv1p mode despite IsOptedInForRootCerts=true" + } Write-Log "Warning: no CA certificates were downloaded in rcv1p mode" } diff --git a/staging/cse/windows/kubernetesfunc.tests.ps1 b/staging/cse/windows/kubernetesfunc.tests.ps1 index 924ccf13fc5..8cf053a5d1d 100644 --- a/staging/cse/windows/kubernetesfunc.tests.ps1 +++ b/staging/cse/windows/kubernetesfunc.tests.ps1 @@ -206,6 +206,22 @@ Describe 'Get-CACertificates' { { Get-CACertificates -Location 'ussecwest' -FailOnError } | Should -Throw '*CA certificates rawdata is empty*' } + It 'throws when opted in but no certs downloaded with -FailOnError' { + $script:callCount = 0 + Mock Retry-Command -MockWith { + param($Command, $Args, $Retries, $RetryDelaySeconds) + $script:callCount++ + $uri = $PSBoundParameters['Args'].Uri + if ($uri -match 'isOptedInForRootCerts') { + return [PSCustomObject]@{ Content = '{"IsOptedInForRootCerts":true}' } + } + # Return empty operation info for cert endpoints + return [PSCustomObject]@{ Content = '{"OperationsInfo":[]}' } + } + + { Get-CACertificates -Location 'southcentralus' -FailOnError } | Should -Throw '*No CA certificates were downloaded*' + } + It 'falls back to legacy endpoint when called without -Location (backward compat)' { $script:retryUris = @() Mock Retry-Command -MockWith { From 56bf65d8f0e021c34239371cb2042d29a7273d20 Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Thu, 7 May 2026 13:57:11 -0700 Subject: [PATCH 063/103] e2e: use branch-built CSE zip for Windows RCV1P tests The published CSE package (aks-windows-cse-scripts-current.zip) does not contain the RCV1P code (Get-CACertificates -Location, -FailOnError, IsOptedInForRootCerts, Register-CACertificatesRefreshTask). Without this override, Windows RCV1P E2E tests pass vacuously using the old code path. This builds a CSE zip from staging/cse/windows/ at test time, uploads it to blob storage with a SAS URL, and overrides CseScriptsPackageURL so the VMs download the branch's CSE scripts. TODO(rcv1p): remove the branch CSE zip override and rcv1pWindowsCSEMutator once the RCV1P code ships in a published CSE package. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Ramkumar Chinchani --- e2e/scenario_rcv1p_test.go | 135 +++++++++++++++++++++++++++++++++ e2e/scenario_rcv1p_win_test.go | 19 +++-- 2 files changed, 149 insertions(+), 5 deletions(-) diff --git a/e2e/scenario_rcv1p_test.go b/e2e/scenario_rcv1p_test.go index eafe8d52524..ca94f2ca67f 100644 --- a/e2e/scenario_rcv1p_test.go +++ b/e2e/scenario_rcv1p_test.go @@ -18,12 +18,16 @@ package e2e import ( + "archive/zip" "context" "fmt" "io" + "os" + "path/filepath" "strings" "sync" "testing" + "time" "github.com/Azure/agentbaker/e2e/config" "github.com/Azure/agentbaker/pkg/agent/datamodel" @@ -184,6 +188,137 @@ func rcv1pVMInstanceTags() map[string]*string { } } +// TODO(rcv1p): remove the branch CSE zip override once the RCV1P code ships in a published +// CSE package on packages.aks.azure.com. Until then, Windows E2E tests would exercise the +// old Get-CACertificates (without -Location, -FailOnError, or IsOptedInForRootCerts) from +// the released aks-windows-cse-scripts-current.zip instead of the PR's version. +var ( + branchCSEZipURL string + branchCSEZipErr error + branchCSEZipOnce sync.Once +) + +// getOrBuildBranchCSEPackageURL builds a CSE zip from staging/cse/windows/ (matching the +// pipeline packaging in .pipelines/scripts/windows_package_cse.sh) and uploads it to the +// E2E blob storage. Returns a SAS-signed URL. Uses sync.Once so the zip is built and +// uploaded exactly once across all parallel tests. +func getOrBuildBranchCSEPackageURL(t *testing.T) string { + t.Helper() + branchCSEZipOnce.Do(func() { + ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute) + defer cancel() + branchCSEZipURL, branchCSEZipErr = buildAndUploadCSEZip(ctx) + }) + if branchCSEZipErr != nil { + t.Fatalf("failed to build/upload branch CSE zip: %v", branchCSEZipErr) + } + t.Logf("using branch CSE package URL: %s", branchCSEZipURL) + return branchCSEZipURL +} + +func buildAndUploadCSEZip(ctx context.Context) (string, error) { + repoRoot, err := findRepoRoot() + if err != nil { + return "", fmt.Errorf("find repo root: %w", err) + } + cseDir := filepath.Join(repoRoot, "staging", "cse", "windows") + + tmpFile, err := os.CreateTemp("", "aks-windows-cse-scripts-branch-*.zip") + if err != nil { + return "", fmt.Errorf("create temp file: %w", err) + } + defer os.Remove(tmpFile.Name()) + defer tmpFile.Close() + + zw := zip.NewWriter(tmpFile) + err = filepath.Walk(cseDir, func(path string, info os.FileInfo, walkErr error) error { + if walkErr != nil { + return walkErr + } + rel, err := filepath.Rel(cseDir, path) + if err != nil { + return err + } + rel = filepath.ToSlash(rel) + if rel == "." { + return nil + } + // skip test files and debug helper (matches windows_package_cse.sh) + if strings.HasSuffix(rel, ".tests.ps1") || strings.Contains(rel, ".tests.suites") { + if info.IsDir() { + return filepath.SkipDir + } + return nil + } + if rel == "README" || rel == "debug/update-scripts.ps1" { + return nil + } + if info.IsDir() { + return nil + } + w, err := zw.Create(rel) + if err != nil { + return fmt.Errorf("create zip entry %s: %w", rel, err) + } + f, err := os.Open(path) + if err != nil { + return fmt.Errorf("open %s: %w", path, err) + } + defer f.Close() + _, err = io.Copy(w, f) + return err + }) + if err != nil { + return "", fmt.Errorf("build zip: %w", err) + } + if err := zw.Close(); err != nil { + return "", fmt.Errorf("close zip writer: %w", err) + } + + if _, err := tmpFile.Seek(0, io.SeekStart); err != nil { + return "", fmt.Errorf("seek temp file: %w", err) + } + + blobName := fmt.Sprintf("cse-packages/aks-windows-cse-scripts-branch-%s.zip", + time.Now().UTC().Format("20060102-150405")) + url, err := config.Azure.UploadAndGetSignedLink(ctx, blobName, tmpFile) + if err != nil { + return "", fmt.Errorf("upload CSE zip: %w", err) + } + return url, nil +} + +func findRepoRoot() (string, error) { + dir, err := os.Getwd() + if err != nil { + return "", err + } + for { + if _, err := os.Stat(filepath.Join(dir, "go.mod")); err == nil { + if filepath.Base(dir) == "e2e" { + dir = filepath.Dir(dir) + continue + } + return dir, nil + } + parent := filepath.Dir(dir) + if parent == dir { + return "", fmt.Errorf("could not find repo root (go.mod) from %s", dir) + } + dir = parent + } +} + +// rcv1pWindowsCSEMutator returns a BootstrapConfigMutator that overrides CseScriptsPackageURL +// to use the branch-built CSE zip containing the RCV1P code. +// TODO(rcv1p): remove this once the RCV1P code ships in a published CSE package. +func rcv1pWindowsCSEMutator(t *testing.T) func(*datamodel.NodeBootstrappingConfiguration) { + cseURL := getOrBuildBranchCSEPackageURL(t) + return func(nbc *datamodel.NodeBootstrappingConfiguration) { + nbc.ContainerService.Properties.WindowsProfile.CseScriptsPackageURL = cseURL + } +} + // Test_RCV1P_Ubuntu2204 validates RCV1P cert download and trust store installation on Ubuntu 22.04. // Ubuntu uses /usr/local/share/ca-certificates/ as the cert drop folder and update-ca-certificates // to rebuild the trust bundle. diff --git a/e2e/scenario_rcv1p_win_test.go b/e2e/scenario_rcv1p_win_test.go index 3621446896b..048deaf5cd1 100644 --- a/e2e/scenario_rcv1p_win_test.go +++ b/e2e/scenario_rcv1p_win_test.go @@ -21,6 +21,7 @@ import ( // installation on Windows Server 2022. func Test_RCV1P_Windows2022(t *testing.T) { skipIfRCV1PNotConfigured(t) + cseMutator := rcv1pWindowsCSEMutator(t) // TODO(rcv1p): remove once RCV1P ships in published CSE package RunScenario(t, &Scenario{ Description: "Tests RCV1P cert mode on Windows Server 2022 with VM opt-in tag", AzureClient: config.RCV1PAzure, @@ -33,7 +34,7 @@ func Test_RCV1P_Windows2022(t *testing.T) { VHD: config.VHDWindows2022Containerd, VMConfigMutator: rcv1pOptInVMConfigMutator, VMInstanceTags: rcv1pVMInstanceTags(), - BootstrapConfigMutator: EmptyBootstrapConfigMutator, + BootstrapConfigMutator: cseMutator, Validator: func(ctx context.Context, s *Scenario) { ValidateRCV1PCertModeWindows(ctx, s) }, @@ -44,6 +45,7 @@ func Test_RCV1P_Windows2022(t *testing.T) { // Test_RCV1P_Windows23H2 validates RCV1P on Windows Server 23H2, the annual channel release. func Test_RCV1P_Windows23H2(t *testing.T) { skipIfRCV1PNotConfigured(t) + cseMutator := rcv1pWindowsCSEMutator(t) // TODO(rcv1p): remove once RCV1P ships in published CSE package RunScenario(t, &Scenario{ Description: "Tests RCV1P cert mode on Windows Server 23H2 with VM opt-in tag", AzureClient: config.RCV1PAzure, @@ -56,7 +58,7 @@ func Test_RCV1P_Windows23H2(t *testing.T) { VHD: config.VHDWindows23H2, VMConfigMutator: rcv1pOptInVMConfigMutator, VMInstanceTags: rcv1pVMInstanceTags(), - BootstrapConfigMutator: EmptyBootstrapConfigMutator, + BootstrapConfigMutator: cseMutator, Validator: func(ctx context.Context, s *Scenario) { ValidateRCV1PCertModeWindows(ctx, s) }, @@ -67,6 +69,7 @@ func Test_RCV1P_Windows23H2(t *testing.T) { // Test_RCV1P_Windows2025 validates RCV1P on Windows Server 2025 (non-gen2). func Test_RCV1P_Windows2025(t *testing.T) { skipIfRCV1PNotConfigured(t) + cseMutator := rcv1pWindowsCSEMutator(t) // TODO(rcv1p): remove once RCV1P ships in published CSE package RunScenario(t, &Scenario{ Description: "Tests RCV1P cert mode on Windows Server 2025 with VM opt-in tag", AzureClient: config.RCV1PAzure, @@ -80,6 +83,7 @@ func Test_RCV1P_Windows2025(t *testing.T) { VMConfigMutator: rcv1pOptInVMConfigMutator, VMInstanceTags: rcv1pVMInstanceTags(), BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) { + cseMutator(nbc) Windows2025BootstrapConfigMutator(t, nbc) }, Validator: func(ctx context.Context, s *Scenario) { @@ -93,6 +97,7 @@ func Test_RCV1P_Windows2025(t *testing.T) { // installation on Windows Server 2022 Gen2. Covers the gen2 pipeline job. func Test_RCV1P_Windows2022Gen2(t *testing.T) { skipIfRCV1PNotConfigured(t) + cseMutator := rcv1pWindowsCSEMutator(t) // TODO(rcv1p): remove once RCV1P ships in published CSE package RunScenario(t, &Scenario{ Description: "Tests RCV1P cert mode on Windows Server 2022 Gen2 with VM opt-in tag", AzureClient: config.RCV1PAzure, @@ -105,7 +110,7 @@ func Test_RCV1P_Windows2022Gen2(t *testing.T) { VHD: config.VHDWindows2022ContainerdGen2, VMConfigMutator: rcv1pOptInVMConfigMutator, VMInstanceTags: rcv1pVMInstanceTags(), - BootstrapConfigMutator: EmptyBootstrapConfigMutator, + BootstrapConfigMutator: cseMutator, Validator: func(ctx context.Context, s *Scenario) { ValidateRCV1PCertModeWindows(ctx, s) }, @@ -116,6 +121,7 @@ func Test_RCV1P_Windows2022Gen2(t *testing.T) { // Test_RCV1P_Windows23H2Gen2 validates RCV1P on Windows Server 23H2 Gen2. Covers the gen2 pipeline job. func Test_RCV1P_Windows23H2Gen2(t *testing.T) { skipIfRCV1PNotConfigured(t) + cseMutator := rcv1pWindowsCSEMutator(t) // TODO(rcv1p): remove once RCV1P ships in published CSE package RunScenario(t, &Scenario{ Description: "Tests RCV1P cert mode on Windows Server 23H2 Gen2 with VM opt-in tag", AzureClient: config.RCV1PAzure, @@ -128,7 +134,7 @@ func Test_RCV1P_Windows23H2Gen2(t *testing.T) { VHD: config.VHDWindows23H2Gen2, VMConfigMutator: rcv1pOptInVMConfigMutator, VMInstanceTags: rcv1pVMInstanceTags(), - BootstrapConfigMutator: EmptyBootstrapConfigMutator, + BootstrapConfigMutator: cseMutator, Validator: func(ctx context.Context, s *Scenario) { ValidateRCV1PCertModeWindows(ctx, s) }, @@ -139,6 +145,7 @@ func Test_RCV1P_Windows23H2Gen2(t *testing.T) { // Test_RCV1P_Windows2025Gen2 validates RCV1P on Windows Server 2025 Gen2. Covers the gen2 pipeline job. func Test_RCV1P_Windows2025Gen2(t *testing.T) { skipIfRCV1PNotConfigured(t) + cseMutator := rcv1pWindowsCSEMutator(t) // TODO(rcv1p): remove once RCV1P ships in published CSE package RunScenario(t, &Scenario{ Description: "Tests RCV1P cert mode on Windows Server 2025 Gen2 with VM opt-in tag", AzureClient: config.RCV1PAzure, @@ -155,6 +162,7 @@ func Test_RCV1P_Windows2025Gen2(t *testing.T) { }, VMInstanceTags: rcv1pVMInstanceTags(), BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) { + cseMutator(nbc) Windows2025BootstrapConfigMutator(t, nbc) }, Validator: func(ctx context.Context, s *Scenario) { @@ -172,6 +180,7 @@ func Test_RCV1P_Windows2025Gen2(t *testing.T) { func Test_RCV1P_Windows_NotOptedIn(t *testing.T) { skipIfRCV1PNotConfigured(t) skipNotOptedInOnAutoDetect(t) + cseMutator := rcv1pWindowsCSEMutator(t) // TODO(rcv1p): remove once RCV1P ships in published CSE package RunScenario(t, &Scenario{ Description: "Tests RCV1P cert mode on Windows without VM opt-in tag; expects no cert installation", AzureClient: config.RCV1PAzure, @@ -182,7 +191,7 @@ func Test_RCV1P_Windows_NotOptedIn(t *testing.T) { Config: Config{ Cluster: ClusterRCV1PAzureNetwork, VHD: config.VHDWindows2022Containerd, - BootstrapConfigMutator: EmptyBootstrapConfigMutator, + BootstrapConfigMutator: cseMutator, Validator: func(ctx context.Context, s *Scenario) { ValidateRCV1PNotOptedInWindows(ctx, s) }, From d5c4f5caf014da1f907c0a4d4f02ce30031d0e22 Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Thu, 7 May 2026 21:59:02 -0700 Subject: [PATCH 064/103] fix: parse wireserver IsOptedInForRootCerts JSON response with jq The wireserver returns JSON like {"IsOptedInForRootCerts":true} but the script was using grep for IsOptedInForRootCerts=true (equals sign), which never matches the JSON colon format. Use jq for proper JSON parsing instead. This fix was previously applied but accidentally dropped during a rebase squash/reorder. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Ramkumar Chinchani --- parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh b/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh index f22a78fd34f..5e914878426 100644 --- a/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh +++ b/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh @@ -94,7 +94,9 @@ function is_opted_in_for_root_certs { return 2 fi - if echo "$opt_in_response" | grep -q "IsOptedInForRootCerts=true"; then + # Wireserver may return JSON ({"IsOptedInForRootCerts":true}) or key=value + # (IsOptedInForRootCerts=true). Use jq for proper JSON parsing. + if echo "$opt_in_response" | jq -e '.IsOptedInForRootCerts == true' > /dev/null 2>&1; then echo "IsOptedInForRootCerts=true" return 0 fi From e335c3d0157f621320e44c0241a07eedf4972f00 Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Thu, 7 May 2026 23:20:07 -0700 Subject: [PATCH 065/103] fix(e2e): update BootstrapConfigMutator signatures after rebase Adapt to upstream signature change: BootstrapConfigMutator now takes (*Cluster, *NodeBootstrappingConfiguration) instead of just (*NodeBootstrappingConfiguration). Also thread infra parameter through setupPrivateDNSForAPIServer to match getClusterVNet signature. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Ramkumar Chinchani --- e2e/scenario_rcv1p_test.go | 16 ++++++++-------- e2e/scenario_rcv1p_win_test.go | 10 +++++----- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/e2e/scenario_rcv1p_test.go b/e2e/scenario_rcv1p_test.go index ca94f2ca67f..dc90654376d 100644 --- a/e2e/scenario_rcv1p_test.go +++ b/e2e/scenario_rcv1p_test.go @@ -312,9 +312,9 @@ func findRepoRoot() (string, error) { // rcv1pWindowsCSEMutator returns a BootstrapConfigMutator that overrides CseScriptsPackageURL // to use the branch-built CSE zip containing the RCV1P code. // TODO(rcv1p): remove this once the RCV1P code ships in a published CSE package. -func rcv1pWindowsCSEMutator(t *testing.T) func(*datamodel.NodeBootstrappingConfiguration) { +func rcv1pWindowsCSEMutator(t *testing.T) func(*Cluster, *datamodel.NodeBootstrappingConfiguration) { cseURL := getOrBuildBranchCSEPackageURL(t) - return func(nbc *datamodel.NodeBootstrappingConfiguration) { + return func(_ *Cluster, nbc *datamodel.NodeBootstrappingConfiguration) { nbc.ContainerService.Properties.WindowsProfile.CseScriptsPackageURL = cseURL } } @@ -336,7 +336,7 @@ func Test_RCV1P_Ubuntu2204(t *testing.T) { VHD: config.VHDUbuntu2204Gen2Containerd, VMConfigMutator: rcv1pOptInVMConfigMutator, VMInstanceTags: rcv1pVMInstanceTags(), - BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) { + BootstrapConfigMutator: func(_ *Cluster, nbc *datamodel.NodeBootstrappingConfiguration) { }, Validator: func(ctx context.Context, s *Scenario) { ValidateRCV1PCertMode(ctx, s) @@ -362,7 +362,7 @@ func Test_RCV1P_Ubuntu2404(t *testing.T) { VHD: config.VHDUbuntu2404Gen2Containerd, VMConfigMutator: rcv1pOptInVMConfigMutator, VMInstanceTags: rcv1pVMInstanceTags(), - BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) { + BootstrapConfigMutator: func(_ *Cluster, nbc *datamodel.NodeBootstrappingConfiguration) { }, Validator: func(ctx context.Context, s *Scenario) { ValidateRCV1PCertMode(ctx, s) @@ -388,7 +388,7 @@ func Test_RCV1P_AzureLinuxV3(t *testing.T) { VHD: config.VHDAzureLinuxV3Gen2, VMConfigMutator: rcv1pOptInVMConfigMutator, VMInstanceTags: rcv1pVMInstanceTags(), - BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) { + BootstrapConfigMutator: func(_ *Cluster, nbc *datamodel.NodeBootstrappingConfiguration) { }, Validator: func(ctx context.Context, s *Scenario) { ValidateRCV1PCertMode(ctx, s) @@ -414,7 +414,7 @@ func Test_RCV1P_Flatcar(t *testing.T) { VHD: config.VHDFlatcarGen2, VMConfigMutator: rcv1pOptInVMConfigMutator, VMInstanceTags: rcv1pVMInstanceTags(), - BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) { + BootstrapConfigMutator: func(_ *Cluster, nbc *datamodel.NodeBootstrappingConfiguration) { }, Validator: func(ctx context.Context, s *Scenario) { ValidateRCV1PCertMode(ctx, s) @@ -443,7 +443,7 @@ func Test_RCV1P_ACL(t *testing.T) { rcv1pOptInVMConfigMutator(vmss) }, VMInstanceTags: rcv1pVMInstanceTags(), - BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) { + BootstrapConfigMutator: func(_ *Cluster, nbc *datamodel.NodeBootstrappingConfiguration) { }, Validator: func(ctx context.Context, s *Scenario) { ValidateRCV1PCertMode(ctx, s) @@ -472,7 +472,7 @@ func Test_RCV1P_NotOptedIn(t *testing.T) { Config: Config{ Cluster: ClusterRCV1PKubenet, VHD: config.VHDUbuntu2204Gen2Containerd, - BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) { + BootstrapConfigMutator: func(_ *Cluster, nbc *datamodel.NodeBootstrappingConfiguration) { }, Validator: func(ctx context.Context, s *Scenario) { ValidateRCV1PNotOptedIn(ctx, s) diff --git a/e2e/scenario_rcv1p_win_test.go b/e2e/scenario_rcv1p_win_test.go index 048deaf5cd1..fe196a1e9ae 100644 --- a/e2e/scenario_rcv1p_win_test.go +++ b/e2e/scenario_rcv1p_win_test.go @@ -81,9 +81,9 @@ func Test_RCV1P_Windows2025(t *testing.T) { Cluster: ClusterRCV1PAzureNetwork, VHD: config.VHDWindows2025, VMConfigMutator: rcv1pOptInVMConfigMutator, - VMInstanceTags: rcv1pVMInstanceTags(), - BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) { - cseMutator(nbc) + VMInstanceTags: rcv1pVMInstanceTags(), + BootstrapConfigMutator: func(_ *Cluster, nbc *datamodel.NodeBootstrappingConfiguration) { + cseMutator(nil, nbc) Windows2025BootstrapConfigMutator(t, nbc) }, Validator: func(ctx context.Context, s *Scenario) { @@ -161,8 +161,8 @@ func Test_RCV1P_Windows2025Gen2(t *testing.T) { rcv1pOptInVMConfigMutator(vmss) }, VMInstanceTags: rcv1pVMInstanceTags(), - BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) { - cseMutator(nbc) + BootstrapConfigMutator: func(_ *Cluster, nbc *datamodel.NodeBootstrappingConfiguration) { + cseMutator(nil, nbc) Windows2025BootstrapConfigMutator(t, nbc) }, Validator: func(ctx context.Context, s *Scenario) { From 2b4d4290294c54975ad833037725c36bd80a1526 Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Thu, 7 May 2026 23:24:45 -0700 Subject: [PATCH 066/103] fix: fail process_cert_operations when no cert bodies are saved Track the number of successfully saved certificates and return non-zero if all individual cert content fetches failed despite the operation endpoint returning filenames. This closes a gap where retrieve_rcv1p_certs could report success with zero certs actually downloaded. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Ramkumar Chinchani --- parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh b/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh index 5e914878426..fc1d2793635 100644 --- a/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh +++ b/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh @@ -166,6 +166,7 @@ function process_cert_operations { return 1 fi + local saved_count=0 for cert_filename in "${cert_filenames[@]}"; do echo "Processing certificate file: $cert_filename" @@ -182,7 +183,14 @@ function process_cert_operations { echo "$cert_content" > "/root/AzureCACertificates/$cert_filename" echo "Successfully saved certificate: $cert_filename" + saved_count=$((saved_count + 1)) done + + if [ $saved_count -eq 0 ]; then + echo "Error: all certificate content fetches failed for $endpoint_type (${#cert_filenames[@]} filenames found but 0 saved)" + return 1 + fi + echo "Saved $saved_count/${#cert_filenames[@]} certificates for $endpoint_type" } function retrieve_rcv1p_certs { From 83ac070b744cf1b6f5e29156d73934dca26a8220 Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Thu, 7 May 2026 23:25:57 -0700 Subject: [PATCH 067/103] fix: pass repodepot_endpoint explicitly to add_key_ubuntu and add_ms_keys These functions relied on bash dynamic scoping to access the caller's local repodepot_endpoint variable. Pass it as an explicit parameter to follow the repo's shell script guidelines and avoid fragile implicit variable dependencies. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Ramkumar Chinchani --- .../artifacts/init-aks-custom-cloud-repos.sh | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/parts/linux/cloud-init/artifacts/init-aks-custom-cloud-repos.sh b/parts/linux/cloud-init/artifacts/init-aks-custom-cloud-repos.sh index 0c68d513568..9f5eae0119a 100644 --- a/parts/linux/cloud-init/artifacts/init-aks-custom-cloud-repos.sh +++ b/parts/linux/cloud-init/artifacts/init-aks-custom-cloud-repos.sh @@ -92,13 +92,14 @@ EOF } function add_key_ubuntu { - local key_name=$1 + local repodepot_endpoint="$1" + local key_name="$2" - key_url="${repodepot_endpoint}/keys/${key_name}" + local key_url="${repodepot_endpoint}/keys/${key_name}" check_url $key_url echo "Adding $key_name key to keyring..." - key_data=$(wget -O - $key_url) - key_path=$(derive_key_paths $key_name) + local key_data=$(wget -O - $key_url) + local key_path=$(derive_key_paths $key_name) echo "$key_data" | gpg --dearmor | tee $key_path > /dev/null echo "$key_name key added to keyring." } @@ -115,11 +116,12 @@ function derive_key_paths { } function add_ms_keys { + local repodepot_endpoint="$1" # Add the Microsoft package server keys to keyring. echo "Adding Microsoft keys to keyring..." - add_key_ubuntu microsoft.asc - add_key_ubuntu msopentech.asc + add_key_ubuntu "$repodepot_endpoint" microsoft.asc + add_key_ubuntu "$repodepot_endpoint" msopentech.asc } function aptget_update { From 63383eb40686f1bf9f17940a8eaccf18e76bcf66 Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Thu, 7 May 2026 23:27:07 -0700 Subject: [PATCH 068/103] chore(e2e): remove REVERT ME wireserver diagnostic block from Windows validator Remove the always-on diagnostic block that probed wireserver endpoints and dumped CSE logs on every Windows RCV1P test run. This bloated test logs, added latency, and could leak wireserver response content into CI. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Ramkumar Chinchani --- e2e/scenario_rcv1p_test.go | 4 ++-- e2e/validators.go | 39 -------------------------------------- 2 files changed, 2 insertions(+), 41 deletions(-) diff --git a/e2e/scenario_rcv1p_test.go b/e2e/scenario_rcv1p_test.go index dc90654376d..9c7f889358a 100644 --- a/e2e/scenario_rcv1p_test.go +++ b/e2e/scenario_rcv1p_test.go @@ -410,8 +410,8 @@ func Test_RCV1P_Flatcar(t *testing.T) { RCV1PCertMode: true, }, Config: Config{ - Cluster: ClusterRCV1PKubenet, - VHD: config.VHDFlatcarGen2, + Cluster: ClusterRCV1PKubenet, + VHD: config.VHDFlatcarGen2, VMConfigMutator: rcv1pOptInVMConfigMutator, VMInstanceTags: rcv1pVMInstanceTags(), BootstrapConfigMutator: func(_ *Cluster, nbc *datamodel.NodeBootstrappingConfiguration) { diff --git a/e2e/validators.go b/e2e/validators.go index 9c26e7b7c00..ffb70a6d332 100644 --- a/e2e/validators.go +++ b/e2e/validators.go @@ -3173,45 +3173,6 @@ func rcv1pTrustStoreDir(s *Scenario) string { func ValidateRCV1PCertModeWindows(ctx context.Context, s *Scenario) { s.T.Helper() - // REVERT ME: Diagnostic block — probe wireserver endpoints and dump CSE log tail from the VM - // so we can see exactly what the wireserver returns for operationrequests and what the CSE logged. - diagCommand := []string{ - "$ErrorActionPreference = 'Continue'", - "Write-Host '=== DIAGNOSTIC: probing wireserver rcv1p endpoints ==='", - "try {", - " $optIn = Invoke-WebRequest -Uri 'http://168.63.129.16/acms/isOptedInForRootCerts' -UseBasicParsing -TimeoutSec 30", - " Write-Host \"isOptedInForRootCerts: $($optIn.Content)\"", - "} catch { Write-Host \"isOptedInForRootCerts ERROR: $_\" }", - "try {", - " $root = Invoke-WebRequest -Uri 'http://168.63.129.16/machine?comp=acmspackage&type=operationrequestsroot&ext=json' -UseBasicParsing -TimeoutSec 30", - " Write-Host \"operationrequestsroot status=$($root.StatusCode) length=$($root.Content.Length)\"", - " Write-Host \"operationrequestsroot content: $($root.Content)\"", - "} catch { Write-Host \"operationrequestsroot ERROR: $_\" }", - "try {", - " $intermediate = Invoke-WebRequest -Uri 'http://168.63.129.16/machine?comp=acmspackage&type=operationrequestsintermediate&ext=json' -UseBasicParsing -TimeoutSec 30", - " Write-Host \"operationrequestsintermediate status=$($intermediate.StatusCode) length=$($intermediate.Content.Length)\"", - " Write-Host \"operationrequestsintermediate content: $($intermediate.Content)\"", - "} catch { Write-Host \"operationrequestsintermediate ERROR: $_\" }", - "try {", - " $legacy = Invoke-WebRequest -Uri 'http://168.63.129.16/machine?comp=acmspackage&type=cacertificates&ext=json' -UseBasicParsing -TimeoutSec 30", - " Write-Host \"legacy cacertificates status=$($legacy.StatusCode) length=$($legacy.Content.Length)\"", - " $legacyJson = $legacy.Content | ConvertFrom-Json", - " if ($legacyJson.Certificates) { Write-Host \"legacy cert count: $($legacyJson.Certificates.Length)\" } else { Write-Host 'legacy: no Certificates array' }", - "} catch { Write-Host \"legacy cacertificates ERROR: $_\" }", - "Write-Host '=== DIAGNOSTIC: C:\\ca folder contents ==='", - "if (Test-Path 'C:\\ca') { Get-ChildItem -Path 'C:\\ca' -File | ForEach-Object { Write-Host \" $($_.Name) ($($_.Length) bytes)\" } } else { Write-Host 'C:\\ca does not exist' }", - "Write-Host '=== DIAGNOSTIC: CSE log tail (last 60 lines with CA/cert/wireserver) ==='", - "if (Test-Path 'C:\\AzureData\\CustomDataSetupScript.log') {", - " Get-Content 'C:\\AzureData\\CustomDataSetupScript.log' -Tail 200 | Where-Object { $_ -match 'CA |cert|wireserver|optedin|operation|acms|Write cert|Warning' } | Select-Object -Last 60 | ForEach-Object { Write-Host $_ }", - "} else { Write-Host 'CSE log not found' }", - "Write-Host '=== END DIAGNOSTIC ==='", - } - diagResult := execScriptOnVMForScenario(ctx, s, strings.Join(diagCommand, "\n")) - s.T.Logf("REVERT ME: wireserver diagnostics stdout:\n%s", diagResult.stdout) - if diagResult.stderr != "" { - s.T.Logf("REVERT ME: wireserver diagnostics stderr:\n%s", diagResult.stderr) - } - // Validate the provisioning log shows wireserver was queried and returned opted-in ValidateFileHasContent(ctx, s, "C:\\AzureData\\CustomDataSetupScript.log", "IsOptedInForRootCerts wireserver response:") From 27d5086117ee21ad28fa64efd642bbc8882fbb55 Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Mon, 18 May 2026 14:31:59 -0700 Subject: [PATCH 069/103] fix: guard against unresolved ADO pipeline variable expressions in RCV1PSubscriptionID Signed-off-by: Ramkumar Chinchani --- e2e/config/config.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/e2e/config/config.go b/e2e/config/config.go index f88f4d51ad5..6de8bc25e15 100644 --- a/e2e/config/config.go +++ b/e2e/config/config.go @@ -186,6 +186,8 @@ func mustLoadConfig() *Configuration { func init() { rcv1pSubID := strings.TrimSpace(Config.RCV1PSubscriptionID) + // Guard against ADO pipeline variable expressions that weren't resolved (e.g. "$(RCV1P_SUBSCRIPTION_ID)"). + // If the value is still a literal $(...) token, treat it as unset. if rcv1pSubID != "" && !strings.HasPrefix(rcv1pSubID, "$(") { client, err := NewAzureClientForSubscription(rcv1pSubID) if err != nil { From 9d0da888cb2586151ba2b7668665fd484f116d3b Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Mon, 1 Jun 2026 13:57:10 -0700 Subject: [PATCH 070/103] fix: update for main branch API changes (getClusterVNet, remove Windows 23H2) Signed-off-by: Ramkumar Chinchani --- e2e/scenario_rcv1p_win_test.go | 48 ---------------------------------- 1 file changed, 48 deletions(-) diff --git a/e2e/scenario_rcv1p_win_test.go b/e2e/scenario_rcv1p_win_test.go index fe196a1e9ae..3892821e2d3 100644 --- a/e2e/scenario_rcv1p_win_test.go +++ b/e2e/scenario_rcv1p_win_test.go @@ -42,30 +42,6 @@ func Test_RCV1P_Windows2022(t *testing.T) { }) } -// Test_RCV1P_Windows23H2 validates RCV1P on Windows Server 23H2, the annual channel release. -func Test_RCV1P_Windows23H2(t *testing.T) { - skipIfRCV1PNotConfigured(t) - cseMutator := rcv1pWindowsCSEMutator(t) // TODO(rcv1p): remove once RCV1P ships in published CSE package - RunScenario(t, &Scenario{ - Description: "Tests RCV1P cert mode on Windows Server 23H2 with VM opt-in tag", - AzureClient: config.RCV1PAzure, - SubscriptionID: config.Config.RCV1PSubscriptionID, - Tags: Tags{ - RCV1PCertMode: true, - }, - Config: Config{ - Cluster: ClusterRCV1PAzureNetwork, - VHD: config.VHDWindows23H2, - VMConfigMutator: rcv1pOptInVMConfigMutator, - VMInstanceTags: rcv1pVMInstanceTags(), - BootstrapConfigMutator: cseMutator, - Validator: func(ctx context.Context, s *Scenario) { - ValidateRCV1PCertModeWindows(ctx, s) - }, - }, - }) -} - // Test_RCV1P_Windows2025 validates RCV1P on Windows Server 2025 (non-gen2). func Test_RCV1P_Windows2025(t *testing.T) { skipIfRCV1PNotConfigured(t) @@ -118,30 +94,6 @@ func Test_RCV1P_Windows2022Gen2(t *testing.T) { }) } -// Test_RCV1P_Windows23H2Gen2 validates RCV1P on Windows Server 23H2 Gen2. Covers the gen2 pipeline job. -func Test_RCV1P_Windows23H2Gen2(t *testing.T) { - skipIfRCV1PNotConfigured(t) - cseMutator := rcv1pWindowsCSEMutator(t) // TODO(rcv1p): remove once RCV1P ships in published CSE package - RunScenario(t, &Scenario{ - Description: "Tests RCV1P cert mode on Windows Server 23H2 Gen2 with VM opt-in tag", - AzureClient: config.RCV1PAzure, - SubscriptionID: config.Config.RCV1PSubscriptionID, - Tags: Tags{ - RCV1PCertMode: true, - }, - Config: Config{ - Cluster: ClusterRCV1PAzureNetwork, - VHD: config.VHDWindows23H2Gen2, - VMConfigMutator: rcv1pOptInVMConfigMutator, - VMInstanceTags: rcv1pVMInstanceTags(), - BootstrapConfigMutator: cseMutator, - Validator: func(ctx context.Context, s *Scenario) { - ValidateRCV1PCertModeWindows(ctx, s) - }, - }, - }) -} - // Test_RCV1P_Windows2025Gen2 validates RCV1P on Windows Server 2025 Gen2. Covers the gen2 pipeline job. func Test_RCV1P_Windows2025Gen2(t *testing.T) { skipIfRCV1PNotConfigured(t) From 4eb13f5b22872caf39876dd77a65f4895da230ef Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Mon, 1 Jun 2026 17:31:06 -0700 Subject: [PATCH 071/103] fix: fail fast if LOCATION is empty when installing ca-refresh schedule Signed-off-by: Ramkumar Chinchani --- parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh b/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh index fc1d2793635..a492f4a3fad 100644 --- a/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh +++ b/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh @@ -285,6 +285,11 @@ if [ "$action" = "ca-refresh" ]; then exit fi +if [ "$install_ca_refresh_schedule" -eq 1 ] && [ -z "$LOCATION" ]; then + echo "ERROR: LOCATION is required to install ca-refresh schedule but is empty" + exit 1 +fi + if [ "$IS_UBUNTU" -eq 1 ] || [ "$IS_MARINER" -eq 1 ] || [ "$IS_AZURELINUX" -eq 1 ]; then scriptPath=$0 # Determine an absolute, canonical path to this script for use in cron. From 2d414ea6f64824583bd920871ebd4b2ba0193ad9 Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Tue, 2 Jun 2026 12:48:12 -0700 Subject: [PATCH 072/103] e2e: filter transient waagent ProtocolError in ValidateWaagentLog MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix pre-existing flake on main branch where ValidateWaagentLog fails tests due to a benign single-occurrence 'Error fetching the goal state: [ProtocolError]' in waagent.log during early VM boot. This transient error occurs while the wireserver connection is being established and does not indicate a real problem — the node provisions successfully. The exclusion is applied unconditionally (like the existing FIPS PFX exclusion) since this error is normal boot behavior across all distros. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- e2e/validators.go | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/e2e/validators.go b/e2e/validators.go index ffb70a6d332..94a41c1cc94 100644 --- a/e2e/validators.go +++ b/e2e/validators.go @@ -2941,6 +2941,10 @@ func ValidateWaagentLog(ctx context.Context, s *Scenario) { "waagent.log should confirm WALinuxAgent-%s is running as ExtHandler", expectedVersion) // 3. Check for ExtHandler errors + // Always exclude transient ProtocolError goal-state fetch failures — these are + // benign single-occurrence errors during early boot when the wireserver connection + // is still being established. They do not indicate a real problem. + // // On Ubuntu 22.04 FIPS VHDs, waagent logs "Cannot convert PFX to PEM" because // of a known bug with VMSS that fails to propagate the FIPS additionalCapabilities. // Until the VMSS bug is fixed, skip the "Cannot convert PFX to PEM" errors. @@ -2948,10 +2952,11 @@ func ValidateWaagentLog(ctx context.Context, s *Scenario) { isUbuntu2204FIPS := s.VHD == config.VHDUbuntu2204FIPSContainerd || s.VHD == config.VHDUbuntu2204Gen2FIPSContainerd || s.VHD == config.VHDUbuntu2204Gen2FIPSTLContainerd - grepCmd := fmt.Sprintf("sudo grep 'ERROR ExtHandler' %s || true", waagentLogFile) + excludePatterns := "'Error fetching the goal state: \\[ProtocolError\\]'" if isUbuntu2204FIPS { - grepCmd = fmt.Sprintf("sudo grep 'ERROR ExtHandler' %s | grep -v 'Cannot convert PFX to PEM' || true", waagentLogFile) + excludePatterns += " -e 'Cannot convert PFX to PEM'" } + grepCmd := fmt.Sprintf("sudo grep 'ERROR ExtHandler' %s | grep -v -e %s || true", waagentLogFile, excludePatterns) extHandlerErrors := execScriptOnVMForScenarioValidateExitCode(ctx, s, strings.Join([]string{ "set -e", From 4353cd4d5d58f1564550ba9b06c24bea83646bc0 Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Tue, 2 Jun 2026 15:46:34 -0700 Subject: [PATCH 073/103] e2e: simplify RCV1P to single-subscription-per-job model Remove the separate RCV1P_SUBSCRIPTION_ID/RCV1PAzure client pattern. Instead, use a pipeline subscriptionOverride parameter so the RCV1P pipeline passes its subscription as E2E_SUBSCRIPTION_ID, running all tests against one subscription. - Add subscriptionOverride parameter to e2e-template.yaml (coalesce override) - Map SUBSCRIPTION_ID from E2E_SUBSCRIPTION_ID in e2e_run.sh (Go config reads SUBSCRIPTION_ID env var, not E2E_SUBSCRIPTION_ID) - Remove RCV1PAzure global, RCV1PSubscriptionID config, RCV1P-specific init(), resource group helpers, and cluster infra functions - Remove AzureClient/SubscriptionID overrides from all RCV1P test scenarios - Replace auto-detect logic with simple SkipRCV1PNotOptedIn config flag (RCV1P_SKIP_NOT_OPTED_IN env var, defaults true for MSFT tenant safety) - Simplify test_helpers.go: remove RCV1P-specific infra provisioning branch Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .pipelines/e2e-rcv1p.yaml | 1 + .pipelines/scripts/e2e_run.sh | 5 +- .pipelines/templates/e2e-template.yaml | 1 - e2e/cache.go | 15 ++-- e2e/config/config.go | 26 ------- e2e/scenario_rcv1p_test.go | 95 ++++---------------------- e2e/scenario_rcv1p_win_test.go | 25 ++----- e2e/test_helpers.go | 27 ++------ e2e/types.go | 18 ----- 9 files changed, 39 insertions(+), 174 deletions(-) diff --git a/.pipelines/e2e-rcv1p.yaml b/.pipelines/e2e-rcv1p.yaml index 5fdf9d3a5ee..927365ea9c8 100644 --- a/.pipelines/e2e-rcv1p.yaml +++ b/.pipelines/e2e-rcv1p.yaml @@ -17,3 +17,4 @@ jobs: parameters: name: RCV1P Cert Mode Tests IgnoreScenariosWithMissingVhd: false + subscriptionOverride: $(RCV1P_SUBSCRIPTION_ID) diff --git a/.pipelines/scripts/e2e_run.sh b/.pipelines/scripts/e2e_run.sh index 097fe250756..b6f83df5fab 100644 --- a/.pipelines/scripts/e2e_run.sh +++ b/.pipelines/scripts/e2e_run.sh @@ -17,6 +17,9 @@ set -euo pipefail az account set -s "${E2E_SUBSCRIPTION_ID}" echo "Using subscription ${E2E_SUBSCRIPTION_ID} for e2e tests" +# Map E2E_SUBSCRIPTION_ID to SUBSCRIPTION_ID which the Go test framework reads +export SUBSCRIPTION_ID="${E2E_SUBSCRIPTION_ID}" + # Setup go export GOPATH="$(go env GOPATH)" go version @@ -35,7 +38,6 @@ VHD_BUILD_ID="${VHD_BUILD_ID:-}" IGNORE_SCENARIOS_WITH_MISSING_VHD="${IGNORE_SCENARIOS_WITH_MISSING_VHD:-}" LOGGING_DIR="${LOGGING_DIR:-}" E2E_SUBSCRIPTION_ID="${E2E_SUBSCRIPTION_ID:-}" -RCV1P_SUBSCRIPTION_ID="${RCV1P_SUBSCRIPTION_ID:-}" ENABLE_SECURE_TLS_BOOTSTRAPPING="${ENABLE_SECURE_TLS_BOOTSTRAPPING:-true}" TAGS_TO_SKIP="${TAGS_TO_SKIP:-}" TAGS_TO_RUN="${TAGS_TO_RUN:-}" @@ -48,7 +50,6 @@ echo "VHD_BUILD_ID: ${VHD_BUILD_ID}" echo "IGNORE_SCENARIOS_WITH_MISSING_VHD: ${IGNORE_SCENARIOS_WITH_MISSING_VHD}" echo "LOGGING_DIR: ${LOGGING_DIR}" echo "E2E_SUBSCRIPTION_ID: ${E2E_SUBSCRIPTION_ID}" -echo "RCV1P_SUBSCRIPTION_ID: ${RCV1P_SUBSCRIPTION_ID}" echo "ENABLE_SECURE_TLS_BOOTSTRAPPING: ${ENABLE_SECURE_TLS_BOOTSTRAPPING}" echo "TAGS_TO_SKIP: ${TAGS_TO_SKIP}" echo "TAGS_TO_RUN: ${TAGS_TO_RUN}" diff --git a/.pipelines/templates/e2e-template.yaml b/.pipelines/templates/e2e-template.yaml index 09398db8d30..fe53fe52a2e 100644 --- a/.pipelines/templates/e2e-template.yaml +++ b/.pipelines/templates/e2e-template.yaml @@ -42,7 +42,6 @@ jobs: displayName: Run AgentBaker E2E env: E2E_SUBSCRIPTION_ID: ${{parameters.subscriptionId}} - RCV1P_SUBSCRIPTION_ID: $(RCV1P_SUBSCRIPTION_ID) SYS_SSH_PUBLIC_KEY: $(SYS_SSH_PUBLIC_KEY) SYS_SSH_PRIVATE_KEY_B64: $(SYS_SSH_PRIVATE_KEY_B64) BUILD_SRC_DIR: $(System.DefaultWorkingDirectory) diff --git a/e2e/cache.go b/e2e/cache.go index 174b20335ba..6147954137a 100644 --- a/e2e/cache.go +++ b/e2e/cache.go @@ -210,17 +210,20 @@ func clusterCiliumNetwork(ctx context.Context, request ClusterRequest) (*Cluster return prepareCluster(ctx, model, false, false) } +var ClusterRCV1PKubenet = cachedFunc(clusterRCV1PKubenet) + +// clusterRCV1PKubenet creates a kubenet cluster for RCV1P cert mode testing. +func clusterRCV1PKubenet(ctx context.Context, request ClusterRequest) (*Cluster, error) { + return prepareCluster(ctx, getKubenetClusterModel("abe2e-rcv1p-kubenet-v1", request.Location, request.K8sSystemPoolSKU), false, false) +} + var ClusterRCV1PAzureNetwork = cachedFunc(clusterRCV1PAzureNetwork) -// clusterRCV1PAzureNetwork creates an Azure CNI cluster in the RCV1P subscription for Windows cert mode testing. +// clusterRCV1PAzureNetwork creates an Azure CNI cluster for Windows RCV1P cert mode testing. // Windows tests require Azure CNI (not kubenet) because baseTemplateWindows() configures the NBC for // Azure CNI overlay mode. func clusterRCV1PAzureNetwork(ctx context.Context, request ClusterRequest) (*Cluster, error) { - infra := RCV1PClusterInfra() - if infra == nil { - return nil, fmt.Errorf("RCV1P_SUBSCRIPTION_ID not set, cannot create RCV1P Azure CNI cluster") - } - return prepareCluster(ctx, infra, getAzureNetworkClusterModel("abe2e-rcv1p-azure-v1", request.Location, request.K8sSystemPoolSKU), false, false) + return prepareCluster(ctx, getAzureNetworkClusterModel("abe2e-rcv1p-azure-v1", request.Location, request.K8sSystemPoolSKU), false, false) } // isNotFoundErr checks if an error represents a "not found" response from Azure API diff --git a/e2e/config/config.go b/e2e/config/config.go index 6de8bc25e15..2c6e7eb012e 100644 --- a/e2e/config/config.go +++ b/e2e/config/config.go @@ -29,10 +29,6 @@ var ( Azure = mustNewAzureClient() VMIdentityName = "abe2e-vm-identity" - // RCV1PAzure is lazily initialized when RCV1PSubscriptionID is set. - // It provides Azure clients bound to the PlatformSettingsOverride-registered subscription. - RCV1PAzure *AzureClient - DefaultPollUntilDoneOptions = &runtime.PollUntilDoneOptions{ Frequency: time.Second, } @@ -44,14 +40,6 @@ func ResourceGroupName(location string) string { return "abe2e-" + location } -func RCV1PResourceGroupName(location string) string { - return "abe2e-rcv1p-" + location -} - -func (c *Configuration) RCV1PVMIdentityResourceID(location string) string { - return fmt.Sprintf("/subscriptions/%s/resourceGroups/%s/providers/Microsoft.ManagedIdentity/userAssignedIdentities/%s", c.RCV1PSubscriptionID, RCV1PResourceGroupName(location), VMIdentityName) -} - func PrivateACRNameNotAnon(location string) string { return "abe2eprivatenonanon" + location // will have anonymous pull enabled } @@ -102,7 +90,6 @@ type Configuration struct { TestTimeoutCluster time.Duration `env:"TEST_TIMEOUT_CLUSTER" envDefault:"30m"` TestTimeoutVMSS time.Duration `env:"TEST_TIMEOUT_VMSS" envDefault:"17m"` WindowsAdminPassword string `env:"WINDOWS_ADMIN_PASSWORD"` - RCV1PSubscriptionID string `env:"RCV1P_SUBSCRIPTION_ID"` } func (c *Configuration) BlobStorageAccount() string { @@ -184,19 +171,6 @@ func mustLoadConfig() *Configuration { return cfg } -func init() { - rcv1pSubID := strings.TrimSpace(Config.RCV1PSubscriptionID) - // Guard against ADO pipeline variable expressions that weren't resolved (e.g. "$(RCV1P_SUBSCRIPTION_ID)"). - // If the value is still a literal $(...) token, treat it as unset. - if rcv1pSubID != "" && !strings.HasPrefix(rcv1pSubID, "$(") { - client, err := NewAzureClientForSubscription(rcv1pSubID) - if err != nil { - panic(fmt.Sprintf("failed to create RCV1P Azure client: %v", err)) - } - RCV1PAzure = client - } -} - // Returns a newly generated RSA public/private key pair with the private key in PEM format. func mustGetNewRSAKeyPair() ([]byte, []byte, string) { // Generate new key pair diff --git a/e2e/scenario_rcv1p_test.go b/e2e/scenario_rcv1p_test.go index 9c7f889358a..16516990bf9 100644 --- a/e2e/scenario_rcv1p_test.go +++ b/e2e/scenario_rcv1p_test.go @@ -4,8 +4,9 @@ // at provisioning time to download the latest root certificates and installs them into the OS trust store. // // These tests require: -// - A dedicated subscription (RCV1P_SUBSCRIPTION_ID) with the Microsoft.Compute/PlatformSettingsOverride -// feature flag registered, which enables the wireserver certificate endpoint. +// - The E2E subscription (E2E_SUBSCRIPTION_ID / SUBSCRIPTION_ID) must have the +// Microsoft.Compute/PlatformSettingsOverride feature flag registered, which enables the +// wireserver certificate endpoint. // - The VM opt-in tag "platformsettings.host_environment.service.platform_optedin_for_rootcerts=true" // on each VMSS, which tells wireserver to serve certificates to this specific VM. // @@ -41,70 +42,17 @@ import ( // if the subscription has the PlatformSettingsOverride feature registered. const rcv1pOptInTag = "platformsettings.host_environment.service.platform_optedin_for_rootcerts" -// skipIfRCV1PNotConfigured skips the test when no subscription with the RCV1P feature flag -// is available. It checks in order: -// 1. Explicit RCV1P_SUBSCRIPTION_ID (dedicated RCV1P subscription) -// 2. E2E_SUBSCRIPTION_ID auto-detection (checks if the feature flag is registered) -// -// When E2E_SUBSCRIPTION_ID has the feature flag registered (e.g., MSFT tenant pipelines), -// the RCV1P tests run automatically without needing a separate variable. +// skipIfRCV1PNotConfigured skips the test when the E2E subscription does not have the +// Microsoft.Compute/PlatformSettingsOverride feature flag registered. func skipIfRCV1PNotConfigured(t *testing.T) { t.Helper() - subID := strings.TrimSpace(config.Config.RCV1PSubscriptionID) - if subID != "" && !strings.HasPrefix(subID, "$(") { - // Explicit RCV1P subscription configured — verify it has the feature flag - checkPlatformSettingsOverrideFeatureFlag(t, subID, config.RCV1PAzure, true) - return - } - - // No explicit RCV1P subscription — try auto-detecting from the E2E subscription - t.Log("RCV1P_SUBSCRIPTION_ID not set, checking if E2E subscription has PlatformSettingsOverride feature flag...") - e2eSubID := strings.TrimSpace(config.Config.SubscriptionID) - if e2eSubID == "" { - t.Skip("neither RCV1P_SUBSCRIPTION_ID nor E2E_SUBSCRIPTION_ID is set, skipping RCV1P test") - } - - e2eAzure, err := config.NewAzureClient() - if err != nil { - t.Skipf("failed to create E2E Azure client for feature flag auto-detection: %v", err) + subID := strings.TrimSpace(config.Config.SubscriptionID) + if subID == "" { + t.Skip("E2E_SUBSCRIPTION_ID / SUBSCRIPTION_ID is not set, skipping RCV1P test") } - registered, err := queryFeatureFlag(t.Context(), e2eSubID, e2eAzure) - if err != nil { - t.Skipf("failed to query feature flag on E2E subscription %s: %v", e2eSubID, err) - } - if !registered { - t.Skipf("E2E subscription %s does not have PlatformSettingsOverride registered, skipping RCV1P test", e2eSubID) - } - - // E2E subscription is enrolled — configure RCV1P globals so the rest of the test infra works - t.Logf("auto-detected PlatformSettingsOverride on E2E subscription %s, using it for RCV1P tests", e2eSubID) - rcv1pAutoDetectOnce.Do(func() { - config.Config.RCV1PSubscriptionID = e2eSubID - config.RCV1PAzure = e2eAzure - rcv1pAutoDetected = true - }) -} - -var ( - rcv1pAutoDetectOnce sync.Once - // rcv1pAutoDetected is true when the RCV1P subscription was auto-detected from the - // E2E subscription rather than explicitly set via RCV1P_SUBSCRIPTION_ID. On auto-detected - // (enrolled) subscriptions, the platform auto-injects the opt-in tag on ALL VMSSes, - // making "not opted in" negative tests impossible. - rcv1pAutoDetected bool -) - -// skipNotOptedInOnAutoDetect skips NotOptedIn negative tests when the RCV1P subscription was -// auto-detected. On enrolled subscriptions, the platform auto-injects the opt-in tag on ALL -// VMSSes, making it impossible to test the "not opted in" scenario. -func skipNotOptedInOnAutoDetect(t *testing.T) { - t.Helper() - if rcv1pAutoDetected { - t.Skip("skipping NotOptedIn test: RCV1P subscription was auto-detected from E2E subscription — " + - "platform auto-injects opt-in tag on all VMSSes in enrolled subscriptions") - } + checkPlatformSettingsOverrideFeatureFlag(t, subID, config.Azure, true) } var ( @@ -325,9 +273,7 @@ func rcv1pWindowsCSEMutator(t *testing.T) func(*Cluster, *datamodel.NodeBootstra func Test_RCV1P_Ubuntu2204(t *testing.T) { skipIfRCV1PNotConfigured(t) RunScenario(t, &Scenario{ - Description: "Tests RCV1P cert mode on Ubuntu 22.04 with VM opt-in tag", - AzureClient: config.RCV1PAzure, - SubscriptionID: config.Config.RCV1PSubscriptionID, + Description: "Tests RCV1P cert mode on Ubuntu 22.04 with VM opt-in tag", Tags: Tags{ RCV1PCertMode: true, }, @@ -351,9 +297,7 @@ func Test_RCV1P_Ubuntu2204(t *testing.T) { func Test_RCV1P_Ubuntu2404(t *testing.T) { skipIfRCV1PNotConfigured(t) RunScenario(t, &Scenario{ - Description: "Tests RCV1P cert mode on Ubuntu 24.04 with VM opt-in tag", - AzureClient: config.RCV1PAzure, - SubscriptionID: config.Config.RCV1PSubscriptionID, + Description: "Tests RCV1P cert mode on Ubuntu 24.04 with VM opt-in tag", Tags: Tags{ RCV1PCertMode: true, }, @@ -377,9 +321,7 @@ func Test_RCV1P_Ubuntu2404(t *testing.T) { func Test_RCV1P_AzureLinuxV3(t *testing.T) { skipIfRCV1PNotConfigured(t) RunScenario(t, &Scenario{ - Description: "Tests RCV1P cert mode on Azure Linux V3 with VM opt-in tag", - AzureClient: config.RCV1PAzure, - SubscriptionID: config.Config.RCV1PSubscriptionID, + Description: "Tests RCV1P cert mode on Azure Linux V3 with VM opt-in tag", Tags: Tags{ RCV1PCertMode: true, }, @@ -403,9 +345,7 @@ func Test_RCV1P_AzureLinuxV3(t *testing.T) { func Test_RCV1P_Flatcar(t *testing.T) { skipIfRCV1PNotConfigured(t) RunScenario(t, &Scenario{ - Description: "Tests RCV1P cert mode on Flatcar with VM opt-in tag", - AzureClient: config.RCV1PAzure, - SubscriptionID: config.Config.RCV1PSubscriptionID, + Description: "Tests RCV1P cert mode on Flatcar with VM opt-in tag", Tags: Tags{ RCV1PCertMode: true, }, @@ -429,9 +369,7 @@ func Test_RCV1P_Flatcar(t *testing.T) { func Test_RCV1P_ACL(t *testing.T) { skipIfRCV1PNotConfigured(t) RunScenario(t, &Scenario{ - Description: "Tests RCV1P cert mode on ACL with VM opt-in tag", - AzureClient: config.RCV1PAzure, - SubscriptionID: config.Config.RCV1PSubscriptionID, + Description: "Tests RCV1P cert mode on ACL with VM opt-in tag", Tags: Tags{ RCV1PCertMode: true, }, @@ -461,11 +399,8 @@ func Test_RCV1P_ACL(t *testing.T) { // subscription feature alone is not sufficient — the VM must also be explicitly tagged. func Test_RCV1P_NotOptedIn(t *testing.T) { skipIfRCV1PNotConfigured(t) - skipNotOptedInOnAutoDetect(t) RunScenario(t, &Scenario{ - Description: "Tests RCV1P cert mode without VM opt-in tag; expects no cert installation", - AzureClient: config.RCV1PAzure, - SubscriptionID: config.Config.RCV1PSubscriptionID, + Description: "Tests RCV1P cert mode without VM opt-in tag; expects no cert installation", Tags: Tags{ RCV1PCertMode: true, }, diff --git a/e2e/scenario_rcv1p_win_test.go b/e2e/scenario_rcv1p_win_test.go index 3892821e2d3..a996d5625f7 100644 --- a/e2e/scenario_rcv1p_win_test.go +++ b/e2e/scenario_rcv1p_win_test.go @@ -4,7 +4,7 @@ // Import-Certificate. A scheduled task (aks-ca-certs-refresh-task) is registered to // periodically refresh the certificates. // -// These tests run against the same RCV1P subscription and require the same VM opt-in tag +// These tests run against the E2E subscription and require the same VM opt-in tag // as the Linux tests (see scenario_rcv1p_test.go for details on the two-layer access control). package e2e @@ -23,9 +23,7 @@ func Test_RCV1P_Windows2022(t *testing.T) { skipIfRCV1PNotConfigured(t) cseMutator := rcv1pWindowsCSEMutator(t) // TODO(rcv1p): remove once RCV1P ships in published CSE package RunScenario(t, &Scenario{ - Description: "Tests RCV1P cert mode on Windows Server 2022 with VM opt-in tag", - AzureClient: config.RCV1PAzure, - SubscriptionID: config.Config.RCV1PSubscriptionID, + Description: "Tests RCV1P cert mode on Windows Server 2022 with VM opt-in tag", Tags: Tags{ RCV1PCertMode: true, }, @@ -47,9 +45,7 @@ func Test_RCV1P_Windows2025(t *testing.T) { skipIfRCV1PNotConfigured(t) cseMutator := rcv1pWindowsCSEMutator(t) // TODO(rcv1p): remove once RCV1P ships in published CSE package RunScenario(t, &Scenario{ - Description: "Tests RCV1P cert mode on Windows Server 2025 with VM opt-in tag", - AzureClient: config.RCV1PAzure, - SubscriptionID: config.Config.RCV1PSubscriptionID, + Description: "Tests RCV1P cert mode on Windows Server 2025 with VM opt-in tag", Tags: Tags{ RCV1PCertMode: true, }, @@ -75,9 +71,7 @@ func Test_RCV1P_Windows2022Gen2(t *testing.T) { skipIfRCV1PNotConfigured(t) cseMutator := rcv1pWindowsCSEMutator(t) // TODO(rcv1p): remove once RCV1P ships in published CSE package RunScenario(t, &Scenario{ - Description: "Tests RCV1P cert mode on Windows Server 2022 Gen2 with VM opt-in tag", - AzureClient: config.RCV1PAzure, - SubscriptionID: config.Config.RCV1PSubscriptionID, + Description: "Tests RCV1P cert mode on Windows Server 2022 Gen2 with VM opt-in tag", Tags: Tags{ RCV1PCertMode: true, }, @@ -99,9 +93,7 @@ func Test_RCV1P_Windows2025Gen2(t *testing.T) { skipIfRCV1PNotConfigured(t) cseMutator := rcv1pWindowsCSEMutator(t) // TODO(rcv1p): remove once RCV1P ships in published CSE package RunScenario(t, &Scenario{ - Description: "Tests RCV1P cert mode on Windows Server 2025 Gen2 with VM opt-in tag", - AzureClient: config.RCV1PAzure, - SubscriptionID: config.Config.RCV1PSubscriptionID, + Description: "Tests RCV1P cert mode on Windows Server 2025 Gen2 with VM opt-in tag", Tags: Tags{ RCV1PCertMode: true, }, @@ -131,12 +123,9 @@ func Test_RCV1P_Windows2025Gen2(t *testing.T) { // script correctly skips certificate download and refresh task registration. func Test_RCV1P_Windows_NotOptedIn(t *testing.T) { skipIfRCV1PNotConfigured(t) - skipNotOptedInOnAutoDetect(t) - cseMutator := rcv1pWindowsCSEMutator(t) // TODO(rcv1p): remove once RCV1P ships in published CSE package + cseMutator := rcv1pWindowsCSEMutator(t)// TODO(rcv1p): remove once RCV1P ships in published CSE package RunScenario(t, &Scenario{ - Description: "Tests RCV1P cert mode on Windows without VM opt-in tag; expects no cert installation", - AzureClient: config.RCV1PAzure, - SubscriptionID: config.Config.RCV1PSubscriptionID, + Description: "Tests RCV1P cert mode on Windows without VM opt-in tag; expects no cert installation", Tags: Tags{ RCV1PCertMode: true, }, diff --git a/e2e/test_helpers.go b/e2e/test_helpers.go index d7ad6e70720..df6d5ad2d4d 100644 --- a/e2e/test_helpers.go +++ b/e2e/test_helpers.go @@ -212,24 +212,10 @@ func runScenario(t testing.TB, s *Scenario) error { ctx := newTestCtx(t) maybeSkipScenario(ctx, t, s) - if s.AzureClient != nil { - // RCV1P scenario: ensure RG and identity in the RCV1P subscription - _, err := CachedRCV1PEnsureResourceGroup(ctx, s.Location) - require.NoError(t, err) - _, err = CachedRCV1PCreateVMManagedIdentity(ctx, s.Location) - require.NoError(t, err) - // Also ensure default subscription infra (RG + identity + blob storage) is provisioned, - // since Windows log extraction on failure uploads to the default subscription's blob storage. - _, err = CachedEnsureResourceGroup(ctx, s.Location) - require.NoError(t, err) - _, err = CachedCreateVMManagedIdentity(ctx, s.Location) - require.NoError(t, err) - } else { - _, err := CachedEnsureResourceGroup(ctx, s.Location) - require.NoError(t, err) - _, err = CachedCreateVMManagedIdentity(ctx, s.Location) - require.NoError(t, err) - } + _, err := CachedEnsureResourceGroup(ctx, s.Location) + require.NoError(t, err) + _, err = CachedCreateVMManagedIdentity(ctx, s.Location) + require.NoError(t, err) s.T = t ctrruntimelog.SetLogger(zap.New()) @@ -289,11 +275,6 @@ func prepareAKSNode(ctx context.Context, s *Scenario) (*ScenarioVM, error) { nbc, err := getBaseNBC(ctx, s.T, s.Runtime.Cluster, s.VHD) require.NoError(s.T, err) - // Override subscription ID for RCV1P scenarios - if s.SubscriptionID != "" { - nbc.SubscriptionID = s.SubscriptionID - } - if !config.Config.DisableScriptless { nbc.EnableScriptlessCSECmd = true } diff --git a/e2e/types.go b/e2e/types.go index 5f20af3148a..96c0e2faac2 100644 --- a/e2e/types.go +++ b/e2e/types.go @@ -36,18 +36,6 @@ var DefaultClusterInfra = &ClusterInfra{ ResourceGroupName: config.ResourceGroupName, } -// RCV1PClusterInfra returns the ClusterInfra for the RCV1P subscription, or nil if not configured. -func RCV1PClusterInfra() *ClusterInfra { - if config.RCV1PAzure == nil { - return nil - } - return &ClusterInfra{ - Azure: config.RCV1PAzure, - SubscriptionID: config.Config.RCV1PSubscriptionID, - ResourceGroupName: config.RCV1PResourceGroupName, - } -} - type Tags struct { Name string ImageName string @@ -528,16 +516,10 @@ func (s *Scenario) GetSubscriptionID() string { // GetResourceGroupName returns the resource group name for this scenario's location. func (s *Scenario) GetResourceGroupName() string { - if s.SubscriptionID != "" && s.SubscriptionID != config.Config.SubscriptionID { - return config.RCV1PResourceGroupName(s.Location) - } return config.ResourceGroupName(s.Location) } // GetVMIdentityResourceID returns the VM identity resource ID for this scenario. func (s *Scenario) GetVMIdentityResourceID() string { - if s.SubscriptionID != "" && s.SubscriptionID != config.Config.SubscriptionID { - return config.Config.RCV1PVMIdentityResourceID(s.Location) - } return config.Config.VMIdentityResourceID(s.Location) } From cadcff869fc59524c6c8e5c26f098a46b3f042b5 Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Wed, 3 Jun 2026 23:45:20 -0700 Subject: [PATCH 074/103] init-aks-custom-cloud: add telemetry events for cert provisioning Add logs_to_events() and emit_event() functions to init-aks-custom-cloud.sh for off-node observability of RCV1P cert provisioning events. The Geneva agent picks up JSON events from the CustomScript events directory and ships them to Kusto for querying. Events emitted: certEndpointMode, optedIn/notOptedIn status, certCount, and wrapped operations for opt-in check, cert retrieval, and trust store installation. Existing log lines are preserved unchanged for E2E pipeline validators. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../artifacts/init-aks-custom-cloud.sh | 85 +++++++++++++++++-- 1 file changed, 80 insertions(+), 5 deletions(-) diff --git a/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh b/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh index a492f4a3fad..68fcd66dd1a 100644 --- a/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh +++ b/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh @@ -1,6 +1,73 @@ #!/bin/bash set -x +# GA events directory — Azure Guest Agent monitors this directory and forwards +# JSON event files to Geneva/Kusto for off-node telemetry. +EVENTS_LOGGING_DIR="/var/log/azure/Microsoft.Azure.Extensions.CustomScript/events/" + +# Lightweight logs_to_events for telemetry — wraps a command, records timing, +# and writes a JSON event file that GA picks up and ships to Kusto. +# Does NOT suppress stdout/stderr — existing log lines are preserved. +logs_to_events() { + local task=$1; shift + local eventsFileName + eventsFileName=$(date +%s%3N) + + local startTime + startTime=$(date +"%F %T.%3N") + "${@}" + local ret=$? + local endTime + endTime=$(date +"%F %T.%3N") + + local json_string + json_string=$(jq -n \ + --arg Timestamp "${startTime}" \ + --arg OperationId "${endTime}" \ + --arg Version "1.23" \ + --arg TaskName "${task}" \ + --arg EventLevel "Informational" \ + --arg Message "Completed: $*" \ + --arg EventPid "0" \ + --arg EventTid "0" \ + '{Timestamp: $Timestamp, OperationId: $OperationId, Version: $Version, TaskName: $TaskName, EventLevel: $EventLevel, Message: $Message, EventPid: $EventPid, EventTid: $EventTid}' + ) + + mkdir -p "${EVENTS_LOGGING_DIR}" + echo "${json_string}" > "${EVENTS_LOGGING_DIR}${eventsFileName}.json" + + if [ "$ret" -ne 0 ]; then + return $ret + fi +} + +# Emit a custom telemetry event with a specific message (not wrapping a command). +emit_event() { + local task=$1 + local message=$2 + local level=${3:-Informational} + local eventsFileName + eventsFileName=$(date +%s%3N) + local timestamp + timestamp=$(date +"%F %T.%3N") + + local json_string + json_string=$(jq -n \ + --arg Timestamp "${timestamp}" \ + --arg OperationId "${timestamp}" \ + --arg Version "1.23" \ + --arg TaskName "${task}" \ + --arg EventLevel "${level}" \ + --arg Message "${message}" \ + --arg EventPid "0" \ + --arg EventTid "0" \ + '{Timestamp: $Timestamp, OperationId: $OperationId, Version: $Version, TaskName: $TaskName, EventLevel: $EventLevel, Message: $Message, EventPid: $EventPid, EventTid: $EventTid}' + ) + + mkdir -p "${EVENTS_LOGGING_DIR}" + echo "${json_string}" > "${EVENTS_LOGGING_DIR}${eventsFileName}.json" +} + IS_FLATCAR=0 IS_UBUNTU=0 IS_ACL=0 @@ -243,19 +310,20 @@ case "$location_normalized" in esac echo "Using custom cloud certificate endpoint mode: ${cert_endpoint_mode}" +emit_event "AKS.CSE.rcv1p.certEndpointMode" "mode=${cert_endpoint_mode}, location=${location_normalized}" install_ca_refresh_schedule=0 mkdir -p /root/AzureCACertificates rm -f /root/AzureCACertificates/* if [ "$cert_endpoint_mode" = "legacy" ]; then install_ca_refresh_schedule=1 - if retrieve_legacy_certs; then - install_certs_to_trust_store + if logs_to_events "AKS.CSE.rcv1p.retrieveLegacyCerts" retrieve_legacy_certs; then + logs_to_events "AKS.CSE.rcv1p.installCertsToTrustStore" install_certs_to_trust_store else echo "ERROR: failed to retrieve legacy certificates from wireserver after retries" exit 1 fi elif [ "$cert_endpoint_mode" = "rcv1p" ]; then - is_opted_in_for_root_certs + logs_to_events "AKS.CSE.rcv1p.isOptedIn" is_opted_in_for_root_certs opt_in_result=$? if [ $opt_in_result -eq 2 ]; then # Fatal: wireserver was unreachable after retries. We cannot determine whether @@ -263,15 +331,22 @@ elif [ "$cert_endpoint_mode" = "rcv1p" ]; then # falling back to the distro trust store would be a security hole if the # customer intended hardened certs, so we fail hard here. echo "ERROR: cannot provision node — wireserver unreachable for cert opt-in check" + emit_event "AKS.CSE.rcv1p.optInCheckFailed" "wireserver unreachable after retries" "Error" exit 1 elif [ $opt_in_result -eq 0 ]; then install_ca_refresh_schedule=1 - if retrieve_rcv1p_certs; then - install_certs_to_trust_store + emit_event "AKS.CSE.rcv1p.optedIn" "IsOptedInForRootCerts=true" + if logs_to_events "AKS.CSE.rcv1p.retrieveCerts" retrieve_rcv1p_certs; then + cert_count=$(find /root/AzureCACertificates -name '*.crt' 2>/dev/null | wc -l) + emit_event "AKS.CSE.rcv1p.certCount" "downloaded ${cert_count} certificates" + logs_to_events "AKS.CSE.rcv1p.installCertsToTrustStore" install_certs_to_trust_store else echo "ERROR: failed to retrieve rcv1p certificates from wireserver after retries" + emit_event "AKS.CSE.rcv1p.retrieveCertsFailed" "failed to retrieve rcv1p certificates" "Error" exit 1 fi + else + emit_event "AKS.CSE.rcv1p.notOptedIn" "IsOptedInForRootCerts=false, skipping cert installation" fi fi From 6b31a4299320f265bc1be1b11bc521e4ae678dfe Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Wed, 10 Jun 2026 17:18:35 -0700 Subject: [PATCH 075/103] e2e: skip NotOptedIn tests when tags are auto-injected On MSFT tenant, RCV1P opt-in tags are auto-injected on all VMSSes, so NotOptedIn tests (which expect wireserver to return IsOptedInForRootCerts=false) always fail. Add RCV1P_TAGS_AUTO_INJECTED config flag (default true in e2e-template.yaml) to skip these tests on tenants with auto-injection. The RCV1P pipeline (e2e-rcv1p.yaml, TME tenant) sets this to false so NotOptedIn tests run there where tags are not auto-injected. Also changes skipIfRCV1PNotConfigured to use t.Skip instead of t.Fatal when the feature flag is not registered, so pipelines on non-RCV1P subscriptions pass cleanly. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .pipelines/e2e-rcv1p.yaml | 1 + .pipelines/templates/e2e-template.yaml | 5 +++++ e2e/config/config.go | 1 + e2e/scenario_rcv1p_test.go | 25 ++++++++++++++++++------- e2e/scenario_rcv1p_win_test.go | 1 + 5 files changed, 26 insertions(+), 7 deletions(-) diff --git a/.pipelines/e2e-rcv1p.yaml b/.pipelines/e2e-rcv1p.yaml index 927365ea9c8..73e8f14d387 100644 --- a/.pipelines/e2e-rcv1p.yaml +++ b/.pipelines/e2e-rcv1p.yaml @@ -18,3 +18,4 @@ jobs: name: RCV1P Cert Mode Tests IgnoreScenariosWithMissingVhd: false subscriptionOverride: $(RCV1P_SUBSCRIPTION_ID) + rcv1pTagsAutoInjected: "false" diff --git a/.pipelines/templates/e2e-template.yaml b/.pipelines/templates/e2e-template.yaml index fe53fe52a2e..8131c085659 100644 --- a/.pipelines/templates/e2e-template.yaml +++ b/.pipelines/templates/e2e-template.yaml @@ -14,6 +14,10 @@ parameters: type: string displayName: Subscription ID to use for E2E tests default: $(E2E_SUBSCRIPTION_ID) + - name: rcv1pTagsAutoInjected + type: string + displayName: Whether the platform auto-injects RCV1P opt-in tags on all VMSSes (true on MSFT tenant) + default: "true" jobs: - job: e2e @@ -48,6 +52,7 @@ jobs: DefaultWorkingDirectory: $(Build.SourcesDirectory) VHD_BUILD_ID: $(VHD_BUILD_ID) IGNORE_SCENARIOS_WITH_MISSING_VHD: ${{parameters.IgnoreScenariosWithMissingVhd}} + RCV1P_TAGS_AUTO_INJECTED: ${{parameters.rcv1pTagsAutoInjected}} - task: PublishTestResults@2 displayName: Upload test results diff --git a/e2e/config/config.go b/e2e/config/config.go index 2c6e7eb012e..de200dc66df 100644 --- a/e2e/config/config.go +++ b/e2e/config/config.go @@ -73,6 +73,7 @@ type Configuration struct { GallerySubscriptionIDWindows string `env:"GALLERY_SUBSCRIPTION_ID" envDefault:"c4c3550e-a965-4993-a50c-628fd38cd3e1"` IgnoreScenariosWithMissingVHD bool `env:"IGNORE_SCENARIOS_WITH_MISSING_VHD"` KeepVMSS bool `env:"KEEP_VMSS"` + RCV1PTagsAutoInjected bool `env:"RCV1P_TAGS_AUTO_INJECTED"` NetworkIsolatedNSGName string `env:"NETWORK_ISOLATED_NSG_NAME" envDefault:"abe2e-networkisolated-securityGroup"` SIGVersionTagName string `env:"SIG_VERSION_TAG_NAME" envDefault:"branch"` SIGVersionTagValue string `env:"SIG_VERSION_TAG_VALUE" envDefault:"refs/heads/main"` diff --git a/e2e/scenario_rcv1p_test.go b/e2e/scenario_rcv1p_test.go index 16516990bf9..4c2965148a8 100644 --- a/e2e/scenario_rcv1p_test.go +++ b/e2e/scenario_rcv1p_test.go @@ -55,6 +55,16 @@ func skipIfRCV1PNotConfigured(t *testing.T) { checkPlatformSettingsOverrideFeatureFlag(t, subID, config.Azure, true) } +// skipIfRCV1PTagsAutoInjected skips the test when the platform auto-injects +// opt-in tags on all VMSSes (e.g. MSFT tenant). NotOptedIn tests can only +// produce meaningful results when tags are NOT auto-injected (e.g. TME tenant). +func skipIfRCV1PTagsAutoInjected(t *testing.T) { + t.Helper() + if config.Config.RCV1PTagsAutoInjected { + t.Skip("RCV1P_TAGS_AUTO_INJECTED is true; NotOptedIn tests require a tenant that does not auto-inject tags") + } +} + var ( featureFlagChecks sync.Map // subscriptionID -> *featureFlagResult ) @@ -66,9 +76,9 @@ type featureFlagResult struct { } // checkPlatformSettingsOverrideFeatureFlag checks the Microsoft.Compute/PlatformSettingsOverride -// feature flag on the given subscription. When failIfMissing is true (RCV1P tests), the test -// fails if the flag is not registered. When false (diagnostics), it only logs the result. -func checkPlatformSettingsOverrideFeatureFlag(t *testing.T, subscriptionID string, client *config.AzureClient, failIfMissing bool) { +// feature flag on the given subscription. When skipIfMissing is true (RCV1P tests), the test +// is skipped if the flag is not registered. When false (diagnostics), it only logs the result. +func checkPlatformSettingsOverrideFeatureFlag(t *testing.T, subscriptionID string, client *config.AzureClient, skipIfMissing bool) { t.Helper() val, _ := featureFlagChecks.LoadOrStore(subscriptionID, &featureFlagResult{}) result := val.(*featureFlagResult) @@ -78,15 +88,15 @@ func checkPlatformSettingsOverrideFeatureFlag(t *testing.T, subscriptionID strin if result.err != nil { t.Logf("PlatformSettingsOverride feature flag check on subscription %s: error: %v", subscriptionID, result.err) - if failIfMissing { - t.Fatalf("RCV1P feature flag check failed: %v", result.err) + if skipIfMissing { + t.Skipf("RCV1P feature flag check failed: %v", result.err) } return } t.Logf("PlatformSettingsOverride feature flag on subscription %s: registered=%v", subscriptionID, result.registered) - if failIfMissing && !result.registered { - t.Fatalf("Microsoft.Compute/PlatformSettingsOverride is NOT registered on subscription %s; "+ + if skipIfMissing && !result.registered { + t.Skipf("Microsoft.Compute/PlatformSettingsOverride is NOT registered on subscription %s; "+ "wireserver will not serve root certificates without this feature flag", subscriptionID) } } @@ -399,6 +409,7 @@ func Test_RCV1P_ACL(t *testing.T) { // subscription feature alone is not sufficient — the VM must also be explicitly tagged. func Test_RCV1P_NotOptedIn(t *testing.T) { skipIfRCV1PNotConfigured(t) + skipIfRCV1PTagsAutoInjected(t) RunScenario(t, &Scenario{ Description: "Tests RCV1P cert mode without VM opt-in tag; expects no cert installation", Tags: Tags{ diff --git a/e2e/scenario_rcv1p_win_test.go b/e2e/scenario_rcv1p_win_test.go index a996d5625f7..cd8989baabb 100644 --- a/e2e/scenario_rcv1p_win_test.go +++ b/e2e/scenario_rcv1p_win_test.go @@ -123,6 +123,7 @@ func Test_RCV1P_Windows2025Gen2(t *testing.T) { // script correctly skips certificate download and refresh task registration. func Test_RCV1P_Windows_NotOptedIn(t *testing.T) { skipIfRCV1PNotConfigured(t) + skipIfRCV1PTagsAutoInjected(t) cseMutator := rcv1pWindowsCSEMutator(t)// TODO(rcv1p): remove once RCV1P ships in published CSE package RunScenario(t, &Scenario{ Description: "Tests RCV1P cert mode on Windows without VM opt-in tag; expects no cert installation", From a5139fa02520cd3c45c24c2d62eea246cba92a5c Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Mon, 15 Jun 2026 09:17:17 -0700 Subject: [PATCH 076/103] e2e: use ab-e2e-tme-rcv1p variable group for RCV1P pipeline Replace subscriptionOverride with a dedicated variable group that has E2E_SUBSCRIPTION_ID set to the RCV1P-enabled TME subscription. This removes the dependency on the now-removed RCV1P_SUBSCRIPTION_ID variable. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .pipelines/e2e-rcv1p.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pipelines/e2e-rcv1p.yaml b/.pipelines/e2e-rcv1p.yaml index 73e8f14d387..6d8fb88e7cc 100644 --- a/.pipelines/e2e-rcv1p.yaml +++ b/.pipelines/e2e-rcv1p.yaml @@ -17,5 +17,5 @@ jobs: parameters: name: RCV1P Cert Mode Tests IgnoreScenariosWithMissingVhd: false - subscriptionOverride: $(RCV1P_SUBSCRIPTION_ID) + variableGroup: ab-e2e-tme-rcv1p rcv1pTagsAutoInjected: "false" From 308eff91382a6c8e718b3c1cf4a24bc6a713128c Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Wed, 17 Jun 2026 10:00:29 -0700 Subject: [PATCH 077/103] =?UTF-8?q?fix:=20correct=20typo=20'usuable'=20?= =?UTF-8?q?=E2=86=92=20'usable'=20in=20chrony=20comment?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- parts/linux/cloud-init/artifacts/init-aks-custom-cloud-repos.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parts/linux/cloud-init/artifacts/init-aks-custom-cloud-repos.sh b/parts/linux/cloud-init/artifacts/init-aks-custom-cloud-repos.sh index 9f5eae0119a..568df51f92b 100644 --- a/parts/linux/cloud-init/artifacts/init-aks-custom-cloud-repos.sh +++ b/parts/linux/cloud-init/artifacts/init-aks-custom-cloud-repos.sh @@ -304,7 +304,7 @@ fi cat > $chrony_conf < Date: Wed, 17 Jun 2026 12:40:31 -0700 Subject: [PATCH 078/103] fix: remove duplicate Register-NodeResetScriptTask call in BasePrep The task must only be registered after temp kubeconfig removal (line 595) to avoid a TLS bootstrap race. The earlier call in BasePrep was redundant and reintroduced that race condition. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- parts/windows/kuberneteswindowssetup.ps1 | 1 - 1 file changed, 1 deletion(-) diff --git a/parts/windows/kuberneteswindowssetup.ps1 b/parts/windows/kuberneteswindowssetup.ps1 index 6594066c558..899630d5039 100644 --- a/parts/windows/kuberneteswindowssetup.ps1 +++ b/parts/windows/kuberneteswindowssetup.ps1 @@ -486,7 +486,6 @@ function BasePrep { PREPROVISION_EXTENSION Adjust-DynamicPortRange Register-LogsCleanupScriptTask - Register-NodeResetScriptTask # Guard against older CSE packages that do not yet export Should-InstallCACertificatesRefreshTask. # If the function is absent (old package), fall back to the previous unconditional behaviour so # that legacy/ussec/usnat clusters continue to register the refresh task. From 46111f55ece1c60151a32fc7a53d2d7437115394 Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Wed, 17 Jun 2026 12:46:25 -0700 Subject: [PATCH 079/103] style: add missing space before inline comment in rcv1p win test Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- e2e/scenario_rcv1p_win_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/e2e/scenario_rcv1p_win_test.go b/e2e/scenario_rcv1p_win_test.go index cd8989baabb..e8960fa731f 100644 --- a/e2e/scenario_rcv1p_win_test.go +++ b/e2e/scenario_rcv1p_win_test.go @@ -124,7 +124,7 @@ func Test_RCV1P_Windows2025Gen2(t *testing.T) { func Test_RCV1P_Windows_NotOptedIn(t *testing.T) { skipIfRCV1PNotConfigured(t) skipIfRCV1PTagsAutoInjected(t) - cseMutator := rcv1pWindowsCSEMutator(t)// TODO(rcv1p): remove once RCV1P ships in published CSE package + cseMutator := rcv1pWindowsCSEMutator(t) // TODO(rcv1p): remove once RCV1P ships in published CSE package RunScenario(t, &Scenario{ Description: "Tests RCV1P cert mode on Windows without VM opt-in tag; expects no cert installation", Tags: Tags{ From 7b8b1743252136d48c4dc0604a1a198265c8f60c Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Wed, 17 Jun 2026 15:09:19 -0700 Subject: [PATCH 080/103] fix: fail hard if legacy CA cert trust store install fails Previously install_certs_to_trust_store failure was silently ignored, allowing provisioning to continue without proper CA configuration. Now exits with error to match the fatal-after-retries intent. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh b/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh index 68fcd66dd1a..811a8075a52 100644 --- a/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh +++ b/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh @@ -317,7 +317,10 @@ rm -f /root/AzureCACertificates/* if [ "$cert_endpoint_mode" = "legacy" ]; then install_ca_refresh_schedule=1 if logs_to_events "AKS.CSE.rcv1p.retrieveLegacyCerts" retrieve_legacy_certs; then - logs_to_events "AKS.CSE.rcv1p.installCertsToTrustStore" install_certs_to_trust_store + logs_to_events "AKS.CSE.rcv1p.installCertsToTrustStore" install_certs_to_trust_store || { + echo "ERROR: failed to install legacy CA certificates into trust store" >&2 + exit 1 + } else echo "ERROR: failed to retrieve legacy certificates from wireserver after retries" exit 1 From a12a0d86f6aba90428ef0ef125e013fcbbffeea2 Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Wed, 17 Jun 2026 19:40:05 -0700 Subject: [PATCH 081/103] fix: remove confusing '(true on MSFT tenant)' from displayName The auto-injection is determined by subscription enrollment, not tenant. Removing the parenthetical avoids confusion. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .pipelines/templates/e2e-template.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pipelines/templates/e2e-template.yaml b/.pipelines/templates/e2e-template.yaml index 8131c085659..bfeebd2f88e 100644 --- a/.pipelines/templates/e2e-template.yaml +++ b/.pipelines/templates/e2e-template.yaml @@ -16,7 +16,7 @@ parameters: default: $(E2E_SUBSCRIPTION_ID) - name: rcv1pTagsAutoInjected type: string - displayName: Whether the platform auto-injects RCV1P opt-in tags on all VMSSes (true on MSFT tenant) + displayName: Whether the platform auto-injects RCV1P opt-in tags on all VMSSes default: "true" jobs: From 8e1167c4b5b6fe41b9064427371fe49a0ce0eaa7 Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Wed, 17 Jun 2026 21:14:03 -0700 Subject: [PATCH 082/103] docs: add comment explaining rcv1pTagsAutoInjected=false Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .pipelines/e2e-rcv1p.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.pipelines/e2e-rcv1p.yaml b/.pipelines/e2e-rcv1p.yaml index 6d8fb88e7cc..8cd8759f067 100644 --- a/.pipelines/e2e-rcv1p.yaml +++ b/.pipelines/e2e-rcv1p.yaml @@ -18,4 +18,6 @@ jobs: name: RCV1P Cert Mode Tests IgnoreScenariosWithMissingVhd: false variableGroup: ab-e2e-tme-rcv1p + # The RCV1P testing subscription does not have platform auto-injection enabled, + # so the E2E framework explicitly injects opt-in tags on each VMSS. rcv1pTagsAutoInjected: "false" From f4965bb35370f8f9b245066023f7ea716feb5a9f Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Wed, 17 Jun 2026 21:15:23 -0700 Subject: [PATCH 083/103] fix: remove redundant cron schedule from e2e-rcv1p pipeline Daily orchestration in aks-rp already runs these tests. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .pipelines/e2e-rcv1p.yaml | 7 ------- 1 file changed, 7 deletions(-) diff --git a/.pipelines/e2e-rcv1p.yaml b/.pipelines/e2e-rcv1p.yaml index 8cd8759f067..9d77c2ff2ae 100644 --- a/.pipelines/e2e-rcv1p.yaml +++ b/.pipelines/e2e-rcv1p.yaml @@ -3,13 +3,6 @@ variables: TAGS_TO_RUN: "rcv1pcertmode=true" SKIP_E2E_TESTS: false E2E_GO_TEST_TIMEOUT: "75m" -schedules: - - cron: "0 11 * * *" - displayName: Daily 3am PST - branches: - include: - - main - always: true trigger: none pr: none jobs: From 708ca07478f71f95212e03542565d74e547dfd52 Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Wed, 17 Jun 2026 21:19:57 -0700 Subject: [PATCH 084/103] fix: reword skip message to reference environment, not tenant The auto-inject behavior is controlled by the E2E config flag, not the tenant itself. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- e2e/scenario_rcv1p_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/e2e/scenario_rcv1p_test.go b/e2e/scenario_rcv1p_test.go index 4c2965148a8..ba68b69ff64 100644 --- a/e2e/scenario_rcv1p_test.go +++ b/e2e/scenario_rcv1p_test.go @@ -61,7 +61,7 @@ func skipIfRCV1PNotConfigured(t *testing.T) { func skipIfRCV1PTagsAutoInjected(t *testing.T) { t.Helper() if config.Config.RCV1PTagsAutoInjected { - t.Skip("RCV1P_TAGS_AUTO_INJECTED is true; NotOptedIn tests require a tenant that does not auto-inject tags") + t.Skip("RCV1P_TAGS_AUTO_INJECTED is true; NotOptedIn tests require an environment where the platform does not auto-inject tags") } } From 1fd6fefd874211de3b80999fed42fe167cf0d910 Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Wed, 17 Jun 2026 21:28:54 -0700 Subject: [PATCH 085/103] fix: parse AFEC feature flag response as JSON instead of string contains Properly unmarshal the ARM response and check properties.state with case-insensitive comparison instead of brittle string matching. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- e2e/scenario_rcv1p_test.go | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/e2e/scenario_rcv1p_test.go b/e2e/scenario_rcv1p_test.go index ba68b69ff64..fda1e6ec805 100644 --- a/e2e/scenario_rcv1p_test.go +++ b/e2e/scenario_rcv1p_test.go @@ -21,6 +21,7 @@ package e2e import ( "archive/zip" "context" + "encoding/json" "fmt" "io" "os" @@ -119,13 +120,21 @@ func queryFeatureFlag(ctx context.Context, subscriptionID string, client *config defer resp.Body.Close() body, _ := io.ReadAll(resp.Body) - bodyStr := string(body) if resp.StatusCode != 200 { - return false, fmt.Errorf("feature flag query returned status %d: %s", resp.StatusCode, bodyStr) + return false, fmt.Errorf("feature flag query returned status %d: %s", resp.StatusCode, string(body)) } - return strings.Contains(bodyStr, `"Registered"`), nil + var result struct { + Properties struct { + State string `json:"state"` + } `json:"properties"` + } + if err := json.Unmarshal(body, &result); err != nil { + return false, fmt.Errorf("failed to parse feature flag response: %w", err) + } + + return strings.EqualFold(result.Properties.State, "Registered"), nil } // rcv1pOptInVMConfigMutator sets the platform opt-in tag on the VMSS resource level. From 54b48f30e25d09defa7d84b5ea8707e956bd37a0 Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Wed, 17 Jun 2026 21:36:20 -0700 Subject: [PATCH 086/103] docs: clarify deferred extension pattern is E2E-specific In production, AKS RP handles tag application and CSE sequencing through CRP. The deferred extension is only needed in E2E because Uniform VMSS requires BeginUpdate for instance-level tags. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- e2e/vmss.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/e2e/vmss.go b/e2e/vmss.go index 2d28d4aacf2..abf8d017fa2 100644 --- a/e2e/vmss.go +++ b/e2e/vmss.go @@ -538,11 +538,12 @@ func CreateVMSS(ctx context.Context, s *Scenario, resourceGroupName string) (*Sc rcv1pTagKey := "platformsettings.host_environment.service.platform_optedin_for_rootcerts" _, requestedRCV1PTag := model.Tags[rcv1pTagKey] - // For scenarios that need VM instance tags (e.g., RCV1P), we must apply tags + // E2E-specific: For scenarios that need VM instance tags (e.g., RCV1P), we must apply tags // before CSE runs because wireserver checks per-VM-instance tags. The only // working method for Uniform VMSS is BeginUpdate (full PUT), which takes ~108s. // To avoid the race, we strip the CSE extension before creation, apply tags // via BeginUpdate, then re-add the extension in a second update. + // In production, AKS RP handles tag application and CSE sequencing through CRP. var deferredExtensionProfile *armcompute.VirtualMachineScaleSetExtensionProfile if len(s.Config.VMInstanceTags) > 0 && model.Properties.VirtualMachineProfile.ExtensionProfile != nil { deferredExtensionProfile = model.Properties.VirtualMachineProfile.ExtensionProfile From dc3455a2115ac0d50c591c8133e16df593ef8a90 Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Wed, 17 Jun 2026 21:40:32 -0700 Subject: [PATCH 087/103] fix: remove vmssResp2 code smell, assign directly to vmssResp Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- e2e/vmss.go | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/e2e/vmss.go b/e2e/vmss.go index abf8d017fa2..82e9ff183ec 100644 --- a/e2e/vmss.go +++ b/e2e/vmss.go @@ -603,11 +603,10 @@ func CreateVMSS(ctx context.Context, s *Scenario, resourceGroupName string) (*Sc if err != nil { return vm, fmt.Errorf("failed to begin adding CSE extension: %w", err) } - vmssResp2, err := cseOp.PollUntilDone(ctx, config.DefaultPollUntilDoneOptions) + vmssResp, err = cseOp.PollUntilDone(ctx, config.DefaultPollUntilDoneOptions) if err != nil { return vm, fmt.Errorf("failed to add CSE extension: %w", err) } - vmssResp = vmssResp2 } } From 01a4590474ff78318c152f2ff4ce45936b3cdf3e Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Wed, 17 Jun 2026 21:54:19 -0700 Subject: [PATCH 088/103] fix: add logs_to_events telemetry for chrony restart Emit guest agent telemetry when restarting chrony in custom cloud environments for better observability. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../linux/cloud-init/artifacts/init-aks-custom-cloud-repos.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/parts/linux/cloud-init/artifacts/init-aks-custom-cloud-repos.sh b/parts/linux/cloud-init/artifacts/init-aks-custom-cloud-repos.sh index 568df51f92b..6339c8fdd8c 100644 --- a/parts/linux/cloud-init/artifacts/init-aks-custom-cloud-repos.sh +++ b/parts/linux/cloud-init/artifacts/init-aks-custom-cloud-repos.sh @@ -351,9 +351,9 @@ makestep 1.0 -1 EOF if [ "$IS_UBUNTU" -eq 1 ]; then - systemctl restart chrony + logs_to_events "AKS.CSE.customCloud.restartChrony" systemctl restart chrony elif [ "$IS_FLATCAR" -eq 1 ]; then - systemctl restart chronyd + logs_to_events "AKS.CSE.customCloud.restartChrony" systemctl restart chronyd fi fi From d36c6a4214d7e3e822a00a4503a9e0d55cd8f7de Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Thu, 18 Jun 2026 13:17:10 -0700 Subject: [PATCH 089/103] fix: guard initAKSCustomCloudRepos with IsAKSCustomCloud in nodecustomdata The initAKSCustomCloudRepos write_files entry was placed unconditionally in the non-scriptless branch, but variables.go only populates it when IsAKSCustomCloud() is true. With missingkey=zero, non-custom-cloud nodes would get a 0-byte file written unnecessarily. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- parts/linux/cloud-init/nodecustomdata.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/parts/linux/cloud-init/nodecustomdata.yml b/parts/linux/cloud-init/nodecustomdata.yml index 31f20a0c544..0cd73a6248e 100644 --- a/parts/linux/cloud-init/nodecustomdata.yml +++ b/parts/linux/cloud-init/nodecustomdata.yml @@ -182,6 +182,7 @@ write_files: content: !!binary | {{GetVariableProperty "cloudInitData" "azureNetworkUdevRule"}} +{{if IsAKSCustomCloud}} - path: {{GetInitAKSCustomCloudReposFilepath}} permissions: "0744" encoding: gzip @@ -189,3 +190,4 @@ write_files: content: !!binary | {{GetVariableProperty "cloudInitData" "initAKSCustomCloudRepos"}} {{- end }} +{{- end }} From 9e34e6fbce0cc304698077c95f5d1be12d972efc Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Wed, 24 Jun 2026 21:51:42 -0700 Subject: [PATCH 090/103] fix(e2e): expose subscriptionId parameter on e2e-tme.yaml The aks-rp orchestrator queues e2e-tme.yaml with --subscription-id, but the top-level pipeline did not declare a subscriptionId parameter, causing ADO to reject the queue request with a validation error. PR #8747 added the parameter to the inner template (e2e-template.yaml) but ADO validates parameters against the top-level YAML before expanding templates. Forward the parameter through to the template so orchestrated RCV1P runs can pass through the RCV1P subscription ID. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .pipelines/e2e-tme.yaml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.pipelines/e2e-tme.yaml b/.pipelines/e2e-tme.yaml index 16bea2e07bb..97a6d742e69 100644 --- a/.pipelines/e2e-tme.yaml +++ b/.pipelines/e2e-tme.yaml @@ -1,4 +1,9 @@ name: $(Date:yyyyMMdd)$(Rev:.r) +parameters: + - name: subscriptionId + type: string + displayName: Subscription ID to use for E2E tests + default: $(E2E_SUBSCRIPTION_ID) variables: SKIP_E2E_TESTS: false @@ -8,4 +13,5 @@ jobs: name: Linux Tests IgnoreScenariosWithMissingVhd: false variableGroup: ab-e2e-tme + subscriptionId: ${{ parameters.subscriptionId }} From 260f3c3514c9f2e06779b5cf9da3f3df790275ec Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Wed, 24 Jun 2026 22:05:39 -0700 Subject: [PATCH 091/103] fix(init-aks-custom-cloud): use fixed-string grep for crontab cleanup The crontab filter used 'grep -v "$scriptPath" ca-refresh' which treats $scriptPath as a BRE regex. Path metacharacters like '.' would match any character, risking removal of unrelated crontab entries. Switch to 'grep -F -v' for literal-string matching. Eliminates the regex-injection surface even if the script path ever changes to include shell/regex metacharacters. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh b/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh index 811a8075a52..34dccc83858 100644 --- a/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh +++ b/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh @@ -383,7 +383,7 @@ if [ "$IS_UBUNTU" -eq 1 ] || [ "$IS_MARINER" -eq 1 ] || [ "$IS_AZURELINUX" -eq 1 # wrong for ussec/usnat legacy environments. new_entry="0 19 * * * \"$scriptPath\" ca-refresh \"$LOCATION\"" existing=$(crontab -l 2>/dev/null || true) - filtered=$(printf '%s\n' "$existing" | grep -v "\"$scriptPath\" ca-refresh" || true) + filtered=$(printf '%s\n' "$existing" | grep -F -v "\"$scriptPath\" ca-refresh" || true) if ! (printf '%s\n' "$filtered"; printf '%s\n' "$new_entry") | sed '/^$/d' | crontab -; then echo "Failed to install ca-refresh cron job via crontab" >&2 fi From 0c16a4d9dff7ca955ed52078b637375234da137c Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Wed, 24 Jun 2026 22:46:43 -0700 Subject: [PATCH 092/103] refactor(e2e): remove dead per-scenario SubscriptionID override Commit 4353cd4d5d ("e2e: simplify RCV1P to single-subscription-per-job model") gutted the dual-subscription override branches in GetResourceGroupName and GetVMIdentityResourceID, but left Scenario.SubscriptionID and GetSubscriptionID() as vestigial code. The field is set nowhere; every call to GetSubscriptionID() already resolves to config.Config.SubscriptionID. Worse, GetVMIdentityResourceID never honored the override even when the field was active, creating a latent footgun if anyone repopulated the field: VMSS identity would resolve in the wrong subscription and ARM creation would fail. Remove the half-honored surface entirely, aligning the code with the single-subscription-per-job architecture: - Drop Scenario.SubscriptionID field and GetSubscriptionID() method - Drop GetVMIdentityResourceID() (one-line wrapper, used once) - Replace s.GetSubscriptionID() with config.Config.SubscriptionID at all 4 call sites in vmss.go and 1 in test_helpers.go - Replace s.GetVMIdentityResourceID() with config.Config.VMIdentityResourceID(s.Location) in vmss.go Behavior is identical (all call sites already resolved to the same value). go vet and go build pass clean. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- e2e/test_helpers.go | 2 +- e2e/types.go | 17 ----------------- e2e/vmss.go | 12 ++++++------ 3 files changed, 7 insertions(+), 24 deletions(-) diff --git a/e2e/test_helpers.go b/e2e/test_helpers.go index df6d5ad2d4d..9ee8851b706 100644 --- a/e2e/test_helpers.go +++ b/e2e/test_helpers.go @@ -926,7 +926,7 @@ func CreateSIGImageVersionFromDisk(ctx context.Context, s *Scenario, version str customVHD := *s.Config.VHD customVHD.Name = *image.Name // Use the architecture-specific image name customVHD.Gallery = &config.Gallery{ - SubscriptionID: s.GetSubscriptionID(), + SubscriptionID: config.Config.SubscriptionID, ResourceGroupName: rg, Name: *gallery.Name, } diff --git a/e2e/types.go b/e2e/types.go index 96c0e2faac2..dcbf1b5922f 100644 --- a/e2e/types.go +++ b/e2e/types.go @@ -167,10 +167,6 @@ type Scenario struct { // When nil, config.Azure is used. AzureClient *config.AzureClient - // SubscriptionID overrides the default config.Config.SubscriptionID for this scenario. - // When empty, config.Config.SubscriptionID is used. - SubscriptionID string - // Runtime contains the runtime state of the scenario. It's populated in the beginning of the test run Runtime *ScenarioRuntime T testing.TB @@ -506,20 +502,7 @@ func (s *Scenario) GetAzure() *config.AzureClient { return config.Azure } -// GetSubscriptionID returns the subscription ID for this scenario, falling back to config.Config.SubscriptionID. -func (s *Scenario) GetSubscriptionID() string { - if s.SubscriptionID != "" { - return s.SubscriptionID - } - return config.Config.SubscriptionID -} - // GetResourceGroupName returns the resource group name for this scenario's location. func (s *Scenario) GetResourceGroupName() string { return config.ResourceGroupName(s.Location) } - -// GetVMIdentityResourceID returns the VM identity resource ID for this scenario. -func (s *Scenario) GetVMIdentityResourceID() string { - return config.Config.VMIdentityResourceID(s.Location) -} diff --git a/e2e/vmss.go b/e2e/vmss.go index 82e9ff183ec..7adbb766fbf 100644 --- a/e2e/vmss.go +++ b/e2e/vmss.go @@ -434,13 +434,13 @@ func createVMSSModel(ctx context.Context, s *Scenario) armcompute.VirtualMachine if config.Config.IsLocalBuild() { s.T.Logf( "VMSS portal link: https://ms.portal.azure.com/#@microsoft.onmicrosoft.com/resource/subscriptions/%s/resourceGroups/%s/providers/Microsoft.Compute/virtualMachineScaleSets/%s/overview", - s.GetSubscriptionID(), + config.Config.SubscriptionID, *cluster.Model.Properties.NodeResourceGroup, s.Runtime.VMSSName, ) s.T.Logf( "Managed cluster portal link: https://ms.portal.azure.com/#@microsoft.onmicrosoft.com/resource/subscriptions/%s/resourceGroups/%s/providers/Microsoft.ContainerService/managedClusters/%s/overview", - s.GetSubscriptionID(), + config.Config.SubscriptionID, *cluster.Model.Properties.NodeResourceGroup, *cluster.Model.Name, ) @@ -452,8 +452,8 @@ func createVMSSModel(ctx context.Context, s *Scenario) armcompute.VirtualMachine model.Identity = &armcompute.VirtualMachineScaleSetIdentity{ Type: to.Ptr(armcompute.ResourceIdentityTypeSystemAssignedUserAssigned), UserAssignedIdentities: map[string]*armcompute.UserAssignedIdentitiesValue{ - *s.Runtime.Cluster.KubeletIdentity.ResourceID: {}, - s.GetVMIdentityResourceID(): {}, + *s.Runtime.Cluster.KubeletIdentity.ResourceID: {}, + config.Config.VMIdentityResourceID(s.Location): {}, }, } @@ -656,7 +656,7 @@ func CreateVMSS(ctx context.Context, s *Scenario, resourceGroupName string) (*Sc } s.T.Logf("WARNING: platform auto-injected RCV1P opt-in tag %q=%s on VMSS — "+ "PlatformSettingsOverride feature flag may be causing auto-injection on subscription %s", - rcv1pTagKey, val, s.GetSubscriptionID()) + rcv1pTagKey, val, config.Config.SubscriptionID) if s.Tags.RCV1PCertMode && strings.EqualFold(val, "true") { s.T.Logf("WARNING: auto-injected tag value is 'true' — negative (NotOptedIn) tests will be "+ "INVALID on this subscription because wireserver will serve certificates regardless of our intent") @@ -1568,7 +1568,7 @@ func getBaseVMSSModel(s *Scenario, customData, cseCmd string) armcompute.Virtual ID: to.Ptr( fmt.Sprintf( loadBalancerBackendAddressPoolIDTemplate, - s.GetSubscriptionID(), + config.Config.SubscriptionID, *s.Runtime.Cluster.Model.Properties.NodeResourceGroup, ), ), From 9e172c83b1a369a3f635d038cfd2b71fe37e0d8d Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Wed, 24 Jun 2026 23:10:22 -0700 Subject: [PATCH 093/103] test(parts): add ShellSpec coverage for init-aks-custom-cloud-repos.sh Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../init_aks_custom_cloud_repos_spec.sh | 206 ++++++++++++++++++ 1 file changed, 206 insertions(+) create mode 100644 spec/parts/linux/cloud-init/artifacts/init_aks_custom_cloud_repos_spec.sh diff --git a/spec/parts/linux/cloud-init/artifacts/init_aks_custom_cloud_repos_spec.sh b/spec/parts/linux/cloud-init/artifacts/init_aks_custom_cloud_repos_spec.sh new file mode 100644 index 00000000000..af785013205 --- /dev/null +++ b/spec/parts/linux/cloud-init/artifacts/init_aks_custom_cloud_repos_spec.sh @@ -0,0 +1,206 @@ +#!/bin/bash + +Describe 'init-aks-custom-cloud-repos.sh repo depot + chrony wiring' + script_path='./parts/linux/cloud-init/artifacts/init-aks-custom-cloud-repos.sh' + + Describe 'function definitions' + Parameters + init_ubuntu_main_repo_depot + init_ubuntu_pmc_repo_depot + init_mariner_repo_depot + init_azurelinux_repo_depot + check_url + write_to_sources_file + add_key_ubuntu + add_ms_keys + derive_key_paths + aptget_update + dnf_makecache + End + + It "defines function $1" + When run grep -Eq "^function $1 \\{$" "$script_path" + The status should eq 0 + End + End + + Describe 'distro branching ladder' + It 'routes Ubuntu to Ubuntu repo init branch' + When run grep -Eq '^if \[ "\$IS_UBUNTU" -eq 1 \]; then$' "$script_path" + The status should eq 0 + End + + It 'routes Mariner or AzureLinux to the rpm-based repo init branch' + When run grep -Eq '^elif \[ "\$IS_MARINER" -eq 1 \] \|\| \[ "\$IS_AZURELINUX" -eq 1 \]; then$' "$script_path" + The status should eq 0 + End + + It 'invokes Mariner-specific init only when IS_MARINER=1' + When run grep -Eq '^[[:space:]]*if \[ "\$IS_MARINER" -eq 1 \]; then$' "$script_path" + The status should eq 0 + End + + It 'falls back to AzureLinux init in the else branch' + When run grep -Eq 'init_azurelinux_repo_depot \$\{marinerRepoDepotEndpoint\}' "$script_path" + The status should eq 0 + End + End + + Describe 'Ubuntu apt sources rewrite' + It 'syncs OpenSSL bundle from system trust store' + When run grep -Fq 'cp /etc/ssl/certs/ca-certificates.crt /usr/lib/ssl/cert.pem' "$script_path" + The status should eq 0 + End + + It 'backs up existing /etc/apt/sources.list before rewrite' + When run grep -Fq 'mv /etc/apt/sources.list /etc/apt/backup/' "$script_path" + The status should eq 0 + End + + It 'writes the new ubuntu.sources file under sources.list.d' + When run grep -Fq '/etc/apt/sources.list.d/ubuntu.sources' "$script_path" + The status should eq 0 + End + + It 'rewrites all http(s) URLs in apt sources to the RepoDepot URL' + When run grep -Fq 'sed -i "s,https\?://.[^ ]*,$ubuntuUrl,g" $aptSourceFile' "$script_path" + The status should eq 0 + End + End + + Describe 'parameter passing — add_key_ubuntu / add_ms_keys take repodepot_endpoint explicitly' + # Guards against regression of fix(ef6...) where these were sourced from + # an outer-scope variable instead of being passed as an argument. + It 'add_key_ubuntu declares repodepot_endpoint as first positional arg' + When run grep -Eq '^ local repodepot_endpoint="\$1"$' "$script_path" + The status should eq 0 + End + + It 'add_ms_keys forwards repodepot_endpoint to add_key_ubuntu' + When run grep -Fq 'add_key_ubuntu "$repodepot_endpoint" microsoft.asc' "$script_path" + The status should eq 0 + End + End + + Describe 'Mariner rpm repo creation' + It 'creates mariner-extended.repo from mariner-extras.repo' + When run grep -Fq 'cp /etc/yum.repos.d/mariner-extras.repo /etc/yum.repos.d/mariner-extended.repo' "$script_path" + The status should eq 0 + End + + It 'creates mariner-nvidia.repo from mariner-extras.repo' + When run grep -Fq 'cp /etc/yum.repos.d/mariner-extras.repo /etc/yum.repos.d/mariner-nvidia.repo' "$script_path" + The status should eq 0 + End + + It 'creates mariner-cloud-native.repo from mariner-extras.repo' + When run grep -Fq 'cp /etc/yum.repos.d/mariner-extras.repo /etc/yum.repos.d/mariner-cloud-native.repo' "$script_path" + The status should eq 0 + End + + It 'redirects packages.microsoft.com URLs to RepoDepot for all .repo files' + When run grep -Eq 'sed -i -e "s\|https://packages\.microsoft\.com\|\$\{repodepot_endpoint\}/mariner/packages\.microsoft\.com\|"' "$script_path" + The status should eq 0 + End + End + + Describe 'AzureLinux tdnf repo creation' + It 'removes pre-existing azurelinux*.repo before writing fresh repos' + When run grep -Fq 'rm -f /etc/yum.repos.d/azurelinux*' "$script_path" + The status should eq 0 + End + + It 'enumerates the full set of AzureLinux repos to create' + When run grep -Fq 'local repos=("amd" "base" "cloud-native" "extended" "ms-non-oss" "ms-oss" "nvidia")' "$script_path" + The status should eq 0 + End + + It 'enables gpgcheck on generated repos' + When run grep -Fq '"gpgcheck=1"' "$script_path" + The status should eq 0 + End + + It 'enables repo_gpgcheck on generated repos' + When run grep -Fq '"repo_gpgcheck=1"' "$script_path" + The status should eq 0 + End + + It 'pins gpgkey to MICROSOFT-RPM-GPG-KEY' + When run grep -Fq '"gpgkey=file:///etc/pki/rpm-gpg/MICROSOFT-RPM-GPG-KEY"' "$script_path" + The status should eq 0 + End + End + + Describe 'chrony configuration branching' + It 'skips chrony configuration entirely on ACL' + When run grep -Eq '^if \[ "\$IS_ACL" -eq 1 \]; then$' "$script_path" + The status should eq 0 + End + + It 'documents the ACL skip with a logged reason' + When run grep -Fq 'Skipping chrony configuration for ACL' "$script_path" + The status should eq 0 + End + + It 'writes /etc/chrony.conf for Mariner/AzureLinux' + When run grep -Eq '^elif \[ "\$IS_MARINER" -eq 1 \] \|\| \[ "\$IS_AZURELINUX" -eq 1 \]; then$' "$script_path" + The status should eq 0 + End + + It 'configures PTP refclock for Mariner/AzureLinux chrony' + When run grep -Fq 'refclock PHC /dev/ptp0 poll 3 dpoll -2 offset 0' "$script_path" + The status should eq 0 + End + + It 'restarts chronyd on Mariner/AzureLinux branch' + When run grep -Eq '^systemctl restart chronyd$' "$script_path" + The status should eq 0 + End + + It 'targets /etc/chrony/chrony.conf for Ubuntu/Flatcar' + When run grep -Eq '^chrony_conf="/etc/chrony/chrony\.conf"$' "$script_path" + The status should eq 0 + End + + It 'disables systemd-timesyncd on Ubuntu before installing chrony' + When run grep -Eq '^[[:space:]]*systemctl stop systemd-timesyncd$' "$script_path" + The status should eq 0 + End + + It 'installs chrony on Ubuntu when not already present' + When run grep -Eq '^[[:space:]]*apt-get install chrony -y$' "$script_path" + The status should eq 0 + End + + It 'removes the default chrony config on Flatcar to force regeneration' + When run grep -Eq '^[[:space:]]*rm -f \$\{chrony_conf\}$' "$script_path" + The status should eq 0 + End + End + + Describe 'chrony restart telemetry (regression guard for commit f1233050ba)' + It 'wraps Ubuntu chrony restart in logs_to_events' + When run grep -Fq 'logs_to_events "AKS.CSE.customCloud.restartChrony" systemctl restart chrony' "$script_path" + The status should eq 0 + End + + It 'wraps Flatcar chronyd restart in logs_to_events' + When run grep -Fq 'logs_to_events "AKS.CSE.customCloud.restartChrony" systemctl restart chronyd' "$script_path" + The status should eq 0 + End + End + + Describe 'sourcing contract — script must be safe to source' + It 'does not call exit at top level (would terminate the sourcing parent)' + # Exits only allowed inside functions (check_url, aptget_update, dnf_makecache) + # or guarded by branch-internal error handling. No bare top-level exit. + When run grep -En '^exit( |$)' "$script_path" + The status should eq 1 + End + + It 'enables shell tracing for diagnostic logs' + When run grep -Eq '^set -x$' "$script_path" + The status should eq 0 + End + End +End From d5108b9aca917d88fd5fcbe298cda6a243716ff7 Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Wed, 24 Jun 2026 23:23:33 -0700 Subject: [PATCH 094/103] fix(e2e): close file immediately in CSE zip walk to avoid FD accumulation Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- e2e/scenario_rcv1p_test.go | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/e2e/scenario_rcv1p_test.go b/e2e/scenario_rcv1p_test.go index fda1e6ec805..72f4b136769 100644 --- a/e2e/scenario_rcv1p_test.go +++ b/e2e/scenario_rcv1p_test.go @@ -231,9 +231,15 @@ func buildAndUploadCSEZip(ctx context.Context) (string, error) { if err != nil { return fmt.Errorf("open %s: %w", path, err) } - defer f.Close() - _, err = io.Copy(w, f) - return err + _, copyErr := io.Copy(w, f) + closeErr := f.Close() + if copyErr != nil { + return fmt.Errorf("copy %s: %w", path, copyErr) + } + if closeErr != nil { + return fmt.Errorf("close %s: %w", path, closeErr) + } + return nil }) if err != nil { return "", fmt.Errorf("build zip: %w", err) From db670a2ffdfe9a2e7cc335d378e1da4a3487c592 Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Wed, 24 Jun 2026 23:24:24 -0700 Subject: [PATCH 095/103] test(cse-windows): re-stub Set-ExitCode after dot-sourcing to prevent test runner exit Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- staging/cse/windows/kubernetesfunc.tests.ps1 | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/staging/cse/windows/kubernetesfunc.tests.ps1 b/staging/cse/windows/kubernetesfunc.tests.ps1 index 8cf053a5d1d..32f5a218a7e 100644 --- a/staging/cse/windows/kubernetesfunc.tests.ps1 +++ b/staging/cse/windows/kubernetesfunc.tests.ps1 @@ -58,6 +58,11 @@ BeforeAll { . $helperScriptPath . $scriptUnderTestPath + + function Set-ExitCode { + param($ExitCode, $ErrorMessage) + throw "Unexpected Set-ExitCode: $ExitCode $ErrorMessage" + } } Describe 'Get-CustomCloudCertEndpointModeFromLocation' { From 67e564582961fb80824bc968f850ba45669c9dd0 Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Thu, 25 Jun 2026 09:05:44 -0700 Subject: [PATCH 096/103] fix(e2e): override E2E_SUBSCRIPTION_ID pipeline variable with subscriptionId param Variable group ab-e2e-tme defines E2E_SUBSCRIPTION_ID, which ADO auto-exposes as an env var to all tasks. The previous task-level env: override did not reliably win against the auto-exposed group variable in AzureCLI@2, so the orchestrator's --subscription-id (e.g. RCV1P sub) was ignored and tests ran/skipped against the default TME sub. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .pipelines/templates/e2e-template.yaml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/.pipelines/templates/e2e-template.yaml b/.pipelines/templates/e2e-template.yaml index bfeebd2f88e..4ebad74981d 100644 --- a/.pipelines/templates/e2e-template.yaml +++ b/.pipelines/templates/e2e-template.yaml @@ -24,6 +24,11 @@ jobs: condition: and(succeeded(), ne(variables.SKIP_E2E_TESTS, 'true')) variables: - group: ${{parameters.variableGroup}} # all variables prefixed with E2E_* come from this variable group + # Override E2E_SUBSCRIPTION_ID from the variable group with the per-run parameter + # so callers (e.g. aks-rp orchestrator passing --subscription-id) can target a + # different subscription (e.g. RCV1P) than the variable group's default. + - name: E2E_SUBSCRIPTION_ID + value: ${{parameters.subscriptionId}} pool: name: $(E2E_POOL_NAME) timeoutInMinutes: 90 @@ -45,7 +50,6 @@ jobs: bash .pipelines/scripts/e2e_run.sh displayName: Run AgentBaker E2E env: - E2E_SUBSCRIPTION_ID: ${{parameters.subscriptionId}} SYS_SSH_PUBLIC_KEY: $(SYS_SSH_PUBLIC_KEY) SYS_SSH_PRIVATE_KEY_B64: $(SYS_SSH_PRIVATE_KEY_B64) BUILD_SRC_DIR: $(System.DefaultWorkingDirectory) From 2e68f4b23ea4f3d8c6300ca1966b0ad2e2aacbc7 Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Thu, 25 Jun 2026 09:36:21 -0700 Subject: [PATCH 097/103] fix(rcv1p): fail hard when installing rcv1p CA certs to trust store fails The rcv1p opted-in path previously ignored the exit status of install_certs_to_trust_store, allowing provisioning to continue with an incomplete trust store. This now matches the legacy path's behavior of exiting non-zero on failure. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh b/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh index 34dccc83858..a23b87b6dec 100644 --- a/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh +++ b/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh @@ -342,7 +342,11 @@ elif [ "$cert_endpoint_mode" = "rcv1p" ]; then if logs_to_events "AKS.CSE.rcv1p.retrieveCerts" retrieve_rcv1p_certs; then cert_count=$(find /root/AzureCACertificates -name '*.crt' 2>/dev/null | wc -l) emit_event "AKS.CSE.rcv1p.certCount" "downloaded ${cert_count} certificates" - logs_to_events "AKS.CSE.rcv1p.installCertsToTrustStore" install_certs_to_trust_store + logs_to_events "AKS.CSE.rcv1p.installCertsToTrustStore" install_certs_to_trust_store || { + echo "ERROR: failed to install rcv1p CA certificates into trust store" >&2 + emit_event "AKS.CSE.rcv1p.installCertsFailed" "failed to install rcv1p CA certificates" "Error" + exit 1 + } else echo "ERROR: failed to retrieve rcv1p certificates from wireserver after retries" emit_event "AKS.CSE.rcv1p.retrieveCertsFailed" "failed to retrieve rcv1p certificates" "Error" From 892116ad15063876fb7ab536c1873141826ed43b Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Thu, 25 Jun 2026 12:18:42 -0700 Subject: [PATCH 098/103] fix(e2e): use empty default for subscriptionId param to avoid cyclical variable reference Previously the subscriptionId parameter defaulted to $(E2E_SUBSCRIPTION_ID) and the template defined a variable E2E_SUBSCRIPTION_ID with value ${{parameters.subscriptionId}}, creating a cycle when no caller explicitly passed subscriptionId. Use an empty-string sentinel default and only override the pipeline variable when the parameter is non-empty. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .pipelines/e2e-tme.yaml | 4 ++-- .pipelines/templates/e2e-template.yaml | 15 ++++++++------- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/.pipelines/e2e-tme.yaml b/.pipelines/e2e-tme.yaml index 97a6d742e69..f3367060235 100644 --- a/.pipelines/e2e-tme.yaml +++ b/.pipelines/e2e-tme.yaml @@ -2,8 +2,8 @@ name: $(Date:yyyyMMdd)$(Rev:.r) parameters: - name: subscriptionId type: string - displayName: Subscription ID to use for E2E tests - default: $(E2E_SUBSCRIPTION_ID) + displayName: Subscription ID to use for E2E tests (empty = use variable group default) + default: "" variables: SKIP_E2E_TESTS: false diff --git a/.pipelines/templates/e2e-template.yaml b/.pipelines/templates/e2e-template.yaml index 4ebad74981d..c33ce05bb9d 100644 --- a/.pipelines/templates/e2e-template.yaml +++ b/.pipelines/templates/e2e-template.yaml @@ -12,8 +12,8 @@ parameters: default: ab-e2e - name: subscriptionId type: string - displayName: Subscription ID to use for E2E tests - default: $(E2E_SUBSCRIPTION_ID) + displayName: Subscription ID to use for E2E tests (empty = use variable group default) + default: "" - name: rcv1pTagsAutoInjected type: string displayName: Whether the platform auto-injects RCV1P opt-in tags on all VMSSes @@ -24,11 +24,12 @@ jobs: condition: and(succeeded(), ne(variables.SKIP_E2E_TESTS, 'true')) variables: - group: ${{parameters.variableGroup}} # all variables prefixed with E2E_* come from this variable group - # Override E2E_SUBSCRIPTION_ID from the variable group with the per-run parameter - # so callers (e.g. aks-rp orchestrator passing --subscription-id) can target a - # different subscription (e.g. RCV1P) than the variable group's default. - - name: E2E_SUBSCRIPTION_ID - value: ${{parameters.subscriptionId}} + # When a caller (e.g. aks-rp orchestrator) explicitly passes subscriptionId, + # override E2E_SUBSCRIPTION_ID from the variable group so the run targets the + # requested subscription (e.g. RCV1P). When empty, keep the variable group default. + - ${{ if ne(parameters.subscriptionId, '') }}: + - name: E2E_SUBSCRIPTION_ID + value: ${{parameters.subscriptionId}} pool: name: $(E2E_POOL_NAME) timeoutInMinutes: 90 From 5ddbc1edc3bf2f0ea8707cb7087a2252e1f9c9eb Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Thu, 25 Jun 2026 12:40:02 -0700 Subject: [PATCH 099/103] fix(rcv1p): address copilot review comments - init-aks-custom-cloud.sh: sanitize cert_filename via basename to prevent path traversal from wireserver response (defense-in-depth) - init-aks-custom-cloud.sh: quote $certs in retrieve_legacy_certs to avoid word-splitting on JSON payload - scenario_rcv1p_win_test.go / validators.go: update misleading comments that claimed Windows cert store import; current validators only check files in C:\ca Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- e2e/scenario_rcv1p_win_test.go | 7 ++++--- e2e/validators.go | 4 +++- .../cloud-init/artifacts/init-aks-custom-cloud.sh | 14 ++++++++++++-- 3 files changed, 19 insertions(+), 6 deletions(-) diff --git a/e2e/scenario_rcv1p_win_test.go b/e2e/scenario_rcv1p_win_test.go index e8960fa731f..4aafcc82ac9 100644 --- a/e2e/scenario_rcv1p_win_test.go +++ b/e2e/scenario_rcv1p_win_test.go @@ -1,8 +1,9 @@ // scenario_rcv1p_win_test.go contains end-to-end tests for the RCV1P cert mode on Windows. // Windows uses a different cert installation path than Linux: certificates are downloaded to -// C:\ca and imported into the Windows certificate store (Cert:\LocalMachine\Root) via -// Import-Certificate. A scheduled task (aks-ca-certs-refresh-task) is registered to -// periodically refresh the certificates. +// C:\ca by the CSE provisioning script. A scheduled task (aks-ca-certs-refresh-task) is +// registered to periodically refresh the certificates. These tests validate that the files +// land in C:\ca and the scheduled task is registered; they do not currently validate +// installation into the Windows certificate store (Cert:\LocalMachine\Root). // // These tests run against the E2E subscription and require the same VM opt-in tag // as the Linux tests (see scenario_rcv1p_test.go for details on the two-layer access control). diff --git a/e2e/validators.go b/e2e/validators.go index 94a41c1cc94..61b2f39082e 100644 --- a/e2e/validators.go +++ b/e2e/validators.go @@ -3182,7 +3182,9 @@ func ValidateRCV1PCertModeWindows(ctx context.Context, s *Scenario) { ValidateFileHasContent(ctx, s, "C:\\AzureData\\CustomDataSetupScript.log", "IsOptedInForRootCerts wireserver response:") - // Validate CA certificates were installed to the Windows certificate store + // Validate CA certificate files exist in C:\ca (the on-disk drop location). + // Note: this does not verify import into the Windows certificate store + // (Cert:\LocalMachine\Root); see scenario_rcv1p_win_test.go header. command := []string{ "$ErrorActionPreference = 'Stop'", "$caFolder = 'C:\\ca'", diff --git a/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh b/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh index a23b87b6dec..d1598c24dcb 100644 --- a/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh +++ b/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh @@ -205,8 +205,8 @@ function retrieve_legacy_certs { IFS_backup=$IFS IFS=$'\r\n' - cert_names=($(echo $certs | grep -oP '(?<=Name\": \")[^\"]*')) - cert_bodies=($(echo $certs | grep -oP '(?<=CertBody\": \")[^\"]*')) + cert_names=($(echo "$certs" | grep -oP '(?<=Name\": \")[^\"]*')) + cert_bodies=($(echo "$certs" | grep -oP '(?<=CertBody\": \")[^\"]*')) for i in ${!cert_bodies[@]}; do echo ${cert_bodies[$i]} | sed 's/\\r\\n/\n/g' | sed 's/\\//g' > "/root/AzureCACertificates/$(echo ${cert_names[$i]} | sed 's/.cer/.crt/g')" done @@ -237,6 +237,16 @@ function process_cert_operations { for cert_filename in "${cert_filenames[@]}"; do echo "Processing certificate file: $cert_filename" + # Defense-in-depth: sanitize filename to a basename to prevent path traversal + # if wireserver ever returns a value containing path separators. + local sanitized_filename + sanitized_filename=$(basename "$cert_filename") + if [ "$sanitized_filename" != "$cert_filename" ]; then + echo "Warning: rejecting certificate filename with path separators: $cert_filename" + continue + fi + cert_filename="$sanitized_filename" + local filename="${cert_filename%.*}" local extension="${cert_filename##*.}" local cert_content From bba679372e9452c152eba824725e7963f7139344 Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Thu, 25 Jun 2026 19:26:56 -0700 Subject: [PATCH 100/103] fix(e2e): override RCV1P-incompatible settings when running TME e2e against RCV1P sub When the aks-rp orchestrator routes e2e-tme.yaml to the RCV1P testing subscription via --subscription-id, the variables inherited from variable group ab-e2e-tme are incompatible with the RCV1P sub: - BLOB_STORAGE_ACCOUNT_PREFIX=abe2etme yields globally-taken account name abe2etmewestus3 (already owned by the regular TME E2E sub), causing StorageAccountAlreadyTaken on every Linux RCV1P scenario. - RCV1P_TAGS_AUTO_INJECTED defaults true, but the RCV1P sub does not auto-inject opt-in tags; the framework must stamp them explicitly. - IGNORE_SCENARIOS_WITH_MISSING_VHD=false fails Windows RCV1P scenarios when the Linux orchestrator only published Linux VHDs. Detect the RCV1P sub at runtime in e2e_run.sh and override these three settings so the same e2e-tme.yaml pipeline works for both regular and RCV1P targets without requiring orchestrator changes. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .pipelines/scripts/e2e_run.sh | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/.pipelines/scripts/e2e_run.sh b/.pipelines/scripts/e2e_run.sh index b6f83df5fab..25a28311b04 100644 --- a/.pipelines/scripts/e2e_run.sh +++ b/.pipelines/scripts/e2e_run.sh @@ -20,6 +20,29 @@ echo "Using subscription ${E2E_SUBSCRIPTION_ID} for e2e tests" # Map E2E_SUBSCRIPTION_ID to SUBSCRIPTION_ID which the Go test framework reads export SUBSCRIPTION_ID="${E2E_SUBSCRIPTION_ID}" +# When the orchestrator routes e2e-tme.yaml to the RCV1P testing subscription via +# --subscription-id, override settings that are otherwise tied to the regular TME +# variable group (ab-e2e-tme): +# * BLOB_STORAGE_ACCOUNT_PREFIX must yield a globally-unique storage account name +# (the default "abe2etme" prefix is already taken by another subscription, so +# account creation in the RCV1P sub fails with StorageAccountAlreadyTaken). +# * RCV1P_TAGS_AUTO_INJECTED must be false: the RCV1P testing subscription does +# not have platform auto-injection enabled, so the framework must explicitly +# stamp opt-in tags on each VMSS. +# * IGNORE_SCENARIOS_WITH_MISSING_VHD must be true: the Linux orchestrator only +# publishes Linux VHDs, so Windows RCV1P scenarios should skip (not fail) when +# the matching Windows image is absent from the gallery. +RCV1P_SUBSCRIPTION_ID="38d77129-fc18-4f21-9ce1-79dd1fe50fc6" +if [ "${E2E_SUBSCRIPTION_ID}" = "${RCV1P_SUBSCRIPTION_ID}" ]; then + echo "Detected RCV1P testing subscription; applying RCV1P-specific overrides" + export BLOB_STORAGE_ACCOUNT_PREFIX="abe2etmercv1p" + export RCV1P_TAGS_AUTO_INJECTED="false" + export IGNORE_SCENARIOS_WITH_MISSING_VHD="true" + echo " BLOB_STORAGE_ACCOUNT_PREFIX=${BLOB_STORAGE_ACCOUNT_PREFIX}" + echo " RCV1P_TAGS_AUTO_INJECTED=${RCV1P_TAGS_AUTO_INJECTED}" + echo " IGNORE_SCENARIOS_WITH_MISSING_VHD=${IGNORE_SCENARIOS_WITH_MISSING_VHD}" +fi + # Setup go export GOPATH="$(go env GOPATH)" go version From f05669c0753eff7bb7e00ae168fff5fd8adca3ca Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Thu, 25 Jun 2026 19:28:40 -0700 Subject: [PATCH 101/103] fix(e2e): detect RCV1P sub via E2E_SUBSCRIPTION_ID_RCV1P instead of hardcoded GUID Replace the hardcoded RCV1P subscription GUID with a comparison against E2E_SUBSCRIPTION_ID_RCV1P sourced from the variable group. This keeps subscription identity out of the script, lets the value rotate without code changes, and makes the override path a no-op for pipelines (e.g. the MSFT-tenant default) that do not define E2E_SUBSCRIPTION_ID_RCV1P. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .pipelines/scripts/e2e_run.sh | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/.pipelines/scripts/e2e_run.sh b/.pipelines/scripts/e2e_run.sh index 25a28311b04..9684f0b6ed0 100644 --- a/.pipelines/scripts/e2e_run.sh +++ b/.pipelines/scripts/e2e_run.sh @@ -20,9 +20,11 @@ echo "Using subscription ${E2E_SUBSCRIPTION_ID} for e2e tests" # Map E2E_SUBSCRIPTION_ID to SUBSCRIPTION_ID which the Go test framework reads export SUBSCRIPTION_ID="${E2E_SUBSCRIPTION_ID}" -# When the orchestrator routes e2e-tme.yaml to the RCV1P testing subscription via -# --subscription-id, override settings that are otherwise tied to the regular TME -# variable group (ab-e2e-tme): +# When the aks-rp orchestrator routes e2e-tme.yaml to the RCV1P testing subscription +# via --subscription-id, the variables inherited from variable group ab-e2e-tme are +# incompatible with the RCV1P sub. The variable group exposes E2E_SUBSCRIPTION_ID_RCV1P +# as the canonical ID of the RCV1P testing subscription; if the active subscription +# matches it, apply RCV1P-specific overrides: # * BLOB_STORAGE_ACCOUNT_PREFIX must yield a globally-unique storage account name # (the default "abe2etme" prefix is already taken by another subscription, so # account creation in the RCV1P sub fails with StorageAccountAlreadyTaken). @@ -32,9 +34,11 @@ export SUBSCRIPTION_ID="${E2E_SUBSCRIPTION_ID}" # * IGNORE_SCENARIOS_WITH_MISSING_VHD must be true: the Linux orchestrator only # publishes Linux VHDs, so Windows RCV1P scenarios should skip (not fail) when # the matching Windows image is absent from the gallery. -RCV1P_SUBSCRIPTION_ID="38d77129-fc18-4f21-9ce1-79dd1fe50fc6" -if [ "${E2E_SUBSCRIPTION_ID}" = "${RCV1P_SUBSCRIPTION_ID}" ]; then - echo "Detected RCV1P testing subscription; applying RCV1P-specific overrides" +# In the default MSFT-tenant E2E path, E2E_SUBSCRIPTION_ID_RCV1P is either unset or +# does not match the active subscription, so this block is a no-op and the existing +# auto-injection / shared storage account behavior is preserved. +if [ -n "${E2E_SUBSCRIPTION_ID_RCV1P:-}" ] && [ "${E2E_SUBSCRIPTION_ID}" = "${E2E_SUBSCRIPTION_ID_RCV1P}" ]; then + echo "Active subscription matches E2E_SUBSCRIPTION_ID_RCV1P; applying RCV1P-specific overrides" export BLOB_STORAGE_ACCOUNT_PREFIX="abe2etmercv1p" export RCV1P_TAGS_AUTO_INJECTED="false" export IGNORE_SCENARIOS_WITH_MISSING_VHD="true" From 95a16c33895dcf3466933b51d4dce4f1fcdfa9ee Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Thu, 25 Jun 2026 19:34:06 -0700 Subject: [PATCH 102/103] docs(e2e): expand reviewer comments on RCV1P override block Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .pipelines/scripts/e2e_run.sh | 101 ++++++++++++++++++++++++++++------ 1 file changed, 84 insertions(+), 17 deletions(-) diff --git a/.pipelines/scripts/e2e_run.sh b/.pipelines/scripts/e2e_run.sh index 9684f0b6ed0..44a6cd39376 100644 --- a/.pipelines/scripts/e2e_run.sh +++ b/.pipelines/scripts/e2e_run.sh @@ -20,25 +20,92 @@ echo "Using subscription ${E2E_SUBSCRIPTION_ID} for e2e tests" # Map E2E_SUBSCRIPTION_ID to SUBSCRIPTION_ID which the Go test framework reads export SUBSCRIPTION_ID="${E2E_SUBSCRIPTION_ID}" -# When the aks-rp orchestrator routes e2e-tme.yaml to the RCV1P testing subscription -# via --subscription-id, the variables inherited from variable group ab-e2e-tme are -# incompatible with the RCV1P sub. The variable group exposes E2E_SUBSCRIPTION_ID_RCV1P -# as the canonical ID of the RCV1P testing subscription; if the active subscription -# matches it, apply RCV1P-specific overrides: -# * BLOB_STORAGE_ACCOUNT_PREFIX must yield a globally-unique storage account name -# (the default "abe2etme" prefix is already taken by another subscription, so -# account creation in the RCV1P sub fails with StorageAccountAlreadyTaken). -# * RCV1P_TAGS_AUTO_INJECTED must be false: the RCV1P testing subscription does -# not have platform auto-injection enabled, so the framework must explicitly -# stamp opt-in tags on each VMSS. -# * IGNORE_SCENARIOS_WITH_MISSING_VHD must be true: the Linux orchestrator only -# publishes Linux VHDs, so Windows RCV1P scenarios should skip (not fail) when -# the matching Windows image is absent from the gallery. -# In the default MSFT-tenant E2E path, E2E_SUBSCRIPTION_ID_RCV1P is either unset or -# does not match the active subscription, so this block is a no-op and the existing -# auto-injection / shared storage account behavior is preserved. +# ----------------------------------------------------------------------------- +# RCV1P testing-subscription override block +# ----------------------------------------------------------------------------- +# Context: +# AgentBaker's RCV1P (root-cert v1 platform-injection) end-to-end tests need +# to run in a subscription where Microsoft.Compute/PlatformSettingsOverride +# is registered. Two distinct environments expose that capability: +# +# 1. The MSFT-tenant default E2E subscription (variable group: ab-e2e-tme). +# PlatformSettingsOverride is registered AND every VMSS is auto-tagged +# with the RCV1P opt-in tag by an Azure Policy at create time. The +# framework therefore does NOT stamp tags itself; it relies on the +# platform to inject them. +# +# 2. The dedicated RCV1P testing subscription (id pinned in the +# ab-e2e-tme variable group as E2E_SUBSCRIPTION_ID_RCV1P). +# PlatformSettingsOverride is registered but NO auto-tagging policy +# is attached, so the framework MUST stamp tags itself. +# +# The aks-rp orchestrator runs both flows against the same AgentBaker +# pipeline (e2e-tme.yaml) and switches between them via the --subscription-id +# parameter on the pipeline. That parameter ultimately lands in +# E2E_SUBSCRIPTION_ID below. +# +# Why this block exists: +# When the orchestrator points e2e-tme.yaml at the RCV1P sub, the variables +# inherited from ab-e2e-tme (which target the default sub) are wrong: +# +# a) BLOB_STORAGE_ACCOUNT_PREFIX=abe2etme yields the account name +# "abe2etme", which is already owned by the default sub. +# Storage account names are globally unique, so trying to (re)create +# it under the RCV1P sub fails with StorageAccountAlreadyTaken and +# every Linux RCV1P scenario aborts before provisioning. +# +# b) The Go test framework reads RCV1P_TAGS_AUTO_INJECTED to decide +# whether to stamp opt-in tags on VMSSes it creates. The default +# (true) is correct for the MSFT-tenant flow but wrong for the +# RCV1P sub, where the framework must do the stamping itself. +# +# c) The RCV1P stage of the Linux orchestrator only publishes Linux +# VHDs to the gallery, but the test selector still picks up +# Test_RCV1P_Windows*. Without IGNORE_SCENARIOS_WITH_MISSING_VHD, +# those Windows scenarios fail hard with "image does not exist in +# gallery" instead of skipping. +# +# How detection works: +# E2E_SUBSCRIPTION_ID is the active subscription this run will use (set +# from the variable group, possibly overridden by the orchestrator's +# --subscription-id). E2E_SUBSCRIPTION_ID_RCV1P is a constant defined in +# the ab-e2e-tme variable group identifying the RCV1P sub. We compare +# them; no subscription GUID is hardcoded in the script. +# +# For pipelines whose variable group does not define +# E2E_SUBSCRIPTION_ID_RCV1P (e.g. non-TME variable groups), the first +# condition is empty and the block is a no-op -- the default behavior +# (auto-injection on, shared storage account, missing-VHD = failure) +# is preserved. +# +# What gets overridden: +# * BLOB_STORAGE_ACCOUNT_PREFIX = "abe2etmercv1p" +# The framework computes the storage account name as +# (see e2e/config/config.go:BlobStorageAccount). +# "abe2etmercv1p" is unique globally, so the storage account is created +# inside the RCV1P sub on first run and reused thereafter. +# * RCV1P_TAGS_AUTO_INJECTED = "false" +# Tells the framework to stamp opt-in tags on each VMSS it creates, +# and lets Test_RCV1P_*_NotOptedIn actually run (they self-skip when +# this flag is true because a not-opted-in VMSS is impossible under +# auto-injection). +# * IGNORE_SCENARIOS_WITH_MISSING_VHD = "true" +# Surfaces missing VHDs as SKIP, not FAIL (see e2e/test_helpers.go). +# Needed because the Linux RCV1P orchestrator run does not produce +# Windows VHDs; the Windows RCV1P scenarios would otherwise fail. +# +# Long-term plan: +# Replace this runtime override with a dedicated RCV1P pipeline +# (e2e-rcv1p.yaml already exists and wires the correct variable group +# ab-e2e-tme-rcv1p and rcv1pTagsAutoInjected=false). That requires an +# aks-rp orchestrator change to queue e2e-rcv1p.yaml instead of +# e2e-tme.yaml. Until then, this block keeps the single-pipeline flow +# working from inside AgentBaker only. +# ----------------------------------------------------------------------------- if [ -n "${E2E_SUBSCRIPTION_ID_RCV1P:-}" ] && [ "${E2E_SUBSCRIPTION_ID}" = "${E2E_SUBSCRIPTION_ID_RCV1P}" ]; then echo "Active subscription matches E2E_SUBSCRIPTION_ID_RCV1P; applying RCV1P-specific overrides" + # See "What gets overridden" in the comment block above for the rationale + # behind each of these three settings. export BLOB_STORAGE_ACCOUNT_PREFIX="abe2etmercv1p" export RCV1P_TAGS_AUTO_INJECTED="false" export IGNORE_SCENARIOS_WITH_MISSING_VHD="true" From 8b75bf92c4b1c6347b8d6dad00d886dc7a518990 Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Thu, 25 Jun 2026 20:48:36 -0700 Subject: [PATCH 103/103] fix(e2e): make blob storage account name subscription-unique MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Storage account names are globally unique in Azure. When a pipeline that normally targets one subscription is redirected at runtime to another (e.g. orchestrator-routed RCV1P run via --subscription-id), the existing account name collides with StorageAccountAlreadyTaken and the run fails before any test executes. Append a deterministic 6-hex suffix derived from SubscriptionID to make the account name unique per subscription with zero per-environment configuration. The framework remains agnostic to which subscription is running — no subscription identity check anywhere in this repo. Also reverts the RCV1P-specific override block from e2e_run.sh (which coupled AgentBaker to an aks-rp variable group). Remaining RCV1P-sub mismatches (RCV1P_TAGS_AUTO_INJECTED default, missing-VHD handling) will be addressed in a follow-up. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .pipelines/scripts/e2e_run.sh | 94 ----------------------------------- e2e/config/config.go | 44 +++++++++++++++- 2 files changed, 43 insertions(+), 95 deletions(-) diff --git a/.pipelines/scripts/e2e_run.sh b/.pipelines/scripts/e2e_run.sh index 44a6cd39376..b6f83df5fab 100644 --- a/.pipelines/scripts/e2e_run.sh +++ b/.pipelines/scripts/e2e_run.sh @@ -20,100 +20,6 @@ echo "Using subscription ${E2E_SUBSCRIPTION_ID} for e2e tests" # Map E2E_SUBSCRIPTION_ID to SUBSCRIPTION_ID which the Go test framework reads export SUBSCRIPTION_ID="${E2E_SUBSCRIPTION_ID}" -# ----------------------------------------------------------------------------- -# RCV1P testing-subscription override block -# ----------------------------------------------------------------------------- -# Context: -# AgentBaker's RCV1P (root-cert v1 platform-injection) end-to-end tests need -# to run in a subscription where Microsoft.Compute/PlatformSettingsOverride -# is registered. Two distinct environments expose that capability: -# -# 1. The MSFT-tenant default E2E subscription (variable group: ab-e2e-tme). -# PlatformSettingsOverride is registered AND every VMSS is auto-tagged -# with the RCV1P opt-in tag by an Azure Policy at create time. The -# framework therefore does NOT stamp tags itself; it relies on the -# platform to inject them. -# -# 2. The dedicated RCV1P testing subscription (id pinned in the -# ab-e2e-tme variable group as E2E_SUBSCRIPTION_ID_RCV1P). -# PlatformSettingsOverride is registered but NO auto-tagging policy -# is attached, so the framework MUST stamp tags itself. -# -# The aks-rp orchestrator runs both flows against the same AgentBaker -# pipeline (e2e-tme.yaml) and switches between them via the --subscription-id -# parameter on the pipeline. That parameter ultimately lands in -# E2E_SUBSCRIPTION_ID below. -# -# Why this block exists: -# When the orchestrator points e2e-tme.yaml at the RCV1P sub, the variables -# inherited from ab-e2e-tme (which target the default sub) are wrong: -# -# a) BLOB_STORAGE_ACCOUNT_PREFIX=abe2etme yields the account name -# "abe2etme", which is already owned by the default sub. -# Storage account names are globally unique, so trying to (re)create -# it under the RCV1P sub fails with StorageAccountAlreadyTaken and -# every Linux RCV1P scenario aborts before provisioning. -# -# b) The Go test framework reads RCV1P_TAGS_AUTO_INJECTED to decide -# whether to stamp opt-in tags on VMSSes it creates. The default -# (true) is correct for the MSFT-tenant flow but wrong for the -# RCV1P sub, where the framework must do the stamping itself. -# -# c) The RCV1P stage of the Linux orchestrator only publishes Linux -# VHDs to the gallery, but the test selector still picks up -# Test_RCV1P_Windows*. Without IGNORE_SCENARIOS_WITH_MISSING_VHD, -# those Windows scenarios fail hard with "image does not exist in -# gallery" instead of skipping. -# -# How detection works: -# E2E_SUBSCRIPTION_ID is the active subscription this run will use (set -# from the variable group, possibly overridden by the orchestrator's -# --subscription-id). E2E_SUBSCRIPTION_ID_RCV1P is a constant defined in -# the ab-e2e-tme variable group identifying the RCV1P sub. We compare -# them; no subscription GUID is hardcoded in the script. -# -# For pipelines whose variable group does not define -# E2E_SUBSCRIPTION_ID_RCV1P (e.g. non-TME variable groups), the first -# condition is empty and the block is a no-op -- the default behavior -# (auto-injection on, shared storage account, missing-VHD = failure) -# is preserved. -# -# What gets overridden: -# * BLOB_STORAGE_ACCOUNT_PREFIX = "abe2etmercv1p" -# The framework computes the storage account name as -# (see e2e/config/config.go:BlobStorageAccount). -# "abe2etmercv1p" is unique globally, so the storage account is created -# inside the RCV1P sub on first run and reused thereafter. -# * RCV1P_TAGS_AUTO_INJECTED = "false" -# Tells the framework to stamp opt-in tags on each VMSS it creates, -# and lets Test_RCV1P_*_NotOptedIn actually run (they self-skip when -# this flag is true because a not-opted-in VMSS is impossible under -# auto-injection). -# * IGNORE_SCENARIOS_WITH_MISSING_VHD = "true" -# Surfaces missing VHDs as SKIP, not FAIL (see e2e/test_helpers.go). -# Needed because the Linux RCV1P orchestrator run does not produce -# Windows VHDs; the Windows RCV1P scenarios would otherwise fail. -# -# Long-term plan: -# Replace this runtime override with a dedicated RCV1P pipeline -# (e2e-rcv1p.yaml already exists and wires the correct variable group -# ab-e2e-tme-rcv1p and rcv1pTagsAutoInjected=false). That requires an -# aks-rp orchestrator change to queue e2e-rcv1p.yaml instead of -# e2e-tme.yaml. Until then, this block keeps the single-pipeline flow -# working from inside AgentBaker only. -# ----------------------------------------------------------------------------- -if [ -n "${E2E_SUBSCRIPTION_ID_RCV1P:-}" ] && [ "${E2E_SUBSCRIPTION_ID}" = "${E2E_SUBSCRIPTION_ID_RCV1P}" ]; then - echo "Active subscription matches E2E_SUBSCRIPTION_ID_RCV1P; applying RCV1P-specific overrides" - # See "What gets overridden" in the comment block above for the rationale - # behind each of these three settings. - export BLOB_STORAGE_ACCOUNT_PREFIX="abe2etmercv1p" - export RCV1P_TAGS_AUTO_INJECTED="false" - export IGNORE_SCENARIOS_WITH_MISSING_VHD="true" - echo " BLOB_STORAGE_ACCOUNT_PREFIX=${BLOB_STORAGE_ACCOUNT_PREFIX}" - echo " RCV1P_TAGS_AUTO_INJECTED=${RCV1P_TAGS_AUTO_INJECTED}" - echo " IGNORE_SCENARIOS_WITH_MISSING_VHD=${IGNORE_SCENARIOS_WITH_MISSING_VHD}" -fi - # Setup go export GOPATH="$(go env GOPATH)" go version diff --git a/e2e/config/config.go b/e2e/config/config.go index de200dc66df..e4ad4cd7dca 100644 --- a/e2e/config/config.go +++ b/e2e/config/config.go @@ -94,12 +94,54 @@ type Configuration struct { } func (c *Configuration) BlobStorageAccount() string { + // Storage account names are GLOBALLY unique in Azure (not per-subscription). + // Two subscriptions cannot own an account with the same name; the second + // one to call BeginCreate fails with StorageAccountAlreadyTaken even + // though BeginCreate is otherwise idempotent within a single sub. + // + // This bites whenever a pipeline that normally targets subscription A + // (with BLOB_STORAGE_ACCOUNT_PREFIX baked into a variable group) is + // redirected at runtime to subscription B — for example when the aks-rp + // orchestrator routes the RCV1P phase to its dedicated testing sub via + // --subscription-id. The prefix was chosen for sub A, the account + // "" already exists in sub A, and creation in sub B + // fails before any test can run. + // + // Embedding a deterministic subscription-derived suffix in the account + // name guarantees a globally-unique name per subscription with zero + // per-environment configuration: every new sub gets its own account + // the first time it runs and reuses it thereafter. The framework stays + // completely agnostic to which subscription is "special" — there is no + // subscription identity check anywhere in this repo. + // + // DefaultLocation is included for the historical reason captured below + // (the blob client is keyed off the storage account URL, which is per + // location, even though tests run across multiple locations). + // // Here DefaultLocation is used because the azure blob client requires the // full URL to the storage account, which means creating a new client per // location. While everything else for running AB tests is sharded per // location, but we continue to use the same storage account for all // locations. - return c.BlobStorageAccountPrefix + c.DefaultLocation + return c.BlobStorageAccountPrefix + c.DefaultLocation + subscriptionSuffix(c.SubscriptionID) +} + +// subscriptionSuffix returns a short, deterministic suffix derived from the +// subscription ID, suitable for embedding in resource names that have +// global-uniqueness constraints (e.g. storage accounts). It takes the first +// 6 hex characters of the subscription UUID after stripping hyphens, which +// keeps the resulting name within the Azure storage account length limit +// (3-24 chars) while giving ~16M-way collision resistance — sufficient for +// the handful of subscriptions this test framework will ever run against. +// +// Lowercase hex is also valid for storage account names (lowercase +// alphanumeric only), so no further sanitization is required. +func subscriptionSuffix(subscriptionID string) string { + cleaned := strings.ToLower(strings.ReplaceAll(subscriptionID, "-", "")) + if len(cleaned) < 6 { + return cleaned + } + return cleaned[:6] } func (c *Configuration) IsLocalBuild() bool {