/* SPDX-License-Identifier: BSD-2-Clause OR GPL-2.0-only */
/* SPDX-FileCopyrightText: Copyright Amazon.com, Inc. or its affiliates. All rights reserved. */
// Use milestones to abort old builds when the user force pushes
def buildNumber = env.BUILD_NUMBER as int
if (buildNumber > 1) milestone(buildNumber - 1)
milestone(buildNumber)


import groovy.transform.Field
@Field boolean build_ok = true


def get_portafiducia_download_path() {
    /* Stable Portafiducia tarball */
    def AWS_ACCOUNT_ID = sh (
                script: "aws sts get-caller-identity --query Account --output text | tr -dc 0-9",
                returnStdout: true
              )
    return "s3://libfabric-ci-$AWS_ACCOUNT_ID-us-west-2/portafiducia/portafiducia.tar.gz"
}

def download_and_extract_portafiducia(outputDir) {
    /* Download PortaFiducia tarball from S3 and extract to outputDir */
    def tempPath = "/tmp/portafiducia.tar.gz"
    def downloadPath = this.get_portafiducia_download_path()
    sh """
        mkdir -p ${outputDir}
        aws s3 cp ${downloadPath} ${tempPath}
        tar xf ${tempPath} -C ${outputDir}
    """
}

def install_porta_fiducia() {
    /*
     * Install PortaFiducia in a (new) virtual environment.
     */
    sh '''
        python3 -m venv venv
        . venv/bin/activate
        pip install --upgrade pip
        pip install --upgrade awscli
        pip install -e PortaFiducia
    '''
}

def run_test_orchestrator_once(run_name, build_tag, os, instance_type, instance_count, region, addl_args) {
    /*
     * Run PortaFiducia/tests/test_orchestrator.py with given command line arguments
     * param@ args: str, the command line arguments
     */
    def cluster_name = get_cluster_name(build_tag, os, instance_type)
    def args = "--os ${os} --instance-type ${instance_type} --instance-count ${instance_count} --region ${region} --cluster-name ${cluster_name} ${addl_args} --junit-xml outputs/${cluster_name}.xml"
    sh ". venv/bin/activate; cd PortaFiducia/tests && ./test_orchestrator.py ${args}"
}

def get_random_string(len) {
    def s = sh (
        script: "cat /dev/urandom | LC_ALL=C tr -dc A-Za-z0-9 | head -c ${len}",
        returnStdout: true
    )
    return s
}

def get_cluster_name_prefix(build_tag) {
    prefix = sh(
                script: "echo ${build_tag} | sed \"s/^jenkins-//g\" | sed \"s/ //g\" | tr -d '.\\n'",
                returnStdout: true
            )
    return prefix.take(28)
}

def get_cluster_name(build_tag, os, instance_type) {
    /*
     * Compose the cluster name. Pcluster requires a cluster name under 60 characters.
     * cluster name cannot have ".".
     * Jenkins does not allow groovy to use the replace() method
     * of string. Therefore we used shell command sed to replace "." with ""
     */
    build_tag = get_cluster_name_prefix(build_tag)

    def cluster_name = sh(
                        script: "echo '${build_tag}-${os.take(10)}-${instance_type.take(10)}-'${get_random_string(8)} | tr -d '.\\n'",
                        returnStdout: true
                     )

    return cluster_name
}

def get_single_node_windows_test_stage_with_lock(stage_name, lock_label) {
    /*
     * Get Windows Stage
     */
    return {
        stage("${stage_name}") {
            lock(label: lock_label, quantity: 1) {
                sh """
                    . venv/bin/activate;
                    cd PortaFiducia/scripts;
                    export PULL_REQUEST_ID=${env.CHANGE_ID};
                    env AWS_DEFAULT_REGION=us-west-2 ./test_orchestrator_windows.py --ci public --s3-bucket-name libfabric-ci-windows-prod-test-output --pull-request-id ${env.CHANGE_ID};
                """
            }
        }
    }

}

def get_test_stage_with_lock(stage_name, build_tag, os, instance_type, instance_count, region, lock_label, addl_args) {
    /*
     * Generate a single test stage that run test_orchestrator.py with the given parameters.
     * param@ stage_name: the name of the stage
     * param@ build_tag: the BUILD_TAG env generated by Jenkins
     * param@ os: the operating system for the test stage.
     * param@ instance_type: the instance type for the test stage.
     * param@ instance_count: number of intances to use
     * param@ region: the (default) aws region where the tests are run.
     * param@ addl_args: additional arguments passed to test_orchestrator.py
     * return@: the test stage.
     */
    return {
        stage("${stage_name}") {
            lock(label: lock_label, quantity: instance_count) {
                this.run_test_orchestrator_once(stage_name, build_tag, os, instance_type, instance_count, region, addl_args)
            }
        }
    }
}

pipeline {
    agent {
        ecs {
            inheritFrom 'fargate-large'
        }
    }
    options {
        buildDiscarder(logRotator(daysToKeepStr: "90"))
        timeout(time: 10, unit: 'HOURS')
        skipDefaultCheckout()
    }
    stages {
        // Cleanup workspace before job start.
        stage("Clean up workspace") {
            steps{
                deleteDir()
            }
        }
        stage("Checkout SCM repo") {
            steps {
                checkout scm
            }
        }
        stage("Download and extract PortaFiducia") {
            steps {
                script {
                    download_and_extract_portafiducia('PortaFiducia')
                }
            }
        }
        stage("Install PortaFiducia") {
            steps {
                script {
                    install_porta_fiducia()
                }

            }
        }
        stage("Test EFA provider") {
            steps {
                script {
                    def stages = [:]
                    def timeout = "--timeout 270"
                    def generic_pf = "--cluster-type manual_cluster --test-target libfabric --test-type pr --test-libfabric-pr $env.CHANGE_ID"
                    // onesided tests are covered by imb, collective tests are covered by omb
                    def mpi_collective_tests = "'test_omb and not onesided'"
                    def libfabric_tests = "test_efa_ut test_fabtests_functional test_fork_support test_backward_compatibility"
                    def one_sided_tests = "'test_imb and not collective'"
                    def libfabric_and_onesided_tests = "${libfabric_tests} ${one_sided_tests}"
                    def efa_direct_tests = "'test_fabtests_functional and efa-direct'"

                    def efa_provider = "--test-libfabric-provider efa"
                    def addl_args_efa_libfabric_mpi = "${timeout} ${generic_pf} ${efa_provider} --test-list ${mpi_collective_tests} ${libfabric_and_onesided_tests}"
                    def addl_args_efa_mpi = "${timeout} ${generic_pf} ${efa_provider} --test-list ${mpi_collective_tests}"
                    def addl_args_efa_libfabric_and_onesided_mpi = "${timeout} ${generic_pf} ${efa_provider} --test-list ${libfabric_and_onesided_tests}"
                    def addl_args_efa_direct = "${timeout} ${generic_pf} ${efa_provider} --test-list ${efa_direct_tests}" 

                    def shm_provider = "--test-libfabric-provider shm"
                    def addl_args_shm = "${timeout} ${generic_pf} ${shm_provider} --test-list ${mpi_collective_tests} ${libfabric_and_onesided_tests}"

                    def tcp_provider = "--test-libfabric-provider tcp --enable-efa false"
                    def addl_args_tcp = "${timeout} ${generic_pf} ${tcp_provider} --test-list ${mpi_collective_tests} ${libfabric_and_onesided_tests}"

                    // Use lockable resources to limit the number of jobs that can get executed in parallel
                    def g4dn8x_lock_label = "g4dn8x"
                    def g4dn12x_lock_label  = "g4dn12x"
                    def c52x_lock_label  = "c52x"
                    def hpc6a48x_lock_label  = "hpc6a48x"
                    def c6gn16x_lock_label  = "c6gn16x"
                    def c5n18x_lock_label  = "c5n18x"
                    def c6g2x_lock_label  = "c6g2x"
                    def trn132x_lock_label  = "trn132x"

                    // Single Node Tests - EFA
                    stages["1_g4dn_alinux2-efa"] = get_test_stage_with_lock("1_g4dn_alinux2_efa", env.BUILD_TAG, "alinux2", "g4dn.8xlarge", 1, "us-east-1", g4dn8x_lock_label, addl_args_efa_libfabric_mpi)
                    stages["1_g4dn_alinux2023-efa"] = get_test_stage_with_lock("1_g4dn_alinux2023_efa", env.BUILD_TAG, "alinux2023", "g4dn.8xlarge", 1, "us-east-1", g4dn8x_lock_label, addl_args_efa_libfabric_mpi)
                    stages["1_g4dn_ubuntu2004-efa"] = get_test_stage_with_lock("1_g4dn_ubuntu2004_efa", env.BUILD_TAG, "ubuntu2004", "g4dn.8xlarge", 1, "us-east-1", g4dn8x_lock_label, addl_args_efa_libfabric_mpi)
                    stages["1_g4dn_rhel8-efa"] = get_test_stage_with_lock("1_g4dn_rhel8_efa", env.BUILD_TAG, "rhel8", "g4dn.8xlarge", 1, "us-east-1", g4dn8x_lock_label, addl_args_efa_libfabric_mpi)

                    // Single Node Tests - SHM
                    stages["1_g4dn_alinux2_shm"] = get_test_stage_with_lock("1_g4dn_alinux2_shm", env.BUILD_TAG, "alinux2", "g4dn.8xlarge", 1, "us-east-1", g4dn8x_lock_label, addl_args_shm)
                    stages["1_g4dn_alinux2023_shm"] = get_test_stage_with_lock("1_g4dn_alinux2023_shm", env.BUILD_TAG, "alinux2023", "g4dn.8xlarge", 1, "us-east-1", g4dn8x_lock_label, addl_args_shm)
                    stages["1_g4dn_ubuntu2004_shm"] = get_test_stage_with_lock("1_g4dn_ubuntu2004_shm", env.BUILD_TAG, "ubuntu2004", "g4dn.8xlarge", 1, "us-east-1", g4dn8x_lock_label, addl_args_shm)
                    stages["1_c5_rhel8_shm"] = get_test_stage_with_lock("1_c5_rhel8_shm", env.BUILD_TAG, "rhel8", "c5.2xlarge", 1, "us-east-1", c52x_lock_label, addl_args_shm + " --enable-efa false")
                    stages["1_c5_ubuntu2004_shm_disable-cma"] = get_test_stage_with_lock("1_c5_ubuntu2004_shm_disable-cma", env.BUILD_TAG, "ubuntu2004", "c5.2xlarge", 1, "us-east-1", c52x_lock_label, addl_args_shm + " --enable-cma false --enable-efa false")

                    // Single Node Windows Test
                    stages["EFA_Windows_Test"] = get_single_node_windows_test_stage_with_lock("EFA_Windows_Test", c5n18x_lock_label)

                    // Multi Node Tests - EFA
                    stages["2_hpc6a_alinux2_efa_mpi"] = get_test_stage_with_lock("2_hpc6a_alinux2_efa_mpi", env.BUILD_TAG, "alinux2", "hpc6a.48xlarge", 2, "eu-north-1", hpc6a48x_lock_label, addl_args_efa_mpi)
                    stages["2_hpc6a_alinux2_efa_libfabric_and_one_sided"] = get_test_stage_with_lock("2_hpc6a_alinux2_efa_libfabric_and_one_sided", env.BUILD_TAG, "alinux2", "hpc6a.48xlarge", 2, "eu-north-1", hpc6a48x_lock_label, addl_args_efa_libfabric_and_onesided_mpi)
                    stages["2_hpc6a_alinux2023_efa_mpi"] = get_test_stage_with_lock("2_hpc6a_alinux2023_efa_mpi", env.BUILD_TAG, "alinux2023", "hpc6a.48xlarge", 2, "eu-north-1", hpc6a48x_lock_label, addl_args_efa_mpi)
                    stages["2_hpc6a_alinux2023_efa_libfabric_and_one_sided"] = get_test_stage_with_lock("2_hpc6a_alinux2023_efa_libfabric_and_one_sided", env.BUILD_TAG, "alinux2023", "hpc6a.48xlarge", 2, "eu-north-1", hpc6a48x_lock_label, addl_args_efa_libfabric_and_onesided_mpi)
                    stages["2_c6gn_alinux2023_efa_mpi"] = get_test_stage_with_lock("2_c6gn_alinux2023_efa_mpi", env.BUILD_TAG, "alinux2023", "c6gn.16xlarge", 2, "us-west-2", c6gn16x_lock_label, addl_args_efa_mpi)
                    stages["2_c6gn_alinux2023_efa_libfabric_and_one_sided"] = get_test_stage_with_lock("2_c6gn_alinux2023_efa_libfabric_and_one_sided", env.BUILD_TAG, "alinux2023", "c6gn.16xlarge", 2, "us-west-2", c6gn16x_lock_label, addl_args_efa_libfabric_and_onesided_mpi)
                    stages["2_c5n_alinux2_efa_mpi"] = get_test_stage_with_lock("2_c5n_alinux2_efa_mpi", env.BUILD_TAG, "alinux2", "c5n.18xlarge", 2, "us-east-1", c5n18x_lock_label, addl_args_efa_mpi)
                    stages["2_c5n_alinux2_efa_libfabric_and_one_sided"] = get_test_stage_with_lock("2_c5n_alinux2_efa_libfabric_and_one_sided", env.BUILD_TAG, "alinux2", "c5n.18xlarge", 2, "us-east-1", c5n18x_lock_label, addl_args_efa_libfabric_and_onesided_mpi)
                    stages["2_c5n_alinux2023_efa_mpi"] = get_test_stage_with_lock("2_c5n_alinux2023_efa_mpi", env.BUILD_TAG, "alinux2023", "c5n.18xlarge", 2, "us-east-1", c5n18x_lock_label, addl_args_efa_mpi)
                    stages["2_c5n_alinux2023_efa_libfabric_and_one_sided"] = get_test_stage_with_lock("2_c5n_alinux2023_efa_libfabric_and_one_sided", env.BUILD_TAG, "alinux2023", "c5n.18xlarge", 2, "us-east-1", c5n18x_lock_label, addl_args_efa_libfabric_and_onesided_mpi)
                    stages["2_hpc6a_ubuntu2004_efa_mpi"] = get_test_stage_with_lock("2_hpc6a_ubuntu2004_efa_mpi", env.BUILD_TAG, "ubuntu2004", "hpc6a.48xlarge", 2, "eu-north-1", hpc6a48x_lock_label, addl_args_efa_mpi)
                    stages["2_hpc6a_ubuntu2004_efa_libfabric_and_one_sided"] = get_test_stage_with_lock("2_hpc6a_ubuntu2004_efa_libfabric_and_one_sided", env.BUILD_TAG, "ubuntu2004", "hpc6a.48xlarge", 2, "eu-north-1", hpc6a48x_lock_label, addl_args_efa_libfabric_and_onesided_mpi)
                    stages["2_hpc6a_rhel8_efa_mpi"] = get_test_stage_with_lock("2_hpc6a_rhel8_efa_mpi", env.BUILD_TAG, "rhel8", "hpc6a.48xlarge", 2, "eu-north-1", hpc6a48x_lock_label, addl_args_efa_mpi)
                    stages["2_hpc6a_rhel8_efa_libfabric_and_one_sided"] = get_test_stage_with_lock("2_hpc6a_rhel8_efa_libfabric_and_one_sided", env.BUILD_TAG, "rhel8", "hpc6a.48xlarge", 2, "eu-north-1", hpc6a48x_lock_label, addl_args_efa_libfabric_and_onesided_mpi)
                    def addl_args_trn1_odcr_efa_direct = " --odcr cr-097fd3374f511c972 ${addl_args_efa_direct}"
                    stages["2_trn1_ubuntu2004_efa_direct"] = get_test_stage_with_lock("2_trn1_ubuntu2004_efa_direct", env.BUILD_TAG, "ubuntu2004", "trn1.32xlarge", 2, "us-west-2", trn132x_lock_label, addl_args_trn1_odcr_efa_direct)

                    // cg6n AL2 builds are the slowest b/c they have asan turned on with debug, and have slower memcpy speeds
                    // split "libfabric tests" into "fabtests", and imb
                    def addl_args_efa_one_sided_only = "${timeout} ${generic_pf} ${efa_provider} --test-list ${one_sided_tests}"
                    def addl_args_efa_libfabric_only = "${timeout} ${generic_pf} ${efa_provider} --test-list ${libfabric_tests}"
                    stages["2_c6gn_alinux2_efa_mpi"] = get_test_stage_with_lock("2_c6gn_alinux2_efa_mpi", env.BUILD_TAG, "alinux2", "c6gn.16xlarge", 2, "us-west-2", c6gn16x_lock_label, addl_args_efa_mpi)
                    stages["2_c6gn_alinux2_efa_one_sided"] = get_test_stage_with_lock("2_c6gn_alinux2_efa_one_sided", env.BUILD_TAG, "alinux2", "c6gn.16xlarge", 2, "us-west-2", c6gn16x_lock_label, addl_args_efa_one_sided_only)
                    stages["2_c6gn_alinux2_efa_libfabric"] = get_test_stage_with_lock("2_c6gn_alinux2_efa_libfabric", env.BUILD_TAG, "alinux2", "c6gn.16xlarge", 2, "us-west-2", c6gn16x_lock_label, addl_args_efa_libfabric_only)

                    // Multi Node Tests - TCP
                    stages["2_c6g_alinux2_tcp"] = get_test_stage_with_lock("2_c6g_alinux2_tcp", env.BUILD_TAG, "alinux2", "c6g.2xlarge", 2, "us-west-2", c6g2x_lock_label, addl_args_tcp)
                    stages["2_c6g_alinux2023_tcp"] = get_test_stage_with_lock("2_c6g_alinux2023_tcp", env.BUILD_TAG, "alinux2023", "c6g.2xlarge", 2, "us-west-2", c6g2x_lock_label, addl_args_tcp)
                    stages["2_c6g_ubuntu2004_tcp"] = get_test_stage_with_lock("2_c6g_ubuntu2004_tcp", env.BUILD_TAG, "ubuntu2004", "c6g.2xlarge", 2, "us-west-2", c6g2x_lock_label, addl_args_tcp)
                    stages["2_c6g_rhel8_tcp"] = get_test_stage_with_lock("2_c6g_rhel8_tcp", env.BUILD_TAG, "rhel8", "c6g.2xlarge", 2, "us-west-2", c6g2x_lock_label, addl_args_tcp)
                    stages["3_g4dn_alinux2_tcp"] = get_test_stage_with_lock("3_g4dn_alinux2_tcp", env.BUILD_TAG, "alinux2", "g4dn.12xlarge", 3, "us-east-1", g4dn12x_lock_label, addl_args_tcp + " --test-list test_nccl_tests --test-iterations fastest")

                    parallel stages
                }
            }
        }
        stage('check build_ok') {
            steps {
                script {
                    if (build_ok) {
                        currentBuild.result = "SUCCESS"
                    }
                    else {
                        currentBuild.result = "FAILURE"
                    }
                }
            }
        }
    }
    post {
        always {
            sh 'find PortaFiducia/tests/outputs -name "*.xml" | xargs du -shc'
            junit testResults: 'PortaFiducia/tests/outputs/**/*.xml', keepLongStdio: false
            archiveArtifacts artifacts: 'PortaFiducia/tests/outputs/**/*.*'
            script {
                // Try To Cleanup Resources
                def regions = ["us-east-1", "eu-north-1", "us-west-2"]
                cluster_name_prefix = get_cluster_name_prefix(env.BUILD_TAG)
                regions.each { region ->
                    sh ". venv/bin/activate; ./PortaFiducia/scripts/delete_manual_cluster.py --cluster-name '${cluster_name_prefix}*' --region ${region}"
                }
                // Windows Cluster, has a different name
                sh """
                    . venv/bin/activate
                    ./PortaFiducia/scripts/delete_manual_cluster.py --cluster-name WindowsLibfabricCi_${env.CHANGE_ID}_*
                """
            }
        }
        cleanup {
            deleteDir()
        }
    }
}
