simplyblock · RaunakJalan · Aug 25, 2024 · Aug 25, 2024 · Aug 25, 2024 · Aug 25, 2024
diff --git a/.github/workflows/e2e.yaml b/.github/workflows/e2e.yaml
@@ -21,6 +21,22 @@ on:
         description: 'Name of test to run. Empty to run all'
         required: false
         default: ''
+      run_failed_tests_only:
+        description: "Run only the failed tests from the last run"
+        required: false
+        default: false
+        type: boolean
+      run_unexecuted_tests:
+        description: "Run tests that were not executed in the last run"
+        required: false
+        default: false
+        type: boolean
+      retry_count:
+        description: "Number of retries for failed test cases"
+        required: false
+        default: 1
+        type: number
+
 jobs:
   e2e:
     runs-on: self-hosted
@@ -70,7 +86,7 @@ jobs:
               -backend-config="dynamodb_table=${TFSTATE_DYNAMODB_TABLE}" \
               -backend-config="encrypt=true"
 
-      - name: select or create workspace
+      - name: Select or Create Workspace
         run: |
           cd $GITHUB_WORKSPACE/simplyBlockDeploy
           terraform workspace select -or-create ghiaction-sbclie2e
@@ -83,8 +99,10 @@ jobs:
           cd $GITHUB_WORKSPACE/simplyBlockDeploy
           terraform plan \
             -var "mgmt_nodes=1" -var "storage_nodes=3" -var "volumes_per_storage_nodes=3" \
-            -var "extra_nodes=1" -var "extra_nodes_instance_type=m6id.large" \
-            -var "region=us-east-2" -var "sbcli_cmd=${{ github.event.inputs.sbcli_cmd || 'sbcli-dev' }}" -out=tfplan
+            -var mgmt_nodes_instance_type="m6id.xlarge" -var storage_nodes_instance_type="m6id.xlarge" \
+            -var "extra_nodes=1" -var "extra_nodes_instance_type=m6id.xlarge" \
+            -var storage_nodes_ebs_size2=100 -var "region=us-east-2" \
+            -var "sbcli_cmd=${{ github.event.inputs.sbcli_cmd || 'sbcli-dev' }}" -out=tfplan
       - name: Apply Terraform Changes
         run: |
           cd $GITHUB_WORKSPACE/simplyBlockDeploy
@@ -100,20 +118,26 @@ jobs:
       - name: Bootstrap Cluster
         run: |
           cd $GITHUB_WORKSPACE/simplyBlockDeploy
-          ./bootstrap-cluster.sh --max-lvol 10 --max-snap 10 --max-prov 450g --number-of-devices 3
+          ./bootstrap-cluster.sh --max-lvol 100 --max-snap 100 --max-prov 360G --number-of-devices 3
         id: bootstrap_cluster
         env:
           SBCLI_CMD: ${{ github.event.inputs.sbcli_cmd || 'sbcli-dev' }}
 
       - name: Setup Tests & Run Tests
-        timeout-minutes: 60
+        timeout-minutes: 120
         run: |
           cd $GITHUB_WORKSPACE/e2e
           sudo apt-get install -y python3.12-venv
           python3 -m venv myenv
           source myenv/bin/activate
           python3 -m pip install -r requirements.txt
           echo "Running tests in namespace ${{ steps.get-namespace.outputs.namespace }}"
+
+          # Clean branch name to remove slashes for filename
+          BRANCH_NAME_SAFE=$(echo "${{ github.ref_name }}" | tr '/' '_')
+          export BRANCH_NAME=${BRANCH_NAME_SAFE}
+          export FAILED_CASES_FILE="failed_cases_${BRANCH_NAME_SAFE}.json"
+          export EXECUTED_CASES_FILE="executed_cases_${BRANCH_NAME_SAFE}.json"
           export CLUSTER_ID=${{ steps.bootstrap_cluster.outputs.cluster_id }}
           export CLUSTER_SECRET=${{ steps.bootstrap_cluster.outputs.cluster_secret }}
           export CLUSTER_IP=${{ steps.bootstrap_cluster.outputs.cluster_ip }}
@@ -128,7 +152,13 @@ jobs:
           if [ -n "${{ github.event.inputs.testname }}" ]; then
             TESTNAME="--testname ${{ github.event.inputs.testname }}"
           fi
-          python3 e2e.py $TESTNAME
+          if [ "${{ github.event.inputs.run_failed_tests_only }}" = "true" ]; then
+            python3 e2e.py --failed_only --retry ${{ github.event.inputs.retry_count }} --branch $BRANCH_NAME $TESTNAME
+          elif [ "${{ github.event.inputs.run_unexecuted_tests }}" = "true" ]; then
+            python3 e2e.py --unexecuted_only --retry ${{ github.event.inputs.retry_count }} --branch $BRANCH_NAME $TESTNAME
+          else
+            python3 e2e.py --retry ${{ github.event.inputs.retry_count }} --branch $BRANCH_NAME $TESTNAME
+          fi
       - name: Upload automation and docker logs to s3
         run: |
           cd $GITHUB_WORKSPACE/e2e/logs
@@ -243,4 +273,4 @@ jobs:
           ls -la ./
           rm -rf ./* || true
           rm -rf ./.??* || true
-          ls -la ./
+          ls -la ./
diff --git a/e2e/e2e.py b/e2e/e2e.py
@@ -1,83 +1,139 @@
 ### simplyblock e2e tests
 import argparse
+import os
+import json
 import traceback
 from __init__ import get_all_tests
 from logger_config import setup_logger
-from exceptions.custom_exception import (
-    TestNotFoundException,
-    MultipleExceptions
-)
+from exceptions.custom_exception import TestNotFoundException, MultipleExceptions
 from e2e_tests.cluster_test_base import TestClusterBase
 from utils.sbcli_utils import SbcliUtils
 from utils.ssh_utils import SshUtils
 
 
 def main():
-    """Run complete test suite
-    """
+    """Run the complete test suite or specific tests."""
     parser = argparse.ArgumentParser(description="Run simplyBlock's E2E Test Framework")
     parser.add_argument('--testname', type=str, help="The name of the test to run", default=None)
     parser.add_argument('--fio_debug', type=bool, help="Add debug flag to fio", default=False)
+    parser.add_argument('--failed_only', action='store_true', help="Run only failed tests from last run", default=False)
+    parser.add_argument('--unexecuted_only', action='store_true', help="Run only unexecuted tests from last run", default=False)
+    parser.add_argument('--branch', type=str, help="Branch name to uniquely store test results", required=True)
+    parser.add_argument('--retry', type=int, help="Number of retries for failed cases", default=1)
 
     args = parser.parse_args()
 
     tests = get_all_tests()
-    # Find the test class based on the provided test name
-    test_class_run = []
-    if args.testname is None or len(args.testname.strip()) == 0:
-        test_class_run = tests
+
+    # File to store failed test cases for the specific branch
+    base_dir = os.path.join(os.path.expanduser('~'), 'e2e_test_runs_fail_unexec_json')
+    if not os.path.exists(base_dir):
+        os.makedirs(base_dir)
+    failed_cases_file = os.path.join(base_dir,
+                                     f'failed_cases_{args.branch}.json')
+    executed_cases_file = os.path.join(base_dir,
+                                       f'executed_cases_{args.branch}.json')
+
+    logger.info(f"Failed only: {args.failed_only}")
+    logger.info(f"Unexecuted only: {args.unexecuted_only}")
+    logger.info(f"Failed case file: {failed_cases_file}")
+    logger.info(f"File exists: {os.path.exists(failed_cases_file)}")
+
+    onlyfiles = [f for f in os.listdir(base_dir) if os.path.isfile(os.path.join(base_dir, f))]
+    logger.info(f"List of files: {onlyfiles}")
+
+    # Load previously failed cases if '--failed_only' is set
+    if args.failed_only and os.path.exists(failed_cases_file):
+        logger.info("Running failed cases only")
+        with open(failed_cases_file, 'r', encoding='utf-8') as file:
+            failed_tests = json.load(file)
+            test_class_run = [cls for cls in tests 
+                              if any(ft in f'{cls.__name__}' for ft in failed_tests)]
+
+            logger.info(f"Running failed cases only: {test_class_run}")
+    elif args.unexecuted_only and os.path.exists(executed_cases_file):
+        logger.info("Running unexecuted cases only")
+        with open(executed_cases_file, 'r', encoding='utf-8') as file:
+            executed_tests = json.load(file)
+            test_class_run = [cls for cls in tests 
+                              if all(unet not in f'{cls.__name__}' for unet in executed_tests)]
+            logger.info(f"Running unexecuted cases only: {test_class_run}")
     else:
-        for cls in tests:
-            if args.testname.lower() in cls.__name__.lower():
-                test_class_run.append(cls)
+        # Run all tests or selected ones
+        logger.info("Running all or selected cases")
+        test_class_run = []
+        if args.testname is None or len(args.testname.strip()) == 0:
+            test_class_run = tests
+        else:
+            for cls in tests:
+                if args.testname.lower() in cls.__name__.lower():
+                    test_class_run.append(cls)
 
+    logger.info(f"List of tests to run: {test_class_run}")
     if not test_class_run:
         available_tests = ', '.join(cls.__name__ for cls in tests)
         logger.info(f"Test '{args.testname}' not found. Available tests are: {available_tests}")
         raise TestNotFoundException(args.testname, available_tests)
-    
+
     errors = {}
+    executed_tests = []
     for test in test_class_run:
         logger.info(f"Running Test {test}")
         test_obj = test(fio_debug=args.fio_debug)
-        try:
-            test_obj.setup()
-            test_obj.run()
-        except Exception as exp:
-            logger.error(traceback.format_exc())
-            errors[f"{test.__name__}"] = [exp]
+
+        for attempt in range(args.retry):
+            try:
+                test_obj.setup()
+                executed_tests.append(test.__name__)
+                test_obj.run()
+                logger.info(f"Test {test.__name__} passed on attempt {attempt + 1}")
+                if f"{test.__name__}" in errors:
+                    del errors[f"{test.__name__}"]
+                break  # Test passed, no need for more retries
+            except Exception as exp:
+                logger.error(f"Attempt {attempt + 1} failed for test {test.__name__}")
+                logger.error(traceback.format_exc())
+                errors[f"{test.__name__}"] = [exp]
+
         try:
             test_obj.teardown()
-            # pass
         except Exception as _:
             logger.error(f"Error During Teardown for test: {test.__name__}")
             logger.error(traceback.format_exc())
-            # errors[f"{test.__name__}"].append(exp)
         finally:
             if check_for_dumps():
                 logger.info("Found a core dump during test execution. "
                             "Cannot execute more tests as cluster is not stable. Exiting")
                 break
 
     failed_cases = list(errors.keys())
+
+    # Save failed cases for next run
+    if failed_cases:
+        with open(failed_cases_file, 'w') as file:
+            json.dump(failed_cases, file)
+    else:
+        if os.path.exists(failed_cases_file):
+            os.remove(failed_cases_file)  # Clear file if all tests passed
+
+    # Save executed cases for next run
+    if executed_tests:
+        with open(executed_cases_file, 'w') as file:
+            json.dump(executed_tests, file)
+    else:
+        if os.path.exists(executed_cases_file):
+            os.remove(executed_cases_file)  # Clear file if no tests executed the run
+
     logger.info(f"Number of Total Cases: {len(test_class_run)}")
     logger.info(f"Number of Passed Cases: {len(test_class_run) - len(failed_cases)}")
     logger.info(f"Number of Failed Cases: {len(failed_cases)}")
-
-    logger.info("Test Wise run status:")
-    for test in test_class_run:
-        if test.__name__ not in failed_cases:
-            logger.info(f"{test.__name__} PASSED CASE.")
-        else:
-            logger.info(f"{test.__name__} FAILED CASE.")
-
 
     if errors:
         raise MultipleExceptions(errors)
-    
+
 
 def check_for_dumps():
-    """Validates whether core dumps present on machines
+    """Validates whether core dumps are present on machines
 
     Returns:
         bool: If there are core dumps or not

diff --git a/e2e/e2e_tests/single_node_failure.py b/e2e/e2e_tests/single_node_failure.py
@@ -65,6 +65,7 @@ def run(self):
             distr_ndcs=2,
             distr_npcs=1
         )
+        # raise Exception("Testing failure runs and retry runs")
         lvols = self.sbcli_utils.list_lvols()
         assert self.lvol_name in list(lvols.keys()), \
             f"Lvol {self.lvol_name} not present in list of lvols post add: {lvols}"