From bbb9da650d4e8f59197fbd514cbbe43bce3e6a38 Mon Sep 17 00:00:00 2001 From: hamdykhader Date: Tue, 16 Dec 2025 16:30:59 +0300 Subject: [PATCH 01/16] Adds backup controller and cli _1 --- simplyblock_cli/cli-reference.yaml | 24 +++++ simplyblock_cli/cli.py | 37 ++++++++ simplyblock_cli/clibase.py | 14 ++- .../controllers/backup_controller.py | 90 +++++++++++++++++++ 4 files changed, 164 insertions(+), 1 deletion(-) create mode 100644 simplyblock_core/controllers/backup_controller.py diff --git a/simplyblock_cli/cli-reference.yaml b/simplyblock_cli/cli-reference.yaml index d5cad51c0..c17aad796 100644 --- a/simplyblock_cli/cli-reference.yaml +++ b/simplyblock_cli/cli-reference.yaml @@ -1947,3 +1947,27 @@ commands: type: str required: false default: "" + - name: "backup" + help: "FDB Backup operations" + weight: 800 + subcommands: + - name: create + help: "Creates an fdb backup" + arguments: + - name: "--tag-name" + help: "backup tag name" + dest: tag_name + type: str + required: false + default: "" + - name: list + help: "Lists all fdb backups" + - name: status + help: "get backup status" + - name: restore + help: "restore a backup" + arguments: + - name: "name" + help: "backup class name" + dest: name + type: str diff --git a/simplyblock_cli/cli.py b/simplyblock_cli/cli.py index 2bfd792ec..77827e5a7 100644 --- a/simplyblock_cli/cli.py +++ b/simplyblock_cli/cli.py @@ -28,6 +28,7 @@ def __init__(self): self.init_storage_pool() self.init_snapshot() self.init_qos() + self.init_backup() super().__init__() def init_storage_node(self): @@ -795,6 +796,29 @@ def init_qos__delete(self, subparser): subcommand.add_argument('cluster_id', help='Cluster UUID', type=str, default='') + def init_backup(self): + subparser = self.add_command('backup', 'FDB Backup operations') + self.init_backup__create(subparser) + self.init_backup__list(subparser) + self.init_backup__status(subparser) + self.init_backup__restore(subparser) + + + def init_backup__create(self, subparser): + subcommand = self.add_sub_command(subparser, 'create', 'Creates an fdb backup') + argument = subcommand.add_argument('--tag-name', help='backup tag name', type=str, default='', dest='tag_name', required=False) + + def init_backup__list(self, subparser): + subcommand = self.add_sub_command(subparser, 'list', 'Lists all fdb backups') + + def init_backup__status(self, subparser): + subcommand = self.add_sub_command(subparser, 'status', 'get backup status') + + def init_backup__restore(self, subparser): + subcommand = self.add_sub_command(subparser, 'restore', 'restore a backup') + subcommand.add_argument('name', help='backup class name', type=str) + + def run(self): args = self.parser.parse_args() if args.debug: @@ -1127,6 +1151,19 @@ def run(self): else: self.parser.print_help() + elif args.command in ['backup']: + sub_command = args_dict['backup'] + if sub_command in ['create']: + ret = self.backup__create(sub_command, args) + elif sub_command in ['list']: + ret = self.backup__list(sub_command, args) + elif sub_command in ['status']: + ret = self.backup__status(sub_command, args) + elif sub_command in ['restore']: + ret = self.backup__restore(sub_command, args) + else: + self.parser.print_help() + else: self.parser.print_help() diff --git a/simplyblock_cli/clibase.py b/simplyblock_cli/clibase.py index 277984c6a..86e076dfe 100644 --- a/simplyblock_cli/clibase.py +++ b/simplyblock_cli/clibase.py @@ -12,7 +12,7 @@ from simplyblock_core import storage_node_ops as storage_ops from simplyblock_core import mgmt_node_ops as mgmt_ops from simplyblock_core.controllers import pool_controller, lvol_controller, snapshot_controller, device_controller, \ - tasks_controller, qos_controller + tasks_controller, qos_controller, backup_controller from simplyblock_core.controllers import health_controller from simplyblock_core.models.pool import Pool from simplyblock_core.models.cluster import Cluster @@ -666,6 +666,18 @@ def qos__list(self, sub_command, args): def qos__delete(self, sub_command, args): return qos_controller.delete_class(args.name, args.cluster_id) + def backup__create(self, sub_command, args): + return backup_controller.create_backup(args.tag_name) + + def backup__list(self, sub_command, args): + return backup_controller.list_backups() + + def backup__status(self, sub_command, args): + return backup_controller.backup_status() + + def backup__restore(self, sub_command, args): + return backup_controller.backup_restore(args.name) + def storage_node_list_devices(self, args): node_id = args.node_id is_json = args.json diff --git a/simplyblock_core/controllers/backup_controller.py b/simplyblock_core/controllers/backup_controller.py new file mode 100644 index 000000000..f09f625cb --- /dev/null +++ b/simplyblock_core/controllers/backup_controller.py @@ -0,0 +1,90 @@ +# coding=utf-8 +import datetime +import logging as lg + +import docker + +from simplyblock_core import utils, constants +from simplyblock_core.db_controller import DBController + + +logger = lg.getLogger() +db_controller = DBController() + +backup_path = "file:///etc/foundationdb/backup" + + +def __get_fdb_cont(): + snode = db_controller.get_mgmt_nodes()[0] + if not snode: + return False + node_docker = docker.DockerClient(base_url=f"tcp://{snode.docker_ip_port}", version="auto") + for container in node_docker.containers.list(): + if container.name.startswith("app_fdb-server"): + return container + + +def create_backup(tag_name): + container = __get_fdb_cont() + if container: + res = container.exec_run(cmd=f"fdbbackup start -d {backup_path}") + cont = res.output.decode("utf-8") + logger.info(cont) + # logger.info(f"backup start: {backup_path}") + + return True + + + +def list_backups(): + container = __get_fdb_cont() + data = [] + if container: + res = container.exec_run(cmd=f"fdbbackup list -b {backup_path}") + logger.info(f"backup list from : {backup_path}") + cont = res.output.decode("utf-8") + for line in cont.splitlines(): + if not line or "backup-" not in line: + continue + + name = line.split("/")[-1].strip() + size = 0 + restorable = 0 + res = container.exec_run(cmd=f"fdbbackup describe -d {line}") + cont = res.output.decode("utf-8") + for line in cont.splitlines(): + if line and line.startswith("SnapshotBytes"): + size = line.split()[1].strip() + if line and line.startswith("Restorable"): + restorable = line.split()[1].strip() + date = datetime.datetime.strptime(name.replace("backup-",""), "%Y-%m-%d-%H-%M-%S.%f").strftime( + "%H:%M:%S, %d/%m/%Y") + + data.append({ + "Name": name, + "Size": utils.humanbytes(size), + "Restorable": restorable, + "Date": date, + }) + + return utils.print_table(data) + + return True + + + +def backup_status(): + container = __get_fdb_cont() + if container: + res = container.exec_run(cmd="fdbbackup status") + cont = res.output.decode("utf-8") + logger.info(f"backup status: \n{cont.strip()}") + + +def backup_restore(backup_name): + container = __get_fdb_cont() + if container: + res = container.exec_run(cmd=f"fdbrestore start -r {backup_path}/{backup_name} --dest-cluster-file {constants.KVD_DB_FILE_PATH}") + cont = res.output.decode("utf-8") + logger.info(cont.strip()) + From 2d591d67f16ce1da46d2c2923e5d7b31f134f708 Mon Sep 17 00:00:00 2001 From: hamdykhader Date: Tue, 16 Dec 2025 17:03:16 +0300 Subject: [PATCH 02/16] Fix linter and type checker issues --- simplyblock_core/controllers/backup_controller.py | 2 +- simplyblock_web/api/internal/storage_node/docker.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/simplyblock_core/controllers/backup_controller.py b/simplyblock_core/controllers/backup_controller.py index f09f625cb..198352d06 100644 --- a/simplyblock_core/controllers/backup_controller.py +++ b/simplyblock_core/controllers/backup_controller.py @@ -20,7 +20,7 @@ def __get_fdb_cont(): return False node_docker = docker.DockerClient(base_url=f"tcp://{snode.docker_ip_port}", version="auto") for container in node_docker.containers.list(): - if container.name.startswith("app_fdb-server"): + if container.name.startswith("app_fdb-server"): # type: ignore[union-attr] return container diff --git a/simplyblock_web/api/internal/storage_node/docker.py b/simplyblock_web/api/internal/storage_node/docker.py index af74a76b8..0a384cedb 100644 --- a/simplyblock_web/api/internal/storage_node/docker.py +++ b/simplyblock_web/api/internal/storage_node/docker.py @@ -4,7 +4,6 @@ import math import os from pathlib import Path -import subprocess import time from typing import List, Optional, Union From 3e364926f60e48eb839c18bfef333226578372fa Mon Sep 17 00:00:00 2001 From: hamdykhader Date: Sun, 28 Dec 2025 08:41:31 +0300 Subject: [PATCH 03/16] wip 2 wip --- simplyblock_cli/cli-reference.yaml | 12 ++++ simplyblock_cli/cli.py | 8 +++ simplyblock_cli/clibase.py | 3 + simplyblock_core/constants.py | 1 + .../controllers/backup_controller.py | 29 +++++--- simplyblock_core/models/cluster.py | 3 + .../services/tasks_runner_fdb_backup.py | 68 +++++++++++++++++++ 7 files changed, 114 insertions(+), 10 deletions(-) create mode 100644 simplyblock_core/services/tasks_runner_fdb_backup.py diff --git a/simplyblock_cli/cli-reference.yaml b/simplyblock_cli/cli-reference.yaml index c17aad796..6cad46108 100644 --- a/simplyblock_cli/cli-reference.yaml +++ b/simplyblock_cli/cli-reference.yaml @@ -1971,3 +1971,15 @@ commands: help: "backup class name" dest: name type: str + - name: config + help: "restore a backup" + arguments: + - name: "backup_path" + help: 'backup path, defaults to local: file:///etc/foundationdb/backup, can be s3 bucket blobstore link, + blobstore://{API_KEY}:{API_SECRET}@s3.us-east-2.amazonaws.com/backup-fdb?bucket=fdb-backup-dir®ion=us-east-2&sc=0' + dest: backup_path + type: str + - name: "backup_frequency" + help: "backup frequency, can be 3h, 1d" + dest: backup_frequency + type: str diff --git a/simplyblock_cli/cli.py b/simplyblock_cli/cli.py index 77827e5a7..f22f6321b 100644 --- a/simplyblock_cli/cli.py +++ b/simplyblock_cli/cli.py @@ -802,6 +802,7 @@ def init_backup(self): self.init_backup__list(subparser) self.init_backup__status(subparser) self.init_backup__restore(subparser) + self.init_backup__config(subparser) def init_backup__create(self, subparser): @@ -818,6 +819,11 @@ def init_backup__restore(self, subparser): subcommand = self.add_sub_command(subparser, 'restore', 'restore a backup') subcommand.add_argument('name', help='backup class name', type=str) + def init_backup__config(self, subparser): + subcommand = self.add_sub_command(subparser, 'config', 'restore a backup') + subcommand.add_argument('backup_path', help='backup path, defaults to local: file:///etc/foundationdb/backup, can be s3 bucket blobstore link, blobstore://[API_KEY]:[API_SECRET]@s3.us-east-2.amazonaws.com/backup-2025-12-28?bucket=fdb-backup-dir®ion=us-east-2&sc=0', type=str) + subcommand.add_argument('backup_frequency', help='backup frequency, can be 3h, 1d', type=str) + def run(self): args = self.parser.parse_args() @@ -1161,6 +1167,8 @@ def run(self): ret = self.backup__status(sub_command, args) elif sub_command in ['restore']: ret = self.backup__restore(sub_command, args) + elif sub_command in ['config']: + ret = self.backup__config(sub_command, args) else: self.parser.print_help() diff --git a/simplyblock_cli/clibase.py b/simplyblock_cli/clibase.py index 86e076dfe..a1f98af7f 100644 --- a/simplyblock_cli/clibase.py +++ b/simplyblock_cli/clibase.py @@ -678,6 +678,9 @@ def backup__status(self, sub_command, args): def backup__restore(self, sub_command, args): return backup_controller.backup_restore(args.name) + def backup__config(self, sub_command, args): + return backup_controller.backup_configure(args.backup_path, args.backup_frequency) + def storage_node_list_devices(self, args): node_id = args.node_id is_json = args.json diff --git a/simplyblock_core/constants.py b/simplyblock_core/constants.py index b9921d6ac..b8c05346d 100644 --- a/simplyblock_core/constants.py +++ b/simplyblock_core/constants.py @@ -26,6 +26,7 @@ def get_config_var(name, default=None): KVD_DB_VERSION = 730 KVD_DB_FILE_PATH = os.getenv('FDB_CLUSTER_FILE', '/etc/foundationdb/fdb.cluster') KVD_DB_TIMEOUT_MS = 10000 +KVD_DB_BACKUP_PATH = "file:///etc/foundationdb/backup" SPK_DIR = '/home/ec2-user/spdk' LOG_LEVEL = logging.INFO LOG_WEB_LEVEL = logging.DEBUG diff --git a/simplyblock_core/controllers/backup_controller.py b/simplyblock_core/controllers/backup_controller.py index 198352d06..971f58048 100644 --- a/simplyblock_core/controllers/backup_controller.py +++ b/simplyblock_core/controllers/backup_controller.py @@ -11,30 +11,30 @@ logger = lg.getLogger() db_controller = DBController() -backup_path = "file:///etc/foundationdb/backup" - +backup_path = constants.KVD_DB_BACKUP_PATH +clusters = db_controller.get_clusters() +if clusters: + backup_path = clusters[0].backup_path +if not backup_path: + backup_path = constants.KVD_DB_BACKUP_PATH def __get_fdb_cont(): snode = db_controller.get_mgmt_nodes()[0] if not snode: - return False + return node_docker = docker.DockerClient(base_url=f"tcp://{snode.docker_ip_port}", version="auto") for container in node_docker.containers.list(): if container.name.startswith("app_fdb-server"): # type: ignore[union-attr] return container - def create_backup(tag_name): container = __get_fdb_cont() if container: res = container.exec_run(cmd=f"fdbbackup start -d {backup_path}") cont = res.output.decode("utf-8") logger.info(cont) - # logger.info(f"backup start: {backup_path}") - - return True - - + return True + return False def list_backups(): container = __get_fdb_cont() @@ -84,7 +84,16 @@ def backup_status(): def backup_restore(backup_name): container = __get_fdb_cont() if container: - res = container.exec_run(cmd=f"fdbrestore start -r {backup_path}/{backup_name} --dest-cluster-file {constants.KVD_DB_FILE_PATH}") + res = container.exec_run(cmd=f"fdbcli --exec \"writemode on; clearrange \"\" \xff\"; fdbrestore start -r {backup_path} --dest-cluster-file {constants.KVD_DB_FILE_PATH} -v {backup_name}") cont = res.output.decode("utf-8") logger.info(cont.strip()) + +def backup_configure(backup_path, backup_frequency): + clusters = db_controller.get_clusters() + if clusters: + clusters[0].backup_path = backup_path + clusters[0].backup_frequency_seconds = backup_frequency + clusters[0].write_to_db() + return True + diff --git a/simplyblock_core/models/cluster.py b/simplyblock_core/models/cluster.py index 620309f77..b924ddc95 100644 --- a/simplyblock_core/models/cluster.py +++ b/simplyblock_core/models/cluster.py @@ -2,6 +2,7 @@ from typing import List +from simplyblock_core import constants from simplyblock_core.models.base_model import BaseModel @@ -69,6 +70,8 @@ class Cluster(BaseModel): is_re_balancing: bool = False full_page_unmap: bool = True is_single_node: bool = False + backup_path: str = constants.KVD_DB_BACKUP_PATH + backup_frequency_seconds: int = 3*60*60 def get_status_code(self): if self.status in self.STATUS_CODE_MAP: diff --git a/simplyblock_core/services/tasks_runner_fdb_backup.py b/simplyblock_core/services/tasks_runner_fdb_backup.py new file mode 100644 index 000000000..eb9282f61 --- /dev/null +++ b/simplyblock_core/services/tasks_runner_fdb_backup.py @@ -0,0 +1,68 @@ +# coding=utf-8 +import time + + +from simplyblock_core import db_controller, storage_node_ops, utils, constants +from simplyblock_core.controllers import backup_controller +from simplyblock_core.models.job_schedule import JobSchedule +from simplyblock_core.models.cluster import Cluster + + +logger = utils.get_logger(__name__) + +# get DB controller +db = db_controller.DBController() + + +def process_task(task): + if task.canceled: + task.function_result = "canceled" + task.status = JobSchedule.STATUS_DONE + task.write_to_db(db.kv_store) + return False + + if task.retry >= task.max_retry: + task.function_result = "max retry reached" + task.status = JobSchedule.STATUS_DONE + task.write_to_db(db.kv_store) + return True + + if db.get_cluster_by_id(cl.get_id()).status == Cluster.STATUS_IN_ACTIVATION: + task.function_result = "Cluster is in_activation, waiting" + task.status = JobSchedule.STATUS_NEW + task.write_to_db(db.kv_store) + return False + + if task.status != JobSchedule.STATUS_RUNNING: + task.status = JobSchedule.STATUS_RUNNING + task.write_to_db(db.kv_store) + + try: + res = storage_node_ops.add_node(**task.function_params) + msg = f"Node add result: {res}" + logger.info(msg) + task.function_result = msg + if res: + task.status = JobSchedule.STATUS_DONE + else: + task.retry += 1 + task.status = JobSchedule.STATUS_SUSPENDED + task.write_to_db(db.kv_store) + return True + except Exception as e: + logger.error(e) + return False + + +logger.info("Starting Tasks runner fdb backup...") + +while True: + clusters = db.get_clusters() + if not clusters: + logger.error("No clusters found!") + else: + + b = backup_controller.create_backup() + time.sleep(clusters[0].backup_frequency) + + time.sleep(constants.TASK_EXEC_INTERVAL_SEC) From d21d20e91d00671b79e6804d27c47a1fe04c84ad Mon Sep 17 00:00:00 2001 From: hamdykhader Date: Sun, 28 Dec 2025 09:18:46 +0300 Subject: [PATCH 04/16] fix --- simplyblock_core/controllers/backup_controller.py | 5 ++--- simplyblock_core/env_var | 2 +- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/simplyblock_core/controllers/backup_controller.py b/simplyblock_core/controllers/backup_controller.py index 971f58048..e2eb1bfce 100644 --- a/simplyblock_core/controllers/backup_controller.py +++ b/simplyblock_core/controllers/backup_controller.py @@ -15,8 +15,7 @@ clusters = db_controller.get_clusters() if clusters: backup_path = clusters[0].backup_path -if not backup_path: - backup_path = constants.KVD_DB_BACKUP_PATH + def __get_fdb_cont(): snode = db_controller.get_mgmt_nodes()[0] @@ -62,7 +61,7 @@ def list_backups(): data.append({ "Name": name, - "Size": utils.humanbytes(size), + "Size": utils.humanbytes(int(size)), "Restorable": restorable, "Date": date, }) diff --git a/simplyblock_core/env_var b/simplyblock_core/env_var index 027f0d179..c2d20c223 100644 --- a/simplyblock_core/env_var +++ b/simplyblock_core/env_var @@ -1,5 +1,5 @@ SIMPLY_BLOCK_COMMAND_NAME=sbcli-dev SIMPLY_BLOCK_VERSION=19.2.30 -SIMPLY_BLOCK_DOCKER_IMAGE=public.ecr.aws/simply-block/simplyblock:R25.10-Hotfix +SIMPLY_BLOCK_DOCKER_IMAGE=public.ecr.aws/simply-block/simplyblock:R25.10-Hotfix-fdb-backup SIMPLY_BLOCK_SPDK_ULTRA_IMAGE=public.ecr.aws/simply-block/ultra:R25.10-Hotfix-latest From a28da7df2bb04674c7c2bded7b3e2f35633a4962 Mon Sep 17 00:00:00 2001 From: hamdykhader Date: Sun, 28 Dec 2025 12:19:38 +0300 Subject: [PATCH 05/16] wip --- simplyblock_cli/cli-reference.yaml | 15 +++++++++++++-- simplyblock_cli/clibase.py | 2 +- .../controllers/backup_controller.py | 19 ++++++++++++++----- simplyblock_core/models/cluster.py | 12 +++++++++++- 4 files changed, 39 insertions(+), 9 deletions(-) diff --git a/simplyblock_cli/cli-reference.yaml b/simplyblock_cli/cli-reference.yaml index 6cad46108..eaf01d512 100644 --- a/simplyblock_cli/cli-reference.yaml +++ b/simplyblock_cli/cli-reference.yaml @@ -1975,11 +1975,22 @@ commands: help: "restore a backup" arguments: - name: "backup_path" - help: 'backup path, defaults to local: file:///etc/foundationdb/backup, can be s3 bucket blobstore link, - blobstore://{API_KEY}:{API_SECRET}@s3.us-east-2.amazonaws.com/backup-fdb?bucket=fdb-backup-dir®ion=us-east-2&sc=0' + help: 'local backup path, defaults to /etc/foundationdb/backup' dest: backup_path type: str - name: "backup_frequency" help: "backup frequency, can be 3h, 1d" dest: backup_frequency type: str + - name: "--s3-bucket" + help: 'AWS S3 bucket name' + dest: bucket_name + type: str + - name: "--s3-region" + help: 'AWS S3 region' + dest: region_name + type: str + - name: "--s3-credentials" + help: 'AWS S3 API key and secret, should be supplied like this: [API_KEY]:[API_SECRET]' + dest: backup_credentials + type: str \ No newline at end of file diff --git a/simplyblock_cli/clibase.py b/simplyblock_cli/clibase.py index a1f98af7f..347dd002d 100644 --- a/simplyblock_cli/clibase.py +++ b/simplyblock_cli/clibase.py @@ -679,7 +679,7 @@ def backup__restore(self, sub_command, args): return backup_controller.backup_restore(args.name) def backup__config(self, sub_command, args): - return backup_controller.backup_configure(args.backup_path, args.backup_frequency) + return backup_controller.backup_configure(args.backup_path, args.backup_frequency, args.bucket_name, args.region_name, args.backup_credentials) def storage_node_list_devices(self, args): node_id = args.node_id diff --git a/simplyblock_core/controllers/backup_controller.py b/simplyblock_core/controllers/backup_controller.py index e2eb1bfce..3fdb1504e 100644 --- a/simplyblock_core/controllers/backup_controller.py +++ b/simplyblock_core/controllers/backup_controller.py @@ -10,11 +10,10 @@ logger = lg.getLogger() db_controller = DBController() - -backup_path = constants.KVD_DB_BACKUP_PATH +# backup_path = constants.KVD_DB_BACKUP_PATH clusters = db_controller.get_clusters() if clusters: - backup_path = clusters[0].backup_path + cluster = clusters[0] def __get_fdb_cont(): @@ -29,6 +28,11 @@ def __get_fdb_cont(): def create_backup(tag_name): container = __get_fdb_cont() if container: + backup_path = cluster.backup_local_path + if cluster.backup_s3_bucket and cluster.backup_s3_cred: + folder = f"backup-{str(datetime.datetime.now())}" + backup_path = f"blobstore://{cluster.backup_s3_cred}@s3.{cluster.backup_s3_region}.amazonaws.com/{folder}?bucket={cluster.backup_s3_bucket}&{cluster.backup_s3_region}&sc=0" + res = container.exec_run(cmd=f"fdbbackup start -d {backup_path}") cont = res.output.decode("utf-8") logger.info(cont) @@ -39,6 +43,7 @@ def list_backups(): container = __get_fdb_cont() data = [] if container: + backup_path = cluster.get_backup_path() res = container.exec_run(cmd=f"fdbbackup list -b {backup_path}") logger.info(f"backup list from : {backup_path}") cont = res.output.decode("utf-8") @@ -83,16 +88,20 @@ def backup_status(): def backup_restore(backup_name): container = __get_fdb_cont() if container: + backup_path = cluster.get_backup_path() res = container.exec_run(cmd=f"fdbcli --exec \"writemode on; clearrange \"\" \xff\"; fdbrestore start -r {backup_path} --dest-cluster-file {constants.KVD_DB_FILE_PATH} -v {backup_name}") cont = res.output.decode("utf-8") logger.info(cont.strip()) -def backup_configure(backup_path, backup_frequency): +def backup_configure(backup_path, backup_frequency, bucket_name, region_name, backup_credentials): clusters = db_controller.get_clusters() if clusters: - clusters[0].backup_path = backup_path + clusters[0].backup_local_path = backup_path clusters[0].backup_frequency_seconds = backup_frequency + clusters[0].backup_s3_region = region_name + clusters[0].backup_s3_bucket = bucket_name + clusters[0].backup_s3_cred = backup_credentials clusters[0].write_to_db() return True diff --git a/simplyblock_core/models/cluster.py b/simplyblock_core/models/cluster.py index b924ddc95..389d3c07c 100644 --- a/simplyblock_core/models/cluster.py +++ b/simplyblock_core/models/cluster.py @@ -70,8 +70,11 @@ class Cluster(BaseModel): is_re_balancing: bool = False full_page_unmap: bool = True is_single_node: bool = False - backup_path: str = constants.KVD_DB_BACKUP_PATH + backup_local_path: str = constants.KVD_DB_BACKUP_PATH backup_frequency_seconds: int = 3*60*60 + backup_s3_bucket: str = "" + backup_s3_region: str = "" + backup_s3_cred: str = "" def get_status_code(self): if self.status in self.STATUS_CODE_MAP: @@ -93,3 +96,10 @@ def is_qos_set(self) -> bool: return True return False + def get_backup_path(self): + if self.backup_s3_bucket and self.backup_s3_cred: + backup_path = f"blobstore://{self.backup_s3_cred}@s3.{self.backup_s3_region}.amazonaws.com/?bucket={self.backup_s3_bucket}" \ + + f"®ion={self.backup_s3_region}&sc=0" + else: + backup_path = self.backup_local_path + return backup_path From fa27bf938aaf7edb8cee6dc55de7af13b71caf7e Mon Sep 17 00:00:00 2001 From: hamdykhader Date: Sun, 28 Dec 2025 12:27:46 +0300 Subject: [PATCH 06/16] wip --- simplyblock_cli/cli-reference.yaml | 4 ++-- simplyblock_cli/cli.py | 7 +++++-- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/simplyblock_cli/cli-reference.yaml b/simplyblock_cli/cli-reference.yaml index eaf01d512..6aa55d212 100644 --- a/simplyblock_cli/cli-reference.yaml +++ b/simplyblock_cli/cli-reference.yaml @@ -1974,11 +1974,11 @@ commands: - name: config help: "restore a backup" arguments: - - name: "backup_path" + - name: "--backup-path" help: 'local backup path, defaults to /etc/foundationdb/backup' dest: backup_path type: str - - name: "backup_frequency" + - name: "--backup-frequency" help: "backup frequency, can be 3h, 1d" dest: backup_frequency type: str diff --git a/simplyblock_cli/cli.py b/simplyblock_cli/cli.py index f22f6321b..6ed8e4a9d 100644 --- a/simplyblock_cli/cli.py +++ b/simplyblock_cli/cli.py @@ -821,8 +821,11 @@ def init_backup__restore(self, subparser): def init_backup__config(self, subparser): subcommand = self.add_sub_command(subparser, 'config', 'restore a backup') - subcommand.add_argument('backup_path', help='backup path, defaults to local: file:///etc/foundationdb/backup, can be s3 bucket blobstore link, blobstore://[API_KEY]:[API_SECRET]@s3.us-east-2.amazonaws.com/backup-2025-12-28?bucket=fdb-backup-dir®ion=us-east-2&sc=0', type=str) - subcommand.add_argument('backup_frequency', help='backup frequency, can be 3h, 1d', type=str) + argument = subcommand.add_argument('--backup-path', help='local backup path, defaults to /etc/foundationdb/backup', type=str, dest='backup_path') + argument = subcommand.add_argument('--backup-frequency', help='backup frequency, can be 3h, 1d', type=str, dest='backup_frequency') + argument = subcommand.add_argument('--s3-bucket', help='AWS S3 bucket name', type=str, dest='bucket_name') + argument = subcommand.add_argument('--s3-region', help='AWS S3 region', type=str, dest='region_name') + argument = subcommand.add_argument('--s3-credentials', help='AWS S3 API key and secret, should be supplied like this: [API_KEY]:[API_SECRET]', type=str, dest='backup_credentials') def run(self): From f690a30d59e9c716cc47855521d202cd072f6898 Mon Sep 17 00:00:00 2001 From: hamdykhader Date: Sun, 28 Dec 2025 13:52:07 +0300 Subject: [PATCH 07/16] wip --- .../controllers/backup_controller.py | 33 +++++++++++++++---- simplyblock_core/models/cluster.py | 6 ++-- 2 files changed, 30 insertions(+), 9 deletions(-) diff --git a/simplyblock_core/controllers/backup_controller.py b/simplyblock_core/controllers/backup_controller.py index 3fdb1504e..f99786581 100644 --- a/simplyblock_core/controllers/backup_controller.py +++ b/simplyblock_core/controllers/backup_controller.py @@ -31,7 +31,10 @@ def create_backup(tag_name): backup_path = cluster.backup_local_path if cluster.backup_s3_bucket and cluster.backup_s3_cred: folder = f"backup-{str(datetime.datetime.now())}" - backup_path = f"blobstore://{cluster.backup_s3_cred}@s3.{cluster.backup_s3_region}.amazonaws.com/{folder}?bucket={cluster.backup_s3_bucket}&{cluster.backup_s3_region}&sc=0" + folder = folder.replace(" ", "-") + folder = folder.replace(":", "-") + folder = folder.split(".")[0] + backup_path = f"blobstore://{cluster.backup_s3_cred}@s3.{cluster.backup_s3_region}.amazonaws.com/{folder}?bucket={cluster.backup_s3_bucket}®ion={cluster.backup_s3_region}&sc=0" res = container.exec_run(cmd=f"fdbbackup start -d {backup_path}") cont = res.output.decode("utf-8") @@ -52,20 +55,33 @@ def list_backups(): continue name = line.split("/")[-1].strip() + name = name.split("?")[0] size = 0 restorable = 0 - res = container.exec_run(cmd=f"fdbbackup describe -d {line}") + date = "" + version = 0 + res = container.exec_run(cmd=f"fdbbackup describe -d {cluster.get_backup_path(name)} --version-timestamps") cont = res.output.decode("utf-8") for line in cont.splitlines(): if line and line.startswith("SnapshotBytes"): size = line.split()[1].strip() if line and line.startswith("Restorable"): restorable = line.split()[1].strip() - date = datetime.datetime.strptime(name.replace("backup-",""), "%Y-%m-%d-%H-%M-%S.%f").strftime( - "%H:%M:%S, %d/%m/%Y") + if line and line.startswith("Snapshot:"): + for param in line.split(): + if param.startswith("startVersion"): + version = param.split("=")[1].strip() + elif param.startswith("(") and param.endswith(")") and not date:# 2025/12/28.10:10:20+0000 + try: + date = datetime.datetime.strptime(param[1:-1], "%Y/%m/%d.%H:%M:%S+0000").strftime("%Y-%m-%d %H:%M:%S") + except Exception: + date = name.replace("backup-","") + if not date: + date = name.replace("backup-", "") data.append({ "Name": name, + "Version": version, "Size": utils.humanbytes(int(size)), "Restorable": restorable, "Date": date, @@ -83,15 +99,20 @@ def backup_status(): res = container.exec_run(cmd="fdbbackup status") cont = res.output.decode("utf-8") logger.info(f"backup status: \n{cont.strip()}") + return True def backup_restore(backup_name): container = __get_fdb_cont() if container: - backup_path = cluster.get_backup_path() - res = container.exec_run(cmd=f"fdbcli --exec \"writemode on; clearrange \"\" \xff\"; fdbrestore start -r {backup_path} --dest-cluster-file {constants.KVD_DB_FILE_PATH} -v {backup_name}") + backup_path = cluster.get_backup_path(backup_name) + res = container.exec_run(cmd=f"fdbcli --exec \"writemode on; clearrange \\\"\\\" \\xff\"") cont = res.output.decode("utf-8") logger.info(cont.strip()) + res = container.exec_run(cmd=f"fdbrestore start -r \"{backup_path}\" --dest-cluster-file {constants.KVD_DB_FILE_PATH}") + cont = res.output.decode("utf-8") + logger.info(cont.strip()) + return True def backup_configure(backup_path, backup_frequency, bucket_name, region_name, backup_credentials): diff --git a/simplyblock_core/models/cluster.py b/simplyblock_core/models/cluster.py index 389d3c07c..239403bae 100644 --- a/simplyblock_core/models/cluster.py +++ b/simplyblock_core/models/cluster.py @@ -96,10 +96,10 @@ def is_qos_set(self) -> bool: return True return False - def get_backup_path(self): + def get_backup_path(self, path=""): if self.backup_s3_bucket and self.backup_s3_cred: - backup_path = f"blobstore://{self.backup_s3_cred}@s3.{self.backup_s3_region}.amazonaws.com/?bucket={self.backup_s3_bucket}" \ + backup_path = f"blobstore://{self.backup_s3_cred}@s3.{self.backup_s3_region}.amazonaws.com/{path}?bucket={self.backup_s3_bucket}" \ + f"®ion={self.backup_s3_region}&sc=0" else: - backup_path = self.backup_local_path + backup_path = path.join([self.backup_local_path, path]) return backup_path From 9851b7754ab0c7dac9ac1e64fc17c47eb9f714c1 Mon Sep 17 00:00:00 2001 From: hamdykhader Date: Sun, 28 Dec 2025 14:21:55 +0300 Subject: [PATCH 08/16] wip --- simplyblock_core/controllers/backup_controller.py | 13 ++++++++----- simplyblock_core/models/cluster.py | 4 ++-- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/simplyblock_core/controllers/backup_controller.py b/simplyblock_core/controllers/backup_controller.py index f99786581..3378d0ebd 100644 --- a/simplyblock_core/controllers/backup_controller.py +++ b/simplyblock_core/controllers/backup_controller.py @@ -118,11 +118,14 @@ def backup_restore(backup_name): def backup_configure(backup_path, backup_frequency, bucket_name, region_name, backup_credentials): clusters = db_controller.get_clusters() if clusters: - clusters[0].backup_local_path = backup_path - clusters[0].backup_frequency_seconds = backup_frequency - clusters[0].backup_s3_region = region_name - clusters[0].backup_s3_bucket = bucket_name - clusters[0].backup_s3_cred = backup_credentials + if backup_path: + if not backup_path.startswith("file://"): + backup_path = f"file://{backup_path}" + clusters[0].backup_local_path = backup_path + clusters[0].backup_frequency_seconds = backup_frequency if backup_frequency else "" + clusters[0].backup_s3_region = region_name if region_name else "" + clusters[0].backup_s3_bucket = bucket_name if bucket_name else "" + clusters[0].backup_s3_cred = backup_credentials if backup_credentials else "" clusters[0].write_to_db() return True diff --git a/simplyblock_core/models/cluster.py b/simplyblock_core/models/cluster.py index 239403bae..02277feb0 100644 --- a/simplyblock_core/models/cluster.py +++ b/simplyblock_core/models/cluster.py @@ -1,5 +1,5 @@ # coding=utf-8 - +import os.path from typing import List from simplyblock_core import constants @@ -101,5 +101,5 @@ def get_backup_path(self, path=""): backup_path = f"blobstore://{self.backup_s3_cred}@s3.{self.backup_s3_region}.amazonaws.com/{path}?bucket={self.backup_s3_bucket}" \ + f"®ion={self.backup_s3_region}&sc=0" else: - backup_path = path.join([self.backup_local_path, path]) + backup_path = os.path.join(self.backup_local_path, path) return backup_path From b16ab46b5273b90d88cc99d277a1c10f70977ed1 Mon Sep 17 00:00:00 2001 From: hamdykhader Date: Sun, 28 Dec 2025 14:59:46 +0300 Subject: [PATCH 09/16] wip --- simplyblock_cli/clibase.py | 2 +- .../controllers/backup_controller.py | 32 +++++++++++++++++-- .../services/tasks_runner_fdb_backup.py | 4 +-- 3 files changed, 32 insertions(+), 6 deletions(-) diff --git a/simplyblock_cli/clibase.py b/simplyblock_cli/clibase.py index 347dd002d..f0096a7bf 100644 --- a/simplyblock_cli/clibase.py +++ b/simplyblock_cli/clibase.py @@ -667,7 +667,7 @@ def qos__delete(self, sub_command, args): return qos_controller.delete_class(args.name, args.cluster_id) def backup__create(self, sub_command, args): - return backup_controller.create_backup(args.tag_name) + return backup_controller.create_backup() def backup__list(self, sub_command, args): return backup_controller.list_backups() diff --git a/simplyblock_core/controllers/backup_controller.py b/simplyblock_core/controllers/backup_controller.py index 3378d0ebd..038c0c664 100644 --- a/simplyblock_core/controllers/backup_controller.py +++ b/simplyblock_core/controllers/backup_controller.py @@ -1,6 +1,7 @@ # coding=utf-8 import datetime import logging as lg +import re import docker @@ -25,7 +26,7 @@ def __get_fdb_cont(): if container.name.startswith("app_fdb-server"): # type: ignore[union-attr] return container -def create_backup(tag_name): +def create_backup(): container = __get_fdb_cont() if container: backup_path = cluster.backup_local_path @@ -106,7 +107,7 @@ def backup_restore(backup_name): container = __get_fdb_cont() if container: backup_path = cluster.get_backup_path(backup_name) - res = container.exec_run(cmd=f"fdbcli --exec \"writemode on; clearrange \\\"\\\" \\xff\"") + res = container.exec_run(cmd="fdbcli --exec \"writemode on; clearrange \\\"\\\" \\xff\"") cont = res.output.decode("utf-8") logger.info(cont.strip()) res = container.exec_run(cmd=f"fdbrestore start -r \"{backup_path}\" --dest-cluster-file {constants.KVD_DB_FILE_PATH}") @@ -115,6 +116,29 @@ def backup_restore(backup_name): return True +def parse_history_param(history_string): + if not history_string: + logger.error("Invalid history value") + return False + results = re.search(r'^(\d+[hmd])(\d+[hmd])?$', history_string.lower()) + if not results: + logger.error(f"Error parsing history string: {history_string}") + return False + total_seconds = 0 + for s in results.groups(): + if not s: + continue + ind = s[-1] + v = int(s[:-1]) + if ind == 'd': + total_seconds += v*60*60*24 + if ind == 'h': + total_seconds += v*60*60 + if ind == 'm': + total_seconds += v*60 + return int(total_seconds) + + def backup_configure(backup_path, backup_frequency, bucket_name, region_name, backup_credentials): clusters = db_controller.get_clusters() if clusters: @@ -122,7 +146,9 @@ def backup_configure(backup_path, backup_frequency, bucket_name, region_name, ba if not backup_path.startswith("file://"): backup_path = f"file://{backup_path}" clusters[0].backup_local_path = backup_path - clusters[0].backup_frequency_seconds = backup_frequency if backup_frequency else "" + if backup_frequency: + total_seconds = parse_history_param(backup_frequency) + clusters[0].backup_frequency_seconds = total_seconds clusters[0].backup_s3_region = region_name if region_name else "" clusters[0].backup_s3_bucket = bucket_name if bucket_name else "" clusters[0].backup_s3_cred = backup_credentials if backup_credentials else "" diff --git a/simplyblock_core/services/tasks_runner_fdb_backup.py b/simplyblock_core/services/tasks_runner_fdb_backup.py index eb9282f61..6487fdb34 100644 --- a/simplyblock_core/services/tasks_runner_fdb_backup.py +++ b/simplyblock_core/services/tasks_runner_fdb_backup.py @@ -27,7 +27,7 @@ def process_task(task): task.write_to_db(db.kv_store) return True - if db.get_cluster_by_id(cl.get_id()).status == Cluster.STATUS_IN_ACTIVATION: + if db.get_cluster_by_id(task.cluster_id).status == Cluster.STATUS_IN_ACTIVATION: task.function_result = "Cluster is in_activation, waiting" task.status = JobSchedule.STATUS_NEW task.write_to_db(db.kv_store) @@ -63,6 +63,6 @@ def process_task(task): else: b = backup_controller.create_backup() - time.sleep(clusters[0].backup_frequency) + time.sleep(clusters[0].backup_frequency_seconds) time.sleep(constants.TASK_EXEC_INTERVAL_SEC) From d6d41adeee40fea264648902b46cc9f8b6ca1555 Mon Sep 17 00:00:00 2001 From: hamdykhader Date: Sun, 28 Dec 2025 15:27:11 +0300 Subject: [PATCH 10/16] wip --- simplyblock_core/test/test_utils.py | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/simplyblock_core/test/test_utils.py b/simplyblock_core/test/test_utils.py index 514ba0bbe..5cdb66b1a 100644 --- a/simplyblock_core/test/test_utils.py +++ b/simplyblock_core/test/test_utils.py @@ -1,9 +1,11 @@ import uuid from typing import ContextManager +from unittest.mock import patch import pytest from simplyblock_core import utils, storage_node_ops +from simplyblock_core.db_controller import DBController from simplyblock_core.models.nvme_device import JMDevice, RemoteJMDevice from simplyblock_core.models.storage_node import StorageNode from simplyblock_core.utils import helpers, parse_thread_siblings_list @@ -153,24 +155,26 @@ def test_parse_thread_siblings_list(input, expected): -def test_get_node_jm_names(): +@patch.object(DBController, 'get_jm_device_by_id') +def test_get_node_jm_names(db_controller_get_jm_device_by_id): + jm_devices = [] node_1_jm = JMDevice() node_1_jm.uuid = "node_1_jm_id" node_1_jm.jm_bdev = "node_1_jm" - + jm_devices.append(node_1_jm) node_2_jm = JMDevice() node_2_jm.uuid = "node_2_jm_id" node_2_jm.jm_bdev = "node_2_jm" - + jm_devices.append(node_2_jm) node_3_jm = JMDevice() node_3_jm.uuid = "node_3_jm_id" node_3_jm.jm_bdev = "node_3_jm" - + jm_devices.append(node_3_jm) node_4_jm = JMDevice() node_4_jm.uuid = "node_4_jm_id" node_4_jm.jm_bdev = "node_4_jm" - + jm_devices.append(node_4_jm) node_1 = StorageNode() node_1.uuid = str(uuid.uuid4()) node_1.enable_ha_jm = True @@ -178,6 +182,13 @@ def test_get_node_jm_names(): node_1.jm_device = node_1_jm node_1.jm_ids = ["node_2_jm_id", "node_3_jm_id", "node_4_jm_id"] + def get_jm_device_by_id(jm_id): + for dev in jm_devices: + if dev.uuid == jm_id: + return dev + + db_controller_get_jm_device_by_id.side_effect = get_jm_device_by_id + remote_node = StorageNode() remote_node.uuid = str(uuid.uuid4()) remote_node.enable_ha_jm = True From bc672309254bfe2dc8fe200c197cdfb7aa4282b0 Mon Sep 17 00:00:00 2001 From: hamdykhader Date: Mon, 29 Dec 2025 22:12:58 +0300 Subject: [PATCH 11/16] Adds backup service def to docker swarm and k8s --- simplyblock_cli/cli-reference.yaml | 10 +-- simplyblock_cli/cli.py | 4 +- simplyblock_cli/clibase.py | 2 +- simplyblock_core/cluster_ops.py | 15 ++++ .../controllers/backup_controller.py | 6 +- .../controllers/tasks_controller.py | 35 ++++++++- simplyblock_core/models/job_schedule.py | 1 + .../scripts/charts/templates/app_k8s.yaml | 51 ++++++++++++ .../scripts/docker-compose-swarm.yml | 14 ++++ simplyblock_core/services/cap_monitor.py | 16 +++- .../services/tasks_runner_fdb_backup.py | 77 ++++++++----------- 11 files changed, 170 insertions(+), 61 deletions(-) diff --git a/simplyblock_cli/cli-reference.yaml b/simplyblock_cli/cli-reference.yaml index 6aa55d212..39f0e8058 100644 --- a/simplyblock_cli/cli-reference.yaml +++ b/simplyblock_cli/cli-reference.yaml @@ -1954,12 +1954,10 @@ commands: - name: create help: "Creates an fdb backup" arguments: - - name: "--tag-name" - help: "backup tag name" - dest: tag_name + - name: "cluster_id" + help: "Cluster ID to create db backup for" + dest: cluster_id type: str - required: false - default: "" - name: list help: "Lists all fdb backups" - name: status @@ -1972,7 +1970,7 @@ commands: dest: name type: str - name: config - help: "restore a backup" + help: "Set backup configuration" arguments: - name: "--backup-path" help: 'local backup path, defaults to /etc/foundationdb/backup' diff --git a/simplyblock_cli/cli.py b/simplyblock_cli/cli.py index 6ed8e4a9d..d98e3cdc1 100644 --- a/simplyblock_cli/cli.py +++ b/simplyblock_cli/cli.py @@ -807,7 +807,7 @@ def init_backup(self): def init_backup__create(self, subparser): subcommand = self.add_sub_command(subparser, 'create', 'Creates an fdb backup') - argument = subcommand.add_argument('--tag-name', help='backup tag name', type=str, default='', dest='tag_name', required=False) + subcommand.add_argument('cluster_id', help='Cluster ID to create db backup for', type=str) def init_backup__list(self, subparser): subcommand = self.add_sub_command(subparser, 'list', 'Lists all fdb backups') @@ -820,7 +820,7 @@ def init_backup__restore(self, subparser): subcommand.add_argument('name', help='backup class name', type=str) def init_backup__config(self, subparser): - subcommand = self.add_sub_command(subparser, 'config', 'restore a backup') + subcommand = self.add_sub_command(subparser, 'config', 'Set backup configuration') argument = subcommand.add_argument('--backup-path', help='local backup path, defaults to /etc/foundationdb/backup', type=str, dest='backup_path') argument = subcommand.add_argument('--backup-frequency', help='backup frequency, can be 3h, 1d', type=str, dest='backup_frequency') argument = subcommand.add_argument('--s3-bucket', help='AWS S3 bucket name', type=str, dest='bucket_name') diff --git a/simplyblock_cli/clibase.py b/simplyblock_cli/clibase.py index f0096a7bf..48194d39b 100644 --- a/simplyblock_cli/clibase.py +++ b/simplyblock_cli/clibase.py @@ -667,7 +667,7 @@ def qos__delete(self, sub_command, args): return qos_controller.delete_class(args.name, args.cluster_id) def backup__create(self, sub_command, args): - return backup_controller.create_backup() + return tasks_controller.add_backup_task(args.cluster_id) def backup__list(self, sub_command, args): return backup_controller.list_backups() diff --git a/simplyblock_core/cluster_ops.py b/simplyblock_core/cluster_ops.py index ceff1374d..bf1c3ca00 100644 --- a/simplyblock_core/cluster_ops.py +++ b/simplyblock_core/cluster_ops.py @@ -1193,6 +1193,13 @@ def update_cluster(cluster_id, mgmt_only=False, restart=False, spdk_image=None, service_file="python simplyblock_core/services/tasks_runner_jc_comp.py", service_image=service_image) + if "app_BackupService" not in service_names: + utils.create_docker_service( + cluster_docker=cluster_docker, + service_name="app_BackupService", + service_file="python simplyblock_core/services/tasks_runner_fdb_backup.py", + service_image=service_image) + logger.info("Done updating mgmt cluster") elif cluster.mode == "kubernetes": @@ -1239,6 +1246,14 @@ def update_cluster(cluster_id, mgmt_only=False, restart=False, spdk_image=None, service_file="simplyblock_core/services/snapshot_monitor.py", container_image=service_image) + if "simplyblock-backup-service" not in deployment_names: + utils.create_k8s_service( + namespace=namespace, + deployment_name="simplyblock-backup-service", + container_name="snapshot-monitor", + service_file="simplyblock_core/services/tasks_runner_fdb_backup.py", + container_image=service_image) + # Update DaemonSets daemonsets = apps_v1.list_namespaced_daemon_set(namespace=namespace) for ds in daemonsets.items: diff --git a/simplyblock_core/controllers/backup_controller.py b/simplyblock_core/controllers/backup_controller.py index 038c0c664..da1568d12 100644 --- a/simplyblock_core/controllers/backup_controller.py +++ b/simplyblock_core/controllers/backup_controller.py @@ -2,12 +2,14 @@ import datetime import logging as lg import re +import time +import uuid import docker from simplyblock_core import utils, constants from simplyblock_core.db_controller import DBController - +from simplyblock_core.models.job_schedule import JobSchedule logger = lg.getLogger() db_controller = DBController() @@ -37,7 +39,7 @@ def create_backup(): folder = folder.split(".")[0] backup_path = f"blobstore://{cluster.backup_s3_cred}@s3.{cluster.backup_s3_region}.amazonaws.com/{folder}?bucket={cluster.backup_s3_bucket}®ion={cluster.backup_s3_region}&sc=0" - res = container.exec_run(cmd=f"fdbbackup start -d {backup_path}") + res = container.exec_run(cmd=f"fdbbackup start -d {backup_path} -w") cont = res.output.decode("utf-8") logger.info(cont) return True diff --git a/simplyblock_core/controllers/tasks_controller.py b/simplyblock_core/controllers/tasks_controller.py index dab539943..e8c509e72 100644 --- a/simplyblock_core/controllers/tasks_controller.py +++ b/simplyblock_core/controllers/tasks_controller.py @@ -160,8 +160,7 @@ def list_tasks(cluster_id, is_json=False, limit=50, **kwargs): return False data = [] - tasks = db.get_job_tasks(cluster_id, reverse=True) - tasks.reverse() + tasks = db.get_job_tasks(cluster_id, reverse=False) if is_json is True: for t in tasks: if t.function_name == JobSchedule.FN_DEV_MIG: @@ -420,3 +419,35 @@ def get_lvol_sync_del_task(cluster_id, node_id, lvol_bdev_name=None): return task.uuid return False + +def add_backup_task(cluster_id): + try: + cluster = db.get_cluster_by_id(cluster_id) + except Exception as e: + logger.error(f"Failed to get cluster {cluster_id}: {e}") + return False + + tasks = get_backup_tasks(cluster_id) + for task in tasks: + if task.status != JobSchedule.STATUS_DONE: + logger.info(f"Backup task found: {tasks[0].uuid}") + return False + + task_obj = JobSchedule() + task_obj.uuid = str(uuid.uuid4()) + task_obj.cluster_id = cluster.get_id() + task_obj.date = int(time.time()) + task_obj.max_retry = constants.TASK_EXEC_RETRY_COUNT + task_obj.status = JobSchedule.STATUS_NEW + task_obj.write_to_db() + logger.info(f"Backup task created: {task_obj.uuid}") + return task_obj.uuid + + +def get_backup_tasks(cluster_id): + backup_tasks = [] + tasks = db.get_job_tasks(cluster_id) + for task in tasks: + if task.function_name == JobSchedule.FN_FDB_BACKUP: + backup_tasks.append(task) + return backup_tasks diff --git a/simplyblock_core/models/job_schedule.py b/simplyblock_core/models/job_schedule.py index bbdcd7871..e16424987 100644 --- a/simplyblock_core/models/job_schedule.py +++ b/simplyblock_core/models/job_schedule.py @@ -23,6 +23,7 @@ class JobSchedule(BaseModel): FN_BALANCING_AFTER_DEV_EXPANSION = "balancing_on_dev_add" FN_JC_COMP_RESUME = "jc_comp_resume" FN_LVOL_SYNC_DEL = "lvol_sync_del" + FN_FDB_BACKUP = "fdb_backup" canceled: bool = False cluster_id: str = "" diff --git a/simplyblock_core/scripts/charts/templates/app_k8s.yaml b/simplyblock_core/scripts/charts/templates/app_k8s.yaml index ab11562d9..458afa322 100644 --- a/simplyblock_core/scripts/charts/templates/app_k8s.yaml +++ b/simplyblock_core/scripts/charts/templates/app_k8s.yaml @@ -1158,6 +1158,57 @@ spec: - key: cluster-file path: fdb.cluster --- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: simplyblock-backup-service + namespace: {{ .Release.Namespace }} +spec: + replicas: 1 + selector: + matchLabels: + app: simplyblock-backup-service + template: + metadata: + annotations: + log-collector/enabled: "true" + reloader.stakater.com/auto: "true" + reloader.stakater.com/configmap: "simplyblock-fdb-cluster-config" + labels: + app: simplyblock-backup-service + spec: + hostNetwork: true + dnsPolicy: ClusterFirstWithHostNet + containers: + - name: backup-service + image: "{{ .Values.image.simplyblock.repository }}:{{ .Values.image.simplyblock.tag }}" + imagePullPolicy: "{{ .Values.image.simplyblock.pullPolicy }}" + command: ["python", "simplyblock_core/services/tasks_runner_fdb_backup.py"] + env: + - name: SIMPLYBLOCK_LOG_LEVEL + valueFrom: + configMapKeyRef: + name: simplyblock-config + key: LOG_LEVEL + volumeMounts: + - name: fdb-cluster-file + mountPath: /etc/foundationdb/fdb.cluster + subPath: fdb.cluster + resources: + requests: + cpu: "200m" + memory: "256Mi" + limits: + cpu: "400m" + memory: "1Gi" + volumes: + - name: fdb-cluster-file + configMap: + name: simplyblock-fdb-cluster-config + items: + - key: cluster-file + path: fdb.cluster +--- apiVersion: apps/v1 kind: DaemonSet diff --git a/simplyblock_core/scripts/docker-compose-swarm.yml b/simplyblock_core/scripts/docker-compose-swarm.yml index eb29d68b6..f6b5e579a 100644 --- a/simplyblock_core/scripts/docker-compose-swarm.yml +++ b/simplyblock_core/scripts/docker-compose-swarm.yml @@ -364,6 +364,20 @@ services: environment: SIMPLYBLOCK_LOG_LEVEL: "$LOG_LEVEL" + BackupService: + <<: *service-base + image: $SIMPLYBLOCK_DOCKER_IMAGE + command: "python simplyblock_core/services/tasks_runner_fdb_backup.py" + deploy: + placement: + constraints: [node.role == manager] + volumes: + - "/etc/foundationdb:/etc/foundationdb" + networks: + - hostnet + environment: + SIMPLYBLOCK_LOG_LEVEL: "$LOG_LEVEL" + networks: monitoring-net: external: true diff --git a/simplyblock_core/services/cap_monitor.py b/simplyblock_core/services/cap_monitor.py index c68049256..b9632a0fb 100644 --- a/simplyblock_core/services/cap_monitor.py +++ b/simplyblock_core/services/cap_monitor.py @@ -4,9 +4,9 @@ from datetime import datetime, timezone from simplyblock_core import db_controller, constants, cluster_ops, utils -from simplyblock_core.controllers import cluster_events +from simplyblock_core.controllers import cluster_events, tasks_controller from simplyblock_core.models.cluster import Cluster - +from simplyblock_core.models.job_schedule import JobSchedule logger = utils.get_logger(__name__) @@ -14,10 +14,22 @@ db = db_controller.DBController() last_event: dict[str, dict] = {} + +def create_fdb_backup_if_needed(cluster): + last_backup_task = None + tasks = tasks_controller.get_backup_tasks(cluster.get_id()) + if tasks: + last_backup_task = tasks[0] + if last_backup_task and last_backup_task.status == JobSchedule.STATUS_DONE: + if last_backup_task.date + cluster.backup_frequency_seconds > time.time(): + tasks_controller.add_backup_task(cluster.get_id()) + + logger.info("Starting capacity monitoring service...") while True: clusters = db.get_clusters() for cl in clusters: + create_fdb_backup_if_needed(cl) logger.info(f"Checking cluster: {cl.get_id()}") records = db.get_cluster_capacity(cl, 1) if not records: diff --git a/simplyblock_core/services/tasks_runner_fdb_backup.py b/simplyblock_core/services/tasks_runner_fdb_backup.py index 6487fdb34..bc365435c 100644 --- a/simplyblock_core/services/tasks_runner_fdb_backup.py +++ b/simplyblock_core/services/tasks_runner_fdb_backup.py @@ -2,8 +2,8 @@ import time -from simplyblock_core import db_controller, storage_node_ops, utils, constants -from simplyblock_core.controllers import backup_controller +from simplyblock_core import db_controller, utils, constants +from simplyblock_core.controllers import backup_controller, tasks_controller from simplyblock_core.models.job_schedule import JobSchedule from simplyblock_core.models.cluster import Cluster @@ -13,47 +13,6 @@ # get DB controller db = db_controller.DBController() - -def process_task(task): - if task.canceled: - task.function_result = "canceled" - task.status = JobSchedule.STATUS_DONE - task.write_to_db(db.kv_store) - return False - - if task.retry >= task.max_retry: - task.function_result = "max retry reached" - task.status = JobSchedule.STATUS_DONE - task.write_to_db(db.kv_store) - return True - - if db.get_cluster_by_id(task.cluster_id).status == Cluster.STATUS_IN_ACTIVATION: - task.function_result = "Cluster is in_activation, waiting" - task.status = JobSchedule.STATUS_NEW - task.write_to_db(db.kv_store) - return False - - if task.status != JobSchedule.STATUS_RUNNING: - task.status = JobSchedule.STATUS_RUNNING - task.write_to_db(db.kv_store) - - try: - res = storage_node_ops.add_node(**task.function_params) - msg = f"Node add result: {res}" - logger.info(msg) - task.function_result = msg - if res: - task.status = JobSchedule.STATUS_DONE - else: - task.retry += 1 - task.status = JobSchedule.STATUS_SUSPENDED - task.write_to_db(db.kv_store) - return True - except Exception as e: - logger.error(e) - return False - - logger.info("Starting Tasks runner fdb backup...") while True: @@ -61,8 +20,34 @@ def process_task(task): if not clusters: logger.error("No clusters found!") else: - - b = backup_controller.create_backup() - time.sleep(clusters[0].backup_frequency_seconds) + for cl in clusters: + if cl.status == Cluster.STATUS_IN_ACTIVATION: + continue + + tasks = tasks_controller.get_backup_tasks(cl.get_id()) + for task in tasks: + if task.status != JobSchedule.STATUS_DONE: + task = db.get_task_by_id(task.uuid) + if task.canceled: + task.function_result = "canceled" + task.status = JobSchedule.STATUS_DONE + task.write_to_db(db.kv_store) + continue + + if task.retry >= task.max_retry: + task.function_result = "max retry reached, stopping task" + task.status = JobSchedule.STATUS_DONE + task.write_to_db(db.kv_store) + continue + + if task.status != JobSchedule.STATUS_RUNNING: + task.status = JobSchedule.STATUS_RUNNING + task.write_to_db(db.kv_store) + + ret = backup_controller.create_backup() + if ret: + task.function_result = "Backup created" + task.status = JobSchedule.STATUS_DONE + task.write_to_db(db.kv_store) time.sleep(constants.TASK_EXEC_INTERVAL_SEC) From c9b13f2510e6eca40f048dd96b57936a6584711a Mon Sep 17 00:00:00 2001 From: hamdykhader Date: Thu, 8 Jan 2026 03:04:36 +0300 Subject: [PATCH 12/16] wip --- simplyblock_cli/cli-reference.yaml | 12 +++ simplyblock_cli/cli.py | 8 ++ simplyblock_cli/clibase.py | 3 + .../controllers/backup_controller.py | 18 ++++ .../controllers/snapshot_controller.py | 23 +++- .../controllers/tasks_controller.py | 37 +++++++ simplyblock_core/env_var | 2 +- simplyblock_core/models/job_schedule.py | 1 + simplyblock_core/rpc_client.py | 8 ++ .../services/tasks_runner_fdb_backup.py | 102 +++++++++++++----- 10 files changed, 188 insertions(+), 26 deletions(-) diff --git a/simplyblock_cli/cli-reference.yaml b/simplyblock_cli/cli-reference.yaml index 39f0e8058..c0d08a263 100644 --- a/simplyblock_cli/cli-reference.yaml +++ b/simplyblock_cli/cli-reference.yaml @@ -1899,6 +1899,18 @@ commands: dest: resize type: size default: "0" + - name: backup + help: "create backup task for snapshot" + arguments: + - name: "snapshot_id" + help: "Snapshot id" + dest: snapshot_id + type: str + - name: "--delete-after-finish" + help: "Delete the snapshot after backup is completed" + dest: delete_after_finish + type: bool + action: store_true - name: "qos" help: "qos commands" weight: 700 diff --git a/simplyblock_cli/cli.py b/simplyblock_cli/cli.py index d98e3cdc1..a3e778fc1 100644 --- a/simplyblock_cli/cli.py +++ b/simplyblock_cli/cli.py @@ -749,6 +749,7 @@ def init_snapshot(self): self.init_snapshot__list(subparser) self.init_snapshot__delete(subparser) self.init_snapshot__clone(subparser) + self.init_snapshot__backup(subparser) def init_snapshot__add(self, subparser): @@ -771,6 +772,11 @@ def init_snapshot__clone(self, subparser): subcommand.add_argument('lvol_name', help='Logical volume name', type=str) argument = subcommand.add_argument('--resize', help='New logical volume size: 10M, 10G, 10(bytes). Can only increase.', type=size_type(), default='0', dest='resize') + def init_snapshot__backup(self, subparser): + subcommand = self.add_sub_command(subparser, 'backup', 'create backup task for snapshot') + subcommand.add_argument('snapshot_id', help='Snapshot id', type=str) + argument = subcommand.add_argument('--delete-after-finish', help='Delete the snapshot after backup is completed', dest='delete_after_finish', action='store_true') + def init_qos(self): subparser = self.add_command('qos', 'qos commands') @@ -1146,6 +1152,8 @@ def run(self): ret = self.snapshot__delete(sub_command, args) elif sub_command in ['clone']: ret = self.snapshot__clone(sub_command, args) + elif sub_command in ['backup']: + ret = self.snapshot__backup(sub_command, args) else: self.parser.print_help() diff --git a/simplyblock_cli/clibase.py b/simplyblock_cli/clibase.py index 48194d39b..ed9168f16 100644 --- a/simplyblock_cli/clibase.py +++ b/simplyblock_cli/clibase.py @@ -657,6 +657,9 @@ def snapshot__clone(self, sub_command, args): success, details = snapshot_controller.clone(args.snapshot_id, args.lvol_name, new_size) return details + def snapshot__backup(self, sub_command, args): + return snapshot_controller.create_backup(args.snapshot_id, args.delete_after_finish) + def qos__add(self, sub_command, args): return qos_controller.add_class(args.name, args.weight, args.cluster_id) diff --git a/simplyblock_core/controllers/backup_controller.py b/simplyblock_core/controllers/backup_controller.py index da1568d12..470d39511 100644 --- a/simplyblock_core/controllers/backup_controller.py +++ b/simplyblock_core/controllers/backup_controller.py @@ -10,6 +10,7 @@ from simplyblock_core import utils, constants from simplyblock_core.db_controller import DBController from simplyblock_core.models.job_schedule import JobSchedule +from simplyblock_core.models.snapshot import SnapShot logger = lg.getLogger() db_controller = DBController() @@ -157,3 +158,20 @@ def backup_configure(backup_path, backup_frequency, bucket_name, region_name, ba clusters[0].write_to_db() return True + +def create_snapshot_backup(snapshot: SnapShot): + container = __get_fdb_cont() + if container: + backup_path = cluster.backup_local_path + if cluster.backup_s3_bucket and cluster.backup_s3_cred: + folder = f"backup-{str(datetime.datetime.now())}" + folder = folder.replace(" ", "-") + folder = folder.replace(":", "-") + folder = folder.split(".")[0] + backup_path = f"blobstore://{cluster.backup_s3_cred}@s3.{cluster.backup_s3_region}.amazonaws.com/{folder}?bucket={cluster.backup_s3_bucket}®ion={cluster.backup_s3_region}&sc=0" + + res = container.exec_run(cmd=f"fdbbackup start -d {backup_path} -w") + cont = res.output.decode("utf-8") + logger.info(cont) + return True + return False diff --git a/simplyblock_core/controllers/snapshot_controller.py b/simplyblock_core/controllers/snapshot_controller.py index d3eca0e00..24576f8ed 100644 --- a/simplyblock_core/controllers/snapshot_controller.py +++ b/simplyblock_core/controllers/snapshot_controller.py @@ -3,7 +3,7 @@ import time import uuid -from simplyblock_core.controllers import lvol_controller, snapshot_events, pool_controller +from simplyblock_core.controllers import lvol_controller, snapshot_events, pool_controller, tasks_controller from simplyblock_core import utils, constants from simplyblock_core.db_controller import DBController @@ -219,6 +219,10 @@ def add(lvol_id, snapshot_name): logger.info("Done") snapshot_events.snapshot_create(snap) + + if cluster.backup_s3_bucket and cluster.backup_s3_cred: + tasks_controller.add_snap_backup_task(snap) + return snap.uuid, False @@ -587,3 +591,20 @@ def clone(snapshot_id, clone_name, new_size=0, pvc_name=None, pvc_namespace=None if new_size: lvol_controller.resize_lvol(lvol.get_id(), new_size) return lvol.uuid, False + + +def create_backup(snapshot_uuid, delete_after_finish=False): + try: + snap = db_controller.get_snapshot_by_id(snapshot_uuid) + except KeyError: + logger.error(f"Snapshot not found {snapshot_uuid}") + return False + + cluster = db_controller.get_cluster_by_id(snap.cluster_id) + if cluster.backup_s3_bucket and cluster.backup_s3_cred: + tasks_controller.add_snap_backup_task(snap, delete_after_finish) + logger.info("Done") + return True + else: + logger.error("Cluster S3 backup is not configured") + return False diff --git a/simplyblock_core/controllers/tasks_controller.py b/simplyblock_core/controllers/tasks_controller.py index e8c509e72..932eb2c7b 100644 --- a/simplyblock_core/controllers/tasks_controller.py +++ b/simplyblock_core/controllers/tasks_controller.py @@ -9,6 +9,7 @@ from simplyblock_core.controllers import tasks_events, device_controller from simplyblock_core.models.cluster import Cluster from simplyblock_core.models.job_schedule import JobSchedule +from simplyblock_core.models.snapshot import SnapShot from simplyblock_core.models.storage_node import StorageNode logger = logging.getLogger() @@ -451,3 +452,39 @@ def get_backup_tasks(cluster_id): if task.function_name == JobSchedule.FN_FDB_BACKUP: backup_tasks.append(task) return backup_tasks + + +def add_snap_backup_task(snapshot: SnapShot, delete_after_finish: bool = False): + try: + cluster = db.get_cluster_by_id(snapshot.cluster_id) + except Exception as e: + logger.error(f"Failed to get cluster {snapshot.cluster_id}: {e}") + return False + + tasks = get_snap_backup_task(snapshot.cluster_id) + for task in tasks: + if task.status != JobSchedule.STATUS_DONE and task.function_params.get("snapshot_id") == snapshot.get_id(): + logger.info(f"Snapshot Backup task found: {task.uuid}") + return False + + task_obj = JobSchedule() + task_obj.uuid = str(uuid.uuid4()) + task_obj.cluster_id = cluster.get_id() + task_obj.date = int(time.time()) + task_obj.max_retry = constants.TASK_EXEC_RETRY_COUNT + task_obj.status = JobSchedule.STATUS_NEW + task_obj.function_name = JobSchedule.FN_SNAPSHOT_BACKUP + task_obj.function_params ={"snapshot_id": snapshot.get_id(), "delete_after_finish": delete_after_finish} + task_obj.write_to_db() + + logger.info(f"Backup task created: {task_obj.uuid}") + return task_obj.uuid + + +def get_snap_backup_task(cluster_id): + backup_tasks = [] + tasks = db.get_job_tasks(cluster_id) + for task in tasks: + if task.function_name == JobSchedule.FN_SNAPSHOT_BACKUP: + backup_tasks.append(task) + return backup_tasks diff --git a/simplyblock_core/env_var b/simplyblock_core/env_var index c2d20c223..1dd006dad 100644 --- a/simplyblock_core/env_var +++ b/simplyblock_core/env_var @@ -2,4 +2,4 @@ SIMPLY_BLOCK_COMMAND_NAME=sbcli-dev SIMPLY_BLOCK_VERSION=19.2.30 SIMPLY_BLOCK_DOCKER_IMAGE=public.ecr.aws/simply-block/simplyblock:R25.10-Hotfix-fdb-backup -SIMPLY_BLOCK_SPDK_ULTRA_IMAGE=public.ecr.aws/simply-block/ultra:R25.10-Hotfix-latest +SIMPLY_BLOCK_SPDK_ULTRA_IMAGE=public.ecr.aws/simply-block/ultra:s3-backup-latest diff --git a/simplyblock_core/models/job_schedule.py b/simplyblock_core/models/job_schedule.py index e16424987..575ad705a 100644 --- a/simplyblock_core/models/job_schedule.py +++ b/simplyblock_core/models/job_schedule.py @@ -24,6 +24,7 @@ class JobSchedule(BaseModel): FN_JC_COMP_RESUME = "jc_comp_resume" FN_LVOL_SYNC_DEL = "lvol_sync_del" FN_FDB_BACKUP = "fdb_backup" + FN_SNAPSHOT_BACKUP = "snapshot_backup" canceled: bool = False cluster_id: str = "" diff --git a/simplyblock_core/rpc_client.py b/simplyblock_core/rpc_client.py index 293a76b8d..316c2359d 100644 --- a/simplyblock_core/rpc_client.py +++ b/simplyblock_core/rpc_client.py @@ -1234,3 +1234,11 @@ def nvmf_port_unblock_rdma(self, port): def nvmf_get_blocked_ports_rdma(self): return self._request("nvmf_get_blocked_ports") + + def bdev_lvol_s3_backup(self, s3_id, cluster_batch, snapshots): + params = { + "s3_id": s3_id, + "cluster_batch": cluster_batch, + "snapshots": snapshots + } + return self._request("bdev_lvol_s3_backup", params) diff --git a/simplyblock_core/services/tasks_runner_fdb_backup.py b/simplyblock_core/services/tasks_runner_fdb_backup.py index bc365435c..b426444b4 100644 --- a/simplyblock_core/services/tasks_runner_fdb_backup.py +++ b/simplyblock_core/services/tasks_runner_fdb_backup.py @@ -6,13 +6,85 @@ from simplyblock_core.controllers import backup_controller, tasks_controller from simplyblock_core.models.job_schedule import JobSchedule from simplyblock_core.models.cluster import Cluster - +from simplyblock_core.models.storage_node import StorageNode logger = utils.get_logger(__name__) # get DB controller db = db_controller.DBController() + +def process_fdb_backup_task(task): + task = db.get_task_by_id(task.uuid) + if task.canceled: + task.function_result = "canceled" + task.status = JobSchedule.STATUS_DONE + task.write_to_db(db.kv_store) + return + + if task.retry >= task.max_retry: + task.function_result = "max retry reached, stopping task" + task.status = JobSchedule.STATUS_DONE + task.write_to_db(db.kv_store) + return + + if task.status != JobSchedule.STATUS_RUNNING: + task.status = JobSchedule.STATUS_RUNNING + task.write_to_db(db.kv_store) + + ret = backup_controller.create_backup() + if ret: + task.function_result = "Backup created" + task.status = JobSchedule.STATUS_DONE + task.write_to_db(db.kv_store) + + +def process_snap_backup_task(task): + + task = db.get_task_by_id(task.uuid) + + if task.canceled: + task.function_result = "canceled" + task.status = JobSchedule.STATUS_DONE + task.write_to_db(db.kv_store) + return + + node = db.get_storage_node_by_id(task.node_id) + + if not node: + task.function_result = "node not found" + task.status = JobSchedule.STATUS_DONE + task.write_to_db(db.kv_store) + return + + if node.status not in [StorageNode.STATUS_DOWN, StorageNode.STATUS_ONLINE]: + msg = f"Node is {node.status}, retry task" + logger.info(msg) + task.function_result = msg + task.status = JobSchedule.STATUS_SUSPENDED + task.write_to_db(db.kv_store) + return + + if task.status != JobSchedule.STATUS_RUNNING: + task.status = JobSchedule.STATUS_RUNNING + task.write_to_db(db.kv_store) + + snapshot_id = task.function_params["snapshot_id"] + + logger.info(f"Sync delete bdev: {lvol_bdev_name} from node: {node.get_id()}") + ret, err = node.rpc_client().delete_lvol(lvol_bdev_name, del_async=True) + if not ret: + if "code" in err and err["code"] == -19: + logger.error(f"Sync delete completed with error: {err}") + else: + logger.error( + f"Failed to sync delete bdev: {lvol_bdev_name} from node: {node.get_id()}") + + task.function_result = f"bdev {lvol_bdev_name} deleted" + task.status = JobSchedule.STATUS_DONE + task.write_to_db(db.kv_store) + + logger.info("Starting Tasks runner fdb backup...") while True: @@ -24,30 +96,12 @@ if cl.status == Cluster.STATUS_IN_ACTIVATION: continue - tasks = tasks_controller.get_backup_tasks(cl.get_id()) + tasks = db.get_job_tasks(cl.get_id()) for task in tasks: if task.status != JobSchedule.STATUS_DONE: - task = db.get_task_by_id(task.uuid) - if task.canceled: - task.function_result = "canceled" - task.status = JobSchedule.STATUS_DONE - task.write_to_db(db.kv_store) - continue - - if task.retry >= task.max_retry: - task.function_result = "max retry reached, stopping task" - task.status = JobSchedule.STATUS_DONE - task.write_to_db(db.kv_store) - continue - - if task.status != JobSchedule.STATUS_RUNNING: - task.status = JobSchedule.STATUS_RUNNING - task.write_to_db(db.kv_store) - - ret = backup_controller.create_backup() - if ret: - task.function_result = "Backup created" - task.status = JobSchedule.STATUS_DONE - task.write_to_db(db.kv_store) + if task.function_name == JobSchedule.FN_FDB_BACKUP: + process_fdb_backup_task(task) + elif task.function_name == JobSchedule.FN_SNAPSHOT_BACKUP: + process_snap_backup_task(task) time.sleep(constants.TASK_EXEC_INTERVAL_SEC) From 4e025688cbf1d7b0dada1a33d4ab01501ec4656d Mon Sep 17 00:00:00 2001 From: hamdykhader Date: Thu, 8 Jan 2026 03:24:38 +0300 Subject: [PATCH 13/16] wip --- simplyblock_core/env_var | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/simplyblock_core/env_var b/simplyblock_core/env_var index 1dd006dad..ac68a1969 100644 --- a/simplyblock_core/env_var +++ b/simplyblock_core/env_var @@ -2,4 +2,4 @@ SIMPLY_BLOCK_COMMAND_NAME=sbcli-dev SIMPLY_BLOCK_VERSION=19.2.30 SIMPLY_BLOCK_DOCKER_IMAGE=public.ecr.aws/simply-block/simplyblock:R25.10-Hotfix-fdb-backup -SIMPLY_BLOCK_SPDK_ULTRA_IMAGE=public.ecr.aws/simply-block/ultra:s3-backup-latest +SIMPLY_BLOCK_SPDK_ULTRA_IMAGE=simplyblock/spdk:s3-backup-latest From 17809ba36c289649a2da464f667d438b8c97a86f Mon Sep 17 00:00:00 2001 From: hamdykhader Date: Thu, 8 Jan 2026 03:50:33 +0300 Subject: [PATCH 14/16] wip --- simplyblock_core/controllers/tasks_controller.py | 1 + 1 file changed, 1 insertion(+) diff --git a/simplyblock_core/controllers/tasks_controller.py b/simplyblock_core/controllers/tasks_controller.py index 932eb2c7b..11f568c20 100644 --- a/simplyblock_core/controllers/tasks_controller.py +++ b/simplyblock_core/controllers/tasks_controller.py @@ -440,6 +440,7 @@ def add_backup_task(cluster_id): task_obj.date = int(time.time()) task_obj.max_retry = constants.TASK_EXEC_RETRY_COUNT task_obj.status = JobSchedule.STATUS_NEW + task_obj.function_name = JobSchedule.FN_FDB_BACKUP task_obj.write_to_db() logger.info(f"Backup task created: {task_obj.uuid}") return task_obj.uuid From 42cd0f5c66cb999b857f0d9234b8172a17b3534f Mon Sep 17 00:00:00 2001 From: hamdykhader Date: Thu, 8 Jan 2026 04:08:15 +0300 Subject: [PATCH 15/16] wip --- simplyblock_core/rpc_client.py | 6 ++--- .../services/tasks_runner_fdb_backup.py | 22 +++++++++++-------- 2 files changed, 16 insertions(+), 12 deletions(-) diff --git a/simplyblock_core/rpc_client.py b/simplyblock_core/rpc_client.py index 316c2359d..a13871bcb 100644 --- a/simplyblock_core/rpc_client.py +++ b/simplyblock_core/rpc_client.py @@ -1235,10 +1235,10 @@ def nvmf_port_unblock_rdma(self, port): def nvmf_get_blocked_ports_rdma(self): return self._request("nvmf_get_blocked_ports") - def bdev_lvol_s3_backup(self, s3_id, cluster_batch, snapshots): + def bdev_lvol_s3_backup(self, s3_id, snapshots): params = { "s3_id": s3_id, - "cluster_batch": cluster_batch, + "cluster_batch": 16, "snapshots": snapshots } - return self._request("bdev_lvol_s3_backup", params) + return self._request2("bdev_lvol_s3_backup", params) diff --git a/simplyblock_core/services/tasks_runner_fdb_backup.py b/simplyblock_core/services/tasks_runner_fdb_backup.py index b426444b4..417ec95ef 100644 --- a/simplyblock_core/services/tasks_runner_fdb_backup.py +++ b/simplyblock_core/services/tasks_runner_fdb_backup.py @@ -69,18 +69,22 @@ def process_snap_backup_task(task): task.status = JobSchedule.STATUS_RUNNING task.write_to_db(db.kv_store) + cluster = db.get_cluster_by_id(task.cluster_id) snapshot_id = task.function_params["snapshot_id"] - - logger.info(f"Sync delete bdev: {lvol_bdev_name} from node: {node.get_id()}") - ret, err = node.rpc_client().delete_lvol(lvol_bdev_name, del_async=True) - if not ret: - if "code" in err and err["code"] == -19: - logger.error(f"Sync delete completed with error: {err}") - else: + snapshot = db.get_snapshot_by_id(snapshot_id) + logger.info(f"backing up: {snapshot_id}") + if cluster.backup_s3_bucket and cluster.backup_s3_cred: + folder = f"backup-{snapshot_id}" + folder = folder.replace(" ", "-") + folder = folder.replace(":", "-") + folder = folder.split(".")[0] + backup_path = f"blobstore://{cluster.backup_s3_cred}@s3.{cluster.backup_s3_region}.amazonaws.com/{folder}?bucket={cluster.backup_s3_bucket}®ion={cluster.backup_s3_region}&sc=0" + ret, err = node.rpc_client().bdev_lvol_s3_backup(backup_path, snapshot.snap_bdev) + if not ret: logger.error( - f"Failed to sync delete bdev: {lvol_bdev_name} from node: {node.get_id()}") + f"Failed to backup snapshot: {snapshot_id}, error: {err}") - task.function_result = f"bdev {lvol_bdev_name} deleted" + task.function_result = f"snap backup done" task.status = JobSchedule.STATUS_DONE task.write_to_db(db.kv_store) From a462712520d17889c07d24d34f87caa3345a92cb Mon Sep 17 00:00:00 2001 From: hamdykhader Date: Thu, 8 Jan 2026 04:39:42 +0300 Subject: [PATCH 16/16] wip --- simplyblock_core/services/cap_monitor.py | 2 +- simplyblock_core/services/tasks_runner_fdb_backup.py | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/simplyblock_core/services/cap_monitor.py b/simplyblock_core/services/cap_monitor.py index b9632a0fb..61a4beea9 100644 --- a/simplyblock_core/services/cap_monitor.py +++ b/simplyblock_core/services/cap_monitor.py @@ -21,7 +21,7 @@ def create_fdb_backup_if_needed(cluster): if tasks: last_backup_task = tasks[0] if last_backup_task and last_backup_task.status == JobSchedule.STATUS_DONE: - if last_backup_task.date + cluster.backup_frequency_seconds > time.time(): + if last_backup_task.date + cluster.backup_frequency_seconds < time.time(): tasks_controller.add_backup_task(cluster.get_id()) diff --git a/simplyblock_core/services/tasks_runner_fdb_backup.py b/simplyblock_core/services/tasks_runner_fdb_backup.py index 417ec95ef..6806682e3 100644 --- a/simplyblock_core/services/tasks_runner_fdb_backup.py +++ b/simplyblock_core/services/tasks_runner_fdb_backup.py @@ -84,7 +84,10 @@ def process_snap_backup_task(task): logger.error( f"Failed to backup snapshot: {snapshot_id}, error: {err}") - task.function_result = f"snap backup done" + task.function_result = f"snap backup error" + else: + task.function_result = f"snap backup done" + task.status = JobSchedule.STATUS_DONE task.write_to_db(db.kv_store)