diff --git a/.codecov.yml b/.codecov.yml deleted file mode 100644 index 046261603d..0000000000 --- a/.codecov.yml +++ /dev/null @@ -1,2 +0,0 @@ -ignore: - - "**/tensorflow/tensorflow_serving/.*" diff --git a/.coveragerc b/.coveragerc deleted file mode 100644 index 35f9af99dd..0000000000 --- a/.coveragerc +++ /dev/null @@ -1,5 +0,0 @@ -[run] -concurrency = threading -omit = sagemaker/tests/* -timid = True -disable_warnings = module-not-measured diff --git a/.dictionary b/.dictionary deleted file mode 100644 index 8907f4ff9b..0000000000 --- a/.dictionary +++ /dev/null @@ -1,38 +0,0 @@ -args -arn -autoscaling -aws -bool -boolean -boto -botocore -clienterror -cloudwatch -cron -config -dataset -datasets -datetime -desc -docstring -entrypoint -env -iam -hyperparameter -hyperparameters -jupyter -kms -kwargs -neo -noqa -rc -runtime -sagemaker -stdout -str -subdirectories -subnet -subnets -unexpectedstatusexception -uri -vpc diff --git a/.flake8 b/.flake8 deleted file mode 100644 index 53e43383ac..0000000000 --- a/.flake8 +++ /dev/null @@ -1,6 +0,0 @@ -[flake8] -application_import_names = sagemaker, tests -import-order-style = google -per-file-ignores = - tests/unit/test_tuner.py: F405 - src/sagemaker/config/config_schema.py: E501 diff --git a/.githooks/pre-push b/.githooks/pre-push deleted file mode 100755 index 995ab70108..0000000000 --- a/.githooks/pre-push +++ /dev/null @@ -1,16 +0,0 @@ -#!/bin/sh -# this pre-push hook runs style checks and unit tests in python 3.8, 3.9, and 3.10 using tox. - -set -e - -TOX_PARALLEL_NO_SPINNER=1, -PY_COLORS=0 -start_time=`date +%s` -tox -e flake8,pylint,docstyle,black-check,twine --parallel all -./ci-scripts/displaytime.sh 'flake8,pylint,docstyle,black-check,twine' $start_time -start_time=`date +%s` -tox -e sphinx,doc8 --parallel all -./ci-scripts/displaytime.sh 'sphinx,doc8' $start_time -start_time=`date +%s` -tox -e py38,py39,py310 --parallel all -- tests/unit -./ci-scripts/displaytime.sh 'py38,py39,py310 unit' $start_time diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md index 048133d265..746d95a2aa 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.md +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -7,6 +7,10 @@ assignees: '' --- +**PySDK Version** +- [ ] PySDK V2 (2.x) +- [ ] PySDK V3 (3.x) + **Describe the bug** A clear and concise description of what the bug is. diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md deleted file mode 100644 index e659c40513..0000000000 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ /dev/null @@ -1,27 +0,0 @@ -*Issue #, if available:* - -*Description of changes:* - -*Testing done:* - -## Merge Checklist - -_Put an `x` in the boxes that apply. You can also fill these out after creating the PR. If you're unsure about any of them, don't hesitate to ask. We're here to help! This is simply a reminder of what we are going to look for before merging your pull request._ - -#### General - -- [ ] I have read the [CONTRIBUTING](https://github.com/aws/sagemaker-python-sdk/blob/master/CONTRIBUTING.md) doc -- [ ] I certify that the changes I am introducing will be backward compatible, and I have discussed concerns about this, if any, with the Python SDK team -- [ ] I used the commit message format described in [CONTRIBUTING](https://github.com/aws/sagemaker-python-sdk/blob/master/CONTRIBUTING.md#committing-your-change) -- [ ] I have passed the region in to all S3 and STS clients that I've initialized as part of this change. -- [ ] I have updated any necessary documentation, including [READMEs](https://github.com/aws/sagemaker-python-sdk/blob/master/README.rst) and [API docs](https://github.com/aws/sagemaker-python-sdk/tree/master/doc) (if appropriate) - -#### Tests - -- [ ] I have added tests that prove my fix is effective or that my feature works (if appropriate) -- [ ] I have added unit and/or integration tests as appropriate to ensure backward compatibility of the changes -- [ ] I have checked that my tests are not configured for a specific region or account (if appropriate) -- [ ] I have used [`unique_name_from_base`](https://github.com/aws/sagemaker-python-sdk/blob/master/src/sagemaker/utils.py#L77) to create resource names in integ tests (if appropriate) -- [ ] If adding any dependency in requirements.txt files, I have spell checked and ensured they exist in PyPi - -By submitting this pull request, I confirm that my contribution is made under the terms of the Apache 2.0 license. diff --git a/.github/workflows/ci-health.yml b/.github/workflows/ci-health.yml new file mode 100644 index 0000000000..db94ce084a --- /dev/null +++ b/.github/workflows/ci-health.yml @@ -0,0 +1,38 @@ +name: CI Health +on: + schedule: + - cron: "0 */3 * * *" + workflow_dispatch: + +permissions: + id-token: write # This is required for requesting the JWT + +jobs: + canaries-v3: + runs-on: ubuntu-latest + steps: + - name: Configure AWS Credentials + uses: aws-actions/configure-aws-credentials@v4 + with: + role-to-assume: ${{ secrets.CI_AWS_ROLE_ARN }} + aws-region: us-west-2 + role-duration-seconds: 10800 + - name: Run Canaries V3 + uses: aws-actions/aws-codebuild-run-build@v1 + with: + project-name: sagemaker-python-sdk-ci-health-canaries-v3 + source-version: refs/heads/master + canaries-v2: + runs-on: ubuntu-latest + steps: + - name: Configure AWS Credentials + uses: aws-actions/configure-aws-credentials@v4 + with: + role-to-assume: ${{ secrets.CI_AWS_ROLE_ARN }} + aws-region: us-west-2 + role-duration-seconds: 10800 + - name: Run Canaries V2 + uses: aws-actions/aws-codebuild-run-build@v1 + with: + project-name: sagemaker-python-sdk-ci-health-canaries-v2 + source-version: refs/heads/master-v2 diff --git a/.github/workflows/codebuild-ci-health.yml b/.github/workflows/codebuild-ci-health.yml deleted file mode 100644 index 7ecefd310f..0000000000 --- a/.github/workflows/codebuild-ci-health.yml +++ /dev/null @@ -1,84 +0,0 @@ -name: CI Health -on: - schedule: - - cron: "0 */3 * * *" - workflow_dispatch: - -permissions: - id-token: write # This is required for requesting the JWT - -jobs: - codestyle-doc-tests: - runs-on: ubuntu-latest - steps: - - name: Configure AWS Credentials - uses: aws-actions/configure-aws-credentials@v4 - with: - role-to-assume: ${{ secrets.CI_AWS_ROLE_ARN }} - aws-region: us-west-2 - role-duration-seconds: 10800 - - name: Run Codestyle & Doc Tests - uses: aws-actions/aws-codebuild-run-build@v1 - with: - project-name: sagemaker-python-sdk-ci-health-codestyle-doc-tests - unit-tests: - runs-on: ubuntu-latest - strategy: - fail-fast: false - matrix: - python-version: ["py38", "py39", "py310", "py311"] - steps: - - name: Configure AWS Credentials - uses: aws-actions/configure-aws-credentials@v4 - with: - role-to-assume: ${{ secrets.CI_AWS_ROLE_ARN }} - aws-region: us-west-2 - role-duration-seconds: 10800 - - name: Run Unit Tests - uses: aws-actions/aws-codebuild-run-build@v1 - with: - project-name: sagemaker-python-sdk-ci-health-unit-tests - env-vars-for-codebuild: | - PY_VERSION - env: - PY_VERSION: ${{ matrix.python-version }} - integ-tests: - runs-on: ubuntu-latest - steps: - - name: Configure AWS Credentials - uses: aws-actions/configure-aws-credentials@v4 - with: - role-to-assume: ${{ secrets.CI_AWS_ROLE_ARN }} - aws-region: us-west-2 - role-duration-seconds: 10800 - - name: Run Integ Tests - uses: aws-actions/aws-codebuild-run-build@v1 - id: codebuild - with: - project-name: sagemaker-python-sdk-ci-health-integ-tests - slow-tests: - runs-on: ubuntu-latest - steps: - - name: Configure AWS Credentials - uses: aws-actions/configure-aws-credentials@v4 - with: - role-to-assume: ${{ secrets.CI_AWS_ROLE_ARN }} - aws-region: us-west-2 - role-duration-seconds: 10800 - - name: Run Slow Tests - uses: aws-actions/aws-codebuild-run-build@v1 - with: - project-name: sagemaker-python-sdk-ci-health-slow-tests - localmode-tests: - runs-on: ubuntu-latest - steps: - - name: Configure AWS Credentials - uses: aws-actions/configure-aws-credentials@v4 - with: - role-to-assume: ${{ secrets.CI_AWS_ROLE_ARN }} - aws-region: us-west-2 - role-duration-seconds: 10800 - - name: Run Local Mode Tests - uses: aws-actions/aws-codebuild-run-build@v1 - with: - project-name: sagemaker-python-sdk-ci-health-localmode-tests \ No newline at end of file diff --git a/.github/workflows/codebuild-ci.yml b/.github/workflows/codebuild-ci.yml deleted file mode 100644 index 8c6bd6b337..0000000000 --- a/.github/workflows/codebuild-ci.yml +++ /dev/null @@ -1,97 +0,0 @@ -name: PR Checks -on: - pull_request_target: - branches: - - "master*" - -concurrency: - group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.head_ref }} - cancel-in-progress: true - -permissions: - id-token: write # This is required for requesting the JWT - -jobs: - collab-check: - runs-on: ubuntu-latest - outputs: - approval-env: ${{ steps.collab-check.outputs.result }} - steps: - - name: Collaborator Check - uses: actions/github-script@v7 - id: collab-check - with: - github-token: ${{ secrets.COLLAB_CHECK_TOKEN }} - result-encoding: string - script: | - try { - const res = await github.rest.repos.checkCollaborator({ - owner: context.repo.owner, - repo: context.repo.repo, - username: "${{ github.event.pull_request.user.login }}", - }); - console.log("Verifed ${{ github.event.pull_request.user.login }} is a repo collaborator. Auto Approving PR Checks.") - return res.status == "204" ? "auto-approve" : "manual-approval" - } catch (error) { - console.log("${{ github.event.pull_request.user.login }} is not a collaborator. Requiring Manual Approval to run PR Checks.") - return "manual-approval" - } - wait-for-approval: - runs-on: ubuntu-latest - needs: [collab-check] - environment: ${{ needs.collab-check.outputs.approval-env }} - steps: - - run: echo "Workflow Approved! Starting PR Checks." - codestyle-doc-tests: - runs-on: ubuntu-latest - needs: [wait-for-approval] - steps: - - name: Configure AWS Credentials - uses: aws-actions/configure-aws-credentials@v4 - with: - role-to-assume: ${{ secrets.CI_AWS_ROLE_ARN }} - aws-region: us-west-2 - role-duration-seconds: 10800 - - name: Run Codestyle & Doc Tests - uses: aws-actions/aws-codebuild-run-build@v1 - with: - project-name: ${{ github.event.repository.name }}-ci-codestyle-doc-tests - source-version-override: 'refs/pull/${{ github.event.pull_request.number }}/head^{${{ github.event.pull_request.head.sha }}}' - unit-tests: - runs-on: ubuntu-latest - needs: [wait-for-approval] - strategy: - fail-fast: false - matrix: - python-version: ["py38","py39","py310","py311"] - steps: - - name: Configure AWS Credentials - uses: aws-actions/configure-aws-credentials@v4 - with: - role-to-assume: ${{ secrets.CI_AWS_ROLE_ARN }} - aws-region: us-west-2 - role-duration-seconds: 10800 - - name: Run Unit Tests - uses: aws-actions/aws-codebuild-run-build@v1 - with: - project-name: ${{ github.event.repository.name }}-ci-unit-tests - source-version-override: 'refs/pull/${{ github.event.pull_request.number }}/head^{${{ github.event.pull_request.head.sha }}}' - env-vars-for-codebuild: | - PY_VERSION - env: - PY_VERSION: ${{ matrix.python-version }} - integ-tests: - runs-on: ubuntu-latest - needs: [wait-for-approval] - steps: - - name: Configure AWS Credentials - uses: aws-actions/configure-aws-credentials@v4 - with: - role-to-assume: ${{ secrets.CI_AWS_ROLE_ARN }} - aws-region: us-west-2 - role-duration-seconds: 10800 - - name: Run Integ Tests - uses: aws-actions/aws-codebuild-run-build@v1 - with: - project-name: ${{ github.event.repository.name }}-ci-integ-tests - source-version-override: 'refs/pull/${{ github.event.pull_request.number }}/head^{${{ github.event.pull_request.head.sha }}}' diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index 8fbf42803b..c6df60d630 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -32,4 +32,4 @@ jobs: - name: Perform CodeQL Analysis uses: github/codeql-action/analyze@4b1d7da102ff94aca014c0245062b1a463356d72 with: - category: "/language:${{matrix.language}}" + category: "/language:${{matrix.language}}" \ No newline at end of file diff --git a/.github/workflows/pr-checks-master-v2.yml b/.github/workflows/pr-checks-master-v2.yml new file mode 100644 index 0000000000..2004af5ea5 --- /dev/null +++ b/.github/workflows/pr-checks-master-v2.yml @@ -0,0 +1,97 @@ +name: Sagemaker PR Checks (Master-v2) +on: + pull_request_target: + branches: + - "master-v2" + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.head_ref }} + cancel-in-progress: true + +permissions: + id-token: write + +jobs: + collab-check: + runs-on: ubuntu-latest + outputs: + approval-env: ${{ steps.collab-check.outputs.result }} + steps: + - name: Collaborator Check + uses: actions/github-script@v7 + id: collab-check + with: + github-token: ${{ secrets.COLLAB_CHECK_TOKEN }} + result-encoding: string + script: | + try { + const res = await github.rest.repos.checkCollaborator({ + owner: context.repo.owner, + repo: context.repo.repo, + username: "${{ github.event.pull_request.user.login }}", + }); + console.log("Verifed ${{ github.event.pull_request.user.login }} is a repo collaborator. Auto Approving PR Checks.") + return res.status == "204" ? "auto-approve" : "manual-approval" + } catch (error) { + console.log("${{ github.event.pull_request.user.login }} is not a collaborator. Requiring Manual Approval to run PR Checks.") + return "manual-approval" + } + wait-for-approval: + runs-on: ubuntu-latest + needs: [collab-check] + environment: ${{ needs.collab-check.outputs.approval-env }} + steps: + - run: echo "Workflow Approved! Starting PR Checks." + codestyle-doc-tests: + runs-on: ubuntu-latest + needs: [wait-for-approval] + steps: + - name: Configure AWS Credentials + uses: aws-actions/configure-aws-credentials@v4 + with: + role-to-assume: ${{ secrets.CI_AWS_ROLE_ARN }} + aws-region: us-west-2 + role-duration-seconds: 10800 + - name: Run Codestyle & Doc Tests + uses: aws-actions/aws-codebuild-run-build@v1 + with: + project-name: ${{ github.event.repository.name }}-ci-codestyle-doc-tests + source-version-override: 'refs/pull/${{ github.event.pull_request.number }}/head^{${{ github.event.pull_request.head.sha }}}' + unit-tests: + runs-on: ubuntu-latest + needs: [wait-for-approval] + strategy: + fail-fast: false + matrix: + python-version: ["py39","py310","py311","py312"] + steps: + - name: Configure AWS Credentials + uses: aws-actions/configure-aws-credentials@v4 + with: + role-to-assume: ${{ secrets.CI_AWS_ROLE_ARN }} + aws-region: us-west-2 + role-duration-seconds: 10800 + - name: Run Unit Tests + uses: aws-actions/aws-codebuild-run-build@v1 + with: + project-name: ${{ github.event.repository.name }}-ci-unit-tests + source-version-override: 'refs/pull/${{ github.event.pull_request.number }}/head^{${{ github.event.pull_request.head.sha }}}' + env-vars-for-codebuild: | + PY_VERSION + env: + PY_VERSION: ${{ matrix.python-version }} + integ-tests: + runs-on: ubuntu-latest + needs: [wait-for-approval] + steps: + - name: Configure AWS Credentials + uses: aws-actions/configure-aws-credentials@v4 + with: + role-to-assume: ${{ secrets.CI_AWS_ROLE_ARN }} + aws-region: us-west-2 + role-duration-seconds: 10800 + - name: Run Integ Tests + uses: aws-actions/aws-codebuild-run-build@v1 + with: + project-name: ${{ github.event.repository.name }}-ci-integ-tests + source-version-override: 'refs/pull/${{ github.event.pull_request.number }}/head^{${{ github.event.pull_request.head.sha }}}' diff --git a/.github/workflows/pr-checks-master.yml b/.github/workflows/pr-checks-master.yml new file mode 100644 index 0000000000..4f63ad0b9a --- /dev/null +++ b/.github/workflows/pr-checks-master.yml @@ -0,0 +1,217 @@ +name: Sagemaker PR Checks (Master) +on: + pull_request_target: + branches: + - "master" + paths: + - 'sagemaker-train/**' + - 'sagemaker-serve/**' + - 'sagemaker-mlops/**' + - 'sagemaker-core/**' + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.head_ref }} + cancel-in-progress: true + +permissions: + id-token: write + +jobs: + collab-check: + runs-on: ubuntu-latest + outputs: + approval-env: ${{ steps.collab-check.outputs.result }} + steps: + - name: Collaborator Check + uses: actions/github-script@v7 + id: collab-check + with: + github-token: ${{ secrets.COLLAB_CHECK_TOKEN }} + result-encoding: string + script: | + try { + const res = await github.rest.repos.checkCollaborator({ + owner: context.repo.owner, + repo: context.repo.repo, + username: "${{ github.event.pull_request.user.login }}", + }); + console.log("Verifed ${{ github.event.pull_request.user.login }} is a repo collaborator. Auto Approving PR Checks.") + return res.status == "204" ? "auto-approve" : "manual-approval" + } catch (error) { + console.log("${{ github.event.pull_request.user.login }} is not a collaborator. Requiring Manual Approval to run PR Checks.") + return "manual-approval" + } + wait-for-approval: + runs-on: ubuntu-latest + needs: [ collab-check ] + environment: ${{ needs.collab-check.outputs.approval-env }} + steps: + - run: echo "Workflow Approved! Starting PR Checks." + detect-changes: + runs-on: ubuntu-latest + needs: [wait-for-approval] + outputs: + submodules: ${{ steps.check-changes.outputs.submodules }} + steps: + - uses: actions/checkout@v3 + with: + fetch-depth: 0 + token: ${{ secrets.GH_PAT }} + ref: ${{ github.event.pull_request.base.ref }} + - name: Detect Changes + id: check-changes + run: | + set -e + + echo "Target Branch: ${{ github.event.pull_request.base.ref }}" + echo "Current Target SHA: $(git rev-parse HEAD)" + echo "PR Number: ${{ github.event.pull_request.number }}" + echo "PR Latest SHA: ${{ github.event.pull_request.head.sha }}" + + git fetch origin pull/${{ github.event.pull_request.number }}/head + CHANGES=$(git diff --name-only HEAD FETCH_HEAD) + + echo "Changed files:" + echo "$CHANGES" + + # Function to extract dependencies from pyproject.toml + get_dependencies() { + local module=$1 + grep "sagemaker-" "$module/pyproject.toml" | grep -o 'sagemaker-[a-z]*' | sort -u + } + + # Function to find all modules that depend on a given module (recursively) + find_dependents() { + local target=$1 + local all_modules=("sagemaker-core" "sagemaker-train" "sagemaker-serve" "sagemaker-mlops") + local dependents=() + + for module in "${all_modules[@]}"; do + if [ "$module" != "$target" ]; then + if get_dependencies "$module" | grep -q "^$target$"; then + dependents+=("$module") + fi + fi + done + + echo "${dependents[@]}" + } + + # Initialize set of submodules to test (using associative array) + declare -A SUBMODULES_SET + + # Function to recursively add module and all its dependents + add_module_and_dependents() { + local module=$1 + + if [ -z "${SUBMODULES_SET[$module]}" ]; then + SUBMODULES_SET["$module"]=1 + echo "Adding $module to test set" + + # Find all modules that depend on this one and add them recursively + local dependents=$(find_dependents "$module") + for dependent in $dependents; do + add_module_and_dependents "$dependent" + done + fi + } + + # Check which submodules changed and add them plus their dependents + if echo "$CHANGES" | grep -q "^sagemaker-core/"; then + echo "sagemaker-core changed - will add core and all dependents" + add_module_and_dependents "sagemaker-core" + fi + + if echo "$CHANGES" | grep -q "^sagemaker-train/"; then + echo "sagemaker-train changed - will add train and all dependents" + add_module_and_dependents "sagemaker-train" + fi + + if echo "$CHANGES" | grep -q "^sagemaker-serve/"; then + echo "sagemaker-serve changed - will add serve and all dependents" + add_module_and_dependents "sagemaker-serve" + fi + + if echo "$CHANGES" | grep -q "^sagemaker-mlops/"; then + echo "sagemaker-mlops changed - will add mlops" + add_module_and_dependents "sagemaker-mlops" + fi + + # Convert associative array to JSON array + SUBMODULES='[]' + for submodule in "${!SUBMODULES_SET[@]}"; do + if [ "$SUBMODULES" = '[]' ]; then + SUBMODULES="[\"$submodule\"]" + else + SUBMODULES=$(echo $SUBMODULES | sed "s/\]$/,\"$submodule\"\]/") + fi + done + + echo "Final SUBMODULES: $SUBMODULES" + echo "submodules=$SUBMODULES" >> $GITHUB_OUTPUT + + codestyle-doc-tests: + runs-on: ubuntu-latest + needs: [detect-changes] + if: needs.detect-changes.outputs.submodules != '[]' + strategy: + fail-fast: false + matrix: + submodule: ${{ fromJson(needs.detect-changes.outputs.submodules) }} + steps: + - name: Configure AWS Credentials + uses: aws-actions/configure-aws-credentials@v4 + with: + role-to-assume: ${{ secrets.CI_AWS_ROLE_ARN }} + aws-region: us-west-2 + role-duration-seconds: 10800 + + - name: Run CodeBuild for ${{ matrix.submodule }} + uses: aws-actions/aws-codebuild-run-build@v1 + with: + project-name: ${{ github.event.repository.name }}-ci-${{ matrix.submodule }}-codestyle-doc-tests + source-version-override: 'refs/pull/${{ github.event.pull_request.number }}/head^{${{ github.event.pull_request.head.sha }}}' + + unit-tests: + runs-on: ubuntu-latest + needs: [detect-changes] + if: needs.detect-changes.outputs.submodules != '[]' + strategy: + fail-fast: false + matrix: + submodule: ${{ fromJson(needs.detect-changes.outputs.submodules) }} + steps: + - name: Configure AWS Credentials + uses: aws-actions/configure-aws-credentials@v4 + with: + role-to-assume: ${{ secrets.CI_AWS_ROLE_ARN }} + aws-region: us-west-2 + role-duration-seconds: 10800 + + - name: Run Unit Tests for ${{ matrix.submodule }} + uses: aws-actions/aws-codebuild-run-build@v1 + with: + project-name: ${{ github.event.repository.name }}-ci-${{ matrix.submodule }}-unit-tests + source-version-override: 'refs/pull/${{ github.event.pull_request.number }}/head^{${{ github.event.pull_request.head.sha }}}' + + integ-tests: + runs-on: ubuntu-latest + needs: [detect-changes] + if: needs.detect-changes.outputs.submodules != '[]' + strategy: + fail-fast: false + matrix: + submodule: ${{ fromJson(needs.detect-changes.outputs.submodules) }} + steps: + - name: Configure AWS Credentials + uses: aws-actions/configure-aws-credentials@v4 + with: + role-to-assume: ${{ secrets.CI_AWS_ROLE_ARN }} + aws-region: us-west-2 + role-duration-seconds: 10800 + + - name: Run Integ Tests for ${{ matrix.submodule }} + uses: aws-actions/aws-codebuild-run-build@v1 + with: + project-name: ${{ github.event.repository.name }}-ci-${{ matrix.submodule }}-integ-tests + source-version-override: 'refs/pull/${{ github.event.pull_request.number }}/head^{${{ github.event.pull_request.head.sha }}}' diff --git a/.github/workflows/security-monitoring.yml b/.github/workflows/security-monitoring.yml index ecce0643e6..8e44b426c2 100644 --- a/.github/workflows/security-monitoring.yml +++ b/.github/workflows/security-monitoring.yml @@ -118,4 +118,4 @@ jobs: aws cloudwatch put-metric-data --metric-name SecretScanningAlert --namespace SecurityMonitoringMetrics --value 1 --unit Count --dimensions ProjectName=sagemaker-python-sdk else aws cloudwatch put-metric-data --metric-name SecretScanningAlert --namespace SecurityMonitoringMetrics --value 0 --unit Count --dimensions ProjectName=sagemaker-python-sdk - fi + fi \ No newline at end of file diff --git a/.gitignore b/.gitignore index fc07847fba..09935a1dc9 100644 --- a/.gitignore +++ b/.gitignore @@ -13,6 +13,7 @@ dist/ **/*.pyc **.pyc scratch*.py +scratch/ .eggs *.egg examples/tensorflow/distributed_mnist/data @@ -32,9 +33,6 @@ env/ .python-version *.html **/_repack_script_launcher.sh -src/sagemaker/modules/train/container_drivers/sm_train.sh -src/sagemaker/modules/train/container_drivers/sourcecode.json -src/sagemaker/modules/train/container_drivers/distributed.json -tests/data/**/_repack_model.py -tests/data/experiment/sagemaker-dev-1.0.tar.gz -src/sagemaker/serve/tmp_workspace \ No newline at end of file +sagemaker_train/src/**/container_drivers/sm_train.sh +sagemaker_train/src/**/container_drivers/sourcecode.json +sagemaker_train/src/**/container_drivers/distributed.json diff --git a/.pydocstylerc b/.pydocstylerc deleted file mode 100644 index 9ed879a760..0000000000 --- a/.pydocstylerc +++ /dev/null @@ -1,5 +0,0 @@ -[pydocstyle] -inherit = false -ignore = D104,D107,D202,D203,D213,D214,D400,D401,D404,D406,D407,D411,D413,D414,D415,D417 -match = (?!record_pb2).*\.py -match-dir = (?!.*test).* \ No newline at end of file diff --git a/.pylintrc b/.pylintrc index 5428b86be0..223580f4d3 100644 --- a/.pylintrc +++ b/.pylintrc @@ -94,7 +94,24 @@ disable= useless-object-inheritance, # TODO: Enable this check and fix code once Python 2 is no longer supported. super-with-arguments, raise-missing-from, - E1136, + C0116, # Missing function or method docstring + C0209, # Use f-string instead of format + E0015, # Unrecognized option found in config + E0702, # Raising a string instead of an exception + E1101, # Module has no member (likely dynamic attr) + E1136, # Value assigned to something inferred as None + R0022, # Useless option value in config + R1710, # Inconsistent return statements + R1714, # Consider using `in` with comparisons + R1729, # Use a generator + R1732, + R1735, # Consider using a dict or list literal + W0237, # Argument renamed in override + W0613, # Unused argument + W0621, # Redefining name from outer scope + W0719 + W1404, # Implicit string concatenation + W1514, # `open()` used without encoding [REPORTS] # Set the output format. Available formats are text, parseable, colorized, msvs @@ -436,4 +453,4 @@ analyse-fallback-blocks=no # Exceptions that will emit a warning when being caught. Defaults to # "Exception" -overgeneral-exceptions=Exception +overgeneral-exceptions=builtins.Exception diff --git a/.readthedocs.yaml b/.readthedocs.yaml deleted file mode 100644 index 0a6e3928b5..0000000000 --- a/.readthedocs.yaml +++ /dev/null @@ -1,22 +0,0 @@ -# ReadTheDocs environment customization to allow us to use conda to install -# libraries which have C dependencies for the doc build. See: -# https://docs.readthedocs.io/en/latest/config-file/v2.html - -version: 2 - -build: - os: ubuntu-20.04 - tools: - python: "3.9" - - -python: - install: - - method: pip - path: . - - requirements: doc/requirements.txt - - -sphinx: - configuration: doc/conf.py - fail_on_warning: true # http://www.sphinx-doc.org/en/master/man/sphinx-build.html#id6 diff --git a/CHANGELOG.md b/CHANGELOG.md index e68653ce0d..1ef97b034b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,473 @@ # Changelog +## v3.3.0 (2025-12-19) + +### Features + * AWS_Batch: queueing of training jobs with ModelTrainer +### Bug fixes and Other Changes + * Fixes for model registry with ModelBuilder + +## v3.2.0 (2025-12-18) + +### Features + * Evaluator handshake with trainer + * Datasets Format validation +### Bug fixes and Other Changes + * Add xgboost 3.0-5 to release + * Fix get_child_process_ids parsing issue + +## v3.1.1 (2025-12-10) + +### Bug fixes and Other Changes +* Add validation to bedrock reward models +* Hyperparameter issue fixes, Add validation s3 output path +* Fix the recipe selection for multiple recipe scenario +* Train wait() timeout exception handling +* Update example notebooks to reflect recent code changes +* Update `model_package_group_name` param to `model_package_group` in finetuning interfaces +* remove `dataset` param for benchmark evaluator + +## v3.1.0 (2025-12-03) + +### Features + +* Fine-tuning SDK: SFT, RLVR, and RLAIF techniques with standardized parameter design +* AIRegistry Integration: Added CRUD operations for datasets and evaluators +* Enhanced Training Experience: Implemented MLFlow metrics tracking and deployment workflows + +## v3.0.1 (2025-11-19) + +* Update project dependencies to include submodules: sagemaker-core, sagemaker-train, sagemaker-serve, sagemaker-mlops + +## v3.0.0 (2025-11-19) + +### Major Version Release + +#### ⚠️ Breaking Changes + +#### Important: Please review these breaking changes before upgrading. + +* Version 3.0.0 represents a significant milestone in our product's evolution. This major release introduces a modernized architecture, enhanced performance, and powerful new features while maintaining our commitment to user experience and reliability. +* Older interfaces such as Estimator, Model, Predictor and all their subclasses will not be supported in V3. +* Please review documentation of interfaces for parameters support (especially ModelBuilder) + +## v2.254.1 (2025-10-31) + +### Bug Fixes and Other Changes + + * update get_execution_role to directly return the ExecutionRoleArn if it presents in the resource metadata file + * [hf] HF PT Training DLCs + +## v2.254.0 (2025-10-29) + +### Features + + * Triton v25.09 DLC + +### Bug Fixes and Other Changes + + * Add Numpy 2.0 support + * add HF Optimum Neuron DLCs + * [Hugging Face][Pytorch] Inference DLC 4.51.3 + * [hf] HF Inference TGI + +## v2.253.1 (2025-10-14) + +### Bug Fixes and Other Changes + + * Update instance type regex to also include hyphens + * Revert the change "Add Numpy 2.0 support" + * [hf-tei] add image uri to utils + * add TEI 1.8.2 + +## v2.253.0 (2025-10-10) + +### Features + + * Added condition to allow eval recipe. + * add model_type hyperparameter support for Nova recipes + +### Bug Fixes and Other Changes + + * Fix for a failed slow test: numpy fix + * Add numpy 2.0 support + * chore: domain support for eu-isoe-west-1 + * Adding default identity implementations to InferenceSpec + * djl regions fixes #5273 + * Fix flaky integ test + +## v2.252.0 (2025-09-29) + +### Features + + * change S3 endpoint env name + * add eval custom lambda arn to hyperparameters + +### Bug Fixes and Other Changes + + * merge rba without the iso region changes + * handle trial component status message longer than API supports + * Add nova custom lambda in hyperparameter from estimator + * add retryable option to emr step in SageMaker Pipelines + * Feature/js mlops telemetry + * latest tgi + +## v2.251.1 (2025-08-29) + +### Bug Fixes and Other Changes + + * chore: onboard tei 1.8.0 + +## v2.251.0 (2025-08-21) + +### Features + + * support pipeline versioning + +### Bug Fixes and Other Changes + + * GPT OSS Hotfix + * dockerfile stuck on interactive shell + * add sleep for model deployment + +## v2.250.0 (2025-08-08) + +### Features + + * Add support for InstancePlacementConfig in Estimator for training jobs running on ultraserver capacity + +### Bug Fixes and Other Changes + + * Add more constraints to test requirements + +## v2.249.0 (2025-07-31) + +### Features + + * AWS Batch for SageMaker Training jobs + +### Bug Fixes and Other Changes + + * Directly use customer-provided endpoint name for ModelBuilder deployment. + * update image_uri_configs 07-23-2025 07:18:25 PST + +## v2.248.2 (2025-07-22) + +### Bug Fixes and Other Changes + + * Relax boto3 version requirement + * update image_uri_configs 07-22-2025 07:18:25 PST + * update image_uri_configs 07-18-2025 07:18:28 PST + * add hard dependency on sagemaker-core pypi lib + * When rootlessDocker is enabled, return a fixed SageMaker IP + +## v2.248.1 (2025-07-16) + +### Bug Fixes and Other Changes + + * Nova training support + +## v2.248.0 (2025-07-15) + +### Features + + * integrate amtviz for visualization of tuning jobs + +### Bug Fixes and Other Changes + + * build(deps): bump requests in /tests/data/serve_resources/mlflow/pytorch + * build(deps): bump protobuf from 4.25.5 to 4.25.8 in /requirements/extras + * build(deps): bump mlflow in /tests/data/serve_resources/mlflow/xgboost + * build(deps): bump torch in /tests/data/modules/script_mode + * sanitize git clone repo input url + * Adding Hyperpod feature to enable hyperpod telemetry + * Adding Hyperpod feature to enable hyperpod telemetry + * Bump SMD version to enable custom workflow deployment. + * Update TF DLC python version to py312 + * update image_uri_configs 07-04-2025 07:18:27 PST + * update image_uri_configs 06-26-2025 07:18:35 PST + * relax protobuf to <6.32 + +## v2.247.1 (2025-06-23) + +### Bug Fixes and Other Changes + + * update image_uri_configs 06-19-2025 07:18:34 PST + +## v2.247.0 (2025-06-13) + +### Features + + * Add support for MetricDefinitions in ModelTrainer + +### Bug Fixes and Other Changes + + * update jumpstart region_config, update image_uri_configs 06-12-2025 07:18:12 PST + * Add ignore_patterns in ModelTrainer to ignore specific files/folders + * Allow import failure for internal _hashlib module + +## v2.246.0 (2025-06-04) + +### Features + + * Triton v25.04 DLC + +### Bug Fixes and Other Changes + + * Update Attrs version to widen support + * update estimator documentation regarding hyperparameters for source_dir + +## v2.245.0 (2025-05-28) + +### Features + + * Correct mypy type checking through PEP 561 + +### Bug Fixes and Other Changes + + * MLFLow update for dependabot + * addWaiterTimeoutHandling + * merge method inputs with class inputs + * update image_uri_configs 05-20-2025 07:18:17 PST + +## v2.244.2 (2025-05-19) + +### Bug Fixes and Other Changes + + * include model channel for gated uncompressed models + * clarify model monitor one time schedule bug + * update jumpstart region_config 05-15-2025 07:18:15 PST + * update image_uri_configs 05-14-2025 07:18:16 PST + * Add image configs and region config for TPE (ap-east-2) + * Improve defaults handling in ModelTrainer + +## v2.244.1 (2025-05-15) + +### Bug Fixes and Other Changes + + * Fix Flask-Limiter version + * Fix test_huggingface_tei_uris() + * huggingface-llm-neuronx dlc + * huggingface-neuronx dlc image_uri + * huggingface-tei dlc image_uri + * Fix test_deploy_with_update_endpoint() + * add AG v1.3 + * parameter mismatch in update_endpoint + * remove --strip-component for untar source tar.gz + * Fix type annotations + * chore: Allow omegaconf >=2.2,<3 + * honor json serialization of HPs + * Map llama models to correct script + * pin test dependency + * fix bad initialization script error message + * Improve error logging and documentation for issue 4007 + * build(deps): bump scikit-learn + * build(deps): bump mlflow + * build(deps): bump mlflow in /tests/data/serve_resources/mlflow/pytorch + * chore: Add tei 1.6.0 image + +## v2.244.0 (2025-05-02) + +### Features + + * support custom workflow deployment in ModelBuilder using SMD image. + +### Bug Fixes and Other Changes + + * Add Owner ID check for bucket with path when prefix is provided + * Add model server timeout + * pin mamba version to 24.11.3-2 to avoid inconsistent test runs + * Update ModelTrainer to support s3 uri and tar.gz file as source_dir + * chore: add huggingface images + +## v2.243.3 (2025-04-23) + +### Bug Fixes and Other Changes + + * update readme to reflect py312 upgrade + * Revert the PR changes 5122 + * Py312 upgrade step 2: Update dependencies, integ tests and unit tests + * update pr test to deprecate py38 and add py312 + * update image_uri_configs 04-16-2025 07:18:18 PST + * update image_uri_configs 04-15-2025 07:18:10 PST + * update image_uri_configs 04-11-2025 07:18:19 PST + +## v2.243.2 (2025-04-16) + +### Bug Fixes and Other Changes + + * tgi image uri unit tests + * Fix deepdiff dependencies + +## v2.243.1 (2025-04-11) + +### Bug Fixes and Other Changes + + * Added handler for pipeline variable while creating process job + * Fix issue #4856 by copying environment variables + * remove historical job_name caching which causes long job name + * Update instance gpu info + * Master + * Add mlflow tracking arn telemetry + * chore: fix semantic versioning for wildcard identifier + * flaky test + +### Documentation Changes + + * update pipelines step caching examples to include more steps + * update ModelStep data dependency info + +## v2.243.0 (2025-03-27) + +### Features + + * Enabled update_endpoint through model_builder + +### Bug Fixes and Other Changes + + * Update for PT 2.5.1, SMP 2.8.0 + * chore: move jumpstart region definitions to json file + * fix flaky clarify model monitor test + * fix flaky spark processor integ + * use temp file in unit tests + * Update transformers version + * Aligned disable_output_compression for @remote with Estimator + * Update Jinja version + * update image_uri_configs 03-26-2025 07:18:16 PST + * chore: fix integ tests to use latest version of model + * update image_uri_configs 03-25-2025 07:18:13 PST + * Skip tests failed due to deprecated instance type + * update image_uri_configs 03-21-2025 07:17:55 PST + * factor in set instance type when building JumpStart models in ModelBuilder. + * ADD Documentation to ReadtheDocs for Upgrading torch versions + * add new regions to JUMPSTART_LAUNCHED_REGIONS + +## v2.242.0 (2025-03-14) + +### Features + + * add integ tests for training JumpStart models in private hub + +### Bug Fixes and Other Changes + + * Torch upgrade + * Prevent RunContext overlap between test_run tests + * remove s3 output location requirement from hub class init + * Fixing Pytorch training python version in tests + * update image_uri_configs 03-11-2025 07:18:09 PST + * resolve infinite loop in _find_config on Windows systems + * pipeline definition function doc update + +## v2.241.0 (2025-03-06) + +### Features + + * Make DistributedConfig Extensible + * support training for JumpStart model references as part of Curated Hub Phase 2 + * Allow ModelTrainer to accept hyperparameters file + +### Bug Fixes and Other Changes + + * Skip tests with deprecated instance type + * Ensure Model.is_repack() returns a boolean + * Fix error when there is no session to call _create_model_request() + * Use sagemaker session's s3_resource in download_folder + * Added check for the presence of model package group before creating one + * Fix key error in _send_metrics() + +## v2.240.0 (2025-02-25) + +### Features + + * Add support for TGI Neuronx 0.0.27 and HF PT 2.3.0 image in PySDK + +### Bug Fixes and Other Changes + + * Remove main function entrypoint in ModelBuilder dependency manager. + * forbid extras in Configs + * altconfig hubcontent and reenable integ test + * Merge branch 'master-rba' into local_merge + * py_version doc fixes + * Add backward compatbility for RecordSerializer and RecordDeserializer + * update image_uri_configs 02-21-2025 06:18:10 PST + * update image_uri_configs 02-20-2025 06:18:08 PST + +### Documentation Changes + + * Removed a line about python version requirements of training script which can misguide users. + +## v2.239.3 (2025-02-19) + +### Bug Fixes and Other Changes + + * added ap-southeast-7 and mx-central-1 for Jumpstart + * update image_uri_configs 02-19-2025 06:18:15 PST + +## v2.239.2 (2025-02-18) + +### Bug Fixes and Other Changes + + * Add warning about not supporting torch.nn.SyncBatchNorm + * pass in inference_ami_version to model_based endpoint type + * Fix hyperparameter strategy docs + * Add framework_version to all TensorFlowModel examples + * Move RecordSerializer and RecordDeserializer to sagemaker.serializers and sagemaker.deserialzers + +## v2.239.1 (2025-02-14) + +### Bug Fixes and Other Changes + + * keep sagemaker_session from being overridden to None + * Fix all type hint and docstrings for callable + * Fix the workshop link for Step Functions + * Fix Tensorflow doc link + * Fix FeatureGroup docstring + * Add type hint for ProcessingOutput + * Fix sourcedir.tar.gz filenames in docstrings + * Fix documentation for local mode + * bug in get latest version was getting the max sorted alphabetically + * Add cleanup logic to model builder integ tests for endpoints + * Fixed pagination failing while listing collections + * fix ValueError when updating a data quality monitoring schedule + * Add docstring for image_uris.retrieve + * Create GitHub action to trigger canaries + * update image_uri_configs 02-04-2025 06:18:00 PST + +## v2.239.0 (2025-02-01) + +### Features + + * Add support for deepseek recipes + +### Bug Fixes and Other Changes + + * mpirun protocol - distributed training with @remote decorator + * Allow telemetry only in supported regions + * Fix ssh host policy + +## v2.238.0 (2025-01-29) + +### Features + + * use jumpstart deployment config image as default optimization image + +### Bug Fixes and Other Changes + + * chore: add new images for HF TGI + * update image_uri_configs 01-29-2025 06:18:08 PST + * skip TF tests for unsupported versions + * Merge branch 'master-rba' into local_merge + * Add missing attributes to local resourceconfig + * update image_uri_configs 01-27-2025 06:18:13 PST + * update image_uri_configs 01-24-2025 06:18:11 PST + * add missing schema definition in docs + * Omegaconf upgrade + * SageMaker @remote function: Added multi-node functionality + * remove option + * fix typo + * fix tests + * Add an option for user to remove inputs and container artifacts when using local model trainer + ## v2.237.3 (2025-01-09) ### Bug Fixes and Other Changes diff --git a/CODEOWNERS b/CODEOWNERS deleted file mode 100644 index 7f7ac28644..0000000000 --- a/CODEOWNERS +++ /dev/null @@ -1 +0,0 @@ -* @aws/sagemaker-ml-frameworks diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md deleted file mode 100644 index 3b64466870..0000000000 --- a/CODE_OF_CONDUCT.md +++ /dev/null @@ -1,4 +0,0 @@ -## Code of Conduct -This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). -For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact -opensource-codeofconduct@amazon.com with any additional questions or comments. diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 24226af4ee..6a78a25c21 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -16,6 +16,7 @@ information to effectively respond to your bug report or contribution. * [Run the Unit Tests](#run-the-unit-tests) * [Run the Integration Tests](#run-the-integration-tests) * [Make and Test Your Change](#make-and-test-your-change) + * [Lint Your Change](#lint-your-change) * [Commit Your Change](#commit-your-change) * [Send a Pull Request](#send-a-pull-request) * [Documentation Guidelines](#documentation-guidelines) @@ -61,6 +62,10 @@ Before sending us a pull request, please ensure that: 1. Follow the instructions at [Modifying an EBS Volume Using Elastic Volumes (Console)](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/requesting-ebs-volume-modifications.html#modify-ebs-volume) to increase the EBS volume size associated with the newly created EC2 instance. 1. Wait 5-10min for the new EBS volume increase to finalize. 1. Allow EC2 to claim the additional space by stopping and then starting your EC2 host. +2. Set up a venv to manage dependencies: + 1. `python -m venv ~/.venv/myproject-env` to create the venv + 2. `source ~/.venv/myproject-env/bin/activate` to activate the venv + 3. `deactivate` to exit the venv ### Pull Down the Code @@ -74,8 +79,8 @@ Before sending us a pull request, please ensure that: ### Run the Unit Tests 1. Install tox using `pip install tox` -1. Install coverage using `pip install .[test]` -1. cd into the sagemaker-python-sdk folder: `cd sagemaker-python-sdk` or `cd /environment/sagemaker-python-sdk` +1. cd into the github project sagemaker-python-sdk folder: `cd sagemaker-python-sdk` or `cd /environment/sagemaker-python-sdk` +1. Install coverage using `pip install '.[test]'` 1. Run the following tox command and verify that all code checks and unit tests pass: `tox tests/unit` 1. You can also run a single test with the following command: `tox -e py310 -- -s -vv ::` 1. You can run coverage via runcvoerage env : `tox -e runcoverage -- tests/unit` or `tox -e py310 -- tests/unit --cov=sagemaker --cov-append --cov-report xml` @@ -113,6 +118,13 @@ If you are writing or modifying a test that creates a SageMaker job (training, t 1. If your changes include documentation changes, please see the [Documentation Guidelines](#documentation-guidelines). 1. If you include integration tests, do not mark them as canaries if they will not run in all regions. +### Lint Your Change + +Before submitting, ensure your code meets our quality and style guidelines. Run: +```shell +tox -e flake8,pylint,docstyle,black-check,twine --parallel all +``` +Address any errors or warnings before opening a pull request. ### Commit Your Change diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000000..f49a4e16e6 --- /dev/null +++ b/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. \ No newline at end of file diff --git a/LICENSE.txt b/LICENSE.txt deleted file mode 100644 index 0633468f44..0000000000 --- a/LICENSE.txt +++ /dev/null @@ -1,193 +0,0 @@ - Apache License - Version 2.0, January 2004 - - TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - - 1. Definitions. - - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. - - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. - - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by Licensor and - subsequently incorporated within the Work. - - 2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - - 3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - - 4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - - 5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - - 6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - - 7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - - 8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - - 9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. - - END OF TERMS AND CONDITIONS - - ====================================================================================== - Amazon SageMaker Examples Subcomponents: - - The Amazon SageMaker Examples project contains subcomponents with separate - copyright notices and license terms. Your use of the source code for the - these subcomponents is subject to the terms and conditions of the following - licenses. See licenses/ for text of these licenses. - - If a folder hierarchy is listed as subcomponent, separate listings of - further subcomponents (files or folder hierarchies) part of the hierarchy - take precedence. - - ======================================================================================= - 2-clause BSD license - ======================================================================================= - _static/kendrasearchtools.js - _templates/search.html diff --git a/MANIFEST.in b/MANIFEST.in deleted file mode 100644 index 28f1569c35..0000000000 --- a/MANIFEST.in +++ /dev/null @@ -1,18 +0,0 @@ -recursive-include src/sagemaker *.py - -include src/sagemaker/image_uri_config/*.json -include src/sagemaker/pytorch/training_recipes.json -include src/sagemaker/serve/schema/*.json -include src/sagemaker/serve/requirements.txt -include src/sagemaker/modules/train/sm_recipes/training_recipes.json -recursive-include requirements * - -include VERSION -include LICENSE.txt -include README.rst -include hatch_build.py - -prune tests - -recursive-exclude * __pycache__ -recursive-exclude * *.py[co] diff --git a/NOTICE.txt b/NOTICE.txt deleted file mode 100644 index 46da7e5caa..0000000000 --- a/NOTICE.txt +++ /dev/null @@ -1,2 +0,0 @@ -Amazon SageMaker Python SDK -Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. diff --git a/README.rst b/README.rst index 68cf79c55b..e39f955deb 100644 --- a/README.rst +++ b/README.rst @@ -10,10 +10,6 @@ SageMaker Python SDK :target: https://pypi.python.org/pypi/sagemaker :alt: Latest Version -.. image:: https://img.shields.io/conda/vn/conda-forge/sagemaker-python-sdk.svg - :target: https://anaconda.org/conda-forge/sagemaker-python-sdk - :alt: Conda-Forge Version - .. image:: https://img.shields.io/pypi/pyversions/sagemaker.svg :target: https://pypi.python.org/pypi/sagemaker :alt: Supported Python Versions @@ -32,40 +28,170 @@ SageMaker Python SDK SageMaker Python SDK is an open source library for training and deploying machine learning models on Amazon SageMaker. -With the SDK, you can train and deploy models using popular deep learning frameworks **Apache MXNet** and **TensorFlow**. +With the SDK, you can train and deploy models using popular deep learning frameworks **Apache MXNet** and **PyTorch**. You can also train and deploy models with **Amazon algorithms**, which are scalable implementations of core machine learning algorithms that are optimized for SageMaker and GPU training. If you have **your own algorithms** built into SageMaker compatible Docker containers, you can train and host models using these as well. -For detailed documentation, including the API reference, see `Read the Docs `_. +To install SageMaker Python SDK, see `Installing SageMaker Python SDK <#installing-the-sagemaker-python-sdk>`_. + +❗🔥 SageMaker V3 Release +------------------------- + +Version 3.0.0 represents a significant milestone in our product's evolution. This major release introduces a modernized architecture, enhanced performance, and powerful new features while maintaining our commitment to user experience and reliability. + +**Important: Please review these breaking changes before upgrading.** + +* Older interfaces such as Estimator, Model, Predictor and all their subclasses will not be supported in V3. +* Please see our `V3 examples folder `__ for example notebooks and usage patterns. + + +Migrating to V3 +---------------- + +**Upgrading to 3.x** + +To upgrade to the latest version of SageMaker Python SDK 3.x: + +:: + + pip install --upgrade sagemaker + +If you prefer to downgrade to the 2.x version: + +:: + + pip install sagemaker==2.* + +See `SageMaker V2 Examples <#sagemaker-v2-examples>`__ for V2 documentation and examples. + +**Key Benefits of 3.x** + +* **Modular Architecture**: Separate PyPI packages for core, training, and serving capabilities + + * `sagemaker-core `__ + * `sagemaker-train `__ + * `sagemaker-serve `__ + * `sagemaker-mlops `__ + +* **Unified Training & Inference**: Single classes (ModelTrainer, ModelBuilder) replace multiple framework-specific classes +* **Object-Oriented API**: Structured interface with auto-generated configs aligned with AWS APIs +* **Simplified Workflows**: Reduced boilerplate and more intuitive interfaces + +**Training Experience** + +V3 introduces the unified ModelTrainer class to reduce complexity of initial setup and deployment for model training. This replaces the V2 Estimator class and framework-specific classes (PyTorchEstimator, SKLearnEstimator, etc.). + +This example shows how to train a model using a custom training container with training data from S3. + +*SageMaker Python SDK 2.x:* + +.. code:: python + + from sagemaker.estimator import Estimator + estimator = Estimator( + image_uri="my-training-image", + role="arn:aws:iam::123456789012:role/SageMakerRole", + instance_count=1, + instance_type="ml.m5.xlarge", + output_path="s3://my-bucket/output" + ) + estimator.fit({"training": "s3://my-bucket/train"}) + +*SageMaker Python SDK 3.x:* + +.. code:: python + + from sagemaker.train import ModelTrainer + from sagemaker.train.configs import InputData + + trainer = ModelTrainer( + training_image="my-training-image", + role="arn:aws:iam::123456789012:role/SageMakerRole" + ) + + train_data = InputData( + channel_name="training", + data_source="s3://my-bucket/train" + ) + + trainer.train(input_data_config=[train_data]) + +**See more examples:** `SageMaker V3 Examples <#sagemaker-v3-examples>`__ + +**Inference Experience** + +V3 introduces the unified ModelBuilder class for model deployment and inference. This replaces the V2 Model class and framework-specific classes (PyTorchModel, TensorFlowModel, SKLearnModel, XGBoostModel, etc.). + +This example shows how to deploy a trained model for real-time inference. + +*SageMaker Python SDK 2.x:* + +.. code:: python + + from sagemaker.model import Model + from sagemaker.predictor import Predictor + model = Model( + image_uri="my-inference-image", + model_data="s3://my-bucket/model.tar.gz", + role="arn:aws:iam::123456789012:role/SageMakerRole" + ) + predictor = model.deploy( + initial_instance_count=1, + instance_type="ml.m5.xlarge" + ) + result = predictor.predict(data) + +*SageMaker Python SDK 3.x:* + +.. code:: python + + from sagemaker.serve import ModelBuilder + model_builder = ModelBuilder( + model="my-model", + model_path="s3://my-bucket/model.tar.gz" + ) + endpoint = model_builder.build() + result = endpoint.invoke(...) + +**See more examples:** `SageMaker V3 Examples <#sagemaker-v3-examples>`__ + +SageMaker V3 Examples +--------------------- + +**Training Examples** + +#. `Custom Distributed Training Example `__ +#. `Distributed Local Training Example `__ +#. `Hyperparameter Training Example `__ +#. `JumpStart Training Example `__ +#. `Local Training Example `__ + +**Inference Examples** + +#. `HuggingFace Example `__ +#. `In-Process Mode Example `__ +#. `Inference Spec Example `__ +#. `JumpStart E2E Training Example `__ +#. `JumpStart Example `__ +#. `Local Mode Example `__ +#. `Optimize Example `__ +#. `Train Inference E2E Example `__ + +**ML Ops Examples** + +#. `V3 Hyperparameter Tuning Example `__ +#. `V3 Hyperparameter Tuning Pipeline `__ +#. `V3 Model Registry Example `__ +#. `V3 PyTorch Processing Example `__ +#. `V3 Pipeline Train Create Registry `__ +#. `V3 Processing Job Sklearn `__ +#. `V3 SageMaker Clarify `__ +#. `V3 Transform Job Example `__ + +**Looking for V2 Examples?** See `SageMaker V2 Examples <#sagemaker-v2-examples>`__ below. -Table of Contents ------------------ -#. `Installing SageMaker Python SDK <#installing-the-sagemaker-python-sdk>`__ -#. `Using the SageMaker Python SDK `__ -#. `Using MXNet `__ -#. `Using TensorFlow `__ -#. `Using Chainer `__ -#. `Using PyTorch `__ -#. `Using Scikit-learn `__ -#. `Using XGBoost `__ -#. `SageMaker Reinforcement Learning Estimators `__ -#. `SageMaker SparkML Serving <#sagemaker-sparkml-serving>`__ -#. `Amazon SageMaker Built-in Algorithm Estimators `__ -#. `Using SageMaker AlgorithmEstimators `__ -#. `Consuming SageMaker Model Packages `__ -#. `BYO Docker Containers with SageMaker Estimators `__ -#. `SageMaker Automatic Model Tuning `__ -#. `SageMaker Batch Transform `__ -#. `Secure Training and Inference with VPC `__ -#. `BYO Model `__ -#. `Inference Pipelines `__ -#. `Amazon SageMaker Operators in Apache Airflow `__ -#. `SageMaker Autopilot `__ -#. `Model Monitoring `__ -#. `SageMaker Debugger `__ -#. `SageMaker Processing `__ Installing the SageMaker Python SDK @@ -94,10 +220,10 @@ Supported Python Versions SageMaker Python SDK is tested on: -- Python 3.8 - Python 3.9 - Python 3.10 - Python 3.11 +- Python 3.12 Telemetry ~~~~~~~~~~~~~~~ @@ -191,9 +317,9 @@ Setup a Python environment, and install the dependencies listed in ``doc/require :: # conda - conda create -n sagemaker python=3.7 + conda create -n sagemaker python=3.12 conda activate sagemaker - conda install sphinx=3.1.1 sphinx_rtd_theme=0.5.0 + conda install sphinx=5.1.1 sphinx_rtd_theme=0.5.0 # pip pip install -r doc/requirements.txt @@ -254,3 +380,77 @@ For more information about the different ``content-type`` and ``Accept`` formats ``schema`` that SageMaker SparkML Serving recognizes, please see `SageMaker SparkML Serving Container`_. .. _SageMaker SparkML Serving Container: https://github.com/aws/sagemaker-sparkml-serving-container + + +SageMaker V2 Examples +--------------------- + +#. `Using the SageMaker Python SDK `__ +#. `Using MXNet `__ +#. `Using TensorFlow `__ +#. `Using Chainer `__ +#. `Using PyTorch `__ +#. `Using Scikit-learn `__ +#. `Using XGBoost `__ +#. `SageMaker Reinforcement Learning Estimators `__ +#. `SageMaker SparkML Serving <#sagemaker-sparkml-serving>`__ +#. `Amazon SageMaker Built-in Algorithm Estimators `__ +#. `Using SageMaker AlgorithmEstimators `__ +#. `Consuming SageMaker Model Packages `__ +#. `BYO Docker Containers with SageMaker Estimators `__ +#. `SageMaker Automatic Model Tuning `__ +#. `SageMaker Batch Transform `__ +#. `Secure Training and Inference with VPC `__ +#. `BYO Model `__ +#. `Inference Pipelines `__ +#. `Amazon SageMaker Operators in Apache Airflow `__ +#. `SageMaker Autopilot `__ +#. `Model Monitoring `__ +#. `SageMaker Debugger `__ +#. `SageMaker Processing `__ + +🚀 Model Fine-Tuning Support Now Available in V3 +------------------------------------------------- + +We're excited to announce model fine-tuning capabilities in SageMaker Python SDK V3! + +**What's New** + +Four new trainer classes for fine-tuning foundation models: + +* SFTTrainer - Supervised fine-tuning +* DPOTrainer - Direct preference optimization +* RLAIFTrainer - RL from AI feedback +* RLVRTrainer - RL from verifiable rewards + +**Quick Example** + +.. code:: python + + from sagemaker.train import SFTTrainer + from sagemaker.train.common import TrainingType + + trainer = SFTTrainer( + model="meta-llama/Llama-2-7b-hf", + training_type=TrainingType.LORA, + model_package_group_name="my-models", + training_dataset="s3://bucket/train.jsonl" + ) + + training_job = trainer.train() + +**Key Features** + +* ✨ LoRA & full fine-tuning +* 📊 MLflow integration with real-time metrics +* 🚀 Deploy to SageMaker or Bedrock +* 📈 Built-in evaluation (11 benchmarks) +* ☁️ Serverless training + +**Get Started** + +.. code:: python + + pip install sagemaker>=3.1.0 + +`📓 Example notebooks `__ \ No newline at end of file diff --git a/VERSION b/VERSION index 1ca006360a..15a2799817 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -2.237.4.dev0 +3.3.0 diff --git a/bin/README b/bin/README deleted file mode 100644 index 6accc729fc..0000000000 --- a/bin/README +++ /dev/null @@ -1,5 +0,0 @@ -Put your python scripts into this directory. - -Any script that has a shebang line with python in it and is executable -will be automatically included in your package. All others must be -declared explicitly in the setup.py file. diff --git a/ci-scripts/displaytime.sh b/ci-scripts/displaytime.sh deleted file mode 100755 index 6e1e474b4a..0000000000 --- a/ci-scripts/displaytime.sh +++ /dev/null @@ -1,29 +0,0 @@ -#!/usr/bin/env bash -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"). You -# may not use this file except in compliance with the License. A copy of -# the License is located at -# -# http://aws.amazon.com/apache2.0/ -# -# or in the "license" file accompanying this file. This file is -# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF -# ANY KIND, either express or implied. See the License for the specific -# language governing permissions and limitations under the License. - -set -euo pipefail - -echo =================== $1 execution time =================== - -start_time=$2 -end_time=`date +%s` -total_time=$(expr $end_time - $start_time + 1) -hours=$((total_time/60/60%24)) -minutes=$((total_time/60%60)) -secs=$((total_time%60)) - -(( $hours > 0 )) && printf '%d hours ' $hours -(( $minutes > 0 )) && printf '%d minutes ' $minutes -(( $hours > 0 || $minutes > 0 )) && printf 'and ' -printf '%d seconds\n\n' $secs diff --git a/ci-scripts/queue_build.py b/ci-scripts/queue_build.py deleted file mode 100644 index fcff0b9a9b..0000000000 --- a/ci-scripts/queue_build.py +++ /dev/null @@ -1,161 +0,0 @@ -# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"). You -# may not use this file except in compliance with the License. A copy of -# the License is located at -# -# http://aws.amazon.com/apache2.0/ -# -# or in the "license" file accompanying this file. This file is -# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF -# ANY KIND, either express or implied. See the License for the specific -# language governing permissions and limitations under the License. -from __future__ import absolute_import - -import os -import re -import time - -import boto3 - -account = boto3.client( - "sts", region_name="us-west-2", endpoint_url="https://sts.us-west-2.amazonaws.com" -).get_caller_identity()["Account"] -bucket_name = "sagemaker-us-west-2-%s" % account - -MAX_IN_PROGRESS_BUILDS = 3 -INTERVAL_BETWEEN_CONCURRENT_RUNS = 15 # minutes -CLEAN_UP_TICKETS_OLDER_THAN = 8 # hours - - -def queue_build(): - ticket_number = int(1000 * time.time()) - files = _list_tickets() - _cleanup_tickets_older_than(files) - _wait_for_other_builds(ticket_number) - - -def _build_info_from_file(file): - filename = file.key.split("/")[2] - ticket_number, build_id, source_version = filename.split("_") - return int(ticket_number), build_id, source_version - - -def _wait_for_other_builds(ticket_number): - sorted_files = _list_tickets() - - print("build queue status:") - print() - - for order, file in enumerate(sorted_files): - file_ticket_number, build_id, source_version = _build_info_from_file(file) - print( - "%s -> %s %s, ticket number: %s status: %s" - % (order, build_id, source_version, file_ticket_number, file.key.split("/")[1]) - ) - print() - build_id = re.sub("[_/]", "-", os.environ.get("CODEBUILD_BUILD_ID", "CODEBUILD-BUILD-ID")) - source_version = re.sub( - "[_/]", - "-", - os.environ.get("CODEBUILD_SOURCE_VERSION", "CODEBUILD-SOURCE-VERSION"), - ) - filename = "%s_%s_%s" % (ticket_number, build_id, source_version) - s3_file_obj = _write_ticket(filename, status="waiting") - print("Build %s waiting to be scheduled" % filename) - - while True: - _cleanup_tickets_with_terminal_states() - waiting_tickets = _list_tickets("waiting") - if waiting_tickets: - first_waiting_ticket_number, _, _ = _build_info_from_file(_list_tickets("waiting")[0]) - else: - first_waiting_ticket_number = ticket_number - - if ( - len(_list_tickets(status="in-progress")) < 3 - and last_in_progress_elapsed_time_check() - and first_waiting_ticket_number == ticket_number - ): - # put the build in progress - print("Scheduling build %s for running.." % filename) - s3_file_obj.delete() - _write_ticket(filename, status="in-progress") - break - else: - # wait - time.sleep(30) - - -def last_in_progress_elapsed_time_check(): - in_progress_tickets = _list_tickets("in-progress") - if not in_progress_tickets: - return True - last_in_progress_ticket, _, _ = _build_info_from_file(_list_tickets("in-progress")[-1]) - _elapsed_time = int(1000 * time.time()) - last_in_progress_ticket - last_in_progress_elapsed_time = int(_elapsed_time / (1000 * 60)) # in minutes - return last_in_progress_elapsed_time > INTERVAL_BETWEEN_CONCURRENT_RUNS - - -def _cleanup_tickets_with_terminal_states(): - files = _list_tickets() - build_ids = [] - for file in files: - _, build_id, _ = _build_info_from_file(file) - build_ids.append(build_id) - - client = boto3.client("codebuild") - response = client.batch_get_builds(ids=build_ids) - - for file, build_details in zip(files, response["builds"]): - _, _build_id_from_file, _ = _build_info_from_file(file) - build_status = build_details["buildStatus"] - - if build_status != "IN_PROGRESS" and _build_id_from_file == build_details["id"]: - print( - "Build %s in terminal state: %s, deleting lock" - % (_build_id_from_file, build_status) - ) - file.delete() - - -def _cleanup_tickets_older_than(files): - oldfiles = list(filter(_file_older_than, files)) - for file in oldfiles: - print("object %s older than 8 hours. Deleting" % file.key) - file.delete() - return files - - -def _list_tickets(status=None): - s3 = boto3.resource("s3") - bucket = s3.Bucket(bucket_name) - prefix = "ci-integ-queue/{}/".format(status) if status else "ci-integ-queue/" - objects = [file for file in bucket.objects.filter(Prefix=prefix)] - files = list(filter(lambda x: x != prefix, objects)) - sorted_files = list(sorted(files, key=lambda y: y.key)) - return sorted_files - - -def _file_older_than(file): - timelimit = 1000 * 60 * 60 * CLEAN_UP_TICKETS_OLDER_THAN - file_ticket_number, build_id, source_version = _build_info_from_file(file) - return int(1000 * time.time()) - file_ticket_number > timelimit - - -def _write_ticket(filename, status="waiting"): - file_path = "ci-integ-queue/{}".format(status) - if not os.path.exists(file_path): - os.makedirs(file_path) - - file_full_path = file_path + "/" + filename - with open(file_full_path, "w") as file: - file.write(filename) - s3_file_obj = boto3.Session().resource("s3").Object(bucket_name, file_full_path) - s3_file_obj.upload_file(file_full_path) - print("Build %s is now in state %s" % (filename, status)) - return s3_file_obj - - -if __name__ == "__main__": - queue_build() diff --git a/doc/_static/js/datatable.js b/doc/_static/js/datatable.js deleted file mode 100644 index 897204e8df..0000000000 --- a/doc/_static/js/datatable.js +++ /dev/null @@ -1,4 +0,0 @@ -$(document).ready( function () { - $('table.datatable').DataTable(); - $('a.external').attr('target', '_blank'); -} ); \ No newline at end of file diff --git a/doc/_static/kendrasearchtools.js b/doc/_static/kendrasearchtools.js deleted file mode 100644 index 4920607010..0000000000 --- a/doc/_static/kendrasearchtools.js +++ /dev/null @@ -1,700 +0,0 @@ -/* - * kendrasearchtools.js - * ~~~~~~~~~~~~~~~~ - * - * A modification of searchtools.js (https://github.com/sphinx-doc/sphinx/blob/275d9/sphinx/themes/basic/static/searchtools.js) - * where the default full-text search implemented in searchtools.js is replaced with AWS Kendra searching over multiple - * websites. The default full-text search is still kept and implemented as a fallback in the case that the Kendra search doesn't work. - * - * :copyright: Copyright 2007-2021 by the Sphinx team, see AUTHORS. - * :license: BSD, see LICENSE for details. - * - */ - -if (!Scorer) { - /** - * Simple result scoring code. - */ - var Scorer = { - // Implement the following function to further tweak the score for each result - // The function takes a result array [filename, title, anchor, descr, score] - // and returns the new score. - /* - score: function(result) { - return result[4]; - }, - */ - - // query matches the full name of an object - objNameMatch: 11, - // or matches in the last dotted part of the object name - objPartialMatch: 6, - // Additive scores depending on the priority of the object - objPrio: {0: 15, // used to be importantResults - 1: 5, // used to be objectResults - 2: -5}, // used to be unimportantResults - // Used when the priority is not in the mapping. - objPrioDefault: 0, - - // query found in title - title: 15, - partialTitle: 7, - // query found in terms - term: 5, - partialTerm: 2 - }; -} - -if (!splitQuery) { - function splitQuery(query) { - return query.split(/\s+/); - } -} - -/** - * default rtd search (used as fallback) - */ -var Search = { - - _index : null, - _queued_query : null, - _pulse_status : -1, - - htmlToText : function(htmlString) { - var virtualDocument = document.implementation.createHTMLDocument('virtual'); - var htmlElement = $(htmlString, virtualDocument); - htmlElement.find('.headerlink').remove(); - docContent = htmlElement.find('[role=main]')[0]; - if(docContent === undefined) { - console.warn("Content block not found. Sphinx search tries to obtain it " + - "via '[role=main]'. Could you check your theme or template."); - return ""; - } - return docContent.textContent || docContent.innerText; - }, - - init : function() { - var params = $.getQueryParameters(); - if (params.q) { - var query = params.q[0]; - $('input[name="q"]')[0].value = query; - // this.performSearch(query); - } - }, - - loadIndex : function(url) { - $.ajax({type: "GET", url: url, data: null, - dataType: "script", cache: true, - complete: function(jqxhr, textstatus) { - if (textstatus != "success") { - document.getElementById("searchindexloader").src = url; - } - }}); - }, - - setIndex : function(index) { - var q; - this._index = index; - if ((q = this._queued_query) !== null) { - this._queued_query = null; - Search.query(q); - } - }, - - hasIndex : function() { - return this._index !== null; - }, - - deferQuery : function(query) { - this._queued_query = query; - }, - - stopPulse : function() { - this._pulse_status = 0; - }, - - startPulse : function() { - if (this._pulse_status >= 0) - return; - function pulse() { - var i; - Search._pulse_status = (Search._pulse_status + 1) % 4; - var dotString = ''; - for (i = 0; i < Search._pulse_status; i++) - dotString += '.'; - Search.dots.text(dotString); - if (Search._pulse_status > -1) - window.setTimeout(pulse, 500); - } - pulse(); - }, - - /** - * perform a search for something (or wait until index is loaded) - */ - performSearch : function(query) { - // create the required interface elements - this.out = $('#search-results'); - this.title = $('#search-results h2:first'); // $('

' + _('Searching') + '

').appendTo(this.out); - this.dots = $('#search-results span:first'); //$('').appendTo(this.title); - this.status = $('#search-results p:first'); // $('

 

').appendTo(this.out); - this.output = $('#search-results ul:first'); //$('