diff --git a/.github/workflows/build-seqrepo-slim.yaml b/.github/workflows/build-seqrepo-slim.yaml new file mode 100644 index 00000000..c10441d0 --- /dev/null +++ b/.github/workflows/build-seqrepo-slim.yaml @@ -0,0 +1,89 @@ +name: Build SeqRepo Slim Container + +on: + release: + types: [published] + +env: + REGISTRY: ghcr.io + IMAGE_NAME: ${{ github.repository }}/seqrepo-slim + +jobs: + build-and-push: + runs-on: ubuntu-latest + permissions: + contents: read + packages: write + + strategy: + matrix: + assembly: [GRCh38, GRCh37] + + env: + ASSEMBLY: ${{ matrix.assembly }} + BASE_TAG: ghcr.io/${{ github.repository }}/seqrepo-slim + + steps: + - name: Checkout vrs-python repository + uses: actions/checkout@v4 + with: + fetch-depth: 1 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + with: + driver-opts: | + image=moby/buildkit:latest + + - name: Log in to Container Registry + uses: docker/login-action@v3 + with: + registry: ${{ env.REGISTRY }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Build and push data stage + uses: docker/build-push-action@v5 + with: + context: misc/containers + file: misc/containers/Dockerfile + platforms: linux/amd64 + target: data + build-args: | + ASSEMBLY=${{ env.ASSEMBLY }} + cache-from: type=registry,ref=${{ env.BASE_TAG }}:${{ env.ASSEMBLY }}-data + tags: ${{ env.BASE_TAG }}:${{ env.ASSEMBLY }}-data + push: true + + - name: Build and push build stage + uses: docker/build-push-action@v5 + with: + context: misc/containers + file: misc/containers/Dockerfile + platforms: linux/amd64 + target: build + build-args: | + ASSEMBLY=${{ env.ASSEMBLY }} + cache-from: | + type=registry,ref=${{ env.BASE_TAG }}:${{ env.ASSEMBLY }}-data + type=registry,ref=${{ env.BASE_TAG }}:${{ env.ASSEMBLY }}-build + tags: ${{ env.BASE_TAG }}:${{ env.ASSEMBLY }}-build + push: true + + - name: Build and push final stage + uses: docker/build-push-action@v5 + with: + context: misc/containers + file: misc/containers/Dockerfile + platforms: linux/amd64 + target: vrs-python + build-args: | + ASSEMBLY=${{ env.ASSEMBLY }} + cache-from: | + type=registry,ref=${{ env.BASE_TAG }}:${{ env.ASSEMBLY }}-data + type=registry,ref=${{ env.BASE_TAG }}:${{ env.ASSEMBLY }}-build + type=registry,ref=${{ env.BASE_TAG }}:${{ env.ASSEMBLY }} + tags: | + ${{ env.BASE_TAG }}:${{ env.ASSEMBLY }} + ${{ env.BASE_TAG }}:latest + push: true diff --git a/docker-compose.yml b/docker-compose.yml index 43d4348e..b11ceebc 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -26,7 +26,7 @@ services: volumes: - uta_vol:/var/lib/postgresql/data ports: - - 5432:5432 + - 5433:5432 volumes: seqrepo_vol: diff --git a/misc/containers/Dockerfile b/misc/containers/Dockerfile new file mode 100644 index 00000000..6c1c3150 --- /dev/null +++ b/misc/containers/Dockerfile @@ -0,0 +1,91 @@ +### +# podman build --arch linux/amd64,linux/arm64 --build-arg ASSEMBLY=GRCh38 -t docker.io/ga4gh/vrs-python:GRCh38 -f ./Dockerfile . +# podman build --arch linux/arm64 --build-arg ASSEMBLY=GRCh38 --target build -t docker.io/ga4gh/vrs-python:GRCh38-build -f ./Dockerfile . +### +# Data layer - downloads genomic reference files +FROM python:3.12-slim AS data + +# Either 'GRCh38' or 'GRCh37' +ARG ASSEMBLY="GRCh38" + +# Tell build-seqrepo where to put the data +ENV SEQREPO_ROOT_DIR=/seqrepo-${ASSEMBLY} + +# Install curl for downloading +RUN apt-get update && apt-get install -y curl \ + && rm -rf /var/lib/apt/lists/* + +WORKDIR /data + +# Download the appropriate genomic reference file based on assembly +COPY build-${ASSEMBLY}.bash /data/ +RUN . /data/build-${ASSEMBLY}.bash \ + && download_reference + +# Builder image +FROM python:3.12-slim AS build + +# Either 'GRCh38' or 'GRCh37' +ARG ASSEMBLY="GRCh38" + +# Install packages needed for the build +RUN apt-get update && apt-get upgrade -y && apt-get install -y \ + curl \ + git \ + libpq-dev \ + python3-pip \ + python3-venv \ + tabix \ + rsync \ + zlib1g-dev \ + postgresql-client \ + unzip \ + libhts3 \ + && rm -rf /var/lib/apt/lists/* + +WORKDIR /vrs-python + +# Copy downloaded genomic files from data layer +COPY --from=data /data/* /data/ + +# Setup the virtual env for vrs-python +RUN python3 -m venv /vrs-python/venv +ENV PATH=/vrs-python/venv/bin:$PATH + +# Tell build-seqrepo where to put the data +ENV SEQREPO_ROOT_DIR=/seqrepo-${ASSEMBLY} + +# Install vrs-python +RUN /vrs-python/venv/bin/python3 -m pip install -U setuptools 'ga4gh.vrs[extras]' biocommons.seqrepo + +# Build the seqrepo data using provided function +COPY build-${ASSEMBLY}.bash /tmp/build-${ASSEMBLY}.bash +RUN cd /data && . /tmp/build-${ASSEMBLY}.bash \ + && build_seqrepo + +# Final image +FROM python:3.12-slim AS vrs-python +ARG ASSEMBLY="GRCh38" +ENV ASSEMBLY=${ASSEMBLY} + +# Install runtime required packages +RUN apt-get update && apt-get install -y libpq-dev \ + && rm -rf /var/lib/apt/lists/* + +# Copy over artifacts from the builder +COPY --from=build /vrs-python /vrs-python +COPY --from=build /seqrepo-${ASSEMBLY} /seqrepo-${ASSEMBLY} + +# Copy over run script +COPY ./entrypoint.sh /entrypoint.sh +RUN chmod +x /entrypoint.sh + +# Set environment variables +ENV GA4GH_VRS_DATAPROXY_URI="seqrepo+file:///seqrepo-${ASSEMBLY}/master" +ENV SEQREPO_ROOT_DIR=/seqrepo-${ASSEMBLY} +ENV VIRTUAL_ENV=/vrs-python/venv +ENV PATH=/vrs-python/venv/bin:$PATH + +WORKDIR / + +ENTRYPOINT [ "/entrypoint.sh" ] diff --git a/misc/containers/build-GRCh37.bash b/misc/containers/build-GRCh37.bash new file mode 100644 index 00000000..830af257 --- /dev/null +++ b/misc/containers/build-GRCh37.bash @@ -0,0 +1,23 @@ +#!/bin/bash +set -xeuo pipefail + +if [ -z $SEQREPO_ROOT_DIR ]; then + echo "Must set SEQREPO_ROOT_DIR" + exit 1 +fi + +reference_url=https://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405.13_GRCh37/GCF_000001405.13_GRCh37_genomic.fna.gz +reference_fname=$(basename $reference_url) + +download_reference() { + curl -O $reference_url + echo "$reference_url" +} + +build_seqrepo() { + # Load reference genome from pre-downloaded file + # File should already be present from Docker data layer + seqrepo -r $SEQREPO_ROOT_DIR init + seqrepo -r $SEQREPO_ROOT_DIR load -n NCBI $reference_fname + seqrepo -r $SEQREPO_ROOT_DIR add-assembly-names +} diff --git a/misc/containers/build-GRCh38.bash b/misc/containers/build-GRCh38.bash new file mode 100644 index 00000000..6f52ff14 --- /dev/null +++ b/misc/containers/build-GRCh38.bash @@ -0,0 +1,25 @@ +#!/bin/bash +set -xeuo pipefail + +if [ -z $SEQREPO_ROOT_DIR ]; then + echo "Must set SEQREPO_ROOT_DIR" + exit 1 +fi + +reference_url=https://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405.26_GRCh38/GCF_000001405.26_GRCh38_genomic.fna.gz +reference_fname=$(basename $reference_url) + +echo "PATH: $PATH" + +download_reference() { + curl -O $reference_url + echo "$reference_url" +} + +build_seqrepo() { + # Load reference genome from pre-downloaded file + # File should already be present from Docker data layer + seqrepo -r $SEQREPO_ROOT_DIR init + seqrepo -r $SEQREPO_ROOT_DIR load -n NCBI $reference_fname + seqrepo -r $SEQREPO_ROOT_DIR add-assembly-names +} diff --git a/misc/containers/build-with-tar.sh b/misc/containers/build-with-tar.sh new file mode 100755 index 00000000..3d304585 --- /dev/null +++ b/misc/containers/build-with-tar.sh @@ -0,0 +1,76 @@ +#!/bin/bash + +# Build script that creates a tar.gz with only necessary files for container build +# Usage: ./build-with-tar.sh [ASSEMBLY] + +set -e + +ASSEMBLY=${1:-GRCh38} +TAR_NAME="build-context.tar.gz" +BUILD_DIR="build-context" + +echo "Building container with assembly: $ASSEMBLY" +echo "Creating build context tar.gz..." + +# Clean up any existing build context +rm -rf "$BUILD_DIR" "$TAR_NAME" + +# Create build directory +mkdir -p "$BUILD_DIR/misc/containers" + +# Copy necessary files for the container build +echo "Copying files to build context..." + +# Container-specific files +cp misc/containers/Dockerfile "$BUILD_DIR/misc/containers/" +cp misc/containers/entrypoint.sh "$BUILD_DIR/misc/containers/" +cp misc/containers/build-${ASSEMBLY}.bash "$BUILD_DIR/misc/containers/" + +# Create the tar.gz +echo "Creating tar.gz..." +rm -rf "$TAR_NAME" +tar -czf "$TAR_NAME" -C "$BUILD_DIR" . + +# Clean up build directory +rm -rf "$BUILD_DIR" + +echo "Build context created: $TAR_NAME" + +# Detect container runtime +if command -v docker >/dev/null 2>&1; then + CONTAINER_CMD="docker" + echo "Using Docker for build..." +elif command -v podman >/dev/null 2>&1; then + CONTAINER_CMD="podman" + echo "Using Podman for build..." +else + echo "Error: Neither docker nor podman found in PATH" + exit 1 +fi + +# Run container build with the tar.gz as context +cat "$TAR_NAME" | $CONTAINER_CMD build \ + --arch linux/arm64,linux/amd64 \ + --build-arg ASSEMBLY="$ASSEMBLY" \ + --target data \ + -t ghcr.io/theferrit32/vrs-python:${ASSEMBLY}-data \ + -f ./misc/containers/Dockerfile + +cat "$TAR_NAME" | $CONTAINER_CMD build \ + --arch linux/arm64,linux/amd64 \ + --build-arg ASSEMBLY="$ASSEMBLY" \ + --target build \ + -t ghcr.io/theferrit32/vrs-python:${ASSEMBLY}-build \ + -f ./misc/containers/Dockerfile + +cat "$TAR_NAME" | $CONTAINER_CMD build \ + --arch linux/arm64,linux/amd64 \ + --build-arg ASSEMBLY="$ASSEMBLY" \ + --target build \ + -t ghcr.io/theferrit32/vrs-python:${ASSEMBLY} \ + -f ./misc/containers/Dockerfile + +# Clean up tar file +# rm -f "$TAR_NAME" + +echo "Build completed successfully!" diff --git a/misc/containers/entrypoint.sh b/misc/containers/entrypoint.sh new file mode 100644 index 00000000..1a745408 --- /dev/null +++ b/misc/containers/entrypoint.sh @@ -0,0 +1,2 @@ +#!/bin/sh +/vrs-python/venv/bin/vrs-annotate vcf --assembly ${ASSEMBLY} $@