Files
life-echo/.github/workflows/docker-build-deploy.yml
penghanyuan 8bbe6367ae fix(ci): retry SSH setup steps in remote candidate preparation
Retry remote docker login, bootstrap SSH commands, and scp uploads to handle transient connection timeout and banner exchange failures in GitHub runner environments.

Co-authored-by: Cursor <cursoragent@cursor.com>
2026-05-14 17:23:31 +02:00

434 lines
16 KiB
YAML
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# API Dockermain → Staging 机(无前缀 SSH_* / DEPLOY_PATHTag v*.*.* → Prod 机PROD_*
# 在 Repo → Settings → Secrets and variables → Actions 中配置,无需 GitHub Environments。
# StagingSSH_HOST / SSH_USER / SSH_PRIVATE_KEY / SSH_PORT / DEPLOY_PATH
# ProductionPROD_SSH_HOST / PROD_SSH_USER / PROD_SSH_PRIVATE_KEY / PROD_SSH_PORT / PROD_DEPLOY_PATH
# 阿里云镜像仍为仓库级ALIYUN_CR_USERNAME / ALIYUN_CR_PASSWORD
#
# 勿把 PROD 私钥与 Staging 混用staging 只读 SSH_PRIVATE_KEYprod 只读 PROD_SSH_PRIVATE_KEY。
#
# 旧库 pg_dump 一次性迁入当前 schema见 workflow「Legacy DB migrate (one-shot)」(手动运行,非每次构建)。
#
# 发布策略:
# - merge / push 到 main构建并部署到 Staging 机;使用仓库中的 api/.env.staging上传后切换为运行时 .env
# - 手动创建并推送 tag vMAJOR.MINOR.PATCH构建并部署到 Production使用仓库中的 api/.env.production上传后切换为运行时 .env
#
# 注意paths 过滤在 tag push 时按「被指向的 commit」判断若该 commit 未改 api/ 与本 workflow不会触发。
# 此时可用 workflow_dispatch 补跑 mainStaging或 vMAJOR.MINOR.PATCH tagProduction
name: Docker Build and Deploy
on:
push:
branches:
- main
tags:
- 'v*.*.*'
paths:
- 'api/**'
- '.github/workflows/**'
workflow_dispatch:
inputs:
branch:
description: '部署 ref分支名或 tag如 main / v1.0.0);留空则使用当前运行所选 ref'
required: false
type: string
default: ''
concurrency:
group: docker-api-${{ github.ref }}
cancel-in-progress: false
env:
IMAGE_NAME: lifecho-api
REGISTRY: crpi-u2903xccyzd6nqnc.cn-shanghai.personal.cr.aliyuncs.com
REGISTRY_NAMESPACE: huaga
FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: true
jobs:
resolve-deploy-target:
name: Resolve deploy target
runs-on: ubuntu-latest
outputs:
deploy_ref: ${{ steps.deploy_target.outputs.deploy_ref }}
image_tag: ${{ steps.deploy_target.outputs.image_tag }}
target: ${{ steps.deploy_target.outputs.target }}
steps:
- name: Determine deploy target
id: deploy_target
run: |
if [ -n "${{ github.event.inputs.branch }}" ]; then
REF_NAME="${{ github.event.inputs.branch }}"
else
REF_NAME="${{ github.ref_name }}"
fi
echo "deploy_ref=$REF_NAME" >> "$GITHUB_OUTPUT"
if [[ "$REF_NAME" =~ ^v[0-9]+\.[0-9]+\.[0-9]+$ ]]; then
echo "target=prod" >> "$GITHUB_OUTPUT"
echo "image_tag=${REF_NAME#v}" >> "$GITHUB_OUTPUT"
elif [ "$REF_NAME" = "main" ] || [ "$REF_NAME" = "master" ]; then
echo "target=staging" >> "$GITHUB_OUTPUT"
echo "image_tag=latest" >> "$GITHUB_OUTPUT"
else
echo "::error::不支持部署 ref '$REF_NAME'。Staging release 只允许 mainProduction release 只允许 vMAJOR.MINOR.PATCH tag。"
exit 1
fi
test:
name: API tests
needs: resolve-deploy-target
runs-on: ubuntu-latest
permissions:
contents: read
steps:
- uses: actions/checkout@v5
with:
ref: ${{ needs.resolve-deploy-target.outputs.deploy_ref }}
- name: Install uv
uses: astral-sh/setup-uv@v5
with:
version: "0.7.3"
- name: Sync deps and run pytest
working-directory: api
run: |
uv sync --dev
uv run pytest --tb=short -q
build-and-push:
name: Build and Push Docker Image
needs:
- resolve-deploy-target
- test
runs-on: ubuntu-latest
permissions:
contents: read
steps:
- name: Checkout code
uses: actions/checkout@v5
with:
ref: ${{ needs.resolve-deploy-target.outputs.deploy_ref }}
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Log in to Alibaba Cloud Container Registry
env:
REGISTRY: ${{ env.REGISTRY }}
USERNAME: ${{ secrets.ALIYUN_CR_USERNAME }}
PASSWORD: ${{ secrets.ALIYUN_CR_PASSWORD }}
run: |
echo "正在登录到阿里云容器镜像服务..."
echo "Registry: $REGISTRY"
echo "Username: $USERNAME"
echo "Password length: ${#PASSWORD}"
# 使用 printf 确保密码正确传递(包括特殊字符)
printf '%s\n' "$PASSWORD" | docker login "$REGISTRY" --username="$USERNAME" --password-stdin
echo "✅ 登录成功!"
- name: Extract metadata (tags, labels) for Docker
id: meta
uses: docker/metadata-action@v5
with:
images: ${{ env.REGISTRY }}/${{ env.REGISTRY_NAMESPACE }}/${{ env.IMAGE_NAME }}
tags: |
type=ref,event=branch
type=ref,event=pr
type=semver,pattern={{version}}
type=semver,pattern={{major}}.{{minor}}
type=sha,prefix=sha-
type=raw,value=${{ needs.resolve-deploy-target.outputs.image_tag }}
type=raw,value=latest,enable={{is_default_branch}}
- name: Build and push Docker image
uses: docker/build-push-action@v6
with:
context: ./api
file: ./api/Dockerfile
push: true
tags: ${{ steps.meta.outputs.tags }}
labels: ${{ steps.meta.outputs.labels }}
cache-from: type=gha
cache-to: type=gha,mode=max
deploy:
name: Deploy to Remote Server
runs-on: ubuntu-latest
needs:
- resolve-deploy-target
- build-and-push
if: github.event_name != 'pull_request'
steps:
- name: Checkout code
uses: actions/checkout@v5
with:
ref: ${{ needs.resolve-deploy-target.outputs.deploy_ref }}
- name: Ensure production SSH secret is set
if: needs.resolve-deploy-target.outputs.target == 'prod'
env:
PROD_SSH_PRIVATE_KEY: ${{ secrets.PROD_SSH_PRIVATE_KEY }}
run: |
if [ -z "$PROD_SSH_PRIVATE_KEY" ]; then
echo "::error::PROD_SSH_PRIVATE_KEY 未配置或为空,无法部署生产。请在 Repository secrets 中设置 PROD_SSH_*。"
exit 1
fi
- name: Ensure staging SSH secret is set
if: needs.resolve-deploy-target.outputs.target != 'prod'
env:
SSH_PRIVATE_KEY: ${{ secrets.SSH_PRIVATE_KEY }}
run: |
if [ -z "$SSH_PRIVATE_KEY" ]; then
echo "::error::SSH_PRIVATE_KEY 未配置或为空,无法部署 staging。请在 Repository secrets 中设置 SSH_HOST / SSH_USER / SSH_PRIVATE_KEY / SSH_PORT / DEPLOY_PATH。"
exit 1
fi
# 勿用 `prod && PROD_KEY || SSH_KEY`PROD 为空时会错误回退到 staging 密钥,导致连生产机报 Permission denied。
- name: Set up SSH (production)
if: needs.resolve-deploy-target.outputs.target == 'prod'
uses: webfactory/ssh-agent@v0.9.1
with:
ssh-private-key: ${{ secrets.PROD_SSH_PRIVATE_KEY }}
- name: Set up SSH (staging)
if: needs.resolve-deploy-target.outputs.target != 'prod'
uses: webfactory/ssh-agent@v0.9.1
with:
ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }}
- name: Export deploy connection env
run: |
if [ "${{ needs.resolve-deploy-target.outputs.target }}" = "prod" ]; then
{
echo "SSH_HOST=${{ secrets.PROD_SSH_HOST }}"
echo "SSH_USER=${{ secrets.PROD_SSH_USER }}"
echo "SSH_PORT=${{ secrets.PROD_SSH_PORT || '22' }}"
echo "COMPOSE_DIR=${{ secrets.PROD_DEPLOY_PATH || '/opt/life-echo' }}"
} >> "$GITHUB_ENV"
else
{
echo "SSH_HOST=${{ secrets.SSH_HOST }}"
echo "SSH_USER=${{ secrets.SSH_USER }}"
echo "SSH_PORT=${{ secrets.SSH_PORT || '22' }}"
echo "COMPOSE_DIR=${{ secrets.DEPLOY_PATH || '/opt/life-echo' }}"
} >> "$GITHUB_ENV"
fi
{
echo "SSH_BASE_OPTS=-o BatchMode=yes -o ConnectTimeout=15 -o ConnectionAttempts=3 -o ServerAliveInterval=20 -o ServerAliveCountMax=6 -o TCPKeepAlive=yes"
} >> "$GITHUB_ENV"
- name: Add server to known hosts
run: |
set -euo pipefail
mkdir -p ~/.ssh
touch ~/.ssh/known_hosts
KEYSCAN_OK=0
for i in 1 2 3; do
echo "ssh-keyscan attempt ${i}/3: ${SSH_HOST}:${SSH_PORT:-22}"
if ssh-keyscan -T 10 -H -p "${SSH_PORT:-22}" "${SSH_HOST}" >> ~/.ssh/known_hosts 2>/tmp/ssh-keyscan.err; then
KEYSCAN_OK=1
break
fi
sleep 2
done
if [ "$KEYSCAN_OK" -eq 1 ]; then
echo "SSH_COMMON_OPTS=${SSH_BASE_OPTS}" >> "$GITHUB_ENV"
else
echo "::warning::ssh-keyscan failed. Falling back to non-strict host checking for this run."
if [ -f /tmp/ssh-keyscan.err ]; then
echo "--- ssh-keyscan stderr ---"
cat /tmp/ssh-keyscan.err || true
echo "--------------------------"
fi
echo "SSH_COMMON_OPTS=${SSH_BASE_OPTS} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null" >> "$GITHUB_ENV"
fi
- name: Prepare remote candidate release
env:
IMAGE_TAG: ${{ env.REGISTRY }}/${{ env.REGISTRY_NAMESPACE }}/${{ env.IMAGE_NAME }}:${{ needs.resolve-deploy-target.outputs.image_tag }}
REGISTRY: ${{ env.REGISTRY }}
ALIYUN_CR_USERNAME: ${{ secrets.ALIYUN_CR_USERNAME }}
ALIYUN_CR_PASSWORD: ${{ secrets.ALIYUN_CR_PASSWORD }}
run: |
set -euo pipefail
echo "准备候选版本..."
echo "镜像标签: $IMAGE_TAG"
echo "部署目录: $COMPOSE_DIR/api"
LOGIN_OK=0
for i in 1 2 3; do
echo "远端 docker login 尝试 ${i}/3..."
if echo "$ALIYUN_CR_PASSWORD" | ssh ${SSH_COMMON_OPTS:-} -p "$SSH_PORT" "$SSH_USER@$SSH_HOST" \
"docker login $REGISTRY --username=$ALIYUN_CR_USERNAME --password-stdin"; then
LOGIN_OK=1
break
fi
sleep 3
done
if [ "$LOGIN_OK" -ne 1 ]; then
echo "::error::远端 docker login 连续 3 次失败。"
exit 1
fi
BOOTSTRAP_OK=0
for i in 1 2 3; do
echo "远端目录与网络初始化尝试 ${i}/3..."
if ssh ${SSH_COMMON_OPTS:-} -p "$SSH_PORT" "$SSH_USER@$SSH_HOST" "
set -euo pipefail
mkdir -p '$COMPOSE_DIR/api'
mkdir -p '$COMPOSE_DIR/api/backups'
docker network inspect api_life-echo-network >/dev/null 2>&1 || docker network create api_life-echo-network
"; then
BOOTSTRAP_OK=1
break
fi
sleep 3
done
if [ "$BOOTSTRAP_OK" -ne 1 ]; then
echo "::error::远端目录与网络初始化连续 3 次失败。"
exit 1
fi
if [ "${{ needs.resolve-deploy-target.outputs.target }}" = "prod" ]; then
ENV_SRC="api/.env.production"
else
ENV_SRC="api/.env.staging"
fi
if [ ! -f "$ENV_SRC" ]; then
echo "::error::缺少 $ENV_SRC无法部署。"
exit 1
fi
# 仅检查「有效配置行」:跳过整行注释,避免 # KEY=your_* 示例误伤部署
if grep -Ev '^[[:space:]]*#' "$ENV_SRC" | grep -Eq '=(your_|replace_with_|\\.{3})$'; then
echo "::error::$ENV_SRC 仍包含占位符值,请先完善环境文件。"
exit 1
fi
if grep -Ev '^[[:space:]]*#' "$ENV_SRC" | grep -Eq '^DATABASE_URL=.*@localhost:' || grep -Ev '^[[:space:]]*#' "$ENV_SRC" | grep -Eq '^REDIS_URL=redis://localhost'; then
echo "::error::$ENV_SRC 包含 localhost 数据库或 Redis 地址,容器内将无法连接。"
exit 1
fi
if grep -Ev '^[[:space:]]*#' "$ENV_SRC" | grep -Eq '^DATABASE_URL=.*@postgresql:'; then
echo "::error::$ENV_SRC 仍引用过期主机名 postgresql当前 compose 服务名应为 postgres。"
exit 1
fi
echo "上传候选 compose 与环境文件..."
SCP_OK=0
for i in 1 2 3; do
echo "上传候选文件尝试 ${i}/3..."
if scp ${SSH_COMMON_OPTS:-} -P "$SSH_PORT" ./api/docker-compose.yml "$SSH_USER@$SSH_HOST:$COMPOSE_DIR/api/docker-compose.candidate.yml" \
&& scp ${SSH_COMMON_OPTS:-} -P "$SSH_PORT" "$ENV_SRC" "$SSH_USER@$SSH_HOST:$COMPOSE_DIR/api/.env.candidate"; then
SCP_OK=1
break
fi
sleep 3
done
if [ "$SCP_OK" -ne 1 ]; then
echo "::error::上传候选文件连续 3 次失败。"
exit 1
fi
PULL_OK=0
for i in 1 2 3; do
echo "远端拉取候选镜像尝试 ${i}/3..."
if ssh ${SSH_COMMON_OPTS:-} -p "$SSH_PORT" "$SSH_USER@$SSH_HOST" "
set -euo pipefail
cd '$COMPOSE_DIR/api'
echo '拉取候选镜像: $IMAGE_TAG'
docker pull '$IMAGE_TAG'
sed -i.tmp 's|image:.*lifecho-api.*|image: $IMAGE_TAG|g' docker-compose.candidate.yml
sed -i.tmp 's|image:.*life-echo-api.*|image: $IMAGE_TAG|g' docker-compose.candidate.yml
rm -f docker-compose.candidate.yml.tmp 2>/dev/null || true
"; then
PULL_OK=1
break
fi
sleep 3
done
if [ "$PULL_OK" -ne 1 ]; then
echo "::error::远端拉取候选镜像连续 3 次失败。"
exit 1
fi
- name: Promote candidate release
env:
COMPOSE_FILE: docker-compose.yml
run: |
set -euo pipefail
echo "切换线上版本,容器启动时将自动执行 Alembic..."
ssh ${SSH_COMMON_OPTS:-} -p "$SSH_PORT" "$SSH_USER@$SSH_HOST" "
set -euo pipefail
cd '$COMPOSE_DIR/api'
if [ -f '$COMPOSE_FILE' ]; then
cp '$COMPOSE_FILE' '${COMPOSE_FILE}.predeploy'
fi
if [ -f '.env.production' ]; then
cp '.env.production' '.env.production.predeploy'
fi
if [ -f '.env' ]; then
cp '.env' '.env.predeploy'
fi
mv 'docker-compose.candidate.yml' '$COMPOSE_FILE'
mv '.env.candidate' '.env'
if ! docker compose -f '$COMPOSE_FILE' up -d --remove-orphans; then
echo 'docker compose up 失败,输出 api 状态与最近日志...'
docker compose -f '$COMPOSE_FILE' ps || true
API_CID=\$(docker compose -f '$COMPOSE_FILE' ps -q api || true)
if [ -n \"\$API_CID\" ]; then
docker inspect -f '{{json .State}}' \"\$API_CID\" || true
fi
docker compose -f '$COMPOSE_FILE' logs --tail=120 api || true
docker compose -f '$COMPOSE_FILE' logs --tail=80 celery-worker || true
exit 1
fi
echo '等待服务启动...'
sleep 20
docker image prune -f || true
docker compose -f '$COMPOSE_FILE' ps
"
- name: Verify deployment
run: |
echo "验证部署状态..."
ssh ${SSH_COMMON_OPTS:-} -p "$SSH_PORT" "$SSH_USER@$SSH_HOST" "
set -euo pipefail
cd '$COMPOSE_DIR/api'
docker compose ps
API_CID=\$(docker compose ps -q api)
if [ -z \"\$API_CID\" ]; then
echo '未找到 api 容器'
docker compose logs --tail=80 api || true
exit 1
fi
API_HEALTH=''
for _ in \$(seq 1 24); do
API_HEALTH=\$(docker inspect -f '{{if .State.Health}}{{.State.Health.Status}}{{else}}{{.State.Status}}{{end}}' \"\$API_CID\")
echo \"api health: \$API_HEALTH\"
if [ \"\$API_HEALTH\" = 'healthy' ]; then
break
fi
sleep 5
done
if [ \"\$API_HEALTH\" != 'healthy' ]; then
echo 'api 容器未在预期时间内变为 healthy'
docker inspect -f '{{json .State}}' \"\$API_CID\" || true
docker compose logs --tail=80 api || true
exit 1
fi
docker compose logs --tail=50 api
"