~cytrogen/gstack

gstack/.github/workflows/evals-periodic.yml -rw-r--r-- 3.8 KiB
9c5f4797 — Cytrogen fork: 频率分级路由 + 触发式描述符重写 2 days ago
                                                                                
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
name: Periodic Evals
on:
  schedule:
    - cron: '0 6 * * 1'  # Monday 6 AM UTC
  workflow_dispatch:

concurrency:
  group: evals-periodic
  cancel-in-progress: true

env:
  IMAGE: ghcr.io/${{ github.repository }}/ci
  EVALS_TIER: periodic
  EVALS_ALL: 1  # Ignore diff — run all periodic tests

jobs:
  build-image:
    runs-on: ubicloud-standard-2
    permissions:
      contents: read
      packages: write
    outputs:
      image-tag: ${{ steps.meta.outputs.tag }}
    steps:
      - uses: actions/checkout@v4

      - id: meta
        run: echo "tag=${{ env.IMAGE }}:${{ hashFiles('.github/docker/Dockerfile.ci', 'package.json') }}" >> "$GITHUB_OUTPUT"

      - uses: docker/login-action@v3
        with:
          registry: ghcr.io
          username: ${{ github.actor }}
          password: ${{ secrets.GITHUB_TOKEN }}

      - name: Check if image exists
        id: check
        run: |
          if docker manifest inspect ${{ steps.meta.outputs.tag }} > /dev/null 2>&1; then
            echo "exists=true" >> "$GITHUB_OUTPUT"
          else
            echo "exists=false" >> "$GITHUB_OUTPUT"
          fi

      - if: steps.check.outputs.exists == 'false'
        run: cp package.json .github/docker/

      - if: steps.check.outputs.exists == 'false'
        uses: docker/build-push-action@v6
        with:
          context: .github/docker
          file: .github/docker/Dockerfile.ci
          push: true
          tags: |
            ${{ steps.meta.outputs.tag }}
            ${{ env.IMAGE }}:latest

  evals:
    runs-on: ubicloud-standard-2
    needs: build-image
    container:
      image: ${{ needs.build-image.outputs.image-tag }}
      credentials:
        username: ${{ github.actor }}
        password: ${{ secrets.GITHUB_TOKEN }}
      options: --user runner
    timeout-minutes: 25
    strategy:
      fail-fast: false
      matrix:
        suite:
          - name: e2e-plan
            file: test/skill-e2e-plan.test.ts
          - name: e2e-design
            file: test/skill-e2e-design.test.ts
          - name: e2e-qa-bugs
            file: test/skill-e2e-qa-bugs.test.ts
          - name: e2e-qa-workflow
            file: test/skill-e2e-qa-workflow.test.ts
          - name: e2e-review
            file: test/skill-e2e-review.test.ts
          - name: e2e-workflow
            file: test/skill-e2e-workflow.test.ts
          - name: e2e-routing
            file: test/skill-routing-e2e.test.ts
          - name: e2e-codex
            file: test/codex-e2e.test.ts
          - name: e2e-gemini
            file: test/gemini-e2e.test.ts
    steps:
      - uses: actions/checkout@v4
        with:
          fetch-depth: 0

      - name: Fix bun temp
        run: |
          mkdir -p /home/runner/.cache/bun
          {
            echo "BUN_INSTALL_CACHE_DIR=/home/runner/.cache/bun"
            echo "BUN_TMPDIR=/home/runner/.cache/bun"
            echo "TMPDIR=/home/runner/.cache"
          } >> "$GITHUB_ENV"

      - name: Restore deps
        run: |
          if [ -d /opt/node_modules_cache ] && diff -q /opt/node_modules_cache/.package.json package.json >/dev/null 2>&1; then
            ln -s /opt/node_modules_cache node_modules
          else
            bun install
          fi

      - run: bun run build

      - name: Run ${{ matrix.suite.name }}
        env:
          ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
          GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }}
          EVALS_CONCURRENCY: "40"
          PLAYWRIGHT_BROWSERS_PATH: /opt/playwright-browsers
        run: EVALS=1 bun test --retry 2 --concurrent --max-concurrency 40 ${{ matrix.suite.file }}

      - name: Upload eval results
        if: always()
        uses: actions/upload-artifact@v4
        with:
          name: eval-periodic-${{ matrix.suite.name }}
          path: ~/.gstack-dev/evals/*.json
          retention-days: 90