Skip to content

CI/CD Integration

Integrate youBencha into your CI/CD pipeline to automate AI agent evaluations on every pull request or scheduled basis.

.github/workflows/youbencha.yml
name: youBencha Evaluation
on:
pull_request:
branches: [main]
schedule:
- cron: '0 6 * * *' # Daily at 6 AM UTC
jobs:
evaluate:
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Setup Node.js
uses: actions/setup-node@v4
with:
node-version: '20'
- name: Install youBencha
run: npm install -g youbencha
- name: Run Evaluation
run: yb run -c suite.yaml
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
- name: Check Results
run: |
FAILED=$(jq '.summary.failed' .youbencha-workspace/run-*/artifacts/results.json)
if [ "$FAILED" -gt 0 ]; then
echo "❌ Evaluation failed: $FAILED evaluators did not pass"
exit 1
fi
echo "✅ All evaluators passed"
- name: Upload Artifacts
uses: actions/upload-artifact@v4
if: always()
with:
name: evaluation-results
path: .youbencha-workspace/run-*/artifacts/

For evaluations using GitHub Copilot CLI:

.github/workflows/youbencha-copilot.yml
name: youBencha with Copilot
on:
pull_request:
branches: [main]
jobs:
evaluate:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/setup-node@v4
with:
node-version: '20'
- name: Install Dependencies
run: |
npm install -g youbencha
npm install -g @githubnext/github-copilot-cli
- name: Authenticate GitHub CLI
run: gh auth login --with-token <<< "${{ secrets.GITHUB_TOKEN }}"
- name: Run Evaluation
run: yb run -c suite.yaml
env:
GITHUB_TOKEN: ${{ secrets.COPILOT_TOKEN }}
- name: Generate Report
run: yb report --from .youbencha-workspace/run-*/artifacts/results.json
- name: Upload Results
uses: actions/upload-artifact@v4
with:
name: evaluation-report
path: |
.youbencha-workspace/run-*/artifacts/report.md
.youbencha-workspace/run-*/artifacts/results.json
- name: Evaluate and Gate
run: |
yb run -c suite.yaml
STATUS=$(jq -r '.summary.overall_status' .youbencha-workspace/run-*/artifacts/results.json)
if [ "$STATUS" != "passed" ]; then
echo "::error::youBencha evaluation failed"
exit 1
fi
- name: Comment on PR
uses: actions/github-script@v7
if: github.event_name == 'pull_request'
with:
script: |
const fs = require('fs');
const report = fs.readFileSync('.youbencha-workspace/run-*/artifacts/report.md', 'utf8');
await github.rest.issues.createComment({
owner: context.repo.owner,
repo: context.repo.repo,
issue_number: context.issue.number,
body: `## youBencha Evaluation Results\n\n${report}`
});
name: Daily Regression
on:
schedule:
- cron: '0 6 * * *' # 6 AM UTC daily
jobs:
regression:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Run Regression Suite
run: |
npm install -g youbencha
yb run -c suites/regression.yaml
- name: Store Results
run: |
DATE=$(date +%Y-%m-%d)
cp .youbencha-workspace/run-*/artifacts/results.json ./history/${DATE}.json
- name: Commit History
run: |
git config user.name "github-actions"
git config user.email "[email protected]"
git add history/
git commit -m "Add regression results for $(date +%Y-%m-%d)" || true
git push

Run multiple suites in parallel:

jobs:
evaluate:
runs-on: ubuntu-latest
strategy:
matrix:
suite:
- auth-feature
- api-endpoints
- database-migrations
steps:
- uses: actions/checkout@v4
- name: Run Suite
run: |
npm install -g youbencha
yb run -c suites/${{ matrix.suite }}.yaml
- name: Upload Results
uses: actions/upload-artifact@v4
with:
name: results-${{ matrix.suite }}
path: .youbencha-workspace/run-*/artifacts/

Store sensitive values in GitHub Secrets:

- name: Run Evaluation
run: yb run -c suite.yaml
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
COPILOT_TOKEN: ${{ secrets.COPILOT_TOKEN }}
SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }}
suite.yaml
name: ci-evaluation
repo: https://github.com/${{ github.repository }}.git
branch: ${{ github.head_ref }}
agent:
type: copilot-cli
config:
prompt_file: ./prompts/ci-task.md
evaluators:
- name: git-diff
config:
assertions:
max_files_changed: 20
- name: agentic-judge
config:
type: copilot-cli
assertions:
ci_ready: "Changes are CI-ready. Score 0-1."
post_evaluation:
- name: webhook
config:
url: ${SLACK_WEBHOOK_URL}
  1. Use secrets for tokens and webhooks
  2. Upload artifacts for debugging failures
  3. Set appropriate timeouts for agent execution
  4. Use matrix builds for multiple suites
  5. Store history for trend analysis