|
| 1 | +# Example caller workflow for text extraction |
| 2 | +# Copy this to your state data repo as .github/workflows/extract-text.yml |
| 3 | + |
1 | 4 | name: Text Extraction from Bills |
2 | 5 |
|
3 | 6 | on: |
4 | 7 | schedule: |
5 | | - - cron: "0 8 * * *" # Daily at 8 AM UTC (~3 AM ET, ~12 AM PT) |
| 8 | + - cron: "0 8 * * *" # Daily at 8 AM UTC (~3 AM ET, ~12 AM PT) |
6 | 9 | workflow_dispatch: |
7 | 10 |
|
8 | 11 | jobs: |
9 | 12 | extract-text: |
10 | 13 | name: Text Extraction |
11 | 14 | runs-on: ubuntu-latest |
12 | | - timeout-minutes: 330 # 5.5 hours (recommended for large datasets) |
| 15 | + timeout-minutes: 355 # ~5.9 hours (close to GitHub's 6-hour limit) |
| 16 | + continue-on-error: true # Don't fail workflow on timeout - let restart job handle it |
13 | 17 | permissions: |
14 | 18 | contents: write |
15 | 19 |
|
|
20 | 24 | fetch-depth: 0 |
21 | 25 |
|
22 | 26 | - name: Run text extraction action |
| 27 | + id: extract |
23 | 28 | uses: windy-civi/toolkit/actions/extract@main |
24 | 29 | with: |
25 | | - state: vt # Vermont |
| 30 | + state: vt # VT |
26 | 31 | github-token: ${{ secrets.GITHUB_TOKEN }} |
27 | 32 | force-update: "false" |
28 | 33 |
|
|
35 | 40 | else |
36 | 41 | echo "⚠️ Summary file not found" |
37 | 42 | fi |
| 43 | +
|
| 44 | + # Auto-restart job if extraction was cancelled (timeout) or failed |
| 45 | + check-and-restart: |
| 46 | + name: Check Status & Restart if Needed |
| 47 | + needs: extract-text |
| 48 | + runs-on: ubuntu-latest |
| 49 | + if: needs.extract-text.result == 'cancelled' || needs.extract-text.result == 'failure' |
| 50 | + permissions: |
| 51 | + actions: write |
| 52 | + contents: read |
| 53 | + |
| 54 | + steps: |
| 55 | + - name: Checkout to check completion status |
| 56 | + uses: actions/checkout@v4 |
| 57 | + |
| 58 | + - name: Check if more work remains |
| 59 | + id: check |
| 60 | + shell: bash |
| 61 | + run: | |
| 62 | + # Count bills that still need text extraction |
| 63 | + # Bills without _processing.text_extraction_latest_update need processing |
| 64 | + TOTAL_BILLS=$(find country:us/state:*/sessions/*/bills/*/metadata.json 2>/dev/null | wc -l || echo "0") |
| 65 | +
|
| 66 | + if [ "$TOTAL_BILLS" -gt 0 ]; then |
| 67 | + echo "📊 Found $TOTAL_BILLS bills total" |
| 68 | + echo "needs_restart=true" >> $GITHUB_OUTPUT |
| 69 | + else |
| 70 | + echo "⚠️ No bills found or extraction complete" |
| 71 | + echo "needs_restart=false" >> $GITHUB_OUTPUT |
| 72 | + fi |
| 73 | +
|
| 74 | + - name: Trigger workflow restart |
| 75 | + if: steps.check.outputs.needs_restart == 'true' |
| 76 | + env: |
| 77 | + # Note: GITHUB_TOKEN cannot trigger workflows due to GitHub security restrictions |
| 78 | + # You must create a Personal Access Token (PAT) with 'workflow' scope and add it as a secret |
| 79 | + # See: https://docs.github.com/en/actions/security-guides/automatic-token-authentication#using-the-github_token-in-a-workflow |
| 80 | + GH_TOKEN: ${{ secrets.PAT_WORKFLOW_TRIGGER }} |
| 81 | + run: | |
| 82 | + echo "⚠️ Previous run was cancelled/failed. Restarting to continue extraction..." |
| 83 | + echo " (Incremental processing will skip already-completed bills)" |
| 84 | + gh workflow run extract-text.yml --repo ${{ github.repository }} |
0 commit comments