Skip to content

Commit 0e17cc3

Browse files
Sync extract-text workflow with latest template
Updates: - Display extraction summary from .extraction_summary.txt file - Use @main branch for toolkit actions - Include auto-restart job with PAT support - Proper timeout and error handling configuration State: md
1 parent 994cfa5 commit 0e17cc3

File tree

1 file changed

+50
-3
lines changed

1 file changed

+50
-3
lines changed

.github/workflows/extract-text.yml

Lines changed: 50 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,19 @@
1+
# Example caller workflow for text extraction
2+
# Copy this to your state data repo as .github/workflows/extract-text.yml
3+
14
name: Text Extraction from Bills
25

36
on:
47
schedule:
5-
- cron: "0 4 * * *" # Daily at 4 AM UTC (adjust as needed)
8+
- cron: "0 8 * * *" # Daily at 8 AM UTC (~3 AM ET, ~12 AM PT)
69
workflow_dispatch:
710

811
jobs:
912
extract-text:
1013
name: Text Extraction
1114
runs-on: ubuntu-latest
12-
timeout-minutes: 330 # 5.5 hours (recommended for large datasets)
15+
timeout-minutes: 355 # ~5.9 hours (close to GitHub's 6-hour limit)
16+
continue-on-error: true # Don't fail workflow on timeout - let restart job handle it
1317
permissions:
1418
contents: write
1519

@@ -20,9 +24,10 @@ jobs:
2024
fetch-depth: 0
2125

2226
- name: Run text extraction action
27+
id: extract
2328
uses: windy-civi/toolkit/actions/extract@main
2429
with:
25-
state: md # Maryland
30+
state: md # MD
2631
github-token: ${{ secrets.GITHUB_TOKEN }}
2732
force-update: "false"
2833

@@ -35,3 +40,45 @@ jobs:
3540
else
3641
echo "⚠️ Summary file not found"
3742
fi
43+
44+
# Auto-restart job if extraction was cancelled (timeout) or failed
45+
check-and-restart:
46+
name: Check Status & Restart if Needed
47+
needs: extract-text
48+
runs-on: ubuntu-latest
49+
if: needs.extract-text.result == 'cancelled' || needs.extract-text.result == 'failure'
50+
permissions:
51+
actions: write
52+
contents: read
53+
54+
steps:
55+
- name: Checkout to check completion status
56+
uses: actions/checkout@v4
57+
58+
- name: Check if more work remains
59+
id: check
60+
shell: bash
61+
run: |
62+
# Count bills that still need text extraction
63+
# Bills without _processing.text_extraction_latest_update need processing
64+
TOTAL_BILLS=$(find country:us/state:*/sessions/*/bills/*/metadata.json 2>/dev/null | wc -l || echo "0")
65+
66+
if [ "$TOTAL_BILLS" -gt 0 ]; then
67+
echo "📊 Found $TOTAL_BILLS bills total"
68+
echo "needs_restart=true" >> $GITHUB_OUTPUT
69+
else
70+
echo "⚠️ No bills found or extraction complete"
71+
echo "needs_restart=false" >> $GITHUB_OUTPUT
72+
fi
73+
74+
- name: Trigger workflow restart
75+
if: steps.check.outputs.needs_restart == 'true'
76+
env:
77+
# Note: GITHUB_TOKEN cannot trigger workflows due to GitHub security restrictions
78+
# You must create a Personal Access Token (PAT) with 'workflow' scope and add it as a secret
79+
# See: https://docs.github.com/en/actions/security-guides/automatic-token-authentication#using-the-github_token-in-a-workflow
80+
GH_TOKEN: ${{ secrets.PAT_WORKFLOW_TRIGGER }}
81+
run: |
82+
echo "⚠️ Previous run was cancelled/failed. Restarting to continue extraction..."
83+
echo " (Incremental processing will skip already-completed bills)"
84+
gh workflow run extract-text.yml --repo ${{ github.repository }}

0 commit comments

Comments
 (0)