Refactor Codacy workflow for UTF-8 sanitization #138
Workflow file for this run
  
    
      This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
      Learn more about bidirectional Unicode characters
    
  
  
    
  | # This workflow checks out code, performs a Codacy security scan | ||
| # and integrates the results with the | ||
| # GitHub Advanced Security code scanning feature. For more information on | ||
| # the Codacy security scan action usage and parameters, see | ||
| # https://github.com/codacy/codacy-analysis-cli-action. | ||
| # For more information on Codacy Analysis CLI in general, see | ||
| # https://github.com/codacy/codacy-analysis-cli. | ||
| name: Codacy Security Scan | ||
| on: | ||
| push: | ||
| branches: [ "main" ] | ||
| pull_request: | ||
| branches: [ "main" ] | ||
| schedule: | ||
| - cron: '44 7 * * 0' | ||
| permissions: | ||
| contents: read | ||
| jobs: | ||
| codacy-security-scan: | ||
| permissions: | ||
| contents: read | ||
| security-events: write | ||
| actions: read | ||
| name: Codacy Security Scan | ||
| runs-on: ubuntu-latest | ||
| steps: | ||
| - name: Checkout code | ||
| uses: actions/checkout@v4 | ||
| - name: Sanitize non-UTF-8 files (convert to UTF-8) | ||
| run: | | ||
| set -euo pipefail | ||
| echo "Scanning tracked files for non-UTF-8 contents..." | ||
| converted=0 | ||
| skipped=0 | ||
| errors=0 | ||
| git ls-files -z | while IFS= read -r -d '' f; do | ||
| [ -z "$f" ] && continue | ||
| # Skip very large files | ||
| filesize=$(stat -c%s "$f" || echo 0) | ||
| if [ "$filesize" -gt $((10 * 1024 * 1024)) ]; then | ||
| echo "SKIP (too large): $f" | ||
| skipped=$((skipped+1)) | ||
| continue | ||
| fi | ||
| # Skip obvious binary types | ||
| mime=$(file -b --mime "$f" || true) | ||
| if echo "$mime" | grep -qiE 'charset=binary|application/octet-stream|image/|audio/|video/|executable'; then | ||
| echo "SKIP (binary mime): $f -> $mime" | ||
| skipped=$((skipped+1)) | ||
| continue | ||
| fi | ||
| # Check UTF-8 validity using iconv (returns non-zero on invalid sequences) | ||
| if iconv -f UTF-8 -t UTF-8 "$f" -o /dev/null 2>/dev/null; then | ||
| # already valid UTF-8 | ||
| continue | ||
| fi | ||
| echo "CONVERT: $f (non-UTF-8 detected: $mime)" | ||
| # First attempt: try converting from ISO-8859-1 (latin1) to UTF-8 | ||
| if iconv -f ISO-8859-1 -t UTF-8 "$f" -o "${f}.utf8.tmp" 2>/dev/null; then | ||
| mv "${f}.utf8.tmp" "$f" | ||
| echo " -> iconv ISO-8859-1 -> UTF-8 succeeded for $f" | ||
| converted=$((converted+1)) | ||
| continue | ||
| fi | ||
| # Fallback: re-encode using Python with 'replace' to guarantee valid UTF-8 | ||
| if python3 - <<PY | ||
| import sys | ||
| p = sys.argv[1] | ||
| b = open(p,'rb').read() | ||
| s = b.decode('utf-8','replace') | ||
| open(p,'w',encoding='utf-8').write(s) | ||
| print("ok") | ||
| PY | ||
| then | ||
| echo " -> re-encoded with replacement for $f" | ||
| converted=$((converted+1)) | ||
| continue | ||
| else | ||
| echo " -> failed to convert $f" | ||
| errors=$((errors+1)) | ||
| fi | ||
| done | ||
| echo "Sanitization complete. Converted: $converted, Skipped: $skipped, Errors: $errors" | ||
| if [ "$errors" -ne 0 ]; then | ||
| echo "One or more files could not be sanitized. Please inspect the repository." | ||
| exit 2 | ||
| fi | ||
| shell: bash | ||
| - name: Run Codacy Analysis CLI | ||
| uses: codacy/codacy-analysis-cli-action@d840f886c4bd4edc059706d09c6a1586111c540b | ||
| env: | ||
| JAVA_TOOL_OPTIONS: -Dfile.encoding=UTF-8 | ||
| with: | ||
| project-token: ${{ secrets.CODACY_PROJECT_TOKEN }} | ||
| verbose: true | ||
| output: results.sarif | ||
| format: sarif | ||
| gh-code-scanning-compat: true | ||
| max-allowed-issues: 2147483647 | ||
| - name: Upload SARIF results file | ||
| uses: github/codeql-action/upload-sarif@v3 | ||
| with: | ||
| sarif_file: results.sarif | ||