ProteoWizard · brendanx67 · Oct 20, 2025 · Oct 19, 2025 · Oct 19, 2025 · Oct 19, 2025
diff --git a/...ata/TOFsulfasMS4GHzDualMode+DADSpectra+UVSignal272-NoProfile.d/AcqData/DefaultMassCal.xml b/...ata/TOFsulfasMS4GHzDualMode+DADSpectra+UVSignal272-NoProfile.d/AcqData/DefaultMassCal.xml
@@ -1,4 +1,4 @@
-<?xml version="1.0" encoding="UTF-8"?>
+<?xml version="1.0" encoding="UTF-8"?>
 <DefaultMassCalibration xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:noNamespaceSchemaLocation="DefaultMassCal.xsd">
   <Version>1</Version>
   <DefaultCalibrations>

diff --git a/..._Test.data/TOFsulfasMS4GHzDualMode+DADSpectra+UVSignal272-NoProfile.d/AcqData/Devices.xml b/..._Test.data/TOFsulfasMS4GHzDualMode+DADSpectra+UVSignal272-NoProfile.d/AcqData/Devices.xml
@@ -1,4 +1,4 @@
-<?xml version="1.0" encoding="UTF-8"?>
+<?xml version="1.0" encoding="UTF-8"?>
 <Devices xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:noNamespaceSchemaLocation="Devices.xsd">
   <Version>1</Version>
   <Device DeviceID="1">

diff --git a/...ent_Test.data/TOFsulfasMS4GHzDualMode+DADSpectra+UVSignal272-NoProfile.d/AcqData/MSTS.xml b/...ent_Test.data/TOFsulfasMS4GHzDualMode+DADSpectra+UVSignal272-NoProfile.d/AcqData/MSTS.xml
@@ -1,4 +1,4 @@
-<?xml version="1.0" encoding="UTF-8"?>
+<?xml version="1.0" encoding="UTF-8"?>
 <TimeSegments xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:noNamespaceSchemaLocation="MSTS.xsd">
   <Version>1</Version>
   <TimeSegment TimeSegmentID="1">

diff --git a/...est.data/TOFsulfasMS4GHzDualMode+DADSpectra+UVSignal272-NoProfile.d/AcqData/acqmethod.xml b/...est.data/TOFsulfasMS4GHzDualMode+DADSpectra+UVSignal272-NoProfile.d/AcqData/acqmethod.xml
@@ -1,4 +1,4 @@
-<?xml version="1.0" encoding="utf-8"?>
+<?xml version="1.0" encoding="utf-8"?>
 <AcqMethod>
   <Version>1.0</Version>
   <!--Method Properties-->

diff --git a/pwiz_tools/Bumbershoot/bumberdash/Tests/Data/AgilentTest.d/AcqData/Devices.xml b/pwiz_tools/Bumbershoot/bumberdash/Tests/Data/AgilentTest.d/AcqData/Devices.xml
@@ -1,4 +1,4 @@
-<?xml version="1.0" encoding="UTF-8"?>
+<?xml version="1.0" encoding="UTF-8"?>
 <!-- edited with XML Spy v4.4 U (http://www.xmlspy.com) by Malini Srikantarajeurs (private) -->
 <!--Sample XML file generated by XML Spy v4.4 U (http://www.xmlspy.com)-->
 <Devices xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:noNamespaceSchemaLocation="Devices.xsd">

diff --git a/pwiz_tools/Bumbershoot/bumberdash/Tests/Data/AgilentTest.d/AcqData/acqmethod.xml b/pwiz_tools/Bumbershoot/bumberdash/Tests/Data/AgilentTest.d/AcqData/acqmethod.xml
@@ -1,4 +1,4 @@
-<?xml version="1.0" encoding="utf-8"?>
+<?xml version="1.0" encoding="utf-8"?>
 <AcqMethod>
   <Version>1.0</Version>
   <!--Method Properties-->

diff --git a/scripts/misc/analyze-bom-git.ps1 b/scripts/misc/analyze-bom-git.ps1
@@ -1,13 +1,17 @@
 # Analyze UTF-8 BOM usage across Git-tracked files only
 # Much faster than scanning entire directory tree
 #
-# Usage: .\analyze-bom-git.ps1
-# Output: Console report + files-with-bom.txt
+# Usage: .\analyze-bom-git.ps1 [-OutputFile <path>]
+# Output: Console report + optional output file
 #
 # This script was created during the webclient_replacement work (Oct 2025)
 # when LLM tools were inadvertently removing BOMs from source files.
 # See todos/backlog/TODO-utf8_no_bom.md for the full standardization plan.
 
+param(
+    [string]$OutputFile = $null  # If not specified, no file will be written
+)
+
 $utf8Bom = @(0xEF, 0xBB, 0xBF)
 
 Write-Host "Scanning Git-tracked files for UTF-8 BOM usage..." -ForegroundColor Cyan
@@ -109,11 +113,13 @@ if ($withBom.Count -gt 0) {
         Write-Host "  ... and $($withBom.Count - 30) more" -ForegroundColor Yellow
     }
     Write-Host ""
-
-    # Export full list
-    $withBom | Out-File -FilePath "files-with-bom.txt" -Encoding ASCII
-    Write-Host "Full list written to: files-with-bom.txt" -ForegroundColor Cyan
-    Write-Host ""
+
+    # Export full list if output file specified
+    if ($OutputFile) {
+        $withBom | Out-File -FilePath $OutputFile -Encoding ASCII
+        Write-Host "Full list written to: $OutputFile" -ForegroundColor Cyan
+        Write-Host ""
+    }
 }
 
 Write-Host "=== Recommendation ===" -ForegroundColor Cyan

diff --git a/scripts/misc/validate-bom-compliance.ps1 b/scripts/misc/validate-bom-compliance.ps1
@@ -0,0 +1,117 @@
+# Validate UTF-8 BOM compliance against approved list
+#
+# Usage: .\validate-bom-compliance.ps1
+# Returns: Exit code 0 if compliant, 1 if unexpected BOMs found
+#
+# This script is intended to be used in CI/commit hooks to prevent
+# unwanted BOM introduction.
+
+$ErrorActionPreference = "Stop"
+$utf8Bom = @(0xEF, 0xBB, 0xBF)
+
+# Approved files that are allowed to have BOM with explanations
+$approvedBomFiles = @{
+    # Visual Studio auto-generated COM type library files
+    "pwiz_aux/msrc/utility/vendor_api/thermo/xrawfile2.tli" = "Visual Studio generated COM type library"
+    "pwiz_tools/Skyline/Executables/BuildMethod/BuildLTQMethod/ltmethod.tlh" = "Visual Studio generated COM type library"
+    "pwiz_tools/Skyline/Executables/BuildMethod/BuildLTQMethod/ltmethod.tli" = "Visual Studio generated COM type library"
+    "pwiz_tools/Skyline/Executables/BuildMethod/BuildTSQEZMethod/tsqezmethod.tlh" = "Visual Studio generated COM type library"
+    "pwiz_tools/Skyline/Executables/BuildMethod/BuildTSQEZMethod/tsqezmethod.tli" = "Visual Studio generated COM type library"
+
+    # Agilent vendor data format test files (represent real instrument output)
+    "pwiz/data/vendor_readers/Agilent/Reader_Agilent_Test.data/TOFsulfasMS4GHzDualMode+DADSpectra+UVSignal272-NoProfile.d/AcqData/DefaultMassCal.xml" = "Agilent vendor data format"
+    "pwiz/data/vendor_readers/Agilent/Reader_Agilent_Test.data/TOFsulfasMS4GHzDualMode+DADSpectra+UVSignal272-NoProfile.d/AcqData/Devices.xml" = "Agilent vendor data format"
+    "pwiz/data/vendor_readers/Agilent/Reader_Agilent_Test.data/TOFsulfasMS4GHzDualMode+DADSpectra+UVSignal272-NoProfile.d/AcqData/MSTS.xml" = "Agilent vendor data format"
+    "pwiz/data/vendor_readers/Agilent/Reader_Agilent_Test.data/TOFsulfasMS4GHzDualMode+DADSpectra+UVSignal272-NoProfile.d/AcqData/acqmethod.xml" = "Agilent vendor data format"
+    "pwiz_tools/Bumbershoot/bumberdash/Tests/Data/AgilentTest.d/AcqData/Devices.xml" = "Agilent vendor data format"
+    "pwiz_tools/Bumbershoot/bumberdash/Tests/Data/AgilentTest.d/AcqData/acqmethod.xml" = "Agilent vendor data format"
+}
+
+Write-Host "Validating UTF-8 BOM compliance..." -ForegroundColor Cyan
+Write-Host ""
+
+# Get all Git-tracked files
+$gitFiles = @(git ls-files)
+$filesWithBom = @()
+
+foreach ($file in $gitFiles) {
+    # Convert forward slashes to backslashes for Windows
+    $filePath = $file.Replace('/', '\')
+    $fullPath = Join-Path (Get-Location) $filePath
+
+    if (-not [System.IO.File]::Exists($fullPath)) {
+        continue
+    }
+
+    try {
+        # Read first 3 bytes
+        $stream = [System.IO.File]::OpenRead($fullPath)
+        $bytes = New-Object byte[] 3
+        $bytesRead = $stream.Read($bytes, 0, 3)
+        $stream.Close()
+
+        if ($bytesRead -ge 3 -and
+            $bytes[0] -eq $utf8Bom[0] -and
+            $bytes[1] -eq $utf8Bom[1] -and
+            $bytes[2] -eq $utf8Bom[2]) {
+            $filesWithBom += $file
+        }
+    } catch {
+        # Skip files that can't be read
+    }
+}
+
+Write-Host "Found $($filesWithBom.Count) files with UTF-8 BOM" -ForegroundColor $(if ($filesWithBom.Count -eq 0) { "Green" } else { "Yellow" })
+Write-Host ""
+
+# Check for unexpected BOMs
+$unexpectedBoms = @()
+$approvedBoms = @()
+
+foreach ($file in $filesWithBom) {
+    $normalizedPath = $file.Replace('\', '/')
+    if ($approvedBomFiles.ContainsKey($normalizedPath)) {
+        $approvedBoms += @{
+            File = $file
+            Reason = $approvedBomFiles[$normalizedPath]
+        }
+    } else {
+        $unexpectedBoms += $file
+    }
+}
+
+# Report approved BOMs
+if ($approvedBoms.Count -gt 0) {
+    Write-Host "=== Approved BOMs ($($approvedBoms.Count)) ===" -ForegroundColor Green
+    foreach ($entry in $approvedBoms) {
+        Write-Host "  [OK] $($entry.File)" -ForegroundColor Green
+        Write-Host "       Reason: $($entry.Reason)" -ForegroundColor Gray
+    }
+    Write-Host ""
+}
+
+# Report unexpected BOMs
+if ($unexpectedBoms.Count -gt 0) {
+    Write-Host "=== UNEXPECTED BOMs FOUND ($($unexpectedBoms.Count)) ===" -ForegroundColor Red
+    Write-Host ""
+    Write-Host "The following files have UTF-8 BOMs but are not on the approved list:" -ForegroundColor Red
+    Write-Host ""
+    foreach ($file in $unexpectedBoms) {
+        Write-Host "  [ERROR] $file" -ForegroundColor Red
+    }
+    Write-Host ""
+    Write-Host "Action required:" -ForegroundColor Yellow
+    Write-Host "  1. If these files should NOT have BOM, remove it using scripts/misc/remove-bom.ps1" -ForegroundColor Yellow
+    Write-Host "  2. If these files MUST have BOM, add them to the approved list in this script" -ForegroundColor Yellow
+    Write-Host ""
+    Write-Host "For more information, see todos/active/TODO-20251019_utf8_no_bom.md" -ForegroundColor Yellow
+    Write-Host ""
+    exit 1
+}
+
+# Success
+Write-Host "=== VALIDATION PASSED ===" -ForegroundColor Green
+Write-Host "All files with BOM are on the approved list." -ForegroundColor Green
+Write-Host "Project is BOM-compliant!" -ForegroundColor Green
+Write-Host ""
+exit 0
diff --git a/todos/active/TODO-20251019_utf8_no_bom.md b/todos/active/TODO-20251019_utf8_no_bom.md
@@ -105,8 +105,14 @@ Modern best practice is **UTF-8 without BOM** for source code. BOM can cause iss
 
 ## Tools & Scripts
 - **`scripts/misc/analyze-bom-git.ps1`** - Analysis script (keep permanently)
+  - Now accepts optional `-OutputFile` parameter
+  - No longer creates files-with-bom.txt at root by default
 - **`scripts/misc/remove-bom.ps1`** - BOM removal script (keep permanently)
-- **`todos/active/files-with-bom.txt`** - Complete list of 3,439 files with BOM (REMOVE before completing TODO)
+- **`scripts/misc/validate-bom-compliance.ps1`** - Validation script (keep permanently)
+  - Checks against approved list of 11 files allowed to have BOM
+  - Returns exit code 0 if compliant, 1 if unexpected BOMs found
+  - Ready for use in CI or commit hooks
+- **`todos/active/original-files-with-bom.txt`** - Original list of 3,439 files with BOM (keep for reference)
 
 ## Risks & Considerations
 - **Large commit**: May affect hundreds of files
@@ -121,6 +127,54 @@ Modern best practice is **UTF-8 without BOM** for source code. BOM can cause iss
 - Guidelines prevent regression
 - Analysis script confirms BOM-free status
 
+## Future Work: CI Integration (Out of Scope for Current Branch)
+
+The `validate-bom-compliance.ps1` script is ready for CI integration but requires significant additional work:
+
+**Key Challenge: Cross-Platform Compatibility**
+- ProteoWizard builds on Windows, Linux, and macOS (hence Boost.Build)
+- Current validation script is PowerShell (Windows-only)
+- Any CI integration must work across all three platforms
+- Commit hooks must be cross-platform (bash scripts work everywhere, PowerShell doesn't)
+- This complexity is why we defer to future work with proper planning
+
+Current script provides immediate value for Windows development (where most Skyline work happens), but ProteoWizard-level CI integration requires a proper cross-platform solution.
+
+Consider for future TODO:
+
+### Option 1: Boost.Build/Jam Integration
+Add BOM validation test to pwiz test suite (runs on TeamCity alongside Reader_Agilent_Test, etc.):
+- Create C++ wrapper executable: `scripts/misc/bom-validation-test.cpp`
+  - Calls PowerShell script via `std::system()`
+  - Returns proper exit codes (0 = pass, 1 = fail)
+- Add to appropriate Jamfile.jam (likely `pwiz/utility/misc/Jamfile.jam` or root `Jamroot.jam`)
+  - Use `run-if-exists` rule for Windows-only execution
+  - Example: `run-if-exists bom-validation-test.cpp : : : : bom-validation-test ;`
+- Requires coordination with Boost.Build expert (Matt)
+
+### Option 2: Git Commit Hook
+Simpler alternative for immediate protection:
+- Create `.githooks/pre-commit` template with setup instructions
+- Developers opt-in by running: `git config core.hooksPath .githooks`
+- **Challenge**: Hook must work on Windows, Linux, and macOS
+  - Would need bash script that detects platform
+  - Calls PowerShell on Windows, needs alternate implementation on Unix
+  - Still requires cross-platform solution
+
+### Option 3: GitHub Actions / CI Script
+If project moves to GitHub Actions or similar:
+- Add BOM validation as separate CI job
+- Runs on every PR
+- Fails PR if unexpected BOMs found
+
+**Recommendation**: All options require cross-platform solution. Best approach:
+1. Document the requirement clearly (this TODO)
+2. Create a separate TODO for cross-platform CI integration
+3. Coordinate with Matt (Boost.Build expert) for proper implementation
+4. Consider Python-based solution (works everywhere) instead of PowerShell/bash
+
+For now, the validation script provides value for manual Windows-based verification and documents the approved BOM list.
+
 ## Handoff Prompt for Branch Creation
 
 ```

diff --git a/todos/active/files-with-bom.txt → todos/active/original-files-with-bom.txt b/todos/active/files-with-bom.txt → todos/active/original-files-with-bom.txt
diff --git a/todos/backlog/TODO-compress_vendor_test_data.md b/todos/backlog/TODO-compress_vendor_test_data.md
@@ -0,0 +1,92 @@
+# TODO: Compress vendor test data directories
+
+**Status:** Backlog
+**Priority:** Low
+**Estimated Effort:** Small to Medium (1-2 days)
+
+## Problem
+
+Vendor data format test directories (`.d`, `.raw`, etc.) contain XML and other text files that have binary integrity requirements. These files are:
+- Sensitive to encoding changes (BOM addition/removal changes SHA-1 hashes)
+- Sensitive to line ending changes (\n vs \r\n changes byte offsets)
+- Treated as "binary files masquerading as text" by developers
+- Large and take up significant repository space
+
+Currently, some test data like the Agilent `.d` directories contain XML files with UTF-8 BOMs that represent real instrument output and must not be modified. However, Git treats them as text files and they're vulnerable to inadvertent modification during bulk operations (like BOM removal).
+
+## Current State
+
+Test data is stored uncompressed in directories like:
+- `pwiz/data/vendor_readers/Agilent/Reader_Agilent_Test.data/*.d/`
+- `pwiz/data/vendor_readers/Waters/Reader_Waters_Test.data/*.raw/`
+- Similar directories for other vendors
+
+## Proposed Solution
+
+Store vendor test data directories in compressed archives (.zip, .7z, or .gz):
+
+1. **Compress test data**
+   - One-time compression of existing `.d` and other vendor directories
+   - Use .zip (widely supported) or .7z (better compression)
+   - Expected compression: 80-90% size reduction for XML-heavy formats
+
+2. **Update test harness**
+   - Modify `VendorReaderTestHarness.cpp` to:
+     - Detect if test data is compressed (check for .zip/.7z extension)
+     - Extract to temp directory before running test
+     - Run test on extracted data
+     - Clean up temp directory after test
+   - Handle extraction failures gracefully with clear error messages
+
+3. **Update build system**
+   - Ensure extraction tools available in CI environment
+   - Add extraction step if needed for build-time test data generation
+
+## Benefits
+
+- **Binary integrity**: Compressed archives are explicitly binary, preventing accidental text modifications
+- **Repository size**: Significant space savings (XML compresses extremely well)
+- **Consistency**: Matches how vendor reader DLLs are stored in `pwiz_aux/msrc/utility`
+- **Protection**: Prevents inadvertent BOM removal, line ending changes, or other text transformations
+- **Git performance**: Fewer objects to track, faster clones/pulls
+
+## Risks & Mitigation
+
+| Risk | Mitigation |
+|------|-----------|
+| Slower tests (extraction overhead) | Extract once per test run, reuse for multiple tests |
+| Extraction failure in CI | Clear error messages, verify extraction tools in CI setup |
+| Developer workflow disruption | Document how to add/update test data |
+| Debugging difficulty | Keep some uncompressed data for quick inspection, or add extraction utility |
+
+## Implementation Plan
+
+### Phase 1: Proof of Concept (4 hours)
+1. Compress one `.d` directory (e.g., TOFsulfasMS4GHzDualMode test data)
+2. Update VendorReaderTestHarness.cpp to handle extraction for that one test
+3. Verify test passes with compressed data
+4. Measure compression ratio and test time impact
+
+### Phase 2: Full Implementation (1 day)
+1. Compress all vendor test data directories
+2. Update all vendor reader tests
+3. Update documentation for adding new test data
+4. Verify all tests pass on local machine
+
+### Phase 3: CI Integration (2-4 hours)
+1. Ensure CI environments have extraction tools
+2. Run full test suite on CI
+3. Monitor for any extraction-related failures
+4. Document any CI-specific requirements
+
+## Related
+
+- Similar to how vendor reader DLLs are stored in `pwiz_aux/msrc/utility`
+- Related to BOM standardization work (TODO-20251019_utf8_no_bom.md)
+- May want to exclude compressed archives from certain Git operations
+
+## Future Considerations
+
+- Could use this approach for other large test data
+- Could add utility script to easily extract/re-compress for debugging
+- Could use Git LFS for very large compressed archives if needed