Skip to content

Commit 762ef41

Browse files
Basic Xorb creation + Add xet-core WASM bindings (#1616)
This doesn't handle dedup at all yet, will not be used as is even if I merge it. What it does: - Write chunks into xorb - Write chunk headers - Handle compression - Handle xorb size limit and xorb chunk count limit Questions: - ~~Are there xorb headers? I'm writing chunk headers before each chunk, but is there a xorb header at the beginning of the xorb to write?~~ No, thanks @assafvayner - For now I only test lz4 compression length against uncompressed length, I do not bother with bg4. Should I test both? - Is the xet backend hardened against invalid chunks/xorbs? cc @assafvayner @seanses @hoytak @sirahd @rajatarya for viz and if you have answers to the questions :) --------- Co-authored-by: Sylvestre Bouchot <Kakulukian@users.noreply.github.com>
1 parent fdf8bb7 commit 762ef41

16 files changed

+2780
-51
lines changed

packages/hub/.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
xet-core-wasm-build

packages/hub/.prettierignore

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,4 +2,5 @@ pnpm-lock.yaml
22
# In order to avoid code samples to have tabs, they don't display well on npm
33
README.md
44
dist
5-
sha256.js
5+
sha256.js
6+
src/vendor/xet-chunk/chunker_wasm_bg.js

packages/hub/package.json

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,8 @@
4141
"prepare": "pnpm run build",
4242
"test": "vitest run",
4343
"test:browser": "vitest run --browser.name=chrome --browser.headless --config vitest-browser.config.mts",
44-
"check": "tsc"
44+
"check": "tsc",
45+
"build:xet-wasm": "./scripts/build-xet-wasm.sh -t bundler -c -b hoytak/250714-eliminate-mdb-v1"
4546
},
4647
"files": [
4748
"src",
Lines changed: 241 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,241 @@
1+
#!/bin/bash
2+
3+
# Regenerate xet-chunk wasm files directly from xet-core codebase
4+
5+
set -euo pipefail
6+
7+
# Configuration
8+
REPO_URL="https://github.com/huggingface/xet-core.git"
9+
DEFAULT_BRANCH="main"
10+
DEFAULT_PACKAGE="hf_xet_thin_wasm"
11+
DEFAULT_JS_TARGET="web"
12+
CLONE_DIR="xet-core-wasm-build"
13+
14+
# Colors for output
15+
RED='\033[0;31m'
16+
GREEN='\033[0;32m'
17+
YELLOW='\033[1;33m'
18+
BLUE='\033[0;34m'
19+
NC='\033[0m' # No Color
20+
21+
# Function to print colored output
22+
log() {
23+
echo -e "${GREEN}[INFO]${NC} $1"
24+
}
25+
26+
warn() {
27+
echo -e "${YELLOW}[WARN]${NC} $1"
28+
}
29+
30+
error() {
31+
echo -e "${RED}[ERROR]${NC} $1"
32+
exit 1
33+
}
34+
35+
# Function to check if a command exists
36+
command_exists() {
37+
command -v "$1" >/dev/null 2>&1
38+
}
39+
40+
# Help function
41+
show_help() {
42+
cat << EOF
43+
Usage: $0 [OPTIONS]
44+
45+
Build WASM packages from xet-core repository.
46+
47+
OPTIONS:
48+
-b, --branch BRANCH Git branch to checkout (default: $DEFAULT_BRANCH)
49+
-p, --package PACKAGE WASM package to build: hf_xet_thin_wasm or hf_xet_wasm (default: $DEFAULT_PACKAGE)
50+
-t, --target TARGET JavaScript target: web, nodejs, bundler, no-modules, deno (default: $DEFAULT_JS_TARGET)
51+
-o, --output DIR Output directory to copy built WASM files
52+
-c, --clean Clean clone directory before starting
53+
-h, --help Show this help message
54+
55+
EXAMPLES:
56+
$0 # Build hf_xet_thin_wasm from main branch
57+
$0 -b feature-branch # Build from specific branch
58+
$0 -p hf_xet_wasm # Build the full WASM package
59+
$0 -o ./my-project/wasm # Copy output to specific directory
60+
$0 -t nodejs -o ./dist # Build for Node.js and copy to dist
61+
62+
REQUIREMENTS:
63+
- Git
64+
- Rust (will install nightly toolchain automatically)
65+
- Internet connection for downloading dependencies
66+
67+
EOF
68+
}
69+
70+
# Parse command line arguments
71+
BRANCH="$DEFAULT_BRANCH"
72+
PACKAGE="$DEFAULT_PACKAGE"
73+
JS_TARGET="$DEFAULT_JS_TARGET"
74+
OUTPUT_DIR=""
75+
CLEAN=false
76+
ORIGINAL_DIR=$(pwd)
77+
78+
while [[ $# -gt 0 ]]; do
79+
case $1 in
80+
-b|--branch)
81+
BRANCH="$2"
82+
shift 2
83+
;;
84+
-p|--package)
85+
PACKAGE="$2"
86+
if [[ "$PACKAGE" != "hf_xet_thin_wasm" && "$PACKAGE" != "hf_xet_wasm" ]]; then
87+
error "Invalid package: $PACKAGE. Must be 'hf_xet_thin_wasm' or 'hf_xet_wasm'"
88+
fi
89+
shift 2
90+
;;
91+
-t|--target)
92+
JS_TARGET="$2"
93+
shift 2
94+
;;
95+
-o|--output)
96+
OUTPUT_DIR="$2"
97+
shift 2
98+
;;
99+
-c|--clean)
100+
CLEAN=true
101+
shift
102+
;;
103+
-h|--help)
104+
show_help
105+
exit 0
106+
;;
107+
*)
108+
error "Unknown option: $1. Use -h for help."
109+
;;
110+
esac
111+
done
112+
113+
# Check prerequisites
114+
log "Checking prerequisites..."
115+
116+
if ! command_exists git; then
117+
error "Git is not installed. Please install Git first."
118+
fi
119+
120+
if ! command_exists rustup; then
121+
error "Rustup is not installed. Please install Rust from https://rustup.rs/"
122+
fi
123+
124+
# Clean previous build if requested
125+
if [[ "$CLEAN" == true && -d "$CLONE_DIR" ]]; then
126+
log "Cleaning previous build directory: $CLONE_DIR"
127+
rm -rf "$CLONE_DIR"
128+
fi
129+
130+
# Clone the repository
131+
if [[ -d "$CLONE_DIR" ]]; then
132+
log "Directory $CLONE_DIR already exists. Using existing clone."
133+
cd "$CLONE_DIR"
134+
log "Fetching latest changes..."
135+
git fetch origin
136+
git checkout "$BRANCH"
137+
git reset --hard "origin/$BRANCH"
138+
else
139+
log "Cloning xet-core repository (branch: $BRANCH, depth: 1)..."
140+
git clone --depth=1 --branch="$BRANCH" "$REPO_URL" "$CLONE_DIR"
141+
cd "$CLONE_DIR"
142+
fi
143+
144+
log "Repository cloned successfully. Current directory: $(pwd)"
145+
146+
# Install required Rust toolchain and components
147+
log "Setting up Rust toolchain..."
148+
149+
# # Install nightly toolchain
150+
# log "Installing Rust nightly toolchain..."
151+
# rustup toolchain install nightly
152+
153+
# # Add WASM target
154+
# log "Adding wasm32-unknown-unknown target..."
155+
# rustup target add wasm32-unknown-unknown --toolchain nightly
156+
157+
# # Add rust-src component for nightly
158+
# log "Adding rust-src component..."
159+
# rustup component add rust-src --toolchain nightly
160+
161+
# Install required tools
162+
log "Installing wasm-pack and wasm-bindgen-cli..."
163+
if ! command_exists wasm-pack; then
164+
cargo install wasm-pack
165+
else
166+
log "wasm-pack already installed"
167+
fi
168+
169+
if ! command_exists wasm-bindgen; then
170+
cargo install wasm-bindgen-cli
171+
else
172+
log "wasm-bindgen-cli already installed"
173+
fi
174+
175+
# Change to the package directory
176+
log "Building WASM package: $PACKAGE"
177+
cd "$PACKAGE"
178+
179+
# Set environment variable for JS target
180+
export JS_TARGET="$JS_TARGET"
181+
182+
# Build the WASM package
183+
log "Starting WASM build (target: $JS_TARGET)..."
184+
if [[ "$PACKAGE" == "hf_xet_thin_wasm" ]]; then
185+
# Use the existing build script for thin WASM
186+
chmod +x build_wasm.sh
187+
./build_wasm.sh
188+
else
189+
# For hf_xet_wasm, use the more complex build process
190+
chmod +x build_wasm.sh
191+
./build_wasm.sh
192+
fi
193+
194+
log "WASM build completed successfully!"
195+
196+
# Check if pkg directory exists (created by wasm-pack)
197+
if [[ -d "pkg" ]]; then
198+
log "Generated files in pkg directory:"
199+
ls -la pkg/
200+
201+
# Copy to output directory if specified
202+
if [[ -n "$OUTPUT_DIR" ]]; then
203+
log "Copying WASM files to output directory: $OUTPUT_DIR"
204+
mkdir -p "$OUTPUT_DIR"
205+
cp -r pkg/* "$OUTPUT_DIR/"
206+
log "Files copied to $OUTPUT_DIR"
207+
log "Contents of output directory:"
208+
ls -la "$OUTPUT_DIR"
209+
fi
210+
else
211+
warn "pkg directory not found. Build may have failed or used different output location."
212+
fi
213+
214+
# Return to original directory
215+
cd "$ORIGINAL_DIR"
216+
217+
log "Build process completed!"
218+
log "Built package: $PACKAGE"
219+
log "Branch: $BRANCH"
220+
log "JavaScript target: $JS_TARGET"
221+
if [[ -n "$OUTPUT_DIR" ]]; then
222+
log "Output copied to: $OUTPUT_DIR"
223+
fi
224+
225+
# copy the generated hf_xet_thin_wasm_bg.js to the hub package and hf_xet_thin_wasm_bg.wasm to the hub package
226+
cp "$CLONE_DIR/$PACKAGE/pkg/hf_xet_thin_wasm_bg.js" "./src/vendor/xet-chunk/chunker_wasm_bg.js"
227+
echo "// Generated by build-xet-wasm.sh" > "./src/vendor/xet-chunk/chunker_wasm_bg.wasm.base64.ts"
228+
echo "export const wasmBase64 = atob(\`" >> "./src/vendor/xet-chunk/chunker_wasm_bg.wasm.base64.ts"
229+
base64 "$CLONE_DIR/$PACKAGE/pkg/hf_xet_thin_wasm_bg.wasm" | fold -w 100 >> "./src/vendor/xet-chunk/chunker_wasm_bg.wasm.base64.ts"
230+
cat << 'EOF' >> "./src/vendor/xet-chunk/chunker_wasm_bg.wasm.base64.ts"
231+
`)
232+
.trim()
233+
.replaceAll("\n", "");
234+
const wasmBinary = new Uint8Array(wasmBase64.length);
235+
for (let i = 0; i < wasmBase64.length; i++) {
236+
wasmBinary[i] = wasmBase64.charCodeAt(i);
237+
}
238+
export { wasmBinary };
239+
EOF
240+
241+
echo -e "\n${GREEN}🎉 Success!${NC} Your WASM package is ready to use."

packages/hub/src/utils/XetBlob.spec.ts

Lines changed: 47 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import { describe, expect, it } from "vitest";
22
import type { ReconstructionInfo } from "./XetBlob";
3-
import { bg4_regoup_bytes, XetBlob } from "./XetBlob";
3+
import { bg4_regroup_bytes, bg4_split_bytes, XetBlob } from "./XetBlob";
44
import { sum } from "./sum";
55

66
describe("XetBlob", () => {
@@ -173,30 +173,72 @@ describe("XetBlob", () => {
173173

174174
describe("bg4_regoup_bytes", () => {
175175
it("should regroup bytes when the array is %4 length", () => {
176-
expect(bg4_regoup_bytes(new Uint8Array([1, 5, 2, 6, 3, 7, 4, 8]))).toEqual(
176+
expect(bg4_regroup_bytes(new Uint8Array([1, 5, 2, 6, 3, 7, 4, 8]))).toEqual(
177177
new Uint8Array([1, 2, 3, 4, 5, 6, 7, 8])
178178
);
179179
});
180180

181181
it("should regroup bytes when the array is %4 + 1 length", () => {
182-
expect(bg4_regoup_bytes(new Uint8Array([1, 5, 9, 2, 6, 3, 7, 4, 8]))).toEqual(
182+
expect(bg4_regroup_bytes(new Uint8Array([1, 5, 9, 2, 6, 3, 7, 4, 8]))).toEqual(
183183
new Uint8Array([1, 2, 3, 4, 5, 6, 7, 8, 9])
184184
);
185185
});
186186

187187
it("should regroup bytes when the array is %4 + 2 length", () => {
188-
expect(bg4_regoup_bytes(new Uint8Array([1, 5, 9, 2, 6, 10, 3, 7, 4, 8]))).toEqual(
188+
expect(bg4_regroup_bytes(new Uint8Array([1, 5, 9, 2, 6, 10, 3, 7, 4, 8]))).toEqual(
189189
new Uint8Array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
190190
);
191191
});
192192

193193
it("should regroup bytes when the array is %4 + 3 length", () => {
194-
expect(bg4_regoup_bytes(new Uint8Array([1, 5, 9, 2, 6, 10, 3, 7, 11, 4, 8]))).toEqual(
194+
expect(bg4_regroup_bytes(new Uint8Array([1, 5, 9, 2, 6, 10, 3, 7, 11, 4, 8]))).toEqual(
195195
new Uint8Array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11])
196196
);
197197
});
198198
});
199199

200+
describe("bg4_split_bytes", () => {
201+
it("should split bytes when the array is %4 length", () => {
202+
expect(bg4_split_bytes(new Uint8Array([1, 2, 3, 4, 5, 6, 7, 8]))).toEqual(
203+
new Uint8Array([1, 5, 2, 6, 3, 7, 4, 8])
204+
);
205+
});
206+
207+
it("should split bytes when the array is %4 + 1 length", () => {
208+
expect(bg4_split_bytes(new Uint8Array([1, 2, 3, 4, 5, 6, 7, 8, 9]))).toEqual(
209+
new Uint8Array([1, 5, 9, 2, 6, 3, 7, 4, 8])
210+
);
211+
});
212+
213+
it("should split bytes when the array is %4 + 2 length", () => {
214+
expect(bg4_split_bytes(new Uint8Array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]))).toEqual(
215+
new Uint8Array([1, 5, 9, 2, 6, 10, 3, 7, 4, 8])
216+
);
217+
});
218+
219+
it("should split bytes when the array is %4 + 3 length", () => {
220+
expect(bg4_split_bytes(new Uint8Array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]))).toEqual(
221+
new Uint8Array([1, 5, 9, 2, 6, 10, 3, 7, 11, 4, 8])
222+
);
223+
});
224+
225+
it("should be the inverse of bg4_regroup_bytes", () => {
226+
const testArrays = [
227+
new Uint8Array([1, 2, 3, 4, 5, 6, 7, 8]),
228+
new Uint8Array([1, 2, 3, 4, 5, 6, 7, 8, 9]),
229+
new Uint8Array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]),
230+
new Uint8Array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]),
231+
new Uint8Array([42]),
232+
new Uint8Array([1, 2]),
233+
new Uint8Array([1, 2, 3]),
234+
];
235+
236+
testArrays.forEach((arr) => {
237+
expect(bg4_regroup_bytes(bg4_split_bytes(arr))).toEqual(arr);
238+
});
239+
});
240+
});
241+
200242
describe("when mocked", () => {
201243
describe("loading many chunks every read", () => {
202244
it("should load different slices", async () => {

0 commit comments

Comments
 (0)