Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/deploy_sphinx_docs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: [ "3.10" ]
python-version: [ "3.11" ]
steps:
- name: Checkout
uses: actions/checkout@v4
Expand Down Expand Up @@ -54,7 +54,7 @@ jobs:
name: SphinxDoc
path: 'docs/sphinx_doc/build'
- uses: peaceiris/actions-gh-pages@v3
if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' }}
if: ${{ github.event_name == 'push' && (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/tags/')) }}
with:
github_token: ${{ secrets.GITHUB_TOKEN }}
publish_dir: 'docs/sphinx_doc/build'
40 changes: 0 additions & 40 deletions .github/workflows/sphinx_docs_linkcheck.yml

This file was deleted.

2 changes: 1 addition & 1 deletion docs/awesome_llm_data.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ Due to the rapid development in the field, this repository and our paper are con
## News
+ 🎉 [2025-06-04] Our [Data-Model Co-development Survey](hhttps://ieeexplore.ieee.org/document/11027559) has been accepted by IEEE Transactions on Pattern Analysis and Machine Intelligence (**TPAMI**)! Welcome to explore and contribute this awesome-list.
+ [2025-05-25] We added 20 academic papers related to this survey.
+ ![new](https://img.alicdn.com/imgextra/i4/O1CN01kUiDtl1HVxN6G56vN_!!6000000000764-2-tps-43-19.png) [2024-10-23] We built a [dynamic table](https://modelscope.github.io/data-juicer/_static/awesome-list.html) based on the [paper list](#paper-list) that supports filtering and searching.
+ ![new](https://img.alicdn.com/imgextra/i4/O1CN01kUiDtl1HVxN6G56vN_!!6000000000764-2-tps-43-19.png) [2024-10-23] We built a [dynamic table](https://modelscope.github.io/data-juicer/en/main/_static/awesome-list.html) based on the [paper list](#paper-list) that supports filtering and searching.
+ ![new](https://img.alicdn.com/imgextra/i4/O1CN01kUiDtl1HVxN6G56vN_!!6000000000764-2-tps-43-19.png) [2024-10-22] We restructured our [paper list](#paper-list) to provide more streamlined information.


Expand Down
1 change: 1 addition & 0 deletions docs/sphinx_doc/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ doc, please run the following commands:
```bash
# $~/data_juicer/docs/sphinx_doc
# 1. install the sphinx requirements and init the sphinx-quickstart
# Note: Please run in Python>=3.11 environment
uv pip install "py-data-juicer[dev]"

# 2. auto generate and build the doc
Expand Down
1 change: 1 addition & 0 deletions docs/sphinx_doc/README_ZH.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ Data-Juicer 借助 Sphinx 构建文档。
```bash
# $~/data_juicer/docs/sphinx_doc
# 1.安装 sphinx 的依赖并初始化 sphinx-quickstart
# 注意:请在python>=3.11环境下运行
uv pip install "py-data-juicer[dev]"
# 2. 运行文档构建脚本
./build_doc.sh
Expand Down
73 changes: 69 additions & 4 deletions docs/sphinx_doc/build_doc.sh
Original file line number Diff line number Diff line change
@@ -1,8 +1,73 @@
#!/bin/bash

# Get project root directory (assuming we're in docs/sphinx_doc)
PROJECT_ROOT="$(cd "$(dirname "$0")/../.." && pwd)"
WORKTREES_DIR="$PROJECT_ROOT/.worktrees"

# Cleanup function: handle git worktree related issues
cleanup_worktrees() {
echo "Cleaning up git worktrees..."
echo "Project root: $PROJECT_ROOT"

# Change to project root for git operations
cd "$PROJECT_ROOT"

# 1. Prune invalid worktree references
git worktree prune 2>/dev/null || true

# 2. Force remove all worktrees in .worktrees directory
if [ -d "$WORKTREES_DIR" ]; then
echo "Found .worktrees directory at: $WORKTREES_DIR"
for wt_dir in "$WORKTREES_DIR"/*; do
if [ -d "$wt_dir" ]; then
wt_name=$(basename "$wt_dir")
echo " Removing worktree: $wt_name"
# Try normal removal first
git worktree remove --force "$wt_dir" 2>/dev/null || {
# If normal removal fails, force delete directory
echo " Force deleting directory: $wt_dir"
rm -rf "$wt_dir"
}
fi
done
# Remove empty .worktrees directory
rmdir "$WORKTREES_DIR" 2>/dev/null || true
fi

# 3. Prune worktree references again
git worktree prune 2>/dev/null || true

echo "Worktree cleanup completed"

# Return to original directory
cd - > /dev/null
}

# Error handling function
handle_error() {
echo "Error occurred during build process, cleaning up..."
cleanup_worktrees
exit 1
}

# Set up error handling
trap handle_error ERR

# Store current directory
ORIGINAL_DIR=$(pwd)

# Pre-cleanup: ensure clean environment before starting
echo "Pre-cleanup before build..."
echo "Current directory: $ORIGINAL_DIR"
cleanup_worktrees

# Execute original build process (back in docs/sphinx_doc)
echo "Starting build..."
make clean
languages=(en zh_CN)
python build_versions.py

for lang in "${languages[@]}"; do
sphinx-multiversion source build/$lang -D "language=$lang"
done
# # Post-build cleanup (optional, as build_versions.py already has cleanup logic)
# echo "Build completed, performing final cleanup..."
# cleanup_worktrees

echo "All operations completed successfully!"
151 changes: 151 additions & 0 deletions docs/sphinx_doc/build_versions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
#!/usr/bin/env python3
import os
import re
import shutil
import subprocess
from pathlib import Path
from packaging import version as pv

# Repository structure and build configuration
REPO_ROOT = Path(__file__).resolve().parents[2]
SITE_DIR = REPO_ROOT / "docs" / "sphinx_doc" / "build" # Build output directory
WORKTREES_DIR = REPO_ROOT / ".worktrees" # Temporary worktree directory for version builds
DOCS_REL = Path("docs/sphinx_doc")
LANGS = ["en", "zh_CN"] # Supported documentation languages
MIN_TAG = "v1.4.0" # Minimum version tag to build
REMOTE = "origin" # Git remote name

# Build options
KEEP_WORKTREES = False # Whether to keep worktrees after build (default: cleanup)
HAS_SUBMODULES = False # Set True if repo uses submodules and needs initialization

def run(cmd, cwd=None, env=None, check=True):
"""Execute shell command with logging"""
print(f"[RUN] {' '.join(map(str, cmd))}")
subprocess.run(cmd, cwd=cwd, env=env, check=check)

def is_valid_tag(tag: str) -> bool:
"""Check if tag matches version pattern and meets minimum version requirement"""
if not re.match(r"^v\d+\.\d+\.\d+$", tag):
return False
try:
return pv.parse(tag) >= pv.parse(MIN_TAG)
except Exception:
return False

def get_tags():
"""Fetch and filter valid version tags from remote repository"""
run(["git", "fetch", "--tags", "--force", REMOTE])
out = subprocess.check_output(["git", "tag"], text=True).strip()
tags = [t for t in out.splitlines() if t]
return [t for t in tags if is_valid_tag(t)]

def ensure_clean_worktree(path: Path):
"""Remove existing worktree if present to ensure clean state"""
if path.exists():
try:
run(["git", "worktree", "remove", "--force", str(path)])
except Exception:
shutil.rmtree(path, ignore_errors=True)

def copy_docs_source_to(wt_root: Path):
"""Copy current docs source to worktree to unify templates and extensions"""
src = REPO_ROOT / DOCS_REL
dst = wt_root / DOCS_REL
dst.parent.mkdir(parents=True, exist_ok=True)
print(f"[COPY] {src} -> {dst}")
shutil.copytree(src, dst, dirs_exist_ok=True, ignore=shutil.ignore_patterns(".git", "build", ".pyc"))

def maybe_init_submodules(wt_root: Path):
"""Initialize submodules in worktree if repository uses them"""
if HAS_SUBMODULES:
try:
run(["git", "submodule", "update", "--init", "--recursive"], cwd=wt_root)
except Exception as e:
print(f"[WARN] submodule init failed: {e}")

def copy_markdown_files(wt_root: Path):
for md_file in wt_root.rglob("*.md"):
exclude_paths = ["outputs", "sphinx_doc", ".github"]
if any(path in str(md_file) for path in exclude_paths):
continue
target = wt_root / DOCS_REL / "source" / md_file.relative_to(wt_root)
target.parent.mkdir(parents=True, exist_ok=True)
if not target.exists():
shutil.copy2(md_file, target)

def build_one(ref: str, ref_label: str, available_versions: list[str]):
"""Build documentation for a single version/branch"""
# Create and setup worktree for the specific git reference
wt = WORKTREES_DIR / ref_label
ensure_clean_worktree(wt)
run(["git", "worktree", "add", "--force", str(wt), ref])
maybe_init_submodules(wt)

# Override docs/sphinx_doc with current repo version for unified templates
copy_docs_source_to(wt)
copy_markdown_files(wt)

src = wt / DOCS_REL / "source"
if not src.exists():
print(f"[SKIP] {ref_label}: {src} not found")
if not KEEP_WORKTREES:
run(["git", "worktree", "remove", "--force", str(wt)])
return

# Build documentation for each supported language
for lang in LANGS:
out_dir = SITE_DIR / lang / ref_label
out_dir.mkdir(parents=True, exist_ok=True)

# Setup environment variables for Sphinx build
env = os.environ.copy()
env["DOCS_VERSION"] = ref_label # Documentation version label (e.g., latest, v1.5.0)
env["GIT_REF_FOR_LINKS"] = ref # Git reference for GitHub links
env["AVAILABLE_VERSIONS"] = ",".join(available_versions) # All available versions for switcher
env["REPO_ROOT"] = str(wt) # Version-specific repo root for copying markdown files
env["CODE_ROOT"] = str(wt) # Version-specific code root for autodoc imports

# Generate the API rst files
api_cmd = [
"sphinx-apidoc",
"-o", str(wt / DOCS_REL / "source" / "api"),
str(wt / "data_juicer"),
"-t", "_templates",
"-e"
]
run(api_cmd, env=env)

# Execute Sphinx build command
cmd = [
"sphinx-build",
"-b", "html", # HTML builder
"-D", f"language={lang}", # Set language for this build
"-j", "auto",
str(src), # Source directory
str(out_dir), # Output directory
]
run(cmd, env=env)

# Cleanup worktree after successful build
if not KEEP_WORKTREES:
run(["git", "worktree", "remove", "--force", str(wt)])
try:
run(["git", "worktree", "prune"]) # Clean up worktree references
except Exception:
pass

def main():
"""Main entry point: build documentation for all versions"""
WORKTREES_DIR.mkdir(exist_ok=True)
tags = get_tags()
tags.sort(key=pv.parse, reverse=True)
versions = ["main"] + tags # Build main branch + all valid tags

# Build main branch first, then all tagged versions
build_one("main", "main", versions)
for t in tags:
build_one(t, t, versions)

if __name__ == "__main__":
main()
28 changes: 28 additions & 0 deletions docs/sphinx_doc/source/_static/sidebar-menu.css
Original file line number Diff line number Diff line change
Expand Up @@ -60,3 +60,31 @@
.sidebar-bottom-menu .section dd a.active {
font-weight: bold;
}

.sidebar-bottom-menu .section.version dd {
max-height: calc(3 * (1.6em + 0.6em)); /* 3 rows */
overflow-y: auto;
overscroll-behavior: contain;
scrollbar-gutter: stable;
scrollbar-width: none;
}
.sidebar-bottom-menu .section.version dd:hover {
scrollbar-width: thin;
scrollbar-color: var(--color-foreground-secondary) transparent;
}

.sidebar-bottom-menu .section.version dd::-webkit-scrollbar {
width: 0;
height: 0;
}
.sidebar-bottom-menu .section.version dd:hover::-webkit-scrollbar {
width: 6px;
height: 6px;
}
.sidebar-bottom-menu .section.version dd:hover::-webkit-scrollbar-thumb {
background-color: rgba(128,128,128,0.35);
border-radius: 6px;
}
.sidebar-bottom-menu .section.version dd:hover::-webkit-scrollbar-thumb:hover {
background-color: rgba(128,128,128,0.55);
}
Loading