diff --git a/docs/docs.json b/docs/docs.json
index 6fd9790dc7..5c4c995e30 100644
--- a/docs/docs.json
+++ b/docs/docs.json
@@ -18,24 +18,14 @@
"groups": [
{
"group": "Getting started",
- "pages": [
- "introduction",
- "quick-start",
- "video-walkthrough",
- "how-it-works",
- "limits"
- ]
+ "pages": ["introduction", "quick-start", "video-walkthrough", "how-it-works", "limits"]
},
{
"group": "Fundamentals",
"pages": [
{
"group": "Tasks",
- "pages": [
- "tasks/overview",
- "tasks/schemaTask",
- "tasks/scheduled"
- ]
+ "pages": ["tasks/overview", "tasks/schemaTask", "tasks/scheduled"]
},
"triggering",
"runs",
@@ -50,13 +40,7 @@
"errors-retrying",
{
"group": "Wait",
- "pages": [
- "wait",
- "wait-for",
- "wait-until",
- "wait-for-event",
- "wait-for-request"
- ]
+ "pages": ["wait", "wait-for", "wait-until", "wait-for-event", "wait-for-request"]
},
"queue-concurrency",
"versioning",
@@ -100,9 +84,7 @@
},
{
"group": "Development",
- "pages": [
- "cli-dev"
- ]
+ "pages": ["cli-dev"]
},
{
"group": "Deployment",
@@ -113,9 +95,7 @@
"deployment/atomic-deployment",
{
"group": "Deployment integrations",
- "pages": [
- "vercel-integration"
- ]
+ "pages": ["vercel-integration"]
}
]
},
@@ -166,12 +146,7 @@
},
{
"group": "Using the Dashboard",
- "pages": [
- "run-tests",
- "troubleshooting-alerts",
- "replaying",
- "bulk-actions"
- ]
+ "pages": ["run-tests", "troubleshooting-alerts", "replaying", "bulk-actions"]
},
{
"group": "Troubleshooting",
@@ -197,11 +172,7 @@
},
{
"group": "Help",
- "pages": [
- "community",
- "help-slack",
- "help-email"
- ]
+ "pages": ["community", "help-slack", "help-email"]
}
]
},
@@ -222,10 +193,7 @@
},
{
"group": "Tasks API",
- "pages": [
- "management/tasks/trigger",
- "management/tasks/batch-trigger"
- ]
+ "pages": ["management/tasks/trigger", "management/tasks/batch-trigger"]
},
{
"group": "Runs API",
@@ -271,9 +239,7 @@
"groups": [
{
"group": "Introduction",
- "pages": [
- "guides/introduction"
- ]
+ "pages": ["guides/introduction"]
},
{
"group": "Frameworks",
@@ -340,7 +306,8 @@
"guides/example-projects/claude-thinking-chatbot",
"guides/example-projects/realtime-fal-ai",
"guides/example-projects/realtime-csv-importer",
- "guides/example-projects/vercel-ai-sdk-image-generator"
+ "guides/example-projects/vercel-ai-sdk-image-generator",
+ "guides/python/python-crawl4ai"
]
},
{
@@ -386,10 +353,7 @@
"href": "https://trigger.dev"
},
"api": {
- "openapi": [
- "openapi.yml",
- "v3-openapi.yaml"
- ],
+ "openapi": ["openapi.yml", "v3-openapi.yaml"],
"playground": {
"display": "simple"
}
@@ -564,4 +528,4 @@
"destination": "/management/overview"
}
]
-}
\ No newline at end of file
+}
diff --git a/docs/guides/examples/puppeteer.mdx b/docs/guides/examples/puppeteer.mdx
index 63e802aa02..7bd59db283 100644
--- a/docs/guides/examples/puppeteer.mdx
+++ b/docs/guides/examples/puppeteer.mdx
@@ -205,7 +205,7 @@ There's no payload required for this task so you can just click "Run test" from
## Proxying
-If you're using Trigger.dev Cloud and Puppeteer or any other tool to scrape content from websites you don't own, you'll need to proxy your requests. **If you don't you'll risk getting our IP address blocked and we will ban you from our service.**
+If you're using Trigger.dev Cloud and Puppeteer or any other tool to scrape content from websites you don't own, you'll need to proxy your requests. **If you don't you'll risk getting our IP address blocked and we will ban you from our service. You must always have permission from the website owner to scrape their content.**
Here are a list of proxy services we recommend:
diff --git a/docs/guides/introduction.mdx b/docs/guides/introduction.mdx
index eebc14b921..0a72541a8f 100644
--- a/docs/guides/introduction.mdx
+++ b/docs/guides/introduction.mdx
@@ -45,6 +45,7 @@ Example projects are full projects with example repos you can fork and use. Thes
| [Realtime Fal.ai image generation](/guides/example-projects/realtime-fal-ai) | Generate an image from a prompt using Fal.ai and show the progress of the task on the frontend using Realtime. | Next.js | [View the repo](https://github.com/triggerdotdev/examples/tree/main/realtime-fal-ai-image-generation) |
| [Realtime CSV Importer](/guides/example-projects/realtime-csv-importer) | Upload a CSV file and see the progress of the task streamed to the frontend. | Next.js | [View the repo](https://github.com/triggerdotdev/examples/tree/main/realtime-csv-importer) |
| [Vercel AI SDK image generator](/guides/example-projects/vercel-ai-sdk-image-generator) | Use the Vercel AI SDK to generate images from a prompt. | Next.js | [View the repo](https://github.com/triggerdotdev/examples/tree/main/vercel-ai-sdk-image-generator) |
+| [Python web crawler](/guides/python/python-crawl4ai) | Use Python, Crawl4AI and Playwright to create a headless web crawler with Trigger.dev. | — | [View the repo](https://github.com/triggerdotdev/examples/tree/main/python-crawl4ai) |
## Example tasks
diff --git a/docs/guides/python/python-crawl4ai.mdx b/docs/guides/python/python-crawl4ai.mdx
new file mode 100644
index 0000000000..b233506177
--- /dev/null
+++ b/docs/guides/python/python-crawl4ai.mdx
@@ -0,0 +1,177 @@
+---
+title: "Python headless browser web crawler example"
+sidebarTitle: "Python headless web crawler"
+description: "Learn how to use Python, Crawl4AI and Playwright to create a headless browser web crawler with Trigger.dev."
+---
+
+import ScrapingWarning from "/snippets/web-scraping-warning.mdx";
+import PythonLearnMore from "/snippets/python-learn-more.mdx";
+
+## Prerequisites
+
+- A project with [Trigger.dev initialized](/quick-start)
+- [Python](https://www.python.org/) installed on your local machine
+
+## Overview
+
+This demo showcases how to use Trigger.dev with Python to build a web crawler that uses a headless browser to navigate websites and extract content.
+
+## Features
+
+- [Trigger.dev](https://trigger.dev) for background task orchestration
+- Our [Python build extension](/config/extensions/pythonExtension) to install the dependencies and run the Python script
+- [Crawl4AI](https://github.com/unclecode/crawl4ai), an open source LLM friendly web crawler
+- A custom [Playwright extension](https://playwright.dev/) to create a headless chromium browser
+
+
+
+## GitHub repo
+
+
+ Click here to view the full code for this project in our examples repository on GitHub. You can
+ fork it and use it as a starting point for your own project.
+
+
+## The code
+
+### Build configuration
+
+After you've initialized your project with Trigger.dev, add these build settings to your `trigger.config.ts` file:
+
+```ts trigger.config.ts
+import { defineConfig } from "@trigger.dev/sdk/v3";
+import { pythonExtension } from "@trigger.dev/python/extension";
+import type { BuildContext, BuildExtension } from "@trigger.dev/core/v3/build";
+
+export default defineConfig({
+ project: "",
+ // Your other config settings...
+ build: {
+ extensions: [
+ // This is required to use the Python extension
+ pythonExtension(),
+ // This is required to create a headless chromium browser with Playwright
+ installPlaywrightChromium(),
+ ],
+ },
+});
+
+// This is a custom build extension to install Playwright and Chromium
+export function installPlaywrightChromium(): BuildExtension {
+ return {
+ name: "InstallPlaywrightChromium",
+ onBuildComplete(context: BuildContext) {
+ const instructions = [
+ // Base and Chromium dependencies
+ `RUN apt-get update && apt-get install -y --no-install-recommends \
+ curl unzip npm libnspr4 libatk1.0-0 libatk-bridge2.0-0 libatspi2.0-0 \
+ libasound2 libnss3 libxcomposite1 libxdamage1 libxfixes3 libxrandr2 \
+ libgbm1 libxkbcommon0 \
+ && apt-get clean && rm -rf /var/lib/apt/lists/*`,
+
+ // Install Playwright and Chromium
+ `RUN npm install -g playwright`,
+ `RUN mkdir -p /ms-playwright`,
+ `RUN PLAYWRIGHT_BROWSERS_PATH=/ms-playwright python -m playwright install --with-deps chromium`,
+ ];
+
+ context.addLayer({
+ id: "playwright",
+ image: { instructions },
+ deploy: {
+ env: {
+ PLAYWRIGHT_BROWSERS_PATH: "/ms-playwright",
+ PLAYWRIGHT_SKIP_BROWSER_DOWNLOAD: "1",
+ PLAYWRIGHT_SKIP_BROWSER_VALIDATION: "1",
+ },
+ override: true,
+ },
+ });
+ },
+ };
+}
+```
+
+Learn more about the [trigger.config.ts](/config/config-file) file including setting default retry settings, customizing the build environment, and more.
+
+### Task code
+
+This task uses the `python.runScript` method to run the `crawl-url.py` script with the given URL as an argument. You can see the original task in our examples repository [here](https://github.com/triggerdotdev/examples/blob/main/python-crawl4ai/src/trigger/pythonTasks.ts).
+
+```ts src/trigger/pythonTasks.ts
+import { logger, schemaTask, task } from "@trigger.dev/sdk/v3";
+import { python } from "@trigger.dev/python";
+import { z } from "zod";
+
+export const convertUrlToMarkdown = schemaTask({
+ id: "convert-url-to-markdown",
+ schema: z.object({
+ url: z.string().url(),
+ }),
+ run: async (payload) => {
+ const result = await python.runScript("./src/python/crawl-url.py", [payload.url]);
+
+ logger.debug("convert-url-to-markdown", {
+ url: payload.url,
+ result,
+ });
+
+ return result.stdout;
+ },
+});
+```
+
+### Add a requirements.txt file
+
+Add the following to your `requirements.txt` file. This is required in Python projects to install the dependencies.
+
+```txt requirements.txt
+crawl4ai
+playwright
+urllib3<2.0.0
+```
+
+### The Python script
+
+The Python script is a simple script using Crawl4AI that takes a URL and returns the markdown content of the page. You can see the original script in our examples repository [here](https://github.com/triggerdotdev/examples/blob/main/python-crawl4ai/src/python/crawl-url.py).
+
+```python src/python/crawl-url.py
+import asyncio
+import sys
+from crawl4ai import *
+
+async def main(url: str):
+ async with AsyncWebCrawler() as crawler:
+ result = await crawler.arun(
+ url=url,
+ )
+ print(result.markdown)
+
+if __name__ == "__main__":
+ if len(sys.argv) < 2:
+ print("Usage: python crawl-url.py ")
+ sys.exit(1)
+ url = sys.argv[1]
+ asyncio.run(main(url))
+```
+
+## Testing your task
+
+1. Create a virtual environment `python -m venv venv`
+2. Activate the virtual environment, depending on your OS: On Mac/Linux: `source venv/bin/activate`, on Windows: `venv\Scripts\activate`
+3. Install the Python dependencies `pip install -r requirements.txt`
+4. If you haven't already, copy your project ref from your [Trigger.dev dashboard](https://cloud.trigger.dev) and and add it to the `trigger.config.ts` file.
+5. Run the Trigger.dev dev CLI command with with `npx trigger dev@latest dev` (it may ask you to authorize the CLI if you haven't already).
+6. Test the task in the dashboard, using a URL of your choice.
+
+
+
+## Deploying your task
+
+Deploy the task to production using the CLI command `npx trigger.dev@latest deploy`
+
+
diff --git a/docs/snippets/python-learn-more.mdx b/docs/snippets/python-learn-more.mdx
new file mode 100644
index 0000000000..8277c4d2af
--- /dev/null
+++ b/docs/snippets/python-learn-more.mdx
@@ -0,0 +1,6 @@
+## Learn more about using Python with Trigger.dev
+
+
+ Learn how to use our built-in Python build extension to install dependencies and run your Python
+ code.
+