From a1ea09d6e2fe15ee36282b106eee08e589e580e7 Mon Sep 17 00:00:00 2001 From: D-K-P <8297864+D-K-P@users.noreply.github.com> Date: Tue, 11 Mar 2025 11:54:28 +0000 Subject: [PATCH 1/2] Added Python crawling (slithering?) example --- docs/docs.json | 62 ++------- docs/guides/examples/puppeteer.mdx | 2 +- docs/guides/introduction.mdx | 1 + docs/guides/python/python-crawl4ai.mdx | 177 +++++++++++++++++++++++++ docs/snippets/python-learn-more.mdx | 6 + 5 files changed, 198 insertions(+), 50 deletions(-) create mode 100644 docs/guides/python/python-crawl4ai.mdx create mode 100644 docs/snippets/python-learn-more.mdx diff --git a/docs/docs.json b/docs/docs.json index 6fd9790dc7..5c4c995e30 100644 --- a/docs/docs.json +++ b/docs/docs.json @@ -18,24 +18,14 @@ "groups": [ { "group": "Getting started", - "pages": [ - "introduction", - "quick-start", - "video-walkthrough", - "how-it-works", - "limits" - ] + "pages": ["introduction", "quick-start", "video-walkthrough", "how-it-works", "limits"] }, { "group": "Fundamentals", "pages": [ { "group": "Tasks", - "pages": [ - "tasks/overview", - "tasks/schemaTask", - "tasks/scheduled" - ] + "pages": ["tasks/overview", "tasks/schemaTask", "tasks/scheduled"] }, "triggering", "runs", @@ -50,13 +40,7 @@ "errors-retrying", { "group": "Wait", - "pages": [ - "wait", - "wait-for", - "wait-until", - "wait-for-event", - "wait-for-request" - ] + "pages": ["wait", "wait-for", "wait-until", "wait-for-event", "wait-for-request"] }, "queue-concurrency", "versioning", @@ -100,9 +84,7 @@ }, { "group": "Development", - "pages": [ - "cli-dev" - ] + "pages": ["cli-dev"] }, { "group": "Deployment", @@ -113,9 +95,7 @@ "deployment/atomic-deployment", { "group": "Deployment integrations", - "pages": [ - "vercel-integration" - ] + "pages": ["vercel-integration"] } ] }, @@ -166,12 +146,7 @@ }, { "group": "Using the Dashboard", - "pages": [ - "run-tests", - "troubleshooting-alerts", - "replaying", - "bulk-actions" - ] + "pages": ["run-tests", "troubleshooting-alerts", "replaying", "bulk-actions"] }, { "group": "Troubleshooting", @@ -197,11 +172,7 @@ }, { "group": "Help", - "pages": [ - "community", - "help-slack", - "help-email" - ] + "pages": ["community", "help-slack", "help-email"] } ] }, @@ -222,10 +193,7 @@ }, { "group": "Tasks API", - "pages": [ - "management/tasks/trigger", - "management/tasks/batch-trigger" - ] + "pages": ["management/tasks/trigger", "management/tasks/batch-trigger"] }, { "group": "Runs API", @@ -271,9 +239,7 @@ "groups": [ { "group": "Introduction", - "pages": [ - "guides/introduction" - ] + "pages": ["guides/introduction"] }, { "group": "Frameworks", @@ -340,7 +306,8 @@ "guides/example-projects/claude-thinking-chatbot", "guides/example-projects/realtime-fal-ai", "guides/example-projects/realtime-csv-importer", - "guides/example-projects/vercel-ai-sdk-image-generator" + "guides/example-projects/vercel-ai-sdk-image-generator", + "guides/python/python-crawl4ai" ] }, { @@ -386,10 +353,7 @@ "href": "https://trigger.dev" }, "api": { - "openapi": [ - "openapi.yml", - "v3-openapi.yaml" - ], + "openapi": ["openapi.yml", "v3-openapi.yaml"], "playground": { "display": "simple" } @@ -564,4 +528,4 @@ "destination": "/management/overview" } ] -} \ No newline at end of file +} diff --git a/docs/guides/examples/puppeteer.mdx b/docs/guides/examples/puppeteer.mdx index 63e802aa02..7bd59db283 100644 --- a/docs/guides/examples/puppeteer.mdx +++ b/docs/guides/examples/puppeteer.mdx @@ -205,7 +205,7 @@ There's no payload required for this task so you can just click "Run test" from ## Proxying -If you're using Trigger.dev Cloud and Puppeteer or any other tool to scrape content from websites you don't own, you'll need to proxy your requests. **If you don't you'll risk getting our IP address blocked and we will ban you from our service.** +If you're using Trigger.dev Cloud and Puppeteer or any other tool to scrape content from websites you don't own, you'll need to proxy your requests. **If you don't you'll risk getting our IP address blocked and we will ban you from our service. You must always have permission from the website owner to scrape their content.** Here are a list of proxy services we recommend: diff --git a/docs/guides/introduction.mdx b/docs/guides/introduction.mdx index eebc14b921..f7e34837b7 100644 --- a/docs/guides/introduction.mdx +++ b/docs/guides/introduction.mdx @@ -45,6 +45,7 @@ Example projects are full projects with example repos you can fork and use. Thes | [Realtime Fal.ai image generation](/guides/example-projects/realtime-fal-ai) | Generate an image from a prompt using Fal.ai and show the progress of the task on the frontend using Realtime. | Next.js | [View the repo](https://github.com/triggerdotdev/examples/tree/main/realtime-fal-ai-image-generation) | | [Realtime CSV Importer](/guides/example-projects/realtime-csv-importer) | Upload a CSV file and see the progress of the task streamed to the frontend. | Next.js | [View the repo](https://github.com/triggerdotdev/examples/tree/main/realtime-csv-importer) | | [Vercel AI SDK image generator](/guides/example-projects/vercel-ai-sdk-image-generator) | Use the Vercel AI SDK to generate images from a prompt. | Next.js | [View the repo](https://github.com/triggerdotdev/examples/tree/main/vercel-ai-sdk-image-generator) | +| [Python web crawler](/guides/example-projects/python-web-crawler) | Use Python, Crawl4AI and Playwright to create a headless web crawler with Trigger.dev. | — | [View the repo](https://github.com/triggerdotdev/examples/tree/main/python-crawl4ai) | ## Example tasks diff --git a/docs/guides/python/python-crawl4ai.mdx b/docs/guides/python/python-crawl4ai.mdx new file mode 100644 index 0000000000..b233506177 --- /dev/null +++ b/docs/guides/python/python-crawl4ai.mdx @@ -0,0 +1,177 @@ +--- +title: "Python headless browser web crawler example" +sidebarTitle: "Python headless web crawler" +description: "Learn how to use Python, Crawl4AI and Playwright to create a headless browser web crawler with Trigger.dev." +--- + +import ScrapingWarning from "/snippets/web-scraping-warning.mdx"; +import PythonLearnMore from "/snippets/python-learn-more.mdx"; + +## Prerequisites + +- A project with [Trigger.dev initialized](/quick-start) +- [Python](https://www.python.org/) installed on your local machine + +## Overview + +This demo showcases how to use Trigger.dev with Python to build a web crawler that uses a headless browser to navigate websites and extract content. + +## Features + +- [Trigger.dev](https://trigger.dev) for background task orchestration +- Our [Python build extension](/config/extensions/pythonExtension) to install the dependencies and run the Python script +- [Crawl4AI](https://github.com/unclecode/crawl4ai), an open source LLM friendly web crawler +- A custom [Playwright extension](https://playwright.dev/) to create a headless chromium browser + + + +## GitHub repo + + + Click here to view the full code for this project in our examples repository on GitHub. You can + fork it and use it as a starting point for your own project. + + +## The code + +### Build configuration + +After you've initialized your project with Trigger.dev, add these build settings to your `trigger.config.ts` file: + +```ts trigger.config.ts +import { defineConfig } from "@trigger.dev/sdk/v3"; +import { pythonExtension } from "@trigger.dev/python/extension"; +import type { BuildContext, BuildExtension } from "@trigger.dev/core/v3/build"; + +export default defineConfig({ + project: "", + // Your other config settings... + build: { + extensions: [ + // This is required to use the Python extension + pythonExtension(), + // This is required to create a headless chromium browser with Playwright + installPlaywrightChromium(), + ], + }, +}); + +// This is a custom build extension to install Playwright and Chromium +export function installPlaywrightChromium(): BuildExtension { + return { + name: "InstallPlaywrightChromium", + onBuildComplete(context: BuildContext) { + const instructions = [ + // Base and Chromium dependencies + `RUN apt-get update && apt-get install -y --no-install-recommends \ + curl unzip npm libnspr4 libatk1.0-0 libatk-bridge2.0-0 libatspi2.0-0 \ + libasound2 libnss3 libxcomposite1 libxdamage1 libxfixes3 libxrandr2 \ + libgbm1 libxkbcommon0 \ + && apt-get clean && rm -rf /var/lib/apt/lists/*`, + + // Install Playwright and Chromium + `RUN npm install -g playwright`, + `RUN mkdir -p /ms-playwright`, + `RUN PLAYWRIGHT_BROWSERS_PATH=/ms-playwright python -m playwright install --with-deps chromium`, + ]; + + context.addLayer({ + id: "playwright", + image: { instructions }, + deploy: { + env: { + PLAYWRIGHT_BROWSERS_PATH: "/ms-playwright", + PLAYWRIGHT_SKIP_BROWSER_DOWNLOAD: "1", + PLAYWRIGHT_SKIP_BROWSER_VALIDATION: "1", + }, + override: true, + }, + }); + }, + }; +} +``` + +Learn more about the [trigger.config.ts](/config/config-file) file including setting default retry settings, customizing the build environment, and more. + +### Task code + +This task uses the `python.runScript` method to run the `crawl-url.py` script with the given URL as an argument. You can see the original task in our examples repository [here](https://github.com/triggerdotdev/examples/blob/main/python-crawl4ai/src/trigger/pythonTasks.ts). + +```ts src/trigger/pythonTasks.ts +import { logger, schemaTask, task } from "@trigger.dev/sdk/v3"; +import { python } from "@trigger.dev/python"; +import { z } from "zod"; + +export const convertUrlToMarkdown = schemaTask({ + id: "convert-url-to-markdown", + schema: z.object({ + url: z.string().url(), + }), + run: async (payload) => { + const result = await python.runScript("./src/python/crawl-url.py", [payload.url]); + + logger.debug("convert-url-to-markdown", { + url: payload.url, + result, + }); + + return result.stdout; + }, +}); +``` + +### Add a requirements.txt file + +Add the following to your `requirements.txt` file. This is required in Python projects to install the dependencies. + +```txt requirements.txt +crawl4ai +playwright +urllib3<2.0.0 +``` + +### The Python script + +The Python script is a simple script using Crawl4AI that takes a URL and returns the markdown content of the page. You can see the original script in our examples repository [here](https://github.com/triggerdotdev/examples/blob/main/python-crawl4ai/src/python/crawl-url.py). + +```python src/python/crawl-url.py +import asyncio +import sys +from crawl4ai import * + +async def main(url: str): + async with AsyncWebCrawler() as crawler: + result = await crawler.arun( + url=url, + ) + print(result.markdown) + +if __name__ == "__main__": + if len(sys.argv) < 2: + print("Usage: python crawl-url.py ") + sys.exit(1) + url = sys.argv[1] + asyncio.run(main(url)) +``` + +## Testing your task + +1. Create a virtual environment `python -m venv venv` +2. Activate the virtual environment, depending on your OS: On Mac/Linux: `source venv/bin/activate`, on Windows: `venv\Scripts\activate` +3. Install the Python dependencies `pip install -r requirements.txt` +4. If you haven't already, copy your project ref from your [Trigger.dev dashboard](https://cloud.trigger.dev) and and add it to the `trigger.config.ts` file. +5. Run the Trigger.dev dev CLI command with with `npx trigger dev@latest dev` (it may ask you to authorize the CLI if you haven't already). +6. Test the task in the dashboard, using a URL of your choice. + + + +## Deploying your task + +Deploy the task to production using the CLI command `npx trigger.dev@latest deploy` + + diff --git a/docs/snippets/python-learn-more.mdx b/docs/snippets/python-learn-more.mdx new file mode 100644 index 0000000000..8277c4d2af --- /dev/null +++ b/docs/snippets/python-learn-more.mdx @@ -0,0 +1,6 @@ +## Learn more about using Python with Trigger.dev + + + Learn how to use our built-in Python build extension to install dependencies and run your Python + code. + From c3ef5beb9168546d4082b78bfb88aeeeee40b169 Mon Sep 17 00:00:00 2001 From: D-K-P <8297864+D-K-P@users.noreply.github.com> Date: Tue, 11 Mar 2025 14:13:14 +0000 Subject: [PATCH 2/2] Fixed link --- docs/guides/introduction.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/guides/introduction.mdx b/docs/guides/introduction.mdx index f7e34837b7..0a72541a8f 100644 --- a/docs/guides/introduction.mdx +++ b/docs/guides/introduction.mdx @@ -45,7 +45,7 @@ Example projects are full projects with example repos you can fork and use. Thes | [Realtime Fal.ai image generation](/guides/example-projects/realtime-fal-ai) | Generate an image from a prompt using Fal.ai and show the progress of the task on the frontend using Realtime. | Next.js | [View the repo](https://github.com/triggerdotdev/examples/tree/main/realtime-fal-ai-image-generation) | | [Realtime CSV Importer](/guides/example-projects/realtime-csv-importer) | Upload a CSV file and see the progress of the task streamed to the frontend. | Next.js | [View the repo](https://github.com/triggerdotdev/examples/tree/main/realtime-csv-importer) | | [Vercel AI SDK image generator](/guides/example-projects/vercel-ai-sdk-image-generator) | Use the Vercel AI SDK to generate images from a prompt. | Next.js | [View the repo](https://github.com/triggerdotdev/examples/tree/main/vercel-ai-sdk-image-generator) | -| [Python web crawler](/guides/example-projects/python-web-crawler) | Use Python, Crawl4AI and Playwright to create a headless web crawler with Trigger.dev. | — | [View the repo](https://github.com/triggerdotdev/examples/tree/main/python-crawl4ai) | +| [Python web crawler](/guides/python/python-crawl4ai) | Use Python, Crawl4AI and Playwright to create a headless web crawler with Trigger.dev. | — | [View the repo](https://github.com/triggerdotdev/examples/tree/main/python-crawl4ai) | ## Example tasks