diff --git a/__tests__/client/extraction.test.ts b/__tests__/client/extraction.test.ts index 7376eb5..cca7296 100644 --- a/__tests__/client/extraction.test.ts +++ b/__tests__/client/extraction.test.ts @@ -39,8 +39,8 @@ Deno.test('extract: fails due to invalid config', async () => { new ExtractionConfig({ body: html, content_type: 'text/html', - ephemeral_template: { source: 'html' }, - template: 'template', + extraction_ephemeral_template: { source: 'html' }, + extraction_template: 'template', }), ); }, diff --git a/__tests__/config/extraction.test.ts b/__tests__/config/extraction.test.ts index b6d2be5..42cb464 100644 --- a/__tests__/config/extraction.test.ts +++ b/__tests__/config/extraction.test.ts @@ -58,11 +58,11 @@ Deno.test('url param generation: sets charset', async () => { }); }); -Deno.test('url param generation: sets template', async () => { +Deno.test('url param generation: sets extraction_template', async () => { const config = new ExtractionConfig({ body: input_html, content_type: input_content_type, - template: 'my_template', + extraction_template: 'my_template', }); const params = config.toApiParams({ key: '1234' }); assertEquals(params, { @@ -72,11 +72,11 @@ Deno.test('url param generation: sets template', async () => { }); }); -Deno.test('url param generation: sets ephemeral_template', async () => { +Deno.test('url param generation: sets extraction_ephemeral_template', async () => { const config = new ExtractionConfig({ body: input_html, content_type: input_content_type, - ephemeral_template: { source: 'html', selectors: [] }, + extraction_ephemeral_template: { source: 'html', selectors: [] }, }); const params = config.toApiParams({ key: '1234' }); assertEquals(params, { diff --git a/__tests__/config/scrape.test.ts b/__tests__/config/scrape.test.ts index 6d959de..ab16a5a 100644 --- a/__tests__/config/scrape.test.ts +++ b/__tests__/config/scrape.test.ts @@ -3,6 +3,7 @@ import { HttpMethod } from '../../src/types.ts'; import { ScrapeConfigError } from '../../src/errors.ts'; import { assertEquals, assertThrows } from "https://deno.land/std@0.224.0/assert/mod.ts"; +const input_content_type = 'text/html'; Deno.test('scrapeconfig loads', () => { const config = new ScrapeConfig({ url: 'http://httpbin.dev/get' }); @@ -15,8 +16,6 @@ Deno.test('scrapeconfig throws on unknown options', () => { }, ScrapeConfigError, "Invalid option provided: foobar"); }); - - Deno.test('scrapeconfig allowed methods', () => { (['GET', 'POST', 'PUT', 'PATCH', 'HEAD'] as HttpMethod[]).forEach((method) => { const config = new ScrapeConfig({ @@ -360,6 +359,58 @@ Deno.test('url param generation: proxy_pool sets', () => { }); }); +Deno.test('url param generation: sets extraction_template', async () => { + const config = new ScrapeConfig({ + url: 'http://httpbin.dev/get', + extraction_template: 'my_template', + }); + const params = config.toApiParams({ key: '1234' }); + assertEquals(params, { + key: '1234', + url: 'http://httpbin.dev/get', + extraction_template: 'my_template', + }); +}); + +Deno.test('url param generation: sets extraction_ephemeral_template', async () => { + const config = new ScrapeConfig({ + url: 'http://httpbin.dev/get', + extraction_ephemeral_template: { source: 'html', selectors: [] }, + }); + const params = config.toApiParams({ key: '1234' }); + assertEquals(params, { + key: '1234', + url: 'http://httpbin.dev/get', + extraction_template: 'ephemeral:eyJzb3VyY2UiOiJodG1sIiwic2VsZWN0b3JzIjpbXX0', + }); +}); + +Deno.test('url param generation: sets extraction_prompt', async () => { + const config = new ScrapeConfig({ + url: 'http://httpbin.dev/get', + extraction_prompt: 'summarize the document', + }); + const params = config.toApiParams({ key: '1234' }); + assertEquals(params, { + key: '1234', + url: 'http://httpbin.dev/get', + extraction_prompt: 'summarize the document', + }); +}); + +Deno.test('url param generation: sets extraction_model', async () => { + const config = new ScrapeConfig({ + url: 'http://httpbin.dev/get', + extraction_model: 'review_list', + }); + const params = config.toApiParams({ key: '1234' }); + assertEquals(params, { + key: '1234', + url: 'http://httpbin.dev/get', + extraction_model: 'review_list', + }); +}); + Deno.test('url param generation: session sets', () => { const config = new ScrapeConfig({ url: 'http://httpbin.dev/get', diff --git a/examples/deno/deno_examples.ts b/examples/deno/deno_examples.ts index 66203e6..4118c97 100644 --- a/examples/deno/deno_examples.ts +++ b/examples/deno/deno_examples.ts @@ -81,6 +81,28 @@ export async function JSRender(apiKey: string) { console.log(scrape_result.result.browser_data); } +/* Use AI extraction capabilities with the the web scraping API + * all Extraction API methods are supported, see below examples for more + */ +export async function scrapeExtraction(apiKey: string) { + const client = new ScrapflyClient({ key: apiKey}); + + let scrape_result = await client.scrape( + new ScrapeConfig({ + url: 'https://web-scraping.dev/product/1', + // enable browsers: + render_js: true, + // use LLM prompt for auto parsing + extraction_prompt: "Extract the product specification in json format", + }) + ); + + // access the extraction result + console.log("extraction result:"); + console.log(scrape_result.result.extracted_data); +} + + /* Scrapfly Extraction API offers LLM (Language Learning Model) based extraction * This example demonstrates how to use LLM query HTML files * https://scrapfly.io/docs/extraction-api/llm-prompt @@ -190,7 +212,7 @@ export async function extractionTemplates(apiKey: string){ body: html, content_type: "text/html", // provide template: - ephemeral_template: template, + extraction_ephemeral_template: template, }) ); console.log('product extract'); diff --git a/examples/node_commonjs/commonjs_examples.cjs b/examples/node_commonjs/commonjs_examples.cjs index 2fddb30..2b7422e 100644 --- a/examples/node_commonjs/commonjs_examples.cjs +++ b/examples/node_commonjs/commonjs_examples.cjs @@ -81,6 +81,27 @@ async function JSRender(apiKey) { console.log(scrape_result.result.browser_data); } +/* Use AI extraction capabilities with the the web scraping API + * all Extraction API methods are supported, see below examples for more + */ +async function scrapeExtraction(apiKey) { + const client = new ScrapflyClient({ key: apiKey}); + + let scrape_result = await client.scrape( + new ScrapeConfig({ + url: 'https://web-scraping.dev/product/1', + // enable browsers: + render_js: true, + // use LLM prompt for auto parsing + extraction_prompt: "Extract the product specification in json format", + }) + ); + + // access the extraction result + console.log("extraction result:"); + console.log(scrape_result.result.extracted_data); +} + /* Scrapfly Extraction API offers LLM (Language Learning Model) based extraction * This example demonstrates how to use LLM query HTML files * https://scrapfly.io/docs/extraction-api/llm-prompt @@ -190,7 +211,7 @@ async function extractionTemplates(apiKey){ body: html, content_type: "text/html", // provide template: - ephemeral_template: template, + extraction_ephemeral_template: template, }) ); console.log('product extract'); @@ -233,6 +254,7 @@ module.exports = { getAccount, basicGet, JSRender, + scrapeExtraction, extractionLLM, extractionAutoExtract, extractionTemplates, diff --git a/src/extractionconfig.ts b/src/extractionconfig.ts index c2b1791..77776ee 100644 --- a/src/extractionconfig.ts +++ b/src/extractionconfig.ts @@ -1,5 +1,6 @@ import * as errors from './errors.ts'; import { urlsafe_b64encode } from './utils.ts'; +import { ExtractionConfigError } from './errors.ts'; export enum CompressionFormat { /** @@ -21,13 +22,17 @@ type ExtractionConfigOptions = { content_type: string; url?: string; charset?: string; - template?: string; // saved template name - ephemeral_template?: object; // ephemeraly declared json template + extraction_template?: string; // saved template name + extraction_ephemeral_template?: object; // ephemeraly declared json template extraction_prompt?: string; extraction_model?: string; is_document_compressed?: boolean; document_compression_format?: 'gzip' | 'zstd' | 'deflate' | CompressionFormat; webhook?: string; + + // deprecated options + template?: string; + ephemeral_template?: object; }; export class ExtractionConfig { @@ -35,16 +40,37 @@ export class ExtractionConfig { content_type: string; url?: string; charset?: string; - template?: string; // saved template name - ephemeral_template?: object; // ephemeraly declared json template + extraction_template?: string; // saved template name + extraction_ephemeral_template?: object; // ephemeraly declared json template extraction_prompt?: string; extraction_model?: string; is_document_compressed?: boolean; document_compression_format?: 'gzip' | 'zstd' | 'deflate' | CompressionFormat; webhook?: string; + // // deprecated options + template?: string; + ephemeral_template?: object; + constructor(options: ExtractionConfigOptions) { this.validateOptions(options); + if (options.template) { + console.warn( + `Deprecation warning: 'template' is deprecated. Use 'extraction_template' instead.` + ); + this.extraction_template = options.template; + } else { + this.extraction_template = options.extraction_template; + } + if (options.ephemeral_template) { + console.warn( + `Deprecation warning: 'ephemeral_template' is deprecated. Use 'extraction_ephemeral_template' instead.` + ); + this.extraction_ephemeral_template = options.ephemeral_template; + } else { + this.extraction_ephemeral_template = options.extraction_ephemeral_template; + } + if ( options.document_compression_format && !Object.values(CompressionFormat).includes(options.document_compression_format as CompressionFormat) @@ -57,8 +83,8 @@ export class ExtractionConfig { this.content_type = options.content_type; this.url = options.url ?? this.url; this.charset = options.charset ?? this.charset; - this.template = options.template ?? this.template; - this.ephemeral_template = options.ephemeral_template ?? this.ephemeral_template; + this.extraction_template = options.extraction_template ?? this.extraction_template; + this.extraction_ephemeral_template = options.extraction_ephemeral_template ?? this.extraction_ephemeral_template; this.extraction_prompt = options.extraction_prompt ?? this.extraction_prompt; this.extraction_model = options.extraction_model ?? this.extraction_model; this.is_document_compressed = options.is_document_compressed ?? this.is_document_compressed; @@ -90,18 +116,18 @@ export class ExtractionConfig { params.charset = this.charset; } - if (this.template && this.ephemeral_template) { - throw new errors.ExtractionConfigError( - 'You cannot pass both parameters template and ephemeral_template. You must choose', + if (this.extraction_template && this.extraction_ephemeral_template) { + throw new ExtractionConfigError( + 'You cannot pass both parameters extraction_template and extraction_ephemeral_template. You must choose', ); } - if (this.template) { - params.extraction_template = this.template; + if (this.extraction_template) { + params.extraction_template = this.extraction_template; } - if (this.ephemeral_template) { - params.extraction_template = 'ephemeral:' + urlsafe_b64encode(JSON.stringify(this.ephemeral_template)); + if (this.extraction_ephemeral_template) { + params.extraction_template = 'ephemeral:' + urlsafe_b64encode(JSON.stringify(this.extraction_ephemeral_template)); } if (this.extraction_prompt) { diff --git a/src/scrapeconfig.ts b/src/scrapeconfig.ts index e3be1f8..a745a69 100644 --- a/src/scrapeconfig.ts +++ b/src/scrapeconfig.ts @@ -61,6 +61,10 @@ type ScrapeConfigOptions = { tags?: string[]; format?: 'json' | 'text' | 'markdown' | 'clean_html' | 'raw' | Format; format_options?: ('no_links' | 'no_images' | 'only_content' | FormatOption)[]; + extraction_template?: string; // saved template name + extraction_ephemeral_template?: object; // ephemeraly declared json template + extraction_prompt?: string; + extraction_model?: string; correlation_id?: string; cookies?: Rec; body?: string; @@ -104,6 +108,10 @@ export class ScrapeConfig { tags: Set = new Set(); format?: 'json' | 'text' | 'markdown' | 'clean_html' | 'raw' | Format; format_options?: ('no_links' | 'no_images' | 'only_content' | FormatOption)[]; + extraction_template?: string; // saved template name + extraction_ephemeral_template?: object; // ephemeraly declared json template + extraction_prompt?: string; + extraction_model?: string; correlation_id?: string; cookies?: Rec; body?: string; @@ -163,6 +171,10 @@ export class ScrapeConfig { this.tags = new Set(options.tags) ?? this.tags; this.format = options.format ?? this.format; this.format_options = options.format_options ?? this.format_options; + this.extraction_template = options.extraction_template ?? this.extraction_template; + this.extraction_ephemeral_template = options.extraction_ephemeral_template ?? this.extraction_ephemeral_template; + this.extraction_prompt = options.extraction_prompt ?? this.extraction_prompt; + this.extraction_model = options.extraction_model ?? this.extraction_model; this.correlation_id = options.correlation_id ?? this.correlation_id; this.cookies = options.cookies ? Object.fromEntries(Object.entries(options.cookies).map(([k, v]) => [k.toLowerCase(), v])) @@ -338,6 +350,27 @@ export class ScrapeConfig { params.format += ':' + this.format_options.join(','); } } + if (this.extraction_template && this.extraction_ephemeral_template) { + throw new ScrapeConfigError( + 'You cannot pass both parameters extraction_template and extraction_ephemeral_template. You must choose', + ); + } + + if (this.extraction_template) { + params.extraction_template = this.extraction_template; + } + + if (this.extraction_ephemeral_template) { + params.extraction_template = 'ephemeral:' + urlsafe_b64encode(JSON.stringify(this.extraction_ephemeral_template)); + } + + if (this.extraction_prompt) { + params.extraction_prompt = this.extraction_prompt; + } + + if (this.extraction_model) { + params.extraction_model = this.extraction_model; + } if (this.correlation_id) { params.correlation_id = this.correlation_id; } diff --git a/tsconfig.json b/tsconfig.json index f7eb880..f8e74d6 100644 --- a/tsconfig.json +++ b/tsconfig.json @@ -5,7 +5,7 @@ "types": ["node", "jest"], "skipLibCheck": true, "module": "ESNext", - "lib": ["ES2022", "ESNext"], + "lib": ["ES2022", "ESNext", "dom"], "moduleResolution": "Node", "rootDir": ".", "outDir": "build",