Skip to content

add scrape extraction params and rename extraction template options #13

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Nov 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions __tests__/client/extraction.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,8 @@ Deno.test('extract: fails due to invalid config', async () => {
new ExtractionConfig({
body: html,
content_type: 'text/html',
ephemeral_template: { source: 'html' },
template: 'template',
extraction_ephemeral_template: { source: 'html' },
extraction_template: 'template',
}),
);
},
Expand Down
8 changes: 4 additions & 4 deletions __tests__/config/extraction.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -58,11 +58,11 @@ Deno.test('url param generation: sets charset', async () => {
});
});

Deno.test('url param generation: sets template', async () => {
Deno.test('url param generation: sets extraction_template', async () => {
const config = new ExtractionConfig({
body: input_html,
content_type: input_content_type,
template: 'my_template',
extraction_template: 'my_template',
});
const params = config.toApiParams({ key: '1234' });
assertEquals(params, {
Expand All @@ -72,11 +72,11 @@ Deno.test('url param generation: sets template', async () => {
});
});

Deno.test('url param generation: sets ephemeral_template', async () => {
Deno.test('url param generation: sets extraction_ephemeral_template', async () => {
const config = new ExtractionConfig({
body: input_html,
content_type: input_content_type,
ephemeral_template: { source: 'html', selectors: [] },
extraction_ephemeral_template: { source: 'html', selectors: [] },
});
const params = config.toApiParams({ key: '1234' });
assertEquals(params, {
Expand Down
55 changes: 53 additions & 2 deletions __tests__/config/scrape.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ import { HttpMethod } from '../../src/types.ts';
import { ScrapeConfigError } from '../../src/errors.ts';
import { assertEquals, assertThrows } from "https://deno.land/std@0.224.0/assert/mod.ts";

const input_content_type = 'text/html';

Deno.test('scrapeconfig loads', () => {
const config = new ScrapeConfig({ url: 'http://httpbin.dev/get' });
Expand All @@ -15,8 +16,6 @@ Deno.test('scrapeconfig throws on unknown options', () => {
}, ScrapeConfigError, "Invalid option provided: foobar");
});



Deno.test('scrapeconfig allowed methods', () => {
(['GET', 'POST', 'PUT', 'PATCH', 'HEAD'] as HttpMethod[]).forEach((method) => {
const config = new ScrapeConfig({
Expand Down Expand Up @@ -360,6 +359,58 @@ Deno.test('url param generation: proxy_pool sets', () => {
});
});

Deno.test('url param generation: sets extraction_template', async () => {
const config = new ScrapeConfig({
url: 'http://httpbin.dev/get',
extraction_template: 'my_template',
});
const params = config.toApiParams({ key: '1234' });
assertEquals(params, {
key: '1234',
url: 'http://httpbin.dev/get',
extraction_template: 'my_template',
});
});

Deno.test('url param generation: sets extraction_ephemeral_template', async () => {
const config = new ScrapeConfig({
url: 'http://httpbin.dev/get',
extraction_ephemeral_template: { source: 'html', selectors: [] },
});
const params = config.toApiParams({ key: '1234' });
assertEquals(params, {
key: '1234',
url: 'http://httpbin.dev/get',
extraction_template: 'ephemeral:eyJzb3VyY2UiOiJodG1sIiwic2VsZWN0b3JzIjpbXX0',
});
});

Deno.test('url param generation: sets extraction_prompt', async () => {
const config = new ScrapeConfig({
url: 'http://httpbin.dev/get',
extraction_prompt: 'summarize the document',
});
const params = config.toApiParams({ key: '1234' });
assertEquals(params, {
key: '1234',
url: 'http://httpbin.dev/get',
extraction_prompt: 'summarize the document',
});
});

Deno.test('url param generation: sets extraction_model', async () => {
const config = new ScrapeConfig({
url: 'http://httpbin.dev/get',
extraction_model: 'review_list',
});
const params = config.toApiParams({ key: '1234' });
assertEquals(params, {
key: '1234',
url: 'http://httpbin.dev/get',
extraction_model: 'review_list',
});
});

Deno.test('url param generation: session sets', () => {
const config = new ScrapeConfig({
url: 'http://httpbin.dev/get',
Expand Down
24 changes: 23 additions & 1 deletion examples/deno/deno_examples.ts
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,28 @@ export async function JSRender(apiKey: string) {
console.log(scrape_result.result.browser_data);
}

/* Use AI extraction capabilities with the the web scraping API
* all Extraction API methods are supported, see below examples for more
*/
export async function scrapeExtraction(apiKey: string) {
const client = new ScrapflyClient({ key: apiKey});

let scrape_result = await client.scrape(
new ScrapeConfig({
url: 'https://web-scraping.dev/product/1',
// enable browsers:
render_js: true,
// use LLM prompt for auto parsing
extraction_prompt: "Extract the product specification in json format",
})
);

// access the extraction result
console.log("extraction result:");
console.log(scrape_result.result.extracted_data);
}


/* Scrapfly Extraction API offers LLM (Language Learning Model) based extraction
* This example demonstrates how to use LLM query HTML files
* https://scrapfly.io/docs/extraction-api/llm-prompt
Expand Down Expand Up @@ -190,7 +212,7 @@ export async function extractionTemplates(apiKey: string){
body: html,
content_type: "text/html",
// provide template:
ephemeral_template: template,
extraction_ephemeral_template: template,
})
);
console.log('product extract');
Expand Down
24 changes: 23 additions & 1 deletion examples/node_commonjs/commonjs_examples.cjs
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,27 @@ async function JSRender(apiKey) {
console.log(scrape_result.result.browser_data);
}

/* Use AI extraction capabilities with the the web scraping API
* all Extraction API methods are supported, see below examples for more
*/
async function scrapeExtraction(apiKey) {
const client = new ScrapflyClient({ key: apiKey});

let scrape_result = await client.scrape(
new ScrapeConfig({
url: 'https://web-scraping.dev/product/1',
// enable browsers:
render_js: true,
// use LLM prompt for auto parsing
extraction_prompt: "Extract the product specification in json format",
})
);

// access the extraction result
console.log("extraction result:");
console.log(scrape_result.result.extracted_data);
}

/* Scrapfly Extraction API offers LLM (Language Learning Model) based extraction
* This example demonstrates how to use LLM query HTML files
* https://scrapfly.io/docs/extraction-api/llm-prompt
Expand Down Expand Up @@ -190,7 +211,7 @@ async function extractionTemplates(apiKey){
body: html,
content_type: "text/html",
// provide template:
ephemeral_template: template,
extraction_ephemeral_template: template,
})
);
console.log('product extract');
Expand Down Expand Up @@ -233,6 +254,7 @@ module.exports = {
getAccount,
basicGet,
JSRender,
scrapeExtraction,
extractionLLM,
extractionAutoExtract,
extractionTemplates,
Expand Down
52 changes: 39 additions & 13 deletions src/extractionconfig.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import * as errors from './errors.ts';
import { urlsafe_b64encode } from './utils.ts';
import { ExtractionConfigError } from './errors.ts';

export enum CompressionFormat {
/**
Expand All @@ -21,30 +22,55 @@ type ExtractionConfigOptions = {
content_type: string;
url?: string;
charset?: string;
template?: string; // saved template name
ephemeral_template?: object; // ephemeraly declared json template
extraction_template?: string; // saved template name
extraction_ephemeral_template?: object; // ephemeraly declared json template
extraction_prompt?: string;
extraction_model?: string;
is_document_compressed?: boolean;
document_compression_format?: 'gzip' | 'zstd' | 'deflate' | CompressionFormat;
webhook?: string;

// deprecated options
template?: string;
ephemeral_template?: object;
};

export class ExtractionConfig {
body: string | Uint8Array;
content_type: string;
url?: string;
charset?: string;
template?: string; // saved template name
ephemeral_template?: object; // ephemeraly declared json template
extraction_template?: string; // saved template name
extraction_ephemeral_template?: object; // ephemeraly declared json template
extraction_prompt?: string;
extraction_model?: string;
is_document_compressed?: boolean;
document_compression_format?: 'gzip' | 'zstd' | 'deflate' | CompressionFormat;
webhook?: string;

// // deprecated options
template?: string;
ephemeral_template?: object;

constructor(options: ExtractionConfigOptions) {
this.validateOptions(options);
if (options.template) {
console.warn(
`Deprecation warning: 'template' is deprecated. Use 'extraction_template' instead.`
);
this.extraction_template = options.template;
} else {
this.extraction_template = options.extraction_template;
}
if (options.ephemeral_template) {
console.warn(
`Deprecation warning: 'ephemeral_template' is deprecated. Use 'extraction_ephemeral_template' instead.`
);
this.extraction_ephemeral_template = options.ephemeral_template;
} else {
this.extraction_ephemeral_template = options.extraction_ephemeral_template;
}

if (
options.document_compression_format &&
!Object.values(CompressionFormat).includes(options.document_compression_format as CompressionFormat)
Expand All @@ -57,8 +83,8 @@ export class ExtractionConfig {
this.content_type = options.content_type;
this.url = options.url ?? this.url;
this.charset = options.charset ?? this.charset;
this.template = options.template ?? this.template;
this.ephemeral_template = options.ephemeral_template ?? this.ephemeral_template;
this.extraction_template = options.extraction_template ?? this.extraction_template;
this.extraction_ephemeral_template = options.extraction_ephemeral_template ?? this.extraction_ephemeral_template;
this.extraction_prompt = options.extraction_prompt ?? this.extraction_prompt;
this.extraction_model = options.extraction_model ?? this.extraction_model;
this.is_document_compressed = options.is_document_compressed ?? this.is_document_compressed;
Expand Down Expand Up @@ -90,18 +116,18 @@ export class ExtractionConfig {
params.charset = this.charset;
}

if (this.template && this.ephemeral_template) {
throw new errors.ExtractionConfigError(
'You cannot pass both parameters template and ephemeral_template. You must choose',
if (this.extraction_template && this.extraction_ephemeral_template) {
throw new ExtractionConfigError(
'You cannot pass both parameters extraction_template and extraction_ephemeral_template. You must choose',
);
}

if (this.template) {
params.extraction_template = this.template;
if (this.extraction_template) {
params.extraction_template = this.extraction_template;
}

if (this.ephemeral_template) {
params.extraction_template = 'ephemeral:' + urlsafe_b64encode(JSON.stringify(this.ephemeral_template));
if (this.extraction_ephemeral_template) {
params.extraction_template = 'ephemeral:' + urlsafe_b64encode(JSON.stringify(this.extraction_ephemeral_template));
}

if (this.extraction_prompt) {
Expand Down
33 changes: 33 additions & 0 deletions src/scrapeconfig.ts
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,10 @@ type ScrapeConfigOptions = {
tags?: string[];
format?: 'json' | 'text' | 'markdown' | 'clean_html' | 'raw' | Format;
format_options?: ('no_links' | 'no_images' | 'only_content' | FormatOption)[];
extraction_template?: string; // saved template name
extraction_ephemeral_template?: object; // ephemeraly declared json template
extraction_prompt?: string;
extraction_model?: string;
correlation_id?: string;
cookies?: Rec<string>;
body?: string;
Expand Down Expand Up @@ -104,6 +108,10 @@ export class ScrapeConfig {
tags: Set<string> = new Set<string>();
format?: 'json' | 'text' | 'markdown' | 'clean_html' | 'raw' | Format;
format_options?: ('no_links' | 'no_images' | 'only_content' | FormatOption)[];
extraction_template?: string; // saved template name
extraction_ephemeral_template?: object; // ephemeraly declared json template
extraction_prompt?: string;
extraction_model?: string;
correlation_id?: string;
cookies?: Rec<string>;
body?: string;
Expand Down Expand Up @@ -163,6 +171,10 @@ export class ScrapeConfig {
this.tags = new Set(options.tags) ?? this.tags;
this.format = options.format ?? this.format;
this.format_options = options.format_options ?? this.format_options;
this.extraction_template = options.extraction_template ?? this.extraction_template;
this.extraction_ephemeral_template = options.extraction_ephemeral_template ?? this.extraction_ephemeral_template;
this.extraction_prompt = options.extraction_prompt ?? this.extraction_prompt;
this.extraction_model = options.extraction_model ?? this.extraction_model;
this.correlation_id = options.correlation_id ?? this.correlation_id;
this.cookies = options.cookies
? Object.fromEntries(Object.entries(options.cookies).map(([k, v]) => [k.toLowerCase(), v]))
Expand Down Expand Up @@ -338,6 +350,27 @@ export class ScrapeConfig {
params.format += ':' + this.format_options.join(',');
}
}
if (this.extraction_template && this.extraction_ephemeral_template) {
throw new ScrapeConfigError(
'You cannot pass both parameters extraction_template and extraction_ephemeral_template. You must choose',
);
}

if (this.extraction_template) {
params.extraction_template = this.extraction_template;
}

if (this.extraction_ephemeral_template) {
params.extraction_template = 'ephemeral:' + urlsafe_b64encode(JSON.stringify(this.extraction_ephemeral_template));
}

if (this.extraction_prompt) {
params.extraction_prompt = this.extraction_prompt;
}

if (this.extraction_model) {
params.extraction_model = this.extraction_model;
}
if (this.correlation_id) {
params.correlation_id = this.correlation_id;
}
Expand Down
2 changes: 1 addition & 1 deletion tsconfig.json
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
"types": ["node", "jest"],
"skipLibCheck": true,
"module": "ESNext",
"lib": ["ES2022", "ESNext"],
"lib": ["ES2022", "ESNext", "dom"],
"moduleResolution": "Node",
"rootDir": ".",
"outDir": "build",
Expand Down
Loading