@@ -24,37 +24,41 @@ export const scrape = (maxCharsPerElem: number) =>
24
24
export async function scrapeUrl ( url : string , maxCharsPerElem : number ) {
25
25
const { res, page } = await loadPage ( url ) ;
26
26
27
- if ( ! res ) throw Error ( "Failed to load page" ) ;
27
+ try {
28
+ if ( ! res ) throw Error ( "Failed to load page" ) ;
28
29
29
- // Check if it's a non-html content type that we can handle directly
30
- // TODO: direct mappings to markdown can be added for markdown, csv and others
31
- const contentType = res . headers ( ) [ "content-type" ] ?? "" ;
32
- if (
33
- contentType . includes ( "text/plain" ) ||
34
- contentType . includes ( "text/markdown" ) ||
35
- contentType . includes ( "application/json" ) ||
36
- contentType . includes ( "application/xml" ) ||
37
- contentType . includes ( "text/csv" )
38
- ) {
39
- const title = await page . title ( ) ;
40
- const content = await page . content ( ) ;
41
- return {
42
- title,
43
- markdownTree : htmlToMarkdownTree (
30
+ // Check if it's a non-html content type that we can handle directly
31
+ // TODO: direct mappings to markdown can be added for markdown, csv and others
32
+ const contentType = res . headers ( ) [ "content-type" ] ?? "" ;
33
+ if (
34
+ contentType . includes ( "text/plain" ) ||
35
+ contentType . includes ( "text/markdown" ) ||
36
+ contentType . includes ( "application/json" ) ||
37
+ contentType . includes ( "application/xml" ) ||
38
+ contentType . includes ( "text/csv" )
39
+ ) {
40
+ const title = await page . title ( ) ;
41
+ const content = await page . content ( ) ;
42
+ return {
44
43
title,
45
- [ { tagName : "p" , attributes : { } , content : [ content ] } ] ,
46
- maxCharsPerElem
47
- ) ,
48
- } ;
49
- }
44
+ markdownTree : htmlToMarkdownTree (
45
+ title ,
46
+ [ { tagName : "p" , attributes : { } , content : [ content ] } ] ,
47
+ maxCharsPerElem
48
+ ) ,
49
+ } ;
50
+ }
50
51
51
- return timeout ( page . evaluate ( spatialParser ) , 2000 )
52
- . then ( ( { elements, ...parsed } ) => ( {
53
- ...parsed ,
54
- markdownTree : htmlToMarkdownTree ( parsed . title , elements , maxCharsPerElem ) ,
55
- } ) )
56
- . catch ( ( cause ) => {
57
- throw Error ( "Parsing failed" , { cause } ) ;
58
- } )
59
- . finally ( ( ) => page . close ( ) ) ;
52
+ const scrapedOutput = await timeout ( page . evaluate ( spatialParser ) , 2000 )
53
+ . then ( ( { elements, ...parsed } ) => ( {
54
+ ...parsed ,
55
+ markdownTree : htmlToMarkdownTree ( parsed . title , elements , maxCharsPerElem ) ,
56
+ } ) )
57
+ . catch ( ( cause ) => {
58
+ throw Error ( "Parsing failed" , { cause } ) ;
59
+ } ) ;
60
+ return scrapedOutput ;
61
+ } finally {
62
+ page . close ( ) ;
63
+ }
60
64
}
0 commit comments