1
1
import { config } from 'dotenv' ;
2
2
import { createPool , Pool , PoolConfig } from 'mysql' ;
3
3
import { schedule } from 'node-cron' ;
4
- import Puppeteer , { launch , LaunchOptions } from 'puppeteer' ;
4
+ import { LaunchOptions } from 'puppeteer' ;
5
+ import Puppeteer from 'puppeteer-extra' ;
5
6
6
7
7
8
/** Only use .env files when running in dev mode */
9
+ const isProduction = process . env . production ?. toString ( ) === 'true' || process . env . NODE_ENV === 'production' ;
8
10
if ( ! process . env . production ) config ( ) ;
9
11
10
- export const url = '' ;
12
+ /** Additional Puppeteer options and plugins */
13
+ const AnonymizeUAPlugin = require ( 'puppeteer-extra-plugin-anonymize-ua' ) ; // Add anonymize user agent plugin (changes user agent to a random one)
14
+ const AdblockerPlugin = require ( 'puppeteer-extra-plugin-adblocker' ) ; // Add adblocker plugin to block all ads and trackers (saves bandwidth)
15
+ const RecaptchaPlugin = require ( 'puppeteer-extra-plugin-recaptcha' ) ; // Add recaptcha plugin (solves recaptchas automagically)
16
+ const StealthPlugin = require ( 'puppeteer-extra-plugin-stealth' ) ; // Add stealth plugin and use defaults (all tricks to hide puppeteer usage)
17
+
18
+ Puppeteer . use ( RecaptchaPlugin ( { provider : { id : '2captcha' , token : process . env . TWO_CAPTCHA_API_KEY } , visualFeedback : true } ) )
19
+ Puppeteer . use ( AnonymizeUAPlugin ( { makeWindows : true , stripHeadless : true } ) )
20
+ Puppeteer . use ( AdblockerPlugin ( { blockTrackers : true } ) ) ;
21
+ Puppeteer . use ( StealthPlugin ( ) ) ;
22
+
23
+ /** Launch options */
24
+ const launchOptions : LaunchOptions = {
25
+ headless : isProduction , // Run headless in production mode
26
+ args : [
27
+ '--disable-gpu' , '--disable-dev-shm-usage' , '--disable-setuid-sandbox' , '--no-sandbox' ,
28
+ '--window-size=1920,1080' , /* '--window-position=1920,0' */ // Activate this if you want to have the browser window on a second screen
29
+ ] ,
30
+ ignoreHTTPSErrors : true ,
31
+ devtools : ! isProduction , // Open devtools in development mode
32
+ slowMo : 0 , // Slow down puppeteer operations by X milliseconds (useful for debugging)
33
+ timeout : 0 // Disable timeouts
34
+ }
35
+
36
+
37
+ /** Url to scrape */
38
+ export const url = 'https://bot.sannysoft.com/' ; // Example url to test the scraper's fingerprint
11
39
12
40
/**
13
41
* @param pool - MySQL connection pool (could also be made global)
@@ -17,12 +45,7 @@ export const url = '';
17
45
* example of how to do this.
18
46
*/
19
47
async function scrape ( pool : Pool ) {
20
- const browser = await Puppeteer . launch ( < LaunchOptions > {
21
- headless : true ,
22
- args : [ '--no-sandbox' , '--disable-gpu' ] ,
23
- timeout : 0
24
- } ) ;
25
-
48
+ const browser = await Puppeteer . launch ( launchOptions ) ;
26
49
const page = await browser . newPage ( ) ;
27
50
await page . goto ( url ) ;
28
51
@@ -39,18 +62,20 @@ async function scrape(pool: Pool) {
39
62
* debugging is hard.
40
63
*/
41
64
try {
42
- // Do stuff
65
+ // Do stuff ...
66
+ throw new Error ( 'Error while scraping...' ) ;
43
67
} catch ( error ) {
44
68
if ( error instanceof Error ) {
45
69
/** Save a screenshot if possible */
46
- try { await page . screenshot ( { path : `log/err-${ new Date ( ) . getTime ( ) } .png` } ) } catch ( error ) { }
70
+ try { await page . screenshot ( { path : `log/err-${ new Date ( ) . getTime ( ) } .png` } ) } catch ( error ) { }
47
71
console . error ( error . message ) ;
48
72
}
49
73
}
50
74
51
75
await browser . close ( ) ;
52
76
}
53
77
78
+ /** Create MySQL connection pool so we can reuse connections */
54
79
const pool : Pool = createPool ( < PoolConfig > {
55
80
host : process . env . HOST ,
56
81
user : process . env . USER ,
@@ -59,9 +84,13 @@ const pool: Pool = createPool(<PoolConfig>{
59
84
port : process . env . PORT
60
85
} ) ;
61
86
62
- // Scrape every 15 minutes if production mode is enabled (https://crontab.guru is your best friend)
87
+ /*
88
+ * Scrape every 15 minutes if production mode is enabled or once
89
+ * if not.
90
+ * (https://crontab.guru is your best friend)
91
+ */
63
92
const interval = process . env . production ? '*/30 * * * *' : '* * * * *' ;
64
- console . log ( `Scraping every ${ process . env . production ? '15 minutes' : 'minute ' } .` ) ;
93
+ console . log ( `Scraping ${ process . env . production ? 'every 30 minutes' : 'once. ' } .` ) ;
65
94
66
- if ( ! process . env . production ) scrape ( pool ) ;
67
- schedule ( interval , ( ) => scrape ( pool ) ) ;
95
+ if ( ! process . env . production ) scrape ( pool )
96
+ else schedule ( interval , ( ) => scrape ( pool ) ) ;
0 commit comments