Skip to content

Commit b696965

Browse files
committed
added QOL plugins
1 parent 119354a commit b696965

File tree

3 files changed

+55
-17
lines changed

3 files changed

+55
-17
lines changed

.env_template

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,5 +14,9 @@ MAIL_PASSWORD=<password>
1414
MAIL_TARGET=<example@unicorn.com>
1515
MAIL_SUBJECT=<subject>
1616

17-
# Openai api key
18-
OPENAI_API_KEY=<openai api key>
17+
# Openai api key (https://openai.com)
18+
OPENAI_API_KEY=<openai api key>
19+
20+
21+
# 2captcha api key (https://2captcha.com)
22+
TWO_CAPTCHA_API_KEY=<2captcha api key>

package.json

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,12 @@
2626
"node-cron": "^2.0.3",
2727
"nodemailer": "^6.4.11",
2828
"openai": "^3.2.1",
29-
"puppeteer": "^5.2.1"
29+
"puppeteer": "^5.2.1",
30+
"puppeteer-extra": "^3.3.6",
31+
"puppeteer-extra-plugin-adblocker": "^2.13.6",
32+
"puppeteer-extra-plugin-anonymize-ua": "^2.4.6",
33+
"puppeteer-extra-plugin-recaptcha": "^3.6.8",
34+
"puppeteer-extra-plugin-stealth": "^2.11.2"
3035
},
3136
"devDependencies": {
3237
"@types/dotenv": "^8.2.0",

src/scraper.ts

Lines changed: 43 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,41 @@
11
import { config } from 'dotenv';
22
import { createPool, Pool, PoolConfig } from 'mysql';
33
import { schedule } from 'node-cron';
4-
import Puppeteer, { launch, LaunchOptions } from 'puppeteer';
4+
import { LaunchOptions } from 'puppeteer';
5+
import Puppeteer from 'puppeteer-extra';
56

67

78
/** Only use .env files when running in dev mode */
9+
const isProduction = process.env.production?.toString() === 'true' || process.env.NODE_ENV === 'production';
810
if (!process.env.production) config();
911

10-
export const url = '';
12+
/** Additional Puppeteer options and plugins */
13+
const AnonymizeUAPlugin = require('puppeteer-extra-plugin-anonymize-ua'); // Add anonymize user agent plugin (changes user agent to a random one)
14+
const AdblockerPlugin = require('puppeteer-extra-plugin-adblocker'); // Add adblocker plugin to block all ads and trackers (saves bandwidth)
15+
const RecaptchaPlugin = require('puppeteer-extra-plugin-recaptcha'); // Add recaptcha plugin (solves recaptchas automagically)
16+
const StealthPlugin = require('puppeteer-extra-plugin-stealth'); // Add stealth plugin and use defaults (all tricks to hide puppeteer usage)
17+
18+
Puppeteer.use(RecaptchaPlugin({provider: {id: '2captcha',token: process.env.TWO_CAPTCHA_API_KEY }, visualFeedback: true }))
19+
Puppeteer.use(AnonymizeUAPlugin({ makeWindows: true, stripHeadless: true }))
20+
Puppeteer.use(AdblockerPlugin({ blockTrackers: true }));
21+
Puppeteer.use(StealthPlugin());
22+
23+
/** Launch options */
24+
const launchOptions: LaunchOptions = {
25+
headless: isProduction, // Run headless in production mode
26+
args: [
27+
'--disable-gpu', '--disable-dev-shm-usage', '--disable-setuid-sandbox', '--no-sandbox',
28+
'--window-size=1920,1080', /* '--window-position=1920,0' */ // Activate this if you want to have the browser window on a second screen
29+
],
30+
ignoreHTTPSErrors: true,
31+
devtools: !isProduction, // Open devtools in development mode
32+
slowMo: 0, // Slow down puppeteer operations by X milliseconds (useful for debugging)
33+
timeout: 0 // Disable timeouts
34+
}
35+
36+
37+
/** Url to scrape */
38+
export const url = 'https://bot.sannysoft.com/'; // Example url to test the scraper's fingerprint
1139

1240
/**
1341
* @param pool - MySQL connection pool (could also be made global)
@@ -17,12 +45,7 @@ export const url = '';
1745
* example of how to do this.
1846
*/
1947
async function scrape(pool: Pool) {
20-
const browser = await Puppeteer.launch(<LaunchOptions>{
21-
headless: true,
22-
args: ['--no-sandbox', '--disable-gpu'],
23-
timeout: 0
24-
});
25-
48+
const browser = await Puppeteer.launch(launchOptions);
2649
const page = await browser.newPage();
2750
await page.goto(url);
2851

@@ -39,18 +62,20 @@ async function scrape(pool: Pool) {
3962
* debugging is hard.
4063
*/
4164
try {
42-
// Do stuff
65+
// Do stuff ...
66+
throw new Error('Error while scraping...');
4367
} catch (error) {
4468
if (error instanceof Error) {
4569
/** Save a screenshot if possible */
46-
try { await page.screenshot({ path: `log/err-${new Date().getTime()}.png` }) } catch (error) {}
70+
try { await page.screenshot({ path: `log/err-${new Date().getTime()}.png` }) } catch (error) { }
4771
console.error(error.message);
4872
}
4973
}
5074

5175
await browser.close();
5276
}
5377

78+
/** Create MySQL connection pool so we can reuse connections */
5479
const pool: Pool = createPool(<PoolConfig>{
5580
host: process.env.HOST,
5681
user: process.env.USER,
@@ -59,9 +84,13 @@ const pool: Pool = createPool(<PoolConfig>{
5984
port: process.env.PORT
6085
});
6186

62-
// Scrape every 15 minutes if production mode is enabled (https://crontab.guru is your best friend)
87+
/*
88+
* Scrape every 15 minutes if production mode is enabled or once
89+
* if not.
90+
* (https://crontab.guru is your best friend)
91+
*/
6392
const interval = process.env.production ? '*/30 * * * *' : '* * * * *';
64-
console.log(`Scraping every ${process.env.production ? '15 minutes' : 'minute'}.`);
93+
console.log(`Scraping ${process.env.production ? 'every 30 minutes' : 'once.'}.`);
6594

66-
if (!process.env.production) scrape(pool);
67-
schedule(interval, () => scrape(pool));
95+
if (!process.env.production) scrape(pool)
96+
else schedule(interval, () => scrape(pool));

0 commit comments

Comments
 (0)