Skip to content

Commit 05bfb9d

Browse files
committed
improved docker deployment
1 parent ca54255 commit 05bfb9d

File tree

9 files changed

+113
-36
lines changed

9 files changed

+113
-36
lines changed
Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
# Settings for database
2-
HOST=<host of dev database>
3-
USER=<user for dev database>
4-
PASSWORD=<password for dev database>
5-
DATABASE=<database of dev database>
6-
PORT=<port of dev database>
2+
DB_HOST=<host of dev database>
3+
DB_USER=<user for dev database>
4+
DB_PASSWORD=<password for dev database>
5+
DB_DATABASE=<database of dev database>
6+
DB_PORT=<port of dev database>
77

88
# Settings for email notification
99
MAIL_HOST=<host>
@@ -17,6 +17,6 @@ MAIL_SUBJECT=<subject>
1717
# Openai api key (https://openai.com)
1818
OPENAI_API_KEY=<openai api key>
1919

20-
2120
# 2captcha api key (https://2captcha.com)
22-
TWO_CAPTCHA_API_KEY=<2captcha api key>
21+
TWO_CAPTCHA_API_KEY=<2captcha api key>
22+

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,8 @@ typings/
7373

7474
# dotenv environment variables file
7575
.env
76+
database.env
77+
phpmyadmin.env
7678
.env.test
7779

7880
# parcel-bundler cache (https://parceljs.org/)

Dockerfile

Lines changed: 18 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ RUN apt-get update \
99
&& sh -c 'echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google.list' \
1010
&& apt-get update \
1111
&& apt-get install -y google-chrome-stable fonts-ipafont-gothic fonts-wqy-zenhei fonts-thai-tlwg fonts-kacst fonts-freefont-ttf libxss1 \
12-
--no-install-recommends \
12+
--no-install-recommends \
1313
&& rm -rf /var/lib/apt/lists/*
1414

1515
# If running Docker >= 1.13.0 use docker run's --init arg to reap zombie processes, otherwise
@@ -21,25 +21,31 @@ RUN apt-get update \
2121
# Uncomment to skip the chromium download when installing puppeteer. If you do,
2222
# you'll need to launch puppeteer with:
2323
# browser.launch({executablePath: 'google-chrome-stable'})
24-
# ENV PUPPETEER_SKIP_CHROMIUM_DOWNLOAD true
25-
26-
WORKDIR /usr/src/app
27-
COPY package*.json ./
28-
RUN npm install
29-
RUN npm ci --only=production
30-
COPY . .
24+
# ENV PUPPETEER_SKIP_DOWNLOAD true
3125

3226
# Install puppeteer so it's available in the container.
33-
# Add user so we don't need --no-sandbox.
34-
# same layer as npm install to keep re-chowned files from using up several hundred MBs more space
35-
# RUN groupadd -r pptruser && useradd -r -g pptruser -G audio,video pptruser \
27+
# RUN npm init -y && \
28+
# npm i puppeteer \
29+
# # Add user so we don't need --no-sandbox.
30+
# # same layer as npm install to keep re-chowned files from using up several hundred MBs more space
31+
# && groupadd -r pptruser && useradd -r -g pptruser -G audio,video pptruser \
3632
# && mkdir -p /home/pptruser/Downloads \
3733
# && chown -R pptruser:pptruser /home/pptruser \
3834
# && chown -R pptruser:pptruser /node_modules \
3935
# && chown -R pptruser:pptruser /package.json \
4036
# && chown -R pptruser:pptruser /package-lock.json
4137

38+
WORKDIR /usr/src/app
39+
COPY package*.json ./
40+
RUN npm install
41+
RUN npm ci --only=production
42+
COPY . .
43+
4244
# Run everything after as non-privileged user.
4345
# USER pptruser
4446

45-
CMD [ "npm", "start" ]
47+
ENV NODE_ENV=production
48+
49+
RUN npm run compile
50+
51+
CMD [ "npm", "run", "start:prod" ]

README.md

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,12 @@ This advanced TypeScript Puppeteer web scraper template offers a comprehensive s
44

55
## Features
66

7-
- **Environment-Specific Configuration**: Leverages `.env` files for differentiating between development and production environments.
87
- **Puppeteer Plugins Integration**: Includes plugins like `puppeteer-extra-plugin-anonymize-ua`, `puppeteer-extra-plugin-adblocker`, `puppeteer-extra-plugin-recaptcha`, and `puppeteer-extra-plugin-stealth` for enhanced scraping capabilities.
98
- **Automated Scheduling**: Utilizes `node-cron` for scheduling scraping tasks, customizable for different intervals.
9+
- **Environment-Specific Configuration**: Leverages `.env` files for differentiating between development and production environments.
1010
- **MySQL Database Integration**: Features integration with MySQL using a connection pool for efficient data handling.
1111
- **Error Handling and Debugging**: Advanced error handling with screenshot capabilities for debugging, along with options to open devtools and slow down Puppeteer operations for detailed inspection.
12+
- **Automated Deployment**: Includes a docker-compose file for automated deployment of the scraper. This will automatically build the scraper, a MySQL database, and a phpMyAdmin instance for database management.
1213

1314
## Getting Started
1415

@@ -31,10 +32,10 @@ This advanced TypeScript Puppeteer web scraper template offers a comprehensive s
3132

3233
### Configuration
3334

34-
1. Create a `.env` file in the root directory.
35-
2. Add the necessary environment variables (as declared in the `.env.example` file) to the `.env` file or environment variables (recommended).
35+
1. Create thre three env files `.env`, `database.env` and `phpmyadmin.env` in the root directory.
36+
2. Add the necessary environment variables (as declared in the `*.env.example` files) to the `.env` files or environment variables.
3637

37-
### Usage
38+
### Local Usage
3839

3940
- Compile the scraper:
4041

@@ -52,6 +53,15 @@ This advanced TypeScript Puppeteer web scraper template offers a comprehensive s
5253
npm start
5354
```
5455

56+
### Docker Usage
57+
58+
- Build the scraper, MySQL database, and phpMyAdmin instance:
59+
60+
```sh
61+
docker-compose up
62+
```
63+
Make sure to add the necessary environment variables to the `database.env` and `phpmyadmin.env` files.
64+
5565
## TypeScript and Puppeteer Integration
5666

5767
- **TypeScript Support**: Fully supported with TypeScript for type safety and easier code management.

data/readme.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
This folder is mounted inside a volume, so it will be persisted even if the container is removed. This is useful for storing data
2+
that you want to keep between container restarts but don't want to store in the database.

database.env.template

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
MYSQL_ROOT_PASSWORD=
2+
MYSQL_USER=
3+
MYSQL_PASSWORD=
4+
MYSQL_DATABASE=
5+
DB_PORT=

docker-compose.yml

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
version: '2'
2+
3+
volumes:
4+
scraper-db-data:
5+
scraper-data:
6+
7+
services:
8+
scraper:
9+
container_name: scraper
10+
build: ./
11+
depends_on:
12+
- db
13+
env_file:
14+
- .env
15+
volumes:
16+
- scraper-data:/usr/src/app/data
17+
18+
phpmyadmin:
19+
container_name: phpmyadmin
20+
image: phpmyadmin/phpmyadmin
21+
depends_on:
22+
- db
23+
ports:
24+
- 9999:80
25+
expose:
26+
- 9999
27+
restart: always
28+
env_file:
29+
- database.env
30+
- phpmyadmin.env
31+
32+
db:
33+
container_name: scraper-db
34+
image: mysql:latest
35+
command:
36+
--default-authentication-plugin=mysql_native_password
37+
env_file:
38+
- database.env
39+
volumes:
40+
- scraper-db-data:/var/lib/mysql
41+
ports:
42+
- 7706:3306
43+
expose:
44+
- 7706

phpmyadmin.env.template

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
PMA_HOST=
2+
PMA_PORT=
3+
PMA_ARBITRARY=

src/scraper.ts

Lines changed: 18 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,11 @@
11
import { config } from 'dotenv';
22
import { createPool, Pool, PoolConfig } from 'mysql';
33
import { schedule } from 'node-cron';
4-
import { LaunchOptions } from 'puppeteer';
4+
import { PuppeteerLaunchOptions } from 'puppeteer';
55
import Puppeteer from 'puppeteer-extra';
66

7+
import { query } from './storage';
8+
79

810
/** Only use .env files when running in dev mode */
911
const isProduction = process.env.production?.toString() === 'true' || process.env.NODE_ENV === 'production';
@@ -21,13 +23,13 @@ Puppeteer.use(AdblockerPlugin({ blockTrackers: true }));
2123
Puppeteer.use(StealthPlugin());
2224

2325
/** Launch options */
24-
const launchOptions: LaunchOptions = {
26+
const launchOptions: PuppeteerLaunchOptions = {
2527
headless: isProduction, // Run headless in production mode
2628
args: [
2729
'--disable-gpu', '--disable-dev-shm-usage', '--disable-setuid-sandbox', '--no-sandbox',
2830
'--window-size=1920,1080', /* '--window-position=1920,0' */ // Activate this if you want to have the browser window on a second screen
2931
],
30-
ignoreHTTPSErrors: true,
32+
ignoreHTTPSErrors: true, // Ignore HTTPS errors
3133
devtools: !isProduction, // Open devtools in development mode
3234
slowMo: 0, // Slow down puppeteer operations by X milliseconds (useful for debugging)
3335
timeout: 0 // Disable timeouts
@@ -67,7 +69,7 @@ async function scrape(pool: Pool) {
6769
} catch (error) {
6870
if (error instanceof Error) {
6971
/** Save a screenshot if possible */
70-
try { await page.screenshot({ path: `log/err-${new Date().getTime()}.png` }) } catch (error) { }
72+
try { await page.screenshot({ path: `data/err-${new Date().getTime()}.png` }) } catch (error) { }
7173
console.error(error.message);
7274
}
7375
}
@@ -77,20 +79,23 @@ async function scrape(pool: Pool) {
7779

7880
/** Create MySQL connection pool so we can reuse connections */
7981
const pool: Pool = createPool(<PoolConfig>{
80-
host: process.env.HOST,
81-
user: process.env.USER,
82-
password: process.env.PASSWORD,
83-
database: process.env.DATABASE,
84-
port: process.env.PORT
82+
host: process.env.DB_HOST,
83+
user: process.env.DB_USER,
84+
password: process.env.DB_PASSWORD,
85+
database: process.env.DB_DATABASE,
86+
port: process.env.DB_PORT
8587
});
8688

89+
/* Test connection */
90+
query('SHOW TABLES FROM data;', [], (e, r) => {console.log(e ? e : `You have the following tables: ${r[0]}`);}, pool);
91+
8792
/*
8893
* Scrape every 15 minutes if production mode is enabled or once
8994
* if not.
9095
* (https://crontab.guru is your best friend)
9196
*/
92-
const interval = process.env.production ? '*/30 * * * *' : '* * * * *';
93-
console.log(`Scraping ${process.env.production ? 'every 30 minutes' : 'once.'}.`);
97+
const interval = isProduction ? '*/30 * * * *' : '* * * * *';
98+
console.log(`Scraping ${isProduction ? 'every 30 minutes' : 'once'} in ${isProduction ? 'production' : 'dev'} mode.`);
9499

95-
if (!process.env.production) scrape(pool)
96-
else schedule(interval, () => scrape(pool));
100+
if (isProduction) schedule(interval, () => scrape(pool));
101+
else scrape(pool);

0 commit comments

Comments
 (0)