Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 29 additions & 0 deletions browser.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
import { chromium } from 'playwright-extra';
import stealth from 'puppeteer-extra-plugin-stealth';
import path from 'path';

chromium.use(stealth());

export const createBrowser = async (userDataDir, headless, userAgent) => {
const resolvedUserAgent = typeof userAgent === 'string'
? userAgent
: userAgent?.toString?.();

return chromium.launchPersistentContext(path.resolve(userDataDir), {
headless,
channel: 'chromium',
acceptDownloads: true,
args: [
'--disable-features=IsolateOrigins,site-per-process',
'--disable-blink-features=AutomationControlled',
'--no-sandbox',
'--disable-infobars',
'--disable-extensions',
'--start-maximized',
'--window-size=1280,720'
],
userAgent: resolvedUserAgent,
viewport: { width: 1280, height: 720 },
deviceScaleFactor: 1,
});
};
2 changes: 0 additions & 2 deletions download/.gitignore

This file was deleted.

125 changes: 72 additions & 53 deletions index.js
Original file line number Diff line number Diff line change
@@ -1,29 +1,39 @@
import { chromium } from 'playwright-extra'
import stealth from 'puppeteer-extra-plugin-stealth'
import path from 'path'
import { moveFile } from 'move-file'
import fsP from 'node:fs/promises'
import fs from 'node:fs'
import { exiftool } from 'exiftool-vendored'
import ua from 'user-agents'

const userAgent = new ua({
platform: 'MacIntel', // 'Win32', 'Linux ...'
deviceCategory: 'desktop', // 'mobile', 'tablet'
});

chromium.use(stealth())

const timeoutValue = 300000
const userDataDir = './session'
const downloadPath = './download'

let headless = true

// accept --headless=false argument to run in headful mode
if (process.argv[2] === '--headless=false') {
headless = false
}
import { exiftool } from 'exiftool-vendored'
import ua from 'user-agents'
import { program, InvalidArgumentError } from 'commander';
import { createBrowser } from './browser.js';

const userAgent = new ua({
platform: 'MacIntel', // 'Win32', 'Linux ...'
deviceCategory: 'desktop', // 'mobile', 'tablet'
});

const timeoutValue = 300000
const userDataDir = './session'
const downloadPath = './download'

const parseBooleanOption = (value) => {
if (value === undefined) return true;
const normalized = String(value).toLowerCase();
if (['true', '1', 'yes', 'y'].includes(normalized)) return true;
if (['false', '0', 'no', 'n'].includes(normalized)) return false;
throw new InvalidArgumentError('Expected a boolean value');
};

// Configure command-line arguments
program
.option('--headless [boolean]', 'Run in headless mode', parseBooleanOption, true)
.option('--ignore-errors', 'Ignore errors and continue', false)
.option('--max-num-photos-in-session <number>', 'Maximum number of photos to download per session', -1)
.parse(process.argv);

const options = program.opts();
const headless = options.headless;
const ignoreErrors = options.ignoreErrors;
const maxNumPhotosInSession = parseInt(options.maxNumPhotosInSession, 10);

const sleep = ms => new Promise(resolve => setTimeout(resolve, ms))

Expand Down Expand Up @@ -84,25 +94,9 @@ const getMonthAndYear = async (metadata, page) => {
const startLink = await getProgress()
console.log('Starting from:', new URL(startLink).href)

const browser = await chromium.launchPersistentContext(path.resolve(userDataDir), {
headless,
channel: 'chromium',
acceptDownloads: true,
args: [
'--disable-features=IsolateOrigins,site-per-process',
'--disable-blink-features=AutomationControlled',
'--no-sandbox', // May help in some environments
'--disable-infobars', // Prevent infobars
'--disable-extensions', // Disable extensions
'--start-maximized', // Start maximized
'--window-size=1280,720' // Set a specific window size
],
userAgent: userAgent.toString(),
viewport: { width: 1280, height: 720 },
deviceScaleFactor: 1,
})
let browser = await createBrowser(userDataDir, headless, userAgent)

const page = await browser.newPage()
let page = await browser.newPage()

await page.goto('https://photos.google.com')

Expand All @@ -117,6 +111,9 @@ const getMonthAndYear = async (metadata, page) => {
*/
await downloadPhoto(page, true)

let photosDownloaded = 0; // Counter for downloaded photos
let photosDownloadedInSession = 0;

while (true) {
const currentUrl = await page.url()

Expand All @@ -131,18 +128,40 @@ const getMonthAndYear = async (metadata, page) => {
Note: I have tried both left arrow press and clicking directly the left side of arrow using playwright click method.
However, both of them are not working. So, I have injected the click method in the html.
*/
await page.evaluate(() => document.getElementsByClassName('SxgK2b OQEhnd')[0].click())

// we wait until new photo is loaded
await page.waitForURL((url) => {
return url.host === 'photos.google.com' && url.href !== currentUrl
},
{
timeout: timeoutValue,
})

await downloadPhoto(page)
await saveProgress(page)
try {
await page.evaluate(() => document.getElementsByClassName('SxgK2b OQEhnd')[0].click())

// we wait until new photo is loaded
await page.waitForURL((url) => {
return url.host === 'photos.google.com' && url.href !== currentUrl
},
{
timeout: timeoutValue,
})

await downloadPhoto(page)
await saveProgress(page)

photosDownloaded++; // Increment the counter
photosDownloadedInSession++;

// Check if the maximum number of photos in session has been reached
if (maxNumPhotosInSession !== -1 && photosDownloadedInSession >= maxNumPhotosInSession) {
console.log(`Downloaded ${photosDownloadedInSession} photos in this session. Restarting browser...`);
await browser.close();
photosDownloadedInSession = 0;
browser = await createBrowser(userDataDir, headless, userAgent)
page = await browser.newPage();
await page.goto(currentUrl);
}
Comment on lines +148 to +156
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🔴 Critical

Session restart uses stale URL; resume from the current page instead.

currentUrl was captured before navigation. On restart you’ll revisit the old photo and risk duplicates.

-      if (maxNumPhotosInSession !== -1 && photosDownloadedInSession >= maxNumPhotosInSession) {
+      if (maxNumPhotosInSession !== -1 && photosDownloadedInSession >= maxNumPhotosInSession) {
         console.log(`Downloaded ${photosDownloadedInSession} photos in this session. Restarting browser...`);
-        await browser.close();
+        const resumeUrl = await page.url(); // capture current page before closing
+        await browser.close();
         photosDownloadedInSession = 0;
         browser = await createBrowser(userDataDir, headless, userAgent)
         page = await browser.newPage();
-        await page.goto(currentUrl);
+        await page.goto(resumeUrl);
       }
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
// Check if the maximum number of photos in session has been reached
if (maxNumPhotosInSession !== -1 && photosDownloadedInSession >= maxNumPhotosInSession) {
console.log(`Downloaded ${photosDownloadedInSession} photos in this session. Restarting browser...`);
await browser.close();
photosDownloadedInSession = 0;
browser = await createBrowser(userDataDir, headless, userAgent)
page = await browser.newPage();
await page.goto(currentUrl);
}
// Check if the maximum number of photos in session has been reached
if (maxNumPhotosInSession !== -1 && photosDownloadedInSession >= maxNumPhotosInSession) {
console.log(`Downloaded ${photosDownloadedInSession} photos in this session. Restarting browser...`);
const resumeUrl = await page.url(); // capture current page before closing
await browser.close();
photosDownloadedInSession = 0;
browser = await createBrowser(userDataDir, headless, userAgent)
page = await browser.newPage();
await page.goto(resumeUrl);
}
🤖 Prompt for AI Agents
In index.js around lines 148-156, the restart logic uses the stale variable
currentUrl captured earlier, causing a revisit of the old photo; before closing
the browser capture the actual current page URL (e.g., const resumeUrl = await
page.url()), then after creating the new browser and page navigate to that
captured resumeUrl instead of currentUrl so scraping resumes where it left off.

} catch (error) {
console.error('An error occurred:', error);
await saveProgress(page);
if (!ignoreErrors) {
console.log('Aborting due to error. Use --ignore-errors to continue on errors.');
process.exit(1);
}
}
}
await browser.close()
await exiftool.end()
Expand Down
10 changes: 10 additions & 0 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
"photo"
],
"dependencies": {
"commander": "^14.0.1",
"exiftool-vendored": "^22.0.0",
"move-file": "^3.1.0",
"pino": "^8.14.1",
Expand Down