-
Notifications
You must be signed in to change notification settings - Fork 1.1k
feat: better IFrame and shadowDom elements scraping #409
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 6 commits
414ff5c
cdb550c
f0f4141
e254624
22a3dc3
58a893c
8b82d33
83afef7
1a8cfbc
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -762,6 +762,22 @@ export const getSelectors = async (page: Page, coordinates: Coordinates) => { | |
One, | ||
} | ||
|
||
type ShadowBoundary = { | ||
type: 'shadow'; | ||
host: HTMLElement; | ||
root: ShadowRoot; | ||
element: HTMLElement; | ||
}; | ||
|
||
type IframeBoundary = { | ||
type: 'iframe'; | ||
frame: HTMLIFrameElement; | ||
document: Document; | ||
element: HTMLElement; | ||
}; | ||
|
||
type Boundary = ShadowBoundary | IframeBoundary; | ||
|
||
type Options = { | ||
root: Element; | ||
idName: (name: string) => boolean; | ||
|
@@ -1340,64 +1356,43 @@ export const getSelectors = async (page: Page, coordinates: Coordinates) => { | |
} | ||
}; | ||
|
||
// Helper function to generate selectors for shadow DOM elements | ||
const genSelectorForShadowDOM = (element: HTMLElement) => { | ||
// Get complete path up to document root | ||
const getShadowPath = (el: HTMLElement) => { | ||
const path = []; | ||
let current = el; | ||
let depth = 0; | ||
const MAX_DEPTH = 4; | ||
|
||
while (current && depth < MAX_DEPTH) { | ||
const rootNode = current.getRootNode(); | ||
if (rootNode instanceof ShadowRoot) { | ||
path.unshift({ | ||
host: rootNode.host as HTMLElement, | ||
root: rootNode, | ||
element: current | ||
}); | ||
current = rootNode.host as HTMLElement; | ||
depth++; | ||
} else { | ||
break; | ||
} | ||
const getBoundaryPath = (element: HTMLElement): Boundary[] => { | ||
const path: Boundary[] = []; | ||
let current = element; | ||
let depth = 0; | ||
const MAX_DEPTH = 4; | ||
|
||
while (current && depth < MAX_DEPTH) { | ||
const rootNode = current.getRootNode(); | ||
if (rootNode instanceof ShadowRoot) { | ||
path.unshift({ | ||
type: 'shadow', | ||
host: rootNode.host as HTMLElement, | ||
root: rootNode, | ||
element: current | ||
}); | ||
current = rootNode.host as HTMLElement; | ||
depth++; | ||
continue; | ||
} | ||
return path; | ||
}; | ||
|
||
const shadowPath = getShadowPath(element); | ||
if (shadowPath.length === 0) return null; | ||
|
||
try { | ||
const selectorParts: string[] = []; | ||
|
||
// Generate selector for each shadow DOM boundary | ||
shadowPath.forEach((context, index) => { | ||
// Get selector for the host element | ||
const hostSelector = finder(context.host, { | ||
root: index === 0 ? document.body : (shadowPath[index - 1].root as unknown as Element) | ||
|
||
const ownerDocument = current.ownerDocument; | ||
const frameElement = ownerDocument?.defaultView?.frameElement as HTMLIFrameElement; | ||
if (frameElement) { | ||
path.unshift({ | ||
type: 'iframe', | ||
frame: frameElement, | ||
document: ownerDocument, | ||
element: current | ||
}); | ||
|
||
// For the last context, get selector for target element | ||
if (index === shadowPath.length - 1) { | ||
const elementSelector = finder(element, { | ||
root: context.root as unknown as Element | ||
}); | ||
selectorParts.push(`${hostSelector} >> ${elementSelector}`); | ||
} else { | ||
selectorParts.push(hostSelector); | ||
} | ||
}); | ||
|
||
return { | ||
fullSelector: selectorParts.join(' >> '), | ||
mode: shadowPath[shadowPath.length - 1].root.mode | ||
}; | ||
} catch (e) { | ||
console.warn('Error generating shadow DOM selector:', e); | ||
return null; | ||
current = frameElement; | ||
depth++; | ||
continue; | ||
} | ||
|
||
break; | ||
} | ||
return path; | ||
}; | ||
|
||
const genSelectors = (element: HTMLElement | null) => { | ||
|
@@ -1406,55 +1401,107 @@ export const getSelectors = async (page: Page, coordinates: Coordinates) => { | |
} | ||
|
||
const href = element.getAttribute('href'); | ||
const boundaryPath = getBoundaryPath(element); | ||
|
||
const getRootElement = (index: number): Element => { | ||
if (index === 0) { | ||
return document.body; | ||
} | ||
|
||
const previousBoundary = boundaryPath[index - 1]; | ||
if (!previousBoundary) { | ||
return document.body; | ||
} | ||
|
||
if (previousBoundary.type === 'shadow') { | ||
return previousBoundary.root as unknown as Element; | ||
} | ||
return previousBoundary.document.body as Element; | ||
}; | ||
|
||
const generateBoundaryAwareSelector = (elementOptions = {}) => { | ||
if (boundaryPath.length === 0) { | ||
return finder(element, elementOptions); | ||
} | ||
|
||
const selectorParts: string[] = []; | ||
|
||
boundaryPath.forEach((context, index) => { | ||
const root = getRootElement(index); | ||
|
||
if (context.type === 'shadow') { | ||
const hostSelector = finder(context.host, { root }); | ||
|
||
if (index === boundaryPath.length - 1) { | ||
const elementSelector = finder(element, { | ||
...elementOptions, | ||
root: context.root as unknown as Element | ||
}); | ||
selectorParts.push(`${hostSelector} >> ${elementSelector}`); | ||
} else { | ||
selectorParts.push(hostSelector); | ||
} | ||
} else { | ||
const frameSelector = finder(context.frame, { root }); | ||
|
||
if (index === boundaryPath.length - 1) { | ||
const elementSelector = finder(element, { | ||
...elementOptions, | ||
root: context.document.body as Element | ||
}); | ||
selectorParts.push(`${frameSelector} :>> ${elementSelector}`); | ||
} else { | ||
selectorParts.push(frameSelector); | ||
} | ||
} | ||
}); | ||
|
||
const lastBoundary = boundaryPath[boundaryPath.length - 1]; | ||
const delimiter = lastBoundary.type === 'shadow' ? ' >> ' : ' :>> '; | ||
return selectorParts.join(delimiter); | ||
}; | ||
|
||
let generalSelector = null; | ||
try { | ||
generalSelector = finder(element); | ||
generalSelector = generateBoundaryAwareSelector(); | ||
} catch (e) { | ||
} | ||
|
||
let attrSelector = null; | ||
try { | ||
attrSelector = finder(element, { attr: () => true }); | ||
attrSelector = generateBoundaryAwareSelector({ attr: () => true }); | ||
} catch (e) { | ||
} | ||
|
||
// const iframeSelector = genSelectorForIframe(element); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🛠️ Refactor suggestion Remove commented-out code. The code contains commented-out sections related to the old iframe selector implementation. Since this functionality has been replaced with the new boundary-aware selector, these comments should be removed to maintain code cleanliness. -// const iframeSelector = genSelectorForIframe(element);
-// iframeSelector: iframeSelector ? {
-// full: iframeSelector.fullSelector,
-// isIframe: iframeSelector.isFrameContent,
-// } : null, Also applies to: 1519-1522 |
||
|
||
const iframeSelector = genSelectorForIframe(element); | ||
const shadowSelector = genSelectorForShadowDOM(element); | ||
|
||
const hrefSelector = genSelectorForAttributes(element, ['href']); | ||
const formSelector = genSelectorForAttributes(element, [ | ||
'name', | ||
'placeholder', | ||
'for', | ||
]); | ||
const accessibilitySelector = genSelectorForAttributes(element, [ | ||
'aria-label', | ||
'alt', | ||
'title', | ||
]); | ||
|
||
const testIdSelector = genSelectorForAttributes(element, [ | ||
'data-testid', | ||
'data-test-id', | ||
'data-testing', | ||
'data-test', | ||
'data-qa', | ||
'data-cy', | ||
]); | ||
const hrefSelector = generateBoundaryAwareSelector({ | ||
attr: genValidAttributeFilter(element, ['href']) | ||
}); | ||
const formSelector = generateBoundaryAwareSelector({ | ||
attr: genValidAttributeFilter(element, ['name', 'placeholder', 'for']) | ||
}); | ||
const accessibilitySelector = generateBoundaryAwareSelector({ | ||
attr: genValidAttributeFilter(element, ['aria-label', 'alt', 'title']) | ||
}); | ||
|
||
const testIdSelector = generateBoundaryAwareSelector({ | ||
attr: genValidAttributeFilter(element, [ | ||
'data-testid', | ||
'data-test-id', | ||
'data-testing', | ||
'data-test', | ||
'data-qa', | ||
'data-cy', | ||
]) | ||
}); | ||
|
||
// We won't use an id selector if the id is invalid (starts with a number) | ||
let idSelector = null; | ||
try { | ||
idSelector = | ||
isAttributesDefined(element, ['id']) && | ||
!isCharacterNumber(element.id?.[0]) | ||
? // Certain apps don't have unique ids (ex. youtube) | ||
finder(element, { | ||
attr: (name) => name === 'id', | ||
}) | ||
: null; | ||
idSelector = isAttributesDefined(element, ['id']) && !isCharacterNumber(element.id?.[0]) | ||
? generateBoundaryAwareSelector({ attr: (name: string) => name === 'id' }) | ||
: null; | ||
} catch (e) { | ||
} | ||
|
||
|
@@ -1469,14 +1516,10 @@ export const getSelectors = async (page: Page, coordinates: Coordinates) => { | |
hrefSelector, | ||
accessibilitySelector, | ||
formSelector, | ||
iframeSelector: iframeSelector ? { | ||
full: iframeSelector.fullSelector, | ||
isIframe: iframeSelector.isFrameContent, | ||
} : null, | ||
shadowSelector: shadowSelector ? { | ||
full: shadowSelector.fullSelector, | ||
mode: shadowSelector.mode | ||
} : null | ||
// iframeSelector: iframeSelector ? { | ||
// full: iframeSelector.fullSelector, | ||
// isIframe: iframeSelector.isFrameContent, | ||
// } : null, | ||
}; | ||
} | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Add origin check for iframe access.
When accessing iframe content, it's important to check the iframe's origin to prevent potential security issues with cross-origin frames. Consider adding origin validation before attempting to access iframe content.
📝 Committable suggestion