Skip to content

feat: better IFrame and shadowDom elements scraping #409

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 9 commits into from
Closed
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
235 changes: 139 additions & 96 deletions server/src/workflow-management/selector.ts
Original file line number Diff line number Diff line change
Expand Up @@ -762,6 +762,22 @@ export const getSelectors = async (page: Page, coordinates: Coordinates) => {
One,
}

type ShadowBoundary = {
type: 'shadow';
host: HTMLElement;
root: ShadowRoot;
element: HTMLElement;
};

type IframeBoundary = {
type: 'iframe';
frame: HTMLIFrameElement;
document: Document;
element: HTMLElement;
};

type Boundary = ShadowBoundary | IframeBoundary;

type Options = {
root: Element;
idName: (name: string) => boolean;
Expand Down Expand Up @@ -1340,64 +1356,43 @@ export const getSelectors = async (page: Page, coordinates: Coordinates) => {
}
};

// Helper function to generate selectors for shadow DOM elements
const genSelectorForShadowDOM = (element: HTMLElement) => {
// Get complete path up to document root
const getShadowPath = (el: HTMLElement) => {
const path = [];
let current = el;
let depth = 0;
const MAX_DEPTH = 4;

while (current && depth < MAX_DEPTH) {
const rootNode = current.getRootNode();
if (rootNode instanceof ShadowRoot) {
path.unshift({
host: rootNode.host as HTMLElement,
root: rootNode,
element: current
});
current = rootNode.host as HTMLElement;
depth++;
} else {
break;
}
const getBoundaryPath = (element: HTMLElement): Boundary[] => {
const path: Boundary[] = [];
let current = element;
let depth = 0;
const MAX_DEPTH = 4;

while (current && depth < MAX_DEPTH) {
const rootNode = current.getRootNode();
if (rootNode instanceof ShadowRoot) {
path.unshift({
type: 'shadow',
host: rootNode.host as HTMLElement,
root: rootNode,
element: current
});
current = rootNode.host as HTMLElement;
depth++;
continue;
}
return path;
};

const shadowPath = getShadowPath(element);
if (shadowPath.length === 0) return null;

try {
const selectorParts: string[] = [];

// Generate selector for each shadow DOM boundary
shadowPath.forEach((context, index) => {
// Get selector for the host element
const hostSelector = finder(context.host, {
root: index === 0 ? document.body : (shadowPath[index - 1].root as unknown as Element)

const ownerDocument = current.ownerDocument;
const frameElement = ownerDocument?.defaultView?.frameElement as HTMLIFrameElement;
if (frameElement) {
path.unshift({
type: 'iframe',
frame: frameElement,
document: ownerDocument,
element: current
});

// For the last context, get selector for target element
if (index === shadowPath.length - 1) {
const elementSelector = finder(element, {
root: context.root as unknown as Element
});
selectorParts.push(`${hostSelector} >> ${elementSelector}`);
} else {
selectorParts.push(hostSelector);
}
});

return {
fullSelector: selectorParts.join(' >> '),
mode: shadowPath[shadowPath.length - 1].root.mode
};
} catch (e) {
console.warn('Error generating shadow DOM selector:', e);
return null;
current = frameElement;
depth++;
continue;
}
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue

Add origin check for iframe access.

When accessing iframe content, it's important to check the iframe's origin to prevent potential security issues with cross-origin frames. Consider adding origin validation before attempting to access iframe content.

 const ownerDocument = current.ownerDocument;
 const frameElement = ownerDocument?.defaultView?.frameElement as HTMLIFrameElement;
 if (frameElement) {
+  try {
+    // Check if we can access the iframe's origin
+    const iframeOrigin = new URL(frameElement.src).origin;
+    const currentOrigin = window.location.origin;
+    if (iframeOrigin !== currentOrigin) {
+      console.warn(`Skipping cross-origin iframe: ${iframeOrigin}`);
+      break;
+    }
+
     path.unshift({
       type: 'iframe',
       frame: frameElement,
       document: ownerDocument,
       element: current
     });
     current = frameElement;
     depth++;
     continue;
+  } catch (error) {
+    console.warn('Cannot access iframe origin:', error);
+    break;
+  }
 }
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
const ownerDocument = current.ownerDocument;
const frameElement = ownerDocument?.defaultView?.frameElement as HTMLIFrameElement;
if (frameElement) {
path.unshift({
type: 'iframe',
frame: frameElement,
document: ownerDocument,
element: current
});
// For the last context, get selector for target element
if (index === shadowPath.length - 1) {
const elementSelector = finder(element, {
root: context.root as unknown as Element
});
selectorParts.push(`${hostSelector} >> ${elementSelector}`);
} else {
selectorParts.push(hostSelector);
}
});
return {
fullSelector: selectorParts.join(' >> '),
mode: shadowPath[shadowPath.length - 1].root.mode
};
} catch (e) {
console.warn('Error generating shadow DOM selector:', e);
return null;
current = frameElement;
depth++;
continue;
}
const ownerDocument = current.ownerDocument;
const frameElement = ownerDocument?.defaultView?.frameElement as HTMLIFrameElement;
if (frameElement) {
try {
// Check if we can access the iframe's origin
const iframeOrigin = new URL(frameElement.src).origin;
const currentOrigin = window.location.origin;
if (iframeOrigin !== currentOrigin) {
console.warn(`Skipping cross-origin iframe: ${iframeOrigin}`);
break;
}
path.unshift({
type: 'iframe',
frame: frameElement,
document: ownerDocument,
element: current
});
current = frameElement;
depth++;
continue;
} catch (error) {
console.warn('Cannot access iframe origin:', error);
break;
}
}


break;
}
return path;
};

const genSelectors = (element: HTMLElement | null) => {
Expand All @@ -1406,55 +1401,107 @@ export const getSelectors = async (page: Page, coordinates: Coordinates) => {
}

const href = element.getAttribute('href');
const boundaryPath = getBoundaryPath(element);

const getRootElement = (index: number): Element => {
if (index === 0) {
return document.body;
}

const previousBoundary = boundaryPath[index - 1];
if (!previousBoundary) {
return document.body;
}

if (previousBoundary.type === 'shadow') {
return previousBoundary.root as unknown as Element;
}
return previousBoundary.document.body as Element;
};

const generateBoundaryAwareSelector = (elementOptions = {}) => {
if (boundaryPath.length === 0) {
return finder(element, elementOptions);
}

const selectorParts: string[] = [];

boundaryPath.forEach((context, index) => {
const root = getRootElement(index);

if (context.type === 'shadow') {
const hostSelector = finder(context.host, { root });

if (index === boundaryPath.length - 1) {
const elementSelector = finder(element, {
...elementOptions,
root: context.root as unknown as Element
});
selectorParts.push(`${hostSelector} >> ${elementSelector}`);
} else {
selectorParts.push(hostSelector);
}
} else {
const frameSelector = finder(context.frame, { root });

if (index === boundaryPath.length - 1) {
const elementSelector = finder(element, {
...elementOptions,
root: context.document.body as Element
});
selectorParts.push(`${frameSelector} :>> ${elementSelector}`);
} else {
selectorParts.push(frameSelector);
}
}
});

const lastBoundary = boundaryPath[boundaryPath.length - 1];
const delimiter = lastBoundary.type === 'shadow' ? ' >> ' : ' :>> ';
return selectorParts.join(delimiter);
};

let generalSelector = null;
try {
generalSelector = finder(element);
generalSelector = generateBoundaryAwareSelector();
} catch (e) {
}

let attrSelector = null;
try {
attrSelector = finder(element, { attr: () => true });
attrSelector = generateBoundaryAwareSelector({ attr: () => true });
} catch (e) {
}

// const iframeSelector = genSelectorForIframe(element);
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🛠️ Refactor suggestion

Remove commented-out code.

The code contains commented-out sections related to the old iframe selector implementation. Since this functionality has been replaced with the new boundary-aware selector, these comments should be removed to maintain code cleanliness.

-// const iframeSelector = genSelectorForIframe(element);

-// iframeSelector: iframeSelector ? {
-//   full: iframeSelector.fullSelector,
-//   isIframe: iframeSelector.isFrameContent,
-// } : null,

Also applies to: 1519-1522


const iframeSelector = genSelectorForIframe(element);
const shadowSelector = genSelectorForShadowDOM(element);

const hrefSelector = genSelectorForAttributes(element, ['href']);
const formSelector = genSelectorForAttributes(element, [
'name',
'placeholder',
'for',
]);
const accessibilitySelector = genSelectorForAttributes(element, [
'aria-label',
'alt',
'title',
]);

const testIdSelector = genSelectorForAttributes(element, [
'data-testid',
'data-test-id',
'data-testing',
'data-test',
'data-qa',
'data-cy',
]);
const hrefSelector = generateBoundaryAwareSelector({
attr: genValidAttributeFilter(element, ['href'])
});
const formSelector = generateBoundaryAwareSelector({
attr: genValidAttributeFilter(element, ['name', 'placeholder', 'for'])
});
const accessibilitySelector = generateBoundaryAwareSelector({
attr: genValidAttributeFilter(element, ['aria-label', 'alt', 'title'])
});

const testIdSelector = generateBoundaryAwareSelector({
attr: genValidAttributeFilter(element, [
'data-testid',
'data-test-id',
'data-testing',
'data-test',
'data-qa',
'data-cy',
])
});

// We won't use an id selector if the id is invalid (starts with a number)
let idSelector = null;
try {
idSelector =
isAttributesDefined(element, ['id']) &&
!isCharacterNumber(element.id?.[0])
? // Certain apps don't have unique ids (ex. youtube)
finder(element, {
attr: (name) => name === 'id',
})
: null;
idSelector = isAttributesDefined(element, ['id']) && !isCharacterNumber(element.id?.[0])
? generateBoundaryAwareSelector({ attr: (name: string) => name === 'id' })
: null;
} catch (e) {
}

Expand All @@ -1469,14 +1516,10 @@ export const getSelectors = async (page: Page, coordinates: Coordinates) => {
hrefSelector,
accessibilitySelector,
formSelector,
iframeSelector: iframeSelector ? {
full: iframeSelector.fullSelector,
isIframe: iframeSelector.isFrameContent,
} : null,
shadowSelector: shadowSelector ? {
full: shadowSelector.fullSelector,
mode: shadowSelector.mode
} : null
// iframeSelector: iframeSelector ? {
// full: iframeSelector.fullSelector,
// isIframe: iframeSelector.isFrameContent,
// } : null,
};
}

Expand Down
9 changes: 0 additions & 9 deletions server/src/workflow-management/utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -13,15 +13,6 @@ export const getBestSelectorForAction = (action: Action) => {
case ActionType.DragAndDrop: {
const selectors = action.selectors;


if (selectors?.iframeSelector?.full) {
return selectors.iframeSelector.full;
}

if (selectors?.shadowSelector?.full) {
return selectors.shadowSelector.full;
}

// less than 25 characters, and element only has text inside
const textSelector =
selectors?.text?.length != null &&
Expand Down