Skip to content

feat: better IFrame and shadowDom elements scraping #409

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 9 commits into
base: develop
Choose a base branch
from
146 changes: 63 additions & 83 deletions server/src/workflow-management/selector.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1340,121 +1340,105 @@ export const getSelectors = async (page: Page, coordinates: Coordinates) => {
}
};

// Helper function to generate selectors for shadow DOM elements
const genSelectorForShadowDOM = (element: HTMLElement) => {
// Get complete path up to document root
const getShadowPath = (el: HTMLElement) => {
const path = [];
let current = el;
let depth = 0;
const MAX_DEPTH = 4;

while (current && depth < MAX_DEPTH) {
const rootNode = current.getRootNode();
if (rootNode instanceof ShadowRoot) {
path.unshift({
host: rootNode.host as HTMLElement,
root: rootNode,
element: current
});
current = rootNode.host as HTMLElement;
depth++;
} else {
break;
}
const getShadowPath = (element: HTMLElement) => {
const path = [];
let current = element;
let depth = 0;
const MAX_DEPTH = 4;

while (current && depth < MAX_DEPTH) {
const rootNode = current.getRootNode();
if (rootNode instanceof ShadowRoot) {
path.unshift({
host: rootNode.host as HTMLElement,
root: rootNode,
element: current
});
current = rootNode.host as HTMLElement;
depth++;
} else {
break;
}
return path;
};
}
return path;
};

const genSelectors = (element: HTMLElement | null) => {
if (element == null) {
return null;
}

const href = element.getAttribute('href');
const shadowPath = getShadowPath(element);
if (shadowPath.length === 0) return null;

try {
const generateShadowAwareSelector = (elementOptions = {}) => {
if (shadowPath.length === 0) {
return finder(element, elementOptions);
}

const selectorParts: string[] = [];

// Generate selector for each shadow DOM boundary
shadowPath.forEach((context, index) => {
// Get selector for the host element
const hostSelector = finder(context.host, {
root: index === 0 ? document.body : (shadowPath[index - 1].root as unknown as Element)
});

// For the last context, get selector for target element

if (index === shadowPath.length - 1) {
const elementSelector = finder(element, {
...elementOptions,
root: context.root as unknown as Element
});
selectorParts.push(`${hostSelector} >> ${elementSelector}`);
} else {
selectorParts.push(hostSelector);
}
});

return selectorParts.join(' >> ');
};

return {
fullSelector: selectorParts.join(' >> '),
mode: shadowPath[shadowPath.length - 1].root.mode
};
} catch (e) {
console.warn('Error generating shadow DOM selector:', e);
return null;
}
};

const genSelectors = (element: HTMLElement | null) => {
if (element == null) {
return null;
}

const href = element.getAttribute('href');

let generalSelector = null;
try {
generalSelector = finder(element);
generalSelector = generateShadowAwareSelector();
} catch (e) {
}

let attrSelector = null;
try {
attrSelector = finder(element, { attr: () => true });
attrSelector = generateShadowAwareSelector({ attr: () => true });
} catch (e) {
}


const iframeSelector = genSelectorForIframe(element);
const shadowSelector = genSelectorForShadowDOM(element);

const hrefSelector = genSelectorForAttributes(element, ['href']);
const formSelector = genSelectorForAttributes(element, [
'name',
'placeholder',
'for',
]);
const accessibilitySelector = genSelectorForAttributes(element, [
'aria-label',
'alt',
'title',
]);

const testIdSelector = genSelectorForAttributes(element, [
'data-testid',
'data-test-id',
'data-testing',
'data-test',
'data-qa',
'data-cy',
]);

const hrefSelector = generateShadowAwareSelector({
attr: genValidAttributeFilter(element, ['href'])
});
const formSelector = generateShadowAwareSelector({
attr: genValidAttributeFilter(element, ['name', 'placeholder', 'for'])
});
const accessibilitySelector = generateShadowAwareSelector({
attr: genValidAttributeFilter(element, ['aria-label', 'alt', 'title'])
});

const testIdSelector = generateShadowAwareSelector({
attr: genValidAttributeFilter(element, [
'data-testid',
'data-test-id',
'data-testing',
'data-test',
'data-qa',
'data-cy',
])
});

// We won't use an id selector if the id is invalid (starts with a number)
let idSelector = null;
try {
idSelector =
isAttributesDefined(element, ['id']) &&
!isCharacterNumber(element.id?.[0])
? // Certain apps don't have unique ids (ex. youtube)
finder(element, {
attr: (name) => name === 'id',
})
: null;
idSelector = isAttributesDefined(element, ['id']) && !isCharacterNumber(element.id?.[0])
? generateShadowAwareSelector({ attr: (name: string) => name === 'id' })
: null;
} catch (e) {
}

Expand All @@ -1473,10 +1457,6 @@ export const getSelectors = async (page: Page, coordinates: Coordinates) => {
full: iframeSelector.fullSelector,
isIframe: iframeSelector.isFrameContent,
} : null,
shadowSelector: shadowSelector ? {
full: shadowSelector.fullSelector,
mode: shadowSelector.mode
} : null
};
}

Expand Down
4 changes: 0 additions & 4 deletions server/src/workflow-management/utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,6 @@ export const getBestSelectorForAction = (action: Action) => {
if (selectors?.iframeSelector?.full) {
return selectors.iframeSelector.full;
}

if (selectors?.shadowSelector?.full) {
return selectors.shadowSelector.full;
}

// less than 25 characters, and element only has text inside
const textSelector =
Expand Down