Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

chore: release v0.0.5 #261

Draft
wants to merge 29 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
171d669
feat: pass context script for webdriver
amhsirak Dec 13, 2024
cd05ddf
chore: lint
amhsirak Dec 13, 2024
c49e70a
chrome and chromium user agent
amhsirak Dec 13, 2024
b173ce3
chore: remove commented code
amhsirak Dec 13, 2024
0618401
feat: args
amhsirak Dec 14, 2024
4469325
chore: sync compose master <-> develop
amhsirak Dec 14, 2024
7f48464
feat: add page navigation timeout
RohitR311 Dec 14, 2024
bdf908e
feat: add domcontentloaded wait load state
RohitR311 Dec 14, 2024
f38230d
feat: revert to networkidle for wait load state
RohitR311 Dec 14, 2024
7ce7a15
feat: check for selector visibility in getState
RohitR311 Dec 14, 2024
ea2c36f
Merge pull request #257 from getmaxun/item0-fix
amhsirak Dec 14, 2024
e22c019
feat: rotate user agents
amhsirak Dec 14, 2024
320f24e
feat: shm & sandbox args
amhsirak Dec 14, 2024
ffe87b0
feat: user getUserAgent()
amhsirak Dec 14, 2024
e701452
feat: remove container tags
amhsirak Dec 14, 2024
cb09653
feat: accept getList in getRect and getElementInfo
amhsirak Dec 14, 2024
ddb880d
fix: capture text selection
amhsirak Dec 14, 2024
97e7c89
feat: re-add listSelector empty check for child selection
amhsirak Dec 14, 2024
0c3b1e3
feat: paass listSelect
amhsirak Dec 14, 2024
e147693
fix: dont pass listSelector to non unique
amhsirak Dec 16, 2024
4a94960
feat: push parentSelector
amhsirak Dec 16, 2024
23ac134
fix: add pair to workflow before decision socket emission
RohitR311 Dec 16, 2024
94df794
feat: conditionally compute non unique
amhsirak Dec 16, 2024
52b7671
feat: !push parentSelector
amhsirak Dec 16, 2024
647cd62
feat: add listSelector param
RohitR311 Dec 17, 2024
a9dc4c8
feat: add conditional check to collect matching classes
RohitR311 Dec 17, 2024
0ed4c8a
Merge pull request #264 from getmaxun/listout-fix
amhsirak Dec 17, 2024
fa2d609
Merge pull request #259 from getmaxun/webdrive-patch
amhsirak Dec 17, 2024
c25975b
Merge pull request #266 from getmaxun/sel-fix
amhsirak Dec 17, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 1 addition & 6 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -64,8 +64,6 @@ services:
- redis
- minio
volumes:
- ./server:/app/server # Mount server source code for hot reloading
- ./maxun-core:/app/maxun-core # Mount maxun-core for any shared code updates
- /var/run/dbus:/var/run/dbus

frontend:
Expand All @@ -79,13 +77,10 @@ services:
environment:
PUBLIC_URL: ${PUBLIC_URL}
BACKEND_URL: ${BACKEND_URL}
volumes:
- ./:/app # Mount entire frontend app directory for hot reloading
- /app/node_modules # Anonymous volume to prevent overwriting node_modules
depends_on:
- backend

volumes:
postgres_data:
minio_data:
redis_data:
redis_data:
95 changes: 63 additions & 32 deletions maxun-core/src/browserSide/scraper.js
Original file line number Diff line number Diff line change
Expand Up @@ -265,41 +265,72 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
const scrapedData = [];

while (scrapedData.length < limit) {
// Get all parent elements matching the listSelector
const parentElements = Array.from(document.querySelectorAll(listSelector));

// Iterate through each parent element
for (const parent of parentElements) {
if (scrapedData.length >= limit) break;
const record = {};

// For each field, select the corresponding element within the parent
for (const [label, { selector, attribute }] of Object.entries(fields)) {
const fieldElement = parent.querySelector(selector);

if (fieldElement) {
if (attribute === 'innerText') {
record[label] = fieldElement.innerText.trim();
} else if (attribute === 'innerHTML') {
record[label] = fieldElement.innerHTML.trim();
} else if (attribute === 'src') {
// Handle relative 'src' URLs
const src = fieldElement.getAttribute('src');
record[label] = src ? new URL(src, window.location.origin).href : null;
} else if (attribute === 'href') {
// Handle relative 'href' URLs
const href = fieldElement.getAttribute('href');
record[label] = href ? new URL(href, window.location.origin).href : null;
} else {
record[label] = fieldElement.getAttribute(attribute);
let parentElements = Array.from(document.querySelectorAll(listSelector));

// If we only got one element or none, try a more generic approach
if (limit > 1 && parentElements.length <= 1) {
const [containerSelector, _] = listSelector.split('>').map(s => s.trim());
const container = document.querySelector(containerSelector);

if (container) {
const allChildren = Array.from(container.children);

const firstMatch = document.querySelector(listSelector);
if (firstMatch) {
// Get classes from the first matching element
const firstMatchClasses = Array.from(firstMatch.classList);

// Find similar elements by matching most of their classes
parentElements = allChildren.filter(element => {
const elementClasses = Array.from(element.classList);

// Element should share at least 70% of classes with the first match
const commonClasses = firstMatchClasses.filter(cls =>
elementClasses.includes(cls));
return commonClasses.length >= Math.floor(firstMatchClasses.length * 0.7);
});
}
}
}
}
scrapedData.push(record);
}

// Iterate through each parent element
for (const parent of parentElements) {
if (scrapedData.length >= limit) break;
const record = {};

// For each field, select the corresponding element within the parent
for (const [label, { selector, attribute }] of Object.entries(fields)) {
const fieldElement = parent.querySelector(selector);

if (fieldElement) {
if (attribute === 'innerText') {
record[label] = fieldElement.innerText.trim();
} else if (attribute === 'innerHTML') {
record[label] = fieldElement.innerHTML.trim();
} else if (attribute === 'src') {
// Handle relative 'src' URLs
const src = fieldElement.getAttribute('src');
record[label] = src ? new URL(src, window.location.origin).href : null;
} else if (attribute === 'href') {
// Handle relative 'href' URLs
const href = fieldElement.getAttribute('href');
record[label] = href ? new URL(href, window.location.origin).href : null;
} else {
record[label] = fieldElement.getAttribute(attribute);
}
}
}
scrapedData.push(record);
}

// If we've processed all available elements and still haven't reached the limit,
// break to avoid infinite loop
if (parentElements.length === 0 || scrapedData.length >= parentElements.length) {
break;
}
}
return scrapedData
};
return scrapedData;
};


/**
Expand Down
19 changes: 16 additions & 3 deletions maxun-core/src/interpret.ts
Original file line number Diff line number Diff line change
Expand Up @@ -192,8 +192,8 @@ export default class Interpreter extends EventEmitter {
// const actionable = async (selector: string): Promise<boolean> => {
// try {
// const proms = [
// page.isEnabled(selector, { timeout: 5000 }),
// page.isVisible(selector, { timeout: 5000 }),
// page.isEnabled(selector, { timeout: 10000 }),
// page.isVisible(selector, { timeout: 10000 }),
// ];

// return await Promise.all(proms).then((bools) => bools.every((x) => x));
Expand All @@ -214,6 +214,17 @@ export default class Interpreter extends EventEmitter {
// return [];
// }),
// ).then((x) => x.flat());

const presentSelectors: SelectorArray = await Promise.all(
selectors.map(async (selector) => {
try {
await page.waitForSelector(selector, { state: 'attached' });
return [selector];
} catch (e) {
return [];
}
}),
).then((x) => x.flat());

const action = workflowCopy[workflowCopy.length - 1];

Expand All @@ -233,7 +244,7 @@ export default class Interpreter extends EventEmitter {
...p,
[cookie.name]: cookie.value,
}), {}),
selectors,
selectors: presentSelectors,
};
}

Expand Down Expand Up @@ -767,6 +778,8 @@ export default class Interpreter extends EventEmitter {
public async run(page: Page, params?: ParamType): Promise<void> {
this.log('Starting the workflow.', Level.LOG);
const context = page.context();

page.setDefaultNavigationTimeout(100000);

// Check proxy settings from context options
const contextOptions = (context as any)._options;
Expand Down
95 changes: 54 additions & 41 deletions server/src/browser-management/classes/RemoteBrowser.ts
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ export class RemoteBrowser {
} catch {
return url;
}
}
}

/**
* Determines if a URL change is significant enough to emit
Expand All @@ -130,11 +130,11 @@ export class RemoteBrowser {
});

// Handle page load events with retry mechanism
page.on('load', async () => {
page.on('load', async () => {
const injectScript = async (): Promise<boolean> => {
try {
await page.waitForLoadState('networkidle', { timeout: 5000 });

await page.evaluate(getInjectableScript());
return true;
} catch (error: any) {
Expand All @@ -148,44 +148,37 @@ export class RemoteBrowser {
});
}

private getUserAgent() {
const userAgents = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.5845.140 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:117.0) Gecko/20100101 Firefox/117.0',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.1938.81 Safari/537.36 Edg/116.0.1938.81',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.5845.96 Safari/537.36 OPR/101.0.4843.25',
'Mozilla/5.0 (Windows NT 11.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.5938.62 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:118.0) Gecko/20100101 Firefox/118.0',
];

return userAgents[Math.floor(Math.random() * userAgents.length)];
}

/**
* An asynchronous constructor for asynchronously initialized properties.
* Must be called right after creating an instance of RemoteBrowser class.
* @param options remote browser options to be used when launching the browser
* @returns {Promise<void>}
*/
public initialize = async (userId: string): Promise<void> => {
// const launchOptions = {
// headless: true,
// proxy: options.launchOptions?.proxy,
// chromiumSandbox: false,
// args: [
// '--no-sandbox',
// '--disable-setuid-sandbox',
// '--headless=new',
// '--disable-gpu',
// '--disable-dev-shm-usage',
// '--disable-software-rasterizer',
// '--in-process-gpu',
// '--disable-infobars',
// '--single-process',
// '--no-zygote',
// '--disable-notifications',
// '--disable-extensions',
// '--disable-background-timer-throttling',
// ...(options.launchOptions?.args || [])
// ],
// env: {
// ...process.env,
// CHROMIUM_FLAGS: '--disable-gpu --no-sandbox --headless=new'
// }
// };
// console.log('Launch options before:', options.launchOptions);
// this.browser = <Browser>(await options.browser.launch(launchOptions));

// console.log('Launch options after:', options.launchOptions)
this.browser = <Browser>(await chromium.launch({
headless: true,
args: [
"--disable-blink-features=AutomationControlled",
"--disable-web-security",
"--disable-features=IsolateOrigins,site-per-process",
"--disable-site-isolation-trials",
"--disable-extensions",
"--no-sandbox",
"--disable-dev-shm-usage",
],
}));
const proxyConfig = await getDecryptedProxyConfig(userId);
let proxyOptions: { server: string, username?: string, password?: string } = { server: '' };
Expand All @@ -201,7 +194,7 @@ export class RemoteBrowser {
const contextOptions: any = {
viewport: { height: 400, width: 900 },
// recordVideo: { dir: 'videos/' }
// Force reduced motion to prevent animation issues
// Force reduced motion to prevent animation issues
reducedMotion: 'reduce',
// Force JavaScript to be enabled
javaScriptEnabled: true,
Expand All @@ -210,7 +203,8 @@ export class RemoteBrowser {
// Disable hardware acceleration
forcedColors: 'none',
isMobile: false,
hasTouch: false
hasTouch: false,
userAgent: this.getUserAgent(),
};

if (proxyOptions.server) {
Expand All @@ -220,18 +214,37 @@ export class RemoteBrowser {
password: proxyOptions.password ? proxyOptions.password : undefined,
};
}
const browserUserAgent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.5481.38 Safari/537.36";


contextOptions.userAgent = browserUserAgent;
this.context = await this.browser.newContext(contextOptions);
await this.context.addInitScript(
`const defaultGetter = Object.getOwnPropertyDescriptor(
Navigator.prototype,
"webdriver"
).get;
defaultGetter.apply(navigator);
defaultGetter.toString();
Object.defineProperty(Navigator.prototype, "webdriver", {
set: undefined,
enumerable: true,
configurable: true,
get: new Proxy(defaultGetter, {
apply: (target, thisArg, args) => {
Reflect.apply(target, thisArg, args);
return false;
},
}),
});
const patchedGetter = Object.getOwnPropertyDescriptor(
Navigator.prototype,
"webdriver"
).get;
patchedGetter.apply(navigator);
patchedGetter.toString();`
);
this.currentPage = await this.context.newPage();

await this.setupPageEventListeners(this.currentPage);

// await this.currentPage.setExtraHTTPHeaders({
// 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
// });
const blocker = await PlaywrightBlocker.fromLists(fetch, ['https://easylist.to/easylist/easylist.txt']);
await blocker.enableBlockingInPage(this.currentPage);
this.client = await this.currentPage.context().newCDPSession(this.currentPage);
Expand Down Expand Up @@ -456,7 +469,7 @@ export class RemoteBrowser {
this.currentPage = newPage;
if (this.currentPage) {
await this.setupPageEventListeners(this.currentPage);

this.client = await this.currentPage.context().newCDPSession(this.currentPage);
await this.subscribeToScreencast();
} else {
Expand Down
Loading