1
This commit is contained in:
201
server/services/puppeteer/puppeteer_runner.js
Normal file
201
server/services/puppeteer/puppeteer_runner.js
Normal file
@@ -0,0 +1,201 @@
|
||||
import fs from 'node:fs';
|
||||
import path from 'node:path';
|
||||
import puppeteer from 'puppeteer';
|
||||
import { get_app_config } from '../config/app_config.js';
|
||||
import { apply_page_stealth_defaults, get_stealth_puppeteer } from './puppeteer_stealth.js';
|
||||
|
||||
let browser_singleton = null;
|
||||
|
||||
function get_action_timeout_ms() {
|
||||
const cfg = get_app_config();
|
||||
return cfg.crawler.action_timeout_ms;
|
||||
}
|
||||
|
||||
function get_crx_src_path() {
|
||||
const cfg = get_app_config();
|
||||
return cfg.crawler.crx_src_path;
|
||||
}
|
||||
|
||||
function get_extension_id_from_targets(targets) {
|
||||
for (const target of targets) {
|
||||
const url = target.url();
|
||||
if (!url) continue;
|
||||
if (url.startsWith('chrome-extension://')) {
|
||||
const match = url.match(/^chrome-extension:\/\/([^/]+)\//);
|
||||
if (match && match[1]) return match[1];
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
async function wait_for_extension_id(browser, timeout_ms) {
|
||||
const existing = get_extension_id_from_targets(browser.targets());
|
||||
if (existing) {
|
||||
return existing;
|
||||
}
|
||||
|
||||
const target = await browser
|
||||
.waitForTarget((t) => {
|
||||
const url = t.url();
|
||||
return typeof url === 'string' && url.startsWith('chrome-extension://');
|
||||
}, { timeout: timeout_ms })
|
||||
.catch(() => null);
|
||||
|
||||
if (!target) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return get_extension_id_from_targets([target]);
|
||||
}
|
||||
|
||||
function get_chrome_executable_path() {
|
||||
const cfg = get_app_config();
|
||||
return path.resolve(cfg.crawler.chrome_executable_path);
|
||||
}
|
||||
|
||||
export async function get_or_create_browser() {
|
||||
if (browser_singleton) {
|
||||
return browser_singleton;
|
||||
}
|
||||
|
||||
const chrome_executable_path = get_chrome_executable_path();
|
||||
if (!fs.existsSync(chrome_executable_path)) {
|
||||
throw new Error(`Chrome 不存在: ${chrome_executable_path}`);
|
||||
}
|
||||
|
||||
const raw_extension_path = path.resolve(get_crx_src_path());
|
||||
const manifest_path = path.resolve(raw_extension_path, 'manifest.json');
|
||||
if (!fs.existsSync(manifest_path)) {
|
||||
throw new Error(`扩展 manifest.json 不存在: ${manifest_path}`);
|
||||
}
|
||||
|
||||
const cfg = get_app_config();
|
||||
const extension_path = raw_extension_path.replace(/\\/g, '/');
|
||||
const headless = cfg.crawler.puppeteer_headless;
|
||||
|
||||
const cfg2 = get_app_config();
|
||||
const pptr = cfg2.crawler.enable_stealth ? get_stealth_puppeteer(puppeteer) : puppeteer;
|
||||
|
||||
browser_singleton = await pptr.launch({
|
||||
executablePath: chrome_executable_path,
|
||||
headless,
|
||||
args: [
|
||||
'--enable-extensions',
|
||||
`--disable-extensions-except=${extension_path}`,
|
||||
`--load-extension=${extension_path}`,
|
||||
'--no-default-browser-check',
|
||||
'--disable-popup-blocking',
|
||||
'--disable-dev-shm-usage',
|
||||
'--disable-features=ExtensionManifestV2Disabled,ExtensionManifestV2Unsupported',
|
||||
'--enable-features=AllowLegacyMV2Extensions'
|
||||
]
|
||||
});
|
||||
|
||||
return browser_singleton;
|
||||
}
|
||||
|
||||
export async function invoke_extension_action(action_name, action_payload) {
|
||||
const cfg = get_app_config();
|
||||
const browser = await get_or_create_browser();
|
||||
|
||||
const started_at = Date.now();
|
||||
|
||||
const log_enabled = cfg.crawler.log_invoke_action;
|
||||
if (log_enabled) {
|
||||
// eslint-disable-next-line no-console
|
||||
console.log('[invoke_extension_action] start', {
|
||||
action_name,
|
||||
has_payload: !!action_payload,
|
||||
keys: action_payload && typeof action_payload === 'object' ? Object.keys(action_payload).slice(0, 20) : []
|
||||
});
|
||||
}
|
||||
|
||||
let page = null;
|
||||
try {
|
||||
page = await browser.newPage();
|
||||
if (cfg.crawler.enable_stealth) {
|
||||
await apply_page_stealth_defaults(page);
|
||||
}
|
||||
await page.goto('about:blank');
|
||||
|
||||
// 尝试先打开 chrome://extensions 触发扩展初始化(某些环境下扩展 target 不会立刻出现)
|
||||
try {
|
||||
await page.goto('chrome://extensions/', { waitUntil: 'domcontentloaded' });
|
||||
} catch (err) {
|
||||
// ignore
|
||||
}
|
||||
|
||||
const extension_id = await wait_for_extension_id(browser, 15000);
|
||||
if (!extension_id) {
|
||||
throw new Error(
|
||||
'未找到扩展 extension_id:Chrome 未加载扩展(常见原因:MV2 被禁用/企业策略未生效/CRX_SRC_PATH 不正确/使用了 headless)'
|
||||
);
|
||||
}
|
||||
|
||||
const bridge_url = `chrome-extension://${extension_id}/bridge/bridge.html`;
|
||||
await page.goto(bridge_url, { waitUntil: 'domcontentloaded' });
|
||||
|
||||
const timeout_ms = get_action_timeout_ms();
|
||||
const action_res = await page.evaluate(
|
||||
async (action, payload, timeout) => {
|
||||
function with_timeout(promise, timeout_ms_inner) {
|
||||
return new Promise((resolve, reject) => {
|
||||
const timer = setTimeout(() => reject(new Error('action_timeout')), timeout_ms_inner);
|
||||
promise
|
||||
.then((v) => {
|
||||
clearTimeout(timer);
|
||||
resolve(v);
|
||||
})
|
||||
.catch((e) => {
|
||||
clearTimeout(timer);
|
||||
reject(e);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
if (!window.server_bridge_invoke) {
|
||||
throw new Error('bridge 未注入 window.server_bridge_invoke');
|
||||
}
|
||||
|
||||
return await with_timeout(window.server_bridge_invoke(action, payload), timeout);
|
||||
},
|
||||
action_name,
|
||||
action_payload || {},
|
||||
timeout_ms
|
||||
);
|
||||
|
||||
if (log_enabled) {
|
||||
// eslint-disable-next-line no-console
|
||||
console.log('[invoke_extension_action] ok', { action_name, cost_ms: Date.now() - started_at });
|
||||
}
|
||||
|
||||
return action_res;
|
||||
} catch (err) {
|
||||
if (log_enabled) {
|
||||
// eslint-disable-next-line no-console
|
||||
console.log('[invoke_extension_action] fail', {
|
||||
action_name,
|
||||
cost_ms: Date.now() - started_at,
|
||||
error: (err && err.message) || String(err)
|
||||
});
|
||||
}
|
||||
throw err;
|
||||
} finally {
|
||||
if (page) {
|
||||
try {
|
||||
await page.close();
|
||||
} catch (err) {
|
||||
// ignore
|
||||
}
|
||||
}
|
||||
|
||||
if (cfg.crawler.auto_close_browser) {
|
||||
try {
|
||||
await browser.close();
|
||||
} catch (err) {
|
||||
// ignore
|
||||
}
|
||||
browser_singleton = null;
|
||||
}
|
||||
}
|
||||
}
|
||||
27
server/services/puppeteer/puppeteer_stealth.js
Normal file
27
server/services/puppeteer/puppeteer_stealth.js
Normal file
@@ -0,0 +1,27 @@
|
||||
import puppeteer_extra from 'puppeteer-extra';
|
||||
import stealth_plugin from 'puppeteer-extra-plugin-stealth';
|
||||
|
||||
// 全局只注册一次插件
|
||||
let inited = false;
|
||||
|
||||
export function get_stealth_puppeteer(puppeteer_core) {
|
||||
if (!inited) {
|
||||
puppeteer_extra.use(stealth_plugin());
|
||||
inited = true;
|
||||
}
|
||||
|
||||
// 复用 puppeteer 的 Chromium/Chrome 绑定(保持你现有的 executablePath 等能力)
|
||||
puppeteer_extra.puppeteer = puppeteer_core;
|
||||
return puppeteer_extra;
|
||||
}
|
||||
|
||||
export async function apply_page_stealth_defaults(page) {
|
||||
// 这些属于通用的轻量“指纹一致性”设置,不会影响你现有业务
|
||||
await page.setViewport({ width: 1366, height: 768, deviceScaleFactor: 1 });
|
||||
await page.setUserAgent(
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36'
|
||||
);
|
||||
await page.setExtraHTTPHeaders({
|
||||
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8'
|
||||
});
|
||||
}
|
||||
@@ -1,148 +0,0 @@
|
||||
import fs from 'node:fs';
|
||||
import path from 'node:path';
|
||||
import puppeteer from 'puppeteer';
|
||||
import { get_app_config } from '../config/app_config.js';
|
||||
|
||||
let browser_singleton = null;
|
||||
|
||||
function get_action_timeout_ms() {
|
||||
const cfg = get_app_config();
|
||||
return cfg.crawler.action_timeout_ms;
|
||||
}
|
||||
|
||||
function get_crx_src_path() {
|
||||
const cfg = get_app_config();
|
||||
return cfg.crawler.crx_src_path;
|
||||
}
|
||||
|
||||
function get_extension_id_from_targets(targets) {
|
||||
for (const target of targets) {
|
||||
const url = target.url();
|
||||
if (!url) continue;
|
||||
if (url.startsWith('chrome-extension://')) {
|
||||
const match = url.match(/^chrome-extension:\/\/([^/]+)\//);
|
||||
if (match && match[1]) return match[1];
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
async function wait_for_extension_id(browser, timeout_ms) {
|
||||
const existing = get_extension_id_from_targets(browser.targets());
|
||||
if (existing) {
|
||||
return existing;
|
||||
}
|
||||
|
||||
const target = await browser
|
||||
.waitForTarget((t) => {
|
||||
const url = t.url();
|
||||
return typeof url === 'string' && url.startsWith('chrome-extension://');
|
||||
}, { timeout: timeout_ms })
|
||||
.catch(() => null);
|
||||
|
||||
if (!target) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return get_extension_id_from_targets([target]);
|
||||
}
|
||||
|
||||
function get_chrome_executable_path() {
|
||||
const cfg = get_app_config();
|
||||
return path.resolve(cfg.crawler.chrome_executable_path);
|
||||
}
|
||||
|
||||
export async function get_or_create_browser() {
|
||||
if (browser_singleton) {
|
||||
return browser_singleton;
|
||||
}
|
||||
|
||||
const chrome_executable_path = get_chrome_executable_path();
|
||||
if (!fs.existsSync(chrome_executable_path)) {
|
||||
throw new Error(`Chrome 不存在: ${chrome_executable_path}`);
|
||||
}
|
||||
|
||||
const raw_extension_path = path.resolve(get_crx_src_path());
|
||||
const manifest_path = path.resolve(raw_extension_path, 'manifest.json');
|
||||
if (!fs.existsSync(manifest_path)) {
|
||||
throw new Error(`扩展 manifest.json 不存在: ${manifest_path}`);
|
||||
}
|
||||
|
||||
const cfg = get_app_config();
|
||||
const extension_path = raw_extension_path.replace(/\\/g, '/');
|
||||
const headless = cfg.crawler.puppeteer_headless;
|
||||
const user_data_dir = path.resolve(process.cwd(), 'puppeteer_profile');
|
||||
|
||||
browser_singleton = await puppeteer.launch({
|
||||
executablePath: chrome_executable_path,
|
||||
headless,
|
||||
args: [
|
||||
`--user-data-dir=${user_data_dir}`,
|
||||
'--enable-extensions',
|
||||
`--disable-extensions-except=${extension_path}`,
|
||||
`--load-extension=${extension_path}`,
|
||||
'--no-default-browser-check',
|
||||
'--disable-popup-blocking',
|
||||
'--disable-dev-shm-usage'
|
||||
]
|
||||
});
|
||||
|
||||
return browser_singleton;
|
||||
}
|
||||
|
||||
export async function invoke_extension_action(action_name, action_payload) {
|
||||
const browser = await get_or_create_browser();
|
||||
|
||||
const page = await browser.newPage();
|
||||
await page.goto('about:blank');
|
||||
|
||||
// 尝试先打开 chrome://extensions 触发扩展初始化(某些环境下扩展 target 不会立刻出现)
|
||||
try {
|
||||
await page.goto('chrome://extensions/', { waitUntil: 'domcontentloaded' });
|
||||
} catch (err) {
|
||||
// ignore
|
||||
}
|
||||
|
||||
const extension_id = await wait_for_extension_id(browser, 15000);
|
||||
if (!extension_id) {
|
||||
await page.close();
|
||||
throw new Error(
|
||||
'未找到扩展 extension_id:Chrome 未加载扩展(常见原因:MV2 被禁用/企业策略未生效/CRX_SRC_PATH 不正确/使用了 headless)'
|
||||
);
|
||||
}
|
||||
|
||||
const bridge_url = `chrome-extension://${extension_id}/bridge/bridge.html`;
|
||||
await page.goto(bridge_url, { waitUntil: 'domcontentloaded' });
|
||||
|
||||
const timeout_ms = get_action_timeout_ms();
|
||||
const action_res = await page.evaluate(
|
||||
async (action, payload, timeout) => {
|
||||
function with_timeout(promise, timeout_ms_inner) {
|
||||
return new Promise((resolve, reject) => {
|
||||
const timer = setTimeout(() => reject(new Error('action_timeout')), timeout_ms_inner);
|
||||
promise
|
||||
.then((v) => {
|
||||
clearTimeout(timer);
|
||||
resolve(v);
|
||||
})
|
||||
.catch((e) => {
|
||||
clearTimeout(timer);
|
||||
reject(e);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
if (!window.server_bridge_invoke) {
|
||||
throw new Error('bridge 未注入 window.server_bridge_invoke');
|
||||
}
|
||||
|
||||
return await with_timeout(window.server_bridge_invoke(action, payload), timeout);
|
||||
},
|
||||
action_name,
|
||||
action_payload || {},
|
||||
timeout_ms
|
||||
);
|
||||
|
||||
await page.close();
|
||||
return action_res;
|
||||
}
|
||||
@@ -1,6 +1,6 @@
|
||||
import { crawl_run_record } from '../models/index.js';
|
||||
import { safe_json_stringify } from './json_utils.js';
|
||||
import { invoke_extension_action } from './puppeteer_runner.js';
|
||||
import { invoke_extension_action } from './puppeteer/puppeteer_runner.js';
|
||||
import { persist_amazon_result } from './amazon_persist.js';
|
||||
|
||||
export async function execute_action_and_record(params) {
|
||||
@@ -13,18 +13,15 @@ export async function execute_action_and_record(params) {
|
||||
let error_message = null;
|
||||
|
||||
try {
|
||||
const result = await invoke_extension_action(action_name, action_payload || {});
|
||||
|
||||
|
||||
console.log( 'invoke_extension_action-start', action_name, action_payload );
|
||||
const res_invoke = await invoke_extension_action(action_name, action_payload || {});
|
||||
console.log( 'invoke_extension_action-end', action_name, result );
|
||||
ok = true;
|
||||
result_payload = safe_json_stringify(res_invoke);
|
||||
result_payload = safe_json_stringify(result);
|
||||
|
||||
// 按 stage 自动入库(不影响原始 run_record 记录)
|
||||
await persist_amazon_result(res_invoke.result);
|
||||
await persist_amazon_result(result);
|
||||
|
||||
return res_invoke;
|
||||
return result;
|
||||
} catch (err) {
|
||||
ok = false;
|
||||
error_message = (err && err.message) || String(err);
|
||||
|
||||
Reference in New Issue
Block a user