This commit is contained in:
张成
2026-03-18 16:28:26 +08:00
parent 30d127ac0b
commit 6d75720a89
8 changed files with 697 additions and 158 deletions

View File

@@ -0,0 +1,201 @@
import fs from 'node:fs';
import path from 'node:path';
import puppeteer from 'puppeteer';
import { get_app_config } from '../config/app_config.js';
import { apply_page_stealth_defaults, get_stealth_puppeteer } from './puppeteer_stealth.js';
let browser_singleton = null;
function get_action_timeout_ms() {
const cfg = get_app_config();
return cfg.crawler.action_timeout_ms;
}
function get_crx_src_path() {
const cfg = get_app_config();
return cfg.crawler.crx_src_path;
}
function get_extension_id_from_targets(targets) {
for (const target of targets) {
const url = target.url();
if (!url) continue;
if (url.startsWith('chrome-extension://')) {
const match = url.match(/^chrome-extension:\/\/([^/]+)\//);
if (match && match[1]) return match[1];
}
}
return null;
}
async function wait_for_extension_id(browser, timeout_ms) {
const existing = get_extension_id_from_targets(browser.targets());
if (existing) {
return existing;
}
const target = await browser
.waitForTarget((t) => {
const url = t.url();
return typeof url === 'string' && url.startsWith('chrome-extension://');
}, { timeout: timeout_ms })
.catch(() => null);
if (!target) {
return null;
}
return get_extension_id_from_targets([target]);
}
function get_chrome_executable_path() {
const cfg = get_app_config();
return path.resolve(cfg.crawler.chrome_executable_path);
}
export async function get_or_create_browser() {
if (browser_singleton) {
return browser_singleton;
}
const chrome_executable_path = get_chrome_executable_path();
if (!fs.existsSync(chrome_executable_path)) {
throw new Error(`Chrome 不存在: ${chrome_executable_path}`);
}
const raw_extension_path = path.resolve(get_crx_src_path());
const manifest_path = path.resolve(raw_extension_path, 'manifest.json');
if (!fs.existsSync(manifest_path)) {
throw new Error(`扩展 manifest.json 不存在: ${manifest_path}`);
}
const cfg = get_app_config();
const extension_path = raw_extension_path.replace(/\\/g, '/');
const headless = cfg.crawler.puppeteer_headless;
const cfg2 = get_app_config();
const pptr = cfg2.crawler.enable_stealth ? get_stealth_puppeteer(puppeteer) : puppeteer;
browser_singleton = await pptr.launch({
executablePath: chrome_executable_path,
headless,
args: [
'--enable-extensions',
`--disable-extensions-except=${extension_path}`,
`--load-extension=${extension_path}`,
'--no-default-browser-check',
'--disable-popup-blocking',
'--disable-dev-shm-usage',
'--disable-features=ExtensionManifestV2Disabled,ExtensionManifestV2Unsupported',
'--enable-features=AllowLegacyMV2Extensions'
]
});
return browser_singleton;
}
export async function invoke_extension_action(action_name, action_payload) {
const cfg = get_app_config();
const browser = await get_or_create_browser();
const started_at = Date.now();
const log_enabled = cfg.crawler.log_invoke_action;
if (log_enabled) {
// eslint-disable-next-line no-console
console.log('[invoke_extension_action] start', {
action_name,
has_payload: !!action_payload,
keys: action_payload && typeof action_payload === 'object' ? Object.keys(action_payload).slice(0, 20) : []
});
}
let page = null;
try {
page = await browser.newPage();
if (cfg.crawler.enable_stealth) {
await apply_page_stealth_defaults(page);
}
await page.goto('about:blank');
// 尝试先打开 chrome://extensions 触发扩展初始化(某些环境下扩展 target 不会立刻出现)
try {
await page.goto('chrome://extensions/', { waitUntil: 'domcontentloaded' });
} catch (err) {
// ignore
}
const extension_id = await wait_for_extension_id(browser, 15000);
if (!extension_id) {
throw new Error(
'未找到扩展 extension_idChrome 未加载扩展常见原因MV2 被禁用/企业策略未生效/CRX_SRC_PATH 不正确/使用了 headless'
);
}
const bridge_url = `chrome-extension://${extension_id}/bridge/bridge.html`;
await page.goto(bridge_url, { waitUntil: 'domcontentloaded' });
const timeout_ms = get_action_timeout_ms();
const action_res = await page.evaluate(
async (action, payload, timeout) => {
function with_timeout(promise, timeout_ms_inner) {
return new Promise((resolve, reject) => {
const timer = setTimeout(() => reject(new Error('action_timeout')), timeout_ms_inner);
promise
.then((v) => {
clearTimeout(timer);
resolve(v);
})
.catch((e) => {
clearTimeout(timer);
reject(e);
});
});
}
if (!window.server_bridge_invoke) {
throw new Error('bridge 未注入 window.server_bridge_invoke');
}
return await with_timeout(window.server_bridge_invoke(action, payload), timeout);
},
action_name,
action_payload || {},
timeout_ms
);
if (log_enabled) {
// eslint-disable-next-line no-console
console.log('[invoke_extension_action] ok', { action_name, cost_ms: Date.now() - started_at });
}
return action_res;
} catch (err) {
if (log_enabled) {
// eslint-disable-next-line no-console
console.log('[invoke_extension_action] fail', {
action_name,
cost_ms: Date.now() - started_at,
error: (err && err.message) || String(err)
});
}
throw err;
} finally {
if (page) {
try {
await page.close();
} catch (err) {
// ignore
}
}
if (cfg.crawler.auto_close_browser) {
try {
await browser.close();
} catch (err) {
// ignore
}
browser_singleton = null;
}
}
}

View File

@@ -0,0 +1,27 @@
import puppeteer_extra from 'puppeteer-extra';
import stealth_plugin from 'puppeteer-extra-plugin-stealth';
// 全局只注册一次插件
let inited = false;
export function get_stealth_puppeteer(puppeteer_core) {
if (!inited) {
puppeteer_extra.use(stealth_plugin());
inited = true;
}
// 复用 puppeteer 的 Chromium/Chrome 绑定(保持你现有的 executablePath 等能力)
puppeteer_extra.puppeteer = puppeteer_core;
return puppeteer_extra;
}
export async function apply_page_stealth_defaults(page) {
// 这些属于通用的轻量“指纹一致性”设置,不会影响你现有业务
await page.setViewport({ width: 1366, height: 768, deviceScaleFactor: 1 });
await page.setUserAgent(
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36'
);
await page.setExtraHTTPHeaders({
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8'
});
}

View File

@@ -1,148 +0,0 @@
import fs from 'node:fs';
import path from 'node:path';
import puppeteer from 'puppeteer';
import { get_app_config } from '../config/app_config.js';
let browser_singleton = null;
function get_action_timeout_ms() {
const cfg = get_app_config();
return cfg.crawler.action_timeout_ms;
}
function get_crx_src_path() {
const cfg = get_app_config();
return cfg.crawler.crx_src_path;
}
function get_extension_id_from_targets(targets) {
for (const target of targets) {
const url = target.url();
if (!url) continue;
if (url.startsWith('chrome-extension://')) {
const match = url.match(/^chrome-extension:\/\/([^/]+)\//);
if (match && match[1]) return match[1];
}
}
return null;
}
async function wait_for_extension_id(browser, timeout_ms) {
const existing = get_extension_id_from_targets(browser.targets());
if (existing) {
return existing;
}
const target = await browser
.waitForTarget((t) => {
const url = t.url();
return typeof url === 'string' && url.startsWith('chrome-extension://');
}, { timeout: timeout_ms })
.catch(() => null);
if (!target) {
return null;
}
return get_extension_id_from_targets([target]);
}
function get_chrome_executable_path() {
const cfg = get_app_config();
return path.resolve(cfg.crawler.chrome_executable_path);
}
export async function get_or_create_browser() {
if (browser_singleton) {
return browser_singleton;
}
const chrome_executable_path = get_chrome_executable_path();
if (!fs.existsSync(chrome_executable_path)) {
throw new Error(`Chrome 不存在: ${chrome_executable_path}`);
}
const raw_extension_path = path.resolve(get_crx_src_path());
const manifest_path = path.resolve(raw_extension_path, 'manifest.json');
if (!fs.existsSync(manifest_path)) {
throw new Error(`扩展 manifest.json 不存在: ${manifest_path}`);
}
const cfg = get_app_config();
const extension_path = raw_extension_path.replace(/\\/g, '/');
const headless = cfg.crawler.puppeteer_headless;
const user_data_dir = path.resolve(process.cwd(), 'puppeteer_profile');
browser_singleton = await puppeteer.launch({
executablePath: chrome_executable_path,
headless,
args: [
`--user-data-dir=${user_data_dir}`,
'--enable-extensions',
`--disable-extensions-except=${extension_path}`,
`--load-extension=${extension_path}`,
'--no-default-browser-check',
'--disable-popup-blocking',
'--disable-dev-shm-usage'
]
});
return browser_singleton;
}
export async function invoke_extension_action(action_name, action_payload) {
const browser = await get_or_create_browser();
const page = await browser.newPage();
await page.goto('about:blank');
// 尝试先打开 chrome://extensions 触发扩展初始化(某些环境下扩展 target 不会立刻出现)
try {
await page.goto('chrome://extensions/', { waitUntil: 'domcontentloaded' });
} catch (err) {
// ignore
}
const extension_id = await wait_for_extension_id(browser, 15000);
if (!extension_id) {
await page.close();
throw new Error(
'未找到扩展 extension_idChrome 未加载扩展常见原因MV2 被禁用/企业策略未生效/CRX_SRC_PATH 不正确/使用了 headless'
);
}
const bridge_url = `chrome-extension://${extension_id}/bridge/bridge.html`;
await page.goto(bridge_url, { waitUntil: 'domcontentloaded' });
const timeout_ms = get_action_timeout_ms();
const action_res = await page.evaluate(
async (action, payload, timeout) => {
function with_timeout(promise, timeout_ms_inner) {
return new Promise((resolve, reject) => {
const timer = setTimeout(() => reject(new Error('action_timeout')), timeout_ms_inner);
promise
.then((v) => {
clearTimeout(timer);
resolve(v);
})
.catch((e) => {
clearTimeout(timer);
reject(e);
});
});
}
if (!window.server_bridge_invoke) {
throw new Error('bridge 未注入 window.server_bridge_invoke');
}
return await with_timeout(window.server_bridge_invoke(action, payload), timeout);
},
action_name,
action_payload || {},
timeout_ms
);
await page.close();
return action_res;
}

View File

@@ -1,6 +1,6 @@
import { crawl_run_record } from '../models/index.js';
import { safe_json_stringify } from './json_utils.js';
import { invoke_extension_action } from './puppeteer_runner.js';
import { invoke_extension_action } from './puppeteer/puppeteer_runner.js';
import { persist_amazon_result } from './amazon_persist.js';
export async function execute_action_and_record(params) {
@@ -13,18 +13,15 @@ export async function execute_action_and_record(params) {
let error_message = null;
try {
const result = await invoke_extension_action(action_name, action_payload || {});
console.log( 'invoke_extension_action-start', action_name, action_payload );
const res_invoke = await invoke_extension_action(action_name, action_payload || {});
console.log( 'invoke_extension_action-end', action_name, result );
ok = true;
result_payload = safe_json_stringify(res_invoke);
result_payload = safe_json_stringify(result);
// 按 stage 自动入库(不影响原始 run_record 记录)
await persist_amazon_result(res_invoke.result);
await persist_amazon_result(result);
return res_invoke;
return result;
} catch (err) {
ok = false;
error_message = (err && err.message) || String(err);