This commit is contained in:
张成
2026-03-18 15:25:34 +08:00
parent 5b671d320b
commit 37e39d35b8
17 changed files with 368 additions and 167 deletions

View File

@@ -0,0 +1,110 @@
import {
amazon_product,
amazon_search_item,
amazon_review
} from '../models/index.js';
import { safe_json_stringify } from './json_utils.js';
function build_batch_key(prefix) {
return `${prefix}_${Date.now()}_${Math.random().toString().slice(2, 8)}`;
}
function pick_asin_from_url(url) {
if (!url) return null;
const m = String(url).match(/\/dp\/([A-Z0-9]{8,16})/i);
return m && m[1] ? m[1].toUpperCase() : null;
}
export async function persist_amazon_result(result) {
if (!result || !result.stage) {
return;
}
if (result.stage === 'detail') {
const asin = result.asin || pick_asin_from_url(result.url);
if (!asin) {
return;
}
await amazon_product.upsert({
asin,
url: result.url || '',
title: result.title || null,
price: result.price || null,
sku: result.sku || null,
sku_color: result.sku_color || null,
sku_size: result.sku_size || null,
brand_line: result.brand_line || null,
brand_store_url: result.brand_store_url || null,
ac_badge: result.ac_badge || null,
bestseller_hint: result.bestseller_hint || null,
delivery_hint: result.delivery_hint || null,
social_proof: result.social_proof || null,
sustainability_hint: result.sustainability_hint || null,
rating_stars: result.rating_stars || null,
review_count_text: result.review_count_text || null,
main_image: result.main_image || null,
images_json: safe_json_stringify(result.images || []),
bullets_json: safe_json_stringify(result.bullets || []),
product_info_json: safe_json_stringify(result.product_info || {}),
detail_extra_lines_json: safe_json_stringify(result.detail_extra_lines || [])
});
return;
}
if (result.stage === 'list') {
const batch_key = build_batch_key('list');
const items = Array.isArray(result.items) ? result.items : [];
for (const it of items) {
const asin = it.asin || pick_asin_from_url(it.url);
if (!asin || !it.url) continue;
await amazon_search_item.create({
asin,
url: it.url,
title: it.title || null,
price: it.price || null,
rating: typeof it.rating === 'number' ? it.rating : null,
rating_text: it.rating_text || null,
review_count: typeof it.review_count === 'number' ? it.review_count : null,
review_count_text: it.review_count_text || null,
rank_index: typeof it.index === 'number' ? it.index : null,
batch_key,
batch_total: typeof result.total === 'number' ? result.total : null,
batch_limit: typeof result.limit === 'number' ? result.limit : null
});
}
return;
}
if (result.stage === 'reviews') {
const batch_key = build_batch_key('reviews');
const asin = pick_asin_from_url(result.url);
const items = Array.isArray(result.items) ? result.items : [];
for (const it of items) {
const review_id = it.review_id;
if (!review_id) continue;
const asin_value = asin || pick_asin_from_url(it.url) || pick_asin_from_url(result.url);
await amazon_review.upsert({
asin: asin_value || null,
url: result.url || '',
review_id,
author: it.author || null,
title: it.title || null,
body: it.body || null,
rating_text: it.rating_text || null,
review_date: it.date || null,
review_index: typeof it.index === 'number' ? it.index : null,
batch_key,
batch_total: typeof result.total === 'number' ? result.total : null,
batch_limit: typeof result.limit === 'number' ? result.limit : null
});
}
}
}

View File

@@ -1,30 +1 @@
import cron from 'node-cron';
const task_id_to_cron_job = new Map();
export function stop_all_cron_jobs() {
for (const job of task_id_to_cron_job.values()) {
job.stop();
}
task_id_to_cron_job.clear();
}
export function upsert_cron_job(schedule_task_id, cron_expression, on_tick) {
const existing = task_id_to_cron_job.get(schedule_task_id);
if (existing) {
existing.stop();
task_id_to_cron_job.delete(schedule_task_id);
}
const job = cron.schedule(cron_expression, on_tick, { scheduled: true });
task_id_to_cron_job.set(schedule_task_id, job);
}
export function remove_cron_job(schedule_task_id) {
const job = task_id_to_cron_job.get(schedule_task_id);
if (!job) {
return;
}
job.stop();
task_id_to_cron_job.delete(schedule_task_id);
}
// 已废弃:按需求改为写死定时任务(见 config/cron_tasks.js

View File

@@ -1,4 +1,5 @@
import dotenv from 'dotenv';
import fs from 'node:fs';
import path from 'node:path';
import puppeteer from 'puppeteer';
@@ -30,17 +31,64 @@ function get_extension_id_from_targets(targets) {
return null;
}
async function wait_for_extension_id(browser, timeout_ms) {
const existing = get_extension_id_from_targets(browser.targets());
if (existing) {
return existing;
}
const target = await browser
.waitForTarget((t) => {
const url = t.url();
return typeof url === 'string' && url.startsWith('chrome-extension://');
}, { timeout: timeout_ms })
.catch(() => null);
if (!target) {
return null;
}
return get_extension_id_from_targets([target]);
}
function get_chrome_executable_path() {
// 优先环境变量,方便你后续切换版本
const from_env = process.env.CHROME_EXECUTABLE_PATH;
if (from_env) {
return path.resolve(from_env);
}
// 默认使用项目根目录的 chrome-win/chrome.exe
// 当前进程 cwd 通常是 server/,所以回到上一级
return path.resolve(process.cwd(), '../chrome-win/chrome.exe');
}
export async function get_or_create_browser() {
if (browser_singleton) {
return browser_singleton;
}
const extension_path = path.resolve(get_crx_src_path());
const chrome_executable_path = get_chrome_executable_path();
if (!fs.existsSync(chrome_executable_path)) {
throw new Error(`Chrome 不存在: ${chrome_executable_path}`);
}
const raw_extension_path = path.resolve(get_crx_src_path());
const manifest_path = path.resolve(raw_extension_path, 'manifest.json');
if (!fs.existsSync(manifest_path)) {
throw new Error(`扩展 manifest.json 不存在: ${manifest_path}`);
}
const extension_path = raw_extension_path.replace(/\\/g, '/');
const headless = String(process.env.PUPPETEER_HEADLESS || 'false') === 'true';
const user_data_dir = path.resolve(process.cwd(), 'puppeteer_profile');
browser_singleton = await puppeteer.launch({
executablePath: chrome_executable_path,
headless,
args: [
`--user-data-dir=${user_data_dir}`,
'--enable-extensions',
`--disable-extensions-except=${extension_path}`,
`--load-extension=${extension_path}`,
'--no-default-browser-check',
@@ -58,11 +106,19 @@ export async function invoke_extension_action(action_name, action_payload) {
const page = await browser.newPage();
await page.goto('about:blank');
const targets = await browser.targets();
const extension_id = get_extension_id_from_targets(targets);
// 尝试先打开 chrome://extensions 触发扩展初始化(某些环境下扩展 target 不会立刻出现)
try {
await page.goto('chrome://extensions/', { waitUntil: 'domcontentloaded' });
} catch (err) {
// ignore
}
const extension_id = await wait_for_extension_id(browser, 15000);
if (!extension_id) {
await page.close();
throw new Error('未找到扩展 extension_id请确认 CRX_SRC_PATH 指向 src 且成功加载)');
throw new Error(
'未找到扩展 extension_idChrome 未加载扩展常见原因MV2 被禁用/企业策略未生效/CRX_SRC_PATH 不正确/使用了 headless'
);
}
const bridge_url = `chrome-extension://${extension_id}/bridge/bridge.html`;

View File

@@ -1,33 +1,30 @@
import { schedule_task } from '../models/index.js';
import { safe_json_parse } from './json_utils.js';
import cron from 'node-cron';
import { cron_task_list } from '../config/cron_tasks.js';
import { execute_action_and_record } from './task_executor.js';
import { remove_cron_job, upsert_cron_job } from './cron_manager.js';
export async function reload_all_schedules() {
const rows = await schedule_task.findAll();
const cron_jobs = [];
for (const row of rows) {
if (!row.enabled) {
remove_cron_job(row.id);
continue;
}
upsert_cron_job(row.id, row.cron_expression, async () => {
export function start_all_cron_tasks() {
for (const task of cron_task_list) {
const job = cron.schedule(task.cron_expression, async () => {
try {
await schedule_task.update(
{ last_run_at: new Date() },
{ where: { id: row.id } }
);
await execute_action_and_record({
action_name: row.action_name,
action_payload: safe_json_parse(row.payload_json) || {},
source: 'cron',
schedule_task_id: row.id
action_name: task.action_name,
action_payload: task.action_payload || {},
source: 'cron'
});
} catch (err) {
// cron 执行失败在 crawl_run_record 落库,避免重复抛出影响其它任务
// 失败在 crawl_run_record 落库
}
});
cron_jobs.push(job);
}
}
export function stop_all_cron_tasks() {
for (const job of cron_jobs) {
job.stop();
}
cron_jobs.length = 0;
}

View File

@@ -1,14 +1,10 @@
import { crawl_run_record } from '../models/index.js';
import { safe_json_stringify } from './json_utils.js';
import { invoke_extension_action } from './puppeteer_runner.js';
import { persist_amazon_result } from './amazon_persist.js';
export async function execute_action_and_record(params) {
const {
action_name,
action_payload,
source,
schedule_task_id
} = params;
const { action_name, action_payload, source } = params;
const request_payload = safe_json_stringify(action_payload || {});
@@ -18,8 +14,13 @@ export async function execute_action_and_record(params) {
try {
const result = await invoke_extension_action(action_name, action_payload || {});
ok = true;
result_payload = safe_json_stringify(result);
// 按 stage 自动入库(不影响原始 run_record 记录)
await persist_amazon_result(result);
return result;
} catch (err) {
ok = false;
@@ -32,8 +33,7 @@ export async function execute_action_and_record(params) {
ok,
result_payload,
error_message,
source,
schedule_task_id: schedule_task_id || null
source
});
}
}