1
This commit is contained in:
4
.gitignore
vendored
4
.gitignore
vendored
@@ -1,2 +1,4 @@
|
||||
server/node_modules/*
|
||||
.env
|
||||
server/puppeteer_profile/*
|
||||
.env
|
||||
chrome-win/*
|
||||
@@ -11,3 +11,5 @@ CRX_SRC_PATH=d:/项目/电商抓取项目/mv2_simple_crx/src
|
||||
SERVER_PORT=38080
|
||||
ACTION_TIMEOUT_MS=300000
|
||||
PUPPETEER_HEADLESS=false
|
||||
# 可选:指定浏览器路径(不填默认用 ../chrome-win/chrome.exe)
|
||||
CHROME_EXECUTABLE_PATH=
|
||||
|
||||
40
server/README.md
Normal file
40
server/README.md
Normal file
@@ -0,0 +1,40 @@
|
||||
## server(Koa + Sequelize + MySQL)
|
||||
|
||||
### 功能
|
||||
- `POST /api/crawl/run_action`:服务端调用扩展 action,返回结果,并按 `stage` 自动入库
|
||||
- **定时任务写死配置**:`config/cron_tasks.js`(不走数据库)
|
||||
|
||||
### 运行
|
||||
1. 安装依赖
|
||||
|
||||
```bash
|
||||
cd server
|
||||
npm install
|
||||
```
|
||||
|
||||
2. 配置环境变量
|
||||
|
||||
```bash
|
||||
copy .env.example .env
|
||||
```
|
||||
|
||||
3. 初始化/更新表结构
|
||||
|
||||
```bash
|
||||
node scripts/db_sync.js
|
||||
```
|
||||
|
||||
4. 启动
|
||||
|
||||
```bash
|
||||
node app.js
|
||||
```
|
||||
|
||||
### 定时任务
|
||||
编辑 `config/cron_tasks.js` 的 `cron_task_list`,重启服务即可生效。
|
||||
|
||||
### 落库表
|
||||
- `crawl_run_record`:所有 action 调用的原始请求/响应
|
||||
- `amazon_product`:`stage=detail` 详情
|
||||
- `amazon_search_item`:`stage=list` 列表 item
|
||||
- `amazon_review`:`stage=reviews` 评论
|
||||
@@ -4,8 +4,7 @@ import body_parser from 'koa-bodyparser';
|
||||
|
||||
import { sequelize } from './models/index.js';
|
||||
import { crawl_router } from './routes/crawl.js';
|
||||
import { schedule_task_router } from './routes/schedule_task.js';
|
||||
import { reload_all_schedules } from './services/schedule_loader.js';
|
||||
import { start_all_cron_tasks } from './services/schedule_loader.js';
|
||||
|
||||
dotenv.config();
|
||||
|
||||
@@ -13,7 +12,6 @@ const app = new Koa();
|
||||
app.use(body_parser({ jsonLimit: '10mb' }));
|
||||
|
||||
app.use(crawl_router.routes()).use(crawl_router.allowedMethods());
|
||||
app.use(schedule_task_router.routes()).use(schedule_task_router.allowedMethods());
|
||||
|
||||
app.use(async (ctx) => {
|
||||
ctx.status = 404;
|
||||
@@ -24,7 +22,7 @@ const port = Number(process.env.SERVER_PORT || 38080);
|
||||
|
||||
await sequelize.authenticate();
|
||||
await sequelize.sync();
|
||||
await reload_all_schedules();
|
||||
start_all_cron_tasks();
|
||||
|
||||
app.listen(port);
|
||||
// eslint-disable-next-line no-console
|
||||
|
||||
14
server/config/cron_tasks.js
Normal file
14
server/config/cron_tasks.js
Normal file
@@ -0,0 +1,14 @@
|
||||
/**
|
||||
* 写死定时任务配置(不走数据库)
|
||||
* cron_expression 参考 node-cron
|
||||
*/
|
||||
|
||||
export const cron_task_list = [
|
||||
// 示例:每 6 小时跑一次列表抓取
|
||||
// {
|
||||
// name: 'amazon_search_list_every_6h',
|
||||
// cron_expression: '0 */6 * * *',
|
||||
// action_name: 'amazon_search_list',
|
||||
// action_payload: { keyword: '午餐包', limit: 100 }
|
||||
// }
|
||||
];
|
||||
39
server/models/amazon_product.js
Normal file
39
server/models/amazon_product.js
Normal file
@@ -0,0 +1,39 @@
|
||||
import { DataTypes } from 'sequelize';
|
||||
|
||||
export function define_amazon_product(sequelize) {
|
||||
return sequelize.define(
|
||||
'amazon_product',
|
||||
{
|
||||
id: { type: DataTypes.BIGINT.UNSIGNED, primaryKey: true, autoIncrement: true },
|
||||
asin: { type: DataTypes.STRING(32), allowNull: false },
|
||||
url: { type: DataTypes.TEXT, allowNull: false },
|
||||
title: { type: DataTypes.TEXT, allowNull: true },
|
||||
price: { type: DataTypes.STRING(64), allowNull: true },
|
||||
sku: { type: DataTypes.STRING(256), allowNull: true },
|
||||
sku_color: { type: DataTypes.STRING(128), allowNull: true },
|
||||
sku_size: { type: DataTypes.STRING(128), allowNull: true },
|
||||
brand_line: { type: DataTypes.TEXT, allowNull: true },
|
||||
brand_store_url: { type: DataTypes.TEXT, allowNull: true },
|
||||
ac_badge: { type: DataTypes.STRING(128), allowNull: true },
|
||||
bestseller_hint: { type: DataTypes.TEXT, allowNull: true },
|
||||
delivery_hint: { type: DataTypes.TEXT, allowNull: true },
|
||||
social_proof: { type: DataTypes.TEXT, allowNull: true },
|
||||
sustainability_hint: { type: DataTypes.TEXT, allowNull: true },
|
||||
rating_stars: { type: DataTypes.STRING(64), allowNull: true },
|
||||
review_count_text: { type: DataTypes.STRING(64), allowNull: true },
|
||||
main_image: { type: DataTypes.TEXT, allowNull: true },
|
||||
images_json: { type: DataTypes.TEXT('long'), allowNull: true },
|
||||
bullets_json: { type: DataTypes.TEXT('long'), allowNull: true },
|
||||
product_info_json: { type: DataTypes.TEXT('long'), allowNull: true },
|
||||
detail_extra_lines_json: { type: DataTypes.TEXT('long'), allowNull: true }
|
||||
},
|
||||
{
|
||||
tableName: 'amazon_product',
|
||||
indexes: [
|
||||
{ unique: true, fields: ['asin'] },
|
||||
{ fields: ['created_at'] },
|
||||
{ fields: ['updated_at'] }
|
||||
]
|
||||
}
|
||||
);
|
||||
}
|
||||
31
server/models/amazon_review.js
Normal file
31
server/models/amazon_review.js
Normal file
@@ -0,0 +1,31 @@
|
||||
import { DataTypes } from 'sequelize';
|
||||
|
||||
export function define_amazon_review(sequelize) {
|
||||
return sequelize.define(
|
||||
'amazon_review',
|
||||
{
|
||||
id: { type: DataTypes.BIGINT.UNSIGNED, primaryKey: true, autoIncrement: true },
|
||||
asin: { type: DataTypes.STRING(32), allowNull: true },
|
||||
url: { type: DataTypes.TEXT, allowNull: false },
|
||||
review_id: { type: DataTypes.STRING(64), allowNull: false },
|
||||
author: { type: DataTypes.STRING(256), allowNull: true },
|
||||
title: { type: DataTypes.TEXT, allowNull: true },
|
||||
body: { type: DataTypes.TEXT('long'), allowNull: true },
|
||||
rating_text: { type: DataTypes.STRING(64), allowNull: true },
|
||||
review_date: { type: DataTypes.STRING(128), allowNull: true },
|
||||
review_index: { type: DataTypes.INTEGER, allowNull: true },
|
||||
batch_key: { type: DataTypes.STRING(64), allowNull: false },
|
||||
batch_total: { type: DataTypes.INTEGER, allowNull: true },
|
||||
batch_limit: { type: DataTypes.INTEGER, allowNull: true }
|
||||
},
|
||||
{
|
||||
tableName: 'amazon_review',
|
||||
indexes: [
|
||||
{ unique: true, fields: ['review_id'] },
|
||||
{ fields: ['asin'] },
|
||||
{ fields: ['batch_key'] },
|
||||
{ fields: ['created_at'] }
|
||||
]
|
||||
}
|
||||
);
|
||||
}
|
||||
30
server/models/amazon_search_item.js
Normal file
30
server/models/amazon_search_item.js
Normal file
@@ -0,0 +1,30 @@
|
||||
import { DataTypes } from 'sequelize';
|
||||
|
||||
export function define_amazon_search_item(sequelize) {
|
||||
return sequelize.define(
|
||||
'amazon_search_item',
|
||||
{
|
||||
id: { type: DataTypes.BIGINT.UNSIGNED, primaryKey: true, autoIncrement: true },
|
||||
asin: { type: DataTypes.STRING(32), allowNull: false },
|
||||
url: { type: DataTypes.TEXT, allowNull: false },
|
||||
title: { type: DataTypes.TEXT, allowNull: true },
|
||||
price: { type: DataTypes.STRING(64), allowNull: true },
|
||||
rating: { type: DataTypes.FLOAT, allowNull: true },
|
||||
rating_text: { type: DataTypes.STRING(64), allowNull: true },
|
||||
review_count: { type: DataTypes.INTEGER, allowNull: true },
|
||||
review_count_text: { type: DataTypes.STRING(64), allowNull: true },
|
||||
rank_index: { type: DataTypes.INTEGER, allowNull: true, comment: '列表中的 index 字段' },
|
||||
batch_key: { type: DataTypes.STRING(64), allowNull: false, comment: '一次列表抓取的批次 key' },
|
||||
batch_total: { type: DataTypes.INTEGER, allowNull: true },
|
||||
batch_limit: { type: DataTypes.INTEGER, allowNull: true }
|
||||
},
|
||||
{
|
||||
tableName: 'amazon_search_item',
|
||||
indexes: [
|
||||
{ fields: ['asin'] },
|
||||
{ fields: ['batch_key'] },
|
||||
{ fields: ['created_at'] }
|
||||
]
|
||||
}
|
||||
);
|
||||
}
|
||||
@@ -15,8 +15,7 @@ export function define_crawl_run_record(sequelize) {
|
||||
allowNull: false,
|
||||
defaultValue: 'api',
|
||||
comment: 'api | cron'
|
||||
},
|
||||
schedule_task_id: { type: DataTypes.BIGINT.UNSIGNED, allowNull: true }
|
||||
}
|
||||
},
|
||||
{
|
||||
tableName: 'crawl_run_record',
|
||||
|
||||
@@ -1,7 +1,9 @@
|
||||
import { Sequelize } from 'sequelize';
|
||||
import { get_sequelize_options } from '../config/database.js';
|
||||
import { define_crawl_run_record } from './crawl_run_record.js';
|
||||
import { define_schedule_task } from './schedule_task.js';
|
||||
import { define_amazon_product } from './amazon_product.js';
|
||||
import { define_amazon_search_item } from './amazon_search_item.js';
|
||||
import { define_amazon_review } from './amazon_review.js';
|
||||
|
||||
const sequelize_options = get_sequelize_options();
|
||||
const { database, username, password, ...rest } = sequelize_options;
|
||||
@@ -9,7 +11,7 @@ const { database, username, password, ...rest } = sequelize_options;
|
||||
export const sequelize = new Sequelize(database, username, password, rest);
|
||||
|
||||
export const crawl_run_record = define_crawl_run_record(sequelize);
|
||||
export const schedule_task = define_schedule_task(sequelize);
|
||||
|
||||
schedule_task.hasMany(crawl_run_record, { foreignKey: 'schedule_task_id', as: 'records' });
|
||||
crawl_run_record.belongsTo(schedule_task, { foreignKey: 'schedule_task_id', as: 'schedule_task' });
|
||||
export const amazon_product = define_amazon_product(sequelize);
|
||||
export const amazon_search_item = define_amazon_search_item(sequelize);
|
||||
export const amazon_review = define_amazon_review(sequelize);
|
||||
|
||||
@@ -1,19 +1 @@
|
||||
import { DataTypes } from 'sequelize';
|
||||
|
||||
export function define_schedule_task(sequelize) {
|
||||
return sequelize.define(
|
||||
'schedule_task',
|
||||
{
|
||||
id: { type: DataTypes.BIGINT.UNSIGNED, primaryKey: true, autoIncrement: true },
|
||||
name: { type: DataTypes.STRING(128), allowNull: false },
|
||||
cron_expression: { type: DataTypes.STRING(64), allowNull: false },
|
||||
action_name: { type: DataTypes.STRING(128), allowNull: false },
|
||||
payload_json: { type: DataTypes.TEXT, allowNull: true },
|
||||
enabled: { type: DataTypes.BOOLEAN, allowNull: false, defaultValue: true },
|
||||
last_run_at: { type: DataTypes.DATE, allowNull: true }
|
||||
},
|
||||
{
|
||||
tableName: 'schedule_task'
|
||||
}
|
||||
);
|
||||
}
|
||||
// 已废弃:按需求改为写死定时任务(见 config/cron_tasks.js)
|
||||
|
||||
@@ -1,73 +1 @@
|
||||
import Router from '@koa/router';
|
||||
import { schedule_task } from '../models/index.js';
|
||||
import { safe_json_parse, safe_json_stringify } from '../services/json_utils.js';
|
||||
|
||||
export const schedule_task_router = new Router();
|
||||
|
||||
schedule_task_router.post('/api/schedule_task/create', async (ctx) => {
|
||||
const { name, cron_expression, action_name, payload } = ctx.request.body || {};
|
||||
if (!name || !cron_expression || !action_name) {
|
||||
ctx.status = 400;
|
||||
ctx.body = { ok: false, error: '缺少 name/cron_expression/action_name' };
|
||||
return;
|
||||
}
|
||||
|
||||
const payload_json = payload ? safe_json_stringify(payload) : null;
|
||||
const row = await schedule_task.create({
|
||||
name,
|
||||
cron_expression,
|
||||
action_name,
|
||||
payload_json,
|
||||
enabled: true
|
||||
});
|
||||
|
||||
ctx.body = { ok: true, data: { id: row.id } };
|
||||
});
|
||||
|
||||
schedule_task_router.post('/api/schedule_task/list', async (ctx) => {
|
||||
const rows = await schedule_task.findAll({ order: [['id', 'desc']] });
|
||||
ctx.body = {
|
||||
ok: true,
|
||||
data: rows.map((r) => ({
|
||||
id: r.id,
|
||||
name: r.name,
|
||||
cron_expression: r.cron_expression,
|
||||
action_name: r.action_name,
|
||||
payload: safe_json_parse(r.payload_json),
|
||||
enabled: r.enabled,
|
||||
last_run_at: r.last_run_at
|
||||
}))
|
||||
};
|
||||
});
|
||||
|
||||
schedule_task_router.post('/api/schedule_task/set_enabled', async (ctx) => {
|
||||
const { id, enabled } = ctx.request.body || {};
|
||||
if (!id || typeof enabled !== 'boolean') {
|
||||
ctx.status = 400;
|
||||
ctx.body = { ok: false, error: '缺少 id/enabled(boolean)' };
|
||||
return;
|
||||
}
|
||||
|
||||
const row = await schedule_task.findByPk(id);
|
||||
if (!row) {
|
||||
ctx.status = 404;
|
||||
ctx.body = { ok: false, error: '任务不存在' };
|
||||
return;
|
||||
}
|
||||
|
||||
row.enabled = enabled;
|
||||
await row.save();
|
||||
ctx.body = { ok: true };
|
||||
});
|
||||
|
||||
schedule_task_router.post('/api/schedule_task/delete', async (ctx) => {
|
||||
const { id } = ctx.request.body || {};
|
||||
if (!id) {
|
||||
ctx.status = 400;
|
||||
ctx.body = { ok: false, error: '缺少 id' };
|
||||
return;
|
||||
}
|
||||
|
||||
await schedule_task.destroy({ where: { id } });
|
||||
ctx.body = { ok: true };
|
||||
});
|
||||
// 已废弃:按需求改为写死定时任务(见 config/cron_tasks.js)
|
||||
|
||||
110
server/services/amazon_persist.js
Normal file
110
server/services/amazon_persist.js
Normal file
@@ -0,0 +1,110 @@
|
||||
import {
|
||||
amazon_product,
|
||||
amazon_search_item,
|
||||
amazon_review
|
||||
} from '../models/index.js';
|
||||
import { safe_json_stringify } from './json_utils.js';
|
||||
|
||||
function build_batch_key(prefix) {
|
||||
return `${prefix}_${Date.now()}_${Math.random().toString().slice(2, 8)}`;
|
||||
}
|
||||
|
||||
function pick_asin_from_url(url) {
|
||||
if (!url) return null;
|
||||
const m = String(url).match(/\/dp\/([A-Z0-9]{8,16})/i);
|
||||
return m && m[1] ? m[1].toUpperCase() : null;
|
||||
}
|
||||
|
||||
export async function persist_amazon_result(result) {
|
||||
if (!result || !result.stage) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (result.stage === 'detail') {
|
||||
const asin = result.asin || pick_asin_from_url(result.url);
|
||||
if (!asin) {
|
||||
return;
|
||||
}
|
||||
|
||||
await amazon_product.upsert({
|
||||
asin,
|
||||
url: result.url || '',
|
||||
title: result.title || null,
|
||||
price: result.price || null,
|
||||
sku: result.sku || null,
|
||||
sku_color: result.sku_color || null,
|
||||
sku_size: result.sku_size || null,
|
||||
brand_line: result.brand_line || null,
|
||||
brand_store_url: result.brand_store_url || null,
|
||||
ac_badge: result.ac_badge || null,
|
||||
bestseller_hint: result.bestseller_hint || null,
|
||||
delivery_hint: result.delivery_hint || null,
|
||||
social_proof: result.social_proof || null,
|
||||
sustainability_hint: result.sustainability_hint || null,
|
||||
rating_stars: result.rating_stars || null,
|
||||
review_count_text: result.review_count_text || null,
|
||||
main_image: result.main_image || null,
|
||||
images_json: safe_json_stringify(result.images || []),
|
||||
bullets_json: safe_json_stringify(result.bullets || []),
|
||||
product_info_json: safe_json_stringify(result.product_info || {}),
|
||||
detail_extra_lines_json: safe_json_stringify(result.detail_extra_lines || [])
|
||||
});
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
if (result.stage === 'list') {
|
||||
const batch_key = build_batch_key('list');
|
||||
const items = Array.isArray(result.items) ? result.items : [];
|
||||
|
||||
for (const it of items) {
|
||||
const asin = it.asin || pick_asin_from_url(it.url);
|
||||
if (!asin || !it.url) continue;
|
||||
|
||||
await amazon_search_item.create({
|
||||
asin,
|
||||
url: it.url,
|
||||
title: it.title || null,
|
||||
price: it.price || null,
|
||||
rating: typeof it.rating === 'number' ? it.rating : null,
|
||||
rating_text: it.rating_text || null,
|
||||
review_count: typeof it.review_count === 'number' ? it.review_count : null,
|
||||
review_count_text: it.review_count_text || null,
|
||||
rank_index: typeof it.index === 'number' ? it.index : null,
|
||||
batch_key,
|
||||
batch_total: typeof result.total === 'number' ? result.total : null,
|
||||
batch_limit: typeof result.limit === 'number' ? result.limit : null
|
||||
});
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
if (result.stage === 'reviews') {
|
||||
const batch_key = build_batch_key('reviews');
|
||||
const asin = pick_asin_from_url(result.url);
|
||||
const items = Array.isArray(result.items) ? result.items : [];
|
||||
|
||||
for (const it of items) {
|
||||
const review_id = it.review_id;
|
||||
if (!review_id) continue;
|
||||
|
||||
const asin_value = asin || pick_asin_from_url(it.url) || pick_asin_from_url(result.url);
|
||||
|
||||
await amazon_review.upsert({
|
||||
asin: asin_value || null,
|
||||
url: result.url || '',
|
||||
review_id,
|
||||
author: it.author || null,
|
||||
title: it.title || null,
|
||||
body: it.body || null,
|
||||
rating_text: it.rating_text || null,
|
||||
review_date: it.date || null,
|
||||
review_index: typeof it.index === 'number' ? it.index : null,
|
||||
batch_key,
|
||||
batch_total: typeof result.total === 'number' ? result.total : null,
|
||||
batch_limit: typeof result.limit === 'number' ? result.limit : null
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,30 +1 @@
|
||||
import cron from 'node-cron';
|
||||
|
||||
const task_id_to_cron_job = new Map();
|
||||
|
||||
export function stop_all_cron_jobs() {
|
||||
for (const job of task_id_to_cron_job.values()) {
|
||||
job.stop();
|
||||
}
|
||||
task_id_to_cron_job.clear();
|
||||
}
|
||||
|
||||
export function upsert_cron_job(schedule_task_id, cron_expression, on_tick) {
|
||||
const existing = task_id_to_cron_job.get(schedule_task_id);
|
||||
if (existing) {
|
||||
existing.stop();
|
||||
task_id_to_cron_job.delete(schedule_task_id);
|
||||
}
|
||||
|
||||
const job = cron.schedule(cron_expression, on_tick, { scheduled: true });
|
||||
task_id_to_cron_job.set(schedule_task_id, job);
|
||||
}
|
||||
|
||||
export function remove_cron_job(schedule_task_id) {
|
||||
const job = task_id_to_cron_job.get(schedule_task_id);
|
||||
if (!job) {
|
||||
return;
|
||||
}
|
||||
job.stop();
|
||||
task_id_to_cron_job.delete(schedule_task_id);
|
||||
}
|
||||
// 已废弃:按需求改为写死定时任务(见 config/cron_tasks.js)
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
import dotenv from 'dotenv';
|
||||
import fs from 'node:fs';
|
||||
import path from 'node:path';
|
||||
import puppeteer from 'puppeteer';
|
||||
|
||||
@@ -30,17 +31,64 @@ function get_extension_id_from_targets(targets) {
|
||||
return null;
|
||||
}
|
||||
|
||||
async function wait_for_extension_id(browser, timeout_ms) {
|
||||
const existing = get_extension_id_from_targets(browser.targets());
|
||||
if (existing) {
|
||||
return existing;
|
||||
}
|
||||
|
||||
const target = await browser
|
||||
.waitForTarget((t) => {
|
||||
const url = t.url();
|
||||
return typeof url === 'string' && url.startsWith('chrome-extension://');
|
||||
}, { timeout: timeout_ms })
|
||||
.catch(() => null);
|
||||
|
||||
if (!target) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return get_extension_id_from_targets([target]);
|
||||
}
|
||||
|
||||
function get_chrome_executable_path() {
|
||||
// 优先环境变量,方便你后续切换版本
|
||||
const from_env = process.env.CHROME_EXECUTABLE_PATH;
|
||||
if (from_env) {
|
||||
return path.resolve(from_env);
|
||||
}
|
||||
|
||||
// 默认使用项目根目录的 chrome-win/chrome.exe
|
||||
// 当前进程 cwd 通常是 server/,所以回到上一级
|
||||
return path.resolve(process.cwd(), '../chrome-win/chrome.exe');
|
||||
}
|
||||
|
||||
export async function get_or_create_browser() {
|
||||
if (browser_singleton) {
|
||||
return browser_singleton;
|
||||
}
|
||||
|
||||
const extension_path = path.resolve(get_crx_src_path());
|
||||
const chrome_executable_path = get_chrome_executable_path();
|
||||
if (!fs.existsSync(chrome_executable_path)) {
|
||||
throw new Error(`Chrome 不存在: ${chrome_executable_path}`);
|
||||
}
|
||||
|
||||
const raw_extension_path = path.resolve(get_crx_src_path());
|
||||
const manifest_path = path.resolve(raw_extension_path, 'manifest.json');
|
||||
if (!fs.existsSync(manifest_path)) {
|
||||
throw new Error(`扩展 manifest.json 不存在: ${manifest_path}`);
|
||||
}
|
||||
|
||||
const extension_path = raw_extension_path.replace(/\\/g, '/');
|
||||
const headless = String(process.env.PUPPETEER_HEADLESS || 'false') === 'true';
|
||||
const user_data_dir = path.resolve(process.cwd(), 'puppeteer_profile');
|
||||
|
||||
browser_singleton = await puppeteer.launch({
|
||||
executablePath: chrome_executable_path,
|
||||
headless,
|
||||
args: [
|
||||
`--user-data-dir=${user_data_dir}`,
|
||||
'--enable-extensions',
|
||||
`--disable-extensions-except=${extension_path}`,
|
||||
`--load-extension=${extension_path}`,
|
||||
'--no-default-browser-check',
|
||||
@@ -58,11 +106,19 @@ export async function invoke_extension_action(action_name, action_payload) {
|
||||
const page = await browser.newPage();
|
||||
await page.goto('about:blank');
|
||||
|
||||
const targets = await browser.targets();
|
||||
const extension_id = get_extension_id_from_targets(targets);
|
||||
// 尝试先打开 chrome://extensions 触发扩展初始化(某些环境下扩展 target 不会立刻出现)
|
||||
try {
|
||||
await page.goto('chrome://extensions/', { waitUntil: 'domcontentloaded' });
|
||||
} catch (err) {
|
||||
// ignore
|
||||
}
|
||||
|
||||
const extension_id = await wait_for_extension_id(browser, 15000);
|
||||
if (!extension_id) {
|
||||
await page.close();
|
||||
throw new Error('未找到扩展 extension_id(请确认 CRX_SRC_PATH 指向 src 且成功加载)');
|
||||
throw new Error(
|
||||
'未找到扩展 extension_id:Chrome 未加载扩展(常见原因:MV2 被禁用/企业策略未生效/CRX_SRC_PATH 不正确/使用了 headless)'
|
||||
);
|
||||
}
|
||||
|
||||
const bridge_url = `chrome-extension://${extension_id}/bridge/bridge.html`;
|
||||
|
||||
@@ -1,33 +1,30 @@
|
||||
import { schedule_task } from '../models/index.js';
|
||||
import { safe_json_parse } from './json_utils.js';
|
||||
import cron from 'node-cron';
|
||||
import { cron_task_list } from '../config/cron_tasks.js';
|
||||
import { execute_action_and_record } from './task_executor.js';
|
||||
import { remove_cron_job, upsert_cron_job } from './cron_manager.js';
|
||||
|
||||
export async function reload_all_schedules() {
|
||||
const rows = await schedule_task.findAll();
|
||||
const cron_jobs = [];
|
||||
|
||||
for (const row of rows) {
|
||||
if (!row.enabled) {
|
||||
remove_cron_job(row.id);
|
||||
continue;
|
||||
}
|
||||
|
||||
upsert_cron_job(row.id, row.cron_expression, async () => {
|
||||
export function start_all_cron_tasks() {
|
||||
for (const task of cron_task_list) {
|
||||
const job = cron.schedule(task.cron_expression, async () => {
|
||||
try {
|
||||
await schedule_task.update(
|
||||
{ last_run_at: new Date() },
|
||||
{ where: { id: row.id } }
|
||||
);
|
||||
|
||||
await execute_action_and_record({
|
||||
action_name: row.action_name,
|
||||
action_payload: safe_json_parse(row.payload_json) || {},
|
||||
source: 'cron',
|
||||
schedule_task_id: row.id
|
||||
action_name: task.action_name,
|
||||
action_payload: task.action_payload || {},
|
||||
source: 'cron'
|
||||
});
|
||||
} catch (err) {
|
||||
// cron 执行失败已在 crawl_run_record 落库,避免重复抛出影响其它任务
|
||||
// 失败会在 crawl_run_record 落库
|
||||
}
|
||||
});
|
||||
|
||||
cron_jobs.push(job);
|
||||
}
|
||||
}
|
||||
|
||||
export function stop_all_cron_tasks() {
|
||||
for (const job of cron_jobs) {
|
||||
job.stop();
|
||||
}
|
||||
cron_jobs.length = 0;
|
||||
}
|
||||
|
||||
@@ -1,14 +1,10 @@
|
||||
import { crawl_run_record } from '../models/index.js';
|
||||
import { safe_json_stringify } from './json_utils.js';
|
||||
import { invoke_extension_action } from './puppeteer_runner.js';
|
||||
import { persist_amazon_result } from './amazon_persist.js';
|
||||
|
||||
export async function execute_action_and_record(params) {
|
||||
const {
|
||||
action_name,
|
||||
action_payload,
|
||||
source,
|
||||
schedule_task_id
|
||||
} = params;
|
||||
const { action_name, action_payload, source } = params;
|
||||
|
||||
const request_payload = safe_json_stringify(action_payload || {});
|
||||
|
||||
@@ -18,8 +14,13 @@ export async function execute_action_and_record(params) {
|
||||
|
||||
try {
|
||||
const result = await invoke_extension_action(action_name, action_payload || {});
|
||||
|
||||
ok = true;
|
||||
result_payload = safe_json_stringify(result);
|
||||
|
||||
// 按 stage 自动入库(不影响原始 run_record 记录)
|
||||
await persist_amazon_result(result);
|
||||
|
||||
return result;
|
||||
} catch (err) {
|
||||
ok = false;
|
||||
@@ -32,8 +33,7 @@ export async function execute_action_and_record(params) {
|
||||
ok,
|
||||
result_payload,
|
||||
error_message,
|
||||
source,
|
||||
schedule_task_id: schedule_task_id || null
|
||||
source
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user