1
This commit is contained in:
13
server/.env.example
Normal file
13
server/.env.example
Normal file
@@ -0,0 +1,13 @@
|
||||
# MySQL
|
||||
MYSQL_HOST=127.0.0.1
|
||||
MYSQL_PORT=3306
|
||||
MYSQL_USER=root
|
||||
MYSQL_PASSWORD=
|
||||
MYSQL_DATABASE=ecom_crawl
|
||||
|
||||
# 扩展目录(未打包,含 manifest.json)
|
||||
CRX_SRC_PATH=d:/项目/电商抓取项目/mv2_simple_crx/src
|
||||
|
||||
SERVER_PORT=38080
|
||||
ACTION_TIMEOUT_MS=300000
|
||||
PUPPETEER_HEADLESS=false
|
||||
31
server/app.js
Normal file
31
server/app.js
Normal file
@@ -0,0 +1,31 @@
|
||||
import dotenv from 'dotenv';
|
||||
import Koa from 'koa';
|
||||
import body_parser from 'koa-bodyparser';
|
||||
|
||||
import { sequelize } from './models/index.js';
|
||||
import { crawl_router } from './routes/crawl.js';
|
||||
import { schedule_task_router } from './routes/schedule_task.js';
|
||||
import { reload_all_schedules } from './services/schedule_loader.js';
|
||||
|
||||
dotenv.config();
|
||||
|
||||
const app = new Koa();
|
||||
app.use(body_parser({ jsonLimit: '10mb' }));
|
||||
|
||||
app.use(crawl_router.routes()).use(crawl_router.allowedMethods());
|
||||
app.use(schedule_task_router.routes()).use(schedule_task_router.allowedMethods());
|
||||
|
||||
app.use(async (ctx) => {
|
||||
ctx.status = 404;
|
||||
ctx.body = { ok: false, error: 'not_found' };
|
||||
});
|
||||
|
||||
const port = Number(process.env.SERVER_PORT || 38080);
|
||||
|
||||
await sequelize.authenticate();
|
||||
await sequelize.sync();
|
||||
await reload_all_schedules();
|
||||
|
||||
app.listen(port);
|
||||
// eslint-disable-next-line no-console
|
||||
console.log(`server listening on ${port}`);
|
||||
22
server/config/database.js
Normal file
22
server/config/database.js
Normal file
@@ -0,0 +1,22 @@
|
||||
import dotenv from 'dotenv';
|
||||
|
||||
dotenv.config();
|
||||
|
||||
export function get_sequelize_options() {
|
||||
return {
|
||||
host: process.env.MYSQL_HOST || '127.0.0.1',
|
||||
port: Number(process.env.MYSQL_PORT || 3306),
|
||||
username: process.env.MYSQL_USER || 'root',
|
||||
password: process.env.MYSQL_PASSWORD || '',
|
||||
database: process.env.MYSQL_DATABASE || 'ecom_crawl',
|
||||
dialect: 'mysql',
|
||||
logging: false,
|
||||
define: {
|
||||
underscored: true,
|
||||
timestamps: true,
|
||||
createdAt: 'created_at',
|
||||
updatedAt: 'updated_at'
|
||||
},
|
||||
timezone: '+08:00'
|
||||
};
|
||||
}
|
||||
26
server/models/crawl_run_record.js
Normal file
26
server/models/crawl_run_record.js
Normal file
@@ -0,0 +1,26 @@
|
||||
import { DataTypes } from 'sequelize';
|
||||
|
||||
export function define_crawl_run_record(sequelize) {
|
||||
return sequelize.define(
|
||||
'crawl_run_record',
|
||||
{
|
||||
id: { type: DataTypes.BIGINT.UNSIGNED, primaryKey: true, autoIncrement: true },
|
||||
action_name: { type: DataTypes.STRING(128), allowNull: false },
|
||||
request_payload: { type: DataTypes.TEXT, allowNull: true, comment: 'JSON 请求体' },
|
||||
ok: { type: DataTypes.BOOLEAN, allowNull: false },
|
||||
result_payload: { type: DataTypes.TEXT('long'), allowNull: true, comment: 'JSON 结果' },
|
||||
error_message: { type: DataTypes.TEXT, allowNull: true },
|
||||
source: {
|
||||
type: DataTypes.STRING(32),
|
||||
allowNull: false,
|
||||
defaultValue: 'api',
|
||||
comment: 'api | cron'
|
||||
},
|
||||
schedule_task_id: { type: DataTypes.BIGINT.UNSIGNED, allowNull: true }
|
||||
},
|
||||
{
|
||||
tableName: 'crawl_run_record',
|
||||
indexes: [{ fields: ['action_name'] }, { fields: ['created_at'] }]
|
||||
}
|
||||
);
|
||||
}
|
||||
15
server/models/index.js
Normal file
15
server/models/index.js
Normal file
@@ -0,0 +1,15 @@
|
||||
import { Sequelize } from 'sequelize';
|
||||
import { get_sequelize_options } from '../config/database.js';
|
||||
import { define_crawl_run_record } from './crawl_run_record.js';
|
||||
import { define_schedule_task } from './schedule_task.js';
|
||||
|
||||
const sequelize_options = get_sequelize_options();
|
||||
const { database, username, password, ...rest } = sequelize_options;
|
||||
|
||||
export const sequelize = new Sequelize(database, username, password, rest);
|
||||
|
||||
export const crawl_run_record = define_crawl_run_record(sequelize);
|
||||
export const schedule_task = define_schedule_task(sequelize);
|
||||
|
||||
schedule_task.hasMany(crawl_run_record, { foreignKey: 'schedule_task_id', as: 'records' });
|
||||
crawl_run_record.belongsTo(schedule_task, { foreignKey: 'schedule_task_id', as: 'schedule_task' });
|
||||
19
server/models/schedule_task.js
Normal file
19
server/models/schedule_task.js
Normal file
@@ -0,0 +1,19 @@
|
||||
import { DataTypes } from 'sequelize';
|
||||
|
||||
export function define_schedule_task(sequelize) {
|
||||
return sequelize.define(
|
||||
'schedule_task',
|
||||
{
|
||||
id: { type: DataTypes.BIGINT.UNSIGNED, primaryKey: true, autoIncrement: true },
|
||||
name: { type: DataTypes.STRING(128), allowNull: false },
|
||||
cron_expression: { type: DataTypes.STRING(64), allowNull: false },
|
||||
action_name: { type: DataTypes.STRING(128), allowNull: false },
|
||||
payload_json: { type: DataTypes.TEXT, allowNull: true },
|
||||
enabled: { type: DataTypes.BOOLEAN, allowNull: false, defaultValue: true },
|
||||
last_run_at: { type: DataTypes.DATE, allowNull: true }
|
||||
},
|
||||
{
|
||||
tableName: 'schedule_task'
|
||||
}
|
||||
);
|
||||
}
|
||||
2461
server/package-lock.json
generated
Normal file
2461
server/package-lock.json
generated
Normal file
File diff suppressed because it is too large
Load Diff
20
server/package.json
Normal file
20
server/package.json
Normal file
@@ -0,0 +1,20 @@
|
||||
{
|
||||
"name": "ecom_crawl_server",
|
||||
"version": "1.0.0",
|
||||
"private": true,
|
||||
"type": "module",
|
||||
"scripts": {
|
||||
"start": "node app.js",
|
||||
"db_sync": "node scripts/db_sync.js"
|
||||
},
|
||||
"dependencies": {
|
||||
"@koa/router": "^12.0.1",
|
||||
"dotenv": "^16.4.5",
|
||||
"koa": "^2.15.3",
|
||||
"koa-bodyparser": "^4.4.1",
|
||||
"mysql2": "^3.11.0",
|
||||
"node-cron": "^3.0.3",
|
||||
"puppeteer": "^23.4.1",
|
||||
"sequelize": "^6.37.3"
|
||||
}
|
||||
}
|
||||
26
server/routes/crawl.js
Normal file
26
server/routes/crawl.js
Normal file
@@ -0,0 +1,26 @@
|
||||
import Router from '@koa/router';
|
||||
import { execute_action_and_record } from '../services/task_executor.js';
|
||||
|
||||
export const crawl_router = new Router();
|
||||
|
||||
crawl_router.post('/api/crawl/run_action', async (ctx) => {
|
||||
const { action_name, action_payload } = ctx.request.body || {};
|
||||
|
||||
if (!action_name) {
|
||||
ctx.status = 400;
|
||||
ctx.body = { ok: false, error: '缺少 action_name' };
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
const data = await execute_action_and_record({
|
||||
action_name,
|
||||
action_payload: action_payload || {},
|
||||
source: 'api'
|
||||
});
|
||||
ctx.body = { ok: true, data };
|
||||
} catch (err) {
|
||||
ctx.status = 500;
|
||||
ctx.body = { ok: false, error: (err && err.message) || String(err) };
|
||||
}
|
||||
});
|
||||
73
server/routes/schedule_task.js
Normal file
73
server/routes/schedule_task.js
Normal file
@@ -0,0 +1,73 @@
|
||||
import Router from '@koa/router';
|
||||
import { schedule_task } from '../models/index.js';
|
||||
import { safe_json_parse, safe_json_stringify } from '../services/json_utils.js';
|
||||
|
||||
export const schedule_task_router = new Router();
|
||||
|
||||
schedule_task_router.post('/api/schedule_task/create', async (ctx) => {
|
||||
const { name, cron_expression, action_name, payload } = ctx.request.body || {};
|
||||
if (!name || !cron_expression || !action_name) {
|
||||
ctx.status = 400;
|
||||
ctx.body = { ok: false, error: '缺少 name/cron_expression/action_name' };
|
||||
return;
|
||||
}
|
||||
|
||||
const payload_json = payload ? safe_json_stringify(payload) : null;
|
||||
const row = await schedule_task.create({
|
||||
name,
|
||||
cron_expression,
|
||||
action_name,
|
||||
payload_json,
|
||||
enabled: true
|
||||
});
|
||||
|
||||
ctx.body = { ok: true, data: { id: row.id } };
|
||||
});
|
||||
|
||||
schedule_task_router.post('/api/schedule_task/list', async (ctx) => {
|
||||
const rows = await schedule_task.findAll({ order: [['id', 'desc']] });
|
||||
ctx.body = {
|
||||
ok: true,
|
||||
data: rows.map((r) => ({
|
||||
id: r.id,
|
||||
name: r.name,
|
||||
cron_expression: r.cron_expression,
|
||||
action_name: r.action_name,
|
||||
payload: safe_json_parse(r.payload_json),
|
||||
enabled: r.enabled,
|
||||
last_run_at: r.last_run_at
|
||||
}))
|
||||
};
|
||||
});
|
||||
|
||||
schedule_task_router.post('/api/schedule_task/set_enabled', async (ctx) => {
|
||||
const { id, enabled } = ctx.request.body || {};
|
||||
if (!id || typeof enabled !== 'boolean') {
|
||||
ctx.status = 400;
|
||||
ctx.body = { ok: false, error: '缺少 id/enabled(boolean)' };
|
||||
return;
|
||||
}
|
||||
|
||||
const row = await schedule_task.findByPk(id);
|
||||
if (!row) {
|
||||
ctx.status = 404;
|
||||
ctx.body = { ok: false, error: '任务不存在' };
|
||||
return;
|
||||
}
|
||||
|
||||
row.enabled = enabled;
|
||||
await row.save();
|
||||
ctx.body = { ok: true };
|
||||
});
|
||||
|
||||
schedule_task_router.post('/api/schedule_task/delete', async (ctx) => {
|
||||
const { id } = ctx.request.body || {};
|
||||
if (!id) {
|
||||
ctx.status = 400;
|
||||
ctx.body = { ok: false, error: '缺少 id' };
|
||||
return;
|
||||
}
|
||||
|
||||
await schedule_task.destroy({ where: { id } });
|
||||
ctx.body = { ok: true };
|
||||
});
|
||||
6
server/scripts/db_sync.js
Normal file
6
server/scripts/db_sync.js
Normal file
@@ -0,0 +1,6 @@
|
||||
import { sequelize } from '../models/index.js';
|
||||
|
||||
await sequelize.sync({ alter: true });
|
||||
// eslint-disable-next-line no-console
|
||||
console.log('sync ok');
|
||||
await sequelize.close();
|
||||
30
server/services/cron_manager.js
Normal file
30
server/services/cron_manager.js
Normal file
@@ -0,0 +1,30 @@
|
||||
import cron from 'node-cron';
|
||||
|
||||
const task_id_to_cron_job = new Map();
|
||||
|
||||
export function stop_all_cron_jobs() {
|
||||
for (const job of task_id_to_cron_job.values()) {
|
||||
job.stop();
|
||||
}
|
||||
task_id_to_cron_job.clear();
|
||||
}
|
||||
|
||||
export function upsert_cron_job(schedule_task_id, cron_expression, on_tick) {
|
||||
const existing = task_id_to_cron_job.get(schedule_task_id);
|
||||
if (existing) {
|
||||
existing.stop();
|
||||
task_id_to_cron_job.delete(schedule_task_id);
|
||||
}
|
||||
|
||||
const job = cron.schedule(cron_expression, on_tick, { scheduled: true });
|
||||
task_id_to_cron_job.set(schedule_task_id, job);
|
||||
}
|
||||
|
||||
export function remove_cron_job(schedule_task_id) {
|
||||
const job = task_id_to_cron_job.get(schedule_task_id);
|
||||
if (!job) {
|
||||
return;
|
||||
}
|
||||
job.stop();
|
||||
task_id_to_cron_job.delete(schedule_task_id);
|
||||
}
|
||||
18
server/services/json_utils.js
Normal file
18
server/services/json_utils.js
Normal file
@@ -0,0 +1,18 @@
|
||||
export function safe_json_stringify(value) {
|
||||
try {
|
||||
return JSON.stringify(value);
|
||||
} catch (err) {
|
||||
return JSON.stringify({ error: 'json_stringify_failed', message: String(err) });
|
||||
}
|
||||
}
|
||||
|
||||
export function safe_json_parse(text) {
|
||||
if (text === null || text === undefined || text === '') {
|
||||
return null;
|
||||
}
|
||||
try {
|
||||
return JSON.parse(text);
|
||||
} catch (err) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
102
server/services/puppeteer_runner.js
Normal file
102
server/services/puppeteer_runner.js
Normal file
@@ -0,0 +1,102 @@
|
||||
import dotenv from 'dotenv';
|
||||
import path from 'node:path';
|
||||
import puppeteer from 'puppeteer';
|
||||
|
||||
dotenv.config();
|
||||
|
||||
let browser_singleton = null;
|
||||
|
||||
function get_action_timeout_ms() {
|
||||
return Number(process.env.ACTION_TIMEOUT_MS || 300000);
|
||||
}
|
||||
|
||||
function get_crx_src_path() {
|
||||
const crx_src_path = process.env.CRX_SRC_PATH;
|
||||
if (!crx_src_path) {
|
||||
throw new Error('缺少环境变量 CRX_SRC_PATH');
|
||||
}
|
||||
return crx_src_path;
|
||||
}
|
||||
|
||||
function get_extension_id_from_targets(targets) {
|
||||
for (const target of targets) {
|
||||
const url = target.url();
|
||||
if (!url) continue;
|
||||
if (url.startsWith('chrome-extension://')) {
|
||||
const match = url.match(/^chrome-extension:\/\/([^/]+)\//);
|
||||
if (match && match[1]) return match[1];
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
export async function get_or_create_browser() {
|
||||
if (browser_singleton) {
|
||||
return browser_singleton;
|
||||
}
|
||||
|
||||
const extension_path = path.resolve(get_crx_src_path());
|
||||
const headless = String(process.env.PUPPETEER_HEADLESS || 'false') === 'true';
|
||||
|
||||
browser_singleton = await puppeteer.launch({
|
||||
headless,
|
||||
args: [
|
||||
`--disable-extensions-except=${extension_path}`,
|
||||
`--load-extension=${extension_path}`,
|
||||
'--no-default-browser-check',
|
||||
'--disable-popup-blocking',
|
||||
'--disable-dev-shm-usage'
|
||||
]
|
||||
});
|
||||
|
||||
return browser_singleton;
|
||||
}
|
||||
|
||||
export async function invoke_extension_action(action_name, action_payload) {
|
||||
const browser = await get_or_create_browser();
|
||||
|
||||
const page = await browser.newPage();
|
||||
await page.goto('about:blank');
|
||||
|
||||
const targets = await browser.targets();
|
||||
const extension_id = get_extension_id_from_targets(targets);
|
||||
if (!extension_id) {
|
||||
await page.close();
|
||||
throw new Error('未找到扩展 extension_id(请确认 CRX_SRC_PATH 指向 src 且成功加载)');
|
||||
}
|
||||
|
||||
const bridge_url = `chrome-extension://${extension_id}/bridge/bridge.html`;
|
||||
await page.goto(bridge_url, { waitUntil: 'domcontentloaded' });
|
||||
|
||||
const timeout_ms = get_action_timeout_ms();
|
||||
const action_res = await page.evaluate(
|
||||
async (action, payload, timeout) => {
|
||||
function with_timeout(promise, timeout_ms_inner) {
|
||||
return new Promise((resolve, reject) => {
|
||||
const timer = setTimeout(() => reject(new Error('action_timeout')), timeout_ms_inner);
|
||||
promise
|
||||
.then((v) => {
|
||||
clearTimeout(timer);
|
||||
resolve(v);
|
||||
})
|
||||
.catch((e) => {
|
||||
clearTimeout(timer);
|
||||
reject(e);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
if (!window.server_bridge_invoke) {
|
||||
throw new Error('bridge 未注入 window.server_bridge_invoke');
|
||||
}
|
||||
|
||||
return await with_timeout(window.server_bridge_invoke(action, payload), timeout);
|
||||
},
|
||||
action_name,
|
||||
action_payload || {},
|
||||
timeout_ms
|
||||
);
|
||||
|
||||
await page.close();
|
||||
return action_res;
|
||||
}
|
||||
33
server/services/schedule_loader.js
Normal file
33
server/services/schedule_loader.js
Normal file
@@ -0,0 +1,33 @@
|
||||
import { schedule_task } from '../models/index.js';
|
||||
import { safe_json_parse } from './json_utils.js';
|
||||
import { execute_action_and_record } from './task_executor.js';
|
||||
import { remove_cron_job, upsert_cron_job } from './cron_manager.js';
|
||||
|
||||
export async function reload_all_schedules() {
|
||||
const rows = await schedule_task.findAll();
|
||||
|
||||
for (const row of rows) {
|
||||
if (!row.enabled) {
|
||||
remove_cron_job(row.id);
|
||||
continue;
|
||||
}
|
||||
|
||||
upsert_cron_job(row.id, row.cron_expression, async () => {
|
||||
try {
|
||||
await schedule_task.update(
|
||||
{ last_run_at: new Date() },
|
||||
{ where: { id: row.id } }
|
||||
);
|
||||
|
||||
await execute_action_and_record({
|
||||
action_name: row.action_name,
|
||||
action_payload: safe_json_parse(row.payload_json) || {},
|
||||
source: 'cron',
|
||||
schedule_task_id: row.id
|
||||
});
|
||||
} catch (err) {
|
||||
// cron 执行失败已在 crawl_run_record 落库,避免重复抛出影响其它任务
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
39
server/services/task_executor.js
Normal file
39
server/services/task_executor.js
Normal file
@@ -0,0 +1,39 @@
|
||||
import { crawl_run_record } from '../models/index.js';
|
||||
import { safe_json_stringify } from './json_utils.js';
|
||||
import { invoke_extension_action } from './puppeteer_runner.js';
|
||||
|
||||
export async function execute_action_and_record(params) {
|
||||
const {
|
||||
action_name,
|
||||
action_payload,
|
||||
source,
|
||||
schedule_task_id
|
||||
} = params;
|
||||
|
||||
const request_payload = safe_json_stringify(action_payload || {});
|
||||
|
||||
let ok = false;
|
||||
let result_payload = null;
|
||||
let error_message = null;
|
||||
|
||||
try {
|
||||
const result = await invoke_extension_action(action_name, action_payload || {});
|
||||
ok = true;
|
||||
result_payload = safe_json_stringify(result);
|
||||
return result;
|
||||
} catch (err) {
|
||||
ok = false;
|
||||
error_message = (err && err.message) || String(err);
|
||||
throw err;
|
||||
} finally {
|
||||
await crawl_run_record.create({
|
||||
action_name,
|
||||
request_payload,
|
||||
ok,
|
||||
result_payload,
|
||||
error_message,
|
||||
source,
|
||||
schedule_task_id: schedule_task_id || null
|
||||
});
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user