This commit is contained in:
张成
2026-03-18 14:18:41 +08:00
parent 54341f0a0b
commit 5b671d320b
21 changed files with 4404 additions and 42 deletions

2
.gitignore vendored Normal file
View File

@@ -0,0 +1,2 @@
server/node_modules/*
.env

View File

@@ -86,7 +86,7 @@ export function injected_amazon_switch_language(params) {
export function injected_amazon_search_list(params) {
const start_url = params && params.url ? String(params.url) : location.href;
const category_keyword = params && params.category_keyword ? String(params.category_keyword).trim() : '';
const category_keyword = params && params.category_keyword ? String(params.category_keyword).trim() : '';
const sort_by = params && params.sort_by ? String(params.sort_by).trim() : '';
function pick_number(text) {
@@ -106,15 +106,15 @@ export function injected_amazon_search_list(params) {
if (mm) return Math.round(parseFloat(mm[1]) * 1000000);
const digits = raw.replace(/[^\d]/g, '');
return digits ? Number(digits) : null;
}
}
function abs_url(href) {
try {
return new URL(href, location.origin).toString();
} catch (_) {
return href;
}
}
function abs_url(href) {
try {
return new URL(href, location.origin).toString();
} catch (_) {
return href;
}
}
function parse_asin_from_url(url) {
if (!url || typeof url !== 'string') return null;
@@ -161,20 +161,20 @@ export function injected_amazon_search_list(params) {
const n = pick_int(review_count_text);
return Number.isFinite(n) ? n : null;
})();
items.push({
items.push({
index: idx + 1,
asin: asin || parse_asin_from_url(item_url),
title,
title,
url: item_url,
price,
price,
rating,
rating_text,
review_count,
review_count_text,
});
});
return items;
}
});
});
return items;
}
function pick_next_url() {
const a = document.querySelector('a.s-pagination-next');
@@ -191,7 +191,7 @@ export function injected_amazon_search_list(params) {
return {
start_url,
href: location.href,
category_keyword,
category_keyword,
sort_by,
total: items.length,
items,
@@ -273,18 +273,90 @@ export function injected_amazon_product_detail() {
if (t) bullets.push(t);
});
const variants = {};
document.querySelectorAll('[id^="variation_"]').forEach((block) => {
const key = block.id.replace(/^variation_/, '') || block.id;
const sel =
block.querySelector('.selection') ||
block.querySelector('.a-button-selected .a-button-text') ||
block.querySelector('[class*="dropdown"]');
if (sel) {
const v = norm(sel.textContent);
if (v) variants[key] = v;
/** 变体 id 后缀是否为颜色 / 尺寸(仅提取这两项,不收集其它维度) */
function is_sku_color_key(k) {
const x = String(k).toLowerCase();
return x === 'color' || x === 'color_name' || x.endsWith('_color_name');
}
function is_sku_size_key(k) {
const x = String(k).toLowerCase();
return x === 'size' || x === 'size_name' || x.endsWith('_size_name');
}
/** 变体维度:颜色 / 尺寸 各为选项列表 */
const sku = { color: [], size: [] };
const twister_plus_root = document.querySelector('#twister-plus-desktop-twister-container');
if (twister_plus_root) {
const color_row = twister_plus_root.querySelector('#inline-twister-row-color_name');
if (color_row) {
const seen_c = new Set();
color_row.querySelectorAll('li').forEach((li) => {
const img = li.querySelector('img[alt]');
if (!img) return;
const v = norm(img.getAttribute('alt'));
if (v && !seen_c.has(v)) {
seen_c.add(v);
sku.color.push(v);
}
});
}
});
if (!sku.color.length) {
const dim = twister_plus_root.querySelector('#inline-twister-expanded-dimension-text-color_name');
const v = dim && norm(dim.textContent);
if (v) sku.color.push(v);
}
const size_row = twister_plus_root.querySelector('#inline-twister-row-size_name');
if (size_row) {
const seen_s = new Set();
size_row.querySelectorAll('li').forEach((li) => {
const el = li.querySelector('.swatch-title-text-display, .swatch-title-text-single-line');
const v = el ? norm(el.textContent) : null;
if (v && !seen_s.has(v)) {
seen_s.add(v);
sku.size.push(v);
}
});
}
if (!sku.size.length) {
const dim = twister_plus_root.querySelector('#inline-twister-expanded-dimension-text-size_name');
const v = dim && norm(dim.textContent);
if (v) sku.size.push(v);
}
} else {
let cur_color = null;
let cur_size = null;
document.querySelectorAll('[id^="variation_"]').forEach((block) => {
const key = block.id.replace(/^variation_/, '') || block.id;
if (!is_sku_color_key(key) && !is_sku_size_key(key)) return;
const sel =
block.querySelector('.selection') ||
block.querySelector('.a-button-selected .a-button-text') ||
block.querySelector('[class*="dropdown"]');
if (!sel) return;
const v = norm(sel.textContent);
if (!v) return;
if (is_sku_color_key(key) && !cur_color) cur_color = v;
if (is_sku_size_key(key) && !cur_size) cur_size = v;
});
document.querySelectorAll('div.inline-twister-row[id^="inline-twister-row-"]').forEach((row) => {
const id = row.id || '';
const key = id.replace(/^inline-twister-row-/, '') || id;
if (!is_sku_color_key(key) && !is_sku_size_key(key)) return;
const selected =
row.querySelector('.a-button-selected .swatch-title-text-display') ||
row.querySelector('.a-button-selected .a-button-text') ||
row.querySelector('.a-button-selected');
if (!selected) return;
const v = norm(selected.textContent);
if (!v) return;
if (is_sku_color_key(key) && !cur_color) cur_color = v;
if (is_sku_size_key(key) && !cur_size) cur_size = v;
});
if (cur_color) sku.color.push(cur_color);
if (cur_size) sku.size.push(cur_size);
}
let delivery_hint = null;
const del = document.querySelector(
@@ -292,11 +364,6 @@ export function injected_amazon_product_detail() {
);
if (del) delivery_hint = norm(del.innerText).slice(0, 500);
let sku = null;
Object.keys(product_info).forEach((k) => {
if (/^sku$/i.test(k) || /item model|型号|part number|制造商型号/i.test(k)) sku = product_info[k];
});
const images = [];
const seen_img = new Set();
function add_img(u) {
@@ -338,7 +405,6 @@ export function injected_amazon_product_detail() {
product_info,
detail_extra_lines,
bullets,
variants,
delivery_hint,
sku,
images,
@@ -492,8 +558,8 @@ export function amazon_search_list(data, sendResponse) {
chrome.tabs.update(tab.id, { url: next_url, active: true }, () => {
if (chrome.runtime.lastError) return reject_nav(new Error(chrome.runtime.lastError.message));
resolve_nav(true);
});
});
});
});
await wait_tab_complete(tab.id);
}
const injected_result_list = await tab.execute_script(
@@ -518,10 +584,10 @@ export function amazon_search_list(data, sendResponse) {
total: unique_map.size,
items: Array.from(unique_map.values()).slice(0, limit),
};
const result = {
code: 0,
status: true,
message: 'ok',
const result = {
code: 0,
status: true,
message: 'ok',
data: { tab_id: tab.id, url, category_keyword, sort_by: sort_by || 'featured', limit, result: list_result },
};
send_action('amazon_search_list', result);
@@ -610,7 +676,7 @@ export function amazon_set_language(data, sendResponse) {
data: null,
documentURI: AMAZON_HOME_FOR_LANG,
});
reject(err);
reject(err);
}
});
}
@@ -686,7 +752,8 @@ export function amazon_product_detail(data, sendResponse) {
return run_pdp_action(data && data.product_url, injected_amazon_product_detail, [], 'amazon_product_detail', sendResponse);
}
amazon_product_detail.desc = 'Amazon 商品详情标题、价格、品牌、SKU、要点、变体、配送摘要等';
amazon_product_detail.desc =
'Amazon 商品详情标题、价格、品牌、sku{color[],size[]}、要点、配送摘要等)';
amazon_product_detail.params = {
product_url: {
type: 'string',

View File

@@ -0,0 +1,11 @@
<!doctype html>
<html lang="zh-CN">
<head>
<meta charset="utf-8" />
<title>server_bridge</title>
</head>
<body>
<!-- Puppeteer 打开此页后 evaluate 调用 window.server_bridge_invoke与 background 同协议 -->
<script src="bridge.js"></script>
</body>
</html>

View File

@@ -0,0 +1,32 @@
/**
* 服务端 Puppeteer 通过此页与 background 通讯(等同 UI 发 chrome.runtime.sendMessage
* 页面内若需 Web Worker 做重计算,可在此 postMessage当前直连 background 即可满足指令/结果
*/
(function () {
function server_bridge_invoke(action, data) {
return new Promise(function (resolve, reject) {
if (!action) {
reject(new Error('缺少 action'));
return;
}
chrome.runtime.sendMessage({ action: action, data: data || {} }, function (res) {
var err = chrome.runtime.lastError;
if (err) {
reject(new Error(err.message));
return;
}
if (!res) {
reject(new Error('background 无响应'));
return;
}
if (res.ok) {
resolve(res.data);
} else {
reject(new Error(res.error || 'action 失败'));
}
});
});
}
window.server_bridge_invoke = server_bridge_invoke;
})();

13
server/.env.example Normal file
View File

@@ -0,0 +1,13 @@
# MySQL
MYSQL_HOST=127.0.0.1
MYSQL_PORT=3306
MYSQL_USER=root
MYSQL_PASSWORD=
MYSQL_DATABASE=ecom_crawl
# 扩展目录(未打包,含 manifest.json
CRX_SRC_PATH=d:/项目/电商抓取项目/mv2_simple_crx/src
SERVER_PORT=38080
ACTION_TIMEOUT_MS=300000
PUPPETEER_HEADLESS=false

31
server/app.js Normal file
View File

@@ -0,0 +1,31 @@
import dotenv from 'dotenv';
import Koa from 'koa';
import body_parser from 'koa-bodyparser';
import { sequelize } from './models/index.js';
import { crawl_router } from './routes/crawl.js';
import { schedule_task_router } from './routes/schedule_task.js';
import { reload_all_schedules } from './services/schedule_loader.js';
dotenv.config();
const app = new Koa();
app.use(body_parser({ jsonLimit: '10mb' }));
app.use(crawl_router.routes()).use(crawl_router.allowedMethods());
app.use(schedule_task_router.routes()).use(schedule_task_router.allowedMethods());
app.use(async (ctx) => {
ctx.status = 404;
ctx.body = { ok: false, error: 'not_found' };
});
const port = Number(process.env.SERVER_PORT || 38080);
await sequelize.authenticate();
await sequelize.sync();
await reload_all_schedules();
app.listen(port);
// eslint-disable-next-line no-console
console.log(`server listening on ${port}`);

22
server/config/database.js Normal file
View File

@@ -0,0 +1,22 @@
import dotenv from 'dotenv';
dotenv.config();
export function get_sequelize_options() {
return {
host: process.env.MYSQL_HOST || '127.0.0.1',
port: Number(process.env.MYSQL_PORT || 3306),
username: process.env.MYSQL_USER || 'root',
password: process.env.MYSQL_PASSWORD || '',
database: process.env.MYSQL_DATABASE || 'ecom_crawl',
dialect: 'mysql',
logging: false,
define: {
underscored: true,
timestamps: true,
createdAt: 'created_at',
updatedAt: 'updated_at'
},
timezone: '+08:00'
};
}

View File

@@ -0,0 +1,26 @@
import { DataTypes } from 'sequelize';
export function define_crawl_run_record(sequelize) {
return sequelize.define(
'crawl_run_record',
{
id: { type: DataTypes.BIGINT.UNSIGNED, primaryKey: true, autoIncrement: true },
action_name: { type: DataTypes.STRING(128), allowNull: false },
request_payload: { type: DataTypes.TEXT, allowNull: true, comment: 'JSON 请求体' },
ok: { type: DataTypes.BOOLEAN, allowNull: false },
result_payload: { type: DataTypes.TEXT('long'), allowNull: true, comment: 'JSON 结果' },
error_message: { type: DataTypes.TEXT, allowNull: true },
source: {
type: DataTypes.STRING(32),
allowNull: false,
defaultValue: 'api',
comment: 'api | cron'
},
schedule_task_id: { type: DataTypes.BIGINT.UNSIGNED, allowNull: true }
},
{
tableName: 'crawl_run_record',
indexes: [{ fields: ['action_name'] }, { fields: ['created_at'] }]
}
);
}

15
server/models/index.js Normal file
View File

@@ -0,0 +1,15 @@
import { Sequelize } from 'sequelize';
import { get_sequelize_options } from '../config/database.js';
import { define_crawl_run_record } from './crawl_run_record.js';
import { define_schedule_task } from './schedule_task.js';
const sequelize_options = get_sequelize_options();
const { database, username, password, ...rest } = sequelize_options;
export const sequelize = new Sequelize(database, username, password, rest);
export const crawl_run_record = define_crawl_run_record(sequelize);
export const schedule_task = define_schedule_task(sequelize);
schedule_task.hasMany(crawl_run_record, { foreignKey: 'schedule_task_id', as: 'records' });
crawl_run_record.belongsTo(schedule_task, { foreignKey: 'schedule_task_id', as: 'schedule_task' });

View File

@@ -0,0 +1,19 @@
import { DataTypes } from 'sequelize';
export function define_schedule_task(sequelize) {
return sequelize.define(
'schedule_task',
{
id: { type: DataTypes.BIGINT.UNSIGNED, primaryKey: true, autoIncrement: true },
name: { type: DataTypes.STRING(128), allowNull: false },
cron_expression: { type: DataTypes.STRING(64), allowNull: false },
action_name: { type: DataTypes.STRING(128), allowNull: false },
payload_json: { type: DataTypes.TEXT, allowNull: true },
enabled: { type: DataTypes.BOOLEAN, allowNull: false, defaultValue: true },
last_run_at: { type: DataTypes.DATE, allowNull: true }
},
{
tableName: 'schedule_task'
}
);
}

2461
server/package-lock.json generated Normal file

File diff suppressed because it is too large Load Diff

20
server/package.json Normal file
View File

@@ -0,0 +1,20 @@
{
"name": "ecom_crawl_server",
"version": "1.0.0",
"private": true,
"type": "module",
"scripts": {
"start": "node app.js",
"db_sync": "node scripts/db_sync.js"
},
"dependencies": {
"@koa/router": "^12.0.1",
"dotenv": "^16.4.5",
"koa": "^2.15.3",
"koa-bodyparser": "^4.4.1",
"mysql2": "^3.11.0",
"node-cron": "^3.0.3",
"puppeteer": "^23.4.1",
"sequelize": "^6.37.3"
}
}

26
server/routes/crawl.js Normal file
View File

@@ -0,0 +1,26 @@
import Router from '@koa/router';
import { execute_action_and_record } from '../services/task_executor.js';
export const crawl_router = new Router();
crawl_router.post('/api/crawl/run_action', async (ctx) => {
const { action_name, action_payload } = ctx.request.body || {};
if (!action_name) {
ctx.status = 400;
ctx.body = { ok: false, error: '缺少 action_name' };
return;
}
try {
const data = await execute_action_and_record({
action_name,
action_payload: action_payload || {},
source: 'api'
});
ctx.body = { ok: true, data };
} catch (err) {
ctx.status = 500;
ctx.body = { ok: false, error: (err && err.message) || String(err) };
}
});

View File

@@ -0,0 +1,73 @@
import Router from '@koa/router';
import { schedule_task } from '../models/index.js';
import { safe_json_parse, safe_json_stringify } from '../services/json_utils.js';
export const schedule_task_router = new Router();
schedule_task_router.post('/api/schedule_task/create', async (ctx) => {
const { name, cron_expression, action_name, payload } = ctx.request.body || {};
if (!name || !cron_expression || !action_name) {
ctx.status = 400;
ctx.body = { ok: false, error: '缺少 name/cron_expression/action_name' };
return;
}
const payload_json = payload ? safe_json_stringify(payload) : null;
const row = await schedule_task.create({
name,
cron_expression,
action_name,
payload_json,
enabled: true
});
ctx.body = { ok: true, data: { id: row.id } };
});
schedule_task_router.post('/api/schedule_task/list', async (ctx) => {
const rows = await schedule_task.findAll({ order: [['id', 'desc']] });
ctx.body = {
ok: true,
data: rows.map((r) => ({
id: r.id,
name: r.name,
cron_expression: r.cron_expression,
action_name: r.action_name,
payload: safe_json_parse(r.payload_json),
enabled: r.enabled,
last_run_at: r.last_run_at
}))
};
});
schedule_task_router.post('/api/schedule_task/set_enabled', async (ctx) => {
const { id, enabled } = ctx.request.body || {};
if (!id || typeof enabled !== 'boolean') {
ctx.status = 400;
ctx.body = { ok: false, error: '缺少 id/enabled(boolean)' };
return;
}
const row = await schedule_task.findByPk(id);
if (!row) {
ctx.status = 404;
ctx.body = { ok: false, error: '任务不存在' };
return;
}
row.enabled = enabled;
await row.save();
ctx.body = { ok: true };
});
schedule_task_router.post('/api/schedule_task/delete', async (ctx) => {
const { id } = ctx.request.body || {};
if (!id) {
ctx.status = 400;
ctx.body = { ok: false, error: '缺少 id' };
return;
}
await schedule_task.destroy({ where: { id } });
ctx.body = { ok: true };
});

View File

@@ -0,0 +1,6 @@
import { sequelize } from '../models/index.js';
await sequelize.sync({ alter: true });
// eslint-disable-next-line no-console
console.log('sync ok');
await sequelize.close();

View File

@@ -0,0 +1,30 @@
import cron from 'node-cron';
const task_id_to_cron_job = new Map();
export function stop_all_cron_jobs() {
for (const job of task_id_to_cron_job.values()) {
job.stop();
}
task_id_to_cron_job.clear();
}
export function upsert_cron_job(schedule_task_id, cron_expression, on_tick) {
const existing = task_id_to_cron_job.get(schedule_task_id);
if (existing) {
existing.stop();
task_id_to_cron_job.delete(schedule_task_id);
}
const job = cron.schedule(cron_expression, on_tick, { scheduled: true });
task_id_to_cron_job.set(schedule_task_id, job);
}
export function remove_cron_job(schedule_task_id) {
const job = task_id_to_cron_job.get(schedule_task_id);
if (!job) {
return;
}
job.stop();
task_id_to_cron_job.delete(schedule_task_id);
}

View File

@@ -0,0 +1,18 @@
export function safe_json_stringify(value) {
try {
return JSON.stringify(value);
} catch (err) {
return JSON.stringify({ error: 'json_stringify_failed', message: String(err) });
}
}
export function safe_json_parse(text) {
if (text === null || text === undefined || text === '') {
return null;
}
try {
return JSON.parse(text);
} catch (err) {
return null;
}
}

View File

@@ -0,0 +1,102 @@
import dotenv from 'dotenv';
import path from 'node:path';
import puppeteer from 'puppeteer';
dotenv.config();
let browser_singleton = null;
function get_action_timeout_ms() {
return Number(process.env.ACTION_TIMEOUT_MS || 300000);
}
function get_crx_src_path() {
const crx_src_path = process.env.CRX_SRC_PATH;
if (!crx_src_path) {
throw new Error('缺少环境变量 CRX_SRC_PATH');
}
return crx_src_path;
}
function get_extension_id_from_targets(targets) {
for (const target of targets) {
const url = target.url();
if (!url) continue;
if (url.startsWith('chrome-extension://')) {
const match = url.match(/^chrome-extension:\/\/([^/]+)\//);
if (match && match[1]) return match[1];
}
}
return null;
}
export async function get_or_create_browser() {
if (browser_singleton) {
return browser_singleton;
}
const extension_path = path.resolve(get_crx_src_path());
const headless = String(process.env.PUPPETEER_HEADLESS || 'false') === 'true';
browser_singleton = await puppeteer.launch({
headless,
args: [
`--disable-extensions-except=${extension_path}`,
`--load-extension=${extension_path}`,
'--no-default-browser-check',
'--disable-popup-blocking',
'--disable-dev-shm-usage'
]
});
return browser_singleton;
}
export async function invoke_extension_action(action_name, action_payload) {
const browser = await get_or_create_browser();
const page = await browser.newPage();
await page.goto('about:blank');
const targets = await browser.targets();
const extension_id = get_extension_id_from_targets(targets);
if (!extension_id) {
await page.close();
throw new Error('未找到扩展 extension_id请确认 CRX_SRC_PATH 指向 src 且成功加载)');
}
const bridge_url = `chrome-extension://${extension_id}/bridge/bridge.html`;
await page.goto(bridge_url, { waitUntil: 'domcontentloaded' });
const timeout_ms = get_action_timeout_ms();
const action_res = await page.evaluate(
async (action, payload, timeout) => {
function with_timeout(promise, timeout_ms_inner) {
return new Promise((resolve, reject) => {
const timer = setTimeout(() => reject(new Error('action_timeout')), timeout_ms_inner);
promise
.then((v) => {
clearTimeout(timer);
resolve(v);
})
.catch((e) => {
clearTimeout(timer);
reject(e);
});
});
}
if (!window.server_bridge_invoke) {
throw new Error('bridge 未注入 window.server_bridge_invoke');
}
return await with_timeout(window.server_bridge_invoke(action, payload), timeout);
},
action_name,
action_payload || {},
timeout_ms
);
await page.close();
return action_res;
}

View File

@@ -0,0 +1,33 @@
import { schedule_task } from '../models/index.js';
import { safe_json_parse } from './json_utils.js';
import { execute_action_and_record } from './task_executor.js';
import { remove_cron_job, upsert_cron_job } from './cron_manager.js';
export async function reload_all_schedules() {
const rows = await schedule_task.findAll();
for (const row of rows) {
if (!row.enabled) {
remove_cron_job(row.id);
continue;
}
upsert_cron_job(row.id, row.cron_expression, async () => {
try {
await schedule_task.update(
{ last_run_at: new Date() },
{ where: { id: row.id } }
);
await execute_action_and_record({
action_name: row.action_name,
action_payload: safe_json_parse(row.payload_json) || {},
source: 'cron',
schedule_task_id: row.id
});
} catch (err) {
// cron 执行失败已在 crawl_run_record 落库,避免重复抛出影响其它任务
}
});
}
}

View File

@@ -0,0 +1,39 @@
import { crawl_run_record } from '../models/index.js';
import { safe_json_stringify } from './json_utils.js';
import { invoke_extension_action } from './puppeteer_runner.js';
export async function execute_action_and_record(params) {
const {
action_name,
action_payload,
source,
schedule_task_id
} = params;
const request_payload = safe_json_stringify(action_payload || {});
let ok = false;
let result_payload = null;
let error_message = null;
try {
const result = await invoke_extension_action(action_name, action_payload || {});
ok = true;
result_payload = safe_json_stringify(result);
return result;
} catch (err) {
ok = false;
error_message = (err && err.message) || String(err);
throw err;
} finally {
await crawl_run_record.create({
action_name,
request_payload,
ok,
result_payload,
error_message,
source,
schedule_task_id: schedule_task_id || null
});
}
}

File diff suppressed because it is too large Load Diff