This commit is contained in:
张成
2026-03-18 18:07:41 +08:00
parent 18aa083c91
commit aecb7944a8
8 changed files with 44 additions and 57 deletions

View File

@@ -58,7 +58,9 @@ export function get_app_config() {
chrome_executable_path: (get_env('CHROME_EXECUTABLE_PATH') || '').trim() || path.resolve(__dirname, '../../chrome-win/chrome.exe'), chrome_executable_path: (get_env('CHROME_EXECUTABLE_PATH') || '').trim() || path.resolve(__dirname, '../../chrome-win/chrome.exe'),
log_invoke_action: get_bool('LOG_INVOKE_ACTION', true), log_invoke_action: get_bool('LOG_INVOKE_ACTION', true),
auto_close_browser: get_bool('AUTO_CLOSE_BROWSER', true), auto_close_browser: get_bool('AUTO_CLOSE_BROWSER', true),
enable_stealth: get_bool('ENABLE_STEALTH', true) enable_stealth: get_bool('ENABLE_STEALTH', true),
log_sql: get_bool('LOG_SQL', false),
log_sql_benchmark: get_bool('LOG_SQL_BENCHMARK', false)
} }
}; };

View File

@@ -10,7 +10,18 @@ export function get_sequelize_options() {
password: cfg.mysql.password, password: cfg.mysql.password,
database: cfg.mysql.database, database: cfg.mysql.database,
dialect: 'mysql', dialect: 'mysql',
logging: false, benchmark: cfg.crawler.log_sql_benchmark === true,
logging: cfg.crawler.log_sql === true
? (sql, timing_ms) => {
if (cfg.crawler.log_sql_benchmark === true && typeof timing_ms === 'number') {
// eslint-disable-next-line no-console
console.log('[sql]', { timing_ms, sql });
return;
}
// eslint-disable-next-line no-console
console.log('[sql]', sql);
}
: false,
define: { define: {
underscored: true, underscored: true,
timestamps: true, timestamps: true,

View File

@@ -6,26 +6,24 @@ export function define_amazon_product(sequelize) {
{ {
id: { type: DataTypes.BIGINT.UNSIGNED, primaryKey: true, autoIncrement: true }, id: { type: DataTypes.BIGINT.UNSIGNED, primaryKey: true, autoIncrement: true },
asin: { type: DataTypes.STRING(32), allowNull: false }, asin: { type: DataTypes.STRING(32), allowNull: false },
url: { type: DataTypes.TEXT, allowNull: false }, url: { type: DataTypes.STRING(2048), allowNull: false },
title: { type: DataTypes.TEXT, allowNull: true }, title: { type: DataTypes.STRING(1024), allowNull: true },
price: { type: DataTypes.STRING(64), allowNull: true }, price: { type: DataTypes.STRING(64), allowNull: true },
sku: { type: DataTypes.STRING(256), allowNull: true }, sku_json: { type: DataTypes.JSON, allowNull: true, comment: 'sku 结构化 JSON如 {color:[], size:[]}' },
sku_color: { type: DataTypes.STRING(128), allowNull: true }, brand_line: { type: DataTypes.STRING(512), allowNull: true },
sku_size: { type: DataTypes.STRING(128), allowNull: true }, brand_store_url: { type: DataTypes.STRING(2048), allowNull: true },
brand_line: { type: DataTypes.TEXT, allowNull: true },
brand_store_url: { type: DataTypes.TEXT, allowNull: true },
ac_badge: { type: DataTypes.STRING(128), allowNull: true }, ac_badge: { type: DataTypes.STRING(128), allowNull: true },
bestseller_hint: { type: DataTypes.TEXT, allowNull: true }, bestseller_hint: { type: DataTypes.STRING(512), allowNull: true },
delivery_hint: { type: DataTypes.TEXT, allowNull: true }, delivery_hint: { type: DataTypes.STRING(512), allowNull: true },
social_proof: { type: DataTypes.TEXT, allowNull: true }, social_proof: { type: DataTypes.STRING(256), allowNull: true },
sustainability_hint: { type: DataTypes.TEXT, allowNull: true }, sustainability_hint: { type: DataTypes.STRING(256), allowNull: true },
rating_stars: { type: DataTypes.STRING(64), allowNull: true }, rating_stars: { type: DataTypes.STRING(64), allowNull: true },
review_count_text: { type: DataTypes.STRING(64), allowNull: true }, review_count_text: { type: DataTypes.STRING(64), allowNull: true },
main_image: { type: DataTypes.TEXT, allowNull: true }, main_image: { type: DataTypes.STRING(2048), allowNull: true },
images_json: { type: DataTypes.TEXT('long'), allowNull: true }, images_json: { type: DataTypes.JSON, allowNull: true },
bullets_json: { type: DataTypes.TEXT('long'), allowNull: true }, bullets_json: { type: DataTypes.JSON, allowNull: true },
product_info_json: { type: DataTypes.TEXT('long'), allowNull: true }, product_info_json: { type: DataTypes.JSON, allowNull: true },
detail_extra_lines_json: { type: DataTypes.TEXT('long'), allowNull: true } detail_extra_lines_json: { type: DataTypes.JSON, allowNull: true }
}, },
{ {
tableName: 'amazon_product', tableName: 'amazon_product',

View File

@@ -6,10 +6,10 @@ export function define_amazon_review(sequelize) {
{ {
id: { type: DataTypes.BIGINT.UNSIGNED, primaryKey: true, autoIncrement: true }, id: { type: DataTypes.BIGINT.UNSIGNED, primaryKey: true, autoIncrement: true },
asin: { type: DataTypes.STRING(32), allowNull: true }, asin: { type: DataTypes.STRING(32), allowNull: true },
url: { type: DataTypes.TEXT, allowNull: false }, url: { type: DataTypes.STRING(2048), allowNull: false },
review_id: { type: DataTypes.STRING(64), allowNull: false }, review_id: { type: DataTypes.STRING(64), allowNull: false },
author: { type: DataTypes.STRING(256), allowNull: true }, author: { type: DataTypes.STRING(256), allowNull: true },
title: { type: DataTypes.TEXT, allowNull: true }, title: { type: DataTypes.STRING(512), allowNull: true },
body: { type: DataTypes.TEXT('long'), allowNull: true }, body: { type: DataTypes.TEXT('long'), allowNull: true },
rating_text: { type: DataTypes.STRING(64), allowNull: true }, rating_text: { type: DataTypes.STRING(64), allowNull: true },
review_date: { type: DataTypes.STRING(128), allowNull: true }, review_date: { type: DataTypes.STRING(128), allowNull: true },

View File

@@ -6,8 +6,8 @@ export function define_amazon_search_item(sequelize) {
{ {
id: { type: DataTypes.BIGINT.UNSIGNED, primaryKey: true, autoIncrement: true }, id: { type: DataTypes.BIGINT.UNSIGNED, primaryKey: true, autoIncrement: true },
asin: { type: DataTypes.STRING(32), allowNull: false }, asin: { type: DataTypes.STRING(32), allowNull: false },
url: { type: DataTypes.TEXT, allowNull: false }, url: { type: DataTypes.STRING(2048), allowNull: false },
title: { type: DataTypes.TEXT, allowNull: true }, title: { type: DataTypes.STRING(1024), allowNull: true },
price: { type: DataTypes.STRING(64), allowNull: true }, price: { type: DataTypes.STRING(64), allowNull: true },
rating: { type: DataTypes.FLOAT, allowNull: true }, rating: { type: DataTypes.FLOAT, allowNull: true },
rating_text: { type: DataTypes.STRING(64), allowNull: true }, rating_text: { type: DataTypes.STRING(64), allowNull: true },

View File

@@ -1,5 +1,5 @@
import { execute_action_and_record } from '../../task_executor.js'; import { execute_action_and_record } from '../../task_executor.js';
import { map_limit, sleep_ms } from '../flow_utils.js'; import { sleep_ms } from '../flow_utils.js';
import { amazon_product, amazon_search_item, amazon_review } from '../../../models/index.js'; import { amazon_product, amazon_search_item, amazon_review } from '../../../models/index.js';
import { safe_json_stringify } from '../../json_utils.js'; import { safe_json_stringify } from '../../json_utils.js';
import { close_browser } from '../../puppeteer/puppeteer_runner.js'; import { close_browser } from '../../puppeteer/puppeteer_runner.js';
@@ -51,14 +51,15 @@ async function persist_detail(detail_res_raw) {
return; return;
} }
const sku_is_object = detail_res && detail_res.sku && typeof detail_res.sku === 'object' && !Array.isArray(detail_res.sku);
await amazon_product.upsert({ await amazon_product.upsert({
asin, asin,
url: detail_res.url || '', url: detail_res.url || '',
title: detail_res.title || null, title: detail_res.title || null,
price: detail_res.price || null, price: detail_res.price || null,
sku: detail_res.sku || null, sku: sku_is_object ? null : (detail_res.sku || null),
sku_color: detail_res.sku_color || null, sku_json: sku_is_object ? detail_res.sku : null,
sku_size: detail_res.sku_size || null,
brand_line: detail_res.brand_line || null, brand_line: detail_res.brand_line || null,
brand_store_url: detail_res.brand_store_url || null, brand_store_url: detail_res.brand_store_url || null,
ac_badge: detail_res.ac_badge || null, ac_badge: detail_res.ac_badge || null,
@@ -69,10 +70,10 @@ async function persist_detail(detail_res_raw) {
rating_stars: detail_res.rating_stars || null, rating_stars: detail_res.rating_stars || null,
review_count_text: detail_res.review_count_text || null, review_count_text: detail_res.review_count_text || null,
main_image: detail_res.main_image || null, main_image: detail_res.main_image || null,
images_json: safe_json_stringify(detail_res.images || []), images_json: Array.isArray(detail_res.images) ? detail_res.images : null,
bullets_json: safe_json_stringify(detail_res.bullets || []), bullets_json: Array.isArray(detail_res.bullets) ? detail_res.bullets : null,
product_info_json: safe_json_stringify(detail_res.product_info || {}), product_info_json: detail_res.product_info && typeof detail_res.product_info === 'object' ? detail_res.product_info : null,
detail_extra_lines_json: safe_json_stringify(detail_res.detail_extra_lines || []) detail_extra_lines_json: Array.isArray(detail_res.detail_extra_lines) ? detail_res.detail_extra_lines : null
}); });
} }

View File

@@ -1,26 +1,3 @@
export async function sleep_ms(ms) { export async function sleep_ms(ms) {
await new Promise((resolve) => setTimeout(resolve, ms)); await new Promise((resolve) => setTimeout(resolve, ms));
} }
export async function map_limit(items, worker) {
const list = Array.isArray(items) ? items : [];
const res = new Array(list.length);
let idx = 0;
async function run_one() {
while (idx < list.length) {
const cur = idx;
idx += 1;
res[cur] = await worker(list[cur], cur);
}
}
const runners = [];
for (let i = 0; i < list.length; i += 1) {
runners.push(run_one());
}
await Promise.all(runners);
return res;
}

View File

@@ -31,11 +31,9 @@ async function run_cron_task(task) {
export async function start_all_cron_tasks() { export async function start_all_cron_tasks() {
for (const task of cron_task_list) { for (const task of cron_task_list) {
// const job = cron.schedule(task.cron_expression, async () => { // const job = cron.schedule(task.cron_expression, async () => {
try {
await run_cron_task(task); await run_cron_task(task);
} catch (err) {
// action 内部已记录 crawl_run_recordflow 内部 action 也会记录
}
// }); // });
// cron_jobs.push(job); // cron_jobs.push(job);