diff --git a/server/config/app_config.js b/server/config/app_config.js index 4ae23b7..dadd515 100644 --- a/server/config/app_config.js +++ b/server/config/app_config.js @@ -58,7 +58,9 @@ export function get_app_config() { chrome_executable_path: (get_env('CHROME_EXECUTABLE_PATH') || '').trim() || path.resolve(__dirname, '../../chrome-win/chrome.exe'), log_invoke_action: get_bool('LOG_INVOKE_ACTION', true), auto_close_browser: get_bool('AUTO_CLOSE_BROWSER', true), - enable_stealth: get_bool('ENABLE_STEALTH', true) + enable_stealth: get_bool('ENABLE_STEALTH', true), + log_sql: get_bool('LOG_SQL', false), + log_sql_benchmark: get_bool('LOG_SQL_BENCHMARK', false) } }; diff --git a/server/config/database.js b/server/config/database.js index 2edf05f..f6c71ca 100644 --- a/server/config/database.js +++ b/server/config/database.js @@ -10,7 +10,18 @@ export function get_sequelize_options() { password: cfg.mysql.password, database: cfg.mysql.database, dialect: 'mysql', - logging: false, + benchmark: cfg.crawler.log_sql_benchmark === true, + logging: cfg.crawler.log_sql === true + ? (sql, timing_ms) => { + if (cfg.crawler.log_sql_benchmark === true && typeof timing_ms === 'number') { + // eslint-disable-next-line no-console + console.log('[sql]', { timing_ms, sql }); + return; + } + // eslint-disable-next-line no-console + console.log('[sql]', sql); + } + : false, define: { underscored: true, timestamps: true, diff --git a/server/models/amazon_product.js b/server/models/amazon_product.js index 65169ac..17117bf 100644 --- a/server/models/amazon_product.js +++ b/server/models/amazon_product.js @@ -6,26 +6,24 @@ export function define_amazon_product(sequelize) { { id: { type: DataTypes.BIGINT.UNSIGNED, primaryKey: true, autoIncrement: true }, asin: { type: DataTypes.STRING(32), allowNull: false }, - url: { type: DataTypes.TEXT, allowNull: false }, - title: { type: DataTypes.TEXT, allowNull: true }, + url: { type: DataTypes.STRING(2048), allowNull: false }, + title: { type: DataTypes.STRING(1024), allowNull: true }, price: { type: DataTypes.STRING(64), allowNull: true }, - sku: { type: DataTypes.STRING(256), allowNull: true }, - sku_color: { type: DataTypes.STRING(128), allowNull: true }, - sku_size: { type: DataTypes.STRING(128), allowNull: true }, - brand_line: { type: DataTypes.TEXT, allowNull: true }, - brand_store_url: { type: DataTypes.TEXT, allowNull: true }, + sku_json: { type: DataTypes.JSON, allowNull: true, comment: 'sku 结构化 JSON,如 {color:[], size:[]}' }, + brand_line: { type: DataTypes.STRING(512), allowNull: true }, + brand_store_url: { type: DataTypes.STRING(2048), allowNull: true }, ac_badge: { type: DataTypes.STRING(128), allowNull: true }, - bestseller_hint: { type: DataTypes.TEXT, allowNull: true }, - delivery_hint: { type: DataTypes.TEXT, allowNull: true }, - social_proof: { type: DataTypes.TEXT, allowNull: true }, - sustainability_hint: { type: DataTypes.TEXT, allowNull: true }, + bestseller_hint: { type: DataTypes.STRING(512), allowNull: true }, + delivery_hint: { type: DataTypes.STRING(512), allowNull: true }, + social_proof: { type: DataTypes.STRING(256), allowNull: true }, + sustainability_hint: { type: DataTypes.STRING(256), allowNull: true }, rating_stars: { type: DataTypes.STRING(64), allowNull: true }, review_count_text: { type: DataTypes.STRING(64), allowNull: true }, - main_image: { type: DataTypes.TEXT, allowNull: true }, - images_json: { type: DataTypes.TEXT('long'), allowNull: true }, - bullets_json: { type: DataTypes.TEXT('long'), allowNull: true }, - product_info_json: { type: DataTypes.TEXT('long'), allowNull: true }, - detail_extra_lines_json: { type: DataTypes.TEXT('long'), allowNull: true } + main_image: { type: DataTypes.STRING(2048), allowNull: true }, + images_json: { type: DataTypes.JSON, allowNull: true }, + bullets_json: { type: DataTypes.JSON, allowNull: true }, + product_info_json: { type: DataTypes.JSON, allowNull: true }, + detail_extra_lines_json: { type: DataTypes.JSON, allowNull: true } }, { tableName: 'amazon_product', diff --git a/server/models/amazon_review.js b/server/models/amazon_review.js index aca3a7f..6b40b75 100644 --- a/server/models/amazon_review.js +++ b/server/models/amazon_review.js @@ -6,10 +6,10 @@ export function define_amazon_review(sequelize) { { id: { type: DataTypes.BIGINT.UNSIGNED, primaryKey: true, autoIncrement: true }, asin: { type: DataTypes.STRING(32), allowNull: true }, - url: { type: DataTypes.TEXT, allowNull: false }, + url: { type: DataTypes.STRING(2048), allowNull: false }, review_id: { type: DataTypes.STRING(64), allowNull: false }, author: { type: DataTypes.STRING(256), allowNull: true }, - title: { type: DataTypes.TEXT, allowNull: true }, + title: { type: DataTypes.STRING(512), allowNull: true }, body: { type: DataTypes.TEXT('long'), allowNull: true }, rating_text: { type: DataTypes.STRING(64), allowNull: true }, review_date: { type: DataTypes.STRING(128), allowNull: true }, diff --git a/server/models/amazon_search_item.js b/server/models/amazon_search_item.js index c515823..d6fcdfb 100644 --- a/server/models/amazon_search_item.js +++ b/server/models/amazon_search_item.js @@ -6,8 +6,8 @@ export function define_amazon_search_item(sequelize) { { id: { type: DataTypes.BIGINT.UNSIGNED, primaryKey: true, autoIncrement: true }, asin: { type: DataTypes.STRING(32), allowNull: false }, - url: { type: DataTypes.TEXT, allowNull: false }, - title: { type: DataTypes.TEXT, allowNull: true }, + url: { type: DataTypes.STRING(2048), allowNull: false }, + title: { type: DataTypes.STRING(1024), allowNull: true }, price: { type: DataTypes.STRING(64), allowNull: true }, rating: { type: DataTypes.FLOAT, allowNull: true }, rating_text: { type: DataTypes.STRING(64), allowNull: true }, diff --git a/server/services/flows/amazon/amazon_search_detail_reviews_flow.js b/server/services/flows/amazon/amazon_search_detail_reviews_flow.js index 8bf04b8..71945f4 100644 --- a/server/services/flows/amazon/amazon_search_detail_reviews_flow.js +++ b/server/services/flows/amazon/amazon_search_detail_reviews_flow.js @@ -1,5 +1,5 @@ import { execute_action_and_record } from '../../task_executor.js'; -import { map_limit, sleep_ms } from '../flow_utils.js'; +import { sleep_ms } from '../flow_utils.js'; import { amazon_product, amazon_search_item, amazon_review } from '../../../models/index.js'; import { safe_json_stringify } from '../../json_utils.js'; import { close_browser } from '../../puppeteer/puppeteer_runner.js'; @@ -51,14 +51,15 @@ async function persist_detail(detail_res_raw) { return; } + const sku_is_object = detail_res && detail_res.sku && typeof detail_res.sku === 'object' && !Array.isArray(detail_res.sku); + await amazon_product.upsert({ asin, url: detail_res.url || '', title: detail_res.title || null, price: detail_res.price || null, - sku: detail_res.sku || null, - sku_color: detail_res.sku_color || null, - sku_size: detail_res.sku_size || null, + sku: sku_is_object ? null : (detail_res.sku || null), + sku_json: sku_is_object ? detail_res.sku : null, brand_line: detail_res.brand_line || null, brand_store_url: detail_res.brand_store_url || null, ac_badge: detail_res.ac_badge || null, @@ -69,10 +70,10 @@ async function persist_detail(detail_res_raw) { rating_stars: detail_res.rating_stars || null, review_count_text: detail_res.review_count_text || null, main_image: detail_res.main_image || null, - images_json: safe_json_stringify(detail_res.images || []), - bullets_json: safe_json_stringify(detail_res.bullets || []), - product_info_json: safe_json_stringify(detail_res.product_info || {}), - detail_extra_lines_json: safe_json_stringify(detail_res.detail_extra_lines || []) + images_json: Array.isArray(detail_res.images) ? detail_res.images : null, + bullets_json: Array.isArray(detail_res.bullets) ? detail_res.bullets : null, + product_info_json: detail_res.product_info && typeof detail_res.product_info === 'object' ? detail_res.product_info : null, + detail_extra_lines_json: Array.isArray(detail_res.detail_extra_lines) ? detail_res.detail_extra_lines : null }); } diff --git a/server/services/flows/flow_utils.js b/server/services/flows/flow_utils.js index 8b1790b..89b7048 100644 --- a/server/services/flows/flow_utils.js +++ b/server/services/flows/flow_utils.js @@ -1,26 +1,3 @@ export async function sleep_ms(ms) { await new Promise((resolve) => setTimeout(resolve, ms)); } - -export async function map_limit(items, worker) { - const list = Array.isArray(items) ? items : []; - - const res = new Array(list.length); - let idx = 0; - - async function run_one() { - while (idx < list.length) { - const cur = idx; - idx += 1; - res[cur] = await worker(list[cur], cur); - } - } - - const runners = []; - for (let i = 0; i < list.length; i += 1) { - runners.push(run_one()); - } - - await Promise.all(runners); - return res; -} diff --git a/server/services/schedule_loader.js b/server/services/schedule_loader.js index b06b1c7..6795c91 100644 --- a/server/services/schedule_loader.js +++ b/server/services/schedule_loader.js @@ -31,11 +31,9 @@ async function run_cron_task(task) { export async function start_all_cron_tasks() { for (const task of cron_task_list) { // const job = cron.schedule(task.cron_expression, async () => { - try { + await run_cron_task(task); - } catch (err) { - // action 内部已记录 crawl_run_record;flow 内部 action 也会记录 - } + // }); // cron_jobs.push(job);