diff --git a/1.md b/1.md index 2c7ec7a..bd7b5ce 100644 --- a/1.md +++ b/1.md @@ -41,4 +41,7 @@ -"C:\Program Files\Google\Chrome\Application\chrome.exe" --disable-features=ExtensionManifestV2Unsupported,ExtensionManifestV2Disabled \ No newline at end of file +"C:\Program Files\Google\Chrome\Application\chrome.exe" --disable-features=ExtensionManifestV2Unsupported,ExtensionManifestV2Disabled + +1.浏览器下载地址 + diff --git a/server/config/cron_tasks.js b/server/config/cron_tasks.js index 1b3bca2..5782d9f 100644 --- a/server/config/cron_tasks.js +++ b/server/config/cron_tasks.js @@ -4,11 +4,19 @@ */ export const cron_task_list = [ - // 示例:每 6 小时跑一次列表抓取 + // 任务流:先跑列表,再依赖列表 URL 跑详情+评论 { - name: 'amazon_search_list_every_6h', + name: 'amazon_search_detail_reviews_every_1h', cron_expression: '0 */1 * * *', - action_name: 'amazon_search_list', - action_payload: { keyword: '野餐包', limit: 100 } + type: 'flow', + flow_name: 'amazon_search_detail_reviews', + flow_payload: { + keyword: '野餐包', + limit: 100, + max_products: 30, + reviews_limit: 50, + concurrency: 1, + gap_ms: 500 + } } ]; diff --git a/server/services/amazon_persist.js b/server/services/amazon_persist.js deleted file mode 100644 index bc3bc60..0000000 --- a/server/services/amazon_persist.js +++ /dev/null @@ -1,110 +0,0 @@ -import { - amazon_product, - amazon_search_item, - amazon_review -} from '../models/index.js'; -import { safe_json_stringify } from './json_utils.js'; - -function build_batch_key(prefix) { - return `${prefix}_${Date.now()}_${Math.random().toString().slice(2, 8)}`; -} - -function pick_asin_from_url(url) { - if (!url) return null; - const m = String(url).match(/\/dp\/([A-Z0-9]{8,16})/i); - return m && m[1] ? m[1].toUpperCase() : null; -} - -export async function persist_amazon_result({ result }) { - if (!result || !result.stage) { - return; - } - - if (result.stage === 'detail') { - const asin = result.asin || pick_asin_from_url(result.url); - if (!asin) { - return; - } - - await amazon_product.upsert({ - asin, - url: result.url || '', - title: result.title || null, - price: result.price || null, - sku: result.sku || null, - sku_color: result.sku_color || null, - sku_size: result.sku_size || null, - brand_line: result.brand_line || null, - brand_store_url: result.brand_store_url || null, - ac_badge: result.ac_badge || null, - bestseller_hint: result.bestseller_hint || null, - delivery_hint: result.delivery_hint || null, - social_proof: result.social_proof || null, - sustainability_hint: result.sustainability_hint || null, - rating_stars: result.rating_stars || null, - review_count_text: result.review_count_text || null, - main_image: result.main_image || null, - images_json: safe_json_stringify(result.images || []), - bullets_json: safe_json_stringify(result.bullets || []), - product_info_json: safe_json_stringify(result.product_info || {}), - detail_extra_lines_json: safe_json_stringify(result.detail_extra_lines || []) - }); - - return; - } - - if (result.stage === 'list') { - const batch_key = build_batch_key('list'); - const items = Array.isArray(result.items) ? result.items : []; - - for (const it of items) { - const asin = it.asin || pick_asin_from_url(it.url); - if (!asin || !it.url) continue; - - await amazon_search_item.create({ - asin, - url: it.url, - title: it.title || null, - price: it.price || null, - rating: typeof it.rating === 'number' ? it.rating : null, - rating_text: it.rating_text || null, - review_count: typeof it.review_count === 'number' ? it.review_count : null, - review_count_text: it.review_count_text || null, - rank_index: typeof it.index === 'number' ? it.index : null, - batch_key, - batch_total: typeof result.total === 'number' ? result.total : null, - batch_limit: typeof result.limit === 'number' ? result.limit : null - }); - } - - return; - } - - if (result.stage === 'reviews') { - const batch_key = build_batch_key('reviews'); - const asin = pick_asin_from_url(result.url); - const items = Array.isArray(result.items) ? result.items : []; - - for (const it of items) { - const review_id = it.review_id; - if (!review_id) continue; - - const asin_value = asin || pick_asin_from_url(it.url) || pick_asin_from_url(result.url); - - await amazon_review.upsert({ - asin: asin_value || null, - url: result.url || '', - review_id, - author: it.author || null, - title: it.title || null, - body: it.body || null, - rating_text: it.rating_text || null, - review_date: it.date || null, - review_index: typeof it.index === 'number' ? it.index : null, - batch_key, - batch_total: typeof result.total === 'number' ? result.total : null, - batch_limit: typeof result.limit === 'number' ? result.limit : null - }); - } - } -} diff --git a/server/services/flows/amazon/amazon_search_detail_reviews_flow.js b/server/services/flows/amazon/amazon_search_detail_reviews_flow.js new file mode 100644 index 0000000..eabc839 --- /dev/null +++ b/server/services/flows/amazon/amazon_search_detail_reviews_flow.js @@ -0,0 +1,190 @@ +import { execute_action_and_record } from '../../task_executor.js'; +import { map_limit, sleep_ms } from '../flow_utils.js'; +import { amazon_product, amazon_search_item, amazon_review } from '../../../models/index.js'; +import { safe_json_stringify } from '../../json_utils.js'; + +function build_batch_key(prefix) { + return `${prefix}_${Date.now()}_${Math.random().toString().slice(2, 8)}`; +} + +function pick_asin_from_url(url) { + if (!url) return null; + const m = String(url).match(/\/dp\/([A-Z0-9]{8,16})/i); + return m && m[1] ? m[1].toUpperCase() : null; +} + +async function persist_detail(detail_res) { + if (!detail_res || detail_res.stage !== 'detail') { + return; + } + + const asin = detail_res.asin || pick_asin_from_url(detail_res.url); + if (!asin) { + return; + } + + await amazon_product.upsert({ + asin, + url: detail_res.url || '', + title: detail_res.title || null, + price: detail_res.price || null, + sku: detail_res.sku || null, + sku_color: detail_res.sku_color || null, + sku_size: detail_res.sku_size || null, + brand_line: detail_res.brand_line || null, + brand_store_url: detail_res.brand_store_url || null, + ac_badge: detail_res.ac_badge || null, + bestseller_hint: detail_res.bestseller_hint || null, + delivery_hint: detail_res.delivery_hint || null, + social_proof: detail_res.social_proof || null, + sustainability_hint: detail_res.sustainability_hint || null, + rating_stars: detail_res.rating_stars || null, + review_count_text: detail_res.review_count_text || null, + main_image: detail_res.main_image || null, + images_json: safe_json_stringify(detail_res.images || []), + bullets_json: safe_json_stringify(detail_res.bullets || []), + product_info_json: safe_json_stringify(detail_res.product_info || {}), + detail_extra_lines_json: safe_json_stringify(detail_res.detail_extra_lines || []) + }); +} + +async function persist_list(list_res) { + if (!list_res || list_res.stage !== 'list') { + return; + } + + const batch_key = build_batch_key('list'); + const items = Array.isArray(list_res.items) ? list_res.items : []; + + for (const it of items) { + const asin = it.asin || pick_asin_from_url(it.url); + if (!asin || !it.url) continue; + + await amazon_search_item.create({ + asin, + url: it.url, + title: it.title || null, + price: it.price || null, + rating: typeof it.rating === 'number' ? it.rating : null, + rating_text: it.rating_text || null, + review_count: typeof it.review_count === 'number' ? it.review_count : null, + review_count_text: it.review_count_text || null, + rank_index: typeof it.index === 'number' ? it.index : null, + batch_key, + batch_total: typeof list_res.total === 'number' ? list_res.total : null, + batch_limit: typeof list_res.limit === 'number' ? list_res.limit : null + }); + } +} + +async function persist_reviews(reviews_res) { + if (!reviews_res || reviews_res.stage !== 'reviews') { + return; + } + + const batch_key = build_batch_key('reviews'); + const asin = pick_asin_from_url(reviews_res.url); + const items = Array.isArray(reviews_res.items) ? reviews_res.items : []; + + for (const it of items) { + const review_id = it.review_id; + if (!review_id) continue; + + const asin_value = asin || pick_asin_from_url(it.url) || pick_asin_from_url(reviews_res.url); + + await amazon_review.upsert({ + asin: asin_value || null, + url: reviews_res.url || '', + review_id, + author: it.author || null, + title: it.title || null, + body: it.body || null, + rating_text: it.rating_text || null, + review_date: it.date || null, + review_index: typeof it.index === 'number' ? it.index : null, + batch_key, + batch_total: typeof reviews_res.total === 'number' ? reviews_res.total : null, + batch_limit: typeof reviews_res.limit === 'number' ? reviews_res.limit : null + }); + } +} + +function must_string(v, name) { + if (!v || typeof v !== 'string') { + throw new Error(`flow 参数 ${name} 必须是字符串`); + } + return v; +} + +function get_int(v, default_value) { + const n = Number(v); + if (Number.isNaN(n)) return default_value; + return n; +} + +export async function run_amazon_search_detail_reviews_flow(flow_payload) { + const keyword = must_string(flow_payload.keyword, 'keyword'); + const limit = get_int(flow_payload.limit, 100); + const max_products = get_int(flow_payload.max_products, 30); + const reviews_limit = get_int(flow_payload.reviews_limit, 50); + const concurrency = get_int(flow_payload.concurrency, 1); + const gap_ms = get_int(flow_payload.gap_ms, 0); + + const list_res = await execute_action_and_record({ + action_name: 'amazon_search_list', + action_payload: { keyword, limit }, + source: 'cron' + }); + + await persist_list(list_res); + + if (!list_res || list_res.stage !== 'list') { + throw new Error('amazon_search_list 返回非 list stage'); + } + + const items = Array.isArray(list_res.items) ? list_res.items : []; + const urls = items + .map((it) => (it && it.url ? String(it.url) : '')) + .filter((u) => u); + + const picked_urls = urls.slice(0, Math.max(0, max_products)); + + await map_limit(picked_urls, concurrency, async (product_url) => { + if (gap_ms > 0) { + await sleep_ms(gap_ms); + } + + const detail_res = await execute_action_and_record({ + action_name: 'amazon_product_detail', + action_payload: { product_url }, + source: 'cron' + }); + + await persist_detail(detail_res); + + if (gap_ms > 0) { + await sleep_ms(gap_ms); + } + + const reviews_res = await execute_action_and_record({ + action_name: 'amazon_product_reviews', + action_payload: { product_url, limit: reviews_limit }, + source: 'cron' + }); + + await persist_reviews(reviews_res); + + return null; + }); + + return { + stage: 'flow', + flow_name: 'amazon_search_detail_reviews', + keyword, + limit, + max_products, + reviews_limit, + total_urls: urls.length, + picked: picked_urls.length + }; +} diff --git a/server/services/flows/flow_registry.js b/server/services/flows/flow_registry.js new file mode 100644 index 0000000..0af2ce2 --- /dev/null +++ b/server/services/flows/flow_registry.js @@ -0,0 +1,13 @@ +import { run_amazon_search_detail_reviews_flow } from './amazon/amazon_search_detail_reviews_flow.js'; + +const flow_map = { + amazon_search_detail_reviews: run_amazon_search_detail_reviews_flow +}; + +export function get_flow_runner(flow_name) { + const fn = flow_map[flow_name]; + if (!fn) { + throw new Error(`未知 flow_name: ${flow_name}`); + } + return fn; +} diff --git a/server/services/flows/flow_utils.js b/server/services/flows/flow_utils.js new file mode 100644 index 0000000..9979e67 --- /dev/null +++ b/server/services/flows/flow_utils.js @@ -0,0 +1,27 @@ +export async function sleep_ms(ms) { + await new Promise((resolve) => setTimeout(resolve, ms)); +} + +export async function map_limit(items, limit, worker) { + const list = Array.isArray(items) ? items : []; + const n = Math.max(1, Number(limit || 1)); + + const res = new Array(list.length); + let idx = 0; + + async function run_one() { + while (idx < list.length) { + const cur = idx; + idx += 1; + res[cur] = await worker(list[cur], cur); + } + } + + const runners = []; + for (let i = 0; i < Math.min(n, list.length); i += 1) { + runners.push(run_one()); + } + + await Promise.all(runners); + return res; +} diff --git a/server/services/schedule_loader.js b/server/services/schedule_loader.js index b51975b..b49cc83 100644 --- a/server/services/schedule_loader.js +++ b/server/services/schedule_loader.js @@ -1,24 +1,44 @@ import cron from 'node-cron'; import { cron_task_list } from '../config/cron_tasks.js'; import { execute_action_and_record } from './task_executor.js'; +import { get_flow_runner } from './flows/flow_registry.js'; const cron_jobs = []; +async function run_cron_task(task) { + if (!task || !task.type) { + throw new Error('cron_task 缺少 type'); + } + + if (task.type === 'action') { + await execute_action_and_record({ + action_name: task.action_name, + action_payload: task.action_payload || {}, + source: 'cron' + }); + return; + } + + if (task.type === 'flow') { + const run_flow = get_flow_runner(task.flow_name); + await run_flow(task.flow_payload || {}); + return; + } + + throw new Error(`cron_task type 不支持: ${task.type}`); +} + export async function start_all_cron_tasks() { for (const task of cron_task_list) { - // const job = cron.schedule(task.cron_expression, async () => { + const job = cron.schedule(task.cron_expression, async () => { try { - await execute_action_and_record({ - action_name: task.action_name, - action_payload: task.action_payload || {}, - source: 'cron' - }); + await run_cron_task(task); } catch (err) { - // 失败会在 crawl_run_record 落库 + // action 内部已记录 crawl_run_record;flow 内部 action 也会记录 } - // }); + }); - // cron_jobs.push(job); + cron_jobs.push(job); } } diff --git a/server/services/task_executor.js b/server/services/task_executor.js index fea9fd8..cf0cbe4 100644 --- a/server/services/task_executor.js +++ b/server/services/task_executor.js @@ -1,7 +1,6 @@ import { crawl_run_record } from '../models/index.js'; import { safe_json_stringify } from './json_utils.js'; import { invoke_extension_action } from './puppeteer/puppeteer_runner.js'; -import { persist_amazon_result } from './amazon_persist.js'; export async function execute_action_and_record(params) { const { action_name, action_payload, source } = params; @@ -18,9 +17,6 @@ export async function execute_action_and_record(params) { ok = true; result_payload = safe_json_stringify(result); - // 按 stage 自动入库(不影响原始 run_record 记录) - await persist_amazon_result(result); - return result; } catch (err) { ok = false;