1
This commit is contained in:
@@ -11,11 +11,14 @@ export const cron_task_list = [
|
|||||||
type: 'flow',
|
type: 'flow',
|
||||||
flow_name: 'amazon_search_detail_reviews',
|
flow_name: 'amazon_search_detail_reviews',
|
||||||
flow_payload: {
|
flow_payload: {
|
||||||
keyword: '野餐包',
|
// 插件参数:category_keyword / sort_by / limit
|
||||||
|
category_keyword: '野餐包',
|
||||||
|
// featured / price_asc / price_desc / review / newest / bestseller
|
||||||
|
sort_by: 'bestseller',
|
||||||
limit: 100,
|
limit: 100,
|
||||||
max_products: 30,
|
|
||||||
|
// flow 自己的参数
|
||||||
reviews_limit: 50,
|
reviews_limit: 50,
|
||||||
concurrency: 1,
|
|
||||||
gap_ms: 500
|
gap_ms: 500
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -2,6 +2,7 @@ import { execute_action_and_record } from '../../task_executor.js';
|
|||||||
import { map_limit, sleep_ms } from '../flow_utils.js';
|
import { map_limit, sleep_ms } from '../flow_utils.js';
|
||||||
import { amazon_product, amazon_search_item, amazon_review } from '../../../models/index.js';
|
import { amazon_product, amazon_search_item, amazon_review } from '../../../models/index.js';
|
||||||
import { safe_json_stringify } from '../../json_utils.js';
|
import { safe_json_stringify } from '../../json_utils.js';
|
||||||
|
import { close_browser } from '../../puppeteer/puppeteer_runner.js';
|
||||||
|
|
||||||
function build_batch_key(prefix) {
|
function build_batch_key(prefix) {
|
||||||
return `${prefix}_${Date.now()}_${Math.random().toString().slice(2, 8)}`;
|
return `${prefix}_${Date.now()}_${Math.random().toString().slice(2, 8)}`;
|
||||||
@@ -13,7 +14,34 @@ function pick_asin_from_url(url) {
|
|||||||
return m && m[1] ? m[1].toUpperCase() : null;
|
return m && m[1] ? m[1].toUpperCase() : null;
|
||||||
}
|
}
|
||||||
|
|
||||||
async function persist_detail(detail_res) {
|
function unwrap_action_result(res) {
|
||||||
|
// 插件返回通常是 { ..., result: { stage, items... } }
|
||||||
|
if (res && typeof res === 'object' && res.result && typeof res.result === 'object') {
|
||||||
|
return res.result;
|
||||||
|
}
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
function normalize_sort_by(sort_by) {
|
||||||
|
if (sort_by === undefined || sort_by === null || sort_by === '') {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
const s = String(sort_by).trim();
|
||||||
|
if (!s) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
const allow = new Set(['featured', 'price_asc', 'price_desc', 'review', 'newest', 'bestseller']);
|
||||||
|
if (!allow.has(s)) {
|
||||||
|
throw new Error(`sort_by 不支持: ${s}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
return s;
|
||||||
|
}
|
||||||
|
|
||||||
|
async function persist_detail(detail_res_raw) {
|
||||||
|
const detail_res = unwrap_action_result(detail_res_raw);
|
||||||
if (!detail_res || detail_res.stage !== 'detail') {
|
if (!detail_res || detail_res.stage !== 'detail') {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
@@ -48,7 +76,8 @@ async function persist_detail(detail_res) {
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
async function persist_list(list_res) {
|
async function persist_list(list_res_raw) {
|
||||||
|
const list_res = unwrap_action_result(list_res_raw);
|
||||||
if (!list_res || list_res.stage !== 'list') {
|
if (!list_res || list_res.stage !== 'list') {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
@@ -77,7 +106,8 @@ async function persist_list(list_res) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async function persist_reviews(reviews_res) {
|
async function persist_reviews(reviews_res_raw) {
|
||||||
|
const reviews_res = unwrap_action_result(reviews_res_raw);
|
||||||
if (!reviews_res || reviews_res.stage !== 'reviews') {
|
if (!reviews_res || reviews_res.stage !== 'reviews') {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
@@ -110,10 +140,10 @@ async function persist_reviews(reviews_res) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
function must_string(v, name) {
|
function must_string(v, name) {
|
||||||
if (!v || typeof v !== 'string') {
|
if (typeof v !== 'string' || !v.trim()) {
|
||||||
throw new Error(`flow 参数 ${name} 必须是字符串`);
|
throw new Error(`flow 参数 ${name} 必须是字符串`);
|
||||||
}
|
}
|
||||||
return v;
|
return v.trim();
|
||||||
}
|
}
|
||||||
|
|
||||||
function get_int(v, default_value) {
|
function get_int(v, default_value) {
|
||||||
@@ -123,40 +153,47 @@ function get_int(v, default_value) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
export async function run_amazon_search_detail_reviews_flow(flow_payload) {
|
export async function run_amazon_search_detail_reviews_flow(flow_payload) {
|
||||||
const keyword = must_string(flow_payload.keyword, 'keyword');
|
const category_keyword = must_string(flow_payload.category_keyword, 'category_keyword');
|
||||||
|
const sort_by = normalize_sort_by(flow_payload.sort_by);
|
||||||
const limit = get_int(flow_payload.limit, 100);
|
const limit = get_int(flow_payload.limit, 100);
|
||||||
const max_products = get_int(flow_payload.max_products, 30);
|
|
||||||
const reviews_limit = get_int(flow_payload.reviews_limit, 50);
|
const reviews_limit = get_int(flow_payload.reviews_limit, 50);
|
||||||
const concurrency = get_int(flow_payload.concurrency, 1);
|
|
||||||
const gap_ms = get_int(flow_payload.gap_ms, 0);
|
const gap_ms = get_int(flow_payload.gap_ms, 0);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
const list_payload = { category_keyword, limit };
|
||||||
|
if (sort_by) {
|
||||||
|
list_payload.sort_by = sort_by;
|
||||||
|
}
|
||||||
|
|
||||||
const list_res = await execute_action_and_record({
|
const list_res = await execute_action_and_record({
|
||||||
action_name: 'amazon_search_list',
|
action_name: 'amazon_search_list',
|
||||||
action_payload: { keyword, limit },
|
action_payload: list_payload,
|
||||||
source: 'cron'
|
source: 'cron'
|
||||||
});
|
});
|
||||||
|
|
||||||
await persist_list(list_res);
|
await persist_list(list_res);
|
||||||
|
|
||||||
if (!list_res || list_res.stage !== 'list') {
|
const list_result = unwrap_action_result(list_res);
|
||||||
|
if (!list_result || list_result.stage !== 'list') {
|
||||||
throw new Error('amazon_search_list 返回非 list stage');
|
throw new Error('amazon_search_list 返回非 list stage');
|
||||||
}
|
}
|
||||||
|
|
||||||
const items = Array.isArray(list_res.items) ? list_res.items : [];
|
const items = Array.isArray(list_result.items) ? list_result.items : [];
|
||||||
const urls = items
|
const urls = items
|
||||||
.map((it) => (it && it.url ? String(it.url) : ''))
|
.map((it) => (it && it.url ? String(it.url) : ''))
|
||||||
.filter((u) => u);
|
.filter((u) => u);
|
||||||
|
|
||||||
const picked_urls = urls.slice(0, Math.max(0, max_products));
|
const picked_urls = urls;
|
||||||
|
|
||||||
await map_limit(picked_urls, concurrency, async (product_url) => {
|
|
||||||
|
for (const url of picked_urls) {
|
||||||
if (gap_ms > 0) {
|
if (gap_ms > 0) {
|
||||||
await sleep_ms(gap_ms);
|
await sleep_ms(gap_ms);
|
||||||
}
|
}
|
||||||
|
|
||||||
const detail_res = await execute_action_and_record({
|
const detail_res = await execute_action_and_record({
|
||||||
action_name: 'amazon_product_detail',
|
action_name: 'amazon_product_detail',
|
||||||
action_payload: { product_url },
|
action_payload: { product_url: url },
|
||||||
source: 'cron'
|
source: 'cron'
|
||||||
});
|
});
|
||||||
|
|
||||||
@@ -168,23 +205,27 @@ export async function run_amazon_search_detail_reviews_flow(flow_payload) {
|
|||||||
|
|
||||||
const reviews_res = await execute_action_and_record({
|
const reviews_res = await execute_action_and_record({
|
||||||
action_name: 'amazon_product_reviews',
|
action_name: 'amazon_product_reviews',
|
||||||
action_payload: { product_url, limit: reviews_limit },
|
action_payload: { product_url: url, limit: reviews_limit },
|
||||||
source: 'cron'
|
source: 'cron',
|
||||||
|
keep_browser_open: true
|
||||||
});
|
});
|
||||||
|
|
||||||
await persist_reviews(reviews_res);
|
await persist_reviews(reviews_res);
|
||||||
|
}
|
||||||
|
|
||||||
return null;
|
|
||||||
});
|
|
||||||
|
|
||||||
return {
|
const summary = {
|
||||||
stage: 'flow',
|
stage: 'flow',
|
||||||
flow_name: 'amazon_search_detail_reviews',
|
flow_name: 'amazon_search_detail_reviews',
|
||||||
keyword,
|
category_keyword,
|
||||||
|
sort_by: sort_by || 'featured',
|
||||||
limit,
|
limit,
|
||||||
max_products,
|
|
||||||
reviews_limit,
|
reviews_limit,
|
||||||
total_urls: urls.length,
|
total_urls: urls.length,
|
||||||
picked: picked_urls.length
|
picked: picked_urls.length
|
||||||
};
|
};
|
||||||
|
|
||||||
|
await close_browser();
|
||||||
|
|
||||||
|
return summary;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -2,9 +2,8 @@ export async function sleep_ms(ms) {
|
|||||||
await new Promise((resolve) => setTimeout(resolve, ms));
|
await new Promise((resolve) => setTimeout(resolve, ms));
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function map_limit(items, limit, worker) {
|
export async function map_limit(items, worker) {
|
||||||
const list = Array.isArray(items) ? items : [];
|
const list = Array.isArray(items) ? items : [];
|
||||||
const n = Math.max(1, Number(limit || 1));
|
|
||||||
|
|
||||||
const res = new Array(list.length);
|
const res = new Array(list.length);
|
||||||
let idx = 0;
|
let idx = 0;
|
||||||
@@ -18,7 +17,7 @@ export async function map_limit(items, limit, worker) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
const runners = [];
|
const runners = [];
|
||||||
for (let i = 0; i < Math.min(n, list.length); i += 1) {
|
for (let i = 0; i < list.length; i += 1) {
|
||||||
runners.push(run_one());
|
runners.push(run_one());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -6,6 +6,18 @@ import { apply_page_stealth_defaults, get_stealth_puppeteer } from './puppeteer_
|
|||||||
|
|
||||||
let browser_singleton = null;
|
let browser_singleton = null;
|
||||||
|
|
||||||
|
export async function close_browser() {
|
||||||
|
if (!browser_singleton) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
try {
|
||||||
|
await browser_singleton.close();
|
||||||
|
} catch (err) {
|
||||||
|
// ignore
|
||||||
|
}
|
||||||
|
browser_singleton = null;
|
||||||
|
}
|
||||||
|
|
||||||
function get_action_timeout_ms() {
|
function get_action_timeout_ms() {
|
||||||
const cfg = get_app_config();
|
const cfg = get_app_config();
|
||||||
return cfg.crawler.action_timeout_ms;
|
return cfg.crawler.action_timeout_ms;
|
||||||
@@ -94,7 +106,7 @@ export async function get_or_create_browser() {
|
|||||||
return browser_singleton;
|
return browser_singleton;
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function invoke_extension_action(action_name, action_payload) {
|
export async function invoke_extension_action(action_name, action_payload, options) {
|
||||||
const cfg = get_app_config();
|
const cfg = get_app_config();
|
||||||
const browser = await get_or_create_browser();
|
const browser = await get_or_create_browser();
|
||||||
|
|
||||||
@@ -189,13 +201,9 @@ export async function invoke_extension_action(action_name, action_payload) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (cfg.crawler.auto_close_browser) {
|
const keep_browser_open = options && options.keep_browser_open === true;
|
||||||
try {
|
if (cfg.crawler.auto_close_browser && !keep_browser_open) {
|
||||||
await browser.close();
|
await close_browser();
|
||||||
} catch (err) {
|
|
||||||
// ignore
|
|
||||||
}
|
|
||||||
browser_singleton = null;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -30,15 +30,15 @@ async function run_cron_task(task) {
|
|||||||
|
|
||||||
export async function start_all_cron_tasks() {
|
export async function start_all_cron_tasks() {
|
||||||
for (const task of cron_task_list) {
|
for (const task of cron_task_list) {
|
||||||
const job = cron.schedule(task.cron_expression, async () => {
|
// const job = cron.schedule(task.cron_expression, async () => {
|
||||||
try {
|
try {
|
||||||
await run_cron_task(task);
|
await run_cron_task(task);
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
// action 内部已记录 crawl_run_record;flow 内部 action 也会记录
|
// action 内部已记录 crawl_run_record;flow 内部 action 也会记录
|
||||||
}
|
}
|
||||||
});
|
// });
|
||||||
|
|
||||||
cron_jobs.push(job);
|
// cron_jobs.push(job);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -3,7 +3,7 @@ import { safe_json_stringify } from './json_utils.js';
|
|||||||
import { invoke_extension_action } from './puppeteer/puppeteer_runner.js';
|
import { invoke_extension_action } from './puppeteer/puppeteer_runner.js';
|
||||||
|
|
||||||
export async function execute_action_and_record(params) {
|
export async function execute_action_and_record(params) {
|
||||||
const { action_name, action_payload, source } = params;
|
const { action_name, action_payload, source, keep_browser_open } = params;
|
||||||
|
|
||||||
const request_payload = safe_json_stringify(action_payload || {});
|
const request_payload = safe_json_stringify(action_payload || {});
|
||||||
|
|
||||||
@@ -12,12 +12,14 @@ export async function execute_action_and_record(params) {
|
|||||||
let error_message = null;
|
let error_message = null;
|
||||||
|
|
||||||
try {
|
try {
|
||||||
const result = await invoke_extension_action(action_name, action_payload || {});
|
const result_obj = await invoke_extension_action(action_name, action_payload || {}, {
|
||||||
|
keep_browser_open: keep_browser_open === true
|
||||||
|
});
|
||||||
|
|
||||||
ok = true;
|
ok = true;
|
||||||
result_payload = safe_json_stringify(result);
|
result_payload = safe_json_stringify(result_obj);
|
||||||
|
|
||||||
return result;
|
return result_obj.result;
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
ok = false;
|
ok = false;
|
||||||
error_message = (err && err.message) || String(err);
|
error_message = (err && err.message) || String(err);
|
||||||
|
|||||||
Reference in New Issue
Block a user