feat: new file

This commit is contained in:
Daniel
2026-03-18 18:57:58 +08:00
commit d0ff049899
31 changed files with 1507 additions and 0 deletions

View File

@@ -0,0 +1,73 @@
from __future__ import annotations
from typing import Any
import httpx
from ..settings import settings
def _rule_based_summary(query: str, retrieved: list[dict[str, Any]]) -> dict[str, Any]:
top = retrieved[:3]
bullets = []
for r in top:
pid = r.get("product_id") or r.get("id") or "-"
title = r.get("title") or ""
follow = r.get("follow_score")
life = r.get("lifecycle")
bullets.append(f"- {pid} {title}(跟卖指数={follow} 生命周期={life}")
return {
"mode": "rules_only",
"query": query,
"retrieved": retrieved,
"answer": "基于当前向量库/指标的规则摘要:\n" + "\n".join(bullets),
}
def generate_insight(query: str, product_id: str | None, top_k: int) -> dict[str, Any]:
"""
这里先做“可运行的最小闭环”:
- 向量检索(尚未实现时返回空)
- 有 OPENAI_API_KEY 则调用 LLM输出结构化建议
- 否则返回规则引擎摘要
"""
# TODO: 接入向量库检索Milvus/Azure 等)。先保留协议,保证前端可用。
retrieved: list[dict[str, Any]] = []
if not settings.openai_api_key:
return _rule_based_summary(query, retrieved)
prompt = f"""你是电商数据分析与选品决策助手。
用户问题:{query}
请输出一个“发现爆款 -> 数据验证 -> 决策跟卖”的闭环建议,包含:
1) 结论摘要3-5条
2) 数据证据(引用关键指标:销量/增速/竞争/生命周期)
3) 风险点与反例至少3条
4) 可执行动作(选品、备货、投流、供应链)
如果没有足够数据,请明确说明缺口,并给出最小补充数据清单。
"""
headers = {"Authorization": f"Bearer {settings.openai_api_key}"}
payload = {
"model": settings.openai_model,
"input": prompt,
}
try:
with httpx.Client(timeout=30.0) as client:
r = client.post("https://api.openai.com/v1/responses", headers=headers, json=payload)
r.raise_for_status()
data = r.json()
text = ""
for item in data.get("output", []):
for c in item.get("content", []):
if c.get("type") in ("output_text", "text"):
text += c.get("text", "")
return {"mode": "llm", "query": query, "retrieved": retrieved, "answer": text.strip()}
except Exception as e:
out = _rule_based_summary(query, retrieved)
out["mode"] = "rules_fallback"
out["error"] = str(e)
return out

View File

@@ -0,0 +1,105 @@
from __future__ import annotations
import json
import logging
from typing import Any
import pandas as pd
from sqlalchemy import Engine, text
from ..db import get_engine
from ..services.schema_discovery import discover_schema
from ..settings import settings
log = logging.getLogger("db_sample")
def _truncate_value(v: Any, max_len: int) -> Any:
if v is None:
return None
if isinstance(v, (int, float, bool)):
return v
s = str(v)
if len(s) <= max_len:
return s
return s[: max_len - 3] + "..."
def _df_to_records(df: pd.DataFrame, max_str_len: int) -> list[dict[str, Any]]:
out: list[dict[str, Any]] = []
for _, row in df.iterrows():
rec: dict[str, Any] = {}
for k, v in row.items():
rec[str(k)] = _truncate_value(v, max_str_len)
out.append(rec)
return out
def _list_tables(engine: Engine) -> list[str]:
with engine.connect() as conn:
rows = conn.execute(
text(
"""
SELECT table_name
FROM information_schema.tables
WHERE table_schema = DATABASE()
ORDER BY table_name
"""
)
).all()
return [r[0] for r in rows]
def _table_row_count(engine: Engine, table: str) -> int | None:
try:
with engine.connect() as conn:
v = conn.execute(text(f"SELECT COUNT(*) FROM {table}")).scalar()
return int(v) if v is not None else None
except Exception:
return None
def _sample_table(engine: Engine, table: str, limit: int) -> pd.DataFrame:
with engine.connect() as conn:
return pd.read_sql(text(f"SELECT * FROM {table} LIMIT {limit}"), conn)
def print_db_sample_to_logs(limit: int | None = None) -> None:
"""
打印数据库结构与样例行到后端日志,便于分析。
注意:会截断长字符串,避免日志爆炸。
"""
engine = get_engine()
schema = discover_schema(engine)
eff_limit = int(limit or settings.debug_db_sample_limit)
max_str_len = int(settings.debug_db_sample_max_str_len)
tables = _list_tables(engine)
log.warning("DB SAMPLE: discovered schema=%s", schema.model_dump())
log.warning("DB SAMPLE: tables=%s", tables)
# prioritize discovered tables first
prioritized: list[str] = []
for t in [schema.sales_table, schema.products_table]:
if t and t in tables and t not in prioritized:
prioritized.append(t)
for t in tables:
if t not in prioritized:
prioritized.append(t)
for t in prioritized[: min(len(prioritized), 8)]:
cnt = _table_row_count(engine, t)
try:
df = _sample_table(engine, t, eff_limit)
recs = _df_to_records(df, max_str_len=max_str_len)
log.warning(
"DB SAMPLE: table=%s rows=%s cols=%s sample=%s",
t,
cnt,
list(df.columns),
json.dumps(recs, ensure_ascii=False),
)
except Exception as e:
log.warning("DB SAMPLE: table=%s rows=%s sample_failed=%s", t, cnt, str(e))

View File

@@ -0,0 +1,28 @@
from __future__ import annotations
import numpy as np
from statsmodels.tsa.holtwinters import ExponentialSmoothing
def forecast_next_n(y: np.ndarray, n: int) -> np.ndarray:
"""
轻量级预测:优先 Holt-Winters对短序列也相对稳失败则用简单移动平均。
"""
y = np.asarray(y, dtype=float)
if y.size < 3:
return np.repeat(y[-1] if y.size else 0.0, n)
try:
model = ExponentialSmoothing(
y,
trend="add",
seasonal=None,
initialization_method="estimated",
)
fit = model.fit(optimized=True)
return np.asarray(fit.forecast(n), dtype=float)
except Exception:
window = int(min(7, max(3, y.size // 2)))
avg = float(np.mean(y[-window:])) if y.size else 0.0
return np.repeat(avg, n)

View File

@@ -0,0 +1,223 @@
from __future__ import annotations
from dataclasses import dataclass
from pydantic import BaseModel
from sqlalchemy import Engine, text
class DiscoveredSchema(BaseModel):
# discovered table names
sales_table: str | None = None
products_table: str | None = None
# required columns (in sales_table)
sales_product_id_col: str | None = None
sales_time_col: str | None = None
sales_units_col: str | None = None
sales_amount_col: str | None = None
# optional product cols
product_title_col: str | None = None
product_created_col: str | None = None
product_rank_col: str | None = None
product_category_col: str | None = None
product_desc_col: str | None = None
@property
def overview_sql(self) -> str:
# minimal, safe aggregations
t = self.sales_table
pid = self.sales_product_id_col
ts = self.sales_time_col
units = self.sales_units_col
amount = self.sales_amount_col
return f"""
SELECT
COUNT(DISTINCT {pid}) AS products,
SUM(COALESCE({units}, 0)) AS units_30d,
SUM(COALESCE({amount}, 0)) AS gmv_30d,
COUNT(*) AS rows_30d
FROM {t}
WHERE {ts} >= (UTC_TIMESTAMP() - INTERVAL 30 DAY)
"""
@property
def timeseries_sql(self) -> str:
t = self.sales_table
pid = self.sales_product_id_col
ts = self.sales_time_col
units = self.sales_units_col
amount = self.sales_amount_col
return f"""
SELECT
DATE({ts}) AS ds,
SUM(COALESCE({units}, 0)) AS units,
SUM(COALESCE({amount}, 0)) AS gmv
FROM {t}
WHERE {pid} = :product_id
AND {ts} >= :since
GROUP BY DATE({ts})
ORDER BY ds ASC
"""
@property
def trend_candidates_sql(self) -> str:
# produce per-product last-N-day rollups; join products when available
t = self.sales_table
pid = self.sales_product_id_col
ts = self.sales_time_col
units = self.sales_units_col
amount = self.sales_amount_col
p = self.products_table
title = self.product_title_col
created = self.product_created_col
rank = self.product_rank_col
cat = self.product_category_col
join = ""
if p:
join = f"LEFT JOIN {p} p ON p.{pid} = s.{pid}" if self._products_has_same_pid_name else f""
# if we can't confidently join, still return sales-only metrics
select_p = ""
if p and join:
title_expr = f"p.{title}" if title else "NULL"
cat_expr = f"p.{cat}" if cat else "NULL"
created_expr = f"p.{created}" if created else "NULL"
rank_expr = f"p.{rank}" if rank else "NULL"
select_p = f""",
{title_expr} AS title,
{cat_expr} AS category,
{created_expr} AS created_at,
{rank_expr} AS rank_now
"""
return f"""
SELECT
s.{pid} AS product_id,
SUM(COALESCE(s.{units}, 0)) AS units,
SUM(COALESCE(s.{amount}, 0)) AS gmv,
COUNT(*) AS records,
MIN(s.{ts}) AS first_seen,
MAX(s.{ts}) AS last_seen
{select_p}
FROM {t} s
{join}
WHERE s.{ts} >= :since
GROUP BY s.{pid}
ORDER BY units DESC
LIMIT :limit
"""
@property
def _products_has_same_pid_name(self) -> bool:
# discovery sets this attribute dynamically
return getattr(self, "__products_has_same_pid_name", False)
def set_products_pid_same(self, v: bool) -> None:
setattr(self, "__products_has_same_pid_name", v)
SALES_UNITS_CANDIDATES = ["units", "qty", "quantity", "sales", "sold", "order_qty", "num"]
SALES_AMOUNT_CANDIDATES = ["amount", "gmv", "revenue", "pay_amount", "total", "price", "order_amount"]
TIME_CANDIDATES = ["created_at", "create_time", "created", "ts", "timestamp", "date_time", "paid_at", "order_time"]
PID_CANDIDATES = ["product_id", "item_id", "sku_id", "goods_id", "asin"]
PRODUCT_TITLE_CANDIDATES = ["title", "name", "product_name", "item_title"]
PRODUCT_DESC_CANDIDATES = ["description", "desc", "detail"]
PRODUCT_CREATED_CANDIDATES = ["created_at", "create_time", "created"]
PRODUCT_RANK_CANDIDATES = ["rank", "bsr_rank", "position"]
PRODUCT_CATEGORY_CANDIDATES = ["category", "cat", "category_name"]
def _lower(s: str | None) -> str:
return (s or "").lower()
def _pick(cols: list[str], candidates: list[str]) -> str | None:
cols_l = {_lower(c): c for c in cols}
for cand in candidates:
if cand in cols_l:
return cols_l[cand]
return None
def discover_schema(engine: Engine) -> DiscoveredSchema:
"""
在未知表结构的情况下做“足够稳妥”的自动发现:
- 优先寻找包含 product_id + 时间 + 数量/金额 的表作为 sales_table
- 寻找包含 title/name 等列的表作为 products_table
"""
with engine.connect() as conn:
rows = conn.execute(
text(
"""
SELECT table_name, column_name
FROM information_schema.columns
WHERE table_schema = DATABASE()
ORDER BY table_name, ordinal_position
"""
)
).all()
by_table: dict[str, list[str]] = {}
for t, c in rows:
by_table.setdefault(t, []).append(c)
best_sales: tuple[int, str, dict[str, str]] | None = None
best_products: tuple[int, str, dict[str, str]] | None = None
for t, cols in by_table.items():
pid = _pick(cols, PID_CANDIDATES)
ts = _pick(cols, TIME_CANDIDATES)
units = _pick(cols, SALES_UNITS_CANDIDATES)
amount = _pick(cols, SALES_AMOUNT_CANDIDATES)
score = 0
if pid:
score += 3
if ts:
score += 3
if units:
score += 2
if amount:
score += 1
if score >= 6:
if best_sales is None or score > best_sales[0]:
best_sales = (score, t, {"pid": pid, "ts": ts, "units": units, "amount": amount})
title = _pick(cols, PRODUCT_TITLE_CANDIDATES)
if title:
pscore = 2
if _pick(cols, PID_CANDIDATES):
pscore += 2
if _pick(cols, PRODUCT_CATEGORY_CANDIDATES):
pscore += 1
if _pick(cols, PRODUCT_DESC_CANDIDATES):
pscore += 1
if best_products is None or pscore > best_products[0]:
best_products = (pscore, t, {"title": title})
schema = DiscoveredSchema()
if best_sales:
_, t, m = best_sales
schema.sales_table = t
schema.sales_product_id_col = m["pid"]
schema.sales_time_col = m["ts"]
schema.sales_units_col = m["units"] or m["amount"] # last resort
schema.sales_amount_col = m["amount"] or m["units"]
if best_products:
_, pt, _ = best_products
schema.products_table = pt
cols = by_table.get(pt, [])
schema.product_title_col = _pick(cols, PRODUCT_TITLE_CANDIDATES)
schema.product_desc_col = _pick(cols, PRODUCT_DESC_CANDIDATES)
schema.product_created_col = _pick(cols, PRODUCT_CREATED_CANDIDATES)
schema.product_rank_col = _pick(cols, PRODUCT_RANK_CANDIDATES)
schema.product_category_col = _pick(cols, PRODUCT_CATEGORY_CANDIDATES)
schema.set_products_pid_same(_pick(cols, PID_CANDIDATES) == schema.sales_product_id_col)
return schema

View File

@@ -0,0 +1,21 @@
from __future__ import annotations
import pandas as pd
def normalize_timeseries(df: pd.DataFrame, ts_col: str, value_cols: list[str]) -> list[dict]:
df = df.copy()
df[ts_col] = pd.to_datetime(df[ts_col], errors="coerce")
df = df.dropna(subset=[ts_col]).sort_values(ts_col)
out = []
for _, r in df.iterrows():
p = {"ds": r[ts_col].to_pydatetime().isoformat()}
for c in value_cols:
v = r.get(c)
try:
p[c] = float(v) if v is not None else 0.0
except Exception:
p[c] = 0.0
out.append(p)
return out

View File

@@ -0,0 +1,108 @@
from __future__ import annotations
import math
import numpy as np
import pandas as pd
def _sigmoid(x: float) -> float:
return 1.0 / (1.0 + math.exp(-x))
def compute_trend_scores(df: pd.DataFrame) -> pd.DataFrame:
"""
在“表结构不确定”的情况下,先基于可用字段做可解释评分:
- 潜伏期识别Potential Winners新上架/刚出现 + 指标增长快
- 爆发力:若存在 tiktok/search 等字段则融合,否则使用销量/GMV加速度代理
- 决策建议:跟卖指数 + 生命周期预警(缺失字段会自动降权)
"""
out = df.copy()
now = pd.Timestamp.now(tz="UTC")
# 新品/潜伏期first_seen 越近、同时 units/gmv 相对高 -> potential
first_seen = pd.to_datetime(out.get("first_seen"), errors="coerce", utc=True)
age_days = (now - first_seen).dt.total_seconds() / 86400.0
age_days = age_days.fillna(999.0).clip(lower=0.0)
units = pd.to_numeric(out.get("units"), errors="coerce").fillna(0.0)
gmv = pd.to_numeric(out.get("gmv"), errors="coerce").fillna(0.0)
# 规模归一log1p 降噪
units_s = np.log1p(units)
gmv_s = np.log1p(gmv)
freshness = 1.0 / (1.0 + (age_days / 7.0)) # 0~1
scale = (units_s.rank(pct=True) * 0.6 + gmv_s.rank(pct=True) * 0.4).clip(0.0, 1.0)
out["potential_score"] = (freshness * 0.55 + scale * 0.45).clip(0.0, 1.0)
# 爆发力:优先融合可选字段
tiktok_raw = out["tiktok_hot"] if "tiktok_hot" in out.columns else pd.Series(np.nan, index=out.index)
search_raw = out["search_growth"] if "search_growth" in out.columns else pd.Series(np.nan, index=out.index)
tiktok = pd.to_numeric(tiktok_raw, errors="coerce")
search_g = pd.to_numeric(search_raw, errors="coerce")
if tiktok.notna().any() or search_g.notna().any():
tiktok_s = tiktok.fillna(tiktok.median() if tiktok.notna().any() else 0.0)
search_s = search_g.fillna(search_g.median() if search_g.notna().any() else 0.0)
burst = (
pd.Series(tiktok_s).rank(pct=True) * 0.6
+ pd.Series(search_s).rank(pct=True) * 0.4
).clip(0.0, 1.0)
else:
# 无外部热度字段:用规模 + 新鲜度 作为代理
burst = (scale * 0.65 + freshness * 0.35).clip(0.0, 1.0)
out["burst_score"] = burst
# 跟卖指数竞争records 越多)负向;利润空间/供应链难度若缺失则降级
records = pd.to_numeric(out.get("records"), errors="coerce").fillna(0.0)
competition = records.rank(pct=True).clip(0.0, 1.0) # 越大越卷
margin_raw = out["margin"] if "margin" in out.columns else pd.Series(np.nan, index=out.index)
margin = pd.to_numeric(margin_raw, errors="coerce")
if margin.notna().any():
margin_s = margin.fillna(margin.median()).rank(pct=True).clip(0.0, 1.0)
margin_w = 0.35
else:
margin_s = pd.Series(0.5, index=out.index)
margin_w = 0.15
supply_raw = out["supply_difficulty"] if "supply_difficulty" in out.columns else pd.Series(np.nan, index=out.index)
supply = pd.to_numeric(supply_raw, errors="coerce")
if supply.notna().any():
supply_s = (1.0 - supply.fillna(supply.median()).rank(pct=True)).clip(0.0, 1.0) # 越难越低分
supply_w = 0.20
else:
supply_s = pd.Series(0.5, index=out.index)
supply_w = 0.10
# 趋势作为正向
trend = (out["potential_score"] * 0.5 + out["burst_score"] * 0.5).clip(0.0, 1.0)
trend_w = 0.45
comp_w = 0.20
follow = (
trend * trend_w
+ margin_s * margin_w
+ supply_s * supply_w
+ (1.0 - competition) * comp_w
)
out["follow_score"] = follow.clip(0.0, 1.0)
# 生命周期预警(简化):过老 + 规模不增长 + 竞争高 => red-ocean / decline
lifecycle = []
for i in out.index:
a = float(age_days.loc[i])
comp = float(competition.loc[i])
tr = float(trend.loc[i])
if a > 120 and comp > 0.7 and tr < 0.4:
lifecycle.append("decline_or_red_ocean")
elif a > 60 and comp > 0.75:
lifecycle.append("red_ocean")
elif a < 21 and tr > 0.65:
lifecycle.append("early_growth")
else:
lifecycle.append("normal")
out["lifecycle"] = lifecycle
return out