feat: new file

2026-03-18 18:57:58 +08:00
commit d0ff049899
31 changed files with 1507 additions and 0 deletions
--- a/backend/app/services/ai_insight.py
+++ b/backend/app/services/ai_insight.py
@@ -0,0 +1,73 @@
+from __future__ import annotations
+
+from typing import Any
+
+import httpx
+
+from ..settings import settings
+
+
+def _rule_based_summary(query: str, retrieved: list[dict[str, Any]]) -> dict[str, Any]:
+    top = retrieved[:3]
+    bullets = []
+    for r in top:
+        pid = r.get("product_id") or r.get("id") or "-"
+        title = r.get("title") or ""
+        follow = r.get("follow_score")
+        life = r.get("lifecycle")
+        bullets.append(f"- {pid} {title}（跟卖指数={follow} 生命周期={life}）")
+    return {
+        "mode": "rules_only",
+        "query": query,
+        "retrieved": retrieved,
+        "answer": "基于当前向量库/指标的规则摘要：\n" + "\n".join(bullets),
+    }
+
+
+def generate_insight(query: str, product_id: str | None, top_k: int) -> dict[str, Any]:
+    """
+    这里先做“可运行的最小闭环”：
+    - 向量检索（尚未实现时返回空）
+    - 有 OPENAI_API_KEY 则调用 LLM，输出结构化建议
+    - 否则返回规则引擎摘要
+    """
+    # TODO: 接入向量库检索（Milvus/Azure 等）。先保留协议，保证前端可用。
+    retrieved: list[dict[str, Any]] = []
+
+    if not settings.openai_api_key:
+        return _rule_based_summary(query, retrieved)
+
+    prompt = f"""你是电商数据分析与选品决策助手。
+用户问题：{query}
+
+请输出一个“发现爆款 -> 数据验证 -> 决策跟卖”的闭环建议，包含：
+1) 结论摘要（3-5条）
+2) 数据证据（引用关键指标：销量/增速/竞争/生命周期）
+3) 风险点与反例（至少3条）
+4) 可执行动作（选品、备货、投流、供应链）
+如果没有足够数据，请明确说明缺口，并给出最小补充数据清单。
+"""
+
+    headers = {"Authorization": f"Bearer {settings.openai_api_key}"}
+    payload = {
+        "model": settings.openai_model,
+        "input": prompt,
+    }
+
+    try:
+        with httpx.Client(timeout=30.0) as client:
+            r = client.post("https://api.openai.com/v1/responses", headers=headers, json=payload)
+            r.raise_for_status()
+            data = r.json()
+        text = ""
+        for item in data.get("output", []):
+            for c in item.get("content", []):
+                if c.get("type") in ("output_text", "text"):
+                    text += c.get("text", "")
+        return {"mode": "llm", "query": query, "retrieved": retrieved, "answer": text.strip()}
+    except Exception as e:
+        out = _rule_based_summary(query, retrieved)
+        out["mode"] = "rules_fallback"
+        out["error"] = str(e)
+        return out
+
--- a/backend/app/services/db_sample.py
+++ b/backend/app/services/db_sample.py
@@ -0,0 +1,105 @@
+from __future__ import annotations
+
+import json
+import logging
+from typing import Any
+
+import pandas as pd
+from sqlalchemy import Engine, text
+
+from ..db import get_engine
+from ..services.schema_discovery import discover_schema
+from ..settings import settings
+
+log = logging.getLogger("db_sample")
+
+
+def _truncate_value(v: Any, max_len: int) -> Any:
+    if v is None:
+        return None
+    if isinstance(v, (int, float, bool)):
+        return v
+    s = str(v)
+    if len(s) <= max_len:
+        return s
+    return s[: max_len - 3] + "..."
+
+
+def _df_to_records(df: pd.DataFrame, max_str_len: int) -> list[dict[str, Any]]:
+    out: list[dict[str, Any]] = []
+    for _, row in df.iterrows():
+        rec: dict[str, Any] = {}
+        for k, v in row.items():
+            rec[str(k)] = _truncate_value(v, max_str_len)
+        out.append(rec)
+    return out
+
+
+def _list_tables(engine: Engine) -> list[str]:
+    with engine.connect() as conn:
+        rows = conn.execute(
+            text(
+                """
+                SELECT table_name
+                FROM information_schema.tables
+                WHERE table_schema = DATABASE()
+                ORDER BY table_name
+                """
+            )
+        ).all()
+    return [r[0] for r in rows]
+
+
+def _table_row_count(engine: Engine, table: str) -> int | None:
+    try:
+        with engine.connect() as conn:
+            v = conn.execute(text(f"SELECT COUNT(*) FROM {table}")).scalar()
+        return int(v) if v is not None else None
+    except Exception:
+        return None
+
+
+def _sample_table(engine: Engine, table: str, limit: int) -> pd.DataFrame:
+    with engine.connect() as conn:
+        return pd.read_sql(text(f"SELECT * FROM {table} LIMIT {limit}"), conn)
+
+
+def print_db_sample_to_logs(limit: int | None = None) -> None:
+    """
+    打印数据库结构与样例行到后端日志，便于分析。
+    注意：会截断长字符串，避免日志爆炸。
+    """
+    engine = get_engine()
+    schema = discover_schema(engine)
+
+    eff_limit = int(limit or settings.debug_db_sample_limit)
+    max_str_len = int(settings.debug_db_sample_max_str_len)
+
+    tables = _list_tables(engine)
+    log.warning("DB SAMPLE: discovered schema=%s", schema.model_dump())
+    log.warning("DB SAMPLE: tables=%s", tables)
+
+    # prioritize discovered tables first
+    prioritized: list[str] = []
+    for t in [schema.sales_table, schema.products_table]:
+        if t and t in tables and t not in prioritized:
+            prioritized.append(t)
+    for t in tables:
+        if t not in prioritized:
+            prioritized.append(t)
+
+    for t in prioritized[: min(len(prioritized), 8)]:
+        cnt = _table_row_count(engine, t)
+        try:
+            df = _sample_table(engine, t, eff_limit)
+            recs = _df_to_records(df, max_str_len=max_str_len)
+            log.warning(
+                "DB SAMPLE: table=%s rows=%s cols=%s sample=%s",
+                t,
+                cnt,
+                list(df.columns),
+                json.dumps(recs, ensure_ascii=False),
+            )
+        except Exception as e:
+            log.warning("DB SAMPLE: table=%s rows=%s sample_failed=%s", t, cnt, str(e))
+
--- a/backend/app/services/forecast.py
+++ b/backend/app/services/forecast.py
@@ -0,0 +1,28 @@
+from __future__ import annotations
+
+import numpy as np
+from statsmodels.tsa.holtwinters import ExponentialSmoothing
+
+
+def forecast_next_n(y: np.ndarray, n: int) -> np.ndarray:
+    """
+    轻量级预测：优先 Holt-Winters（对短序列也相对稳），失败则用简单移动平均。
+    """
+    y = np.asarray(y, dtype=float)
+    if y.size < 3:
+        return np.repeat(y[-1] if y.size else 0.0, n)
+
+    try:
+        model = ExponentialSmoothing(
+            y,
+            trend="add",
+            seasonal=None,
+            initialization_method="estimated",
+        )
+        fit = model.fit(optimized=True)
+        return np.asarray(fit.forecast(n), dtype=float)
+    except Exception:
+        window = int(min(7, max(3, y.size // 2)))
+        avg = float(np.mean(y[-window:])) if y.size else 0.0
+        return np.repeat(avg, n)
+
--- a/backend/app/services/schema_discovery.py
+++ b/backend/app/services/schema_discovery.py
@@ -0,0 +1,223 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+
+from pydantic import BaseModel
+from sqlalchemy import Engine, text
+
+
+class DiscoveredSchema(BaseModel):
+    # discovered table names
+    sales_table: str | None = None
+    products_table: str | None = None
+
+    # required columns (in sales_table)
+    sales_product_id_col: str | None = None
+    sales_time_col: str | None = None
+    sales_units_col: str | None = None
+    sales_amount_col: str | None = None
+
+    # optional product cols
+    product_title_col: str | None = None
+    product_created_col: str | None = None
+    product_rank_col: str | None = None
+    product_category_col: str | None = None
+    product_desc_col: str | None = None
+
+    @property
+    def overview_sql(self) -> str:
+        # minimal, safe aggregations
+        t = self.sales_table
+        pid = self.sales_product_id_col
+        ts = self.sales_time_col
+        units = self.sales_units_col
+        amount = self.sales_amount_col
+        return f"""
+        SELECT
+          COUNT(DISTINCT {pid}) AS products,
+          SUM(COALESCE({units}, 0)) AS units_30d,
+          SUM(COALESCE({amount}, 0)) AS gmv_30d,
+          COUNT(*) AS rows_30d
+        FROM {t}
+        WHERE {ts} >= (UTC_TIMESTAMP() - INTERVAL 30 DAY)
+        """
+
+    @property
+    def timeseries_sql(self) -> str:
+        t = self.sales_table
+        pid = self.sales_product_id_col
+        ts = self.sales_time_col
+        units = self.sales_units_col
+        amount = self.sales_amount_col
+        return f"""
+        SELECT
+          DATE({ts}) AS ds,
+          SUM(COALESCE({units}, 0)) AS units,
+          SUM(COALESCE({amount}, 0)) AS gmv
+        FROM {t}
+        WHERE {pid} = :product_id
+          AND {ts} >= :since
+        GROUP BY DATE({ts})
+        ORDER BY ds ASC
+        """
+
+    @property
+    def trend_candidates_sql(self) -> str:
+        # produce per-product last-N-day rollups; join products when available
+        t = self.sales_table
+        pid = self.sales_product_id_col
+        ts = self.sales_time_col
+        units = self.sales_units_col
+        amount = self.sales_amount_col
+
+        p = self.products_table
+        title = self.product_title_col
+        created = self.product_created_col
+        rank = self.product_rank_col
+        cat = self.product_category_col
+
+        join = ""
+        if p:
+            join = f"LEFT JOIN {p} p ON p.{pid} = s.{pid}" if self._products_has_same_pid_name else f""
+
+        # if we can't confidently join, still return sales-only metrics
+        select_p = ""
+        if p and join:
+            title_expr = f"p.{title}" if title else "NULL"
+            cat_expr = f"p.{cat}" if cat else "NULL"
+            created_expr = f"p.{created}" if created else "NULL"
+            rank_expr = f"p.{rank}" if rank else "NULL"
+            select_p = f""",
+              {title_expr} AS title,
+              {cat_expr} AS category,
+              {created_expr} AS created_at,
+              {rank_expr} AS rank_now
+            """
+        return f"""
+        SELECT
+          s.{pid} AS product_id,
+          SUM(COALESCE(s.{units}, 0)) AS units,
+          SUM(COALESCE(s.{amount}, 0)) AS gmv,
+          COUNT(*) AS records,
+          MIN(s.{ts}) AS first_seen,
+          MAX(s.{ts}) AS last_seen
+          {select_p}
+        FROM {t} s
+        {join}
+        WHERE s.{ts} >= :since
+        GROUP BY s.{pid}
+        ORDER BY units DESC
+        LIMIT :limit
+        """
+
+    @property
+    def _products_has_same_pid_name(self) -> bool:
+        # discovery sets this attribute dynamically
+        return getattr(self, "__products_has_same_pid_name", False)
+
+    def set_products_pid_same(self, v: bool) -> None:
+        setattr(self, "__products_has_same_pid_name", v)
+
+
+SALES_UNITS_CANDIDATES = ["units", "qty", "quantity", "sales", "sold", "order_qty", "num"]
+SALES_AMOUNT_CANDIDATES = ["amount", "gmv", "revenue", "pay_amount", "total", "price", "order_amount"]
+TIME_CANDIDATES = ["created_at", "create_time", "created", "ts", "timestamp", "date_time", "paid_at", "order_time"]
+PID_CANDIDATES = ["product_id", "item_id", "sku_id", "goods_id", "asin"]
+PRODUCT_TITLE_CANDIDATES = ["title", "name", "product_name", "item_title"]
+PRODUCT_DESC_CANDIDATES = ["description", "desc", "detail"]
+PRODUCT_CREATED_CANDIDATES = ["created_at", "create_time", "created"]
+PRODUCT_RANK_CANDIDATES = ["rank", "bsr_rank", "position"]
+PRODUCT_CATEGORY_CANDIDATES = ["category", "cat", "category_name"]
+
+
+def _lower(s: str | None) -> str:
+    return (s or "").lower()
+
+
+def _pick(cols: list[str], candidates: list[str]) -> str | None:
+    cols_l = {_lower(c): c for c in cols}
+    for cand in candidates:
+        if cand in cols_l:
+            return cols_l[cand]
+    return None
+
+
+def discover_schema(engine: Engine) -> DiscoveredSchema:
+    """
+    在未知表结构的情况下做“足够稳妥”的自动发现：
+    - 优先寻找包含 product_id + 时间 + 数量/金额 的表作为 sales_table
+    - 寻找包含 title/name 等列的表作为 products_table
+    """
+    with engine.connect() as conn:
+        rows = conn.execute(
+            text(
+                """
+                SELECT table_name, column_name
+                FROM information_schema.columns
+                WHERE table_schema = DATABASE()
+                ORDER BY table_name, ordinal_position
+                """
+            )
+        ).all()
+
+    by_table: dict[str, list[str]] = {}
+    for t, c in rows:
+        by_table.setdefault(t, []).append(c)
+
+    best_sales: tuple[int, str, dict[str, str]] | None = None
+    best_products: tuple[int, str, dict[str, str]] | None = None
+
+    for t, cols in by_table.items():
+        pid = _pick(cols, PID_CANDIDATES)
+        ts = _pick(cols, TIME_CANDIDATES)
+        units = _pick(cols, SALES_UNITS_CANDIDATES)
+        amount = _pick(cols, SALES_AMOUNT_CANDIDATES)
+
+        score = 0
+        if pid:
+            score += 3
+        if ts:
+            score += 3
+        if units:
+            score += 2
+        if amount:
+            score += 1
+
+        if score >= 6:
+            if best_sales is None or score > best_sales[0]:
+                best_sales = (score, t, {"pid": pid, "ts": ts, "units": units, "amount": amount})
+
+        title = _pick(cols, PRODUCT_TITLE_CANDIDATES)
+        if title:
+            pscore = 2
+            if _pick(cols, PID_CANDIDATES):
+                pscore += 2
+            if _pick(cols, PRODUCT_CATEGORY_CANDIDATES):
+                pscore += 1
+            if _pick(cols, PRODUCT_DESC_CANDIDATES):
+                pscore += 1
+            if best_products is None or pscore > best_products[0]:
+                best_products = (pscore, t, {"title": title})
+
+    schema = DiscoveredSchema()
+    if best_sales:
+        _, t, m = best_sales
+        schema.sales_table = t
+        schema.sales_product_id_col = m["pid"]
+        schema.sales_time_col = m["ts"]
+        schema.sales_units_col = m["units"] or m["amount"]  # last resort
+        schema.sales_amount_col = m["amount"] or m["units"]
+
+    if best_products:
+        _, pt, _ = best_products
+        schema.products_table = pt
+        cols = by_table.get(pt, [])
+        schema.product_title_col = _pick(cols, PRODUCT_TITLE_CANDIDATES)
+        schema.product_desc_col = _pick(cols, PRODUCT_DESC_CANDIDATES)
+        schema.product_created_col = _pick(cols, PRODUCT_CREATED_CANDIDATES)
+        schema.product_rank_col = _pick(cols, PRODUCT_RANK_CANDIDATES)
+        schema.product_category_col = _pick(cols, PRODUCT_CATEGORY_CANDIDATES)
+        schema.set_products_pid_same(_pick(cols, PID_CANDIDATES) == schema.sales_product_id_col)
+
+    return schema
+
--- a/backend/app/services/timeseries.py
+++ b/backend/app/services/timeseries.py
@@ -0,0 +1,21 @@
+from __future__ import annotations
+
+import pandas as pd
+
+
+def normalize_timeseries(df: pd.DataFrame, ts_col: str, value_cols: list[str]) -> list[dict]:
+    df = df.copy()
+    df[ts_col] = pd.to_datetime(df[ts_col], errors="coerce")
+    df = df.dropna(subset=[ts_col]).sort_values(ts_col)
+    out = []
+    for _, r in df.iterrows():
+        p = {"ds": r[ts_col].to_pydatetime().isoformat()}
+        for c in value_cols:
+            v = r.get(c)
+            try:
+                p[c] = float(v) if v is not None else 0.0
+            except Exception:
+                p[c] = 0.0
+        out.append(p)
+    return out
+
--- a/backend/app/services/trend_engine.py
+++ b/backend/app/services/trend_engine.py
@@ -0,0 +1,108 @@
+from __future__ import annotations
+
+import math
+
+import numpy as np
+import pandas as pd
+
+
+def _sigmoid(x: float) -> float:
+    return 1.0 / (1.0 + math.exp(-x))
+
+
+def compute_trend_scores(df: pd.DataFrame) -> pd.DataFrame:
+    """
+    在“表结构不确定”的情况下，先基于可用字段做可解释评分：
+    - 潜伏期识别（Potential Winners）：新上架/刚出现 + 指标增长快
+    - 爆发力：若存在 tiktok/search 等字段则融合，否则使用销量/GMV加速度代理
+    - 决策建议：跟卖指数 + 生命周期预警（缺失字段会自动降权）
+    """
+    out = df.copy()
+    now = pd.Timestamp.now(tz="UTC")
+
+    # 新品/潜伏期：first_seen 越近、同时 units/gmv 相对高 -> potential
+    first_seen = pd.to_datetime(out.get("first_seen"), errors="coerce", utc=True)
+    age_days = (now - first_seen).dt.total_seconds() / 86400.0
+    age_days = age_days.fillna(999.0).clip(lower=0.0)
+
+    units = pd.to_numeric(out.get("units"), errors="coerce").fillna(0.0)
+    gmv = pd.to_numeric(out.get("gmv"), errors="coerce").fillna(0.0)
+
+    # 规模归一：log1p 降噪
+    units_s = np.log1p(units)
+    gmv_s = np.log1p(gmv)
+
+    freshness = 1.0 / (1.0 + (age_days / 7.0))  # 0~1
+    scale = (units_s.rank(pct=True) * 0.6 + gmv_s.rank(pct=True) * 0.4).clip(0.0, 1.0)
+    out["potential_score"] = (freshness * 0.55 + scale * 0.45).clip(0.0, 1.0)
+
+    # 爆发力：优先融合可选字段
+    tiktok_raw = out["tiktok_hot"] if "tiktok_hot" in out.columns else pd.Series(np.nan, index=out.index)
+    search_raw = out["search_growth"] if "search_growth" in out.columns else pd.Series(np.nan, index=out.index)
+    tiktok = pd.to_numeric(tiktok_raw, errors="coerce")
+    search_g = pd.to_numeric(search_raw, errors="coerce")
+    if tiktok.notna().any() or search_g.notna().any():
+        tiktok_s = tiktok.fillna(tiktok.median() if tiktok.notna().any() else 0.0)
+        search_s = search_g.fillna(search_g.median() if search_g.notna().any() else 0.0)
+        burst = (
+            pd.Series(tiktok_s).rank(pct=True) * 0.6
+            + pd.Series(search_s).rank(pct=True) * 0.4
+        ).clip(0.0, 1.0)
+    else:
+        # 无外部热度字段：用规模 + 新鲜度 作为代理
+        burst = (scale * 0.65 + freshness * 0.35).clip(0.0, 1.0)
+    out["burst_score"] = burst
+
+    # 跟卖指数：竞争（records 越多）负向；利润空间/供应链难度若缺失则降级
+    records = pd.to_numeric(out.get("records"), errors="coerce").fillna(0.0)
+    competition = records.rank(pct=True).clip(0.0, 1.0)  # 越大越卷
+
+    margin_raw = out["margin"] if "margin" in out.columns else pd.Series(np.nan, index=out.index)
+    margin = pd.to_numeric(margin_raw, errors="coerce")
+    if margin.notna().any():
+        margin_s = margin.fillna(margin.median()).rank(pct=True).clip(0.0, 1.0)
+        margin_w = 0.35
+    else:
+        margin_s = pd.Series(0.5, index=out.index)
+        margin_w = 0.15
+
+    supply_raw = out["supply_difficulty"] if "supply_difficulty" in out.columns else pd.Series(np.nan, index=out.index)
+    supply = pd.to_numeric(supply_raw, errors="coerce")
+    if supply.notna().any():
+        supply_s = (1.0 - supply.fillna(supply.median()).rank(pct=True)).clip(0.0, 1.0)  # 越难越低分
+        supply_w = 0.20
+    else:
+        supply_s = pd.Series(0.5, index=out.index)
+        supply_w = 0.10
+
+    # 趋势作为正向
+    trend = (out["potential_score"] * 0.5 + out["burst_score"] * 0.5).clip(0.0, 1.0)
+    trend_w = 0.45
+
+    comp_w = 0.20
+    follow = (
+        trend * trend_w
+        + margin_s * margin_w
+        + supply_s * supply_w
+        + (1.0 - competition) * comp_w
+    )
+    out["follow_score"] = follow.clip(0.0, 1.0)
+
+    # 生命周期预警（简化）：过老 + 规模不增长 + 竞争高 => red-ocean / decline
+    lifecycle = []
+    for i in out.index:
+        a = float(age_days.loc[i])
+        comp = float(competition.loc[i])
+        tr = float(trend.loc[i])
+        if a > 120 and comp > 0.7 and tr < 0.4:
+            lifecycle.append("decline_or_red_ocean")
+        elif a > 60 and comp > 0.75:
+            lifecycle.append("red_ocean")
+        elif a < 21 and tr > 0.65:
+            lifecycle.append("early_growth")
+        else:
+            lifecycle.append("normal")
+    out["lifecycle"] = lifecycle
+
+    return out
+