fix: 修复爬虫问题

2026-03-02 17:20:31 +08:00
parent 33e4786cd0
commit 0027074b8b
21 changed files with 523 additions and 16 deletions
--- a/crawler/news_storage.py
+++ b/crawler/news_storage.py
@@ -0,0 +1,141 @@
+# -*- coding: utf-8 -*-
+"""
+资讯内容独立存储，支持历史去重
+爬虫拉回数据 → 计算 content_hash → 若已存在则跳过（去重）→ 新数据落库 news_content
+"""
+import hashlib
+import os
+import re
+import sqlite3
+from datetime import datetime, timezone
+from typing import List, Optional, Tuple
+
+from config import DB_PATH
+
+
+def _to_utc_iso(dt: datetime) -> str:
+    if dt.tzinfo:
+        dt = dt.astimezone(timezone.utc)
+    return dt.strftime("%Y-%m-%dT%H:%M:%S.000Z")
+
+
+def _normalize_for_hash(text: str) -> str:
+    """归一化文本用于生成去重 hash"""
+    if not text:
+        return ""
+    t = re.sub(r"\s+", " ", str(text).strip().lower())[:600]
+    return re.sub(r"[\x00-\x1f]", "", t)
+
+
+def content_hash(title: str, summary: str, url: str) -> str:
+    """根据标题、摘要、URL 生成去重 hash，相似内容视为重复"""
+    raw = _normalize_for_hash(title) + "|" + _normalize_for_hash(summary) + "|" + (url or "").strip()
+    return hashlib.sha256(raw.encode("utf-8")).hexdigest()[:32]
+
+
+def _ensure_table(conn: sqlite3.Connection) -> None:
+    conn.execute("""
+        CREATE TABLE IF NOT EXISTS news_content (
+            id TEXT PRIMARY KEY,
+            content_hash TEXT NOT NULL UNIQUE,
+            title TEXT NOT NULL,
+            summary TEXT NOT NULL,
+            url TEXT NOT NULL DEFAULT '',
+            source TEXT NOT NULL DEFAULT '',
+            published_at TEXT NOT NULL,
+            category TEXT NOT NULL DEFAULT 'other',
+            severity TEXT NOT NULL DEFAULT 'medium',
+            created_at TEXT NOT NULL DEFAULT (datetime('now'))
+        )
+    """)
+    try:
+        conn.execute("CREATE UNIQUE INDEX IF NOT EXISTS idx_news_content_hash ON news_content(content_hash)")
+    except sqlite3.OperationalError:
+        pass
+    try:
+        conn.execute("CREATE INDEX IF NOT EXISTS idx_news_content_pub ON news_content(published_at DESC)")
+    except sqlite3.OperationalError:
+        pass
+    conn.commit()
+
+
+def exists_by_hash(conn: sqlite3.Connection, h: str) -> bool:
+    row = conn.execute("SELECT 1 FROM news_content WHERE content_hash = ? LIMIT 1", (h,)).fetchone()
+    return row is not None
+
+
+def insert_news(
+    conn: sqlite3.Connection,
+    *,
+    title: str,
+    summary: str,
+    url: str = "",
+    source: str = "",
+    published: datetime,
+    category: str = "other",
+    severity: str = "medium",
+) -> Optional[str]:
+    """
+    插入资讯，若 content_hash 已存在则跳过（去重）
+    返回: 新插入的 id，或 None 表示重复跳过
+    """
+    _ensure_table(conn)
+    h = content_hash(title, summary, url)
+    if exists_by_hash(conn, h):
+        return None
+    uid = "nc_" + hashlib.sha256(f"{h}{datetime.utcnow().isoformat()}".encode()).hexdigest()[:14]
+    ts = _to_utc_iso(published)
+    conn.execute(
+        """INSERT INTO news_content (id, content_hash, title, summary, url, source, published_at, category, severity)
+           VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)""",
+        (uid, h, (title or "")[:500], (summary or "")[:2000], (url or "")[:500], (source or "")[:100], ts, category, severity),
+    )
+    conn.commit()
+    return uid
+
+
+def save_and_dedup(items: List[dict], db_path: Optional[str] = None) -> Tuple[List[dict], int]:
+    """
+    去重后落库 news_content
+    items: [{"title","summary","url","published","category","severity","source"?}, ...]
+    返回: (通过去重的新项列表, 实际新增条数)
+    """
+    path = db_path or DB_PATH
+    if not os.path.exists(path):
+        return [], 0
+    conn = sqlite3.connect(path, timeout=10)
+    try:
+        _ensure_table(conn)
+        new_items: List[dict] = []
+        count = 0
+        for u in items:
+            title = (u.get("title") or "")[:500]
+            summary = (u.get("summary") or u.get("title") or "")[:2000]
+            url = (u.get("url") or "")[:500]
+            source = (u.get("source") or "")[:100]
+            pub = u.get("published")
+            if isinstance(pub, str):
+                try:
+                    pub = datetime.fromisoformat(pub.replace("Z", "+00:00"))
+                except ValueError:
+                    pub = datetime.now(timezone.utc)
+            elif pub is None:
+                pub = datetime.now(timezone.utc)
+            cat = u.get("category", "other")
+            sev = u.get("severity", "medium")
+            uid = insert_news(
+                conn,
+                title=title,
+                summary=summary,
+                url=url,
+                source=source,
+                published=pub,
+                category=cat,
+                severity=sev,
+            )
+            if uid:
+                count += 1
+                new_items.append({**u, "news_id": uid})
+        return new_items, count
+    finally:
+        conn.close()