fix: 修复爬虫问题

This commit is contained in:
Daniel
2026-03-02 17:20:31 +08:00
parent 33e4786cd0
commit 0027074b8b
21 changed files with 523 additions and 16 deletions

141
crawler/news_storage.py Normal file
View File

@@ -0,0 +1,141 @@
# -*- coding: utf-8 -*-
"""
资讯内容独立存储,支持历史去重
爬虫拉回数据 → 计算 content_hash → 若已存在则跳过(去重)→ 新数据落库 news_content
"""
import hashlib
import os
import re
import sqlite3
from datetime import datetime, timezone
from typing import List, Optional, Tuple
from config import DB_PATH
def _to_utc_iso(dt: datetime) -> str:
if dt.tzinfo:
dt = dt.astimezone(timezone.utc)
return dt.strftime("%Y-%m-%dT%H:%M:%S.000Z")
def _normalize_for_hash(text: str) -> str:
"""归一化文本用于生成去重 hash"""
if not text:
return ""
t = re.sub(r"\s+", " ", str(text).strip().lower())[:600]
return re.sub(r"[\x00-\x1f]", "", t)
def content_hash(title: str, summary: str, url: str) -> str:
"""根据标题、摘要、URL 生成去重 hash相似内容视为重复"""
raw = _normalize_for_hash(title) + "|" + _normalize_for_hash(summary) + "|" + (url or "").strip()
return hashlib.sha256(raw.encode("utf-8")).hexdigest()[:32]
def _ensure_table(conn: sqlite3.Connection) -> None:
conn.execute("""
CREATE TABLE IF NOT EXISTS news_content (
id TEXT PRIMARY KEY,
content_hash TEXT NOT NULL UNIQUE,
title TEXT NOT NULL,
summary TEXT NOT NULL,
url TEXT NOT NULL DEFAULT '',
source TEXT NOT NULL DEFAULT '',
published_at TEXT NOT NULL,
category TEXT NOT NULL DEFAULT 'other',
severity TEXT NOT NULL DEFAULT 'medium',
created_at TEXT NOT NULL DEFAULT (datetime('now'))
)
""")
try:
conn.execute("CREATE UNIQUE INDEX IF NOT EXISTS idx_news_content_hash ON news_content(content_hash)")
except sqlite3.OperationalError:
pass
try:
conn.execute("CREATE INDEX IF NOT EXISTS idx_news_content_pub ON news_content(published_at DESC)")
except sqlite3.OperationalError:
pass
conn.commit()
def exists_by_hash(conn: sqlite3.Connection, h: str) -> bool:
row = conn.execute("SELECT 1 FROM news_content WHERE content_hash = ? LIMIT 1", (h,)).fetchone()
return row is not None
def insert_news(
conn: sqlite3.Connection,
*,
title: str,
summary: str,
url: str = "",
source: str = "",
published: datetime,
category: str = "other",
severity: str = "medium",
) -> Optional[str]:
"""
插入资讯,若 content_hash 已存在则跳过(去重)
返回: 新插入的 id或 None 表示重复跳过
"""
_ensure_table(conn)
h = content_hash(title, summary, url)
if exists_by_hash(conn, h):
return None
uid = "nc_" + hashlib.sha256(f"{h}{datetime.utcnow().isoformat()}".encode()).hexdigest()[:14]
ts = _to_utc_iso(published)
conn.execute(
"""INSERT INTO news_content (id, content_hash, title, summary, url, source, published_at, category, severity)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)""",
(uid, h, (title or "")[:500], (summary or "")[:2000], (url or "")[:500], (source or "")[:100], ts, category, severity),
)
conn.commit()
return uid
def save_and_dedup(items: List[dict], db_path: Optional[str] = None) -> Tuple[List[dict], int]:
"""
去重后落库 news_content
items: [{"title","summary","url","published","category","severity","source"?}, ...]
返回: (通过去重的新项列表, 实际新增条数)
"""
path = db_path or DB_PATH
if not os.path.exists(path):
return [], 0
conn = sqlite3.connect(path, timeout=10)
try:
_ensure_table(conn)
new_items: List[dict] = []
count = 0
for u in items:
title = (u.get("title") or "")[:500]
summary = (u.get("summary") or u.get("title") or "")[:2000]
url = (u.get("url") or "")[:500]
source = (u.get("source") or "")[:100]
pub = u.get("published")
if isinstance(pub, str):
try:
pub = datetime.fromisoformat(pub.replace("Z", "+00:00"))
except ValueError:
pub = datetime.now(timezone.utc)
elif pub is None:
pub = datetime.now(timezone.utc)
cat = u.get("category", "other")
sev = u.get("severity", "medium")
uid = insert_news(
conn,
title=title,
summary=summary,
url=url,
source=source,
published=pub,
category=cat,
severity=sev,
)
if uid:
count += 1
new_items.append({**u, "news_id": uid})
return new_items, count
finally:
conn.close()