Files
usa/scripts/check-crawler-data.cjs
2026-03-03 22:42:21 +08:00

141 lines
4.9 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env node
/**
* 检查爬虫写入的数据:条数 + 最近内容situation_update、news_content、gdelt_events
* 用法(项目根目录): node scripts/check-crawler-data.cjs
* 可选:先启动爬虫 npm run gdelt再启动 API 或直接运行本脚本读 DB
*/
const path = require('path')
const http = require('http')
const projectRoot = path.resolve(__dirname, '..')
process.chdir(projectRoot)
const db = require('../server/db')
const CRAWLER_URL = process.env.CRAWLER_URL || 'http://localhost:8000'
const SHOW_ROWS = 10
function fetchCrawlerStatus() {
return new Promise((resolve) => {
const url = new URL(`${CRAWLER_URL}/crawler/status`)
const req = http.request(
{ hostname: url.hostname, port: url.port || 80, path: url.pathname, method: 'GET', timeout: 3000 },
(res) => {
let body = ''
res.on('data', (c) => (body += c))
res.on('end', () => {
try {
resolve(JSON.parse(body))
} catch {
resolve(null)
}
})
}
)
req.on('error', () => resolve(null))
req.end()
})
}
async function run() {
console.log('========================================')
console.log('爬虫数据检查(条数 + 最近内容)')
console.log('========================================\n')
// ---------- 爬虫服务状态(可选)----------
const status = await fetchCrawlerStatus()
if (status) {
console.log('--- 爬虫服务状态 GET /crawler/status ---')
console.log(' db_path:', status.db_path)
console.log(' db_exists:', status.db_exists)
console.log(' situation_update_count:', status.situation_update_count)
console.log(' last_fetch_items:', status.last_fetch_items, '(本轮抓取条数)')
console.log(' last_fetch_inserted:', status.last_fetch_inserted, '(去重后新增)')
if (status.last_fetch_error) console.log(' last_fetch_error:', status.last_fetch_error)
console.log('')
} else {
console.log('--- 爬虫服务 ---')
console.log(' 未启动或不可达:', CRAWLER_URL)
console.log('')
}
// ---------- situation_update事件脉络看板「近期更新」----------
let situationUpdateRows = []
let situationUpdateCount = 0
try {
situationUpdateCount = db.prepare('SELECT COUNT(*) as c FROM situation_update').get().c
situationUpdateRows = db
.prepare(
'SELECT id, timestamp, category, summary, severity FROM situation_update ORDER BY timestamp DESC LIMIT ?'
)
.all(SHOW_ROWS)
} catch (e) {
console.log('situation_update 表读取失败:', e.message)
}
console.log('--- situation_update事件脉络---')
console.log(' 总条数:', situationUpdateCount)
if (situationUpdateRows.length > 0) {
console.log(' 最近', situationUpdateRows.length, '条:')
situationUpdateRows.forEach((r, i) => {
const summary = (r.summary || '').slice(0, 50)
console.log(` ${i + 1}. [${r.timestamp}] ${r.category}/${r.severity} ${summary}${summary.length >= 50 ? '…' : ''}`)
})
}
console.log('')
// ---------- news_content资讯表爬虫去重后写入----------
let newsCount = 0
let newsRows = []
try {
newsCount = db.prepare('SELECT COUNT(*) as c FROM news_content').get().c
newsRows = db
.prepare(
'SELECT title, summary, source, published_at, category, severity FROM news_content ORDER BY published_at DESC LIMIT ?'
)
.all(SHOW_ROWS)
} catch (e) {
console.log('news_content 表读取失败:', e.message)
}
console.log('--- news_content资讯表---')
console.log(' 总条数:', newsCount)
if (newsRows.length > 0) {
console.log(' 最近', newsRows.length, '条:')
newsRows.forEach((r, i) => {
const title = (r.title || '').slice(0, 45)
console.log(` ${i + 1}. [${r.published_at || ''}] ${r.source || ''} ${title}${title.length >= 45 ? '…' : ''}`)
if (r.summary) console.log(` summary: ${(r.summary || '').slice(0, 60)}`)
})
}
console.log('')
// ---------- gdelt_events地图冲突点----------
let gdeltCount = 0
let gdeltRows = []
try {
gdeltCount = db.prepare('SELECT COUNT(*) as c FROM gdelt_events').get().c
gdeltRows = db
.prepare('SELECT event_id, event_time, title, impact_score FROM gdelt_events ORDER BY event_time DESC LIMIT 5')
.all()
} catch (e) {
console.log('gdelt_events 表读取失败:', e.message)
}
console.log('--- gdelt_events地图冲突点---')
console.log(' 总条数:', gdeltCount)
if (gdeltRows.length > 0) {
console.log(' 最近 5 条:')
gdeltRows.forEach((r, i) => {
const title = (r.title || '').slice(0, 50)
console.log(` ${i + 1}. [${r.event_time}] impact=${r.impact_score} ${title}${title.length >= 50 ? '…' : ''}`)
})
}
console.log('========================================')
}
db.initDb().then(() => run()).catch((err) => {
console.error('失败:', err.message)
process.exit(1)
})