from __future__ import annotations import json import re from datetime import datetime, timedelta, timezone from pathlib import Path from dateutil import parser as date_parser from pydantic import ValidationError from app.core.config import get_settings from app.core.logging import logger from app.domain.schemas import ExtractResponse, JobCard, Salary, SkillScore, WorkerCard from app.services.llm_client import LLMClient from app.utils.ids import generate_id from app.utils.prompts import load_prompt class ExtractionService: def __init__(self) -> None: self.settings = get_settings() self.skills = json.loads((self.settings.sample_data_dir / "skills.json").read_text(encoding="utf-8")) self.categories = json.loads((self.settings.sample_data_dir / "categories.json").read_text(encoding="utf-8")) self.regions = json.loads((self.settings.sample_data_dir / "regions.json").read_text(encoding="utf-8")) self.llm_client = LLMClient(self.settings) def extract_job(self, text: str) -> ExtractResponse: logger.info("extract_job request text=%s", text) llm_card = self._llm_extract_with_retry(text, self.settings.prompt_dir / "job_extract.md", JobCard) if llm_card: return ExtractResponse(success=True, data=llm_card) try: card = self._extract_job_rule(text) return ExtractResponse(success=True, data=card) except ValidationError as exc: logger.exception("Rule job extraction validation failed") return ExtractResponse(success=False, errors=[str(exc)], missing_fields=self._missing_fields(exc)) def extract_worker(self, text: str) -> ExtractResponse: logger.info("extract_worker request text=%s", text) llm_card = self._llm_extract_with_retry(text, self.settings.prompt_dir / "worker_extract.md", WorkerCard) if llm_card: return ExtractResponse(success=True, data=llm_card) try: card = self._extract_worker_rule(text) return ExtractResponse(success=True, data=card) except ValidationError as exc: logger.exception("Rule worker extraction validation failed") return ExtractResponse(success=False, errors=[str(exc)], missing_fields=self._missing_fields(exc)) def _llm_extract(self, text: str, prompt_path: Path): try: return self.llm_client.extract_json(load_prompt(prompt_path), text) except Exception: logger.exception("LLM extraction failed, fallback to rule-based extraction") return None def _llm_extract_with_retry(self, text: str, prompt_path: Path, schema_cls): base_prompt = load_prompt(prompt_path) llm_result = self._llm_extract(text, prompt_path) if not llm_result: return None try: return schema_cls(**llm_result.content) except ValidationError as exc: logger.warning("LLM extraction validation failed, trying schema-aware retry") last_error = exc last_output = llm_result.content for _ in range(self.settings.extraction_llm_max_retries): missing_fields = self._missing_fields(last_error) repair_prompt = self._build_repair_prompt(base_prompt, schema_cls, missing_fields) try: repair_result = self.llm_client.extract_json( repair_prompt, self._build_repair_input(text, last_output, missing_fields), ) except Exception: logger.exception("LLM schema-aware retry failed") return None if not repair_result: return None last_output = repair_result.content try: return schema_cls(**repair_result.content) except ValidationError as exc: last_error = exc logger.warning("LLM schema-aware retry still invalid missing_fields=%s", self._missing_fields(exc)) return None def _build_repair_prompt(self, base_prompt: str, schema_cls, missing_fields: list[str]) -> str: schema_json = json.dumps(schema_cls.model_json_schema(), ensure_ascii=False) return ( f"{base_prompt}\n\n" "你是结构化修复助手。请严格输出可被 JSON 解析的对象,不要输出解释文字。\n" "目标是根据给定 schema 修复字段缺失和类型错误,优先保证必填字段完整。\n" f"缺失或错误字段: {', '.join(missing_fields) if missing_fields else 'unknown'}\n" f"JSON Schema: {schema_json}\n" ) def _build_repair_input(self, original_text: str, last_output: dict, missing_fields: list[str]) -> str: return ( f"原始文本:\n{original_text}\n\n" f"上一次抽取结果:\n{json.dumps(last_output, ensure_ascii=False)}\n\n" f"请重点修复字段:\n{json.dumps(missing_fields, ensure_ascii=False)}" ) def _extract_job_rule(self, text: str) -> JobCard: skill_hits = [item for item in self.skills if item in text] category = next((item for item in self.categories if item in text), "活动执行") region = self._extract_region(text) salary = self._extract_salary(text) headcount = self._extract_number(text, [r"(\d+)\s*[个名人位]"], default=1) duration = self._extract_number(text, [r"(\d+(?:\.\d+)?)\s*小时"], default=4.0, cast=float) tags = [tag for tag in ["女生优先", "男生优先", "有经验优先", "沟通好", "可连做优先"] if tag in text] title = next((f"{category}{skill_hits[0]}兼职" for _ in [0] if skill_hits), f"{category}兼职") card = JobCard( job_id=generate_id("job"), title=title, category=category, description=text, skills=skill_hits[:5] or self._guess_category_skills(category), city=region["city"], region=region["region"], location_detail=self._extract_location(text, region), start_time=self._extract_job_time(text), duration_hours=duration, headcount=int(headcount), salary=salary, work_mode="排班制" if "排班" in text else "兼职", tags=tags or ["有经验优先"], confidence=self._compute_confidence(skill_hits, region, salary.amount > 0), ) return card def _extract_worker_rule(self, text: str) -> WorkerCard: skill_hits = [item for item in self.skills if item in text][:6] region_hits = [item for item in self.regions if item["region"] in text or item["city"] in text] city_names = list(dict.fromkeys([item["city"] for item in region_hits])) or ["深圳"] region_names = list(dict.fromkeys([item["region"] for item in region_hits])) or ["南山"] availability = self._extract_availability(text) experience = [item for item in ["商场", "会展", "活动执行", "物流", "零售", "客服中心", "快消", "校园推广"] if item in text] card = WorkerCard( worker_id=generate_id("worker"), name=self._extract_name(text), description=text, skills=[SkillScore(name=item, score=round(0.72 + index * 0.04, 2)) for index, item in enumerate(skill_hits or ["活动执行", "引导", "登记"])], cities=city_names, regions=region_names, availability=availability, experience_tags=experience or ["活动执行"], reliability_score=0.76, profile_completion=0.68, confidence=self._compute_confidence(skill_hits, {"city": city_names[0], "region": region_names[0]}, True), ) return card def _extract_region(self, text: str) -> dict: for item in self.regions: if item["city"] in text and item["region"] in text: return item for item in self.regions: if item["region"] in text: return item return {"city": "深圳", "region": "南山"} def _extract_location(self, text: str, region: dict) -> str: markers = ["会展中心", "商场", "地铁站", "园区", "写字楼", "仓库", "门店"] for marker in markers: if marker in text: return f"{region['city']}{region['region']}{marker}" return f"{region['city']}{region['region']}待定点位" def _extract_salary(self, text: str) -> Salary: amount = self._extract_number(text, [r"(\d+(?:\.\d+)?)\s*(?:元|块)"], default=150.0, cast=float) salary_type = "hourly" if "小时" in text and "/小时" in text else "daily" return Salary(type=salary_type, amount=amount, currency="CNY") def _extract_number(self, text: str, patterns: list[str], default, cast=int): for pattern in patterns: match = re.search(pattern, text) if match: return cast(match.group(1)) return default def _extract_job_time(self, text: str) -> datetime: shanghai_tz = timezone(timedelta(hours=8)) now = datetime.now(shanghai_tz) if "明天" in text: base = now + timedelta(days=1) elif "后天" in text: base = now + timedelta(days=2) else: month_day = re.search(r"(\d{1,2})月(\d{1,2})日", text) if month_day: month, day = int(month_day.group(1)), int(month_day.group(2)) base = now.replace(month=month, day=day) else: base = now + timedelta(days=1) hour = 9 if "下午" in text: hour = 13 elif "晚上" in text: hour = 19 explicit_hour = re.search(r"(\d{1,2})[:点时](\d{0,2})?", text) if explicit_hour: hour = int(explicit_hour.group(1)) return base.replace(hour=hour, minute=0, second=0, microsecond=0) def _extract_availability(self, text: str) -> list[str]: tags = [] if "周末" in text: tags.append("weekend") if "上午" in text: tags.append("weekday_am") if "下午" in text: tags.append("weekday_pm") if "随时" in text or "都能" in text or "全天" in text: tags.append("anytime") return tags or ["anytime"] def _extract_name(self, text: str) -> str: if match := re.search(r"我叫([\u4e00-\u9fa5]{2,4})", text): return match.group(1) if match := re.search(r"我是([\u4e00-\u9fa5]{2,4})", text): return match.group(1) return "匿名候选人" def _guess_category_skills(self, category: str) -> list[str]: mapping = { "活动执行": ["签到", "引导", "登记"], "促销": ["促销", "导购", "陈列"], "配送": ["配送", "装卸", "司机协助"], "客服": ["客服", "电话邀约", "线上客服"], } return mapping.get(category, ["活动执行", "沟通"]) def _compute_confidence(self, skill_hits: list[str], region: dict, has_salary: bool) -> float: score = 0.55 if skill_hits: score += 0.15 if region.get("city"): score += 0.15 if has_salary: score += 0.1 return min(round(score, 2), 0.95) def _missing_fields(self, exc: ValidationError) -> list[str]: return [".".join(str(part) for part in item["loc"]) for item in exc.errors()]