"""Interview quality helpers: known facts, persona threads, and anti-repeat guard.""" from __future__ import annotations import re from collections.abc import Iterable from langchain_core.messages import AIMessage, BaseMessage from app.agents.stage_constants import STAGE_DISPLAY_ZH, STAGE_SLOT_KEYS from app.agents.state_schema import ( KnownFact, MemoirStateSchema, PersonaThread, narrative_coverage_state, ) # 与 `apply_duplicate_question_guard` 中整段替换句一致;用于判定是否需触发二次生成。 DUPLICATE_QUESTION_GUARD_FALLBACK_ZH = "这一段我记住了。" _QUESTION_SPLIT_RE = re.compile(r"[??]+") _SENTENCE_SPLIT_RE = re.compile(r"(?<=[。!?!?])") _PUNCT_RE = re.compile(r"[\s,。!?;:、“”‘’()()《》【】\\[\\],.!?:;\"'`~·…-]+") # 「我演罗密欧」等扮演亲历,但排除「我演示…」类口癖 _AUTOBIO_IYAN_NOT_DEMO_RE = re.compile(r"我演(?!示)") _TRAIT_HINTS: tuple[tuple[str, tuple[str, ...]], ...] = ( ("执着坚持", ("坚持", "执着", "咬牙", "熬过", "顶住", "训练", "反复")), ("规划目标感", ("计划", "规划", "目标", "打算", "一步步", "安排", "准备")), ("求真较真", ("弄明白", "搞清楚", "想通", "为什么", "较真", "求证")), ("行动力", ("决定", "创业", "开始做", "尝试", "报名", "跑去", "去做")), ( "家庭责任感", ("家里", "父母", "妈妈", "爸爸", "妻子", "丈夫", "孩子", "照顾", "支持"), ), ("即时反馈驱动", ("反馈", "看到结果", "成就感", "立刻", "马上见效")), ("自由天性", ("自由", "无拘无束", "满世界跑", "疯玩", "野", "不管")), ("动手创造", ("自己动手", "搭", "做", "造", "修", "拆", "烤", "生火", "种")), ("重感情念旧", ("想起来", "怀念", "舍不得", "还记得", "那时候", "小时候")), ("好胜争先", ("比赛", "赢", "比", "第一", "不服输", "较劲")), ) _SCENE_CUE_WORDS: tuple[tuple[str, str], ...] = ( ("田野", "田野的泥土和青草气息"), ("河里", "河水的凉意"), ("海边", "海风和咸咸的空气"), ("溜冰", "冰面上咔嚓咔嚓的声响"), ("游泳", "一头扎进水里的畅快"), ("烤红薯", "红薯外焦里糯、掰开冒热气的香味"), ("烤", "火堆噼啪响、烟气里混着食物焦香"), ("打水漂", "石片在水面跳跃、一圈一圈涟漪散开"), ("捉", "追着跑、手心攥紧怕跑掉的紧张"), ("雪", "雪花落在脸上化成水珠的凉"), ("风", "风灌进领子里的感觉"), ("下雨", "雨点打在屋顶上的声音"), ("自行车", "骑车下坡风呼呼吹过耳朵"), ("火车", "绿皮车厢里混着泡面和橘子皮的味道"), ("学校", "教室里粉笔灰飘在阳光里的样子"), ("考试", "翻卷子时纸张沙沙响"), ("工厂", "机器轰鸣、油污和铁锈的气味"), ("做饭", "锅铲碰锅底的声响、油花溅起来的滋滋声"), ) def extract_scene_cues(user_message: str) -> list[str]: msg = (user_message or "").strip() if not msg: return [] cues: list[str] = [] for keyword, description in _SCENE_CUE_WORDS: if keyword in msg: cues.append(description) return cues[:3] _SLOT_REPEAT_PATTERNS: dict[str, tuple[str, ...]] = { "place": ("哪里长大", "家乡", "老家", "在哪长大", "什么地方长大"), "people": ("谁对你影响", "家里都有谁", "小时候和谁", "身边有什么人"), "daily_life": ("平时怎么过", "日常都做什么", "小时候都玩什么"), "emotion": ("那时候什么感觉", "当时什么感受", "小时候开心吗"), "turning_event": ("印象最深的事", "难忘的事", "转折"), "school": ("什么学校", "在哪上学", "读的什么学校", "上什么学校"), "city": ("在哪个城市", "去了哪里读书", "在哪读书"), "motivation": ("为什么想学", "为什么选这个", "动力是什么"), "challenge": ("遇到什么困难", "最大的难处", "辛苦吗"), "change": ("后来有什么变化", "这件事怎么改变你", "之后有什么不同"), "job": ("做什么工作", "具体做什么", "工作内容是什么"), "environment": ("工作环境", "在哪工作", "什么单位"), "decision": ("为什么做这个决定", "怎么决定的"), "pressure": ("压力大吗", "最难的时候", "最大的压力"), "growth": ("学到了什么", "成长在哪里", "后来有什么提升"), "relationship": ("和家人关系怎么样", "和伴侣关系怎么样"), "conflict": ("有什么矛盾", "怎么吵起来的", "冲突"), "support": ("谁支持你", "谁帮过你", "怎么支持你的"), "responsibility": ("承担什么责任", "家里靠谁", "你主要负责什么"), "value": ("你最看重什么", "信念是什么", "原则是什么"), "regret": ("最大的遗憾", "后悔过吗"), "pride": ("最骄傲的事", "最自豪的事"), "lesson": ("学到了什么道理", "最大的感悟", "给你什么启发"), } def _normalize_text(text: str) -> str: return _PUNCT_RE.sub("", (text or "").strip().lower()) def _dedupe_keep_last(items: Iterable[str], *, limit: int) -> list[str]: out: list[str] = [] seen: set[str] = set() for raw in reversed([str(x).strip() for x in items if str(x).strip()]): key = _normalize_text(raw) if not key or key in seen: continue seen.add(key) out.append(raw) if len(out) >= limit: break out.reverse() return out def _merge_known_facts( existing: Iterable[KnownFact], additions: Iterable[KnownFact], *, limit: int = 24, ) -> list[KnownFact]: merged: dict[tuple[str, str, str], KnownFact] = {} for item in list(existing) + list(additions): key = ( (item.stage or "").strip(), (item.slot_name or "").strip(), _normalize_text(f"{item.label}:{item.value}"), ) if not key[2]: continue merged[key] = item values = list(merged.values())[-limit:] return values def _merge_persona_threads( existing: Iterable[PersonaThread], additions: Iterable[PersonaThread], *, limit: int = 12, ) -> list[PersonaThread]: merged: dict[tuple[str, str], PersonaThread] = {} for item in list(existing) + list(additions): key = (_normalize_text(item.trait), _normalize_text(item.evidence)) if not key[0]: continue merged[key] = item values = list(merged.values())[-limit:] return values def _trim_sentence(text: str, *, limit: int = 80) -> str: s = re.sub(r"\s+", " ", (text or "").strip()) if len(s) <= limit: return s return s[: limit - 1].rstrip() + "…" def build_runtime_interview_state( state: MemoirStateSchema, *, user_message: str, active_stage: str, birth_year: int | None = None, birth_place: str = "", grew_up_place: str = "", occupation: str = "", ) -> MemoirStateSchema: """Merge current-turn hints into a prompt-only state view.""" additions: list[KnownFact] = [] if birth_year: additions.append( KnownFact( label="出生年份", value=f"{birth_year}年", source="profile", ) ) if birth_place: additions.append( KnownFact( label="出生地", value=birth_place.strip(), source="profile", stage="childhood", slot_name="place", ) ) if grew_up_place: additions.append( KnownFact( label="成长地", value=grew_up_place.strip(), source="profile", stage="childhood", slot_name="place", ) ) if occupation: additions.append( KnownFact( label="职业背景", value=occupation.strip(), source="profile", stage="career", slot_name="job", ) ) msg = _trim_sentence(user_message, limit=120) if msg: additions.append( KnownFact( label="本轮新信息", value=msg, source="current_turn", stage=active_stage, ) ) persona_additions: list[PersonaThread] = [] narrative_state = narrative_coverage_state(state) haystack = " ".join( [msg] + [fact.value for fact in state.known_facts[-8:]] + list(narrative_state.filled_slots_for_stage(active_stage).values())[:4] ) for trait, markers in _TRAIT_HINTS: for marker in markers: if marker and marker in haystack: persona_additions.append( PersonaThread( trait=trait, evidence=_trim_sentence( marker if marker in msg else haystack, limit=70 ), source="heuristic", stage=active_stage, ) ) break return state.model_copy( update={ "known_facts": _merge_known_facts(state.known_facts, additions), "persona_threads": _merge_persona_threads( state.persona_threads, persona_additions ), } ) def extract_recent_questions( messages: Iterable[BaseMessage], *, limit: int = 4 ) -> list[str]: questions: list[str] = [] for msg in messages: if not isinstance(msg, AIMessage): continue text = str(getattr(msg, "content", "") or "").strip() if not text: continue for part in _QUESTION_SPLIT_RE.split(text): part = part.strip() if not part: continue if any(w in text for w in ("?", "?")): questions.append(_trim_sentence(part + "?", limit=50)) return _dedupe_keep_last(questions, limit=limit) def update_recent_questions( existing: Iterable[str], generated_segments: Iterable[str], *, limit: int = 4, ) -> list[str]: fresh: list[str] = list(existing) for seg in generated_segments: text = str(seg or "").strip() if not text or ("?" not in text and "?" not in text): continue parts = [p.strip() for p in _QUESTION_SPLIT_RE.split(text) if p.strip()] if not parts: continue fresh.append(_trim_sentence(parts[-1] + "?", limit=50)) return _dedupe_keep_last(fresh, limit=limit) def apply_duplicate_question_guard( segments: Iterable[str], *, state: MemoirStateSchema, recent_questions: Iterable[str], ) -> tuple[list[str], bool]: """Downgrade obvious repeated-fact questions into acknowledgment-only text.""" recent_norms = {_normalize_text(q) for q in recent_questions if _normalize_text(q)} known_patterns: list[str] = [] for fact in state.known_facts: slot_patterns = _SLOT_REPEAT_PATTERNS.get(fact.slot_name or "", ()) known_patterns.extend(slot_patterns) if fact.label == "本轮新信息": known_patterns.append(fact.value) cleaned: list[str] = [] touched = False for seg in segments: text = str(seg or "").strip() if not text: continue text_norm = _normalize_text(text) repeated = False if ("?" in text or "?" in text) and text_norm: if any(q and (q in text_norm or text_norm in q) for q in recent_norms): repeated = True if not repeated: for pattern in known_patterns: pat_norm = _normalize_text(pattern) if pat_norm and pat_norm in text_norm: repeated = True break if repeated: sentences = [s.strip() for s in _SENTENCE_SPLIT_RE.split(text) if s.strip()] kept = [s for s in sentences if "?" not in s and "?" not in s] replacement = kept[0] if kept else DUPLICATE_QUESTION_GUARD_FALLBACK_ZH if not replacement.endswith(("。", "!", "…")): replacement += "。" cleaned.append(replacement) touched = True else: cleaned.append(text) if not cleaned: cleaned = [DUPLICATE_QUESTION_GUARD_FALLBACK_ZH] return cleaned, touched def segments_are_only_duplicate_guard_fallback(segments: Iterable[str]) -> bool: """是否为「仅兜底_ack、无实质承接」——适合再打一枪模型。""" parts = [str(s or "").strip() for s in segments if str(s or "").strip()] return len(parts) == 1 and parts[0] == DUPLICATE_QUESTION_GUARD_FALLBACK_ZH # 助手可见回复中,明显声称「我本人有过某种人生经历」的高置信子串(偏保守、宁可漏网不误伤泛化共情)。 _AUTOBIOGRAPHICAL_MARKERS_ZH: tuple[str, ...] = ( "我小时候", "我小学", "我中学", "我初中", "我高中", "我大学", "我上学那", "我念书", "我读书那", "我暗恋", "我当时暗恋", "我爸妈", "我父亲", "我母亲", "我爹", "我妈", "我爷爷", "我奶奶", "我外公", "我外婆", "我前任", "我老公", "我老婆", "我丈夫", "我妻子", "我男友", "我女友", "我对象", "我儿子", "我女儿", "我孩子", "我以前也", "我当时也", "我那时候也", "我也经历过", "我也有过", "我也演过", "我也上台", "我演过", "我饰演", "我演出", "我演的是", "我演的", "我扮演", "感觉我熟", "这我熟", ) AUTOBIOGRAPHICAL_BOUNDARY_FALLBACK_ZH = ( "你刚说的这段很有画面,我想多听你讲讲那时候你心里是什么感觉。" ) def _segment_has_autobiographical_claim_zh(text: str) -> bool: s = (text or "").strip() if not s: return False if _AUTOBIO_IYAN_NOT_DEMO_RE.search(s): return True return any(m and m in s for m in _AUTOBIOGRAPHICAL_MARKERS_ZH) def apply_autobiographical_boundary_guard( segments: Iterable[str], ) -> tuple[list[str], bool]: """将明显带有「助手自传式经历」的段落替换为中性承接,避免身份越界。""" cleaned: list[str] = [] touched = False for seg in segments: text = str(seg or "").strip() if not text: continue if _segment_has_autobiographical_claim_zh(text): cleaned.append(AUTOBIOGRAPHICAL_BOUNDARY_FALLBACK_ZH) touched = True else: cleaned.append(text) if not cleaned: cleaned = [AUTOBIOGRAPHICAL_BOUNDARY_FALLBACK_ZH] touched = True return cleaned, touched def stage_slot_hint_lines(stage: str) -> list[str]: keys = STAGE_SLOT_KEYS.get(stage, ()) stage_zh = STAGE_DISPLAY_ZH.get(stage, stage) return [f"{stage_zh}:{key}" for key in keys]