soupstick commited on
Commit
ccb470a
·
1 Parent(s): 112f78a

refactor: scaffold modular split (agent, tools, modules/*, llm_provider, mcp, validation, security)

Browse files
agent.py ADDED
@@ -0,0 +1 @@
 
 
1
+ """TODO: implement agent.py (split from app_monolith_backup.py)"""
app.py CHANGED
@@ -1,670 +1 @@
1
- """
2
- Fraud Detector Analyst — LangChain + (optional) MCP
3
- Advanced “prototype-first” build:
4
- - Chat uses chat-completion models (LangChain ChatHuggingFace).
5
- - AI Summary shows a notice when no inference is connected.
6
-
7
- LLM env (serverless friendly):
8
- HF_TOKEN (or HF_SPACES)
9
- LC_CHAT_MODEL (default: "Qwen/Qwen2.5-0.5B-Instruct")
10
- LC_CHAT_MODEL_FALLBACK (default: "mistralai/Mistral-7B-Instruct")
11
-
12
- Summary behavior:
13
- If no working inference/token -> summary fields display:
14
- "🔌 Please connect to an inference point to generate summary."
15
-
16
- Optional MCP:
17
- ENABLE_MCP=1
18
- MCP_SANCTIONS_URL, MCP_HIGH_RISK_MCC_URL
19
- MCP_AUTH_HEADER="Authorization: Bearer <token>"
20
-
21
- Run:
22
- pip install -r requirements.txt
23
- python app.py
24
- On Spaces:
25
- Add secret HF_TOKEN (or HF_SPACES). Launch.
26
- """
27
-
28
- from __future__ import annotations
29
-
30
- import os, io, re, json, math, unicodedata, logging
31
- from typing import Optional, Tuple, List, Dict
32
-
33
- import numpy as np
34
- import pandas as pd
35
- import gradio as gr
36
- from dotenv import load_dotenv
37
-
38
- # LangChain
39
- from langchain.tools import tool
40
- from langchain_core.tools import Tool
41
- from langchain.agents import initialize_agent, AgentType
42
- from langchain.schema import HumanMessage, SystemMessage
43
-
44
- from pydantic import BaseModel, Field
45
- from langchain_huggingface import HuggingFaceEndpoint, ChatHuggingFace
46
-
47
- # Phone normalization
48
- try:
49
- import phonenumbers
50
- HAVE_PHONENUM = True
51
- except Exception:
52
- HAVE_PHONENUM = False
53
-
54
- # ------------------------
55
- # Setup
56
- # ------------------------
57
- load_dotenv()
58
- logging.basicConfig(level=logging.INFO)
59
- log = logging.getLogger("fraud-analyst")
60
-
61
- HF_TOKEN = os.getenv("HF_TOKEN") or os.getenv("HF_SPACES")
62
-
63
- # Chat models (chat-completions)
64
- DEFAULT_CHAT_MODEL = os.getenv("LC_CHAT_MODEL", "Qwen/Qwen2.5-0.5B-Instruct")
65
- FALLBACK_CHAT_MODEL = os.getenv("LC_CHAT_MODEL_FALLBACK", "mistralai/Mistral-7B-Instruct")
66
-
67
- SUMMARY_NOTICE = "🔌 Please connect to an inference point to generate summary."
68
- CHAT_NOTICE = "🔌 Chat model not configured. Set HF_TOKEN and LC_CHAT_MODEL to enable chat."
69
-
70
- # ------------------------
71
- # LLM builders
72
- # ------------------------
73
- def _mk_chat_llm(model_id: str) -> ChatHuggingFace:
74
- """
75
- ChatHuggingFace uses HF Inference under the hood.
76
- Although the backend task is 'text-generation', this wrapper handles chat-style messages.
77
- """
78
- base = HuggingFaceEndpoint(
79
- repo_id=model_id,
80
- task="text-generation",
81
- huggingfacehub_api_token=HF_TOKEN,
82
- max_new_tokens=256,
83
- temperature=0.2,
84
- repetition_penalty=1.05,
85
- timeout=60,
86
- )
87
- return ChatHuggingFace(llm=base)
88
-
89
- def _heartbeat_chat(model_id: str) -> bool:
90
- try:
91
- chat = _mk_chat_llm(model_id)
92
- _ = chat.invoke([HumanMessage(content="ok")])
93
- return True
94
- except Exception as e:
95
- log.warning(f"Heartbeat failed for {model_id}: {str(e)[:160]}")
96
- return False
97
-
98
- def build_chat_llm() -> Optional[ChatHuggingFace]:
99
- """
100
- Returns a working ChatHuggingFace or None (if token/permissions missing).
101
- """
102
- log.info(f"HF token present: {bool(HF_TOKEN)} len={len(HF_TOKEN) if HF_TOKEN else 0}")
103
- if HF_TOKEN and _heartbeat_chat(DEFAULT_CHAT_MODEL):
104
- log.info(f"Using chat model: {DEFAULT_CHAT_MODEL}")
105
- return _mk_chat_llm(DEFAULT_CHAT_MODEL)
106
- if HF_TOKEN and _heartbeat_chat(FALLBACK_CHAT_MODEL):
107
- log.info(f"Using fallback chat model: {FALLBACK_CHAT_MODEL}")
108
- return _mk_chat_llm(FALLBACK_CHAT_MODEL)
109
- log.warning("No working chat model; chat will show a notice.")
110
- return None
111
-
112
- CHAT_LLM = build_chat_llm()
113
-
114
- # ------------------------
115
- # Normalization helpers
116
- # ------------------------
117
- def _norm_colname(c: str) -> str:
118
- c = c.strip().lower()
119
- c = re.sub(r"\s+", "_", c)
120
- c = re.sub(r"[^\w]+", "_", c)
121
- return c.strip("_")
122
-
123
- def _nfkc(s: str) -> str:
124
- return unicodedata.normalize("NFKC", s)
125
-
126
- def _collapse_ws(s: str) -> str:
127
- return re.sub(r"\s+", " ", s).strip()
128
-
129
- def _clean_str(x):
130
- if pd.isna(x): return x
131
- return _collapse_ws(_nfkc(str(x)))
132
-
133
- def _is_email(s: str) -> bool:
134
- return bool(re.match(r"^[A-Za-z0-9._%+\-]+@[A-Za-z0-9.\-]+\.[A-Za-z]{2,}$", s or ""))
135
-
136
- def _clean_phone(s: str, default_region: str = "IN"):
137
- if s is None or str(s).strip() == "":
138
- return None, "missing_phone"
139
- raw = re.sub(r"[^\d+]", "", str(s))
140
- if HAVE_PHONENUM:
141
- try:
142
- pn = phonenumbers.parse(raw, default_region)
143
- if phonenumbers.is_possible_number(pn) and phonenumbers.is_valid_number(pn):
144
- return phonenumbers.format_number(pn, phonenumbers.PhoneNumberFormat.E164), None
145
- return raw, "invalid_phone"
146
- except Exception:
147
- return raw, "invalid_phone"
148
- digits = re.sub(r"\D", "", raw)
149
- return (digits, None) if 8 <= len(digits) <= 15 else (digits, "invalid_phone")
150
-
151
- def _parse_datetime(s):
152
- try:
153
- return pd.to_datetime(s, errors="coerce", utc=True)
154
- except Exception:
155
- return pd.NaT
156
-
157
- def _to_numeric(series: pd.Series):
158
- coerced = pd.to_numeric(series, errors="coerce")
159
- return coerced, (coerced.isna() & series.notna())
160
-
161
- def _read_csv_any(file_obj) -> pd.DataFrame:
162
- if file_obj is None:
163
- raise ValueError("No file uploaded.")
164
- if hasattr(file_obj, "name"):
165
- p = file_obj.name
166
- try: return pd.read_csv(p)
167
- except Exception: return pd.read_csv(p, encoding="latin-1")
168
- try: return pd.read_csv(file_obj)
169
- except Exception:
170
- file_obj.seek(0)
171
- return pd.read_csv(file_obj, encoding="latin-1")
172
-
173
- def _standardize_df(df: pd.DataFrame) -> pd.DataFrame:
174
- df = df.copy()
175
- df.columns = [_norm_colname(c) for c in df.columns]
176
- for c in df.select_dtypes(include=["object"]).columns:
177
- df[c] = df[c].apply(_clean_str)
178
- return df
179
-
180
- def _prepare_generic(df: pd.DataFrame, expected: Dict[str, List[str]]):
181
- issues = []
182
- df0 = _standardize_df(df)
183
-
184
- # Synonym mapping
185
- colmap = {}
186
- cols = set(df0.columns)
187
- for canon, syns in expected.items():
188
- found = None
189
- for s in [canon] + syns:
190
- s = _norm_colname(s)
191
- if s in cols:
192
- found = s; break
193
- if found: colmap[canon] = found
194
-
195
- # Email/phone quality
196
- for c in list(df0.columns):
197
- if "email" in c:
198
- df0[c] = df0[c].apply(lambda x: str(x).lower().strip() if pd.notna(x) else x)
199
- for idx, v in df0[c].items():
200
- if pd.isna(v) or str(v).strip()=="":
201
- issues.append({"row": idx, "field": c, "issue":"missing_email","value":""})
202
- elif not _is_email(v):
203
- issues.append({"row": idx, "field": c, "issue":"invalid_email","value":str(v)})
204
- if "phone" in c or "mobile" in c:
205
- vals = []
206
- for idx, v in df0[c].items():
207
- e164, prob = _clean_phone(v)
208
- vals.append(e164)
209
- if prob: issues.append({"row": idx, "field": c, "issue":prob, "value":str(v)})
210
- df0[c] = vals
211
-
212
- # Datetime parsing
213
- for c in df0.columns:
214
- if any(k in c for k in ["date","time","timestamp","created_at","updated_at"]):
215
- parsed = _parse_datetime(df0[c])
216
- bad = parsed.isna() & df0[c].notna()
217
- for idx in df0.index[bad]:
218
- issues.append({"row": int(idx), "field": c, "issue":"unparseable_timestamp", "value":str(df0.loc[idx, c])})
219
- df0[c] = parsed
220
-
221
- # Numeric coercions for common fields
222
- for nc in ["amount","credit_score","utilization","dti","recent_defaults","income"]:
223
- for c in df0.columns:
224
- if c == nc or c.endswith("_"+nc) or nc in c:
225
- coerced, badmask = _to_numeric(df0[c])
226
- for idx in df0.index[badmask]:
227
- issues.append({"row": int(idx), "field": c, "issue":"non_numeric", "value":str(df0.loc[idx, c])})
228
- df0[c] = coerced
229
-
230
- issues_df = pd.DataFrame(issues, columns=["row","field","issue","value"])
231
- missing = [k for k in expected.keys() if k not in colmap]
232
- quality_summary = f"Rows={len(df0)}, Cols={len(df0.columns)}; Missing required fields: {missing if missing else 'None'}"
233
- return df0, issues_df, quality_summary, colmap
234
-
235
- # ------------------------
236
- # Modules & Rules
237
- # ------------------------
238
- TX_EXPECTED = {
239
- "transaction_id":["txn_id","transactionid","id","tx_id"],
240
- "customer_id":["cust_id","user_id","client_id"],
241
- "amount":["amt","amount_inr","value"],
242
- "timestamp":["date","event_time","created_at","tx_time"],
243
- "merchant_category":["mcc","merchant_cat","category"]
244
- }
245
- def prepare_transactions(df): return _prepare_generic(df, TX_EXPECTED)
246
-
247
- def detect_transactions(clean_df, colmap, high_risk_mcc: Optional[List[str]]=None):
248
- high_risk = set(["HIGH_RISK","GAMBLING","CRYPTO_EXCHANGE","ESCORTS","CASINO"])
249
- if high_risk_mcc:
250
- high_risk.update([_nfkc(x).strip().upper().replace(" ","_") for x in high_risk_mcc])
251
- if not all(k in colmap for k in ["customer_id","amount"]):
252
- return pd.DataFrame(), "Required columns missing for detection (need at least customer_id, amount)."
253
- df = clean_df.copy()
254
- reasons = []
255
- amtcol = colmap.get("amount")
256
- if amtcol is not None:
257
- reasons.append(("large_amount>10k", df[amtcol] > 10000))
258
- reasons.append(("negative_amount", df[amtcol] < 0))
259
- if "merchant_category" in colmap:
260
- mcc = colmap["merchant_category"]
261
- high = df[mcc].astype(str).str.upper().str.replace(" ","_", regex=False).isin(high_risk)
262
- reasons.append(("merchant_category_high_risk", high))
263
- if all(k in colmap for k in ["customer_id","timestamp","amount"]):
264
- cid, ts, amt = colmap["customer_id"], colmap["timestamp"], colmap["amount"]
265
- daily = df.groupby([cid, df[ts].dt.date])[amt].transform("sum")
266
- reasons.append(("daily_sum_per_customer>50k", daily > 50000))
267
- mask = None
268
- for _, m in reasons:
269
- mask = m if mask is None else (mask | m)
270
- flagged = df[mask] if mask is not None else pd.DataFrame()
271
- if not flagged.empty:
272
- rr=[]
273
- for _, row in flagged.iterrows():
274
- hits=[]
275
- a = row[amtcol] if amtcol in flagged.columns else None
276
- if pd.notna(a) and a>10000: hits.append("large_amount")
277
- if pd.notna(a) and a<0: hits.append("negative_amount")
278
- if "merchant_category" in colmap:
279
- val = str(row[colmap["merchant_category"]]).upper().replace(" ","_")
280
- if val in high_risk: hits.append("mcc_high_risk")
281
- # daily sum check reconstructed
282
- try:
283
- if all(k in colmap for k in ["customer_id","timestamp","amount"]):
284
- sub = df[(df[colmap["customer_id"]]==row[colmap["customer_id"]]) &
285
- (df[colmap["timestamp"]].dt.date==pd.to_datetime(row[colmap["timestamp"]], errors="coerce").date())]
286
- if sub[colmap["amount"]].sum() > 50000: hits.append("daily_sum>50k")
287
- except Exception: pass
288
- rr.append(", ".join(sorted(set(hits))) or "rule_hit")
289
- flagged = flagged.assign(risk_reason=rr)
290
- stats = f"Transactions flagged: {len(flagged)} of {len(df)}."
291
- return flagged, stats
292
-
293
- KYC_EXPECTED = {
294
- "customer_id":["cust_id","user_id","client_id"],
295
- "name":["full_name","customer_name"],
296
- "email":["email_address","mail"],
297
- "phone":["phone_number","mobile","contact"],
298
- "dob":["date_of_birth","birthdate"]
299
- }
300
- def prepare_kyc(df): return _prepare_generic(df, KYC_EXPECTED)
301
-
302
- def _age_years(dob: pd.Series) -> pd.Series:
303
- now = pd.Timestamp.utcnow()
304
- return (now - dob).dt.days / 365.25
305
-
306
- def detect_kyc(clean_df, colmap):
307
- if not all(k in colmap for k in ["customer_id","name"]):
308
- return pd.DataFrame(), "Required columns missing for KYC (need at least customer_id, name)."
309
- df = clean_df.copy()
310
- reasons=[]
311
- if "email" in colmap:
312
- dupe_email = df.duplicated(subset=[colmap["email"]], keep=False) & df[colmap["email"]].notna()
313
- reasons.append(("duplicate_email", dupe_email))
314
- if "phone" in colmap:
315
- dupe_phone = df.duplicated(subset=[colmap["phone"]], keep=False) & df[colmap["phone"]].notna()
316
- reasons.append(("duplicate_phone", dupe_phone))
317
- if "dob" in colmap:
318
- age = _age_years(df[colmap["dob"]])
319
- invalid = (df[colmap["dob"]].isna()) | (df[colmap["dob"]] > pd.Timestamp.utcnow()) | (age > 120)
320
- reasons.append(("invalid_dob", invalid))
321
- if "name" in colmap:
322
- name = df[colmap["name"]].astype(str)
323
- susp = name.str.isupper() | name.str.contains(r"\d") | (name.str.len()<3)
324
- reasons.append(("suspicious_name", susp))
325
- mask = None
326
- for _, m in reasons:
327
- mask = m if mask is None else (mask | m)
328
- flagged = df[mask] if mask is not None else pd.DataFrame()
329
- if not flagged.empty:
330
- flagged = flagged.assign(risk_reason="kyc_rule_hit")
331
- stats = f"KYC flagged: {len(flagged)} of {len(df)}."
332
- return flagged, stats
333
-
334
- SAN_EXPECTED = {"customer_id":["cust_id","user_id","client_id"], "name":["full_name","customer_name"]}
335
- def prepare_sanctions(df): return _prepare_generic(df, SAN_EXPECTED)
336
-
337
- DEMO_SANCTIONS = pd.DataFrame({"name":["Ivan Petrov","Global Terror Org","Acme Front LLC","John Doe (PEP)","Shadow Brokers"]})
338
-
339
- def token_overlap(a: str, b: str) -> int:
340
- at = set(re.findall(r"[A-Za-z0-9]+", a.lower()))
341
- bt = set(re.findall(r"[A-Za-z0-9]+", b.lower()))
342
- return len(at & bt)
343
-
344
- def detect_sanctions(clean_df, colmap, sanctions_df: Optional[pd.DataFrame]=None):
345
- if "name" not in colmap:
346
- return pd.DataFrame(), "Required column missing for Sanctions (need name)."
347
- df = clean_df.copy()
348
- sanc = sanctions_df if sanctions_df is not None else DEMO_SANCTIONS.copy()
349
- sanc = _standardize_df(sanc)
350
- if "name" not in sanc.columns:
351
- for c in sanc.columns:
352
- if "name" in c: sanc = sanc.rename(columns={c:"name"}); break
353
- sanc_names = sanc["name"].dropna().astype(str).tolist()
354
- matches=[]
355
- for idx, row in df.iterrows():
356
- nm = str(row[colmap["name"]] or "").strip()
357
- if not nm: continue
358
- if any(nm.lower()==s.lower() for s in sanc_names):
359
- matches.append((idx,"exact")); continue
360
- if any(token_overlap(nm, s) >= 2 for s in sanc_names):
361
- matches.append((idx,"fuzzy"))
362
- flagged = df.loc[[i for i,_ in matches]].copy() if matches else pd.DataFrame()
363
- if not flagged.empty:
364
- mt = {i:t for i,t in matches}
365
- flagged = flagged.assign(match_type=[mt.get(i,"") for i in flagged.index])
366
- stats = f"Sanctions matches: {len(flagged)} of {len(df)}. (Using {'uploaded/MCP' if sanctions_df is not None else 'demo'} list)"
367
- return flagged, stats
368
-
369
- CR_EXPECTED = {
370
- "customer_id":["cust_id","user_id","client_id"],
371
- "credit_score":["creditscore","score"],
372
- "utilization":["util","credit_utilization","utilization_ratio"],
373
- "dti":["debt_to_income","debt_to_income_ratio"],
374
- "recent_defaults":["defaults","recentdefaults"],
375
- "income":["annual_income","salary"]
376
- }
377
- def prepare_credit(df): return _prepare_generic(df, CR_EXPECTED)
378
-
379
- def detect_credit(clean_df, colmap):
380
- needed = ["credit_score","utilization","dti","recent_defaults","income"]
381
- if not any(k in colmap for k in needed):
382
- return pd.DataFrame(), "Required columns missing for Credit Risk."
383
- df = clean_df.copy()
384
- cs = df[colmap.get("credit_score","credit_score")] if "credit_score" in colmap else pd.Series([np.nan]*len(df))
385
- util= df[colmap.get("utilization","utilization")] if "utilization" in colmap else pd.Series([np.nan]*len(df))
386
- dti = df[colmap.get("dti","dti")] if "dti" in colmap else pd.Series([np.nan]*len(df))
387
- rde = df[colmap.get("recent_defaults","recent_defaults")] if "recent_defaults" in colmap else pd.Series([np.nan]*len(df))
388
- inc = df[colmap.get("income","income")] if "income" in colmap else pd.Series([np.nan]*len(df))
389
- out=[]
390
- for i in range(len(df)):
391
- hits=0; reasons=[]
392
- if pd.notna(cs.iloc[i]) and cs.iloc[i] < 600: hits+=1; reasons.append("credit_score<600")
393
- if pd.notna(util.iloc[i]) and util.iloc[i] > 0.8: hits+=1; reasons.append("utilization>0.8")
394
- if pd.notna(dti.iloc[i]) and dti.iloc[i] > 0.4: hits+=1; reasons.append("DTI>0.4")
395
- if pd.notna(rde.iloc[i]) and rde.iloc[i] > 0: hits+=1; reasons.append("recent_defaults>0")
396
- if pd.notna(inc.iloc[i]) and inc.iloc[i] < 30000: hits+=1; reasons.append("income<30000")
397
- level = "High" if hits>=3 else ("Medium" if hits==2 else ("Low" if hits==1 else "None"))
398
- out.append((hits, level, ", ".join(reasons)))
399
- risk_score=[x[0] for x in out]; risk_level=[x[1] for x in out]; reason=[x[2] for x in out]
400
- res = df.assign(risk_score=risk_score, risk_level=risk_level, risk_reason=reason)
401
- flagged = res[res["risk_level"].isin(["High","Medium","Low"]) & (res["risk_level"]!="None")]
402
- stats = f"Credit Risk flagged: {len(flagged)} of {len(df)}. Distribution: High={(res['risk_level']=='High').sum()}, Medium={(res['risk_level']=='Medium').sum()}, Low={(res['risk_level']=='Low').sum()}."
403
- return flagged, stats
404
-
405
- # ------------------------
406
- # Summarizer (notice-first)
407
- # ------------------------
408
- SUMMARY_SYS = "You are a helpful Fraud/Risk analyst. Be concise (<120 words), list key counts, drivers, and data quality caveats."
409
-
410
- def summarize_ai(context: str) -> str:
411
- """
412
- If chat LLM is available, use it to generate a short summary.
413
- Otherwise return the prototype notice string.
414
- """
415
- if CHAT_LLM is None:
416
- return SUMMARY_NOTICE
417
- try:
418
- out = CHAT_LLM.invoke([SystemMessage(content=SUMMARY_SYS), HumanMessage(content=context[:4000])])
419
- if hasattr(out, "content"): return out.content
420
- return str(out)
421
- except Exception as e:
422
- msg = str(e)
423
- if "401" in msg or "403" in msg:
424
- return SUMMARY_NOTICE
425
- return SUMMARY_NOTICE
426
-
427
- # ------------------------
428
- # Optional MCP
429
- # ------------------------
430
- from urllib.request import Request, urlopen
431
- def _mcp_get_json(url: str, auth_header: Optional[str]):
432
- try:
433
- req = Request(url)
434
- if auth_header:
435
- k, v = auth_header.split(":", 1)
436
- req.add_header(k.strip(), v.strip())
437
- with urlopen(req, timeout=10) as r:
438
- return json.loads(r.read().decode("utf-8"))
439
- except Exception as e:
440
- log.warning(f"MCP fetch failed: {e}")
441
- return None
442
-
443
- def mcp_fetch_sanctions() -> Optional[pd.DataFrame]:
444
- if os.getenv("ENABLE_MCP","0") not in ("1","true","TRUE"): return None
445
- url = os.getenv("MCP_SANCTIONS_URL")
446
- if not url: return None
447
- data = _mcp_get_json(url, os.getenv("MCP_AUTH_HEADER"))
448
- if not data: return None
449
- if isinstance(data, list):
450
- if all(isinstance(x, dict) for x in data):
451
- rows = [{"name": x.get("name") or x.get("Name")} for x in data if x.get("name") or x.get("Name")]
452
- return pd.DataFrame(rows) if rows else None
453
- if all(isinstance(x, str) for x in data):
454
- return pd.DataFrame({"name": data})
455
- return None
456
-
457
- def mcp_fetch_high_risk_mcc() -> Optional[List[str]]:
458
- if os.getenv("ENABLE_MCP","0") not in ("1","true","TRUE"): return None
459
- url = os.getenv("MCP_HIGH_RISK_MCC_URL")
460
- if not url: return None
461
- data = _mcp_get_json(url, os.getenv("MCP_AUTH_HEADER"))
462
- return [str(x) for x in data] if isinstance(data, list) else None
463
-
464
- # ------------------------
465
- # Pipelines (per tab)
466
- # ------------------------
467
- def run_transactions(file):
468
- try:
469
- df = _read_csv_any(file)
470
- clean, issues, quality, colmap = prepare_transactions(df)
471
- mcc = mcp_fetch_high_risk_mcc()
472
- flagged, stats = detect_transactions(clean, colmap, mcc)
473
- ctx = f"[Transactions]\n{stats}\nQuality: {quality}\nHead:\n{clean.head(5).to_csv(index=False)}\nFlagged:\n{flagged.head(5).to_csv(index=False)}"
474
- ai = summarize_ai(ctx)
475
- return ai, stats, flagged, issues
476
- except Exception as e:
477
- return f"Error: {e}", "Validation failed.", pd.DataFrame(), pd.DataFrame()
478
-
479
- def run_kyc(file):
480
- try:
481
- df = _read_csv_any(file)
482
- clean, issues, quality, colmap = prepare_kyc(df)
483
- flagged, stats = detect_kyc(clean, colmap)
484
- ctx = f"[KYC]\n{stats}\nQuality: {quality}\nHead:\n{clean.head(5).to_csv(index=False)}\nFlagged:\n{flagged.head(5).to_csv(index=False)}"
485
- ai = summarize_ai(ctx)
486
- return ai, stats, flagged, issues
487
- except Exception as e:
488
- return f"Error: {e}", "Validation failed.", pd.DataFrame(), pd.DataFrame()
489
-
490
- def run_sanctions(customers_file, sanctions_file):
491
- try:
492
- df = _read_csv_any(customers_file)
493
- clean, issues, quality, colmap = prepare_sanctions(df)
494
- sanc_df = mcp_fetch_sanctions()
495
- if sanc_df is None and sanctions_file is not None:
496
- sanc_df = _read_csv_any(sanctions_file)
497
- flagged, stats = detect_sanctions(clean, colmap, sanc_df)
498
- ctx = f"[Sanctions]\n{stats}\nQuality: {quality}\nHead:\n{clean.head(5).to_csv(index=False)}\nMatches:\n{flagged.head(5).to_csv(index=False)}"
499
- ai = summarize_ai(ctx)
500
- return ai, stats, flagged, issues
501
- except Exception as e:
502
- return f"Error: {e}", "Validation failed.", pd.DataFrame(), pd.DataFrame()
503
-
504
- def run_credit(file):
505
- try:
506
- df = _read_csv_any(file)
507
- clean, issues, quality, colmap = prepare_credit(df)
508
- flagged, stats = detect_credit(clean, colmap)
509
- ctx = f"[Credit]\n{stats}\nQuality: {quality}\nHead:\n{clean.head(5).to_csv(index=False)}\nFlagged:\n{flagged.head(5).to_csv(index=False)}"
510
- ai = summarize_ai(ctx)
511
- return ai, stats, flagged, issues
512
- except Exception as e:
513
- return f"Error: {e}", "Validation failed.", pd.DataFrame(), pd.DataFrame()
514
-
515
- # ------------------------
516
- # Tools (CSV text in → concise text out)
517
- # ------------------------
518
- def _csv_text_to_df(csv_text: str) -> pd.DataFrame:
519
- return pd.read_csv(io.StringIO(csv_text))
520
-
521
- class TransactionCSVInput(BaseModel):
522
- csv_text: str = Field(..., description="Transactions CSV text")
523
-
524
- @tool("transactions_fraud_tool", args_schema=TransactionCSVInput)
525
- def transactions_fraud_tool(csv_text: str) -> str:
526
- df = _csv_text_to_df(csv_text)
527
- clean, issues, quality, colmap = prepare_transactions(df)
528
- flagged, stats = detect_transactions(clean, colmap)
529
- return f"{stats}\nDQ issues: {len(issues)}\nFirst flagged:\n{flagged.head(5).to_csv(index=False)}"[:2800]
530
-
531
- class KYCCSVInput(BaseModel):
532
- csv_text: str = Field(..., description="KYC CSV text")
533
-
534
- @tool("kyc_fraud_tool", args_schema=KYCCSVInput)
535
- def kyc_fraud_tool(csv_text: str) -> str:
536
- df = _csv_text_to_df(csv_text)
537
- clean, issues, quality, colmap = prepare_kyc(df)
538
- flagged, stats = detect_kyc(clean, colmap)
539
- return f"{stats}\nDQ issues: {len(issues)}\nFirst flagged:\n{flagged.head(5).to_csv(index=False)}"[:2800]
540
-
541
- class SanctionsCSVInput(BaseModel):
542
- csv_text: str = Field(..., description="Customers CSV text with a 'name' column")
543
-
544
- @tool("sanctions_pep_tool", args_schema=SanctionsCSVInput)
545
- def sanctions_pep_tool(csv_text: str) -> str:
546
- df = _csv_text_to_df(csv_text)
547
- clean, issues, quality, colmap = prepare_sanctions(df)
548
- flagged, stats = detect_sanctions(clean, colmap)
549
- return f"{stats}\nDQ issues: {len(issues)}\nFirst matches:\n{flagged.head(5).to_csv(index=False)}"[:2800]
550
-
551
- class CreditCSVInput(BaseModel):
552
- csv_text: str = Field(..., description="Credit CSV text")
553
-
554
- @tool("credit_risk_tool", args_schema=CreditCSVInput)
555
- def credit_risk_tool(csv_text: str) -> str:
556
- df = _csv_text_to_df(csv_text)
557
- clean, issues, quality, colmap = prepare_credit(df)
558
- flagged, stats = detect_credit(clean, colmap)
559
- return f"{stats}\nDQ issues: {len(issues)}\nFirst flagged:\n{flagged.head(5).to_csv(index=False)}"[:2800]
560
-
561
- TOOLS: List[Tool] = [
562
- transactions_fraud_tool,
563
- kyc_fraud_tool,
564
- sanctions_pep_tool,
565
- credit_risk_tool,
566
- ]
567
-
568
- # ------------------------
569
- # Agent (chat-completions)
570
- # ------------------------
571
- AGENT_SYSTEM = """You are an AI Consultant for Fraud/Risk.
572
- You have tools for Transactions, KYC, Sanctions/PEP, and Credit Risk.
573
- If the user pastes a small CSV snippet, pick the relevant tool and analyze it.
574
- Be concise and actionable."""
575
-
576
- def build_agent():
577
- if CHAT_LLM is None:
578
- class Stub:
579
- def invoke(self, prompt): return CHAT_NOTICE
580
- return Stub()
581
- return initialize_agent(
582
- TOOLS,
583
- CHAT_LLM,
584
- agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
585
- verbose=False,
586
- agent_kwargs={"system_message": AGENT_SYSTEM},
587
- handle_parsing_errors=True,
588
- )
589
-
590
- AGENT = build_agent()
591
-
592
- def agent_reply(history: List[Dict], user_msg: str):
593
- try:
594
- looks_like_csv = ("," in user_msg) and ("\n" in user_msg) and (user_msg.count(",") >= 2)
595
- prompt = f"CSV snippet detected. Decide tool and analyze:\n\n{user_msg}" if looks_like_csv else user_msg
596
- res = AGENT.invoke(prompt)
597
- if isinstance(res, dict) and "output" in res: return res["output"]
598
- return str(res)
599
- except Exception as e:
600
- return f"Agent error: {e}"
601
-
602
- # ------------------------
603
- # UI
604
- # ------------------------
605
- with gr.Blocks(title="Fraud Detector Analyst — LangChain + MCP", theme=gr.themes.Soft()) as demo:
606
- gr.Markdown("# 🛡️ Fraud Detector Analyst — LangChain + MCP")
607
- gr.Markdown(
608
- "This prototype runs **rules & data checks locally**. "
609
- "Chat + AI summaries require a remote inference provider (HF Inference)."
610
- )
611
-
612
- with gr.Tabs():
613
- with gr.Tab("Transactions"):
614
- gr.Markdown("Upload a **transactions** CSV.")
615
- tx_file = gr.File(file_types=[".csv"], label="Transactions CSV", type="binary")
616
- tx_ai = gr.Textbox(label="AI Summary (requires inference)", value=SUMMARY_NOTICE, lines=6)
617
- tx_stats = gr.Textbox(label="Stats", lines=3)
618
- tx_flagged = gr.Dataframe(label="Flagged Transactions")
619
- tx_issues = gr.Dataframe(label="Data Quality Issues (row, field, issue, value)")
620
- tx_file.upload(run_transactions, inputs=[tx_file], outputs=[tx_ai, tx_stats, tx_flagged, tx_issues])
621
-
622
- with gr.Tab("KYC"):
623
- gr.Markdown("Upload a **KYC** CSV.")
624
- kyc_file = gr.File(file_types=[".csv"], label="KYC CSV", type="binary")
625
- kyc_ai = gr.Textbox(label="AI Summary (requires inference)", value=SUMMARY_NOTICE, lines=6)
626
- kyc_stats = gr.Textbox(label="Stats", lines=3)
627
- kyc_flagged = gr.Dataframe(label="Flagged KYC Rows")
628
- kyc_issues = gr.Dataframe(label="Data Quality Issues")
629
- kyc_file.upload(run_kyc, inputs=[kyc_file], outputs=[kyc_ai, kyc_stats, kyc_flagged, kyc_issues])
630
-
631
- with gr.Tab("Sanctions/PEP"):
632
- gr.Markdown("Upload **customers** CSV (+ optional sanctions CSV).")
633
- san_customers = gr.File(file_types=[".csv"], label="Customers CSV", type="binary")
634
- san_list = gr.File(file_types=[".csv"], label="Sanctions/PEP CSV (optional)", type="binary")
635
- san_ai = gr.Textbox(label="AI Summary (requires inference)", value=SUMMARY_NOTICE, lines=6)
636
- san_stats = gr.Textbox(label="Stats", lines=3)
637
- san_flagged = gr.Dataframe(label="Matches")
638
- san_issues = gr.Dataframe(label="Data Quality Issues")
639
- san_customers.upload(run_sanctions, inputs=[san_customers, san_list], outputs=[san_ai, san_stats, san_flagged, san_issues])
640
- san_list.upload(run_sanctions, inputs=[san_customers, san_list], outputs=[san_ai, san_stats, san_flagged, san_issues])
641
-
642
- with gr.Tab("Credit Risk"):
643
- gr.Markdown("Upload a **credit** CSV.")
644
- cr_file = gr.File(file_types=[".csv"], label="Credit CSV", type="binary")
645
- cr_ai = gr.Textbox(label="AI Summary (requires inference)", value=SUMMARY_NOTICE, lines=6)
646
- cr_stats = gr.Textbox(label="Stats", lines=3)
647
- cr_flagged = gr.Dataframe(label="Flagged Applicants")
648
- cr_issues = gr.Dataframe(label="Data Quality Issues")
649
- cr_file.upload(run_credit, inputs=[cr_file], outputs=[cr_ai, cr_stats, cr_flagged, cr_issues])
650
-
651
- with gr.Tab("AI Consultant (Agent)"):
652
- gr.Markdown("Paste a small CSV snippet or ask questions. Uses chat-completions when configured.")
653
- chatbot = gr.Chatbot(type="messages", label="Fraud AI Consultant")
654
- user_in = gr.Textbox(label="Message or CSV snippet")
655
- send_btn = gr.Button("Send")
656
- def _chat_fn(history, msg):
657
- reply = agent_reply(history, msg)
658
- history = (history or []) + [{"role":"user","content":msg}, {"role":"assistant","content":reply}]
659
- return history, ""
660
- send_btn.click(_chat_fn, inputs=[chatbot, user_in], outputs=[chatbot, user_in])
661
-
662
- gr.Markdown(
663
- "### ⚙️ Enable inference\n"
664
- "- Set **HF_TOKEN** (or HF_SPACES on Spaces)\n"
665
- "- Optional: **LC_CHAT_MODEL** (default Qwen 0.5B Instruct), **LC_CHAT_MODEL_FALLBACK** (default Mistral 7B Instruct)\n"
666
- "- Optional MCP: `ENABLE_MCP=1`, `MCP_SANCTIONS_URL`, `MCP_HIGH_RISK_MCC_URL`, `MCP_AUTH_HEADER`"
667
- )
668
-
669
- if __name__ == "__main__":
670
- demo.launch(server_name="0.0.0.0", server_port=7860)
 
1
+ """TODO: implement app.py (split from app_monolith_backup.py)"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app_monolith_backup.py ADDED
@@ -0,0 +1,670 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Fraud Detector Analyst — LangChain + (optional) MCP
3
+ Advanced “prototype-first” build:
4
+ - Chat uses chat-completion models (LangChain ChatHuggingFace).
5
+ - AI Summary shows a notice when no inference is connected.
6
+
7
+ LLM env (serverless friendly):
8
+ HF_TOKEN (or HF_SPACES)
9
+ LC_CHAT_MODEL (default: "Qwen/Qwen2.5-0.5B-Instruct")
10
+ LC_CHAT_MODEL_FALLBACK (default: "mistralai/Mistral-7B-Instruct")
11
+
12
+ Summary behavior:
13
+ If no working inference/token -> summary fields display:
14
+ "🔌 Please connect to an inference point to generate summary."
15
+
16
+ Optional MCP:
17
+ ENABLE_MCP=1
18
+ MCP_SANCTIONS_URL, MCP_HIGH_RISK_MCC_URL
19
+ MCP_AUTH_HEADER="Authorization: Bearer <token>"
20
+
21
+ Run:
22
+ pip install -r requirements.txt
23
+ python app.py
24
+ On Spaces:
25
+ Add secret HF_TOKEN (or HF_SPACES). Launch.
26
+ """
27
+
28
+ from __future__ import annotations
29
+
30
+ import os, io, re, json, math, unicodedata, logging
31
+ from typing import Optional, Tuple, List, Dict
32
+
33
+ import numpy as np
34
+ import pandas as pd
35
+ import gradio as gr
36
+ from dotenv import load_dotenv
37
+
38
+ # LangChain
39
+ from langchain.tools import tool
40
+ from langchain_core.tools import Tool
41
+ from langchain.agents import initialize_agent, AgentType
42
+ from langchain.schema import HumanMessage, SystemMessage
43
+
44
+ from pydantic import BaseModel, Field
45
+ from langchain_huggingface import HuggingFaceEndpoint, ChatHuggingFace
46
+
47
+ # Phone normalization
48
+ try:
49
+ import phonenumbers
50
+ HAVE_PHONENUM = True
51
+ except Exception:
52
+ HAVE_PHONENUM = False
53
+
54
+ # ------------------------
55
+ # Setup
56
+ # ------------------------
57
+ load_dotenv()
58
+ logging.basicConfig(level=logging.INFO)
59
+ log = logging.getLogger("fraud-analyst")
60
+
61
+ HF_TOKEN = os.getenv("HF_TOKEN") or os.getenv("HF_SPACES")
62
+
63
+ # Chat models (chat-completions)
64
+ DEFAULT_CHAT_MODEL = os.getenv("LC_CHAT_MODEL", "Qwen/Qwen2.5-0.5B-Instruct")
65
+ FALLBACK_CHAT_MODEL = os.getenv("LC_CHAT_MODEL_FALLBACK", "mistralai/Mistral-7B-Instruct")
66
+
67
+ SUMMARY_NOTICE = "🔌 Please connect to an inference point to generate summary."
68
+ CHAT_NOTICE = "🔌 Chat model not configured. Set HF_TOKEN and LC_CHAT_MODEL to enable chat."
69
+
70
+ # ------------------------
71
+ # LLM builders
72
+ # ------------------------
73
+ def _mk_chat_llm(model_id: str) -> ChatHuggingFace:
74
+ """
75
+ ChatHuggingFace uses HF Inference under the hood.
76
+ Although the backend task is 'text-generation', this wrapper handles chat-style messages.
77
+ """
78
+ base = HuggingFaceEndpoint(
79
+ repo_id=model_id,
80
+ task="text-generation",
81
+ huggingfacehub_api_token=HF_TOKEN,
82
+ max_new_tokens=256,
83
+ temperature=0.2,
84
+ repetition_penalty=1.05,
85
+ timeout=60,
86
+ )
87
+ return ChatHuggingFace(llm=base)
88
+
89
+ def _heartbeat_chat(model_id: str) -> bool:
90
+ try:
91
+ chat = _mk_chat_llm(model_id)
92
+ _ = chat.invoke([HumanMessage(content="ok")])
93
+ return True
94
+ except Exception as e:
95
+ log.warning(f"Heartbeat failed for {model_id}: {str(e)[:160]}")
96
+ return False
97
+
98
+ def build_chat_llm() -> Optional[ChatHuggingFace]:
99
+ """
100
+ Returns a working ChatHuggingFace or None (if token/permissions missing).
101
+ """
102
+ log.info(f"HF token present: {bool(HF_TOKEN)} len={len(HF_TOKEN) if HF_TOKEN else 0}")
103
+ if HF_TOKEN and _heartbeat_chat(DEFAULT_CHAT_MODEL):
104
+ log.info(f"Using chat model: {DEFAULT_CHAT_MODEL}")
105
+ return _mk_chat_llm(DEFAULT_CHAT_MODEL)
106
+ if HF_TOKEN and _heartbeat_chat(FALLBACK_CHAT_MODEL):
107
+ log.info(f"Using fallback chat model: {FALLBACK_CHAT_MODEL}")
108
+ return _mk_chat_llm(FALLBACK_CHAT_MODEL)
109
+ log.warning("No working chat model; chat will show a notice.")
110
+ return None
111
+
112
+ CHAT_LLM = build_chat_llm()
113
+
114
+ # ------------------------
115
+ # Normalization helpers
116
+ # ------------------------
117
+ def _norm_colname(c: str) -> str:
118
+ c = c.strip().lower()
119
+ c = re.sub(r"\s+", "_", c)
120
+ c = re.sub(r"[^\w]+", "_", c)
121
+ return c.strip("_")
122
+
123
+ def _nfkc(s: str) -> str:
124
+ return unicodedata.normalize("NFKC", s)
125
+
126
+ def _collapse_ws(s: str) -> str:
127
+ return re.sub(r"\s+", " ", s).strip()
128
+
129
+ def _clean_str(x):
130
+ if pd.isna(x): return x
131
+ return _collapse_ws(_nfkc(str(x)))
132
+
133
+ def _is_email(s: str) -> bool:
134
+ return bool(re.match(r"^[A-Za-z0-9._%+\-]+@[A-Za-z0-9.\-]+\.[A-Za-z]{2,}$", s or ""))
135
+
136
+ def _clean_phone(s: str, default_region: str = "IN"):
137
+ if s is None or str(s).strip() == "":
138
+ return None, "missing_phone"
139
+ raw = re.sub(r"[^\d+]", "", str(s))
140
+ if HAVE_PHONENUM:
141
+ try:
142
+ pn = phonenumbers.parse(raw, default_region)
143
+ if phonenumbers.is_possible_number(pn) and phonenumbers.is_valid_number(pn):
144
+ return phonenumbers.format_number(pn, phonenumbers.PhoneNumberFormat.E164), None
145
+ return raw, "invalid_phone"
146
+ except Exception:
147
+ return raw, "invalid_phone"
148
+ digits = re.sub(r"\D", "", raw)
149
+ return (digits, None) if 8 <= len(digits) <= 15 else (digits, "invalid_phone")
150
+
151
+ def _parse_datetime(s):
152
+ try:
153
+ return pd.to_datetime(s, errors="coerce", utc=True)
154
+ except Exception:
155
+ return pd.NaT
156
+
157
+ def _to_numeric(series: pd.Series):
158
+ coerced = pd.to_numeric(series, errors="coerce")
159
+ return coerced, (coerced.isna() & series.notna())
160
+
161
+ def _read_csv_any(file_obj) -> pd.DataFrame:
162
+ if file_obj is None:
163
+ raise ValueError("No file uploaded.")
164
+ if hasattr(file_obj, "name"):
165
+ p = file_obj.name
166
+ try: return pd.read_csv(p)
167
+ except Exception: return pd.read_csv(p, encoding="latin-1")
168
+ try: return pd.read_csv(file_obj)
169
+ except Exception:
170
+ file_obj.seek(0)
171
+ return pd.read_csv(file_obj, encoding="latin-1")
172
+
173
+ def _standardize_df(df: pd.DataFrame) -> pd.DataFrame:
174
+ df = df.copy()
175
+ df.columns = [_norm_colname(c) for c in df.columns]
176
+ for c in df.select_dtypes(include=["object"]).columns:
177
+ df[c] = df[c].apply(_clean_str)
178
+ return df
179
+
180
+ def _prepare_generic(df: pd.DataFrame, expected: Dict[str, List[str]]):
181
+ issues = []
182
+ df0 = _standardize_df(df)
183
+
184
+ # Synonym mapping
185
+ colmap = {}
186
+ cols = set(df0.columns)
187
+ for canon, syns in expected.items():
188
+ found = None
189
+ for s in [canon] + syns:
190
+ s = _norm_colname(s)
191
+ if s in cols:
192
+ found = s; break
193
+ if found: colmap[canon] = found
194
+
195
+ # Email/phone quality
196
+ for c in list(df0.columns):
197
+ if "email" in c:
198
+ df0[c] = df0[c].apply(lambda x: str(x).lower().strip() if pd.notna(x) else x)
199
+ for idx, v in df0[c].items():
200
+ if pd.isna(v) or str(v).strip()=="":
201
+ issues.append({"row": idx, "field": c, "issue":"missing_email","value":""})
202
+ elif not _is_email(v):
203
+ issues.append({"row": idx, "field": c, "issue":"invalid_email","value":str(v)})
204
+ if "phone" in c or "mobile" in c:
205
+ vals = []
206
+ for idx, v in df0[c].items():
207
+ e164, prob = _clean_phone(v)
208
+ vals.append(e164)
209
+ if prob: issues.append({"row": idx, "field": c, "issue":prob, "value":str(v)})
210
+ df0[c] = vals
211
+
212
+ # Datetime parsing
213
+ for c in df0.columns:
214
+ if any(k in c for k in ["date","time","timestamp","created_at","updated_at"]):
215
+ parsed = _parse_datetime(df0[c])
216
+ bad = parsed.isna() & df0[c].notna()
217
+ for idx in df0.index[bad]:
218
+ issues.append({"row": int(idx), "field": c, "issue":"unparseable_timestamp", "value":str(df0.loc[idx, c])})
219
+ df0[c] = parsed
220
+
221
+ # Numeric coercions for common fields
222
+ for nc in ["amount","credit_score","utilization","dti","recent_defaults","income"]:
223
+ for c in df0.columns:
224
+ if c == nc or c.endswith("_"+nc) or nc in c:
225
+ coerced, badmask = _to_numeric(df0[c])
226
+ for idx in df0.index[badmask]:
227
+ issues.append({"row": int(idx), "field": c, "issue":"non_numeric", "value":str(df0.loc[idx, c])})
228
+ df0[c] = coerced
229
+
230
+ issues_df = pd.DataFrame(issues, columns=["row","field","issue","value"])
231
+ missing = [k for k in expected.keys() if k not in colmap]
232
+ quality_summary = f"Rows={len(df0)}, Cols={len(df0.columns)}; Missing required fields: {missing if missing else 'None'}"
233
+ return df0, issues_df, quality_summary, colmap
234
+
235
+ # ------------------------
236
+ # Modules & Rules
237
+ # ------------------------
238
+ TX_EXPECTED = {
239
+ "transaction_id":["txn_id","transactionid","id","tx_id"],
240
+ "customer_id":["cust_id","user_id","client_id"],
241
+ "amount":["amt","amount_inr","value"],
242
+ "timestamp":["date","event_time","created_at","tx_time"],
243
+ "merchant_category":["mcc","merchant_cat","category"]
244
+ }
245
+ def prepare_transactions(df): return _prepare_generic(df, TX_EXPECTED)
246
+
247
+ def detect_transactions(clean_df, colmap, high_risk_mcc: Optional[List[str]]=None):
248
+ high_risk = set(["HIGH_RISK","GAMBLING","CRYPTO_EXCHANGE","ESCORTS","CASINO"])
249
+ if high_risk_mcc:
250
+ high_risk.update([_nfkc(x).strip().upper().replace(" ","_") for x in high_risk_mcc])
251
+ if not all(k in colmap for k in ["customer_id","amount"]):
252
+ return pd.DataFrame(), "Required columns missing for detection (need at least customer_id, amount)."
253
+ df = clean_df.copy()
254
+ reasons = []
255
+ amtcol = colmap.get("amount")
256
+ if amtcol is not None:
257
+ reasons.append(("large_amount>10k", df[amtcol] > 10000))
258
+ reasons.append(("negative_amount", df[amtcol] < 0))
259
+ if "merchant_category" in colmap:
260
+ mcc = colmap["merchant_category"]
261
+ high = df[mcc].astype(str).str.upper().str.replace(" ","_", regex=False).isin(high_risk)
262
+ reasons.append(("merchant_category_high_risk", high))
263
+ if all(k in colmap for k in ["customer_id","timestamp","amount"]):
264
+ cid, ts, amt = colmap["customer_id"], colmap["timestamp"], colmap["amount"]
265
+ daily = df.groupby([cid, df[ts].dt.date])[amt].transform("sum")
266
+ reasons.append(("daily_sum_per_customer>50k", daily > 50000))
267
+ mask = None
268
+ for _, m in reasons:
269
+ mask = m if mask is None else (mask | m)
270
+ flagged = df[mask] if mask is not None else pd.DataFrame()
271
+ if not flagged.empty:
272
+ rr=[]
273
+ for _, row in flagged.iterrows():
274
+ hits=[]
275
+ a = row[amtcol] if amtcol in flagged.columns else None
276
+ if pd.notna(a) and a>10000: hits.append("large_amount")
277
+ if pd.notna(a) and a<0: hits.append("negative_amount")
278
+ if "merchant_category" in colmap:
279
+ val = str(row[colmap["merchant_category"]]).upper().replace(" ","_")
280
+ if val in high_risk: hits.append("mcc_high_risk")
281
+ # daily sum check reconstructed
282
+ try:
283
+ if all(k in colmap for k in ["customer_id","timestamp","amount"]):
284
+ sub = df[(df[colmap["customer_id"]]==row[colmap["customer_id"]]) &
285
+ (df[colmap["timestamp"]].dt.date==pd.to_datetime(row[colmap["timestamp"]], errors="coerce").date())]
286
+ if sub[colmap["amount"]].sum() > 50000: hits.append("daily_sum>50k")
287
+ except Exception: pass
288
+ rr.append(", ".join(sorted(set(hits))) or "rule_hit")
289
+ flagged = flagged.assign(risk_reason=rr)
290
+ stats = f"Transactions flagged: {len(flagged)} of {len(df)}."
291
+ return flagged, stats
292
+
293
+ KYC_EXPECTED = {
294
+ "customer_id":["cust_id","user_id","client_id"],
295
+ "name":["full_name","customer_name"],
296
+ "email":["email_address","mail"],
297
+ "phone":["phone_number","mobile","contact"],
298
+ "dob":["date_of_birth","birthdate"]
299
+ }
300
+ def prepare_kyc(df): return _prepare_generic(df, KYC_EXPECTED)
301
+
302
+ def _age_years(dob: pd.Series) -> pd.Series:
303
+ now = pd.Timestamp.utcnow()
304
+ return (now - dob).dt.days / 365.25
305
+
306
+ def detect_kyc(clean_df, colmap):
307
+ if not all(k in colmap for k in ["customer_id","name"]):
308
+ return pd.DataFrame(), "Required columns missing for KYC (need at least customer_id, name)."
309
+ df = clean_df.copy()
310
+ reasons=[]
311
+ if "email" in colmap:
312
+ dupe_email = df.duplicated(subset=[colmap["email"]], keep=False) & df[colmap["email"]].notna()
313
+ reasons.append(("duplicate_email", dupe_email))
314
+ if "phone" in colmap:
315
+ dupe_phone = df.duplicated(subset=[colmap["phone"]], keep=False) & df[colmap["phone"]].notna()
316
+ reasons.append(("duplicate_phone", dupe_phone))
317
+ if "dob" in colmap:
318
+ age = _age_years(df[colmap["dob"]])
319
+ invalid = (df[colmap["dob"]].isna()) | (df[colmap["dob"]] > pd.Timestamp.utcnow()) | (age > 120)
320
+ reasons.append(("invalid_dob", invalid))
321
+ if "name" in colmap:
322
+ name = df[colmap["name"]].astype(str)
323
+ susp = name.str.isupper() | name.str.contains(r"\d") | (name.str.len()<3)
324
+ reasons.append(("suspicious_name", susp))
325
+ mask = None
326
+ for _, m in reasons:
327
+ mask = m if mask is None else (mask | m)
328
+ flagged = df[mask] if mask is not None else pd.DataFrame()
329
+ if not flagged.empty:
330
+ flagged = flagged.assign(risk_reason="kyc_rule_hit")
331
+ stats = f"KYC flagged: {len(flagged)} of {len(df)}."
332
+ return flagged, stats
333
+
334
+ SAN_EXPECTED = {"customer_id":["cust_id","user_id","client_id"], "name":["full_name","customer_name"]}
335
+ def prepare_sanctions(df): return _prepare_generic(df, SAN_EXPECTED)
336
+
337
+ DEMO_SANCTIONS = pd.DataFrame({"name":["Ivan Petrov","Global Terror Org","Acme Front LLC","John Doe (PEP)","Shadow Brokers"]})
338
+
339
+ def token_overlap(a: str, b: str) -> int:
340
+ at = set(re.findall(r"[A-Za-z0-9]+", a.lower()))
341
+ bt = set(re.findall(r"[A-Za-z0-9]+", b.lower()))
342
+ return len(at & bt)
343
+
344
+ def detect_sanctions(clean_df, colmap, sanctions_df: Optional[pd.DataFrame]=None):
345
+ if "name" not in colmap:
346
+ return pd.DataFrame(), "Required column missing for Sanctions (need name)."
347
+ df = clean_df.copy()
348
+ sanc = sanctions_df if sanctions_df is not None else DEMO_SANCTIONS.copy()
349
+ sanc = _standardize_df(sanc)
350
+ if "name" not in sanc.columns:
351
+ for c in sanc.columns:
352
+ if "name" in c: sanc = sanc.rename(columns={c:"name"}); break
353
+ sanc_names = sanc["name"].dropna().astype(str).tolist()
354
+ matches=[]
355
+ for idx, row in df.iterrows():
356
+ nm = str(row[colmap["name"]] or "").strip()
357
+ if not nm: continue
358
+ if any(nm.lower()==s.lower() for s in sanc_names):
359
+ matches.append((idx,"exact")); continue
360
+ if any(token_overlap(nm, s) >= 2 for s in sanc_names):
361
+ matches.append((idx,"fuzzy"))
362
+ flagged = df.loc[[i for i,_ in matches]].copy() if matches else pd.DataFrame()
363
+ if not flagged.empty:
364
+ mt = {i:t for i,t in matches}
365
+ flagged = flagged.assign(match_type=[mt.get(i,"") for i in flagged.index])
366
+ stats = f"Sanctions matches: {len(flagged)} of {len(df)}. (Using {'uploaded/MCP' if sanctions_df is not None else 'demo'} list)"
367
+ return flagged, stats
368
+
369
+ CR_EXPECTED = {
370
+ "customer_id":["cust_id","user_id","client_id"],
371
+ "credit_score":["creditscore","score"],
372
+ "utilization":["util","credit_utilization","utilization_ratio"],
373
+ "dti":["debt_to_income","debt_to_income_ratio"],
374
+ "recent_defaults":["defaults","recentdefaults"],
375
+ "income":["annual_income","salary"]
376
+ }
377
+ def prepare_credit(df): return _prepare_generic(df, CR_EXPECTED)
378
+
379
+ def detect_credit(clean_df, colmap):
380
+ needed = ["credit_score","utilization","dti","recent_defaults","income"]
381
+ if not any(k in colmap for k in needed):
382
+ return pd.DataFrame(), "Required columns missing for Credit Risk."
383
+ df = clean_df.copy()
384
+ cs = df[colmap.get("credit_score","credit_score")] if "credit_score" in colmap else pd.Series([np.nan]*len(df))
385
+ util= df[colmap.get("utilization","utilization")] if "utilization" in colmap else pd.Series([np.nan]*len(df))
386
+ dti = df[colmap.get("dti","dti")] if "dti" in colmap else pd.Series([np.nan]*len(df))
387
+ rde = df[colmap.get("recent_defaults","recent_defaults")] if "recent_defaults" in colmap else pd.Series([np.nan]*len(df))
388
+ inc = df[colmap.get("income","income")] if "income" in colmap else pd.Series([np.nan]*len(df))
389
+ out=[]
390
+ for i in range(len(df)):
391
+ hits=0; reasons=[]
392
+ if pd.notna(cs.iloc[i]) and cs.iloc[i] < 600: hits+=1; reasons.append("credit_score<600")
393
+ if pd.notna(util.iloc[i]) and util.iloc[i] > 0.8: hits+=1; reasons.append("utilization>0.8")
394
+ if pd.notna(dti.iloc[i]) and dti.iloc[i] > 0.4: hits+=1; reasons.append("DTI>0.4")
395
+ if pd.notna(rde.iloc[i]) and rde.iloc[i] > 0: hits+=1; reasons.append("recent_defaults>0")
396
+ if pd.notna(inc.iloc[i]) and inc.iloc[i] < 30000: hits+=1; reasons.append("income<30000")
397
+ level = "High" if hits>=3 else ("Medium" if hits==2 else ("Low" if hits==1 else "None"))
398
+ out.append((hits, level, ", ".join(reasons)))
399
+ risk_score=[x[0] for x in out]; risk_level=[x[1] for x in out]; reason=[x[2] for x in out]
400
+ res = df.assign(risk_score=risk_score, risk_level=risk_level, risk_reason=reason)
401
+ flagged = res[res["risk_level"].isin(["High","Medium","Low"]) & (res["risk_level"]!="None")]
402
+ stats = f"Credit Risk flagged: {len(flagged)} of {len(df)}. Distribution: High={(res['risk_level']=='High').sum()}, Medium={(res['risk_level']=='Medium').sum()}, Low={(res['risk_level']=='Low').sum()}."
403
+ return flagged, stats
404
+
405
+ # ------------------------
406
+ # Summarizer (notice-first)
407
+ # ------------------------
408
+ SUMMARY_SYS = "You are a helpful Fraud/Risk analyst. Be concise (<120 words), list key counts, drivers, and data quality caveats."
409
+
410
+ def summarize_ai(context: str) -> str:
411
+ """
412
+ If chat LLM is available, use it to generate a short summary.
413
+ Otherwise return the prototype notice string.
414
+ """
415
+ if CHAT_LLM is None:
416
+ return SUMMARY_NOTICE
417
+ try:
418
+ out = CHAT_LLM.invoke([SystemMessage(content=SUMMARY_SYS), HumanMessage(content=context[:4000])])
419
+ if hasattr(out, "content"): return out.content
420
+ return str(out)
421
+ except Exception as e:
422
+ msg = str(e)
423
+ if "401" in msg or "403" in msg:
424
+ return SUMMARY_NOTICE
425
+ return SUMMARY_NOTICE
426
+
427
+ # ------------------------
428
+ # Optional MCP
429
+ # ------------------------
430
+ from urllib.request import Request, urlopen
431
+ def _mcp_get_json(url: str, auth_header: Optional[str]):
432
+ try:
433
+ req = Request(url)
434
+ if auth_header:
435
+ k, v = auth_header.split(":", 1)
436
+ req.add_header(k.strip(), v.strip())
437
+ with urlopen(req, timeout=10) as r:
438
+ return json.loads(r.read().decode("utf-8"))
439
+ except Exception as e:
440
+ log.warning(f"MCP fetch failed: {e}")
441
+ return None
442
+
443
+ def mcp_fetch_sanctions() -> Optional[pd.DataFrame]:
444
+ if os.getenv("ENABLE_MCP","0") not in ("1","true","TRUE"): return None
445
+ url = os.getenv("MCP_SANCTIONS_URL")
446
+ if not url: return None
447
+ data = _mcp_get_json(url, os.getenv("MCP_AUTH_HEADER"))
448
+ if not data: return None
449
+ if isinstance(data, list):
450
+ if all(isinstance(x, dict) for x in data):
451
+ rows = [{"name": x.get("name") or x.get("Name")} for x in data if x.get("name") or x.get("Name")]
452
+ return pd.DataFrame(rows) if rows else None
453
+ if all(isinstance(x, str) for x in data):
454
+ return pd.DataFrame({"name": data})
455
+ return None
456
+
457
+ def mcp_fetch_high_risk_mcc() -> Optional[List[str]]:
458
+ if os.getenv("ENABLE_MCP","0") not in ("1","true","TRUE"): return None
459
+ url = os.getenv("MCP_HIGH_RISK_MCC_URL")
460
+ if not url: return None
461
+ data = _mcp_get_json(url, os.getenv("MCP_AUTH_HEADER"))
462
+ return [str(x) for x in data] if isinstance(data, list) else None
463
+
464
+ # ------------------------
465
+ # Pipelines (per tab)
466
+ # ------------------------
467
+ def run_transactions(file):
468
+ try:
469
+ df = _read_csv_any(file)
470
+ clean, issues, quality, colmap = prepare_transactions(df)
471
+ mcc = mcp_fetch_high_risk_mcc()
472
+ flagged, stats = detect_transactions(clean, colmap, mcc)
473
+ ctx = f"[Transactions]\n{stats}\nQuality: {quality}\nHead:\n{clean.head(5).to_csv(index=False)}\nFlagged:\n{flagged.head(5).to_csv(index=False)}"
474
+ ai = summarize_ai(ctx)
475
+ return ai, stats, flagged, issues
476
+ except Exception as e:
477
+ return f"Error: {e}", "Validation failed.", pd.DataFrame(), pd.DataFrame()
478
+
479
+ def run_kyc(file):
480
+ try:
481
+ df = _read_csv_any(file)
482
+ clean, issues, quality, colmap = prepare_kyc(df)
483
+ flagged, stats = detect_kyc(clean, colmap)
484
+ ctx = f"[KYC]\n{stats}\nQuality: {quality}\nHead:\n{clean.head(5).to_csv(index=False)}\nFlagged:\n{flagged.head(5).to_csv(index=False)}"
485
+ ai = summarize_ai(ctx)
486
+ return ai, stats, flagged, issues
487
+ except Exception as e:
488
+ return f"Error: {e}", "Validation failed.", pd.DataFrame(), pd.DataFrame()
489
+
490
+ def run_sanctions(customers_file, sanctions_file):
491
+ try:
492
+ df = _read_csv_any(customers_file)
493
+ clean, issues, quality, colmap = prepare_sanctions(df)
494
+ sanc_df = mcp_fetch_sanctions()
495
+ if sanc_df is None and sanctions_file is not None:
496
+ sanc_df = _read_csv_any(sanctions_file)
497
+ flagged, stats = detect_sanctions(clean, colmap, sanc_df)
498
+ ctx = f"[Sanctions]\n{stats}\nQuality: {quality}\nHead:\n{clean.head(5).to_csv(index=False)}\nMatches:\n{flagged.head(5).to_csv(index=False)}"
499
+ ai = summarize_ai(ctx)
500
+ return ai, stats, flagged, issues
501
+ except Exception as e:
502
+ return f"Error: {e}", "Validation failed.", pd.DataFrame(), pd.DataFrame()
503
+
504
+ def run_credit(file):
505
+ try:
506
+ df = _read_csv_any(file)
507
+ clean, issues, quality, colmap = prepare_credit(df)
508
+ flagged, stats = detect_credit(clean, colmap)
509
+ ctx = f"[Credit]\n{stats}\nQuality: {quality}\nHead:\n{clean.head(5).to_csv(index=False)}\nFlagged:\n{flagged.head(5).to_csv(index=False)}"
510
+ ai = summarize_ai(ctx)
511
+ return ai, stats, flagged, issues
512
+ except Exception as e:
513
+ return f"Error: {e}", "Validation failed.", pd.DataFrame(), pd.DataFrame()
514
+
515
+ # ------------------------
516
+ # Tools (CSV text in → concise text out)
517
+ # ------------------------
518
+ def _csv_text_to_df(csv_text: str) -> pd.DataFrame:
519
+ return pd.read_csv(io.StringIO(csv_text))
520
+
521
+ class TransactionCSVInput(BaseModel):
522
+ csv_text: str = Field(..., description="Transactions CSV text")
523
+
524
+ @tool("transactions_fraud_tool", args_schema=TransactionCSVInput)
525
+ def transactions_fraud_tool(csv_text: str) -> str:
526
+ df = _csv_text_to_df(csv_text)
527
+ clean, issues, quality, colmap = prepare_transactions(df)
528
+ flagged, stats = detect_transactions(clean, colmap)
529
+ return f"{stats}\nDQ issues: {len(issues)}\nFirst flagged:\n{flagged.head(5).to_csv(index=False)}"[:2800]
530
+
531
+ class KYCCSVInput(BaseModel):
532
+ csv_text: str = Field(..., description="KYC CSV text")
533
+
534
+ @tool("kyc_fraud_tool", args_schema=KYCCSVInput)
535
+ def kyc_fraud_tool(csv_text: str) -> str:
536
+ df = _csv_text_to_df(csv_text)
537
+ clean, issues, quality, colmap = prepare_kyc(df)
538
+ flagged, stats = detect_kyc(clean, colmap)
539
+ return f"{stats}\nDQ issues: {len(issues)}\nFirst flagged:\n{flagged.head(5).to_csv(index=False)}"[:2800]
540
+
541
+ class SanctionsCSVInput(BaseModel):
542
+ csv_text: str = Field(..., description="Customers CSV text with a 'name' column")
543
+
544
+ @tool("sanctions_pep_tool", args_schema=SanctionsCSVInput)
545
+ def sanctions_pep_tool(csv_text: str) -> str:
546
+ df = _csv_text_to_df(csv_text)
547
+ clean, issues, quality, colmap = prepare_sanctions(df)
548
+ flagged, stats = detect_sanctions(clean, colmap)
549
+ return f"{stats}\nDQ issues: {len(issues)}\nFirst matches:\n{flagged.head(5).to_csv(index=False)}"[:2800]
550
+
551
+ class CreditCSVInput(BaseModel):
552
+ csv_text: str = Field(..., description="Credit CSV text")
553
+
554
+ @tool("credit_risk_tool", args_schema=CreditCSVInput)
555
+ def credit_risk_tool(csv_text: str) -> str:
556
+ df = _csv_text_to_df(csv_text)
557
+ clean, issues, quality, colmap = prepare_credit(df)
558
+ flagged, stats = detect_credit(clean, colmap)
559
+ return f"{stats}\nDQ issues: {len(issues)}\nFirst flagged:\n{flagged.head(5).to_csv(index=False)}"[:2800]
560
+
561
+ TOOLS: List[Tool] = [
562
+ transactions_fraud_tool,
563
+ kyc_fraud_tool,
564
+ sanctions_pep_tool,
565
+ credit_risk_tool,
566
+ ]
567
+
568
+ # ------------------------
569
+ # Agent (chat-completions)
570
+ # ------------------------
571
+ AGENT_SYSTEM = """You are an AI Consultant for Fraud/Risk.
572
+ You have tools for Transactions, KYC, Sanctions/PEP, and Credit Risk.
573
+ If the user pastes a small CSV snippet, pick the relevant tool and analyze it.
574
+ Be concise and actionable."""
575
+
576
+ def build_agent():
577
+ if CHAT_LLM is None:
578
+ class Stub:
579
+ def invoke(self, prompt): return CHAT_NOTICE
580
+ return Stub()
581
+ return initialize_agent(
582
+ TOOLS,
583
+ CHAT_LLM,
584
+ agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
585
+ verbose=False,
586
+ agent_kwargs={"system_message": AGENT_SYSTEM},
587
+ handle_parsing_errors=True,
588
+ )
589
+
590
+ AGENT = build_agent()
591
+
592
+ def agent_reply(history: List[Dict], user_msg: str):
593
+ try:
594
+ looks_like_csv = ("," in user_msg) and ("\n" in user_msg) and (user_msg.count(",") >= 2)
595
+ prompt = f"CSV snippet detected. Decide tool and analyze:\n\n{user_msg}" if looks_like_csv else user_msg
596
+ res = AGENT.invoke(prompt)
597
+ if isinstance(res, dict) and "output" in res: return res["output"]
598
+ return str(res)
599
+ except Exception as e:
600
+ return f"Agent error: {e}"
601
+
602
+ # ------------------------
603
+ # UI
604
+ # ------------------------
605
+ with gr.Blocks(title="Fraud Detector Analyst — LangChain + MCP", theme=gr.themes.Soft()) as demo:
606
+ gr.Markdown("# 🛡️ Fraud Detector Analyst — LangChain + MCP")
607
+ gr.Markdown(
608
+ "This prototype runs **rules & data checks locally**. "
609
+ "Chat + AI summaries require a remote inference provider (HF Inference)."
610
+ )
611
+
612
+ with gr.Tabs():
613
+ with gr.Tab("Transactions"):
614
+ gr.Markdown("Upload a **transactions** CSV.")
615
+ tx_file = gr.File(file_types=[".csv"], label="Transactions CSV", type="binary")
616
+ tx_ai = gr.Textbox(label="AI Summary (requires inference)", value=SUMMARY_NOTICE, lines=6)
617
+ tx_stats = gr.Textbox(label="Stats", lines=3)
618
+ tx_flagged = gr.Dataframe(label="Flagged Transactions")
619
+ tx_issues = gr.Dataframe(label="Data Quality Issues (row, field, issue, value)")
620
+ tx_file.upload(run_transactions, inputs=[tx_file], outputs=[tx_ai, tx_stats, tx_flagged, tx_issues])
621
+
622
+ with gr.Tab("KYC"):
623
+ gr.Markdown("Upload a **KYC** CSV.")
624
+ kyc_file = gr.File(file_types=[".csv"], label="KYC CSV", type="binary")
625
+ kyc_ai = gr.Textbox(label="AI Summary (requires inference)", value=SUMMARY_NOTICE, lines=6)
626
+ kyc_stats = gr.Textbox(label="Stats", lines=3)
627
+ kyc_flagged = gr.Dataframe(label="Flagged KYC Rows")
628
+ kyc_issues = gr.Dataframe(label="Data Quality Issues")
629
+ kyc_file.upload(run_kyc, inputs=[kyc_file], outputs=[kyc_ai, kyc_stats, kyc_flagged, kyc_issues])
630
+
631
+ with gr.Tab("Sanctions/PEP"):
632
+ gr.Markdown("Upload **customers** CSV (+ optional sanctions CSV).")
633
+ san_customers = gr.File(file_types=[".csv"], label="Customers CSV", type="binary")
634
+ san_list = gr.File(file_types=[".csv"], label="Sanctions/PEP CSV (optional)", type="binary")
635
+ san_ai = gr.Textbox(label="AI Summary (requires inference)", value=SUMMARY_NOTICE, lines=6)
636
+ san_stats = gr.Textbox(label="Stats", lines=3)
637
+ san_flagged = gr.Dataframe(label="Matches")
638
+ san_issues = gr.Dataframe(label="Data Quality Issues")
639
+ san_customers.upload(run_sanctions, inputs=[san_customers, san_list], outputs=[san_ai, san_stats, san_flagged, san_issues])
640
+ san_list.upload(run_sanctions, inputs=[san_customers, san_list], outputs=[san_ai, san_stats, san_flagged, san_issues])
641
+
642
+ with gr.Tab("Credit Risk"):
643
+ gr.Markdown("Upload a **credit** CSV.")
644
+ cr_file = gr.File(file_types=[".csv"], label="Credit CSV", type="binary")
645
+ cr_ai = gr.Textbox(label="AI Summary (requires inference)", value=SUMMARY_NOTICE, lines=6)
646
+ cr_stats = gr.Textbox(label="Stats", lines=3)
647
+ cr_flagged = gr.Dataframe(label="Flagged Applicants")
648
+ cr_issues = gr.Dataframe(label="Data Quality Issues")
649
+ cr_file.upload(run_credit, inputs=[cr_file], outputs=[cr_ai, cr_stats, cr_flagged, cr_issues])
650
+
651
+ with gr.Tab("AI Consultant (Agent)"):
652
+ gr.Markdown("Paste a small CSV snippet or ask questions. Uses chat-completions when configured.")
653
+ chatbot = gr.Chatbot(type="messages", label="Fraud AI Consultant")
654
+ user_in = gr.Textbox(label="Message or CSV snippet")
655
+ send_btn = gr.Button("Send")
656
+ def _chat_fn(history, msg):
657
+ reply = agent_reply(history, msg)
658
+ history = (history or []) + [{"role":"user","content":msg}, {"role":"assistant","content":reply}]
659
+ return history, ""
660
+ send_btn.click(_chat_fn, inputs=[chatbot, user_in], outputs=[chatbot, user_in])
661
+
662
+ gr.Markdown(
663
+ "### ⚙️ Enable inference\n"
664
+ "- Set **HF_TOKEN** (or HF_SPACES on Spaces)\n"
665
+ "- Optional: **LC_CHAT_MODEL** (default Qwen 0.5B Instruct), **LC_CHAT_MODEL_FALLBACK** (default Mistral 7B Instruct)\n"
666
+ "- Optional MCP: `ENABLE_MCP=1`, `MCP_SANCTIONS_URL`, `MCP_HIGH_RISK_MCC_URL`, `MCP_AUTH_HEADER`"
667
+ )
668
+
669
+ if __name__ == "__main__":
670
+ demo.launch(server_name="0.0.0.0", server_port=7860)
llm_provider.py ADDED
@@ -0,0 +1 @@
 
 
1
+ """TODO: implement llm_provider.py (split from app_monolith_backup.py)"""
mcp.py ADDED
@@ -0,0 +1 @@
 
 
1
+ """TODO: implement mcp.py (split from app_monolith_backup.py)"""
modules/__init__.py ADDED
File without changes
modules/credit.py ADDED
@@ -0,0 +1 @@
 
 
1
+ """TODO: implement modules/credit.py (split from app_monolith_backup.py)"""
modules/kyc.py ADDED
@@ -0,0 +1 @@
 
 
1
+ """TODO: implement modules/kyc.py (split from app_monolith_backup.py)"""
modules/sanctions.py ADDED
@@ -0,0 +1 @@
 
 
1
+ """TODO: implement modules/sanctions.py (split from app_monolith_backup.py)"""
modules/transactions.py ADDED
@@ -0,0 +1 @@
 
 
1
+ """TODO: implement modules/transactions.py (split from app_monolith_backup.py)"""
threat_intel.py ADDED
@@ -0,0 +1 @@
 
 
1
+ """TODO: implement threat_intel.py (split from app_monolith_backup.py)"""
tools.py ADDED
@@ -0,0 +1 @@
 
 
1
+ """TODO: implement tools.py (split from app_monolith_backup.py)"""
ttp_guard.py ADDED
@@ -0,0 +1 @@
 
 
1
+ """TODO: implement ttp_guard.py (split from app_monolith_backup.py)"""
validation.py ADDED
@@ -0,0 +1 @@
 
 
1
+ """TODO: implement validation.py (split from app_monolith_backup.py)"""