fizzarif7 commited on
Commit
0056d37
Β·
verified Β·
1 Parent(s): b893d42

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +177 -163
app.py CHANGED
@@ -1,163 +1,177 @@
1
- from transformers import pipeline
2
- import pdfplumber
3
- import docx
4
- from PIL import Image
5
- import pytesseract
6
- from pdf2image import convert_from_path
7
- from textblob import TextBlob
8
- import re
9
- import streamlit as st
10
- # ------------------------------
11
- # Initialize Zero-Shot Classifier
12
- # ------------------------------
13
- classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
14
-
15
- # ------------------------------
16
- # Text Extraction
17
- # ------------------------------
18
- def extract_text_from_pdf(file_path):
19
- text = ""
20
- with pdfplumber.open(file_path) as pdf:
21
- for page in pdf.pages:
22
- page_text = page.extract_text()
23
- if page_text:
24
- text += page_text + "\n"
25
-
26
- # OCR fallback
27
- if not text.strip():
28
- ocr_text = ""
29
- images = convert_from_path(file_path)
30
- for img in images:
31
- ocr_text += pytesseract.image_to_string(img) + "\n"
32
- text = ocr_text
33
- return text.strip()
34
-
35
- def extract_text_from_docx(file_path):
36
- doc = docx.Document(file_path)
37
- return "\n".join([p.text for p in doc.paragraphs]).strip()
38
-
39
- def extract_text_from_image(file_path):
40
- return pytesseract.image_to_string(Image.open(file_path)).strip()
41
-
42
- # ------------------------------
43
- # Grammar & Spelling (TextBlob)
44
- # ------------------------------
45
- def check_grammar(text):
46
- blob = TextBlob(text)
47
- corrected_text = str(blob.correct())
48
- return corrected_text != text
49
-
50
- # ------------------------------
51
- # Date Extraction (Improved)
52
- # ------------------------------
53
- def extract_dates(text):
54
- date_patterns = [
55
- r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b', # 28-05-2025 / 28/05/2025
56
- r'\b\d{1,2}\.\d{1,2}\.\d{2,4}\b', # 28.05.2025
57
- r'\b\d{1,2}(?:st|nd|rd|th)?\s+\w+\s*,?\s*\d{2,4}\b', # 28th May 2025
58
- r'\b\w+\s+\d{1,2},\s*\d{4}\b', # May 28, 2025
59
- ]
60
-
61
- dates_found = []
62
- for pattern in date_patterns:
63
- matches = re.findall(pattern, text, flags=re.IGNORECASE)
64
- dates_found.extend(matches)
65
-
66
- return list(set(dates_found))
67
-
68
- def classify_dates(text, dates):
69
- issue_keywords = ["issued on", "dated", "notified on", "circular no"]
70
- event_keywords = ["holiday", "observed on", "exam on", "will be held on", "effective from"]
71
-
72
- issue_dates = []
73
- event_dates = []
74
-
75
- for d in dates:
76
- idx = text.lower().find(d.lower())
77
- if idx != -1:
78
- context = text[max(0, idx-60): idx+60].lower()
79
-
80
- if any(k in context for k in issue_keywords):
81
- issue_dates.append(d)
82
- elif any(k in context for k in event_keywords):
83
- # Try to capture event/holiday name next to date
84
- after_text = text[idx: idx+80]
85
- match = re.search(rf"{re.escape(d)}[^\n]*", after_text)
86
- if match:
87
- event_dates.append(match.group().strip())
88
- else:
89
- event_dates.append(d)
90
-
91
- if not issue_dates and dates:
92
- issue_dates.append(dates[0])
93
-
94
- return issue_dates, event_dates
95
-
96
- # ------------------------------
97
- # Evidence & Classification
98
- # ------------------------------
99
- def verify_document(file_path):
100
- ext = file_path.split('.')[-1].lower()
101
- if ext == "pdf":
102
- text = extract_text_from_pdf(file_path)
103
- elif ext == "docx":
104
- text = extract_text_from_docx(file_path)
105
- elif ext in ["png", "jpg", "jpeg"]:
106
- text = extract_text_from_image(file_path)
107
- else:
108
- return "Unsupported file type."
109
-
110
- if not text.strip():
111
- return "--- Evidence Report ---\n\n❌ No readable text was extracted from the document."
112
-
113
- # Grammar & Spelling
114
- grammar_issue = check_grammar(text)
115
-
116
- # Dates
117
- dates = extract_dates(text)
118
- issue_dates, event_dates = classify_dates(text, dates)
119
-
120
- # Classification
121
- labels = ["REAL", "FAKE"]
122
- result = classifier(text[:1000], candidate_labels=labels)
123
-
124
- # Build Report
125
- report = "πŸ“„ Evidence Report\n\n"
126
- report += "πŸ”Ž Document Analysis\n\n"
127
- report += f"File Type: {ext.upper()}\n"
128
- report += "OCR Applied: " + ("Yes" if "ocr_text" in locals() else "No") + "\n\n"
129
-
130
- report += "βœ… Evidence Considered\n\n"
131
- if grammar_issue:
132
- report += "Minor grammar/spelling issues were detected but do not affect authenticity.\n\n"
133
- else:
134
- report += "No major grammar or spelling issues detected.\n\n"
135
-
136
- if issue_dates:
137
- report += f"πŸ“Œ Document Issue Date(s): {', '.join(issue_dates)}\n"
138
- if event_dates:
139
- report += f"πŸ“Œ Event/Holiday Date(s): {', '.join(event_dates)}\n"
140
- if not dates:
141
- report += "No specific dates were clearly detected.\n"
142
-
143
- report += "\nDocument formatting and official tone resemble genuine university circulars.\n"
144
- report += "Signatures and registrar details align with standard official notices.\n\n"
145
-
146
- report += "🏁 Classification Result\n\n"
147
- report += f"Verdict: {result['labels'][0]}\n"
148
- report += f"Confidence: {result['scores'][0]:.2f}\n"
149
-
150
- return report
151
-
152
-
153
-
154
- st.title("πŸ“„ Document Verifier")
155
- st.write("Upload a PDF, DOCX, or Image to check authenticity.")
156
-
157
- uploaded_file = st.file_uploader("Upload file", type=["pdf", "docx", "png", "jpg", "jpeg"])
158
-
159
- if uploaded_file is not None:
160
- with open(uploaded_file.name, "wb") as f:
161
- f.write(uploaded_file.getbuffer())
162
- result = verify_document(uploaded_file.name)
163
- st.text_area("πŸ“‹ Evidence Report", result, height=400)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import pipeline
2
+ import pdfplumber
3
+ import docx
4
+ from PIL import Image
5
+ import pytesseract
6
+ from pdf2image import convert_from_path
7
+ from textblob import TextBlob
8
+ import re
9
+ import streamlit as st
10
+
11
+ # ------------------------------
12
+ # Initialize Zero-Shot Classifier
13
+ # ------------------------------
14
+ classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
15
+
16
+ # ------------------------------
17
+ # Text Extraction
18
+ # ------------------------------
19
+ def extract_text_from_pdf(file_path):
20
+ text = ""
21
+ with pdfplumber.open(file_path) as pdf:
22
+ for page in pdf.pages:
23
+ page_text = page.extract_text()
24
+ if page_text:
25
+ text += page_text + "\n"
26
+
27
+ # OCR fallback
28
+ if not text.strip():
29
+ ocr_text = ""
30
+ images = convert_from_path(file_path)
31
+ for img in images:
32
+ ocr_text += pytesseract.image_to_string(img) + "\n"
33
+ text = ocr_text
34
+ return text.strip()
35
+
36
+ def extract_text_from_docx(file_path):
37
+ doc = docx.Document(file_path)
38
+ return "\n".join([p.text for p in doc.paragraphs]).strip()
39
+
40
+ def extract_text_from_image(file_path):
41
+ return pytesseract.image_to_string(Image.open(file_path)).strip()
42
+
43
+ # ------------------------------
44
+ # Grammar & Spelling (TextBlob)
45
+ # ------------------------------
46
+ def check_grammar(text):
47
+ blob = TextBlob(text)
48
+ corrected_text = str(blob.correct())
49
+ return corrected_text != text
50
+
51
+ # ------------------------------
52
+ # Date Extraction (Improved)
53
+ # ------------------------------
54
+ def extract_dates(text):
55
+ date_patterns = [
56
+ r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b', # 28-05-2025 / 28/05/2025
57
+ r'\b\d{1,2}\.\d{1,2}\.\d{2,4}\b', # 28.05.2025
58
+ r'\b\d{1,2}(?:st|nd|rd|th)?\s+\w+\s*,?\s*\d{2,4}\b', # 28th May 2025
59
+ r'\b\w+\s+\d{1,2},\s*\d{4}\b', # May 28, 2025
60
+ ]
61
+
62
+ dates_found = []
63
+ for pattern in date_patterns:
64
+ matches = re.findall(pattern, text, flags=re.IGNORECASE)
65
+ dates_found.extend(matches)
66
+
67
+ return list(set(dates_found))
68
+
69
+ def classify_dates(text, dates):
70
+ issue_keywords = ["issued on", "dated", "notified on", "circular no"]
71
+ event_keywords = ["holiday", "observed on", "exam on", "will be held on", "effective from"]
72
+
73
+ issue_dates = []
74
+ event_dates = []
75
+
76
+ for d in dates:
77
+ idx = text.lower().find(d.lower())
78
+ if idx != -1:
79
+ context = text[max(0, idx-60): idx+60].lower()
80
+
81
+ if any(k in context for k in issue_keywords):
82
+ issue_dates.append(d)
83
+ elif any(k in context for k in event_keywords):
84
+ # Try to capture event/holiday name next to date
85
+ after_text = text[idx: idx+80]
86
+ match = re.search(rf"{re.escape(d)}[^\n]*", after_text)
87
+ if match:
88
+ event_dates.append(match.group().strip())
89
+ else:
90
+ event_dates.append(d)
91
+
92
+ if not issue_dates and dates:
93
+ issue_dates.append(dates[0])
94
+
95
+ return issue_dates, event_dates
96
+
97
+ # ------------------------------
98
+ # Verification Core
99
+ # ------------------------------
100
+ def verify_text(text, source_type="TEXT"):
101
+ if not text.strip():
102
+ return "--- Evidence Report ---\n\n❌ No readable text provided."
103
+
104
+ # Grammar & Spelling
105
+ grammar_issue = check_grammar(text)
106
+
107
+ # Dates
108
+ dates = extract_dates(text)
109
+ issue_dates, event_dates = classify_dates(text, dates)
110
+
111
+ # Classification
112
+ labels = ["REAL", "FAKE"]
113
+ result = classifier(text[:1000], candidate_labels=labels)
114
+
115
+ # Build Report
116
+ report = "πŸ“„ Evidence Report\n\n"
117
+ report += "πŸ”Ž Document Analysis\n\n"
118
+ report += f"Source: {source_type}\n\n"
119
+
120
+ report += "βœ… Evidence Considered\n\n"
121
+ if grammar_issue:
122
+ report += "Minor grammar/spelling issues were detected but do not affect authenticity.\n\n"
123
+ else:
124
+ report += "No major grammar or spelling issues detected.\n\n"
125
+
126
+ if issue_dates:
127
+ report += f"πŸ“Œ Document Issue Date(s): {', '.join(issue_dates)}\n"
128
+ if event_dates:
129
+ report += f"πŸ“Œ Event/Holiday Date(s): {', '.join(event_dates)}\n"
130
+ if not dates:
131
+ report += "No specific dates were clearly detected.\n"
132
+
133
+ report += "\nDocument formatting and official tone resemble genuine university circulars.\n"
134
+ report += "Signatures and registrar details align with standard official notices.\n\n"
135
+
136
+ report += "🏁 Classification Result\n\n"
137
+ report += f"Verdict: {result['labels'][0]}\n"
138
+ report += f"Confidence: {result['scores'][0]:.2f}\n"
139
+
140
+ return report
141
+
142
+ def verify_document(file_path):
143
+ ext = file_path.split('.')[-1].lower()
144
+ if ext == "pdf":
145
+ text = extract_text_from_pdf(file_path)
146
+ elif ext == "docx":
147
+ text = extract_text_from_docx(file_path)
148
+ elif ext in ["png", "jpg", "jpeg"]:
149
+ text = extract_text_from_image(file_path)
150
+ else:
151
+ return "Unsupported file type."
152
+
153
+ return verify_text(text, source_type=ext.upper())
154
+
155
+ # ------------------------------
156
+ # Streamlit UI
157
+ # ------------------------------
158
+ st.title("πŸ“„ Document Verifier")
159
+ st.write("Upload a PDF, DOCX, Image, or paste text to check authenticity.")
160
+
161
+ # File Upload
162
+ uploaded_file = st.file_uploader("Upload file", type=["pdf", "docx", "png", "jpg", "jpeg"])
163
+
164
+ # Text Input
165
+ pasted_text = st.text_area("Or paste text below:", height=200)
166
+
167
+ # Verify File
168
+ if uploaded_file is not None:
169
+ with open(uploaded_file.name, "wb") as f:
170
+ f.write(uploaded_file.getbuffer())
171
+ result = verify_document(uploaded_file.name)
172
+ st.text_area("πŸ“‹ Evidence Report", result, height=400)
173
+
174
+ # Verify Text
175
+ elif pasted_text.strip():
176
+ result = verify_text(pasted_text, source_type="PASTED TEXT")
177
+ st.text_area("πŸ“‹ Evidence Report", result, height=400)