Spaces:

fizzarif7
/

DocumentVerifier

Sleeping

App Files Files Community

fizzarif7 commited on 18 days ago

Commit

0056d37

verified ·

1 Parent(s): b893d42

Update app.py

Browse files

Files changed (1) hide show

app.py +177 -163

app.py CHANGED Viewed

@@ -1,163 +1,177 @@
-from transformers import pipeline
-import pdfplumber
-import docx
-from PIL import Image
-import pytesseract
-from pdf2image import convert_from_path
-from textblob import TextBlob
-import re
-import streamlit as st
-# ------------------------------
-# Initialize Zero-Shot Classifier
-# ------------------------------
-classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
-# ------------------------------
-# Text Extraction
-# ------------------------------
-def extract_text_from_pdf(file_path):
-    text = ""
-    with pdfplumber.open(file_path) as pdf:
-        for page in pdf.pages:
-            page_text = page.extract_text()
-            if page_text:
-                text += page_text + "\n"
-    # OCR fallback
-    if not text.strip():
-        ocr_text = ""
-        images = convert_from_path(file_path)
-        for img in images:
-            ocr_text += pytesseract.image_to_string(img) + "\n"
-        text = ocr_text
-    return text.strip()
-def extract_text_from_docx(file_path):
-    doc = docx.Document(file_path)
-    return "\n".join([p.text for p in doc.paragraphs]).strip()
-def extract_text_from_image(file_path):
-    return pytesseract.image_to_string(Image.open(file_path)).strip()
-# ------------------------------
-# Grammar & Spelling (TextBlob)
-# ------------------------------
-def check_grammar(text):
-    blob = TextBlob(text)
-    corrected_text = str(blob.correct())
-    return corrected_text != text
-# ------------------------------
-# Date Extraction (Improved)
-# ------------------------------
-def extract_dates(text):
-    date_patterns = [
-        r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b',       # 28-05-2025 / 28/05/2025
-        r'\b\d{1,2}\.\d{1,2}\.\d{2,4}\b',           # 28.05.2025
-        r'\b\d{1,2}(?:st|nd|rd|th)?\s+\w+\s*,?\s*\d{2,4}\b', # 28th May 2025
-        r'\b\w+\s+\d{1,2},\s*\d{4}\b',              # May 28, 2025
-    ]
-    dates_found = []
-    for pattern in date_patterns:
-        matches = re.findall(pattern, text, flags=re.IGNORECASE)
-        dates_found.extend(matches)
-    return list(set(dates_found))
-def classify_dates(text, dates):
-    issue_keywords = ["issued on", "dated", "notified on", "circular no"]
-    event_keywords = ["holiday", "observed on", "exam on", "will be held on", "effective from"]
-    issue_dates = []
-    event_dates = []
-    for d in dates:
-        idx = text.lower().find(d.lower())
-        if idx != -1:
-            context = text[max(0, idx-60): idx+60].lower()
-            if any(k in context for k in issue_keywords):
-                issue_dates.append(d)
-            elif any(k in context for k in event_keywords):
-                # Try to capture event/holiday name next to date
-                after_text = text[idx: idx+80]
-                match = re.search(rf"{re.escape(d)}[^\n]*", after_text)
-                if match:
-                    event_dates.append(match.group().strip())
-                else:
-                    event_dates.append(d)
-    if not issue_dates and dates:
-        issue_dates.append(dates[0])
-    return issue_dates, event_dates
-# ------------------------------
-# Evidence & Classification
-# ------------------------------
-def verify_document(file_path):
-    ext = file_path.split('.')[-1].lower()
-    if ext == "pdf":
-        text = extract_text_from_pdf(file_path)
-    elif ext == "docx":
-        text = extract_text_from_docx(file_path)
-    elif ext in ["png", "jpg", "jpeg"]:
-        text = extract_text_from_image(file_path)
-    else:
-        return "Unsupported file type."
-    if not text.strip():
-        return "--- Evidence Report ---\n\n❌ No readable text was extracted from the document."
-    # Grammar & Spelling
-    grammar_issue = check_grammar(text)
-    # Dates
-    dates = extract_dates(text)
-    issue_dates, event_dates = classify_dates(text, dates)
-    # Classification
-    labels = ["REAL", "FAKE"]
-    result = classifier(text[:1000], candidate_labels=labels)
-    # Build Report
-    report = "📄 Evidence Report\n\n"
-    report += "🔎 Document Analysis\n\n"
-    report += f"File Type: {ext.upper()}\n"
-    report += "OCR Applied: " + ("Yes" if "ocr_text" in locals() else "No") + "\n\n"
-    report += "✅ Evidence Considered\n\n"
-    if grammar_issue:
-        report += "Minor grammar/spelling issues were detected but do not affect authenticity.\n\n"
-    else:
-        report += "No major grammar or spelling issues detected.\n\n"
-    if issue_dates:
-        report += f"📌 Document Issue Date(s): {', '.join(issue_dates)}\n"
-    if event_dates:
-        report += f"📌 Event/Holiday Date(s): {', '.join(event_dates)}\n"
-    if not dates:
-        report += "No specific dates were clearly detected.\n"
-    report += "\nDocument formatting and official tone resemble genuine university circulars.\n"
-    report += "Signatures and registrar details align with standard official notices.\n\n"
-    report += "🏁 Classification Result\n\n"
-    report += f"Verdict: {result['labels'][0]}\n"
-    report += f"Confidence: {result['scores'][0]:.2f}\n"
-    return report
-st.title("📄 Document Verifier")
-st.write("Upload a PDF, DOCX, or Image to check authenticity.")
-uploaded_file = st.file_uploader("Upload file", type=["pdf", "docx", "png", "jpg", "jpeg"])
-if uploaded_file is not None:
-    with open(uploaded_file.name, "wb") as f:
-        f.write(uploaded_file.getbuffer())
-    result = verify_document(uploaded_file.name)
-    st.text_area("📋 Evidence Report", result, height=400)

+from transformers import pipeline
+import pdfplumber
+import docx
+from PIL import Image
+import pytesseract
+from pdf2image import convert_from_path
+from textblob import TextBlob
+import re
+import streamlit as st
+# ------------------------------
+# Initialize Zero-Shot Classifier
+# ------------------------------
+classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
+# ------------------------------
+# Text Extraction
+# ------------------------------
+def extract_text_from_pdf(file_path):
+    text = ""
+    with pdfplumber.open(file_path) as pdf:
+        for page in pdf.pages:
+            page_text = page.extract_text()
+            if page_text:
+                text += page_text + "\n"
+    # OCR fallback
+    if not text.strip():
+        ocr_text = ""
+        images = convert_from_path(file_path)
+        for img in images:
+            ocr_text += pytesseract.image_to_string(img) + "\n"
+        text = ocr_text
+    return text.strip()
+def extract_text_from_docx(file_path):
+    doc = docx.Document(file_path)
+    return "\n".join([p.text for p in doc.paragraphs]).strip()
+def extract_text_from_image(file_path):
+    return pytesseract.image_to_string(Image.open(file_path)).strip()
+# ------------------------------
+# Grammar & Spelling (TextBlob)
+# ------------------------------
+def check_grammar(text):
+    blob = TextBlob(text)
+    corrected_text = str(blob.correct())
+    return corrected_text != text
+# ------------------------------
+# Date Extraction (Improved)
+# ------------------------------
+def extract_dates(text):
+    date_patterns = [
+        r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b',       # 28-05-2025 / 28/05/2025
+        r'\b\d{1,2}\.\d{1,2}\.\d{2,4}\b',           # 28.05.2025
+        r'\b\d{1,2}(?:st|nd|rd|th)?\s+\w+\s*,?\s*\d{2,4}\b', # 28th May 2025
+        r'\b\w+\s+\d{1,2},\s*\d{4}\b',              # May 28, 2025
+    ]
+    dates_found = []
+    for pattern in date_patterns:
+        matches = re.findall(pattern, text, flags=re.IGNORECASE)
+        dates_found.extend(matches)
+    return list(set(dates_found))
+def classify_dates(text, dates):
+    issue_keywords = ["issued on", "dated", "notified on", "circular no"]
+    event_keywords = ["holiday", "observed on", "exam on", "will be held on", "effective from"]
+    issue_dates = []
+    event_dates = []
+    for d in dates:
+        idx = text.lower().find(d.lower())
+        if idx != -1:
+            context = text[max(0, idx-60): idx+60].lower()
+            if any(k in context for k in issue_keywords):
+                issue_dates.append(d)
+            elif any(k in context for k in event_keywords):
+                # Try to capture event/holiday name next to date
+                after_text = text[idx: idx+80]
+                match = re.search(rf"{re.escape(d)}[^\n]*", after_text)
+                if match:
+                    event_dates.append(match.group().strip())
+                else:
+                    event_dates.append(d)
+    if not issue_dates and dates:
+        issue_dates.append(dates[0])
+    return issue_dates, event_dates
+# ------------------------------
+# Verification Core
+# ------------------------------
+def verify_text(text, source_type="TEXT"):
+    if not text.strip():
+        return "--- Evidence Report ---\n\n❌ No readable text provided."
+    # Grammar & Spelling
+    grammar_issue = check_grammar(text)
+    # Dates
+    dates = extract_dates(text)
+    issue_dates, event_dates = classify_dates(text, dates)
+    # Classification
+    labels = ["REAL", "FAKE"]
+    result = classifier(text[:1000], candidate_labels=labels)
+    # Build Report
+    report = "📄 Evidence Report\n\n"
+    report += "🔎 Document Analysis\n\n"
+    report += f"Source: {source_type}\n\n"
+    report += "✅ Evidence Considered\n\n"
+    if grammar_issue:
+        report += "Minor grammar/spelling issues were detected but do not affect authenticity.\n\n"
+    else:
+        report += "No major grammar or spelling issues detected.\n\n"
+    if issue_dates:
+        report += f"📌 Document Issue Date(s): {', '.join(issue_dates)}\n"
+    if event_dates:
+        report += f"📌 Event/Holiday Date(s): {', '.join(event_dates)}\n"
+    if not dates:
+        report += "No specific dates were clearly detected.\n"
+    report += "\nDocument formatting and official tone resemble genuine university circulars.\n"
+    report += "Signatures and registrar details align with standard official notices.\n\n"
+    report += "🏁 Classification Result\n\n"
+    report += f"Verdict: {result['labels'][0]}\n"
+    report += f"Confidence: {result['scores'][0]:.2f}\n"
+    return report
+def verify_document(file_path):
+    ext = file_path.split('.')[-1].lower()
+    if ext == "pdf":
+        text = extract_text_from_pdf(file_path)
+    elif ext == "docx":
+        text = extract_text_from_docx(file_path)
+    elif ext in ["png", "jpg", "jpeg"]:
+        text = extract_text_from_image(file_path)
+    else:
+        return "Unsupported file type."
+    return verify_text(text, source_type=ext.upper())
+# ------------------------------
+# Streamlit UI
+# ------------------------------
+st.title("📄 Document Verifier")
+st.write("Upload a PDF, DOCX, Image, or paste text to check authenticity.")
+# File Upload
+uploaded_file = st.file_uploader("Upload file", type=["pdf", "docx", "png", "jpg", "jpeg"])
+# Text Input
+pasted_text = st.text_area("Or paste text below:", height=200)
+# Verify File
+if uploaded_file is not None:
+    with open(uploaded_file.name, "wb") as f:
+        f.write(uploaded_file.getbuffer())
+    result = verify_document(uploaded_file.name)
+    st.text_area("📋 Evidence Report", result, height=400)
+# Verify Text
+elif pasted_text.strip():
+    result = verify_text(pasted_text, source_type="PASTED TEXT")
+    st.text_area("📋 Evidence Report", result, height=400)